Hackmageddon_Parser.R
4.3 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
#------------------------------------------------------------------------------------------------------
#-----------------------------Parser Excel - ISO survey------------------------------------------------
#------------------------------------------------------------------------------------------------------
#' Parse an excel raw data file from armaggedon
#'
#' @param file path to the excel file
#' @param cols list of columns index to read
#'
#' @return data.frame
#' @export
#'
#' @examples
#' data.raw <- ParseHMExcel("./data/hackmaggedon/file.xls", c(2, 3, 6, 5))
ParseHMExcel <- function(file, cols){
#Checks
if (!file.exists(file)) {
stop(paste("Error, file [", file, "] not found"))
}
#Parse data from excell, same formats but different columns because of the files
dataset <- xlsx::read.xlsx2(file, 1, header = TRUE,colIndex = cols, colClasses = c("numeric", "character", "character", "character"))
#If date is null probably theres a blanc row that should be ignored
if (is.null(dataset$Date)) {
dataset <- xlsx::read.xlsx2(file, 1, header = TRUE,colIndex = cols, colClasses = c("numeric", "character", "character", "character"), startRow = 2)
}
dataset
}
#' Prepare raw data from hackmaggedon's excel to use it
#'
#' @param dataset.raw data.frame with raw data
#' @param dateOffset origin to add the numeric date
#'
#' @return data.frame
#' @export
#'
#' @examples
#' data.pro <- ProcessHMRaw(data.raw, "1899-12-30")
ProcessHMRaw <- function(dataset.raw, dateOffset){
#Standar names to the columns
dataset <- setNames(dataset.raw, c("Date", "Attack", "Target", "Country"))
#Remove rows with Date NA
dataset <- dataset[!is.na(dataset$Date),]
dataset <- dataset[!is.na(dataset$Country),]
dataset <- dataset[!dataset$Country == ">1",]
dataset <- dataset[!dataset$Country == ">A",]
dataset <- dataset[!dataset$Country == "INT",]
dataset <- dataset[!grepl(">",dataset$Country),]
dataset$Country <- gsub("\n"," ",dataset$Country)
dataset <- FilterMultiCountry(dataset)
dataset <- dataset[!dataset$Country == "",]
#Format properly the date
dataset$Date <- as.POSIXct(dataset$Date*86400, tz = "GMT", origin = dateOffset)
dataset
}
FilterMultiCountry <- function(dataset.pre) {
multi <- dataset.pre[grepl(" ",dataset.pre$Country),]
dataset <- dataset.pre[!grepl(" ",dataset.pre$Country),]
for (i in 1:nrow(multi)) {
crow <- multi[i,]
country_s <- strsplit(toString(crow$Country), " ")
for (j in 1:length(country_s)) {
Date <- crow[1]
Attack <- crow[2]
Target <- crow[3]
Country <- country_s[[1]][j]
new.row <- data.frame(Date, Attack, Target, Country)
print(new.row)
dataset <- rbind(dataset, new.row)
}
}
dataset
}
#' Parse every excel file into a folder
#'
#' @param folder path to the folder to iterate
#' @param cols columns to parse into each file
#' @param dateOffset origin to calc the dates into each file
#'
#' @return data.frame
#' @export
#'
#' @examples
#' data.pro <- ProcessHMRaw("./data/hackmaggedon/", c(1, 5, 3) "1899-12-30")
ParseHMFolder <- function(folder, cols, dateOffset){
#List excel files into the folder
filelist <- list.files(folder, pattern = "*.xls*")
#Iterate for each file appending the returned data.frame
dataset <- ProcessHMRaw(ParseHMExcel(paste(folder,filelist[1],sep = ""), cols), dateOffset)
for (i in 2:length(filelist)) {
dataset <- rbind(dataset, ProcessHMRaw(ParseHMExcel(paste(folder,filelist[i],sep = ""), cols), dateOffset))
}
dataset
}
#' Parse the default data from the package from hackmaggedon (2012-2016)
#'
#' @return data.frame
#' @export
#'
#' @examples
#' Attacks <- GetDefaultAttacksData()
GetDefaultAttacksData <- function() {
#Parsing each different folder with the correct properties
format1 <- ParseHMFolder("./data/hackmageddon/Format1/", c(2, 9, 7, 6), "1899-12-30")
format2 <- ParseHMFolder("./data/hackmageddon/Format2/", c(2, 7, 5, 3), "1904-01-01")
format3 <- ParseHMFolder("./data/hackmageddon/Format3/", c(2, 9, 7, 6), "1904-01-01")
format3.2 <- ParseHMFolder("./data/hackmageddon/Format3/Format3.2/", c(2, 9, 7, 6), "1899-12-30")
format4 <- ParseHMFolder("./data/hackmageddon/Format4/", c(1, 9, 5, 3), "1899-12-30")
#Appending evey data.frame in the standard format
dataset <- rbind(format1, format2, format3, format3.2, format4)
dataset
}