|
1
2
3
4
5
6
|
#------------------------------------------------------------------------------------------------------
#-----------------------------Parser Excel - ISO survey------------------------------------------------
#------------------------------------------------------------------------------------------------------
|
|
7
8
9
10
11
12
13
14
15
16
|
#' Parse an excel raw data file from armaggedon
#'
#' @param file path to the excel file
#' @param cols list of columns index to read
#'
#' @return data.frame
#' @export
#'
#' @examples
#' data.raw <- ParseHMExcel("./data/hackmaggedon/file.xls", c(2, 3, 6, 5))
|
|
17
|
ParseHMExcel <- function(file, cols){
|
|
18
19
|
#Checks
|
|
20
21
22
|
if (!file.exists(file)) {
stop(paste("Error, file [", file, "] not found"))
}
|
|
23
24
|
#Parse data from excell, same formats but different columns because of the files
|
|
25
26
|
dataset <- xlsx::read.xlsx2(file, 1, header = TRUE,colIndex = cols, colClasses = c("numeric", "character", "character", "character"))
|
|
27
|
#If date is null probably theres a blanc row that should be ignored
|
|
28
29
30
31
|
if (is.null(dataset$Date)) {
dataset <- xlsx::read.xlsx2(file, 1, header = TRUE,colIndex = cols, colClasses = c("numeric", "character", "character", "character"), startRow = 2)
}
|
|
32
33
34
|
dataset
}
|
|
35
36
37
38
39
40
41
42
43
44
45
|
#' Prepare raw data from hackmaggedon's excel to use it
#'
#' @param dataset.raw data.frame with raw data
#' @param dateOffset origin to add the numeric date
#'
#' @return data.frame
#' @export
#'
#' @examples
|
|
46
47
|
#' data.pro <- ProcessHMRaw(data.raw, "1899-12-30")
ProcessHMRaw <- function(dataset.raw, dateOffset){
|
|
48
49
50
|
#Standar names to the columns
dataset <- setNames(dataset.raw, c("Date", "Attack", "Target", "Country"))
|
|
51
|
|
|
52
|
#Data frame changes to standarize values and make easier the joins
|
|
53
|
dataset <- dataset[!is.na(dataset$Date),]
|
|
54
|
dataset <- dataset[!is.na(dataset$Country),]
|
|
55
56
|
dataset$Country <- toupper(dataset$Country)
|
|
57
58
|
dataset <- dataset[!dataset$Country == "INT",]
dataset <- dataset[!grepl(">",dataset$Country),]
|
|
59
|
dataset <- dataset[dataset$Country != "N/A",]
|
|
60
61
|
dataset$Country <- gsub("\n"," ",dataset$Country)
dataset <- FilterMultiCountry(dataset)
|
|
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
|
dataset <- dataset[dataset$Country != "",]
dataset <- dataset[dataset$Country != "H",]
dataset <- dataset[dataset$Country != "W",]
dataset <- dataset[dataset$Country != "14",]
dataset <- dataset[dataset$Country != "EU",]
dataset <- dataset[dataset$Country != "UN",]
dataset <- dataset[dataset$Country != "TI",]
dataset <- dataset[dataset$Country != ".TI",]
dataset$Country <- gsub("G8","GI",dataset$Country)
dataset$Country <- gsub("UK","GB",dataset$Country)
dataset$Country <- gsub("EN","GB",dataset$Country)
dataset$Country <- gsub("UAE","AE",dataset$Country)
dataset$Country <- gsub("CB","KH",dataset$Country)
|
|
77
78
79
80
|
CountryNames <- data.frame(countrycode::countrycode_data$country.name, countrycode::countrycode_data$iso2c, countrycode::countrycode_data$continent)
CountryNames <- setNames(CountryNames, c("Country_large","Country", "Continent"))
dataset <- merge(x = dataset, y = CountryNames, by = "Country", all.x = TRUE)
dataset <- dataset[!is.na(dataset$Continent),]
|
|
81
82
83
|
#Format properly the date
dataset$Date <- as.POSIXct(dataset$Date*86400, tz = "GMT", origin = dateOffset)
|
|
84
|
|
|
85
86
87
88
89
90
91
92
|
#Standar Attack type
Attack.config <- read.csv("./data/hackmageddon/AttackTypeConfig.csv", header = FALSE, sep = ";")
Attack.config <- setNames(Attack.config, c("Attack", "Attack.standar"))
dataset <- merge(x = dataset, y = Attack.config, by = "Attack", all.x = TRUE)
dataset <- subset(dataset, select = -c(Attack))
|
|
93
94
|
dataset <- FilterMultiAttack(dataset)
|
|
95
96
97
|
dataset
}
|
|
98
99
100
101
102
|
#' Look for rows with more than one country target and split into multiple
#'
#' @param dataset.pre data.frame to process
#'
#' @return data.frame
|
|
103
|
FilterMultiCountry <- function(dataset.pre) {
|
|
104
105
|
#data.frame with multiple taget country rows
|
|
106
107
|
multi <- dataset.pre[grepl(" ",dataset.pre$Country),]
|
|
108
|
if (nrow(multi) == 0) { #Ignore if there are not multiple target rows
|
|
109
|
|
|
110
111
|
dataset.pre
} else {
|
|
112
|
|
|
113
114
|
#data.frame with every rows except multi ones
dataset <- dataset.pre[!grepl(" ",dataset.pre$Country),]
|
|
115
|
|
|
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
|
#Iterate over multi
for (i in 1:nrow(multi)) {
crow <- multi[i,] #current row
country_s <- strsplit(toString(crow$Country), " ")[[1]] #each country target
#Iterate over each country target
for (j in 1:length(country_s)) {
Date <- crow[1]
Attack <- crow[2]
Target <- crow[3]
Country <- country_s[j]
new.row <- data.frame(Date, Attack, Target, Country)
dataset <- rbind(dataset, new.row) #Append new row to output data.frame
}
|
|
131
132
|
}
|
|
133
134
|
dataset
}
|
|
135
|
}
|
|
136
|
|
|
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
|
#' Look for rows with more than one attack and split into multiple
#'
#' @param dataset.pre data.frame to process
#'
#' @return data.frame
FilterMultiAttack <- function(dataset.pre) {
#data.frame with multiple taget country rows
multi <- dataset.pre[grepl("/",dataset.pre$Attack.standar),]
if (nrow(multi) == 0) { #Ignore if there are not multiple target rows
dataset.pre
} else {
#data.frame with every rows except multi ones
dataset <- dataset.pre[!grepl("/",dataset.pre$Attack.standar),]
#Iterate over multi
for (i in 1:nrow(multi)) {
crow <- multi[i,] #current row
country_s <- strsplit(toString(crow$Attack.standar), "/")[[1]] #each country target
#Iterate over each country target
for (j in 1:length(country_s)) {
#Date <- crow[1]
#Attack.standar <- country_s[j]
#Target <- crow[3]
#Country <- crow[4]
new.row <- data.frame(crow)
new.row$Attack.standar <- country_s[j]
dataset <- rbind(dataset, new.row) #Append new row to output data.frame
}
}
dataset
}
}
|
|
177
|
#' Parse every excel file into a folder
|
|
178
|
#'
|
|
179
180
181
|
#' @param folder path to the folder to iterate
#' @param cols columns to parse into each file
#' @param dateOffset origin to calc the dates into each file
|
|
182
|
#'
|
|
183
|
#' @return data.frame
|
|
184
185
186
|
#' @export
#'
#' @examples
|
|
187
|
#' data.pro <- ProcessHMRaw("./data/hackmaggedon/", c(1, 5, 3) "1899-12-30")
|
|
188
|
ParseHMFolder <- function(folder, cols, dateOffset){
|
|
189
|
|
|
190
191
|
#List excel files into the folder
filelist <- list.files(folder, pattern = "*.xls*")
|
|
192
|
|
|
193
|
#Iterate for each file appending the returned data.frame
|
|
194
|
dataset <- ProcessHMRaw(ParseHMExcel(paste(folder,filelist[1],sep = ""), cols), dateOffset)
|
|
195
|
for (i in 2:length(filelist)) {
|
|
196
|
dataset <- rbind(dataset, ProcessHMRaw(ParseHMExcel(paste(folder,filelist[i],sep = ""), cols), dateOffset))
|
|
197
198
199
|
}
dataset
|
|
200
201
|
}
|
|
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
|
#' Parse the default data from the package from hackmaggedon (2012-2016)
#'
#' @return data.frame
#' @export
#'
#' @examples
#' Attacks <- GetDefaultAttacksData()
GetDefaultAttacksData <- function() {
#Parsing each different folder with the correct properties
format1 <- ParseHMFolder("./data/hackmageddon/Format1/", c(2, 9, 7, 6), "1899-12-30")
format2 <- ParseHMFolder("./data/hackmageddon/Format2/", c(2, 7, 5, 3), "1904-01-01")
format3 <- ParseHMFolder("./data/hackmageddon/Format3/", c(2, 9, 7, 6), "1904-01-01")
format3.2 <- ParseHMFolder("./data/hackmageddon/Format3/Format3.2/", c(2, 9, 7, 6), "1899-12-30")
format4 <- ParseHMFolder("./data/hackmageddon/Format4/", c(1, 9, 5, 3), "1899-12-30")
#Appending evey data.frame in the standard format
dataset <- rbind(format1, format2, format3, format3.2, format4)
|
|
220
221
222
|
dataset
}
|
|
223
|
|