Hackmageddon_Parser.R
6.08 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
#------------------------------------------------------------------------------------------------------
#-----------------------------Parser Excel - ISO survey------------------------------------------------
#------------------------------------------------------------------------------------------------------
#' Parse an excel raw data file from armaggedon
#'
#' @param file path to the excel file
#' @param cols list of columns index to read
#'
#' @return data.frame
#' @export
#'
#' @examples
#' data.raw <- ParseHMExcel("./data/hackmaggedon/file.xls", c(2, 3, 6, 5))
ParseHMExcel <- function(file, cols){
#Checks
if (!file.exists(file)) {
stop(paste("Error, file [", file, "] not found"))
}
#Parse data from excell, same formats but different columns because of the files
dataset <- xlsx::read.xlsx2(file, 1, header = TRUE,colIndex = cols, colClasses = c("numeric", "character", "character", "character"))
#If date is null probably theres a blanc row that should be ignored
if (is.null(dataset$Date)) {
dataset <- xlsx::read.xlsx2(file, 1, header = TRUE,colIndex = cols, colClasses = c("numeric", "character", "character", "character"), startRow = 2)
}
dataset
}
#' Prepare raw data from hackmaggedon's excel to use it
#'
#' @param dataset.raw data.frame with raw data
#' @param dateOffset origin to add the numeric date
#'
#' @return data.frame
#' @export
#'
#' @examples
#' data.pro <- ProcessHMRaw(data.raw, "1899-12-30")
ProcessHMRaw <- function(dataset.raw, dateOffset){
#Standar names to the columns
dataset <- setNames(dataset.raw, c("Date", "Attack", "Target", "Country"))
#Data frame changes to standarize values and make easier the joins
dataset <- dataset[!is.na(dataset$Date),]
dataset <- dataset[!is.na(dataset$Country),]
dataset$Country <- toupper(dataset$Country)
dataset <- dataset[!dataset$Country == "INT",]
dataset <- dataset[!grepl(">",dataset$Country),]
dataset <- dataset[dataset$Country != "N/A",]
dataset$Country <- gsub("\n"," ",dataset$Country)
dataset <- FilterMultiCountry(dataset)
dataset <- dataset[dataset$Country != "",]
dataset <- dataset[dataset$Country != "H",]
dataset <- dataset[dataset$Country != "W",]
dataset <- dataset[dataset$Country != "14",]
dataset <- dataset[dataset$Country != "EU",]
dataset <- dataset[dataset$Country != "UN",]
dataset <- dataset[dataset$Country != "TI",]
dataset <- dataset[dataset$Country != ".TI",]
dataset$Country <- gsub("G8","GI",dataset$Country)
dataset$Country <- gsub("UK","GB",dataset$Country)
dataset$Country <- gsub("EN","GB",dataset$Country)
dataset$Country <- gsub("UAE","AE",dataset$Country)
dataset$Country <- gsub("CB","KH",dataset$Country)
CountryNames <- data.frame(countrycode::countrycode_data$country.name, countrycode::countrycode_data$iso2c, countrycode::countrycode_data$continent)
CountryNames <- setNames(CountryNames, c("Country_large","Country", "Continent"))
dataset <- merge(x = dataset, y = CountryNames, by = "Country", all.x = TRUE)
dataset <- dataset[!is.na(dataset$Continent),]
#Format properly the date
dataset$Date <- as.POSIXct(dataset$Date*86400, tz = "GMT", origin = dateOffset)
#Standar Attack type
Attack.config <- read.csv("./data/hackmageddon/AttackTypeConfig.csv", header = FALSE, sep = ";")
Attack.config <- setNames(Attack.config, c("Attack", "Attack.standar"))
dataset <- merge(x = dataset, y = Attack.config, by = "Attack", all.x = TRUE)
dataset <- subset(dataset, select = -c(Attack))
dataset
}
#' Look for rows with more than one country target and split into multiple
#'
#' @param dataset.pre data.frame to process
#'
#' @return data.frame
FilterMultiCountry <- function(dataset.pre) {
#data.frame with multiple taget country rows
multi <- dataset.pre[grepl(" ",dataset.pre$Country),]
if (nrow(multi) == 0) { #Ignore if there are not multiple target rows
dataset.pre
} else {
#data.frame with every rows except multi ones
dataset <- dataset.pre[!grepl(" ",dataset.pre$Country),]
#Iterate over multi
for (i in 1:nrow(multi)) {
crow <- multi[i,] #current row
country_s <- strsplit(toString(crow$Country), " ")[[1]] #each country target
#Iterate over each country target
for (j in 1:length(country_s)) {
Date <- crow[1]
Attack <- crow[2]
Target <- crow[3]
Country <- country_s[j]
new.row <- data.frame(Date, Attack, Target, Country)
dataset <- rbind(dataset, new.row) #Append new row to output data.frame
}
}
dataset
}
}
#' Parse every excel file into a folder
#'
#' @param folder path to the folder to iterate
#' @param cols columns to parse into each file
#' @param dateOffset origin to calc the dates into each file
#'
#' @return data.frame
#' @export
#'
#' @examples
#' data.pro <- ProcessHMRaw("./data/hackmaggedon/", c(1, 5, 3) "1899-12-30")
ParseHMFolder <- function(folder, cols, dateOffset){
#List excel files into the folder
filelist <- list.files(folder, pattern = "*.xls*")
#Iterate for each file appending the returned data.frame
dataset <- ProcessHMRaw(ParseHMExcel(paste(folder,filelist[1],sep = ""), cols), dateOffset)
for (i in 2:length(filelist)) {
dataset <- rbind(dataset, ProcessHMRaw(ParseHMExcel(paste(folder,filelist[i],sep = ""), cols), dateOffset))
}
dataset
}
#' Parse the default data from the package from hackmaggedon (2012-2016)
#'
#' @return data.frame
#' @export
#'
#' @examples
#' Attacks <- GetDefaultAttacksData()
GetDefaultAttacksData <- function() {
#Parsing each different folder with the correct properties
format1 <- ParseHMFolder("./data/hackmageddon/Format1/", c(2, 9, 7, 6), "1899-12-30")
format2 <- ParseHMFolder("./data/hackmageddon/Format2/", c(2, 7, 5, 3), "1904-01-01")
format3 <- ParseHMFolder("./data/hackmageddon/Format3/", c(2, 9, 7, 6), "1904-01-01")
format3.2 <- ParseHMFolder("./data/hackmageddon/Format3/Format3.2/", c(2, 9, 7, 6), "1899-12-30")
format4 <- ParseHMFolder("./data/hackmageddon/Format4/", c(1, 9, 5, 3), "1899-12-30")
#Appending evey data.frame in the standard format
dataset <- rbind(format1, format2, format3, format3.2, format4)
dataset
}