|
1
2
3
4
5
6
|
#------------------------------------------------------------------------------------------------------
#-----------------------------Parser Excel - ISO survey------------------------------------------------
#------------------------------------------------------------------------------------------------------
|
|
7
8
9
10
11
12
13
14
15
16
|
#' Parse an excel raw data file from armaggedon
#'
#' @param file path to the excel file
#' @param cols list of columns index to read
#'
#' @return data.frame
#' @export
#'
#' @examples
#' data.raw <- ParseHMExcel("./data/hackmaggedon/file.xls", c(2, 3, 6, 5))
|
|
17
|
ParseHMExcel <- function(file, cols){
|
|
18
19
|
#Checks
|
|
20
21
22
|
if (!file.exists(file)) {
stop(paste("Error, file [", file, "] not found"))
}
|
|
23
24
|
#Parse data from excell, same formats but different columns because of the files
|
|
25
26
|
dataset <- xlsx::read.xlsx2(file, 1, header = TRUE,colIndex = cols, colClasses = c("numeric", "character", "character", "character"))
|
|
27
|
#If date is null probably theres a blanc row that should be ignored
|
|
28
29
30
31
|
if (is.null(dataset$Date)) {
dataset <- xlsx::read.xlsx2(file, 1, header = TRUE,colIndex = cols, colClasses = c("numeric", "character", "character", "character"), startRow = 2)
}
|
|
32
33
34
|
dataset
}
|
|
35
36
37
38
39
40
41
42
43
44
45
|
#' Prepare raw data from hackmaggedon's excel to use it
#'
#' @param dataset.raw data.frame with raw data
#' @param dateOffset origin to add the numeric date
#'
#' @return data.frame
#' @export
#'
#' @examples
|
|
46
47
|
#' data.pro <- ProcessHMRaw(data.raw, "1899-12-30")
ProcessHMRaw <- function(dataset.raw, dateOffset){
|
|
48
49
50
|
#Standar names to the columns
dataset <- setNames(dataset.raw, c("Date", "Attack", "Target", "Country"))
|
|
51
|
|
|
52
|
#Data frame changes to standarize values and make easier the joins
|
|
53
|
dataset <- dataset[!is.na(dataset$Date),]
|
|
54
|
dataset <- dataset[!is.na(dataset$Country),]
|
|
55
56
|
dataset$Country <- toupper(dataset$Country)
|
|
57
58
|
dataset <- dataset[!dataset$Country == "INT",]
dataset <- dataset[!grepl(">",dataset$Country),]
|
|
59
|
dataset <- dataset[dataset$Country != "N/A",]
|
|
60
61
|
dataset$Country <- gsub("\n"," ",dataset$Country)
dataset <- FilterMultiCountry(dataset)
|
|
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
|
dataset <- dataset[dataset$Country != "",]
dataset <- dataset[dataset$Country != "H",]
dataset <- dataset[dataset$Country != "W",]
dataset <- dataset[dataset$Country != "14",]
dataset <- dataset[dataset$Country != "EU",]
dataset <- dataset[dataset$Country != "UN",]
dataset <- dataset[dataset$Country != "TI",]
dataset <- dataset[dataset$Country != ".TI",]
dataset$Country <- gsub("G8","GI",dataset$Country)
dataset$Country <- gsub("UK","GB",dataset$Country)
dataset$Country <- gsub("EN","GB",dataset$Country)
dataset$Country <- gsub("UAE","AE",dataset$Country)
dataset$Country <- gsub("CB","KH",dataset$Country)
|
|
77
78
79
|
#Format properly the date
dataset$Date <- as.POSIXct(dataset$Date*86400, tz = "GMT", origin = dateOffset)
|
|
80
|
|
|
81
82
83
|
dataset
}
|
|
84
85
86
87
88
|
#' Look for rows with more than one country target and split into multiple
#'
#' @param dataset.pre data.frame to process
#'
#' @return data.frame
|
|
89
|
FilterMultiCountry <- function(dataset.pre) {
|
|
90
91
|
#data.frame with multiple taget country rows
|
|
92
93
|
multi <- dataset.pre[grepl(" ",dataset.pre$Country),]
|
|
94
|
if (nrow(multi) == 0) { #Ignore if there are not multiple target rows
|
|
95
|
|
|
96
97
|
dataset.pre
} else {
|
|
98
|
|
|
99
100
|
#data.frame with every rows except multi ones
dataset <- dataset.pre[!grepl(" ",dataset.pre$Country),]
|
|
101
|
|
|
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
|
#Iterate over multi
for (i in 1:nrow(multi)) {
crow <- multi[i,] #current row
country_s <- strsplit(toString(crow$Country), " ")[[1]] #each country target
#Iterate over each country target
for (j in 1:length(country_s)) {
Date <- crow[1]
Attack <- crow[2]
Target <- crow[3]
Country <- country_s[j]
new.row <- data.frame(Date, Attack, Target, Country)
dataset <- rbind(dataset, new.row) #Append new row to output data.frame
}
|
|
117
118
|
}
|
|
119
120
|
dataset
}
|
|
121
|
}
|
|
122
123
|
#' Parse every excel file into a folder
|
|
124
|
#'
|
|
125
126
127
|
#' @param folder path to the folder to iterate
#' @param cols columns to parse into each file
#' @param dateOffset origin to calc the dates into each file
|
|
128
|
#'
|
|
129
|
#' @return data.frame
|
|
130
131
132
|
#' @export
#'
#' @examples
|
|
133
|
#' data.pro <- ProcessHMRaw("./data/hackmaggedon/", c(1, 5, 3) "1899-12-30")
|
|
134
|
ParseHMFolder <- function(folder, cols, dateOffset){
|
|
135
|
|
|
136
137
|
#List excel files into the folder
filelist <- list.files(folder, pattern = "*.xls*")
|
|
138
|
|
|
139
|
#Iterate for each file appending the returned data.frame
|
|
140
|
dataset <- ProcessHMRaw(ParseHMExcel(paste(folder,filelist[1],sep = ""), cols), dateOffset)
|
|
141
|
for (i in 2:length(filelist)) {
|
|
142
|
dataset <- rbind(dataset, ProcessHMRaw(ParseHMExcel(paste(folder,filelist[i],sep = ""), cols), dateOffset))
|
|
143
144
145
|
}
dataset
|
|
146
147
|
}
|
|
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
|
#' Parse the default data from the package from hackmaggedon (2012-2016)
#'
#' @return data.frame
#' @export
#'
#' @examples
#' Attacks <- GetDefaultAttacksData()
GetDefaultAttacksData <- function() {
#Parsing each different folder with the correct properties
format1 <- ParseHMFolder("./data/hackmageddon/Format1/", c(2, 9, 7, 6), "1899-12-30")
format2 <- ParseHMFolder("./data/hackmageddon/Format2/", c(2, 7, 5, 3), "1904-01-01")
format3 <- ParseHMFolder("./data/hackmageddon/Format3/", c(2, 9, 7, 6), "1904-01-01")
format3.2 <- ParseHMFolder("./data/hackmageddon/Format3/Format3.2/", c(2, 9, 7, 6), "1899-12-30")
format4 <- ParseHMFolder("./data/hackmageddon/Format4/", c(1, 9, 5, 3), "1899-12-30")
#Appending evey data.frame in the standard format
dataset <- rbind(format1, format2, format3, format3.2, format4)
|
|
166
167
168
|
dataset
}
|
|
169
|
|