Blame view

ISO27001effectiveness/R/Hackmageddon_Parser.R 7.19 KB
Imanol-Mikel Barba Sabariego authored
1
2
3
4
5
6
#------------------------------------------------------------------------------------------------------
#-----------------------------Parser Excel - ISO survey------------------------------------------------
#------------------------------------------------------------------------------------------------------
Miguel Tuñón authored
7
8
9
10
11
12
13
14
15
16
#' Parse an excel raw data file from armaggedon
#'
#' @param file path to the excel file
#' @param cols list of columns index to read
#'
#' @return data.frame
#' @export
#'
#' @examples
#' data.raw <- ParseHMExcel("./data/hackmaggedon/file.xls", c(2, 3, 6, 5))
Miguel Tuñón authored
17
ParseHMExcel <- function(file, cols){
Miguel Tuñón authored
18
19

  #Checks
Imanol-Mikel Barba Sabariego authored
20
21
22
  if (!file.exists(file)) {
    stop(paste("Error, file [", file, "] not found"))
  }
Miguel Tuñón authored
23
24

  #Parse data from excell, same formats but different columns because of the files
Miguel Tuñón authored
25
26
  dataset <- xlsx::read.xlsx2(file, 1, header = TRUE,colIndex = cols, colClasses = c("numeric", "character", "character", "character"))
Miguel Tuñón authored
27
  #If date is null probably theres a blanc row that should be ignored
Miguel Tuñón authored
28
29
30
31
  if (is.null(dataset$Date)) {
    dataset <- xlsx::read.xlsx2(file, 1, header = TRUE,colIndex = cols, colClasses = c("numeric", "character", "character", "character"), startRow = 2)
  }
Miguel Tuñón authored
32
33
34
  dataset
}
Miguel Tuñón authored
35
36
37
38
39
40
41
42
43
44
45


#' Prepare raw data from hackmaggedon's excel to use it
#'
#' @param dataset.raw data.frame with raw data
#' @param dateOffset origin to add the numeric date
#'
#' @return data.frame
#' @export
#'
#' @examples
46
47
#' data.pro <- ProcessHMRaw(data.raw, "1899-12-30")
ProcessHMRaw <- function(dataset.raw, dateOffset){
Miguel Tuñón authored
48
49
50

  #Standar names to the columns
  dataset <- setNames(dataset.raw, c("Date", "Attack", "Target", "Country"))
Miguel Tuñón authored
51
Miguel Tuñón authored
52
  #Data frame changes to standarize values and make easier the joins
Miguel Tuñón authored
53
  dataset <- dataset[!is.na(dataset$Date),]
54
  dataset <- dataset[!is.na(dataset$Country),]
Miguel Tuñón authored
55
56
  dataset$Country <- toupper(dataset$Country)
57
58
  dataset <- dataset[!dataset$Country == "INT",]
  dataset <- dataset[!grepl(">",dataset$Country),]
Miguel Tuñón authored
59
  dataset <- dataset[dataset$Country != "N/A",]
60
61
  dataset$Country <- gsub("\n"," ",dataset$Country)
  dataset <- FilterMultiCountry(dataset)
Miguel Tuñón authored
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
  dataset <- dataset[dataset$Country != "",]

  dataset <- dataset[dataset$Country != "H",]
  dataset <- dataset[dataset$Country != "W",]
  dataset <- dataset[dataset$Country != "14",]
  dataset <- dataset[dataset$Country != "EU",]
  dataset <- dataset[dataset$Country != "UN",]
  dataset <- dataset[dataset$Country != "TI",]
  dataset <- dataset[dataset$Country != ".TI",]
  dataset$Country <- gsub("G8","GI",dataset$Country)
  dataset$Country <- gsub("UK","GB",dataset$Country)
  dataset$Country <- gsub("EN","GB",dataset$Country)
  dataset$Country <- gsub("UAE","AE",dataset$Country)
  dataset$Country <- gsub("CB","KH",dataset$Country)
Miguel Tuñón authored
77
78
79
80
  CountryNames <- data.frame(countrycode::countrycode_data$country.name, countrycode::countrycode_data$iso2c, countrycode::countrycode_data$continent)
  CountryNames <- setNames(CountryNames, c("Country_large","Country", "Continent"))
  dataset <- merge(x = dataset, y = CountryNames, by = "Country", all.x = TRUE)
  dataset <- dataset[!is.na(dataset$Continent),]
Miguel Tuñón authored
81
82
83

  #Format properly the date
  dataset$Date <- as.POSIXct(dataset$Date*86400, tz = "GMT", origin = dateOffset)
Miguel Tuñón authored
84
Miguel Tuñón authored
85
86
87
88
89
90
91
92
  #Standar Attack type

  Attack.config <- read.csv("./data/hackmageddon/AttackTypeConfig.csv", header = FALSE, sep = ";")
  Attack.config <- setNames(Attack.config, c("Attack", "Attack.standar"))

  dataset <- merge(x = dataset, y = Attack.config, by = "Attack", all.x = TRUE)
  dataset <- subset(dataset, select = -c(Attack))
Miguel Tuñón authored
93
94
  dataset <- FilterMultiAttack(dataset)
Imanol-Mikel Barba Sabariego authored
95
96
97
  dataset
}
Miguel Tuñón authored
98
99
100
101
102
#' Look for rows with more than one country target and split into multiple
#'
#' @param dataset.pre data.frame to process
#'
#' @return data.frame
master4_cs22 authored
103
FilterMultiCountry <- function(dataset.pre) {
Miguel Tuñón authored
104
105

  #data.frame with multiple taget country rows
master4_cs22 authored
106
107
  multi <- dataset.pre[grepl(" ",dataset.pre$Country),]
Miguel Tuñón authored
108
  if (nrow(multi) == 0) { #Ignore if there are not multiple target rows
master4_cs22 authored
109
Miguel Tuñón authored
110
111
    dataset.pre
  } else {
master4_cs22 authored
112
Miguel Tuñón authored
113
114
    #data.frame with every rows except multi ones
    dataset <- dataset.pre[!grepl(" ",dataset.pre$Country),]
master4_cs22 authored
115
Miguel Tuñón authored
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
    #Iterate over multi
    for (i in 1:nrow(multi)) {
      crow <- multi[i,] #current row

      country_s <- strsplit(toString(crow$Country), " ")[[1]] #each country target

      #Iterate over each country target
      for (j in 1:length(country_s)) {
        Date <- crow[1]
        Attack <- crow[2]
        Target <- crow[3]
        Country <- country_s[j]
        new.row <- data.frame(Date, Attack, Target, Country)
        dataset <- rbind(dataset, new.row) #Append new row to output data.frame
      }
master4_cs22 authored
131
132
    }
Miguel Tuñón authored
133
134
    dataset
    }
135
}
Miguel Tuñón authored
136
Miguel Tuñón authored
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
#' Look for rows with more than one attack and split into multiple
#'
#' @param dataset.pre data.frame to process
#'
#' @return data.frame
FilterMultiAttack <- function(dataset.pre) {

  #data.frame with multiple taget country rows
  multi <- dataset.pre[grepl("/",dataset.pre$Attack.standar),]

  if (nrow(multi) == 0) { #Ignore if there are not multiple target rows

    dataset.pre
  } else {

    #data.frame with every rows except multi ones
    dataset <- dataset.pre[!grepl("/",dataset.pre$Attack.standar),]

    #Iterate over multi
    for (i in 1:nrow(multi)) {
      crow <- multi[i,] #current row

      country_s <- strsplit(toString(crow$Attack.standar), "/")[[1]] #each country target

      #Iterate over each country target
      for (j in 1:length(country_s)) {
        #Date <- crow[1]
        #Attack.standar <- country_s[j]
        #Target <- crow[3]
        #Country <- crow[4]
        new.row <- data.frame(crow)
        new.row$Attack.standar <- country_s[j]
        dataset <- rbind(dataset, new.row) #Append new row to output data.frame
      }
    }

    dataset
  }
}
Miguel Tuñón authored
177
#' Parse every excel file into a folder
Miguel Tuñón authored
178
#'
Miguel Tuñón authored
179
180
181
#' @param folder path to the folder to iterate
#' @param cols columns to parse into each file
#' @param dateOffset origin to calc the dates into each file
Miguel Tuñón authored
182
#'
Miguel Tuñón authored
183
#' @return data.frame
Miguel Tuñón authored
184
185
186
#' @export
#'
#' @examples
187
#' data.pro <- ProcessHMRaw("./data/hackmaggedon/", c(1, 5, 3) "1899-12-30")
Miguel Tuñón authored
188
ParseHMFolder <- function(folder, cols, dateOffset){
Miguel Tuñón authored
189
Miguel Tuñón authored
190
191
  #List excel files into the folder
  filelist <- list.files(folder, pattern = "*.xls*")
Miguel Tuñón authored
192
Miguel Tuñón authored
193
  #Iterate for each file appending the returned data.frame
194
  dataset <- ProcessHMRaw(ParseHMExcel(paste(folder,filelist[1],sep = ""), cols), dateOffset)
Miguel Tuñón authored
195
  for (i in 2:length(filelist)) {
196
    dataset <- rbind(dataset, ProcessHMRaw(ParseHMExcel(paste(folder,filelist[i],sep = ""), cols), dateOffset))
Miguel Tuñón authored
197
198
199
  }

  dataset
Imanol-Mikel Barba Sabariego authored
200
201
}
Miguel Tuñón authored
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
#' Parse the default data from the package from hackmaggedon (2012-2016)
#'
#' @return data.frame
#' @export
#'
#' @examples
#' Attacks <- GetDefaultAttacksData()
GetDefaultAttacksData <- function() {

  #Parsing each different folder with the correct properties
  format1 <- ParseHMFolder("./data/hackmageddon/Format1/", c(2, 9, 7, 6), "1899-12-30")
  format2 <- ParseHMFolder("./data/hackmageddon/Format2/", c(2, 7, 5, 3), "1904-01-01")
  format3 <- ParseHMFolder("./data/hackmageddon/Format3/", c(2, 9, 7, 6), "1904-01-01")
  format3.2 <- ParseHMFolder("./data/hackmageddon/Format3/Format3.2/", c(2, 9, 7, 6), "1899-12-30")
  format4 <- ParseHMFolder("./data/hackmageddon/Format4/", c(1, 9, 5, 3), "1899-12-30")

  #Appending evey data.frame in the standard format
  dataset <- rbind(format1, format2, format3, format3.2, format4)
Miguel Tuñón authored
220
221
222

  dataset
}
Imanol-Mikel Barba Sabariego authored
223