Blame view

ISO27001effectiveness/R/Hackmageddon_Parser.R 6.08 KB
Imanol-Mikel Barba Sabariego authored
1
2
3
4
5
6
#------------------------------------------------------------------------------------------------------
#-----------------------------Parser Excel - ISO survey------------------------------------------------
#------------------------------------------------------------------------------------------------------
Miguel Tuñón authored
7
8
9
10
11
12
13
14
15
16
#' Parse an excel raw data file from armaggedon
#'
#' @param file path to the excel file
#' @param cols list of columns index to read
#'
#' @return data.frame
#' @export
#'
#' @examples
#' data.raw <- ParseHMExcel("./data/hackmaggedon/file.xls", c(2, 3, 6, 5))
Miguel Tuñón authored
17
ParseHMExcel <- function(file, cols){
Miguel Tuñón authored
18
19

  #Checks
Imanol-Mikel Barba Sabariego authored
20
21
22
  if (!file.exists(file)) {
    stop(paste("Error, file [", file, "] not found"))
  }
Miguel Tuñón authored
23
24

  #Parse data from excell, same formats but different columns because of the files
Miguel Tuñón authored
25
26
  dataset <- xlsx::read.xlsx2(file, 1, header = TRUE,colIndex = cols, colClasses = c("numeric", "character", "character", "character"))
Miguel Tuñón authored
27
  #If date is null probably theres a blanc row that should be ignored
Miguel Tuñón authored
28
29
30
31
  if (is.null(dataset$Date)) {
    dataset <- xlsx::read.xlsx2(file, 1, header = TRUE,colIndex = cols, colClasses = c("numeric", "character", "character", "character"), startRow = 2)
  }
Miguel Tuñón authored
32
33
34
  dataset
}
Miguel Tuñón authored
35
36
37
38
39
40
41
42
43
44
45


#' Prepare raw data from hackmaggedon's excel to use it
#'
#' @param dataset.raw data.frame with raw data
#' @param dateOffset origin to add the numeric date
#'
#' @return data.frame
#' @export
#'
#' @examples
46
47
#' data.pro <- ProcessHMRaw(data.raw, "1899-12-30")
ProcessHMRaw <- function(dataset.raw, dateOffset){
Miguel Tuñón authored
48
49
50

  #Standar names to the columns
  dataset <- setNames(dataset.raw, c("Date", "Attack", "Target", "Country"))
Miguel Tuñón authored
51
Miguel Tuñón authored
52
  #Data frame changes to standarize values and make easier the joins
Miguel Tuñón authored
53
  dataset <- dataset[!is.na(dataset$Date),]
54
  dataset <- dataset[!is.na(dataset$Country),]
Miguel Tuñón authored
55
56
  dataset$Country <- toupper(dataset$Country)
57
58
  dataset <- dataset[!dataset$Country == "INT",]
  dataset <- dataset[!grepl(">",dataset$Country),]
Miguel Tuñón authored
59
  dataset <- dataset[dataset$Country != "N/A",]
60
61
  dataset$Country <- gsub("\n"," ",dataset$Country)
  dataset <- FilterMultiCountry(dataset)
Miguel Tuñón authored
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
  dataset <- dataset[dataset$Country != "",]

  dataset <- dataset[dataset$Country != "H",]
  dataset <- dataset[dataset$Country != "W",]
  dataset <- dataset[dataset$Country != "14",]
  dataset <- dataset[dataset$Country != "EU",]
  dataset <- dataset[dataset$Country != "UN",]
  dataset <- dataset[dataset$Country != "TI",]
  dataset <- dataset[dataset$Country != ".TI",]
  dataset$Country <- gsub("G8","GI",dataset$Country)
  dataset$Country <- gsub("UK","GB",dataset$Country)
  dataset$Country <- gsub("EN","GB",dataset$Country)
  dataset$Country <- gsub("UAE","AE",dataset$Country)
  dataset$Country <- gsub("CB","KH",dataset$Country)
Miguel Tuñón authored
77
78
79
80
  CountryNames <- data.frame(countrycode::countrycode_data$country.name, countrycode::countrycode_data$iso2c, countrycode::countrycode_data$continent)
  CountryNames <- setNames(CountryNames, c("Country_large","Country", "Continent"))
  dataset <- merge(x = dataset, y = CountryNames, by = "Country", all.x = TRUE)
  dataset <- dataset[!is.na(dataset$Continent),]
Miguel Tuñón authored
81
82
83

  #Format properly the date
  dataset$Date <- as.POSIXct(dataset$Date*86400, tz = "GMT", origin = dateOffset)
Miguel Tuñón authored
84
Miguel Tuñón authored
85
86
87
88
89
90
91
92
  #Standar Attack type

  Attack.config <- read.csv("./data/hackmageddon/AttackTypeConfig.csv", header = FALSE, sep = ";")
  Attack.config <- setNames(Attack.config, c("Attack", "Attack.standar"))

  dataset <- merge(x = dataset, y = Attack.config, by = "Attack", all.x = TRUE)
  dataset <- subset(dataset, select = -c(Attack))
Imanol-Mikel Barba Sabariego authored
93
94
95
  dataset
}
Miguel Tuñón authored
96
97
98
99
100
#' Look for rows with more than one country target and split into multiple
#'
#' @param dataset.pre data.frame to process
#'
#' @return data.frame
master4_cs22 authored
101
FilterMultiCountry <- function(dataset.pre) {
Miguel Tuñón authored
102
103

  #data.frame with multiple taget country rows
master4_cs22 authored
104
105
  multi <- dataset.pre[grepl(" ",dataset.pre$Country),]
Miguel Tuñón authored
106
  if (nrow(multi) == 0) { #Ignore if there are not multiple target rows
master4_cs22 authored
107
Miguel Tuñón authored
108
109
    dataset.pre
  } else {
master4_cs22 authored
110
Miguel Tuñón authored
111
112
    #data.frame with every rows except multi ones
    dataset <- dataset.pre[!grepl(" ",dataset.pre$Country),]
master4_cs22 authored
113
Miguel Tuñón authored
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
    #Iterate over multi
    for (i in 1:nrow(multi)) {
      crow <- multi[i,] #current row

      country_s <- strsplit(toString(crow$Country), " ")[[1]] #each country target

      #Iterate over each country target
      for (j in 1:length(country_s)) {
        Date <- crow[1]
        Attack <- crow[2]
        Target <- crow[3]
        Country <- country_s[j]
        new.row <- data.frame(Date, Attack, Target, Country)
        dataset <- rbind(dataset, new.row) #Append new row to output data.frame
      }
master4_cs22 authored
129
130
    }
Miguel Tuñón authored
131
132
    dataset
    }
133
}
Miguel Tuñón authored
134
135

#' Parse every excel file into a folder
Miguel Tuñón authored
136
#'
Miguel Tuñón authored
137
138
139
#' @param folder path to the folder to iterate
#' @param cols columns to parse into each file
#' @param dateOffset origin to calc the dates into each file
Miguel Tuñón authored
140
#'
Miguel Tuñón authored
141
#' @return data.frame
Miguel Tuñón authored
142
143
144
#' @export
#'
#' @examples
145
#' data.pro <- ProcessHMRaw("./data/hackmaggedon/", c(1, 5, 3) "1899-12-30")
Miguel Tuñón authored
146
ParseHMFolder <- function(folder, cols, dateOffset){
Miguel Tuñón authored
147
Miguel Tuñón authored
148
149
  #List excel files into the folder
  filelist <- list.files(folder, pattern = "*.xls*")
Miguel Tuñón authored
150
Miguel Tuñón authored
151
  #Iterate for each file appending the returned data.frame
152
  dataset <- ProcessHMRaw(ParseHMExcel(paste(folder,filelist[1],sep = ""), cols), dateOffset)
Miguel Tuñón authored
153
  for (i in 2:length(filelist)) {
154
    dataset <- rbind(dataset, ProcessHMRaw(ParseHMExcel(paste(folder,filelist[i],sep = ""), cols), dateOffset))
Miguel Tuñón authored
155
156
157
  }

  dataset
Imanol-Mikel Barba Sabariego authored
158
159
}
Miguel Tuñón authored
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
#' Parse the default data from the package from hackmaggedon (2012-2016)
#'
#' @return data.frame
#' @export
#'
#' @examples
#' Attacks <- GetDefaultAttacksData()
GetDefaultAttacksData <- function() {

  #Parsing each different folder with the correct properties
  format1 <- ParseHMFolder("./data/hackmageddon/Format1/", c(2, 9, 7, 6), "1899-12-30")
  format2 <- ParseHMFolder("./data/hackmageddon/Format2/", c(2, 7, 5, 3), "1904-01-01")
  format3 <- ParseHMFolder("./data/hackmageddon/Format3/", c(2, 9, 7, 6), "1904-01-01")
  format3.2 <- ParseHMFolder("./data/hackmageddon/Format3/Format3.2/", c(2, 9, 7, 6), "1899-12-30")
  format4 <- ParseHMFolder("./data/hackmageddon/Format4/", c(1, 9, 5, 3), "1899-12-30")

  #Appending evey data.frame in the standard format
  dataset <- rbind(format1, format2, format3, format3.2, format4)
Miguel Tuñón authored
178
179
180

  dataset
}
Imanol-Mikel Barba Sabariego authored
181