#------------------------------------------------------------------------------------------------------ #-----------------------------Parser Excel - ISO survey------------------------------------------------ #------------------------------------------------------------------------------------------------------ #' Parse an excel raw data file from armaggedon #' #' @param file path to the excel file #' @param cols list of columns index to read #' #' @return data.frame #' @export #' #' @examples #' data.raw <- ParseHMExcel("./data/hackmaggedon/file.xls", c(2, 3, 6, 5)) ParseHMExcel <- function(file, cols){ #Checks if (!file.exists(file)) { stop(paste("Error, file [", file, "] not found")) } #Parse data from excell, same formats but different columns because of the files dataset <- xlsx::read.xlsx2(file, 1, header = TRUE,colIndex = cols, colClasses = c("numeric", "character", "character", "character")) #If date is null probably theres a blanc row that should be ignored if (is.null(dataset$Date)) { dataset <- xlsx::read.xlsx2(file, 1, header = TRUE,colIndex = cols, colClasses = c("numeric", "character", "character", "character"), startRow = 2) } dataset } #' Prepare raw data from hackmaggedon's excel to use it #' #' @param dataset.raw data.frame with raw data #' @param dateOffset origin to add the numeric date #' #' @return data.frame #' @export #' #' @examples #' data.pro <- ProcessHMRaw(data.raw, "1899-12-30") ProcessHMRaw <- function(dataset.raw, dateOffset){ #Standar names to the columns dataset <- setNames(dataset.raw, c("Date", "Attack", "Target", "Country")) #Data frame changes to standarize values and make easier the joins dataset <- dataset[!is.na(dataset$Date),] dataset <- dataset[!is.na(dataset$Country),] dataset$Country <- toupper(dataset$Country) dataset <- dataset[!dataset$Country == "INT",] dataset <- dataset[!grepl(">",dataset$Country),] dataset <- dataset[dataset$Country != "N/A",] dataset$Country <- gsub("\n"," ",dataset$Country) dataset <- FilterMultiCountry(dataset) dataset <- dataset[dataset$Country != "",] dataset <- dataset[dataset$Country != "H",] dataset <- dataset[dataset$Country != "W",] dataset <- dataset[dataset$Country != "14",] dataset <- dataset[dataset$Country != "EU",] dataset <- dataset[dataset$Country != "UN",] dataset <- dataset[dataset$Country != "TI",] dataset <- dataset[dataset$Country != ".TI",] dataset$Country <- gsub("G8","GI",dataset$Country) dataset$Country <- gsub("UK","GB",dataset$Country) dataset$Country <- gsub("EN","GB",dataset$Country) dataset$Country <- gsub("UAE","AE",dataset$Country) dataset$Country <- gsub("CB","KH",dataset$Country) CountryNames <- data.frame(countrycode::countrycode_data$country.name, countrycode::countrycode_data$iso2c, countrycode::countrycode_data$continent) CountryNames <- setNames(CountryNames, c("Country_large","Country", "Continent")) dataset <- merge(x = dataset, y = CountryNames, by = "Country", all.x = TRUE) dataset <- dataset[!is.na(dataset$Continent),] #Format properly the date dataset$Date <- as.POSIXct(dataset$Date*86400, tz = "GMT", origin = dateOffset) #Standar Attack type Attack.config <- read.csv("./data/hackmageddon/AttackTypeConfig.csv", header = FALSE, sep = ";") Attack.config <- setNames(Attack.config, c("Attack", "Attack.standar")) dataset <- merge(x = dataset, y = Attack.config, by = "Attack", all.x = TRUE) dataset <- subset(dataset, select = -c(Attack)) dataset } #' Look for rows with more than one country target and split into multiple #' #' @param dataset.pre data.frame to process #' #' @return data.frame FilterMultiCountry <- function(dataset.pre) { #data.frame with multiple taget country rows multi <- dataset.pre[grepl(" ",dataset.pre$Country),] if (nrow(multi) == 0) { #Ignore if there are not multiple target rows dataset.pre } else { #data.frame with every rows except multi ones dataset <- dataset.pre[!grepl(" ",dataset.pre$Country),] #Iterate over multi for (i in 1:nrow(multi)) { crow <- multi[i,] #current row country_s <- strsplit(toString(crow$Country), " ")[[1]] #each country target #Iterate over each country target for (j in 1:length(country_s)) { Date <- crow[1] Attack <- crow[2] Target <- crow[3] Country <- country_s[j] new.row <- data.frame(Date, Attack, Target, Country) dataset <- rbind(dataset, new.row) #Append new row to output data.frame } } dataset } } #' Parse every excel file into a folder #' #' @param folder path to the folder to iterate #' @param cols columns to parse into each file #' @param dateOffset origin to calc the dates into each file #' #' @return data.frame #' @export #' #' @examples #' data.pro <- ProcessHMRaw("./data/hackmaggedon/", c(1, 5, 3) "1899-12-30") ParseHMFolder <- function(folder, cols, dateOffset){ #List excel files into the folder filelist <- list.files(folder, pattern = "*.xls*") #Iterate for each file appending the returned data.frame dataset <- ProcessHMRaw(ParseHMExcel(paste(folder,filelist[1],sep = ""), cols), dateOffset) for (i in 2:length(filelist)) { dataset <- rbind(dataset, ProcessHMRaw(ParseHMExcel(paste(folder,filelist[i],sep = ""), cols), dateOffset)) } dataset } #' Parse the default data from the package from hackmaggedon (2012-2016) #' #' @return data.frame #' @export #' #' @examples #' Attacks <- GetDefaultAttacksData() GetDefaultAttacksData <- function() { #Parsing each different folder with the correct properties format1 <- ParseHMFolder("./data/hackmageddon/Format1/", c(2, 9, 7, 6), "1899-12-30") format2 <- ParseHMFolder("./data/hackmageddon/Format2/", c(2, 7, 5, 3), "1904-01-01") format3 <- ParseHMFolder("./data/hackmageddon/Format3/", c(2, 9, 7, 6), "1904-01-01") format3.2 <- ParseHMFolder("./data/hackmageddon/Format3/Format3.2/", c(2, 9, 7, 6), "1899-12-30") format4 <- ParseHMFolder("./data/hackmageddon/Format4/", c(1, 9, 5, 3), "1899-12-30") #Appending evey data.frame in the standard format dataset <- rbind(format1, format2, format3, format3.2, format4) dataset }