#------------------------------------------------------------------------------------------------------ #-----------------------------Parser Excel - ISO survey------------------------------------------------ #------------------------------------------------------------------------------------------------------ #' Parse an excel raw data file from armaggedon #' #' @param file path to the excel file #' @param cols list of columns index to read #' #' @return data.frame #' @export #' #' @examples #' data.raw <- ParseHMExcel("./data/hackmaggedon/file.xls", c(2, 3, 6, 5)) ParseHMExcel <- function(file, cols){ #Checks if (!file.exists(file)) { stop(paste("Error, file [", file, "] not found")) } #Parse data from excell, same formats but different columns because of the files dataset <- xlsx::read.xlsx2(file, 1, header = TRUE,colIndex = cols, colClasses = c("numeric", "character", "character", "character")) #If date is null probably theres a blanc row that should be ignored if (is.null(dataset$Date)) { dataset <- xlsx::read.xlsx2(file, 1, header = TRUE,colIndex = cols, colClasses = c("numeric", "character", "character", "character"), startRow = 2) } dataset } #' Prepare raw data from hackmaggedon's excel to use it #' #' @param dataset.raw data.frame with raw data #' @param dateOffset origin to add the numeric date #' #' @return data.frame #' @export #' #' @examples #' data.pro <- ProcessHMRaw(data.raw, "1899-12-30") ProcessHMRaw <- function(dataset.raw, dateOffset){ #Standar names to the columns dataset <- setNames(dataset.raw, c("Date", "Attack", "Target", "Country")) #Remove rows with Date NA dataset <- dataset[!is.na(dataset$Date),] dataset <- dataset[!is.na(dataset$Country),] dataset <- dataset[!dataset$Country == ">1",] dataset <- dataset[!dataset$Country == ">A",] dataset <- dataset[!dataset$Country == "INT",] dataset <- dataset[!grepl(">",dataset$Country),] dataset$Country <- gsub("\n"," ",dataset$Country) dataset <- FilterMultiCountry(dataset) dataset <- dataset[!dataset$Country == "",] #Format properly the date dataset$Date <- as.POSIXct(dataset$Date*86400, tz = "GMT", origin = dateOffset) dataset } FilterMultiCountry <- function(dataset.pre) { multi <- dataset.pre[grepl(" ",dataset.pre$Country),] dataset <- dataset.pre[!grepl(" ",dataset.pre$Country),] for (i in 1:nrow(multi)) { crow <- multi[i,] country_s <- strsplit(toString(crow$Country), " ") for (j in 1:length(country_s)) { Date <- crow[1] Attack <- crow[2] Target <- crow[3] Country <- country_s[[1]][j] new.row <- data.frame(Date, Attack, Target, Country) print(new.row) dataset <- rbind(dataset, new.row) } } dataset } #' Parse every excel file into a folder #' #' @param folder path to the folder to iterate #' @param cols columns to parse into each file #' @param dateOffset origin to calc the dates into each file #' #' @return data.frame #' @export #' #' @examples #' data.pro <- ProcessHMRaw("./data/hackmaggedon/", c(1, 5, 3) "1899-12-30") ParseHMFolder <- function(folder, cols, dateOffset){ #List excel files into the folder filelist <- list.files(folder, pattern = "*.xls*") #Iterate for each file appending the returned data.frame dataset <- ProcessHMRaw(ParseHMExcel(paste(folder,filelist[1],sep = ""), cols), dateOffset) for (i in 2:length(filelist)) { dataset <- rbind(dataset, ProcessHMRaw(ParseHMExcel(paste(folder,filelist[i],sep = ""), cols), dateOffset)) } dataset } #' Parse the default data from the package from hackmaggedon (2012-2016) #' #' @return data.frame #' @export #' #' @examples #' Attacks <- GetDefaultAttacksData() GetDefaultAttacksData <- function() { #Parsing each different folder with the correct properties format1 <- ParseHMFolder("./data/hackmageddon/Format1/", c(2, 9, 7, 6), "1899-12-30") format2 <- ParseHMFolder("./data/hackmageddon/Format2/", c(2, 7, 5, 3), "1904-01-01") format3 <- ParseHMFolder("./data/hackmageddon/Format3/", c(2, 9, 7, 6), "1904-01-01") format3.2 <- ParseHMFolder("./data/hackmageddon/Format3/Format3.2/", c(2, 9, 7, 6), "1899-12-30") format4 <- ParseHMFolder("./data/hackmageddon/Format4/", c(1, 9, 5, 3), "1899-12-30") #Appending evey data.frame in the standard format dataset <- rbind(format1, format2, format3, format3.2, format4) dataset }