Hackmageddon_Parser.R 4.3 KB
#------------------------------------------------------------------------------------------------------
#-----------------------------Parser Excel - ISO survey------------------------------------------------
#------------------------------------------------------------------------------------------------------



#' Parse an excel raw data file from armaggedon
#'
#' @param file path to the excel file
#' @param cols list of columns index to read
#'
#' @return data.frame
#' @export
#'
#' @examples
#' data.raw <- ParseHMExcel("./data/hackmaggedon/file.xls", c(2, 3, 6, 5))
ParseHMExcel <- function(file, cols){

  #Checks
  if (!file.exists(file)) {
    stop(paste("Error, file [", file, "] not found"))
  }

  #Parse data from excell, same formats but different columns because of the files
  dataset <- xlsx::read.xlsx2(file, 1, header = TRUE,colIndex = cols, colClasses = c("numeric", "character", "character", "character"))

  #If date is null probably theres a blanc row that should be ignored
  if (is.null(dataset$Date)) {
    dataset <- xlsx::read.xlsx2(file, 1, header = TRUE,colIndex = cols, colClasses = c("numeric", "character", "character", "character"), startRow = 2)
  }

  dataset
}



#' Prepare raw data from hackmaggedon's excel to use it
#'
#' @param dataset.raw data.frame with raw data
#' @param dateOffset origin to add the numeric date
#'
#' @return data.frame
#' @export
#'
#' @examples
#' data.pro <- ProcessHMRaw(data.raw, "1899-12-30")
ProcessHMRaw <- function(dataset.raw, dateOffset){

  #Standar names to the columns
  dataset <- setNames(dataset.raw, c("Date", "Attack", "Target", "Country"))

  #Remove rows with Date NA
  dataset <- dataset[!is.na(dataset$Date),]
  dataset <- dataset[!is.na(dataset$Country),]
  dataset <- dataset[!dataset$Country == ">1",]
  dataset <- dataset[!dataset$Country == ">A",]
  dataset <- dataset[!dataset$Country == "INT",]
  dataset <- dataset[!grepl(">",dataset$Country),]
  dataset$Country <- gsub("\n"," ",dataset$Country)
  dataset <- FilterMultiCountry(dataset)
  dataset <- dataset[!dataset$Country == "",]

  #Format properly the date
  dataset$Date <- as.POSIXct(dataset$Date*86400, tz = "GMT", origin = dateOffset)

  dataset
}

FilterMultiCountry <- function(dataset.pre) {
  multi <- dataset.pre[grepl(" ",dataset.pre$Country),]

  dataset <- dataset.pre[!grepl(" ",dataset.pre$Country),]

  for (i in 1:nrow(multi)) {
    crow <- multi[i,]

    country_s <- strsplit(toString(crow$Country), " ")

    for (j in 1:length(country_s)) {
      Date <- crow[1]
      Attack <- crow[2]
      Target <- crow[3]
      Country <- country_s[[1]][j]
      new.row <- data.frame(Date, Attack, Target, Country)
      print(new.row)
      dataset <- rbind(dataset, new.row)
    }
  }

  dataset
}

#' Parse every excel file into a folder
#'
#' @param folder path to the folder to iterate
#' @param cols columns to parse into each file
#' @param dateOffset origin to calc the dates into each file
#'
#' @return data.frame
#' @export
#'
#' @examples
#' data.pro <- ProcessHMRaw("./data/hackmaggedon/", c(1, 5, 3) "1899-12-30")
ParseHMFolder <- function(folder, cols, dateOffset){

  #List excel files into the folder
  filelist <- list.files(folder, pattern = "*.xls*")

  #Iterate for each file appending the returned data.frame
  dataset <- ProcessHMRaw(ParseHMExcel(paste(folder,filelist[1],sep = ""), cols), dateOffset)
  for (i in 2:length(filelist)) {
    dataset <- rbind(dataset, ProcessHMRaw(ParseHMExcel(paste(folder,filelist[i],sep = ""), cols), dateOffset))
  }

  dataset
}

#' Parse the default data from the package from hackmaggedon (2012-2016)
#'
#' @return data.frame
#' @export
#'
#' @examples
#' Attacks <- GetDefaultAttacksData()
GetDefaultAttacksData <- function() {

  #Parsing each different folder with the correct properties
  format1 <- ParseHMFolder("./data/hackmageddon/Format1/", c(2, 9, 7, 6), "1899-12-30")
  format2 <- ParseHMFolder("./data/hackmageddon/Format2/", c(2, 7, 5, 3), "1904-01-01")
  format3 <- ParseHMFolder("./data/hackmageddon/Format3/", c(2, 9, 7, 6), "1904-01-01")
  format3.2 <- ParseHMFolder("./data/hackmageddon/Format3/Format3.2/", c(2, 9, 7, 6), "1899-12-30")
  format4 <- ParseHMFolder("./data/hackmageddon/Format4/", c(1, 9, 5, 3), "1899-12-30")

  #Appending evey data.frame in the standard format
  dataset <- rbind(format1, format2, format3, format3.2, format4)

  dataset
}