From 2de5507d6f309c26f053e7e88787a3fa18c633bb Mon Sep 17 00:00:00 2001 From: Miguel Tuñón Date: Tue, 20 Dec 2016 01:06:18 +0100 Subject: [PATCH] Parsers working correctly --- ISO27001effectiveness/DESCRIPTION | 1 + ISO27001effectiveness/Main.R | 15 ++++----------- ISO27001effectiveness/R/Hackmageddon_Parser.R | 90 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++----------------------- ISO27001effectiveness/R/ISOSurvey_Parser.R | 97 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++----------- ISO27001effectiveness/R/Util.R | 14 -------------- ISO27001effectiveness/data/hackmageddon/Format3/1-15 Jan 2014 Cyber Attacks Timeline.xls | Bin 868864 -> 0 bytes ISO27001effectiveness/data/hackmageddon/Format3/16-31 December 2014 Cyber Attacks Timeline.xlsx | Bin 503129 -> 0 bytes ISO27001effectiveness/data/hackmageddon/Format3/16-31 Jan 2014 Cyber Attacks Timeline.xlsx | Bin 354544 -> 0 bytes ISO27001effectiveness/data/hackmageddon/Format3/16-31 Jun 2014 Cyber Attacks Timeline.xlsx | Bin 106706 -> 0 bytes ISO27001effectiveness/data/hackmageddon/Format3/Format3.2/1-15 Jan 2014 Cyber Attacks Timeline.xls | Bin 0 -> 868864 bytes ISO27001effectiveness/data/hackmageddon/Format3/Format3.2/16-31 Jan 2014 Cyber Attacks Timeline.xlsx | Bin 0 -> 354544 bytes ISO27001effectiveness/data/test/16-31 Mar 2014 Cyber Attacks Timeline.xlsx | Bin 503141 -> 0 bytes ISO27001effectiveness/data/test/16-31 Mar 2015 Cyber Attacks Timeline.xlsx | Bin 514908 -> 0 bytes 13 files changed, 158 insertions(+), 59 deletions(-) delete mode 100644 ISO27001effectiveness/data/hackmageddon/Format3/1-15 Jan 2014 Cyber Attacks Timeline.xls delete mode 100644 ISO27001effectiveness/data/hackmageddon/Format3/16-31 Jan 2014 Cyber Attacks Timeline.xlsx delete mode 100644 ISO27001effectiveness/data/hackmageddon/Format3/16-31 Jun 2014 Cyber Attacks Timeline.xlsx create mode 100644 ISO27001effectiveness/data/hackmageddon/Format3/Format3.2/1-15 Jan 2014 Cyber Attacks Timeline.xls create mode 100644 ISO27001effectiveness/data/hackmageddon/Format3/Format3.2/16-31 Jan 2014 Cyber Attacks Timeline.xlsx delete mode 100644 ISO27001effectiveness/data/test/16-31 Mar 2014 Cyber Attacks Timeline.xlsx delete mode 100644 ISO27001effectiveness/data/test/16-31 Mar 2015 Cyber Attacks Timeline.xlsx diff --git a/ISO27001effectiveness/DESCRIPTION b/ISO27001effectiveness/DESCRIPTION index 5842733..28ba219 100644 --- a/ISO27001effectiveness/DESCRIPTION +++ b/ISO27001effectiveness/DESCRIPTION @@ -10,3 +10,4 @@ License: What license is it under? Encoding: UTF-8 LazyData: true RoxygenNote: 5.0.1 +Imports: xlsx diff --git a/ISO27001effectiveness/Main.R b/ISO27001effectiveness/Main.R index 9b8363c..cdf48b8 100644 --- a/ISO27001effectiveness/Main.R +++ b/ISO27001effectiveness/Main.R @@ -1,13 +1,6 @@ -source("./R/Util.R") -LoadLibraries() -source("./R/ISOSurvey_Parser.R") -Cert_PerCountry <- ParseExcelFileRaw("./data/ISO/iso_27001_iso_survey2015_preprocessed.xlsx", 1) -Sites_PerCountry <- ParseExcelFileRaw("./data/ISO/iso_27001_iso_survey2015_preprocessed.xlsx", 2) -Cert_PerSector <- ParseExcelFileRaw("./data/ISO/iso_27001_iso_survey2015_preprocessed.xlsx", 3) +Cert_PerCountry <- ISO27001effectiveness::GetISOSurveyCertsPerCountry() +Sites_PerCountry <- ISO27001effectiveness::GetISOSurveySitesPerCountry() +Cert_PerSector <- ISO27001effectiveness::GetISOSurveyCertsPerSector() -Cert_PerCountry <- ProccesISOSurveyByCountryRaw(Cert_PerCountry, c("X2012", "X2013", "X2014", "X2015")) -Sites_PerCountry <- ProccesISOSurveyByCountryRaw(Sites_PerCountry, c( "X2012", "X2013", "X2014", "X2015")) -#Cert_PerSector <- ProccesISOSurveyRaw(Cert_PerSector, c("X2012", "X2013", "X2014", "X2015")) - -Attacks <- GetAttacksData() +Attacks <- ISO27001effectiveness::GetDefaultAttacksData() diff --git a/ISO27001effectiveness/R/Hackmageddon_Parser.R b/ISO27001effectiveness/R/Hackmageddon_Parser.R index 7708f30..3d9add8 100644 --- a/ISO27001effectiveness/R/Hackmageddon_Parser.R +++ b/ISO27001effectiveness/R/Hackmageddon_Parser.R @@ -4,13 +4,27 @@ +#' Parse an excel raw data file from armaggedon +#' +#' @param file path to the excel file +#' @param cols list of columns index to read +#' +#' @return data.frame +#' @export +#' +#' @examples +#' data.raw <- ParseHMExcel("./data/hackmaggedon/file.xls", c(2, 3, 6, 5)) ParseHMExcel <- function(file, cols){ - print(file) + + #Checks if (!file.exists(file)) { stop(paste("Error, file [", file, "] not found")) } + + #Parse data from excell, same formats but different columns because of the files dataset <- xlsx::read.xlsx2(file, 1, header = TRUE,colIndex = cols, colClasses = c("numeric", "character", "character", "character")) + #If date is null probably theres a blanc row that should be ignored if (is.null(dataset$Date)) { dataset <- xlsx::read.xlsx2(file, 1, header = TRUE,colIndex = cols, colClasses = c("numeric", "character", "character", "character"), startRow = 2) } @@ -18,47 +32,77 @@ ParseHMExcel <- function(file, cols){ dataset } -ProccesHMRaw <- function(dataset.raw){ + + +#' Prepare raw data from hackmaggedon's excel to use it +#' +#' @param dataset.raw data.frame with raw data +#' @param dateOffset origin to add the numeric date +#' +#' @return data.frame +#' @export +#' +#' @examples +#' data.pro <- ProccesHMRaw(data.raw, "1899-12-30") +ProccesHMRaw <- function(dataset.raw, dateOffset){ + + #Standar names to the columns + dataset <- setNames(dataset.raw, c("Date", "Attack", "Target", "Country")) #Remove rows with Date NA - print(class(dataset.raw$Date)) - dataset <- dataset.raw[!is.na(dataset.raw$Date),] - dataset$Date <- as.POSIXct(dataset$Date*86400, tz ="GMT", origin ="1904-01-01") + dataset <- dataset[!is.na(dataset$Date),] + + #Format properly the date + dataset$Date <- as.POSIXct(dataset$Date*86400, tz = "GMT", origin = dateOffset) dataset } -#' Title + + +#' Parse every excel file into a folder #' -#' @param folder -#' @param cols +#' @param folder path to the folder to iterate +#' @param cols columns to parse into each file +#' @param dateOffset origin to calc the dates into each file #' -#' @return +#' @return data.frame #' @export #' #' @examples -ParseHMFolder <- function(folder, cols){ - filelist <- list.files(folder) - #frames <- lapply(paste(folder,filelist,sep = ""),ParseHMExcel) +#' data.pro <- ProccesHMRaw("./data/hackmaggedon/", c(1, 5, 3) "1899-12-30") +ParseHMFolder <- function(folder, cols, dateOffset){ - myFile <- paste(folder,filelist[1],sep = "") - dataset <- ProccesHMRaw(ParseHMExcel(myFile, cols)) + #List excel files into the folder + filelist <- list.files(folder, pattern = "*.xls*") + #Iterate for each file appending the returned data.frame + dataset <- ProccesHMRaw(ParseHMExcel(paste(folder,filelist[1],sep = ""), cols), dateOffset) for (i in 2:length(filelist)) { - dataset <- rbind(dataset, ProccesHMRaw(ParseHMExcel(paste(folder,filelist[i],sep = ""), cols))) + dataset <- rbind(dataset, ProccesHMRaw(ParseHMExcel(paste(folder,filelist[i],sep = ""), cols), dateOffset)) } dataset } -GetAttacksData <- function() { - - format1 <- ParseHMFolder("./data/hackmageddon/Format1/", c(2, 9, 7, 6)) - format2 <- ParseHMFolder("./data/hackmageddon/Format2/", c(2,7, 5, 6)) - format3 <- ParseHMFolder("./data/hackmageddon/Format3/", c(2, 9, 7, 6)) - format4 <- ParseHMFolder("./data/hackmageddon/Format4/", c(1, 9, 5, 3)) - - dataset <- rbind(format1, format2, format3, format4) +#' Parse the default data from the package from hackmaggedon (2012-2016) +#' +#' @return data.frame +#' @export +#' +#' @examples +#' Attacks <- GetDefaultAttacksData() +GetDefaultAttacksData <- function() { + + #Parsing each different folder with the correct properties + format1 <- ParseHMFolder("./data/hackmageddon/Format1/", c(2, 9, 7, 6), "1899-12-30") + format2 <- ParseHMFolder("./data/hackmageddon/Format2/", c(2, 7, 5, 3), "1904-01-01") + format3 <- ParseHMFolder("./data/hackmageddon/Format3/", c(2, 9, 7, 6), "1904-01-01") + format3.2 <- ParseHMFolder("./data/hackmageddon/Format3/Format3.2/", c(2, 9, 7, 6), "1899-12-30") + format4 <- ParseHMFolder("./data/hackmageddon/Format4/", c(1, 9, 5, 3), "1899-12-30") + + #Appending evey data.frame in the standard format + dataset <- rbind(format1, format2, format3, format3.2, format4) dataset } diff --git a/ISO27001effectiveness/R/ISOSurvey_Parser.R b/ISO27001effectiveness/R/ISOSurvey_Parser.R index d436ec5..fec6e7d 100644 --- a/ISO27001effectiveness/R/ISOSurvey_Parser.R +++ b/ISO27001effectiveness/R/ISOSurvey_Parser.R @@ -17,10 +17,12 @@ #' Cert_PerSector <- ParseExcelFileRaw("./data/ISO/iso_27001_iso_survey2015_preprocessed.xlsx", 3) ParseExcelFileRaw <- function(file, sheet){ + #Checks if (!file.exists(file)) { stop(paste("Error, file [", file, "] not found")) } + #Parse the excel file dataset <- xlsx::read.xlsx2(file, sheet,colClasses = c("character","numeric","numeric","numeric","numeric","numeric","numeric","numeric","numeric","numeric","numeric"), header = TRUE, as.data.frame = TRUE) dataset @@ -28,9 +30,41 @@ ParseExcelFileRaw <- function(file, sheet){ -#' Process raw data from ISO survey +#' PRocess raw data parsed from excel file ISOSurvey27001 +#' +#' @param dataset.raw data.frame with raw data +#' @param years list of years to include preceded with a X +#' +#' @return data.frame +#' @export +#' +#' @examples +#' Cert_PerSector <- ProccesISOSurveyRaw(Cert_PerSector, c("X2010", "X2011", "X2012", "X2013", "X2014", "X2015")) +ProccesISOSurveyRaw <- function(dataset.raw, years){ + + #Change NAs to zeros + dataset <- dataset.raw + dataset[is.na(dataset)] <- 0 + + vars <- names(dataset) + + #Remove blanc rows + if ("INDUSTRIAL.SECTOR" == vars[1]) { + dataset <- dataset[!dataset$INDUSTRIAL.SECTOR == "",] + }else if ("Country" == vars[1]) { + dataset <- dataset[!dataset$Country == "",] + } + + #Remove years not included in years param + years_checked <- intersect(vars, c("Country", "INDUSTRIAL.SECTOR", years)) + dataset <- dataset[,years_checked] + + + dataset +} + +#' Join data from ISOSurvey with 2 letter code countries plus process raw.data #' -#' Proccess the raw data from ISO survey to replace NAs, normalizate country names and filter years #' @param dataset.raw raw data from ISO Survey excel file #' @param years List of years to return, c("X2006", "X2010", ...) #' @@ -40,21 +74,62 @@ ParseExcelFileRaw <- function(file, sheet){ #' #' Cert_PerCountry <- ProccesISOSurveyRaw(Cert_PerCountry, c("X2010", "X2011", "X2012", "X2013", "X2014", "X2015")) #' Sites_PerCountry <- ProccesISOSurveyRaw(Sites_PerCountry, c("X2010", "X2011", "X2012", "X2013", "X2014", "X2015")) -#' Cert_PerSector <- ProccesISOSurveyRaw(Cert_PerSector, c("X2010", "X2011", "X2012", "X2013", "X2014", "X2015")) ProccesISOSurveyByCountryRaw <- function(dataset.raw, years){ - #NAs to 0s - dataset <- dataset.raw - dataset[is.na(dataset)] <- 0 + #Standard proccess + dataset <- ProccesISOSurveyRaw(dataset.raw, years) #Translate country names to 2 letter code CountryNames <- GetCountryAbrev() - dataset <- merge(x = dataset, y = CountryNames, by = "Country", all.x = TRUE) - vars <- names(dataset) - years_checked <- intersect(vars, years) - dataset <- dataset[,c(c("Country", "country_short"), years_checked)] - dataset } + + + +#' Get data of certificates per year and country from IS27001 +#' +#' @return data.frame +#' @export +#' +#' @examples +#' Cert_PerCountry <- GetISOSurveyCertsPerCountry() +GetISOSurveyCertsPerCountry <- function() { + Cert_PerCountry <- ParseExcelFileRaw("./data/ISO/iso_27001_iso_survey2015_preprocessed.xlsx", 1) + + Cert_PerCountry <- ProccesISOSurveyByCountryRaw(Cert_PerCountry, c("X2012", "X2013", "X2014", "X2015")) + + Cert_PerCountry +} + +#' Get data of sites per year and country from IS27001 +#' +#' @return data.frame +#' @export +#' +#' @examples +#' Sites_PerCountry <- GetISOSurveySitesPerCountry() +GetISOSurveySitesPerCountry <- function() { + Sites_PerCountry <- ParseExcelFileRaw("./data/ISO/iso_27001_iso_survey2015_preprocessed.xlsx", 2) + + Sites_PerCountry <- ProccesISOSurveyByCountryRaw(Sites_PerCountry, c( "X2012", "X2013", "X2014", "X2015")) + + Sites_PerCountry +} + +#' Get data of certificates per year and sector from IS27001 +#' +#' @return data.frame +#' @export +#' +#' @examples +#' Cert_PerSector <- GetISOSurveyCertsPerSector() +GetISOSurveyCertsPerSector <- function() { + Cert_PerSector <- ParseExcelFileRaw("./data/ISO/iso_27001_iso_survey2015_preprocessed.xlsx", 3) + + Cert_PerSector <- ProccesISOSurveyRaw(Cert_PerSector, c("X2012", "X2013", "X2014", "X2015")) + + Cert_PerSector + +} diff --git a/ISO27001effectiveness/R/Util.R b/ISO27001effectiveness/R/Util.R index 829ebb4..ea739ca 100644 --- a/ISO27001effectiveness/R/Util.R +++ b/ISO27001effectiveness/R/Util.R @@ -1,19 +1,5 @@ #-----------------------------Util functions-------------------------------------------------- - -#' Install and load required libraries -#' -#' This function checks if every required library is installed to be loaded, if not they will be installed and then loaded. -#' Libraries installed: -#' xlsx to parse excel files like ISO survey source format -LoadParserLibraries <- function(){ - if (!require("xlsx")) - { - install.packages("xlsx") - if (!require("xlsx")) stop("Error while loading package [xlsx]") - } -} - #' Return the 2 letter code of a country relation #' #' Relation of country names included in the ISO Survey input file with 2 letter code diff --git a/ISO27001effectiveness/data/hackmageddon/Format3/1-15 Jan 2014 Cyber Attacks Timeline.xls b/ISO27001effectiveness/data/hackmageddon/Format3/1-15 Jan 2014 Cyber Attacks Timeline.xls deleted file mode 100644 index 9334a2f..0000000 Binary files a/ISO27001effectiveness/data/hackmageddon/Format3/1-15 Jan 2014 Cyber Attacks Timeline.xls and /dev/null differ diff --git a/ISO27001effectiveness/data/hackmageddon/Format3/16-31 December 2014 Cyber Attacks Timeline.xlsx b/ISO27001effectiveness/data/hackmageddon/Format3/16-31 December 2014 Cyber Attacks Timeline.xlsx index 471bf19..0bea2d6 100644 Binary files a/ISO27001effectiveness/data/hackmageddon/Format3/16-31 December 2014 Cyber Attacks Timeline.xlsx and b/ISO27001effectiveness/data/hackmageddon/Format3/16-31 December 2014 Cyber Attacks Timeline.xlsx differ diff --git a/ISO27001effectiveness/data/hackmageddon/Format3/16-31 Jan 2014 Cyber Attacks Timeline.xlsx b/ISO27001effectiveness/data/hackmageddon/Format3/16-31 Jan 2014 Cyber Attacks Timeline.xlsx deleted file mode 100644 index 91005f3..0000000 Binary files a/ISO27001effectiveness/data/hackmageddon/Format3/16-31 Jan 2014 Cyber Attacks Timeline.xlsx and /dev/null differ diff --git a/ISO27001effectiveness/data/hackmageddon/Format3/16-31 Jun 2014 Cyber Attacks Timeline.xlsx b/ISO27001effectiveness/data/hackmageddon/Format3/16-31 Jun 2014 Cyber Attacks Timeline.xlsx deleted file mode 100644 index 5db0c45..0000000 Binary files a/ISO27001effectiveness/data/hackmageddon/Format3/16-31 Jun 2014 Cyber Attacks Timeline.xlsx and /dev/null differ diff --git a/ISO27001effectiveness/data/hackmageddon/Format3/Format3.2/1-15 Jan 2014 Cyber Attacks Timeline.xls b/ISO27001effectiveness/data/hackmageddon/Format3/Format3.2/1-15 Jan 2014 Cyber Attacks Timeline.xls new file mode 100644 index 0000000..fd980cc Binary files /dev/null and b/ISO27001effectiveness/data/hackmageddon/Format3/Format3.2/1-15 Jan 2014 Cyber Attacks Timeline.xls differ diff --git a/ISO27001effectiveness/data/hackmageddon/Format3/Format3.2/16-31 Jan 2014 Cyber Attacks Timeline.xlsx b/ISO27001effectiveness/data/hackmageddon/Format3/Format3.2/16-31 Jan 2014 Cyber Attacks Timeline.xlsx new file mode 100644 index 0000000..91005f3 Binary files /dev/null and b/ISO27001effectiveness/data/hackmageddon/Format3/Format3.2/16-31 Jan 2014 Cyber Attacks Timeline.xlsx differ diff --git a/ISO27001effectiveness/data/test/16-31 Mar 2014 Cyber Attacks Timeline.xlsx b/ISO27001effectiveness/data/test/16-31 Mar 2014 Cyber Attacks Timeline.xlsx deleted file mode 100644 index 5780129..0000000 Binary files a/ISO27001effectiveness/data/test/16-31 Mar 2014 Cyber Attacks Timeline.xlsx and /dev/null differ diff --git a/ISO27001effectiveness/data/test/16-31 Mar 2015 Cyber Attacks Timeline.xlsx b/ISO27001effectiveness/data/test/16-31 Mar 2015 Cyber Attacks Timeline.xlsx deleted file mode 100644 index b38e8d3..0000000 Binary files a/ISO27001effectiveness/data/test/16-31 Mar 2015 Cyber Attacks Timeline.xlsx and /dev/null differ -- libgit2 0.22.2