Commit 2de5507d6f309c26f053e7e88787a3fa18c633bb

Authored by Miguel Tuñón
1 parent f376d0cf

Parsers working correctly

ISO27001effectiveness/DESCRIPTION
@@ -10,3 +10,4 @@ License: What license is it under? @@ -10,3 +10,4 @@ License: What license is it under?
10 Encoding: UTF-8 10 Encoding: UTF-8
11 LazyData: true 11 LazyData: true
12 RoxygenNote: 5.0.1 12 RoxygenNote: 5.0.1
  13 +Imports: xlsx
ISO27001effectiveness/Main.R
1 -source("./R/Util.R")  
2 -LoadLibraries()  
3 -source("./R/ISOSurvey_Parser.R")  
4 1
5 -Cert_PerCountry <- ParseExcelFileRaw("./data/ISO/iso_27001_iso_survey2015_preprocessed.xlsx", 1)  
6 -Sites_PerCountry <- ParseExcelFileRaw("./data/ISO/iso_27001_iso_survey2015_preprocessed.xlsx", 2)  
7 -Cert_PerSector <- ParseExcelFileRaw("./data/ISO/iso_27001_iso_survey2015_preprocessed.xlsx", 3) 2 +Cert_PerCountry <- ISO27001effectiveness::GetISOSurveyCertsPerCountry()
  3 +Sites_PerCountry <- ISO27001effectiveness::GetISOSurveySitesPerCountry()
  4 +Cert_PerSector <- ISO27001effectiveness::GetISOSurveyCertsPerSector()
8 5
9 -Cert_PerCountry <- ProccesISOSurveyByCountryRaw(Cert_PerCountry, c("X2012", "X2013", "X2014", "X2015"))  
10 -Sites_PerCountry <- ProccesISOSurveyByCountryRaw(Sites_PerCountry, c( "X2012", "X2013", "X2014", "X2015"))  
11 -#Cert_PerSector <- ProccesISOSurveyRaw(Cert_PerSector, c("X2012", "X2013", "X2014", "X2015"))  
12 -  
13 -Attacks <- GetAttacksData() 6 +Attacks <- ISO27001effectiveness::GetDefaultAttacksData()
ISO27001effectiveness/R/Hackmageddon_Parser.R
@@ -4,13 +4,27 @@ @@ -4,13 +4,27 @@
4 4
5 5
6 6
  7 +#' Parse an excel raw data file from armaggedon
  8 +#'
  9 +#' @param file path to the excel file
  10 +#' @param cols list of columns index to read
  11 +#'
  12 +#' @return data.frame
  13 +#' @export
  14 +#'
  15 +#' @examples
  16 +#' data.raw <- ParseHMExcel("./data/hackmaggedon/file.xls", c(2, 3, 6, 5))
7 ParseHMExcel <- function(file, cols){ 17 ParseHMExcel <- function(file, cols){
8 - print(file) 18 +
  19 + #Checks
9 if (!file.exists(file)) { 20 if (!file.exists(file)) {
10 stop(paste("Error, file [", file, "] not found")) 21 stop(paste("Error, file [", file, "] not found"))
11 } 22 }
  23 +
  24 + #Parse data from excell, same formats but different columns because of the files
12 dataset <- xlsx::read.xlsx2(file, 1, header = TRUE,colIndex = cols, colClasses = c("numeric", "character", "character", "character")) 25 dataset <- xlsx::read.xlsx2(file, 1, header = TRUE,colIndex = cols, colClasses = c("numeric", "character", "character", "character"))
13 26
  27 + #If date is null probably theres a blanc row that should be ignored
14 if (is.null(dataset$Date)) { 28 if (is.null(dataset$Date)) {
15 dataset <- xlsx::read.xlsx2(file, 1, header = TRUE,colIndex = cols, colClasses = c("numeric", "character", "character", "character"), startRow = 2) 29 dataset <- xlsx::read.xlsx2(file, 1, header = TRUE,colIndex = cols, colClasses = c("numeric", "character", "character", "character"), startRow = 2)
16 } 30 }
@@ -18,47 +32,77 @@ ParseHMExcel &lt;- function(file, cols){ @@ -18,47 +32,77 @@ ParseHMExcel &lt;- function(file, cols){
18 dataset 32 dataset
19 } 33 }
20 34
21 -ProccesHMRaw <- function(dataset.raw){ 35 +
  36 +
  37 +#' Prepare raw data from hackmaggedon's excel to use it
  38 +#'
  39 +#' @param dataset.raw data.frame with raw data
  40 +#' @param dateOffset origin to add the numeric date
  41 +#'
  42 +#' @return data.frame
  43 +#' @export
  44 +#'
  45 +#' @examples
  46 +#' data.pro <- ProccesHMRaw(data.raw, "1899-12-30")
  47 +ProccesHMRaw <- function(dataset.raw, dateOffset){
  48 +
  49 + #Standar names to the columns
  50 + dataset <- setNames(dataset.raw, c("Date", "Attack", "Target", "Country"))
22 51
23 #Remove rows with Date NA 52 #Remove rows with Date NA
24 - print(class(dataset.raw$Date))  
25 - dataset <- dataset.raw[!is.na(dataset.raw$Date),]  
26 - dataset$Date <- as.POSIXct(dataset$Date*86400, tz ="GMT", origin ="1904-01-01") 53 + dataset <- dataset[!is.na(dataset$Date),]
  54 +
  55 + #Format properly the date
  56 + dataset$Date <- as.POSIXct(dataset$Date*86400, tz = "GMT", origin = dateOffset)
27 57
28 dataset 58 dataset
29 } 59 }
30 60
31 -#' Title 61 +
  62 +
  63 +#' Parse every excel file into a folder
32 #' 64 #'
33 -#' @param folder  
34 -#' @param cols 65 +#' @param folder path to the folder to iterate
  66 +#' @param cols columns to parse into each file
  67 +#' @param dateOffset origin to calc the dates into each file
35 #' 68 #'
36 -#' @return 69 +#' @return data.frame
37 #' @export 70 #' @export
38 #' 71 #'
39 #' @examples 72 #' @examples
40 -ParseHMFolder <- function(folder, cols){  
41 - filelist <- list.files(folder)  
42 - #frames <- lapply(paste(folder,filelist,sep = ""),ParseHMExcel) 73 +#' data.pro <- ProccesHMRaw("./data/hackmaggedon/", c(1, 5, 3) "1899-12-30")
  74 +ParseHMFolder <- function(folder, cols, dateOffset){
43 75
44 - myFile <- paste(folder,filelist[1],sep = "")  
45 - dataset <- ProccesHMRaw(ParseHMExcel(myFile, cols)) 76 + #List excel files into the folder
  77 + filelist <- list.files(folder, pattern = "*.xls*")
46 78
  79 + #Iterate for each file appending the returned data.frame
  80 + dataset <- ProccesHMRaw(ParseHMExcel(paste(folder,filelist[1],sep = ""), cols), dateOffset)
47 for (i in 2:length(filelist)) { 81 for (i in 2:length(filelist)) {
48 - dataset <- rbind(dataset, ProccesHMRaw(ParseHMExcel(paste(folder,filelist[i],sep = ""), cols))) 82 + dataset <- rbind(dataset, ProccesHMRaw(ParseHMExcel(paste(folder,filelist[i],sep = ""), cols), dateOffset))
49 } 83 }
50 84
51 dataset 85 dataset
52 } 86 }
53 87
54 -GetAttacksData <- function() {  
55 -  
56 - format1 <- ParseHMFolder("./data/hackmageddon/Format1/", c(2, 9, 7, 6))  
57 - format2 <- ParseHMFolder("./data/hackmageddon/Format2/", c(2,7, 5, 6))  
58 - format3 <- ParseHMFolder("./data/hackmageddon/Format3/", c(2, 9, 7, 6))  
59 - format4 <- ParseHMFolder("./data/hackmageddon/Format4/", c(1, 9, 5, 3))  
60 -  
61 - dataset <- rbind(format1, format2, format3, format4) 88 +#' Parse the default data from the package from hackmaggedon (2012-2016)
  89 +#'
  90 +#' @return data.frame
  91 +#' @export
  92 +#'
  93 +#' @examples
  94 +#' Attacks <- GetDefaultAttacksData()
  95 +GetDefaultAttacksData <- function() {
  96 +
  97 + #Parsing each different folder with the correct properties
  98 + format1 <- ParseHMFolder("./data/hackmageddon/Format1/", c(2, 9, 7, 6), "1899-12-30")
  99 + format2 <- ParseHMFolder("./data/hackmageddon/Format2/", c(2, 7, 5, 3), "1904-01-01")
  100 + format3 <- ParseHMFolder("./data/hackmageddon/Format3/", c(2, 9, 7, 6), "1904-01-01")
  101 + format3.2 <- ParseHMFolder("./data/hackmageddon/Format3/Format3.2/", c(2, 9, 7, 6), "1899-12-30")
  102 + format4 <- ParseHMFolder("./data/hackmageddon/Format4/", c(1, 9, 5, 3), "1899-12-30")
  103 +
  104 + #Appending evey data.frame in the standard format
  105 + dataset <- rbind(format1, format2, format3, format3.2, format4)
62 106
63 dataset 107 dataset
64 } 108 }
ISO27001effectiveness/R/ISOSurvey_Parser.R
@@ -17,10 +17,12 @@ @@ -17,10 +17,12 @@
17 #' Cert_PerSector <- ParseExcelFileRaw("./data/ISO/iso_27001_iso_survey2015_preprocessed.xlsx", 3) 17 #' Cert_PerSector <- ParseExcelFileRaw("./data/ISO/iso_27001_iso_survey2015_preprocessed.xlsx", 3)
18 ParseExcelFileRaw <- function(file, sheet){ 18 ParseExcelFileRaw <- function(file, sheet){
19 19
  20 + #Checks
20 if (!file.exists(file)) { 21 if (!file.exists(file)) {
21 stop(paste("Error, file [", file, "] not found")) 22 stop(paste("Error, file [", file, "] not found"))
22 } 23 }
23 24
  25 + #Parse the excel file
24 dataset <- xlsx::read.xlsx2(file, sheet,colClasses = c("character","numeric","numeric","numeric","numeric","numeric","numeric","numeric","numeric","numeric","numeric"), header = TRUE, as.data.frame = TRUE) 26 dataset <- xlsx::read.xlsx2(file, sheet,colClasses = c("character","numeric","numeric","numeric","numeric","numeric","numeric","numeric","numeric","numeric","numeric"), header = TRUE, as.data.frame = TRUE)
25 27
26 dataset 28 dataset
@@ -28,9 +30,41 @@ ParseExcelFileRaw &lt;- function(file, sheet){ @@ -28,9 +30,41 @@ ParseExcelFileRaw &lt;- function(file, sheet){
28 30
29 31
30 32
31 -#' Process raw data from ISO survey 33 +#' PRocess raw data parsed from excel file ISOSurvey27001
  34 +#'
  35 +#' @param dataset.raw data.frame with raw data
  36 +#' @param years list of years to include preceded with a X
  37 +#'
  38 +#' @return data.frame
  39 +#' @export
  40 +#'
  41 +#' @examples
  42 +#' Cert_PerSector <- ProccesISOSurveyRaw(Cert_PerSector, c("X2010", "X2011", "X2012", "X2013", "X2014", "X2015"))
  43 +ProccesISOSurveyRaw <- function(dataset.raw, years){
  44 +
  45 + #Change NAs to zeros
  46 + dataset <- dataset.raw
  47 + dataset[is.na(dataset)] <- 0
  48 +
  49 + vars <- names(dataset)
  50 +
  51 + #Remove blanc rows
  52 + if ("INDUSTRIAL.SECTOR" == vars[1]) {
  53 + dataset <- dataset[!dataset$INDUSTRIAL.SECTOR == "",]
  54 + }else if ("Country" == vars[1]) {
  55 + dataset <- dataset[!dataset$Country == "",]
  56 + }
  57 +
  58 + #Remove years not included in years param
  59 + years_checked <- intersect(vars, c("Country", "INDUSTRIAL.SECTOR", years))
  60 + dataset <- dataset[,years_checked]
  61 +
  62 +
  63 + dataset
  64 +}
  65 +
  66 +#' Join data from ISOSurvey with 2 letter code countries plus process raw.data
32 #' 67 #'
33 -#' Proccess the raw data from ISO survey to replace NAs, normalizate country names and filter years  
34 #' @param dataset.raw raw data from ISO Survey excel file 68 #' @param dataset.raw raw data from ISO Survey excel file
35 #' @param years List of years to return, c("X2006", "X2010", ...) 69 #' @param years List of years to return, c("X2006", "X2010", ...)
36 #' 70 #'
@@ -40,21 +74,62 @@ ParseExcelFileRaw &lt;- function(file, sheet){ @@ -40,21 +74,62 @@ ParseExcelFileRaw &lt;- function(file, sheet){
40 #' 74 #'
41 #' Cert_PerCountry <- ProccesISOSurveyRaw(Cert_PerCountry, c("X2010", "X2011", "X2012", "X2013", "X2014", "X2015")) 75 #' Cert_PerCountry <- ProccesISOSurveyRaw(Cert_PerCountry, c("X2010", "X2011", "X2012", "X2013", "X2014", "X2015"))
42 #' Sites_PerCountry <- ProccesISOSurveyRaw(Sites_PerCountry, c("X2010", "X2011", "X2012", "X2013", "X2014", "X2015")) 76 #' Sites_PerCountry <- ProccesISOSurveyRaw(Sites_PerCountry, c("X2010", "X2011", "X2012", "X2013", "X2014", "X2015"))
43 -#' Cert_PerSector <- ProccesISOSurveyRaw(Cert_PerSector, c("X2010", "X2011", "X2012", "X2013", "X2014", "X2015"))  
44 ProccesISOSurveyByCountryRaw <- function(dataset.raw, years){ 77 ProccesISOSurveyByCountryRaw <- function(dataset.raw, years){
45 78
46 - #NAs to 0s  
47 - dataset <- dataset.raw  
48 - dataset[is.na(dataset)] <- 0 79 + #Standard proccess
  80 + dataset <- ProccesISOSurveyRaw(dataset.raw, years)
49 81
50 #Translate country names to 2 letter code 82 #Translate country names to 2 letter code
51 CountryNames <- GetCountryAbrev() 83 CountryNames <- GetCountryAbrev()
52 -  
53 dataset <- merge(x = dataset, y = CountryNames, by = "Country", all.x = TRUE) 84 dataset <- merge(x = dataset, y = CountryNames, by = "Country", all.x = TRUE)
54 85
55 - vars <- names(dataset)  
56 - years_checked <- intersect(vars, years)  
57 - dataset <- dataset[,c(c("Country", "country_short"), years_checked)]  
58 -  
59 dataset 86 dataset
60 } 87 }
  88 +
  89 +
  90 +
  91 +#' Get data of certificates per year and country from IS27001
  92 +#'
  93 +#' @return data.frame
  94 +#' @export
  95 +#'
  96 +#' @examples
  97 +#' Cert_PerCountry <- GetISOSurveyCertsPerCountry()
  98 +GetISOSurveyCertsPerCountry <- function() {
  99 + Cert_PerCountry <- ParseExcelFileRaw("./data/ISO/iso_27001_iso_survey2015_preprocessed.xlsx", 1)
  100 +
  101 + Cert_PerCountry <- ProccesISOSurveyByCountryRaw(Cert_PerCountry, c("X2012", "X2013", "X2014", "X2015"))
  102 +
  103 + Cert_PerCountry
  104 +}
  105 +
  106 +#' Get data of sites per year and country from IS27001
  107 +#'
  108 +#' @return data.frame
  109 +#' @export
  110 +#'
  111 +#' @examples
  112 +#' Sites_PerCountry <- GetISOSurveySitesPerCountry()
  113 +GetISOSurveySitesPerCountry <- function() {
  114 + Sites_PerCountry <- ParseExcelFileRaw("./data/ISO/iso_27001_iso_survey2015_preprocessed.xlsx", 2)
  115 +
  116 + Sites_PerCountry <- ProccesISOSurveyByCountryRaw(Sites_PerCountry, c( "X2012", "X2013", "X2014", "X2015"))
  117 +
  118 + Sites_PerCountry
  119 +}
  120 +
  121 +#' Get data of certificates per year and sector from IS27001
  122 +#'
  123 +#' @return data.frame
  124 +#' @export
  125 +#'
  126 +#' @examples
  127 +#' Cert_PerSector <- GetISOSurveyCertsPerSector()
  128 +GetISOSurveyCertsPerSector <- function() {
  129 + Cert_PerSector <- ParseExcelFileRaw("./data/ISO/iso_27001_iso_survey2015_preprocessed.xlsx", 3)
  130 +
  131 + Cert_PerSector <- ProccesISOSurveyRaw(Cert_PerSector, c("X2012", "X2013", "X2014", "X2015"))
  132 +
  133 + Cert_PerSector
  134 +
  135 +}
ISO27001effectiveness/R/Util.R
1 #-----------------------------Util functions-------------------------------------------------- 1 #-----------------------------Util functions--------------------------------------------------
2 2
3 -  
4 -#' Install and load required libraries  
5 -#'  
6 -#' This function checks if every required library is installed to be loaded, if not they will be installed and then loaded.  
7 -#' Libraries installed:  
8 -#' xlsx to parse excel files like ISO survey source format  
9 -LoadParserLibraries <- function(){  
10 - if (!require("xlsx"))  
11 - {  
12 - install.packages("xlsx")  
13 - if (!require("xlsx")) stop("Error while loading package [xlsx]")  
14 - }  
15 -}  
16 -  
17 #' Return the 2 letter code of a country relation 3 #' Return the 2 letter code of a country relation
18 #' 4 #'
19 #' Relation of country names included in the ISO Survey input file with 2 letter code 5 #' Relation of country names included in the ISO Survey input file with 2 letter code
ISO27001effectiveness/data/hackmageddon/Format3/16-31 December 2014 Cyber Attacks Timeline.xlsx
No preview for this file type
ISO27001effectiveness/data/hackmageddon/Format3/16-31 Jun 2014 Cyber Attacks Timeline.xlsx deleted
No preview for this file type
ISO27001effectiveness/data/hackmageddon/Format3/1-15 Jan 2014 Cyber Attacks Timeline.xls renamed to ISO27001effectiveness/data/hackmageddon/Format3/Format3.2/1-15 Jan 2014 Cyber Attacks Timeline.xls
No preview for this file type
ISO27001effectiveness/data/hackmageddon/Format3/16-31 Jan 2014 Cyber Attacks Timeline.xlsx renamed to ISO27001effectiveness/data/hackmageddon/Format3/Format3.2/16-31 Jan 2014 Cyber Attacks Timeline.xlsx
No preview for this file type
ISO27001effectiveness/data/test/16-31 Mar 2014 Cyber Attacks Timeline.xlsx deleted
No preview for this file type
ISO27001effectiveness/data/test/16-31 Mar 2015 Cyber Attacks Timeline.xlsx deleted
No preview for this file type