Commit 2de5507d6f309c26f053e7e88787a3fa18c633bb
1 parent
f376d0cf
Parsers working correctly
Showing
11 changed files
with
158 additions
and
59 deletions
ISO27001effectiveness/DESCRIPTION
ISO27001effectiveness/Main.R
1 | -source("./R/Util.R") | |
2 | -LoadLibraries() | |
3 | -source("./R/ISOSurvey_Parser.R") | |
4 | 1 | |
5 | -Cert_PerCountry <- ParseExcelFileRaw("./data/ISO/iso_27001_iso_survey2015_preprocessed.xlsx", 1) | |
6 | -Sites_PerCountry <- ParseExcelFileRaw("./data/ISO/iso_27001_iso_survey2015_preprocessed.xlsx", 2) | |
7 | -Cert_PerSector <- ParseExcelFileRaw("./data/ISO/iso_27001_iso_survey2015_preprocessed.xlsx", 3) | |
2 | +Cert_PerCountry <- ISO27001effectiveness::GetISOSurveyCertsPerCountry() | |
3 | +Sites_PerCountry <- ISO27001effectiveness::GetISOSurveySitesPerCountry() | |
4 | +Cert_PerSector <- ISO27001effectiveness::GetISOSurveyCertsPerSector() | |
8 | 5 | |
9 | -Cert_PerCountry <- ProccesISOSurveyByCountryRaw(Cert_PerCountry, c("X2012", "X2013", "X2014", "X2015")) | |
10 | -Sites_PerCountry <- ProccesISOSurveyByCountryRaw(Sites_PerCountry, c( "X2012", "X2013", "X2014", "X2015")) | |
11 | -#Cert_PerSector <- ProccesISOSurveyRaw(Cert_PerSector, c("X2012", "X2013", "X2014", "X2015")) | |
12 | - | |
13 | -Attacks <- GetAttacksData() | |
6 | +Attacks <- ISO27001effectiveness::GetDefaultAttacksData() | ... | ... |
ISO27001effectiveness/R/Hackmageddon_Parser.R
... | ... | @@ -4,13 +4,27 @@ |
4 | 4 | |
5 | 5 | |
6 | 6 | |
7 | +#' Parse an excel raw data file from armaggedon | |
8 | +#' | |
9 | +#' @param file path to the excel file | |
10 | +#' @param cols list of columns index to read | |
11 | +#' | |
12 | +#' @return data.frame | |
13 | +#' @export | |
14 | +#' | |
15 | +#' @examples | |
16 | +#' data.raw <- ParseHMExcel("./data/hackmaggedon/file.xls", c(2, 3, 6, 5)) | |
7 | 17 | ParseHMExcel <- function(file, cols){ |
8 | - print(file) | |
18 | + | |
19 | + #Checks | |
9 | 20 | if (!file.exists(file)) { |
10 | 21 | stop(paste("Error, file [", file, "] not found")) |
11 | 22 | } |
23 | + | |
24 | + #Parse data from excell, same formats but different columns because of the files | |
12 | 25 | dataset <- xlsx::read.xlsx2(file, 1, header = TRUE,colIndex = cols, colClasses = c("numeric", "character", "character", "character")) |
13 | 26 | |
27 | + #If date is null probably theres a blanc row that should be ignored | |
14 | 28 | if (is.null(dataset$Date)) { |
15 | 29 | dataset <- xlsx::read.xlsx2(file, 1, header = TRUE,colIndex = cols, colClasses = c("numeric", "character", "character", "character"), startRow = 2) |
16 | 30 | } |
... | ... | @@ -18,47 +32,77 @@ ParseHMExcel <- function(file, cols){ |
18 | 32 | dataset |
19 | 33 | } |
20 | 34 | |
21 | -ProccesHMRaw <- function(dataset.raw){ | |
35 | + | |
36 | + | |
37 | +#' Prepare raw data from hackmaggedon's excel to use it | |
38 | +#' | |
39 | +#' @param dataset.raw data.frame with raw data | |
40 | +#' @param dateOffset origin to add the numeric date | |
41 | +#' | |
42 | +#' @return data.frame | |
43 | +#' @export | |
44 | +#' | |
45 | +#' @examples | |
46 | +#' data.pro <- ProccesHMRaw(data.raw, "1899-12-30") | |
47 | +ProccesHMRaw <- function(dataset.raw, dateOffset){ | |
48 | + | |
49 | + #Standar names to the columns | |
50 | + dataset <- setNames(dataset.raw, c("Date", "Attack", "Target", "Country")) | |
22 | 51 | |
23 | 52 | #Remove rows with Date NA |
24 | - print(class(dataset.raw$Date)) | |
25 | - dataset <- dataset.raw[!is.na(dataset.raw$Date),] | |
26 | - dataset$Date <- as.POSIXct(dataset$Date*86400, tz ="GMT", origin ="1904-01-01") | |
53 | + dataset <- dataset[!is.na(dataset$Date),] | |
54 | + | |
55 | + #Format properly the date | |
56 | + dataset$Date <- as.POSIXct(dataset$Date*86400, tz = "GMT", origin = dateOffset) | |
27 | 57 | |
28 | 58 | dataset |
29 | 59 | } |
30 | 60 | |
31 | -#' Title | |
61 | + | |
62 | + | |
63 | +#' Parse every excel file into a folder | |
32 | 64 | #' |
33 | -#' @param folder | |
34 | -#' @param cols | |
65 | +#' @param folder path to the folder to iterate | |
66 | +#' @param cols columns to parse into each file | |
67 | +#' @param dateOffset origin to calc the dates into each file | |
35 | 68 | #' |
36 | -#' @return | |
69 | +#' @return data.frame | |
37 | 70 | #' @export |
38 | 71 | #' |
39 | 72 | #' @examples |
40 | -ParseHMFolder <- function(folder, cols){ | |
41 | - filelist <- list.files(folder) | |
42 | - #frames <- lapply(paste(folder,filelist,sep = ""),ParseHMExcel) | |
73 | +#' data.pro <- ProccesHMRaw("./data/hackmaggedon/", c(1, 5, 3) "1899-12-30") | |
74 | +ParseHMFolder <- function(folder, cols, dateOffset){ | |
43 | 75 | |
44 | - myFile <- paste(folder,filelist[1],sep = "") | |
45 | - dataset <- ProccesHMRaw(ParseHMExcel(myFile, cols)) | |
76 | + #List excel files into the folder | |
77 | + filelist <- list.files(folder, pattern = "*.xls*") | |
46 | 78 | |
79 | + #Iterate for each file appending the returned data.frame | |
80 | + dataset <- ProccesHMRaw(ParseHMExcel(paste(folder,filelist[1],sep = ""), cols), dateOffset) | |
47 | 81 | for (i in 2:length(filelist)) { |
48 | - dataset <- rbind(dataset, ProccesHMRaw(ParseHMExcel(paste(folder,filelist[i],sep = ""), cols))) | |
82 | + dataset <- rbind(dataset, ProccesHMRaw(ParseHMExcel(paste(folder,filelist[i],sep = ""), cols), dateOffset)) | |
49 | 83 | } |
50 | 84 | |
51 | 85 | dataset |
52 | 86 | } |
53 | 87 | |
54 | -GetAttacksData <- function() { | |
55 | - | |
56 | - format1 <- ParseHMFolder("./data/hackmageddon/Format1/", c(2, 9, 7, 6)) | |
57 | - format2 <- ParseHMFolder("./data/hackmageddon/Format2/", c(2,7, 5, 6)) | |
58 | - format3 <- ParseHMFolder("./data/hackmageddon/Format3/", c(2, 9, 7, 6)) | |
59 | - format4 <- ParseHMFolder("./data/hackmageddon/Format4/", c(1, 9, 5, 3)) | |
60 | - | |
61 | - dataset <- rbind(format1, format2, format3, format4) | |
88 | +#' Parse the default data from the package from hackmaggedon (2012-2016) | |
89 | +#' | |
90 | +#' @return data.frame | |
91 | +#' @export | |
92 | +#' | |
93 | +#' @examples | |
94 | +#' Attacks <- GetDefaultAttacksData() | |
95 | +GetDefaultAttacksData <- function() { | |
96 | + | |
97 | + #Parsing each different folder with the correct properties | |
98 | + format1 <- ParseHMFolder("./data/hackmageddon/Format1/", c(2, 9, 7, 6), "1899-12-30") | |
99 | + format2 <- ParseHMFolder("./data/hackmageddon/Format2/", c(2, 7, 5, 3), "1904-01-01") | |
100 | + format3 <- ParseHMFolder("./data/hackmageddon/Format3/", c(2, 9, 7, 6), "1904-01-01") | |
101 | + format3.2 <- ParseHMFolder("./data/hackmageddon/Format3/Format3.2/", c(2, 9, 7, 6), "1899-12-30") | |
102 | + format4 <- ParseHMFolder("./data/hackmageddon/Format4/", c(1, 9, 5, 3), "1899-12-30") | |
103 | + | |
104 | + #Appending evey data.frame in the standard format | |
105 | + dataset <- rbind(format1, format2, format3, format3.2, format4) | |
62 | 106 | |
63 | 107 | dataset |
64 | 108 | } | ... | ... |
ISO27001effectiveness/R/ISOSurvey_Parser.R
... | ... | @@ -17,10 +17,12 @@ |
17 | 17 | #' Cert_PerSector <- ParseExcelFileRaw("./data/ISO/iso_27001_iso_survey2015_preprocessed.xlsx", 3) |
18 | 18 | ParseExcelFileRaw <- function(file, sheet){ |
19 | 19 | |
20 | + #Checks | |
20 | 21 | if (!file.exists(file)) { |
21 | 22 | stop(paste("Error, file [", file, "] not found")) |
22 | 23 | } |
23 | 24 | |
25 | + #Parse the excel file | |
24 | 26 | dataset <- xlsx::read.xlsx2(file, sheet,colClasses = c("character","numeric","numeric","numeric","numeric","numeric","numeric","numeric","numeric","numeric","numeric"), header = TRUE, as.data.frame = TRUE) |
25 | 27 | |
26 | 28 | dataset |
... | ... | @@ -28,9 +30,41 @@ ParseExcelFileRaw <- function(file, sheet){ |
28 | 30 | |
29 | 31 | |
30 | 32 | |
31 | -#' Process raw data from ISO survey | |
33 | +#' PRocess raw data parsed from excel file ISOSurvey27001 | |
34 | +#' | |
35 | +#' @param dataset.raw data.frame with raw data | |
36 | +#' @param years list of years to include preceded with a X | |
37 | +#' | |
38 | +#' @return data.frame | |
39 | +#' @export | |
40 | +#' | |
41 | +#' @examples | |
42 | +#' Cert_PerSector <- ProccesISOSurveyRaw(Cert_PerSector, c("X2010", "X2011", "X2012", "X2013", "X2014", "X2015")) | |
43 | +ProccesISOSurveyRaw <- function(dataset.raw, years){ | |
44 | + | |
45 | + #Change NAs to zeros | |
46 | + dataset <- dataset.raw | |
47 | + dataset[is.na(dataset)] <- 0 | |
48 | + | |
49 | + vars <- names(dataset) | |
50 | + | |
51 | + #Remove blanc rows | |
52 | + if ("INDUSTRIAL.SECTOR" == vars[1]) { | |
53 | + dataset <- dataset[!dataset$INDUSTRIAL.SECTOR == "",] | |
54 | + }else if ("Country" == vars[1]) { | |
55 | + dataset <- dataset[!dataset$Country == "",] | |
56 | + } | |
57 | + | |
58 | + #Remove years not included in years param | |
59 | + years_checked <- intersect(vars, c("Country", "INDUSTRIAL.SECTOR", years)) | |
60 | + dataset <- dataset[,years_checked] | |
61 | + | |
62 | + | |
63 | + dataset | |
64 | +} | |
65 | + | |
66 | +#' Join data from ISOSurvey with 2 letter code countries plus process raw.data | |
32 | 67 | #' |
33 | -#' Proccess the raw data from ISO survey to replace NAs, normalizate country names and filter years | |
34 | 68 | #' @param dataset.raw raw data from ISO Survey excel file |
35 | 69 | #' @param years List of years to return, c("X2006", "X2010", ...) |
36 | 70 | #' |
... | ... | @@ -40,21 +74,62 @@ ParseExcelFileRaw <- function(file, sheet){ |
40 | 74 | #' |
41 | 75 | #' Cert_PerCountry <- ProccesISOSurveyRaw(Cert_PerCountry, c("X2010", "X2011", "X2012", "X2013", "X2014", "X2015")) |
42 | 76 | #' Sites_PerCountry <- ProccesISOSurveyRaw(Sites_PerCountry, c("X2010", "X2011", "X2012", "X2013", "X2014", "X2015")) |
43 | -#' Cert_PerSector <- ProccesISOSurveyRaw(Cert_PerSector, c("X2010", "X2011", "X2012", "X2013", "X2014", "X2015")) | |
44 | 77 | ProccesISOSurveyByCountryRaw <- function(dataset.raw, years){ |
45 | 78 | |
46 | - #NAs to 0s | |
47 | - dataset <- dataset.raw | |
48 | - dataset[is.na(dataset)] <- 0 | |
79 | + #Standard proccess | |
80 | + dataset <- ProccesISOSurveyRaw(dataset.raw, years) | |
49 | 81 | |
50 | 82 | #Translate country names to 2 letter code |
51 | 83 | CountryNames <- GetCountryAbrev() |
52 | - | |
53 | 84 | dataset <- merge(x = dataset, y = CountryNames, by = "Country", all.x = TRUE) |
54 | 85 | |
55 | - vars <- names(dataset) | |
56 | - years_checked <- intersect(vars, years) | |
57 | - dataset <- dataset[,c(c("Country", "country_short"), years_checked)] | |
58 | - | |
59 | 86 | dataset |
60 | 87 | } |
88 | + | |
89 | + | |
90 | + | |
91 | +#' Get data of certificates per year and country from IS27001 | |
92 | +#' | |
93 | +#' @return data.frame | |
94 | +#' @export | |
95 | +#' | |
96 | +#' @examples | |
97 | +#' Cert_PerCountry <- GetISOSurveyCertsPerCountry() | |
98 | +GetISOSurveyCertsPerCountry <- function() { | |
99 | + Cert_PerCountry <- ParseExcelFileRaw("./data/ISO/iso_27001_iso_survey2015_preprocessed.xlsx", 1) | |
100 | + | |
101 | + Cert_PerCountry <- ProccesISOSurveyByCountryRaw(Cert_PerCountry, c("X2012", "X2013", "X2014", "X2015")) | |
102 | + | |
103 | + Cert_PerCountry | |
104 | +} | |
105 | + | |
106 | +#' Get data of sites per year and country from IS27001 | |
107 | +#' | |
108 | +#' @return data.frame | |
109 | +#' @export | |
110 | +#' | |
111 | +#' @examples | |
112 | +#' Sites_PerCountry <- GetISOSurveySitesPerCountry() | |
113 | +GetISOSurveySitesPerCountry <- function() { | |
114 | + Sites_PerCountry <- ParseExcelFileRaw("./data/ISO/iso_27001_iso_survey2015_preprocessed.xlsx", 2) | |
115 | + | |
116 | + Sites_PerCountry <- ProccesISOSurveyByCountryRaw(Sites_PerCountry, c( "X2012", "X2013", "X2014", "X2015")) | |
117 | + | |
118 | + Sites_PerCountry | |
119 | +} | |
120 | + | |
121 | +#' Get data of certificates per year and sector from IS27001 | |
122 | +#' | |
123 | +#' @return data.frame | |
124 | +#' @export | |
125 | +#' | |
126 | +#' @examples | |
127 | +#' Cert_PerSector <- GetISOSurveyCertsPerSector() | |
128 | +GetISOSurveyCertsPerSector <- function() { | |
129 | + Cert_PerSector <- ParseExcelFileRaw("./data/ISO/iso_27001_iso_survey2015_preprocessed.xlsx", 3) | |
130 | + | |
131 | + Cert_PerSector <- ProccesISOSurveyRaw(Cert_PerSector, c("X2012", "X2013", "X2014", "X2015")) | |
132 | + | |
133 | + Cert_PerSector | |
134 | + | |
135 | +} | ... | ... |
ISO27001effectiveness/R/Util.R
1 | 1 | #-----------------------------Util functions-------------------------------------------------- |
2 | 2 | |
3 | - | |
4 | -#' Install and load required libraries | |
5 | -#' | |
6 | -#' This function checks if every required library is installed to be loaded, if not they will be installed and then loaded. | |
7 | -#' Libraries installed: | |
8 | -#' xlsx to parse excel files like ISO survey source format | |
9 | -LoadParserLibraries <- function(){ | |
10 | - if (!require("xlsx")) | |
11 | - { | |
12 | - install.packages("xlsx") | |
13 | - if (!require("xlsx")) stop("Error while loading package [xlsx]") | |
14 | - } | |
15 | -} | |
16 | - | |
17 | 3 | #' Return the 2 letter code of a country relation |
18 | 4 | #' |
19 | 5 | #' Relation of country names included in the ISO Survey input file with 2 letter code | ... | ... |
ISO27001effectiveness/data/hackmageddon/Format3/16-31 December 2014 Cyber Attacks Timeline.xlsx
No preview for this file type
ISO27001effectiveness/data/hackmageddon/Format3/16-31 Jun 2014 Cyber Attacks Timeline.xlsx deleted
No preview for this file type
ISO27001effectiveness/data/hackmageddon/Format3/1-15 Jan 2014 Cyber Attacks Timeline.xls renamed to ISO27001effectiveness/data/hackmageddon/Format3/Format3.2/1-15 Jan 2014 Cyber Attacks Timeline.xls
No preview for this file type
ISO27001effectiveness/data/hackmageddon/Format3/16-31 Jan 2014 Cyber Attacks Timeline.xlsx renamed to ISO27001effectiveness/data/hackmageddon/Format3/Format3.2/16-31 Jan 2014 Cyber Attacks Timeline.xlsx
No preview for this file type
ISO27001effectiveness/data/test/16-31 Mar 2014 Cyber Attacks Timeline.xlsx deleted
No preview for this file type
ISO27001effectiveness/data/test/16-31 Mar 2015 Cyber Attacks Timeline.xlsx deleted
No preview for this file type