Commit 2de5507d6f309c26f053e7e88787a3fa18c633bb

Authored by Miguel Tuñón
1 parent f376d0cf

Parsers working correctly

ISO27001effectiveness/DESCRIPTION
... ... @@ -10,3 +10,4 @@ License: What license is it under?
10 10 Encoding: UTF-8
11 11 LazyData: true
12 12 RoxygenNote: 5.0.1
  13 +Imports: xlsx
... ...
ISO27001effectiveness/Main.R
1   -source("./R/Util.R")
2   -LoadLibraries()
3   -source("./R/ISOSurvey_Parser.R")
4 1  
5   -Cert_PerCountry <- ParseExcelFileRaw("./data/ISO/iso_27001_iso_survey2015_preprocessed.xlsx", 1)
6   -Sites_PerCountry <- ParseExcelFileRaw("./data/ISO/iso_27001_iso_survey2015_preprocessed.xlsx", 2)
7   -Cert_PerSector <- ParseExcelFileRaw("./data/ISO/iso_27001_iso_survey2015_preprocessed.xlsx", 3)
  2 +Cert_PerCountry <- ISO27001effectiveness::GetISOSurveyCertsPerCountry()
  3 +Sites_PerCountry <- ISO27001effectiveness::GetISOSurveySitesPerCountry()
  4 +Cert_PerSector <- ISO27001effectiveness::GetISOSurveyCertsPerSector()
8 5  
9   -Cert_PerCountry <- ProccesISOSurveyByCountryRaw(Cert_PerCountry, c("X2012", "X2013", "X2014", "X2015"))
10   -Sites_PerCountry <- ProccesISOSurveyByCountryRaw(Sites_PerCountry, c( "X2012", "X2013", "X2014", "X2015"))
11   -#Cert_PerSector <- ProccesISOSurveyRaw(Cert_PerSector, c("X2012", "X2013", "X2014", "X2015"))
12   -
13   -Attacks <- GetAttacksData()
  6 +Attacks <- ISO27001effectiveness::GetDefaultAttacksData()
... ...
ISO27001effectiveness/R/Hackmageddon_Parser.R
... ... @@ -4,13 +4,27 @@
4 4  
5 5  
6 6  
  7 +#' Parse an excel raw data file from armaggedon
  8 +#'
  9 +#' @param file path to the excel file
  10 +#' @param cols list of columns index to read
  11 +#'
  12 +#' @return data.frame
  13 +#' @export
  14 +#'
  15 +#' @examples
  16 +#' data.raw <- ParseHMExcel("./data/hackmaggedon/file.xls", c(2, 3, 6, 5))
7 17 ParseHMExcel <- function(file, cols){
8   - print(file)
  18 +
  19 + #Checks
9 20 if (!file.exists(file)) {
10 21 stop(paste("Error, file [", file, "] not found"))
11 22 }
  23 +
  24 + #Parse data from excell, same formats but different columns because of the files
12 25 dataset <- xlsx::read.xlsx2(file, 1, header = TRUE,colIndex = cols, colClasses = c("numeric", "character", "character", "character"))
13 26  
  27 + #If date is null probably theres a blanc row that should be ignored
14 28 if (is.null(dataset$Date)) {
15 29 dataset <- xlsx::read.xlsx2(file, 1, header = TRUE,colIndex = cols, colClasses = c("numeric", "character", "character", "character"), startRow = 2)
16 30 }
... ... @@ -18,47 +32,77 @@ ParseHMExcel &lt;- function(file, cols){
18 32 dataset
19 33 }
20 34  
21   -ProccesHMRaw <- function(dataset.raw){
  35 +
  36 +
  37 +#' Prepare raw data from hackmaggedon's excel to use it
  38 +#'
  39 +#' @param dataset.raw data.frame with raw data
  40 +#' @param dateOffset origin to add the numeric date
  41 +#'
  42 +#' @return data.frame
  43 +#' @export
  44 +#'
  45 +#' @examples
  46 +#' data.pro <- ProccesHMRaw(data.raw, "1899-12-30")
  47 +ProccesHMRaw <- function(dataset.raw, dateOffset){
  48 +
  49 + #Standar names to the columns
  50 + dataset <- setNames(dataset.raw, c("Date", "Attack", "Target", "Country"))
22 51  
23 52 #Remove rows with Date NA
24   - print(class(dataset.raw$Date))
25   - dataset <- dataset.raw[!is.na(dataset.raw$Date),]
26   - dataset$Date <- as.POSIXct(dataset$Date*86400, tz ="GMT", origin ="1904-01-01")
  53 + dataset <- dataset[!is.na(dataset$Date),]
  54 +
  55 + #Format properly the date
  56 + dataset$Date <- as.POSIXct(dataset$Date*86400, tz = "GMT", origin = dateOffset)
27 57  
28 58 dataset
29 59 }
30 60  
31   -#' Title
  61 +
  62 +
  63 +#' Parse every excel file into a folder
32 64 #'
33   -#' @param folder
34   -#' @param cols
  65 +#' @param folder path to the folder to iterate
  66 +#' @param cols columns to parse into each file
  67 +#' @param dateOffset origin to calc the dates into each file
35 68 #'
36   -#' @return
  69 +#' @return data.frame
37 70 #' @export
38 71 #'
39 72 #' @examples
40   -ParseHMFolder <- function(folder, cols){
41   - filelist <- list.files(folder)
42   - #frames <- lapply(paste(folder,filelist,sep = ""),ParseHMExcel)
  73 +#' data.pro <- ProccesHMRaw("./data/hackmaggedon/", c(1, 5, 3) "1899-12-30")
  74 +ParseHMFolder <- function(folder, cols, dateOffset){
43 75  
44   - myFile <- paste(folder,filelist[1],sep = "")
45   - dataset <- ProccesHMRaw(ParseHMExcel(myFile, cols))
  76 + #List excel files into the folder
  77 + filelist <- list.files(folder, pattern = "*.xls*")
46 78  
  79 + #Iterate for each file appending the returned data.frame
  80 + dataset <- ProccesHMRaw(ParseHMExcel(paste(folder,filelist[1],sep = ""), cols), dateOffset)
47 81 for (i in 2:length(filelist)) {
48   - dataset <- rbind(dataset, ProccesHMRaw(ParseHMExcel(paste(folder,filelist[i],sep = ""), cols)))
  82 + dataset <- rbind(dataset, ProccesHMRaw(ParseHMExcel(paste(folder,filelist[i],sep = ""), cols), dateOffset))
49 83 }
50 84  
51 85 dataset
52 86 }
53 87  
54   -GetAttacksData <- function() {
55   -
56   - format1 <- ParseHMFolder("./data/hackmageddon/Format1/", c(2, 9, 7, 6))
57   - format2 <- ParseHMFolder("./data/hackmageddon/Format2/", c(2,7, 5, 6))
58   - format3 <- ParseHMFolder("./data/hackmageddon/Format3/", c(2, 9, 7, 6))
59   - format4 <- ParseHMFolder("./data/hackmageddon/Format4/", c(1, 9, 5, 3))
60   -
61   - dataset <- rbind(format1, format2, format3, format4)
  88 +#' Parse the default data from the package from hackmaggedon (2012-2016)
  89 +#'
  90 +#' @return data.frame
  91 +#' @export
  92 +#'
  93 +#' @examples
  94 +#' Attacks <- GetDefaultAttacksData()
  95 +GetDefaultAttacksData <- function() {
  96 +
  97 + #Parsing each different folder with the correct properties
  98 + format1 <- ParseHMFolder("./data/hackmageddon/Format1/", c(2, 9, 7, 6), "1899-12-30")
  99 + format2 <- ParseHMFolder("./data/hackmageddon/Format2/", c(2, 7, 5, 3), "1904-01-01")
  100 + format3 <- ParseHMFolder("./data/hackmageddon/Format3/", c(2, 9, 7, 6), "1904-01-01")
  101 + format3.2 <- ParseHMFolder("./data/hackmageddon/Format3/Format3.2/", c(2, 9, 7, 6), "1899-12-30")
  102 + format4 <- ParseHMFolder("./data/hackmageddon/Format4/", c(1, 9, 5, 3), "1899-12-30")
  103 +
  104 + #Appending evey data.frame in the standard format
  105 + dataset <- rbind(format1, format2, format3, format3.2, format4)
62 106  
63 107 dataset
64 108 }
... ...
ISO27001effectiveness/R/ISOSurvey_Parser.R
... ... @@ -17,10 +17,12 @@
17 17 #' Cert_PerSector <- ParseExcelFileRaw("./data/ISO/iso_27001_iso_survey2015_preprocessed.xlsx", 3)
18 18 ParseExcelFileRaw <- function(file, sheet){
19 19  
  20 + #Checks
20 21 if (!file.exists(file)) {
21 22 stop(paste("Error, file [", file, "] not found"))
22 23 }
23 24  
  25 + #Parse the excel file
24 26 dataset <- xlsx::read.xlsx2(file, sheet,colClasses = c("character","numeric","numeric","numeric","numeric","numeric","numeric","numeric","numeric","numeric","numeric"), header = TRUE, as.data.frame = TRUE)
25 27  
26 28 dataset
... ... @@ -28,9 +30,41 @@ ParseExcelFileRaw &lt;- function(file, sheet){
28 30  
29 31  
30 32  
31   -#' Process raw data from ISO survey
  33 +#' PRocess raw data parsed from excel file ISOSurvey27001
  34 +#'
  35 +#' @param dataset.raw data.frame with raw data
  36 +#' @param years list of years to include preceded with a X
  37 +#'
  38 +#' @return data.frame
  39 +#' @export
  40 +#'
  41 +#' @examples
  42 +#' Cert_PerSector <- ProccesISOSurveyRaw(Cert_PerSector, c("X2010", "X2011", "X2012", "X2013", "X2014", "X2015"))
  43 +ProccesISOSurveyRaw <- function(dataset.raw, years){
  44 +
  45 + #Change NAs to zeros
  46 + dataset <- dataset.raw
  47 + dataset[is.na(dataset)] <- 0
  48 +
  49 + vars <- names(dataset)
  50 +
  51 + #Remove blanc rows
  52 + if ("INDUSTRIAL.SECTOR" == vars[1]) {
  53 + dataset <- dataset[!dataset$INDUSTRIAL.SECTOR == "",]
  54 + }else if ("Country" == vars[1]) {
  55 + dataset <- dataset[!dataset$Country == "",]
  56 + }
  57 +
  58 + #Remove years not included in years param
  59 + years_checked <- intersect(vars, c("Country", "INDUSTRIAL.SECTOR", years))
  60 + dataset <- dataset[,years_checked]
  61 +
  62 +
  63 + dataset
  64 +}
  65 +
  66 +#' Join data from ISOSurvey with 2 letter code countries plus process raw.data
32 67 #'
33   -#' Proccess the raw data from ISO survey to replace NAs, normalizate country names and filter years
34 68 #' @param dataset.raw raw data from ISO Survey excel file
35 69 #' @param years List of years to return, c("X2006", "X2010", ...)
36 70 #'
... ... @@ -40,21 +74,62 @@ ParseExcelFileRaw &lt;- function(file, sheet){
40 74 #'
41 75 #' Cert_PerCountry <- ProccesISOSurveyRaw(Cert_PerCountry, c("X2010", "X2011", "X2012", "X2013", "X2014", "X2015"))
42 76 #' Sites_PerCountry <- ProccesISOSurveyRaw(Sites_PerCountry, c("X2010", "X2011", "X2012", "X2013", "X2014", "X2015"))
43   -#' Cert_PerSector <- ProccesISOSurveyRaw(Cert_PerSector, c("X2010", "X2011", "X2012", "X2013", "X2014", "X2015"))
44 77 ProccesISOSurveyByCountryRaw <- function(dataset.raw, years){
45 78  
46   - #NAs to 0s
47   - dataset <- dataset.raw
48   - dataset[is.na(dataset)] <- 0
  79 + #Standard proccess
  80 + dataset <- ProccesISOSurveyRaw(dataset.raw, years)
49 81  
50 82 #Translate country names to 2 letter code
51 83 CountryNames <- GetCountryAbrev()
52   -
53 84 dataset <- merge(x = dataset, y = CountryNames, by = "Country", all.x = TRUE)
54 85  
55   - vars <- names(dataset)
56   - years_checked <- intersect(vars, years)
57   - dataset <- dataset[,c(c("Country", "country_short"), years_checked)]
58   -
59 86 dataset
60 87 }
  88 +
  89 +
  90 +
  91 +#' Get data of certificates per year and country from IS27001
  92 +#'
  93 +#' @return data.frame
  94 +#' @export
  95 +#'
  96 +#' @examples
  97 +#' Cert_PerCountry <- GetISOSurveyCertsPerCountry()
  98 +GetISOSurveyCertsPerCountry <- function() {
  99 + Cert_PerCountry <- ParseExcelFileRaw("./data/ISO/iso_27001_iso_survey2015_preprocessed.xlsx", 1)
  100 +
  101 + Cert_PerCountry <- ProccesISOSurveyByCountryRaw(Cert_PerCountry, c("X2012", "X2013", "X2014", "X2015"))
  102 +
  103 + Cert_PerCountry
  104 +}
  105 +
  106 +#' Get data of sites per year and country from IS27001
  107 +#'
  108 +#' @return data.frame
  109 +#' @export
  110 +#'
  111 +#' @examples
  112 +#' Sites_PerCountry <- GetISOSurveySitesPerCountry()
  113 +GetISOSurveySitesPerCountry <- function() {
  114 + Sites_PerCountry <- ParseExcelFileRaw("./data/ISO/iso_27001_iso_survey2015_preprocessed.xlsx", 2)
  115 +
  116 + Sites_PerCountry <- ProccesISOSurveyByCountryRaw(Sites_PerCountry, c( "X2012", "X2013", "X2014", "X2015"))
  117 +
  118 + Sites_PerCountry
  119 +}
  120 +
  121 +#' Get data of certificates per year and sector from IS27001
  122 +#'
  123 +#' @return data.frame
  124 +#' @export
  125 +#'
  126 +#' @examples
  127 +#' Cert_PerSector <- GetISOSurveyCertsPerSector()
  128 +GetISOSurveyCertsPerSector <- function() {
  129 + Cert_PerSector <- ParseExcelFileRaw("./data/ISO/iso_27001_iso_survey2015_preprocessed.xlsx", 3)
  130 +
  131 + Cert_PerSector <- ProccesISOSurveyRaw(Cert_PerSector, c("X2012", "X2013", "X2014", "X2015"))
  132 +
  133 + Cert_PerSector
  134 +
  135 +}
... ...
ISO27001effectiveness/R/Util.R
1 1 #-----------------------------Util functions--------------------------------------------------
2 2  
3   -
4   -#' Install and load required libraries
5   -#'
6   -#' This function checks if every required library is installed to be loaded, if not they will be installed and then loaded.
7   -#' Libraries installed:
8   -#' xlsx to parse excel files like ISO survey source format
9   -LoadParserLibraries <- function(){
10   - if (!require("xlsx"))
11   - {
12   - install.packages("xlsx")
13   - if (!require("xlsx")) stop("Error while loading package [xlsx]")
14   - }
15   -}
16   -
17 3 #' Return the 2 letter code of a country relation
18 4 #'
19 5 #' Relation of country names included in the ISO Survey input file with 2 letter code
... ...
ISO27001effectiveness/data/hackmageddon/Format3/16-31 December 2014 Cyber Attacks Timeline.xlsx
No preview for this file type
ISO27001effectiveness/data/hackmageddon/Format3/16-31 Jun 2014 Cyber Attacks Timeline.xlsx deleted
No preview for this file type
ISO27001effectiveness/data/hackmageddon/Format3/1-15 Jan 2014 Cyber Attacks Timeline.xls renamed to ISO27001effectiveness/data/hackmageddon/Format3/Format3.2/1-15 Jan 2014 Cyber Attacks Timeline.xls
No preview for this file type
ISO27001effectiveness/data/hackmageddon/Format3/16-31 Jan 2014 Cyber Attacks Timeline.xlsx renamed to ISO27001effectiveness/data/hackmageddon/Format3/Format3.2/16-31 Jan 2014 Cyber Attacks Timeline.xlsx
No preview for this file type
ISO27001effectiveness/data/test/16-31 Mar 2014 Cyber Attacks Timeline.xlsx deleted
No preview for this file type
ISO27001effectiveness/data/test/16-31 Mar 2015 Cyber Attacks Timeline.xlsx deleted
No preview for this file type