# Title     : bpic_15_enrich_data.R
# Objective : Script to clean and enrich BPIC 15 data (step 1)
# Created by: G. van Hulzen

require(dplyr)
require(lubridate)
require(stringr)
require(fastDummies)

wdir <- dirname(rstudioapi::getActiveDocumentContext()$path)
setwd(wdir)

# Read BPIC 15 event logs.
bpi_15_1 <- read.csv(gzfile("data/BPIC15_1.csv.gz"), header = TRUE, encoding = "UTF-8", fileEncoding = "UTF-8-BOM", stringsAsFactors = TRUE)
bpi_15_2 <- read.csv(gzfile("data/BPIC15_2.csv.gz"), header = TRUE, encoding = "UTF-8", fileEncoding = "UTF-8-BOM", stringsAsFactors = TRUE)
bpi_15_3 <- read.csv(gzfile("data/BPIC15_3.csv.gz"), header = TRUE, encoding = "UTF-8", fileEncoding = "UTF-8-BOM", stringsAsFactors = TRUE)
bpi_15_4 <- read.csv(gzfile("data/BPIC15_4.csv.gz"), header = TRUE, encoding = "UTF-8", fileEncoding = "UTF-8-BOM", stringsAsFactors = TRUE)
bpi_15_5 <- read.csv(gzfile("data/BPIC15_5.csv.gz"), header = TRUE, encoding = "UTF-8", fileEncoding = "UTF-8-BOM", stringsAsFactors = TRUE)

### Municipality 1 ###

bpi_15_1_cleaned <- bpi_15_1 %>%
  select(Case.ID, activityNameEN, "Complete.Timestamp", Resource, "X.case..caseProcedure", "X.case..caseStatus", "X.case..parts", "concept.name", "X.case..Responsible_actor", monitoringResource) %>%
  rename(CaseID = Case.ID,
         Activity = activityNameEN,
         CompleteTimestamp = "Complete.Timestamp",
         CaseProcedure = "X.case..caseProcedure",
         CaseStatus = "X.case..caseStatus",
         CaseParts = "X.case..parts",
         ActivityCode = "concept.name",
         ResponsibleResource = "X.case..Responsible_actor",
         MonitoringResource = monitoringResource) %>%
  mutate(ResponsibleResource = ifelse(is.na(ResponsibleResource), 0, ResponsibleResource),
         ActivityCodeAgg = sub("_$", "", str_extract(ActivityCode, "\\d{2}_([A-Z|\\d]+)_")),
         ActivityOrder = str_extract(ActivityCode, "(?<=_)(\\d{2,3}.{0,6})"),
         Phase = paste0("phase", str_sub(ActivityOrder, 1, 1))) %>%
  mutate(CaseID = as.factor(CaseID),
         CompleteTimestamp = ymd_hms(CompleteTimestamp),
         Resource = as.factor(Resource),
         CaseParts = as.character(CaseParts),
         ResponsibleResource = as.factor(ResponsibleResource),
         MonitoringResource = as.factor(MonitoringResource),
         Weekday = as.factor(wday(CompleteTimestamp, label = FALSE, week_start = 1)),
         ActivityCodeAgg = as.factor(ActivityCodeAgg),
         ActivityOrder = as.factor(ActivityOrder),
         Phase = as.factor(Phase)) %>%
  dummy_columns(select_columns = "CaseParts", split = ",", ignore_na = TRUE)

bpi_15_1_cleaned <- bpi_15_1_cleaned %>%
  rename(CaseParts_Aanleg = "CaseParts_Aanleg (Uitvoeren werk of werkzaamheid)",
         CaseParts_Bouw = "CaseParts_Bouw",
         CaseParts_Brandveilig_Melding = "CaseParts_Brandveilig gebruik (melding)",
         CaseParts_Brandveilig_Vergunning = "CaseParts_Brandveilig gebruik (vergunning)",
         CaseParts_FloraFauna = "CaseParts_Flora en Fauna",
         CaseParts_Gebiedsbescherming = "CaseParts_Gebiedsbescherming",
         CaseParts_HandelenInStrijd = "CaseParts_Handelen in strijd met regels RO",
         CaseParts_Kap = "CaseParts_Kap",
         CaseParts_InritUitweg = "CaseParts_Inrit/Uitweg",
         CaseParts_Milieu_Melding = "CaseParts_Milieu (melding)",
         CaseParts_Milieu_Neutraal = "CaseParts_Milieu (neutraal wijziging)",
         CaseParts_Milieu_Omgevingsvergunning = "CaseParts_Milieu (omgevingsvergunning beperkte milieutoets)",
         CaseParts_Milieu_Vergunning = "CaseParts_Milieu (vergunning)",
         CaseParts_Monument = "CaseParts_Monument",
         CaseParts_Reclame = "CaseParts_Reclame",
         CaseParts_Sloop = "CaseParts_Sloop") %>%
  mutate(CaseParts_Brandveilig = as.integer(if_else(CaseParts_Brandveilig_Melding == 1 | CaseParts_Brandveilig_Vergunning == 1, 1, 0)),
         CaseParts_Milieu = as.integer(if_else(CaseParts_Milieu_Melding == 1 | CaseParts_Milieu_Neutraal == 1 | CaseParts_Milieu_Omgevingsvergunning == 1 |
                                               CaseParts_Milieu_Vergunning == 1, 1, 0)))

# Remove Case Status "T"
bpi_15_1_cleaned <- bpi_15_1_cleaned %>%
  filter(CaseID != "3256221")

bpi_15_1_cleaned %>%
  saveRDS("data/bpi_15_1_cleaned.RDS")

### Municipality 2 ###

bpi_15_2_cleaned <- bpi_15_2 %>%
  select(Case.ID, activityNameEN, "Complete.Timestamp", Resource, "X.case..caseProcedure", "X.case..caseStatus", "X.case..parts", "concept.name", "X.case..Responsible_actor", monitoringResource) %>%
  rename(CaseID = Case.ID,
         Activity = activityNameEN,
         CompleteTimestamp = "Complete.Timestamp",
         CaseProcedure = "X.case..caseProcedure",
         CaseStatus = "X.case..caseStatus",
         CaseParts = "X.case..parts",
         ActivityCode = "concept.name",
         ResponsibleResource = "X.case..Responsible_actor",
         MonitoringResource = monitoringResource) %>%
  mutate(ResponsibleResource = ifelse(is.na(ResponsibleResource), 0, ResponsibleResource),
         ActivityCodeAgg = sub("_$", "", str_extract(ActivityCode, "\\d{2}_([A-Z|\\d]+)_")),
         ActivityOrder = str_extract(ActivityCode, "(?<=_)(\\d{2,3}.{0,6})"),
         Phase = paste0("phase", str_sub(ActivityOrder, 1, 1))) %>%
  mutate(CaseID = as.factor(CaseID),
         CompleteTimestamp = ymd_hms(CompleteTimestamp),
         Resource = as.factor(Resource),
         CaseParts = as.character(CaseParts),
         ResponsibleResource = as.factor(ResponsibleResource),
         MonitoringResource = as.factor(MonitoringResource),
         Weekday = as.factor(wday(CompleteTimestamp, label = FALSE, week_start = 1)),
         ActivityCodeAgg = as.factor(ActivityCodeAgg),
         ActivityOrder = as.factor(ActivityOrder),
         Phase = as.factor(Phase)) %>%
  dummy_columns(select_columns = "CaseParts", split = ",", ignore_na = TRUE)

bpi_15_2_cleaned <- bpi_15_2_cleaned %>%
  rename(CaseParts_Aanleg = "CaseParts_Aanleg (Uitvoeren werk of werkzaamheid)",
         CaseParts_Bouw = "CaseParts_Bouw",
         CaseParts_Brandveilig_Melding = "CaseParts_Brandveilig gebruik (melding)",
         CaseParts_Brandveilig_Vergunning = "CaseParts_Brandveilig gebruik (vergunning)",
         CaseParts_Gebiedsbescherming = "CaseParts_Gebiedsbescherming",
         CaseParts_HandelenInStrijd = "CaseParts_Handelen in strijd met regels RO",
         CaseParts_Kap = "CaseParts_Kap",
         CaseParts_InritUitweg = "CaseParts_Inrit/Uitweg",
         CaseParts_Milieu_Melding = "CaseParts_Milieu (melding)",
         CaseParts_Milieu_Neutraal = "CaseParts_Milieu (neutraal wijziging)",
         CaseParts_Milieu_Omgevingsvergunning = "CaseParts_Milieu (omgevingsvergunning beperkte milieutoets)",
         CaseParts_Milieu_Vergunning = "CaseParts_Milieu (vergunning)",
         CaseParts_Monument = "CaseParts_Monument",
         CaseParts_Reclame = "CaseParts_Reclame",
         CaseParts_Sloop = "CaseParts_Sloop") %>%
  mutate(CaseParts_Brandveilig = as.integer(if_else(CaseParts_Brandveilig_Melding == 1 | CaseParts_Brandveilig_Vergunning == 1, 1, 0)),
         CaseParts_Milieu = as.integer(if_else(CaseParts_Milieu_Melding == 1 | CaseParts_Milieu_Neutraal == 1 | CaseParts_Milieu_Omgevingsvergunning == 1 |
                                                 CaseParts_Milieu_Vergunning == 1, 1, 0)))

bpi_15_2_cleaned %>%
  saveRDS("data/bpi_15_2_cleaned.RDS")

### Municipality 3 ###

bpi_15_3_cleaned <- bpi_15_3 %>%
  select(Case.ID, activityNameEN, "Complete.Timestamp", Resource, "caseProcedure", "caseStatus", "parts", "concept.name", "Responsible_actor", monitoringResource) %>%
  rename(CaseID = Case.ID,
         Activity = activityNameEN,
         CompleteTimestamp = "Complete.Timestamp",
         CaseProcedure = "caseProcedure",
         CaseStatus = "caseStatus",
         CaseParts = "parts",
         ActivityCode = "concept.name",
         ResponsibleResource = "Responsible_actor",
         MonitoringResource = monitoringResource) %>%
  mutate(ResponsibleResource = ifelse(is.na(ResponsibleResource), 0, ResponsibleResource),
         ActivityCodeAgg = sub("_$", "", str_extract(ActivityCode, "\\d{2}_([A-Z|\\d]+)_")),
         ActivityOrder = str_extract(ActivityCode, "(?<=_)(\\d{2,3}.{0,6})"),
         Phase = paste0("phase", str_sub(ActivityOrder, 1, 1))) %>%
  mutate(CaseID = as.factor(CaseID),
         CompleteTimestamp = ymd_hms(CompleteTimestamp),
         Resource = as.factor(Resource),
         CaseParts = as.character(CaseParts),
         ResponsibleResource = as.factor(ResponsibleResource),
         MonitoringResource = as.factor(MonitoringResource),
         Weekday = as.factor(wday(CompleteTimestamp, label = FALSE, week_start = 1)),
         ActivityCodeAgg = as.factor(ActivityCodeAgg),
         ActivityOrder = as.factor(ActivityOrder),
         Phase = as.factor(Phase)) %>%
  dummy_columns(select_columns = "CaseParts", split = ",", ignore_na = TRUE)

bpi_15_3_cleaned <- bpi_15_3_cleaned %>%
  rename(CaseParts_Aanleg = "CaseParts_Aanleg (Uitvoeren werk of werkzaamheid)",
         CaseParts_Bouw = "CaseParts_Bouw",
         CaseParts_Brandveilig_Melding = "CaseParts_Brandveilig gebruik (melding)",
         CaseParts_Brandveilig_Vergunning = "CaseParts_Brandveilig gebruik (vergunning)",
         CaseParts_FloraFauna = "CaseParts_Flora en Fauna",
         CaseParts_Gebiedsbescherming = "CaseParts_Gebiedsbescherming",
         CaseParts_HandelenInStrijd = "CaseParts_Handelen in strijd met regels RO",
         CaseParts_Kap = "CaseParts_Kap",
         CaseParts_InritUitweg = "CaseParts_Inrit/Uitweg",
         CaseParts_Milieu_Melding = "CaseParts_Milieu (melding)",
         CaseParts_Milieu_Neutraal = "CaseParts_Milieu (neutraal wijziging)",
         CaseParts_Milieu_Omgevingsvergunning = "CaseParts_Milieu (omgevingsvergunning beperkte milieutoets)",
         CaseParts_Milieu_Vergunning = "CaseParts_Milieu (vergunning)",
         CaseParts_Monument = "CaseParts_Monument",
         CaseParts_Reclame = "CaseParts_Reclame",
         CaseParts_RoerendeZaken = "CaseParts_Roerende zaken",
         CaseParts_Sloop = "CaseParts_Sloop") %>%
  mutate(CaseParts_Brandveilig = as.integer(if_else(CaseParts_Brandveilig_Melding == 1 | CaseParts_Brandveilig_Vergunning == 1, 1, 0)),
         CaseParts_Milieu = as.integer(if_else(CaseParts_Milieu_Melding == 1 | CaseParts_Milieu_Neutraal == 1 | CaseParts_Milieu_Omgevingsvergunning == 1 |
                                                 CaseParts_Milieu_Vergunning == 1, 1, 0)))

bpi_15_3_cleaned %>%
  saveRDS("data/bpi_15_3_cleaned.RDS")

### Municipality 4 ###

bpi_15_4_cleaned <- bpi_15_4 %>%
  select(Case.ID, activityNameEN, "Complete.Timestamp", Resource, "caseProcedure", "caseStatus", "parts", "concept.name", "Responsible_actor", monitoringResource) %>%
  rename(CaseID = Case.ID,
         Activity = activityNameEN,
         CompleteTimestamp = "Complete.Timestamp",
         CaseProcedure = "caseProcedure",
         CaseStatus = "caseStatus",
         CaseParts = "parts",
         ActivityCode = "concept.name",
         ResponsibleResource = "Responsible_actor",
         MonitoringResource = monitoringResource) %>%
  mutate(ResponsibleResource = ifelse(is.na(ResponsibleResource), 0, ResponsibleResource),
         ActivityCodeAgg = sub("_$", "", str_extract(ActivityCode, "\\d{2}_([A-Z|\\d]+)_")),
         ActivityOrder = str_extract(ActivityCode, "(?<=_)(\\d{2,3}.{0,6})"),
         Phase = paste0("phase", str_sub(ActivityOrder, 1, 1))) %>%
  mutate(CaseID = as.factor(CaseID),
         CompleteTimestamp = ymd_hms(CompleteTimestamp),
         Resource = as.factor(Resource),
         CaseParts = as.character(CaseParts),
         ResponsibleResource = as.factor(ResponsibleResource),
         MonitoringResource = as.factor(MonitoringResource),
         Weekday = as.factor(wday(CompleteTimestamp, label = FALSE, week_start = 1)),
         ActivityCodeAgg = as.factor(ActivityCodeAgg),
         ActivityOrder = as.factor(ActivityOrder),
         Phase = as.factor(Phase)) %>%
  dummy_columns(select_columns = "CaseParts", split = ",", ignore_na = TRUE)

bpi_15_4_cleaned <- bpi_15_4_cleaned %>%
  rename(CaseParts_Aanleg = "CaseParts_Aanleg (Uitvoeren werk of werkzaamheid)",
         CaseParts_Bouw = "CaseParts_Bouw",
         CaseParts_Brandveilig_Vergunning = "CaseParts_Brandveilig gebruik (vergunning)",
         CaseParts_Gebiedsbescherming = "CaseParts_Gebiedsbescherming",
         CaseParts_HandelenInStrijd = "CaseParts_Handelen in strijd met regels RO",
         CaseParts_Kap = "CaseParts_Kap",
         CaseParts_InritUitweg = "CaseParts_Inrit/Uitweg",
         CaseParts_Milieu_Neutraal = "CaseParts_Milieu (neutraal wijziging)",
         CaseParts_Milieu_Omgevingsvergunning = "CaseParts_Milieu (omgevingsvergunning beperkte milieutoets)",
         CaseParts_Milieu_Vergunning = "CaseParts_Milieu (vergunning)",
         CaseParts_Monument = "CaseParts_Monument",
         CaseParts_Reclame = "CaseParts_Reclame",
         CaseParts_Sloop = "CaseParts_Sloop") %>%
  mutate(CaseParts_Brandveilig = as.integer(if_else(CaseParts_Brandveilig_Vergunning == 1, 1, 0)),
         CaseParts_Milieu = as.integer(if_else(CaseParts_Milieu_Neutraal == 1 | CaseParts_Milieu_Omgevingsvergunning == 1 |
                                                 CaseParts_Milieu_Vergunning == 1, 1, 0)))

bpi_15_4_cleaned %>%
  saveRDS("data/bpi_15_4_cleaned.RDS")

### Municipality 5 ###

bpi_15_5_cleaned <- bpi_15_5 %>%
  select(Case.ID, activityNameEN, "Complete.Timestamp", Resource, "caseProcedure", "caseStatus", "parts", "concept.name", "Responsible_actor", monitoringResource) %>%
  rename(CaseID = Case.ID,
         Activity = activityNameEN,
         CompleteTimestamp = "Complete.Timestamp",
         CaseProcedure = "caseProcedure",
         CaseStatus = "caseStatus",
         CaseParts = "parts",
         ActivityCode = "concept.name",
         ResponsibleResource = "Responsible_actor",
         MonitoringResource = monitoringResource) %>%
  mutate(ResponsibleResource = ifelse(is.na(ResponsibleResource), 0, ResponsibleResource),
         ActivityCodeAgg = sub("_$", "", str_extract(ActivityCode, "\\d{2}_([A-Z|\\d]+)_")),
         ActivityOrder = str_extract(ActivityCode, "(?<=_)(\\d{2,3}.{0,6})"),
         Phase = paste0("phase", str_sub(ActivityOrder, 1, 1))) %>%
  mutate(CaseID = as.factor(CaseID),
         CompleteTimestamp = ymd_hms(CompleteTimestamp),
         Resource = as.factor(Resource),
         CaseParts = as.character(CaseParts),
         ResponsibleResource = as.factor(ResponsibleResource),
         MonitoringResource = as.factor(MonitoringResource),
         Weekday = as.factor(wday(CompleteTimestamp, label = FALSE, week_start = 1)),
         ActivityCodeAgg = as.factor(ActivityCodeAgg),
         ActivityOrder = as.factor(ActivityOrder),
         Phase = as.factor(Phase)) %>%
  dummy_columns(select_columns = "CaseParts", split = ",", ignore_na = TRUE)

bpi_15_5_cleaned <- bpi_15_5_cleaned %>%
  rename(CaseParts_Aanleg = "CaseParts_Aanleg (Uitvoeren werk of werkzaamheid)",
         CaseParts_Bouw = "CaseParts_Bouw",
         CaseParts_Brandveilig_Melding = "CaseParts_Brandveilig gebruik (melding)",
         CaseParts_Brandveilig_Vergunning = "CaseParts_Brandveilig gebruik (vergunning)",
         CaseParts_FloraFauna = "CaseParts_Flora en Fauna",
         CaseParts_Gebiedsbescherming = "CaseParts_Gebiedsbescherming",
         CaseParts_HandelenInStrijd = "CaseParts_Handelen in strijd met regels RO",
         CaseParts_Integraal = "CaseParts_Integraal",
         CaseParts_Kap = "CaseParts_Kap",
         CaseParts_InritUitweg = "CaseParts_Inrit/Uitweg",
         CaseParts_Milieu_Melding = "CaseParts_Milieu (melding)",
         CaseParts_Milieu_Neutraal = "CaseParts_Milieu (neutraal wijziging)",
         CaseParts_Milieu_Omgevingsvergunning = "CaseParts_Milieu (omgevingsvergunning beperkte milieutoets)",
         CaseParts_Milieu_Vergunning = "CaseParts_Milieu (vergunning)",
         CaseParts_Monument = "CaseParts_Monument",
         CaseParts_RoerendeZaken = "CaseParts_Roerende zaken",
         CaseParts_Reclame = "CaseParts_Reclame",
         CaseParts_Sloop = "CaseParts_Sloop") %>%
  mutate(CaseParts_Brandveilig = as.integer(if_else(CaseParts_Brandveilig_Melding == 1 | CaseParts_Brandveilig_Vergunning == 1, 1, 0)),
         CaseParts_Milieu = as.integer(if_else(CaseParts_Milieu_Melding == 1 | CaseParts_Milieu_Neutraal == 1 | CaseParts_Milieu_Omgevingsvergunning == 1 |
                                                 CaseParts_Milieu_Vergunning == 1, 1, 0)))

# Remove Case Status "T"
bpi_15_5_cleaned <- bpi_15_5_cleaned %>%
  filter(CaseID != "4197976")

bpi_15_5_cleaned %>%
  saveRDS("data/bpi_15_5_cleaned.RDS")