# --------------------------------------------------------
#  Bertolotti, Gavazza & Lanteri - Data Merging Algorithm
# --------------------------------------------------------

# Description: 
# This code executes the merging algorithm between the dominion and IHS datasets models,
# GenerationModelYear and Generation.

# Outputs: 
# 1) algorithm_dominion_IHS_model_list_output.csv
# 2) algorithm_dominion_IHS_model_year_output.csv

(WD <- getwd())
if (!is.null(WD)) setwd(WD)

# ----------------------------------------------------------------------------------------------------

## 0. Preliminaries -----
set.seed(1)
rm(list=ls())

# Installing packages
packages <- c("tidyverse","stringdist","tm","haven")

installed_packages <- packages %in% rownames(installed.packages())
if (any(installed_packages == FALSE)) {
  install.packages(packages[!installed_packages])
}

# Packages loading
invisible(lapply(packages, library, character.only = TRUE))

## 1. Loading data ----
dominion <- read_dta("d_dominion_model_list.dta")
IHS <- read_dta("d_IHS_model_list.dta")

## 3. Merging models ----

### 3.1 Preparation ----
####  Converting into uppercase the model_IHS string in IHS
IHS <- IHS %>% mutate(make_IHS_upper=toupper(make_IHS),
                      model_IHS_upper=toupper(model_IHS))

####  Creating dominion make+model and make+model_d2 combination
dominion <- dominion %>% mutate(make_model=paste0(make," ",model),
                                make_model_d2=paste0(make," ",model_d2))

####  Creating a vector of individual IHS makes (to reduce NAs in pmatch)
IHS_make <- levels(as.factor(IHS$make_IHS_upper))


### 3.2 Merging algorithms ----
# First match make, then match model. Use NA for cases with multiple solutions.
# Pmatch for brand, manual solutions for certain brands 
# For model, first pmatch, then agrep. If solution is not unique,
# reduce options with stringdist and repeat the process with this subset.

dominion$alg_model_IHS <- "" 
for (i in 1:nrow(dominion)){  
  # First, merge with the make
  position_make <- pmatch(dominion$make[i],IHS_make)
  subset_IHS <- IHS[toupper(IHS$make_IHS)==IHS_make[position_make],]
  
  # Assign subsets to the 6 brands that are not matched under pmatch
  if(dominion$make[i] %in% c("FISK", "JEP",  "MERC", "MERZ", "SMRT", "SUKI", "TOYT")){
    
    if(dominion$make[i]=="JEP"){
      subset_IHS <- IHS[toupper(IHS$make_IHS)=="JEEP",]
    } else if (dominion$make[i]=="MERC") {
      subset_IHS <- IHS[toupper(IHS$make_IHS)=="MERCURY",]
    } else if (dominion$make[i]=="MERZ") {
      # (Could be Mercedes-Benz or Mayback)  
      if(grepl("MAYBACH",dominion$make_model[i])){
        subset_IHS <- IHS[toupper(IHS$make_IHS)=="MAYBACH",]
      } else {
        subset_IHS <- IHS[toupper(IHS$make_IHS)=="MERCEDES-BENZ",]
      }
      
    } else if (dominion$make[i]=="SMRT") {
      subset_IHS <- IHS[toupper(IHS$make_IHS)=="SMART",]
    } else if (dominion$make[i]=="SUKI") {
      subset_IHS <- IHS[toupper(IHS$make_IHS)=="SUZUKI",]
    } else if (dominion$make[i]=="TOYT") {
      subset_IHS <- IHS[toupper(IHS$make_IHS)=="TOYOTA",]
    }
  }
  
  
  # Removing the make name to improve performance of pmatch and agrep  
  dom_model <- word(dominion$make_model[i],2,-1) 
  subset_IHS_model <- word(subset_IHS$model_IHS_upper,2,-1)  
  
  # Adding special case for BMW
  if(dominion$make[i]=="BMW"){
    subset_IHS_model <- removeWords(subset_IHS_model,"-SERIES")
  }
  
  # Adding special case for Mercedes-Benz
  if(dominion$make[i]=="MERZ"){
    subset_IHS_model <- removeWords(subset_IHS_model,"-CLASS")
  }
  
  # Second, attempt merge with the dominion$model variable
  position_model <- pmatch(dom_model,subset_IHS_model)
  if(!is.na(position_model)){
    dominion$alg_model_IHS[i] <- subset_IHS$model_IHS_upper[position_model]
  } else {
    pos_agrep_model_match <- agrep(dom_model,subset_IHS_model)  
    if(length(pos_agrep_model_match)==1){
      dominion$alg_model_IHS[i] <- subset_IHS$model_IHS_upper[pos_agrep_model_match]
    } else {
      # If pmatch and agrep fail to match then match with
      # model_d2
      dom_model_d2 <- word(dominion$make_model_d2[i],2,-1)
      
      position_model_d2 <- pmatch(dom_model_d2,subset_IHS_model)
      if(!is.na(position_model_d2)){
        dominion$alg_model_IHS[i] <- subset_IHS$model_IHS_upper[position_model_d2]
      } else {
        pos_agrep_model_match_d2 <- agrep(dom_model_d2,subset_IHS_model)  
        if(length(pos_agrep_model_match_d2)==1){
          dominion$alg_model_IHS[i] <- subset_IHS$model_IHS_upper[pos_agrep_model_match_d2]
        } else {
          # If agrep and pmatch fail with both categories,then reduce options with
          # stringdist
          distance_model <- stringdist(dom_model_d2,subset_IHS_model)
          model_match_options <- subset_IHS$model_IHS_upper[distance_model==min(distance_model)]
          if (length(model_match_options)==1){
            dominion$alg_model_IHS[i] <- model_match_options
          } else {
            position_model_d2 <- pmatch(dom_model_d2,model_match_options)
            if(!is.na(position_model_d2)){
              dominion$alg_model_IHS[i] <- model_match_options[position_model_d2]
            } else {
              pos_agrep_model_match_d2 <- agrep(dom_model_d2,model_match_options)  
              if(length(pos_agrep_model_match_d2)==1){
                dominion$alg_model_IHS[i] <- subset_IHS$model_IHS_upper[pos_agrep_model_match_d2]}
            }
          }
        }
      }
    }
  }
}


### 3.3 Addressing identified mismatches ----

# Creating flag variable for manual adjustments
dominion$flag_model_manual <- 0

# A. Changing to "" 128 options that were mismatched 
# Audi: ALLROAD, RS4,S6
# CADI: FLEETWOOD
# CHEV: 10, COLORADO,
# CHRY: AVENGER
# DODG: DAKOTA
# FORD: F150, F250, F350, F450, F550, F650,RANGER
# GMC: DENALI, SAVANA
# ISU: AMIGO, I280, I290, i370
# JEP: LIMITED
# KIA: SEPHIA
# MAZD: B4000
# MITS: RAIDER
# NISS: FRONTIER, KING CAB
# RAM: 1500, 2500, 3500, 4500, 5500, DAKOTA,
# TOYT: PRERUNNR, SCION, SOLARA, TUNDRA
dominion$alg_model_IHS[c(90,100,112,113,114,309,325,384,385,386,493,589,590,591,592,593,594,
                         595,596,720,721,722,723,724,725,726,727,728,729,730,731,732,733,734,
                         735,736,737,738,739,740,741,742,743,744,745,746,747,748,749,750,751,
                       752,753,754,794,795,796,816,860,861,865,1014,1015,1021,1022,1023,1076,
                       1077,1078,1123,1228,1229,1272,1466,1467,1468,1478,1479,1480,1481,
                       1482,1485,1556, 1557, 1558, 1559,1560,1561,1562,1563,1564,1565,1566,
                       1567,1568,1569,1570,1571,1572,1573,1574,1575,1576,1577,1578,1579,
                       1580,1581,1582,1583,1584,1585,1586,1587,1727,1732,1734,1735,1736,
                       1737,1738,1743,1744,1753,1754,1755,1756,1757,1758)] <- ""

dominion$flag_model_manual[c(90,100,112,113,114,309,325,384,385,386,493,589,590,591,592,593,594,
                            595,596,720,721,722,723,724,725,726,727,728,729,730,731,732,733,734,
                            735,736,737,738,739,740,741,742,743,744,745,746,747,748,749,750,751,
                            752,753,754,794,795,796,816,860,861,865,1014,1015,1021,1022,1023,1076,
                            1077,1078,1123,1228,1229,1272,1466,1467,1468,1478,1479,1480,1481,
                            1482,1485,1556, 1557, 1558, 1559,1560,1561,1562,1563,1564,1565,1566,
                            1567,1568,1569,1570,1571,1572,1573,1574,1575,1576,1577,1578,1579,
                            1580,1581,1582,1583,1584,1585,1586,1587,1727,1732,1734,1735,1736,
                            1737,1738,1743,1744,1753,1754,1755,1756,1757,1758)] <- 1

# B. Manually change
## BMW 128i and 135i in dominion to BMW 1-SERIES in IHS
dominion$alg_model_IHS[c(137:138)] <- "BMW 1-SERIES"
dominion$flag_model_manual[c(137:138)] <- 1

## BMW 325, 328, 330, and 335 in dominion to BMW 3-SERIES in IHS
dominion$alg_model_IHS[c(139:161,163)] <- "BMW 3-SERIES"
dominion$flag_model_manual[c(139:161,163)] <- 1

## BMW 335xi GT in dominion to BMW 3-SERIES GT in IHS
dominion$alg_model_IHS[c(162)] <- "BMW 3-SERIES GT" 
dominion$flag_model_manual[c(162)] <- 1

## BMW 525,528,530,535,540,545,550 in dominion to BMW 5-SERIES in IHS
dominion$alg_model_IHS[c(164:172,174,176,177,180:183,185)] <- "BMW 5-SERIES"
dominion$flag_model_manual[c(164:172,174,176,177,180:183,185)] <- 1

## BMW 535i GT, 535xi GT, 550i GT, and 550xi GT in dominion to BMW 5-SERIES GT in IHS
dominion$alg_model_IHS[c(173,175,178,179,184,186)] <- "BMW 5-SERIES GT"
dominion$flag_model_manual[c(173,175,178,179,184,186)] <- 1

## BMW 640, 645, and 650 in dominion to BMW 6-SERIES in IHS
dominion$alg_model_IHS[c(187,189:190)] <- "BMW 6-SERIES"
dominion$flag_model_manual[c(187,189:190)] <- 1

## BMW 640i GRAN C and 650i GRAN C in dominion to BMW 6-SERIES GRAN COUPE in IHS
dominion$alg_model_IHS[c(188,191)] <- "BMW 6-SERIES GRAN COUPE"
dominion$flag_model_manual[c(188,191)] <- 1

## BMW 740, 745, 750, and 760 in dominion to BMW 7-SERIES in IHS
dominion$alg_model_IHS[c(192:206)] <- "BMW 7-SERIES"
dominion$flag_model_manual[c(192:206)] <- 1

## CADI ESCALADE 2X, 4X, EX, HY, LU, AND SP in dominion to CADILLAC ESCALADE in IHS
dominion$alg_model_IHS[c(302:303,305:308)] <- "CADILLAC ESCALADE"
dominion$flag_model_manual[c(302:303,305:308)] <- 1

## CHEV EQUINOX 1LT, 2LT, LT, AND LTZ in dominion to CHEVROLET EQUINOX in IHS
dominion$alg_model_IHS[c(404,406,408:409)] <- "CHEVROLET EQUINOX"
dominion$flag_model_manual[c(404,406,408:409)] <- 1

## CHEV EQUINOX S10 4X2, 4X4, BLAZER, AND T10 BLAZER in dominion 
## to CHEVROLET BLAZER CLASSIC in IHS
dominion$alg_model_IHS[c(438:441,451)] <- "CHEVROLET BLAZER CLASSIC"
dominion$flag_model_manual[c(438:441,451)] <- 1

## CHRY PACIFICA, AW, FW, LT, AND TO in dominion to Chrysler Pacifica Wagon in IHS
dominion$alg_model_IHS[c(498:502)] <- "CHRYSLER PACIFICA WAGON"
dominion$flag_model_manual[c(498:502)] <- 1

## FERR 458 in dominion to FERRARI F8/488/458 in IHS
dominion$alg_model_IHS[c(655)] <- "FERRARI F8/488/458"
dominion$flag_model_manual[c(655)] <- 1

## FORD 500 in dominion to FORD FIVE HUNDRED in IHS
dominion$alg_model_IHS[c(675:677)] <- "FORD FIVE HUNDRED"
dominion$flag_model_manual[c(675:677)] <- 1

## FORD ESCORT GT AND ZX2 in dominion to FORD ESCORT (US) in IHS
dominion$alg_model_IHS[c(704:705)] <- "FORD ESCORT (US)"
dominion$flag_model_manual[c(704:705)] <- 1

## FORD EXPLORER 2X, 4X, LT, PO, AND XL in dominion to FORD EXPLORER in IHS
dominion$alg_model_IHS[c(715:719)] <- "FORD EXPLORER"
dominion$flag_model_manual[c(715:719)] <- 1

## FORD SHLBY GT500 in dominion to FORD MUSTANG in IHS
dominion$alg_model_IHS[c(797)] <- "FORD MUSTANG"
dominion$flag_model_manual[c(797)] <- 1

## FORD T-BIRD, 007, and 50AN in dominion to FORD THUNDERBIRD in IHS
dominion$alg_model_IHS[c(799:801)] <- "FORD THUNDERBIRD"
dominion$flag_model_manual[c(799:801)] <- 1

## FORD TAURUS X AW in dominion to FORD TAURUS X in IHS
dominion$alg_model_IHS[c(811)] <- "FORD TAURUS X"
dominion$flag_model_manual[c(811)] <- 1

## FORD W47 FOR# AW in dominion to FORD WINDSTAR in IHS
dominion$alg_model_IHS[c(812)] <- "FORD WINDSTAR"
dominion$flag_model_manual[c(812)] <- 1

## HOND ACCORD TOUR in dominion to HONDA ACCORD CROSSTOUR in IHS
dominion$alg_model_IHS[c(897)] <- "HONDA ACCORD"
dominion$flag_model_manual[c(897)] <- 1

## INFI EX35 AND EX37 in dominion to INFINITI EX in IHS
dominion$alg_model_IHS[c(993:994)] <- "INFINITI EX"
dominion$flag_model_manual[c(993:994)] <- 1

## INFI FX35, FX37, FX45 AND FX50 in dominion to INFINITI FX in IHS
dominion$alg_model_IHS[c(995:998)] <- "INFINITI FX"
dominion$flag_model_manual[c(995:998)] <- 1

## INFI G25, G35, AND G37 in dominion to INFINITI G in IHS
dominion$alg_model_IHS[c(999:1001)] <- "INFINITI G"
dominion$flag_model_manual[c(999:1001)] <- 1

## INFI JX35 AND JX35 AWD in dominion to INFINITI JX in IHS
dominion$alg_model_IHS[c(1003:1004)] <- "INFINITI JX"
dominion$flag_model_manual[c(1003:1004)] <- 1

## INFI M35, M35H, M37, M37X, M45 AND M56 in dominion to INFINITI M in IHS
dominion$alg_model_IHS[c(1005:1010)] <- "INFINITI M"
dominion$flag_model_manual[c(1005:1010)] <- 1

## JAGU SUPER V8 AND VDP in dominion to JAGUAR XJ in IHS
dominion$alg_model_IHS[c(1028:1029)] <- "JAGUAR XJ"
dominion$flag_model_manual[c(1028:1029)] <- 1

## JEP LAREDO, 4X2, 4X4, OVERLAND 4X2, OVERLAND 4X4, AND SRT-8 4X4 in dominion 
# to JEEP GRAND CHEROKEE in IHS
dominion$alg_model_IHS[c(1066:1068,1079:1080,1092)] <- "JEEP GRAND CHEROKEE"
dominion$flag_model_manual[c(1066:1068,1079:1080,1092)] <- 1

## JEP RUBICON AND SAHARA in dominion to JEEP WRANGLER in IHS
dominion$alg_model_IHS[c(1086:1091)] <- "JEEP WRANGLER"
dominion$flag_model_manual[c(1086:1091)] <- 1

## LAND RANGE ROVER in dominion to LAND ROVER RANGE ROVER in IHS
dominion$alg_model_IHS[c(1151)] <- "LAND ROVER RANGE ROVER"
dominion$flag_model_manual[c(1151)] <- 1

## MAZD MAZDA2, SPRT, AND TOUR in dominion to MAZDA 2 in IHS
dominion$alg_model_IHS[c(1240:1242)] <- "MAZDA 2"
dominion$flag_model_manual[c(1240:1242)] <- 1

## MAZD MAZDA3, SPRT, TOUR, GT, SV, 3I, 3S AND SPEED3 in dominion to MAZDA 3 in IHS
dominion$alg_model_IHS[c(1243:1249,1260)] <- "MAZDA 3"
dominion$flag_model_manual[c(1243:1249,1260)] <- 1

## MAZD MAZDA5, SPRT, TOUR, AND GT in dominion to MAZDA 5 in IHS
dominion$alg_model_IHS[c(1250:1253)] <- "MAZDA 5"
dominion$flag_model_manual[c(1250:1253)] <- 1

## MAZD MAZDA6, SPRT, TOUR, GT, 6I, 6S, AND SPEED6 in dominion to MAZDA 6 (US) in IHS
dominion$alg_model_IHS[c(1254:1259,1261)] <- "MAZDA 6 (US)"
dominion$flag_model_manual[c(1254:1259,1261)] <- 1

## MAZD MIATA MX-5 in dominion to MAZDA MX-5 in IHS
dominion$alg_model_IHS[c(1262)] <- "MAZDA MX-5"
dominion$flag_model_manual[c(1262)] <- 1

## MERC GD MARQUIS in dominion to MERCURY MARQUIS in IHS
dominion$alg_model_IHS[c(1273)] <- "MERCURY MARQUIS"
dominion$flag_model_manual[c(1273)] <- 1

## MERC SL550 in dominion to MERCEDES-BENZ SL in IHS
dominion$alg_model_IHS[c(1298)] <- "MERCEDES-BENZ SL"
dominion$flag_model_manual[c(1298)] <- 1

## MERZ C180K, C230, C240, C250, C280, C300, C32, C320, C350, C55, C63, AND CHEV C55 in dominion 
# to MERCEDES-BENZ C-CLASS in IHS
dominion$alg_model_IHS[c(361,1299:1317)] <- "MERCEDES-BENZ C-CLASS"
dominion$flag_model_manual[c(361,1299:1317)] <- 1

## MERZ CL550, CL500, CL550, CL600, CL63, CL65 in dominion to MERCEDES-BENZ CL in IHS
dominion$alg_model_IHS[c(1318:1326)] <- "MERCEDES-BENZ CL"
dominion$flag_model_manual[c(1318:1326)] <- 1

## MERZ CLK500C in dominion to MERCEDES-BENZ CLK in IHS
dominion$alg_model_IHS[c(1331)] <- "MERCEDES-BENZ CLK"
dominion$flag_model_manual[c(1331)] <- 1

## MERZ G500, G55, G550, AND G63 in dominion to MERCEDES-BENZ G-CLASS in IHS
dominion$alg_model_IHS[c(1357:1361)] <- "MERCEDES-BENZ G-CLASS"
dominion$flag_model_manual[c(1357:1361)] <- 1

## MERZ GL320, GL350, GL450, AND GL550 in dominion to MERCEDES-BENZ GL in IHS
dominion$alg_model_IHS[c(1362:1367)] <- "MERCEDES-BENZ GL"
dominion$flag_model_manual[c(1362:1367)] <- 1

## MERZ S350, S400, S430, S500, S55, S550, S600, S63, AND S65 in dominion 
# to MERCEDES-BENZ S-CLASS in IHS
dominion$alg_model_IHS[c(1392:1403)] <- "MERCEDES-BENZ S-CLASS"
dominion$flag_model_manual[c(1392:1403)] <- 1

## MERZ SL500R, SL55, SL550, SL55K, SL600, SL63, SL65, AND SL65K in dominion 
# to MERCEDES-BENZ SL in IHS
dominion$alg_model_IHS[c(1404:1412)] <- "MERCEDES-BENZ SL"
dominion$flag_model_manual[c(1404:1412)] <- 1

## MINI COOPER COUN in dominion to MINI COUNTRYMAN in IHS
dominion$alg_model_IHS[c(1426)] <- "MINI COUNTRYMAN"
dominion$flag_model_manual[c(1426)] <- 1

## MITS MONTERO SPT AND XLS in dominion to MITSUBISHI MONTERO in IHS
dominion$alg_model_IHS[c(1463:1464)] <- "MITSUBISHI MONTERO"
dominion$flag_model_manual[c(1463:1464)] <- 1

## PONT GR PRIX GT, GT2, GTP, GXP, AND SE1 in dominion to PONTIAC GRAND PRIX in IHS
dominion$alg_model_IHS[c(1517:1521)] <- "PONTIAC GRAND PRIX"
dominion$flag_model_manual[c(1517:1521)] <- 1

## SAAB 9-2 AERO in dominion to SAAB 9-2 in IHS
dominion$alg_model_IHS[c(1590)] <- "SAAB 9-2"
dominion$flag_model_manual[c(1590)] <- 1

## SATU L200, L300, LW200, AND LW300 in dominion to SATURN L-SERIES in IHS
dominion$alg_model_IHS[c(1619:1622)] <- "SATURN L-SERIES"
dominion$flag_model_manual[c(1619:1622)] <- 1

## SATU B9 TRIBECA in dominion to SUBARU TRIBECA in IHS
dominion$alg_model_IHS[c(1634)] <- "SUBARU TRIBECA"
dominion$flag_model_manual[c(1634)] <- 1

## TOYT PRIUS V in dominion to TOYOTA PRIUS ALPHA / + / V in IHS
dominion$alg_model_IHS[c(1730)] <- "TOYOTA PRIUS ALPHA / + / V"
dominion$flag_model_manual[c(1730)] <- 1

## TOYT SCION FR-S in dominion to TOYOTA GT86 in IHS
dominion$alg_model_IHS[c(1733)] <- "TOYOTA GT86"
dominion$flag_model_manual[c(1733)] <- 1

## VOLK GLI in dominion to VOLKSWAGEN GOLF in IHS
dominion$alg_model_IHS[c(1784)] <- "VOLKSWAGEN GOLF"
dominion$flag_model_manual[c(1784)] <- 1

## VOLK GLI, R32, AND RABBIT in dominion to VOLKSWAGEN GOLF in IHS
dominion$alg_model_IHS[c(1784,1829:1830)] <- "VOLKSWAGEN GOLF"
dominion$flag_model_manual[c(1784,1829:1830)] <- 1

## GMC ACADIA, DENA, SL, SLE, SLE1, SLE2, SLT1 AND SLT2 in dominion to GMC Acadia/-Limited in IHS
dominion$alg_model_IHS[c(848:855)] <- "GMC ACADIA/-LIMITED"
dominion$flag_model_manual[c(848:855)] <- 1

## LINC NAVIGATR L in dominion to Lincoln Navigator in IHS
dominion$alg_model_IHS[c(1209)] <- "LINCOLN NAVIGATOR"
dominion$flag_model_manual[c(1209)] <- 1

## BENT CONTINENTAL in dominion to Bentley Continental GT/GTC/SSp in IHS
dominion$alg_model_IHS[c(134)] <- "BENTLEY CONTINENTAL GT/GTC/SSP"
dominion$flag_model_manual[c(134)] <- 1

## LINC NAVIGATR 2X AND 4X in dominion to Lincoln Navigator in IHS
dominion$alg_model_IHS[c(1207:1208)] <- "LINCOLN NAVIGATOR L"
dominion$flag_model_manual[c(1207:1208)] <- 1

## SUKI GR VITARA in dominion to Suzuki Grand Vitara in IHS
dominion$alg_model_IHS[c(1685)] <- "SUZUKI GRAND VITARA"
dominion$flag_model_manual[c(1685)] <- 1

# C. Drop from sample (case VOLK COOPER)
dominion <- dominion[-1778,]

# D. Set IHS match in TOYT SCION cases to dominion model 
dominion$alg_model_IHS[c(1732:1738)] <- toupper(dominion$model[c(1732:1738)])
dominion$flag_model_manual[c(1732:1738)] <- 1

### 3.4 Final merge, organizing and selecting columns ----
dominion <- left_join(dominion %>% rename(model_IHS_upper=alg_model_IHS),IHS)
dominion <- dominion %>% select(1:3,8,9,7)


dominion <- dominion %>% mutate(make_IHS=ifelse(is.na(make_IHS),"",make_IHS),
                                model_IHS=ifelse(is.na(model_IHS),"",model_IHS))

# Correct the cases of SCION as make_IHS to the correct Toyota IHS 
dominion$make_IHS[c(1732:1738)] <- "Toyota"
dominion$model_IHS[c(1732)] <- "Scion"

# Correct the case of Saab 9-2 AERO
dominion$make_IHS[c(1590)] <- "Saab"
dominion$model_IHS[c(1590)] <- "Saab 9-2"
dominion$flag_model_manual[c(1590)] <- 1


### 3.5 Saving output ----
write.csv(dominion,file="algorithm_dominion_IHS_model_list_output.csv",row.names = FALSE)


# 4. Merging years ----

## 4.1. Loading data ----
dominion_year <- read_dta("d_dominion_model_year_list.dta")
IHS_generation <- read_dta("d_IHS_model_year_list.dta")

## 4.2 Preparation ----
dominion_year <- left_join(dominion_year,dominion)
dominion_year <- dominion_year %>% select(1,2,3,5,6,4,7)

# Dropping unfeasible cases 
## Volk Cooper
dominion_year <- dominion_year[-6956,]
## HOND PILOT LX model_year=2029
dominion_year <- dominion_year[-3745,]

# Creating empty variables
dominion_year$GenerationModelYear <- NA_integer_
dominion_year$Generation <- NA_integer_

## 4.3 Merging years ----

for(i in 1:nrow(dominion_year)){
  if(dominion_year$model_IHS[i]=="" | is.na(dominion_year$model_IHS[i])) {
    subset_dominion <- dominion_year[dominion_year$model_d2==dominion_year$model_d2[i],]
    dominion_year$GenerationModelYear[i] <- min(subset_dominion$model_year)
    dominion_year$Generation[i] <- 1
    next 
  }

    # Subseting IHS_generation and ordering by GenerationModelYear
  subset_IHS <- IHS_generation[IHS_generation$model_IHS==dominion_year$model_IHS[i],]
  subset_IHS <- subset_IHS[order(subset_IHS$GenerationModelYear),]
  
    # Check if there are rows in IHS prior to model_year
    if(any(subset_IHS$GenerationModelYear<=dominion_year$model_year[i])){
      
      if(nrow(subset_IHS)==1) {
        dominion_year$GenerationModelYear[i] <- subset_IHS$GenerationModelYear
        dominion_year$Generation[i] <- subset_IHS$Generation
       } else {
  
            for(j in 1:(nrow(subset_IHS)-1)){
                if(dominion_year$model_year[i]>=subset_IHS$GenerationModelYear[j] &
                    dominion_year$model_year[i]<subset_IHS$GenerationModelYear[j+1]){
                
                  dominion_year$GenerationModelYear[i] <- subset_IHS$GenerationModelYear[j]
                  dominion_year$Generation[i] <- subset_IHS$Generation[j]
                break
                } else {
                next 
                }
            }
          # if it did not find any match in first j-1 rows, assign row j
          if(is.na(dominion_year$GenerationModelYear[i])){
            j_last=nrow(subset_IHS)
            dominion_year$GenerationModelYear[i] <- subset_IHS$GenerationModelYear[j_last]
            dominion_year$Generation[i] <- subset_IHS$Generation[j_last]
          }
            }  
    } else {
      # If they are all greater, then take the closest one
      dominion_year$GenerationModelYear[i] <- subset_IHS$GenerationModelYear[1]
      dominion_year$Generation[i] <- subset_IHS$Generation[1]
    }
  }

### 4.4 Addressing identified year mismatches ----
dominion_year$flag_year_manual <- 0

## Changes to 1998
# MAZD MIATA MX-5 MODEL YEARS 2003 TO 2005
dominion_year$GenerationModelYear[c(4980:4982)] <- 1998
dominion_year$flag_year_manual[c(4980:4982)] <- 1

## Changes to 2003
# VOLK BEETLE MODEL YEARS 2003 TO 2005
# VOLK GTI MODEL YEARS 2003 TO 2006
# AUDI S4 AVANT, CABRIO AND QUATTRO MODEL YEARS 2004 TO 2005
# TOYT SCION xB MODEL YEARS 2003 TO 2006
dominion_year$GenerationModelYear[c(6921:6932,7002:7005,7013:7016,352,353,357,358,369,
                                    370,6782:6784)] <- 2003
dominion_year$flag_year_manual[c(6921:6932,7002:7005,7013:7016,352,353,357,358,369,
                                 370,6782:6784)] <- 1

## Changes to 2004
# MAZDA6, MAZDA6I, MAZDA6S, AND MAZDASPEED6 MODEL YEARS 2003 TO 2008
# PONTIAC VIBE FROM 2007 AND 2008
# LINC NAVIATR 2x and 4x MODEL YEARS 2003 TO 2006
dominion_year$GenerationModelYear[c(4949:4954,4965:4969,4971:4975,4978,4979,
                                    6024,6025,4762:4765,4773:4776)] <- 2004
dominion_year$flag_year_manual[c(4949:4954,4965:4969,4971:4975,4978,4979,
                                 6024,6025,4762:4765,4773:4776)] <- 1

## Changes to 2005
# CHEV EQUINOX, LS, LT, LTZ, AND SS MODEL YEARS 2005 TO 2009
# SUBA OUTBACK 2.5, 3.0, H6, AND LTD MODEL YEARS 2006 TO 2009 
# SAAB 9-2 AERO MODEL YEAR 2006 
dominion_year$GenerationModelYear[c(1445,1446,1459:1463,1468:1472,1474,1475,1480,1481,
                                  6479:6482,6488:6490,6496:6498,6500:6503,6226)] <- 2005
dominion_year$flag_year_manual[c(1445,1446,1459:1463,1468:1472,1474,1475,1480,1481,
                                 6479:6482,6488:6490,6496:6498,6500:6503,6226)] <- 1

## Changes to 2006
# AUDI S4 AVANT, CABRIO AND QUATTRO MODEL YEARS 2006 TO 2008
# MERC MOUNTNEER MODEL YEARS 2007 TO 2010
dominion_year$GenerationModelYear[c(569:571,354:356,359:361,371:373,5080:5085)] <- 2006
dominion_year$flag_year_manual[c(569:571,354:356,359:361,371:373,5080:5085)] <- 1

## Changes to 2007
# VOLVO C70 MODEL YEARS 2007 TO 2013
# LINC NAVIATR 2x and 4x MODEL YEARS 2007 TO 2011
dominion_year$GenerationModelYear[c(7169:7175,4766:4770,4777:4780)] <- 2007
dominion_year$flag_year_manual[c(7169:7175,4766:4770,4777:4780)] <- 1

## Changes to 2008
# PONTIAC G8 MODEL YEARS 2009
dominion_year$GenerationModelYear[c(5972,5974,5975)] <- 2008
dominion_year$flag_year_manual[c(5972,5974,5975)] <- 1

## Changes to 2009
# AUDI S4 CABRIO, PREMIUM, PRESTIGE, AND QUATTRO MODEL YEARS 2011 TO 2013
# PONTIAC VIBE MODEL YEARS 2009 TO 2010
dominion_year$GenerationModelYear[c(362:368,374,375,6026,6027,6032,6033,
                                    6038,6039)] <- 2009
dominion_year$flag_year_manual[c(362:368,374,375,6026,6027,6032,6033,
                                 6038,6039)] <- 1

## Changes to 2010
# CHEV EQUINOX MODEL YEARS 2010 TO 2013
# TOYT SCION AND SCION TC MODEL YEARS 2010 TO 2013
dominion_year$GenerationModelYear[c(1447:1458,1464:1467,1473,
                                    1476:1479,6764,6773:6776)] <- 2010
dominion_year$flag_year_manual[c(1447:1458,1464:1467,1473,
                                 1476:1479,6764,6773:6776)] <- 1

## Changes to 2011
# BUIC REGAL MODEL YEARS 2011 TO 2013
# GENE 1500 MODEL YEARS 2011
# HOND CIVIC MODEL YEARS 2012 TO 2013
# HYUN ACCENT MODEL YEARS 2011 TO 2013
# RAM 1500, 2500, 3500, AND DAKOTA MODEL YEARS 2011 TO 2013
# LINC NAVIGATR L MODEL YEARS 2011 TO 2013
dominion_year$GenerationModelYear[c(914:918,923:928,3103,3518,3528,3542,3543,
                                    3548,3549,3557,3565,3575,3576,3580,3590,
                                    3816,3821:3823,3831:3833,3837,3838,6161:6197,
                                    6211,6212,4788:4790)] <- 2011
dominion_year$flag_year_manual[c(914:918,923:928,3103,3518,3528,3542,3543,
                                 3548,3549,3557,3565,3575,3576,3580,3590,
                                 3816,3821:3823,3831:3833,3837,3838,6161:6197,
                                 6211,6212,4788:4790)] <- 1

## Changes to 2013
# CHRY AVENGER MODEL YEAR 2013
dominion_year$GenerationModelYear[c(1783)] <- 2013
dominion_year$flag_year_manual[c(1783)] <- 1

# 4.5 Creating a Custom Generation Numbering ---- 

# Assigning Generation 1 to the cases with no IHS match 
dominion_year$Custom_Generation[dominion_year$model_IHS==""] <- 1

# Creating a list of unique categories
categories <- dominion_year %>% group_by(make_IHS,model_IHS) %>% 
  summarize(min_GMY=min(GenerationModelYear)) 
categories <- categories[-1,]

for (i in 1:nrow(categories)) {
subset_category <- dominion_year %>% filter(make_IHS==categories$make_IHS[i] & 
                                              model_IHS==categories$model_IHS[i])

GMY_options <- subset_category$GenerationModelYear %>% as.factor() %>% levels() %>% 
                        as.numeric() %>% sort()

# Initialize while loop
j=1
while(!is.na(GMY_options[j])){
  dominion_year$Custom_Generation[dominion_year$make_IHS==categories$make_IHS[i] & 
                                    dominion_year$model_IHS==categories$model_IHS[i] &
                                    dominion_year$GenerationModelYear==GMY_options[j]] <- j
  j=j+1
}

}

# 4.6  Rearranging and Saving output ---- 
dominion_year <- dominion_year %>% select(1:6,8:9,11,7,10)
write.csv(dominion_year,file="algorithm_dominion_IHS_model_year_output.csv",row.names = FALSE)




