##•••••••••••••••••••••••••••••••••••••••••••••••••••••••••••••••••••••••••••••••••••••••••••••••••••••••••••••••••••••••••••••••••••••••••••••••••••••
### Data analysis: Temporal, spatial and household socio-economic dynamics of typhoid: Towards forecasting outbreaks in Kasese district, Uganda
#••••••••••••••••••••••••••••••••••••••••••••••••••••••••••••••••••••••••••••••••••••••••••••••••••••••••••••••••••••••••••••••••••••••••••••••••••••



# READ IN DATABASES S1 & S2
MOH_DB<- read.csv("MOH_NAT_DIS_DB.csv",sep = ",",header = T)

##***************************************************************
#### FORECASTING OUTBREAKS AT NATIONAL AND DISTRICT LEVEL
##***************************************************************
MOH_2016 <- read.csv("MOH_2016.csv",sep = ",",header = T)
MOH_2016_UG<-MOH_2016[,5,]
MOH_2016_KAS<-MOH_2016[,4,]

x <- read.csv("MOH_Kasese.csv",sep = ",",header = T)
bx <- x[,1:5]

FULL_MOH<-join(bx,MOH_2016, type="full")# USE PLYR LIBRARY
##••••••••••••••••••••••••••••••••••••••
### Case control data analysis:
#•••••••••••••••••••••••••••••••••••••••
## functions for univariable analysis
# function creates a table with OR, CI and p value. x required is summary(model)
mmt <- function(y){
  x <-summary(y)
  bla <-strsplit(as.character(x$call)[[2]],split='~', fixed=TRUE)[[1]][2]
  variable <-gsub(' ','',strsplit(bla,split='+', fixed=TRUE)[[1]][1])
  #OddsRatio <- round(exp(x$coefficients[,1][2:nrow(x$coefficients)]),2)
  OddsRatio <- round(exp(x$coefficients[,1]),2)
  LL <-round(exp(x$coefficients[,1] - 1.96* x$coefficients[,3]),2)
  UL <-round(exp(x$coefficients[,1] + 1.96* x$coefficients[,3]),2)
  pv <- round(x$coefficients[,5],3)
  #pvalue <-c(round(an1["Pr(>Chisq)"][2,],3),rep("",nrow(x$coefficients)-1))
  test <- data.frame(Variable=c(variable, 1:nrow(x$coefficients)), OR=c("",OddsRatio), LL=c("",LL), UL=c("",UL), pv=c(x$waldtest["pvalue"],pv), filter=c("yes", 1:nrow(x$coefficients)))
  test$Variable <- c(variable, rownames(test)[2:length(rownames(test))])
  #test$NAs <- c(na_count[variable], rep("",nrow(x$coefficients)-1))
  rownames(test) <- NULL
  return(test)
}


# C(Using Database S2 AND CASE CONTROL DATA SETS)
# read data in
#clg <-read_csv("cond_logreg_data.csv")
load("cond_logreg_data.RData")
# create matching variable
clg$match <- substr(clg$Questionnairenumber, 1,5)
# clean dataset to use in regression model
clg$Sex <- as.factor(as.character(clg$Sex))
clg$Levelofeducation <- as.factor(as.character(clg$Levelofeducation))
clg$Householdmembers_cut <- recode(as.factor(clg$Householdmembers!="1-5"), "TRUE"="More than 5", "FALSE"= "1-5")
clg$Doyoupreheatleftoverfood[is.na(clg$Doyoupreheatleftoverfood)] <- "No preheated food"  # change NAs to no left over food
clg$Washhandsaftertoiletorlatrine<- as.factor(clg$Washhandsaftertoiletorlatrine)
clg$Evenonurination<- as.factor(clg$Evenonurination) ### not likely to be independent from presvious question
clg$Soapandwatertowashhands<- as.factor(clg$Soapandwatertowashhands) ### not likely to be independent from presvious question
clg$Usetoiletpaper<- as.factor(clg$Usetoiletpaper) ### 
clg$Handlechildrenswaste<- as.factor(clg$Handlechildrenswaste) ### 
clg$Wherefoodisprepared<- as.factor(clg$Wherefoodisprepared) ### 
clg$Eatreadytoeatfoods<- as.factor(clg$Eatreadytoeatfoods) ### 
clg$Howdoyoupreparethevegetables<- as.factor(clg$Howdoyoupreparethevegetables) ###
clg$Howdoyoupreparethevegetables[clg$Howdoyoupreparethevegetables=="1"]<-"2"
clg$Sharethesamebowlwhileeatingfoodesp<- as.factor(clg$Sharethesamebowlwhileeatingfoodesp) ### 
clg$Doyouhaveanimals<- as.factor(clg$Doyouhaveanimals) ### 
clg$Feverishconditions<- as.factor(clg$Feverishconditions) ### 
clg$Diarrhoea<- as.factor(clg$Diarrhoea) ### 
clg$Constipation<- as.factor(clg$Constipation) ### 
clg$Malaise<- as.factor(clg$Malaise) ### 
clg$Abdominalpains<- as.factor(clg$Abdominalpains) ### 
clg$Swollenpainfulabdomen<- as.factor(clg$Swollenpainfulabdomen) ### 
clg$Vomiting<- as.factor(clg$Vomiting) ### 
clg$Painfulabdomennotswollen<- as.factor(clg$Painfulabdomennotswollen) ### 
clg$Blood<- as.factor(clg$Blood) ### 
clg$Stool<- as.factor(clg$Stool) ### 
clg$Urine<- as.factor(clg$Urine) ### 
clg$Saliva<- as.factor(clg$Saliva)
clg$Treatdrinkingwaterpractice<- as.factor(clg$Treatdrinkingwaterpractice)
clg$Dontshareutensilswiththevictim<- as.factor(clg$Dontshareutensilswiththevictim)
clg$Eatfoodwhilehot<- as.factor(clg$Eatfoodwhilehot)
clg$Dontsharethesamebowloffoodwiththevictim<- as.factor(clg$Dontsharethesamebowloffoodwiththevictim)###only on positive
clg$Noadvicegiven<- as.factor(clg$Noadvicegiven)
clg$Watersourcesharedbyanimals<- as.factor(clg$Watersourcesharedbyanimals)
clg$Humanscandirectlycontaminatewateratsource<- as.factor(clg$Humanscandirectlycontaminatewateratsource)
clg$Humansindirectcontactwithatwaterwhilefetching<- as.factor(clg$Humansindirectcontactwithatwaterwhilefetching)
clg$Anypitlatrinesclosetowatersource[clg$Anypitlatrinesclosetowatersource==""]<-"No"
clg$Anypitlatrinesclosetowatersource<- as.factor(clg$Anypitlatrinesclosetowatersource)
clg$Freeofalgae<- as.factor(clg$Freeofalgae)
clg$Sharedwithanimals<- as.factor(clg$Sharedwithanimals)
clg$Sharedwithanimals[clg$Sharedwithanimals==""]<-"No" 
clg$Hasanimaldroppingsorfaeces<- as.factor(clg$Hasanimaldroppingsorfaeces)
clg$Foodisstoredonthefloor<- as.factor(clg$Foodisstoredonthefloor)
clg$EvidenceofinfestationwithfliesFPA<- as.factor(clg$EvidenceofinfestationwithfliesFPA)
clg$Closetothepitlatrineortoilet[clg$Closetothepitlatrineortoilet==""]<-"No"
clg$Closetothepitlatrineortoilet<- as.factor(clg$Closetothepitlatrineortoilet)
clg$PitlatrineUnderuse[clg$PitlatrineUnderuse==""]<-"No"
clg$PitlatrineUnderuse<- as.factor(clg$PitlatrineUnderuse)
clg$Contaminationintheenvironment[is.na(clg$Contaminationintheenvironment)]<-"No"
clg$Contaminationintheenvironment<- as.factor(clg$Contaminationintheenvironment)
cld<- droplevels(clg)
clg$DrinkingWaterTx <- rep(NA, nrow(clg))
clg$DrinkingWaterTx[clg$Treatdrinkingwaterbeforestorage=="No"] <- "No"
clg$DrinkingWaterTx[clg$Boiling=="Yes"] <- "Boiling"
clg$DrinkingWaterTx[clg$Chlorination=="Yes"] <- "Chlorination"
clg$DrinkingWaterTx[clg$Solardisinfection=="Yes"] <- "Solardisinfection"
clg$DrinkingWaterTx[clg$Filtration=="Yes"] <- "Filtration"

####************************************************                
#### COMMENTS ON SOME COLUMS THAT CAN BE RECORDED OR REMOVED
####************************************************
### Treatdrinkingwaterbeforestorage = YES & NO ,if yes this is how they treat colunm (Boiling   Chlorination    Solardisinfection   Filtration)
### Can modify this how do treatwater response, Don' treat == NO from before, Boiling, Chlorinating, Solar and Filtration as response for Yes
### Lackoffirewoodtoboil,Lackofmoneytobuychlorinetablets,Itisnotinherenttotreatdrinkingwater,Believethewaterissafe,Havenotime,Feelsickwhentheytaketreatedwater,Tastelessafterboiling,Unavailabilityofchlorinetablets,Fearforchildrensickness
## these are follow up responses for those who do not treat water.. asking why they do not treat..(Consider droping them or recode including response "treat water")
## Accesstosafedrinkingwater & Accesstotheuntreateddrinkingwater. Redundant question... if someone responded yes to Tap water
## WHICH ANIMALS TO HAVE PET=Dog,Guineapigs &Cat, LIVESTOCK=(Pigs,Goats,Cows), POULTRY=(Chicken ,Ducks) 
# remove useless variables!
clg <- select(clg, -Dateofinterview,-Questionnairenumber, - UniqueKey, -Nameofparticipant,-Bonemarrow,
              -Subcounty, -Parish, -Village, -Parish.latitude, -Victimshouldnotpreparefooduntildeclaredfreeoftyphoidfever,
              -Parish.longitude, -Examplesofreadytoeatfoods, -Householdmembers,
              -Positioninthehousehold, -typeofwatersource1, -Typeofwatersource2, -Notes,
              -Watersourceinspected,-Watercollectionvessel, -Speciesoffaecalsample1,
              -Fromwheredoyouwashyourhand., -Whydontyouseekmedicalattention,
              -Designatedcupkept,-Wheredoyoustoreprepredorleftoverfood,
              -FS1collected, -SalmonellaFS1, -ShigellaFS1, -EColiFS1,
              -FS2collected, -Speciesoffaecalsample2, -SalmonellaFS2,
              -ShigellaFS2, -EColiFS2, -Contaminated.fecal.sample, -PresenceofseparatevesselforDW
              - KindofseparatevesseiforDW, -Presenceofdesignatedcup -Accesstosafedrinkingwater,
              -Accesstotheuntreateddrinkingwater, -Fromwheredoyouwashyourhand.,-Wateronlytowashhands,
              -Ashandwater, -Rubagainstthesurface, -Usemaizecobs, -Usenaturalanalcleansingleaves,
              -Usewaterforanal, -Usewaterandsaopforanal, -Managementofchildrensfecalwaste,
              -Howsoondoyoueatthefoodafterpreparation, -Doyoupreheatleftoverfood
              -Wherefoodisprepared, -Howsoondoyoueatthefoodafterpreparation,    
              -Wheredoyoustoreprepredorleftoverfood,-Doyouusuallyhaveleftovers, 
              -Howdoyoudealwithleftovers,-Utensilstodryafterwashing, -Utensilsinthekitchen, 
              -Arevegetablesusuallyapartofyourdiet, -Anyfamilymemberorsomeoneclosewithtyphoid, -Noneoftheabove,
              -Fever, -Diarrhoea1, -Headache, -Shivering, -Nausea, -Constipation1, -Jointormusclepain,
              -Defaultthemedicationgiven, -Anypractisestoavoidtransmission, -Didyoufollowtheadvicegiven,
              -Watermoving, -Watertreatedatsource, -Cleanontheoutside, -Hasacover, -Freeofholesandleakages,
              -Usedforstorageofdrinkingwater,-Drinkingwaterstoragevessel, -CleanontheoutsideDW, -FreeofalgaeDW, -HasacoverDW,
              -FreeofholesandleakagesDW, -Designatedcuptodrawthedrinkingwaterseen, -Hasadoororbarrier,-Utensilsarekeptonthefloor,
              -Leftoverfoodpresent, -HandwashingfacilitypresentFPA, -HWEvidenceofuseFPA, -HWFhasassoaporashFPA, -Humanfaecalmatterclosetothearea,
              -PitlatrinePresent, -Evidenceofproperuseofpitlatrine, -Holeiscovered, -Evidentflyinfestationatpitlatrine, -Magnitudeoftheinfestationatpitlatrine,
              -Handwashingfacilitypresentatpitlatrine,-HWEvidenceofuseattoilet, -Drinkingwatersample,-SalmonellaWS1,    -EColiWS1,  -ShigellaWS1,-Watersourcesamplecollected2,
              -Typeofwatersource2, -SalmonellaDW1,  -EColiDW1, -ShigellaDW1, -Seconddrinkingwatersamplecollected,   -Notes1, -SalmonellaDW2, -EColiDW2, -ShigellaDW2,
                            -OverallContaminationofDWwithSalmonella,    -Watersourcesamplecollected,    -typeofwatersource1,        -SalmonellaWS2, -EColiWS2,  -ShigellaWS2,   
                            -Contamination.of.water.at.the.source,  -FS1collected,  -Speciesoffaecalsample1,    -SalmonellaFS1  -ShigellaFS1,   -EColiFS1,  -FS2collected,  
                            -Speciesoffaecalsample2,    -SalmonellaFS2, -ShigellaFS2,   -EColiFS2,  -Contaminated.fecal.sample, -Householdmembers_cat, -Lackoffirewoodtoboil,-Lackofmoneytobuychlorinetablets,-Itisnotinherenttotreatdrinkingwater,-Believethewaterissafe,-Havenotime,-Feelsickwhentheytaketreatedwater,-Tastelessafterboiling,-Unavailabilityofchlorinetablets,-Fearforchildrensickness, -Timesofconfirmedtyphoidfever, - Swollenpainfulabdomen, -Urine, -Stool, -Blood, -TyphoidfeverconfirmedontheveryfirstvisittoHC, -Noadvicegiven, -Vomiting, -Saliva, -Eatfoodwhilehot, -Painfulabdomennotswollen,-Dontshareutensilswiththevictim,-Evenonurination, -Feverishconditions,-Diarrhoea, -Constipation, -Malaise, -Abdominalpains, -Howoftendoyouexperiencethesesymptoms, -Seekmedicalattentionwhenwithsymptoms, -Medicalresponseafteronsetofsymptoms)
clg<- droplevels.data.frame(clg)
# univariable analysis 
library(survival)
uniques<-data.frame(name=vector("character",0), levels=vector("character",0))
for( i in 1:ncol(clg)){
  a<-length(unique(clg[,i]))
  b <- names(clg)[i]
  new <- data.frame(name=b, levels=a)
  uniques <- rbind(uniques, new)
}
uniques1 <- arrange(uniques, levels) %>% filter(levels==1)

clg <- select(clg, -which(names(clg) %in% unique(uniques1$name)))

# example model
m1 <-clogit(CaseorControl ~ Levelofeducation + strata(match), data=clg)

test<-clg %>% 
      select(-CaseorControl,
              -match) %>%   
      map(~clogit(clg$CaseorControl ~ .x + strata(clg$match),method="approximate", data=clg)) 

result <- data.frame(Variable=vector("character",0),
                     OR=vector("character",0),
                     LL=vector("character",0),
                     UL=vector("character",0),
                     pv=vector("character",0),
                     filter=vector("character",0))

for(i in 1:length(test)){
  a<-mmt(test[[i]])
  a$Variable[1] <- names(test[i])
  result <- rbind(result, a)
}

## names of variables to consider in the model arranged smaller p-value first for forward variable selection
formodel <- arrange(filter(result, filter=="yes" & pv<0.15),pv)$Variable

# dataframe to use for the model
df_model <- select(clg, formodel, CaseorControl, match)
df_model <- select(df_model,-KindofseparatevesseiforDW)
df_model <- df_model %>% drop_na()

### Forward model selection
## consider each variable in the order it appears in "formodel"
## eg the first one is formodel[1]
# AIC 254.2861
m1 <-clogit(CaseorControl ~ Treatdrinkingwaterpractice + strata(match), data=df_model, method="approximate")
#AIC(m1)

# AIC 255.9174
m2 <-clogit(CaseorControl ~ Treatdrinkingwaterpractice + Boiling + strata(match), data=df_model, method="approximate")
#AIC(m2) 

## FINAL MODEL
# AIC 253.517 <- final model
m3 <-clogit(CaseorControl ~ Treatdrinkingwaterpractice + Soapandwatertowashhands + strata(match), data=df_model, method="approximate")
#AIC(m3)
## final table
select(mmt(m3), -filter)
##                        Variable   OR   LL    UL           pv
## 1    Treatdrinkingwaterpractice                 2.741585e-12
## 2 TreatdrinkingwaterpracticeYes 9.67 5.18 18.05 0.000000e+00
## 3    SoapandwatertowashhandsYes 1.66  0.9  3.05 1.040000e-01
# AIC 256.8196
m4 <-clogit(CaseorControl ~ Treatdrinkingwaterpractice + Soapandwatertowashhands + Levelofeducation + strata(match), data=df_model, method="approximate")
#AIC(m4)

# AIC 255.4725
m5 <-clogit(CaseorControl ~ Treatdrinkingwaterpractice + Soapandwatertowashhands + N3644years + strata(match), data=df_model, method="approximate")
#AIC(m5)

# AIC 255.0848
m6 <-clogit(CaseorControl ~ Treatdrinkingwaterpractice + Soapandwatertowashhands + Washhandsaftertoiletorlatrine + strata(match), data=df_model, method="approximate")
#AIC(m6)

# AIC 254.4313
m7 <-clogit(CaseorControl ~ Treatdrinkingwaterpractice + Soapandwatertowashhands + Above65years + strata(match), data=df_model, method="approximate")
#AIC(m7)

# AIC 254.5589
m8 <-clogit(CaseorControl ~ Treatdrinkingwaterpractice + Soapandwatertowashhands + Usepapers + strata(match), data=df_model, method="approximate")
#AIC(m8)

### backward elimination
# AIC 264.2622
mall <-clogit(CaseorControl ~  Treatdrinkingwaterpractice  +  Boiling +                      Soapandwatertowashhands + Levelofeducation + N3644years + Washhandsaftertoiletorlatrine + Above65years + Usepapers + strata(match), data=df_model, method="approximate")
AIC(mall)
## [1] 264.2622
# AIC 263.6646
m9 <-clogit(CaseorControl ~  Treatdrinkingwaterpractice  +  Boiling +                      Soapandwatertowashhands + Levelofeducation + N3644years + Washhandsaftertoiletorlatrine + Above65years + strata(match), data=df_model, method="approximate")
AIC(m9)
## [1] 263.6646
# AIC 262.2305
m10 <-clogit(CaseorControl ~  Treatdrinkingwaterpractice  +  Boiling +                      Soapandwatertowashhands + Levelofeducation + N3644years + Washhandsaftertoiletorlatrine + strata(match), data=df_model, method="approximate")
AIC(m10)
## [1] 262.2305
# AIC 260.477
m11 <-clogit(CaseorControl ~  Treatdrinkingwaterpractice  +  Boiling +                      Soapandwatertowashhands + Levelofeducation + N3644years + strata(match), data=df_model, method="approximate")
AIC(m11)
## [1] 260.477
# AIC 258.4978
m12 <-clogit(CaseorControl ~  Treatdrinkingwaterpractice  +  Boiling +                      Soapandwatertowashhands + Levelofeducation + strata(match), data=df_model, method="approximate")
AIC(m12)
## [1] 258.4978
# AIC 255.273
m13 <-clogit(CaseorControl ~  Treatdrinkingwaterpractice  +  Boiling +                      Soapandwatertowashhands + strata(match), data=df_model, method="approximate")
AIC(m13)
## [1] 255.273
# AIC 255.9174
m14 <-clogit(CaseorControl ~  Treatdrinkingwaterpractice  +  Boiling + strata(match), data=df_model, method="approximate")
AIC(m14)
## [1] 255.9174
# AIC 253.517 <- Final model
m15 <-clogit(CaseorControl ~  Treatdrinkingwaterpractice  +  Soapandwatertowashhands + strata(match), data=df_model, method="approximate")
AIC(m15)
## [1] 253.517
# AIC 312.4566
m16 <-clogit(CaseorControl ~ Soapandwatertowashhands + strata(match), data=df_model, method="approximate")
AIC(m16)
## [1] 312.4566
## this is me cheating and ignoring Soapandwatertowashhands variable to see what happens. Model still seems upside down
# AIC 313.4555
m1 <-clogit(CaseorControl ~ Boiling + strata(match), data=df_model, method="approximate")
#AIC(m1)

# AIC 311.5513
m2 <-clogit(CaseorControl ~ Boiling + Soapandwatertowashhands + strata(match), data=df_model, method="approximate")
#AIC(m2) 

# AIC 309.5377 
m3 <-clogit(CaseorControl ~ Boiling + Soapandwatertowashhands + Levelofeducation + strata(match), data=df_model, method="approximate")
#AIC(m3)

# AIC 309.2024
m4 <-clogit(CaseorControl ~ Boiling + Soapandwatertowashhands + Levelofeducation + N3644years + strata(match), data=df_model, method="approximate")
#AIC(m4)

# AIC 311.5176
m5 <-clogit(CaseorControl ~ Boiling + Soapandwatertowashhands + Levelofeducation + Washhandsaftertoiletorlatrine + strata(match), data=df_model, method="approximate")
#AIC(m5)

# AIC 309.4235
m6 <-clogit(CaseorControl ~ Boiling + Soapandwatertowashhands + Levelofeducation + Above65years + strata(match), data=df_model, method="approximate")
#AIC(m6)

# AIC 308.9699 <- final model if cheat
m7 <-clogit(CaseorControl ~ Boiling + Soapandwatertowashhands + Levelofeducation + Usepapers + strata(match), data=df_model, method="approximate")
#AIC(m7)

# AIC 254.5589
m8 <-clogit(CaseorControl ~ Treatdrinkingwaterpractice + Soapandwatertowashhands + Usepapers + strata(match), data=df_model, method="approximate")
#AIC(m8)


## model selection 
#library(MuMIn)
#mall <-clogit(CaseorControl ~ . + strata(match), data=df_model)

#options(na.action = "na.fail")
#all_models<-dredge(mall,evaluate = F)
#all_models_eval<-dredge(mall,evaluate = T)
#all_models_get <-get.models(all_models_eval, subset = T)
#########
# RC Analysis were missing values are included in the dataset and the full dataset is used
##########

#Get weather data for each week between 2012-2018
weather <- read.csv("kasese_weather_20180308.csv")
weather$TotalRain <- rowSums(weather[,c( "prec0", "prec3", "prec6", "prec9", "prec12", "prec15","prec18", "prec21")])
weather$MeanRain <- rowMeans(weather[,c( "prec0", "prec3", "prec6", "prec9", "prec12", "prec15","prec18", "prec21")], na.rm = TRUE)
weather$meantempC <- rowMeans(weather[,c("temp0", "temp3", "temp6", "temp9", "temp12", "temp15",  "temp18", "temp21")], na.rm = TRUE)
weather$meanhumidity <- rowMeans(weather[,c("hum0", "hum3", "hum6", "hum9", "hum12", "hum15","hum18", "hum21")], na.rm = TRUE)
weather$Week <- as.integer(week(ymd(weather$date)))
weather$Year <- year(ymd(weather$date))
weather$Month <- as.character((month(ymd(weather$date),label=TRUE,abbr=FALSE)))
weather$Month <- factor(weather$Month,
                    levels=c("January","February","March","April","May","June","July","August","September","October","November","December"))
weather$date <- as.Date(weather$date, "%Y-%m-%d")
weekdate <- aggregate(weather$date, list(weather$Week, weather$Year), min)
colnames(weekdate) <- c("Week", "Year", "StartDate")

weather <- merge(weather, weekdate, by.weather = c("Week", "Year"), by.weekdate = c("Week", "Year"), all.x=TRUE)

weather1 <- weather[,c("date", "Year", "Week", "Month", "maxtempC", "mintempC",  "meantempC",
                      "TotalRain", "MeanRain", "meanhumidity", "StartDate")]
weekly_weather <- 
  weather1 %>% 
      group_by(Year, Week, Month, StartDate) %>%
      summarise_at(vars(-date), funs(mean(., na.rm=TRUE)))

#Getting the missing weeks for 2013-2017
weekly_weather1317 <- weekly_weather[weekly_weather$Year >= 2013 & weekly_weather$Year <= 2017,]
FULL_MOH_na <- merge(weekly_weather1317, FULL_MOH, by = c("Year", "Week", "Month"), all.x = TRUE)

#Removing extra weeks in 2017 for which we don't have case data
FULL_MOH_na <- subset(FULL_MOH_na, !(FULL_MOH_na$Year == 2017 & FULL_MOH_na$Week > 23))
dim(FULL_MOH_na)
## [1] 279  12
#Everthing up until 2016
FULL_MOH_na_2016 <- subset(FULL_MOH_na, FULL_MOH_na$Year < 2017)

#2017 
FULL_MOH_na_2017 <- subset(FULL_MOH_na, FULL_MOH_na$Year == 2017)


#Adding 0 for cases when there was no outbreaks
FULL_MOH_na_2016$Cases.in.Kasese[which(FULL_MOH_na_2016$Week >=1 &  FULL_MOH_na_2016$Week <= 13 & FULL_MOH_na_2016$Year==2013)] <-  0
FULL_MOH_na_2016$Cases.in.Uganda[which(FULL_MOH_na_2016$Week >=1 &  FULL_MOH_na_2016$Week <= 13 & FULL_MOH_na_2016$Year==2013)] <-  0
FULL_MOH_na_2016$Cases.in.Kasese[which(FULL_MOH_na_2016$Week >=39 & FULL_MOH_na_2016$Year==2014)] <-  0
FULL_MOH_na_2016$Cases.in.Kasese[which(FULL_MOH_na_2016$Week <=12 & FULL_MOH_na_2016$Year==2015)] <-  0
FULL_MOH_na_2016$Cases.in.Uganda[which(FULL_MOH_na_2016$Week >=1 &  FULL_MOH_na_2016$Week <= 13 & FULL_MOH_na_2016$Year==2013)] <-  0
FULL_MOH_na_2016$Cases.in.Uganda[which(FULL_MOH_na_2016$Week >=39 & FULL_MOH_na_2016$Year==2014)] <-  0
FULL_MOH_na_2016$Cases.in.Uganda[which(FULL_MOH_na_2016$Week <=12 & FULL_MOH_na_2016$Year==2015)] <-  0

FULL_MOH_na_2016_rain <- melt(FULL_MOH_na_2016[,c("Year", "Week", "Month","StartDate","TotalRain", "MeanRain")], id=c("Year", "Week", "Month", "StartDate"))
FULL_MOH_na_2016_temp <- melt(FULL_MOH_na_2016[,c("Year", "Week", "Month", "StartDate","maxtempC", "mintempC",  "meantempC")], id=c("StartDate","Year", "Week", "Month"))
FULL_MOH_na_2016_hum <- melt(FULL_MOH_na_2016[,c("StartDate","Year", "Week", "Month", "meanhumidity")], id=c("StartDate","Year", "Week", "Month"))

#Time series using TS                       
FULL_MOH_kasese_na_ts <- ts(FULL_MOH_na_2016$Cases.in.Kasese, start=c(2013,1), end=c(2017,1), frequency=52 ,ts.eps = getOption("ts.eps"), is.ts(x))
  ts_plot_kasese <- plot(FULL_MOH_kasese_na_ts)

FULL_MOH_uganda_na_ts <- ts(FULL_MOH_na_2016$Cases.in.Uganda, start=c(2013,1), end=c(2017,1), frequency=52 ,ts.eps = getOption("ts.eps"), is.ts(x))
  ts_plot_uganda <- plot(FULL_MOH_uganda_na_ts)

FULL_MOH_temp_na_ts <- ts(FULL_MOH_na_2016$meantempC,  start=c(2013,1), end=c(2017,1), frequency=52 ,ts.eps = getOption("ts.eps"), is.ts(x))
  ts_plot_temp <-  plot(FULL_MOH_temp_na_ts)

FULL_MOH_hum_na_ts <- ts(FULL_MOH_na_2016$meanhumidity,  start=c(2013,1), end=c(2017,1), frequency=52 ,ts.eps = getOption("ts.eps"), is.ts(x))
  ts_plot_hum <- plot(FULL_MOH_hum_na_ts)

FULL_MOH_rain_na_ts <- ts(FULL_MOH_na_2016$TotalRain,  start=c(2013,1), end=c(2017,12), frequency=52 ,ts.eps = getOption("ts.eps"), is.ts(x))
  ts_plot_rain <- plot(FULL_MOH_rain_na_ts)

FULL_MOH_2017_ts <- ts(FULL_MOH_na_2017$Cases.in.Kasese,  start=c(2017,1), end=c(2017,12), frequency=52 ,ts.eps = getOption("ts.eps"), is.ts(x))
FULL_MOH_2017_UG_ts <- ts(FULL_MOH_na_2017$Cases.in.Uganda,  start=c(2017,1), end=c(2017,12), frequency=52 ,ts.eps = getOption("ts.eps"), is.ts(x))

# #Time series using xts
# #Time series using xts
# xts_kasese <- as.xts(FULL_MOH_na_2016$Cases.in.Kasese,  order.by=FULL_MOH_na_2016$StartDate, ts.eps = getOption("ts.eps"))
# xts_uganda <- as.xts(FULL_MOH_na_2016$Cases.in.Uganda,  order.by=FULL_MOH_na_2016$StartDate, ts.eps = getOption("ts.eps"))
# xts_temp <- as.xts(FULL_MOH_na_2016$meantempC,  order.by=FULL_MOH_na_2016$StartDate, ts.eps = getOption("ts.eps"))
# xts_rain <- as.xts(FULL_MOH_na_2016$MeanRain,  order.by=FULL_MOH_na_2016$StartDate, ts.eps = getOption("ts.eps"))
# xts_hum <- as.xts(FULL_MOH_na_2016$meanhumidity,  order.by=FULL_MOH_na_2016$StartDate, ts.eps = getOption("ts.eps"))
# 
# xts_kasese_2017 <- as.xts(FULL_MOH_na_2017$Cases.in.Kasese,  order.by=FULL_MOH_na_2017$StartDate, ts.eps = getOption("ts.eps"))

#===================================================
# PLOT FOR IMPUTE MISSING DATA FOR TIME SERIES time series
#===================================================
  
imp_kalman_kasese_na <- na.kalman(FULL_MOH_kasese_na_ts) ## Kasese
imp_ma_kasese_na <- na.ma(FULL_MOH_kasese_na_ts)
imp_mean_kasese_na <- na.mean(FULL_MOH_kasese_na_ts)
imp_random_kasese_na <- na.random(FULL_MOH_kasese_na_ts)
imp_locf_kasese_na <- na.locf(FULL_MOH_kasese_na_ts)
imp_interpolation_kasese_na <- na.interpolation(FULL_MOH_kasese_na_ts)


imp_kalman_Uganda_na <- na.kalman(FULL_MOH_uganda_na_ts)## Uganda
imp_ma_Uganda_na <- na.ma(FULL_MOH_uganda_na_ts)
imp_mean__Uganda_na <- na.mean(FULL_MOH_uganda_na_ts)
imp_random_Uganda_na <- na.random(FULL_MOH_uganda_na_ts)
imp_locf_Uganda_na <- na.locf(FULL_MOH_uganda_na_ts)
imp_interpolation_Uganda_na <- na.interpolation(FULL_MOH_uganda_na_ts)

imp_kalman_kasese_tx <- na.kalman(FULL_MOH_na_2016$Cases.in.Kasese)
imp_ma_kasese_na_tx <- na.ma(FULL_MOH_na_2016$Cases.in.Kasese)
imp_mean_kasese_na_tx <- na.mean(FULL_MOH_na_2016$Cases.in.Kasese)
imp_random_kasese_na_tx <- na.random(FULL_MOH_na_2016$Cases.in.Kasese)
imp_locf_kasese_na_tx <- na.locf(FULL_MOH_na_2016$Cases.in.Kasese)
imp_interpolation_kasese_na_tx <- na.interpolation(FULL_MOH_na_2016$Cases.in.Kasese)

imp_dataset_kasese <- data.frame(StartDate=FULL_MOH_na_2016$StartDate,
                          OriginalTS = FULL_MOH_na_2016$Cases.in.Kasese,
                          Kalman = imp_kalman_kasese_tx,
                          MA = imp_ma_kasese_na_tx,
                         # Mean = imp_mean_kasese_na_tx,
                         # Random = imp_random_kasese_na_tx,
                          LOCF = imp_locf_kasese_na_tx,
                          Interpolation = imp_interpolation_kasese_na_tx)
imp_dataset_kasese1 <- melt(imp_dataset_kasese, id="StartDate")
imp_dataset_kasese1$Week <- as.integer(week(ymd(imp_dataset_kasese1$StartDate)))
imp_dataset_kasese1$Year <- year(ymd(imp_dataset_kasese1$StartDate))
 
# plotNA.imputations(x.withNA = FULL_MOH_na_2016$Cases.in.Kasese, x.withImputations = na.seadec(FULL_MOH_na_2016$Cases.in.Kasese, "kalman"))
# plotNA.imputations(x.withNA = FULL_MOH_na_2016$Cases.in.Kasese, x.withImputations = na.seadec(FULL_MOH_na_2016$Cases.in.Kasese,"ma"))

imp_kalman_uganda_na_tx <- na.kalman(FULL_MOH_na_2016$Cases.in.Uganda)
imp_ma_uganda_na_tx <- na.ma(FULL_MOH_na_2016$Cases.in.Uganda)
imp_mean_uganda_na_tx <- na.mean(FULL_MOH_na_2016$Cases.in.Uganda)
imp_random_uganda_na_tx <- na.random(FULL_MOH_na_2016$Cases.in.Uganda)
imp_locf_uganda_na_tx <- na.locf(FULL_MOH_na_2016$Cases.in.Uganda)
imp_interpolation_uganda_na_tx <- na.interpolation(FULL_MOH_na_2016$Cases.in.Uganda)

imp_dataset_uganda <- data.frame(StartDate=FULL_MOH_na_2016$StartDate,
                          OriginalTS = FULL_MOH_na_2016$Cases.in.Uganda,
                          Kalman = imp_kalman_uganda_na_tx,
                          MA = imp_ma_uganda_na_tx,
                        #  Mean = imp_mean_uganda_na_tx,
                        #  Random = imp_random_uganda_na_tx,
                          LOCF = imp_locf_uganda_na_tx,
                          Interpolation = imp_interpolation_uganda_na_tx)
imp_dataset_uganda1 <- melt(imp_dataset_uganda, id="StartDate")
imp_dataset_uganda1$Week <- as.integer(week(ymd(imp_dataset_uganda1$StartDate)))
imp_dataset_uganda1$Year <- year(ymd(imp_dataset_uganda1$StartDate))
# 
# # Kasese_2017_ts <- ts(FULL_MOH_na_2017$Cases.in.Kasese, start=c(2017, 1), end=c(2017, 12), frequency=52,ts.eps = getOption("ts.eps"), is.ts(x))
# # plot(Kasese_2017_ts)
# 
# 
# #Plotting the data
imput_kasese_plot <- ggplot(imp_dataset_kasese1, aes(x=Week, y=value, col=variable)) +
                          geom_line(stat = "identity") +
                          facet_grid(.~Year) +
                          scale_y_continuous("Number of cases (Kasese)") + 
                          scale_color_discrete(name="",
                                              breaks=c("OriginalTS", "Kalman", "MA", "LOCF", "Interpolation"),
                                              labels=c("Original time series", "Kalman smoothing", "Weighted moving average",
                                                       "Last observation carried forward", "Interpolation")) +
                          scale_x_continuous("") + annotate("text", x = c(16,19), y = 28, label = "*") ### Rebecca please add these annotations for flooding reported for  (week 16 in 2013, 19 in 2014, 17 & 27 in 2015, 15 in 2016)
imput_uganda_plot <- ggplot(imp_dataset_uganda1, aes(x=Week, y=value, col=variable)) +
                          geom_line(stat = "identity") +
                          facet_grid(.~Year) +
                          scale_y_continuous("Number of cases (Uganda)") +
                          scale_color_discrete(name="",
                                              breaks=c("OriginalTS", "Kalman", "MA", "LOCF", "Interpolation"),
                                              labels=c("Original time series", "Kalman smoothing", "Weighted moving average",
                                                       "Last observation carried forward", "Interpolation")) +
                          scale_x_continuous("")
rain_plot <- ggplot(FULL_MOH_na_2016_rain, aes(x = Week, y = value, col=variable)) +
                  geom_line(stat = "identity") +
                  scale_y_continuous("Rainfall (mm)") +
                  facet_grid(.~Year) +
                  scale_color_discrete(name="",
                                       breaks=c("TotalRain", "MeanRain"),
                                        labels=c("Total rainfall", "Mean rainfall")) +
                  scale_x_continuous("")
temp_plot <- ggplot(FULL_MOH_na_2016_temp, aes(x = Week, y = value, col=variable)) +
                  geom_line(stat = "identity") +
                  scale_y_continuous(expression(paste("Temperature ",(degree*C)))) +
                  facet_grid(.~Year) +
                  scale_color_discrete(name="",
                                       breaks=c("maxtempC", "mintempC", "meantempC"),
                                        labels=c("Maximum temperature", "Minimum temperature", "Mean temperature")) +
                  scale_x_continuous("")
humidity_plot <- ggplot(FULL_MOH_na_2016_hum, aes(x = Week, y = value, col=variable)) +
                      geom_line(stat = "identity") +
                      scale_y_continuous("Humidity (%)") +
                      facet_grid(.~Year) +
                      scale_color_discrete(name="",
                                       breaks=c("meanhumidity"),
                                        labels=c("Mean humidity"))

# 
# 
#grid.arrange(imput_kasese_plot, imput_uganda_plot, rain_plot, temp_plot, humidity_plot, nrow = 5, common.legend = TRUE, legend="bottom")----
pdf("../../Figures/Imputed_data.pdf",width = 10,height = 14)
multiplot(imput_kasese_plot, imput_uganda_plot,rain_plot,temp_plot,humidity_plot, cols = 1)
dev.off()
## quartz_off_screen 
##                 2
ts_meanrain <- ts(FULL_MOH_na_2016$MeanRain)
ts_totalrain <- ts(FULL_MOH_na_2016$TotalRain)
ts_maxtemp <- ts(FULL_MOH_na_2016$maxtempC)
ts_mintemp <- ts(FULL_MOH_na_2016$mintempC)
ts_meantemp <- ts(FULL_MOH_na_2016$meantempC)
ts_meanhum <- ts(FULL_MOH_na_2016$meanhumidity)

####*******************************************
#### TS CROSS CORRELATION WITH WEATHER IN KASESE
####*******************************************

ccf(imp_kalman_kasese_na,FULL_MOH_rain_na_ts,type = "correlation",plot = TRUE) 
ccf(imp_kalman_kasese_na,FULL_MOH_rain_na_ts,type = "correlation",plot = TRUE) 

ccf(imp_kalman_kasese_na,FULL_MOH_temp_na_ts,type = "correlation",plot = TRUE) 

#ccf(imp_kalman_kasese_na,ts_mintemp,type = "correlation",plot = TRUE) 
#ccf(imp_kalman_kasese_na,ts_meantemp,type = "correlation",plot = TRUE) 
ccf(imp_kalman_kasese_na,FULL_MOH_hum_na_ts,type = "correlation",plot = TRUE) 

####*******************************************
#### SEASONALITY ANALYSIS UG AND KASESE
####*******************************************

myts_NAT12 <- ts(FULL_MOH_na_2016$Cases.in.Uganda, start=c(2013, 1), end=c(2017, 1), frequency=12,ts.eps = getOption("ts.eps"), is.ts(x))  ### NATIONAL AT
myts_DIS12 <- ts(FULL_MOH_na_2016$Cases.in.Kasese, start=c(2013, 1), end=c(2017, 1), frequency=12,ts.eps = getOption("ts.eps"), is.ts(x))  ### DISTRICT AT MOTHLY FREQUENCY
myts_RAIN12 <- ts(FULL_MOH_na_2016$TotalRain, start=c(2013, 1), end=c(2017, 1), frequency=12,ts.eps = getOption("ts.eps"), is.ts(x))  ### RAIN AT DISTRICT MONTHLY
myts_TEMP12 <- ts(FULL_MOH_na_2016$maxtempC, start=c(2013, 1), end=c(2017, 1), frequency=12,ts.eps = getOption("ts.eps"), is.ts(x))  ### TEMPERATURE AT DISTRICT MONTHLY
myts_HUM12<- ts(FULL_MOH_na_2016$meanhumidity, start=c(2013, 1), end=c(2017, 1), frequency=12,ts.eps = getOption("ts.eps"), is.ts(x))  ### HUMIDITY AT DISTRICT MONTHLY

# fit_NAT_1 <- ets(myts_NAT12)
# res <- cbind(Residuals = residuals(fit_NAT_1), 
#              Response.residuals = residuals(fit_NAT_1, type='response'))
# autoplot(res, facets=TRUE)
# 
# fit_DIS_1 <- ets(myts_DIS12)
# res <- cbind(Residuals = residuals(fit_DIS_1), 
#              Response.residuals = residuals(fit_DIS_1, type='response'))
# autoplot(res, facets=T)

imp_kalman_myts_NAT12 <- na.kalman(myts_NAT12) ## National impute season
imp_kalman_myts_DIS12 <- na.kalman(myts_DIS12) ## Kasese impute season

R5<-ggseasonplot(imp_kalman_myts_NAT12, polar=TRUE) + theme_bw() + labs(title= "Typhoid fever in Uganda ")
R1<-ggseasonplot(imp_kalman_myts_DIS12, polar=TRUE) + theme_bw() + labs(title= "Typhoid fever in Kasese District")
R2<-ggseasonplot(myts_RAIN12, polar=TRUE) + theme_bw() + labs(title= "Rainfall in Kasese District")
R3<-ggseasonplot(myts_TEMP12, polar=TRUE) + theme_bw() + labs(title= "Temperature in Kasese District")
R4<-ggseasonplot(myts_HUM12, polar=TRUE) + theme_bw() + labs(title= "Humidity in Kasese District")

multiplot(R1,R2,R3,R4, cols = 2)

pdf("../../Figures/Seasonality_Kasese.pdf",width = 8,height = 6)

multiplot(R1,R2,R3,R4, cols = 2)

dev.off()
## quartz_off_screen 
##                 2
####**********************
#### TS MODEL FITTING
####**********************

fit_kas<-auto.arima(imp_kalman_kasese_na) ### Kasese typhoid data TS fitting using kalman imputed data

checkresiduals(fit_kas)

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(0,1,1)
## Q* = 73.2, df = 103, p-value = 0.9884
## 
## Model df: 1.   Total lags used: 104
fit_UG<-auto.arima(imp_kalman_Uganda_na) ### Kasese typhoid data TS fitting using kalman imputed data
checkresiduals(fit_UG)

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(0,1,2)(0,0,1)[52]
## Q* = 97.095, df = 101, p-value = 0.5915
## 
## Model df: 3.   Total lags used: 104
####**********************
#### FORCASTING####
####**********************

Kas_for_17_intp <- forecast(object = imp_interpolation_kasese_na, h = 16, level=c(90,95))
Kas_for_17_kal <- forecast(object = imp_kalman_kasese_na, h = 16, level=c(90,95))
Kas_for_17_ma <- forecast(object = imp_ma_kasese_na, h = 16, level=c(90,95))

Ug_for_17_intp <- forecast(object = imp_interpolation_Uganda_na, h = 16, level=c(90,95))
Ug_for_17_kal <- forecast(object = imp_kalman_Uganda_na, h = 16, level=c(90,95))
Ug_for_17_ma <- forecast(object = imp_ma_Uganda_na, h = 16, level=c(90,95))


Plot_Kas_for_17<-autoplot(Kas_for_17_intp , series="interpolation 2013-2016") +
 theme_bw() + labs(title=" A Forecast of Typhoid cases in Kasese district  for the 1st  quarter of 2017", y= "Typhoid cases")  + 
  autolayer(Kas_for_17_kal, series="Forecast Kalman") + 
  autolayer(FULL_MOH_2017_ts, series="2017 DATA") +
  autolayer(fitted(Kas_for_17_ma), series="Forecast Moving Average") +
  autolayer(fitted(Kas_for_17_intp), series="Forecast Interpolation")+
  autolayer(fitted(Kas_for_17_kal), series="fitted") 

Plot_Ug_for_17<-autoplot(Ug_for_17_intp , series="interpolation 2013-2016") +
 theme_bw() + labs(title=" A Forecast of Typhoid cases in Uganda  for the 1st  quarter of 2017", y= "Typhoid cases")  + 
  autolayer(Ug_for_17_kal, series="Forecast Kalman") +
  autolayer(FULL_MOH_2017_UG_ts, series="2017 DATA") +
  autolayer(fitted(Ug_for_17_ma), series="Forecast Moving Average") +
  autolayer(fitted(Ug_for_17_intp), series="Forecast Interpolation")+
  autolayer(fitted(Ug_for_17_intp), series="fitted") 

multiplot(Plot_Ug_for_17,Plot_Kas_for_17, cols = 1)

pdf("../../Figures/Forecasting.pdf",width = 12,height = 6)

multiplot(Plot_Ug_for_17,Plot_Kas_for_17, cols = 1)

dev.off()
## quartz_off_screen 
##                 2
#http://sma.epfl.ch/~lbelzile/math342/pract_sup.html

#======
#Non-Linear regession using R_INLA
#=======
# hist(FULL_MOH_na_2016$Cases.in.Kasese)
# hist(FULL_MOH_na_2016$MeanRain)
# plot(FULL_MOH_na_2016$Cases.in.Kasese, FULL_MOH_na_2016$MeanRain)
# 
# #No seasonal affects
# formula <- Cases.in.Kasese ~ 1 + MeanRain
# model.linear_rain <- inla(formula, family="gaussian", data= FULL_MOH_na_2016, control.predictor = list(compute=TRUE))
# round(model.linear_rain$summary.fixed[,1:5],3) #Estimates of the marginal posterior mean
# plot(model.linear_rain$marginals.fixed$`(Intercept)`[,1], model.linear_rain$marginals.fixed$`(Intercept)`[,2]) #Posterior density plot for the intercept 
# plot(model.linear_rain$marginals.fixed$`MeanRain`[,1], model.linear_rain$marginals.fixed$`MeanRain`[,2]) #Posterior density plot for the slope  
# res.lin <- (FULL_MOH_na_2016$Cases.in.Kasese - model.linear_rain$summary.fitted.values$mean) / model.linear_rain$summary.fitted.values$sd #Standardized residuals - the model departs from the observations
#   plot(res.lin,ylim=c(-15,20),main="",xlab="Weeks",ylab=expression((y[i] - hat(mu[i]))/hat(sigma[i])),xaxt="n") #Could possibly be a pattern??
#   
#   
# 
# #Seaonal affects
# formula2 <- Cases.in.Kasese ~ 1 + MeanRain + Month
# model.linear_rain2 <- inla(formula2, family="gaussian", data= FULL_MOH_na_2016, control.predictor = list(compute=TRUE))
# res.lin2 <- (FULL_MOH_na_2016$Cases.in.Kasese - model.linear_rain2$summary.fitted.values$mean) / model.linear_rain2$summary.fitted.values$sd #Standardized residuals 
#     plot(res.lin2,ylim=c(-10,10),main="",xlab="Weeks",ylab=expression((y[i] - hat(mu[i]))/hat(sigma[i])),xaxt="n") #Could possibly be a pattern??
# 
# #Seasonal affects with a random walk
# formula3 <- Cases.in.Kasese ~ 1 + MeanRain + Month + f(row.names(FULL_MOH_na_2016), model="rw2", hyper = list(prec = list(prior="loggamma", param=c(1,0.01))))
# model.linear_rain3 <- inla(formula3, family="gaussian", data= FULL_MOH_na_2016, control.predictor = list(compute=TRUE))
# res.lin3 <- (FULL_MOH_na_2016$Cases.in.Kasese - model.linear_rain3$summary.fitted.values$mean) / model.linear_rain3$summary.fitted.values$sd #Standardized residuals 
#     plot(res.lin3,ylim=c(-10,10),main="",xlab="Weeks",ylab=expression((y[i] - hat(mu[i]))/hat(sigma[i])),xaxt="n")