# This is code to perform the analyses and figures for the 2019 IJGIS paper 
# "Spatio-temporal modeling of PM2.5 concentrations with missing data problem: 
#  a case study in Beijing, China"
#
# Codes were developed by Qiang Pu and Enki Yoo
# Created  09/02/2019

library(nlme)

# set your directory:
setwd(' ')

modelingset <- read.csv('modelingdataset.csv', header = T)
names(modelingset)

## -------------------------------------------------------------------------- ##
# Covariates used in the spatio-temporal LME model 
# (see the appendix in the paper for details)
#
# 1. The data set DTB_3K AOD was substituted with publically available data 
#     MOD/MYD04_3K in the current example.
# 2. The spatial prediction is conducted over a subset of study domain 
# 3. Consequently, the model fit/prediction results may be slightly different 
#   from the reports due to the changes made in the input dataset.


## Model inputs:
modelingset$f.DOY = factor(modelingset$f.DOY)
modelingset$f.region = factor(modelingset$f.region)
modelingset.c = modelingset[complete.cases(modelingset),]

#
# IPW based on yourdaily satellite AOD data: 
# HasAOD is binary 1-AOD retrieved, 0-AOD is missing;
# A simple logistic regression is used to retrieve probability 
# of retrieving AOD values.
#
# aodmissing <- glm(HasAOD ~scale.temp + log.BLH + log.prs + f.region + log.dem + log.Forest_2,
#                    data = obs.df4, 
#                    family  = binomial(link ="logit"))
# dailygrid$predpr <- predict(aodmissing,type=c("response")) # probability
# #create weights:
# dailygrid$ipw.weight <-ifelse(dailygrid$HasAOD==1,1/(dailygrid$predpr),
#                               1/(1-dailygrid$predpr))
# # then match the weights for each grid back to your modeling set;
# ipw.weight is the processed weights for this example.

#
frm.1 = formula(log.pm25 ~ log.AOD3K + log.BLH
                + scale.temp +log.hum  + log.wind
                + log.Forest_2 + log.build4  )
#
# With IPW:
lme.pm = lme(frm.1, random = list(~1+log.AOD3K|f.region, ~1+log.AOD3K|f.DOY),
             na.action=na.omit,
             control = lmeControl(opt = "optim"),
             weights = ~ipw.weight,
             data = modelingset.c)
summary(lme.pm)
#
fittedpm <- data.frame(exp(lme.pm$fitted))
# performance at model fitting : 
thisMSE <- sum((fittedpm[,3]-exp(modelingset.c[,7]))^2)/(nrow(fittedpm))
thisRMSE <- sqrt(thisMSE)
thisR2 <- 1 - (sum((exp(modelingset.c[,7])-fittedpm[,3])^2)/
                 sum((exp(modelingset.c[,7])-mean(exp(modelingset.c[,7])))^2))

###########  grid prediction LME:  ###########  
## read in Grid data:
dailygriddata <- read.csv('griddata.csv', header = T)
dailygriddata$f.DOY <- factor(dailygriddata$f.DOY)
dailygriddata$f.region <- factor(dailygriddata$f.region)
#
names(dailygriddata)
# all covariates along with lmepredict - predicted PM2.5 using LME

# avoid NA in covariates:
griddata <- dailygriddata[complete.cases(dailygriddata[,4]),]
gridprediction.lme <- data.frame(predict(lme.pm, 
                                         newdata = griddata,
                                         level=0:2))
# get grid prediction using the LME model
griddata$lmepredict = exp(gridprediction.lme$predict.f.DOY)
sum(is.na(dailygriddata$lmepredict)) #  1397 grid cell without prediction
#

##############################################################
########         Gap-filling suing  INLA - SPDE       ########       

# scripts below are adopted and revised from:
# Cameletti, M., Lindgren, F., Simpson, D. et al. https://doi.org/10.1007/s10182-012-0196-3
library(INLA)
library(fields)
library(abind)

# PM monitoring stations:
coordinates <- read.csv('stationLocations.csv', header=TRUE,sep=",") 
coordinates$Xkm <- coordinates$X/1000
coordinates$Ykm <- coordinates$Y/1000
##--- Borders of JJJ region (in km)
borders <- read.csv("border.csv",header=TRUE,sep=",") 
#
n_stations <- nrow(coordinates)
n_days <- length(unique(modelingset$date)) # 91days as example.
#
coordinates.all <- as.matrix(modelingset[modelingset$station,
                                              c('Xkm','Ykm')])
#
## Triangulation using borders
mesh =
  inla.mesh.create.helper(points=cbind(coordinates$Xkm,
                                       coordinates$Ykm),
                          points.domain=borders,
                          offset=c(10, 50),
                          max.edge=c(50, 1000),
                          min.angle=c(26, 21),
                          cutoff=0, 
                          plot.delay=NULL
  )
length(mesh$loc)
# plot(mesh, col='blue')
# points(borders,  col='blue',lwd=0.1, cex=0.1)

# Construct the SPDE object
spde = inla.spde2.matern(mesh=mesh ,alpha=2)
#-- Create A matrices for the estimation part --# 
A.est =
  inla.spde.make.A(mesh,
                   loc=coordinates.all,
                   group=modelingset$pgroup,
                   n.group=n_days)
# check non-zeros:
dim(A.est)
table(apply(A.est,1,nnzero) >0 ) 

# Observation structure for field prediction
i_day <- 3 # prediction for 2016-05-03 as an example, this is the # in the group

samplegrid <- dailygriddata[,c(1:3)]
samplegrid$Xkm <- samplegrid$X/1000
samplegrid$Ykm <- samplegrid$Y/1000

# griddata <- filter(dailygriddata, gridnum %in% samplegrid$gridnum)
A.pred =
  inla.spde.make.A(mesh, 
                   loc=as.matrix(samplegrid[,c('Xkm','Ykm')]),
                   group=i_day, 
                   n.group=n_days)

# disregard the warning here:
field.indices =
  inla.spde.make.index("field",
                       n.spde = spde$n.spde,# n.mesh=mesh$n,
                       n.group=n_days)

# create stack obj. for the spatio-temporal model
stack.est =
  inla.stack(data=list(logPM25=modelingset$log.mergeobslme),
             A=list(A.est, 1),
             effects=
               list(c(field.indices,list(Intercept=1)),
                    list(modelingset[,c(9:24)])),
             tag="est")

stack.pred =
  inla.stack(data=list(logPM25=NA),
             A=list(A.pred, 1),
             effects=
               list(c(field.indices,list(Intercept=1)),
                    list(dailygriddata[,c(5:21)])),
             tag="pred")

stack <- inla.stack(stack.est, stack.pred)

# Note: INLA-SPDE as a gap-filling method
formula <- (logPM25 ~ -1 + Intercept + log.BLH + log.Forest_2 + log.hum +log.roadlength+
              f(field, model=spde, group=field.group, control.group=list(model="ar1")))

## ################################
## Call INLA and get results   
## It may take a while ...
mod.mode =
  inla(formula,
       data=inla.stack.data(stack.est, spde=spde),
       family="gaussian",
       control.predictor=list(A=inla.stack.A(stack.est), compute=FALSE),
       control.compute=list(cpo=FALSE),
       keep=FALSE, verbose=TRUE,
       control.inla=list(reordering="metis"))

mod =
  inla(
    formula,
    data = inla.stack.data(stack, spde = spde),
    family = "gaussian",
    control.predictor = list(A = inla.stack.A(stack), compute = TRUE),
    control.compute = list(cpo = TRUE, dic = TRUE),
    control.mode = list(theta = mod.mode$mode$theta, restart = FALSE),
    keep = FALSE,
    verbose = TRUE,
    control.inla = list(reordering = "metis")
  )

##--- Results for fixed effects - covariate coeffs
beta = mod$summary.fixed[,"mean"]

# Grid results
index.pred = inla.stack.index(stack, "pred")$data
# posterior mean:
lp_grid_mean = matrix(mod$summary.linear.predictor[index.pred, "mean"], 3248, 1, byrow =
                        T) #### by row!!!!!!!

spdepred <- data.frame(exp(lp_grid_mean)) # grid prediction of PM2.5 concentration
summary(spdepred)

# Next - Finalize by combining both gap-filled and LME prediction: spdepred and griddata$lmepredict