# Prepare occurrence and climate data for SDM tuning and modeling

library(raster)
library(usdm)
library(rgeos)
library(blockCV)

source('scripts/SDM/maxentTuningFxn.R')

# change temporary directory so I can more actively manage it
tmpPath <- '~/maxentTemp'

if (!dir.exists(tmpPath)) {
	dir.create(tmpPath)
}

tempdir()
unixtools::set.tempdir(tmpPath)


climDir <- 'data/SDM/env/chelsa/current'
anthropoDir <- 'data/SDM/env/anthropo/mosaics'
landcoverDir <- 'data/SDM/env/earthEnvLandCover'

# read in species range
jayRange <- readOGR('utility/Cyanocitta_stelleri_shp/Cyanocitta_stelleri.shp')
jayRange <- gSimplify(jayRange, tol = 0.1, topologyPreserve=T)


blueRamp <- colorRampPalette(c('gray97', 'blue'))

# convenience function for printing list of predictors
varPrint <- function(set) {
	cat('\n')
	for (i in 1:length(set)) cat('\t', set[i], '\n')
	cat('\n')
}

# ----------------------------------------------------------------
# Processing occurrence records
## Occurrence filtering done in script prepOcc.R

# load occurrences
allOcc <- readRDS('data/SDM/occ5.rds')
allOcc <- allOcc[, c('decimalLongitude','decimalLatitude')]
colnames(allOcc) <- c('Longitude','Latitude')
# -----------------------------------------------------------------
# Process climatic predictors

# load climate grids
clim <- list.files(climDir, full.names=T)
clim <- clim[ - grep('tmin|tmax|tmean|precip|solrad|monthlyPET', clim)] # remove monthly rasters
clim <- clim[ - grep('summer|winter', clim, ignore.case=TRUE)]
clim <- clim[ - grep('monthCountByTemp10', clim)] # this is a discrete variable which we won't use
clim <- clim[ - grep('aridityIndexThornthwaite', clim)] # this is redundant with climatic moisture index
clim <- stack(clim)

names(clim)

# Coarsen from 1x1 km to 2x2 km
clim <- aggregate(clim, fact = 2)


# drop any records that fall outside of climate data
e <- extract(clim[[1]], allOcc)
allOcc <- allOcc[which(!is.na(e)),]





#####

# # load anthropogenic layers: distance from roads, distance from populated places, distance from urban environments, distance from protected areas
anthropo <- stack(list.files(anthropoDir, full.names=T))
anthropo <- resample(anthropo, clim[[1]])

# explore relationship between occurrences and these grids. The more important the bias, the more we would expect occurrences to be close to these particular features.
e <- extract(anthropo, allOcc)
e <- e[complete.cases(e),]
apply(e, 2, max)
apply(e, 2, median)

# Looking at the distance to features across all occurrences, in terms of both median and max distance from occurrence to feature, distance to roads is smallest.
# This indicates that distance to roads may be a good predictor of sampling bias. We will therefore use it when sampling pseudo-absences.
anthropoBias <- anthropo$roadsDistance

# We will limit the training region to the geographic range, buffered by 500km
## This is to better differentiate between environments that the species occupies, and environments that it could access but doesn't.
EAproj <- '+proj=aea +lat_1=20 +lat_2=60 +lat_0=40 +lon_0=-96 +x_0=0 +y_0=0 +ellps=GRS80 +datum=NAD83 +units=m +no_defs'
jayRangeEA <- spTransform(jayRange, CRS(EAproj))
hull <- gConvexHull(jayRangeEA)
bufferedHull <- gBuffer(hull, width=500000)
bufferedHull <- spTransform(bufferedHull, CRS('+proj=longlat +datum=WGS84'))

# create plot to show training region and occurrences
# png('trainingRegion_and_Occ.png', width=12, height=9.3, units='in', res=300, bg='transparent')
# 	plot(clim[[3]])
# 	plot(jayRange, add=TRUE, lwd=0.5)
# 	plot(bufferedHull, add=TRUE, lwd=0.5, lty=3)
# 	points(allOcc, cex=0.1, col='blue')
# dev.off()

rm(anthropo)
rm(jayRangeEA)
gc()

# generate pseudo-absences (properly samples according to different cell sizes with latitude)
# sample in proportion to distance to anthropogenic feature

# distance to roads is 0 at roads, but when sampling pseudo-absences, we will want greatest probability of sampling at roads,
# so we need to invert the distance and rescale from 0 to 1.
anthropoBias <- mask(anthropoBias, mask(clim[[1]], bufferedHull))
values(anthropoBias) <- rescale(values(anthropoBias), to = c(1,0))

set.seed(1) # for reproducibility

bg <- as.data.frame(sampleRast(anthropoBias, n=100000))
colnames(bg) <- c('Longitude','Latitude')

PA <- rbind(allOcc, bg)
PA <- SpatialPoints(PA, proj4string=CRS('+proj=longlat +datum=WGS84'))
PAbin <- c(rep(1, nrow(allOcc)), rep(0, nrow(bg)))

# for comparisons take the 'default' approach of sampling pseudo-absences randomly (but still proportionally to cell size)
bg2 <- as.data.frame(sampleRast(anthropoBias, n=100000, prob=FALSE))
colnames(bg2) <- c('Longitude','Latitude')

PA2 <- rbind(allOcc, bg2)
PA2 <- SpatialPoints(PA2, proj4string=CRS('+proj=longlat +datum=WGS84'))
PAbin2 <- c(rep(1, nrow(allOcc)), rep(0, nrow(bg2)))

# save the presence/pseudo-absences for reproducability
saveRDS(list(PA, PAbin), 'data/SDM/PA.rds')
saveRDS(list(PA2, PAbin2), 'data/SDM/PA2.rds')

PA <- readRDS('data/SDM/PA.rds')[[1]]
PAbin <- readRDS('data/SDM/PA.rds')[[2]]

# ran <- c(sample(which(PAbin == 1), 500), sample(which(PAbin == 0), 2000))
# PA <- PA[ran]
# PAbin <- PAbin[ran]


# swd <- extract(clim, PA)
# --------------------------------------------------------
# Variable reduction
## Step 1: reduce number of variables according to correlations and VIF
## This function will identify each pair of predictors with a correlation coefficient of 0.8 or greater, and drop the one that contributes the most VIF
varcor <- vifcor(clim, th = 0.8, maxobservations = 10000)
vifVar <- as.character(varcor@results$Variables)

varPrint(vifVar)
  
# result from vifcor() above
vifVar <- c("CHELSA_bioclim_02", "CHELSA_bioclim_07", "CHELSA_bioclim_14", "CHELSA_bioclim_15", "CHELSA_bioclim_18", "CHELSA_bioclim_19", "climaticMoistureIndex", "embergerQ", "minTempWarmest", "PETDriestQuarter", "PETseasonality", "PETWettestQuarter")

	
	
##########################################################
	# Generate blocks for cross-validation
	PAdf <- cbind.data.frame(bin = PAbin, coordinates(PA))
	PAdf <- sf::st_as_sf(PAdf, coords = c("Longitude", "Latitude"), crs = crs(PA))
	
	co1 <- ncf::correlog(allOcc[,1], allOcc[,2], raster::extract(clim[[1]], allOcc),increment = 1, resamp = 0, latlon = T, na.rm=TRUE)
	zeroSA <- co1$x.intercept # this is in km
	
	blocks <- spatialBlock(PAdf, rasterLayer = clim[[1]], theRange = zeroSA*1000, k=5, biomod2Format=F, showBlocks=FALSE)

	# if eval blocks from blockCV package, reorganize to structure used by ENMeval::get.block
	if (inherits(blocks, 'SpatialBlock')) {
		folds <- list(occ.grp = numeric(length(which(PAbin == 1))), bg.grp = numeric(length(which(PAbin == 0))))
		for (i in 1:max(blocks$foldID)) {
			blockPres <- intersect(blocks$folds[[i]][[2]], which(PAbin == 1))
			blockBG <- intersect(blocks$folds[[i]][[2]], which(PAbin == 0)) - length(which(PAbin == 1))
			folds[[1]][blockPres] <- i
			folds[[2]][blockBG] <- i
		}
	} else {
		folds <- blocks
	}
	
	nfolds <- max(folds[[1]])
	

	
# # 	
# plot blocks
# plot(clim[[1]])
# for (i in 1:max(blocks$foldID)) {
# 	blockPres <- intersect(blocks$folds[[i]][[2]], which(PAbin == 1))
# 	plot(PA[blockPres,], col=rainbow(max(blocks$foldID))[i], cex=0.1, add=TRUE)
# }

# ----------------------------------------------------------

# combine and save

swd <- as.data.frame(extract(clim, PA))

foldVec <- c(folds[[1]], folds[[2]])

PAdat <- cbind.data.frame(folds = foldVec, resp = PAbin, swd)

table(PAdat[,1], useNA='always')
table(PAdat[,2], useNA='always')

saveRDS(PAdat, 'data/SDM/PAdat.rds')





##########################################################
##########################################################
# INCLUDING LANDCOVER

# Process climatic predictors

# load climate grids
clim <- list.files(climDir, full.names=T)
clim <- clim[ - grep('tmin|tmax|tmean|precip|solrad|monthlyPET', clim)] # remove monthly rasters
clim <- clim[ - grep('summer|winter', clim, ignore.case=TRUE)]
clim <- clim[ - grep('monthCountByTemp10', clim)] # this is a discrete variable which we won't use
clim <- clim[ - grep('aridityIndexThornthwaite', clim)] # this is redundant with climatic moisture index
clim <- stack(clim)

names(clim)

# pdf('climVisual.pdf', onefile = TRUE, width=6, height=6)
# for (i in 1:nlayers(clim)) {
	# cat(i, ' ')
	# plot(clim[[i]])
	# title(main = names(clim)[i])
# }
# dev.off()

# Coarsen from 1x1 km to 2x2 km
clim <- aggregate(clim, fact = 2)


landcover <- list.files(landcoverDir, full.names = TRUE)
landcover <- landcover[order(as.numeric(gsub('(.+)(full_class_)(\\d?\\d)\\.tif$', '\\3', landcover)))]
landcover <- stack(landcover)

names(landcover) <- gsub('consensus_full_class_', 'landcover', names(landcover))

landcover <- resample(landcover, clim)

for (i in 1:nlayers(landcover)) {
	landcover[[i]][which(is.na(values(clim[[1]])))] <- NA
}

# landcover <- mask(landcover, clim[[1]])

clim <- addLayer(clim, landcover)

# # pdf('climVisual_withLandcover.pdf', onefile = TRUE, width=6, height=6)
# for (i in 1:nlayers(clim)) {
	# cat(i, ' ')
	# plot(clim[[i]], col = viridis(100))
	# title(main = names(clim)[i])
# }
# dev.off()

# don't include anthropogenic landcover states
clim <- dropLayer(clim, c('landcover7', 'landcover9', 'landcover12'))


PA <- readRDS('data/SDM/PA.rds')[[1]]
PAbin <- readRDS('data/SDM/PA.rds')[[2]]

# drop any records that fall outside of climate data
e <- extract(clim[[c(1,40)]], PA)
if (any(is.na(e))) {
	PAbin <- PAbin[which(!is.na(e))]
	PA <- PA[which(!is.na(e))]
}

# --------------------------------------------------------
# Variable reduction
## Step 1: reduce number of variables according to correlations and VIF
## This function will identify each pair of predictors with a correlation coefficient of 0.8 or greater, and drop the one that contributes the most VIF
varcor <- vifcor(clim, th = 0.8, maxobservations = 10000)
vifVar <- as.character(varcor@results$Variables)

varPrint(vifVar)

# # ---------- VIFs of the remained variables --------
               # Variables       VIF
# 1      CHELSA_bioclim_02 22.820716
# 2      CHELSA_bioclim_03 29.592327
# 3      CHELSA_bioclim_07 13.309928
# 4      CHELSA_bioclim_14  9.067582
# 5      CHELSA_bioclim_15  3.719125
# 6      CHELSA_bioclim_18  5.361470
# 7      CHELSA_bioclim_19  7.610226
# 8  climaticMoistureIndex  8.453469
# 9              embergerQ  5.656445
# 10        minTempWarmest  5.052112
# 11      PETDriestQuarter  6.141286
# 12        PETseasonality 10.369186
# 13     PETWettestQuarter  4.193647
# 14            landcover1  1.970363
# 15            landcover2  2.151615
# 16            landcover3  1.283686
# 17            landcover4  1.412077
# 18            landcover5  2.107784
# 19            landcover6  1.996113
# 20            landcover8  1.077280
# 21           landcover10  1.362634
# 22           landcover11  1.919582

vifVar <- c("CHELSA_bioclim_02", "CHELSA_bioclim_03", "CHELSA_bioclim_07", "CHELSA_bioclim_14", "CHELSA_bioclim_15", "CHELSA_bioclim_18", "CHELSA_bioclim_19", "climaticMoistureIndex", "embergerQ", "minTempWarmest", "PETDriestQuarter", "PETseasonality", "PETWettestQuarter", paste0("landcover", c(1,2,3,4,5,6,8,10,11)))

	
	
##########################################################
	# Generate blocks for cross-validation
	PAdf <- cbind.data.frame(bin = PAbin, coordinates(PA))
	PAdf <- sf::st_as_sf(PAdf, coords = c("Longitude", "Latitude"), crs = crs(PA))
	
	co1 <- ncf::correlog(allOcc[,1], allOcc[,2], raster::extract(clim[[1]], allOcc),increment = 1, resamp = 0, latlon = T, na.rm=TRUE)
	zeroSA <- co1$x.intercept # this is in km
	
	blocks <- spatialBlock(PAdf, rasterLayer = clim[[1]], theRange = zeroSA*1000, k=5, biomod2Format=F, showBlocks=FALSE)

	# if eval blocks from blockCV package, reorganize to structure used by ENMeval::get.block
	if (inherits(blocks, 'SpatialBlock')) {
		folds <- list(occ.grp = numeric(length(which(PAbin == 1))), bg.grp = numeric(length(which(PAbin == 0))))
		for (i in 1:max(blocks$foldID)) {
			blockPres <- intersect(blocks$folds[[i]][[2]], which(PAbin == 1))
			blockBG <- intersect(blocks$folds[[i]][[2]], which(PAbin == 0)) - length(which(PAbin == 1))
			folds[[1]][blockPres] <- i
			folds[[2]][blockBG] <- i
		}
	} else {
		folds <- blocks
	}
	
	nfolds <- max(folds[[1]])
	

# ----------------------------------------------------------

# combine and save

swd <- as.data.frame(extract(clim, PA))

foldVec <- c(folds[[1]], folds[[2]])

PAdat <- cbind.data.frame(folds = foldVec, resp = PAbin, swd)
dim(PAdat)

table(PAdat[,1], useNA='always')
table(PAdat[,2], useNA='always')

saveRDS(PAdat, 'data/SDM/PAdat_landcover.rds')




