# Prepare occurrence data

# apply the following filters:
# 	- intersect with Steller's Jay range, buffered by 1 degree
# 	- if occurrences include coordinateUncertaintyInMeters, drop those records with > 5km uncertainty
# 	- keep 1 occurrence per 2km grid cell, preferentially keeping preserved specimens over observational records.
# 	- apply proximity filter of 10 km to reduce occurrence clumping and spatial autocorrelation.


gbifFile <- 'data/SDM/gbif_26Apr2019.csv'
climGrid <- 'data/SDM/env/chelsa/current/annualPET.tif'

library(sf)
library(raster)
library(spThin)
library(enmSdm)

climGrid <- raster(climGrid)
climGrid <- raster::aggregate(climGrid, fact=c(2,2))

# read in species range
jayRange <- st_read('utility/Cyanocitta_stelleri_shp/Cyanocitta_stelleri.shp')
jayRange <- st_simplify(jayRange, dTolerance = 0.1)
jayRangeBuff <- st_buffer(jayRange, dist = 1)

# remove holes
jayRangeBuff <- nngeo::st_remove_holes(jayRangeBuff)


occ <- data.table::fread(gbifFile, data.table=FALSE)
occ <- occ[which(!is.na(occ$decimalLongitude) & !is.na(occ$decimalLatitude)),]

# intersect with range, buffered by 1 degree
occSp <- st_as_sf(occ, coords = c('decimalLongitude','decimalLatitude'), crs = st_crs(jayRange))

occSpInt <- st_intersects(occSp, jayRangeBuff)
occSpInt2 <- lengths(occSpInt)
occ2 <- occ[which(occSpInt2 > 0),]

rownames(occ2) <- NULL

# for those occurrences that include coordinateUncertaintyInMeters, drop those records with > 5km uncertainty
table(occ2$coordinateUncertaintyInMeters >= 5000, useNA = 'always')
occ2 <- occ2[which(occ2$coordinateUncertaintyInMeters < 5000 | is.na(occ2$coordinateUncertaintyInMeters)),]


# flag each record as being eBird/observational or specimen-based
# value of 1 for preserved_specimen, value of 2 for anything else.
# This way, we can sort, and preferentially keep specimen-based records.
table(occ2$basisOfRecord)
occ2$priority <- as.numeric(grepl('SPECIMEN', occ2$basisOfRecord))
occ2[which(occ2$priority == 0), 'priority'] <- 2
table(occ2$priority)

# filter with grid
## cellSplit is list with all points falling within each cell 
occ2$cellnum <- cellFromXY(climGrid, occ2[,c('decimalLongitude','decimalLatitude')])
cellSplit <- split(occ2, occ2$cellnum)

# sort each entry such that specimen-based records, if they exist in those cells, are sorted first.
cellSplit <- lapply(cellSplit, function(x) x[order(x$priority),])

# keep one point per cell
occ3 <- lapply(cellSplit, function(x) x[1,])
occ3 <- do.call(rbind, occ3)

nrow(occ3)

# generate map showing which cells are associated with specimens vs observations
plot(climGrid)
points(occ[which(occ$priority == 2),c('decimalLongitude','decimalLatitude')], cex=0.1, col='red', pch = 3)
points(occ[which(occ$priority == 1),c('decimalLongitude','decimalLatitude')], cex=0.1, col='blue', pch = 3)



# apply proximity filter of 10km, using enmSdm::geoThin
# because there are so many points, we can't do this all at once. Therefore we will split the data into 10 blocks
# and apply the proximity filter to each block separately. 
k <- 20
clusters <- kmeans(occ3[,c('decimalLongitude','decimalLatitude')], centers=k)
occ3$clusters <- clusters[[1]]
plot(climGrid)
for (i in 1:k) {
	points(occ3[which(clusters[[1]] == i), c('decimalLongitude','decimalLatitude')], cex=0.1, col=rainbow(k)[i])
}

table(clusters[[1]])

thinned <- list()
for (i in 1:k) {
	message('\tCluster ', i)
	thinned[[i]] <- geoThin(occ3[which(occ3$clusters == i),], minDist = 10000, longLat = c('decimalLongitude','decimalLatitude'), verbose=TRUE)
}

occ4 <- do.call(rbind, thinned)

# do one more round to take care of boundary regions
k <- 5
clusters <- kmeans(occ4[,c('decimalLongitude','decimalLatitude')], centers=k)
plot(climGrid)
for (i in 1:k) {
	points(occ4[which(clusters[[1]] == i), c('decimalLongitude','decimalLatitude')], cex=0.1, col=rainbow(k)[i])
}

table(clusters[[1]])

thinned2 <- list()
for (i in 1:k) {
	message('\tCluster ', i)
	thinned2[[i]] <- geoThin(occ4[which(clusters[[1]] == i),], minDist = 10000, longLat = c('decimalLongitude','decimalLatitude'), verbose=FALSE)
}

occ5 <- do.call(rbind, thinned2)

png('output/thinnedJayOcc.png', width=5, height=5, units='in', res=300)
plot(climGrid)
plot(st_geometry(jayRange), add=TRUE)
points(occ[,c('decimalLongitude','decimalLatitude')], cex=0.1, col='blue', pch = 3)
points(occ5[,c('decimalLongitude','decimalLatitude')], cex=0.1, col='red', pch = 3)
dev.off()

saveRDS(occ5, 'data/SDM/occ5.rds')

