####################################################################################################################

# Study Title: Turning lances into shields: flower mantids stretch their 
#                  raptorial forelegs to avert and deflect predator attack
#
# Script Name: GBIF Species Distribution Data Retrieval and Plotting
#
# Description: This script retrieves occurrence data for a specified taxonomic group from
#   GBIF, cleans the downloaded data using CoordinateCleaner, exports the clean 
#   data to an Excel file, and plots the distribution points on a world map.
#   Due to potential differences in data retrieval times and updates on the GBIF 
#   database, results may vary slightly between downloads. Please refer to the 
#   specific date of data retrieval when citing the dataset.
#
#This script requires a valid GBIF account. Please ensure you have registered 
#   and update the account information (username, password, and email) accordingly.
# Reference: Adapted from https://mp.weixin.qq.com/s/WVS-IMP7lquid0SkAdjLvw by Wang Ying.
####################################################################################################################

# Clear the workspace and free memory
rm(list = ls())
gc()

#########################
# 1. Load Required Packages
#########################
library(rgbif)             # For retrieving data from GBIF
library(readr)             # For reading data
library(openxlsx)          # For exporting Excel files
library(CoordinateCleaner) # For cleaning data and checking coordinate issues
library(ggplot2)           # For plotting
library(maps)              # Provides world map data
library(ggrepel)           # For adding sample labels

#########################
# 2. Retrieve Occurrence Data from GBIF
#########################

# Step 1: Query species keys based on taxonomic requirements
# Search for the family Hymenopodidae
a <- name_lookup(query = "Hymenopodidae", 
                 rank = "family",
                 limit = 99999)
# Extract species keys from the query result
key <- a[["data"]][["key"]]

# Step 2: Download distribution data using the retrieved keys
res <- occ_download(
  pred_in("taxonKey", key),      # Use the keys obtained above
  pred_in("basisOfRecord", c('PRESERVED_SPECIMEN',  # Preserved specimens
                             'HUMAN_OBSERVATION',     # Human observations
                             'OBSERVATION',           # Observation records
                             'LIVING_SPECIMEN',       # Living specimens
                             'MACHINE_OBSERVATION',   # Machine observations
                             'LITERATURE')),          # Literature records
  # Uncomment the following line to limit by country if needed
  # pred_in("country",  c("CN","TW")), 
  # Uncomment and adjust the following line to restrict the search area
  # pred_within(gbif_bbox2wkt(minx=-90, miny=-160, maxx=90, maxy=-180)),
  pred("hasCoordinate", TRUE),   # Only select records with coordinates
  pred("hasGeospatialIssue", FALSE), 
  # Uncomment the following line to limit records by year if needed
  # pred_gte("year", 1990),
  format = "SIMPLE_CSV",         # Output format as CSV
  user = "YOUR_USERNAME",        # Replace with your GBIF username
  pwd = "YOUR_PASSWORD",                  # Replace with your GBIF password or access code
  email = "YOUR_EMAIL"           # Replace with your email address
)

# Submit the download request and wait until the status becomes SUCCEEDED
occ_download_meta(res)

# Step 3: Download and import the data locally
# Use overwrite=T if you wish to overwrite an existing file
z <- occ_download_get(res)
df <- occ_download_import(z)

#########################
# 3. Data Processing and Cleaning
#########################

# Check the column names and see the coordinate columns
colnames(df)
df$decimalLongitude  
df$decimalLatitude   

# Extract key fields: taxon, coordinates, etc.
df1 <- df[, c('taxonKey', 'speciesKey', 'species', 'scientificName',
              'verbatimScientificName', 'decimalLongitude', 'decimalLatitude')]
head(df1)
dim(df1)

# Remove duplicate records
df2 <- unique(df1)
dim(df2)
head(df2)

# Step 4: Data Cleaning
# Filter out records with coordinates equal to zero
df3 <- subset(df2, decimalLongitude != 0 & decimalLatitude != 0)
dim(df3)

# Use CoordinateCleaner to check for problematic coordinates
# (e.g., near capitals, country centroids, in the ocean, or near institutions)
problem_records <- clean_coordinates(df3, lon = "decimalLongitude", lat = "decimalLatitude",
                                     species = "species",
                                     tests = c("capitals", "centroids", "equal", "institutions", "zeros", "seas"))

# View summary of problematic records
summary(problem_records)

# Exclude all records with issues from the dataset
data_clean <- df3[which(problem_records$.summary == "TRUE"), ]
str(data_clean)

# Save the cleaned data as an Excel file
write.xlsx(data_clean, "hymenopodidae_data_clean.xlsx")

#########################
# 4. Plotting Species Distribution
#########################

# Read the processed data file 
# Here the example file is named "hymenopodidae_Mantidae.csv", modify as needed.
# Pre-adjust table format to match reference file format
data <- read.delim(file = "hymenopodidae_Mantidae.csv", header = TRUE, sep = ",")
names(data)
# Extract columns for longitude, latitude, and family-level classification
site <- data[, c(1:3)]
names(site)
str(site)

# Plot a base world map
p <- ggplot() +
  geom_polygon(data = map_data('world'), aes(x = long, y = lat, group = group), fill = 'gray70') +
  theme_bw() +
  theme(panel.grid.minor = element_blank()) +
  scale_x_continuous(breaks = c(-180, -120, -60, 0, 60, 120, 180), expand = c(0, 0),
                     labels = c('180°', '120°W', '60°W', '0', '60°E', '120°E', '180°')) +
  scale_y_continuous(breaks = c(-60, -30, 0, 30, 60), expand = c(0, 0),
                     labels = c('60°S', '30°S', '0', '30°N', '60°N')) +
  labs(x = 'Longitude', y = 'Latitude', color = 'Family')
p

# Add occurrence points to the map
p1 <- p +
  geom_point(data = site, aes(x = Longitude, y = Latitude, color = family), size = 2, shape = 16) +
  scale_color_manual(values = c('#01B250', '#3CA6CC')) +  # Set custom colors
  scale_color_discrete(breaks = c("Mantidae", "Hymenopodidae")) +  # Adjust legend order
  theme(legend.position = c(0.1, 0.3),
        legend.background = element_blank(),
        legend.key = element_rect(fill = NA))
p1  # Display the map; adjust point styles as needed

####################################################################################################################
# End of Script
####################################################################################################################
