#Code used to analyse and plot data for Allen et al. (2020)
# doi: 10.1098/rspb.2020.1125


########## Palaeorotate points ##########
#Rotated using PALEOMAP global plate models as follows:
#Wuchiapingian & Changhsingian -> Map #50 "Late Permian(Lopingian, 255.7Ma)"
#Induan -> Map #49 "Permo-Triassic boundary (251Ma)"
#Olenekian -> Map #48 "Early Triassic (Induan & Olenekian, 248.5Ma)"
#Anisian -> Map #47 "Middle Triassic (Anisian, 241.5Ma)"
#Ladinian -> Map #46 "Middle Triassic (Ladinian, 232.9Ma)"

#setwd("#####")

#Load packages
library(tidyverse)
library(sp)
library(rgdal)
library(rgeos)
library(maptools)

###Part 1. Convert modern lat-longs to a shape file
#Read in csv file with the present-day lat/long coordinates
dat <- read.csv("data/tetrapod_database_final.csv", stringsAsFactors = FALSE)
dat <- data.frame(dat)

#Lopingian
#Create data frame with collection numbers, longs, lats
LopingianFilter <- filter(dat, stage_assignment == "Wuchiapingian" | stage_assignment == "Changhsingian")
LopingianPoints <- LopingianFilter[c("collection_no", "lng", "lat")]

#Filter to unique points (one per collection)
LopingianPoints <- distinct(LopingianPoints, collection_no, .keep_all = T)
Lopingian_s2xy <- LopingianPoints[c("lng", "lat")]

#Label as coordinates
coordinates(Lopingian_s2xy) <- ~ lng + lat

#Convert to 'Spatial Points Data Frame'
Lopingian_shp <- SpatialPointsDataFrame(Lopingian_s2xy, LopingianPoints, proj4string = CRS("+proj=longlat +datum=WGS84"))

#Write shape file
writeOGR(Lopingian_shp, "data/Modern shapefiles", "Lopingian", driver="ESRI Shapefile")


#Induan
#Create data frame with collection numbers, longs, lats
InduanFilter <- filter(dat, stage_assignment == "Induan")
InduanPoints <- InduanFilter[c("collection_no", "lng", "lat")]

#Filter to unique points (one per collection)
InduanPoints <- distinct(InduanPoints, collection_no, .keep_all = T)
Induan_s2xy <- InduanPoints[c("lng", "lat")]

#Label as coordinates
coordinates(Induan_s2xy) <- ~ lng + lat

#Convert to 'Spatial Points Data Frame'
Induan_shp <- SpatialPointsDataFrame(Induan_s2xy, InduanPoints, proj4string = CRS("+proj=longlat +datum=WGS84"))

#Write shape file
writeOGR(Induan_shp, "data/Modern shapefiles", "Induan", driver="ESRI Shapefile")


#Olenekian
#Create data frame with collection numbers, longs, lats
OlenekianFilter <- filter(dat, stage_assignment == "Olenekian")
OlenekianPoints <- OlenekianFilter[c("collection_no", "lng", "lat")]

#Filter to unique points (one per collection)
OlenekianPoints <- distinct(OlenekianPoints, collection_no, .keep_all = T)
Olenekian_s2xy <- OlenekianPoints[c("lng", "lat")]

#Label as coordinates
coordinates(Olenekian_s2xy) <- ~ lng + lat

#Convert to 'Spatial Points Data Frame'
Olenekian_shp <- SpatialPointsDataFrame(Olenekian_s2xy, OlenekianPoints, proj4string = CRS("+proj=longlat +datum=WGS84"))

#Write shape file
writeOGR(Olenekian_shp, "data/Modern shapefiles", "Olenekian", driver="ESRI Shapefile")


#Anisian
#Create data frame with collection numbers, longs, lats
AnisianFilter <- filter(dat, stage_assignment == "Anisian")
AnisianPoints <- AnisianFilter[c("collection_no", "lng", "lat")]

#Filter to unique points (one per collection)
AnisianPoints <- distinct(AnisianPoints, collection_no, .keep_all = T)
Anisian_s2xy <- AnisianPoints[c("lng", "lat")]

#Label as coordinates
coordinates(Anisian_s2xy) <- ~ lng + lat

#Convert to 'Spatial Points Data Frame'
Anisian_shp <- SpatialPointsDataFrame(Anisian_s2xy, AnisianPoints, proj4string = CRS("+proj=longlat +datum=WGS84"))

#Write shape file
writeOGR(Anisian_shp, "data/Modern shapefiles", "Anisian", driver="ESRI Shapefile")


#Ladinian
#Create data frame with collection numbers, longs, lats
LadinianFilter <- filter(dat, stage_assignment == "Ladinian")
LadinianPoints <- LadinianFilter[c("collection_no", "lng", "lat")]

#Filter to unique points (one per collection)
LadinianPoints <- distinct(LadinianPoints, collection_no, .keep_all = T)
Ladinian_s2xy <- LadinianPoints[c("lng", "lat")]

#Label as coordinates
coordinates(Ladinian_s2xy) <- ~ lng + lat

#Convert to 'Spatial Points Data Frame'
Ladinian_shp <- SpatialPointsDataFrame(Ladinian_s2xy, LadinianPoints, proj4string = CRS("+proj=longlat +datum=WGS84"))

#Write shape file
writeOGR(Ladinian_shp, "data/Modern shapefiles", "Ladinian", driver="ESRI Shapefile")

### Rotate the points in GPlates ###

###Part 2. Convert shape file back to palaeo lat-longs
#Read shape file, convert to data frame, write as csv
dat <- read.csv("data/tetrapod_database_final.csv", stringsAsFactors = FALSE)
dat <- data.frame(dat)

#Lopingian
LopingianFilter <- filter(dat, stage_assignment == "Wuchiapingian" | stage_assignment == "Changhsingian")
LopingianPoints <- LopingianFilter[c("collection_no", "lng", "lat")]

#Filter to unique points (one per collection)
LopingianPoints <- distinct(LopingianPoints, collection_no, .keep_all = T)

Lopingian_shp <- readOGR("data/Lopingian/reconstructed_255.70Ma.shp")
Lopingian_dat <- data.frame(Lopingian_shp)
Lopingian_s2xy <- Lopingian_dat[c("coords.x1", "coords.x2")]
Lopingian_s2xy$collection_no <- LopingianPoints$collection_no


#Induan
InduanFilter <- filter(dat, stage_assignment == "Induan")
InduanPoints <- InduanFilter[c("collection_no", "lng", "lat")]

#Filter to unique points (one per collection)
InduanPoints <- distinct(InduanPoints, collection_no, .keep_all = T)

Induan_shp <- readOGR("data/Induan/reconstructed_251.00Ma.shp")
Induan_dat <- data.frame(Induan_shp)
Induan_s2xy <- Induan_dat[c("coords.x1", "coords.x2")]
Induan_s2xy$collection_no <- InduanPoints$collection_no


#Olenekian
OlenekianFilter <- filter(dat, stage_assignment == "Olenekian")
OlenekianPoints <- OlenekianFilter[c("collection_no", "lng", "lat")]

#Filter to unique points (one per collection)
OlenekianPoints <- distinct(OlenekianPoints, collection_no, .keep_all = T)

Olenekian_shp <- readOGR("data/Olenekian/reconstructed_248.50Ma.shp")
Olenekian_dat <- data.frame(Olenekian_shp)
Olenekian_s2xy <- Olenekian_dat[c("coords.x1", "coords.x2")]
Olenekian_s2xy$collection_no <- OlenekianPoints$collection_no


#Anisian
AnisianFilter <- filter(dat, stage_assignment == "Anisian")
AnisianPoints <- AnisianFilter[c("collection_no", "lng", "lat")]

#Filter to unique points (one per collection)
AnisianPoints <- distinct(AnisianPoints, collection_no, .keep_all = T)

Anisian_shp <- readOGR("data/Anisian/reconstructed_241.50Ma.shp")
Anisian_dat <- data.frame(Anisian_shp)
Anisian_s2xy <- Anisian_dat[c("coords.x1", "coords.x2")]
Anisian_s2xy$collection_no <- AnisianPoints$collection_no


#Ladinian
LadinianFilter <- filter(dat, stage_assignment == "Ladinian")
LadinianPoints <- LadinianFilter[c("collection_no", "lng", "lat")]

#Filter to unique points (one per collection)
LadinianPoints <- distinct(LadinianPoints, collection_no, .keep_all = T)

Ladinian_shp <- readOGR("data/Ladinian/reconstructed_232.90Ma.shp")
Ladinian_dat <- data.frame(Ladinian_shp)
Ladinian_s2xy <- Ladinian_dat[c("coords.x1", "coords.x2")]
Ladinian_s2xy$collection_no <- LadinianPoints$collection_no


#Compile
AllPoints <- rbind(Lopingian_s2xy, Induan_s2xy, Olenekian_s2xy, Anisian_s2xy, Ladinian_s2xy)

#Write a csv of points
#write.csv(AllPoints, "data/rotated_points.csv")

#Match points to collection numbers
new_db <- left_join(dat, AllPoints, by = "collection_no")

#Write a new csv with the paleolat & long added
write_csv(new_db, path = "data/tetrapod_database_final.csv")


########## Plot occurrences onto maps (Figure 1) ##########

#setwd("#####")

#Load packages
library(tidyverse)
library(jpeg)

#Create a vector giving the chronological order of stages
stages <- c("Wuchiapingian", "Changhsingian", "Induan", "Olenekian", "Anisian", "Ladinian",
            "Carnian")

#Create a vector of stage midpoints
midpoints <- c(256.6, 253, 251.6, 249.2, 244.6, 239.5, 232)

#Create a vector giving the chronological order of substages
substages <- c("Wuchiapingian", "Changhsingian", "Griesbachian", "Dienerian", "Smithian", "Spathian",
               "Aegean-Bithynian", "Pelsonian-Illyrian", "Fassanian", "Longobardian", "Julian",
               "Tuvalian")

#Create a vector of substage midpoints
submidpoints <- c(256.6, 253, 251.7, 251.4, 250.2, 248.2, 245.9, 243.3, 240.8, 238.3, 234.5, 229.5)

#Read in dataset
tetrapods <- read_csv("data/tetrapod_database_final.csv")
glimpse(tetrapods)

#Wrangling
#Check no incorrect spelling
#unique(tetrapods$stage_assignment)
#unique(tetrapods$substage_assignment)

#Remove synonymy repeats (combinations of the same collection no. AND accepted name)
tetrapods <- distinct(tetrapods, accepted_name, collection_no, .keep_all = T)


###Plot using Scotese palaeo-rasters, LP/ET/MT###
LopingianMap <- readJPEG("data/LP trace_shade.jpg")
LopingianPoints <- filter(tetrapods, stage_assignment == "Wuchiapingian"| stage_assignment == "Changhsingian")
ggplot(LopingianPoints, aes(x = Longitude, y = Latitude, group = pres_mode, colour = pres_mode)) + xlim(c(-180, 180)) + ylim(c(-90, 90)) +
  annotation_custom(grid::rasterGrob(LopingianMap, width=unit(0.9, "npc"), height=unit(0.9, "npc"), just = "centre")) +
  geom_segment(aes(x = -180, xend = 180, y = 0, yend = 0), linetype = "solid", colour = "black") +
  geom_point(size = 3) + scale_colour_manual(values = c("limegreen", "orange")) +
  guides(fill = F, colour = F) + theme_classic()

ETMap <- readJPEG("data/ET trace_shade.jpg")
ETPoints <- filter(tetrapods, stage_assignment == "Induan" | stage_assignment == "Olenekian")
ggplot(ETPoints, aes(x = Longitude, y = Latitude, group = interaction(pres_mode, taxon_environment), colour = interaction(pres_mode, taxon_environment))) + xlim(c(-180, 180)) + ylim(c(-90, 90)) +
  annotation_custom(grid::rasterGrob(ETMap, width=unit(0.9, "npc"), height=unit(0.9, "npc"), just = "centre")) +
  geom_segment(aes(x = -180, xend = 180, y = 0, yend = 0), linetype = "solid", colour = "black") +
  geom_point(size = 3) + scale_colour_manual(values = c("blue", "limegreen", "orange")) +
  guides(fill = F, colour = F) + theme_classic()

MTMap <- readJPEG("data/MT trace_shade.jpg")
MTPoints <- filter(tetrapods, stage_assignment == "Anisian" | stage_assignment == "Ladinian")
ggplot(MTPoints, aes(x = Longitude, y = Latitude, group = interaction(pres_mode, taxon_environment), colour = interaction(pres_mode, taxon_environment))) + xlim(c(-180, 180)) + ylim(c(-90, 90)) +
  annotation_custom(grid::rasterGrob(MTMap, width=unit(0.9, "npc"), height=unit(0.9, "npc"), just = "centre")) +
  geom_segment(aes(x = -180, xend = 180, y = 0, yend = 0), linetype = "solid", colour = "black") +
  geom_point(size = 3) + scale_colour_manual(values = c("blue", "limegreen", "orange")) + 
  guides(fill = F, colour = F) + theme_classic()

  
########## Raw diversity by latitude bin (Figure 1) ##########

#setwd("#####")

#Load packages
library(tidyverse)

#Create a vector giving the chronological order of stages
stages <- c("Wuchiapingian", "Changhsingian", "Induan", "Olenekian", "Anisian", "Ladinian",
            "Carnian")

#Create a vector of stage midpoints
midpoints <- c(256.6, 253, 251.6, 249.2, 244.6, 239.5, 232)


#Read in dataset
tetrapods <- read_csv("data/tetrapod_database_final.csv")
glimpse(tetrapods)

#Remember that there are only marine occurrences from the Olenekian onwards

#Wrangling
#Check no incorrect spelling
#unique(tetrapods$stage_assignment)
#unique(tetrapods$substage_assignment)

#Remove synonymy repeats (combinations of the same collection no. AND accepted name)
tetrapods <- distinct(tetrapods, accepted_name, collection_no, .keep_all = T)

#Filter to occurrences with stage assignment
tetrapods <- tetrapods %>% filter(!is.na(stage_assignment))


###Latitude line graphs of richness###
#Add column which indicates latitude bin
bins <- seq(from = -90, to = 90, by = 20)
labels <- seq(from = -80, to = 80, by = 20)
tetrapods_lat <- mutate(tetrapods, paleolat_code = cut(Latitude, breaks = bins, labels = labels))

#Choose stage(s) for filter
tetrapods_lat_stage <- tetrapods_lat %>% filter(stage_assignment == "Wuchiapingian")
#tetrapods_lat_stage <- tetrapods_lat %>% filter(stage_assignment == "Wuchiapingian" | stage_assignment == "Changhsingian")

#Filter unique taxa
#Create an empty dataset with PBDB column names to collect unique occurrences in
unique_by_bin <- tetrapods_lat_stage[FALSE,]

#Loop through each palaeolatitude bin
#For that bin, retain species occurrences, then retain unique taxa at gradually
#   higher taxonomic levels which are not already represented in that bin
for (i in 1:(length(labels))) {
  print(i)
  one_bin <- tetrapods_lat_stage %>% filter(paleolat_code == labels[i])
  for (j in 1:(nrow(one_bin))) {
    if (nrow(one_bin) != 0)
      if (one_bin$accepted_rank[j] == "species") unique_by_bin <- rbind(unique_by_bin, one_bin[j,]) else
        if (!is.na(one_bin$genus[j]))
         (if (one_bin$genus[j] %in% one_bin$genus[-j] == F) unique_by_bin <- rbind(unique_by_bin, one_bin[j,])) else
            if (!is.na(one_bin$family[j]))
              (if (one_bin$family[j] %in% one_bin$family[-j] == F) unique_by_bin <- rbind(unique_by_bin, one_bin[j,])) else
                if (!is.na(one_bin$order[j]))
                  (if (one_bin$order[j] %in% one_bin$order[-j] == F) unique_by_bin <- rbind(unique_by_bin, one_bin[j,]))
  }
}

#Remove repeats of species names, to get one occurrence per unique species per bin
unique_by_bin <- distinct(unique_by_bin, accepted_name, paleolat_code, .keep_all = T)

#Subset data as trace vs. marine vs. terrestrial
traces <- unique_by_bin %>% filter(pres_mode == "trace") %>% count(paleolat_code)
marine <- unique_by_bin %>% filter(pres_mode == "body") %>%
  filter(taxon_environment == "marine") %>% count(paleolat_code)
terrestrial <- unique_by_bin %>% filter(pres_mode == "body") %>%
  filter(taxon_environment == "terrestrial") %>% count(paleolat_code)

#Fill in gaps
for(i in 1:length(labels)) {
if((labels[i] %in% traces$paleolat_code) == FALSE) traces <- rbind(traces, c(labels[i], 0))
}
for(i in 1:length(labels)) {
  if((labels[i] %in% marine$paleolat_code) == FALSE) marine <- rbind(marine, c(labels[i], 0))
}
for(i in 1:length(labels)) {
  if((labels[i] %in% terrestrial$paleolat_code) == FALSE) terrestrial <- rbind(terrestrial, c(labels[i], 0))
}

#Reorder for ease of checking
traces <- arrange(traces, desc(paleolat_code))
marine <- arrange(marine, desc(paleolat_code))
terrestrial <- arrange(terrestrial, desc(paleolat_code))

#May need to rename columns to enable merge
#colnames(traces) <- c("paleolat_code","n")
#colnames(marine) <- c("paleolat_code","n")
#colnames(terrestrial) <- c("paleolat_code","n")

#Combine into single data frame for plotting
traces$type <- "Number of footprint occurrences"
marine$type <- "Number of marine occurrences"
terrestrial$type <- "Number of terrestrial occurrences"
lat_counts <- rbind(traces, marine, terrestrial)

#Convert to numbers for plotting
#terrestrial$paleolat_code <- as.numeric(as.character(terrestrial$paleolat_code))
lat_counts$paleolat_code <- as.numeric(as.character(lat_counts$paleolat_code))

#Plot multiple fossil types
ggplot(lat_counts, aes(x = paleolat_code, y = n, group = type, colour = type)) +
  geom_line(size = 1) + geom_point(size = 2) +
  labs(x = "Palaeolatitude", y = "Raw species richness count") + coord_flip() +
  scale_x_continuous(limits = c(-90, 90)) + geom_vline(aes(xintercept = 0), colour = "black") + 
  scale_colour_manual(values = c("orange", "blue", "limegreen")) +
  theme_classic()

#Plot one fossil type
ggplot(terrestrial, aes(x = paleolat_code, y = n, group = 1)) +
  geom_line(size = 1, colour = "limegreen") + geom_point(size = 2, colour = "limegreen") +
  labs(x = "Palaeolatitude", y = "Raw species richness count") + coord_flip() +
  scale_x_continuous(limits = c(-90, 90)) + geom_vline(aes(xintercept = 0), colour = "black") + 
  theme_classic()
  

########## Squares diversity by latitude graphs (Figure 1) ##########

#setwd("#####")

#Load packages
library(tidyverse)

#Read in dataset
tetrapods <- read_csv("data/tetrapod_database_final.csv")
glimpse(tetrapods)

#Remove synonymy repeats (combinations of the same collection no. AND accepted name)
tetrapods <- distinct(tetrapods, accepted_name, collection_no, .keep_all = T)

#Trim data without substage assignment or species-level identification
tetrapods_trim <- tetrapods %>% filter(!is.na(stage_assignment))

#Add column which indicates latitude bin
bins <- seq(from = -90, to = 90, by = 20)
labels <- seq(from = -80, to = 80, by = 20)
tetrapods_trim <- mutate(tetrapods_trim, paleolat_code = cut(Latitude, breaks = bins, labels = labels))


#Filter data to body fossils, choose environment/time interval
stage_tetrapods <- tetrapods_trim %>% filter(pres_mode == "body") %>% 
  filter(stage_assignment == "Olenekian")

#Filter unique taxa
#Create an empty dataset with PBDB column names to collect unique occurrences in
unique_by_bin <- stage_tetrapods[FALSE,]

#Loop through each palaeolatitude bin
#For that bin, retain species occurrences, then retain unique taxa at gradually
#   higher taxonomic levels which are not already represented in that bin
for (i in 1:(length(labels))) {
  print(i)
  one_bin <- stage_tetrapods %>% filter(paleolat_code == labels[i])
  for (j in 1:(nrow(one_bin))) {
    if (nrow(one_bin) != 0)
      if (one_bin$accepted_rank[j] == "species") unique_by_bin <- rbind(unique_by_bin, one_bin[j,]) else
        if (!is.na(one_bin$genus[j]))
          (if (one_bin$genus[j] %in% one_bin$genus[-j] == F) unique_by_bin <- rbind(unique_by_bin, one_bin[j,])) else
            if (!is.na(one_bin$family[j]))
              (if (one_bin$family[j] %in% one_bin$family[-j] == F) unique_by_bin <- rbind(unique_by_bin, one_bin[j,])) else
                if (!is.na(one_bin$order[j]))
                  (if (one_bin$order[j] %in% one_bin$order[-j] == F) unique_by_bin <- rbind(unique_by_bin, one_bin[j,]))
  }
}

##Terrestrial##
#Filter terrestrial fossils
terrestrial <- filter(unique_by_bin, taxon_environment == "terrestrial")

#Generate list of frequencies by stage
#Achieved by using and trimming the 'count' function in dplyr, across a loop of stage names
terres_lat_freq <- list()

for (i in 1:length(labels)) {
  t_list <- terrestrial %>% filter(paleolat_code == labels[i]) %>%
    count(., accepted_name) %>% arrange(desc(n)) %>% select(n)
  t_list <- unlist(t_list, use.names = F)
  terres_lat_freq[[i]] <- t_list
}
names(terres_lat_freq) <- labels
glimpse(terres_lat_freq)

#Estimate diversity using squares method (Alroy, 2018)
#Create an empty vector to store the diversity estimates
t_squares_list <- vector("numeric", length = 0)

#Loop through bins, estimating squares and appending to vector
for(i in 1:length(terres_lat_freq)) {
  t_freq_list <- terres_lat_freq[[i]]
  if(is.na(t_freq_list[1])){t_freq_list <- 0} #convert empty bin NA to 0
  if(t_freq_list[1] == 0){t_squares <- 0} else {
    t_sp_count <- length(t_freq_list)
    t_sing_count <- sum(t_freq_list == 1)
    t_ind_count <- sum(t_freq_list)
    t_sum_nsq <- sum(t_freq_list^2)
    t_squares <- t_sp_count + (((t_sing_count^2)*t_sum_nsq)/((t_ind_count^2) - (t_sing_count*t_sp_count)))
    if(t_squares == Inf){t_squares <- length(t_freq_list)} #too many singletons produces infinite estimate, corrects to observed abundance
  }
  t_squares_list <- append(t_squares_list, t_squares)
}

#Add bin labels to squares estimates
t_to_plot <- data.frame(labels, squares = t_squares_list)

##Marine##
#Filter marine fossils
marine <- filter(unique_by_bin, taxon_environment == "marine")

#Generate list of frequencies by stage
#Achieved by using and trimming the 'count' function in dplyr, across a loop of stage names
marine_lat_freq <- list()

for (i in 1:length(labels)) {
  m_list <- marine %>% filter(paleolat_code == labels[i]) %>%
    count(., accepted_name) %>% arrange(desc(n)) %>% select(n)
  m_list <- unlist(m_list, use.names = F)
  marine_lat_freq[[i]] <- m_list
}
names(marine_lat_freq) <- labels
glimpse(marine_lat_freq)

#Estimate diversity using squares method (Alroy, 2018)
m_squares_list <- vector("numeric", length = 0)

for(i in 1:length(marine_lat_freq)) {
  m_freq_list <- marine_lat_freq[[i]]
  if(is.na(m_freq_list[1])){m_freq_list <- 0}
  if(m_freq_list[1] == 0){m_squares <- 0} else {
    m_sp_count <- length(m_freq_list)
    m_sing_count <- sum(m_freq_list == 1)
    m_ind_count <- sum(m_freq_list)
    m_sum_nsq <- sum(m_freq_list^2)
    m_squares <- m_sp_count + (((m_sing_count^2)*m_sum_nsq)/((m_ind_count^2) - (m_sing_count*m_sp_count)))
    if(m_squares == Inf){m_squares <- length(m_freq_list)}
  }
  m_squares_list <- append(m_squares_list, m_squares)
}

m_to_plot <- data.frame(labels, squares = m_squares_list)

#Combine terrestrial and marine estimate dataframes for plotting
t_to_plot$type <- "Terrestrial"
m_to_plot$type <- "Marine"
all_to_plot <- rbind(t_to_plot, m_to_plot)

#Plot
ggplot(all_to_plot, aes(x = labels, y = squares, group = type, colour = type)) +
  geom_line(size = 1.5) + geom_point(size = 2) + labs(x = "Palaeolatitude", y = "Subsampled diversity") + 
  coord_flip() + scale_x_continuous(limits = c(-90, 90)) +
  scale_colour_manual(values = c("blue", "limegreen")) +
  geom_vline(aes(xintercept = 0), colour = "black", size = 0.7) +
  theme_classic()

  
########## SQS diversity by latitude graphs (Figures 1 & 2) ##########

#setwd("#####")

#Load packages
library(tidyverse)
library(iNEXT)

#Read in dataset
tetrapods <- read_csv("data/tetrapod_database_final.csv")
glimpse(tetrapods)

#Remove synonymy repeats (combinations of the same collection no. AND accepted name)
tetrapods <- distinct(tetrapods, accepted_name, collection_no, .keep_all = T)

#Trim data without stage assignment
tetrapods_trim <- tetrapods %>% filter(!is.na(stage_assignment))

#Trim to body fossils
tetrapods_trim <- filter(tetrapods_trim, pres_mode == "body")

#Add column which indicates latitude bin
bins <- seq(from = -90, to = 90, by = 20)
labels <- seq(from = -80, to = 80, by = 20)
tetrapods_trim <- mutate(tetrapods_trim, paleolat_code = cut(Latitude, breaks = bins, labels = labels))

#Filter data, choose taxon environment and stage
stage_tetrapods <- tetrapods_trim %>% filter(stage_assignment == "Ladinian") %>%
  filter(taxon_environment == "marine")

#Filter unique taxa
#Create an empty dataset with PBDB column names to collect unique occurrences in
unique_by_bin <- stage_tetrapods[FALSE,]

#Loop through each palaeolatitude bin
#For that bin, retain species occurrences, then retain unique taxa at gradually
#   higher taxonomic levels which are not already represented in that bin
for (i in 1:(length(labels))) {
  print(i)
  one_bin <- stage_tetrapods %>% filter(paleolat_code == labels[i])
  for (j in 1:(nrow(one_bin))) {
    if (nrow(one_bin) != 0)
      if (one_bin$accepted_rank[j] == "species") unique_by_bin <- rbind(unique_by_bin, one_bin[j,]) else
        if (!is.na(one_bin$genus[j]))
          (if (one_bin$genus[j] %in% one_bin$genus[-j] == F) unique_by_bin <- rbind(unique_by_bin, one_bin[j,])) else
            if (!is.na(one_bin$family[j]))
              (if (one_bin$family[j] %in% one_bin$family[-j] == F) unique_by_bin <- rbind(unique_by_bin, one_bin[j,])) else
                if (!is.na(one_bin$order[j]))
                  (if (one_bin$order[j] %in% one_bin$order[-j] == F) unique_by_bin <- rbind(unique_by_bin, one_bin[j,]))
  }
}

#Generate a table of sample sizes by latitude bin, ordered numerically, and remove NAs
stage_totals <- count(unique_by_bin, paleolat_code)
for(i in 1:length(labels)) {
  if((labels[i] %in% stage_totals$paleolat_code) == FALSE) stage_totals <- rbind(stage_totals, c(labels[i], 1))
}

#Reorder for ease of checking
stage_totals <- arrange(stage_totals, paleolat_code)
stage_totals_list <- pull(stage_totals, n)

#Generate list of frequencies by stage, starting with total value, as needed by iNEXT
#Achieved by using and trimming the 'count' function in dplyr, across a loop of stage names
lat_freq <- list()

for (i in 1:length(labels)) {
lat_list <- unique_by_bin %>% filter(paleolat_code == labels[i]) %>%
    count(., accepted_name) %>% arrange(desc(n)) %>% add_row(n = sum(.$n), .before = 1) %>%
    select(n)
lat_list <- unlist(lat_list, use.names = F)
    if(lat_list[1] < 3){lat_list <- NA} #if the list is shorter than 3 (2 species), make NA
lat_freq[[i]] <- lat_list
}
names(lat_freq) <- labels
glimpse(lat_freq)

#Filter out empty lists
lat_freq <- lat_freq[!is.na(lat_freq)]

###Code from Dunne et al. 2018###
#"1  Coverage-standardised diversity curve"

#Set quorum levels
quorum_levels <- round(seq(from = 0.3, to = 0.8, by = 0.1), 1)

#Estimate D using estimateD in iNEXT
estD_list <- list()

for(i in 1:length(quorum_levels)) {
  estD <- estimateD(lat_freq, datatype = "incidence_freq", base = "coverage", level = quorum_levels[i])
  estD <- filter(estD, order == 0)        #filter for species richness values
  estD$quorum_level <- quorum_levels[i]
  estD$reference_t <- stage_totals_list[(match(names(lat_freq), stage_totals$paleolat_code))]  #add sample sizes in extra column
  estD$midpoints <- names(lat_freq)
  estD_list[[i]] <- estD
}

#Bind the individual dataframes into one
estD_list <- bind_rows(estD_list)

#Remove values when t is more than three times the sample size
estD_list[which(estD_list$t >= 3 * estD_list$reference_t), c("qD", "qD.LCL", "qD.UCL")] <- rep(NA, 3) #no more than twice reference sample size

View(estD_list)

#Add empty bins as a plot point
for(i in 1:length(labels)){
  if((labels[i] %in% estD_list$site) == FALSE) estD_list <- rbind(estD_list, c(rep(NA, 5), 0, NA, NA, 0.4, NA, as.numeric(labels[i])))
}

#Make quorum level a factor
estD_list$midpoints <- as.numeric(as.character(estD_list$midpoints))
estD_list$quorum_level <- as.factor(estD_list$quorum_level)

#Used "chartreuse" and "dodgerblue" colour sets
colours <- c("dodgerblue1", "dodgerblue2", "dodgerblue3", "dodgerblue4")

#Plot quorum levels 0.4 - 0.7
ggplot(filter(estD_list, quorum_level %in% quorum_levels[2:5]), aes(x = midpoints, y = qD, ymin = qD.LCL, ymax = qD.UCL, group = quorum_level, colour = quorum_level)) +
  geom_line(size = 1.5) + geom_point(size = 2) + geom_linerange(size = 1) + labs(x = "Palaeolatitude", y = "Subsampled diversity", colour = "Quorum level") + 
  coord_flip() + scale_x_continuous(limits = c(-90, 90)) +
  geom_vline(aes(xintercept = 0), colour = "black", size = 0.7) +
  scale_colour_manual(values = colours) +
  theme_classic()
  
  
########## Wrangle collections and formations data (Figure S1) ##########

#setwd("#####")

#Load packages
library(tidyverse)

#Create a vector giving the chronological order of stages
stages <- c("Wuchiapingian", "Changhsingian", "Induan", "Olenekian", "Anisian", "Ladinian", "Carnian",
            "Norian")

#Create a vector of stage midpoints
midpoints <- c(256.6, 253, 251.6, 249.2, 244.6, 239.5, 232, 217.8)

#Create a vector giving the chronological order of substages
substages <- c("Wuchiapingian", "Changhsingian", "Griesbachian", "Dienerian", "Smithian", "Spathian",
               "Aegean-Bithynian", "Pelsonian-Illyrian", "Fassanian", "Longobardian", "Julian",
               "Tuvalian", "Lacian", "Alaunian", "Sevatian")

#Create a vector of substage midpoints
submidpoints <- c(256.6, 253, 251.7, 251.4, 250.2, 248.2, 245.9, 243.3, 240.8, 238.3, 234.5, 229.5, 221.3, 213.8, 210.3)

#Read in dataset
collections <- read_csv("data/P-T collections.csv")
View(collections)

#Retain only collections with substage dating
collsfilt <- collections %>% filter(is.na(late_interval)) %>% filter(early_interval %in% substages)

#Create table of collection numbers per substage
coll_counts <- count(collsfilt, early_interval)
coll_counts <- coll_counts[match(substages, coll_counts$early_interval),]
coll_counts$midpoints <- submidpoints
coll_counts <- drop_na(coll_counts, n)
write_csv(coll_counts, "data/P-T collection counts.csv")

#Create table of formation numbers per substage
form_counts <- collsfilt %>% filter(!is.na(formation)) %>% group_by(early_interval) %>%
  summarize(n = n_distinct(formation))
form_counts <- form_counts[match(substages, form_counts$early_interval),]
form_counts$midpoints <- submidpoints
form_counts <- drop_na(form_counts, n)
write_csv(form_counts, "data/P-T formation counts.csv")

########## Plot collections and formations data (Figure S1) ##########

#setwd("#####")

#Load packages
library(tidyverse)

#Read in dataset
tetrapods <- read_csv("data/tetrapod_database_ScotesePLL.csv")
glimpse(tetrapods)

#Wrangling
#Check no incorrect spelling
#unique(tetrapods$stage_assignment)
#unique(tetrapods$substage_assignment)

#Remove synonymy repeats (combinations of the same collection no. AND accepted name)
tetrapods <- distinct(tetrapods, accepted_name, collection_no, .keep_all = T)

#Remove occurrences without substage-level temporal resolution
tetrapods <- filter(tetrapods, !is.na(substage_assignment))

###Time line graph with collection and formation counts###
#Total number of tetrapod collections (filtered by unique collection numbers)
tp_collection_counts <- tetrapods %>% filter(!is.na(substage_assignment)) %>% filter(accepted_rank == "species") %>%
  distinct(collection_no, .keep_all = T) %>% count(substage_assignment)
tp_collection_counts <- tp_collection_counts[match(substages, tp_collection_counts$substage_assignment),]
tp_collection_counts$midpoints <- submidpoints
tp_collection_counts$n[is.na(tp_collection_counts$n)] <- 0

#Total number of formations (filtered by unique formation names)
tp_formation_counts <- tetrapods %>% filter(!is.na(substage_assignment)) %>% filter(accepted_rank == "species") %>%
  group_by(substage_assignment) %>% summarize(n = n_distinct(formation))
tp_formation_counts <- tp_formation_counts[match(substages, tp_formation_counts$substage_assignment),]
tp_formation_counts$midpoints <- submidpoints
tp_formation_counts$n[is.na(tp_formation_counts$n)] <- 0

#Read total P-T collections and formations summary tables
collection_counts <- read_csv("data/P-T collection counts.csv")
colnames(collection_counts)[1] <- "substage_assignment"
collection_counts <- collection_counts[match(substages, collection_counts$substage_assignment),]
formation_counts <- read_csv("data/P-T formation counts.csv")
colnames(formation_counts)[1] <- "substage_assignment"
formation_counts <- formation_counts[match(substages, formation_counts$substage_assignment),]

#Combine into a single data frame
tp_collection_counts$type <- "Tetrapod-bearing collections"
tp_formation_counts$type <- "Tetrapod-bearing formations"
collection_counts$type <- "Total collections"
formation_counts$type <- "Total formations"
data_counts <- rbind(tp_collection_counts, tp_formation_counts, collection_counts, formation_counts)

ggplot(data_counts, aes(x = midpoints, y = log(n), group = type, colour = type)) + geom_line(size = 2) +
  scale_x_reverse(limits = c(260, 227)) + scale_y_continuous (position = "right") +
  labs(x = "Ma", y = "log(Count)") +
  geom_vline(aes(xintercept = 259.8), colour = "grey") + #Start of Wuchiapingian
  geom_vline(aes(xintercept = 254.14), colour = "grey") + #Changhsingian
  geom_vline(aes(xintercept = 252.17), colour = "grey") + #Induan
  geom_vline(aes(xintercept = 252.17), linetype = "longdash", colour = "red") + #PT
  geom_vline(aes(xintercept = 251.2), colour = "grey") + #Olenekian
  geom_vline(aes(xintercept = 247.2), colour = "grey") + #Anisian
  geom_vline(aes(xintercept = 242), colour = "grey") + #Ladinian
  geom_vline(aes(xintercept = 237), colour = "grey") + #Carnian
  geom_vline(aes(xintercept = 227), colour = "grey") + #End of Carnian
  scale_colour_manual(values = c("cyan1", "red1", "cyan4", "red4")) + theme_classic() +
  theme(legend.title=element_blank())

  
########## Raw occurrences through time graph (Figure S1) ##########

#setwd("#####")

#Load packages
library(tidyverse)

#Create a vector giving the chronological order of stages
stages <- c("Wuchiapingian", "Changhsingian", "Induan", "Olenekian", "Anisian", "Ladinian",
            "Carnian")

#Create a vector of stage midpoints
midpoints <- c(256.6, 253, 251.6, 249.2, 244.6, 239.5, 232)

#Create a vector giving the chronological order of substages
substages <- c("Wuchiapingian", "Changhsingian", "Griesbachian", "Dienerian", "Smithian", "Spathian",
               "Aegean-Bithynian", "Pelsonian-Illyrian", "Fassanian", "Longobardian", "Julian",
               "Tuvalian")

#Create a vector of substage midpoints
submidpoints <- c(256.6, 253, 251.7, 251.4, 250.2, 248.2, 245.9, 243.3, 240.8, 238.3, 234.5, 229.5)

#Read in dataset
tetrapods <- read_csv("data/tetrapod_database_final.csv")
glimpse(tetrapods)

#Wrangling
#Check no incorrect spelling
#unique(tetrapods$stage_assignment)
#unique(tetrapods$substage_assignment)

#Remove synonymy repeats (combinations of the same collection no. AND accepted name)
tetrapods <- distinct(tetrapods, accepted_name, collection_no, .keep_all = T)

#Remove occurrences without substage-level temporal resolution
tetrapods <- filter(tetrapods, !is.na(substage_assignment))

###Time line graph comparing fossil types/ecology occurrences###
#Trace fossil species
trace_counts <- tetrapods %>% filter(pres_mode == "trace") %>%
  count(substage_assignment)
trace_counts <- trace_counts[match(substages, trace_counts$substage_assignment),]
trace_counts$midpoints <- submidpoints
trace_counts$n[is.na(trace_counts$n)] <- 0

#Terrestrial fossil species
terres_counts <- tetrapods %>% filter(pres_mode == "body") %>%
  filter(taxon_environment == "terrestrial") %>% count(substage_assignment)
terres_counts <- terres_counts[match(substages, terres_counts$substage_assignment),]
terres_counts$midpoints <- submidpoints
terres_counts$n[is.na(terres_counts$n)] <- 0

#Marine fossil species
marine_counts <- tetrapods %>% filter(pres_mode == "body") %>%
  filter(taxon_environment == "marine") %>% count(substage_assignment)
marine_counts <- marine_counts[match(substages, marine_counts$substage_assignment),]
marine_counts$midpoints <- submidpoints
marine_counts$n[is.na(marine_counts$n)] <- 0

#Combine into a single data frame
trace_counts$type <- "Number of ichnospecies"
terres_counts$type <- "Number of terrestrial species"
marine_counts$type <- "Number of marine species"
type_counts <- rbind(trace_counts, terres_counts, marine_counts)

#Plot with geom_line
ggplot(type_counts, aes(x = midpoints, y = n, group = type, colour = type)) +
  geom_line(size = 2) + scale_x_reverse(limits = c(260, 227)) + labs(x = "Ma", y = "Raw count") +
  geom_vline(aes(xintercept = 259.8), colour = "grey") + #Start of Wuchiapingian
  geom_vline(aes(xintercept = 254.14), colour = "grey") + #Changhsingian
  geom_vline(aes(xintercept = 252.17), colour = "grey") + #Induan
  geom_vline(aes(xintercept = 252.17), linetype = "longdash", colour = "red") + #PT
  geom_vline(aes(xintercept = 251.2), colour = "grey") + #Olenekian
  geom_vline(aes(xintercept = 247.2), colour = "grey") + #Anisian
  geom_vline(aes(xintercept = 242), colour = "grey") + #Ladinian
  geom_vline(aes(xintercept = 237), colour = "grey") + #Carnian
  geom_vline(aes(xintercept = 227), colour = "grey") + #End of Carnian
  scale_colour_manual(values = c("orange", "blue", "limegreen")) + theme_classic() +
  theme(legend.title=element_blank())


########## Squares diversity through time graph (Figure S1) ##########

#setwd("#####")

#Load packages
library(tidyverse)

#Create a vector giving the chronological order of substages
substages <- c("Wuchiapingian", "Changhsingian", "Griesbachian", "Smithian", "Spathian",
               "Aegean-Bithynian", "Pelsonian-Illyrian", "Fassanian", "Longobardian", "Julian",
               "Tuvalian")

#Create a vector of substage midpoints
submidpoints <- c(256.6, 253, 251.7, 250.2, 248.2, 245.9, 243.3, 240.8, 238.3, 234.5, 229.5)

#Add Dienerian if needed, 251.4

#Read in dataset
tetrapods <- read_csv("data/tetrapod_database_final.csv")
glimpse(tetrapods)

#Remove synonymy repeats (combinations of the same collection no. AND accepted name)
tetrapods <- distinct(tetrapods, accepted_name, collection_no, .keep_all = T)

#Trim data without substage assignment or species-level identification
tetrapods <- tetrapods %>% filter(!is.na(substage_assignment))

#Create an empty dataset with PBDB column names to collect unique occurrences in
unique_by_substage <- tetrapods[FALSE,]

#Loop through each substage
#For that substage, retain species occurrences, then retain unique taxa at gradually
#   higher taxonomic levels which are not already represented in that substage
for (i in 1:(length(substages))) {
  print(i)
  one_substage <- tetrapods %>% filter(substage_assignment == substages[i])
  for (j in 1:(nrow(one_substage))) {
    if (one_substage$accepted_rank[j] == "species") unique_by_substage <- rbind(unique_by_substage, one_substage[j,]) else
      if (!is.na(one_substage$genus[j]))
        (if (one_substage$genus[j] %in% one_substage$genus[-j] == F) unique_by_substage <- rbind(unique_by_substage, one_substage[j,])) else
          if (!is.na(one_substage$family[j]))
            (if (one_substage$family[j] %in% one_substage$family[-j] == F) unique_by_substage <- rbind(unique_by_substage, one_substage[j,])) else
              if (!is.na(one_substage$order[j]))
                (if (one_substage$order[j] %in% one_substage$order[-j] == F) unique_by_substage <- rbind(unique_by_substage, one_substage[j,]))
  }
}

###Squares - terrestrial###
#Filter terrestrial fossils
terrestrial <- unique_by_substage %>% filter(pres_mode == "body") %>%
  filter(taxon_environment == "terrestrial")

#Generate list of frequencies by substage
#Achieved by using and trimming the 'count' function in dplyr, across a loop of substage names
terres_substage_freq <- list()

for (k in 1:length(substages)) {
  t_list <- terrestrial %>% filter(substage_assignment == substages[k]) %>%
    count(., accepted_name) %>% arrange(desc(n)) %>% select(n)
  t_list <- unlist(t_list, use.names = F)
  terres_substage_freq[[k]] <- t_list
}
names(terres_substage_freq) <- substages
glimpse(terres_substage_freq)


#Estimate diversity using squares method (Alroy, 2018)
t_squares_list <- vector("numeric", length = 0)

for(i in 1:length(terres_substage_freq)) {
  t_freq_list <- terres_substage_freq[[i]]
  if(is.na(t_freq_list[1])){t_freq_list <- 0}
  if(t_freq_list[1] == 0){t_squares <- 0} else {
  t_sp_count <- length(t_freq_list)
  t_sing_count <- sum(t_freq_list == 1)
  t_ind_count <- sum(t_freq_list)
  t_sum_nsq <- sum(t_freq_list^2)
  t_squares <- t_sp_count + (((t_sing_count^2)*t_sum_nsq)/((t_ind_count^2) - (t_sing_count*t_sp_count)))
  if(t_squares == Inf){t_squares <- length(t_freq_list)}
  }
  t_squares_list <- append(t_squares_list, t_squares)
}

t_to_plot <- data.frame(substages, squares = t_squares_list, submidpoints)

###Squares - marine###
#Filter marine fossils
marine <- unique_by_substage %>% filter(pres_mode == "body") %>%
  filter(taxon_environment == "marine")

#Generate list of frequencies by substage
#Achieved by using and trimming the 'count' function in dplyr, across a loop of substage names
marine_substage_freq <- list()

for (i in 1:length(substages)) {
  m_list <- marine %>% filter(substage_assignment == substages[i]) %>%
    count(., accepted_name) %>% arrange(desc(n)) %>% select(n)
  m_list <- unlist(m_list, use.names = F)
  marine_substage_freq[[i]] <- m_list
}
names(marine_substage_freq) <- substages
glimpse(marine_substage_freq)


#Estimate diversity using squares method (Alroy, 2018)
m_squares_list <- vector("numeric", length = 0)

for(i in 1:length(marine_substage_freq)) {
  m_freq_list <- marine_substage_freq[[i]]
  if(is.na(m_freq_list[1])){m_freq_list <- 0}
  if(m_freq_list[1] == 0){m_squares <- 0} else {
  m_sp_count <- length(m_freq_list)
  m_sing_count <- sum(m_freq_list == 1)
  m_ind_count <- sum(m_freq_list)
  m_sum_nsq <- sum(m_freq_list^2)
  m_squares <- m_sp_count + (((m_sing_count^2)*m_sum_nsq)/((m_ind_count^2) - (m_sing_count*m_sp_count)))
  if(m_squares == Inf){m_squares <- length(m_freq_list)}
  }
  m_squares_list <- append(m_squares_list, m_squares)
}

m_to_plot <- data.frame(substages, squares = m_squares_list, submidpoints)

#Plot
t_to_plot$type <- "Terrestrial"
m_to_plot$type <- "Marine"
all_to_plot <- rbind(t_to_plot, m_to_plot)

ggplot(all_to_plot, aes(x = submidpoints, y = squares, group = type, colour = type)) +
  geom_line(size = 1) + geom_point() +
  scale_colour_manual(values = c("blue", "limegreen")) +
  scale_x_reverse(limits = c(260, 227)) + labs(x = "Ma", y = "Squares diversity") +
  geom_vline(aes(xintercept = 259.8), colour = "grey") + #Start of Wuchiapingian
  geom_vline(aes(xintercept = 254.14), colour = "grey") + #Changhsingian
  geom_vline(aes(xintercept = 252.17), colour = "grey") + #Induan
  geom_vline(aes(xintercept = 252.17), linetype = "longdash", colour = "red") + #PT
  geom_vline(aes(xintercept = 251.2), colour = "grey") + #Olenekian
  geom_vline(aes(xintercept = 247.2), colour = "grey") + #Anisian
  geom_vline(aes(xintercept = 242), colour = "grey") + #Ladinian
  geom_vline(aes(xintercept = 237), colour = "grey") + #Carnian
  geom_vline(aes(xintercept = 227), colour = "grey") + #End of Carnian
  theme_classic()

########## SQS diversity through time graph (Figure S1) ##########

#setwd("#####")

#Load packages
library(tidyverse)
library(iNEXT)

#Create a vector giving the chronological order of substages
substages <- c("Wuchiapingian", "Changhsingian", "Griesbachian", "Smithian", "Spathian",
               "Aegean-Bithynian", "Pelsonian-Illyrian", "Fassanian", "Longobardian", "Julian",
               "Tuvalian")

#Create a vector of substage midpoints
submidpoints <- c(256.6, 253, 251.7, 250.2, 248.2, 245.9, 243.3, 240.8, 238.3, 234.5, 229.5)

#Add Dienerian if needed, 251.4

#Read in dataset
tetrapods <- read_csv("data/tetrapod_database_final.csv")
glimpse(tetrapods)

#Remove synonymy repeats (combinations of the same collection no. AND accepted name)
tetrapods <- distinct(tetrapods, accepted_name, collection_no, .keep_all = T)

#Trim data without substage assignment or species-level identification
tetrapods <- tetrapods %>% filter(!is.na(substage_assignment))

#Create an empty dataset with PBDB column names to collect unique occurrences in
unique_by_substage <- tetrapods[FALSE,]

#Loop through each substage
#For that substage, retain species occurrences, then retain unique taxa at gradually
#   higher taxonomic levels which are not already represented in that substage
for (i in 1:(length(substages))) {
  print(i)
  one_substage <- tetrapods %>% filter(substage_assignment == substages[i])
  for (j in 1:(nrow(one_substage))) {
    if (one_substage$accepted_rank[j] == "species") unique_by_substage <- rbind(unique_by_substage, one_substage[j,]) else
      if (!is.na(one_substage$genus[j]))
        (if (one_substage$genus[j] %in% one_substage$genus[-j] == F) unique_by_substage <- rbind(unique_by_substage, one_substage[j,])) else
          if (!is.na(one_substage$family[j]))
            (if (one_substage$family[j] %in% one_substage$family[-j] == F) unique_by_substage <- rbind(unique_by_substage, one_substage[j,])) else
              if (!is.na(one_substage$order[j]))
                (if (one_substage$order[j] %in% one_substage$order[-j] == F) unique_by_substage <- rbind(unique_by_substage, one_substage[j,]))
  }
}

###SQS - terrestrial###
#Filter terrestrial fossils
terrestrial <- unique_by_substage %>% filter(pres_mode == "body") %>%
  filter(taxon_environment == "terrestrial")
#Generate a table of sample sizes by stage, ordered chronologically
terres_totals <- count(terrestrial, substage_assignment)
#terres_totals <- group_by(terrestrial, substage_assignment) %>% summarise(coll_count = n_distinct(collection_no))
terres_totals <- terres_totals[match(substages, terres_totals$substage_assignment),]

#Generate list of frequencies by substage, starting with total value, as needed by iNEXT
#Achieved by using and trimming the 'count' function in dplyr, across a loop of substage names
terres_substage_freq <- list()

for (i in 1:length(substages)) {
  t_list <- terrestrial %>% filter(substage_assignment == substages[i]) %>%
    count(., accepted_name) %>% arrange(desc(n)) %>% add_row(n = sum(.$n), .before = 1) %>%
    select(n)
  t_list <- unlist(t_list, use.names = F)
  if(t_list[1] < 3){t_list <- NA}
  terres_substage_freq[[i]] <- t_list
}
names(terres_substage_freq) <- substages
glimpse(terres_substage_freq)

#Filter out empty lists
terres_substage_freq <- terres_substage_freq[!is.na(terres_substage_freq)]

###Code from Dunne et al. 2018###
#"1  Coverage-standardised diversity curve"

#Set quorum levels
quorum_levels <- round(seq(from = 0.1, to = 0.9, by = 0.1), 1)

#Estimate D using estimateD in iNEXT
t.estD_list <- list()

for(i in 1:length(quorum_levels)) {
  t.estD <- estimateD(terres_substage_freq, datatype = "incidence_freq", base = "coverage", level = quorum_levels[i])
  t.estD <- filter(t.estD, order == 0)        #filter for species richness values
  t.estD$quorum_level <- quorum_levels[i]
  t.estD$reference_t <- terres_totals$n            #add sample sizes in extra column
  t.estD$midpoints <- submidpoints[match(names(terres_substage_freq), substages)]
  t.estD_list[[i]] <- t.estD
}

#Bind the individual dataframes into one
t.estD_list <- bind_rows(t.estD_list)

#Remove values when t is more than three times the sample size
t.estD_list[which(t.estD_list$t >= 3 * t.estD_list$reference_t), c("qD", "qD.LCL", "qD.UCL")] <- rep(NA, 3)

View(t.estD_list)

#Make quorum level a factor
t.estD_list$quorum_level <- as.factor(t.estD_list$quorum_level)

#Plot quorum levels 0.4 - 0.7
ggplot(filter(t.estD_list, quorum_level %in% quorum_levels[4:7]), aes(x = midpoints, y = qD, ymin = qD.LCL, ymax = qD.UCL, group = quorum_level, colour = quorum_level)) +
  geom_line(size = 1) + geom_point() + geom_linerange() +
  scale_colour_manual(values = c("chartreuse1", "chartreuse2", "chartreuse3", "chartreuse4")) +
  scale_x_reverse(limits = c(260, 227)) + labs(x = "Ma", y = "Subsampled diversity", colour = "Quorum level") +
  geom_vline(aes(xintercept = 259.8), colour = "grey") + #Start of Wuchiapingian
  geom_vline(aes(xintercept = 254.14), colour = "grey") + #Changhsingian
  geom_vline(aes(xintercept = 252.17), colour = "grey") + #Induan
  geom_vline(aes(xintercept = 252.17), linetype = "longdash", colour = "red") + #PT
  geom_vline(aes(xintercept = 251.2), colour = "grey") + #Olenekian
  geom_vline(aes(xintercept = 247.2), colour = "grey") + #Anisian
  geom_vline(aes(xintercept = 242), colour = "grey") + #Ladinian
  geom_vline(aes(xintercept = 237), colour = "grey") + #Carnian
  geom_vline(aes(xintercept = 227), colour = "grey") + #End of Carnian
  theme_classic()


###SQS - marine###
#Filter marine fossils
marine <- unique_by_substage %>% filter(pres_mode == "body") %>%
  filter(taxon_environment == "marine")
#Generate a table of sample sizes by stage, ordered chronologically
marine_totals <- count(marine, substage_assignment)
marine_totals <- marine_totals[match(substages, marine_totals$substage_assignment),]
marine_totals <- filter(marine_totals, !is.na(n))

#Generate list of frequencies by substage, starting with total value, as needed by iNEXT
#Achieved by using and trimming the 'count' function in dplyr, across a loop of substage names
marine_substage_freq <- list()

for (i in 1:length(substages)) {
  m_list <- marine %>% filter(substage_assignment == substages[i]) %>%
    count(., accepted_name) %>% arrange(desc(n)) %>% add_row(n = sum(.$n), .before = 1) %>%
    select(n)
  m_list <- unlist(m_list, use.names = F)
  if(m_list[1] < 3){m_list <- NA}
  marine_substage_freq[[i]] <- m_list
}
names(marine_substage_freq) <- substages
glimpse(marine_substage_freq)

#Filter out empty lists
marine_substage_freq <- marine_substage_freq[!is.na(marine_substage_freq)]

###Code from Dunne et al. 2018###
#"1  Coverage-standardised diversity curve"

#Set quorum levels
quorum_levels <- round(seq(from = 0.1, to = 0.9, by = 0.1), 1)

#Estimate D using estimateD in iNEXT
m.estD_list <- list()

for(i in 1:length(quorum_levels)) {
  m.estD <- estimateD(marine_substage_freq, datatype = "incidence_freq", base = "coverage", level = quorum_levels[i])
  m.estD <- filter(m.estD, order == 0)        #filter for species richness values
  m.estD$quorum_level <- quorum_levels[i]
  m.estD$reference_t <- marine_totals$n            #add sample sizes in extra column
  m.estD$midpoints <- submidpoints[match(names(marine_substage_freq), substages)]
  m.estD_list[[i]] <- m.estD
}

#Bind the individual dataframes into one
m.estD_list <- bind_rows(m.estD_list)

#Remove values when t is more than three times the sample size
m.estD_list[which(m.estD_list$t >= 3 * m.estD_list$reference_t), c("qD", "qD.LCL", "qD.UCL")] <- rep(NA, 3)

View(m.estD_list)

#Make quorum level a factor
m.estD_list$quorum_level <- as.factor(m.estD_list$quorum_level)

#Plot quorum levels 0.4 - 0.7
ggplot(filter(m.estD_list, quorum_level %in% quorum_levels[4:7]), aes(x = midpoints, y = qD, ymin = qD.LCL, ymax = qD.UCL, group = quorum_level, colour = quorum_level)) +
  geom_line(size = 1) + geom_point() + geom_linerange() +
  scale_colour_manual(values = c("dodgerblue1", "dodgerblue2", "dodgerblue3", "dodgerblue4")) +
  scale_x_reverse(limits = c(260, 227)) + scale_y_continuous (position = "right") +
  labs(x = "Ma", y = "Subsampled diversity", colour = "Quorum level") +
  geom_vline(aes(xintercept = 259.8), colour = "grey") + #Start of Wuchiapingian
  geom_vline(aes(xintercept = 254.14), colour = "grey") + #Changhsingian
  geom_vline(aes(xintercept = 252.17), colour = "grey") + #Induan
  geom_vline(aes(xintercept = 252.17), linetype = "longdash", colour = "red") + #PT
  geom_vline(aes(xintercept = 251.2), colour = "grey") + #Olenekian
  geom_vline(aes(xintercept = 247.2), colour = "grey") + #Anisian
  geom_vline(aes(xintercept = 242), colour = "grey") + #Ladinian
  geom_vline(aes(xintercept = 237), colour = "grey") + #Carnian
  geom_vline(aes(xintercept = 227), colour = "grey") + #End of Carnian
  theme_classic()
  
  
########## Rarefaction curves (Figure S3) ##########

#setwd("#####")

#Load packages
library(tidyverse)
library(iNEXT)

#Read in dataset
tetrapods <- read_csv("data/tetrapod_database_final.csv")
glimpse(tetrapods)

#Remove synonymy repeats (combinations of the same collection no. AND accepted name)
tetrapods <- distinct(tetrapods, accepted_name, collection_no, .keep_all = T)

#Trim data without stage assignment
tetrapods_trim <- tetrapods %>% filter(!is.na(stage_assignment))

#Trim to body fossils
tetrapods_trim <- filter(tetrapods_trim, pres_mode == "body")

#Add column which indicates latitude bin
bins <- seq(from = -90, to = 90, by = 20)
labels <- seq(from = -80, to = 80, by = 20)
tetrapods_trim <- mutate(tetrapods_trim, paleolat_code = cut(Latitude, breaks = bins, labels = labels))

#Filter data, choose stage(s) and environment
stage_tetrapods <- tetrapods_trim %>% filter(stage_assignment == "Anisian" | stage_assignment == "Ladinian") %>%
  filter(taxon_environment == "marine")

#Filter unique taxa
#Create an empty dataset with PBDB column names to collect unique occurrences in
unique_by_bin <- stage_tetrapods[FALSE,]

#Loop through each palaeolatitude bin
#For that bin, retain species occurrences, then retain unique taxa at gradually
#   higher taxonomic levels which are not already represented in that bin
for (i in 1:(length(labels))) {
  print(i)
  one_bin <- stage_tetrapods %>% filter(paleolat_code == labels[i])
  for (j in 1:(nrow(one_bin))) {
    if (nrow(one_bin) != 0)
      if (one_bin$accepted_rank[j] == "species") unique_by_bin <- rbind(unique_by_bin, one_bin[j,]) else
        if (!is.na(one_bin$genus[j]))
          (if (one_bin$genus[j] %in% one_bin$genus[-j] == F) unique_by_bin <- rbind(unique_by_bin, one_bin[j,])) else
            if (!is.na(one_bin$family[j]))
              (if (one_bin$family[j] %in% one_bin$family[-j] == F) unique_by_bin <- rbind(unique_by_bin, one_bin[j,])) else
                if (!is.na(one_bin$order[j]))
                  (if (one_bin$order[j] %in% one_bin$order[-j] == F) unique_by_bin <- rbind(unique_by_bin, one_bin[j,]))
  }
}

#Generate a table of sample sizes by latitude bin, ordered numerically, and remove NAs
stage_totals <- count(unique_by_bin, paleolat_code)
for(i in 1:length(labels)) {
  if((labels[i] %in% stage_totals$paleolat_code) == FALSE) stage_totals <- rbind(stage_totals, c(labels[i], 1))
}

#Reorder for ease of checking
stage_totals <- arrange(stage_totals, paleolat_code)
stage_totals_list <- pull(stage_totals, n)

#Generate list of frequencies by stage, starting with total value, as needed by iNEXT
#Achieved by using and trimming the 'count' function in dplyr, across a loop of stage names
lat_freq <- list()

for (i in 1:length(labels)) {
  lat_list <- stage_tetrapods %>% filter(paleolat_code == labels[i]) %>%
    count(., accepted_name) %>% arrange(desc(n)) %>% add_row(n = sum(.$n), .before = 1) %>%
    select(n)
  lat_list <- unlist(lat_list, use.names = F)
  if(lat_list[1] < 3){lat_list <- NA}
  lat_freq[[i]] <- lat_list
}
names(lat_freq) <- labels
glimpse(lat_freq)

#Filter out empty lists
lat_freq <- lat_freq[!is.na(lat_freq)]

#Run iNEXT function
inc.data <- iNEXT(lat_freq, q = 0, datatype = "incidence_freq", knots = 100)

#Extract rarefaction points
cov_rare <- inc.data$iNextEst

for(i in 1:length(cov_rare)) {
  cov_rare[[i]]$stage_int <- names(cov_rare)[i]
}

cov_rare <- do.call(rbind, cov_rare) %>% tbl_df() #convert to tibble for ease of plotting

#Plot
ggplot(data = cov_rare, aes(x = SC, y = qD, ymin = qD.LCL, ymax = qD.UCL, fill = stage_int, colour = stage_int, lty = method)) + 
  geom_line(size = 1) + 
  scale_linetype_manual(values=c("dotted", "solid", "longdash")) +
  labs(x = "Coverage", y = "Species richness") +
  theme_classic()
  
  
########## Bootstraps (Figure S4) ##########

#setwd("#####")

#Load packages
library(tidyverse)

#Read in dataset
tetrapods <- read_csv("data/tetrapod_database_final.csv")
glimpse(tetrapods)

#Remove synonymy repeats (combinations of the same collection no. AND accepted name)
tetrapods <- distinct(tetrapods, accepted_name, collection_no, .keep_all = T)

#Trim data without stage assignment
tetrapods_trim <- tetrapods %>% filter(!is.na(stage_assignment))

#Add column which indicates latitude bin
bins <- seq(from = -90, to = 90, by = 20)
labels <- seq(from = -80, to = 80, by = 20)
tetrapods_trim <- mutate(tetrapods_trim, paleolat_code = cut(Latitude, breaks = bins, labels = labels))

#Choose filters as desired
tetrapods_trim <- tetrapods_trim %>% filter(pres_mode == "body") %>% 
  filter(taxon_environment == "marine") %>% 
  filter(stage_assignment == "Induan" | stage_assignment == "Olenekian")


collections <- unique(tetrapods_trim$collection_no)

#Create frames to store sampled collection numbers and the matching curve for each iteration
sample_list <- data.frame()
gradients <- data.frame()

#Indicate number of bootstraps and reps required -> 250 for terrestrial, 30 for marine, 100 reps of each
bootstraps <- 30
reps <- 100

for (x in 1:reps){

  print(x)
  
  #Create subsampled bootstrap of 100 occurrences (with replacement)
  colls_to_sample <- base::sample(collections, bootstraps, replace = T)
  subset <- tetrapods_trim[FALSE,]
  for (k in 1:bootstraps){
  one_coll <- filter(tetrapods_trim, collection_no == colls_to_sample[k])
  subset <- rbind(subset, one_coll)
  }

  #Filter unique taxa
  #Create an empty dataset with PBDB column names to collect unique occurrences in
  unique_by_bin <- tetrapods_trim[FALSE,]

  #Loop through each collection
  #For that collection, retain species occurrences, then retain unique taxa at gradually
  #   higher taxonomic levels which are not already represented in that collection
  #   i.e. if there is an indeterminate dicynodont but no occurrences in that collection which
  #   are dicynodonts but more specifically identified, the occurrence is retained
  for (i in 1:(length(labels))) {
    one_bin <- subset %>% filter(paleolat_code == labels[i])
    for (j in 1:(nrow(one_bin))) {
      if (nrow(one_bin) != 0)
        if (one_bin$accepted_rank[j] == "species") unique_by_bin <- rbind(unique_by_bin, one_bin[j,]) else
          if (!is.na(one_bin$genus[j]))
            (if (one_bin$genus[j] %in% one_bin$genus[-j] == F) unique_by_bin <- rbind(unique_by_bin, one_bin[j,])) else
              if (!is.na(one_bin$family[j]))
                (if (one_bin$family[j] %in% one_bin$family[-j] == F) unique_by_bin <- rbind(unique_by_bin, one_bin[j,])) else
                  if (!is.na(one_bin$order[j]))
                    (if (one_bin$order[j] %in% one_bin$order[-j] == F) unique_by_bin <- rbind(unique_by_bin, one_bin[j,]))
    }
  }

  #Remove repeats of species names, to get one occurrence per unique species per substage
  unique_by_bin <- distinct(unique_by_bin, accepted_name, paleolat_code, .keep_all = T)

  #Generate counts
  raw_curve <- count(unique_by_bin, paleolat_code)

  #Fill in gaps
  for(m in 1:length(labels)) {
    if((labels[m] %in% raw_curve$paleolat_code) == FALSE) raw_curve <- rbind(raw_curve, c(labels[m], 0))
  }

  #Reorder for ease of checking
  raw_curve <- arrange(raw_curve, desc(paleolat_code))
  raw_curve$paleolat_code <- as.numeric(as.character(raw_curve$paleolat_code))

  #Add collection sample and curve for each iteration to a list
  sample_list <- rbind(sample_list, colls_to_sample)
  gradients <- rbind(gradients, raw_curve$n)
}

names(sample_list) <- 1:bootstraps
names(gradients) <- sort(labels, decreasing = T) #reorder labels to match order in output

write.csv(sample_list, file = "data/Bootstrap sample list.csv") #write csv of sampled collections
write.csv(gradients, file = "data/Bootstrap gradients.csv") #write csv of gradients produced for each bootstrap

#sample_list <- read_csv("data/Bootstrap sample list.csv")
#gradients <- read_csv("data/Bootstrap gradients.csv")
#gradients <- gradients[,-1]

#Generate summary statistics
means <- colMeans(gradients)
stand_devs <- apply(gradients, 2, sd)
error <- qnorm(0.95)*(stand_devs/sqrt(reps))
lower <- means - error
upper <- means + error
#Create data frame containing summary statistics ready for plotting
to_plot <- cbind(labels = sort(labels, decreasing = T), means, lower, upper)
to_plot <- data.frame(to_plot)

#Designate colour, limegreen or blue
colour <- "limegreen"

#Plot mean raw richness across bootstraps, with 95% confidence intervals
ggplot(to_plot, aes(x = labels, y = means, ymin = lower, ymax = upper)) +
  geom_line(size = 1, col = colour) + geom_point(size = 2, col = colour) + geom_linerange(col = colour) +
  labs(x = "Palaeolatitude", y = "Raw species richness count") + coord_flip() +
  scale_x_continuous(limits = c(-90, 90)) + geom_vline(aes(xintercept = 0), colour = "black") +
  scale_colour_manual(values = "green") +
  theme_classic()
  

########## Alpha diversity (Table S2) ##########

#setwd("#####")

#Load packages
library(tidyverse)

#Read in dataset
tetrapods <- read_csv("data/tetrapod_database_final.csv")
glimpse(tetrapods)

#Remove synonymy repeats (combinations of the same collection no. AND accepted name)
tetrapods <- distinct(tetrapods, accepted_name, collection_no, .keep_all = T)

#Trim data without stage assignment
tetrapods_trim <- tetrapods %>% filter(!is.na(stage_assignment))

#Add column which indicates latitude bin
bins <- seq(from = -90, to = 90, by = 20)
labels <- seq(from = -80, to = 80, by = 20)
tetrapods_trim <- mutate(tetrapods_trim, paleolat_code = cut(Latitude, breaks = bins, labels = labels))

#Filter unique taxa
#Create an empty dataset with PBDB column names to collect unique occurrences in
new_dataset <- tetrapods_trim[FALSE,]

#Create a vector of unique collection numbers
collections <- unique(tetrapods_trim$collection_no)

#Loop through each collection
#For that collection, retain species occurrences, then retain unique taxa at gradually
#   higher taxonomic levels which are not already represented in that collection
#   i.e. if there is an indeterminate dicynodont but no occurrences in that collection which
#   are dicynodonts but more specifically identified, the occurrence is retained
for (i in 1:(length(collections))) {
  print(i)
  one_coll <- tetrapods_trim %>% filter(collection_no == collections[i])
  for (j in 1:(nrow(one_coll))) {
    if (one_coll$accepted_rank[j] == "species") new_dataset <- rbind(new_dataset, one_coll[j,]) else
      if (!is.na(one_coll$genus[j]))
        (if (one_coll$genus[j] %in% one_coll$genus[-j] == F) new_dataset <- rbind(new_dataset, one_coll[j,])) else
          if (!is.na(one_coll$family[j]))
            (if (one_coll$family[j] %in% one_coll$family[-j] == F) new_dataset <- rbind(new_dataset, one_coll[j,])) else
              if (!is.na(one_coll$order[j]))
                (if (one_coll$order[j] %in% one_coll$order[-j] == F) new_dataset <- rbind(new_dataset, one_coll[j,]))
  }
}

#Filter as desired
subset <- new_dataset %>% filter(pres_mode == "body") %>% 
  filter(taxon_environment == "terrestrial") %>% filter(stage_assignment == "Wuchiapingian")

#Create string of frequencies per collection, and generate a mean across it (choose your palaeolatitude bin)
count_string <- subset %>% filter(paleolat_code == -60) %>% count(collection_no) %>% pull(n)
mean(count_string)


########## Grid cell occupation (Table S3) ##########

#setwd("#####")

#Load packages
library(tidyverse)
library(raster)

#Be wary that some function names are used in both packages - notation is explicit in places to 
#   reduce the chance of error

#Read in dataset
tetrapods <- read_csv("data/tetrapod_database_final.csv")
glimpse(tetrapods)

#Remove synonymy repeats (combinations of the same collection no. AND accepted name)
tetrapods <- distinct(tetrapods, accepted_name, collection_no, .keep_all = T)

#Trim data without stage assignment
tetrapods_trim <- tetrapods %>% filter(!is.na(stage_assignment))

#Add column which indicates latitude bin
bins <- seq(from = -90, to = 90, by = 20)
labels <- seq(from = -80, to = 80, by = 20)
tetrapods_trim <- mutate(tetrapods_trim, paleolat_code = cut(Latitude, breaks = bins, labels = labels))

#Choose filters as desired
tetrapods_trim <- tetrapods_trim %>% filter(pres_mode == "body") %>% 
  filter(taxon_environment == "terrestrial") %>% 
  filter(stage_assignment == "Induan" | stage_assignment == "Olenekian")

#Filter to unique collection numbers
tetrapods_trim <- distinct(tetrapods_trim, collection_no, .keep_all = T)

#Create grid raster with one degree cells (rather than equal area)
x_raster <- raster(ncol=360, nrow=180, xmn=-180, xmx=180, ymn=-90, ymx=90)
x_raster <- setValues(x_raster, c(1:ncell(x_raster)))

#Create empty string to add values to
cell_count <- vector(mode = "numeric", length = 0)

for (i in 1:length(labels)){
  #Filter to bin
  bin_colls <- tetrapods_trim %>% filter(paleolat_code == labels[i])
  
  #Pull coordinates into a spatial points data frame
  bin_colls <- dplyr::select(bin_colls, collection_no, Longitude, Latitude)
  if (nrow(bin_colls) > 0) 
    points <- SpatialPointsDataFrame(bin_colls[,c("Longitude", "Latitude")], bin_colls) else
      points <- NA

  #Calculate occupancy
  occupancy <- raster::extract(x_raster, points)
  if (nrow(bin_colls) > 0) 
    no_occ <- length(unique(occupancy)) else no_occ <- 0
  
  #Append to vector
  cell_count <- append(cell_count, no_occ)
}

#Label the cell count vector
rbind(labels, cell_count)
