# Scripts for main figures of Manaus and São Paulo seroprevalence paper
# Author: Lewis Buss (lewisbuss@usp.br)

# Packages 
library(tidyverse)
library(lubridate)
library(ggbeeswarm)

####################################################################################
#################### Main Figure 1 - Assay validation ##############################
# Validation samples for the Abbott assay
# Variable "type" gives description of each group
validation <- read.csv("data/validation_data.csv", stringsAsFactors = F)

# Make the two cutoffs of 1.4 (manufactuer) and 0.4 (reduced threshold)
validation <- validation %>% 
  mutate(result = ifelse(abbott_sc < 1.4, "Negative", "Positive"),
         result_alt = ifelse(abbott_sc >= 1.4, "Positive", ifelse(abbott_sc <1.4 & abbott_sc > 0.4, "Ind", "Negative"))) 

# Figure 1A
ggplot(validation) +
  ggbeeswarm::geom_quasirandom(aes(x = reorder(type, order), y = abbott_sc, color = result_alt),
                               size = 0.5) +
  geom_boxplot(aes(x = reorder(type, order), y = abbott_sc), 
               alpha = 0.5, 
               outlier.size = -1,
               width = 0.2) +
  scale_y_log10() +
  scale_color_manual(values = c("orange", "darkblue", "red")) +
  geom_hline(yintercept = 1.4, linetype = 2) +
  geom_hline(yintercept = 0.4, linetype = 2) +
  theme_classic() +
  labs(x = "Sample type",
       y = "Abbott SARS-CoV-2 IgG CLIA S/C\n(log scale)") +
  theme(legend.position = "none")

# Figure 1B
# Cohort of convalescent plasma donors with multiple blood draws
plasma_donors <- read.csv("data/convalescent_plasma_longitudinal.csv", stringsAsFactors = F)

# Figure 1B
ggplot(plasma_donors) +
  geom_point(aes(x = days_post_symptoms, y = abbott_sc)) +
  geom_line(aes(x = days_post_symptoms, y = abbott_sc, group = donor_id)) +
  labs(x = "Days after symptom onset",
       y = "Abbott SARS-CoV-2 IgG CIMA S/C") +
  theme_classic()

# Figure 1C
baseline <- plasma_donors %>% 
  filter(seropositive_at_baseline == "yes", followup_visit == "baseline") %>% 
  select(donor_id, first_sc = abbott_sc, first_timepoint = days_post_symptoms) %>% 
  ungroup()

follow_up <- plasma_donors %>% 
  filter(seropositive_at_baseline == "yes", followup_visit == "first follow-up") %>%
  select(donor_id, second_sc = abbott_sc, second_timepoint = days_post_symptoms) %>% 
  ungroup()

# Calculate the decay expressed in log base 2 units per day
gradients <- baseline %>% left_join(follow_up, by = "donor_id")
gradients <- gradients %>% 
  mutate(diff_time = second_timepoint - first_timepoint,
         gradient = log(second_sc, 2) - log(first_sc, 2),
         gradient = gradient/diff_time)

ggplot(gradients) +
  geom_histogram(aes(x = gradient*100),
                 color = "black",
                 fill = "white") +
  geom_vline(xintercept = mean(gradients$gradient)*100, linetype = 2, color = "red") +
  annotate("text", color = "red", x = -3, y = 9, label = "Mean half-life\n106 days") +
  scale_x_continuous(breaks = seq(-4, 3, 1)) +
  theme_classic() +
  labs(x = "Change in Abbott SARS-CoV-2 IgG\nCIMA S/C (log2 units) per 100 days",
       y = "Number of plasma donors")

###################################################################################
#################### Main figure 2 - Prevalence estimates and signal-to-cutoff ##########################

### Figures 2A and 2C

# Total daily deaths and daily mortality standardized to age-structure of Brazilian population, according to the date of death
stand_deaths <- read.csv("data/stand_deaths.csv", stringsAsFactors = F)
# Prevalence estimates with various corrections calculated with the 1.4 and 0.4 thresholds
prev <- read.csv("data/plot_df_prevalences.csv", stringsAsFactors = F)

# Y-axis limit for mortality and prevalence plots
scaler <- max(stand_deaths$brazil_rate)

# Change the location to plot Sao Paulo or Manaus
prev <- prev %>% filter(location == "Manaus") # Change here
stand_deaths <- stand_deaths %>% filter(location == "Manaus") # Change here

# Convert dates into date format
prev <- prev %>% mutate(min_dt = mdy(min_dt))
stand_deaths <- stand_deaths %>% mutate(DT_EVOLUCA = mdy(DT_EVOLUCA))

# Cumulative deaths
stand_deaths <- stand_deaths %>% 
  arrange(DT_EVOLUCA) %>% 
  mutate(cum_deaths = cumsum(total_deaths))

# These scalers are needed to rescale the cumulative deaths curve to the maximum 
# corrected prevalence value 
scaler1 <- max(stand_deaths$cum_deaths)
scaler2 <- max(prev$prev, na.rm = T)

# To offset/dodge points
prev <- prev %>% 
  mutate(dodge = case_when(
    type == "Age-sex re-weighted (1.4)" ~ -6,
    type == "Age-sex & sens/spec adjusted (1.4)" ~ -3,
    type == "Age-sex re-weighted (0.4)" ~ 0,
    type == "Age-sex & sens/spec adjusted (0.4)" ~ 3,
    type == "Seroreversion corrected (1.4)" ~ 6
  ))

ggplot() +
  geom_line(data = stand_deaths, aes(x = DT_EVOLUCA, y = (cum_deaths/scaler1)*scaler2*scaler)) +
  geom_col(data = stand_deaths, aes(x = DT_EVOLUCA, y = brazil_rate), alpha = 0.5) +
  geom_point(data = prev,
             aes(x = min_dt + mid_adjust + dodge, y = prev*scaler, color = type), 
             size = 2) +
  geom_errorbar(data = prev, 
                aes(x = min_dt+mid_adjust+dodge, 
                    ymin = ci_l*scaler, 
                    ymax = ci_u*scaler,
                    color = type), 
                width = 3) +
  scale_y_continuous(limits = c(0, scaler), sec.axis = sec_axis(~100*(./scaler), name = "SARS-CoV-2 anitbody prevalance (%)")) +
  scale_x_date(breaks = "months", date_labels = "%m/%y", limits = c(dmy("01-02-2020"), dmy("30-10-2020"))) +
  theme_classic() +
  theme(legend.position = "top", 
        axis.text.x = element_text(angle = 45, hjust = T)) +
  labs(x = "",
       y = "Age-sex standardized COVID-19\nmortality per 100,000 inhabitants")

#### Figures 2B and 2D - signal-to-cutoff readings
sero <- read.csv("data/sero_consolidated_weights.csv", stringsAsFactors = F)

sero <- sero %>% select(abbott_sc, location, donmo, min_dt, mid_adjust)

sero <- sero %>% mutate(sample_month = case_when(
  donmo == 2 ~ "February",
  donmo == 3 ~ "March",
  donmo == 4 ~ "April",
  donmo == 5 ~ "May",
  donmo == 6 ~ "June",
  donmo == 7 ~ "July",
  donmo == 8 ~ "August",
  donmo == 9 ~ "September",
  donmo == 10 ~ "October"
),
covid_resultado_alt = ifelse(abbott_sc < 0.4, "Neg", 
                             ifelse(abbott_sc >=0.4 & abbott_sc <1.4, "Ind", "Pos")))

# Dates
sero <- sero %>% mutate(min_dt = mdy(min_dt)) 

# Change location between São Paulo and Manaus to generate plots 2B and 2D
ggplot() +
  ggbeeswarm::geom_quasirandom(data = sero %>% filter(location == "Manaus"), # change here - Sao Paulo or Manaus
                               aes(x = min_dt + mid_adjust, y = abbott_sc, color = covid_resultado_alt),
                               alpha = 0.5, size = 0.05) +
  geom_boxplot(data = sero %>% filter(location == "Manaus", # change here - Sao Paulo or Manaus
                                      abbott_sc > 0.4),
               aes(x = min_dt + mid_adjust, y = abbott_sc, group = sample_month),
               alpha = 0.3, outlier.size = -1,
               width = 3) +
  geom_hline(yintercept = 1.4, linetype = 2) +
  geom_hline(yintercept = 0.4, linetype = 2) +
  scale_color_manual(values = c("orange", "darkblue", "lightpink")) +
  scale_y_log10() +
  scale_x_date(breaks = "months", date_labels = "%m/%y") +
  labs(x = "", y = "Abbott SARS-CoV-2 IgG\nCLIA S/CO (log scale)",
       color = "Result at manufactuer's threshold") +
  theme_classic() +
  theme(legend.position = "none")

###################################################################################
#################### Calculation of infection fatality ratios  ####################
# Demographic data - city level projections
demog <- read.csv("data/demography.csv", stringsAsFactors = F)
# SIVEP-Gripe - downloaded 19-10-2020
srag <- read.csv("data/INFLUD-19-10-2020.csv",
                 stringsAsFactors = F,
                 sep = ";")
# Prevalence dataframe
prev <- read.csv("data/plot_df_prevalences.csv", stringsAsFactors = F)

# Pop size by age groups for Manaus
mn <- demog %>% filter(region == "Manaus")
mn <- mn %>% mutate(age_brackets = cut(age_low,
                                       breaks = c(seq(-1, 80, 10), 100),
                                       labels = c(str_c(seq(0, 70, 10), seq(9, 79, 10), sep = " to "), "80+")))
mn <- mn %>% 
  group_by(age_brackets) %>% 
  summarise(population = sum(population))

# Seroprevalence in June, prior to significant waning
seroprev1 <- prev$prev[prev$location == "Manaus" & prev$type == "Age-sex & sens/spec adjusted (1.4)" & prev$donmo == 6]

# Seroprevalence in October after adjusting for wanning
seroprev2 <- prev$prev[prev$location == "Manaus" & prev$type == "Seroreversion corrected (1.4)" & prev$donmo == 10]

# Total infections in each age group assuming constant attack rate
mn <- mn %>% mutate(total_infections_june = population*seroprev1,
                    total_infections_oct = population*seroprev2)

# Deaths by age group Manaus
mn_deaths <- srag %>% 
  filter(EVOLUCAO == 2,  # Select deaths only
         CO_MUN_RES == 130260, # Select residents of Manaus only
         SG_UF_NOT == "AM")   

mn_deaths <- mn_deaths %>% 
  mutate(DT_NOTIFIC = dmy(DT_NOTIFIC), # date of notification
         DT_EVOLUCA = dmy(DT_EVOLUCA)) # date of death

# Make same age brackets
mn_deaths <- mn_deaths %>% 
  mutate(age = as.numeric(ifelse(TP_IDADE == 3, NU_IDADE_N, 0)), # When age recorded in months or days must be infant <1yr
         age_brackets = cut(age, 
                            c(seq(-1, 80, 10), 99), 
                            include.lowest = T,
                            labels = c(str_c(seq(0, 70, 10), seq(9, 79, 10), sep = " to "), "80+"))) 

# Select all deaths due to SARI including confirmed COVID-19 
mn_deaths_sari1 <- mn_deaths %>% 
  filter(CLASSI_FIN %in% c(4, 5) | is.na(CLASSI_FIN)) %>%  # Change here to CLASSI_FIN == 5 to use only confirmed COVID-19 deaths
  # This window spans from the first confirmed COVID-19 death up to the end of the sampling window 
  # in June
  filter(DT_EVOLUCA <= dmy("15-06-2020") & DT_EVOLUCA >= ymd("2020-03-27")) %>% 
  group_by(age_brackets) %>% 
  tally() %>%
  rename(deaths_june = n) %>% 
  filter(!is.na(age_brackets))

mn_deaths_sari2 <- mn_deaths %>% 
  filter(CLASSI_FIN %in% c(4, 5) | is.na(CLASSI_FIN)) %>% 
  # This window spans from the first confirmed COVID-19 death up to the end of the sampling window 
  # in October
  filter(DT_EVOLUCA <= ymd("2020-10-17") & DT_EVOLUCA >= ymd("2020-03-27")) %>% 
  group_by(age_brackets) %>% 
  tally() %>%
  rename(deaths_oct = n) %>% 
  filter(!is.na(age_brackets))

mn_deaths_sari <- mn_deaths_sari1 %>% left_join(mn_deaths_sari2)

rm(mn_deaths_sari1, mn_deaths_sari2)

# Put deaths and pop size together
mn <- mn %>% left_join(mn_deaths_sari, by = "age_brackets")

# caluculate IFR point estimate
mn <- mn %>% mutate(ifr_june = (deaths_june/total_infections_june)*100,
                    ifr_oct =  (deaths_oct/total_infections_oct)*100,
                    location = "Manaus")

# Global IFRs
sum(mn$deaths_june)/sum(mn$total_infections_june)

#### São Paulo
# Pop size by age groups for São Paulo
sp <- demog %>% filter(region == "Sao Paulo")
sp <- sp %>% mutate(age_brackets = cut(age_low,
                                       breaks = c(seq(-1, 80, 10), 100),
                                       labels = c(str_c(seq(0, 70, 10), seq(9, 79, 10), sep = " to "), "80+")))
sp <- sp %>% 
  group_by(age_brackets) %>% 
  summarise(population = sum(population))

# Seroprevalence in June, prior to significant waning
seroprev1 <- prev$prev[prev$location == "Sao Paulo" & prev$type == "Age-sex & sens/spec adjusted (1.4)" & prev$donmo == 6]

# Seroprevalence in October after adjusting for wanning
seroprev2 <- prev$prev[prev$location == "Sao Paulo" & prev$type == "Seroreversion corrected (1.4)" & prev$donmo == 10]


# Total infections in each age group assuming constant attack rate
sp <- sp %>% mutate(total_infections_june = population*seroprev1,
                    total_infections_oct = population*seroprev2)

# Deaths by age group Manaus
sp_deaths <- srag %>% 
  filter(EVOLUCAO == 2,  # Select deaths only
         CO_MUN_RES == 355030, # Select residents of SP only
         SG_UF_NOT == "SP")   

sp_deaths <- sp_deaths %>% 
  mutate(DT_NOTIFIC = dmy(DT_NOTIFIC), # date of notification
         DT_EVOLUCA = dmy(DT_EVOLUCA)) # date of death

sp_deaths <- sp_deaths %>% 
  mutate(age = as.numeric(ifelse(TP_IDADE == 3, NU_IDADE_N, 0)), # When age recorded in months or days must be infant <1yr
         age_brackets = cut(age, 
                            c(seq(-1, 80, 10), 99), # Five year age bands, going 0-4, 5-9, 10-14 etc.
                            include.lowest = T,
                            labels = c(str_c(seq(0, 70, 10), seq(9, 79, 10), sep = " to "), "80+"))) 

# Select all deaths due to SARI including confirmed COVID-19 
sp_deaths_sari1 <- sp_deaths %>% 
  filter(CLASSI_FIN %in% c(4, 5) | is.na(CLASSI_FIN)) %>%  # Toggle here to CLASSI_FIN == 5 for only confirmed COVID-19
  # This window spans from the first confirmed COVID-19 death up to the end of the sampling window 
  # in June
  filter(DT_EVOLUCA <= ymd("2020-06-20") & DT_EVOLUCA >= ymd("2020-03-13")) %>% 
  group_by(age_brackets) %>% 
  tally() %>%
  rename(deaths_june = n) %>% 
  filter(!is.na(age_brackets))

sp_deaths_sari2 <- sp_deaths %>% 
  filter(CLASSI_FIN %in% c(4, 5) | is.na(CLASSI_FIN)) %>% 
  # This window spans from the first confirmed COVID-19 death up to the end of the sampling window 
  # in June
  filter(DT_EVOLUCA <= ymd("2020-10-24") & DT_EVOLUCA >= ymd("2020-03-13")) %>% 
  group_by(age_brackets) %>% 
  tally() %>%
  rename(deaths_oct = n) %>% 
  filter(!is.na(age_brackets))

sp_deaths_sari <- sp_deaths_sari1 %>% left_join(sp_deaths_sari2)

rm(sp_deaths_sari1, sp_deaths_sari2)

# Put deaths and pop size together
sp <- sp %>% left_join(sp_deaths_sari, by = "age_brackets")

# caluculate IFR point estimate
sp <- sp %>% mutate(ifr_june = (deaths_june/total_infections_june)*100,
                    ifr_oct =  (deaths_oct/total_infections_oct)*100,
                    location = "São Paulo") 
# Global IFRs
sum(sp$deaths_june)/sum(sp$total_infections_june)

all <- rbind(
  sp %>% ungroup %>% select(age_brackets, ifr = ifr_june, location) %>% mutate(type = "June (before seroreversion)"),
  sp %>% ungroup %>% select(age_brackets, ifr = ifr_oct, location) %>% mutate(type = "October (seroreversion corrected)"),
  mn %>% ungroup %>% select(age_brackets, ifr = ifr_june, location) %>% mutate(type = "June (before seroreversion)"),
  mn %>% ungroup %>% select(age_brackets, ifr = ifr_oct,  location) %>% mutate(type = "October (seroreversion corrected)")
) 

verity <- read.csv("data/verity_ifr.csv")
verity1 <- verity %>% select(age_brackets, ifr) %>% mutate(location = "Mainland China", type = "June (before seroreversion)")
verity2 <- verity %>% select(age_brackets, ifr) %>% mutate(location = "Mainland China", type = "October (seroreversion corrected)")
all <- all %>% rbind(verity1, verity2)

ggplot(all) +
  geom_col(aes(x = age_brackets, y = ifr, fill = location), color = "black", position = "dodge") +
  labs(fill = "", x = "Age groups (years)", y = "Age specific infection\nfatality ratio (%)") +
  theme_classic() +
  facet_wrap(~ type) +
  theme(axis.text.x = element_text(angle = 45, hjust = T),
        legend.position = "top")

