# This supplementary material is hosted by Eurosurveillance as supporting information alongside the \textit{Epidemiological and clinical insights from SARS-CoV-2 RT-PCR crossing threshold values} on behalf of the authors who remain responsible for the accuracy and appropriateness of the content. The same standards for ethics, copyright, attributions and permissions as for the article apply. Supplements are not edited by Eurosurveillance and the journal is not responsible for the maintenance of any links or email addresses provided therein.

rm(list=ls(all=TRUE)) # clears variables1 
def.par<-par(no.readonly=TRUE) # default graphical parameters

#########################
##  Loading packages


library("tidyverse") ; library("dplyr")
library("lme4") ; library('emmeans')
library("lubridate")
library('boot') ; library('gam') ; library("zoo")
library('R0') ; library('EpiEstim') ; library('incidence')
library('gdata')
library("nlme")
library("ggpubr")
library("ggsci")       # for nice color palettes
library("glmnet")
library("nnet")
library("lmerTest")
library("tidyquant")
library("car")



#########################
##  Loading the data


Df<-read.csv2(file="DATA_cleaned_all.csv",header=T)
Df<-dplyr::select(Df,-1)
Df<-as_tibble(Df)

#Df <- Df %>% group_by(id_patient) %>% mutate(replicat=row_number())

Dtemp<-Df %>% dplyr::filter(
  (replicat==1)  #one value per patient 
  &(age>0)&(age<90)
)

quantile(Dtemp$Ct,c(0,0.025,0.5,0.975,1))


# filter values
Df <- Df %>% dplyr::filter(
   (replicat==1)  #one value per patient 
  &(Ct>10) # remove 0 Cts
  &(Ct<45) # remove Cts too large
  &(age>0)&(age<90)
)

quantile(Df$Ct,c(0,0.025,0.5,0.975,1)) #quantiles for Ct values

dim(Df)[1]-dim(Dtemp)[1]


#############################################
##  cleaning the format of the variables


Df$gene_cible<-as.factor(Df$gene_cible)
table(Df$gene_cible)

Df$technique_PCR<-relevel(as.factor(Df$technique_PCR), ref = "Perkin")

Df$nature_prelevement = replace(Df$nature_prelevement,which(Df$nature_prelevement == "VRB"),"OP")
Df$nature_prelevement<-drop.levels(Df$nature_prelevement)

Df$symptom_details<-as.character(Df$symptom_details)
Df$symptom_details = replace(Df$symptom_details,which(is.na(Df$symptom_details)),"missing")
Df$symptom_details = replace(Df$symptom_details,which(Df$symptom_details == "missing"),NA)
Df$symptom_details<-drop.levels(Df$symptom_details)
Df$symptom_details<-as.factor(Df$symptom_details)
Df$symptom_details<-relevel(Df$symptom_details, ref = "symptomatic")
table(Df$symptom_details)

Df$resultat_qualitatif<-relevel(as.factor(Df$resultat_qualitatif), ref = "1")



# scaling numerical variables 
Df$date_scaled<-scale(as.Date(Df$date_prelevement))
Df$date_scaled<-as.numeric(Df$date_scaled)
Df$age_scaled<-scale(as.numeric(Df$age))
Df$age_scaled<-as.numeric(Df$age_scaled)
Df$Rt_scaled<-scale(as.numeric(Df$Rt_noshift))
Df$Rt_scaled<-as.numeric(Df$Rt_scaled)

Df$resultat_qualitatif<-as.factor(Df$resultat_qualitatif)
Df$resultat_qualitatif<-relevel(Df$resultat_qualitatif, ref = "1")

Df$lieu_prelevement<-as.factor(Df$lieu_prelevement)
Df$lieu_prelevement<-relevel(Df$lieu_prelevement, ref = "screening")

Df$variable_controle_id<-as.factor(Df$variable_controle_id)
Df$variable_controle_id<-relevel(Df$variable_controle_id, ref = "0")


Df$symptoms<-as.factor(Df$symptoms)
Df$symptoms<-relevel(Df$symptoms, ref = "moins_4_jours")


Df$laboratoire_analyse<-as.factor(Df$laboratoire_analyse)
Df$laboratoire_analyse<-relevel(Df$laboratoire_analyse, ref = "LAB_1")

Df$nature_prelevement<-as.factor(Df$nature_prelevement)
Df$nature_prelevement<-relevel(Df$nature_prelevement, ref = "NP")

Df$gene_cible<-as.factor(Df$gene_cible)
Df$gene_cible<-relevel(Df$gene_cible, ref = "N")

Df$technique_PCR<-as.factor(Df$technique_PCR)
Df$technique_PCR<-relevel(Df$technique_PCR, ref = "Perkin")


ungroup(drop_na(Df,symptoms,resultat_qualitatif)) %>% group_by(resultat_qualitatif,symptoms) %>% summarise(n = n()) %>% mutate(freq = n / sum(n))



time_interval_total<-max(as.Date(Df$date_prelevement))-min(as.Date(Df$date_prelevement))
time_interval_total/(max(Df$date_scaled)-min(Df$date_scaled))

age_interval_total<-max(as.Date(Df$age))-min(as.Date(Df$age))
age_interval_total/(max(Df$age_scaled)-min(Df$age_scaled))


#############################################
##  Linear models


modele_general = lm(Ct ~  
                  Rt_scaled*date_scaled + age_scaled+sexe +
                      gene_cible+technique_PCR
                    + laboratoire_analyse   + symptoms + 
                    + nature_prelevement + lieu_prelevement  +
                    + resultat_qualitatif  +variable_controle_id,
                    data = dplyr::filter(Df,(gene_cible!="ci")))

#removing the negative tests from the data to run the linear model
modele_general_noneg = lm(Ct ~  
                      Rt_scaled*date_scaled + age_scaled+sexe +
                      gene_cible+technique_PCR
                    + laboratoire_analyse   + symptoms + 
                      + nature_prelevement + lieu_prelevement  +
                      + resultat_qualitatif  +variable_controle_id,
                    data = dplyr::filter(Df,(resultat_qualitatif!=0),(gene_cible!="ci")))

names(Df)
table(dplyr::filter(Df,(resultat_qualitatif!=0),(gene_cible!="ci")) %>% dplyr::select(infos_date_symptomes))
table(dplyr::filter(Df,(resultat_qualitatif==0),(gene_cible!="ci")) %>% dplyr::select(symptoms))

line_ordering<-match(rownames(round(summary(modele_general_noneg)$coefficients,3)),rownames(round(summary(modele_general)$coefficients,3)))
model_differences<-(round(summary(modele_general)$coefficients,3)[line_ordering,]-round(summary(modele_general_noneg)$coefficients,3))


#writing the output
write.csv2(round(summary(modele_general)$coefficients,3),file="output_modele_general2.csv")

#writing the output
write.csv2(round(summary(modele_general_noneg)$coefficients,3),file="modele_general_noneg.csv")


# same model without Rt (to compare the two models)
modele_general_noR =  lm(Ct ~  
                           date_scaled + age_scaled+sexe 
                         + gene_cible +technique_PCR  
                         + laboratoire_analyse  
                          + symptoms 
                         + nature_prelevement + lieu_prelevement  +
                           + resultat_qualitatif  +variable_controle_id,
                         data = dplyr::filter(Df,(gene_cible!="ci")&(age>0)&(age<90)))



plot(density(residuals(modele_general)))  # without the CI gene, the residuals are nicely Gaussian
abline(v=0,col=2,lty=2)

AIC(modele_general,modele_general_noR) #check we do need R(t)

summary(modele_general) # effects go in nice directions
summary(modele_general_noneg) # effects go in nice directions


# performing a type II ANOVA
anova_typeII<-car::Anova(modele_general)
anova_typeII
summary(anova_typeII)


p_values<-coef(summary(modele_general,na.rm=F))[,c(1,4)]
variables<-row.names(p_values)
p_values<-as.data.frame(p_values)
p_values$variables<-variables


summary(modele_general)$adj.r.squared


# filtering significant effets from the linear model

coefficients<-cbind(coef = signif(coef(modele_general),digits=3), signif(confint.default(modele_general),digits=3))
#coefficients<-cbind(coef = signif(coef(modele_general),digits=2), signif(confint(modele_general),digits=2))
variables<-row.names(coefficients)
coefficients<-as.data.frame(coefficients)
coefficients$variables<-variables

coefficients<-left_join(coefficients,p_values[,-1],by="variables")

coefficients <- coefficients %>% mutate(significant = (`Pr(>|t|)`<=0.013*0.05))

coefficients_labs<-dplyr::filter(coefficients,(substr(variables,1,12)=="laboratoire_")&(`Pr(>|t|)`<=0.01))
coefficients_labs <- coefficients_labs %>% dplyr::select(c(4,1:3,5))

coefficients_control<-dplyr::filter(coefficients,(substr(variables,1,10)=="variable_c")&(`Pr(>|t|)`<=0.05))
coefficients_control <- coefficients_control %>% dplyr::select(c(4,1:3))

coefficients_technique<-dplyr::filter(coefficients,(substr(variables,1,10)=="technique_")&(`Pr(>|t|)`<=0.01))
coefficients_technique <- coefficients_technique %>% dplyr::select(c(4,1:3,5))



#table(dplyr::filter(Df,technique_PCR=="Genefinder")$partenaire_etude)

# coefficients_plot<-dplyr::filter(coefficients,
#                                 (substr(variables,1,10)!="(Intercept")&(substr(variables,1,10)!="technique_")&
#                                   (substr(variables,1,12)!="laboratoire_")&
#                                   (substr(variables,1,10)!="variable_c")&(significant==TRUE))

coefficients_plot<-coefficients%>% dplyr::filter(significant==TRUE)

coefficients_plot <- coefficients_plot %>% dplyr::select(c(4,1:3,5))

coefficients_plot #showing the significant coefficients only






####################################################################
### Focusing on positive tests from screening, can we predict R(t) ?
####################################################################

Du <- Df %>% dplyr::filter(
  (resultat_qualitatif == 1) # test is positive
  &(Ct>10) # remove Cts smaller than 10
  &(Ct<45) # remove Cts larger than 45
#  & (date_prelevement>"2020-05-11") #post lock-down
& (as.Date(date_prelevement)>="2020-07-01") #post summer 2020
  & (gene_cible!="ci") #remove internal control gene
  & (lieu_prelevement=="screening") # from screening
 & (age>5) # not too young
& (age<81) # not too old
)

dim(Du)

Du_original<-Du #save before further changes
#Du<-Du_original

# scaling numerical variables 
Du$date_scaled<-scale(as.Date(Du$date_prelevement))
Du$age<-scale(as.numeric(Du$age))
Du$Rt_scaled<-scale(as.numeric(Du$Rt_noshift))



## cleaning variables to drop removed factors
Du$laboratoire_analyse = drop.levels(as.factor(Du$laboratoire_analyse))
Du$technique_PCR = drop.levels(as.factor(Du$technique_PCR))
Du$gene_cible = drop.levels(as.factor(Du$gene_cible))
Du$nature_prelevement = drop.levels(as.factor(Du$nature_prelevement))
Du$lieu_prelevement = drop.levels(as.factor(Du$lieu_prelevement))



Du<-drop_na(Du,laboratoire_analyse,technique_PCR,gene_cible,age)
Du_saved<-Du

dim(Du)
length(unique(Du$id_patient))

## First create a model to account for other Ct variations
modele_noR2 = lm(Ct ~  age + gene_cible*technique_PCR + laboratoire_analyse,data = Du)

# if we use a mixed effect model we need to abandon gene_cible, which is not great
modele_noR = lme(Ct ~ 1+technique_PCR+age,
                 random=(~1|laboratoire_analyse),data = Du)

plot(density(residuals(modele_noR)))  # without the CI gene, the residuals are nicely Gaussian

anova(modele_noR2) # everything is significant but the control variable

summary(modele_noR2) # we explain 22% of the variance without R




### Creating new variables

Du$Ct_residuals_raw<-residuals(modele_noR)
Du$Ct_residuals2_raw<-residuals(modele_noR2)


?skewness


# compute daily summary statistics
Du_daily <- Du %>% group_by(date_prelevement) %>% summarise(median_Ct_raw=median(Ct),
                                                 median_Ct_residuals=median(Ct_residuals_raw),
                                                 median_Rt=median(Rt_noshift),
                                                 skew_Ct_raw=skewness(Ct),
                                                 skew_Ct_residuals=skewness(Ct_residuals_raw),
                                                 median_Ct_residuals2=median(Ct_residuals2_raw),
                                                 skew_Ct_residuals2=skewness(Ct_residuals2_raw),
                                                 )



# compute rolling averages (with the lm and the lme residuals)

Du_daily$Ct_residuals<-zoo::rollmean(Du_daily$median_Ct_residuals,7,align="right",fill=NA)
Du_daily$Ct_residuals_skew<-zoo::rollmean(Du_daily$skew_Ct_residuals,7,align="right",fill=NA)

Du_daily$Ct_residuals2<-zoo::rollmean(Du_daily$median_Ct_residuals2,7,align="right",fill=NA)
Du_daily$Ct_residuals_skew2<-zoo::rollmean(Du_daily$skew_Ct_residuals2,7,align="right",fill=NA)
         

#studying correlations
ccf(drop_na(Du_daily[Du_daily$date_prelevement>"2020-08-01",])$Ct_residuals_skew,drop_na(Du_daily[Du_daily$date_prelevement>"2020-08-01",])$median_Rt,lag.max=14)
ccf(drop_na(Du_daily)$Ct_residuals,drop_na(Du_daily)$median_Rt,lag.max=14)

ccf(drop_na(Du_daily[Du_daily$date_prelevement>"2020-08-01",])$Ct_residuals,drop_na(Du_daily[Du_daily$date_prelevement>"2020-08-01",])$median_Rt,lag.max=20)[,1]
ccf(drop_na(Du_daily[Du_daily$date_prelevement>"2020-08-01",])$Ct_residuals2,drop_na(Du_daily[Du_daily$date_prelevement>"2020-08-01",])$median_Rt,lag.max=20)[,1]
# correlation is better with modele_noR, i.e. mixed effects


#creating a shifted dataset
N<-dim(Du_daily)[1]
shifting<-7
Du_daily_shifted5<-Du_daily[-c((N-shifting+1):N),]
Du_daily_shifted5$median_Rt<-Du_daily$median_Rt[-c(1:shifting)]


#Du_daily_shifted5<-Du_daily #comment out to remove the shift

Du_daily_shifted5<-Du_daily_shifted5 %>% dplyr::select(date_prelevement,median_Rt,Ct_residuals,Ct_residuals_skew)

ggplot(subset(Du_daily_shifted5,date_prelevement>"2020-08-01"),
       aes(x=as.Date(date_prelevement)))+
  geom_line(aes(y=(Ct_residuals-mean(Ct_residuals))/6,col="Ct median"))+
  geom_line(aes(y=Ct_residuals_skew-mean(Ct_residuals_skew),col="Ct skew"))+
#  geom_line(aes(y=(Ct_residuals2-mean(Ct_residuals2))/6,col="Ct median 2"))+
#  geom_line(aes(y=Ct_residuals_skew2-mean(Ct_residuals_skew2),col="Ct skew 2"))+
  geom_line(aes(y=median_Rt-1,col="R(t)-1"))+
  geom_hline(yintercept = 0,linetype=2,color="purple")+
  labs(x="date",y="écart par rapport à la moyenne globale",color="")+
  #  scale_color_discrete(limits=c("black","red","blue"),labels=c("R(t)","Ct médian", "Ct skew"))+
  scale_color_aaas()+
  scale_fill_aaas()



# how much do we explain in the end ?
model_daily5<-lm(median_Rt~Ct_residuals+Ct_residuals_skew+date_prelevement,data=Du_daily)
anova(model_daily5)
summary(model_daily5)



save.image(file="study_noreplicate.RData")

