rm(list=ls(all=TRUE)) # clears variables1 
def.par<-par(no.readonly=TRUE) # default graphical parameters

Sys.setlocale("LC_TIME", "en_IN") #use English for dates


library('tidyverse')
library('nnet') # for multinomial model
library("summarytools") # for summary tables
library('mixdist') # for Weibull distribution
library("drc") # for dose response curve model

library("texreg") # conversion to TeX
library("Hmisc") # for summary tables
library("summarytools") # for summary tables
library("raster")
library("ggmap")
library("sf")
library("maps")
library("mapproj")
library("zoo")

library('ggsci')       # for nice color palettes


#### Reading the data

setwd("/home/samuel/Documents/Hote-parasite/COVID-2019/Ct/VOC/Omicron/")


data<-read.csv2(file="data_EID1.csv")
data<-data[,-1]
data$date<-as.Date(data$date)
data$date_labo<-as.Date(data$date_labo)

Df_stored<-data

max_date<-max(data$date_labo)
min_date<-min(data$date_labo)



#write.table(table(Df_stored$variant),file="table_variant")




##########################
#### Generic Plots



Df <- ungroup(data) %>% group_by(REGION) %>% dplyr::mutate(n_REGION=n())
Df <- Df %>% dplyr::mutate(region2=ifelse(n_REGION>1000,as.character(REGION),"other"))
Df$region2<-as.factor(Df$region2)
Df = within(Df, region2 <- relevel(region2,ref='Ile-de-France'))

#write.table(table(Df$region2),file="regions.csv")

# strain by region and
ggplot(dplyr::filter(drop_na(Df,region2,variant),
                     #date_labo>=max_date-28,
                     variant!="other?"))+
  geom_bar(aes(x=date_labo,fill=variant),position="fill")+
  labs(x="",y="proportion",fill="")+
  #  scale_y_continuous(trans = 'log10')+
  scale_fill_simpsons()+
  theme(axis.text.x=element_text(angle = -45, hjust = 0))+
  theme_classic()

ggsave(paste("proportions_",max_date,".pdf",sep=""),
       width = 10,height = 6,units = "cm")



# strain by region and
ggplot(dplyr::filter(drop_na(Df,region2,variant),
                     #date_labo>=max_date-28,
                     variant!="other?"))+
  geom_bar(aes(x=date_labo,fill=variant))+
  labs(x="",y="Number of tests",fill="")+
  facet_wrap(.~region2,ncol=3,scale="free_y")+
  #  scale_y_continuous(trans = 'log10')+
  scale_fill_simpsons()+
  theme(axis.text.x=element_text(angle = -45, hjust = 0))+
  theme_classic()

ggsave(paste("raw_numbers_per_region",max_date,".pdf",sep=""),
       width = 20,height = 16,units = "cm")






##############################
# multinomial model
#############################




Df <- ungroup(Df_stored) %>% group_by(REGION) %>% dplyr::mutate(n_REGION=n())


Df <- Df %>% dplyr::mutate(region2=ifelse(n_REGION>1000,as.character(REGION),"other"))
Df$region2<-as.factor(Df$region2)
Df = within(Df, region2 <- relevel(region2,ref='Normandie'))


Df$variant<-as.factor(as.character(Df$variant))
Df = within(Df, variant <- relevel(variant,ref='A0B0C1')) # reference is B.1.1.7


Df$COV1K_02<-as.factor(Df$COV1K_02)
Df = within(Df, COV1K_02 <- relevel(COV1K_02,ref='covid3')) # reference is the covid3 test (latest)
table(Df$COV1K_02)

#centering and scaling variables
Df$age_scale <- scale(Df$age)
Df$date_scale<-scale(as.numeric(Df$date_labo))


Df$location_sampling<-as.factor(Df$location_sampling)
Df = within(Df, location_sampling <- relevel(location_sampling,ref='non-hospital')) # reference is the covid3 test (latest)





data_multinom <- dplyr::filter(Df,
                               date_labo>=max_date-54,
                               date_labo<=max_date
                               #date_labo<=max_date-54,
                               #COV1K_02 %in% c("covid3","covti2")
)



data_multinom$variant<-droplevels(data_multinom$variant)




data_multinom$COV1K_02<-relevel(data_multinom$COV1K_02,ref="covid3")
data_multinom$COV1K_02<-droplevels(data_multinom$COV1K_02)
data_multinom$COV1K_02<-relevel(data_multinom$COV1K_02,ref="covid3")

data_multinom$location_sampling<-relevel(data_multinom$location_sampling,ref="non-hospital")
data_multinom$location_sampling<-as.factor(data_multinom$location_sampling)



library(nnet)

test <- multinom(variant ~ age_scale +location_sampling + COV1K_02 + date_scale:region2, data = data_multinom)



z <- summary(test)$coefficients/summary(test)$standard.errors
p <- (1 - pnorm(abs(z), 0, 1)) * 2
results_significant<-(p<=0.05)*exp(coef(test))
(p<=0.05)*p

t(round(results_significant,2))

ConfidInt<-signif(exp(confint(test)),2) # calculating the 95% CI for the RRR

write.csv2(cbind(t(round(results_significant,2))[,1],ConfidInt[,,1]),file=paste("multinomial_A0B0C0_",max_date,".csv",sep=""))
write.csv2(cbind(t(round(results_significant,2))[,2],ConfidInt[,,2]),file=paste("multinomial_A0B1C1_",max_date,".csv",sep=""))
write.csv2(cbind(t(round(results_significant,2))[,3],ConfidInt[,,3]),file=paste("multinomial_A1B0C0_",max_date,".csv",sep=""))
write.csv2(cbind(t(round(results_significant,2))[,4],ConfidInt[,,4]),file=paste("multinomial_A1B0C1_",max_date,".csv",sep=""))
write.csv2(cbind(t(round(results_significant,2))[,5],ConfidInt[,,5]),file=paste("multinomial_other_",max_date,".csv",sep=""))


write.csv2(t(round(results_significant,2)),file=paste("output_multinomial_model_",max_date,".csv",sep=""))




##########################################
### Ct differences
##########################################

library(car)
library("lme4") 
library('emmeans')
library(ggpubr)

data_Ct<-ungroup(data_multinom) %>% dplyr::filter(Ct>0,Ct<=28,date_labo>="2021-12-12")

data_Ct$variant<-as.character(data_Ct$variant)
data_Ct$variant = replace(data_Ct$variant,
                          which(data_Ct$variant %in% c("A1B0C0","A1B0C1","other?")),
                          "other")
data_Ct$variant<-as.factor(data_Ct$variant)
data_Ct$variant<-relevel(data_Ct$variant,ref="A0B0C1")

data_Ct$assay<-relevel(data_Ct$COV1K_02,ref="covid3") #only the Perkins assay

data_Ct = within(data_Ct, region2 <- relevel(region2,ref='Ile-de-France'))


mod0 = lm(Ct ~ age_scale + location_sampling + date_scale*region2,
          data=data_Ct)
mod1 = lm(Ct ~ age_scale + location_sampling + date_scale*region2 + variant,
          data=data_Ct)

anova(mod0,mod1)


plot_residuals<-ggplot(as.data.frame(residuals(mod1)))+
  geom_density(aes(residuals(mod1)),size=0.3)+
  theme_light()+
  labs(x="linear model Ct residual",y="density")


# ggsave(
#   filename="residuals_lm.pdf",
#   plot = plot_residuals,
#   width = 6,
#   height = 4,
#   units = c("cm"),
#   dpi = 300,
# )




summary(mod1)

Anova(mod1)

#write.csv2(Anova(mod1),file="anova_Ct.csv")

em1 = emmeans(mod1,~variant,opt.digits=F,digits=4)
em2 = emmeans(mod1,~location_sampling,opt.digits=F,digits=4)
#em1
paircomp = pairs(em1,opt.digits=F,digits=10)
#paircomp

pcomp <-summary(paircomp)

anova(mod0,mod1)


dpi = data_Ct %>% group_by(date_labo) %>% summarise(fi = sum(variant == "other")/length(variant))



n_fun <- function(x){
  return(data.frame(y = median(x), label = paste0("n=",length(x))))
}

n_fun2 <- function(x){
  return(data.frame(y = median(x), label = median(x)))
}

#dataMedian <- summarise(group_by(Df, variant), MD = median(Ct))
dataMedian<-cbind(as.character(summary(em1)[,1]),as.numeric(summary(em1)[,2]))
dataMedian<-as.data.frame(summary(em1)[,c(1,2)])


data_Ct<-as.data.frame(data_Ct)
data_Ct$variant<-as.factor(data_Ct$variant)
data_Ct$Ct<-as.numeric(as.character(data_Ct$Ct))

dim(data_Ct)

ggplot(data_Ct, aes(x = variant, y = Ct))+
  geom_violin(aes(fill = variant), trim = T) + 
  geom_boxplot(width = 0.2)+
  stat_compare_means(label = "p.signif", method = "t.test",
                     ref.group = "A0B0C1", hide.ns = TRUE) +
  stat_summary(aes(x=as.factor(variant),y=8),
               fun.data = n_fun,
               geom = "text",
               size=3
  )+
  geom_text(data = dataMedian, aes(variant, emmean, label = round(emmean,digits = 1)), 
            position = position_dodge(width = 0.8), size = 2, vjust = -2) +
  theme_pubr()+
  theme(legend.position = "none") +
  xlab("") +
  ylab("Ct") 
#scale_y_continuous(breaks = c(10, 20,30,40))


ggsave(
  filename="Ct_VOC_Dec18.pdf",
  plot = last_plot(),
  width = 9,
  height = 8,
  units = c("cm"),
  dpi = 300,
)








##########################################
### Transmission advantage caclulation
##########################################



## functions for transmission advantage


# serial interval distribution parameters
MEAN = 4.8 ; SD = 2.3
# Nishiura et al. (2020) IJID https://doi.org/10.1016/j.ijid.2020.02.060
WP.chosen = weibullpar(MEAN,SD)
# transform into Weibull shape and scale parameters

# serival interval distribution
is=function(x){
  dweibull(x,WP.chosen$shape,WP.chosen$scale)
}

# Euler-Lotka integrand
EL_integrand = function(x__,r__, R__) {
  signr = -1
  if(R__ < 1){R__ = 1/R__}
  exp(signr * r__ * x__)*R__*is(x__)
}

# Euler-Lotka integral
euler_lotka = function(r_,R_){
  integrate(EL_integrand,0,Inf,r__=r_,R__=R_)$value - 1
}

# Eulter-Lotka equation solution
# (estimation of exponential growth rate given R and the
# serial interval)
rate.calc = function(R){
  if(R == 1){res = 0}
  else {
    interv = c(0,100)
    res = uniroot(euler_lotka,interval=interv,R_=R)$root
    if(R < 1){res = -res}
  }
  return(res)
}

# the function to cancel in overcontagiosityf
overcontagiosityfoo = function(x,A,R_0){
  A - rate.calc(x) + rate.calc(R_0)
}

# final calculation of the overcontagiosity
# mean and associated 95-% confidence interval
# on the reproduction number scale
overcontagiosityf = function(modl,R_0_){
  Coefs = cbind(modl$coefficients,confint(modl))
  Coefs = -Coefs[1,]
  c(uniroot(overcontagiosityfoo,A=Coefs[1],R_0=R_0_,interval=c(.2,5))$root,
    uniroot(overcontagiosityfoo,A=Coefs[3],R_0=R_0_,interval=c(.2,5))$root,
    uniroot(overcontagiosityfoo,A=Coefs[2],R_0=R_0_,interval=c(.2,5))$root)/R_0_-1
}



# logistic growth function maker
lgfm = function(lgm,given.tau = F) {
  coefs = cbind(lgm$coefficients,confint(lgm))
  if(given.tau){coefs[1,] = coefs[1,1]}
  # set tau to its mean value
  lgf = function(x){
    # define logistic growth function
    c(1/(1+exp(coefs[1,1]*(x-coefs[2,1]))),
      1/(1+exp(coefs[1,2]*(x-coefs[2,2]))),
      1/(1+exp(coefs[1,3]*(x-coefs[2,3]))))
  }
  lgf = Vectorize(lgf)
  return(lgf)
}




### Transmission advantage of omicron vs. delta variants in France

Dt = Df_stored



#Dt<-Dt %>% dplyr::filter()
Dt<-Dt %>% dplyr::filter(date_labo>=max_date-21,date_labo<=max_date)


Dt <- ungroup(Dt) %>% group_by(departement) %>% dplyr::mutate(n_departement=n())

Dt$COV1K_02<-droplevels(Dt$COV1K_02)


# define relative time (in days)
Day0 = min(Dt$date_labo)
max(Dt$date_labo)

Dt$day<-Dt$date_labo-Day0
Dt$day<-as.numeric(Dt$day)


# delta against the alpha
Dt$souche_bin<-NA
Dt$souche_bin = replace(Dt$souche_bin,
                        which(Dt$variant == "A0B0C0"),
                        1)
Dt$souche_bin = replace(Dt$souche_bin,
                        which(Dt$variant == "A0B0C1"),
                        0)

dim(Dt)

Dt<-drop_na(Dt,souche_bin,day,age,REGION)



# GLM to generate residuals


nonhosp = T # remove hospital samples


if(nonhosp){
  Dt <- dplyr::filter(Dt,location_sampling == 'non-hospital')
  modr = glm(souche_bin~day+age+REGION+COV1K_02,
             data=Dt,family="binomial")
} else {
  modr = glm(souche_bin~day+age+location_sampling+REGION+COV1K_02,
             data=Dt,family="binomial")
}




# extracting residuals and fitted values
Dt$res1 = residuals(modr) ;
Dt$fit1 = fitted.values(modr)

# logistic growth fitting and display -------------------------------------



Df = Dt


# calculate daily souche frequencies


Df <- Df %>% group_by(day) %>%
  dplyr::summarise(ie = sum((souche_bin))/n(),
                   ir = sum(res1)/n(),
                   iF = sum(fit1)/n())


# logistic curve fitting
mL <- drm(iF ~ day, data = Df,
          fct = logistic(fixed = c(NA,0,1,NA,1)),
          type='continuous')

# overcontagiosity calculations
lgf = lgfm(mL,given.tau=T)

scv<-c(0,0,0)

scv = signif(overcontagiosityf(mL,1)*100,3)

scv


# plotting the output


# pdf(paste("omicron_delta_France_",max_date,"_fr.pdf",sep=""),
#     width=6,height=8,pointsize=20)

par(mfrow=c(1,1))

main.title = paste("Transmission advantage","\n +",scv[1]," [",scv[2],
                   " - ",scv[3],"]%",sep='')

# logistic curve plot
# logistic curve plot
time.window = seq(min(Df$day)-3,max(Df$day)+16,by=1)
Xlim = c(min(time.window),max(time.window))
ticks.at = seq(Xlim[1],Xlim[2],length.out = 10)

plot(Df$day,Df$iF,xlim=Xlim,ylim=c(0,1.000),axes=F,
     #     xlab='',ylab = 'beta/gamma/eta frequency vs. others',
     xlab='',ylab = 'frequency of A0B0C0 vs. A0B0C1 in France',
     pch=17,col=rgb(0,134,139,130,maxColorValue = 255))
par(new=T)
plot(time.window,lgf(time.window)[1,],
     type='l',xlim=Xlim,ylim=c(0,1.000),axes=F,
     col=c('darkblue','lightblue','lightblue'),lwd=2,
     lty=c(1,3,3),main=main.title,xlab='',
     ylab='')
par(new=T)
polygon(c(time.window,rev(time.window)),
        c(lgf(time.window)[2,],rev(lgf(time.window)[3,])),
        col = rgb(28,134,238,50,max=255),lty=0)
abline(v=as.Date("2020-07-01")-Day0,
       col=rgb(.8,.2,0,.5))
abline(h=0.5,lty=2)
axis(2,ylim=c(0,1.000))
axis(1,at = ticks.at,
     labels = format(Day0+ticks.at,"%d %b"))
par(new=F)

# dev.off()






############ Sliding window in France




Df2<-c()

Dt<-Df_stored


#Dt = dplyr::filter(Dt,(REGION=="Ile-de-France"),location_sampling == 'non-hospital')
Dt = dplyr::filter(Dt,location_sampling == 'non-hospital')




# delta against the alpha
Dt$souche_bin<-NA
Dt$souche_bin = replace(Dt$souche_bin,
                        which(Dt$variant == "A0B0C0"),
                        1)
Dt$souche_bin = replace(Dt$souche_bin,
                        which(Dt$variant == "A0B0C1"),
                        0)

Dt<-drop_na(Dt,souche_bin,age,REGION)


advantage_sliding_window<-c()

min_date<-min(Dt$date_labo)
max_date<-max(Dt$date_labo)

delta_tau<-21

total_time_steps<-max_date-min_date-delta_tau



for(istep in c(1:total_time_steps))
{
  
  skip_to_next <- FALSE
  
  the_min_date_here<-min_date+istep
  the_max_date_here<-the_min_date_here+delta_tau
  
  Df2 = dplyr::filter(Dt,
                      date_labo>as.Date(the_min_date_here),
                      date_labo<=as.Date(the_max_date_here))
  
  Df2$COV1K_02<-droplevels(Df2$COV1K_02)
  
  remove_assay_factor<-length(levels(Df2$COV1K_02))==1
  
  
  Day0 = min(Df2$date_labo)
  Df2$day<-Df2$date_labo-Day0
  Df2$day<-as.numeric(Df2$day)
  
  modr=c()
  if(remove_assay_factor)
  {  modr = glm(souche_bin~day+age+REGION,
                data=Df2,family="binomial")
  }else{
    modr = glm(souche_bin~day+age+REGION+COV1K_02,
               data=Df2,family="binomial")  
  }
  
  
  # extracting residuals and fitted values
  Df2$res1 = residuals(modr) ; 
  Df2$fit1 = fitted.values(modr)
  
  
  
  Df2 <- Df2 %>% group_by(day) %>%
    dplyr::summarise(ie = sum((souche_bin))/n(),
                     ir = sum(res1)/n(),
                     iF = sum(fit1)/n())
  
  
  # logistic curve fitting
  mL <- drm(iF ~ day, data = Df2,
            fct = logistic(fixed = c(NA,0,1,NA,1)),
            type='continuous')
  
  # overcontagiosity calculations
  lgf = lgfm(mL,given.tau=T)
  
  scv<-c(0,0,0)
  
  tryCatch(print(overcontagiosityf(mL,1)), error = function(e) { skip_to_next <<- TRUE})
  
  if(skip_to_next==FALSE)
  {
    scv = signif(overcontagiosityf(mL,1)*100,3)  
    
  }
  
  advantage_sliding_window<-rbind(advantage_sliding_window,c(as.numeric(the_min_date_here),scv))
  
  
}



advantage_sliding_window<-as.data.frame(advantage_sliding_window)
names(advantage_sliding_window)<-c("date","median","low95","up95")
advantage_sliding_window$date<-as.Date(advantage_sliding_window$date)

ggplot(filter(advantage_sliding_window,date>="2021-08-14"))+
  geom_point(aes(x=date,y=median))+
  #  geom_smooth(aes(x=date,y=median))+
  geom_ribbon(aes(x=date,ymin=low95,ymax=up95),alpha=0.3)+
  ylim(min=-40,max=120)+
  geom_hline(yintercept=0.5)+
  labs(x="interval origin (21 days duration)",y="transmission advantage (in %)")
#       ,title="transmission advantage of A0B0C0 tests over A0B0C1 in France")


# ggsave(
#   filename=paste("omicron_delta_France_sliding_",max_date,".pdf",sep=""),
#   plot = last_plot(),
#   width = 14,
#   height = 8,
#   units = c("cm")
# )

