## Fitting splines to SP4 data to estimate the means

## cleaning environment and cache
rm(list = ls()); gc()

#Loading required libraries
library(tidyverse)

### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### 
### Ben's thoughts on using de novo naive Tregs as precursors and on their dynamics

#In principle, de novo Treg should have a normalised donor fraction of 1, like CD4 SP precursors -
#the fact they do not is what tells us some peripheral naive Treg are recirculating back to the thymus. 
#We just assume the the normDonor fraction of recirculated naive Treg is the same as that in the periphery.
#- so normalised donor fraction is 1, 
#- if you need raw donor fractions then it should be the same as CD4SP (or DP even)

#For Ki67, we cannot subset naive Treg into de novo and recirculated - only estimate the size of the split.
#However, I think our best approximation is to use Ki67 in total naive Treg in thymus. 
#As you can see, this is pretty flat over time, even though the estimated de novo fraction varies with age 
#- high in young getting lower in old - this does not seem to have an impact on Ki67 %.
### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### 

# importing data ---------------------------------
### We consider thymic FoxP3 negative SP4 as the source for Tregs (naive Tregs) 
### for all purposes henceforth -- SP4 = FoxP3 negative SP4

## loading pre-processed data and munging
### different data frames because the number of viable observations per dataset are different
# thynaiveTregs_counts <- read_csv("data/Counts_Treg.csv") 
# thynaiveTregs_chimerism <- read_csv("data/Nfd_Treg.csv")
# thynaiveTregs_donorki <- read_csv("data/donorKi67_Treg.csv") 
# thynaiveTregs_hostki <- read_csv("data/hostKi67_Treg.csv") 

## denovo data
denovo_naiTreg <- c(25989.672, 27930.043, 35046.848, 9263.246, 6691.128, 11131.034, 2196.362, 4922.436, 31071.719,
                    54796.639, 52640.411, 5642.878, 2308.317, 2187.832, 10009.992, 19019.081, 5592.218)

host_age <- c(76, 76, 76, 51, 67, 146, 65, 65, 69, 54, 45, 178, 118, 66, 40, 40, 121) 

denovo_df <- data.frame("age.at.S1K" = host_age,
                        "total_FoxP3_pos_SP4" = denovo_naiTreg)



denovo_df <- readxl::read_excel(path = "data/denovo_naiTreg.xlsx", sheet = 4) %>%
  select(contains("mouse.ID"), contains("time"), contains("age"), contains("Denovo")) %>% 
  rename(total_FoxP3_pos_SP4 = TH.naiTregDeNovo) %>%
  filter(total_FoxP3_pos_SP4 > 0) %>%
  na.omit() %>% unique() 



## theme for ggplot
myTheme <- theme(axis.text = element_text(size = 20), axis.title =  element_text(size = 20),
                 plot.title = element_text(size = 20, hjust = 0.5), legend.text = element_text(size= 18),
                 strip.text = element_text(size = 18, hjust = 0.5),
                 legend.title = element_text(size = 18), legend.background = element_blank())

# setting ggplot theme for rest fo the plots
theme_set(theme_bw())

fancy_scientific <- function(l) {
  # turn in to character string in scientific notation
  l <- format(l, scientific = TRUE)
  # quote the part before the exponent to keep all the digits
  l <- gsub("^(.*)e", "'\\1'e", l)
  # remove + after exponent, if exists. E.g.: (e^+2 -> e^2)
  l <- gsub("e\\+","e",l)  
  # turn the 'e' into plotmath format
  l <- gsub("e", "%*%10^", l)
  # convert 1x10^ or 1.000x10^ -> 10^
  l <- gsub("\\'1[\\.0]*\\'\\%\\*\\%", "", l)
  # return this as an expression
  parse(text=l)
}

log10minorbreaks=as.numeric(1:10 %o% 10^(4:8))



### Ki67 Proportions
source_donorKi67 <- readxl::read_excel(path = "data/master_doc.xlsx", sheet = 7) %>%
  select(contains("mouse.ID"), contains("time"), contains("age"), contains("DP1"), contains("SP4"), contains("Fox25"))%>% 
  na.omit() %>% unique() 

source_hostKi67 <- readxl::read_excel(path = "data/master_doc.xlsx", sheet = 8) %>%
  select(contains("mouse.ID"), contains("time"), contains("age"), contains("DP1"), contains("SP4"), contains("Fox25"))%>% 
  na.omit() %>% unique() 

### Total counts and donor fractions for the source population
source_donor <- readxl::read_excel(path = "data/master_doc.xlsx", sheet = 2) %>%
  select(contains("mouse.ID"), contains("time"), contains("age"), contains("DP1"), contains("SP4"), contains("Fox25"))%>% 
  left_join(source_donorKi67, by = c("mouse.ID", "time.post.BMT", "age.at.S1K", "age.at.BMT"), suffix= c("_counts", "_ki")) %>%
  na.omit() %>% unique() %>%
  mutate(### 1st and 4th quadrants of FOXP3 (x-axis) and CD25 (y-axis) Boolean gate, FoxP3- SP4 = Treg free SP4
    FoxP3_neg_SP4_counts = TH.Fox25Q1_counts + TH.Fox25Q4_counts,
    ### 2nd and 3rd quadrants of FOXP3 (x-axis) and CD25 (y-axis) Boolean gate, FoxP3+ SP4 = SP4 Tregs
    FoxP3_pos_SP4_counts = TH.Fox25Q2_counts + TH.Fox25Q3_counts,
    FoxP3_neg_SP4_ki =  (((TH.Fox25Q1_ki/100) * TH.Fox25Q1_counts) + ((TH.Fox25Q4_ki/100) * TH.Fox25Q4_counts))/(TH.Fox25Q1_counts + TH.Fox25Q4_counts),
    FoxP3_pos_SP4_ki =  (((TH.Fox25Q2_ki/100) * TH.Fox25Q2_counts) + ((TH.Fox25Q3_ki/100) * TH.Fox25Q3_counts))/(TH.Fox25Q2_counts + TH.Fox25Q3_counts))%>%   
  select(-contains("Fox25"))

source_host <- readxl::read_excel(path = "data/master_doc.xlsx", sheet = 3) %>%
  select(contains("mouse.ID"), contains("time"), contains("age"), contains("DP1"), contains("SP4"), contains("Fox25"))%>% 
  left_join(source_hostKi67, by = c("mouse.ID", "time.post.BMT", "age.at.S1K", "age.at.BMT"), suffix= c("_counts", "_ki")) %>%
  na.omit() %>% unique() %>%
  mutate(### 1st and 4th quadrants of FOXP3 (x-axis) and CD25 (y-axis) Boolean gate, FoxP3- SP4 = Treg free SP4
    FoxP3_neg_SP4_counts = TH.Fox25Q1_counts + TH.Fox25Q4_counts,
    ### 2nd and 3rd quadrants of FOXP3 (x-axis) and CD25 (y-axis) Boolean gate, FoxP3+ SP4 = SP4 Tregs
    FoxP3_pos_SP4_counts = TH.Fox25Q2_counts + TH.Fox25Q3_counts,
    FoxP3_neg_SP4_ki =  (((TH.Fox25Q1_ki/100) * TH.Fox25Q1_counts) + ((TH.Fox25Q4_ki/100) * TH.Fox25Q4_counts))/(TH.Fox25Q1_counts + TH.Fox25Q4_counts),
    FoxP3_pos_SP4_ki =  (((TH.Fox25Q2_ki/100) * TH.Fox25Q2_counts) + ((TH.Fox25Q3_ki/100) * TH.Fox25Q3_counts))/(TH.Fox25Q2_counts + TH.Fox25Q3_counts))%>%   
  select(-contains("Fox25"))

# merging total counts for host and donor compartments
# calculating total counts, donor fractions
source_join <- full_join(source_host, source_donor, by = c("mouse.ID", "time.post.BMT", "age.at.S1K", "age.at.BMT"), suffix= c(".host", ".donor")) %>%
  mutate(# total = donor + host
    total_DP1 = TH.DP1_counts.host + TH.DP1_counts.donor,
    total_SP4 = TH.SP4_counts.host + TH.SP4_counts.donor,
    total_FoxP3_neg_SP4 = FoxP3_neg_SP4_counts.host + FoxP3_neg_SP4_counts.donor,
    total_FoxP3_pos_SP4 = FoxP3_pos_SP4_counts.host + FoxP3_pos_SP4_counts.donor,
    ## fd = donor fraction
    fd_DP1 = TH.DP1_counts.donor/total_DP1,
    fd_SP4 = TH.SP4_counts.donor/total_SP4,
    fd_FoxP3_neg_SP4 = FoxP3_neg_SP4_counts.donor/total_FoxP3_neg_SP4,
    fd_FoxP3_pos_SP4 = FoxP3_pos_SP4_counts.donor/total_FoxP3_pos_SP4) %>%
  select(-contains("counts"), -contains("ki"))%>%
  na.omit() %>% unique()


## vectors depicting timeseq for predictions 
timeseq <- seq(40, 450)  ## host age
dpt_seq <- seq(14, 300)   ## days post BMT
  

## fd_fit
## phenomenological function
chi_spline <- function(Time, chiEst, qEst){
  Chi = ifelse(Time-10 < 0, 0,
               chiEst * (1-exp(-qEst * (Time-10))))
  return(Chi)
}

             
## LL function
logit_transf <- function(x){asin(sqrt(x))}
Thy_chi_nlm <- nls((fd_FoxP3_neg_SP4) ~ (chi_spline(time.post.BMT, chiEst, qEst)),
                  data =  source_join,
                  start = list(chiEst=0.84, qEst=0.01))
Thy_chi_pars <- coef(Thy_chi_nlm)

# prediction
Thy_chi_fit <- data.frame(dpt_seq, "y_sp" = chi_spline(dpt_seq, Thy_chi_pars[1], Thy_chi_pars[2]))
Thy_chi_fit_m <- data.frame(dpt_seq, "y_sp" = chi_spline(dpt_seq, 0.8, 0.1))
ggplot() + 
  geom_point(data= source_join, aes(x=age.at.S1K-age.at.BMT, y=fd_FoxP3_neg_SP4), size =2.5) +
  geom_line(data = Thy_chi_fit_m, aes(x = dpt_seq , y = y_sp)) + 
  geom_line(data = Thy_chi_fit_m, aes(x = dpt_seq , y = y_sp), col=2, size =1) + 
  ylim(0,1) + #scale_x_log10(limits=c(1, 350)) +
  labs(title = 'Donor fraction in FoxP3+ SP4 cells',  y=NULL,  x = 'Days post BMT') +
  myTheme

## Counts fit
## phenomenological function
counts_spline <- function(age.at.S1K, b0, nu){
  Time = age.at.S1K   ###
  t0 = 40
  #return(10^b0 + (10^b1/(1+ ((Time-t0)/nu)^1)))
  return(10^b0 * exp(-(Time-t0)/nu))
}

thy_counts_nlm <- nls(log(total_FoxP3_pos_SP4) ~ log(counts_spline(age.at.S1K, b0, nu)),
                   data =  denovo_df,
                   start = list(b0=5, nu=200))
thy_counts_pars <- coef(thy_counts_nlm)

thycounts_fit <- data.frame(timeseq, "y_sp" = counts_spline(timeseq, thy_counts_pars[1], thy_counts_pars[2]))
thycounts_fit_m <- data.frame(timeseq, "y_sp" = counts_spline(timeseq, 4.6, 160))

ggplot() + 
  #geom_point(data=source_join, aes(x=age.at.S1K, y=total_FoxP3_pos_SP4), col=4, size =2) +
  geom_point(data=denovo_df, aes(x=age.at.S1K, y=total_FoxP3_pos_SP4), col=1, size=2.5) +
  geom_line(data = thycounts_fit_m, aes(x = timeseq , y = y_sp)) + 
  #geom_line(data = thycounts_fit_m, aes(x = timeseq , y = y_sp), col=4, size =1) + 
  scale_y_log10(limits=c(3e2, 3e5), labels=fancy_scientific, minor_breaks=log10minorbreaks) + 
  xlim(80, 450)+
  labs(title = 'Counts of FoxP3+ SP4 cells',  y=NULL,  x = 'Host age (days)') +
  myTheme


source_donorKi67$TH.SP4 %>% mean()

source_hostKi67$TH.SP4 %>% mean()

## phenomenological function
ki_spline <- function(age.at.S1K, b0, eps_f){
  Time = age.at.S1K   ###
  return(b0 * (1 + exp(-eps_f * (Time-t0))))
}

timeseq <- seq(0, 300)
ki_spline <- function(Time, b0, b1, eps_f){
  return(b0 + (b1/(1 + (Time/eps_f)^4)))
}


ki_nlm <- nls((TH.SP4/100) ~ (ki_spline(age.at.S1K - age.at.BMT, b0, b1, eps_f)),
                      data =  source_donorKi67,
                      start = list(b0=0.4, b1=0.6,  eps_f=60))
ki_pars <- coef(ki_nlm)

ki_fit <- data.frame(timeseq, "y_sp" = ki_spline(timeseq, ki_pars[1], ki_pars[2], ki_pars[3]))
ki_fit_m <- data.frame(timeseq, "y_sp" = ki_spline(timeseq, 0.37, 0.63, 20))

ggplot() + 
  geom_point(data=source_donorKi67, aes(x=age.at.S1K-age.at.BMT, y=TH.SP4), col=4, size =2) +
  #geom_line(data = ki_fit, aes(x = timeseq , y = y_sp*100), col=4, size =1) + 
  geom_line(data = ki_fit_m, aes(x = timeseq , y = y_sp*100), col=4, size =1) + 
  scale_y_log10(limits=c(10, 100)) + xlim(0, 300)+
  labs(title = 'Ki poportions in donor SP4 cells',  y=NULL,  x = 'Host age (days)') 


naiTreg_host_ki <- source_hostKi67$TH.SP4 %>% mean()
ggplot() + 
  geom_point(data=source_hostKi67, aes(x=age.at.S1K-age.at.BMT, y=TH.SP4), col=2, size =2) +
  #geom_line(data = ki_fit, aes(x = timeseq , y = y_sp*100), col=4, size =1) + 
  geom_line(aes(x = timeseq , y =naiTreg_host_ki), col=2, size =1) + 
  scale_y_log10(limits=c(10, 100)) + xlim(0, 300)+
  labs(title = 'Ki poportions in donor FoxP3+ SP4 cells',  y=NULL,  x = 'Host age (days)') 



ggplot() + 
  #geom_point(data=source_donorKi67, aes(x=age.at.S1K, y=TH.SP4/100), col=4, size =2) +
  geom_point(data=source_hostKi67, aes(x=age.at.S1K, y=TH.SP4/100), col=2, size =2) +
  geom_line(aes(x = dpt_seq , y = naiTreg_host_ki), col=2, size =1) + 
  xlim(40, 450) + ylim(0,1)+
  labs(title = 'Total counts of thymic FoxP3 positive naive SP4 cells',  y=NULL,  x = 'Host age (days)') 


dev.off()

