######
# Aim: compute Ne and s of the S/O dataset and the 2 additional ones 
######

# ----- global variables, functions and packages ----- #
source("/Volumes/cluster/Claire/SO_PaperI/manuscript/scripts/global_variables.R")
source("/Volumes/cluster/Claire/SO_PaperI/manuscript/scripts/functions.R")

# ----- Load recombination maps for Dmel and Dsim
recomb <- fread(paste(path_recomb, "/02_recombination_map_moving-median_Dmel_501k_version6.txt", sep = ""), skip = 1, sep = "\t")
chr <- sapply(recomb$V1, function(x) strsplit(x, ":", fixed = T)[[1]][1])
start <- as.numeric(sapply(recomb$V1, function(x) strsplit(strsplit(x, ":", fixed = T)[[1]][2], "..", fixed = T)[[1]][1]))
end <- as.numeric(sapply(recomb$V1, function(x) strsplit(strsplit(x, ":", fixed = T)[[1]][2], "..", fixed = T)[[1]][2]))
rc <- recomb$V2
rmap_dmel <- data.frame(chr=chr, start=start,end=end,r=rc)

rmap_dsim <- fread(paste(path_recomb, "/07_recombination_map_moving-median_Dsim_501k.txt",sep = ""), 
skip = 1, sep = "\t", h = F)
rmap_dsim$chr <- sapply(rmap_dsim$V1, function(x) strsplit(x, ":",fixed=T)[[1]][1]) 
rmap_dsim$start <- as.numeric(sapply(rmap_dsim$V1, function(x) strsplit(strsplit(x, ":",fixed=T)[[1]][2], "..",fixed=T)[[1]][1]))
rmap_dsim$end <- as.numeric(sapply(rmap_dsim$V1, function(x) strsplit(strsplit(x, ":",fixed=T)[[1]][2], "..",fixed=T)[[1]][2]))
colnames(rmap_dsim)[2]<-"r"

split_cM_dmel <- c(quantile(rmap_dmel$r, c(0,.2,.4,.6,.8)),max(rmap_dmel$r+.1)  )
split_cM_dsim <- c(quantile(rmap_dsim$r, c(0,.2,.4,.6,.8)),max(rmap_dsim$r+.1))  


######################
##### this study #####
######################

# load frequencies and coverages
parents_all <- readRDS(paste(path_traj, "freq_cov_F20_29.RDS", sep = ""))    

# ----- Estimate Ne for the autosomes and X separately
ne_estimates <- NULL
for(chr in c("Autosomes", "X")){ 
  if(chr == "X"){index <- which(parents_all$CHROM == "X")}else{index <- which(parents_all$CHROM != "X")}
  for (r in 1:10){ 
    temp_fr <- unlist(subset(parents_all[index, ], select = paste("F20.R", r, ".freq.29", sep = "")));
    temp_cv <- as.numeric(as.character(unlist(subset(parents_all[index, ], select = paste("F20.R", r, ".count.29", sep = "")))));
    p0 <- rep(0.3, nb_SNPs_estimateNe)
    ne <- c()
    for(j in 1:nb_rounds_estimateNe){
      ind <- sample(x = 1:length(index), size = nb_SNPs_estimateNe)
      pt <-  temp_fr[ind]
      covt <- temp_cv[ind]
      cov0 <- covt
      ne <- c(ne, estimateNe(p0 = p0, pt = pt, cov0 = cov0, covt = covt, t = 20, 
                       ploidy = 2, truncAF = 0.05, method = "P.planI", poolSize = c(1500,1500), Ncensus = 1500))
    }  
    val <- sort(ne);
    ne_estimates <- rbind(ne_estimates, data.frame(chr = chr, replicate = r, ne = median(val),
    CI_down=val[qbinom(0.025, 100, 0.5)], CI_up=val[qbinom(1-0.025, 100, 0.5)+1]))
  }
}
saveRDS(ne_estimates, paste(path_estimates_29_F20, "Ne_estimates_SO_F20.RDS"))

# ----- Estimate s for the autosomes and X separately
ne_estimates <- readRDS(paste(path_estimates_29_F20, "Ne_estimates_SO_F20.RDS"))
s_estimates <- NULL
for(chr in c("2", "3", "X")){ 
  print(chr)
  idx <- which(parents_all$CHROM_ALL == chr); tmp <- parents_all[idx, ]
  info <- as.data.frame(subset(tmp, select = c("CHROM", "POS", "CHROM_ALL", "POS_ALL")))
  # work with pseudo count for freq=0 or freq=1
  for(r in 1:10){
    vf <- tmp[[paste("F20.R",r,".freq.29",sep="")]]; vc <- tmp[[paste("F20.R",r,".count.29",sep="")]]
    ind <- which(vf == 0); if(length(ind)>0){eval(parse(text = paste("tmp$F20.R",r, ".freq.29[ind] <- 1/(vc[ind]+1)", sep = "")))}
    ind <- which(vf == 1); if(length(ind)>0){eval(parse(text = paste("tmp$F20.R", r ,".freq.29[ind] <- (vc[ind]-1)/vc[ind]", sep = "")))}
  }
  for(r in 1:10){
    print(r)
    ne <- round(ne_estimates$ne[which(ne_estimates$replicate == r & ne_estimates$chr == "Autosomes")])
    if(chr == "X"){ne <- round(ne_estimates$ne[which(ne_estimates$replicate == r & ne_estimates$chr == "X")])}
    fr <- tmp[[paste("F20.R",r,".freq.29",sep="")]]
    mat <- t(matrix(cbind(rep(0.3, length(fr)), fr), ncol=2))
    s <- apply(mat, 2,function(x) estimateSH(x, t = c(0, 20), h = 0.5, Ne = ne, haploid = FALSE, N.pval=0,method="LLS")$s)
    info <- cbind(info, data.frame(s_0_20 = s))
  }
  colnames(info) <- c("CHROM", "POS", "CHROM_ALL", "POS_ALL", paste("s_0_20.R",1:10,sep=""))
  s_estimates <- rbind(s_estimates, info)
}
saveRDS(s_estimates, paste(path_estimates_29_F20, "s_estimates_SO_F20.RDS", sep = ""))



#############################
###### Kelly and Hughes #####
############################# 

data <- fread("/Volumes/cluster/Claire/SO_PaperI/data/KellyHuguestab.txt", dec=",",sep="\t")
Ncensus <- c(1277, 849, 1187)
names(Ncensus) <- c("A", "B", "C")

ne_estimates <- NULL
for(chr in c("Autosomes", "X")){ 
  if(chr == "X"){ind <- which(data$chrom == "Scf_X")}else{ind <- which(data$chrom != "Scf_X")}
  bla <- data[ind, ]
  for (r in  c("A", "B", "C")){ 
    cov0 <- unlist(subset(bla, select = paste(r, "0.tot", sep = "")));
    covt <- unlist(subset(bla, select = paste(r, "15.tot", sep = "")));
    p0 <- unlist(subset(bla, select = paste(r, "0.freq", sep = "")));
    pt <- unlist(subset(bla, select = paste(r, "15.freq", sep = "")));
    ne <- c()
    for(j in 1:nb_rounds_estimateNe){
      index <- sample(x = 1:length(ind), size = nb_SNPs_estimateNe)
      ne <- c(ne, estimateNe(p0 = p0[index], pt =  pt[index], cov0 = cov0[index], covt = covt[index], t = 15, 
                     ploidy = 2, truncAF = 0.05, method = "P.planI", Ncensus = Ncensus[r]))
    }  
    val <- sort(ne);
    ne_estimates <- rbind(ne_estimates, data.frame(chr = chr, replicate = r, ne = median(val),
    CI_down=val[qbinom(0.025, 100, 0.5)], CI_up=val[qbinom(1-0.025, 100, 0.5)+1]))
  }
}
saveRDS(ne_estimates, paste(path_estimates_29_F20, "Ne_estimates_KellyandHughes_F15.RDS"))


ne_estimates <- readRDS(paste(path_estimates_29_F20, "Ne_estimates_KellyandHughes_F15.RDS"))
s_estimates <- NULL
for(chr in c("Scf_2L", "Scf_2R", "Scf_3L", "Scf_3R", "Scf_X")){
  ind <- which(data$chrom == chr); tmp <- data[ind, ]
  print(chr)
  for(r in c("A", "B", "C")){ 
    print(r)
    # work with pseudo count for freq=0 or freq=1
    for(g in c(0, 15)){ 
      vf <- tmp[[paste(r, g, ".freq",sep="")]]; vc <- tmp[[paste(r, g, ".tot",sep="")]]
      ind <- which(vf == 0); if(length(ind)>0){eval(parse(text = paste("tmp$",r, g, ".freq[ind] <- 1/(vc[ind]+1)", sep = "")))}
      ind <- which(vf == 1); if(length(ind)>0){eval(parse(text = paste("tmp$",r, g, ".freq[ind] <- (vc[ind]-1)/vc[ind]", sep = "")))} 
    }

    ne <- round(ne_estimates$ne[which(ne_estimates$replicate == r & ne_estimates$chr == "Autosomes")])
    if(chr == "X"){ne <- round(ne_estimates$ne[which(ne_estimates$replicate == r & ne_estimates$chr == "X")])}
    mat <- as.matrix(as.data.frame(subset(tmp, select = paste(r, c(0,15), ".freq",sep = ""))))
    s <- apply(t(mat), 2,function(x) estimateSH(x, t = c(0,15), h =0.5,Ne=ne,haploid=F,N.pval=0,method="LLS")$s)
    temp <- cbind(subset(tmp, select = c("chrom", "pos")), data.frame(s_0_15 = s))
    temp$replicate=r
    s_estimates <- rbind(s_estimates,temp)
  }
}
saveRDS(s_estimates, paste(path_estimates_29_F20, "s_estimates_KellyandHughes_F15.rds", sep = ""))



#########################
###### Barghi et al #####
######################### 
gen <- rep(c(0,10,20),each=10)
repl <- rep(1:10, 3)  

for(chr in c("2L", "2R", "3L", "3R", "X")){
  # obtained on dryad
  sync <-  read.sync(paste("/Users/cburny/Downloads/810722097/", chr, ".sync", sep = ""), gen = gen, repl = repl, polarization = "rising")
  # Extract coverages
  al <- alleles(sync)
  covf <- poolSeq::coverage(sync, gen = gen, repl = repl); col_cov <- colnames(covf)
  covf <- cbind(covf, splitLocusID(rownames(covf)))
  # Extract frequences
  freqf <- af(sync, gen = gen, repl = repl); col_freq <- colnames(freqf)
  freqf <- cbind(freqf, splitLocusID(rownames(freqf)))
  tmp <- merge(freqf, covf, by = c("chr", "pos"), all.x = T)
  tmp <- merge(al, tmp, by = c("chr", "pos"))
  ind <- which(is.na(tmp$rising))
  if(length(ind)>0){ 
    tmp <- tmp[-ind, ]
  }
  saveRDS(tmp,paste(path_estimates_29_F20, "Barghi_freq_cov_0_10_20_chr", chr, ".rds", sep = ""))
}

data <- NULL
for(chr in c("2L", "2R", "3L", "3R", "X")){
  data<-rbind(data, readRDS(paste(path_estimates_29_F20, "Barghi_freq_cov_0_10_20_chr", chr, ".rds", sep = "")))
}
ne_estimates <- NULL
for(chr in c("Autosomes", "X")){ 
  if(chr == "X"){ind <- which(data$chr == "X")}else{ind <- which(data$chr != "X")}
  bla <- data[ind, ]
  for (r in 1:10){ 
    cov0 <- unlist(subset(bla, select = paste("F0.R",r, ".cov", sep = "")));
    covt <- unlist(subset(bla, select = paste("F20.R",r, ".cov", sep = "")));
    p0 <- unlist(subset(bla, select = paste("F0.R",r, ".freq", sep = "")));
    pt <- unlist(subset(bla, select = paste("F20.R",r, ".freq", sep = "")));
    ne <- c()
    for(j in 1:nb_rounds_estimateNe){
      index <- sample(x = 1:length(ind), size = nb_SNPs_estimateNe)
      ne <- c(ne, estimateNe(p0 = p0[index], pt =  pt[index], cov0 = cov0[index], covt = covt[index], t = 20, 
                     ploidy = 2, truncAF = 0.05, method = "P.planI", Ncensus = 1000))
    }  
    val <- sort(ne);
    ne_estimates <- rbind(ne_estimates,data.frame(chr = chr, replicate = r, ne = median(val),
    CI_down=val[qbinom(0.025, 100, 0.5)], CI_up=val[qbinom(1-0.025, 100, 0.5)+1]))
  }
}
saveRDS(ne_estimates, paste(path_estimates_29_F20, "Ne_estimates_Barghietal_F20.RDS"))


ne_estimates <- readRDS(paste(path_estimates_29_F20, "Ne_estimates_Barghietal_F20.RDS"))
s_estimates <- NULL
for(chr in c("2L", "2R", "3L", "3R", "X")){
  print(chr)
  tmp <- readRDS(paste(path_estimates_29_F20, "Barghi_freq_cov_0_10_20_chr", chr, ".rds", sep = ""))
  # work with pseudo count for freq=0 or freq=1
  for(r in 1:10){
    for(g in c(0, 10,20)){
      vf <- tmp[[paste("F",g,".R",r,".freq",sep="")]]; vc <- tmp[[paste("F",g,".R",r,".cov",sep="")]]
      ind <- which(vf == 0); if(length(ind)>0){eval(parse(text = paste("tmp$F",g,".R",r, ".freq[ind] <- 1/(vc[ind]+1)", sep = "")))}
      ind <- which(vf == 1); if(length(ind)>0){eval(parse(text = paste("tmp$F",g,".R", r ,".freq[ind] <- (vc[ind]-1)/vc[ind]", sep = "")))} 
    } 
  }
  res <- subset(tmp, select = c("chr", "pos"))
  for(r in 1:10){ 
    print(r)
    mat <- as.matrix(as.data.frame(subset(tmp, select = paste("F", c(0,10,20), ".R",r,".freq",sep = ""))))
    s <- apply(t(mat), 2,function(x) estimateSH(x, t = c(0,10,20), h =0.5,Ne=NA,haploid=F,N.pval=0,method="LLS")$s)
    res <- cbind(res, data.frame(s_0_20 = s))
  }
  s_estimates <- rbind(s_estimates,res)
}
colnames(s_estimates) <- c("chrom", "pos", paste("s_0_20.R",1:10,sep=""))
saveRDS(s_estimates, paste(path_traj, "s_estimates_Barghi_F20.rds", sep = ""))


##### Mean Ne
ne_so <- readRDS(paste(path_estimates_29_F20, "Ne_estimates_SO_F20.RDS"))
print(mean(ne_so$ne[ne_so$chr == "Autosomes"])) #55.4754
print(mean(ne_so$ne[ne_so$chr == "X"])) #20.36408

ne_b <- readRDS(paste(path_estimates_29_F20, "Ne_estimates_Barghietal_F20.RDS"))
print(mean(ne_b$ne[ne_b$chr == "Autosomes"])) #228.3092
print(mean(ne_b$ne[ne_b$chr == "X"])) #202.8547

ne_k <- readRDS(paste(path_estimates_29_F20, "Ne_estimates_KellyandHughes_F15.RDS"))
print(mean(ne_k$ne[ne_k$chr == "Autosomes"])) #378.8373
print(mean(ne_k$ne[ne_k$chr == "X"])) # 391.1897


#######################################################
##### Jacknnife for each replicate per chromosome #####
#######################################################

res_samore <- readRDS( paste(path_estimates_29_F20, "s_estimates_SO_F20.RDS", sep = ""))
res_barghi <- readRDS( paste(path_traj, "s_estimates_Barghi_F20.rds", sep = ""))
res_kelly <- readRDS(paste(path_estimates_29_F20, "s_estimates_KellyandHughes_F15.rds", sep = ""))

theta <- function(x){median(x)}
estimates <- NULL
min <- 10000
for(chr in c("2", "3", "X")){
  print(chr)
  chrs <- chr
  if(chr == "2"){chrs <- c("2L","2R")}
  if(chr == "3"){chrs <- c("3L","3R")}
  idx <- which(res_barghi$chrom %in% chrs); tmp_barghi <- na.omit(res_barghi[idx, ]); nb <- dim(tmp_barghi)[1]
  idx <- which(res_samore$CHROM  %in% chrs); tmp_samore <- na.omit(res_samore[idx, ]); nso <- dim(tmp_samore)[1]
  
  for(repl in 1:10){
    print(repl)
    ind_b <- sample(1:nb, min); 
    x <- abs(tmp_barghi[[paste("s_0_20.R", repl, sep = "")]][ind_b])  
    results_barghi <- jackknife(x, theta)  
    m <- median(x) 
    se <- results_barghi$jack.se
    estimates <- rbind(estimates, data.frame(dataset = "Barghi et al, 2019", chr = chr, replicate = repl,
    median = m, jack.median=m- results_barghi$jack.bias,jack.bias = results_barghi$jack.bias, jack.se = se, 
    tCI.down = m- results_barghi$jack.bias-qt(.975, df=min-1)*se, tCI.up = m- results_barghi$jack.bias+qt(.975, df=min-1)*se))
  }  
       
  for(repl in c("A", "B", "C")){
    print(repl)
    idx <- which(res_kelly$replicate == repl & res_kelly$chrom %in% paste("Scf_", chrs, sep = "")); 
    s <- na.omit(abs(res_kelly[["s_0_15"]][idx]))
    ind_k <- sample(1:length(s), min); 
    x <- s[ind_k]
    results_kelly <- jackknife(x, theta)  
    m <- median(x) 
    se <- results_kelly$jack.se
    estimates <- rbind(estimates, data.frame(dataset = "Kelly and Hughes, 2019", chr = chr, replicate = repl,
    median = m, jack.median=m- results_kelly$jack.bias,jack.bias = results_kelly$jack.bias, jack.se = se, 
    tCI.down = m- results_kelly$jack.bias-qt(.975, df=min-1)*se, tCI.up = m- results_kelly$jack.bias+qt(.975, df=min-1)*se))
  }                        
  for(repl in 1:10){
    ind_so <- sample(1:nso, min); 
    x <- abs(tmp_samore[[paste("s_0_20.R", repl, sep = "")]][ind_so])  
    results_so <- jackknife(x, theta)  
    m <- median(x) 
    se <- results_so$jack.se
    estimates <- rbind(estimates, data.frame(dataset = "This study", chr = chr, replicate = repl,
    median = m, jack.median=m- results_so$jack.bias,jack.bias = results_so$jack.bias, jack.se = se, 
    tCI.down = m- results_so$jack.bias-qt(.975, df=min-1)*se, tCI.up = m- results_so$jack.bias+qt(.975, df=min-1)*se))
  } 
}
saveRDS(estimates, paste(path_estimates_29_F20, "abs_s_median_jack_alldatasets.RDS", sep = ""))
