######
# Aim: read MimicrEE2 neutral simulations
######

# ----- global variables, functions and packages ----- #
source("/Volumes/cluster/Claire/SO_PaperI/manuscript/scripts/global_variables.R")
source("/Volumes/cluster/Claire/SO_PaperI/manuscript/scripts/functions.R")

# --- read simulated sync file and downsample
repl <- 1:10 #replicates sequenced
nb_repl <- length(repl)
gen <- c(0,1,10,20) #generations sequenced for null
nb_tp <- length(gen) #number of time points sequenced
g <- rep(gen, nb_repl); 
r <- rep(1:nb_repl, each = nb_tp)

# windows boundaries
pos_w_all <- read.table(paste(path_list, "pos_window_GW_parents_nooverlap.txt", sep = ""), sep = "\t", h = T)
wsmooth = 250 

parent <- NULL
for(chr in chrom){
  parent <- rbind(parent, fread(paste(path_list, "chr", chr, "_parental.txt", sep = ""), sep = "\t", h = T))
}
freq_29 <- readRDS(paste(path_traj, "freq_cov_F20_29.RDS", sep = "")) 

# uncompress simulated file
gunzip(paste(path_neutral_simu_29_F20, "/MassX_30_70_targets0_run1_s0.sync.gz", sep = ""))

# define coverage average for coverage noise
sum <- apply(subset(freq_29[freq_29$CHROM %in% c("2L", "2R", "3L", "3R", "X"), ], select = paste("F20.R", 1:10, ".count.29", sep = "")), 1, 
function(x) sum(as.numeric(x))/10)
poisson <- round(mean(sum)) #122

# Read sync file
sync <- read.sync(file = paste(path_neutral_simu_29_F20, "/MassX_30_70_targets0_run1_s0.sync", sep = ""), gen = g, repl = r, polarization = "minor")

# Extract alleles info
al <- alleles(sync) # pick the ID of the minor which we comptue the freq
al_minor <- al$minor 
al_major <- al$major

# Extract frequences
freq <- af(sync, gen = g, repl = r)

# sample total depth per site from a Poisson and rewrite it in a new sync file
nb_SNPs <- dim(freq)[1]
cov_sampled <- matrix(rpois(nb_SNPs*dim(freq)[2], lambda = poisson), nrow = nb_SNPs, ncol = dim(freq)[2])

ct_minor_tmp <- mapply(do_binom, cov_sampled, freq)
ct_minor <- matrix(ct_minor_tmp, byrow=F, nrow = nb_SNPs, ncol = dim(freq)[2])
ct_other <- cov_sampled-ct_minor
  
cd <- NULL
for(col in 1:(nb_repl*nb_tp)){
  tmp <- data.frame(A=rep(0, nb_SNPs), T=rep(0, nb_SNPs), C=rep(0, nb_SNPs), G=rep(0, nb_SNPs), N=rep(0, nb_SNPs), del=rep(0, nb_SNPs)) 
  tmp$A[which(al_minor == "A")] <- ct_minor[which(al_minor == "A"), col]
  tmp$T[which(al_minor == "T")] <- ct_minor[which(al_minor == "T"), col]
  tmp$C[which(al_minor == "C")] <- ct_minor[which(al_minor == "C"), col]
  tmp$G[which(al_minor == "G")] <- ct_minor[which(al_minor == "G"), col]
  tmp$A[which(al_major == "A")] <- ct_other[which(al_major == "A"), col]
  tmp$T[which(al_major == "T")] <- ct_other[which(al_major == "T"), col]
  tmp$C[which(al_major == "C")] <- ct_other[which(al_major == "C"), col]
  tmp$G[which(al_major == "G")] <- ct_other[which(al_major == "G"), col]
  cd <- cbind(cd, paste(tmp$A, ":", tmp$T, ":", tmp$C, ":", tmp$G, ":", tmp$N, ":", tmp$del, sep = ""))
}
write.table(cbind(subset(al, select = c("chr", "pos", "ref")), cd), 
paste(path_neutral_simu_29_F20, "/MassX_30_70_targets0_run1_s0_downsampled.sync", sep = ""), sep = "\t", col.names = FALSE, row.names = FALSE, quote = FALSE) 
      
# read downsampled sync file
sync <- fread( paste(path_neutral_simu_29_F20, "/MassX_30_70_targets0_run1_s0_downsampled.sync", sep = ""), sep = "\t", h = FALSE)
colnames(sync)[1:3] <- c("CHROM", "POS", "REF")
 
idx <- list(which(g == 0)+3, which(g == 1)+3,which(g == 10)+3, which(g == 20)+3)
lbl <- c("0", "1","10","20")
for(j in 1:4){
  print(j)
  tmp <- merge(subset(sync, select = c("CHROM", "POS", "REF", colnames(sync)[idx[[j]]])), 
        subset(parent, select = c("CHROM", "POS", "or")))
  pos <- subset(tmp, select = c("CHROM", "POS", "or"))
  tmp_cov <- subset(tmp, select = setdiff(colnames(tmp), c("CHROM", "POS", "REF", "or")))
  cov <- apply(tmp_cov, 2, function(x) sapply(x, function(y) sum(as.numeric(strsplit(y, ":", fixed = T)[[1]]))))
  p <- code[,pos$or]
  freq <- t(sapply(1:length(p), function(x) apply(tmp_cov[x,], 2, function(y) as.numeric(strsplit(y, ":", fixed = T)[[1]][p[x]]))))
  test <- freq/cov
  colnames(test) <- paste("F", lbl[j], ".R", repl, ".freq", sep = "") 
  colnames(cov) <- paste("F", lbl[j], ".R", repl, ".cov", sep = "") 
  if(j == 1){all <- cbind(pos, cov, test)}else{ all <- merge(all, cbind(pos,cov, test),by=c("CHROM",  "POS", "or"))}
}
write.table(all,paste(path_neutral_simu_29_F20, "/MassX_30_70_targets0_run1_s0_downsampled.txt", sep = ""), 
sep = "\t", col.names = TRUE, quote = FALSE, row.names = FALSE)

# ----- smooth the downsampled freq   
all <- fread(paste(path_neutral_simu_29_F20, "/MassX_30_70_targets0_run1_s0_downsampled.txt", sep = ""), 
sep = "\t", header = TRUE)
pos_w_all_cM <- pos_w_all[which(pos_w_all$unit == "cM" & pos_w_all$wsmooth==wsmooth), ]
pos_w_all_bp <- pos_w_all[which(pos_w_all$unit == "bp" & pos_w_all$wsmooth==wsmooth), ]
# both data frames are ordered the same 
all <- merge(all, subset(freq_29, select = c("CHROM", "POS", "CHROM_ALL", "POS_ALL")), by = c("CHROM", "POS"))
smoothed_data <- NULL
for(chr in c("2", "3", "X")){ 
    print(chr)
    mat_data <- all[all$CHROM_ALL == chr, ]; 
    index <- order(mat_data$POS_ALL); mat_data <- mat_data[index, ]
  
    pos_w <- pos_w_all_bp[pos_w_all_bp$chr == chr, ]
    tmp_pos_left=pos_w$pos_left; tmp_pos_right=pos_w$pos_right; tmp_pos_mid=pos_w$pos_mean
    cM_w <- pos_w_all_cM[pos_w_all_cM$chr == chr, ]
    tmp_cM_mid=cM_w$pos_mean
  
    x <- mat_data$POS_ALL; 
    if(chr == "4"){ sm <- data.frame(pos=x, cM=rep(0, length(x)));}else{  sm <- data.frame(pos=tmp_pos_mid, cM=tmp_cM_mid);}
    for(r in 1:10){ 
      fr <- mat_data[[paste("F20.R",r,".freq",sep="")]]
      if(chr == "4"){smooth <- fr}else{ smooth <- do_smooth(tmp_pos_left, tmp_pos_right, fr, x)}
      sm <- cbind(sm,smooth) 
    }
    sm$chr <- chr
    colnames(sm) <- c("pos", "cM", paste("F20.R", 1:10, sep = ""), "chr")
    smoothed_data <- rbind(smoothed_data, sm)
}
saveRDS(smoothed_data, paste(path_neutral_simu_29_F20, "/smoothed_w=", wsmooth, "_MassX_30_70_targets0_run1_s0_downsampled.RDS", sep = ""))