######
# Aim: read MimicrEE2 simulations with selection
######

# ----- global variables, functions and packages ----- #
source("/Volumes/cluster/Claire/SO_PaperI/manuscript/scripts/global_variables.R")
source("/Volumes/cluster/Claire/SO_PaperI/manuscript/scripts/functions.R")

# --- read simulated sync file and downsample
repl <- 1:10 #replicates sequenced
nb_repl <- length(repl)
gen <- c(0,1,10,20, 60) #generations sequenced for null
nb_tp <- length(gen) #number of time points sequenced
g <- rep(gen, nb_repl); 
r <- rep(1:nb_repl, each = nb_tp)
chr <- "X"

# windows boundaries
pos_w_all <- read.table(paste(path_list, "pos_window_GW_parents_nooverlap.txt", sep = ""), sep = "\t", h = T)
wsmooth = 250 

parent <- fread(paste(path_list, "chrX_parental.txt", sep = ""), sep = "\t", h = T)
freq_29 <- readRDS(paste(path_traj, "freq_cov_F20_29.RDS", sep = "")) 

# run
i=1
suffix="1target"

# uncompress simulated file
gunzip(paste(path_simu_29_F20, "/",suffix, "_run", i, "_v2.sync.gz", sep = ""))

# Read sync file
sync <- fread(paste(path_simu_29_F20, "/",suffix, "_run", i, "_v2.sync", sep = ""), sep = "\t", h = FALSE)
colnames(sync)[1:3] <- c("CHROM", "POS", "REF")
 
idx <- list(which(g == 0)+3, which(g == 1)+3,which(g == 10)+3, which(g == 20)+3,  which(g == 60)+3)
lbl <- c("0", "1","10","20", "60")
for(j in 1:5){
  print(j)
  tmp <- merge(subset(sync, select = c("CHROM", "POS", "REF", colnames(sync)[idx[[j]]])), 
               subset(parent, select = c("CHROM", "POS", "or")))
  pos <- subset(tmp, select = c("CHROM", "POS", "or"))
  tmp_cov <- subset(tmp, select = setdiff(colnames(tmp), c("CHROM", "POS", "REF", "or")))
  p <- code[,pos$or]
  freq <- t(sapply(1:length(p), function(x) apply(tmp_cov[x,], 2, function(y) as.numeric(strsplit(y, ":", fixed = T)[[1]][p[x]]))))
  test <- freq/3000
  colnames(test) <- paste("F", lbl[j], ".R", repl, ".freq", sep = "") 
  if(j == 1){all <- cbind(pos, test)}else{ all <- merge(all, cbind(pos, test),by=c("CHROM",  "POS", "or"))} 
}
write.table(all,paste(path_simu_29_F20, "/",suffix, "_run", i, "_v2.txt", sep = ""), 
sep = "\t", col.names = TRUE, quote = FALSE, row.names = FALSE)

# ----- smooth freq   
all <- fread(paste(path_simu_29_F20, "/",suffix, "_run", i, "_v2.txt", sep = ""), 
sep = "\t", header = TRUE)
pos_w_all_cM <- pos_w_all[which(pos_w_all$unit == "cM" & pos_w_all$wsmooth==wsmooth), ]
pos_w_all_bp <- pos_w_all[which(pos_w_all$unit == "bp" & pos_w_all$wsmooth==wsmooth), ]
# both data frames are ordered the same 
all <- merge(all, subset(freq_29, select = c("CHROM", "POS", "CHROM_ALL", "POS_ALL")), by = c("CHROM", "POS"))
mat_data <- all[all$CHROM_ALL == chr, ]; 
index <- order(mat_data$POS_ALL); mat_data <- mat_data[index, ]  
pos_w <- pos_w_all_bp[pos_w_all_bp$chr == chr, ]
tmp_pos_left=pos_w$pos_left; tmp_pos_right=pos_w$pos_right; tmp_pos_mid=pos_w$pos_mean
cM_w <- pos_w_all_cM[pos_w_all_cM$chr == chr, ]
tmp_cM_mid=cM_w$pos_mean
x <- mat_data$POS_ALL; 
sm <- data.frame(pos=tmp_pos_mid, cM=tmp_cM_mid)
for(g in gen){ 
  for(r in repl){ 
    fr <- mat_data[[paste("F",g,".R",r,".freq",sep="")]]
    smooth <- do_smooth(tmp_pos_left, tmp_pos_right, fr, x)
    sm <- cbind(sm, smooth) 
  } 
}
sm$chr <- chr
colnames(sm) <- c("pos", "cM", paste("F0.R", repl, sep = ""), paste("F1.R", repl, sep = ""), paste("F10.R", repl, sep = ""),
paste("F20.R", repl, sep = ""), paste("F60.R", repl, sep = ""),"chr")
smoothed_data <- sm
saveRDS(smoothed_data, paste(path_simu_29_F20, "/smoothed_w=", wsmooth, "_", suffix, "_run", i, "_v2.RDS", sep = ""))


# ---- Fit between empirical data and simulated data
empirical <- readRDS(paste(path_traj, "/smoothed_w=", wsmooth, "_29_F20.RDS", sep = ""))
empirical <- empirical[empirical$chr=="X", ]
target1 <- readRDS(paste(path_simu_29_F20, "/smoothed_w=", wsmooth, "_1target_run", i, "_v2.RDS", sep = ""))
target2 <- readRDS(paste(path_simu_29_F20, "/smoothed_w=", wsmooth, "_2target_run", i, "_v2.RDS", sep = ""))
target3 <- readRDS(paste(path_simu_29_F20, "/smoothed_w=", wsmooth, "_3target_run", i, "_v2.RDS", sep = ""))
target4 <- readRDS(paste(path_simu_29_F20, "/smoothed_w=", wsmooth, "_4target_run", i, "_v2.RDS", sep = ""))
target6 <- readRDS(paste(path_simu_29_F20, "/smoothed_w=", wsmooth, "_6target_run", i, "_v2.RDS", sep = ""))
mean1 <- apply(subset(target1, select = paste("F20.R", 1:10, sep="")), 1, function(x) mean(x))
mean2 <- apply(subset(target2, select = paste("F20.R", 1:10, sep="")), 1, function(x) mean(x))
mean3 <- apply(subset(target3, select = paste("F20.R", 1:10, sep="")), 1, function(x) mean(x))
mean4 <- apply(subset(target4, select = paste("F20.R", 1:10, sep="")), 1, function(x) mean(x))
mean6 <- apply(subset(target6, select = paste("F20.R", 1:10, sep="")), 1, function(x) mean(x))
meane <- apply(subset(empirical, select = paste("F20.R", 1:10, sep="")), 1, function(x) mean(x))

sse1 <- sum((mean1-meane)^2) #25.41169
sse2 <- sum((mean2-meane)^2) #2.471938
sse3 <- sum((mean3-meane)^2) #18.51964
sse4 <- sum((mean4-meane)^2) #0.7290368
sse6 <- sum((mean6-meane)^2) #0.3495659

d1 <- t.test(mean1,meane, paired = TRUE)$p.value
d2 <- t.test(mean2,meane, paired = TRUE)$p.value
d3 <- t.test(mean3,meane, paired = TRUE)$p.value
d4 <- t.test(mean4,meane, paired = TRUE)$p.value
d6 <- t.test(mean6,meane, paired = TRUE)$p.value
p <- p.adjust(c(d1,d2,d3,d4,d6), "BH")
#4.160172e-87 5.385501e-05 4.505434e-84 6.973662e-03 2.055998e-01
