######
# Aim: perform analysis
######

# ----- global variables, functions and packages ----- #
source("/Volumes/cluster/Claire/SO_PaperI/manuscript/scripts/global_variables.R")
source("/Volumes/cluster/Claire/SO_PaperI/manuscript/scripts/functions.R")

nb_start_or <- 100; nb_start_sam <- 200
wsmooth <- 500
color_chromosomes <- col2rgb(magma(5)[c(1,2,3)]); colnames(color_chromosomes)<-c("2","3","X")
color_baseline <- rgb(114/255,25/255,14/255)

# ---- smooth AF
pos_w_all <- read.table(paste(path_list, "pos_window_GW_parents_nooverlap.txt", sep = ""), sep = "\t", h = T)
freq_29 <- readRDS(paste(path_traj, "freq_cov_F20_29.RDS", sep = "")) 
pos_w_all_cM <- pos_w_all[which(pos_w_all$unit == "cM" & pos_w_all$wsmooth==wsmooth), ]
pos_w_all_bp <- pos_w_all[which(pos_w_all$unit == "bp" & pos_w_all$wsmooth==wsmooth), ]
# both data frames are ordered the same 
#> table(pos_w_all_bp$chr)
#  2   3   X 
#762 844 256 
#> mean(pos_w_all_bp$size[pos_w_all_bp$chr!="X"])/1000
#[1] 67.78318
#> mean(pos_w_all_bp$size[pos_w_all_bp$chr=="X"])/1000
#[1] 90.46782

smoothed_data <- NULL
for(chr in c("2", "3", "X")){ 
  print(chr)
  mat_data <- freq_29[freq_29$CHROM_ALL == chr, ]; 
  index <- order(mat_data$POS_ALL); mat_data <- mat_data[index, ]
  pos_w <- pos_w_all_bp[pos_w_all_bp$chr == chr, ]
  tmp_pos_left=pos_w$pos_left; tmp_pos_right=pos_w$pos_right; tmp_pos_mid=pos_w$pos_mean
  cM_w <- pos_w_all_cM[pos_w_all_cM$chr == chr, ]
  tmp_cM_mid=cM_w$pos_mean
  x <- mat_data$POS_ALL; 
  if(chr == "4"){ sm <- data.frame(pos=x, cM=rep(0, length(x)));}else{  sm <- data.frame(pos=tmp_pos_mid, cM=tmp_cM_mid);}
  for(r in 1:10){ 
    fr <- mat_data[[paste("F20.R",r,".freq.29",sep="")]]
    if(chr == "4"){smooth <- fr}else{ smooth <- do_smooth(tmp_pos_left, tmp_pos_right, fr, x)}
    sm <- cbind(sm,smooth)
  }
  sm$chr <- chr
  colnames(sm) <- c("pos", "cM", paste("F20.R", 1:10, sep = ""), "chr")
  smoothed_data <- rbind(smoothed_data, sm)
}
saveRDS(smoothed_data, paste(path_traj, "/smoothed_w=", wsmooth, "_29_F20.RDS", sep = ""))


# ---- Load smoothed neutral and empirical datasets
smoothed_obs_29 <- readRDS(paste(path_traj, "/smoothed_w=", wsmooth, "_29_F20.RDS", sep = ""))
smoothed_neutral_29 <- readRDS(paste(path_neutral_simu_29_F20, "/smoothed_w=", wsmooth, "_MassX_30_70_targets0_run1_s0_downsampled.RDS", 
sep = ""))
ind <- order(smoothed_obs_29$chr, smoothed_obs_29$pos); smoothed_obs_29 <- smoothed_obs_29[ind, ]
ind <- order(smoothed_neutral_29$chr, smoothed_neutral_29$pos); smoothed_neutral_29 <- smoothed_neutral_29[ind, ]
pos_w_all <- read.table(paste(path_list, "pos_window_GW_parents_nooverlap.txt", sep = ""), sep = "\t",h=TRUE)


# --- Remove the last window that contain <wsmooth SNPs
to_remove <- sapply(c("2", "3", "X"), function(x) tail(which(smoothed_obs_29$chr ==x),1))
smoothed_obs_29 <- smoothed_obs_29[-to_remove, ]
smoothed_neutral_29 <- smoothed_neutral_29[-to_remove, ]


# --- median O AF at F29
median <- NULL
for(r in 1:10){
  for(chr in c("2", "3", "X")){
    tmp <- smoothed_obs_29[smoothed_obs_29$chr == chr, ]
    median <- rbind(median, data.frame(chr = chr, repl = r, median = median(tmp[[paste("F20.R", r, sep = "")]])))
  }
}
print(summary(median))


# ---- MDS
# Distance matrix between all samples
mat <- matrix(0, ncol = 20*3,nrow=20*3)
colnames(mat) <- c(paste("F20.R", 1:10, ".obs.2", sep = ""), paste("F20.R", 1:10, ".neut.2", sep = ""),
paste("F20.R", 1:10, ".obs.3", sep = ""), paste("F20.R", 1:10, ".neut.3", sep = ""),
paste("F20.R", 1:10, ".obs.X", sep = ""), paste("F20.R", 1:10, ".neut.X", sep = ""))
rownames(mat) <- colnames(mat)
nm <- rownames(mat)
for(j in 1:60){
  for(i in setdiff(1:60, j)){ 
    chri <- tail(strsplit(nm[i], ".", fixed = TRUE)[[1]], 1); chrj <- tail(strsplit(nm[j], ".", fixed = TRUE)[[1]], 1)
    typei <- strsplit(nm[i], ".", fixed = TRUE)[[1]][3];  typej <- strsplit(nm[j], ".", fixed = TRUE)[[1]][3];
    samplei <- smoothed_neutral_29; if(typei == "obs"){samplei <- smoothed_obs_29}
    samplej <- smoothed_neutral_29; if(typej == "obs"){samplej <- smoothed_obs_29}
    index_i <- which(samplei$chr == chri); index_j <- which(samplej$chr == chrj)
    nbi <- length(index_i); nbj <- length(index_j)
    if(nbi>nbj){set.seed(1991); index_i <-sample(index_i,nbj)}
    if(nbj>nbi){set.seed(1992); index_j <-sample(index_j,nbi)}
    n <- samplei[[paste("F20.", strsplit(nm[i], ".", fixed = TRUE)[[1]][2], sep = "")]][index_i]
    e <- samplej[[paste("F20.", strsplit(nm[j], ".", fixed = TRUE)[[1]][2], sep = "")]][index_j]
    mat[i,j] <- sqrt(2*(1-cor(n,e, method = "spearman")))
    mat[j,i] <-mat[i,j]
  }
}
saveRDS(mat,  paste(path_estimates_29_F20, "/pw_spearman_mat_29_F20_w=",wsmooth,".RDS", sep = ""))
  
mat<-readRDS( paste(path_estimates_29_F20, "/pw_spearman_mat_29_F20_w=",wsmooth,".RDS", sep = ""))
pn <- c(); pe <- c()
for(chr in c("2", "3", "X")){ 
  print(chr)
  temp <- mat[c(paste("F20.R",1:10,".obs.",chr, sep = ""), paste("F20.R",1:10,".neut.",chr, sep = "")), c(paste("F20.R",1:10,".obs.",chr, sep = ""), paste("F20.R",1:10,".neut.",chr, sep = ""))]
  names <- colnames(temp)
  col_neutral <- which(sapply(names, function(x) length(strsplit(x, ".neut", fixed = T)[[1]])) == 2)
  col_empirical <- setdiff(1:length(names), col_neutral)
  mat_neutral <- temp[col_neutral,col_neutral]
  mat_empirical <- temp[col_empirical,col_empirical]
  corrn <- (( (mat_neutral[upper.tri(mat_neutral)]^2)/2)-1)*-1
  corre <- (( (mat_empirical[upper.tri(mat_empirical)]^2)/2)-1)*-1
  print(t.test(corrn))
  print(t.test(corre))
  # check if the correlation is null per major arm for neutral and empirical data
  pn <- c(pn, t.test(corrn)$p.value)
  pe <- c(pe, t.test(corre)$p.value)
  print("#######################")
}

# ----- wsmooth=500
#> p.adjust(pn, "BH") # tests from neutral data
#[1] 0.936657 0.936657 0.936657
#> p.adjust(pe, "BH") # tests from empirical data
#[1] 1.106596e-65 1.204087e-54 6.273178e-67
  
#[1] "2"
#	One Sample t-test # neutral data
#data:  corrn
#t = -0.81019, df = 44, p-value = 0.4222
#alternative hypothesis: true mean is not equal to 0
#95 percent confidence interval:
# -0.10804982  0.04608616
#sample estimates:
#  mean of x 
#-0.03098183 

#	One Sample t-test #empirical data
#data:  corre
#t = 190.89, df = 44, p-value < 2.2e-16
#alternative hypothesis: true mean is not equal to 0
#95 percent confidence interval:
# 0.8808511 0.8996491
#sample estimates:
#mean of x 
#0.8902501 

#[1] "3"
#	One Sample t-test
#data:  corrn
#t = 0.23961, df = 44, p-value = 0.8117
#alternative hypothesis: true mean is not equal to 0
#95 percent confidence interval:
# -0.06848001  0.08696036
#sample estimates:
#  mean of x 
#0.009240175 

#	One Sample t-test
#data:  corre
#t = 106.02, df = 44, p-value < 2.2e-16
#alternative hypothesis: true mean is not equal to 0
#95 percent confidence interval:
# 0.7816699 0.8119647
#sample estimates:
#mean of x 
#0.7968173 

#[1] "X"
#	One Sample t-test
#data:  corrn
#t = -0.079928, df = 44, p-value = 0.9367
#alternative hypothesis: true mean is not equal to 0
#95 percent confidence interval:
# -0.1143339  0.1056110
#sample estimates:
#   mean of x 
#-0.004361426 

#	One Sample t-test
#data:  corre
#t = 207.01, df = 44, p-value < 2.2e-16
#alternative hypothesis: true mean is not equal to 0
#95 percent confidence interval:
# 0.9132444 0.9312011
#sample estimates:
#mean of x 
#0.9222227 


# ---- Autocorrelation 
df <- NULL
for(chr in c("2", "3", "X")){ 
  idx <- which(smoothed_obs_29$chr == chr); mat_data <- smoothed_obs_29[idx, ]; ord <- order(mat_data$pos); mat_data <- mat_data[ord,]
  pos <- mat_data[["pos"]]
  for(i in paste("F20.R", 1:10, sep = "")){
    fr <- mat_data[[i]]
    aacf <- acf(fr,lag.max=175,plot=F)
    loss <- min(which(as.vector(aacf$acf)<qnorm(.975)/sqrt(length(fr))))
    ind <- 1:length(fr)
    d <- c()
    off <- 1
    for(j in 1:(length(fr)-loss)){
      d <- c(d, pos[off+loss]-pos[off])
      off <- off+loss
    }
    df <- rbind(df, data.frame(sample = i, chr=chr, loss = loss, mean_distance = mean(na.omit(d)), median_distance = median(na.omit(d))))
  }
}
saveRDS(df, paste(path_estimates_29_F20, "/autocorrelation_smoothed_w=", wsmooth, "_F20.RDS", sep = ""))

df<-readRDS(paste(path_estimates_29_F20, "/autocorrelation_smoothed_w=", wsmooth, "_F20.RDS", sep = ""))
# --- wsmooth=250
#> median(df$median_distance[which(df$chr == "2")]/1000000)
#[1] 4.544216
#> median(df$median_distance[which(df$chr == "3")]/1000000)
#[1] 3.578521
#> median(df$median_distance[which(df$chr == "X")]/1000000)
#[1] 6.146693


# ---- AFC 
delta <- smoothed_obs_29
d <-  apply(apply(subset(delta, select = paste("F20.R", 1:10, sep = "")), 2, function(x) x-0.3), 1,function(x)median(x))
delta <- cbind(delta,d)
delta <- delta[which(delta$chr!="4"),]
pv <- c(prop.test(length(which(delta$d[which(delta$chr=="2")]>0)),length(delta$d[which(delta$chr=="2")]), 0.5, "greater")$p.value, 
prop.test(length(which(delta$d[which(delta$chr=="3")]>0)),length(delta$d[which(delta$chr=="3")]), 0.5, "greater")$p.value, 
prop.test(length(which(delta$d[which(delta$chr=="X")]>0)),length(delta$d[which(delta$chr=="X")]), 0.5, "greater")$p.value)

# --- wsmooth=250
#> p.adjust(pv, "BH")
#[1]  2.906465e-98 1.993135e-120  2.990554e-19
#>  median(delta$d[which(delta$chr=="2")])
#[1] 0.1462772
#>  median(delta$d[which(delta$chr=="3")])
#[1] 0.1534728
#>  median(delta$d[which(delta$chr=="X")])
#[1] 0.294186


# ---- CV across replicates 
smoothed_obs_29$mean <- apply(subset(smoothed_obs_29, select = paste("F20.R", 1:10, sep = "")), 1, function(x) mean(x))
smoothed_obs_29$sd <- apply(subset(smoothed_obs_29, select = paste("F20.R", 1:10, sep = "")), 1, function(x) sd(x))
smoothed_obs_29$CV <- smoothed_obs_29$sd/smoothed_obs_29$mean
# --- wsmooth = 250
#> summary(smoothed_obs_29$CV[which(smoothed_obs_29$chr=="2")])
#   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
#0.04295 0.08253 0.10296 0.10989 0.12851 0.22662 
#> summary(smoothed_obs_29$CV[which(smoothed_obs_29$chr=="3")])
#   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
#0.03453 0.07867 0.10719 0.11982 0.15583 0.32189 
#> summary(smoothed_obs_29$CV[which(smoothed_obs_29$chr=="X")])
#   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
#0.02006 0.09625 0.13823 0.15323 0.21380 0.35350 

