######
# Aim: filter SNPs based on depth and QUAL filters
######

# ----- global variables, functions and packages ----- #
source("/Volumes/cluster/Claire/SO_PaperI/manuscript/scripts/global_variables.R")
source("/Volumes/cluster/Claire/SO_PaperI/manuscript/scripts/functions.R")

# --- quality control plots to filter for depth
info_all <- NULL
for(chr in chrom){
  print(chr)
  vcf <- read.vcfR(paste(path_vcf_proc, "chr", chr, "_5bp_bisnp.vcf", sep = ""), verbose = FALSE )
  info <- as.data.frame(subset(vcf@fix, select = c("CHROM", "POS", "REF", "ALT", "QUAL")))
  info$POS <- as.numeric(info$POS)
  info$QUAL <- as.numeric(info$QUAL)

  sam <- t(sapply(vcf@gt[,"S_000"], function(x) strsplit(x, ":")[[1]]))
  or <- t(sapply(vcf@gt[,"O_000"], function(x) strsplit(x, ":")[[1]]))
  sam <- as.data.frame(sam); colnames(sam) <- paste("sam_", c("GT", "DP", "AD", "RO", "QR", "AO", "QA", "GL"), sep = "")
  or <- as.data.frame(or); colnames(or) <- paste("or_", c("GT", "DP", "AD", "RO", "QR", "AO", "QA", "GL"), sep = "")
  sam$sam_DP <- as.numeric(sam$sam_DP); sam$sam_AO <- as.numeric(sam$sam_AO); sam$sam_RO <- as.numeric(sam$sam_RO)
  or$or_DP <- as.numeric(or$or_DP); or$or_AO <- as.numeric(or$or_AO); or$or_RO <- as.numeric(or$or_RO)
  #  CHROM       POS        ID       REF       ALT      QUAL    FILTER      INFO 
  #    "4"   "10999"        NA       "G"       "A" "37.0743"    "PASS"        NA 
  #                                FORMAT 
  #             "GT:DP:AD:RO:QR:AO:QA:GL" 
  #                                 S_000 
  #"1/1:2:0,2:0:0:2:73:-5.51298,-0.60206,0" 
  #                                 O_000 
  #                     ".:.:.:.:.:.:.:." 

  info <- cbind(info, subset(sam, select = paste("sam_", c("GT", "DP", "RO","AO"), sep = "")), 
  subset(or, select = paste("or_", c("GT", "DP", "RO","AO"), sep = "")))
  rownames(info)=NULL
  info_all <- rbind(info_all,info) 
}
info_all$sam_AO_RO <- info_all$sam_AO+info_all$sam_RO 
info_all$or_AO_RO <- info_all$or_AO+info_all$or_RO 
info_all$mean_AO_RO <- (info_all$sam_AO_RO+info_all$or_AO_RO)/2 
saveRDS(info_all, paste(path_data_count, "chrall_DP_GT.RDS", sep = ""))

# ----- remove NA 
info_all <- readRDS(paste(path_data_count, "chrall_DP_GT.RDS", sep = "")) #791064
ind <- which(is.na(info_all$mean_AO_RO) | info_all$sam_AO_RO==0 | info_all$or_AO_RO==0) #4286
info_all <- info_all[-ind, ] #786778

# ---- create coverage and QUAL filters
#quantile(info_all$sam_AO_RO, c(0.01,0.99))
# 1% 99% 
# 11  67 
#quantile(info_all$or_AO_RO, c(0.01,0.99))
# 1% 99% 
# 11  69 
#quantile(info_all$mean_AO_RO, c(0.01, 0.99))
#  1%  99% 
#14.5 63.0 

ind <- which(info_all$sam_AO_RO>=11 & info_all$or_AO_RO>=11 & info_all$sam_AO_RO<=68 & info_all$or_AO_RO<=68) 
info_all <- info_all[ind, ] #760906 
quantile(info_all$QUAL[info_all$QUAL>=1], 0.01)
#      1% 
#412.7786 

png(paste(path_data_count, "QUAL_averageAORO.png", sep = ""))
par(mfrow=c(1,2))
hist(info_all$QUAL, main="",xlab="QUAL",breaks = seq(0,82100,by=50),xlim=c(0,4500))
abline(v = 413, lwd=2, lty=2)
hist(info_all$mean_AO_RO, main="",xlab="Average AO+RO",breaks = seq(0,1700,by=10),xlim=c(0,150))
dev.off()

ind <- which(info_all$QUAL >= 413)
info_all <- info_all[ind, ] #684065
saveRDS(info_all, paste(path_data_count, "chrall_DP_GT_filtered.RDS", sep = ""))

# --- quality control plots to filter for depth
info_all <- readRDS(paste(path_data_count, "chrall_DP_GT_filtered.RDS", sep = ""))

parents <- NULL; sum_count <- NULL; deltaauto <- c()
for(chr in chrom){
  print(chr)
  data <- info_all[info_all$CHROM==chr, ]

  # frequency of alternate allele in each parental strain
  data$fr_alt_sam <- data$sam_AO/data$sam_AO_RO 
  data$fr_alt_or <- data$or_AO/data$or_AO_RO

  idx <- c(which(data$sam_GT == "0/0" & data$or_GT == "1/1" & data$fr_alt_sam<=thresh & data$fr_alt_or>=(1-thresh)), 
  which(data$or_GT == "0/0" & data$sam_GT == "1/1"& data$fr_alt_or<=thresh & data$fr_alt_sam>=(1-thresh)))
  #idx <- c(which(data$sam_GT == "0/0" & data$or_GT == "1/1"),  which(data$or_GT == "0/0" & data$sam_GT == "1/1")) 
  #idx <- c(which(data$sam_GT == "0/0" & data$or_GT == "1/1" & data$fr_alt_sam == 0 & data$fr_alt_or==1), 
  #which(data$or_GT == "0/0" & data$sam_GT == "1/1"& data$fr_alt_or== 0 & data$fr_alt_sam==1))  
  
  parent <- data[idx, ]
  parent$sam <- parent$REF;  parent$sam[which(parent$sam_GT == "1/1")] <- parent$ALT[which(parent$sam_GT == "1/1")]
  parent$or <- parent$REF; parent$or[which(parent$or_GT == "1/1")] <- parent$ALT[which(parent$or_GT == "1/1")]
  parent <- parent[order(parent$POS), ]
  write.table(parent, paste(path_list, "chr", chr, "_parental.txt", sep = ""), sep = "\t", col.names = T, row.names = F, quote = F)

  sum_count <- rbind(sum_count, data.frame(chr = chr, processed = dim(data)[1], 
  same_00_geno_Sam_Or = length(which(data$sam_GT == "0/0" & data$or_GT == "0/0")),
  same_11_geno_Sam_Or = length(which(data$sam_GT == "1/1" & data$or_GT == "1/1")),
  different_01_geno_Sam_Or_only = length(c(which(data$sam_GT == "0/1" & data$or_GT %in% c("0/0",".", "1/1")),
  which(data$or_GT == "0/1" & data$sam_GT %in% c("0/0",".", "1/1")))),
  same_01_geno_Sam_Or_only = length(which(data$sam_GT == "0/1" & data$or_GT == "0/1")),
  final = dim(parent)[1]))
  
  if(chr == "X"){deltaX <- parent$POS[2:dim(parent)[1]]-parent$POS[1:(dim(parent)[1]-1)]}
  if(chr %in% c("2L", "2R", "3L", "3R")){deltaauto <- c(deltaauto, parent$POS[2:dim(parent)[1]]-parent$POS[1:(dim(parent)[1]-1)])}

  parents <- rbind(parents, parent)
}  
write.table(parents, paste(path_list, "chrall_parental.txt", sep = ""), sep = "\t", col.names = T, row.names = F, quote = F)
write.table(sum_count, paste(path_list, "counts_snps.txt", sep = ""), sep = "\t", col.names = T, row.names = F, quote = F)

# ---- average distance in bp between consecutive SNPs
#> mean(deltaX)
#[1] 362.905
#> mean(deltaauto)
#[1] 270.8511

# number of parental SNPs per arm (1st number)
#   2L    2R    3L    3R     4     X 
#100315  89955 107133 103777     72  63818 

# ----- Header of parental file:
  CHROM  POS REF ALT    QUAL sam_GT sam_DP sam_RO sam_AO or_GT or_DP or_RO
1    2L 5390   T   A 1623.70    1/1     51      0     51   0/0    50    50
2    2L 5762   T   C 1431.94    1/1     45      0     45   0/0    50    50
3    2L 5904   C   A 1079.20    1/1     33      0     33   0/0    57    57
4    2L 5933   A   T 1107.15    1/1     35      0     35   0/0    48    48
5    2L 5974   C   T 1280.49    1/1     41      0     41   0/0    41    41
6    2L 5992   C   T 1299.33    1/1     40      0     40   0/0    36    36
  or_AO sam_AO_RO or_AO_RO mean_AO_RO fr_alt_sam fr_alt_or sam or
1     0        51       50       50.5          1         0   A  T
2     0        45       50       47.5          1         0   C  T
3     0        33       57       45.0          1         0   A  C
4     0        35       48       41.5          1         0   T  A
5     0        41       41       41.0          1         0   T  C
6     0        40       36       38.0          1         0   T  C
6:          0         1   G  C