## Get number of SNPs and COs from the HMM state inference for each sample
## for identifying outlier samples
## also generates -postvi-co.tsv which only contains the co positions from the HMM output that should be filtered later

## Date Oct/19/2020
## Author Ruqian Lyu


suppressPackageStartupMessages({
  library(dplyr)
  library(tidyr)
  library(ggplot2)
  library(doParallel)
  library(foreach)
})

args <- (commandArgs(trailingOnly = TRUE))
for (i in seq_len(length(args))) {
  eval(parse(text = args[[i]]))
}

print(sample_meta_file)
print(chr)
print(threads)
print(outTSV)
print(badSampleChr)
print(out_png)
print(filePath)
print(bin_size)
print(bcfResult)
## per 100 SNP in a bin
ncluster <- as.numeric(threads)
bin_size <- as.numeric(bin_size)

cl <- makeCluster(ncluster)
registerDoParallel(cl)

sampleNames <- read.table(sample_meta_file,
                            stringsAsFactors = F,
                            header = 1)

sampleNames <- unique(sampleNames$sample_name)
## nSNP,nCO, nNoisyBin
count_co_cSNP_chr_cell <-  foreach(s_name = sampleNames,.combine = c,
                              .packages = c("dplyr")) %dopar% {

                                coFile <- paste0(filePath,s_name,"/",s_name,"_",chr,bcfResult,"_dp2_postvi.tsv")
                                coFilterFile <- paste0(filePath,s_name,"/",s_name,"_",chr,bcfResult,"_dp2_postvi-filtered.tsv")

                                if(!file.exists(coFile)){
                                  nSNP <- c(-1)
                                  nCO <- c(-1)
                                  nNoisyBin <- c(-1)
                                } else {
                                  cell_tsv <- read.table(file = coFile,
                                                       stringsAsFactors = F,header = T)
                                  nSNP <- nrow(cell_tsv)                    
                                  co_only <- cell_tsv %>% mutate(CO = (lag(state) != state),
                                                              Prev = lag(Pos)) %>% filter(CO) 
                                  nCO <-  c(cell_id=nrow(co_only))
                                  nNoisyBin <- cell_tsv %>% dplyr::mutate(posID=seq(1:n()),
                                                             binID =  as.integer(posID/bin_size)) %>%
                                               dplyr::group_by(binID) %>%
                                               dplyr::mutate(fvbGTRatio=sum(GT=="1/1")/100)
                                  nNoisyBin <- sum(nNoisyBin$fvbGTRatio>0.3 & nNoisyBin$fvbGTRatio< 0.8)

                                }

                                if(!file.exists(coFilterFile)){
                                  nAfterFilterSNP <-nSNP
                                  nAfterFilterCO <- nCO
                                  nAfterFilterNoisyBin <- nNoisyBin
                                } else {
                                  cell_tsv <- read.table(file = coFilterFile,
                                                       stringsAsFactors = F,header = T)
                                  nAfterFilterSNP <- nrow(cell_tsv)                    
                                  co_only <- cell_tsv %>% mutate(CO = (lag(state) != state),
                                                              Prev = lag(Pos)) %>% filter(CO) 
                                  nAfterFilterCO <-  c(cell_id=nrow(co_only))
                                  nAfterFilterNoisyBin <- cell_tsv %>% dplyr::mutate(posID=seq(1:n()),
                                                             binID =  as.integer(posID/bin_size)) %>%
                                               dplyr::group_by(binID) %>%
                                               dplyr::mutate(fvbGTRatio=sum(GT=="1/1")/100)
                                  nAfterFilterNoisyBin <- sum(nAfterFilterNoisyBin$fvbGTRatio>0.3 & nAfterFilterNoisyBin$fvbGTRatio< 0.8)

                                }
                              re <-  paste0(nSNP,",",nCO,",",nNoisyBin,",",nAfterFilterSNP,",",nAfterFilterCO,",",nAfterFilterNoisyBin)
                              names(re) <- s_name
                              re  
                              }

nSNP <- as.numeric(sapply(strsplit(count_co_cSNP_chr_cell,","),`[[`,1))
nCO <- as.numeric(sapply(strsplit(count_co_cSNP_chr_cell,","),`[[`,2))
nNoisyBin <- as.numeric(sapply(strsplit(count_co_cSNP_chr_cell,","),`[[`,3))
nAfterFilterSNP <- as.numeric(sapply(strsplit(count_co_cSNP_chr_cell,","),`[[`,4))
nAfterFilterCO <- as.numeric(sapply(strsplit(count_co_cSNP_chr_cell,","),`[[`,5))
nAfterFilterNoisyBin<- as.numeric(sapply(strsplit(count_co_cSNP_chr_cell,","),`[[`,6))

Sid <- names(count_co_cSNP_chr_cell)

sampleQC <- data.frame(nSNP, nCO,nNoisyBin,Sid, nAfterFilterSNP, nAfterFilterCO,nAfterFilterNoisyBin,stringsAsFactors = F)
ggplot(data = sampleQC)+geom_bar(mapping= aes(x=Sid,y=nAfterFilterCO,fill=(nAfterFilterNoisyBin>50 | nAfterFilterSNP <=1000)),
                               stat = "identity")+
  geom_bar(mapping= aes(x=Sid,y=-nAfterFilterNoisyBin,fill=(nAfterFilterNoisyBin>50 | nAfterFilterSNP <=1000)),
           stat = "identity")+theme_bw()+
  ylab("nAfterFilterCO and nAfterFilterNoisyBin (FVBGTratio in every 100SNP in 0.3-0.8)")

ggsave(file=out_png,dpi=70,width = 12,height = 6)

write.table(sampleQC,file=outTSV,row.names = F,col.names = T)

write.table(sampleQC[sampleQC$nAfterFilterSNP>50 | sampleQC$nAfterFilterSNP <=500 | 
                     (sampleQC$nCO>5) ,],
            file=badSampleChr,
            row.names = F,col.names = T)