## count CO for list of SNPs
## across cells for each sample

## generate snp_gr
## and only for good cells

## date OCT/19/2020

suppressPackageStartupMessages({
  library(dplyr)
  library(ggplot2)
  library(doParallel)
  library(foreach)
  library(GenomicRanges)
  library(GenomeInfoDb)
  library(IRanges)
})

args <- (commandArgs(trailingOnly = TRUE))
for (i in seq_len(length(args))) {
  eval(parse(text = args[[i]]))
}

print(threads)
print(chr)
print(filePath)
print(fvbRateFile)
# print(badSampleChr)
print(sample_meta_file)
print(coCountRDS)
print(png_dir)
print(bcfResult)
print(binsize)
print(refgenome)
print(cobinCountRDS)
binsize <- as.numeric(binsize)

## binsize 

ncluster <- as.numeric(threads)

cl <- makeCluster(ncluster)
registerDoParallel(cl)

sampleNames <- read.table(sample_meta_file,
                            stringsAsFactors = F,
                            header = 1)

sampleNames <- unique(sampleNames$sample_name)

  # fvbRateFile for providing the SNP for SNP gr object
snps_pos <- read.table(file=fvbRateFile,
                      stringsAsFactors = F,
                      header=T)$SNP

## current not filtering yet
#badCells <- read.table(badSampleChr,stringsAsFactors = F,header = T)
snps_pos <- as.numeric(sapply(strsplit(snps_pos,"_"),`[[`,2))

snp_gr <- GenomicRanges::GRanges(
  seqnames = chr,
  ranges = IRanges::IRanges(start = snps_pos, width=1))

## cut genome into equal sized bins


chrom_info <- GenomeInfoDb::fetchExtendedChromInfoFromUCSC(refgenome)
## only for chr1-M
chrom_info <- chrom_info[grep("_",chrom_info$UCSC_seqlevel,invert = TRUE),]

## Check what seqnames is in snp_gr and make it consistent
if(!grepl("chr",as.character(seqnames(snp_gr)[1]))){
  chrom_info$UCSC_seqlevel <- gsub("chr","",chrom_info$UCSC_seqlevel)
}

chrom_info <- chrom_info[chrom_info$UCSC_seqlevel %in% GenomeInfoDb::seqlevels(snp_gr),]
## create Granges object for chromosomes
seq_length <- chrom_info$UCSC_seqlength
names(seq_length) <- chrom_info$UCSC_seqlevel

dna_mm10_gr <- GenomicRanges::GRanges(
  seqnames = Rle(names(seq_length)),
  ranges = IRanges(1, end = seq_length, names = names(seq_length)),
  seqlengths = seq_length)
GenomeInfoDb::genome(dna_mm10_gr) <- refgenome
#dna_mm10_gr



crossover_counts <- 
  foreach(fromSid= sampleNames,
          .packages = c("GenomicRanges","IRanges","dplyr","ggplot2")) %dopar%{
  vi_tsv <- paste0(filePath,fromSid,"/",fromSid,"_",chr,bcfResult,"_dp2_postvi.tsv")
  vi_tsv_filtered <- paste0(filePath,fromSid,"/",fromSid,"_",chr,bcfResult,"_dp2_postvi-filtered.tsv")

  stopifnot(file.exists(vi_tsv))

  vi_df <- read.table(file=ifelse(file.exists(vi_tsv_filtered),vi_tsv_filtered,
                                  vi_tsv),
                      header = T,stringsAsFactors = F)
  
  coOnly <- paste0(filePath,fromSid,"/",fromSid,"_",chr,bcfResult,"_dp2_postvi-co-only.tsv")
  coOnly_filtered <- paste0(filePath,fromSid,"/",fromSid,"_",chr,bcfResult,"_dp2_postvi-filtered-co-only.tsv")
  vi_png_filtered <- paste0(png_dir,fromSid,"-",chr,bcfResult,"_dp2_postvi-filtered.png") 

  ggplot(data=vi_df)+geom_point(mapping = aes(x = Pos,y= state,
                                             color =ALT_ratio ),
                               size = 0.3)+ scale_color_continuous(type = "viridis")+
  theme_classic()+xlab(chr)+ylab(paste0(fromSid, " State"))
  ggsave(vi_png_filtered,dpi=90 ,width = 14,height = 4) 

  if(file.exists(coOnly_filtered))  {
    co_df <- read.table(file = coOnly_filtered, header=TRUE, stringsAsFactors=F)
  } else {
    co_df <- vi_df %>% mutate(CO = (lag(state) != state),
                             Prev = lag(Pos)) %>% filter(CO) 
    write.table(co_df,file=coOnly,
            row.names = F,col.names = T,quote=FALSE)
  }                      
  dim(co_df)
  re_df <- data.frame(chr=as.character(GenomicRanges::seqnames(snp_gr)),
                      Pos = IRanges::start(GenomicRanges::ranges(snp_gr)),
                      crossovers = 0)

  ## if nrow() ==1 , means no CO detected
  if(nrow(co_df) !=0) {
      co_gr <- GenomicRanges::GRanges( seqnames = chr,
                                       ranges = IRanges::IRanges(start = co_df$Prev, 
                                                                 end = co_df$Pos),
                                       coid=paste0("id",seq_along(1:nrow(co_df))))
      
      mapped_marker_state <- IRanges::mergeByOverlaps(snp_gr,co_gr)
      mapped_marker_state <- as.data.frame(mapped_marker_state)
    
      mapped_marker_state <- mapped_marker_state %>%
        dplyr::group_by(snp_gr.seqnames,coid) %>%
        mutate(snp_gr.prev = dplyr::lag(snp_gr.start,
                                      default = dplyr::first(snp_gr.start))) %>%
        mutate(len_prop = (snp_gr.start-snp_gr.prev)/(unique(co_gr.width)-1))
  
    
    ## keep non zero counts (TEST)
    ## Finds the first matched row with the same Pos. In case this Pos is
    ## the start SNP for the next interval
    mapped_marker_state <-
      mapped_marker_state[mapped_marker_state$snp_gr.start !=
                            mapped_marker_state$snp_gr.prev,]
    re_df$crossovers[match(mapped_marker_state$snp_gr.start,
                           re_df$Pos)] <- mapped_marker_state$len_prop
  }
  colnames(re_df) <- c(colnames(re_df)[1:2],fromSid)
  rownames(re_df) <- paste0(as.character(re_df$chr),"_",re_df$Pos)
  re_df[,3,drop =F]
  }


final_df <- do.call(cbind,crossover_counts)
#colnames(final_df) <- names(crossover_counts)

gr <- data.frame(seqnames = sapply( strsplit(rownames(final_df),"_"), `[[`,1),
                 end = as.numeric(sapply(strsplit(rownames(final_df),"_"), `[[`,2))) %>% 
  dplyr::group_by(seqnames) %>%
  mutate(start = dplyr::lag(end,default = dplyr::first(end)))

co_gr <- GenomicRanges::GRanges(
  seqnames = gr$seqnames,
  ranges = IRanges(start =gr$start,
                   end = gr$end-1))
GenomicRanges::mcols(co_gr) <- final_df


co_gr <- GenomeInfoDb::sortSeqlevels(co_gr)
co_gr <- GenomicRanges::sort(co_gr)
co_gr <- co_gr[IRanges::width(ranges(co_gr))!=0,]

saveRDS(co_gr,file=coCountRDS)


####--- bined CO counts----- ####
## per bp distances
GenomicRanges::mcols(co_gr) <- apply(GenomicRanges::mcols(co_gr),2,
                                      function(x) x/GenomicRanges::width(co_gr))

tilewidth <- binsize
tiles <- GenomicRanges::tileGenome(seqinfo(dna_mm10_gr),tilewidth = tilewidth)
binned_dna_mm10_gr <- unlist(tiles)
# binned_dna_mm10_gr

bin_dist <-  lapply(colnames(mcols(co_gr)), function(group_col){
  dist_rle <- GenomicRanges::coverage(co_gr,weight = mcols(co_gr)[,group_col])
  dist_bined <- binnedAverage(binned_dna_mm10_gr,dist_rle,
                                          "dist_bin_ave")

  return(dist_bined$dist_bin_ave*width(dist_bined))
  })

mcols(binned_dna_mm10_gr) <- do.call(cbind,bin_dist)
colnames(mcols(binned_dna_mm10_gr)) <- colnames(mcols(co_gr))
saveRDS(binned_dna_mm10_gr,file=cobinCountRDS)
