## date 2021-08-02
## 2022-Mar-17
## Find bins states across cells for each sample to determine whether this is segregation bias
#.libPaths("/mnt/beegfs/mccarthy/scratch/general/rlyu/Software/R/4.1.0/")
suppressPackageStartupMessages({
  library(comapr)
  library(GenomeInfoDb)
  library(foreach)
  library(doParallel)
  })
#cl <- makeCluster(6)
registerDoParallel(8)

bin_width <- 1e7

chrom_info <- GenomeInfoDb::getChromInfoFromUCSC("mm10")
## only for chr1-M
chrom_info <- chrom_info[grep("_",chrom_info$chrom,
                              invert = TRUE),]

seq_length <- chrom_info$size
names(seq_length) <- chrom_info$chrom
seq_length_auto <- seq_length[1:19]

dna_mm10_gr_auto <- GenomicRanges::GRanges(
  seqnames = Rle(names(seq_length_auto)),
  ranges = IRanges(1, end = seq_length_auto, names = names(seq_length_auto)),
  seqlengths = seq_length_auto)

tiles <- GenomicRanges::tileGenome(seqinfo(dna_mm10_gr_auto),
                                   tilewidth = bin_width )
binned_dna_mm10_gr <- unlist(tiles)
#goodbarcodes <- readRDS(file  = "/mnt/mcfiles/rlyu/Projects/rejy_2020_single-sperm-co-calling/output/outputR/analysisRDS/allSamples.setting4.rds")
goodbarcodes <- readRDS(file  = "/mnt/mcfiles/rlyu/Projects/rejy_2020_single-sperm-co-calling/output/outputR/analysisRDS/countsAll-settings4.3-scCNV-CO-counts_07-mar-2022.rds")

sampleName <- "WC_522"
#c("WC_526")
filePath <- "/mnt/mcfiles/rlyu/Projects/rejy_2020_single-sperm-co-calling/output/secondBatch_mar2022_50k//"
# c("WC_CNV_42","WC_CNV_43","WC_CNV_44","WC_CNV_53")
for(sampleName in c("WC_CNV_42","WC_CNV_43","WC_CNV_44","WC_CNV_53")){
  
  sampleBCs <- goodbarcodes$barcodes[goodbarcodes$sampleGroup==sampleName]
  barcodes <-  read.table(file = paste0(filePath,sampleName,"/",
                                          sampleName,"_min50k.txt"))
  for (chrName in paste0("chr",seq(19))) {
    vi_state <- Matrix::readMM(file = paste0(filePath,sampleName,
                                             "/",sampleName,
                                             "_",chrName,"_vi.mtx")) 
    snp_pos <- read.table(file = paste0(filePath,sampleName,"/",
                                        sampleName,"_",chrName,
                                        "_snpAnnot.txt"),
                          header =T) 
    snp_gr <-  GenomicRanges::GRanges(seqnames = chrName,
                                      ranges = IRanges(snp_pos$POS,width = 1))
    chr_bin_gr <- subset(binned_dna_mm10_gr,seqnames==chrName)
    
    chr_bin_state_bcs <- foreach(bc = sampleBCs,.combine = "cbind",
                                 .packages = c("GenomicRanges","dplyr")) %dopar% {
                                   seg_gr <-  getCellCORange(co_count = goodbarcodes,cellBarcode = bc )
                                   seqlengths(seg_gr) <- seqlengths(dna_mm10_gr_auto)
                                   non_COintervals <- subset(GenomicRanges::gaps(seg_gr),strand == "*")
                                   mcols(seg_gr) <- data.frame(type="inCO")
                                   mcols(non_COintervals) <- data.frame(type = "nonCO")
                                   
                                   whole_gr <- c(seg_gr,non_COintervals)
                                   whole_gr <- sort(whole_gr)
                                   
                                   chr_seg_gr <- subset(whole_gr,seqnames ==chrName)
                                   hits <- findOverlaps(chr_seg_gr,snp_gr)
                                   
                                   seg_state <- data.frame(snp_state = vi_state[,which(barcodes$V1==bc)],
                                                           seg_id = hits@from) %>% dplyr::group_by(seg_id) %>%
                                     dplyr::summarise(seg_state = ifelse(sum(snp_state==1) > sum(snp_state==2), "s1","s2"))
                                   #rownames(seg_state) <- seg_state$seg_id
                                   chr_seg_gr$seg_state <- NA
                                   chr_seg_gr$seg_state[seg_state$seg_id] <- seg_state$seg_state
                                   chr_seg_gr$seg_state[chr_seg_gr$type=="inCO"] <- NA 
                                   hits <- findOverlaps(chr_bin_gr,chr_seg_gr)
                                   imputed_cos <- GenomicRanges::GRanges(seqnames = chrName,
                                                                         ranges = ranges(chr_bin_gr[hits@from,]),
                                                                         type = chr_seg_gr$type[hits@to],
                                                                         state = chr_seg_gr$seg_state[hits@to])
                                   imputed_cos_clean <- disjoin(imputed_cos, with.revmap=TRUE)
                                   state_index <- lapply(imputed_cos_clean$revmap, function(inlist)(if(length(inlist)==1){inlist} else {NA}))
                                   imputed_cos_clean$state <- imputed_cos$state[unlist(state_index)]
                                   
                                   rt <- data.frame(bc = imputed_cos_clean$state)
                                   colnames(rt) <- bc
                                   rt
                                 }
    
    mcols(chr_bin_gr) <- chr_bin_state_bcs
    
    saveRDS(chr_bin_gr,
            file = paste0("output/outputR/analysisRDS/",
                          sampleName,"_",
                          chrName,"bin_state_gr-mar_2022.rds"))
  }
  
}


filePath <- "/mnt/mcfiles/rlyu/Projects/rejy_2020_single-sperm-co-calling/output/firstBatch_march2022_50k/"
for(sampleName in  c("WC_522","WC_526")){
  
  sampleBCs <- goodbarcodes$barcodes[goodbarcodes$sampleGroup==sampleName]
  barcodes <-  read.table(file = paste0(filePath,sampleName,"/",
                                        sampleName,"_min50k.txt"))
  for (chrName in paste0("chr",seq(19))) {
    vi_state <- Matrix::readMM(file = paste0(filePath,sampleName,
                                             "/",sampleName,
                                             "_",chrName,"_vi.mtx")) 
    snp_pos <- read.table(file = paste0(filePath,sampleName,"/",
                                        sampleName,"_",chrName,
                                        "_snpAnnot.txt"),
                          header =T) 
    snp_gr <-  GenomicRanges::GRanges(seqnames = chrName,
                                      ranges = IRanges(snp_pos$POS,width = 1))
    chr_bin_gr <- subset(binned_dna_mm10_gr,seqnames==chrName)
    
    chr_bin_state_bcs <- foreach(bc = sampleBCs,.combine = "cbind",
                                 .packages = c("GenomicRanges","dplyr")) %dopar% {
                                   seg_gr <-  getCellCORange(co_count = goodbarcodes,cellBarcode = bc )
                                   seqlengths(seg_gr) <- seqlengths(dna_mm10_gr_auto)
                                   non_COintervals <- subset(GenomicRanges::gaps(seg_gr),strand == "*")
                                   mcols(seg_gr) <- data.frame(type="inCO")
                                   mcols(non_COintervals) <- data.frame(type = "nonCO")
                                   
                                   whole_gr <- c(seg_gr,non_COintervals)
                                   whole_gr <- sort(whole_gr)
                                   
                                   chr_seg_gr <- subset(whole_gr,seqnames ==chrName)
                                   hits <- findOverlaps(chr_seg_gr,snp_gr)
                                   
                                   seg_state <- data.frame(snp_state = vi_state[,which(barcodes$V1==bc)],
                                                           seg_id = hits@from) %>% dplyr::group_by(seg_id) %>%
                                     dplyr::summarise(seg_state = ifelse(sum(snp_state==1) > sum(snp_state==2), "s1","s2"))
                                   #rownames(seg_state) <- seg_state$seg_id
                                   chr_seg_gr$seg_state <- NA
                                   chr_seg_gr$seg_state[seg_state$seg_id] <- seg_state$seg_state
                                   chr_seg_gr$seg_state[chr_seg_gr$type=="inCO"] <- NA 
                                   hits <- findOverlaps(chr_bin_gr,chr_seg_gr)
                                   imputed_cos <- GenomicRanges::GRanges(seqnames = chrName,
                                                                         ranges = ranges(chr_bin_gr[hits@from,]),
                                                                         type = chr_seg_gr$type[hits@to],
                                                                         state = chr_seg_gr$seg_state[hits@to])
                                   imputed_cos_clean <- disjoin(imputed_cos, with.revmap=TRUE)
                                   state_index <- lapply(imputed_cos_clean$revmap, function(inlist)(if(length(inlist)==1){inlist} else {NA}))
                                   imputed_cos_clean$state <- imputed_cos$state[unlist(state_index)]
                                   
                                   rt <- data.frame(bc = imputed_cos_clean$state)
                                   colnames(rt) <- bc
                                   rt
                                 }
    
    mcols(chr_bin_gr) <- chr_bin_state_bcs
    
    saveRDS(chr_bin_gr,
            file = paste0("output/outputR/analysisRDS/",
                          sampleName,"_",
                          chrName,"bin_state_gr-mar_2022.rds"))
  }
  
}
