## date 2021-08-02
## Find bins states across cells for each sample to determine whether 
## there is segregation bias
.libPaths("/mnt/beegfs/mccarthy/scratch/general/rlyu/Software/R/4.1.0/")
suppressPackageStartupMessages({
  library(comapr)
  library(GenomeInfoDb)
  library(foreach)
  library(doParallel)
})
#cl <- makeCluster(6)
registerDoParallel(6)

bin_width <- 1e7

chrom_info <- GenomeInfoDb::getChromInfoFromUCSC("mm10")
## only for chr1-M
chrom_info <- chrom_info[grep("_",chrom_info$chrom,
                              invert = TRUE),]

seq_length <- chrom_info$size
names(seq_length) <- chrom_info$chrom
seq_length_auto <- seq_length[1:19]

dna_mm10_gr_auto <- GenomicRanges::GRanges(
  seqnames = Rle(names(seq_length_auto)),
  ranges = IRanges(1, end = seq_length_auto, names = names(seq_length_auto)),
  seqlengths = seq_length_auto)

tiles <- GenomicRanges::tileGenome(seqinfo(dna_mm10_gr_auto),
                                   tilewidth = bin_width )
binned_dna_mm10_gr <- unlist(tiles)

co_count <- readRDS(file = "output/outputR/analysisRDS/all_rse_count_07-20.rds")

for (chrName in paste0("chr",seq(19))) {
  chr_bin_gr <- subset(binned_dna_mm10_gr,seqnames==chrName)
  
  chr_bin_state_allbc1f1 <- foreach(sid = colnames(co_count),.combine = "cbind",
          .packages = c("GenomicRanges","dplyr","comapr")) %dopar% {
    batch <- co_count$batch[co_count$Sid==sid]
    paths <- c("agrf" = "output/bulkDNAseq_agrf/chr_cos/bcftools/",
               "bgi20" = "output/bulkDNAseq_BGI/F20FTSAPHT0350_MUSyfqR/chr_cos/bcftools/",
               "bgi21" = "output/bulkDNAseq_BC1F1BGI2021/chr_cos/bcftools/",
               "bgi202b" = "output/bulkDNAseq_BGI/F21FTSAPHT0641_MOUobtfR/chr_cos/bcftools/" )
    filePath <- paths[batch]
    sampleName <- gsub("sid","",sid)
    print(sampleName)
    vi_state <- read.table(file = paste0(filePath,sampleName,
                                             "/",sampleName,
                                             "_",chrName,"_bcf_dp2_postvi.tsv"),
                           header = TRUE) 
    stopifnot(nrow(vi_state) > 0)
    snp_gr <-  GenomicRanges::GRanges(seqnames = chrName,
                                      ranges = IRanges(vi_state$Pos,width = 1))
    
    seg_gr <-  getCellCORange(co_count = co_count,cellBarcode = sid )
    seqlengths(seg_gr) <- seqlengths(dna_mm10_gr_auto)
    non_COintervals <- subset(GenomicRanges::gaps(seg_gr),strand == "*")
    mcols(seg_gr) <- data.frame(type="inCO")
    mcols(non_COintervals) <- data.frame(type = "nonCO")
    
    whole_gr <- c(seg_gr,non_COintervals)
    whole_gr <- sort(whole_gr)
    
    chr_seg_gr <- subset(whole_gr,seqnames ==chrName)
    hits <- findOverlaps(chr_seg_gr,snp_gr)
    
    seg_state <- data.frame(snp_state = vi_state$state,
                            seg_id = hits@from) %>% dplyr::group_by(seg_id) %>%
      dplyr::summarise(seg_state = ifelse(sum(snp_state==1) > sum(snp_state==2),
                                          "s1","s2"))
    #rownames(seg_state) <- seg_state$seg_id
    chr_seg_gr$seg_state <- NA
    chr_seg_gr$seg_state[seg_state$seg_id] <- seg_state$seg_state
    chr_seg_gr$seg_state[chr_seg_gr$type=="inCO"] <- NA 
    hits <- findOverlaps(chr_bin_gr,chr_seg_gr)
    imputed_cos <- GenomicRanges::GRanges(seqnames = chrName,
                                          ranges = ranges(chr_bin_gr[hits@from,]),
                                          type = chr_seg_gr$type[hits@to],
                                          state = chr_seg_gr$seg_state[hits@to])
    imputed_cos_clean <- disjoin(imputed_cos, with.revmap=TRUE)
    state_index <- lapply(imputed_cos_clean$revmap, function(inlist)(if(length(inlist)==1){inlist} else {NA}))
    imputed_cos_clean$state <- imputed_cos$state[unlist(state_index)]
    
    rt <- data.frame(bc = imputed_cos_clean$state)
    colnames(rt) <- sid
    rt
 }
  mcols(chr_bin_gr) <- chr_bin_state_allbc1f1
  
  saveRDS(chr_bin_gr, file = paste0("output/outputR/analysisRDS/bc1f1_",chrName,
                                    "bin_state_gr.rds"))
}


for (chrName in paste0("chr",seq(19))) {
  chr_bin_gr <- subset(binned_dna_mm10_gr,seqnames==chrName)
  
  chr_bin_state_allbc1f1 <- foreach(sid = colnames(co_count),.combine = "cbind",
                                    .packages = c("GenomicRanges","dplyr","comapr")) %dopar% {
                                      batch <- co_count$batch[co_count$Sid==sid]
                                      paths <- c("agrf" = "output/bulkDNAseq_agrf/chr_cos/bcftools/",
                                                 "bgi20" = "output/bulkDNAseq_BGI/F20FTSAPHT0350_MUSyfqR/chr_cos/bcftools/",
                                                 "bgi21" = "output/bulkDNAseq_BC1F1BGI2021/chr_cos/bcftools/",
                                                 "bgi202b" = "output/bulkDNAseq_BGI/F21FTSAPHT0641_MOUobtfR/chr_cos/bcftools/" )
                                      filePath <- paths[batch]
                                      sampleName <- gsub("sid","",sid)
                                      print(sampleName)
                                      vi_state <- read.table(file = paste0(filePath,sampleName,
                                                                           "/",sampleName,
                                                                           "_",chrName,"_bcf_dp2_postvi.tsv"),
                                                             header = TRUE) 
                                      stopifnot(nrow(vi_state) > 0)
                                      snp_gr <-  GenomicRanges::GRanges(seqnames = chrName,
                                                                        ranges = IRanges(vi_state$Pos,width = 1))
                                      
                                      seg_gr <-  getCellCORange(co_count = co_count,cellBarcode = sid )
                                      seqlengths(seg_gr) <- seqlengths(dna_mm10_gr_auto)
                                      non_COintervals <- subset(GenomicRanges::gaps(seg_gr),strand == "*")
                                      mcols(seg_gr) <- data.frame(type="inCO")
                                      mcols(non_COintervals) <- data.frame(type = "nonCO")
                                      
                                      whole_gr <- c(seg_gr,non_COintervals)
                                      whole_gr <- sort(whole_gr)
                                      
                                      chr_seg_gr <- subset(whole_gr,seqnames ==chrName)
                                      hits <- findOverlaps(chr_seg_gr,snp_gr)
                                      
                                      seg_state <- data.frame(snp_state = vi_state$state,
                                                              seg_id = hits@from) %>% dplyr::group_by(seg_id) %>%
                                        dplyr::summarise(seg_state = ifelse(sum(snp_state==1) > sum(snp_state==2),
                                                                            "s1","s2"),
                                                         sum_s1 = sum(snp_state==1),
                                                         sum_s2 = sum(snp_state==2))
                                      #rownames(seg_state) <- seg_state$seg_id
                                      chr_seg_gr$sum_s1 <- NA
                                      chr_seg_gr$sum_s1[seg_state$seg_id] <- seg_state$sum_s1
                                      chr_seg_gr$sum_s1[chr_seg_gr$type=="inCO"] <- NA 
                                      
                                      hits <- findOverlaps(chr_bin_gr,chr_seg_gr)
                                      imputed_cos <- GenomicRanges::GRanges(seqnames = chrName,
                                                                            ranges = ranges(chr_bin_gr[hits@from,]),
                                                                            type = chr_seg_gr$type[hits@to],
                                                                            sum_s1 = chr_seg_gr$sum_s1[hits@to])
                                      imputed_cos_clean <- disjoin(imputed_cos, with.revmap=TRUE)
                                      state_index <- lapply(imputed_cos_clean$revmap, function(inlist)(if(length(inlist)==1){inlist} else {NA}))
                                      imputed_cos_clean$sum_s1 <- imputed_cos$sum_s1[unlist(state_index)]
                                      
                                      rt <- data.frame(bc = imputed_cos_clean$sum_s1)
                                      colnames(rt) <- sid
                                      rt
                                    }
  mcols(chr_bin_gr) <- chr_bin_state_allbc1f1
  
  saveRDS(chr_bin_gr, file = paste0("output/outputR/analysisRDS/bc1f1_",chrName,
                                    "bin_s1sum_gr.rds"))
}



for (chrName in paste0("chr",seq(19))) {
  chr_bin_gr <- subset(binned_dna_mm10_gr,seqnames==chrName)
  
  chr_bin_state_allbc1f1 <- foreach(sid = colnames(co_count),.combine = "cbind",
                                    .packages = c("GenomicRanges","dplyr","comapr")) %dopar% {
                                      batch <- co_count$batch[co_count$Sid==sid]
                                      paths <- c("agrf" = "output/bulkDNAseq_agrf/chr_cos/bcftools/",
                                                 "bgi20" = "output/bulkDNAseq_BGI/F20FTSAPHT0350_MUSyfqR/chr_cos/bcftools/",
                                                 "bgi21" = "output/bulkDNAseq_BC1F1BGI2021/chr_cos/bcftools/",
                                                 "bgi202b" = "output/bulkDNAseq_BGI/F21FTSAPHT0641_MOUobtfR/chr_cos/bcftools/" )
                                      filePath <- paths[batch]
                                      sampleName <- gsub("sid","",sid)
                                      print(sampleName)
                                      vi_state <- read.table(file = paste0(filePath,sampleName,
                                                                           "/",sampleName,
                                                                           "_",chrName,"_bcf_dp2_postvi.tsv"),
                                                             header = TRUE) 
                                      stopifnot(nrow(vi_state) > 0)
                                      snp_gr <-  GenomicRanges::GRanges(seqnames = chrName,
                                                                        ranges = IRanges(vi_state$Pos,width = 1))
                                      
                                      seg_gr <-  getCellCORange(co_count = co_count,cellBarcode = sid )
                                      seqlengths(seg_gr) <- seqlengths(dna_mm10_gr_auto)
                                      non_COintervals <- subset(GenomicRanges::gaps(seg_gr),strand == "*")
                                      mcols(seg_gr) <- data.frame(type="inCO")
                                      mcols(non_COintervals) <- data.frame(type = "nonCO")
                                      
                                      whole_gr <- c(seg_gr,non_COintervals)
                                      whole_gr <- sort(whole_gr)
                                      
                                      chr_seg_gr <- subset(whole_gr,seqnames ==chrName)
                                      hits <- findOverlaps(chr_seg_gr,snp_gr)
                                      
                                      seg_state <- data.frame(snp_state = vi_state$state,
                                                              seg_id = hits@from) %>% dplyr::group_by(seg_id) %>%
                                        dplyr::summarise(seg_state = ifelse(sum(snp_state==1) > sum(snp_state==2),
                                                                            "s1","s2"),
                                                         sum_s1 = sum(snp_state==1),
                                                         sum_s2 = sum(snp_state==2))
                                      #rownames(seg_state) <- seg_state$seg_id
                                      chr_seg_gr$sum_s2 <- NA
                                      chr_seg_gr$sum_s2[seg_state$seg_id] <- seg_state$sum_s2
                                      chr_seg_gr$sum_s2[chr_seg_gr$type=="inCO"] <- NA 
                                      
                                      hits <- findOverlaps(chr_bin_gr,chr_seg_gr)
                                      imputed_cos <- GenomicRanges::GRanges(seqnames = chrName,
                                                                            ranges = ranges(chr_bin_gr[hits@from,]),
                                                                            type = chr_seg_gr$type[hits@to],
                                                                            sum_s2 = chr_seg_gr$sum_s2[hits@to])
                                      imputed_cos_clean <- disjoin(imputed_cos, with.revmap=TRUE)
                                      state_index <- lapply(imputed_cos_clean$revmap, function(inlist)(if(length(inlist)==1){inlist} else {NA}))
                                      imputed_cos_clean$sum_s2 <- imputed_cos$sum_s2[unlist(state_index)]
                                      
                                      rt <- data.frame(bc = imputed_cos_clean$sum_s2)
                                      colnames(rt) <- sid
                                      rt
                                    }
  mcols(chr_bin_gr) <- chr_bin_state_allbc1f1
  
  saveRDS(chr_bin_gr, file = paste0("output/outputR/analysisRDS/bc1f1_",chrName,
                                    "bin_s2sum_gr.rds"))
}

