Getting species annotation for intergenic mapping reads

require(GenomicRanges)
Loading required package: GenomicRanges
Loading required package: stats4
Loading required package: BiocGenerics
Loading required package: parallel

Attaching package: ‘BiocGenerics’

The following objects are masked from ‘package:parallel’:

    clusterApply, clusterApplyLB, clusterCall, clusterEvalQ, clusterExport, clusterMap,
    parApply, parCapply, parLapply, parLapplyLB, parRapply, parSapply, parSapplyLB

The following objects are masked from ‘package:dplyr’:

    combine, intersect, setdiff, union

The following objects are masked from ‘package:stats’:

    IQR, mad, sd, var, xtabs

The following objects are masked from ‘package:base’:

    anyDuplicated, append, as.data.frame, basename, cbind, colnames, dirname, do.call,
    duplicated, eval, evalq, Filter, Find, get, grep, grepl, intersect, is.unsorted, lapply,
    Map, mapply, match, mget, order, paste, pmax, pmax.int, pmin, pmin.int, Position, rank,
    rbind, Reduce, rownames, sapply, setdiff, sort, table, tapply, union, unique, unsplit,
    which.max, which.min

Loading required package: S4Vectors

Attaching package: ‘S4Vectors’

The following objects are masked from ‘package:dplyr’:

    first, rename

The following object is masked from ‘package:tidyr’:

    expand

The following objects are masked from ‘package:base’:

    expand.grid, I, unname

Loading required package: IRanges

Attaching package: ‘IRanges’

The following objects are masked from ‘package:dplyr’:

    collapse, desc, slice

The following object is masked from ‘package:purrr’:

    reduce

Loading required package: GenomeInfoDb
require(GenomicFeatures)
Loading required package: GenomicFeatures
Loading required package: AnnotationDbi
Loading required package: Biobase
Welcome to Bioconductor

    Vignettes contain introductory material; view with 'browseVignettes()'. To cite
    Bioconductor, see 'citation("Biobase")', and for packages 'citation("pkgname")'.


Attaching package: ‘AnnotationDbi’

The following object is masked from ‘package:dplyr’:

    select
require(GenomicAlignments)
Loading required package: GenomicAlignments
Loading required package: SummarizedExperiment
Loading required package: MatrixGenerics
Loading required package: matrixStats

Attaching package: ‘matrixStats’

The following objects are masked from ‘package:Biobase’:

    anyMissing, rowMedians

The following object is masked from ‘package:dplyr’:

    count


Attaching package: ‘MatrixGenerics’

The following objects are masked from ‘package:matrixStats’:

    colAlls, colAnyNAs, colAnys, colAvgsPerRowSet, colCollapse, colCounts, colCummaxs,
    colCummins, colCumprods, colCumsums, colDiffs, colIQRDiffs, colIQRs, colLogSumExps,
    colMadDiffs, colMads, colMaxs, colMeans2, colMedians, colMins, colOrderStats, colProds,
    colQuantiles, colRanges, colRanks, colSdDiffs, colSds, colSums2, colTabulates,
    colVarDiffs, colVars, colWeightedMads, colWeightedMeans, colWeightedMedians,
    colWeightedSds, colWeightedVars, rowAlls, rowAnyNAs, rowAnys, rowAvgsPerColSet,
    rowCollapse, rowCounts, rowCummaxs, rowCummins, rowCumprods, rowCumsums, rowDiffs,
    rowIQRDiffs, rowIQRs, rowLogSumExps, rowMadDiffs, rowMads, rowMaxs, rowMeans2,
    rowMedians, rowMins, rowOrderStats, rowProds, rowQuantiles, rowRanges, rowRanks,
    rowSdDiffs, rowSds, rowSums2, rowTabulates, rowVarDiffs, rowVars, rowWeightedMads,
    rowWeightedMeans, rowWeightedMedians, rowWeightedSds, rowWeightedVars

The following object is masked from ‘package:Biobase’:

    rowMedians

Loading required package: Biostrings
Loading required package: XVector

Attaching package: ‘XVector’

The following object is masked from ‘package:purrr’:

    compact


Attaching package: ‘Biostrings’

The following object is masked from ‘package:base’:

    strsplit

Loading required package: Rsamtools

Attaching package: ‘GenomicAlignments’

The following object is masked from ‘package:dplyr’:

    last
require(AnnotationDbi)

library(methods)
library(data.table)
data.table 1.14.0 using 28 threads (see ?getDTthreads).  Latest news: r-datatable.com

Attaching package: ‘data.table’

The following objects are masked from ‘package:GenomicAlignments’:

    first, last, second

The following object is masked from ‘package:SummarizedExperiment’:

    shift

The following object is masked from ‘package:GenomicRanges’:

    shift

The following object is masked from ‘package:IRanges’:

    shift

The following objects are masked from ‘package:S4Vectors’:

    first, second

The following objects are masked from ‘package:dplyr’:

    between, first, last

The following object is masked from ‘package:purrr’:

    transpose
library(yaml)

source("/data/share/htp/prime-seq_Paper/Fig_gdna_priming/analysis/count_intergenic/count_intergenic_functions.R")

outdir<-"/data/share/htp/prime-seq_Paper/Fig_gdna_priming/zUMIs/"

gtf<-"/data/share/htp/prime-seq_Paper/genomes/HsapMmus/gencode.v35.vM25.primary_assembly.annotation.gtf"
  
  
  txdb <- suppressWarnings(suppressMessages(GenomicFeatures::makeTxDbFromGFF(file=gtf, format="gtf")))

  
  ## Make Gene-range GR-object
  se <- suppressMessages(
    AnnotationDbi::select(txdb, keys(txdb, "GENEID"),
                              columns=c("GENEID","TXCHROM","TXSTART","TXEND","TXSTRAND"),
                              keytype="GENEID") %>%
    dplyr::group_by(GENEID,TXCHROM,TXSTRAND)  %>%
    dplyr::mutate( txstart =ifelse(TXSTART<TXEND,min(TXSTART),min(TXEND)),
                   txend  =ifelse(TXSTART<TXEND,max(TXEND),min(TXSTART) ) ) %>%
    dplyr::select(GENEID,TXCHROM,TXSTRAND,txstart,txend)  %>% unique()
    )

  gr.gene<-GenomicRanges::GRanges(seqnames = se$TXCHROM,
                                  ranges =  IRanges(start= se$txstart,
                                                    end=  se$txend,
                                                    names=se$GENEID),
                                  strand =  se$TXSTRAND,
                                  gid    =  se$GENEID)

  ### Get non-overlapping Introns/Exons
  intron<-GenomicFeatures::intronsByTranscript(txdb, use.names=T)
  exon<-GenomicFeatures::exonsBy(txdb, by="tx",use.names=T)

  intron.exon.red <- c( GenomicRanges::reduce(unlist(intron),ignore.strand=T), GenomicRanges::reduce(unlist(exon),ignore.strand=T) )
  intron.exon.dis <- GenomicRanges::disjoin(intron.exon.red, ignore.strand=T)
  intron.only<-GenomicRanges::setdiff(intron.exon.dis, unlist(exon) ,ignore.strand=T)
  intergenic.only<-gaps(intron.exon.dis)
  
  ol.in<-GenomicRanges::findOverlaps(intron.only, gr.gene, select="arbitrary")
  ol.ex<-GenomicRanges::findOverlaps(unlist(exon), gr.gene, select="arbitrary")
  ol.ig<-GenomicRanges::findOverlaps(intergenic.only, gr.gene, select="arbitrary")

  intron.saf<-data.frame(GeneID= names(gr.gene)[ol.in],
                         Chr   = seqnames(intron.only),
                         Start = start(intron.only),
                         End     =   end(intron.only),stringsAsFactors = F)
  exon.saf<-data.frame(GeneID= names(gr.gene)[ol.ex],
                       Chr   = seqnames(unlist(exon)),
                       Start = start(unlist(exon)),
                       End   =   end(unlist(exon)),
                       Strand =  strand(unlist(exon)),stringsAsFactors = F)

  intergenic.saf<-data.frame(GeneID= seqnames(intergenic.only),
                       Chr   = seqnames(intergenic.only),
                       Start = start(intergenic.only),
                       End   =   end(intergenic.only),
                       Strand = rep("+",times=length(intergenic.only)),
                       stringsAsFactors = F)
  
  intergenic.saf<-rbind(intergenic.saf,data.frame(GeneID= seqnames(intergenic.only),
                       Chr   = seqnames(intergenic.only),
                       Start = start(intergenic.only),
                       End   =   end(intergenic.only),
                       Strand = rep("-",times=length(intergenic.only)),
                       stringsAsFactors = F))
  
  intron.saf<-dplyr::left_join(intron.saf,unique(exon.saf[,c("GeneID","Strand")]),by=c("GeneID"))
  

  

  saf <- list(introns=unique(intron.saf),exons=unique(exon.saf),intergenic=unique(intergenic.saf))
  
saveRDS(saf, file="/data/share/htp/prime-seq_Paper/Fig_gdna_priming/analysis/count_intergenic/inex_intergenic.saf.RDS")
paste0(outdir,project[i])
[1] "/data/share/htp/prime-seq_Paper/Fig_gdna_priming/zUMIs/rep1"

#get Intergenic reads from zUMIs-stats2.R
gtf2<-"/data/share/htp/prime-seq_Paper/Fig_gdna_priming/zUMIs/rep1/Bulk_opt_gDNA_priming_rep1.final_annot.gtf"
user_seq<- getUserSeq(gtf2) 
spec_seq<- getSpecies(gtf2)

####
bcb<-c("/data/share/htp/prime-seq_Paper/Fig_gdna_priming/zUMIs/rep1/zUMIs_output/Bulk_opt_gDNA_priming_rep1kept_barcodes_binned.txt",
       "/data/share/htp/prime-seq_Paper/Fig_gdna_priming/zUMIs/rep2/zUMIs_output/Bulk_opt_gDNA_priming_rep2kept_barcodes_binned.txt",
       "/data/share/htp/prime-seq_Paper/Fig_gdna_priming/zUMIs/rep3/zUMIs_output/Bulk_opt_gDNA_priming_rep3kept_barcodes_binned.txt")

BCstats<-c("/data/share/htp/prime-seq_Paper/Fig_gdna_priming/zUMIs/rep1/Bulk_opt_gDNA_priming_rep1.BCstats.txt",
           "/data/share/htp/prime-seq_Paper/Fig_gdna_priming/zUMIs/rep2/Bulk_opt_gDNA_priming_rep2.BCstats.txt",
           "/data/share/htp/prime-seq_Paper/Fig_gdna_priming/zUMIs/rep3/Bulk_opt_gDNA_priming_rep3.BCstats.txt")

kbc<-c("/data/share/htp/prime-seq_Paper/Fig_gdna_priming/zUMIs/rep1/zUMIs_output/Bulk_opt_gDNA_priming_rep1kept_barcodes.txt",
       "/data/share/htp/prime-seq_Paper/Fig_gdna_priming/zUMIs/rep2/zUMIs_output/Bulk_opt_gDNA_priming_rep2kept_barcodes.txt",
       "/data/share/htp/prime-seq_Paper/Fig_gdna_priming/zUMIs/rep3/zUMIs_output/Bulk_opt_gDNA_priming_rep3kept_barcodes.txt")

for (i in 1:3){
  bccount<-fread(bcb[i])

bccount<-splitRG(bccount=bccount, mem= 75,read_layout = "SE")

binmap <- BCbin(bccount_file =BCstats[i],
                  bc_detected  = bccount,
                nReadsperCell = 100,
                nthread = 10,
                BarcodeBinning = 2)

# samouts <- prep_samtools(featfiles = ffiles,
#                          bccount   = bccount,
#                          cores     = 10,
#                          project = project[1],
#                          BarcodeBinning = 2,
#                          outdir=outdir)


bc<-data.table::fread(kbc[i],select = 1, header = T)

featfile_vector <- c(paste0(outdir,project[i],"/Bulk_opt_gDNA_priming_",project[i],".filtered.tagged.Aligned.out.bam.ex.featureCounts.bam"),
                       paste0(outdir,project[i],"/Bulk_opt_gDNA_priming_",project[i],".filtered.tagged.Aligned.out.bam.in.featureCounts.bam"),
                       paste0(outdir,project[i],"/Bulk_opt_gDNA_priming_",project[i],".filtered.tagged.Aligned.out.bam.inter.featureCounts.bam"))

typeCount <- sumstatBAM( featfiles = featfile_vector,
                         cores = 10,
                         outdir= outdir,
                         user_seq = user_seq,
                         bc = bc,
                         outfile = paste0(outdir,project[i],".bc.READcounts.rds"),
                         BCstats = BCstats[i],
                         nReadsperCell = 100,
                         mem_limit = 75,
                         bccount = bccount,
                         project = project[i],
                         BarcodeBinning = 2,
                         mem = 75,
                         read_layout = "SE",
                         spec_seq = spec_seq,
                         binmap=binmap)

tc<-data.frame(typeCount)
tc$type<-factor(tc$type, levels=rev(c("Exon","Intron","Intergenic_mouse","Intergenic_human","Intergenic","Ambiguity","Unmapped","User")))
write.table(tc,file = paste(outdir,project[i],".readspercell.txt",sep=""),sep="\t",row.names = F,col.names = T)
}
---
title: "Count Intergenic per species"
output: html_notebook
---


Getting species annotation for intergenic mapping reads

```{r}
require(GenomicRanges)
require(GenomicFeatures)
require(GenomicAlignments)
require(AnnotationDbi)

library(methods)
library(data.table)
library(yaml)

source("/data/share/htp/prime-seq_Paper/Fig_gdna_priming/analysis/count_intergenic/count_intergenic_functions.R")

outdir<-"/data/share/htp/prime-seq_Paper/Fig_gdna_priming/zUMIs/"

gtf<-"/data/share/htp/prime-seq_Paper/genomes/HsapMmus/gencode.v35.vM25.primary_assembly.annotation.gtf"
  
  
  txdb <- suppressWarnings(suppressMessages(GenomicFeatures::makeTxDbFromGFF(file=gtf, format="gtf")))

  
  ## Make Gene-range GR-object
  se <- suppressMessages(
    AnnotationDbi::select(txdb, keys(txdb, "GENEID"),
                              columns=c("GENEID","TXCHROM","TXSTART","TXEND","TXSTRAND"),
                              keytype="GENEID") %>%
    dplyr::group_by(GENEID,TXCHROM,TXSTRAND)  %>%
    dplyr::mutate( txstart =ifelse(TXSTART<TXEND,min(TXSTART),min(TXEND)),
                   txend  =ifelse(TXSTART<TXEND,max(TXEND),min(TXSTART) ) ) %>%
    dplyr::select(GENEID,TXCHROM,TXSTRAND,txstart,txend)  %>% unique()
    )

  gr.gene<-GenomicRanges::GRanges(seqnames = se$TXCHROM,
                                  ranges =  IRanges(start= se$txstart,
                                                    end=  se$txend,
                                                    names=se$GENEID),
                                  strand =  se$TXSTRAND,
                                  gid    =  se$GENEID)

  ### Get non-overlapping Introns/Exons
  intron<-GenomicFeatures::intronsByTranscript(txdb, use.names=T)
  exon<-GenomicFeatures::exonsBy(txdb, by="tx",use.names=T)

  intron.exon.red <- c( GenomicRanges::reduce(unlist(intron),ignore.strand=T), GenomicRanges::reduce(unlist(exon),ignore.strand=T) )
  intron.exon.dis <- GenomicRanges::disjoin(intron.exon.red, ignore.strand=T)
  intron.only<-GenomicRanges::setdiff(intron.exon.dis, unlist(exon) ,ignore.strand=T)
  intergenic.only<-gaps(intron.exon.dis)
  
  ol.in<-GenomicRanges::findOverlaps(intron.only, gr.gene, select="arbitrary")
  ol.ex<-GenomicRanges::findOverlaps(unlist(exon), gr.gene, select="arbitrary")
  ol.ig<-GenomicRanges::findOverlaps(intergenic.only, gr.gene, select="arbitrary")

  intron.saf<-data.frame(GeneID= names(gr.gene)[ol.in],
                         Chr   = seqnames(intron.only),
                         Start = start(intron.only),
                         End	 =   end(intron.only),stringsAsFactors = F)
  exon.saf<-data.frame(GeneID= names(gr.gene)[ol.ex],
                       Chr   = seqnames(unlist(exon)),
                       Start = start(unlist(exon)),
                       End	 =   end(unlist(exon)),
                       Strand =  strand(unlist(exon)),stringsAsFactors = F)

  intergenic.saf<-data.frame(GeneID= seqnames(intergenic.only),
                       Chr   = seqnames(intergenic.only),
                       Start = start(intergenic.only),
                       End	 =   end(intergenic.only),
                       Strand = rep("+",times=length(intergenic.only)),
                       stringsAsFactors = F)
  
  intergenic.saf<-rbind(intergenic.saf,data.frame(GeneID= seqnames(intergenic.only),
                       Chr   = seqnames(intergenic.only),
                       Start = start(intergenic.only),
                       End	 =   end(intergenic.only),
                       Strand = rep("-",times=length(intergenic.only)),
                       stringsAsFactors = F))
  
  intron.saf<-dplyr::left_join(intron.saf,unique(exon.saf[,c("GeneID","Strand")]),by=c("GeneID"))
  

  

  saf <- list(introns=unique(intron.saf),exons=unique(exon.saf),intergenic=unique(intergenic.saf))
  
saveRDS(saf, file="/data/share/htp/prime-seq_Paper/Fig_gdna_priming/analysis/count_intergenic/inex_intergenic.saf.RDS")
```


```{r}

saf<-readRDS(file="/data/share/htp/prime-seq_Paper/Fig_gdna_priming/analysis/count_intergenic/inex_intergenic.saf.RDS")
### Ok now let's try to count 

project<- c("rep1","rep2","rep3")

abamfile<-c(paste0("/data/share/htp/prime-seq_Paper/Fig_gdna_priming/zUMIs/",project[1],"_noMulti/Bulk_opt_gDNA_priming_",project[1],".filtered.tagged.Aligned.out.bam"),
           paste0("/data/share/htp/prime-seq_Paper/Fig_gdna_priming/zUMIs/",project[2],"_noMulti/Bulk_opt_gDNA_priming_",project[2],".filtered.tagged.Aligned.out.bam"),
           paste0("/data/share/htp/prime-seq_Paper/Fig_gdna_priming/zUMIs/",project[3],"_noMulti/Bulk_opt_gDNA_priming_",project[3],".filtered.tagged.Aligned.out.bam")
            )  

for (i in 1:3) {
fnex<-.runFeatureCount(abamfile[i],
                       saf=saf$exons,
                       strand=1,
                       type="ex",
                       primaryOnly = F,
                       cpu = 10,
                       mem = 75,
                       outdir=paste0(outdir,project[i]))
ffiles<-fnex

fnin  <-.runFeatureCount(abamfile[i],
                           saf=saf$introns,
                           strand=1,
                           type="in",
                           primaryOnly = F,
                           cpu = 10,
                           mem = 75,
                         outdir=paste0(outdir,project[i]))

ffiles<-c(ffiles,fnin)

fninter  <-.runFeatureCount(abamfile[i],
                           saf=saf$intergenic,
                           strand=1,
                           type="inter",
                           primaryOnly = F,
                           cpu = 10,
                           mem = 75,
                       outdir=paste0(outdir,project[i]))

ffiles<-c(ffiles,fninter)

system(paste0("for i in ",paste(ffiles,collapse=" ")," ; do samtools sort -n -O 'BAM' -@ ",round(10/2,0)," -m 15G -o $i $i.tmp & done ; wait"))
system(paste("rm",paste0(ffiles,".tmp",collapse=" ")))
}


```

```{r}

#get Intergenic reads from zUMIs-stats2.R
gtf2<-"/data/share/htp/prime-seq_Paper/Fig_gdna_priming/zUMIs/rep1/Bulk_opt_gDNA_priming_rep1.final_annot.gtf"
user_seq<- getUserSeq(gtf2) 
spec_seq<- getSpecies(gtf2)

####
bcb<-c("/data/share/htp/prime-seq_Paper/Fig_gdna_priming/zUMIs/rep1/zUMIs_output/Bulk_opt_gDNA_priming_rep1kept_barcodes_binned.txt",
       "/data/share/htp/prime-seq_Paper/Fig_gdna_priming/zUMIs/rep2/zUMIs_output/Bulk_opt_gDNA_priming_rep2kept_barcodes_binned.txt",
       "/data/share/htp/prime-seq_Paper/Fig_gdna_priming/zUMIs/rep3/zUMIs_output/Bulk_opt_gDNA_priming_rep3kept_barcodes_binned.txt")

BCstats<-c("/data/share/htp/prime-seq_Paper/Fig_gdna_priming/zUMIs/rep1/Bulk_opt_gDNA_priming_rep1.BCstats.txt",
           "/data/share/htp/prime-seq_Paper/Fig_gdna_priming/zUMIs/rep2/Bulk_opt_gDNA_priming_rep2.BCstats.txt",
           "/data/share/htp/prime-seq_Paper/Fig_gdna_priming/zUMIs/rep3/Bulk_opt_gDNA_priming_rep3.BCstats.txt")

kbc<-c("/data/share/htp/prime-seq_Paper/Fig_gdna_priming/zUMIs/rep1/zUMIs_output/Bulk_opt_gDNA_priming_rep1kept_barcodes.txt",
       "/data/share/htp/prime-seq_Paper/Fig_gdna_priming/zUMIs/rep2/zUMIs_output/Bulk_opt_gDNA_priming_rep2kept_barcodes.txt",
       "/data/share/htp/prime-seq_Paper/Fig_gdna_priming/zUMIs/rep3/zUMIs_output/Bulk_opt_gDNA_priming_rep3kept_barcodes.txt")

for (i in 1:3){
  bccount<-fread(bcb[i])

bccount<-splitRG(bccount=bccount, mem= 75,read_layout = "SE")

binmap <- BCbin(bccount_file =BCstats[i],
                  bc_detected  = bccount,
                nReadsperCell = 100,
                nthread = 10,
                BarcodeBinning = 2)

# samouts <- prep_samtools(featfiles = ffiles,
#                          bccount   = bccount,
#                          cores     = 10,
#                          project = project[1],
#                          BarcodeBinning = 2,
#                          outdir=outdir)


bc<-data.table::fread(kbc[i],select = 1, header = T)

featfile_vector <- c(paste0(outdir,project[i],"/Bulk_opt_gDNA_priming_",project[i],".filtered.tagged.Aligned.out.bam.ex.featureCounts.bam"),
                       paste0(outdir,project[i],"/Bulk_opt_gDNA_priming_",project[i],".filtered.tagged.Aligned.out.bam.in.featureCounts.bam"),
                       paste0(outdir,project[i],"/Bulk_opt_gDNA_priming_",project[i],".filtered.tagged.Aligned.out.bam.inter.featureCounts.bam"))

typeCount <- sumstatBAM( featfiles = featfile_vector,
                         cores = 10,
                         outdir= outdir,
                         user_seq = user_seq,
                         bc = bc,
                         outfile = paste0(outdir,project[i],".bc.READcounts.rds"),
                         BCstats = BCstats[i],
                         nReadsperCell = 100,
                         mem_limit = 75,
                         bccount = bccount,
                         project = project[i],
                         BarcodeBinning = 2,
                         mem = 75,
                         read_layout = "SE",
                         spec_seq = spec_seq,
                         binmap=binmap)

tc<-data.frame(typeCount)
tc$type<-factor(tc$type, levels=rev(c("Exon","Intron","Intergenic_mouse","Intergenic_human","Intergenic","Ambiguity","Unmapped","User")))
write.table(tc,file = paste(outdir,project[i],".readspercell.txt",sep=""),sep="\t",row.names = F,col.names = T)
}

```