Getting species annotation for intergenic mapping reads
require(GenomicRanges)
Loading required package: GenomicRanges
Loading required package: stats4
Loading required package: BiocGenerics
Loading required package: parallel
Attaching package: ‘BiocGenerics’
The following objects are masked from ‘package:parallel’:
clusterApply, clusterApplyLB, clusterCall, clusterEvalQ, clusterExport, clusterMap,
parApply, parCapply, parLapply, parLapplyLB, parRapply, parSapply, parSapplyLB
The following objects are masked from ‘package:dplyr’:
combine, intersect, setdiff, union
The following objects are masked from ‘package:stats’:
IQR, mad, sd, var, xtabs
The following objects are masked from ‘package:base’:
anyDuplicated, append, as.data.frame, basename, cbind, colnames, dirname, do.call,
duplicated, eval, evalq, Filter, Find, get, grep, grepl, intersect, is.unsorted, lapply,
Map, mapply, match, mget, order, paste, pmax, pmax.int, pmin, pmin.int, Position, rank,
rbind, Reduce, rownames, sapply, setdiff, sort, table, tapply, union, unique, unsplit,
which.max, which.min
Loading required package: S4Vectors
Attaching package: ‘S4Vectors’
The following objects are masked from ‘package:dplyr’:
first, rename
The following object is masked from ‘package:tidyr’:
expand
The following objects are masked from ‘package:base’:
expand.grid, I, unname
Loading required package: IRanges
Attaching package: ‘IRanges’
The following objects are masked from ‘package:dplyr’:
collapse, desc, slice
The following object is masked from ‘package:purrr’:
reduce
Loading required package: GenomeInfoDb
require(GenomicFeatures)
Loading required package: GenomicFeatures
Loading required package: AnnotationDbi
Loading required package: Biobase
Welcome to Bioconductor
Vignettes contain introductory material; view with 'browseVignettes()'. To cite
Bioconductor, see 'citation("Biobase")', and for packages 'citation("pkgname")'.
Attaching package: ‘AnnotationDbi’
The following object is masked from ‘package:dplyr’:
select
require(GenomicAlignments)
Loading required package: GenomicAlignments
Loading required package: SummarizedExperiment
Loading required package: MatrixGenerics
Loading required package: matrixStats
Attaching package: ‘matrixStats’
The following objects are masked from ‘package:Biobase’:
anyMissing, rowMedians
The following object is masked from ‘package:dplyr’:
count
Attaching package: ‘MatrixGenerics’
The following objects are masked from ‘package:matrixStats’:
colAlls, colAnyNAs, colAnys, colAvgsPerRowSet, colCollapse, colCounts, colCummaxs,
colCummins, colCumprods, colCumsums, colDiffs, colIQRDiffs, colIQRs, colLogSumExps,
colMadDiffs, colMads, colMaxs, colMeans2, colMedians, colMins, colOrderStats, colProds,
colQuantiles, colRanges, colRanks, colSdDiffs, colSds, colSums2, colTabulates,
colVarDiffs, colVars, colWeightedMads, colWeightedMeans, colWeightedMedians,
colWeightedSds, colWeightedVars, rowAlls, rowAnyNAs, rowAnys, rowAvgsPerColSet,
rowCollapse, rowCounts, rowCummaxs, rowCummins, rowCumprods, rowCumsums, rowDiffs,
rowIQRDiffs, rowIQRs, rowLogSumExps, rowMadDiffs, rowMads, rowMaxs, rowMeans2,
rowMedians, rowMins, rowOrderStats, rowProds, rowQuantiles, rowRanges, rowRanks,
rowSdDiffs, rowSds, rowSums2, rowTabulates, rowVarDiffs, rowVars, rowWeightedMads,
rowWeightedMeans, rowWeightedMedians, rowWeightedSds, rowWeightedVars
The following object is masked from ‘package:Biobase’:
rowMedians
Loading required package: Biostrings
Loading required package: XVector
Attaching package: ‘XVector’
The following object is masked from ‘package:purrr’:
compact
Attaching package: ‘Biostrings’
The following object is masked from ‘package:base’:
strsplit
Loading required package: Rsamtools
Attaching package: ‘GenomicAlignments’
The following object is masked from ‘package:dplyr’:
last
require(AnnotationDbi)
library(methods)
library(data.table)
data.table 1.14.0 using 28 threads (see ?getDTthreads). Latest news: r-datatable.com
Attaching package: ‘data.table’
The following objects are masked from ‘package:GenomicAlignments’:
first, last, second
The following object is masked from ‘package:SummarizedExperiment’:
shift
The following object is masked from ‘package:GenomicRanges’:
shift
The following object is masked from ‘package:IRanges’:
shift
The following objects are masked from ‘package:S4Vectors’:
first, second
The following objects are masked from ‘package:dplyr’:
between, first, last
The following object is masked from ‘package:purrr’:
transpose
library(yaml)
source("/data/share/htp/prime-seq_Paper/Fig_gdna_priming/analysis/count_intergenic/count_intergenic_functions.R")
outdir<-"/data/share/htp/prime-seq_Paper/Fig_gdna_priming/zUMIs/"
gtf<-"/data/share/htp/prime-seq_Paper/genomes/HsapMmus/gencode.v35.vM25.primary_assembly.annotation.gtf"
txdb <- suppressWarnings(suppressMessages(GenomicFeatures::makeTxDbFromGFF(file=gtf, format="gtf")))
## Make Gene-range GR-object
se <- suppressMessages(
AnnotationDbi::select(txdb, keys(txdb, "GENEID"),
columns=c("GENEID","TXCHROM","TXSTART","TXEND","TXSTRAND"),
keytype="GENEID") %>%
dplyr::group_by(GENEID,TXCHROM,TXSTRAND) %>%
dplyr::mutate( txstart =ifelse(TXSTART<TXEND,min(TXSTART),min(TXEND)),
txend =ifelse(TXSTART<TXEND,max(TXEND),min(TXSTART) ) ) %>%
dplyr::select(GENEID,TXCHROM,TXSTRAND,txstart,txend) %>% unique()
)
gr.gene<-GenomicRanges::GRanges(seqnames = se$TXCHROM,
ranges = IRanges(start= se$txstart,
end= se$txend,
names=se$GENEID),
strand = se$TXSTRAND,
gid = se$GENEID)
### Get non-overlapping Introns/Exons
intron<-GenomicFeatures::intronsByTranscript(txdb, use.names=T)
exon<-GenomicFeatures::exonsBy(txdb, by="tx",use.names=T)
intron.exon.red <- c( GenomicRanges::reduce(unlist(intron),ignore.strand=T), GenomicRanges::reduce(unlist(exon),ignore.strand=T) )
intron.exon.dis <- GenomicRanges::disjoin(intron.exon.red, ignore.strand=T)
intron.only<-GenomicRanges::setdiff(intron.exon.dis, unlist(exon) ,ignore.strand=T)
intergenic.only<-gaps(intron.exon.dis)
ol.in<-GenomicRanges::findOverlaps(intron.only, gr.gene, select="arbitrary")
ol.ex<-GenomicRanges::findOverlaps(unlist(exon), gr.gene, select="arbitrary")
ol.ig<-GenomicRanges::findOverlaps(intergenic.only, gr.gene, select="arbitrary")
intron.saf<-data.frame(GeneID= names(gr.gene)[ol.in],
Chr = seqnames(intron.only),
Start = start(intron.only),
End = end(intron.only),stringsAsFactors = F)
exon.saf<-data.frame(GeneID= names(gr.gene)[ol.ex],
Chr = seqnames(unlist(exon)),
Start = start(unlist(exon)),
End = end(unlist(exon)),
Strand = strand(unlist(exon)),stringsAsFactors = F)
intergenic.saf<-data.frame(GeneID= seqnames(intergenic.only),
Chr = seqnames(intergenic.only),
Start = start(intergenic.only),
End = end(intergenic.only),
Strand = rep("+",times=length(intergenic.only)),
stringsAsFactors = F)
intergenic.saf<-rbind(intergenic.saf,data.frame(GeneID= seqnames(intergenic.only),
Chr = seqnames(intergenic.only),
Start = start(intergenic.only),
End = end(intergenic.only),
Strand = rep("-",times=length(intergenic.only)),
stringsAsFactors = F))
intron.saf<-dplyr::left_join(intron.saf,unique(exon.saf[,c("GeneID","Strand")]),by=c("GeneID"))
saf <- list(introns=unique(intron.saf),exons=unique(exon.saf),intergenic=unique(intergenic.saf))
saveRDS(saf, file="/data/share/htp/prime-seq_Paper/Fig_gdna_priming/analysis/count_intergenic/inex_intergenic.saf.RDS")
paste0(outdir,project[i])
[1] "/data/share/htp/prime-seq_Paper/Fig_gdna_priming/zUMIs/rep1"
#get Intergenic reads from zUMIs-stats2.R
gtf2<-"/data/share/htp/prime-seq_Paper/Fig_gdna_priming/zUMIs/rep1/Bulk_opt_gDNA_priming_rep1.final_annot.gtf"
user_seq<- getUserSeq(gtf2)
spec_seq<- getSpecies(gtf2)
####
bcb<-c("/data/share/htp/prime-seq_Paper/Fig_gdna_priming/zUMIs/rep1/zUMIs_output/Bulk_opt_gDNA_priming_rep1kept_barcodes_binned.txt",
"/data/share/htp/prime-seq_Paper/Fig_gdna_priming/zUMIs/rep2/zUMIs_output/Bulk_opt_gDNA_priming_rep2kept_barcodes_binned.txt",
"/data/share/htp/prime-seq_Paper/Fig_gdna_priming/zUMIs/rep3/zUMIs_output/Bulk_opt_gDNA_priming_rep3kept_barcodes_binned.txt")
BCstats<-c("/data/share/htp/prime-seq_Paper/Fig_gdna_priming/zUMIs/rep1/Bulk_opt_gDNA_priming_rep1.BCstats.txt",
"/data/share/htp/prime-seq_Paper/Fig_gdna_priming/zUMIs/rep2/Bulk_opt_gDNA_priming_rep2.BCstats.txt",
"/data/share/htp/prime-seq_Paper/Fig_gdna_priming/zUMIs/rep3/Bulk_opt_gDNA_priming_rep3.BCstats.txt")
kbc<-c("/data/share/htp/prime-seq_Paper/Fig_gdna_priming/zUMIs/rep1/zUMIs_output/Bulk_opt_gDNA_priming_rep1kept_barcodes.txt",
"/data/share/htp/prime-seq_Paper/Fig_gdna_priming/zUMIs/rep2/zUMIs_output/Bulk_opt_gDNA_priming_rep2kept_barcodes.txt",
"/data/share/htp/prime-seq_Paper/Fig_gdna_priming/zUMIs/rep3/zUMIs_output/Bulk_opt_gDNA_priming_rep3kept_barcodes.txt")
for (i in 1:3){
bccount<-fread(bcb[i])
bccount<-splitRG(bccount=bccount, mem= 75,read_layout = "SE")
binmap <- BCbin(bccount_file =BCstats[i],
bc_detected = bccount,
nReadsperCell = 100,
nthread = 10,
BarcodeBinning = 2)
# samouts <- prep_samtools(featfiles = ffiles,
# bccount = bccount,
# cores = 10,
# project = project[1],
# BarcodeBinning = 2,
# outdir=outdir)
bc<-data.table::fread(kbc[i],select = 1, header = T)
featfile_vector <- c(paste0(outdir,project[i],"/Bulk_opt_gDNA_priming_",project[i],".filtered.tagged.Aligned.out.bam.ex.featureCounts.bam"),
paste0(outdir,project[i],"/Bulk_opt_gDNA_priming_",project[i],".filtered.tagged.Aligned.out.bam.in.featureCounts.bam"),
paste0(outdir,project[i],"/Bulk_opt_gDNA_priming_",project[i],".filtered.tagged.Aligned.out.bam.inter.featureCounts.bam"))
typeCount <- sumstatBAM( featfiles = featfile_vector,
cores = 10,
outdir= outdir,
user_seq = user_seq,
bc = bc,
outfile = paste0(outdir,project[i],".bc.READcounts.rds"),
BCstats = BCstats[i],
nReadsperCell = 100,
mem_limit = 75,
bccount = bccount,
project = project[i],
BarcodeBinning = 2,
mem = 75,
read_layout = "SE",
spec_seq = spec_seq,
binmap=binmap)
tc<-data.frame(typeCount)
tc$type<-factor(tc$type, levels=rev(c("Exon","Intron","Intergenic_mouse","Intergenic_human","Intergenic","Ambiguity","Unmapped","User")))
write.table(tc,file = paste(outdir,project[i],".readspercell.txt",sep=""),sep="\t",row.names = F,col.names = T)
}