Purpose:
Intron Expression in HEK cells
Load the following packages:
library(tidyverse)
library(cowplot)
library(ggsci)
library(ggbeeswarm)
library(ComplexUpset)
library(Gviz)
Load functions:
source("/data/share/htp/prime-seq_Paper/Scripts/custom_functions.R")
theme_pub <- theme_bw() + theme(plot.title = element_text(hjust = 0.5, size=18, face="bold"),
axis.text = element_text(colour="black", size=14),
axis.title=element_text(size=16,face="bold"),
legend.text=element_text(size=14),
legend.position="right",
axis.line.x = element_line(colour = "black"),
axis.line.y = element_line(colour = "black"),
strip.background=element_blank(),
strip.text=element_text(size=16))
theme_set(theme_pub)
fig_path<-here::here()
data_path<-str_split(string = fig_path,pattern = "/analysis",simplify = T)[,1]
## Feature Colours
feat_cols<-c("#F08C4B", "#E5DFCC", "#556f44", "#9dc183", "#4D8C57", "#3A8DB5", "#256EA0","dodgerblue4","#256EA0")
names(feat_cols)<-c("Ambiguity","Intergenic","Ribosomal","Mitochondrial","lncRNA","Intron","Coding","Intragenic","Exon")
inex_colors <- c("#ff5154","#91a6ff","#550527")
names(inex_colors)<-c("Intron","Exon","Both")
Load annotation:
## annotation
gtype_human <-getbiotype("hsapiens_gene_ensembl",species="human")
ensembl <- useMart("ensembl", dataset ="hsapiens_gene_ensembl", host="uswest.ensembl.org")
biotype <- getBM(attributes=c("ensembl_gene_id","ensembl_gene_id_version","gene_biotype","external_gene_name","chromosome_name"),
mart=ensembl )
Load data:
inf <-
read.csv(
paste0(fig_path, "/sample_info.csv"),
header = T,
stringsAsFactors = F
) %>%
mutate(Condition = case_when(
Condition == "Incubation + ProtK" ~ "Magnetic Beads",
TRUE ~ Condition
)) %>%
filter(Celltype == "HEK")
inf$Sample <- as.character(inf$Sample)
counts_hek <-
readRDS(
paste0(
data_path,
"/zUMIs/HEK/zUMIs_output/expression/Bulk_opt_lysis_test_2_HEK.dgecounts.rds"
)
)
# Split into exon, intron and Exon+ intron matrices.
# Filter samples and remove Geneversion numbers
counts_hek_ex <-
counts_hek$umicount$exon$all %>% as.matrix() %>% remove_Geneversion()
counts_hek_ex <- counts_hek_ex[, inf$BC]
counts_hek_inex <-
counts_hek$umicount$inex$all %>% as.matrix() %>% remove_Geneversion()
counts_hek_inex <- counts_hek_inex[, inf$BC]
counts_hek_in <-
counts_hek$umicount$intron$all %>% as.matrix() %>% remove_Geneversion()
counts_hek_in <- counts_hek_in[, inf$BC]
## make exon only / intron only matrix, remove genes detected in both
counts_hek_ex_o <-
counts_hek_ex[!(rownames(counts_hek_ex) %in% rownames(counts_hek_in)), ]
counts_hek_in_o <-
counts_hek_in[!(rownames(counts_hek_in) %in% rownames(counts_hek_ex)), ]
counts_hek_inex_o <-
counts_hek_inex[!(rownames(counts_hek_inex) %in% rownames(counts_hek_ex_o)), ]
counts_hek_inex_o <-
counts_hek_inex_o[!(rownames(counts_hek_inex_o) %in% rownames(counts_hek_in_o)), ]
counts_hek_in_o <- counts_hek_in_o[rowSums(counts_hek_in_o) > 0, ]
counts_hek_ex_o <- counts_hek_ex_o[rowSums(counts_hek_ex_o) > 0, ]
counts_hek_inex_o <- counts_hek_inex_o[rowSums(counts_hek_inex_o) > 0, ]
cond <- c("Magnetic Beads", "Column")
for (i in cond) {
sub_inf <- inf %>%
filter(Condition %in% i)
intron_only_genes <-
rownames(counts_hek_in_o[whichgenes_reproducible(counts_hek_in_o[,sub_inf$BC],
exprcutoff = 5,
reproducecutoff = 0.1), sub_inf$BC])
exon_only_genes <-
rownames(counts_hek_ex_o[whichgenes_reproducible(counts_hek_ex_o[,sub_inf$BC],
exprcutoff = 5,
reproducecutoff = 0.1), sub_inf$BC])
inex_genes <-
rownames(counts_hek_inex_o[whichgenes_reproducible(counts_hek_inex_o[,sub_inf$BC],
exprcutoff = 5,
reproducecutoff = 0.1), sub_inf$BC])
inex_genes <-
inex_genes[order(rowSums(counts_hek_inex_o[whichgenes_reproducible(counts_hek_inex_o[,sub_inf$BC],
exprcutoff = 5,
reproducecutoff = 0.1),
sub_inf$BC]),
decreasing = T)] # sort by highest overall count
gene_type <-
data.frame(
ENSEMBL = c(exon_only_genes, intron_only_genes, inex_genes),
detection = c(
rep("exon", times = length(exon_only_genes)),
rep("intron", times = length(intron_only_genes)),
rep("both", times = length(inex_genes))
)
) %>%
left_join(biotype, by = c("ENSEMBL" = "ensembl_gene_id")) %>%
mutate(
biotype2 = case_when(
grepl(gene_biotype, pattern = "*pseudogene") ~ "pseudogene",
grepl(gene_biotype, pattern = "IG*") ~ "protein coding",
grepl(gene_biotype, pattern = "TR*") ~ "protein coding",
grepl(gene_biotype, pattern = "MT*") ~ "other",
grepl(gene_biotype, pattern = "^s") ~ "small RNA",
grepl(gene_biotype, pattern = "ribozyme") ~ "other",
gene_biotype == "misc_RNA" ~ "other",
gene_biotype == "protein_coding" ~ "protein coding",
T ~ gene_biotype
),
cond = i
) %>%
filter(!(is.na(biotype2)))
gene_type$biotype2 <-
factor(gene_type$biotype2, levels = rev(
c(
"protein coding",
"lncRNA",
"pseudogene",
"rRNA",
"small RNA",
"miRNA",
"not_annotated",
"other"
)
))
if (!(exists("gene_type_sum"))) {
gene_type_sum <- gene_type
} else{
gene_type_sum <- bind_rows(gene_type_sum, gene_type)
}
}
ggplot() +
geom_bar(data = gene_type_sum,
aes(x = cond, fill = biotype2),
#position = position_fill(),
na.rm = T) +
ggsci::scale_fill_npg() +
theme(legend.position = "right") +
labs(fill = "",
x = "",
y = "detected Genes") +
facet_grid(~detection)+
theme(axis.text.x = element_text(angle=45,hjust=1))

NA
NA
No difference in composition based on extraction method. We continue with the Bead isolation Condition from here on.
inf<-inf %>%
filter(Condition=="Magnetic Beads")
gene_type<-gene_type_sum %>%
filter(cond=="Magnetic Beads")
counts_hek_inex<-counts_hek_inex[,inf$BC]
counts_hek_ex_o<-counts_hek_inex[subset(gene_type,detection=="exon")$ENSEMBL,]
counts_hek_in_o<-counts_hek_inex[subset(gene_type,detection=="intron")$ENSEMBL,]
counts_hek_inex_o<-counts_hek_inex[subset(gene_type,!(detection%in%c("intron","exon")))$ENSEMBL,]
##
counts_hek_in <- counts_hek_in[whichgenes_reproducible(counts_hek_in[,inf$BC],
exprcutoff = 2,
reproducecutoff = 0.1), inf$BC]
counts_hek_ex <- counts_hek_ex[whichgenes_reproducible(counts_hek_ex[,inf$BC],
exprcutoff = 2,
reproducecutoff = 0.1), inf$BC]
upset plot of intersections
average_counts<-data.frame(ENSEMBL=rownames(counts_hek_inex),
mean_cnt=rowMeans(counts_hek_inex)
)
intron_genes<-rownames(counts_hek_in) %>%
unique()
exon_genes<-rownames(counts_hek_ex)%>%
unique()
upset_df<-data.frame(ENSEMBL=c(exon_genes,intron_genes),
detection=c(rep("exon",
times=length(exon_genes)),
rep("intron",
times=length(intron_genes))
)) %>%
left_join(biotype,by=c("ENSEMBL"="ensembl_gene_id")) %>%
mutate(biotype2=case_when(grepl(gene_biotype,pattern="*pseudogene")~ "pseudogene",
grepl(gene_biotype,pattern="IG*")~ "protein coding",
grepl(gene_biotype,pattern="TR*")~ "protein coding",
grepl(gene_biotype,pattern="MT*")~ "other",
grepl(gene_biotype,pattern="^s")~ "small RNA",
grepl(gene_biotype,pattern="ribozyme")~ "other",
gene_biotype=="misc_RNA"~ "other",
gene_biotype=="protein_coding"~ "protein coding",
T~gene_biotype)) %>%
filter(!is.na(biotype2)) %>%
inner_join(average_counts)
Joining, by = "ENSEMBL"
upset_df$biotype2<-factor(upset_df$biotype2,levels=rev(c("protein coding","lncRNA","pseudogene","rRNA","small RNA","miRNA","other")))
upset_df<-upset_df %>%
mutate(value=TRUE) %>%
pivot_wider(names_from = detection,values_from = value,values_fill = FALSE) %>%
left_join(gene_type)
Joining, by = c("ENSEMBL", "ensembl_gene_id_version", "gene_biotype", "external_gene_name", "chromosome_name", "biotype2")
categories<-c("exon","intron")
upset<-upset(upset_df,
categories,
name="Gene detection",
base_annotations=list(
'Intersection size'=intersection_size(
counts=FALSE,
mapping=aes(fill=biotype2)
)+scale_fill_npg()
+labs(fill="")
),
annotations = list(
'nUMI'=(
ggplot(mapping=aes(y=mean_cnt,fill=biotype2))
+geom_boxplot(position="dodge",outlier.shape = NA,show.legend = F)
+scale_y_log10()
+scale_fill_npg()
)))
upset
ggsave(upset,
device = "pdf",
path = fig_path,
width = 150,
height=180,
units = "mm",
filename = "upset_plot.pdf"
)

Summary stats
make_summary<-function(mat,type){
data.frame(BC=colnames(mat),
umi=colSums(mat),
gene=colSums(mat>0),
type=type)
}
summary<-rbind(make_summary(counts_hek_ex_o,"Exon"),
make_summary(counts_hek_in_o,"Intron"),
make_summary(counts_hek_inex_o,"Both"))
summary$exclusive<-"yes"
ggplot(summary,aes(y=umi,x=type,col=type))+
geom_boxplot()+
geom_beeswarm()+
scale_y_log10()+
scale_colour_manual(values=inex_colors,limits=force)

ggplot(summary,aes(y=gene,x=type,col=type))+
geom_boxplot()+
geom_beeswarm()+
scale_y_log10()+
scale_colour_manual(values=inex_colors,limits=force)

summary2<-rbind(make_summary(counts_hek_ex,"Exon"),
make_summary(counts_hek_in,"Intron"),
make_summary(counts_hek_inex,"Both"))
summary2$exclusive<-"no"
ggplot(summary2,aes(y=umi,x=type,col=type))+
geom_boxplot()+
geom_beeswarm()+
scale_y_log10()+
ggtitle("")+
scale_colour_manual(values=inex_colors,limits=force)

ggplot(summary2,aes(y=gene,x=type,col=type))+
geom_boxplot()+
geom_beeswarm()+
scale_y_log10()+
scale_colour_manual(values=inex_colors,limits=force)

summary3<-rbind(summary,summary2) #%>%
# pivot_longer(cols=c(umi,gene),names_to = "type2" ,values_to = "count")
ggplot(summary3,aes(type,y=gene,group=paste0(exclusive,type),col=exclusive))+
geom_boxplot()+
geom_beeswarm(dodge.width = 0.80)+
scale_y_log10()

ggplot(summary3,aes(type,y=umi,group=paste0(exclusive,type),col=exclusive))+
geom_boxplot()+
geom_beeswarm(dodge.width = 0.80)+
scale_y_log10()

NA
NA
NA
inex_genes<-rownames(counts_hek_inex)
# subset for genes expressed in both
exon<-counts_hek_ex[rownames(counts_hek_ex)%in%inex_genes,]
exon<-exon[whichgenes_reproducible(exon,5,reproducecutoff = 0.2),]
exon<-exon[,]
intron<-counts_hek_in[rownames(counts_hek_in)%in%inex_genes,]
intron<-intron[whichgenes_reproducible(intron,5,reproducecutoff = 0.2),]
inex_lib<-data.frame(sample=names(colSums(counts_hek_inex)),
LibSize=colSums(counts_hek_inex))
## normalize to UMI per million
upm<-function(umi_counts){
umi_counts%>%
as.data.frame() %>%
rownames_to_column(var="GeneID") %>%
pivot_longer( cols = 2:ncol(.),
names_to = "sample",
values_to = "cnt") %>%
left_join(inex_lib) %>%
group_by(sample) %>%
mutate(upm = cnt / LibSize *1e6 ) %>%
dplyr::select(-c(LibSize,cnt))%>%
pivot_wider(names_from = sample,
values_from = upm) %>%
column_to_rownames(var = "GeneID") %>%
as.matrix()
}
exon_upm<-upm(exon)
Joining, by = "sample"
exon_upm<-log2(exon_upm+1)
head(exon_upm)
AAAACT ATATAG GTTTAT TGTTTA GCTAGA CCCACG CGGTGG GCTCGC AAAGTT ATCAAA TAAAGT TTAATC
ENSG00000000003 6.777043 6.894788 6.881346 6.944080 6.935219 7.1598135 6.991783 7.197138 6.375255 6.389947 6.430394 6.690962
ENSG00000000419 5.572084 5.541948 5.722610 5.331175 4.769669 5.2074211 5.363794 5.931007 4.515383 4.605971 4.806156 4.524692
ENSG00000000457 2.626398 1.880397 3.364346 2.689834 2.859309 1.8860013 2.270964 3.792812 2.097778 1.704817 2.677023 3.132752
ENSG00000000460 3.928917 4.865845 4.638661 4.511567 4.896307 4.2459487 4.331438 4.886397 3.285104 3.706273 3.267919 3.802377
ENSG00000000971 0.000000 0.000000 0.000000 0.000000 0.000000 0.9250104 1.542618 0.000000 1.065916 0.000000 1.055938 1.198606
ENSG00000001036 6.173004 6.197858 6.236245 6.332796 6.385227 6.2907196 5.914327 6.120693 5.680913 5.598644 5.599290 5.722660
GGGATT CCCCGT CGTGGG GGAGCC
ENSG00000000003 6.469344 6.334377 6.876656 6.983402
ENSG00000000419 4.328676 4.681841 4.684511 4.822240
ENSG00000000457 1.086215 1.943430 2.179717 3.335826
ENSG00000000460 3.147655 3.928934 4.026753 3.959295
ENSG00000000971 0.000000 0.000000 0.000000 0.000000
ENSG00000001036 5.836887 5.879522 6.340348 5.888000
intron_upm<-upm(intron)
Joining, by = "sample"
intron_upm<-log2(intron_upm+1)
head(intron_upm)
AAAACT ATATAG GTTTAT TGTTTA GCTAGA CCCACG CGGTGG GCTCGC AAAGTT ATCAAA TAAAGT TTAATC
ENSG00000000003 0.000000 1.227052 1.870535 0.000000 1.170781 0.9250104 0.000000 0.000000 2.425995 0.810066 1.659075 1.198606
ENSG00000000419 2.900825 3.707959 3.695444 4.023849 3.088823 3.9429054 3.641473 3.258980 2.693206 2.649165 2.677023 3.802377
ENSG00000000457 2.900825 0.000000 1.870535 2.422748 2.586247 2.4577228 2.752681 2.747467 1.672198 2.464603 3.096513 0.000000
ENSG00000000460 4.256398 4.651676 4.475957 4.205365 4.393785 4.8509811 4.582480 4.181589 4.444698 3.936642 3.685914 4.603667
ENSG00000001036 1.842944 0.000000 0.000000 1.063833 1.170781 0.0000000 0.000000 1.280269 1.065916 0.810066 1.055938 1.198606
ENSG00000001084 4.350893 4.095129 4.475957 4.820789 4.393785 4.8053408 4.892314 3.635841 3.577125 3.529332 4.426421 4.048073
GGGATT CCCCGT CGTGGG GGAGCC
ENSG00000000003 0.000000 0.962534 1.122263 0.000000
ENSG00000000419 3.473587 4.016111 3.674553 2.691105
ENSG00000000457 2.127464 2.521967 2.783333 2.213726
ENSG00000000460 4.481563 3.836148 4.026753 4.393207
ENSG00000001036 2.127464 0.000000 0.000000 1.495367
ENSG00000001084 4.481563 3.736982 4.309621 3.574847
df<-data.frame(ENSEMBL=rownames(exon_upm),exon=rowMeans(exon_upm)) %>%
inner_join(data.frame(ENSEMBL=rownames(intron_upm),intron=rowMeans(intron_upm))) %>%
inner_join(gene_type) %>%
filter(biotype2%in%c("protein coding")) %>%
unique()
Joining, by = "ENSEMBL"
Joining, by = "ENSEMBL"
smc_inex<-ggplot(df) + aes(x=exon, y=intron)+
stat_density2d(geom="tile", aes(fill=..density..^0.25, alpha=1), contour=FALSE,show.legend = F) +
geom_point(size=0.5,show.legend = F) +
stat_density2d(geom="tile", aes(fill=..density..^0.25, alpha=ifelse(..density..^0.25<0.15,0,1)), contour=FALSE,show.legend = F) +
scale_fill_gradientn(colours = colorRampPalette(c("white", blues9,"black"))(256))+
theme(panel.grid = element_blank())+
geom_smooth(aes(exon,intron),method="lm",col="grey70")+
labs(x="Log Mean expression Exon",y="Log Mean expression Intron")
smc_inex
`geom_smooth()` using formula 'y ~ x'

hist.inex<-df %>%
rename(Exon=exon,
Intron=intron) %>%
pivot_longer(cols = c("Exon","Intron")) %>%
ggplot()+
geom_histogram(aes(value,fill=name),alpha=0.8,colour="grey50",bins = 50)+
scale_fill_manual(values = inex_colors,limits=force)+
labs(x="Log Mean Expression",
fill="")+
theme(legend.position = c(0.8,0.5),
legend.background = element_blank())
hist.inex
ggsave(smc_inex,
device = "pdf",
path = fig_path,
width = 100,
height=100,
units = "mm",
filename = "scatter_plot.pdf"
)
`geom_smooth()` using formula 'y ~ x'
ggsave(hist.inex,
device = "pdf",
path = fig_path,
width = 100,
height=100,
units = "mm",
filename = "exp_dist_plot.pdf"
)

ranked exon vs. intron expression
rank_mat<-function(count_mat){
count_mat%>%
as.data.frame() %>%
rownames_to_column(var="GeneID") %>%
pivot_longer( cols = 2:ncol(.),
names_to = "sample",
values_to = "cnt") %>%
arrange(GeneID,cnt) %>%
group_by(sample) %>%
mutate(gene_rank = rank(cnt)) %>%
dplyr::select(-cnt) %>%
pivot_wider(names_from = sample,
values_from = gene_rank) %>%
ungroup() %>%
column_to_rownames(var = "GeneID") %>%
as.matrix()
}
exon_rank<-rank_mat(exon)
intron_rank<-rank_mat(intron)
df_rank<-data.frame(ENSEMBL=rownames(exon_rank),
exon_rank=rowMeans(exon_rank)) %>%
inner_join(data.frame(ENSEMBL=rownames(intron_rank),
intron_rank=rowMeans(intron_rank))) %>%
inner_join(df) %>%
filter(biotype2%in%c("protein coding","lncRNA")) %>%
mutate(norm_rank_exon=exon_rank/max(exon_rank),
norm_rank_intron=intron_rank/max(intron_rank))
Joining, by = "ENSEMBL"
Joining, by = "ENSEMBL"
ggplot(df_rank)+
geom_density2d_filled(aes(exon_rank,intron_rank),)+
geom_smooth(aes(exon_rank,intron_rank),method="lm")+
facet_wrap(~biotype2,nrow=2,scales="free")
`geom_smooth()` using formula 'y ~ x'

ggplot(df_rank)+
ggpointdensity::geom_pointdensity(aes(norm_rank_exon,norm_rank_intron),size=0.2)+
geom_smooth(aes(norm_rank_exon,norm_rank_intron),method="lm")+
facet_wrap(~biotype2,nrow=2,scales="free")
`geom_smooth()` using formula 'y ~ x'

NA
Intron vs. Exon mapping
1: subset .bam file for barcodes 2: run get3distance_v3_SP_v2.R for intron and for exon separately 3: combine outputs and plot
bam_path<- "/data/share/htp/prime-seq_Paper/Fig_beads_columns/zUMIs/HEK/zUMIs_output/demultiplexed/"
bam_files<-list.files(path = bam_path,pattern = ".bam$")
bcs<-inf$XC
bam_files<-bam_files[str_split(bam_files,pattern = "[.]",simplify = T)[,2] %in% bcs]
get per bam file geneBody coverage
# library(GenomicRanges, quietly=T, warn.conflicts = F)
# library(GenomicFeatures, quietly=T, warn.conflicts = F)
# library(GenomicAlignments, quietly=T, warn.conflicts = F)
#
# ## Inputs:
# #bam_files #with corresponding .bai
# #out #"name base for plot and outtable"
# counts_hek_inex<-counts_hek$umicount$inex$all %>% as.matrix()
# counts_hek_inex<-counts_hek_inex[,inf$BC]
#
# inex_genes<-rownames(counts_hek_inex)
#
# inex_genes<-inex_genes[order(rowSums(counts_hek_inex),decreasing = T)] # sort by highest overall count
#
# genes <- inex_genes[1:2000] #if you want to subset genes, list of ENSG ids
# write.table(genes,paste0(bam_path,"inex.genes.txt"),quote = F,row.names = F,col.names = F)
# genes<-paste0(bam_path,"inex.genes.txt")
#
# #ftype" #"intron or exon"
#
# gtf<- "/data/share/htp/prime-seq_Paper/genomes/Hsap/gencode.v35.primary_assembly.annotation.gtf" #gtf file containing the transcript info"
# txdb <- makeTxDbFromGFF(gtf, format="gtf")
#
# # restrict to canonical chromosomes
#
# canonical_chr<-seqlevels(txdb)[1:25]
#
#
# tmp<-transcriptsBy(txdb, by="gene")
# gene2transcript<-lapply( tmp, function(x){ mcols(x)$tx_name })
#
# saveDb(txdb, file="/data/share/htp/prime-seq_Paper/genomes/Hsap/gencode.v35.primary_assembly.sqlite")
# save(gene2transcript,file="/data/share/htp/prime-seq_Paper/genomes/Hsap/gene2transcript_canonical.Rdata" )
#
# txdb<-"/data/share/htp/prime-seq_Paper/genomes/Hsap/gencode.v35.primary_assembly.sqlite"
# g2t<-"/data/share/htp/prime-seq_Paper/genomes/Hsap/gene2transcript_canonical.Rdata"
#
# maxTdist <- 10000 #"max dist. to 3' end counted"
# minTlength <- 0 #"minimal length of transcripts considered"
# max_exon_length<- 5000 # max length of exons per gene
# ngenes<- 2000 #max number of genes to use
#
# # i="HEK.ACTGAGCGAAAACT.demx.bam"
# # j="intron"
#
# # index
# for (i in bam_files){
# if (!(file.exists(paste0(bam_path,i,".bai"))))
#
# # index bam file
# system(paste0("samtools index ",bam_path,i))
#
# }
#
#
# ## start slurm jobs
# for (i in bam_files){
# for (j in c("exon","intron")){
# out<- str_split(i,pattern = "[.]",simplify = T)[,2]
#
#
# system(
# paste0(
# "sbatch -J ",
# out,
# " --wrap '/opt/bin/Rscript",
# " /data/share/common/scripts/get3distance_v3_SP_v2_LW_v1.R",
# " --bamfile ",
# paste0(bam_path, i),
# " --txdb ",
# txdb,
# " --g2tmapping ",
# g2t,
# " --out ",
# paste0(out,"_", j),
# " --minTlength ",
# minTlength,
# " --max_exon_length ",
# max_exon_length,
# " --ftype ",
# j,
# " --plot F ",
# " --canonical T ",
# " --genes ",
# genes,
# " --ngenes ",
# ngenes,
# "'"
# )
# )
#
# }}
#
same for intron only genes
## start slurm jobs
for (i in bam_files){
out<- str_split(i,pattern = "[.]",simplify = T)[,2]
system(
paste0(
"sbatch -J ",
out,
" --wrap '/opt/bin/Rscript",
" /data/share/common/scripts/get3distance_v3_SP_v2_LW_v1.R",
" --bamfile ",
paste0(bam_path, i),
" --txdb ",
txdb,
" --g2tmapping ",
g2t,
" --out ",
paste0(out,"_", "intron_only"),
" --minTlength ",
minTlength,
" --max_exon_length ",
max_exon_length,
" --ftype ",
"intron",
" --plot F ",
" --canonical T ",
" --genes ",
genes,
" --ngenes ",
ngenes,
"'"
)
)
}
Submitted batch job 203096
Submitted batch job 203097
Submitted batch job 203098
Submitted batch job 203099
Submitted batch job 203100
Submitted batch job 203101
Submitted batch job 203102
Submitted batch job 203103
Submitted batch job 203104
Submitted batch job 203105
Submitted batch job 203106
Submitted batch job 203107
Submitted batch job 203108
Submitted batch job 203109
Submitted batch job 203110
Submitted batch job 203111
ggsave(p_enrich,
device = "pdf",
path = fig_path,
width = 180,
height=100,
units = "mm",
filename = "enrich_plot.pdf"
)
`geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
check bam coverage for genes with high intron and exon expression
top_genes_inex<-df_rank %>%
filter(!(is.na(external_gene_name))) %>%
filter(!(is.na(chromosome_name))) %>%
filter(exon_rank<19500 & intron_rank> 15500) %>%
arrange(desc(exon_rank)) %>%
unique() %>%
slice_max(exon_rank,n=20)
colnames(exon)<-paste0(colnames(exon),"_Exon")
colnames(intron)<-paste0(colnames(intron),"_Intron")
full_mat<-list(rownames_to_column(as.data.frame(exon),var="Gene_ID"),rownames_to_column(as.data.frame(intron),var="Gene_ID")) %>%
plyr::join_all() %>%
column_to_rownames(var="Gene_ID") %>%
as.data.frame() %>%
as.matrix()
Joining by: Gene_ID
full_mat[is.na(full_mat)]<-0
inex_mat<-full_mat[,17:32]+full_mat[,1:16]
colnames(inex_mat)<-paste0(str_split(colnames(inex_mat),pattern = "_",simplify = T)[,1],"_Both")
full_mat <-list(rownames_to_column(as.data.frame(inex_mat),var="Gene_ID"),rownames_to_column(as.data.frame(full_mat),var="Gene_ID")) %>%
plyr::join_all() %>%
column_to_rownames(var="Gene_ID") %>%
as.data.frame() %>%
as.matrix()
Joining by: Gene_ID
top_genes_mat<-full_mat[top_genes_inex$ENSEMBL,] %>%
t() %>%
as.data.frame() %>%
rownames_to_column(var="Sample") %>%
mutate(type=str_split(Sample,pattern = "_",simplify = T)[,2])
top_genes_mat%>%
dplyr::select(c(type,top_genes_inex$ENSEMBL[1])) %>%
rename("Ensembl"=top_genes_inex$ENSEMBL[1]) %>%
ggplot(aes(y=Ensembl,x=type,fill=type))+
geom_boxplot()+
coord_flip()+
facet_grid(type~.,scale="free")+
theme(axis.text.y = element_blank(),
axis.ticks.y= element_blank())+
labs(x="",y=paste0("UMIs (",top_genes_inex$external_gene_name[1],")"))+
scale_fill_manual(values=inex_colors)

top_genes_mat$type<-factor(top_genes_mat$type,levels=rev(c("Intron","Exon","Both")))
plot coverage for most highly expressed exons and introns

Export plot for ENAH
Plot_Coverage(PRange=PRange,
annot.colors=annot.colors,
b.inf=b.inf,
gene.models=gene.models,
figure.name=figure.name1,
gen=gen,
type=type,
pdf=T,
title=top_genes_inex$external_gene_name[i],
extend.range=2e4,
ymax=400
)
null device
1
intron_only_geneinfo<-upset_df[which(upset_df$ENSEMBL%in%intron_only_genes),] %>%
filter(!duplicated(ENSEMBL)) %>%
column_to_rownames(var = "ENSEMBL")
for (i in intron_only_genes[1:20]){
PRange <- gtf %>%
as.tibble %>%
mutate(ENSG=str_split_fixed(gene_id,
pattern = "[.]",
n = 2)[, 1]) %>%
filter(ENSG %in% i) %>%
group_by(seqnames, strand, gene_id) %>%
summarize(start = min(start), end = max(end)) %>%
plyranges::as_granges()
print(width(PRange))
figure.name1 = paste0("coveragePlots/Intron_only_mapping_",i,".pdf")
Plot_Coverage(PRange=PRange,
annot.colors=annot.colors,
b.inf=b.inf,
gene.models=gene.models,
figure.name=figure.name1,
gen=gen,
type=type,
pdf=T,
title=intron_only_geneinfo[i,"external_gene_name"],
extend.range=width(PRange)*0.5,
ymax=800
)
}
---
title: "Intron Expression - HEK"
output: html_notebook
---
## Purpose: 
Intron Expression in HEK cells

```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE, warning = FALSE, message = FALSE)
```


###  Load the following packages:

```{r packages}
library(tidyverse)
library(cowplot)
library(ggsci)
library(ggbeeswarm)
library(ComplexUpset)
library(Gviz)

```

###  Load functions:

```{r}
source("/data/share/htp/prime-seq_Paper/Scripts/custom_functions.R")

theme_pub <- theme_bw() + theme(plot.title = element_text(hjust = 0.5, size=18, face="bold"),
                                     axis.text = element_text(colour="black", size=14), 
                                     axis.title=element_text(size=16,face="bold"), 
                                     legend.text=element_text(size=14),
                                     legend.position="right",
                                     axis.line.x = element_line(colour = "black"), 
                                     axis.line.y = element_line(colour = "black"),
                                     strip.background=element_blank(), 
                                     strip.text=element_text(size=16))  

theme_set(theme_pub)


fig_path<-here::here()
data_path<-str_split(string = fig_path,pattern = "/analysis",simplify = T)[,1]
## Feature Colours

feat_cols<-c("#F08C4B", "#E5DFCC", "#556f44", "#9dc183", "#4D8C57", "#3A8DB5", "#256EA0","dodgerblue4","#256EA0")

names(feat_cols)<-c("Ambiguity","Intergenic","Ribosomal","Mitochondrial","lncRNA","Intron","Coding","Intragenic","Exon")

inex_colors <- c("#ff5154","#91a6ff","#550527","#BC7238")
names(inex_colors)<-c("Intron","Exon","Both","Intron only")

```

###  Load annotation:

```{r}
## annotation
gtype_human <-getbiotype("hsapiens_gene_ensembl",species="human")

ensembl <- useMart("ensembl", dataset ="hsapiens_gene_ensembl", host="uswest.ensembl.org") 

biotype <- getBM(attributes=c("ensembl_gene_id","ensembl_gene_id_version","gene_biotype","external_gene_name","chromosome_name"),
                 mart=ensembl )

```

###  Load data:

```{r}

inf <-
  read.csv(
    paste0(fig_path, "/sample_info.csv"),
    header = T,
    stringsAsFactors = F
  ) %>%
  mutate(Condition = case_when(
    Condition == "Incubation + ProtK" ~ "Magnetic Beads",
    TRUE ~ Condition
  )) %>%
  filter(Celltype == "HEK")

inf$Sample <- as.character(inf$Sample)


counts_hek <-
  readRDS(
    paste0(
      data_path,
      "/zUMIs/HEK/zUMIs_output/expression/Bulk_opt_lysis_test_2_HEK.dgecounts.rds"
    )
  )

# Split into exon, intron and Exon+ intron matrices.
# Filter samples and remove Geneversion numbers

counts_hek_ex <-
  counts_hek$umicount$exon$all %>% as.matrix() %>% remove_Geneversion()
counts_hek_ex <- counts_hek_ex[, inf$BC]
counts_hek_inex <-
  counts_hek$umicount$inex$all %>% as.matrix() %>% remove_Geneversion()
counts_hek_inex <- counts_hek_inex[, inf$BC]
counts_hek_in <-
  counts_hek$umicount$intron$all  %>% as.matrix() %>% remove_Geneversion()
counts_hek_in <- counts_hek_in[, inf$BC]

## make exon only / intron only matrix, remove genes detected in both
counts_hek_ex_o <-
  counts_hek_ex[!(rownames(counts_hek_ex) %in% rownames(counts_hek_in)), ]
counts_hek_in_o <-
  counts_hek_in[!(rownames(counts_hek_in) %in% rownames(counts_hek_ex)), ]
counts_hek_inex_o <-
  counts_hek_inex[!(rownames(counts_hek_inex) %in% rownames(counts_hek_ex_o)), ]
counts_hek_inex_o <-
  counts_hek_inex_o[!(rownames(counts_hek_inex_o) %in% rownames(counts_hek_in_o)), ]

counts_hek_in_o <- counts_hek_in_o[rowSums(counts_hek_in_o) > 0, ]
counts_hek_ex_o <- counts_hek_ex_o[rowSums(counts_hek_ex_o) > 0, ]
counts_hek_inex_o <- counts_hek_inex_o[rowSums(counts_hek_inex_o) > 0, ]

cond <- c("Magnetic Beads", "Column")


for (i in cond) {
  
  sub_inf <- inf %>%
    filter(Condition %in% i)
  
  intron_only_genes <-
    rownames(counts_hek_in_o[whichgenes_reproducible(counts_hek_in_o[,sub_inf$BC],
                                                     exprcutoff = 5,
                                                     reproducecutoff = 0.1), sub_inf$BC])
  
  exon_only_genes <-
    rownames(counts_hek_ex_o[whichgenes_reproducible(counts_hek_ex_o[,sub_inf$BC],
                                                     exprcutoff = 5,
                                                     reproducecutoff = 0.1), sub_inf$BC])
  
  inex_genes <-
    rownames(counts_hek_inex_o[whichgenes_reproducible(counts_hek_inex_o[,sub_inf$BC],
                                                       exprcutoff = 5,
                                                       reproducecutoff = 0.1), sub_inf$BC])
  
  inex_genes <-
    inex_genes[order(rowSums(counts_hek_inex_o[whichgenes_reproducible(counts_hek_inex_o[,sub_inf$BC],
                                                                       exprcutoff = 5,
                                                                       reproducecutoff = 0.1),
                                               sub_inf$BC]),
                     decreasing = T)] # sort by highest overall count
  
  gene_type <-
    data.frame(
      ENSEMBL = c(exon_only_genes, intron_only_genes, inex_genes),
      detection = c(
        rep("exon", times = length(exon_only_genes)),
        rep("intron", times = length(intron_only_genes)),
        rep("both", times = length(inex_genes))
      )
    ) %>%
    left_join(biotype, by = c("ENSEMBL" = "ensembl_gene_id")) %>%
    mutate(
      biotype2 = case_when(
        grepl(gene_biotype, pattern = "*pseudogene") ~ "pseudogene",
        grepl(gene_biotype, pattern = "IG*") ~ "protein coding",
        grepl(gene_biotype, pattern = "TR*") ~ "protein coding",
        grepl(gene_biotype, pattern = "MT*") ~ "other",
        grepl(gene_biotype, pattern = "^s") ~ "small RNA",
        grepl(gene_biotype, pattern = "ribozyme") ~ "other",
        gene_biotype == "misc_RNA" ~ "other",
        gene_biotype == "protein_coding" ~ "protein coding",
        T ~ gene_biotype
      ),
      cond = i
    ) %>%
    filter(!(is.na(biotype2)))
  
  gene_type$biotype2 <-
    factor(gene_type$biotype2, levels = rev(
      c(
        "protein coding",
        "lncRNA",
        "pseudogene",
        "rRNA",
        "small RNA",
        "miRNA",
        "not_annotated",
        "other"
      )
    ))
  
  if (!(exists("gene_type_sum"))) {
    gene_type_sum <- gene_type
    
  } else{
    gene_type_sum <- bind_rows(gene_type_sum, gene_type)
  }
  
}

ggplot() +
  geom_bar(data = gene_type_sum,
           aes(x = cond, fill = biotype2),
           #position = position_fill(),
           na.rm = T) +
  ggsci::scale_fill_npg() +
  theme(legend.position = "right") +
  labs(fill = "",
       x = "",
       y = "detected Genes") +
  facet_grid(~detection)+
  theme(axis.text.x = element_text(angle=45,hjust=1))


```

No difference in composition based on extraction method. We continue with the Bead isolation Condition from here on.

```{r filter for beads only}

inf<-inf %>% 
  filter(Condition=="Magnetic Beads")

gene_type<-gene_type_sum %>% 
  filter(cond=="Magnetic Beads")

counts_hek_inex<-counts_hek_inex[,inf$BC]

counts_hek_ex_o<-counts_hek_inex[subset(gene_type,detection=="exon")$ENSEMBL,]

counts_hek_in_o<-counts_hek_inex[subset(gene_type,detection=="intron")$ENSEMBL,]

counts_hek_inex_o<-counts_hek_inex[subset(gene_type,!(detection%in%c("intron","exon")))$ENSEMBL,]

##

counts_hek_in <- counts_hek_in[whichgenes_reproducible(counts_hek_in[,inf$BC],
                                                     exprcutoff = 2,
                                                     reproducecutoff = 0.1), inf$BC]
counts_hek_ex <- counts_hek_ex[whichgenes_reproducible(counts_hek_ex[,inf$BC],
                                                     exprcutoff = 2,
                                                     reproducecutoff = 0.1), inf$BC]

```


## upset plot of intersections
```{r}


average_counts<-data.frame(ENSEMBL=rownames(counts_hek_inex),
                           mean_cnt=rowMeans(counts_hek_inex)
                           )

intron_genes<-rownames(counts_hek_in) %>% 
  unique()

exon_genes<-rownames(counts_hek_ex)%>% 
  unique()

upset_df<-data.frame(ENSEMBL=c(exon_genes,intron_genes),
                      detection=c(rep("exon",
                                      times=length(exon_genes)),
                             rep("intron",
                                 times=length(intron_genes))
                             )) %>% 
  left_join(biotype,by=c("ENSEMBL"="ensembl_gene_id")) %>% 
  mutate(biotype2=case_when(grepl(gene_biotype,pattern="*pseudogene")~ "pseudogene",
                            grepl(gene_biotype,pattern="IG*")~ "protein coding",
                            grepl(gene_biotype,pattern="TR*")~ "protein coding",
                            grepl(gene_biotype,pattern="MT*")~ "other",
                            grepl(gene_biotype,pattern="^s")~ "small RNA",
                            grepl(gene_biotype,pattern="ribozyme")~ "other",
                           gene_biotype=="misc_RNA"~ "other",
                           gene_biotype=="protein_coding"~ "protein coding",
                           T~gene_biotype)) %>% 
  filter(!is.na(biotype2)) %>% 
  inner_join(average_counts)

upset_df$biotype2<-factor(upset_df$biotype2,levels=rev(c("protein coding","lncRNA","pseudogene","rRNA","small RNA","miRNA","other")))


upset_df<-upset_df %>% 
  mutate(value=TRUE) %>% 
  pivot_wider(names_from = detection,values_from = value,values_fill = FALSE) %>% 
  left_join(gene_type)

categories<-c("exon","intron")
 
upset<-upset(upset_df,
      categories,
      name="Gene detection",
      base_annotations=list(
        'Intersection size'=intersection_size(
            counts=FALSE,
            mapping=aes(fill=biotype2)
            
        )+scale_fill_npg()
        +labs(fill="")
      ),
      annotations = list(
        'nUMI'=(
            ggplot(mapping=aes(y=mean_cnt,fill=biotype2))
            +geom_boxplot(position="dodge",outlier.shape = NA,show.legend = F)
            +scale_y_log10()
            +scale_fill_npg()
      )))

upset


ggsave(upset,
       device = "pdf",
       path = fig_path,
       width = 150,
       height=180,
       units = "mm",
       filename = "upset_plot.pdf"
       )
```


## Summary stats
```{r}

make_summary<-function(mat,type){
  data.frame(BC=colnames(mat),
             umi=colSums(mat),
             gene=colSums(mat>0),
             type=type)
}



summary<-rbind(make_summary(counts_hek_ex_o,"Exon"),
      make_summary(counts_hek_in_o,"Intron"),
      make_summary(counts_hek_inex_o,"Both"))

summary$exclusive<-"yes"

ggplot(summary,aes(y=umi,x=type,col=type))+
  geom_boxplot()+
  geom_beeswarm()+
  scale_y_log10()+
  scale_colour_manual(values=inex_colors,limits=force)

ggplot(summary,aes(y=gene,x=type,col=type))+
  geom_boxplot()+
  geom_beeswarm()+
  scale_y_log10()+
  scale_colour_manual(values=inex_colors,limits=force)


summary2<-rbind(make_summary(counts_hek_ex,"Exon"),
      make_summary(counts_hek_in,"Intron"),
      make_summary(counts_hek_inex,"Both"))

summary2$exclusive<-"no"

ggplot(summary2,aes(y=umi,x=type,col=type))+
  geom_boxplot()+
  geom_beeswarm()+
  scale_y_log10()+
  ggtitle("")+
  scale_colour_manual(values=inex_colors,limits=force)

ggplot(summary2,aes(y=gene,x=type,col=type))+
  geom_boxplot()+
  geom_beeswarm()+
  scale_y_log10()+
  scale_colour_manual(values=inex_colors,limits=force)

summary3<-rbind(summary,summary2) #%>% 
 # pivot_longer(cols=c(umi,gene),names_to = "type2" ,values_to = "count") 


ggplot(summary3,aes(type,y=gene,group=paste0(exclusive,type),col=exclusive))+
  geom_boxplot()+
  geom_beeswarm(dodge.width = 0.80)+
  scale_y_log10()

ggplot(summary3,aes(type,y=umi,group=paste0(exclusive,type),col=exclusive))+
  geom_boxplot()+
  geom_beeswarm(dodge.width = 0.80)+
  scale_y_log10()



```


```{r}
inex_genes<-rownames(counts_hek_inex)

# subset for genes expressed in both
exon<-counts_hek_ex[rownames(counts_hek_ex)%in%inex_genes,] 
exon<-exon[whichgenes_reproducible(exon,5,reproducecutoff = 0.2),] 
exon<-exon[,] 

intron<-counts_hek_in[rownames(counts_hek_in)%in%inex_genes,]
intron<-intron[whichgenes_reproducible(intron,5,reproducecutoff = 0.2),] 

inex_lib<-data.frame(sample=names(colSums(counts_hek_inex)),
           LibSize=colSums(counts_hek_inex))

## normalize to UMI per million

upm<-function(umi_counts){ 
  umi_counts%>%
  as.data.frame() %>%
  rownames_to_column(var="GeneID") %>%
  pivot_longer( cols = 2:ncol(.),
                names_to = "sample",
                values_to = "cnt") %>%
  left_join(inex_lib) %>% 
  group_by(sample) %>%
  mutate(upm = cnt / LibSize *1e6 ) %>%
  dplyr::select(-c(LibSize,cnt))%>%
  pivot_wider(names_from = sample,
              values_from = upm) %>%
  column_to_rownames(var = "GeneID") %>%
  as.matrix()
}

exon_upm<-upm(exon)
exon_upm<-log2(exon_upm+1)
head(exon_upm)

intron_upm<-upm(intron)
intron_upm<-log2(intron_upm+1)
head(intron_upm)

df<-data.frame(ENSEMBL=rownames(exon_upm),exon=rowMeans(exon_upm)) %>% 
  inner_join(data.frame(ENSEMBL=rownames(intron_upm),intron=rowMeans(intron_upm))) %>% 
  inner_join(gene_type) %>% 
  filter(biotype2%in%c("protein coding")) %>% 
  unique()
  



smc_inex<-ggplot(df) + aes(x=exon, y=intron)+ 
  stat_density2d(geom="tile", aes(fill=..density..^0.25, alpha=1), contour=FALSE,show.legend = F) + 
  geom_point(size=0.5,show.legend = F) +
  stat_density2d(geom="tile", aes(fill=..density..^0.25,     alpha=ifelse(..density..^0.25<0.15,0,1)), contour=FALSE,show.legend = F) + 
  scale_fill_gradientn(colours = colorRampPalette(c("white", blues9,"black"))(256))+
  theme(panel.grid = element_blank())+
  geom_smooth(aes(exon,intron),method="lm",col="grey70")+
  labs(x="Log Mean expression Exon",y="Log Mean expression Intron")

smc_inex

hist.inex<-df %>%  
  rename(Exon=exon,
         Intron=intron) %>% 
  pivot_longer(cols = c("Exon","Intron")) %>% 
  ggplot()+
  geom_histogram(aes(value,fill=name),alpha=0.8,colour="grey50",bins = 50)+
  scale_fill_manual(values = inex_colors,limits=force)+
  labs(x="Log Mean Expression",
       fill="")+
  theme(legend.position = c(0.8,0.5),
        legend.background = element_blank())


hist.inex

ggsave(smc_inex,
       device = "pdf",
       path = fig_path,
       width = 100,
       height=100,
       units = "mm",
       filename = "scatter_plot.pdf"
       )

ggsave(hist.inex,
       device = "pdf",
       path = fig_path,
       width = 100,
       height=100,
       units = "mm",
       filename = "exp_dist_plot.pdf"
       )
```
## ranked exon vs. intron expression

```{r}

rank_mat<-function(count_mat){ 
  count_mat%>%
  as.data.frame() %>%
  rownames_to_column(var="GeneID") %>%
  pivot_longer( cols = 2:ncol(.),
                names_to = "sample",
                values_to = "cnt") %>%
  arrange(GeneID,cnt) %>% 
  group_by(sample) %>%
  mutate(gene_rank = rank(cnt)) %>%
    dplyr::select(-cnt) %>% 
  pivot_wider(names_from = sample,
              values_from = gene_rank) %>% 
  ungroup() %>% 
  column_to_rownames(var = "GeneID") %>%
  as.matrix()
    
}

exon_rank<-rank_mat(exon)

intron_rank<-rank_mat(intron)

df_rank<-data.frame(ENSEMBL=rownames(exon_rank),
                    exon_rank=rowMeans(exon_rank)) %>% 
  inner_join(data.frame(ENSEMBL=rownames(intron_rank),
                        intron_rank=rowMeans(intron_rank))) %>% 
  inner_join(df) %>% 
  filter(biotype2%in%c("protein coding","lncRNA")) %>% 
  mutate(norm_rank_exon=exon_rank/max(exon_rank),
         norm_rank_intron=intron_rank/max(intron_rank))
  
  ggplot(df_rank)+
  geom_density2d_filled(aes(exon_rank,intron_rank),)+
  geom_smooth(aes(exon_rank,intron_rank),method="lm")+
  facet_wrap(~biotype2,nrow=2,scales="free")
  
    ggplot(df_rank)+
  ggpointdensity::geom_pointdensity(aes(norm_rank_exon,norm_rank_intron),size=0.2)+
  geom_smooth(aes(norm_rank_exon,norm_rank_intron),method="lm")+
  facet_wrap(~biotype2,nrow=2,scales="free")


    
```

# Intron vs. Exon mapping


1: subset .bam file for barcodes
2: run get3distance_v3_SP_v2.R for intron and for exon separately
3: combine outputs and plot

```{r}

bam_path<- "/data/share/htp/prime-seq_Paper/Fig_beads_columns/zUMIs/HEK/zUMIs_output/demultiplexed/"

bam_files<-list.files(path = bam_path,pattern = ".bam$")

bcs<-inf$XC

bam_files<-bam_files[str_split(bam_files,pattern = "[.]",simplify = T)[,2] %in% bcs]


```

# get per bam file geneBody coverage

```{r}
# library(GenomicRanges, quietly=T, warn.conflicts = F)
# library(GenomicFeatures, quietly=T, warn.conflicts = F)
# library(GenomicAlignments, quietly=T, warn.conflicts = F)
# 
# ## Inputs:
# #bam_files #with corresponding .bai
# #out #"name base for plot and outtable"
# counts_hek_inex<-counts_hek$umicount$inex$all %>% as.matrix()
# counts_hek_inex<-counts_hek_inex[,inf$BC]
# 
# inex_genes<-rownames(counts_hek_inex)
# 
# inex_genes<-inex_genes[order(rowSums(counts_hek_inex),decreasing = T)] # sort by highest overall count
# 
# genes <- inex_genes[1:2000] #if you want to subset genes, list of ENSG ids
# write.table(genes,paste0(bam_path,"inex.genes.txt"),quote = F,row.names = F,col.names = F)
# genes<-paste0(bam_path,"inex.genes.txt")
# 
# #ftype" #"intron or exon"
# 
# gtf_file<- "/data/share/htp/prime-seq_Paper/genomes/Hsap/gencode.v35.primary_assembly.annotation.gtf" #gtf file containing the transcript info"
# txdb <- makeTxDbFromGFF(gtf_file, format="gtf")
# 
# # restrict to canonical chromosomes
# 
# canonical_chr<-seqlevels(txdb)[1:25]
# 
# 
# tmp<-transcriptsBy(txdb, by="gene")
# gene2transcript<-lapply( tmp, function(x){  mcols(x)$tx_name })
# 
# saveDb(txdb, file="/data/share/htp/prime-seq_Paper/genomes/Hsap/gencode.v35.primary_assembly.sqlite")
# save(gene2transcript,file="/data/share/htp/prime-seq_Paper/genomes/Hsap/gene2transcript_canonical.Rdata" )
# 
# txdb<-"/data/share/htp/prime-seq_Paper/genomes/Hsap/gencode.v35.primary_assembly.sqlite"
# g2t<-"/data/share/htp/prime-seq_Paper/genomes/Hsap/gene2transcript_canonical.Rdata"
# 
# maxTdist <- 10000 #"max dist. to 3' end counted"
# minTlength <- 0  #"minimal length of transcripts considered"
# max_exon_length<- 5000 # max length of exons per gene
# ngenes<- 2000 #max number of genes to use
# 
# # i="HEK.ACTGAGCGAAAACT.demx.bam"
# # j="intron"
# 
# # index
#  for (i in bam_files){
#    if (!(file.exists(paste0(bam_path,i,".bai"))))
# 
#    # index bam file
#   system(paste0("samtools index ",bam_path,i))
# 
#  }
# 
# 
# ## start slurm jobs
#  for (i in bam_files){
#    for (j in c("exon","intron")){
#   out<- str_split(i,pattern = "[.]",simplify = T)[,2]
# 
# 
#   system(
#     paste0(
#       "sbatch -J ",
#       out,
#       " --wrap '/opt/bin/Rscript",
#       " /data/share/common/scripts/get3distance_v3_SP_v2_LW_v1.R",
#       " --bamfile ",
#       paste0(bam_path, i),
#       " --txdb ",
#       txdb,
#       " --g2tmapping ",
#       g2t,
#       " --out ",
#       paste0(out,"_", j),
#       " --minTlength ",
#       minTlength,
#       " --max_exon_length ",
#       max_exon_length,
#       " --ftype ",
#       j,
#       " --plot F ",
#       " --canonical T ",
#       " --genes ",
#       genes,
#       " --ngenes ",
#       ngenes,
#       "'"
#     )
#   )
# 
# }}
# 


```

## same for intron only genes
```{r}

counts_hek_in_o<-counts_hek_in_o[,inf$BC]

intron_only_genes <-unique(rownames(counts_hek_in_o))

intron_only_genes<-intron_only_genes[order(rowSums(counts_hek_in_o[intron_only_genes,]),decreasing = T)] # sort by highest overall count

genes <- intron_only_genes[1:2000] #if you want to subset genes, list of ENSG ids
write.table(genes,paste0(bam_path,"intron_only.genes.txt"),quote = F,row.names = F,col.names = F)
genes<-paste0(bam_path,"intron_only.genes.txt")

gtf_file<- "/data/share/htp/prime-seq_Paper/genomes/Hsap/gencode.v35.primary_assembly.annotation.gtf" #gtf file containing the transcript info"
txdb<-"/data/share/htp/prime-seq_Paper/genomes/Hsap/gencode.v35.primary_assembly.sqlite"
g2t<-"/data/share/htp/prime-seq_Paper/genomes/Hsap/gene2transcript_canonical.Rdata"

maxTdist <- 10000 #"max dist. to 3' end counted"
minTlength <- 0  #"minimal length of transcripts considered"
max_exon_length<- 5000 # max length of exons per gene
ngenes<- 2000 #max number of genes to use

# i="HEK.ACTGAGCGAAAACT.demx.bam"


## start slurm jobs
 for (i in bam_files){
  out<- str_split(i,pattern = "[.]",simplify = T)[,2]


  system(
    paste0(
      "sbatch -J ",
      out,
      " --wrap '/opt/bin/Rscript",
      " /data/share/common/scripts/get3distance_v3_SP_v2_LW_v1.R",
      " --bamfile ",
      paste0(bam_path, i),
      " --txdb ",
      txdb,
      " --g2tmapping ",
      g2t,
      " --out ",
      paste0(out,"_", "intron_only"),
      " --minTlength ",
      minTlength,
      " --max_exon_length ",
      max_exon_length,
      " --ftype ",
      "intron",
      " --plot F ",
      " --canonical T ",
      " --genes ",
      genes,
      " --ngenes ",
      ngenes,
      "'"
    )
  )

}



```


```{r}

tables<-list.files(paste0(fig_path,"/genebody_coverage/"),pattern="on.txt")
tables<-c(tables,list.files(paste0(fig_path,"/genebody_coverage/"),pattern="intron_only.txt"))



for (i in tables){
  if(!(exists("combined"))){
    combined<-read_delim(paste0(fig_path,"/genebody_coverage/",i),
                         delim = "\t",
                         show_col_types = FALSE) %>% 
      mutate(BC=str_split(i,pattern=c("_"),simplify = T)[,1],
             type=str_split(str_split_fixed(i,pattern=c("_"),n = 2)[,2],pattern="[.]",simplify=T)[,1])
    
  }else{
    combined<-read_delim(paste0(fig_path,"/genebody_coverage/",i),delim = "\t",
                         show_col_types = FALSE) %>% 
       mutate(BC=str_split(i,pattern=c("_"),simplify = T)[,1],
             type=str_split(str_split_fixed(i,pattern=c("_"),n = 2)[,2],pattern="[.]",simplify=T)[,1]) %>% 
      bind_rows(combined)
  }
  
}


combined_sum<-combined %>% 
  group_by(distance_to_3_end,type) %>% 
  summarize(total_count=sum(read_number)) %>%
  group_by(type) %>% 
  filter(total_count>0) %>% 
  filter(distance_to_3_end<5000) %>% 
  mutate(scaled_count=(total_count/sum(total_count))*1e6) %>% 
  mutate(type=case_when(type=="exon"~"Exon",
                        type=="intron"~"Intron",
                        T~ "Intron only"))

ggplot(combined_sum, aes(x=distance_to_3_end, y=total_count,col=type)) + 
  geom_smooth()+
  #scale_y_log10() +
  scale_x_reverse() +
  theme_minimal()+
    facet_wrap(~type,scales="free")+
  scale_colour_manual(values=inex_colors,limits=force)
  
p_enrich<-ggplot(combined_sum, aes(x=distance_to_3_end, y=scaled_count,col=type)) + 
  geom_smooth()+
  #scale_y_log10() +
  scale_x_reverse(limits=c(5000,0)) +
  labs(x= "Distance to 3 prime end" ,
       y= "Counts per position per million",
       colour="")+
  scale_colour_manual(values=inex_colors,limits=force)+
  theme(legend.position = c(0.3,0.7),
        legend.background = element_blank())
  

p_enrich

ggsave(p_enrich,
       device = "pdf",
       path = fig_path,
       width = 180,
       height=100,
       units = "mm",
       filename = "enrich_plot.pdf"
       )

```
### check bam coverage for genes with high intron and exon expression

```{r}
top_genes_inex<-df_rank %>% 
  filter(!(is.na(external_gene_name))) %>% 
  filter(!(is.na(chromosome_name))) %>% 
  filter(exon_rank<19500 & intron_rank> 15500) %>% 
  arrange(desc(exon_rank)) %>% 
  unique() %>% 
  slice_max(exon_rank,n=20)

colnames(exon)<-paste0(colnames(exon),"_Exon")
colnames(intron)<-paste0(colnames(intron),"_Intron")


full_mat<-list(rownames_to_column(as.data.frame(exon),var="Gene_ID"),rownames_to_column(as.data.frame(intron),var="Gene_ID")) %>% 
  plyr::join_all() %>% 
  column_to_rownames(var="Gene_ID") %>% 
  as.data.frame() %>% 
  as.matrix() 

full_mat[is.na(full_mat)]<-0

inex_mat<-full_mat[,17:32]+full_mat[,1:16]
colnames(inex_mat)<-paste0(str_split(colnames(inex_mat),pattern = "_",simplify = T)[,1],"_Both")

full_mat <-list(rownames_to_column(as.data.frame(inex_mat),var="Gene_ID"),rownames_to_column(as.data.frame(full_mat),var="Gene_ID")) %>% 
  plyr::join_all() %>% 
  column_to_rownames(var="Gene_ID") %>% 
  as.data.frame() %>% 
  as.matrix() 


top_genes_mat<-full_mat[top_genes_inex$ENSEMBL,] %>% 
  t() %>% 
  as.data.frame() %>% 
  rownames_to_column(var="Sample") %>% 
  mutate(type=str_split(Sample,pattern = "_",simplify = T)[,2])

top_genes_mat%>% 
  dplyr::select(c(type,top_genes_inex$ENSEMBL[1])) %>% 
  rename("Ensembl"=top_genes_inex$ENSEMBL[1]) %>% 
  ggplot(aes(y=Ensembl,x=type,fill=type))+
  geom_boxplot()+
  coord_flip()+
  facet_grid(type~.,scale="free")+
  theme(axis.text.y = element_blank(),
        axis.ticks.y= element_blank())+
  labs(x="",y=paste0("UMIs (",top_genes_inex$external_gene_name[1],")"))+
  scale_fill_manual(values=inex_colors)
  
top_genes_mat$type<-factor(top_genes_mat$type,levels=rev(c("Intron","Exon","Both")))

```

# plot coverage for most highly expressed exons and introns

```{r}

source("/data/home/wange/ATAC/Coverage_Plot_functions_V1.R")

gtf_file<-"/data/share/htp/prime-seq_Paper/genomes/Hsap/gencode.v35.primary_assembly.annotation.gtf"

gtf<-rtracklayer::import.gff2(gtf_file)
# 
# GV_gene_mod<-makeGviz_geneModel_from_gencode(gtf)
# 
# saveRDS(GV_gene_mod,"/data/share/htp/prime-seq_Paper/genomes/Hsap/gencode.v35.primary_assembly.annotation.gtf.genemodels_Geneviz.rds")

GV_gene_mod<-readRDS("/data/share/htp/prime-seq_Paper/genomes/Hsap/gencode.v35.primary_assembly.annotation.gtf.genemodels_Geneviz.rds")


annot.colors <- list("introns"="#ff5154","exons"="#91a6ff")


bam_files<-"/data/share/htp/prime-seq_Paper/Fig_beads_columns/zUMIs/HEK/Bulk_opt_lysis_test_2_HEK.filtered.Aligned.GeneTagged.sorted.bam"

inex<-rev(c("introns","exons"))
b.inf <-data.frame(bamfile=c(paste(gsub(x = bam_files,pattern=".bam",replacement = ""),inex,"bam",sep=".")),cond1=inex,cond2=rep("HEK",times=2))
gene.models<- GV_gene_mod # genemodels from makeGviz_geneModel_from_gencode
gen="hg38" # genome
type="RNA" # sequencing type RNA-seq or ATAC-seq


for (i in 1:20){
# plotting range 
  PRange <- gtf %>%
    as.tibble %>% 
    mutate(ENSG=str_split_fixed(gene_id,
                    pattern = "[.]",
                    n = 2)[, 1]) %>% 
    filter(ENSG %in% top_genes_inex$ENSEMBL[i]) %>% 
    group_by(seqnames, strand, gene_id) %>%
    summarize(start = min(start), end = max(end)) %>%
    plyranges::as_granges()

print(top_genes_inex$external_gene_name[i])
print(top_genes_inex$chromosome_name[i])
print(length(PRange)>0)
if(length(PRange)>0){

figure.name1 = paste0("coveragePlots/Intron_Exon_mapping_",top_genes_inex$ensembl_gene_id_version[i],".pdf")



Plot_Coverage(PRange=PRange,
              annot.colors=annot.colors,
              b.inf=b.inf,
              gene.models=gene.models,
              figure.name=figure.name1,
              gen=gen,
              type=type,
              pdf=T,
              title=top_genes_inex$external_gene_name[i],
              extend.range=2e4,
              ymax=(sum(top_genes_mat[top_genes_mat$type=="Exon",top_genes_inex$ENSEMBL[i]])/4)## set y axis for coverage plot to sum of exon counts
              )


}
}



for (i in 1:20){

figure.name2 = paste0("coveragePlots/Intron_Exon_expression_",top_genes_inex$ensembl_gene_id_version[i],".pdf")
p.exp<-top_genes_mat %>% 
  dplyr::select(c(type,top_genes_inex$ENSEMBL[i])) %>% 
  rename("Ensembl"=top_genes_inex$ENSEMBL[i]) %>% 
  filter(type!="Both")%>% 
  ggplot(aes(y=Ensembl,x=type,col=type))+
  geom_boxplot(show.legend = F)+
  ggbeeswarm::geom_beeswarm(aes(col=type),show.legend = F)+
  facet_wrap(~type,scale="free_x",nrow = 2)+
  theme(axis.text.x = element_blank(),
        axis.ticks.x= element_blank())+
  labs(x="",y=paste0("UMIs (",top_genes_inex$external_gene_name[i],")"))+
  scale_color_manual(values=inex_colors,limits=force) 

ggsave(plot = p.exp,device = "pdf",filename = figure.name2,width=2,height=4,scale=1.5)
}

```

## Export plot for ENAH

```{r}

i=8
 PRange <- gtf %>%
    as.tibble %>% 
    mutate(ENSG=str_split_fixed(gene_id,
                    pattern = "[.]",
                    n = 2)[, 1]) %>% 
    filter(ENSG %in% top_genes_inex$ENSEMBL[i]) %>% 
    group_by(seqnames, strand, gene_id) %>%
    summarize(start = min(start), end = max(end)) %>%
    plyranges::as_granges()

 
figure.name1 = paste0("coveragePlots/Intron_Exon_mapping_",top_genes_inex$ensembl_gene_id_version[i],".pdf")
 
Plot_Coverage(PRange=PRange,
              annot.colors=annot.colors,
              b.inf=b.inf,
              gene.models=gene.models,
              figure.name=figure.name1,
              gen=gen,
              type=type,
              pdf=T,
              title=top_genes_inex$external_gene_name[i],
              extend.range=2e4,
              ymax=400
              )

```

```{r}


intron_only_geneinfo<-upset_df[which(upset_df$ENSEMBL%in%intron_only_genes),] %>% 
  filter(!duplicated(ENSEMBL)) %>% 
  column_to_rownames(var = "ENSEMBL")


for (i in intron_only_genes[1:20]){
  
 PRange <- gtf %>%
    as.tibble %>% 
    mutate(ENSG=str_split_fixed(gene_id,
                    pattern = "[.]",
                    n = 2)[, 1]) %>% 
    filter(ENSG %in% i) %>% 
    group_by(seqnames, strand, gene_id) %>%
    summarize(start = min(start), end = max(end)) %>%
    plyranges::as_granges()
print(width(PRange))
 
figure.name1 = paste0("coveragePlots/Intron_only_mapping_",i,".pdf")
 
Plot_Coverage(PRange=PRange,
              annot.colors=annot.colors,
              b.inf=b.inf,
              gene.models=gene.models,
              figure.name=figure.name1,
              gen=gen,
              type=type,
              pdf=T,
              title=intron_only_geneinfo[i,"external_gene_name"],
              extend.range=width(PRange)*0.5,
              ymax=800
              )
}

```

