Number of exons lost during liftover
#gpfs/gibbs/project/noonan/jy656/XSanno_fix/gencode.v43.basic.genetype.txt has gene_id, transcript_id, gene_type, gene_name, transcript_id information generated from /vast/palmer/scratch/noonan/ap2549/XSAnno/native_gtf/gencode.v43.basic.annotation.bed
cat("The number of exons in the original gencode.v43.bed file:", nrow(read.table("/gpfs/gibbs/project/noonan/jy656/XSanno_fix/gencode.v43.basic.genetype.txt")))
## The number of exons in the original gencode.v43.bed file: 853536
cat("\nThe number of exons after reciprocal liftover:", nrow(read.table("/gpfs/gibbs/project/noonan/jy656/XSanno_fix/hg38.hg38_panTro6.liftOver.info.txt")))
##
## The number of exons after reciprocal liftover: 801870
before_liftover_txt <- read.table("/gpfs/gibbs/project/noonan/jy656/XSanno_fix/gencode.v43.basic.genetype.txt") %>%
set_colnames(c("gene_id", "transcript_id", "gene_type", "gene_name", "transcript_name", "exon_id"))
after_liftover_bed <-
read.table("/gpfs/gibbs/project/noonan/jy656/XSanno_fix/hg38.hg38_panTro6.liftOver.info.txt") %>%
set_colnames(c("seqnames", "start", "end", "gene_id", "transcript_id", "gene_name", "transcript_name", "exon_id", "extra", "strand"))
Numbers after BLAT
after_BLAT_bed <- read.table("/gpfs/gibbs/project/noonan/jy656/XSanno_fix/blatFiltered.hg38TopanTro6.hg38.info.txt", sep = "\t") %>%
set_colnames(c("seqnames", "start", "end", "gene_id", "transcript_id", "gene_name", "transcript_name", "exon_id", "extra", "strand"))
cat("The number of exons after BLAT:", nrow(after_BLAT_bed))
## The number of exons after BLAT: 707722
n_after_BLAT <-
#incorporating (left_joining) the liftover bedfile info column with gene_type
#left_joined if gene_id, transcript_id, gene_name, transcript_name, exon_id matched
left_join(after_BLAT_bed, before_liftover_txt,
by = c("gene_id", "transcript_id", "gene_name", "transcript_name", "exon_id")) %>%
select(gene_name, gene_type) %>% #selecting only gene_name and gene_type
distinct() %>%
group_by(gene_type) %>% #group by gene_type
summarize(n = n()) #then count the number of gene type
Numbers after final DE
after_DE_bed <- read.table("/gpfs/gibbs/project/noonan/jy656/XSanno_fix/simFiltered.hg38TopanTro6.hg38.info.txt") %>%
set_colnames(c("seqnames", "start", "end", "gene_id", "transcript_id", "gene_name", "transcript_name", "exon_id", "1", "strand"))
cat("The number of exons after DE:", nrow(after_DE_bed))
## The number of exons after DE: 702411
Breakdown by gene_types
n_after_DE <-
#incorporating (left_joining) the liftover bedfile info column with gene_type
#left_joined if gene_id, transcript_id, gene_name, transcript_name, exon_id matched
left_join(after_DE_bed, before_liftover_txt,
by = c("gene_id", "transcript_id", "gene_name", "transcript_name", "exon_id")) %>%
select(gene_name, gene_type) %>% #selecting only gene_name and gene_type
distinct() %>%
group_by(gene_type) %>% #group by gene_type
summarize(n = n()) #then count the number of gene type
full_join(n_before_liftover, n_after_liftover,
by = c("gene_type"), #full_joining by gene_type
suffix = c(".before_liftover", ".after_liftover")) %>%
full_join(n_after_BLAT, by = c("gene_type")) %>%
full_join(n_after_DE, by = c("gene_type"), suffix = c(".after_BLAT", ".after_DE")) %>%
replace(is.na(.), 0) %>%
print(n = 40)
## # A tibble: 40 × 5
## gene_type n.before_liftover n.after_liftover n.after_BLAT n.after_DE
## <chr> <int> <int> <int> <int>
## 1 IG_C_gene 14 14 6 6
## 2 IG_C_pseudogene 9 6 4 4
## 3 IG_D_gene 37 5 0 0
## 4 IG_J_gene 18 8 4 4
## 5 IG_J_pseudogene 3 0 0 0
## 6 IG_V_gene 145 74 43 43
## 7 IG_V_pseudogene 187 99 63 62
## 8 IG_pseudogene 1 0 0 0
## 9 Mt_rRNA 2 2 0 0
## 10 Mt_tRNA 22 22 0 0
## 11 TEC 1054 884 746 744
## 12 TR_C_gene 6 6 6 6
## 13 TR_D_gene 4 4 0 0
## 14 TR_J_gene 79 78 60 60
## 15 TR_J_pseudogene 4 4 2 2
## 16 TR_V_gene 107 104 86 86
## 17 TR_V_pseudogene 33 31 23 23
## 18 artifact 19 0 0 0
## 19 lncRNA 18842 17557 16329 16314
## 20 miRNA 1877 1714 1458 1458
## 21 misc_RNA 1279 1163 1000 999
## 22 processed_pseudog… 10141 8764 7001 6994
## 23 protein_coding 20013 19288 18643 18619
## 24 pseudogene 15 15 6 6
## 25 rRNA 40 16 9 9
## 26 rRNA_pseudogene 491 454 378 377
## 27 ribozyme 8 8 7 7
## 28 sRNA 5 3 3 3
## 29 scRNA 1 0 0 0
## 30 scaRNA 48 45 40 38
## 31 snRNA 1837 1631 1295 1290
## 32 snoRNA 792 740 613 613
## 33 transcribed_proce… 512 434 347 341
## 34 transcribed_unita… 155 153 146 146
## 35 transcribed_unpro… 958 694 404 402
## 36 translated_proces… 2 1 0 0
## 37 translated_unproc… 3 1 0 0
## 38 unitary_pseudogene 99 84 77 77
## 39 unprocessed_pseud… 2606 1869 1198 1198
## 40 vault_RNA 1 1 1 1
full_join(n_before_liftover, n_after_liftover,
by = c("gene_type"), #full_joining by gene_type
suffix = c(".before_liftover", ".after_liftover")) %>%
full_join(n_after_BLAT, by = c("gene_type")) %>%
full_join(n_after_DE, by = c("gene_type"), suffix = c(".after_BLAT", ".after_DE")) %>%
replace(is.na(.), 0) %>%
write.csv("/gpfs/gibbs/project/noonan/jy656/XSanno_fix/n_breakdwon_by_gene_type.csv", quote = FALSE)
cat("The number of exons in the original gencode.v43.bed file:", nrow(read.table("/gpfs/gibbs/project/noonan/jy656/XSanno_fix/gencode.v43.basic.genetype.txt")))
## The number of exons in the original gencode.v43.bed file: 853536
cat("The number of genes in the original gencode.v43.bed file:", length(unique(before_liftover_txt$gene_name)))
## The number of genes in the original gencode.v43.bed file: 61217
cat("The number of exons after reciprocal liftover:", nrow(read.table("/gpfs/gibbs/project/noonan/jy656/XSanno_fix/hg38.hg38_panTro6.liftOver.info.txt")))
## The number of exons after reciprocal liftover: 801870
cat("The number of genes after reciprocal liftover:", length(unique(after_liftover_bed$gene_name)))
## The number of genes after reciprocal liftover: 55789
cat("The number of exons after BLAT:", nrow(after_BLAT_bed))
## The number of exons after BLAT: 707722
cat("The number of genes after BLAT:", length(unique(after_BLAT_bed$gene_name)))
## The number of genes after BLAT: 49879
cat("The number of exons after DE:", nrow(after_DE_bed))
## The number of exons after DE: 702411
cat("The number of genes after DE:", length(unique(after_DE_bed$gene_name)))
## The number of genes after DE: 49813
Comparison with XSanno publication (hg19topanTro3)
XSanno.pub.hg19.genes <- read.table("/gpfs/gibbs/project/noonan/jy656/XSanno_fix/XSanno.pub/Gencode.v10.fullTransExon.hg19TopanTro3.hg19.info.txt")
after_DE_bed <- after_DE_bed %>% rowwise() %>% mutate(gene_id_fix = unlist(strsplit(as.character(gene_id), split = "[.]"))[1])
cat("The number of genes in XSanno publication:", length(unique(XSanno.pub.hg19.genes$V4)))
## The number of genes in XSanno publication: 37469
cat("The number of common genes between XSanno publication and new XSanno:", length(intersect(XSanno.pub.hg19.genes$V4, after_DE_bed$gene_id_fix) %>% unique()))
## The number of common genes between XSanno publication and new XSanno: 33979
cat("The number of genes only in XSanno publication:", length(setdiff(XSanno.pub.hg19.genes$V4, after_DE_bed$gene_id_fix) %>% unique()))
## The number of genes only in XSanno publication: 3490
cat("The number of genes only in new XSanno:", length(setdiff(after_DE_bed$gene_id_fix, XSanno.pub.hg19.genes$V4) %>% unique()))
## The number of genes only in new XSanno: 16787
anti_join(XSanno.pub.hg19.genes, after_DE_bed, by = c("V4" = "gene_id_fix"))$V5 %>%
unique() %>%
write.table("/gpfs/gibbs/project/noonan/jy656/XSanno_fix/XSanno.pub.hg19.genes.only.genes.txt", quote=FALSE, row.names = FALSE, col.names = FALSE)
setdiff(after_BLAT_bed$gene_name, after_DE_bed$gene_name) %>% intersect(anti_join(XSanno.pub.hg19.genes, after_DE_bed, by = c("V4" = "gene_id_fix"))$V5) %>%
unique() %>%
write.table("/gpfs/gibbs/project/noonan/jy656/XSanno_fix/XSanno.pub.hg19.genes.only.genes.by.DEstep.txt", quote=FALSE, row.names = FALSE, col.names = FALSE)
setdiff(after_liftover_bed$gene_name, after_BLAT_bed$gene_name) %>% unique() %>%
write.table("/gpfs/gibbs/project/noonan/jy656/XSanno_fix/genes.filtered.after.BLATstep.txt", quote=FALSE, row.names = FALSE, col.names = FALSE)
setdiff(before_liftover_txt$gene_name, after_liftover_bed$gene_name) %>%
unique() %>%
write.table("/gpfs/gibbs/project/noonan/jy656/XSanno_fix/genes.filtered.after.liftoverstep.txt", quote=FALSE, row.names = FALSE, col.names = FALSE)
Check on DEgenes
XSanno.DEresults <- read.table("/vast/palmer/scratch/noonan/ap2549/XSAnno/Sim_testing/stringtie/simDESeq_blat.exon.results.txt", header = TRUE)
XSanno.DE.ID <- filter(XSanno.DEresults, padj < 0.01)$ID
XSanno.DEresults.count <- read.table("/vast/palmer/scratch/noonan/ap2549/XSAnno/Sim_testing/stringtie/exon_count_matrix_hs_pt.txt", header = TRUE) %>% pivot_longer(-ID) %>% mutate(species = ifelse(grepl("hs", name), "human", "chimp")) %>% group_by(ID, species) %>% summarize(mean = mean(value)) %>% mutate(DE = (ID %in% XSanno.DE.ID))
## `summarise()` has grouped output by 'ID'. You can override using the `.groups`
## argument.
XSanno.DEresults.count %>% filter(DE) %>% pivot_wider(values_from = mean, names_from = species) %>% mutate(ratio = chimp/human) %>% ggplot(aes(x = ratio)) + geom_histogram() + scale_x_continuous(lim = c(0,10))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 1787 rows containing non-finite values (`stat_bin()`).
## Warning: Removed 2 rows containing missing values (`geom_bar()`).

XSanno.DEresults %>% ggplot(aes(x = log2FoldChange, y = -log10(padj))) + geom_point()
## Warning: Removed 1025 rows containing missing values (`geom_point()`).

XSanno.DEresults %>% ggplot(aes(x = baseMean, y = log2FoldChange)) + geom_point()
