Number of exons lost during liftover

#gpfs/gibbs/project/noonan/jy656/XSanno_fix/gencode.v43.basic.genetype.txt has gene_id, transcript_id, gene_type, gene_name, transcript_id information generated from /vast/palmer/scratch/noonan/ap2549/XSAnno/native_gtf/gencode.v43.basic.annotation.bed

cat("The number of exons in the original gencode.v43.bed file:", nrow(read.table("/gpfs/gibbs/project/noonan/jy656/XSanno_fix/gencode.v43.basic.genetype.txt")))
## The number of exons in the original gencode.v43.bed file: 853536
cat("\nThe number of exons after reciprocal liftover:", nrow(read.table("/gpfs/gibbs/project/noonan/jy656/XSanno_fix/hg38.hg38_panTro6.liftOver.info.txt")))
## 
## The number of exons after reciprocal liftover: 801870
before_liftover_txt <- read.table("/gpfs/gibbs/project/noonan/jy656/XSanno_fix/gencode.v43.basic.genetype.txt") %>% 
  set_colnames(c("gene_id", "transcript_id", "gene_type", "gene_name", "transcript_name", "exon_id"))

after_liftover_bed <- 
  read.table("/gpfs/gibbs/project/noonan/jy656/XSanno_fix/hg38.hg38_panTro6.liftOver.info.txt") %>% 
  set_colnames(c("seqnames", "start", "end", "gene_id", "transcript_id", "gene_name", "transcript_name", "exon_id", "extra", "strand"))

Numbers after BLAT

after_BLAT_bed <- read.table("/gpfs/gibbs/project/noonan/jy656/XSanno_fix/blatFiltered.hg38TopanTro6.hg38.info.txt", sep = "\t") %>%
  set_colnames(c("seqnames", "start", "end", "gene_id", "transcript_id", "gene_name", "transcript_name", "exon_id", "extra", "strand"))

cat("The number of exons after BLAT:", nrow(after_BLAT_bed))
## The number of exons after BLAT: 707722
n_after_BLAT <- 
  #incorporating (left_joining) the liftover bedfile info column with gene_type 
  #left_joined if gene_id, transcript_id, gene_name, transcript_name, exon_id matched
  left_join(after_BLAT_bed, before_liftover_txt,
  by = c("gene_id", "transcript_id", "gene_name", "transcript_name", "exon_id")) %>% 
  select(gene_name, gene_type) %>% #selecting only gene_name and gene_type
  distinct() %>% 
  group_by(gene_type) %>% #group by gene_type
  summarize(n = n()) #then count the number of gene type

Numbers after final DE

after_DE_bed <- read.table("/gpfs/gibbs/project/noonan/jy656/XSanno_fix/simFiltered.hg38TopanTro6.hg38.info.txt") %>%
  set_colnames(c("seqnames", "start", "end", "gene_id", "transcript_id", "gene_name", "transcript_name", "exon_id", "1", "strand"))

cat("The number of exons after DE:", nrow(after_DE_bed))
## The number of exons after DE: 702411

Breakdown by gene_types

n_after_DE <- 
  #incorporating (left_joining) the liftover bedfile info column with gene_type 
  #left_joined if gene_id, transcript_id, gene_name, transcript_name, exon_id matched
  left_join(after_DE_bed, before_liftover_txt,
  by = c("gene_id", "transcript_id", "gene_name", "transcript_name", "exon_id")) %>% 
  select(gene_name, gene_type) %>% #selecting only gene_name and gene_type
  distinct() %>% 
  group_by(gene_type) %>% #group by gene_type
  summarize(n = n()) #then count the number of gene type

full_join(n_before_liftover, n_after_liftover, 
          by = c("gene_type"), #full_joining by gene_type
          suffix = c(".before_liftover", ".after_liftover")) %>% 
    full_join(n_after_BLAT, by = c("gene_type")) %>%
    full_join(n_after_DE, by = c("gene_type"), suffix = c(".after_BLAT", ".after_DE")) %>%
    replace(is.na(.), 0) %>% 
    print(n = 40)
## # A tibble: 40 × 5
##    gene_type          n.before_liftover n.after_liftover n.after_BLAT n.after_DE
##    <chr>                          <int>            <int>        <int>      <int>
##  1 IG_C_gene                         14               14            6          6
##  2 IG_C_pseudogene                    9                6            4          4
##  3 IG_D_gene                         37                5            0          0
##  4 IG_J_gene                         18                8            4          4
##  5 IG_J_pseudogene                    3                0            0          0
##  6 IG_V_gene                        145               74           43         43
##  7 IG_V_pseudogene                  187               99           63         62
##  8 IG_pseudogene                      1                0            0          0
##  9 Mt_rRNA                            2                2            0          0
## 10 Mt_tRNA                           22               22            0          0
## 11 TEC                             1054              884          746        744
## 12 TR_C_gene                          6                6            6          6
## 13 TR_D_gene                          4                4            0          0
## 14 TR_J_gene                         79               78           60         60
## 15 TR_J_pseudogene                    4                4            2          2
## 16 TR_V_gene                        107              104           86         86
## 17 TR_V_pseudogene                   33               31           23         23
## 18 artifact                          19                0            0          0
## 19 lncRNA                         18842            17557        16329      16314
## 20 miRNA                           1877             1714         1458       1458
## 21 misc_RNA                        1279             1163         1000        999
## 22 processed_pseudog…             10141             8764         7001       6994
## 23 protein_coding                 20013            19288        18643      18619
## 24 pseudogene                        15               15            6          6
## 25 rRNA                              40               16            9          9
## 26 rRNA_pseudogene                  491              454          378        377
## 27 ribozyme                           8                8            7          7
## 28 sRNA                               5                3            3          3
## 29 scRNA                              1                0            0          0
## 30 scaRNA                            48               45           40         38
## 31 snRNA                           1837             1631         1295       1290
## 32 snoRNA                           792              740          613        613
## 33 transcribed_proce…               512              434          347        341
## 34 transcribed_unita…               155              153          146        146
## 35 transcribed_unpro…               958              694          404        402
## 36 translated_proces…                 2                1            0          0
## 37 translated_unproc…                 3                1            0          0
## 38 unitary_pseudogene                99               84           77         77
## 39 unprocessed_pseud…              2606             1869         1198       1198
## 40 vault_RNA                          1                1            1          1
full_join(n_before_liftover, n_after_liftover, 
          by = c("gene_type"), #full_joining by gene_type
          suffix = c(".before_liftover", ".after_liftover")) %>% 
    full_join(n_after_BLAT, by = c("gene_type")) %>%
    full_join(n_after_DE, by = c("gene_type"), suffix = c(".after_BLAT", ".after_DE")) %>%
    replace(is.na(.), 0) %>% 
  write.csv("/gpfs/gibbs/project/noonan/jy656/XSanno_fix/n_breakdwon_by_gene_type.csv", quote = FALSE)
cat("The number of exons in the original gencode.v43.bed file:", nrow(read.table("/gpfs/gibbs/project/noonan/jy656/XSanno_fix/gencode.v43.basic.genetype.txt")))
## The number of exons in the original gencode.v43.bed file: 853536
cat("The number of genes in the original gencode.v43.bed file:", length(unique(before_liftover_txt$gene_name)))
## The number of genes in the original gencode.v43.bed file: 61217
cat("The number of exons after reciprocal liftover:", nrow(read.table("/gpfs/gibbs/project/noonan/jy656/XSanno_fix/hg38.hg38_panTro6.liftOver.info.txt")))
## The number of exons after reciprocal liftover: 801870
cat("The number of genes after reciprocal liftover:", length(unique(after_liftover_bed$gene_name)))
## The number of genes after reciprocal liftover: 55789
cat("The number of exons after BLAT:", nrow(after_BLAT_bed))
## The number of exons after BLAT: 707722
cat("The number of genes after BLAT:", length(unique(after_BLAT_bed$gene_name)))
## The number of genes after BLAT: 49879
cat("The number of exons after DE:", nrow(after_DE_bed))
## The number of exons after DE: 702411
cat("The number of genes after DE:", length(unique(after_DE_bed$gene_name)))
## The number of genes after DE: 49813

Comparison with XSanno publication (hg19topanTro3)

XSanno.pub.hg19.genes <- read.table("/gpfs/gibbs/project/noonan/jy656/XSanno_fix/XSanno.pub/Gencode.v10.fullTransExon.hg19TopanTro3.hg19.info.txt")
after_DE_bed <- after_DE_bed %>% rowwise() %>% mutate(gene_id_fix = unlist(strsplit(as.character(gene_id), split = "[.]"))[1])

cat("The number of genes in XSanno publication:", length(unique(XSanno.pub.hg19.genes$V4)))
## The number of genes in XSanno publication: 37469
cat("The number of common genes between XSanno publication and new XSanno:", length(intersect(XSanno.pub.hg19.genes$V4, after_DE_bed$gene_id_fix) %>% unique()))
## The number of common genes between XSanno publication and new XSanno: 33979
cat("The number of genes only in XSanno publication:", length(setdiff(XSanno.pub.hg19.genes$V4, after_DE_bed$gene_id_fix) %>% unique()))
## The number of genes only in XSanno publication: 3490
cat("The number of genes only in new XSanno:", length(setdiff(after_DE_bed$gene_id_fix, XSanno.pub.hg19.genes$V4) %>% unique()))
## The number of genes only in new XSanno: 16787
anti_join(XSanno.pub.hg19.genes, after_DE_bed, by = c("V4" = "gene_id_fix"))$V5 %>% 
  unique() %>%
  write.table("/gpfs/gibbs/project/noonan/jy656/XSanno_fix/XSanno.pub.hg19.genes.only.genes.txt", quote=FALSE, row.names = FALSE, col.names = FALSE)

setdiff(after_BLAT_bed$gene_name, after_DE_bed$gene_name) %>% intersect(anti_join(XSanno.pub.hg19.genes, after_DE_bed, by = c("V4" = "gene_id_fix"))$V5) %>%
  unique() %>%
  write.table("/gpfs/gibbs/project/noonan/jy656/XSanno_fix/XSanno.pub.hg19.genes.only.genes.by.DEstep.txt", quote=FALSE, row.names = FALSE, col.names = FALSE)

setdiff(after_liftover_bed$gene_name, after_BLAT_bed$gene_name) %>% unique() %>%
    write.table("/gpfs/gibbs/project/noonan/jy656/XSanno_fix/genes.filtered.after.BLATstep.txt", quote=FALSE, row.names = FALSE, col.names = FALSE)

setdiff(before_liftover_txt$gene_name, after_liftover_bed$gene_name) %>%
  unique() %>%
    write.table("/gpfs/gibbs/project/noonan/jy656/XSanno_fix/genes.filtered.after.liftoverstep.txt", quote=FALSE, row.names = FALSE, col.names = FALSE)

Check on DEgenes

XSanno.DEresults <- read.table("/vast/palmer/scratch/noonan/ap2549/XSAnno/Sim_testing/stringtie/simDESeq_blat.exon.results.txt", header = TRUE)
XSanno.DE.ID <- filter(XSanno.DEresults, padj < 0.01)$ID

XSanno.DEresults.count <- read.table("/vast/palmer/scratch/noonan/ap2549/XSAnno/Sim_testing/stringtie/exon_count_matrix_hs_pt.txt", header = TRUE) %>% pivot_longer(-ID) %>% mutate(species = ifelse(grepl("hs", name), "human", "chimp")) %>% group_by(ID, species) %>% summarize(mean = mean(value)) %>% mutate(DE = (ID %in% XSanno.DE.ID))
## `summarise()` has grouped output by 'ID'. You can override using the `.groups`
## argument.
XSanno.DEresults.count %>% filter(DE) %>% pivot_wider(values_from = mean, names_from = species) %>% mutate(ratio = chimp/human) %>% ggplot(aes(x = ratio)) + geom_histogram() + scale_x_continuous(lim = c(0,10))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 1787 rows containing non-finite values (`stat_bin()`).
## Warning: Removed 2 rows containing missing values (`geom_bar()`).

XSanno.DEresults %>% ggplot(aes(x = log2FoldChange, y = -log10(padj))) + geom_point()
## Warning: Removed 1025 rows containing missing values (`geom_point()`).

XSanno.DEresults %>% ggplot(aes(x = baseMean, y = log2FoldChange)) + geom_point()