For issues and questions, please get in touch via arpoetsch@gmail.com
The preprint that is associated with this script can be found at: https://www.biorxiv.org/content/10.1101/2023.07.19.549677v1
A tutorial on how to use GROVER as a foundation model can be found at:
https://doi.org/10.5281/zenodo.8373159 The pretrained
model can be found at: https://doi.org/10.5281/zenodo.8373117 The data for the
tokenised genome are at: https://doi.org/10.5281/zenodo.8373053
The models and performance measurements were done in python and the code can be found in the corresponding document. For visualisation, data were imported into R. We provide the outputs that we used for visualisation directly to make things easier for reproduction. Some figures were recoloured and/or combined with Adobe Illustrator, so some figures may look a bit different from the publication.
Tokenisation was performed in Python. Visualisations are done in R with token clowds. The pickle file is converted into .rdat with
library(reticulate)
source_python(“src/pickle_reader.py”)
dat <- read_pickle_file(“data/all_vocabs_wg.pkl”)
# load tokens
load("data/all_vocabs_wg.rdat")
# make word clouds for cycle 1,2,3,23,38,121,356
c1 <- dat[[1]]
wc <- data.frame(word=names(c1),freq=unlist(c1))
colvec <- mycolors[nchar(wc$word)]
names(colvec) <- NULL
wordcloud2(data=wc, fontFamily = "Courier New", color=colvec)
c1 <- dat[[2]]
wc <- data.frame(word=names(c1),freq=unlist(c1))
colvec <- mycolors[nchar(wc$word)]
names(colvec) <- NULL
wordcloud2(data=wc, fontFamily = "Courier New", color=colvec)
c1 <- dat[[3]]
wc <- data.frame(word=names(c1),freq=unlist(c1))
colvec <- mycolors[nchar(wc$word)]
names(colvec) <- NULL
wordcloud2(data=wc, fontFamily = "Courier New", color=colvec)
c1 <- dat[[4]]
wc <- data.frame(word=names(c1),freq=unlist(c1))
colvec <- mycolors[nchar(wc$word)]
names(colvec) <- NULL
wordcloud2(data=wc, fontFamily = "Courier New", color=colvec)
c1 <- dat[[24]]
wc <- data.frame(word=names(c1),freq=unlist(c1))
colvec <- mycolors[nchar(wc$word)]
names(colvec) <- NULL
wordcloud2(data=wc, fontFamily = "Courier New", color=colvec)
c1 <- dat[[39]]
wc <- data.frame(word=names(c1),freq=unlist(c1))
colvec <- mycolors[nchar(wc$word)]
names(colvec) <- NULL
wordcloud2(data=wc, fontFamily = "Courier New", color=colvec)
c1 <- dat[[122]]
wc <- data.frame(word=names(c1),freq=unlist(c1))
colvec <- mycolors[nchar(wc$word)]
names(colvec) <- NULL
wordcloud2(data=wc, fontFamily = "Courier New", color=colvec)
c1 <- dat[[357]]
wc <- data.frame(word=names(c1),freq=unlist(c1))
colvec <- mycolors[nchar(wc$word)]
names(colvec) <- NULL
wordcloud2(data=wc, fontFamily = "Courier New", color=colvec)
Performance metrices are imported from python.
perf <- read.csv("data/performance.csv", header = TRUE)
colnames(perf) <- c("2-mers", "3-mers","4-mers","5-mers","6-mers")
rownames(perf) <- c(100*c(1:10),1000*c(2:5))
perf$cycle <- rownames(perf)
perf$cycle <- factor(perf$cycle,levels=perf$cycle)
melted <- melt(perf, id="cycle")
colnames(melted)<-c("cycle","kmer","accuracy")
ggplot(data=melted, aes(x=cycle,y=accuracy,colour=kmer, fill=kmer, group=kmer))+
geom_point(size=3)+
geom_smooth()+
geom_segment(aes(x=10.5,xend=10.5,y=accuracy,yend=accuracy+0.5), linewidth=5, colour="white")+
facet_grid(facet="kmer",rows=5, scales="free")+
scale_colour_manual(values=viridis(n=5))+
scale_fill_manual(values=viridis(n=5))+
theme(strip.background = element_rect(fill = "white"))+
theme(axis.text.x = element_text(angle = 90))+
theme(legend.position="none")+
NULL
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
acc <- read.csv("data/Models_Accuracy_kmers.csv")
acc$Model <- factor(acc$Model, levels=c("GROVER","4mer","5mer","6mer"))
ggplot(acc, aes(x=Model, y=Accuracy, fill=Model))+
geom_bar(stat="identity")+
theme(axis.text.x = element_text(angle = 90, hjust = 1),
legend.position = "none")+
scale_fill_manual(values=modelcols[c(1,2,4,6)])+
scale_y_continuous(name= "accuracy [%]")+
facet_wrap(~Prediction, scales="free_y", ncol=1)+
NULL
acc <- read.csv("data/top_k_accuracy.csv")
ggplot(acc[1:60,], aes(x=k,y=accuracy))+
geom_bar(stat="identity")+
NULL
perp <- read.csv("data/perplexity_GROVER_chr21.csv")
perp4 <- read.csv("data/perplexity_4mer_chr21.csv")
perp5 <- read.csv("data/perplexity_5mer_chr21.csv")
perp6 <- read.csv("data/perplexity_6mer_chr21.csv")
perp$performance <- perp$performance /601
perp4$performance <- perp4$performance /(4^4)
perp5$performance <- perp5$performance /(4^5)
perp6$performance <- perp6$performance /(4^6)
datf <- data.frame(model=c("GROVER","4mer","5mer","6mer"),
perplexity =c(min(perp$performance),min(perp4$performance),min(perp5$performance),min(perp6$performance)))
datf$model <- factor(datf$model, levels=datf$model)
ggplot(datf, aes(x=model,y=perplexity, fill=model))+
geom_bar(stat="identity")+
scale_fill_manual(values=modelcols[c(1,2,4,6)])+
NULL
load("data/all_vocabs_wg.rdat")
vocab600 <- dat[["600"]]
vocab600 <- t(data.frame(vocab600))
colnames(vocab600)<- "frequency"
vocab600 <- data.frame(vocab600)
vocab600$token <- rownames(vocab600)
Hseq <- getSeq(Hsapiens)
Hseq <- Hseq[names(Hseq) %in% paste0("chr",c(1:22,"X","Y"))]
kmers4 <- colSums(oligonucleotideFrequency(Hseq, 4))
kmers5 <- colSums(oligonucleotideFrequency(Hseq, 5))
kmers6 <- colSums(oligonucleotideFrequency(Hseq, 6))
kmers4 <- data.frame(kmers4)
kmers4$token <- rownames(kmers4)
colnames(kmers4)<- c("frequency","token")
kmers5 <- data.frame(kmers5)
kmers5$token <- rownames(kmers5)
colnames(kmers5)<- c("frequency","token")
kmers6 <- data.frame(kmers6)
kmers6$token <- rownames(kmers6)
colnames(kmers6)<- c("frequency","token")
Embeddings for the GROVER vocabulary from the BERT infrastructure and Word2Vec, UMAP and PCA.
#GROVER
Gemb <- get(load("data/vocab_embedding.rdat"))
rownames(Gemb) <- read.table("data/vocab.txt")[,1]
#W2V
GWemb <- get(load("data/iter600_w2v.rdat"))
rownames(GWemb) <- read.table("data/iter600_w2v_vocab.txt")[,1]
Gemb <- Gemb[-c(1:7,9),]
Gum <- umap(Gemb)
Gpca <- prcomp(Gemb)
GWum <- umap(GWemb)
GWpca <- prcomp(GWemb)
Collection of data in dataframes
df <- data.frame(Gum$layout)
df$token <- rownames(df)
colnames(df)<- c("G_UMAP1","G_UMAP2","token")
dfGW <- data.frame(GWum$layout)
dfGW$token <- rownames(dfGW)
colnames(dfGW) <- c("G_W2V_UMAP1","G_W2V_UMAP2","token")
df <- merge(df,dfGW,id="token")
df2 <- data.frame(Gpca$x)[,1:6]
df2$token <- rownames(df2)
colnames(df2) <- c("GROVER_PC1","GROVER_PC2","GROVER_PC3","GROVER_PC4","GROVER_PC5","GROVER_PC6","token")
df <- merge(df,df2,id="token")
df2 <- data.frame(GWpca$x)[,1:6]
df2$token <- rownames(df2)
colnames(df2) <- c("GROVER_W2V_PC1","GROVER_W2V_PC2","GROVER_W2V_PC3","GROVER_W2V_PC4","GROVER_W2V_PC5","GROVER_W2V_PC6","token")
df <- merge(df,df2,id="token")
df <- merge(df,vocab600, id="token")
df$length <- factor(nchar(as.character(df$token)))
df$C <- sapply(as.character(df$token),str_count, pattern=c("C"))/nchar(as.character(df$token))
df$G <- sapply(as.character(df$token),str_count, pattern=c("G"))/nchar(as.character(df$token))
df$A <- sapply(as.character(df$token),str_count, pattern=c("A"))/nchar(as.character(df$token))
df$T <- sapply(as.character(df$token),str_count, pattern=c("T"))/nchar(as.character(df$token))
df$GC <- df$C + df$G
df$AG <- df$A + df$G
df$AC <- df$A + df$C
hex=jet(nrow(df)) #colour selection
df$W2V_hex <- hex[rank(df$GROVER_W2V_PC1)]
w2v_tokencol <- df[,c("token","W2V_hex","GROVER_W2V_PC1")]
Compare frequency distribution between vocabularies
freqdf <- rbind(data.frame(model="GROVER",frequency=df$frequency),
data.frame(model= "4mer",frequency=kmers4$frequency/4),
data.frame(model= "5mer",frequency=kmers5$frequency/5),
data.frame(model= "6mer",frequency=kmers6$frequency/6)
)
freqdf$model <- factor(freqdf$model,levels=c("GROVER","4mer","5mer","6mer"))
ggplot(freqdf, aes(x = frequency, fill = model)) +
geom_density() + # Boxplot without outliers
scale_x_log10() + # Logarithmic y-axis scale
labs(x = "frequency", y = "density") + # Axis labels
scale_fill_manual(values=modelcols[c(1,2,4,6)])+
facet_grid(facet="model", rows=4)+
theme(strip.background = element_rect(fill = "white"))+
NULL
dftmp <- aggregate(frequency~length, df[,c("frequency","length")],sum)
dftmp$length <- as.numeric(as.character(dftmp$length))
levels(dftmp$length) <- 1:16
dftmp <- rbind(dftmp,data.frame(length=c(13:15),frequency=0))
ggplot(dftmp, aes(x=length,y=frequency, fill=length))+
geom_bar(stat="identity")+
scale_fill_manual(values=mycolors)+
scale_x_discrete("token length")+
scale_y_continuous("frequency")+
theme_cowplot()+
NULL
tab <- table(df$length)
tab["13"] <- 0
tab["14"] <- 0
tab["15"] <- 0
dfvoc <- data.frame(t(tab[c(1:12,14:16,13)]))[,2:3]
colnames(dfvoc) <- c("length","vocabulary")
dfvoc$frequency <- 4^c(1:16)
dfvoc$proportion <- dfvoc$vocabulary/dfvoc$frequency
#dfvoc$length<- factor(as.character(dfvoc$length),levels(dfvoc$length))
ggplot(dfvoc, aes(y=vocabulary,x=frequency, colour=length))+
geom_point(size=5)+
scale_colour_manual(values=mycolors)+
scale_x_log10("k-mer frequency")+
scale_y_log10("k-mers represented")+
theme_cowplot()+
NULL
## Warning: Transformation introduced infinite values in continuous y-axis
ggplot(dfvoc, aes(x=length,y=proportion, fill=length))+
geom_bar(stat="identity")+
scale_fill_manual(values=mycolors)+
scale_x_discrete("token length")+
scale_y_continuous("proportion of k-mers represented")+
theme_cowplot()+
NULL
repres_seq <- c(
sum(str_count(df$token, "A")),
sum(str_count(df$token, "C")),
sum(str_count(df$token, "G")),
sum(str_count(df$token, "T")))
genome_freq <- colSums(oligonucleotideFrequency(Hseq, 1))
first_letter_seq <- c(
sum(str_count(substr(df$token,1,1), "A")),
sum(str_count(substr(df$token,1,1), "C")),
sum(str_count(substr(df$token,1,1), "G")),
sum(str_count(substr(df$token,1,1), "T")))
freq_df <- data.frame(genome_frequency=genome_freq/sum(genome_freq),
representation = repres_seq/sum( repres_seq),
first_letter=first_letter_seq/sum(first_letter_seq))
freq_df$nucl <- factor(c("A","C","G","T"), levels=c("A","C","G","T"))
p1 <- ggplot(freq_df, aes(x = "", y = genome_frequency, fill = nucl)) +
geom_bar(stat = "identity", width = 1) +
scale_fill_manual(values=nucl_colours)+
coord_polar("y", start = 0, direction = -1) +
geom_text(aes(label = paste0(nucl,"\n",round(genome_frequency * 100, 2), "%")),
position = position_stack(vjust = 0.5))+
theme_void() +
labs(fill = "Genome Frequency")+
NULL
p2 <- ggplot(freq_df, aes(x = "", y = representation, fill = nucl)) +
geom_bar(stat = "identity", width = 1) +
scale_fill_manual(values=nucl_colours)+
coord_polar("y", start = 0, direction = -1) +
geom_text(aes(label = paste0(nucl,"\n",round(representation * 100, 2), "%")),
position = position_stack(vjust = 0.5))+
theme_void() +
labs(fill = "Representation")+
NULL
p3 <- ggplot(freq_df, aes(x = "", y = first_letter, fill = nucl)) +
geom_bar(stat = "identity", width = 1) +
scale_fill_manual(values=nucl_colours)+
coord_polar("y", start = 0, direction = -1) +
geom_text(aes(label = paste0(nucl,"\n",round(first_letter * 100, 2), "%")),
position = position_stack(vjust = 0.5))+
theme_void() +
labs(fill = "First Letter")+
NULL
p1+p2+p3
tokmetr <- read.csv("data/metrics_per_token.csv")
tokdf <- cbind(tokmetr[c(8,10:609),2:4])
colnames(tokdf) <- c("token","GROVER_AUC","GROVER_Acc")
df <- merge(df, tokdf, id="token")
ggplot(df, aes(y=GROVER_AUC-0.5,x=factor(length), fill=factor(length)))+
geom_boxplot() +
scale_fill_manual(values=mycolors)+
NULL
ggplot(df, aes(y=GROVER_Acc,x=factor(length), fill=factor(length)))+
geom_boxplot() +
scale_fill_manual(values=mycolors)+
NULL
UMAPs and PCAs of the average embeddings of the vocabulary. Putting the Principal Components in relation to genome biology requires integration with additional data.
Geigs <- Gpca$sdev^2
Gvarvec <- Geigs/sum(Geigs)
GWeigs <- GWpca$sdev^2
GWvarvec <- GWeigs/sum(GWeigs)
MEV <- c(GROVER=Gvarvec[1],GROVER_W2V=GWvarvec[1])
MEV <- data.frame(MEV)
MEV$model=rownames(MEV)
MEV$model <- factor(MEV$model,levels=MEV$model)
ggplot(MEV[1:2,],aes(x=model,y=100*MEV))+
geom_bar(stat="identity", fill="slateblue")+
scale_y_continuous("Maximum Explained Variance [%]")+
theme_cowplot()+
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))+
NULL
Plotted are both the PCA and also the colour bar for the legend.
ggplot(df, aes(x=GROVER_W2V_PC1, y=GROVER_W2V_PC2, colour=token))+
geom_point()+
scale_colour_manual(values=df$W2V_hex)+
scale_x_continuous(paste0("Word2Vec PC1 (",round(100*GWvarvec[1],2),"%)"))+
scale_y_continuous(paste0("Word2Vec PC2 (",round(100*GWvarvec[2],2),"%)"))+
#geom_label_repel(aes(label = token ))+
theme_cowplot()+
theme(legend.position = "none")+
NULL
image(matrix(1:601, nrow = 1), col = hex, axes = FALSE, xlab = "", ylab = "")
ggplot(df, aes(x=GROVER_W2V_PC1, y=GC, colour=token))+
geom_point()+
scale_colour_manual(values=df$W2V_hex)+
scale_x_continuous(paste0("Word2Vec PC1 (",round(100*GWvarvec[1],2),"%)"))+
scale_y_continuous(paste0("GC content"))+
#geom_label_repel(aes(label = token ))+
theme_cowplot()+
theme(legend.position = "none")+
NULL
cor.test(df$GROVER_W2V_PC1,df$GC, method="spearman")
## Warning in cor.test.default(df$GROVER_W2V_PC1, df$GC, method = "spearman"):
## Cannot compute exact p-value with ties
##
## Spearman's rank correlation rho
##
## data: df$GROVER_W2V_PC1 and df$GC
## S = 69672108, p-value < 2.2e-16
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
## rho
## -0.9256972
ggplot(df, aes(G_W2V_UMAP1, y=G_W2V_UMAP2, colour=token))+
geom_point()+
scale_colour_manual(values=df$W2V_hex)+
theme_cowplot()+
theme(legend.position = "none")+
NULL
ggplot(df, aes(x=GROVER_PC1, y=GROVER_PC2, colour=token))+
geom_point()+
scale_colour_manual(values=df$W2V_hex)+
scale_x_continuous(name=paste("PC1 (",round(100*Gvarvec[1],2),"%)",sep=""))+
scale_y_continuous(name=paste("PC2 (",round(100*Gvarvec[2],2),"%)",sep=""))+
theme_cowplot()+
theme(legend.position = "none")+
NULL
ggplot(df, aes(x=G_UMAP1, y=G_UMAP2, colour=token))+
geom_point()+
scale_colour_manual(values=df$W2V_hex)+
theme_cowplot()+
theme(legend.position = "none")+
NULL
For Figure 4G and later figures, the proportion of tokens associated with features of genome biology is calculated.
dummydf <- data.frame(Feature= sort(c("Promoter","5' UTR","3' UTR","1st Exon","Other Exon","1st Intron","Other Intron","Downstream (<=300)","Distal Intergenic")))
if(!file.exists("objects/230413_gene_annotation.rdat")){
annotdf <- data.frame(
row.names=dummydf$Feature)
for (i in c(1:length(df$token))){
tok <- df$token[i]
cat(tok,"\n")
bed_ss <- get(load(paste0("objects/GR_by_token/GROVER600_",tok,".rdat")))
if(length(bed_ss)>50000){
bed_ss <- bed_ss[sample(length(bed_ss),50000)]
}
seqlevels(bed_ss) <- paste0("chr",c(1:22,"X","Y"))
seqinfo(bed_ss)<- seqinfo(Hsapiens)
start(bed_ss) <- start(bed_ss)+1 #readjust start
bed_ss <- sort(bed_ss)
names(bed_ss)<- 1:length(bed_ss)
peakAnno <- annotatePeak(bed_ss, tssRegion=c(-1000, 1000),
TxDb=txdb, annoDb="org.Hs.eg.db")
tmp <- merge(dummydf,peakAnno@annoStat,by="Feature",all=TRUE)
annotdf <- cbind(annotdf,tmp[,2])
}
colnames(annotdf) <- df$token
annotdf$feature <- factor(rownames(annotdf),levels=rev(c("Promoter","5' UTR","3' UTR","1st Exon","Other Exon","1st Intron","Other Intron","Downstream (<=300)","Distal Intergenic")))
annotdf[is.na(annotdf)]<- 0
melted <- melt(annotdf)
save(annotdf,file="objects/230413_gene_annotation.rdat")
}
genes <- genes(txdb)
## 403 genes were dropped because they have exons located on both strands
## of the same reference sequence or on more than one reference sequence,
## so cannot be represented by a single genomic range.
## Use 'single.strand.genes.only=FALSE' to get all the genes in a
## GRangesList object, or use suppressMessages() to suppress this message.
genes_p <- genes[strand(genes) == "+"]
genes_m <- genes[strand(genes) == "-"]
strand(genes_m) <- "+"
genedf <- data.frame(
row.names=c("gene_p","gene_m","intragenic"))
genedf$feature <- c("gene_p","gene_m","intragenic")
if(!file.exists("objects/230413_gene_orientation_annotation.rdat")){
genedummy <- genedf
for (i in 1:length(df$token)){
cat(i,"\n")
tok <- df$token[i]
cat(tok,"\n")
bed_ss <- get(load(paste0("objects/GR_by_token/GROVER600_",tok,".rdat")))
if(length(bed_ss)>50000){
bed_ss <- bed_ss[sample(length(bed_ss),50000)]
}
seqlevels(bed_ss) <- paste0("chr",c(1:22,"X","Y"))
seqinfo(bed_ss)<- seqinfo(Hsapiens)
start(bed_ss) <- start(bed_ss)+1 #readjust start
bed_ss <- sort(bed_ss)
bed_ss_rev <- bed_ss
names(bed_ss)<- 1:length(bed_ss)
peakAnno_p <- bed_ss[bed_ss %over% genes_p]
peakAnno_m <- bed_ss[bed_ss %over% genes_m]
peakAnno_o <- bed_ss[!bed_ss %over% c(peakAnno_p,peakAnno_m)]
genedf <- cbind(genedf,c(length(peakAnno_p),length(peakAnno_m),length(peakAnno_o)))
}
colnames(genedf) <- c("class",df$token)
genedf[,2:602] <- genedf[,2:602]/colSums(genedf[,2:602])
genedf2 <- genedf[,c("class",df$token)]
save(genedf2,file="230413_gene_orientation_annotation.rdat")
}
genes <- cds(txdb)
genes_p <- genes[strand(genes) == "+"]
genes_m <- genes[strand(genes) == "-"]
strand(genes_m) <- "+"
genedf <- data.frame(
row.names=c("cds_p","cds_m","other"))
genedf$feature <- c("cds_p","cds_m","other")
if(!file.exists("objects/230413_cds_orientation_annotation.rdat")){
for (i in 1:length(df$token)){
cat(i,"\n")
tok <- df$token[i]
cat(tok,"\n")
bed_ss <- get(load(paste0("objects/GR_by_token/GROVER600_",tok,".rdat")))
if(length(bed_ss)>50000){
bed_ss <- bed_ss[sample(length(bed_ss),50000)]
}
seqlevels(bed_ss) <- paste0("chr",c(1:22,"X","Y"))
seqinfo(bed_ss)<- seqinfo(Hsapiens)
start(bed_ss) <- start(bed_ss)+1 #readjust start
bed_ss <- sort(bed_ss)
bed_ss_rev <- bed_ss
names(bed_ss)<- 1:length(bed_ss)
peakAnno_p <- bed_ss[bed_ss %over% genes_p]
peakAnno_m <- bed_ss[bed_ss %over% genes_m]
peakAnno_o <- bed_ss[!bed_ss %over% c(peakAnno_p,peakAnno_m)]
genedf <- cbind(genedf,c(length(peakAnno_p),length(peakAnno_m),length(peakAnno_o)))
}
colnames(genedf) <- c(df$token)
genedf <- cbind (class=rownames(genedf),genedf)
genedf[,2:602] <- genedf[,2:602]/colSums(genedf[,2:602])
genedf2 <- genedf[,c("class",df$token)]
save(genedf2,file="objects/230413_cds_orientation_annotation.rdat")
}
HMM <- read.table("data/GM12878_ChromHMM.bed")
HMM <- HMM[,c(2,3,4,5,10)]
colnames(HMM) <- c("seqnames","start","end","ID","colour")
HMM <- GRanges(HMM)
HMMdf <- data.frame(
row.names=names(table(HMM$ID)))
HMMdf$ID <- names(table(HMM$ID))
HMMdf <- unique(merge(HMMdf,elementMetadata(HMM)[,c("ID","colour")], id="ID", all.x=TRUE, all.y=FALSE))
HMMdf$ID <- factor(HMMdf$ID, levels=sort(HMMdf$ID)[c(1,8:15,2:7)])
HMMcol <- HMMdf$colour[c(1,8:15,2:7)]
names(HMMcol)[c(1,8:15,2:7)]<- as.character(HMMdf$ID)
HMMdummy <- HMMdf
if(!file.exists("objects/230413_HMM_annotation.rdat")){
for (i in c(1:length(df$token))){
cat(i,"\n")
tok <- df$token[i]
cat(tok,"\n")
bed_ss <- get(load(paste0("objects/GR_by_token/GROVER600_",tok,".rdat")))
if(length(bed_ss)>50000){
bed_ss <- bed_ss[sample(length(bed_ss),50000)]
}
seqlevels(bed_ss) <- paste0("chr",c(1:22,"X","Y"))
seqinfo(bed_ss)<- seqinfo(Hsapiens)
start(bed_ss) <- start(bed_ss)+1 #readjust start
bed_ss <- sort(bed_ss)
names(bed_ss)<- 1:length(bed_ss)
peakAnno <- mergeByOverlaps(bed_ss, HMM)
tmp <- data.frame(table(peakAnno$ID)/length(bed_ss))
tmp <- merge(HMMdummy,tmp,by.x="ID",by.y="Var1",all=TRUE)
HMMdf <- cbind(HMMdf,tmp[,3])
}
colnames(HMMdf) <- c("ID","colour",df$token)
HHMdf <- HHMdf[c(1,8:15,2:7),]
HHMdf$ID <- factor(HHMdf$ID,levels=HHMdf$ID)
HHMdf$colour <-c("#FF0000","#FF6969","#CF0BC6", "#FACA00", "#FACA01", "#FFFC04", "#FFFC05", "#0ABEFE", "#00B050", "#00B051", "#99FF66","#7F7F7F", "#F5F5F5", "#F5F5F6", "#F5F5F7")
save(HMMdf,file="objects/230413_HMM_annotation.rdat")
}
if(!file.exists("objects/230413_rep_annotation.rdat")){
repdb <- get(load("data/repdb_UCSC.Rdata"))
repdf <- data.frame(
row.names=names(table(repdb$repClass)))
repdf$repClass <- names(table(repdb$repClass))
repdfdummy <- repdf
repdfdummy$repClass <- paste0(repdfdummy$repClass,"_p")
repdfdummy2 <- repdf
repdfdummy2$repClass <- paste0(repdfdummy2$repClass,"_m")
repdfdummy <- rbind(repdfdummy,repdfdummy2)
repdf <- repdfdummy
for (i in 1:length(df$token)){
cat(i,"\n")
tok <- df$token[i]
cat(tok,"\n")
bed_ss <- get(load(paste0("objects/GR_by_token/GROVER600_",tok,".rdat")))
if(length(bed_ss)>50000){
bed_ss <- bed_ss[sample(length(bed_ss),50000)]
}
seqlevels(bed_ss) <- paste0("chr",c(1:22,"X","Y"))
seqinfo(bed_ss)<- seqinfo(Hsapiens)
start(bed_ss) <- start(bed_ss)+1 #readjust start
bed_ss <- sort(bed_ss)
bed_ss_rev <- bed_ss
strand(bed_ss_rev) <- "-"
names(bed_ss)<- 1:length(bed_ss)
peakAnno_p <- mergeByOverlaps(bed_ss, repdb[which(strand(repdb)=="+")])
peakAnno_m <- mergeByOverlaps(bed_ss_rev, repdb[which(strand(repdb)=="-")])
tmp_p <- data.frame(table(peakAnno_p$repClass)/length(bed_ss))
tmp_m <- data.frame(table(peakAnno_m$repClass)/length(bed_ss))
tmp_p$Var1 <- paste0(tmp_p$Var,"_p")
tmp_m$Var1 <- paste0(tmp_m$Var,"_m")
tmp <- rbind(tmp_p,tmp_m)
repdf <- cbind(repdf,tmp[,2])
}
colnames(repdf) <- c("repClass",df$token)
repdf2 <- rbind(c(colSums(repdf[c(17:21,13,1:2),-1])),
c(colSums(repdf[21+c(17:21,13,1:2),-1])),
c(colSums(repdf[c(15:16),-1])),
c(colSums(repdf[21+c(15:16),-1])),
c(colSums(repdf[c(14),-1])),
c(colSums(repdf[21+c(14),-1])),
c(colSums(repdf[c(12),-1])),
c(colSums(repdf[21+c(12),-1])),
c(colSums(repdf[c(11),-1])),
c(colSums(repdf[21+c(11),-1])),
c(colSums(repdf[c(6:7),-1])),
c(colSums(repdf[21+c(6:7),-1])),
c(colSums(repdf[c(5),-1])),
c(colSums(repdf[21+c(5),-1])),
c(colSums(repdf[c(4,3),-1])),
c(colSums(repdf[21+c(4,3),-1]))
)
repdf2 <- data.frame(repClass=c("other_p","other_m","SINE_p","SINE_m","Simple_p","Simple_m","Satellite_p","Satellite_m","rRNA_p","rRNA_m","LTR_p","LTR_m","Low_complexity_p","Low_complexity_m","LINE_p","LINE_m"),repdf2)
colnames(repdf2) <- c("repClass",df$token)
save(repdf2,file="objects/230413_rep_annotation.rdat")
}
if(!file.exists("objects/230413_rep_strand_annotation.rdat") &! file.exists("objects/230413_rep_timing_annotation.rdat")){
repltime <- read.table("data/GSE131417_K562_reptime_segmentation_recolored.bed")
replstrand <- rtracklayer::import("data/OK-seq_K562_BR1.bam.rfd.bw")
seqlevels(replstrand, pruning.mode="coarse") <- paste0("chr",c(1:22,"X","Y"))
seqinfo(replstrand)<- seqinfo(Hsapiens)
colnames(repltime)[1:4] <- c("seqnames","start","end","timing")
repltime <- GRanges(repltime)
seqlevels(repltime, pruning.mode="coarse") <- paste0("chr",c(1:22,"X","Y"))
seqinfo(repltime)<- seqinfo(Hsapiens)
early <- repltime[repltime$timing == "early"]
mid_early <- repltime[repltime$timing == "mid-early"]
mid_late <- repltime[repltime$timing == "mid-late"]
late <- repltime[repltime$timing == "late"]
plusrepl <- reduce(replstrand[replstrand$score>0])
minusrepl <- reduce(replstrand[replstrand$score<0])
neut<- reduce(replstrand[replstrand$score==0])
genedf <- data.frame(
row.names=c("early","mid_early","mid_late","late"))
genedf$feature <- c("early","mid_early","mid_late","late")
stranddf <- data.frame(
row.names=c("plus","minus","neutral"))
stranddf$feature <- c("plus","minus","neutral")
for (i in 1:length(df$token)){
cat(i,"\n")
tok <- df$token[i]
cat(tok,"\n")
bed_ss <- get(load(paste0("objects/GR_by_token/GROVER600_",tok,".rdat")))
seqlevels(bed_ss) <- paste0("chr",c(1:22,"X","Y"))
seqinfo(bed_ss)<- seqinfo(Hsapiens)
if(length(bed_ss)>50000){
bed_ss <- bed_ss[sample(length(bed_ss),50000)]
}
start(bed_ss) <- start(bed_ss)+1 #readjust start
bed_ss <- sort(bed_ss)
names(bed_ss)<- 1:length(bed_ss)
peakAnno_early <- bed_ss[bed_ss %over% early]
peakAnno_mid_early <- bed_ss[bed_ss %over% mid_early]
peakAnno_mid_late <- bed_ss[bed_ss %over% mid_late]
peakAnno_late <- bed_ss[bed_ss %over% late]
peakAnno_plus <- bed_ss[bed_ss %over% plusrepl]
peakAnno_minus <- bed_ss[bed_ss %over% minusrepl]
peakAnno_neut <- bed_ss[bed_ss %over% neut]
genedf <- cbind(genedf,c(length(peakAnno_early),length(peakAnno_mid_early),length(peakAnno_mid_late),length(peakAnno_late)))
stranddf <- cbind(stranddf,c(length(peakAnno_plus),length(peakAnno_minus),length(peakAnno_neut)))
}
colnames(genedf) <- c("class",df$token)
colnames(stranddf) <- c("class",df$token)
genedf[,2:602] <- genedf[,2:602]/colSums(genedf[,2:602])
stranddf[,2:602] <- t(t(stranddf[,2:602])/colSums(stranddf[,2:602]))
genedf2 <- genedf[,c("class",df$token[Ghm1])]
genedf2$class <- factor(genedf2$class,levels=genedf2$class)
stranddf2 <- stranddf[,c("class",df$token[Ghm1])]
save(genedf2,file="objects/230413_rep_timing_annotation.rdat")
save(stranddf2,file="objects/230413_rep_strand_annotation.rdat")
}
annot <- get(load("objects/230413_gene_annotation.rdat"))
annot <- data.frame(t(annot[,-602]))
annot$token <- rownames(annot)
df <- merge(df,annot, by="token")
annot <- get(load("objects/230413_gene_orientation_annotation.rdat"))
rownames(annot) <- annot$class
annot <- data.frame(annot[,-c(1)])
annot <- data.frame(t(annot))
annot$token <- rownames(annot)
df <- merge(df,annot, by="token", all=TRUE)
annot <- get(load("objects/230413_cds_orientation_annotation.rdat"))
annot <- data.frame(t(annot[,-1]))
annot$token <- rownames(annot)
colnames(annot)<- c("cds_p","cds_m","cds_other","token")
df <- merge(df,annot, by="token", all=TRUE)
annot <- get(load("objects/230413_HMM_annotation.rdat"))
rownames(annot) <- annot$ID
annot <- data.frame(annot[,-c(1:2)])
annot <- data.frame(t(annot))
annot$token <- rownames(annot)
df <- merge(df,annot, by="token", all=TRUE)
annot <- get(load("objects/230413_rep_annotation.rdat"))
rownames(annot) <- annot$repClass
annot <- data.frame(annot[,-c(1)])
annot <- data.frame(t(annot))
annot$token <- rownames(annot)
df <- merge(df,annot, by="token", all=TRUE)
annot <- get(load("objects/230413_rep_timing_annotation.rdat"))
rownames(annot) <- annot$class
annot <- data.frame(annot[,-c(1)])
annot <- data.frame(t(annot))
annot$token <- rownames(annot)
df <- merge(df,annot, by="token", all=TRUE)
annot <- get(load("objects/230413_rep_strand_annotation.rdat"))
rownames(annot) <- annot$class
annot <- data.frame(annot[,-c(1)])
annot <- data.frame(t(annot))
annot$token <- rownames(annot)
df <- merge(df,annot, by="token", all=TRUE)
save(df, file="objects/GROVER_annotation_df.rdat")
df$X1st.Exon_residuals <- resid(loess(df$X1st.Exon ~ df$GC, span = 1))
df$X1st.Intron_residuals <- resid(loess(df$X1st.Intron ~ df$GC, span = 1))
df$X3..UTR_residuals <- resid(loess(df$X3..UTR ~ df$GC, span = 1))
df$X5..UTR_residuals <- resid(loess(df$X5..UTR ~ df$GC, span = 1))
df$Distal.Intergenic_residuals <- resid(loess(df$Distal.Intergenic ~ df$GC, span = 1))
df$Downstream....300._residuals <- resid(loess(df$Downstream....300. ~ df$GC, span = 1))
df$Other.Exon_residuals <- resid(loess(df$Other.Exon ~ df$GC, span = 1))
df$Other.Intron_residuals <- resid(loess(df$Other.Intron ~ df$GC, span = 1))
df$Promoter_residuals <- resid(loess(df$Promoter ~ df$GC, span = 1))
df$X1_Active_Promoter_residuals <- resid(loess(df$X1_Active_Promoter ~ df$GC, span = 1))
df$X2_Weak_Promoter_residuals <- resid(loess(df$X2_Weak_Promoter ~ df$GC, span = 1))
df$X3_Poised_Promoter_residuals <- resid(loess(df$X3_Poised_Promoter ~ df$GC, span = 1))
df$X4_Strong_Enhancer_residuals <- resid(loess(df$X4_Strong_Enhancer ~ df$GC, span = 1))
df$X5_Strong_Enhancer_residuals <- resid(loess(df$X5_Strong_Enhancer ~ df$GC, span = 1))
df$X6_Weak_Enhancer_residuals <- resid(loess(df$X6_Weak_Enhancer ~ df$GC, span = 1))
df$X7_Weak_Enhancer_residuals <- resid(loess(df$X7_Weak_Enhancer ~ df$GC, span = 1))
df$X8_Insulator_residuals <- resid(loess(df$X8_Insulator ~ df$GC, span = 1))
df$X9_Txn_Transition_residuals <- resid(loess(df$X9_Txn_Transition ~ df$GC, span = 1))
df$X10_Txn_Elongation_residuals <- resid(loess(df$X10_Txn_Elongation ~ df$GC, span = 1))
df$X11_Weak_Txn_residuals <- resid(loess(df$X11_Weak_Txn ~ df$GC, span = 1))
df$X12_Repressed_residuals <- resid(loess(df$X12_Repressed ~ df$GC, span = 1))
df$X13_Heterochrom.lo_residuals <- resid(loess(df$X13_Heterochrom.lo ~ df$GC, span = 1))
df$X14_Repetitive.CNV_residuals <- resid(loess(df$X14_Repetitive.CNV ~ df$GC, span = 1))
df$X15_Repetitive.CNV_residuals <- resid(loess(df$X15_Repetitive.CNV ~ df$GC, span = 1))
df$gene_p_residuals <- resid(loess(df$gene_p ~ df$GC, span = 1))
df$gene_m_residuals <- resid(loess(df$gene_m ~ df$GC, span = 1))
df$intragenic_residuals <- resid(loess(df$intragenic ~ df$GC, span = 1))
df$early_residuals <- resid(loess(df$early ~ df$GC, span = 1))
df$mid_early_residuals <- resid(loess(df$mid_early ~ df$GC, span = 1))
df$mid_late_residuals <- resid(loess(df$mid_late ~ df$GC, span = 1))
df$late_residuals <- resid(loess(df$late ~ df$GC, span = 1))
df$cds_p_residuals <- resid(loess(df$cds_p ~ df$GC, span = 1))
df$cds_m_residuals <- resid(loess(df$cds_m ~ df$GC, span = 1))
df$cds_other_residuals <- resid(loess(df$cds_other ~ df$GC, span = 1))
# fix class for token length to numeric
df$length <- as.numeric(as.character(df$length))
tmp <- data.frame(Gpca$x)[,1:100]
tmp$token <- rownames(tmp)
rownames(df)<- df$token
df <- df %>% arrange(df,token)
tmp <- tmp[df$token,]
cors <- data.frame(nrow=20)
for(i in c(colnames(df)[c(18,19,24:26,91,86,85,83,89,84,90,88,87,107:108,114:115,92:106,60:75,110:113,80:81)])){
cors_tmp <- cor(tmp[,1:20],df[i],method="spearman")
cors <- cbind(cors,cors_tmp)
}
## Warning in cor(tmp[, 1:20], df[i], method = "spearman"): the standard deviation
## is zero
## Warning in cor(tmp[, 1:20], df[i], method = "spearman"): the standard deviation
## is zero
tmp <- data.frame(GWpca$x)[,1:100]
tmp$token <- rownames(tmp)
rownames(df)<- df$token
df <- df %>% arrange(df,token)
tmp <- tmp[df$token,]
corsW <- data.frame(nrow=20)
for(i in c(colnames(df)[c(18,19,24:26,91,86,85,83,89,84,90,88,87,107:108,114:115,92:106,60:75,110:113,80:81)])){
cors_tmp <- cor(tmp[,1:20],df[i],method="spearman")
corsW <- cbind(corsW,cors_tmp)
}
## Warning in cor(tmp[, 1:20], df[i], method = "spearman"): the standard deviation
## is zero
## Warning in cor(tmp[, 1:20], df[i], method = "spearman"): the standard deviation
## is zero
varexplplot(GWvarvec)
ph <- pheatmap(t(corsW[,-c(1,40,48)]),
breaks = seq(-1, 1, length.out = 100),
cluster_rows = FALSE,
cluster_cols = FALSE,
gaps_row = c(5,14,16,18,33,47,51),
color = colorRampPalette(c("navy", "white", "firebrick3"))(100)
)
print(ph)
varexplplot(Gvarvec)
ph <- pheatmap(t(cors[,-c(1,40,48)]),
breaks = seq(-1, 1, length.out = 100),
cluster_rows = FALSE,
cluster_cols = FALSE,
gaps_row = c(5,14,16,18,33,47,51),
color = colorRampPalette(c("navy", "white", "firebrick3"))(100)
)
print(ph)
ggplot(df, aes(x=GROVER_PC1, y=log10(frequency), colour=token))+
geom_point()+
scale_colour_manual(values=df$W2V_hex)+
scale_x_continuous(name=paste("PC1 (",round(100*Gvarvec[1],2),"%)",sep=""))+
scale_y_continuous(name=paste("frequency (log10)",sep=""))+
theme_cowplot()+
theme(legend.position = "none")+
NULL
cor.test(df$GROVER_PC1,log10(df$frequency), method="spearman")
##
## Spearman's rank correlation rho
##
## data: df$GROVER_PC1 and log10(df$frequency)
## S = 4327716, p-value < 2.2e-16
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
## rho
## 0.8803844
ggplot(df, aes(x=GROVER_PC2, y=GC, colour=token))+
geom_point()+
scale_colour_manual(values=df$W2V_hex)+
scale_x_continuous(name=paste("PC2 (",round(100*Gvarvec[2],2),"%)",sep=""))+
scale_y_continuous(name=paste("GC [%]",sep=""))+
theme_cowplot()+
theme(legend.position = "none")+
NULL
cor.test(df$GROVER_PC2,df$GC, method="spearman")
## Warning in cor.test.default(df$GROVER_PC2, df$GC, method = "spearman"): Cannot
## compute exact p-value with ties
##
## Spearman's rank correlation rho
##
## data: df$GROVER_PC2 and df$GC
## S = 70998078, p-value < 2.2e-16
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
## rho
## -0.9623462
ggplot(df, aes(x=GROVER_PC3, y=AG, colour=W2V_hex))+
geom_point()+
scale_colour_manual(values=df$W2V_hex)+
scale_x_continuous(name=paste("PC3 (",round(100*Gvarvec[3],2),"%)",sep=""))+
scale_y_continuous(name=paste("AG [%]",sep=""))+
theme_cowplot()+
theme(legend.position = "none")+
NULL
cor.test(df$GROVER_PC3,df$AG,method="spearman")
## Warning in cor.test.default(df$GROVER_PC3, df$AG, method = "spearman"): Cannot
## compute exact p-value with ties
##
## Spearman's rank correlation rho
##
## data: df$GROVER_PC3 and df$AG
## S = 2083581, p-value < 2.2e-16
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
## rho
## 0.942411
df$length <- as.numeric(df$length)
ggplot(df, aes(x=GROVER_PC4, y=length, colour=W2V_hex))+
geom_point()+
scale_colour_manual(values=df$W2V_hex)+
#geom_label_repel(aes(label = token ))+
scale_x_continuous(name=paste("PC4 (",round(100*Gvarvec[4],2),"%)",sep=""))+
scale_y_continuous(name=paste("token length",sep=""))+
theme_cowplot()+
theme(legend.position = "none")+
NULL
cor.test(df$GROVER_PC4,df$length,method="spearman")
## Warning in cor.test.default(df$GROVER_PC4, df$length, method = "spearman"):
## Cannot compute exact p-value with ties
##
## Spearman's rank correlation rho
##
## data: df$GROVER_PC4 and df$length
## S = 22032923, p-value < 2.2e-16
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
## rho
## 0.3910226
ggplot(df, aes(x=GROVER_PC5, y=A+C, colour=W2V_hex))+
geom_point()+
scale_colour_manual(values=df$W2V_hex)+
scale_x_continuous(name=paste("PC5 (",round(100*Gvarvec[5],2),"%)",sep=""))+
scale_y_continuous(name=paste("AC",sep=""))+
theme_cowplot()+
theme(legend.position = "none")+
NULL
cor.test(df$GROVER_PC5,df$A+df$C,method="spearman")
## Warning in cor.test.default(df$GROVER_PC5, df$A + df$C, method = "spearman"):
## Cannot compute exact p-value with ties
##
## Spearman's rank correlation rho
##
## data: df$GROVER_PC5 and df$A + df$C
## S = 65480318, p-value < 2.2e-16
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
## rho
## -0.8098385
ggplot(df, aes(x=GROVER_PC6, y=length, colour=W2V_hex))+
geom_point()+
scale_colour_manual(values=df$W2V_hex)+
scale_x_continuous(name=paste("PC6 (",round(100*Gvarvec[6],2),"%)",sep=""))+
scale_y_continuous(name=paste("token length",sep=""))+
theme_cowplot()+
theme(legend.position = "none")+
NULL
cor.test(df$GROVER_PC6,df$length,method="spearman")
## Warning in cor.test.default(df$GROVER_PC6, df$length, method = "spearman"):
## Cannot compute exact p-value with ties
##
## Spearman's rank correlation rho
##
## data: df$GROVER_PC6 and df$length
## S = 20756012, p-value < 2.2e-16
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
## rho
## 0.4263157
Clustering and annotation were performed separately in that the order after clustering was obtained and then used for the associated bar graphs. The results were then combined with Adobe Illustrator.
Gdist<- dist(Gemb,method = "euclidean")
Gdist <- as.matrix(Gdist)
Gdf <- data.frame(Gdist)
df$logfreq <- log10(df$frequency)
df$length <- as.factor(df$length)
annot <- df[,c("GROVER_W2V_PC1","GC","AG","length","logfreq")]
rownames(annot) <- df$token
annotcol <- list(length=mycolors,GROVER_W2V_PC1=hex, logfreq=viridis(100, option ="B",direction=-1))
breaks <- c(0, seq(min(Gdf[Gdf != 0]), max(Gdf), length.out = 99))
ph <- pheatmap(Gdf,
cluster_rows=TRUE,
cluster_cols=TRUE,
annotation_col=annot,
annotation_colors = annotcol,
annotation_row = NULL,
annotation_row_names = FALSE,
annotation_col_names = FALSE,
color= c("grey",viridis(100,direction=1)),
show_rownames = F,
show_colnames = F,
breaks = breaks)
Ghm1 <- ph$tree_col[["order"]]
load("objects/230413_rep_annotation.rdat")
melted <- melt(repdf2, id="repClass")
rep_cols <- c(brewer.pal(n = 8, name = "Dark2"),brewer.pal(n = 8, name = "Set2"))
repcol <- rep_cols[c(1,9,2,10,3,11,4,12,5,13,6,14,7,15,8,16)]
ggplot(melted,aes(x=variable,y=value,fill=repClass))+
geom_bar(stat="identity", width = 1)+
theme_cowplot()+
scale_fill_manual(values=repcol)+
coord_flip()+
NULL
HHMdf <- get(load("objects/230413_HMM_annotation.rdat"))
HHMcol <- c("#FF0000","#FF6969","#CF0BC6", "#FACA00", "#FACA01", "#FFFC04", "#FFFC05", "#0ABEFE", "#00B050", "#00B051", "#99FF66","#7F7F7F", "#F5F5F5", "#F5F5F6", "#F5F5F7")
HHMdf <- data.frame(HHMdf)
HHMdf[,3:603] <- t(t(HHMdf[,3:603])/colSums(HHMdf[,3:603]))
melted <- melt(HHMdf[,-2], id=c("ID"))
ggplot(melted,aes(x=variable,y=value,fill=ID))+
geom_bar(stat="identity", width = 1)+
theme_cowplot()+
scale_fill_manual(values=HHMcol)+
coord_flip()+
NULL
Clustering and annotation were performed separately in that the order after clustering was obtained and then used for the associated bar graphs. The results were then combined with Adobe Illustrator.
self_sim <- read.csv("data/self_similarity_per_token.csv")
ss <- self_sim[,3:14]
df <- merge(df, self_sim, id="token")
rownames(ss) <- self_sim$token
df$logfreq <- log10(df$frequency)
annot <- df[,c("logfreq","length","GROVER_W2V_PC1","GROVER_AUC")[4:1]]
annot$length <- factor(annot$length)
rownames(annot) <- df$token
annotcol <- list(length=mycolors,GROVER_W2V_PC1=hex, logfreq=viridis(100, option ="B",direction=-1))
phss <- pheatmap(ss,
cluster_cols=FALSE,
annotation_row = annot,
annotation_colors = annotcol,
show_rownames = F,
color= viridis(100,direction=1))
sshm <- phss$tree_row[["order"]]
token_order <- rownames(ss)[sshm]
HHMdf <- get(load("objects/230413_HMM_annotation.rdat"))
HHMdf <- data.frame(HHMdf)
HHMdf[,3:603] <- t(t(HHMdf[,3:603])/colSums(HHMdf[,3:603]))
HHMdf <- HHMdf[,c("ID",token_order)]
melted <- melt(HHMdf[,-2], id=c("ID"))
ggplot(melted,aes(x=variable,y=value,fill=ID))+
geom_bar(stat="identity", width = 1)+
theme_cowplot()+
scale_fill_manual(values=HHMcol)+
coord_flip()+
NULL
load("objects/230413_rep_annotation.rdat")
repdf2 <- repdf2[,c("repClass",token_order)]
melted <- melt(repdf2, id="repClass")
rep_cols <- c(brewer.pal(n = 8, name = "Dark2"),brewer.pal(n = 8, name = "Set2"))
repcol <- rep_cols[c(1,9,2,10,3,11,4,12,5,13,6,14,7,15,8,16)]
ggplot(melted,aes(x=variable,y=value,fill=repClass))+
geom_bar(stat="identity", width = 1)+
theme_cowplot()+
scale_fill_manual(values=repcol)+
coord_flip()+
NULL
emb <- read.table("data/umap_pca_50_cls_tokens.bed")
colnames(emb)<- c("chromosome","start","end","x","y")
emb$start <- emb$start+1
GR <- GRanges(emb)
seqinfo(GR)<- seqinfo(Hsapiens)
#annotate to repeats
repdb <- get(load("data/repdb_UCSC.Rdata"))
seqlevels(repdb)<- seqlevels(Hsapiens)
seqinfo(repdb)<- seqinfo(Hsapiens)
## Warning in valid.GenomicRanges.seqinfo(x, suggest.trim = TRUE): GRanges object contains 14 out-of-bound ranges located on sequences
## chrUn_gl000211, chrUn_gl000217, chrUn_gl000220, chrUn_gl000227,
## chrUn_gl000230, chrUn_gl000232, chrUn_gl000233, chrUn_gl000235,
## chrUn_gl000236, chrUn_gl000244, chrUn_gl000246, chrUn_gl000247,
## chr1_gl000191_random, and chr19_gl000208_random. Note that ranges
## located on a sequence whose length is unknown (NA) or on a circular
## sequence are not considered out-of-bound (use seqlengths() and
## isCircular() to get the lengths and circularity flags of the underlying
## sequences). You can use trim() to trim these ranges. See
## ?`trim,GenomicRanges-method` for more information.
rep_ss <- repdb[repdb$repClass %in% c("LINE", "LINE?") & strand(repdb) == "-"]
dat <- GR[GR %over% rep_ss]
dat <- data.frame(elementMetadata(dat))
ggplot(dat, aes(x=x, y=y))+
geom_point(colour=rep_cols[1], alpha=0.4)+
scale_x_continuous(limits=c(-4.411,20.5))+
scale_y_continuous(limits=c(-7.69,16.12))+
ggtitle("LINE-")+
NULL
rep_ss <- repdb[repdb$repClass %in% c("LINE", "LINE?") & strand(repdb) == "+"]
dat <- GR[GR %over% rep_ss]
dat <- data.frame(elementMetadata(dat))
ggplot(dat, aes(x=x, y=y))+
geom_point(colour=rep_cols[9], alpha=0.4)+
scale_x_continuous(limits=c(-4.411,20.5))+
scale_y_continuous(limits=c(-7.69,16.12))+
ggtitle("LINE+")+
NULL
rep_ss <- repdb[repdb$repClass %in% c("Low_complexity")]
dat <- GR[GR %over% rep_ss]
dat <- data.frame(elementMetadata(dat))
ggplot(dat, aes(x=x, y=y))+
geom_point(colour=rep_cols[2], alpha=0.4)+
scale_x_continuous(limits=c(-4.411,20.5))+
scale_y_continuous(limits=c(-7.69,16.12))+
ggtitle("Low_complexity")+
NULL
rep_ss <- repdb[repdb$repClass %in% c("LTR","LTR?") & strand(repdb) == "-"]
dat <- GR[GR %over% rep_ss]
dat <- data.frame(elementMetadata(dat))
ggplot(dat, aes(x=x, y=y))+
geom_point(colour=rep_cols[3], alpha=0.4)+
scale_x_continuous(limits=c(-4.411,20.5))+
scale_y_continuous(limits=c(-7.69,16.12))+
ggtitle("LTR-")+
NULL
rep_ss <- repdb[repdb$repClass %in% c("LTR","LTR?") & strand(repdb) == "+"]
dat <- GR[GR %over% rep_ss]
dat <- data.frame(elementMetadata(dat))
ggplot(dat, aes(x=x, y=y))+
geom_point(colour=rep_cols[11], alpha=0.4)+
scale_x_continuous(limits=c(-4.411,20.5))+
scale_y_continuous(limits=c(-7.69,16.12))+
ggtitle("LTR+")+
NULL
rep_ss <- repdb[repdb$repClass %in% c("other","DNA","DNA?","RC","RNA","scRNA","snRNA","srpRNA","tRNA","Unknown","Unknown?") & strand(repdb) == "-"]
dat <- GR[GR %over% rep_ss]
dat <- data.frame(elementMetadata(dat))
ggplot(dat, aes(x=x, y=y))+
geom_point(colour=rep_cols[4], alpha=0.4)+
scale_x_continuous(limits=c(-4.411,20.5))+
scale_y_continuous(limits=c(-7.69,16.12))+
ggtitle("other-")+
NULL
rep_ss <- repdb[repdb$repClass %in% c("other","DNA","DNA?","RC","RNA","scRNA","snRNA","srpRNA","tRNA","Unknown","Unknown?") & strand(repdb) == "+"]
dat <- GR[GR %over% rep_ss]
dat <- data.frame(elementMetadata(dat))
ggplot(dat, aes(x=x, y=y))+
geom_point(colour=rep_cols[12], alpha=0.4)+
scale_x_continuous(limits=c(-4.411,20.5))+
scale_y_continuous(limits=c(-7.69,16.12))+
ggtitle("other+")+
NULL
rep_ss <- repdb[repdb$repClass %in% c("rRNA") & strand(repdb) == "-"]
dat <- GR[GR %over% rep_ss]
dat <- data.frame(elementMetadata(dat))
ggplot(dat, aes(x=x, y=y))+
geom_point(colour=rep_cols[5], alpha=0.4)+
scale_x_continuous(limits=c(-4.411,20.5))+
scale_y_continuous(limits=c(-7.69,16.12))+
ggtitle("rRNA-")+
NULL
rep_ss <- repdb[repdb$repClass %in% c("rRNA") & strand(repdb) == "+"]
dat <- GR[GR %over% rep_ss]
dat <- data.frame(elementMetadata(dat))
ggplot(dat, aes(x=x, y=y))+
geom_point(colour=rep_cols[13], alpha=0.4)+
scale_x_continuous(limits=c(-4.411,20.5))+
scale_y_continuous(limits=c(-7.69,16.12))+
ggtitle("rRNA+")+
NULL
rep_ss <- repdb[repdb$repClass %in% c("Satellite") & strand(repdb) == "-"]
dat <- GR[GR %over% rep_ss]
dat <- data.frame(elementMetadata(dat))
ggplot(dat, aes(x=x, y=y))+
geom_point(colour=rep_cols[6], alpha=0.4)+
scale_x_continuous(limits=c(-4.411,20.5))+
scale_y_continuous(limits=c(-7.69,16.12))+
ggtitle("Satellite-")+
NULL
rep_ss <- repdb[repdb$repClass %in% c("Satellite") & strand(repdb) == "+"]
dat <- GR[GR %over% rep_ss]
dat <- data.frame(elementMetadata(dat))
ggplot(dat, aes(x=x, y=y))+
geom_point(colour=rep_cols[14], alpha=0.4)+
scale_x_continuous(limits=c(-4.411,20.5))+
scale_y_continuous(limits=c(-7.69,16.12))+
ggtitle("Satellite+")+
NULL
rep_ss <- repdb[repdb$repClass %in% c("Simple_repeat") ]
dat <- GR[GR %over% rep_ss]
dat <- data.frame(elementMetadata(dat))
ggplot(dat, aes(x=x, y=y))+
geom_point(colour=rep_cols[15], alpha=0.4)+
scale_x_continuous(limits=c(-4.411,20.5))+
scale_y_continuous(limits=c(-7.69,16.12))+
ggtitle("Simple")+
NULL
rep_ss <- repdb[repdb$repClass %in% c("SINE", "SINE?") & strand(repdb) == "-"]
dat <- GR[GR %over% rep_ss]
dat <- data.frame(elementMetadata(dat))
ggplot(dat, aes(x=x, y=y))+
geom_point(colour=rep_cols[8], alpha=0.4)+
scale_x_continuous(limits=c(-4.411,20.5))+
scale_y_continuous(limits=c(-7.69,16.12))+
ggtitle("SINE-")+
NULL
rep_ss <- repdb[repdb$repClass %in% c("SINE","SINE?") & strand(repdb) == "+"]
dat <- GR[GR %over% rep_ss]
dat <- data.frame(elementMetadata(dat))
ggplot(dat, aes(x=x, y=y))+
geom_point(colour=rep_cols[16], alpha=0.4)+
scale_x_continuous(limits=c(-4.411,20.5))+
scale_y_continuous(limits=c(-7.69,16.12))+
ggtitle("SINE+")+
NULL
HHMcol[13:15] <- "black"
for(i in c(1:15)){
GR2 <- HMM[HMM$ID == levels(HHMdf$ID)[i]]
dat <- GR[GR %over% GR2]
dat <- data.frame(elementMetadata(dat))
p1 <- ggplot(dat, aes(x=x, y=y))+
geom_point(colour=HHMcol[i], alpha=0.4)+
scale_x_continuous(limits=c(-4.411,20.5))+
scale_y_continuous(limits=c(-7.69,16.12))+
ggtitle(levels(HHMdf$ID)[i])+
NULL
print(p1)
}
Figures 7A,C,E are directly genome browser tracks from the IGV browser for illustration.
metr <- read.csv("data/fine_tuning_metrics.csv")
prom300 <- metr[c(5:11,16:18),2:7]
melted <- melt(prom300[4:10,], id=c("Dataset","Model"))
melted$Dataset <- factor(melted$Dataset,levels=c("BP shuffle","4mer-shuffle","5mer-shuffle","6mer-shuffle"))
ggplot(melted, aes(x=interaction(Dataset,Model), group=Dataset, y=value, fill=interaction(Dataset, Model)))+
geom_bar(stat="identity")+
scale_y_continuous(limits=c(0,1))+
scale_fill_manual(values=modelcols)+
facet_wrap(~variable,nrow=1)+
theme(axis.text.x = element_text(angle = 90, vjust = 1, hjust=1))+
NULL
promscan <- metr[19:22,3:7]
melted <- melt(promscan)
## Using Model as id variables
ggplot(melted, aes(x=Model, y=value, fill=Model))+
geom_bar(stat="identity")+
scale_fill_manual(values=modelcols[c(1,2,4,6)])+
facet_wrap(~variable,nrow=1)+
theme(axis.text.x = element_text(angle = 90, vjust = 1, hjust=1))+
NULL
CTCF <- metr[23:26,3:7]
melted <- melt(CTCF)
## Using Model as id variables
ggplot(melted, aes(x=Model, y=value, fill=Model))+
geom_bar(stat="identity")+
scale_fill_manual(values=modelcols[c(1,2,4,6)])+
facet_wrap(~variable,nrow=1)+
theme(axis.text.x = element_text(angle = 90, vjust = 1, hjust=1))+
NULL
sessionInfo()
## R version 4.2.1 (2022-06-23)
## Platform: x86_64-apple-darwin17.0 (64-bit)
## Running under: macOS Big Sur ... 10.16
##
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/4.2/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/4.2/Resources/lib/libRlapack.dylib
##
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
##
## attached base packages:
## [1] stats4 grid stats graphics grDevices utils datasets
## [8] methods base
##
## other attached packages:
## [1] clusterProfiler_4.6.0
## [2] TxDb.Hsapiens.UCSC.hg19.knownGene_3.2.2
## [3] GenomicFeatures_1.50.4
## [4] AnnotationDbi_1.60.0
## [5] Biobase_2.58.0
## [6] BSgenome.Hsapiens.UCSC.hg19_1.4.3
## [7] BSgenome_1.66.3
## [8] rtracklayer_1.58.0
## [9] Biostrings_2.66.0
## [10] XVector_0.38.0
## [11] GenomicRanges_1.50.2
## [12] GenomeInfoDb_1.34.9
## [13] IRanges_2.32.0
## [14] S4Vectors_0.36.1
## [15] BiocGenerics_0.44.0
## [16] wordcloud2_0.2.2
## [17] lsa_0.73.3
## [18] SnowballC_0.7.1
## [19] umap_0.2.10.0
## [20] ggrepel_0.9.3
## [21] ggfortify_0.4.15
## [22] migest_2.0.3
## [23] viridis_0.6.2
## [24] viridisLite_0.4.1
## [25] pheatmap_1.0.12
## [26] gridExtra_2.3
## [27] cowplot_1.1.1
## [28] reshape2_1.4.4
## [29] RColorBrewer_1.1-3
## [30] colorspace_2.1-0
## [31] pals_1.7
## [32] lubridate_1.9.2
## [33] forcats_1.0.0
## [34] stringr_1.5.0
## [35] dplyr_1.1.0
## [36] purrr_1.0.1
## [37] readr_2.1.4
## [38] tidyr_1.3.0
## [39] tibble_3.1.8
## [40] tidyverse_2.0.0
## [41] ggpubr_0.6.0
## [42] ggplot2_3.4.1
##
## loaded via a namespace (and not attached):
## [1] utf8_1.2.3 reticulate_1.28
## [3] tidyselect_1.2.0 RSQLite_2.3.0
## [5] htmlwidgets_1.6.2 BiocParallel_1.32.5
## [7] scatterpie_0.1.8 munsell_0.5.0
## [9] codetools_0.2-19 withr_2.5.0
## [11] GOSemSim_2.24.0 filelock_1.0.2
## [13] highr_0.10 knitr_1.43
## [15] rstudioapi_0.14 ggsignif_0.6.4
## [17] DOSE_3.24.2 labeling_0.4.2
## [19] MatrixGenerics_1.10.0 GenomeInfoDbData_1.2.9
## [21] polyclip_1.10-4 bit64_4.0.5
## [23] farver_2.1.1 downloader_0.4
## [25] treeio_1.22.0 vctrs_0.6.3
## [27] generics_0.1.3 gson_0.0.9
## [29] xfun_0.39 timechange_0.2.0
## [31] BiocFileCache_2.6.1 R6_2.5.1
## [33] graphlayouts_0.8.4 bitops_1.0-7
## [35] cachem_1.0.8 fgsea_1.24.0
## [37] gridGraphics_0.5-1 DelayedArray_0.24.0
## [39] assertthat_0.2.1 BiocIO_1.8.0
## [41] scales_1.2.1 ggraph_2.1.0
## [43] enrichplot_1.18.3 gtable_0.3.1
## [45] tidygraph_1.2.3 rlang_1.1.1
## [47] splines_4.2.1 lazyeval_0.2.2
## [49] rstatix_0.7.2 dichromat_2.0-0.1
## [51] broom_1.0.3 yaml_2.3.7
## [53] abind_1.4-5 backports_1.4.1
## [55] qvalue_2.30.0 tools_4.2.1
## [57] ggplotify_0.1.0 ellipsis_0.3.2
## [59] jquerylib_0.1.4 Rcpp_1.0.10
## [61] plyr_1.8.8 progress_1.2.2
## [63] zlibbioc_1.44.0 RCurl_1.98-1.10
## [65] prettyunits_1.1.1 openssl_2.0.5
## [67] SummarizedExperiment_1.28.0 magrittr_2.0.3
## [69] data.table_1.14.8 RSpectra_0.16-1
## [71] matrixStats_0.63.0 hms_1.1.2
## [73] patchwork_1.1.2 evaluate_0.21
## [75] HDO.db_0.99.1 XML_3.99-0.13
## [77] compiler_4.2.1 biomaRt_2.54.0
## [79] maps_3.4.1 shadowtext_0.1.2
## [81] crayon_1.5.2 htmltools_0.5.5
## [83] mgcv_1.8-41 ggfun_0.0.9
## [85] tzdb_0.3.0 aplot_0.1.9
## [87] DBI_1.1.3 tweenr_2.0.2
## [89] dbplyr_2.3.0 MASS_7.3-58.2
## [91] rappdirs_0.3.3 Matrix_1.5-3
## [93] car_3.1-1 cli_3.6.1
## [95] parallel_4.2.1 igraph_1.4.0
## [97] pkgconfig_2.0.3 GenomicAlignments_1.34.0
## [99] xml2_1.3.3 ggtree_3.6.2
## [101] bslib_0.5.0 yulab.utils_0.0.6
## [103] digest_0.6.31 rmarkdown_2.25
## [105] fastmatch_1.1-3 tidytree_0.4.2
## [107] restfulr_0.0.15 curl_5.0.0
## [109] Rsamtools_2.14.0 rjson_0.2.21
## [111] nlme_3.1-162 lifecycle_1.0.3
## [113] jsonlite_1.8.5 carData_3.0-5
## [115] mapproj_1.2.11 askpass_1.1
## [117] fansi_1.0.4 pillar_1.8.1
## [119] lattice_0.20-45 KEGGREST_1.38.0
## [121] fastmap_1.1.1 httr_1.4.5
## [123] GO.db_3.16.0 glue_1.6.2
## [125] png_0.1-8 bit_4.0.5
## [127] ggforce_0.4.1 stringi_1.7.12
## [129] sass_0.4.6 blob_1.2.3
## [131] memoise_2.0.1 ape_5.7