For issues and questions, please get in touch via arpoetsch@gmail.com
The paper that is associated with this script can be found at: Sanabria,
M., Hirsch, J., Joubert, P.M. et al. DNA language model GROVER learns
sequence context in the human genome. Nat Mach Intell (2024). https://doi.org/10.1038/s42256-024-00872-0 A tutorial on
how to use GROVER as a foundation model can be found at: https://doi.org/10.5281/zenodo.8373158. The pretrained
model can be found at: https://doi.org/10.5281/zenodo.8373116 and https://huggingface.co/PoetschLab/GROVER
The data for the tokenised genome are at: https://doi.org/10.5281/zenodo.8373052
The models and performance measurements were done in python and the code can be found in the corresponding document. For visualisation, data were imported into R. We provide the outputs that we used for visualisation directly to make things easier for reproduction. Some figures were recoloured and/or combined with Adobe Illustrator, so some figures may look a bit different from the publication.
Tokenisation was performed in Python. Visualisations are done in R with token clowds. The pickle file is converted into .rdat with
library(reticulate)
source_python(“src/pickle_reader.py”)
dat <- read_pickle_file(“data/all_vocabs_wg.pkl”)
# load tokens
load("data/all_vocabs_wg.rdat")
# make word clouds for cycle 1,2,3,23,38,121,356
c1 <- dat[[1]]
wc <- data.frame(word=names(c1),freq=unlist(c1))
colvec <- mycolors[nchar(wc$word)]
names(colvec) <- NULL
wordcloud2(data=wc, fontFamily = "Courier New", color=colvec)
wc["BPE_cycle"] <- 1
wctot <- wc
c1 <- dat[[2]]
wc <- data.frame(word=names(c1),freq=unlist(c1))
colvec <- mycolors[nchar(wc$word)]
names(colvec) <- NULL
wordcloud2(data=wc, fontFamily = "Courier New", color=colvec)
wc["BPE_cycle"] <- 2
wctot <- rbind(wctot,wc)
c1 <- dat[[3]]
wc <- data.frame(word=names(c1),freq=unlist(c1))
colvec <- mycolors[nchar(wc$word)]
names(colvec) <- NULL
wordcloud2(data=wc, fontFamily = "Courier New", color=colvec)
wc["BPE_cycle"] <- 3
wctot <- rbind(wctot,wc)
c1 <- dat[[4]]
wc <- data.frame(word=names(c1),freq=unlist(c1))
colvec <- mycolors[nchar(wc$word)]
names(colvec) <- NULL
wordcloud2(data=wc, fontFamily = "Courier New", color=colvec)
wc["BPE_cycle"] <- 4
wctot <- rbind(wctot,wc)
c1 <- dat[[24]]
wc <- data.frame(word=names(c1),freq=unlist(c1))
colvec <- mycolors[nchar(wc$word)]
names(colvec) <- NULL
wordcloud2(data=wc, fontFamily = "Courier New", color=colvec)
wc["BPE_cycle"] <- 24
wctot <- rbind(wctot,wc)
c1 <- dat[[39]]
wc <- data.frame(word=names(c1),freq=unlist(c1))
colvec <- mycolors[nchar(wc$word)]
names(colvec) <- NULL
wordcloud2(data=wc, fontFamily = "Courier New", color=colvec)
wc["BPE_cycle"] <- 39
wctot <- rbind(wctot,wc)
c1 <- dat[[122]]
wc <- data.frame(word=names(c1),freq=unlist(c1))
colvec <- mycolors[nchar(wc$word)]
names(colvec) <- NULL
wordcloud2(data=wc, fontFamily = "Courier New", color=colvec)
wc["BPE_cycle"] <- 122
wctot <- rbind(wctot,wc)
c1 <- dat[[357]]
wc <- data.frame(word=names(c1),freq=unlist(c1))
colvec <- mycolors[nchar(wc$word)]
names(colvec) <- NULL
wordcloud2(data=wc, fontFamily = "Courier New", color=colvec)
wc["BPE_cycle"] <- 357
wctot <- rbind(wctot,wc)
write_csv(wctot, file="fig1A.csv")
Performance metrices are imported from python.
perf <- read.csv("data/performance.csv", header = TRUE)
colnames(perf) <- c("2-mers", "3-mers","4-mers","5-mers","6-mers")
rownames(perf) <- c(100*c(1:10),1000*c(2:5))
perf$cycle <- rownames(perf)
perf$cycle <- factor(perf$cycle,levels=perf$cycle)
melted <- melt(perf, id="cycle")
colnames(melted)<-c("cycle","kmer","accuracy")
ggplot(data=melted, aes(x=cycle,y=accuracy,colour=kmer, fill=kmer, group=kmer))+
geom_point(size=3)+
geom_smooth()+
geom_segment(aes(x=10.5,xend=10.5,y=accuracy,yend=accuracy+0.5), linewidth=5, colour="white")+
facet_grid(facet="kmer",rows=5, scales="free")+
scale_colour_manual(values=viridis(n=5))+
scale_fill_manual(values=viridis(n=5))+
theme(strip.background = element_rect(fill = "white"))+
theme(axis.text.x = element_text(angle = 90))+
theme(legend.position="none")+
NULL
## Warning: The `facets` argument of `facet_grid()` is deprecated as of ggplot2 2.2.0.
## ℹ Please use the `rows` argument instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
write_csv(perf,"fig2A.csv")
dat <- read.csv("data/240221_accuracy.csv")
dat[,2:6] <- apply(dat[,2:6],2,as.numeric)
dat$model <- factor(dat$model, levels=dat$model[c(2,4,5,6,7,1,3,8:13)])
dat_hs <- dat[c(1:13),]
write_csv(dat_hs,"fig2BC.csv")
melted <- melt(dat_hs, id="model")
colnames(melted) <- c("Model","task","value")
p1 <- ggplot(melted, aes(x=Model, y=value,fill=Model))+
geom_bar(stat="identity", position="dodge")+
ggtitle("DNA language models")+
scale_fill_manual(values=c(cols1,colsIDF))+
theme_cowplot()+
facet_wrap(~task, nrow=5, scales="free_y")+
theme(axis.text.x=element_text(angle = 90,hjust = 1))+
NULL
p1
dat <- read.csv("data/240221_precision.csv")
dat[,2:6] <- apply(dat[,2:6],2,as.numeric)
dat$model <- factor(dat$model, levels=dat$model[c(2,4,5,6,7,1,3,8:13)])
dat_hs <- dat[c(1:13),]
melted <- melt(dat_hs, id="model")
colnames(melted) <- c("Model","task","value")
p1 <- ggplot(melted, aes(x=Model, y=value,fill=Model))+
geom_bar(stat="identity", position="dodge")+
ggtitle("DNA language models")+
scale_fill_manual(values=c(cols1,colsIDF))+
theme_cowplot()+
facet_wrap(~task, nrow=5, scales="free_y")+
theme(axis.text.x=element_text(angle = 90,hjust = 1))+
NULL
p1
dat <- read.csv("data/240221_recall.csv")
dat[,2:6] <- apply(dat[,2:6],2,as.numeric)
dat$model <- factor(dat$model, levels=dat$model[c(2,4,5,6,7,1,3,8:13)])
dat_hs <- dat[c(1:13),]
melted <- melt(dat_hs, id="model")
colnames(melted) <- c("Model","task","value")
p1 <- ggplot(melted, aes(x=Model, y=value,fill=Model))+
geom_bar(stat="identity", position="dodge")+
ggtitle("DNA language models")+
scale_fill_manual(values=c(cols1,colsIDF))+
theme_cowplot()+
facet_wrap(~task, nrow=5, scales="free_y")+
theme(axis.text.x=element_text(angle = 90,hjust = 1))+
NULL
p1
dat <- read.csv("data/240221_f1.csv")
dat[,2:6] <- apply(dat[,2:6],2,as.numeric)
dat$model <- factor(dat$model, levels=dat$model[c(2,4,5,6,7,1,3,8:13)])
dat_hs <- dat[c(1:13),]
melted <- melt(dat_hs, id="model")
colnames(melted) <- c("Model","task","value")
p1 <- ggplot(melted, aes(x=Model, y=value,fill=Model))+
geom_bar(stat="identity", position="dodge")+
ggtitle("DNA language models")+
scale_fill_manual(values=c(cols1,colsIDF))+
theme_cowplot()+
facet_wrap(~task, nrow=5, scales="free_y")+
theme(axis.text.x=element_text(angle = 90,hjust = 1))+
NULL
p1
dat <- read.csv("data/240221_matthews_correlation.csv")
dat[,2:6] <- apply(dat[,2:6],2,as.numeric)
dat$model <- factor(dat$model, levels=dat$model[c(2,4,5,6,7,1,3,8:13)])
dat_hs <- dat[c(1:13),]
melted <- melt(dat_hs, id="model")
colnames(melted) <- c("Model","task","value")
p1 <- ggplot(melted, aes(x=Model, y=value,fill=Model))+
geom_bar(stat="identity", position="dodge")+
ggtitle("DNA language models")+
scale_fill_manual(values=c(cols1,colsIDF))+
theme_cowplot()+
facet_wrap(~task, nrow=5, scales="free_y")+
theme(axis.text.x=element_text(angle = 90,hjust = 1))+
NULL
p1
acc <- read.csv("data/top_k_accuracy.csv")
ggplot(acc[1:60,], aes(x=k,y=accuracy))+
geom_bar(stat="identity")+
NULL
perp <- read.csv("data/perplexity_GROVER_chr21.csv")
perp4 <- read.csv("data/perplexity_4mer_chr21.csv")
perp5 <- read.csv("data/perplexity_5mer_chr21.csv")
perp6 <- read.csv("data/perplexity_6mer_chr21.csv")
perp$performance <- perp$performance /601
perp4$performance <- perp4$performance /(4^4)
perp5$performance <- perp5$performance /(4^5)
perp6$performance <- perp6$performance /(4^6)
datf <- data.frame(model=c("GROVER","4mer","5mer","6mer"),
perplexity =c(min(perp$performance),min(perp4$performance),min(perp5$performance),min(perp6$performance)))
datf$model <- factor(datf$model, levels=datf$model)
ggplot(datf, aes(x=model,y=perplexity, fill=model))+
geom_bar(stat="identity")+
scale_fill_manual(values=modelcols[c(1,2,4,6)])+
NULL
load("data/all_vocabs_wg.rdat")
vocab600 <- dat[["600"]]
vocab600 <- t(data.frame(vocab600))
colnames(vocab600)<- "frequency"
vocab600 <- data.frame(vocab600)
vocab600$token <- rownames(vocab600)
Hseq <- getSeq(Hsapiens)
Hseq <- Hseq[names(Hseq) %in% paste0("chr",c(1:22,"X","Y"))]
kmers4 <- colSums(oligonucleotideFrequency(Hseq, 4))
kmers5 <- colSums(oligonucleotideFrequency(Hseq, 5))
kmers6 <- colSums(oligonucleotideFrequency(Hseq, 6))
kmers4 <- data.frame(kmers4)
kmers4$token <- rownames(kmers4)
colnames(kmers4)<- c("frequency","token")
kmers5 <- data.frame(kmers5)
kmers5$token <- rownames(kmers5)
colnames(kmers5)<- c("frequency","token")
kmers6 <- data.frame(kmers6)
kmers6$token <- rownames(kmers6)
colnames(kmers6)<- c("frequency","token")
Embeddings for the GROVER vocabulary from the BERT infrastructure and Word2Vec, UMAP and PCA.
#GROVER
Gemb <- get(load("data/vocab_embedding.rdat"))
rownames(Gemb) <- read.table("data/vocab.txt")[,1]
#W2V
GWemb <- get(load("data/iter600_w2v.rdat"))
rownames(GWemb) <- read.table("data/iter600_w2v_vocab.txt")[,1]
Gemb <- Gemb[-c(1:7,9),]
Gum <- umap(Gemb)
Gpca <- prcomp(Gemb)
GWum <- umap(GWemb)
GWpca <- prcomp(GWemb)
Collection of data in dataframes
df <- data.frame(Gum$layout)
df$token <- rownames(df)
colnames(df)<- c("G_UMAP1","G_UMAP2","token")
dfGW <- data.frame(GWum$layout)
dfGW$token <- rownames(dfGW)
colnames(dfGW) <- c("G_W2V_UMAP1","G_W2V_UMAP2","token")
df <- merge(df,dfGW,id="token")
df2 <- data.frame(Gpca$x)[,1:6]
df2$token <- rownames(df2)
colnames(df2) <- c("GROVER_PC1","GROVER_PC2","GROVER_PC3","GROVER_PC4","GROVER_PC5","GROVER_PC6","token")
df <- merge(df,df2,id="token")
df2 <- data.frame(GWpca$x)[,1:6]
df2$token <- rownames(df2)
colnames(df2) <- c("GROVER_W2V_PC1","GROVER_W2V_PC2","GROVER_W2V_PC3","GROVER_W2V_PC4","GROVER_W2V_PC5","GROVER_W2V_PC6","token")
df <- merge(df,df2,id="token")
df <- merge(df,vocab600, id="token")
df$length <- factor(nchar(as.character(df$token)))
df$C <- sapply(as.character(df$token),str_count, pattern=c("C"))/nchar(as.character(df$token))
df$G <- sapply(as.character(df$token),str_count, pattern=c("G"))/nchar(as.character(df$token))
df$A <- sapply(as.character(df$token),str_count, pattern=c("A"))/nchar(as.character(df$token))
df$T <- sapply(as.character(df$token),str_count, pattern=c("T"))/nchar(as.character(df$token))
df$GC <- df$C + df$G
df$AG <- df$A + df$G
df$AC <- df$A + df$C
hex=jet(nrow(df)) #colour selection
df$W2V_hex <- hex[rank(df$GROVER_W2V_PC1)]
w2v_tokencol <- df[,c("token","W2V_hex","GROVER_W2V_PC1")]
Compare frequency distribution between vocabularies
freqdf <- rbind(data.frame(model="GROVER",frequency=df$frequency),
data.frame(model= "4mer",frequency=kmers4$frequency/4),
data.frame(model= "5mer",frequency=kmers5$frequency/5),
data.frame(model= "6mer",frequency=kmers6$frequency/6)
)
freqdf$model <- factor(freqdf$model,levels=c("GROVER","4mer","5mer","6mer"))
write_csv(freqdf, "fig3a.csv")
ggplot(freqdf, aes(x = frequency, fill = model)) +
geom_density() + # Boxplot without outliers
scale_x_log10() + # Logarithmic y-axis scale
labs(x = "frequency", y = "density") + # Axis labels
scale_fill_manual(values=modelcols[c(1,2,4,6)])+
facet_grid(facet="model", rows=4)+
theme(strip.background = element_rect(fill = "white"))+
NULL
dftmp <- aggregate(frequency~length, df[,c("frequency","length")],sum)
dftmp$length <- as.numeric(as.character(dftmp$length))
levels(dftmp$length) <- 1:15
dftmp <- rbind(dftmp,data.frame(length=c(13:16),frequency=0))
## Warning in `[<-.factor`(`*tmp*`, ri, value = structure(c(1, 2, 3, 4, 5, :
## invalid factor level, NA generated
## Warning in `[<-.factor`(`*tmp*`, ri, value = structure(c(1, 2, 3, 4, 5, :
## invalid factor level, NA generated
write_csv(dftmp, "fig3b.csv")
ggplot(dftmp, aes(x=length,y=frequency, fill=length))+
geom_bar(stat="identity")+
scale_fill_manual(values=mycolors)+
scale_x_discrete("token length")+
scale_y_continuous("frequency")+
theme_cowplot()+
NULL
tab <- table(df$length)
tab["13"] <- 0
tab["14"] <- 0
tab["15"] <- 0
dfvoc <- data.frame(t(tab[c(1:12,14:16,13)]))[,2:3]
colnames(dfvoc) <- c("length","vocabulary")
dfvoc$frequency <- 4^c(1:16)
dfvoc$proportion <- dfvoc$vocabulary/dfvoc$frequency
#dfvoc$length<- factor(as.character(dfvoc$length),levels(dfvoc$length))
write_csv(dfvoc, "fig3cd.csv")
ggplot(dfvoc, aes(y=vocabulary,x=frequency, colour=length))+
geom_point(size=5)+
scale_colour_manual(values=mycolors)+
scale_x_log10("k-mer frequency")+
scale_y_log10("k-mers represented")+
theme_cowplot()+
NULL
## Warning in scale_y_log10("k-mers represented"): log-10 transformation
## introduced infinite values.
ggplot(dfvoc, aes(x=length,y=proportion, fill=length))+
geom_bar(stat="identity")+
scale_fill_manual(values=mycolors)+
scale_x_discrete("token length")+
scale_y_continuous("proportion of k-mers represented")+
theme_cowplot()+
NULL
repres_seq <- c(
sum(str_count(df$token, "A")),
sum(str_count(df$token, "C")),
sum(str_count(df$token, "G")),
sum(str_count(df$token, "T")))
genome_freq <- colSums(oligonucleotideFrequency(Hseq, 1))
first_letter_seq <- c(
sum(str_count(substr(df$token,1,1), "A")),
sum(str_count(substr(df$token,1,1), "C")),
sum(str_count(substr(df$token,1,1), "G")),
sum(str_count(substr(df$token,1,1), "T")))
freq_df <- data.frame(genome_frequency=genome_freq/sum(genome_freq),
representation = repres_seq/sum( repres_seq),
first_letter=first_letter_seq/sum(first_letter_seq))
freq_df$nucl <- factor(c("A","C","G","T"), levels=c("A","C","G","T"))
write_csv(freq_df, "fig3e.csv")
p1 <- ggplot(freq_df, aes(x = "", y = genome_frequency, fill = nucl)) +
geom_bar(stat = "identity", width = 1) +
scale_fill_manual(values=nucl_colours)+
coord_polar("y", start = 0, direction = -1) +
geom_text(aes(label = paste0(nucl,"\n",round(genome_frequency * 100, 2), "%")),
position = position_stack(vjust = 0.5))+
theme_void() +
labs(fill = "Genome Frequency")+
NULL
p2 <- ggplot(freq_df, aes(x = "", y = representation, fill = nucl)) +
geom_bar(stat = "identity", width = 1) +
scale_fill_manual(values=nucl_colours)+
coord_polar("y", start = 0, direction = -1) +
geom_text(aes(label = paste0(nucl,"\n",round(representation * 100, 2), "%")),
position = position_stack(vjust = 0.5))+
theme_void() +
labs(fill = "Representation")+
NULL
p3 <- ggplot(freq_df, aes(x = "", y = first_letter, fill = nucl)) +
geom_bar(stat = "identity", width = 1) +
scale_fill_manual(values=nucl_colours)+
coord_polar("y", start = 0, direction = -1) +
geom_text(aes(label = paste0(nucl,"\n",round(first_letter * 100, 2), "%")),
position = position_stack(vjust = 0.5))+
theme_void() +
labs(fill = "First Letter")+
NULL
p1+p2+p3
tokmetr <- read.csv("data/metrics_per_token.csv")
tokdf <- cbind(tokmetr[c(8,10:609),2:4])
colnames(tokdf) <- c("token","GROVER_AUC","GROVER_Acc")
df <- merge(df, tokdf, id="token")
write_csv(df, "fig3f.csv")
ggplot(df, aes(y=GROVER_AUC-0.5,x=factor(length), fill=factor(length)))+
geom_boxplot() +
scale_fill_manual(values=mycolors)+
NULL
ggplot(df, aes(y=GROVER_Acc,x=factor(length), fill=factor(length)))+
geom_boxplot() +
scale_fill_manual(values=mycolors)+
NULL
UMAPs and PCAs of the average embeddings of the vocabulary. Putting the Principal Components in relation to genome biology requires integration with additional data.
Geigs <- Gpca$sdev^2
Gvarvec <- Geigs/sum(Geigs)
GWeigs <- GWpca$sdev^2
GWvarvec <- GWeigs/sum(GWeigs)
write_csv(data.frame(GWvarvec), file="fig4b2.csv")
MEV <- c(GROVER=Gvarvec[1],GROVER_W2V=GWvarvec[1])
MEV <- data.frame(MEV)
MEV$model=rownames(MEV)
MEV$model <- factor(MEV$model,levels=MEV$model)
write_csv(MEV, "fig4a.csv")
ggplot(MEV[1:2,],aes(x=model,y=100*MEV))+
geom_bar(stat="identity", fill="slateblue")+
scale_y_continuous("Maximum Explained Variance [%]")+
theme_cowplot()+
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))+
NULL
Plotted are both the PCA and also the colour bar for the legend.
ggplot(df, aes(x=GROVER_W2V_PC1, y=GROVER_W2V_PC2, colour=token))+
geom_point()+
scale_colour_manual(values=df$W2V_hex)+
scale_x_continuous(paste0("Word2Vec PC1 (",round(100*GWvarvec[1],2),"%)"))+
scale_y_continuous(paste0("Word2Vec PC2 (",round(100*GWvarvec[2],2),"%)"))+
#geom_label_repel(aes(label = token ))+
theme_cowplot()+
theme(legend.position = "none")+
NULL
image(matrix(1:601, nrow = 1), col = hex, axes = FALSE, xlab = "", ylab = "")
ggplot(df, aes(x=GROVER_W2V_PC1, y=GC, colour=token))+
geom_point()+
scale_colour_manual(values=df$W2V_hex)+
scale_x_continuous(paste0("Word2Vec PC1 (",round(100*GWvarvec[1],2),"%)"))+
scale_y_continuous(paste0("GC content"))+
#geom_label_repel(aes(label = token ))+
theme_cowplot()+
theme(legend.position = "none")+
NULL
cor.test(df$GROVER_W2V_PC1,df$GC, method="spearman")
## Warning in cor.test.default(df$GROVER_W2V_PC1, df$GC, method = "spearman"):
## Cannot compute exact p-value with ties
##
## Spearman's rank correlation rho
##
## data: df$GROVER_W2V_PC1 and df$GC
## S = 2688292, p-value < 2.2e-16
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
## rho
## 0.9256972
ggplot(df, aes(G_W2V_UMAP1, y=G_W2V_UMAP2, colour=token))+
geom_point()+
scale_colour_manual(values=df$W2V_hex)+
theme_cowplot()+
theme(legend.position = "none")+
NULL
ggplot(df, aes(x=GROVER_PC1, y=GROVER_PC2, colour=token))+
geom_point()+
scale_colour_manual(values=df$W2V_hex)+
scale_x_continuous(name=paste("PC1 (",round(100*Gvarvec[1],2),"%)",sep=""))+
scale_y_continuous(name=paste("PC2 (",round(100*Gvarvec[2],2),"%)",sep=""))+
theme_cowplot()+
theme(legend.position = "none")+
NULL
ggplot(df, aes(x=G_UMAP1, y=G_UMAP2, colour=token))+
geom_point()+
scale_colour_manual(values=df$W2V_hex)+
theme_cowplot()+
theme(legend.position = "none")+
NULL
For Figure 4G and later figures, the proportion of tokens associated with features of genome biology is calculated.
dummydf <- data.frame(Feature= sort(c("Promoter","5' UTR","3' UTR","1st Exon","Other Exon","1st Intron","Other Intron","Downstream (<=300)","Distal Intergenic")))
if(!file.exists("objects/230413_gene_annotation.rdat")){
annotdf <- data.frame(
row.names=dummydf$Feature)
for (i in c(1:length(df$token))){
tok <- df$token[i]
cat(tok,"\n")
bed_ss <- get(load(paste0("objects/GR_by_token/GROVER600_",tok,".rdat")))
if(length(bed_ss)>50000){
bed_ss <- bed_ss[sample(length(bed_ss),50000)]
}
seqlevels(bed_ss) <- paste0("chr",c(1:22,"X","Y"))
seqinfo(bed_ss)<- seqinfo(Hsapiens)
start(bed_ss) <- start(bed_ss)+1 #readjust start
bed_ss <- sort(bed_ss)
names(bed_ss)<- 1:length(bed_ss)
peakAnno <- annotatePeak(bed_ss, tssRegion=c(-1000, 1000),
TxDb=txdb, annoDb="org.Hs.eg.db")
tmp <- merge(dummydf,peakAnno@annoStat,by="Feature",all=TRUE)
annotdf <- cbind(annotdf,tmp[,2])
}
colnames(annotdf) <- df$token
annotdf$feature <- factor(rownames(annotdf),levels=rev(c("Promoter","5' UTR","3' UTR","1st Exon","Other Exon","1st Intron","Other Intron","Downstream (<=300)","Distal Intergenic")))
annotdf[is.na(annotdf)]<- 0
melted <- melt(annotdf)
save(annotdf,file="objects/230413_gene_annotation.rdat")
}
genes <- genes(txdb)
## 403 genes were dropped because they have exons located on both strands
## of the same reference sequence or on more than one reference sequence,
## so cannot be represented by a single genomic range.
## Use 'single.strand.genes.only=FALSE' to get all the genes in a
## GRangesList object, or use suppressMessages() to suppress this message.
genes_p <- genes[strand(genes) == "+"]
genes_m <- genes[strand(genes) == "-"]
strand(genes_m) <- "+"
genedf <- data.frame(
row.names=c("gene_p","gene_m","intragenic"))
genedf$feature <- c("gene_p","gene_m","intragenic")
if(!file.exists("objects/230413_gene_orientation_annotation.rdat")){
genedummy <- genedf
for (i in 1:length(df$token)){
cat(i,"\n")
tok <- df$token[i]
cat(tok,"\n")
bed_ss <- get(load(paste0("objects/GR_by_token/GROVER600_",tok,".rdat")))
if(length(bed_ss)>50000){
bed_ss <- bed_ss[sample(length(bed_ss),50000)]
}
seqlevels(bed_ss) <- paste0("chr",c(1:22,"X","Y"))
seqinfo(bed_ss)<- seqinfo(Hsapiens)
start(bed_ss) <- start(bed_ss)+1 #readjust start
bed_ss <- sort(bed_ss)
bed_ss_rev <- bed_ss
names(bed_ss)<- 1:length(bed_ss)
peakAnno_p <- bed_ss[bed_ss %over% genes_p]
peakAnno_m <- bed_ss[bed_ss %over% genes_m]
peakAnno_o <- bed_ss[!bed_ss %over% c(peakAnno_p,peakAnno_m)]
genedf <- cbind(genedf,c(length(peakAnno_p),length(peakAnno_m),length(peakAnno_o)))
}
colnames(genedf) <- c("class",df$token)
genedf[,2:602] <- genedf[,2:602]/colSums(genedf[,2:602])
genedf2 <- genedf[,c("class",df$token)]
save(genedf2,file="230413_gene_orientation_annotation.rdat")
}
genes <- cds(txdb)
genes_p <- genes[strand(genes) == "+"]
genes_m <- genes[strand(genes) == "-"]
strand(genes_m) <- "+"
genedf <- data.frame(
row.names=c("cds_p","cds_m","other"))
genedf$feature <- c("cds_p","cds_m","other")
if(!file.exists("objects/230413_cds_orientation_annotation.rdat")){
for (i in 1:length(df$token)){
cat(i,"\n")
tok <- df$token[i]
cat(tok,"\n")
bed_ss <- get(load(paste0("objects/GR_by_token/GROVER600_",tok,".rdat")))
if(length(bed_ss)>50000){
bed_ss <- bed_ss[sample(length(bed_ss),50000)]
}
seqlevels(bed_ss) <- paste0("chr",c(1:22,"X","Y"))
seqinfo(bed_ss)<- seqinfo(Hsapiens)
start(bed_ss) <- start(bed_ss)+1 #readjust start
bed_ss <- sort(bed_ss)
bed_ss_rev <- bed_ss
names(bed_ss)<- 1:length(bed_ss)
peakAnno_p <- bed_ss[bed_ss %over% genes_p]
peakAnno_m <- bed_ss[bed_ss %over% genes_m]
peakAnno_o <- bed_ss[!bed_ss %over% c(peakAnno_p,peakAnno_m)]
genedf <- cbind(genedf,c(length(peakAnno_p),length(peakAnno_m),length(peakAnno_o)))
}
colnames(genedf) <- c(df$token)
genedf <- cbind (class=rownames(genedf),genedf)
genedf[,2:602] <- genedf[,2:602]/colSums(genedf[,2:602])
genedf2 <- genedf[,c("class",df$token)]
save(genedf2,file="objects/230413_cds_orientation_annotation.rdat")
}
HMM <- read.table("data/GM12878_ChromHMM.bed")
HMM <- HMM[,c(2,3,4,5,10)]
colnames(HMM) <- c("seqnames","start","end","ID","colour")
HMM <- GRanges(HMM)
HMMdf <- data.frame(
row.names=names(table(HMM$ID)))
HMMdf$ID <- names(table(HMM$ID))
HMMdf <- unique(merge(HMMdf,elementMetadata(HMM)[,c("ID","colour")], id="ID", all.x=TRUE, all.y=FALSE))
HMMdf$ID <- factor(HMMdf$ID, levels=sort(HMMdf$ID)[c(1,8:15,2:7)])
HMMcol <- HMMdf$colour[c(1,8:15,2:7)]
names(HMMcol)[c(1,8:15,2:7)]<- as.character(HMMdf$ID)
HMMdummy <- HMMdf
if(!file.exists("objects/230413_HMM_annotation.rdat")){
for (i in c(1:length(df$token))){
cat(i,"\n")
tok <- df$token[i]
cat(tok,"\n")
bed_ss <- get(load(paste0("objects/GR_by_token/GROVER600_",tok,".rdat")))
if(length(bed_ss)>50000){
bed_ss <- bed_ss[sample(length(bed_ss),50000)]
}
seqlevels(bed_ss) <- paste0("chr",c(1:22,"X","Y"))
seqinfo(bed_ss)<- seqinfo(Hsapiens)
start(bed_ss) <- start(bed_ss)+1 #readjust start
bed_ss <- sort(bed_ss)
names(bed_ss)<- 1:length(bed_ss)
peakAnno <- mergeByOverlaps(bed_ss, HMM)
tmp <- data.frame(table(peakAnno$ID)/length(bed_ss))
tmp <- merge(HMMdummy,tmp,by.x="ID",by.y="Var1",all=TRUE)
HMMdf <- cbind(HMMdf,tmp[,3])
}
colnames(HMMdf) <- c("ID","colour",df$token)
HHMdf <- HHMdf[c(1,8:15,2:7),]
HHMdf$ID <- factor(HHMdf$ID,levels=HHMdf$ID)
HHMdf$colour <-c("#FF0000","#FF6969","#CF0BC6", "#FACA00", "#FACA01", "#FFFC04", "#FFFC05", "#0ABEFE", "#00B050", "#00B051", "#99FF66","#7F7F7F", "#F5F5F5", "#F5F5F6", "#F5F5F7")
save(HMMdf,file="objects/230413_HMM_annotation.rdat")
}
if(!file.exists("objects/230413_rep_annotation.rdat")){
repdb <- get(load("data/repdb_UCSC.Rdata"))
repdf <- data.frame(
row.names=names(table(repdb$repClass)))
repdf$repClass <- names(table(repdb$repClass))
repdfdummy <- repdf
repdfdummy$repClass <- paste0(repdfdummy$repClass,"_p")
repdfdummy2 <- repdf
repdfdummy2$repClass <- paste0(repdfdummy2$repClass,"_m")
repdfdummy <- rbind(repdfdummy,repdfdummy2)
repdf <- repdfdummy
for (i in 1:length(df$token)){
cat(i,"\n")
tok <- df$token[i]
cat(tok,"\n")
bed_ss <- get(load(paste0("objects/GR_by_token/GROVER600_",tok,".rdat")))
if(length(bed_ss)>50000){
bed_ss <- bed_ss[sample(length(bed_ss),50000)]
}
seqlevels(bed_ss) <- paste0("chr",c(1:22,"X","Y"))
seqinfo(bed_ss)<- seqinfo(Hsapiens)
start(bed_ss) <- start(bed_ss)+1 #readjust start
bed_ss <- sort(bed_ss)
bed_ss_rev <- bed_ss
strand(bed_ss_rev) <- "-"
names(bed_ss)<- 1:length(bed_ss)
peakAnno_p <- mergeByOverlaps(bed_ss, repdb[which(strand(repdb)=="+")])
peakAnno_m <- mergeByOverlaps(bed_ss_rev, repdb[which(strand(repdb)=="-")])
tmp_p <- data.frame(table(peakAnno_p$repClass)/length(bed_ss))
tmp_m <- data.frame(table(peakAnno_m$repClass)/length(bed_ss))
tmp_p$Var1 <- paste0(tmp_p$Var,"_p")
tmp_m$Var1 <- paste0(tmp_m$Var,"_m")
tmp <- rbind(tmp_p,tmp_m)
repdf <- cbind(repdf,tmp[,2])
}
colnames(repdf) <- c("repClass",df$token)
repdf2 <- rbind(c(colSums(repdf[c(17:21,13,1:2),-1])),
c(colSums(repdf[21+c(17:21,13,1:2),-1])),
c(colSums(repdf[c(15:16),-1])),
c(colSums(repdf[21+c(15:16),-1])),
c(colSums(repdf[c(14),-1])),
c(colSums(repdf[21+c(14),-1])),
c(colSums(repdf[c(12),-1])),
c(colSums(repdf[21+c(12),-1])),
c(colSums(repdf[c(11),-1])),
c(colSums(repdf[21+c(11),-1])),
c(colSums(repdf[c(6:7),-1])),
c(colSums(repdf[21+c(6:7),-1])),
c(colSums(repdf[c(5),-1])),
c(colSums(repdf[21+c(5),-1])),
c(colSums(repdf[c(4,3),-1])),
c(colSums(repdf[21+c(4,3),-1]))
)
repdf2 <- data.frame(repClass=c("other_p","other_m","SINE_p","SINE_m","Simple_p","Simple_m","Satellite_p","Satellite_m","rRNA_p","rRNA_m","LTR_p","LTR_m","Low_complexity_p","Low_complexity_m","LINE_p","LINE_m"),repdf2)
colnames(repdf2) <- c("repClass",df$token)
save(repdf2,file="objects/230413_rep_annotation.rdat")
}
if(!file.exists("objects/230413_rep_strand_annotation.rdat") &! file.exists("objects/230413_rep_timing_annotation.rdat")){
repltime <- read.table("data/GSE131417_K562_reptime_segmentation_recolored.bed")
replstrand <- rtracklayer::import("data/OK-seq_K562_BR1.bam.rfd.bw")
seqlevels(replstrand, pruning.mode="coarse") <- paste0("chr",c(1:22,"X","Y"))
seqinfo(replstrand)<- seqinfo(Hsapiens)
colnames(repltime)[1:4] <- c("seqnames","start","end","timing")
repltime <- GRanges(repltime)
seqlevels(repltime, pruning.mode="coarse") <- paste0("chr",c(1:22,"X","Y"))
seqinfo(repltime)<- seqinfo(Hsapiens)
early <- repltime[repltime$timing == "early"]
mid_early <- repltime[repltime$timing == "mid-early"]
mid_late <- repltime[repltime$timing == "mid-late"]
late <- repltime[repltime$timing == "late"]
plusrepl <- reduce(replstrand[replstrand$score>0])
minusrepl <- reduce(replstrand[replstrand$score<0])
neut<- reduce(replstrand[replstrand$score==0])
genedf <- data.frame(
row.names=c("early","mid_early","mid_late","late"))
genedf$feature <- c("early","mid_early","mid_late","late")
stranddf <- data.frame(
row.names=c("plus","minus","neutral"))
stranddf$feature <- c("plus","minus","neutral")
for (i in 1:length(df$token)){
cat(i,"\n")
tok <- df$token[i]
cat(tok,"\n")
bed_ss <- get(load(paste0("objects/GR_by_token/GROVER600_",tok,".rdat")))
seqlevels(bed_ss) <- paste0("chr",c(1:22,"X","Y"))
seqinfo(bed_ss)<- seqinfo(Hsapiens)
if(length(bed_ss)>50000){
bed_ss <- bed_ss[sample(length(bed_ss),50000)]
}
start(bed_ss) <- start(bed_ss)+1 #readjust start
bed_ss <- sort(bed_ss)
names(bed_ss)<- 1:length(bed_ss)
peakAnno_early <- bed_ss[bed_ss %over% early]
peakAnno_mid_early <- bed_ss[bed_ss %over% mid_early]
peakAnno_mid_late <- bed_ss[bed_ss %over% mid_late]
peakAnno_late <- bed_ss[bed_ss %over% late]
peakAnno_plus <- bed_ss[bed_ss %over% plusrepl]
peakAnno_minus <- bed_ss[bed_ss %over% minusrepl]
peakAnno_neut <- bed_ss[bed_ss %over% neut]
genedf <- cbind(genedf,c(length(peakAnno_early),length(peakAnno_mid_early),length(peakAnno_mid_late),length(peakAnno_late)))
stranddf <- cbind(stranddf,c(length(peakAnno_plus),length(peakAnno_minus),length(peakAnno_neut)))
}
colnames(genedf) <- c("class",df$token)
colnames(stranddf) <- c("class",df$token)
genedf[,2:602] <- genedf[,2:602]/colSums(genedf[,2:602])
stranddf[,2:602] <- t(t(stranddf[,2:602])/colSums(stranddf[,2:602]))
genedf2 <- genedf[,c("class",df$token[Ghm1])]
genedf2$class <- factor(genedf2$class,levels=genedf2$class)
stranddf2 <- stranddf[,c("class",df$token[Ghm1])]
save(genedf2,file="objects/230413_rep_timing_annotation.rdat")
save(stranddf2,file="objects/230413_rep_strand_annotation.rdat")
}
annot <- get(load("objects/230413_gene_annotation.rdat"))
annot <- data.frame(t(annot[,-602]))
annot$token <- rownames(annot)
df <- merge(df,annot, by="token")
annot <- get(load("objects/230413_gene_orientation_annotation.rdat"))
rownames(annot) <- annot$class
annot <- data.frame(annot[,-c(1)])
annot <- data.frame(t(annot))
annot$token <- rownames(annot)
df <- merge(df,annot, by="token", all=TRUE)
annot <- get(load("objects/230413_cds_orientation_annotation.rdat"))
annot <- data.frame(t(annot[,-1]))
annot$token <- rownames(annot)
colnames(annot)<- c("cds_p","cds_m","cds_other","token")
df <- merge(df,annot, by="token", all=TRUE)
annot <- get(load("objects/230413_HMM_annotation.rdat"))
rownames(annot) <- annot$ID
annot <- data.frame(annot[,-c(1:2)])
annot <- data.frame(t(annot))
annot$token <- rownames(annot)
df <- merge(df,annot, by="token", all=TRUE)
annot <- get(load("objects/230413_rep_annotation.rdat"))
rownames(annot) <- annot$repClass
annot <- data.frame(annot[,-c(1)])
annot <- data.frame(t(annot))
annot$token <- rownames(annot)
df <- merge(df,annot, by="token", all=TRUE)
annot <- get(load("objects/230413_rep_timing_annotation.rdat"))
rownames(annot) <- annot$class
annot <- data.frame(annot[,-c(1)])
annot <- data.frame(t(annot))
annot$token <- rownames(annot)
df <- merge(df,annot, by="token", all=TRUE)
annot <- get(load("objects/230413_rep_strand_annotation.rdat"))
rownames(annot) <- annot$class
annot <- data.frame(annot[,-c(1)])
annot <- data.frame(t(annot))
annot$token <- rownames(annot)
df <- merge(df,annot, by="token", all=TRUE)
save(df, file="objects/GROVER_annotation_df.rdat")
df$X1st.Exon_residuals <- resid(loess(df$X1st.Exon ~ df$GC, span = 1))
df$X1st.Intron_residuals <- resid(loess(df$X1st.Intron ~ df$GC, span = 1))
df$X3..UTR_residuals <- resid(loess(df$X3..UTR ~ df$GC, span = 1))
df$X5..UTR_residuals <- resid(loess(df$X5..UTR ~ df$GC, span = 1))
df$Distal.Intergenic_residuals <- resid(loess(df$Distal.Intergenic ~ df$GC, span = 1))
df$Downstream....300._residuals <- resid(loess(df$Downstream....300. ~ df$GC, span = 1))
df$Other.Exon_residuals <- resid(loess(df$Other.Exon ~ df$GC, span = 1))
df$Other.Intron_residuals <- resid(loess(df$Other.Intron ~ df$GC, span = 1))
df$Promoter_residuals <- resid(loess(df$Promoter ~ df$GC, span = 1))
df$X1_Active_Promoter_residuals <- resid(loess(df$X1_Active_Promoter ~ df$GC, span = 1))
df$X2_Weak_Promoter_residuals <- resid(loess(df$X2_Weak_Promoter ~ df$GC, span = 1))
df$X3_Poised_Promoter_residuals <- resid(loess(df$X3_Poised_Promoter ~ df$GC, span = 1))
df$X4_Strong_Enhancer_residuals <- resid(loess(df$X4_Strong_Enhancer ~ df$GC, span = 1))
df$X5_Strong_Enhancer_residuals <- resid(loess(df$X5_Strong_Enhancer ~ df$GC, span = 1))
df$X6_Weak_Enhancer_residuals <- resid(loess(df$X6_Weak_Enhancer ~ df$GC, span = 1))
df$X7_Weak_Enhancer_residuals <- resid(loess(df$X7_Weak_Enhancer ~ df$GC, span = 1))
df$X8_Insulator_residuals <- resid(loess(df$X8_Insulator ~ df$GC, span = 1))
df$X9_Txn_Transition_residuals <- resid(loess(df$X9_Txn_Transition ~ df$GC, span = 1))
df$X10_Txn_Elongation_residuals <- resid(loess(df$X10_Txn_Elongation ~ df$GC, span = 1))
df$X11_Weak_Txn_residuals <- resid(loess(df$X11_Weak_Txn ~ df$GC, span = 1))
df$X12_Repressed_residuals <- resid(loess(df$X12_Repressed ~ df$GC, span = 1))
df$X13_Heterochrom.lo_residuals <- resid(loess(df$X13_Heterochrom.lo ~ df$GC, span = 1))
df$X14_Repetitive.CNV_residuals <- resid(loess(df$X14_Repetitive.CNV ~ df$GC, span = 1))
df$X15_Repetitive.CNV_residuals <- resid(loess(df$X15_Repetitive.CNV ~ df$GC, span = 1))
df$gene_p_residuals <- resid(loess(df$gene_p ~ df$GC, span = 1))
df$gene_m_residuals <- resid(loess(df$gene_m ~ df$GC, span = 1))
df$intragenic_residuals <- resid(loess(df$intragenic ~ df$GC, span = 1))
df$early_residuals <- resid(loess(df$early ~ df$GC, span = 1))
df$mid_early_residuals <- resid(loess(df$mid_early ~ df$GC, span = 1))
df$mid_late_residuals <- resid(loess(df$mid_late ~ df$GC, span = 1))
df$late_residuals <- resid(loess(df$late ~ df$GC, span = 1))
df$cds_p_residuals <- resid(loess(df$cds_p ~ df$GC, span = 1))
df$cds_m_residuals <- resid(loess(df$cds_m ~ df$GC, span = 1))
df$cds_other_residuals <- resid(loess(df$cds_other ~ df$GC, span = 1))
# fix class for token length to numeric
df$length <- as.numeric(as.character(df$length))
tmp <- data.frame(Gpca$x)[,1:100]
tmp$token <- rownames(tmp)
rownames(df)<- df$token
df <- df %>% arrange(df,token)
tmp <- tmp[df$token,]
cors <- data.frame(nrow=20)
for(i in c(colnames(df)[c(18,19,24:26,91,86,85,83,89,84,90,88,87,107:108,114:115,92:106,60:75,110:113,80:81)])){
cors_tmp <- cor(tmp[,1:20],df[i],method="spearman")
cors <- cbind(cors,cors_tmp)
}
## Warning in cor(tmp[, 1:20], df[i], method = "spearman"): the standard deviation
## is zero
## Warning in cor(tmp[, 1:20], df[i], method = "spearman"): the standard deviation
## is zero
tmp <- data.frame(GWpca$x)[,1:100]
tmp$token <- rownames(tmp)
rownames(df)<- df$token
df <- df %>% arrange(df,token)
tmp <- tmp[df$token,]
corsW <- data.frame(nrow=20)
for(i in c(colnames(df)[c(18,19,24:26,91,86,85,83,89,84,90,88,87,107:108,114:115,92:106,60:75,110:113,80:81)])){
cors_tmp <- cor(tmp[,1:20],df[i],method="spearman")
corsW <- cbind(corsW,cors_tmp)
}
## Warning in cor(tmp[, 1:20], df[i], method = "spearman"): the standard deviation
## is zero
## Warning in cor(tmp[, 1:20], df[i], method = "spearman"): the standard deviation
## is zero
varexplplot(GWvarvec)
tab <- corsW[,-c(1,40,48)]
tab$varexpl <- GWvarvec[1:20]
write.csv(tab, file="fig4g.csv")
ph <- pheatmap(t(corsW[,-c(1,40,48)]),
breaks = seq(-1, 1, length.out = 100),
cluster_rows = FALSE,
cluster_cols = FALSE,
gaps_row = c(5,14,16,18,33,47,51),
color = colorRampPalette(c("navy", "white", "firebrick3"))(100)
)
print(ph)
varexplplot(Gvarvec)
tab <- cors[,-c(1,40,48)]
tab$varexpl <- Gvarvec[1:20]
write.csv(tab, file="fig4h.csv")
ph <- pheatmap(t(cors[,-c(1,40,48)]),
breaks = seq(-1, 1, length.out = 100),
cluster_rows = FALSE,
cluster_cols = FALSE,
gaps_row = c(5,14,16,18,33,47,51),
color = colorRampPalette(c("navy", "white", "firebrick3"))(100)
)
print(ph)
ggplot(df, aes(x=GROVER_PC1, y=log10(frequency), colour=token))+
geom_point()+
scale_colour_manual(values=df$W2V_hex)+
scale_x_continuous(name=paste("PC1 (",round(100*Gvarvec[1],2),"%)",sep=""))+
scale_y_continuous(name=paste("frequency (log10)",sep=""))+
theme_cowplot()+
theme(legend.position = "none")+
NULL
cor.test(df$GROVER_PC1,log10(df$frequency), method="spearman")
##
## Spearman's rank correlation rho
##
## data: df$GROVER_PC1 and log10(df$frequency)
## S = 4327716, p-value < 2.2e-16
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
## rho
## 0.8803844
ggplot(df, aes(x=GROVER_PC2, y=GC, colour=token))+
geom_point()+
scale_colour_manual(values=df$W2V_hex)+
scale_x_continuous(name=paste("PC2 (",round(100*Gvarvec[2],2),"%)",sep=""))+
scale_y_continuous(name=paste("GC [%]",sep=""))+
theme_cowplot()+
theme(legend.position = "none")+
NULL
cor.test(df$GROVER_PC2,df$GC, method="spearman")
## Warning in cor.test.default(df$GROVER_PC2, df$GC, method = "spearman"): Cannot
## compute exact p-value with ties
##
## Spearman's rank correlation rho
##
## data: df$GROVER_PC2 and df$GC
## S = 70998078, p-value < 2.2e-16
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
## rho
## -0.9623462
ggplot(df, aes(x=GROVER_PC3, y=AG, colour=W2V_hex))+
geom_point()+
scale_colour_manual(values=df$W2V_hex)+
scale_x_continuous(name=paste("PC3 (",round(100*Gvarvec[3],2),"%)",sep=""))+
scale_y_continuous(name=paste("AG [%]",sep=""))+
theme_cowplot()+
theme(legend.position = "none")+
NULL
cor.test(df$GROVER_PC3,df$AG,method="spearman")
## Warning in cor.test.default(df$GROVER_PC3, df$AG, method = "spearman"): Cannot
## compute exact p-value with ties
##
## Spearman's rank correlation rho
##
## data: df$GROVER_PC3 and df$AG
## S = 2083581, p-value < 2.2e-16
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
## rho
## 0.942411
df$length <- as.numeric(df$length)
ggplot(df, aes(x=GROVER_PC4, y=length, colour=W2V_hex))+
geom_point()+
scale_colour_manual(values=df$W2V_hex)+
#geom_label_repel(aes(label = token ))+
scale_x_continuous(name=paste("PC4 (",round(100*Gvarvec[4],2),"%)",sep=""))+
scale_y_continuous(name=paste("token length",sep=""))+
theme_cowplot()+
theme(legend.position = "none")+
NULL
cor.test(df$GROVER_PC4,df$length,method="spearman")
## Warning in cor.test.default(df$GROVER_PC4, df$length, method = "spearman"):
## Cannot compute exact p-value with ties
##
## Spearman's rank correlation rho
##
## data: df$GROVER_PC4 and df$length
## S = 22032923, p-value < 2.2e-16
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
## rho
## 0.3910226
ggplot(df, aes(x=GROVER_PC5, y=A+C, colour=W2V_hex))+
geom_point()+
scale_colour_manual(values=df$W2V_hex)+
scale_x_continuous(name=paste("PC5 (",round(100*Gvarvec[5],2),"%)",sep=""))+
scale_y_continuous(name=paste("AC",sep=""))+
theme_cowplot()+
theme(legend.position = "none")+
NULL
cor.test(df$GROVER_PC5,df$A+df$C,method="spearman")
## Warning in cor.test.default(df$GROVER_PC5, df$A + df$C, method = "spearman"):
## Cannot compute exact p-value with ties
##
## Spearman's rank correlation rho
##
## data: df$GROVER_PC5 and df$A + df$C
## S = 65480318, p-value < 2.2e-16
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
## rho
## -0.8098385
ggplot(df, aes(x=GROVER_PC6, y=length, colour=W2V_hex))+
geom_point()+
scale_colour_manual(values=df$W2V_hex)+
scale_x_continuous(name=paste("PC6 (",round(100*Gvarvec[6],2),"%)",sep=""))+
scale_y_continuous(name=paste("token length",sep=""))+
theme_cowplot()+
theme(legend.position = "none")+
NULL
cor.test(df$GROVER_PC6,df$length,method="spearman")
## Warning in cor.test.default(df$GROVER_PC6, df$length, method = "spearman"):
## Cannot compute exact p-value with ties
##
## Spearman's rank correlation rho
##
## data: df$GROVER_PC6 and df$length
## S = 20756012, p-value < 2.2e-16
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
## rho
## 0.4263157
Average GROVER token embeddings cluster by token characteristics and
annotation
Clustering and annotation were performed separately in that the order
after clustering was obtained and then used for the associated bar
graphs. The results were then combined with Adobe Illustrator.
Gdist<- dist(Gemb,method = "euclidean")
Gdist <- as.matrix(Gdist)
Gdf <- data.frame(Gdist)
df$logfreq <- log10(df$frequency)
df$length <- as.factor(df$length)
annot <- df[,c("GROVER_W2V_PC1","GC","AG","length","logfreq")]
rownames(annot) <- df$token
annotcol <- list(length=mycolors,GROVER_W2V_PC1=hex, logfreq=viridis(100, option ="B",direction=-1))
breaks <- c(0, seq(min(Gdf[Gdf != 0]), max(Gdf), length.out = 99))
write.csv(Gdf, file="figS2_hm.csv")
write.csv(annot, file="figS2_annot.csv")
ph <- pheatmap(Gdf,
cluster_rows=TRUE,
cluster_cols=TRUE,
annotation_col=annot,
annotation_colors = annotcol,
annotation_row = NULL,
annotation_row_names = FALSE,
annotation_col_names = FALSE,
color= c("grey",viridis(100,direction=1)),
show_rownames = F,
show_colnames = F,
breaks = breaks)
Ghm1 <- ph$tree_col[["order"]]
load("objects/230413_rep_annotation.rdat")
melted <- melt(repdf2, id="repClass")
write.csv(melted, file="figS2_repeats.csv")
rep_cols <- c(brewer.pal(n = 8, name = "Dark2"),brewer.pal(n = 8, name = "Set2"))
repcol <- rep_cols[c(1,9,2,10,3,11,4,12,5,13,6,14,7,15,8,16)]
ggplot(melted,aes(x=variable,y=value,fill=repClass))+
geom_bar(stat="identity", width = 1)+
theme_cowplot()+
scale_fill_manual(values=repcol)+
coord_flip()+
NULL
HHMdf <- get(load("objects/230413_HMM_annotation.rdat"))
HHMcol <- c("#FF0000","#FF6969","#CF0BC6", "#FACA00", "#FACA01", "#FFFC04", "#FFFC05", "#0ABEFE", "#00B050", "#00B051", "#99FF66","#7F7F7F", "#F5F5F5", "#F5F5F6", "#F5F5F7")
HHMdf <- data.frame(HHMdf)
HHMdf[,3:603] <- t(t(HHMdf[,3:603])/colSums(HHMdf[,3:603]))
melted <- melt(HHMdf[,-2], id=c("ID"))
write.csv(melted, file="figS2_chromatin.csv")
ggplot(melted,aes(x=variable,y=value,fill=ID))+
geom_bar(stat="identity", width = 1)+
theme_cowplot()+
scale_fill_manual(values=HHMcol)+
coord_flip()+
NULL
Clustering and annotation were performed separately in that the order after clustering was obtained and then used for the associated bar graphs. The results were then combined with Adobe Illustrator.
self_sim <- read.csv("data/self_similarity_per_token.csv")
ss <- self_sim[,3:14]
df <- merge(df, self_sim, id="token")
rownames(ss) <- self_sim$token
df$logfreq <- log10(df$frequency)
annot <- df[,c("logfreq","length","GROVER_W2V_PC1","GROVER_AUC")[4:1]]
annot$length <- factor(annot$length)
rownames(annot) <- df$token
annotcol <- list(length=mycolors,GROVER_W2V_PC1=hex, logfreq=viridis(100, option ="B",direction=-1))
write.csv(annot, file="fig5A_annot.csv")
write.csv(ss, file="fig5A_heatmap.csv")
phss <- pheatmap(ss,
cluster_cols=FALSE,
annotation_row = annot,
annotation_colors = annotcol,
show_rownames = F,
color= viridis(100,direction=1))
sshm <- phss$tree_row[["order"]]
token_order <- rownames(ss)[sshm]
HHMdf <- get(load("objects/230413_HMM_annotation.rdat"))
HHMdf <- data.frame(HHMdf)
HHMdf[,3:603] <- t(t(HHMdf[,3:603])/colSums(HHMdf[,3:603]))
HHMdf <- HHMdf[,c("ID",token_order)]
melted <- melt(HHMdf[,-2], id=c("ID"))
ggplot(melted,aes(x=variable,y=value,fill=ID))+
geom_bar(stat="identity", width = 1)+
theme_cowplot()+
scale_fill_manual(values=HHMcol)+
coord_flip()+
NULL
load("objects/230413_rep_annotation.rdat")
repdf2 <- repdf2[,c("repClass",token_order)]
melted <- melt(repdf2, id="repClass")
rep_cols <- c(brewer.pal(n = 8, name = "Dark2"),brewer.pal(n = 8, name = "Set2"))
repcol <- rep_cols[c(1,9,2,10,3,11,4,12,5,13,6,14,7,15,8,16)]
ggplot(melted,aes(x=variable,y=value,fill=repClass))+
geom_bar(stat="identity", width = 1)+
theme_cowplot()+
scale_fill_manual(values=repcol)+
coord_flip()+
NULL
emb <- read.table("data/umap/cls_tokens_n_neighbors_200_min_dist_0.1_metric_euclidean.bed")
colnames(emb)<- c("chromosome","start","end","x","y")
emb$start <- emb$start+1 # the ranges are from python and thus 0 based
emb$end <- emb$end+1
GR <- GRanges(emb)
seqinfo(GR)<- seqinfo(Hsapiens)
dat <- data.frame(elementMetadata(GR))
ggplot(dat, aes(x=x, y=y))+
geom_point(colour="black", alpha=0.05, size=0.01)+
scale_x_continuous(limits=c(-8,6.5))+
scale_y_continuous(limits=c(-15,8))+
ggtitle("all")+
NULL
## Warning: Removed 995 rows containing missing values or values outside the scale range
## (`geom_point()`).
#annotate to repeats
repdb <- get(load("/Users/anna/projects/NLP/GROVER/data/repdb_UCSC.Rdata"))
seqlevels(repdb)<- seqlevels(Hsapiens)
seqinfo(repdb)<- seqinfo(Hsapiens)
## Warning in valid.GenomicRanges.seqinfo(x, suggest.trim = TRUE): GRanges object contains 14 out-of-bound ranges located on sequences
## chrUn_gl000211, chrUn_gl000217, chrUn_gl000220, chrUn_gl000227,
## chrUn_gl000230, chrUn_gl000232, chrUn_gl000233, chrUn_gl000235,
## chrUn_gl000236, chrUn_gl000244, chrUn_gl000246, chrUn_gl000247,
## chr1_gl000191_random, and chr19_gl000208_random. Note that ranges
## located on a sequence whose length is unknown (NA) or on a circular
## sequence are not considered out-of-bound (use seqlengths() and
## isCircular() to get the lengths and circularity flags of the underlying
## sequences). You can use trim() to trim these ranges. See
## ?`trim,GenomicRanges-method` for more information.
rep_cols <- c(brewer.pal(n = 8, name = "Dark2"),brewer.pal(n = 8, name = "Set2"))
repcol <- rep_cols[c(1,9,2,10,3,11,4,12,5,13,6,14,7,15,8,16)]
rep_ss <- repdb[repdb$repClass %in% c("LINE", "LINE?") & strand(repdb) == "-"]
dat <- GR[GR %over% rep_ss]
dat <- data.frame(elementMetadata(dat))
ggplot(dat, aes(x=x, y=y))+
geom_point(colour=rep_cols[1], alpha=0.05, size=0.01)+
scale_x_continuous(limits=c(-8,6.5))+
scale_y_continuous(limits=c(-15,8))+
ggtitle("LINE-")+
theme(axis.text = element_blank(),
axis.title = element_blank(),
axis.ticks = element_blank(),
panel.grid = element_blank(),
axis.line = element_blank()
)+
NULL
## Warning: Removed 19 rows containing missing values or values outside the scale range
## (`geom_point()`).
rep_ss <- repdb[repdb$repClass %in% c("LINE", "LINE?") & strand(repdb) == "+"]
dat <- GR[GR %over% rep_ss]
dat <- data.frame(elementMetadata(dat))
ggplot(dat, aes(x=x, y=y))+
geom_point(colour=rep_cols[9], alpha=0.05, size=0.01)+
scale_x_continuous(limits=c(-8,6.5))+
scale_y_continuous(limits=c(-15,8))+
ggtitle("LINE+")+
theme(axis.text = element_blank(),
axis.title = element_blank(),
axis.ticks = element_blank(),
panel.grid = element_blank(),
axis.line = element_blank()
)+
NULL
## Warning: Removed 37 rows containing missing values or values outside the scale range
## (`geom_point()`).
rep_ss <- repdb[repdb$repClass %in% c("Low_complexity")]
dat <- GR[GR %over% rep_ss]
dat <- data.frame(elementMetadata(dat))
ggplot(dat, aes(x=x, y=y))+
geom_point(colour=rep_cols[2], alpha=0.05, size=0.01)+
scale_x_continuous(limits=c(-8,6.5))+
scale_y_continuous(limits=c(-15,8))+
ggtitle("Low_complexity")+
theme(axis.text = element_blank(),
axis.title = element_blank(),
axis.ticks = element_blank(),
panel.grid = element_blank(),
axis.line = element_blank()
)+
NULL
## Warning: Removed 51 rows containing missing values or values outside the scale range
## (`geom_point()`).
rep_ss <- repdb[repdb$repClass %in% c("LTR","LTR?") & strand(repdb) == "-"]
dat <- GR[GR %over% rep_ss]
dat <- data.frame(elementMetadata(dat))
ggplot(dat, aes(x=x, y=y))+
geom_point(colour=rep_cols[3], alpha=0.05, size=0.01)+
scale_x_continuous(limits=c(-8,6.5))+
scale_y_continuous(limits=c(-15,8))+
ggtitle("LTR-")+
theme(axis.text = element_blank(),
axis.title = element_blank(),
axis.ticks = element_blank(),
panel.grid = element_blank(),
axis.line = element_blank()
)+
NULL
## Warning: Removed 116 rows containing missing values or values outside the scale range
## (`geom_point()`).
rep_ss <- repdb[repdb$repClass %in% c("LTR","LTR?") & strand(repdb) == "+"]
dat <- GR[GR %over% rep_ss]
dat <- data.frame(elementMetadata(dat))
ggplot(dat, aes(x=x, y=y))+
geom_point(colour=rep_cols[11], alpha=0.05, size=0.01)+
scale_x_continuous(limits=c(-8,6.5))+
scale_y_continuous(limits=c(-15,8))+
ggtitle("LTR+")+
theme(axis.text = element_blank(),
axis.title = element_blank(),
axis.ticks = element_blank(),
panel.grid = element_blank(),
axis.line = element_blank()
)+
NULL
## Warning: Removed 4 rows containing missing values or values outside the scale range
## (`geom_point()`).
rep_ss <- repdb[repdb$repClass %in% c("other","DNA","DNA?","RC","RNA","scRNA","snRNA","srpRNA","tRNA","Unknown","Unknown?") & strand(repdb) == "-"]
dat <- GR[GR %over% rep_ss]
dat <- data.frame(elementMetadata(dat))
ggplot(dat, aes(x=x, y=y))+
geom_point(colour=rep_cols[4], alpha=0.05, size=0.01)+
scale_x_continuous(limits=c(-8,6.5))+
scale_y_continuous(limits=c(-15,8))+
ggtitle("other-")+
theme(axis.text = element_blank(),
axis.title = element_blank(),
axis.ticks = element_blank(),
panel.grid = element_blank(),
axis.line = element_blank()
)+
NULL
## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_point()`).
rep_ss <- repdb[repdb$repClass %in% c("other","DNA","DNA?","RC","RNA","scRNA","snRNA","srpRNA","tRNA","Unknown","Unknown?") & strand(repdb) == "+"]
dat <- GR[GR %over% rep_ss]
dat <- data.frame(elementMetadata(dat))
ggplot(dat, aes(x=x, y=y))+
geom_point(colour=rep_cols[12], alpha=0.05, size=0.01)+
scale_x_continuous(limits=c(-8,6.5))+
scale_y_continuous(limits=c(-15,8))+
ggtitle("other+")+
theme(axis.text = element_blank(),
axis.title = element_blank(),
axis.ticks = element_blank(),
panel.grid = element_blank(),
axis.line = element_blank()
)+
NULL
rep_ss <- repdb[repdb$repClass %in% c("rRNA") & strand(repdb) == "-"]
dat <- GR[GR %over% rep_ss]
dat <- data.frame(elementMetadata(dat))
ggplot(dat, aes(x=x, y=y))+
geom_point(colour=rep_cols[5], alpha=1, size=0.1)+
scale_x_continuous(limits=c(-8,6.5))+
scale_y_continuous(limits=c(-15,8))+
ggtitle("rRNA-")+
theme(axis.text = element_blank(),
axis.title = element_blank(),
axis.ticks = element_blank(),
panel.grid = element_blank(),
axis.line = element_blank()
)+
NULL
rep_ss <- repdb[repdb$repClass %in% c("rRNA") & strand(repdb) == "+"]
dat <- GR[GR %over% rep_ss]
dat <- data.frame(elementMetadata(dat))
ggplot(dat, aes(x=x, y=y))+
geom_point(colour=rep_cols[13], alpha=1, size=0.1)+
scale_x_continuous(limits=c(-8,6.5))+
scale_y_continuous(limits=c(-15,8))+
ggtitle("rRNA+")+
theme(axis.text = element_blank(),
axis.title = element_blank(),
axis.ticks = element_blank(),
panel.grid = element_blank(),
axis.line = element_blank()
)+
NULL
rep_ss <- repdb[repdb$repClass %in% c("Satellite") & strand(repdb) == "-"]
dat <- GR[GR %over% rep_ss]
dat <- data.frame(elementMetadata(dat))
ggplot(dat, aes(x=x, y=y))+
geom_point(colour=rep_cols[6], alpha=1, size=0.1)+
scale_x_continuous(limits=c(-8,6.5))+
scale_y_continuous(limits=c(-15,8))+
ggtitle("Satellite-")+
theme(axis.text = element_blank(),
axis.title = element_blank(),
axis.ticks = element_blank(),
panel.grid = element_blank(),
axis.line = element_blank()
)+
NULL
## Warning: Removed 285 rows containing missing values or values outside the scale range
## (`geom_point()`).
rep_ss <- repdb[repdb$repClass %in% c("Satellite") & strand(repdb) == "+"]
dat <- GR[GR %over% rep_ss]
dat <- data.frame(elementMetadata(dat))
ggplot(dat, aes(x=x, y=y))+
geom_point(colour=rep_cols[14], alpha=1, size=0.1)+
scale_x_continuous(limits=c(-8,6.5))+
scale_y_continuous(limits=c(-15,8))+
ggtitle("Satellite+")+
theme(axis.text = element_blank(),
axis.title = element_blank(),
axis.ticks = element_blank(),
panel.grid = element_blank(),
axis.line = element_blank()
)+
NULL
## Warning: Removed 525 rows containing missing values or values outside the scale range
## (`geom_point()`).
rep_ss <- repdb[repdb$repClass %in% c("Simple_repeat") ]
dat <- GR[GR %over% rep_ss]
dat <- data.frame(elementMetadata(dat))
ggplot(dat, aes(x=x, y=y))+
geom_point(colour=rep_cols[15], alpha=0.05, size=0.01)+
scale_x_continuous(limits=c(-8,6.5))+
scale_y_continuous(limits=c(-15,8))+
ggtitle("Simple")+
theme(axis.text = element_blank(),
axis.title = element_blank(),
axis.ticks = element_blank(),
panel.grid = element_blank(),
axis.line = element_blank()
)+
NULL
## Warning: Removed 150 rows containing missing values or values outside the scale range
## (`geom_point()`).
rep_ss <- repdb[repdb$repClass %in% c("SINE", "SINE?") & strand(repdb) == "-"]
dat <- GR[GR %over% rep_ss]
dat <- data.frame(elementMetadata(dat))
ggplot(dat, aes(x=x, y=y))+
geom_point(colour=rep_cols[8], alpha=0.05, size=0.01)+
scale_x_continuous(limits=c(-8,6.5))+
scale_y_continuous(limits=c(-15,8))+
ggtitle("SINE-")+
theme(axis.text = element_blank(),
axis.title = element_blank(),
axis.ticks = element_blank(),
panel.grid = element_blank(),
axis.line = element_blank()
)+
NULL
## Warning: Removed 32 rows containing missing values or values outside the scale range
## (`geom_point()`).
rep_ss <- repdb[repdb$repClass %in% c("SINE","SINE?") & strand(repdb) == "+"]
dat <- GR[GR %over% rep_ss]
dat <- data.frame(elementMetadata(dat))
ggplot(dat, aes(x=x, y=y))+
geom_point(colour=rep_cols[16], alpha=0.05, size=0.01)+
scale_x_continuous(limits=c(-8,6.5))+
scale_y_continuous(limits=c(-15,8))+
ggtitle("SINE+")+
theme(axis.text = element_blank(),
axis.title = element_blank(),
axis.ticks = element_blank(),
panel.grid = element_blank(),
axis.line = element_blank()
)+
NULL
## Warning: Removed 40 rows containing missing values or values outside the scale range
## (`geom_point()`).
Highlight knowledge of strand specificity
rep_m <- repdb[repdb$repClass %in% c("SINE", "SINE?") & strand(repdb) == "-"]
dat_m <- GR[GR %over% rep_m]
rep_p <- repdb[repdb$repClass %in% c("SINE","SINE?") & strand(repdb) == "+"]
dat_p <- GR[GR %over% rep_p]
dat_m2 <- dat_m[!dat_m %over% dat_p]
dat_p2 <- dat_p[!dat_p %over% dat_m]
dat1 <- data.frame(elementMetadata(dat_m2))
dat1$ori <- "m"
ggplot(dat, aes(x=x, y=y))+
geom_point(colour=rep_cols[8], alpha=0.05, size=0.01)+
scale_x_continuous(limits=c(-8,6.5))+
scale_y_continuous(limits=c(-15,8))+
ggtitle("SINE-")+
theme(axis.text = element_blank(),
axis.title = element_blank(),
axis.ticks = element_blank(),
panel.grid = element_blank(),
axis.line = element_blank()
)+
NULL
## Warning: Removed 40 rows containing missing values or values outside the scale range
## (`geom_point()`).
dat2 <- data.frame(elementMetadata(dat_p2))
dat2$ori <- "p"
dat <- rbind (dat1,dat2)
ggplot(dat, aes(x=x, y=y, colour=ori))+
geom_point(alpha=0.05, size=0.01)+
scale_x_continuous(limits=c(-8,6.5))+
scale_y_continuous(limits=c(-15,8))+
scale_colour_manual(values=c("green","magenta"))+
ggtitle("SINE orientation")+
theme(axis.text = element_blank(),
axis.title = element_blank(),
axis.ticks = element_blank(),
panel.grid = element_blank(),
axis.line = element_blank(),
legend.position = "none"
)+
NULL
## Warning: Removed 37 rows containing missing values or values outside the scale range
## (`geom_point()`).
Line orientation
rep_m <- repdb[repdb$repClass %in% c("LINE", "LINE?") & strand(repdb) == "-"]
dat_m <- GR[GR %over% rep_m]
rep_p <- repdb[repdb$repClass %in% c("LINE","LINE?") & strand(repdb) == "+"]
dat_p <- GR[GR %over% rep_p]
dat_m2 <- dat_m[!dat_m %over% dat_p]
dat_p2 <- dat_p[!dat_p %over% dat_m]
dat1 <- data.frame(elementMetadata(dat_m2))
dat1$ori <- "m"
dat2 <- data.frame(elementMetadata(dat_p2))
dat2$ori <- "p"
dat <- rbind (dat1,dat2)
ggplot(dat, aes(x=x, y=y, colour=ori))+
geom_point(alpha=0.05, size=0.01)+
scale_x_continuous(limits=c(-8,6.5))+
scale_y_continuous(limits=c(-15,8))+
scale_colour_manual(values=c("green","magenta"))+
ggtitle("LINE orientation")+
theme(axis.text = element_blank(),
axis.title = element_blank(),
axis.ticks = element_blank(),
panel.grid = element_blank(),
axis.line = element_blank(),
legend.position = "none"
)+
NULL
## Warning: Removed 47 rows containing missing values or values outside the scale range
## (`geom_point()`).
Full length and new Line orientation
rep_m <- repdb[repdb$repFamily %in% c("L1") & strand(repdb) == "-" & width(repdb) > 5700 & width(repdb) < 6300]
dat_m <- GR[GR %over% rep_m]
rep_p <- repdb[repdb$repFamily %in% c("L1") & strand(repdb) == "+" & width(repdb) > 5700 & width(repdb) < 6300]
dat_p <- GR[GR %over% rep_p]
dat_m2 <- dat_m[!dat_m %over% dat_p]
dat_p2 <- dat_p[!dat_p %over% dat_m]
dat1 <- data.frame(elementMetadata(dat_m2))
dat1$ori <- "m"
dat2 <- data.frame(elementMetadata(dat_p2))
dat2$ori <- "p"
dat <- rbind (dat1,dat2)
ggplot(dat, aes(x=x, y=y, colour=ori))+
geom_point(alpha=0.05, size=0.01)+
scale_x_continuous(limits=c(-8,6.5))+
scale_y_continuous(limits=c(-15,8))+
scale_colour_manual(values=c("green","magenta"))+
ggtitle("LINE orientation")+
theme(axis.text = element_blank(),
axis.title = element_blank(),
axis.ticks = element_blank(),
panel.grid = element_blank(),
axis.line = element_blank(),
legend.position = "none"
)+
NULL
## Warning: Removed 5 rows containing missing values or values outside the scale range
## (`geom_point()`).
Full length and new Alu orientation
rep_m <- repdb[repdb$repFamily %in% c("Alu") & strand(repdb) == "-" & width(repdb) > 270 & width(repdb) < 330]
dat_m <- GR[GR %over% rep_m]
rep_p <- repdb[repdb$repFamily %in% c("Alu") & strand(repdb) == "+" & width(repdb) > 270 & width(repdb) < 330]
dat_p <- GR[GR %over% rep_p]
dat_m2 <- dat_m[!dat_m %over% dat_p]
dat_p2 <- dat_p[!dat_p %over% dat_m]
dat1 <- data.frame(elementMetadata(dat_m2))
dat1$ori <- "m"
dat2 <- data.frame(elementMetadata(dat_p2))
dat2$ori <- "p"
dat <- rbind (dat1,dat2)
ggplot(dat, aes(x=x, y=y, colour=ori))+
geom_point(alpha=0.05, size=0.01)+
scale_x_continuous(limits=c(-8,6.5))+
scale_y_continuous(limits=c(-15,8))+
scale_colour_manual(values=c("green","magenta"))+
ggtitle("SINE orientation")+
theme(axis.text = element_blank(),
axis.title = element_blank(),
axis.ticks = element_blank(),
panel.grid = element_blank(),
axis.line = element_blank(),
legend.position = "none"
)+
NULL
## Warning: Removed 6 rows containing missing values or values outside the scale range
## (`geom_point()`).
LTR orientation
rep_m <- repdb[repdb$repClass %in% c("LTR","LTR?") & strand(repdb) == "-"]
dat_m <- GR[GR %over% rep_m]
rep_p <- repdb[repdb$repClass %in% c("LTR","LTR?") & strand(repdb) == "+"]
dat_p <- GR[GR %over% rep_p]
dat_m2 <- dat_m[!dat_m %over% dat_p]
dat_p2 <- dat_p[!dat_p %over% dat_m]
dat1 <- data.frame(elementMetadata(dat_m2))
dat1$ori <- "m"
dat2 <- data.frame(elementMetadata(dat_p2))
dat2$ori <- "p"
dat <- rbind (dat1,dat2)
ggplot(dat, aes(x=x, y=y, colour=ori))+
geom_point(alpha=0.05, size=0.01)+
scale_x_continuous(limits=c(-8,6.5))+
scale_y_continuous(limits=c(-15,8))+
scale_colour_manual(values=c("green","magenta"))+
ggtitle("LTR orientation")+
theme(axis.text = element_blank(),
axis.title = element_blank(),
axis.ticks = element_blank(),
panel.grid = element_blank(),
axis.line = element_blank(),
legend.position = "none"
)+
NULL
## Warning: Removed 113 rows containing missing values or values outside the scale range
## (`geom_point()`).
LTR > 1000 orientation
rep_m <- repdb[repdb$repClass %in% c("LTR") & strand(repdb) == "-" & width(repdb) > 1000]
dat_m <- GR[GR %over% rep_m]
rep_p <- repdb[repdb$repClass %in% c("LTR") & strand(repdb) == "+" & width(repdb) > 1000]
dat_p <- GR[GR %over% rep_p]
dat_m2 <- dat_m[!dat_m %over% dat_p]
dat_p2 <- dat_p[!dat_p %over% dat_m]
dat1 <- data.frame(elementMetadata(dat_m2))
dat1$ori <- "m"
dat2 <- data.frame(elementMetadata(dat_p2))
dat2$ori <- "p"
dat <- rbind (dat1,dat2)
ggplot(dat, aes(x=x, y=y, colour=ori))+
geom_point(alpha=0.05, size=0.01)+
scale_x_continuous(limits=c(-8,6.5))+
scale_y_continuous(limits=c(-15,8))+
scale_colour_manual(values=c("green","magenta"))+
ggtitle("LTR orientation >1000")+
theme(axis.text = element_blank(),
axis.title = element_blank(),
axis.ticks = element_blank(),
panel.grid = element_blank(),
axis.line = element_blank(),
legend.position = "none"
)+
NULL
## Warning: Removed 106 rows containing missing values or values outside the scale range
## (`geom_point()`).
HMM <- read.table("data/GM12878_ChromHMM.bed")
HMM <- HMM[,c(2,3,4,5,10)]
colnames(HMM) <- c("seqnames","start","end","ID","colour")
HMM <- GRanges(HMM)
HHMdf <- get(load("../GROVER/objects/230413_HMM_annotation.rdat"))
HHMcol <- c("#FF0000","#FF6969","#CF0BC6", "#FACA00", "#FACA01", "#FFFC04", "#FFFC05", "#0ABEFE", "#00B050", "#00B051", "#99FF66","#7F7F7F", "#F5F5F5", "#F5F5F6", "#F5F5F7")
HHMdf <- data.frame(HHMdf)
HHMcol[13:15] <- "black"
for(i in c(1:15)){
GR2 <- HMM[HMM$ID == levels(HHMdf$ID)[i]]
dat <- GR[GR %over% GR2]
dat <- data.frame(elementMetadata(dat))
p1 <- ggplot(dat, aes(x=x, y=y))+
geom_point(colour=HHMcol[i], alpha=0.05, size=0.01)+
scale_x_continuous(limits=c(-8,6.5))+
scale_y_continuous(limits=c(-15,8))+
ggtitle(levels(HHMdf$ID)[i])+
theme(axis.text = element_blank(),
axis.title = element_blank(),
axis.ticks = element_blank(),
panel.grid = element_blank(),
axis.line = element_blank()
)+
NULL
}
for(i in c(1:15)){
GR2 <- HMM[HMM$ID == levels(HHMdf$ID)[i]]
dat <- GR[GR %over% GR2]
dat <- data.frame(elementMetadata(dat))
p1 <- ggplot(dat, aes(x=x, y=y))+
geom_point(colour=HHMcol[i], alpha=1, size=0.1)+
scale_x_continuous(limits=c(-8,6.5))+
scale_y_continuous(limits=c(-15,8))+
ggtitle(levels(HHMdf$ID)[i])+
theme(axis.text = element_blank(),
axis.title = element_blank(),
axis.ticks = element_blank(),
panel.grid = element_blank(),
axis.line = element_blank()
)+
NULL
}
replication timing
repltime <- read.table("data/GSE131417_K562_reptime_segmentation_recolored.bed")
colnames(repltime)[1:4] <- c("seqnames","start","end","timing")
repltime <- GRanges(repltime)
seqlevels(repltime, pruning.mode="coarse") <- paste0("chr",c(1:22,"X","Y"))
seqinfo(repltime)<- seqinfo(Hsapiens)
## Warning in valid.GenomicRanges.seqinfo(x, suggest.trim = TRUE): GRanges object contains 24 out-of-bound ranges located on sequences
## chr1, chr2, chr3, chr4, chr5, chr6, chr7, chr8, chr9, chr10, chr11,
## chr12, chr13, chr14, chr15, chr16, chr17, chr18, chr19, chr20, chr21,
## chr22, chrX, and chrY. Note that ranges located on a sequence whose
## length is unknown (NA) or on a circular sequence are not considered
## out-of-bound (use seqlengths() and isCircular() to get the lengths and
## circularity flags of the underlying sequences). You can use trim() to
## trim these ranges. See ?`trim,GenomicRanges-method` for more
## information.
early <- repltime[repltime$timing == "early"]
mid_early <- repltime[repltime$timing == "mid-early"]
mid_late <- repltime[repltime$timing == "mid-late"]
late <- repltime[repltime$timing == "late"]
dat_m <- GR[GR %over% early]
dat_p <- GR[GR %over% late]
dat_m2 <- dat_m[!dat_m %over% dat_p]
dat_p2 <- dat_p[!dat_p %over% dat_m]
dat1 <- data.frame(elementMetadata(dat_m2))
dat1$ori <- "m"
dat2 <- data.frame(elementMetadata(dat_p2))
dat2$ori <- "p"
dat <- rbind (dat1,dat2)
ggplot(dat, aes(x=x, y=y, colour=ori))+
geom_point(alpha=0.05, size=0.01)+
scale_x_continuous(limits=c(-8,6.5))+
scale_y_continuous(limits=c(-15,8))+
scale_colour_manual(values=c("yellow","blue"))+
ggtitle("replication timing")+
theme(axis.text = element_blank(),
axis.title = element_blank(),
axis.ticks = element_blank(),
panel.grid = element_blank(),
axis.line = element_blank(),
legend.position = "none"
)+
NULL
## Warning: Removed 403 rows containing missing values or values outside the scale range
## (`geom_point()`).
replication strand
replstrand <- rtracklayer::import("data/OK-seq_K562_BR1.bam.rfd.bw")
seqlevels(replstrand, pruning.mode="coarse") <- paste0("chr",c(1:22,"X","Y"))
seqinfo(replstrand)<- seqinfo(Hsapiens)
plusrepl <- reduce(replstrand[replstrand$score>0])
minusrepl <- reduce(replstrand[replstrand$score<0])
neut<- reduce(replstrand[replstrand$score==0])
dat_m <- GR[GR %over% minusrepl]
dat_p <- GR[GR %over% plusrepl]
dat_m2 <- dat_m[!dat_m %over% dat_p]
dat_p2 <- dat_p[!dat_p %over% dat_m]
dat1 <- data.frame(elementMetadata(dat_m2))
dat1$ori <- "m"
dat2 <- data.frame(elementMetadata(dat_p2))
dat2$ori <- "p"
dat <- rbind (dat1,dat2)
ggplot(dat, aes(x=x, y=y, colour=ori))+
geom_point(alpha=0.05, size=0.01)+
scale_x_continuous(limits=c(-8,6.5))+
scale_y_continuous(limits=c(-15,8))+
scale_colour_manual(values=c("green","magenta"))+
ggtitle("replication strand")+
theme(axis.text = element_blank(),
axis.title = element_blank(),
axis.ticks = element_blank(),
panel.grid = element_blank(),
axis.line = element_blank(),
legend.position = "none"
)+
NULL
## Warning: Removed 463 rows containing missing values or values outside the scale range
## (`geom_point()`).
transcription strand
genes <- genes(txdb)
## 403 genes were dropped because they have exons located on both strands
## of the same reference sequence or on more than one reference sequence,
## so cannot be represented by a single genomic range.
## Use 'single.strand.genes.only=FALSE' to get all the genes in a
## GRangesList object, or use suppressMessages() to suppress this message.
genes_p <- genes[strand(genes) == "+"]
genes_m <- genes[strand(genes) == "-"]
strand(genes_m) <- "+"
dat_m <- GR[GR %over% genes_m]
dat_p <- GR[GR %over% genes_p]
dat_m2 <- dat_m[!dat_m %over% dat_p]
dat_p2 <- dat_p[!dat_p %over% dat_m]
dat1 <- data.frame(elementMetadata(dat_m2))
dat1$ori <- "m"
dat2 <- data.frame(elementMetadata(dat_p2))
dat2$ori <- "p"
dat <- rbind (dat1,dat2)
ggplot(dat, aes(x=x, y=y, colour=ori))+
geom_point(alpha=0.05, size=0.01)+
scale_x_continuous(limits=c(-8,6.5))+
scale_y_continuous(limits=c(-15,8))+
scale_colour_manual(values=c("green","magenta"))+
ggtitle("replication strand")+
theme(axis.text = element_blank(),
axis.title = element_blank(),
axis.ticks = element_blank(),
panel.grid = element_blank(),
axis.line = element_blank(),
legend.position = "none"
)+
NULL
## Warning: Removed 41 rows containing missing values or values outside the scale range
## (`geom_point()`).
Coding sequence strand
cds <- cds(txdb)
cds_p <- cds[strand(cds) == "+"]
cds_m <- cds[strand(cds) == "-"]
strand(cds_m) <- "+"
dat_m <- GR[GR %over% cds_m]
dat_p <- GR[GR %over% cds_p]
dat_m2 <- dat_m[!dat_m %over% dat_p]
dat_p2 <- dat_p[!dat_p %over% dat_m]
dat1 <- data.frame(elementMetadata(dat_m2))
dat1$ori <- "m"
dat2 <- data.frame(elementMetadata(dat_p2))
dat2$ori <- "p"
dat <- rbind (dat1,dat2)
ggplot(dat, aes(x=x, y=y, colour=ori))+
geom_point(alpha=0.05, size=0.01)+
scale_x_continuous(limits=c(-8,6.5))+
scale_y_continuous(limits=c(-15,8))+
scale_colour_manual(values=c("green","magenta"))+
ggtitle("coding sequence strand ")+
theme(axis.text = element_blank(),
axis.title = element_blank(),
axis.ticks = element_blank(),
panel.grid = element_blank(),
axis.line = element_blank(),
legend.position = "none"
)+
NULL
## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_point()`).
Figures 6A,D,G are directly genome browser tracks from the IGV browser for illustration.
dat <- read.table("data/Prom300.tsv", header=TRUE)
dat$model <- factor(dat$model, levels=dat$model[c(10,9,2:4,5:8,11,1,12:18)])
dat_hs <- dat[c(1:4,9:11),]
dat_nc <- dat[c(12:17),]
melted <- melt(dat_hs[,c(2,6)])
## Using model as id variables
colnames(melted) <- c("Model","task","value")
p1 <- ggplot(melted, aes(x=Model, y=value*100,fill=Model))+
geom_bar(stat="identity", position="dodge")+
scale_y_continuous(limits=c(0,100))+
ggtitle("DNA language models")+
scale_fill_manual(values=cols1)+
theme_cowplot()+
theme(axis.text.x=element_text(angle = 90,hjust = 1))+
NULL
melted <- melt(dat_nc[,c(2,6)])
## Using model as id variables
colnames(melted) <- c("Model","task","value")
p3 <- ggplot(melted,aes(x=Model, y=value*100,fill=Model))+
geom_bar(stat="identity", position="dodge")+
scale_y_continuous(limits=c(0,100))+
ggtitle("context free")+
scale_fill_manual(values=colsIDF)+
theme_cowplot()+
theme(axis.text.x=element_text(angle = 90,hjust = 1))+
NULL
p1+p3+plot_layout(guides = 'collect')
# shuffling
dat_hs <- dat[c(1:4,9:11),]
dat_hs$model <- factor(c("GROVER","b4mer","b5mer","b6mer","k4mer","k5mer","k6mer"), levels=c("k4mer","k5mer","k6mer","b4mer","b5mer","b6mer","GROVER"))
melted <- melt(dat_hs[,c(2,6)])
## Using model as id variables
colnames(melted) <- c("Model","task","value")
p1 <- ggplot(melted, aes(x=Model, y=value*100,fill=Model))+
geom_bar(stat="identity", position="dodge")+
scale_y_continuous(limits=c(0,100))+
ggtitle("DNA language models")+
scale_fill_manual(values=shuffle_cols)+
theme_cowplot()+
theme(axis.text.x=element_text(angle = 90,hjust = 1))+
NULL
p1
# shuffling with MCC
dat_hs <- dat[c(1:4,9:11),]
dat_hs$model <- factor(c("GROVER","b4mer","b5mer","b6mer","k4mer","k5mer","k6mer"), levels=c("k4mer","k5mer","k6mer","b4mer","b5mer","b6mer","GROVER"))
melted <- melt(dat_hs[,c(2,6)])
## Using model as id variables
colnames(melted) <- c("Model","task","value")
p1 <- ggplot(melted, aes(x=Model, y=value*100,fill=Model))+
geom_bar(stat="identity", position="dodge")+
scale_y_continuous(limits=c(0,100))+
ggtitle("DNA language models")+
scale_fill_manual(values=shuffle_cols)+
theme_cowplot()+
theme(axis.text.x=element_text(angle = 90,hjust = 1))+
NULL
p1
dat_hs <- dat[c(1:4,9:11),]
dat_nc <- dat[c(12:17),]
melted <- melt(dat_hs[,c(2,3)])
## Using model as id variables
colnames(melted) <- c("Model","task","value")
p1 <- ggplot(melted, aes(x=Model, y=value*100,fill=Model))+
geom_bar(stat="identity", position="dodge")+
scale_y_continuous(limits=c(0,100))+
ggtitle("DNA language models")+
scale_fill_manual(values=cols2)+
theme_cowplot()+
theme(axis.text.x=element_text(angle = 90,hjust = 1))+
NULL
melted <- melt(dat_nc[,c(2,3)])
## Using model as id variables
colnames(melted) <- c("Model","task","value")
p3 <- ggplot(melted, aes(x=Model, y=value*100,fill=Model))+
geom_bar(stat="identity", position="dodge")+
scale_y_continuous(limits=c(0,100))+
ggtitle("context free")+
scale_fill_manual(values=colsIDF)+
theme_cowplot()+
theme(axis.text.x=element_text(angle = 90,hjust = 1))+
NULL
p1+p3+plot_layout(guides = 'collect')
ggsave("Prom300_precision.pdf",width=10,height=5)
# shuffling
dat_hs <- dat[c(1:4,9:11),]
dat_hs$model <- factor(c("GROVER","b4mer","b5mer","b6mer","k4mer","k5mer","k6mer"), levels=c("k4mer","k5mer","k6mer","b4mer","b5mer","b6mer","GROVER"))
melted <- melt(dat_hs[,c(2,3)])
## Using model as id variables
colnames(melted) <- c("Model","task","value")
p1 <- ggplot(melted, aes(x=Model, y=value*100,fill=Model))+
geom_bar(stat="identity", position="dodge")+
scale_y_continuous(limits=c(0,100))+
ggtitle("DNA language models")+
scale_fill_manual(values=shuffle_cols)+
theme_cowplot()+
theme(axis.text.x=element_text(angle = 90,hjust = 1))+
NULL
p1
ggsave("Prom300_precision_shuffling.pdf",width=5,height=5)
dat_hs <- dat[c(1:4,9:11),]
dat_nc <- dat[c(12:17),]
melted <- melt(dat_hs[,c(2,4)])
## Using model as id variables
colnames(melted) <- c("Model","task","value")
p1 <- ggplot(melted, aes(x=Model, y=value*100,fill=Model))+
geom_bar(stat="identity", position="dodge")+
scale_y_continuous(limits=c(0,100))+
ggtitle("DNA language models")+
scale_fill_manual(values=cols2)+
theme_cowplot()+
theme(axis.text.x=element_text(angle = 90,hjust = 1))+
NULL
melted <- melt(dat_nc[,c(2,4)])
## Using model as id variables
colnames(melted) <- c("Model","task","value")
p3 <- ggplot(melted, aes(x=Model, y=value*100,fill=Model))+
geom_bar(stat="identity", position="dodge")+
scale_y_continuous(limits=c(0,100))+
ggtitle("context free")+
scale_fill_manual(values=colsIDF)+
theme_cowplot()+
theme(axis.text.x=element_text(angle = 90,hjust = 1))+
NULL
p1+p3+plot_layout(guides = 'collect')
ggsave("Prom300_recall.pdf",width=10,height=5)
# shuffling
dat_hs <- dat[c(1:4,19:21),]
dat_hs$model <- factor(c("GROVER","b4mer","b5mer","b6mer","k4mer","k5mer","k6mer"), levels=c("k4mer","k5mer","k6mer","b4mer","b5mer","b6mer","GROVER"))
melted <- melt(dat_hs[,c(2,5)])
## Using model as id variables
colnames(melted) <- c("Model","task","value")
p1 <- ggplot(melted, aes(x=Model, y=value*100,fill=Model))+
geom_bar(stat="identity", position="dodge")+
scale_y_continuous(limits=c(0,100))+
ggtitle("DNA language models")+
theme_cowplot()+
scale_fill_manual(values=shuffle_cols)+
theme(axis.text.x=element_text(angle = 90,hjust = 1))+
NULL
p1
ggsave("Prom300_recall_shuffling.pdf",width=5,height=5)
dat_hs <- dat[c(1:4,9:11),]
dat_nc <- dat[c(12:17),]
melted <- melt(dat_hs[,c(2,5)])
## Using model as id variables
colnames(melted) <- c("Model","task","value")
p1 <- ggplot(melted, aes(x=Model, y=value*100,fill=Model))+
geom_bar(stat="identity", position="dodge")+
scale_y_continuous(limits=c(0,100))+
ggtitle("DNA language models")+
scale_fill_manual(values=cols2)+
theme_cowplot()+
theme(axis.text.x=element_text(angle = 90,hjust = 1))+
NULL
melted <- melt(dat_nc[,c(2,5)])
## Using model as id variables
colnames(melted) <- c("Model","task","value")
p3 <- ggplot(melted, aes(x=Model, y=value*100,fill=Model))+
geom_bar(stat="identity", position="dodge")+
scale_y_continuous(limits=c(0,100))+
ggtitle("context free")+
scale_fill_manual(values=colsIDF)+
theme_cowplot()+
theme(axis.text.x=element_text(angle = 90,hjust = 1))+
NULL
p1+p3+plot_layout(guides = 'collect')
ggsave("Prom300_f1.pdf",width=10,height=5)
# shuffling
dat_hs <- dat[c(1:4,19:21),]
dat_hs$model <- factor(c("GROVER","b4mer","b5mer","b6mer","k4mer","k5mer","k6mer"), levels=c("k4mer","k5mer","k6mer","b4mer","b5mer","b6mer","GROVER"))
melted <- melt(dat_hs[,c(2,5)])
## Using model as id variables
colnames(melted) <- c("Model","task","value")
p1 <- ggplot(melted, aes(x=Model, y=value*100,fill=Model))+
geom_bar(stat="identity", position="dodge")+
scale_y_continuous(limits=c(0,100))+
ggtitle("DNA language models")+
scale_fill_manual(values=shuffle_cols)+
theme_cowplot()+
theme(axis.text.x=element_text(angle = 90,hjust = 1))+
NULL
p1
ggsave("Prom300_f1_shuffling.pdf",width=5,height=5)
dat <- read.table("data/PromScan.tsv", header=TRUE)
dat$model <- factor(dat$model, levels=dat$model[c(5,2:4,1,7:12)])
dat_hs <- dat[c(1:5),]
dat_nc <- dat[c(7:12),]
melted <- melt(dat_hs[,c(2,6)])
## Using model as id variables
colnames(melted) <- c("Model","task","value")
p1 <- ggplot(melted, aes(x=Model, y=value*100,fill=Model))+
geom_bar(stat="identity", position="dodge")+
scale_y_continuous(limits=c(0,100))+
ggtitle("DNA language models")+
scale_fill_manual(values=cols1a)+
theme_cowplot()+
theme(axis.text.x=element_text(angle = 90,hjust = 1))+
NULL
melted <- melt(dat_nc[,c(2,6)])
## Using model as id variables
colnames(melted) <- c("Model","task","value")
p3 <- ggplot(melted, aes(x=Model, y=value*100,fill=Model))+
geom_bar(stat="identity", position="dodge")+
scale_y_continuous(limits=c(0,100))+
ggtitle("context free")+
scale_fill_manual(values=colsIDF)+
theme_cowplot()+
theme(axis.text.x=element_text(angle = 90,hjust = 1))+
NULL
p1+p3+plot_layout(guides = 'collect')
melted <- melt(dat_hs[,c(2,3)])
## Using model as id variables
colnames(melted) <- c("Model","task","value")
p1 <- ggplot(melted, aes(x=Model, y=value*100,fill=Model))+
geom_bar(stat="identity", position="dodge")+
scale_y_continuous(limits=c(0,100))+
ggtitle("DNA language models")+
scale_fill_manual(values=cols1a)+
theme_cowplot()+
theme(axis.text.x=element_text(angle = 90,hjust = 1))+
NULL
melted <- melt(dat_nc[,c(2,3)])
## Using model as id variables
colnames(melted) <- c("Model","task","value")
p3 <- ggplot(melted, aes(x=Model, y=value*100,fill=Model))+
geom_bar(stat="identity", position="dodge")+
scale_y_continuous(limits=c(0,100))+
ggtitle("context free")+
scale_fill_manual(values=colsIDF)+
theme_cowplot()+
theme(axis.text.x=element_text(angle = 90,hjust = 1))+
NULL
p1+p3+plot_layout(guides = 'collect')
melted <- melt(dat_hs[,c(2,4)])
## Using model as id variables
colnames(melted) <- c("Model","task","value")
p1 <- ggplot(melted, aes(x=Model, y=value*100,fill=Model))+
geom_bar(stat="identity", position="dodge")+
scale_y_continuous(limits=c(0,100))+
ggtitle("DNA language models")+
scale_fill_manual(values=cols1a)+
theme_cowplot()+
theme(axis.text.x=element_text(angle = 90,hjust = 1))+
NULL
melted <- melt(dat_nc[,c(2,4)])
## Using model as id variables
colnames(melted) <- c("Model","task","value")
p3 <- ggplot(melted, aes(x=Model, y=value*100,fill=Model))+
geom_bar(stat="identity", position="dodge")+
scale_y_continuous(limits=c(0,100))+
ggtitle("context free")+
scale_fill_manual(values=colsIDF)+
theme_cowplot()+
theme(axis.text.x=element_text(angle = 90,hjust = 1))+
NULL
p1+p3+plot_layout(guides = 'collect')
melted <- melt(dat_hs[,c(2,5)])
## Using model as id variables
colnames(melted) <- c("Model","task","value")
p1 <- ggplot(melted, aes(x=Model, y=value*100,fill=Model))+
geom_bar(stat="identity", position="dodge")+
scale_y_continuous(limits=c(0,100))+
ggtitle("DNA language models")+
scale_fill_manual(values=cols1a)+
theme_cowplot()+
theme(axis.text.x=element_text(angle = 90,hjust = 1))+
NULL
melted <- melt(dat_nc[,c(2,5)])
## Using model as id variables
colnames(melted) <- c("Model","task","value")
p3 <- ggplot(melted, aes(x=Model, y=value*100,fill=Model))+
geom_bar(stat="identity", position="dodge")+
scale_y_continuous(limits=c(0,100))+
ggtitle("context free")+
scale_fill_manual(values=colsIDF)+
theme_cowplot()+
theme(axis.text.x=element_text(angle = 90,hjust = 1))+
NULL
p1+p3+plot_layout(guides = 'collect')
dat <- read.table("data/CTCF.tsv", header=TRUE)
dat$model <- factor(dat$model, levels=dat$model[c(6,5,2:4,7,1,8:13)])
dat_hs <- dat[c(1:7),]
dat_nc <- dat[c(8:13),]
melted <- melt(dat_hs[,c(2,6)])
## Using model as id variables
colnames(melted) <- c("Model","task","value")
p1 <- ggplot(melted, aes(x=Model, y=value*100,fill=Model))+
geom_bar(stat="identity", position="dodge")+
scale_y_continuous(limits=c(0,100))+
ggtitle("DNA language models")+
scale_fill_manual(values=cols1)+
theme_cowplot()+
theme(axis.text.x=element_text(angle = 90,hjust = 1))+
NULL
melted <- melt(dat_nc[,c(2,6)])
## Using model as id variables
colnames(melted) <- c("Model","task","value")
p3 <- ggplot(melted, aes(x=Model, y=value*100,fill=Model))+
geom_bar(stat="identity", position="dodge")+
scale_y_continuous(limits=c(0,100))+
ggtitle("context free")+
scale_fill_manual(values=colsIDF)+
theme_cowplot()+
theme(axis.text.x=element_text(angle = 90,hjust = 1))+
NULL
p1+p3+plot_layout(guides = 'collect')
melted <- melt(dat_hs[,c(2,3)])
## Using model as id variables
colnames(melted) <- c("Model","task","value")
p1 <- ggplot(melted, aes(x=Model, y=value*100,fill=Model))+
geom_bar(stat="identity", position="dodge")+
scale_y_continuous(limits=c(0,100))+
ggtitle("DNA language models")+
scale_fill_manual(values=cols1)+
theme_cowplot()+
theme(axis.text.x=element_text(angle = 90,hjust = 1))+
NULL
melted <- melt(dat_nc[,c(2,3)])
## Using model as id variables
colnames(melted) <- c("Model","task","value")
p3 <- ggplot(melted, aes(x=Model, y=value*100,fill=Model))+
geom_bar(stat="identity", position="dodge")+
scale_y_continuous(limits=c(0,100))+
ggtitle("context free")+
scale_fill_manual(values=colsIDF)+
theme_cowplot()+
theme(axis.text.x=element_text(angle = 90,hjust = 1))+
NULL
p1+p3+plot_layout(guides = 'collect')
melted <- melt(dat_hs[,c(2,4)])
## Using model as id variables
colnames(melted) <- c("Model","task","value")
p1 <- ggplot(melted, aes(x=Model, y=value*100,fill=Model))+
geom_bar(stat="identity", position="dodge")+
scale_y_continuous(limits=c(0,100))+
ggtitle("DNA language models")+
scale_fill_manual(values=cols1)+
theme_cowplot()+
theme(axis.text.x=element_text(angle = 90,hjust = 1))+
NULL
melted <- melt(dat_nc[,c(2,4)])
## Using model as id variables
colnames(melted) <- c("Model","task","value")
p3 <- ggplot(melted, aes(x=Model, y=value*100,fill=Model))+
geom_bar(stat="identity", position="dodge")+
scale_y_continuous(limits=c(0,100))+
ggtitle("context free")+
scale_fill_manual(values=colsIDF)+
theme_cowplot()+
theme(axis.text.x=element_text(angle = 90,hjust = 1))+
NULL
p1+p3+plot_layout(guides = 'collect')
melted <- melt(dat_hs[,c(2,5)])
## Using model as id variables
colnames(melted) <- c("Model","task","value")
p1 <- ggplot(melted, aes(x=Model, y=value*100,fill=Model))+
geom_bar(stat="identity", position="dodge")+
scale_y_continuous(limits=c(0,100))+
ggtitle("DNA language models")+
scale_fill_manual(values=cols1)+
theme_cowplot()+
theme(axis.text.x=element_text(angle = 90,hjust = 1))+
NULL
melted <- melt(dat_nc[,c(2,5)])
## Using model as id variables
colnames(melted) <- c("Model","task","value")
p3 <- ggplot(melted, aes(x=Model, y=value*100,fill=Model))+
geom_bar(stat="identity", position="dodge")+
scale_y_continuous(limits=c(0,100))+
ggtitle("context free")+
scale_fill_manual(values=colsIDF)+
theme_cowplot()+
theme(axis.text.x=element_text(angle = 90,hjust = 1))+
NULL
p1+p3+plot_layout(guides = 'collect')
dat <- read.csv("data/InstaDeep/matthews_correlation.csv")
dat$Model <- factor(dat$Model, levels=dat$Model)
dat_hs <- dat[c(2,4,5,9),]
dat_ms <- dat[c(1,3,6,7,8),]
dat_nc <- dat[c(10:15),]
melted <- melt(dat_hs)
## Using Model as id variables
colnames(melted) <- c("Model","task","value")
p1 <- ggplot(melted, aes(x=task, y=value,fill=Model))+
geom_bar(stat="identity", position="dodge")+
scale_y_continuous(limits=c(0,100))+
ggtitle("human")+
scale_fill_manual(values=human_cols2)+
theme_cowplot()+
theme(axis.text.x=element_text(angle = 90,hjust = 1))+
NULL
melted <- melt(dat_ms)
## Using Model as id variables
colnames(melted) <- c("Model","task","value")
p2<- ggplot(melted, aes(x=task, y=value,fill=Model))+
geom_bar(stat="identity", position="dodge")+
scale_y_continuous(limits=c(0,100))+
ggtitle("multi species")+
scale_fill_manual(values=human_mscols2)+
theme_cowplot()+
theme(axis.text.x=element_text(angle = 90,hjust = 1))+
NULL
melted <- melt(dat_nc)
## Using Model as id variables
colnames(melted) <- c("Model","task","value")
p3 <- ggplot(melted, aes(x=task, y=value,fill=Model))+
geom_bar(stat="identity", position="dodge")+
scale_y_continuous(limits=c(0,100))+
ggtitle("context free")+
scale_fill_manual(values=colsIDF)+
theme_cowplot()+
theme(axis.text.x=element_text(angle = 90,hjust = 1))+
NULL
p1+p2+p3+plot_layout(guides = 'collect')
dat <- read.csv("data/GUE/matthews_correlation.csv")
dat$Model <- factor(dat$Model, levels=dat$Model)
dat_hs <- dat[c(1,2,7),]
dat_ms <- dat[c(3:6),]
dat_nc <- dat[c(8:13),]
dat_tfs <- dat_hs[,1:6]
dat_tfs_ms <- dat_ms[,1:6]
dat_tfs_nc <- dat_nc[,1:6]
dat_cp <- dat_hs[,c(1,7:9)]
dat_cp_ms <- dat_ms[,c(1,7:9)]
dat_cp_nc <- dat_nc[,c(1,7:9)]
dat_p300 <- dat_hs[,c(1,10:12)]
dat_p300_ms <- dat_ms[,c(1,10:12)]
dat_p300_nc <- dat_nc[,c(1,10:12)]
dat_sp <- dat_hs[,c(1,13)]
dat_sp_ms <- dat_ms[,c(1,13)]
dat_sp_nc <- dat_nc[,c(1,13)]
## TFs
melted <- melt(dat_tfs)
## Using Model as id variables
colnames(melted) <- c("Model","task","value")
p1 <- ggplot(melted, aes(x=task, y=value,fill=Model))+
geom_bar(stat="identity", position="dodge")+
scale_y_continuous(limits=c(0,80))+
scale_fill_manual(values=human_ms_cols[1:3])+
ggtitle("human")+
theme_cowplot()+
NULL
melted <- melt(dat_tfs_ms)
## Using Model as id variables
colnames(melted) <- c("Model","task","value")
p2<- ggplot(melted, aes(x=task, y=value,fill=Model))+
geom_bar(stat="identity", position="dodge")+
scale_y_continuous(limits=c(0,80))+
ggtitle("multi species")+
scale_fill_manual(values=human_ms_cols[4:7])+
theme_cowplot()+
NULL
melted <- melt(dat_tfs_nc)
## Using Model as id variables
colnames(melted) <- c("Model","task","value")
p3 <- ggplot(melted, aes(x=task, y=value,fill=Model))+
geom_bar(stat="identity", position="dodge")+
scale_y_continuous(limits=c(0,80))+
scale_fill_manual(values=colsIDF)+
ggtitle("context free")+
theme_cowplot()+
NULL
p1+p2+p3+plot_layout(guides = 'collect')
## Core promoter
melted <- melt(dat_cp)
## Using Model as id variables
colnames(melted) <- c("Model","task","value")
p1 <- ggplot(melted, aes(x=task, y=value,fill=Model))+
geom_bar(stat="identity", position="dodge")+
scale_y_continuous(limits=c(0,80))+
ggtitle("human")+
scale_fill_manual(values=human_ms_cols[1:3])+
theme_cowplot()+
NULL
melted <- melt(dat_cp_ms)
## Using Model as id variables
colnames(melted) <- c("Model","task","value")
p2<- ggplot(melted, aes(x=task, y=value,fill=Model))+
geom_bar(stat="identity", position="dodge")+
scale_y_continuous(limits=c(0,80))+
ggtitle("multi species")+
scale_fill_manual(values=human_ms_cols[4:7])+
theme_cowplot()+
NULL
melted <- melt(dat_cp_nc)
## Using Model as id variables
colnames(melted) <- c("Model","task","value")
p3 <- ggplot(melted, aes(x=task, y=value,fill=Model))+
geom_bar(stat="identity", position="dodge")+
scale_y_continuous(limits=c(0,80))+
ggtitle("context free")+
scale_fill_manual(values=colsIDF)+
theme_cowplot()+
NULL
p1+p2+p3+plot_layout(guides = 'collect')
## p300
melted <- melt(dat_p300)
## Using Model as id variables
colnames(melted) <- c("Model","task","value")
p1 <- ggplot(melted, aes(x=task, y=value,fill=Model))+
geom_bar(stat="identity", position="dodge")+
scale_y_continuous(limits=c(0,100))+
ggtitle("human")+
scale_fill_manual(values=human_ms_cols[1:3])+
theme_cowplot()+
NULL
melted <- melt(dat_p300_ms)
## Using Model as id variables
colnames(melted) <- c("Model","task","value")
p2<- ggplot(melted, aes(x=task, y=value,fill=Model))+
geom_bar(stat="identity", position="dodge")+
scale_y_continuous(limits=c(0,100))+
ggtitle("multi species")+
scale_fill_manual(values=human_ms_cols[4:7])+
theme_cowplot()+
NULL
melted <- melt(dat_p300_nc)
## Using Model as id variables
colnames(melted) <- c("Model","task","value")
p3 <- ggplot(melted, aes(x=task, y=value,fill=Model))+
geom_bar(stat="identity", position="dodge")+
scale_y_continuous(limits=c(0,100))+
ggtitle("context free")+
scale_fill_manual(values=colsIDF)+
theme_cowplot()+
NULL
p1+p2+p3+plot_layout(guides = 'collect')
## sp
melted <- melt(dat_sp)
## Using Model as id variables
colnames(melted) <- c("Model","task","value")
p1 <- ggplot(melted, aes(x=task, y=value,fill=Model))+
geom_bar(stat="identity", position="dodge")+
scale_y_continuous(limits=c(0,100))+
ggtitle("human")+
scale_fill_manual(values=human_ms_cols[1:3])+
theme_cowplot()+
NULL
melted <- melt(dat_sp_ms)
## Using Model as id variables
colnames(melted) <- c("Model","task","value")
p2<- ggplot(melted, aes(x=task, y=value,fill=Model))+
geom_bar(stat="identity", position="dodge")+
scale_y_continuous(limits=c(0,100))+
ggtitle("multi species")+
scale_fill_manual(values=human_ms_cols[4:7])+
theme_cowplot()+
NULL
melted <- melt(dat_sp_nc)
## Using Model as id variables
colnames(melted) <- c("Model","task","value")
p3 <- ggplot(melted, aes(x=task, y=value,fill=Model))+
geom_bar(stat="identity", position="dodge")+
scale_y_continuous(limits=c(0,100))+
ggtitle("context free")+
scale_fill_manual(values=colsIDF)+
theme_cowplot()+
NULL
p1+p2+p3+plot_layout(guides = 'collect')
sessionInfo()
## R version 4.3.1 (2023-06-16)
## Platform: x86_64-apple-darwin20 (64-bit)
## Running under: macOS Ventura 13.6
##
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/4.3-x86_64/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/4.3-x86_64/Resources/lib/libRlapack.dylib; LAPACK version 3.11.0
##
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
##
## time zone: Europe/Berlin
## tzcode source: internal
##
## attached base packages:
## [1] stats4 grid stats graphics grDevices utils datasets
## [8] methods base
##
## other attached packages:
## [1] clusterProfiler_4.8.3
## [2] TxDb.Hsapiens.UCSC.hg19.knownGene_3.2.2
## [3] GenomicFeatures_1.52.2
## [4] AnnotationDbi_1.62.2
## [5] Biobase_2.60.0
## [6] BSgenome.Hsapiens.UCSC.hg19_1.4.3
## [7] BSgenome_1.68.0
## [8] rtracklayer_1.60.1
## [9] Biostrings_2.68.1
## [10] XVector_0.40.0
## [11] GenomicRanges_1.52.1
## [12] GenomeInfoDb_1.36.4
## [13] IRanges_2.34.1
## [14] S4Vectors_0.38.2
## [15] BiocGenerics_0.46.0
## [16] patchwork_1.2.0
## [17] wordcloud2_0.2.1
## [18] lsa_0.73.3
## [19] SnowballC_0.7.1
## [20] umap_0.2.10.0
## [21] ggrepel_0.9.5
## [22] ggfortify_0.4.17
## [23] migest_2.0.4
## [24] viridis_0.6.5
## [25] viridisLite_0.4.2
## [26] pheatmap_1.0.12
## [27] gridExtra_2.3
## [28] cowplot_1.1.3
## [29] reshape2_1.4.4
## [30] RColorBrewer_1.1-3
## [31] colorspace_2.1-0
## [32] pals_1.8
## [33] lubridate_1.9.3
## [34] forcats_1.0.0
## [35] stringr_1.5.1
## [36] dplyr_1.1.4
## [37] purrr_1.0.2
## [38] readr_2.1.5
## [39] tidyr_1.3.1
## [40] tibble_3.2.1
## [41] tidyverse_2.0.0
## [42] ggpubr_0.6.0
## [43] ggplot2_3.5.1
##
## loaded via a namespace (and not attached):
## [1] splines_4.3.1 BiocIO_1.10.0
## [3] ggplotify_0.1.2 bitops_1.0-7
## [5] filelock_1.0.3 polyclip_1.10-6
## [7] XML_3.99-0.16.1 lifecycle_1.0.4
## [9] rstatix_0.7.2 vroom_1.6.5
## [11] MASS_7.3-60.0.1 lattice_0.22-6
## [13] backports_1.4.1 magrittr_2.0.3
## [15] sass_0.4.9 rmarkdown_2.27
## [17] jquerylib_0.1.4 yaml_2.3.8
## [19] askpass_1.2.0 reticulate_1.36.1
## [21] mapproj_1.2.11 DBI_1.2.2
## [23] maps_3.4.2 abind_1.4-5
## [25] zlibbioc_1.46.0 ggraph_2.2.1
## [27] RCurl_1.98-1.14 yulab.utils_0.1.4
## [29] tweenr_2.0.3 rappdirs_0.3.3
## [31] GenomeInfoDbData_1.2.10 enrichplot_1.20.3
## [33] tidytree_0.4.6 RSpectra_0.16-1
## [35] codetools_0.2-20 DelayedArray_0.26.7
## [37] ggforce_0.4.2 DOSE_3.26.2
## [39] xml2_1.3.6 tidyselect_1.2.1
## [41] aplot_0.2.2 farver_2.1.2
## [43] matrixStats_1.3.0 BiocFileCache_2.8.0
## [45] GenomicAlignments_1.36.0 jsonlite_1.8.8
## [47] tidygraph_1.3.1 systemfonts_1.1.0
## [49] tools_4.3.1 progress_1.2.3
## [51] ragg_1.3.2 treeio_1.24.3
## [53] Rcpp_1.0.12 glue_1.7.0
## [55] mgcv_1.9-1 xfun_0.44
## [57] qvalue_2.32.0 MatrixGenerics_1.12.3
## [59] withr_3.0.0 fastmap_1.2.0
## [61] fansi_1.0.6 openssl_2.2.0
## [63] digest_0.6.35 gridGraphics_0.5-1
## [65] timechange_0.3.0 R6_2.5.1
## [67] textshaping_0.3.7 GO.db_3.17.0
## [69] dichromat_2.0-0.1 biomaRt_2.56.1
## [71] RSQLite_2.3.6 utf8_1.2.4
## [73] generics_0.1.3 data.table_1.15.4
## [75] graphlayouts_1.1.1 prettyunits_1.2.0
## [77] httr_1.4.7 htmlwidgets_1.6.4
## [79] S4Arrays_1.0.6 scatterpie_0.2.2
## [81] pkgconfig_2.0.3 gtable_0.3.5
## [83] blob_1.2.4 shadowtext_0.1.3
## [85] htmltools_0.5.8.1 carData_3.0-5
## [87] fgsea_1.26.0 scales_1.3.0
## [89] png_0.1-8 ggfun_0.1.4
## [91] knitr_1.45 rstudioapi_0.16.0
## [93] tzdb_0.4.0 rjson_0.2.21
## [95] nlme_3.1-164 curl_5.2.1
## [97] cachem_1.1.0 parallel_4.3.1
## [99] HDO.db_0.99.1 restfulr_0.0.15
## [101] pillar_1.9.0 vctrs_0.6.5
## [103] car_3.1-2 dbplyr_2.5.0
## [105] evaluate_0.23 cli_3.6.2
## [107] compiler_4.3.1 Rsamtools_2.16.0
## [109] rlang_1.1.3 crayon_1.5.2
## [111] ggsignif_0.6.4 labeling_0.4.3
## [113] fs_1.6.4 plyr_1.8.9
## [115] stringi_1.8.4 BiocParallel_1.34.2
## [117] munsell_0.5.1 lazyeval_0.2.2
## [119] GOSemSim_2.26.1 Matrix_1.6-5
## [121] hms_1.1.3 bit64_4.0.5
## [123] KEGGREST_1.40.1 highr_0.10
## [125] SummarizedExperiment_1.30.2 igraph_2.0.3
## [127] broom_1.0.6 memoise_2.0.1
## [129] bslib_0.7.0 ggtree_3.8.2
## [131] fastmatch_1.1-4 bit_4.0.5
## [133] downloader_0.4 gson_0.1.0
## [135] ape_5.8