The human genome’s vocabulary as proposed by the DNA language model GROVER

For issues and questions, please get in touch via arpoetsch@gmail.com
The paper that is associated with this script can be found at: Sanabria, M., Hirsch, J., Joubert, P.M. et al. DNA language model GROVER learns sequence context in the human genome. Nat Mach Intell (2024). https://doi.org/10.1038/s42256-024-00872-0 A tutorial on how to use GROVER as a foundation model can be found at: https://doi.org/10.5281/zenodo.8373158. The pretrained model can be found at: https://doi.org/10.5281/zenodo.8373116 and https://huggingface.co/PoetschLab/GROVER
The data for the tokenised genome are at: https://doi.org/10.5281/zenodo.8373052

Introduction

The models and performance measurements were done in python and the code can be found in the corresponding document. For visualisation, data were imported into R. We provide the outputs that we used for visualisation directly to make things easier for reproduction. Some figures were recoloured and/or combined with Adobe Illustrator, so some figures may look a bit different from the publication.

Figure 1

DNA byte-pair tokenisation and model architecture

Tokenisation was performed in Python. Visualisations are done in R with token clowds. The pickle file is converted into .rdat with

library(reticulate)
source_python(“src/pickle_reader.py”)
dat <- read_pickle_file(“data/all_vocabs_wg.pkl”)

# load tokens
load("data/all_vocabs_wg.rdat")

# make word clouds for cycle 1,2,3,23,38,121,356
c1 <- dat[[1]]
wc <- data.frame(word=names(c1),freq=unlist(c1))
colvec <- mycolors[nchar(wc$word)]
names(colvec) <- NULL
wordcloud2(data=wc, fontFamily = "Courier New", color=colvec)

wc["BPE_cycle"] <- 1
wctot <- wc

c1 <- dat[[2]]
wc <- data.frame(word=names(c1),freq=unlist(c1))
colvec <- mycolors[nchar(wc$word)]
names(colvec) <- NULL
wordcloud2(data=wc, fontFamily = "Courier New", color=colvec)

wc["BPE_cycle"] <- 2

wctot <- rbind(wctot,wc)

c1 <- dat[[3]]
wc <- data.frame(word=names(c1),freq=unlist(c1))
colvec <- mycolors[nchar(wc$word)]
names(colvec) <- NULL
wordcloud2(data=wc, fontFamily = "Courier New", color=colvec)

wc["BPE_cycle"] <- 3

wctot <- rbind(wctot,wc)

c1 <- dat[[4]]
wc <- data.frame(word=names(c1),freq=unlist(c1))
colvec <- mycolors[nchar(wc$word)]
names(colvec) <- NULL
wordcloud2(data=wc, fontFamily = "Courier New", color=colvec)

wc["BPE_cycle"] <- 4

wctot <- rbind(wctot,wc)

c1 <- dat[[24]]
wc <- data.frame(word=names(c1),freq=unlist(c1))
colvec <- mycolors[nchar(wc$word)]
names(colvec) <- NULL
wordcloud2(data=wc, fontFamily = "Courier New", color=colvec)

wc["BPE_cycle"] <- 24

wctot <- rbind(wctot,wc)

c1 <- dat[[39]]
wc <- data.frame(word=names(c1),freq=unlist(c1))
colvec <- mycolors[nchar(wc$word)]
names(colvec) <- NULL
wordcloud2(data=wc, fontFamily = "Courier New", color=colvec)

wc["BPE_cycle"] <- 39

wctot <- rbind(wctot,wc)

c1 <- dat[[122]]
wc <- data.frame(word=names(c1),freq=unlist(c1))
colvec <- mycolors[nchar(wc$word)]
names(colvec) <- NULL
wordcloud2(data=wc, fontFamily = "Courier New", color=colvec)

wc["BPE_cycle"] <- 122

wctot <- rbind(wctot,wc)

c1 <- dat[[357]]
wc <- data.frame(word=names(c1),freq=unlist(c1))
colvec <- mycolors[nchar(wc$word)]
names(colvec) <- NULL
wordcloud2(data=wc, fontFamily = "Courier New", color=colvec)

wc["BPE_cycle"] <- 357

wctot <- rbind(wctot,wc)

write_csv(wctot, file="fig1A.csv")

Figure 2

Performance based selection of the vocabulary identifies 600 cycles of Byte-Pair Tokenization as optimal.

Performance metrices are imported from python.

Figure 2A

perf <- read.csv("data/performance.csv", header = TRUE)
colnames(perf) <- c("2-mers", "3-mers","4-mers","5-mers","6-mers")
rownames(perf) <- c(100*c(1:10),1000*c(2:5)) 

perf$cycle <- rownames(perf)
perf$cycle <- factor(perf$cycle,levels=perf$cycle)
melted <- melt(perf, id="cycle")
colnames(melted)<-c("cycle","kmer","accuracy") 

ggplot(data=melted, aes(x=cycle,y=accuracy,colour=kmer, fill=kmer, group=kmer))+
  geom_point(size=3)+
  geom_smooth()+
  geom_segment(aes(x=10.5,xend=10.5,y=accuracy,yend=accuracy+0.5), linewidth=5, colour="white")+
  facet_grid(facet="kmer",rows=5, scales="free")+
  scale_colour_manual(values=viridis(n=5))+
  scale_fill_manual(values=viridis(n=5))+
  theme(strip.background = element_rect(fill = "white"))+
  theme(axis.text.x = element_text(angle = 90))+
  theme(legend.position="none")+
  NULL

## Warning: The `facets` argument of `facet_grid()` is deprecated as of ggplot2 2.2.0.
## ℹ Please use the `rows` argument instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'

write_csv(perf,"fig2A.csv")

Figure 2B & C

dat <- read.csv("data/240221_accuracy.csv")

dat[,2:6] <- apply(dat[,2:6],2,as.numeric)
dat$model <- factor(dat$model, levels=dat$model[c(2,4,5,6,7,1,3,8:13)])

dat_hs <- dat[c(1:13),]

write_csv(dat_hs,"fig2BC.csv")

melted <- melt(dat_hs, id="model")
colnames(melted) <- c("Model","task","value")

p1 <- ggplot(melted, aes(x=Model, y=value,fill=Model))+
  geom_bar(stat="identity", position="dodge")+
  ggtitle("DNA language models")+
  scale_fill_manual(values=c(cols1,colsIDF))+
  theme_cowplot()+
  facet_wrap(~task, nrow=5, scales="free_y")+
  theme(axis.text.x=element_text(angle = 90,hjust = 1))+
  NULL
p1

Figure S1

dat <- read.csv("data/240221_precision.csv")

dat[,2:6] <- apply(dat[,2:6],2,as.numeric)
dat$model <- factor(dat$model, levels=dat$model[c(2,4,5,6,7,1,3,8:13)])

dat_hs <- dat[c(1:13),]


melted <- melt(dat_hs, id="model")
colnames(melted) <- c("Model","task","value")

p1 <- ggplot(melted, aes(x=Model, y=value,fill=Model))+
  geom_bar(stat="identity", position="dodge")+
  ggtitle("DNA language models")+
  scale_fill_manual(values=c(cols1,colsIDF))+
  theme_cowplot()+
  facet_wrap(~task, nrow=5, scales="free_y")+
  theme(axis.text.x=element_text(angle = 90,hjust = 1))+
  NULL
p1

dat <- read.csv("data/240221_recall.csv")

dat[,2:6] <- apply(dat[,2:6],2,as.numeric)
dat$model <- factor(dat$model, levels=dat$model[c(2,4,5,6,7,1,3,8:13)])

dat_hs <- dat[c(1:13),]

melted <- melt(dat_hs, id="model")
colnames(melted) <- c("Model","task","value")

p1 <- ggplot(melted, aes(x=Model, y=value,fill=Model))+
  geom_bar(stat="identity", position="dodge")+
  ggtitle("DNA language models")+
  scale_fill_manual(values=c(cols1,colsIDF))+
  theme_cowplot()+
  facet_wrap(~task, nrow=5, scales="free_y")+
  theme(axis.text.x=element_text(angle = 90,hjust = 1))+
  NULL
p1

dat <- read.csv("data/240221_f1.csv")


dat[,2:6] <- apply(dat[,2:6],2,as.numeric)
dat$model <- factor(dat$model, levels=dat$model[c(2,4,5,6,7,1,3,8:13)])

dat_hs <- dat[c(1:13),]


melted <- melt(dat_hs, id="model")
colnames(melted) <- c("Model","task","value")

p1 <- ggplot(melted, aes(x=Model, y=value,fill=Model))+
  geom_bar(stat="identity", position="dodge")+
  ggtitle("DNA language models")+
  scale_fill_manual(values=c(cols1,colsIDF))+
  theme_cowplot()+
  facet_wrap(~task, nrow=5, scales="free_y")+
  theme(axis.text.x=element_text(angle = 90,hjust = 1))+
  NULL
p1

dat <- read.csv("data/240221_matthews_correlation.csv")
dat[,2:6] <- apply(dat[,2:6],2,as.numeric)
dat$model <- factor(dat$model, levels=dat$model[c(2,4,5,6,7,1,3,8:13)])

dat_hs <- dat[c(1:13),]


melted <- melt(dat_hs, id="model")
colnames(melted) <- c("Model","task","value")

p1 <- ggplot(melted, aes(x=Model, y=value,fill=Model))+
  geom_bar(stat="identity", position="dodge")+
  ggtitle("DNA language models")+
  scale_fill_manual(values=c(cols1,colsIDF))+
  theme_cowplot()+
  facet_wrap(~task, nrow=5, scales="free_y")+
  theme(axis.text.x=element_text(angle = 90,hjust = 1))+
  NULL
p1

Figure 2D

acc <- read.csv("data/top_k_accuracy.csv")
ggplot(acc[1:60,], aes(x=k,y=accuracy))+
  geom_bar(stat="identity")+
  NULL

Figure 2E

perp <- read.csv("data/perplexity_GROVER_chr21.csv")
perp4 <- read.csv("data/perplexity_4mer_chr21.csv")
perp5 <- read.csv("data/perplexity_5mer_chr21.csv")
perp6 <- read.csv("data/perplexity_6mer_chr21.csv")

perp$performance <- perp$performance /601
perp4$performance <- perp4$performance /(4^4)
perp5$performance <- perp5$performance /(4^5)
perp6$performance <- perp6$performance /(4^6)

datf <- data.frame(model=c("GROVER","4mer","5mer","6mer"),
              perplexity =c(min(perp$performance),min(perp4$performance),min(perp5$performance),min(perp6$performance)))

datf$model <- factor(datf$model, levels=datf$model)

ggplot(datf, aes(x=model,y=perplexity, fill=model))+
  geom_bar(stat="identity")+
  scale_fill_manual(values=modelcols[c(1,2,4,6)])+
  NULL

Data preparation for characterisation of vocabulary

load("data/all_vocabs_wg.rdat")
vocab600 <- dat[["600"]]
vocab600 <- t(data.frame(vocab600))
colnames(vocab600)<- "frequency"
vocab600 <- data.frame(vocab600)
vocab600$token <- rownames(vocab600)

Hseq <- getSeq(Hsapiens)
Hseq <- Hseq[names(Hseq) %in% paste0("chr",c(1:22,"X","Y"))]

kmers4 <- colSums(oligonucleotideFrequency(Hseq, 4))
kmers5 <- colSums(oligonucleotideFrequency(Hseq, 5))
kmers6 <- colSums(oligonucleotideFrequency(Hseq, 6))

kmers4 <- data.frame(kmers4)
kmers4$token <- rownames(kmers4)
colnames(kmers4)<- c("frequency","token")
kmers5 <- data.frame(kmers5)
kmers5$token <- rownames(kmers5)
colnames(kmers5)<- c("frequency","token")
kmers6 <- data.frame(kmers6)
kmers6$token <- rownames(kmers6)
colnames(kmers6)<- c("frequency","token")

Load embeddings

Embeddings for the GROVER vocabulary from the BERT infrastructure and Word2Vec, UMAP and PCA.

#GROVER
Gemb <- get(load("data/vocab_embedding.rdat"))
rownames(Gemb) <- read.table("data/vocab.txt")[,1]

#W2V
GWemb <-  get(load("data/iter600_w2v.rdat"))
rownames(GWemb) <-  read.table("data/iter600_w2v_vocab.txt")[,1]

Gemb <- Gemb[-c(1:7,9),]
Gum <- umap(Gemb)
Gpca <- prcomp(Gemb)
GWum <- umap(GWemb)
GWpca <- prcomp(GWemb)

Collection of data in dataframes

df <- data.frame(Gum$layout)
df$token <- rownames(df)
colnames(df)<- c("G_UMAP1","G_UMAP2","token")

dfGW <- data.frame(GWum$layout)
dfGW$token <- rownames(dfGW)
colnames(dfGW) <- c("G_W2V_UMAP1","G_W2V_UMAP2","token")
df <- merge(df,dfGW,id="token")

df2 <- data.frame(Gpca$x)[,1:6]
df2$token <- rownames(df2)
colnames(df2) <- c("GROVER_PC1","GROVER_PC2","GROVER_PC3","GROVER_PC4","GROVER_PC5","GROVER_PC6","token")
df <- merge(df,df2,id="token")

df2 <- data.frame(GWpca$x)[,1:6]
df2$token <- rownames(df2)
colnames(df2) <- c("GROVER_W2V_PC1","GROVER_W2V_PC2","GROVER_W2V_PC3","GROVER_W2V_PC4","GROVER_W2V_PC5","GROVER_W2V_PC6","token")
df <- merge(df,df2,id="token")
df <- merge(df,vocab600, id="token")
df$length <- factor(nchar(as.character(df$token)))

df$C <- sapply(as.character(df$token),str_count, pattern=c("C"))/nchar(as.character(df$token))
df$G <- sapply(as.character(df$token),str_count, pattern=c("G"))/nchar(as.character(df$token))
df$A <- sapply(as.character(df$token),str_count, pattern=c("A"))/nchar(as.character(df$token))
df$T <- sapply(as.character(df$token),str_count, pattern=c("T"))/nchar(as.character(df$token))
df$GC <- df$C + df$G
df$AG <- df$A + df$G
df$AC <- df$A + df$C

hex=jet(nrow(df)) #colour selection
df$W2V_hex <- hex[rank(df$GROVER_W2V_PC1)]
w2v_tokencol <- df[,c("token","W2V_hex","GROVER_W2V_PC1")]

Figure 3

The frequency balanced GROVER vocabulary shows differential learning performance by token length

Figure 3A

Compare frequency distribution between vocabularies

freqdf <- rbind(data.frame(model="GROVER",frequency=df$frequency),
                data.frame(model= "4mer",frequency=kmers4$frequency/4),
                 data.frame(model= "5mer",frequency=kmers5$frequency/5),
                 data.frame(model= "6mer",frequency=kmers6$frequency/6)
                )

freqdf$model <- factor(freqdf$model,levels=c("GROVER","4mer","5mer","6mer"))

write_csv(freqdf, "fig3a.csv")

ggplot(freqdf, aes(x = frequency, fill = model)) +
  geom_density() +  # Boxplot without outliers
  scale_x_log10() +  # Logarithmic y-axis scale
  labs(x = "frequency", y = "density") +  # Axis labels
  scale_fill_manual(values=modelcols[c(1,2,4,6)])+
  facet_grid(facet="model", rows=4)+
  theme(strip.background = element_rect(fill = "white"))+
  NULL

Figure 3B

dftmp <- aggregate(frequency~length, df[,c("frequency","length")],sum)
dftmp$length <- as.numeric(as.character(dftmp$length))
levels(dftmp$length) <- 1:15
dftmp <- rbind(dftmp,data.frame(length=c(13:16),frequency=0))

## Warning in `[<-.factor`(`*tmp*`, ri, value = structure(c(1, 2, 3, 4, 5, :
## invalid factor level, NA generated

## Warning in `[<-.factor`(`*tmp*`, ri, value = structure(c(1, 2, 3, 4, 5, :
## invalid factor level, NA generated

write_csv(dftmp, "fig3b.csv")

ggplot(dftmp, aes(x=length,y=frequency, fill=length))+
  geom_bar(stat="identity")+
  scale_fill_manual(values=mycolors)+
  scale_x_discrete("token length")+
  scale_y_continuous("frequency")+
  theme_cowplot()+
  NULL

Figure 3C

tab <- table(df$length)
tab["13"] <- 0
tab["14"] <- 0
tab["15"] <- 0

dfvoc <- data.frame(t(tab[c(1:12,14:16,13)]))[,2:3]
colnames(dfvoc) <- c("length","vocabulary")
dfvoc$frequency <- 4^c(1:16)
dfvoc$proportion <- dfvoc$vocabulary/dfvoc$frequency
#dfvoc$length<- factor(as.character(dfvoc$length),levels(dfvoc$length))

write_csv(dfvoc, "fig3cd.csv")

ggplot(dfvoc, aes(y=vocabulary,x=frequency, colour=length))+
  geom_point(size=5)+
  scale_colour_manual(values=mycolors)+
  scale_x_log10("k-mer frequency")+
  scale_y_log10("k-mers represented")+
  theme_cowplot()+
  NULL

## Warning in scale_y_log10("k-mers represented"): log-10 transformation
## introduced infinite values.

Figure 3D

ggplot(dfvoc, aes(x=length,y=proportion, fill=length))+
  geom_bar(stat="identity")+
  scale_fill_manual(values=mycolors)+
  scale_x_discrete("token length")+
  scale_y_continuous("proportion of k-mers represented")+
  theme_cowplot()+
  NULL

Figure 3E

repres_seq <- c(
sum(str_count(df$token, "A")),
sum(str_count(df$token, "C")),
sum(str_count(df$token, "G")),
sum(str_count(df$token, "T")))

genome_freq <- colSums(oligonucleotideFrequency(Hseq, 1))

first_letter_seq <- c(
sum(str_count(substr(df$token,1,1), "A")),
sum(str_count(substr(df$token,1,1), "C")),
sum(str_count(substr(df$token,1,1), "G")),
sum(str_count(substr(df$token,1,1), "T")))

freq_df <- data.frame(genome_frequency=genome_freq/sum(genome_freq),
                      representation = repres_seq/sum( repres_seq),
                      first_letter=first_letter_seq/sum(first_letter_seq))
freq_df$nucl <- factor(c("A","C","G","T"), levels=c("A","C","G","T"))

write_csv(freq_df, "fig3e.csv")

p1 <- ggplot(freq_df, aes(x = "", y = genome_frequency, fill = nucl)) +
  geom_bar(stat = "identity", width = 1) +
  scale_fill_manual(values=nucl_colours)+
  coord_polar("y", start = 0, direction = -1) +
  geom_text(aes(label = paste0(nucl,"\n",round(genome_frequency * 100, 2), "%")),
            position = position_stack(vjust = 0.5))+
  theme_void() +
  labs(fill = "Genome Frequency")+
  NULL

p2 <- ggplot(freq_df, aes(x = "", y = representation, fill = nucl)) +
  geom_bar(stat = "identity", width = 1) +
  scale_fill_manual(values=nucl_colours)+
  coord_polar("y", start = 0, direction = -1) +
  geom_text(aes(label = paste0(nucl,"\n",round(representation * 100, 2), "%")),
            position = position_stack(vjust = 0.5))+
  theme_void() +
  labs(fill = "Representation")+
  NULL

p3 <- ggplot(freq_df, aes(x = "", y = first_letter, fill = nucl)) +
  geom_bar(stat = "identity", width = 1) +
  scale_fill_manual(values=nucl_colours)+
  coord_polar("y", start = 0, direction = -1) +
  geom_text(aes(label = paste0(nucl,"\n",round(first_letter * 100, 2), "%")),
            position = position_stack(vjust = 0.5))+
  theme_void() +
  labs(fill = "First Letter")+
  NULL


p1+p2+p3

Figure 3F

tokmetr <- read.csv("data/metrics_per_token.csv")
tokdf <- cbind(tokmetr[c(8,10:609),2:4])
colnames(tokdf) <- c("token","GROVER_AUC","GROVER_Acc")

df <- merge(df, tokdf, id="token")

write_csv(df, "fig3f.csv")

ggplot(df, aes(y=GROVER_AUC-0.5,x=factor(length), fill=factor(length)))+
         geom_boxplot() +
  scale_fill_manual(values=mycolors)+
  NULL

Figure 3G

ggplot(df, aes(y=GROVER_Acc,x=factor(length), fill=factor(length)))+
         geom_boxplot() +
  scale_fill_manual(values=mycolors)+
  NULL

Figure 4

UMAPs and PCAs of the average embeddings of the vocabulary. Putting the Principal Components in relation to genome biology requires integration with additional data.

Figure 4A

Geigs <- Gpca$sdev^2
Gvarvec <- Geigs/sum(Geigs)
GWeigs <- GWpca$sdev^2
GWvarvec <- GWeigs/sum(GWeigs)

write_csv(data.frame(GWvarvec), file="fig4b2.csv")

MEV <- c(GROVER=Gvarvec[1],GROVER_W2V=GWvarvec[1])
MEV <- data.frame(MEV)
MEV$model=rownames(MEV)
MEV$model <- factor(MEV$model,levels=MEV$model) 

write_csv(MEV, "fig4a.csv")

ggplot(MEV[1:2,],aes(x=model,y=100*MEV))+
    geom_bar(stat="identity", fill="slateblue")+
    scale_y_continuous("Maximum Explained Variance [%]")+
    theme_cowplot()+
    theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))+
    NULL

Figure 4B

Plotted are both the PCA and also the colour bar for the legend.

ggplot(df, aes(x=GROVER_W2V_PC1, y=GROVER_W2V_PC2, colour=token))+
  geom_point()+
  scale_colour_manual(values=df$W2V_hex)+
  scale_x_continuous(paste0("Word2Vec PC1 (",round(100*GWvarvec[1],2),"%)"))+
  scale_y_continuous(paste0("Word2Vec PC2 (",round(100*GWvarvec[2],2),"%)"))+
  #geom_label_repel(aes(label = token ))+
  theme_cowplot()+
  theme(legend.position = "none")+
  NULL

image(matrix(1:601, nrow = 1), col = hex,  axes = FALSE, xlab = "", ylab = "")

Figure 4C

ggplot(df, aes(x=GROVER_W2V_PC1, y=GC, colour=token))+
  geom_point()+
  scale_colour_manual(values=df$W2V_hex)+
  scale_x_continuous(paste0("Word2Vec PC1 (",round(100*GWvarvec[1],2),"%)"))+
  scale_y_continuous(paste0("GC content"))+
  #geom_label_repel(aes(label = token ))+
  theme_cowplot()+
  theme(legend.position = "none")+
  NULL

cor.test(df$GROVER_W2V_PC1,df$GC, method="spearman")

## Warning in cor.test.default(df$GROVER_W2V_PC1, df$GC, method = "spearman"):
## Cannot compute exact p-value with ties

## 
##  Spearman's rank correlation rho
## 
## data:  df$GROVER_W2V_PC1 and df$GC
## S = 2688292, p-value < 2.2e-16
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
##       rho 
## 0.9256972

Figure 4D

ggplot(df, aes(G_W2V_UMAP1, y=G_W2V_UMAP2, colour=token))+
  geom_point()+
  scale_colour_manual(values=df$W2V_hex)+
  theme_cowplot()+
  theme(legend.position = "none")+
  NULL

Figure 4E

ggplot(df, aes(x=GROVER_PC1, y=GROVER_PC2, colour=token))+
  geom_point()+
  scale_colour_manual(values=df$W2V_hex)+
  scale_x_continuous(name=paste("PC1 (",round(100*Gvarvec[1],2),"%)",sep=""))+
  scale_y_continuous(name=paste("PC2 (",round(100*Gvarvec[2],2),"%)",sep=""))+
  theme_cowplot()+
  theme(legend.position = "none")+
  NULL

Figure 4F

ggplot(df, aes(x=G_UMAP1, y=G_UMAP2, colour=token))+
  geom_point()+
  scale_colour_manual(values=df$W2V_hex)+
  theme_cowplot()+
  theme(legend.position = "none")+
  NULL

Annotation of Tokens

For Figure 4G and later figures, the proportion of tokens associated with features of genome biology is calculated.

Gene element annotation

dummydf <- data.frame(Feature= sort(c("Promoter","5' UTR","3' UTR","1st Exon","Other Exon","1st Intron","Other Intron","Downstream (<=300)","Distal Intergenic")))


if(!file.exists("objects/230413_gene_annotation.rdat")){

annotdf <- data.frame(
row.names=dummydf$Feature)

for (i in c(1:length(df$token))){
  tok <- df$token[i]
  cat(tok,"\n")
  bed_ss <- get(load(paste0("objects/GR_by_token/GROVER600_",tok,".rdat")))
  if(length(bed_ss)>50000){
    bed_ss <- bed_ss[sample(length(bed_ss),50000)]
  }
  seqlevels(bed_ss) <- paste0("chr",c(1:22,"X","Y"))
  seqinfo(bed_ss)<- seqinfo(Hsapiens)
  start(bed_ss) <- start(bed_ss)+1 #readjust start
  bed_ss <- sort(bed_ss)
  names(bed_ss)<- 1:length(bed_ss)
  
  peakAnno <- annotatePeak(bed_ss, tssRegion=c(-1000, 1000),
                         TxDb=txdb, annoDb="org.Hs.eg.db")
  tmp <- merge(dummydf,peakAnno@annoStat,by="Feature",all=TRUE)
  annotdf <- cbind(annotdf,tmp[,2])
}

colnames(annotdf) <- df$token

annotdf$feature <- factor(rownames(annotdf),levels=rev(c("Promoter","5' UTR","3' UTR","1st Exon","Other Exon","1st Intron","Other Intron","Downstream (<=300)","Distal Intergenic")))
annotdf[is.na(annotdf)]<- 0
melted <- melt(annotdf)

save(annotdf,file="objects/230413_gene_annotation.rdat")
}

Annotation to genes with orientation

genes <- genes(txdb)

##   403 genes were dropped because they have exons located on both strands
##   of the same reference sequence or on more than one reference sequence,
##   so cannot be represented by a single genomic range.
##   Use 'single.strand.genes.only=FALSE' to get all the genes in a
##   GRangesList object, or use suppressMessages() to suppress this message.

genes_p <- genes[strand(genes) == "+"]
genes_m <- genes[strand(genes) == "-"]
strand(genes_m) <- "+"

genedf <- data.frame(
row.names=c("gene_p","gene_m","intragenic"))
genedf$feature <- c("gene_p","gene_m","intragenic")

if(!file.exists("objects/230413_gene_orientation_annotation.rdat")){
genedummy <- genedf

for (i in 1:length(df$token)){
  cat(i,"\n")
  tok <- df$token[i]
  cat(tok,"\n")
  bed_ss <- get(load(paste0("objects/GR_by_token/GROVER600_",tok,".rdat")))
  if(length(bed_ss)>50000){
    bed_ss <- bed_ss[sample(length(bed_ss),50000)]
  }
  seqlevels(bed_ss) <- paste0("chr",c(1:22,"X","Y"))
  seqinfo(bed_ss)<- seqinfo(Hsapiens)
  start(bed_ss) <- start(bed_ss)+1 #readjust start
  bed_ss <- sort(bed_ss)
  bed_ss_rev <- bed_ss
  names(bed_ss)<- 1:length(bed_ss)
  peakAnno_p <- bed_ss[bed_ss %over% genes_p]
  peakAnno_m <- bed_ss[bed_ss %over% genes_m]
  peakAnno_o <- bed_ss[!bed_ss %over% c(peakAnno_p,peakAnno_m)]
  
  genedf <- cbind(genedf,c(length(peakAnno_p),length(peakAnno_m),length(peakAnno_o)))
}
colnames(genedf) <- c("class",df$token)

genedf[,2:602] <- genedf[,2:602]/colSums(genedf[,2:602])

genedf2 <- genedf[,c("class",df$token)]

save(genedf2,file="230413_gene_orientation_annotation.rdat")
}

Annotation to coding sequence with orientation

genes <- cds(txdb)
genes_p <- genes[strand(genes) == "+"]
genes_m <- genes[strand(genes) == "-"]
strand(genes_m) <- "+"

genedf <- data.frame(
row.names=c("cds_p","cds_m","other"))
genedf$feature <- c("cds_p","cds_m","other")

if(!file.exists("objects/230413_cds_orientation_annotation.rdat")){

for (i in 1:length(df$token)){
  cat(i,"\n")
  tok <- df$token[i]
  cat(tok,"\n")
  bed_ss <- get(load(paste0("objects/GR_by_token/GROVER600_",tok,".rdat")))
  if(length(bed_ss)>50000){
    bed_ss <- bed_ss[sample(length(bed_ss),50000)]
  }
  seqlevels(bed_ss) <- paste0("chr",c(1:22,"X","Y"))
  seqinfo(bed_ss)<- seqinfo(Hsapiens)
  start(bed_ss) <- start(bed_ss)+1 #readjust start
  bed_ss <- sort(bed_ss)
  bed_ss_rev <- bed_ss
  names(bed_ss)<- 1:length(bed_ss)
  peakAnno_p <- bed_ss[bed_ss %over% genes_p]
  peakAnno_m <- bed_ss[bed_ss %over% genes_m]
  peakAnno_o <- bed_ss[!bed_ss %over% c(peakAnno_p,peakAnno_m)]
  
  genedf <- cbind(genedf,c(length(peakAnno_p),length(peakAnno_m),length(peakAnno_o)))
}
colnames(genedf) <- c(df$token)
genedf <- cbind (class=rownames(genedf),genedf)
genedf[,2:602] <- genedf[,2:602]/colSums(genedf[,2:602])

genedf2 <- genedf[,c("class",df$token)]

save(genedf2,file="objects/230413_cds_orientation_annotation.rdat")

}

Annotation to Chromatin Colours

HMM <- read.table("data/GM12878_ChromHMM.bed")
HMM <- HMM[,c(2,3,4,5,10)]
colnames(HMM) <- c("seqnames","start","end","ID","colour")
HMM <- GRanges(HMM)

HMMdf <- data.frame(
row.names=names(table(HMM$ID)))

HMMdf$ID <- names(table(HMM$ID))
HMMdf <- unique(merge(HMMdf,elementMetadata(HMM)[,c("ID","colour")], id="ID", all.x=TRUE, all.y=FALSE))
HMMdf$ID <- factor(HMMdf$ID, levels=sort(HMMdf$ID)[c(1,8:15,2:7)])
HMMcol <- HMMdf$colour[c(1,8:15,2:7)]
names(HMMcol)[c(1,8:15,2:7)]<- as.character(HMMdf$ID) 
HMMdummy <- HMMdf

if(!file.exists("objects/230413_HMM_annotation.rdat")){

for (i in c(1:length(df$token))){
  cat(i,"\n")
  tok <- df$token[i]
  cat(tok,"\n")
  bed_ss <- get(load(paste0("objects/GR_by_token/GROVER600_",tok,".rdat")))
  if(length(bed_ss)>50000){
    bed_ss <- bed_ss[sample(length(bed_ss),50000)]
  }
  seqlevels(bed_ss) <- paste0("chr",c(1:22,"X","Y"))
  seqinfo(bed_ss)<- seqinfo(Hsapiens)
  start(bed_ss) <- start(bed_ss)+1 #readjust start
  bed_ss <- sort(bed_ss)
  names(bed_ss)<- 1:length(bed_ss)
  peakAnno <- mergeByOverlaps(bed_ss, HMM)
  
  tmp <- data.frame(table(peakAnno$ID)/length(bed_ss))
  
  tmp <- merge(HMMdummy,tmp,by.x="ID",by.y="Var1",all=TRUE)
  HMMdf <- cbind(HMMdf,tmp[,3])
}
colnames(HMMdf) <- c("ID","colour",df$token)

HHMdf <- HHMdf[c(1,8:15,2:7),]
HHMdf$ID <- factor(HHMdf$ID,levels=HHMdf$ID)
HHMdf$colour <-c("#FF0000","#FF6969","#CF0BC6", "#FACA00", "#FACA01", "#FFFC04", "#FFFC05", "#0ABEFE", "#00B050", "#00B051", "#99FF66","#7F7F7F", "#F5F5F5", "#F5F5F6", "#F5F5F7")

 save(HMMdf,file="objects/230413_HMM_annotation.rdat")
}

Annotation Repeats

if(!file.exists("objects/230413_rep_annotation.rdat")){
  
  repdb <- get(load("data/repdb_UCSC.Rdata"))
  repdf <- data.frame(
  row.names=names(table(repdb$repClass)))
  repdf$repClass <- names(table(repdb$repClass))
  
  repdfdummy <- repdf
  repdfdummy$repClass <- paste0(repdfdummy$repClass,"_p")
  repdfdummy2 <- repdf
  repdfdummy2$repClass <- paste0(repdfdummy2$repClass,"_m")
  repdfdummy <- rbind(repdfdummy,repdfdummy2)
  repdf <- repdfdummy
  
for (i in 1:length(df$token)){
  cat(i,"\n")
  tok <- df$token[i]
  cat(tok,"\n")
  bed_ss <- get(load(paste0("objects/GR_by_token/GROVER600_",tok,".rdat")))
  if(length(bed_ss)>50000){
    bed_ss <- bed_ss[sample(length(bed_ss),50000)]
  }
  seqlevels(bed_ss) <- paste0("chr",c(1:22,"X","Y"))
  seqinfo(bed_ss)<- seqinfo(Hsapiens)
  start(bed_ss) <- start(bed_ss)+1 #readjust start
  bed_ss <- sort(bed_ss)
  bed_ss_rev <- bed_ss
  strand(bed_ss_rev) <- "-"
  names(bed_ss)<- 1:length(bed_ss)
  peakAnno_p <- mergeByOverlaps(bed_ss, repdb[which(strand(repdb)=="+")])
  peakAnno_m <- mergeByOverlaps(bed_ss_rev, repdb[which(strand(repdb)=="-")])
  
  tmp_p <- data.frame(table(peakAnno_p$repClass)/length(bed_ss))
  tmp_m <- data.frame(table(peakAnno_m$repClass)/length(bed_ss))
  tmp_p$Var1 <- paste0(tmp_p$Var,"_p")
  tmp_m$Var1 <- paste0(tmp_m$Var,"_m")
  
  tmp <- rbind(tmp_p,tmp_m)
  repdf <- cbind(repdf,tmp[,2])
}
colnames(repdf) <- c("repClass",df$token)

repdf2 <- rbind(c(colSums(repdf[c(17:21,13,1:2),-1])),
                c(colSums(repdf[21+c(17:21,13,1:2),-1])),
                c(colSums(repdf[c(15:16),-1])),
                c(colSums(repdf[21+c(15:16),-1])),
                c(colSums(repdf[c(14),-1])),
                c(colSums(repdf[21+c(14),-1])),
                c(colSums(repdf[c(12),-1])),
                c(colSums(repdf[21+c(12),-1])),
                c(colSums(repdf[c(11),-1])),
                c(colSums(repdf[21+c(11),-1])),
                c(colSums(repdf[c(6:7),-1])),
                c(colSums(repdf[21+c(6:7),-1])),
                c(colSums(repdf[c(5),-1])),
                c(colSums(repdf[21+c(5),-1])),
                c(colSums(repdf[c(4,3),-1])),
                c(colSums(repdf[21+c(4,3),-1]))
                  )

repdf2 <- data.frame(repClass=c("other_p","other_m","SINE_p","SINE_m","Simple_p","Simple_m","Satellite_p","Satellite_m","rRNA_p","rRNA_m","LTR_p","LTR_m","Low_complexity_p","Low_complexity_m","LINE_p","LINE_m"),repdf2)


colnames(repdf2) <- c("repClass",df$token)
save(repdf2,file="objects/230413_rep_annotation.rdat")
}

Annotation Replication timing and strand

if(!file.exists("objects/230413_rep_strand_annotation.rdat") &! file.exists("objects/230413_rep_timing_annotation.rdat")){

  repltime <- read.table("data/GSE131417_K562_reptime_segmentation_recolored.bed")
replstrand <- rtracklayer::import("data/OK-seq_K562_BR1.bam.rfd.bw")

seqlevels(replstrand, pruning.mode="coarse") <- paste0("chr",c(1:22,"X","Y"))
seqinfo(replstrand)<- seqinfo(Hsapiens)

colnames(repltime)[1:4] <- c("seqnames","start","end","timing")
repltime <- GRanges(repltime)
seqlevels(repltime, pruning.mode="coarse") <- paste0("chr",c(1:22,"X","Y"))
seqinfo(repltime)<- seqinfo(Hsapiens)
early <- repltime[repltime$timing == "early"]
mid_early <- repltime[repltime$timing == "mid-early"]
mid_late <- repltime[repltime$timing == "mid-late"]
late <- repltime[repltime$timing == "late"]

plusrepl <- reduce(replstrand[replstrand$score>0])
minusrepl <- reduce(replstrand[replstrand$score<0])
neut<- reduce(replstrand[replstrand$score==0])

genedf <- data.frame(
row.names=c("early","mid_early","mid_late","late"))
genedf$feature <- c("early","mid_early","mid_late","late")

stranddf <- data.frame(
row.names=c("plus","minus","neutral"))
stranddf$feature <- c("plus","minus","neutral")
  
for (i in 1:length(df$token)){
  cat(i,"\n")
  tok <- df$token[i]
  cat(tok,"\n")
  bed_ss <- get(load(paste0("objects/GR_by_token/GROVER600_",tok,".rdat")))
  seqlevels(bed_ss) <- paste0("chr",c(1:22,"X","Y"))
  seqinfo(bed_ss)<- seqinfo(Hsapiens)
  if(length(bed_ss)>50000){
    bed_ss <- bed_ss[sample(length(bed_ss),50000)]
  }
  
  start(bed_ss) <- start(bed_ss)+1 #readjust start
  bed_ss <- sort(bed_ss)
  names(bed_ss)<- 1:length(bed_ss)
  peakAnno_early <- bed_ss[bed_ss %over% early]
  peakAnno_mid_early <- bed_ss[bed_ss %over% mid_early]
  peakAnno_mid_late <- bed_ss[bed_ss %over% mid_late]
  peakAnno_late <- bed_ss[bed_ss %over% late]
  peakAnno_plus <- bed_ss[bed_ss %over% plusrepl]
  peakAnno_minus <- bed_ss[bed_ss %over% minusrepl]
  peakAnno_neut <- bed_ss[bed_ss %over% neut]
  
  genedf <- cbind(genedf,c(length(peakAnno_early),length(peakAnno_mid_early),length(peakAnno_mid_late),length(peakAnno_late)))
  stranddf <- cbind(stranddf,c(length(peakAnno_plus),length(peakAnno_minus),length(peakAnno_neut)))
}
colnames(genedf) <- c("class",df$token)
colnames(stranddf) <- c("class",df$token)

genedf[,2:602] <- genedf[,2:602]/colSums(genedf[,2:602])
stranddf[,2:602] <- t(t(stranddf[,2:602])/colSums(stranddf[,2:602]))

genedf2 <- genedf[,c("class",df$token[Ghm1])]
genedf2$class <- factor(genedf2$class,levels=genedf2$class)
stranddf2 <- stranddf[,c("class",df$token[Ghm1])]

save(genedf2,file="objects/230413_rep_timing_annotation.rdat")
save(stranddf2,file="objects/230413_rep_strand_annotation.rdat")
}

Integration of annotation into df

annot <- get(load("objects/230413_gene_annotation.rdat"))
annot <- data.frame(t(annot[,-602]))
annot$token <- rownames(annot)
df <- merge(df,annot, by="token")

annot <- get(load("objects/230413_gene_orientation_annotation.rdat"))
rownames(annot) <- annot$class
annot <- data.frame(annot[,-c(1)])
annot <- data.frame(t(annot))
annot$token <- rownames(annot)
df <- merge(df,annot, by="token", all=TRUE)

annot <- get(load("objects/230413_cds_orientation_annotation.rdat"))
annot <- data.frame(t(annot[,-1]))
annot$token <- rownames(annot)
colnames(annot)<-  c("cds_p","cds_m","cds_other","token")

df <- merge(df,annot, by="token", all=TRUE)

annot <- get(load("objects/230413_HMM_annotation.rdat"))
rownames(annot) <- annot$ID
annot <- data.frame(annot[,-c(1:2)])
annot <- data.frame(t(annot))
annot$token <- rownames(annot)

df <- merge(df,annot, by="token", all=TRUE)

annot <- get(load("objects/230413_rep_annotation.rdat"))
rownames(annot) <- annot$repClass
annot <- data.frame(annot[,-c(1)])
annot <- data.frame(t(annot))
annot$token <- rownames(annot)

df <- merge(df,annot, by="token", all=TRUE)

annot <- get(load("objects/230413_rep_timing_annotation.rdat"))
rownames(annot) <- annot$class
annot <- data.frame(annot[,-c(1)])
annot <- data.frame(t(annot))
annot$token <- rownames(annot)

df <- merge(df,annot, by="token", all=TRUE)

annot <- get(load("objects/230413_rep_strand_annotation.rdat"))
rownames(annot) <- annot$class
annot <- data.frame(annot[,-c(1)])
annot <- data.frame(t(annot))
annot$token <- rownames(annot)

df <- merge(df,annot, by="token", all=TRUE)

save(df, file="objects/GROVER_annotation_df.rdat")

Regression with GC content

   df$X1st.Exon_residuals <- resid(loess(df$X1st.Exon ~ df$GC, span = 1))
   df$X1st.Intron_residuals <- resid(loess(df$X1st.Intron ~ df$GC, span = 1))
   df$X3..UTR_residuals <- resid(loess(df$X3..UTR ~ df$GC, span = 1))
   df$X5..UTR_residuals <- resid(loess(df$X5..UTR ~ df$GC, span = 1))
   df$Distal.Intergenic_residuals <- resid(loess(df$Distal.Intergenic ~ df$GC, span = 1))
   df$Downstream....300._residuals <- resid(loess(df$Downstream....300. ~ df$GC, span = 1))
   df$Other.Exon_residuals <- resid(loess(df$Other.Exon ~ df$GC, span = 1))
   df$Other.Intron_residuals <- resid(loess(df$Other.Intron ~ df$GC, span = 1))
   df$Promoter_residuals <- resid(loess(df$Promoter ~ df$GC, span = 1))
   df$X1_Active_Promoter_residuals <- resid(loess(df$X1_Active_Promoter ~ df$GC, span = 1))
   df$X2_Weak_Promoter_residuals <- resid(loess(df$X2_Weak_Promoter ~ df$GC, span = 1))
   df$X3_Poised_Promoter_residuals <- resid(loess(df$X3_Poised_Promoter ~ df$GC, span = 1))
   df$X4_Strong_Enhancer_residuals <- resid(loess(df$X4_Strong_Enhancer ~ df$GC, span = 1))
   df$X5_Strong_Enhancer_residuals <- resid(loess(df$X5_Strong_Enhancer ~ df$GC, span = 1))
   df$X6_Weak_Enhancer_residuals <- resid(loess(df$X6_Weak_Enhancer ~ df$GC, span = 1))
   df$X7_Weak_Enhancer_residuals <- resid(loess(df$X7_Weak_Enhancer ~ df$GC, span = 1))
   df$X8_Insulator_residuals <- resid(loess(df$X8_Insulator ~ df$GC, span = 1))
   df$X9_Txn_Transition_residuals <- resid(loess(df$X9_Txn_Transition ~ df$GC, span = 1))
   df$X10_Txn_Elongation_residuals <- resid(loess(df$X10_Txn_Elongation ~ df$GC, span = 1))
   df$X11_Weak_Txn_residuals <- resid(loess(df$X11_Weak_Txn ~ df$GC, span = 1))
   df$X12_Repressed_residuals <- resid(loess(df$X12_Repressed ~ df$GC, span = 1))
   df$X13_Heterochrom.lo_residuals <- resid(loess(df$X13_Heterochrom.lo ~ df$GC, span = 1))
   df$X14_Repetitive.CNV_residuals <- resid(loess(df$X14_Repetitive.CNV ~ df$GC, span = 1))
   df$X15_Repetitive.CNV_residuals <- resid(loess(df$X15_Repetitive.CNV ~ df$GC, span = 1))
   
   df$gene_p_residuals <- resid(loess(df$gene_p ~ df$GC, span = 1))
   df$gene_m_residuals <- resid(loess(df$gene_m ~ df$GC, span = 1))
   df$intragenic_residuals <- resid(loess(df$intragenic ~ df$GC, span = 1))
   
   df$early_residuals <- resid(loess(df$early ~ df$GC, span = 1))
   df$mid_early_residuals <- resid(loess(df$mid_early ~ df$GC, span = 1))
   df$mid_late_residuals <- resid(loess(df$mid_late ~ df$GC, span = 1))
   df$late_residuals <- resid(loess(df$late ~ df$GC, span = 1))
   df$cds_p_residuals <- resid(loess(df$cds_p ~ df$GC, span = 1))
   df$cds_m_residuals <- resid(loess(df$cds_m ~ df$GC, span = 1))
   df$cds_other_residuals <- resid(loess(df$cds_other ~ df$GC, span = 1))

Correlation of PCs with annotation

# fix class for token length to numeric
df$length <- as.numeric(as.character(df$length))

tmp <- data.frame(Gpca$x)[,1:100]
tmp$token <- rownames(tmp)
rownames(df)<- df$token
df <- df %>% arrange(df,token)
tmp <- tmp[df$token,]

cors <- data.frame(nrow=20)
for(i in c(colnames(df)[c(18,19,24:26,91,86,85,83,89,84,90,88,87,107:108,114:115,92:106,60:75,110:113,80:81)])){
  cors_tmp <- cor(tmp[,1:20],df[i],method="spearman")
  cors <- cbind(cors,cors_tmp)
}

## Warning in cor(tmp[, 1:20], df[i], method = "spearman"): the standard deviation
## is zero

## Warning in cor(tmp[, 1:20], df[i], method = "spearman"): the standard deviation
## is zero

tmp <- data.frame(GWpca$x)[,1:100]
tmp$token <- rownames(tmp)
rownames(df)<- df$token
df <- df %>% arrange(df,token)
tmp <- tmp[df$token,]

corsW <- data.frame(nrow=20)
for(i in c(colnames(df)[c(18,19,24:26,91,86,85,83,89,84,90,88,87,107:108,114:115,92:106,60:75,110:113,80:81)])){
  cors_tmp <- cor(tmp[,1:20],df[i],method="spearman")
  corsW <- cbind(corsW,cors_tmp)
}

## Warning in cor(tmp[, 1:20], df[i], method = "spearman"): the standard deviation
## is zero

## Warning in cor(tmp[, 1:20], df[i], method = "spearman"): the standard deviation
## is zero

Figure 4G & H

varexplplot(GWvarvec)

tab <- corsW[,-c(1,40,48)]
tab$varexpl <- GWvarvec[1:20]

write.csv(tab, file="fig4g.csv")

ph <- pheatmap(t(corsW[,-c(1,40,48)]),
         breaks = seq(-1, 1, length.out = 100),
         cluster_rows = FALSE,
         cluster_cols = FALSE,
         gaps_row = c(5,14,16,18,33,47,51),
         color = colorRampPalette(c("navy", "white", "firebrick3"))(100)
         )

print(ph)

varexplplot(Gvarvec)

tab <- cors[,-c(1,40,48)]
tab$varexpl <- Gvarvec[1:20]

write.csv(tab, file="fig4h.csv")

ph <- pheatmap(t(cors[,-c(1,40,48)]),
         breaks = seq(-1, 1, length.out = 100),
         cluster_rows = FALSE,
         cluster_cols = FALSE,
         gaps_row = c(5,14,16,18,33,47,51),
         color = colorRampPalette(c("navy", "white", "firebrick3"))(100)
         )

  print(ph)

Figure 4I

ggplot(df, aes(x=GROVER_PC1, y=log10(frequency), colour=token))+
  geom_point()+
  scale_colour_manual(values=df$W2V_hex)+
  scale_x_continuous(name=paste("PC1 (",round(100*Gvarvec[1],2),"%)",sep=""))+
  scale_y_continuous(name=paste("frequency (log10)",sep=""))+
  theme_cowplot()+
  theme(legend.position = "none")+
  NULL

cor.test(df$GROVER_PC1,log10(df$frequency), method="spearman")

## 
##  Spearman's rank correlation rho
## 
## data:  df$GROVER_PC1 and log10(df$frequency)
## S = 4327716, p-value < 2.2e-16
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
##       rho 
## 0.8803844

Figure 4J

ggplot(df, aes(x=GROVER_PC2, y=GC, colour=token))+
  geom_point()+
  scale_colour_manual(values=df$W2V_hex)+
  scale_x_continuous(name=paste("PC2 (",round(100*Gvarvec[2],2),"%)",sep=""))+
  scale_y_continuous(name=paste("GC [%]",sep=""))+
  theme_cowplot()+
  theme(legend.position = "none")+
  NULL

cor.test(df$GROVER_PC2,df$GC, method="spearman")

## Warning in cor.test.default(df$GROVER_PC2, df$GC, method = "spearman"): Cannot
## compute exact p-value with ties

## 
##  Spearman's rank correlation rho
## 
## data:  df$GROVER_PC2 and df$GC
## S = 70998078, p-value < 2.2e-16
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
##        rho 
## -0.9623462

Figure 4K

ggplot(df, aes(x=GROVER_PC3, y=AG, colour=W2V_hex))+
  geom_point()+
  scale_colour_manual(values=df$W2V_hex)+
  scale_x_continuous(name=paste("PC3 (",round(100*Gvarvec[3],2),"%)",sep=""))+
  scale_y_continuous(name=paste("AG [%]",sep=""))+
  theme_cowplot()+
  theme(legend.position = "none")+
  NULL

cor.test(df$GROVER_PC3,df$AG,method="spearman")

## Warning in cor.test.default(df$GROVER_PC3, df$AG, method = "spearman"): Cannot
## compute exact p-value with ties

## 
##  Spearman's rank correlation rho
## 
## data:  df$GROVER_PC3 and df$AG
## S = 2083581, p-value < 2.2e-16
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
##      rho 
## 0.942411

Figure 4L

df$length <- as.numeric(df$length)
ggplot(df, aes(x=GROVER_PC4, y=length, colour=W2V_hex))+
  geom_point()+
  scale_colour_manual(values=df$W2V_hex)+
  #geom_label_repel(aes(label = token ))+
  scale_x_continuous(name=paste("PC4 (",round(100*Gvarvec[4],2),"%)",sep=""))+
  scale_y_continuous(name=paste("token length",sep=""))+
  theme_cowplot()+
  theme(legend.position = "none")+
  NULL

cor.test(df$GROVER_PC4,df$length,method="spearman")

## Warning in cor.test.default(df$GROVER_PC4, df$length, method = "spearman"):
## Cannot compute exact p-value with ties

## 
##  Spearman's rank correlation rho
## 
## data:  df$GROVER_PC4 and df$length
## S = 22032923, p-value < 2.2e-16
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
##       rho 
## 0.3910226

Figure 4M

ggplot(df, aes(x=GROVER_PC5, y=A+C, colour=W2V_hex))+
  geom_point()+
  scale_colour_manual(values=df$W2V_hex)+
  scale_x_continuous(name=paste("PC5 (",round(100*Gvarvec[5],2),"%)",sep=""))+
  scale_y_continuous(name=paste("AC",sep=""))+
  theme_cowplot()+
  theme(legend.position = "none")+
  NULL

cor.test(df$GROVER_PC5,df$A+df$C,method="spearman")

## Warning in cor.test.default(df$GROVER_PC5, df$A + df$C, method = "spearman"):
## Cannot compute exact p-value with ties

## 
##  Spearman's rank correlation rho
## 
## data:  df$GROVER_PC5 and df$A + df$C
## S = 65480318, p-value < 2.2e-16
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
##        rho 
## -0.8098385

Figure 4N

ggplot(df, aes(x=GROVER_PC6, y=length, colour=W2V_hex))+
  geom_point()+
  scale_colour_manual(values=df$W2V_hex)+
  scale_x_continuous(name=paste("PC6 (",round(100*Gvarvec[6],2),"%)",sep=""))+
  scale_y_continuous(name=paste("token length",sep=""))+
  theme_cowplot()+
  theme(legend.position = "none")+
  NULL

cor.test(df$GROVER_PC6,df$length,method="spearman")

## Warning in cor.test.default(df$GROVER_PC6, df$length, method = "spearman"):
## Cannot compute exact p-value with ties

## 
##  Spearman's rank correlation rho
## 
## data:  df$GROVER_PC6 and df$length
## S = 20756012, p-value < 2.2e-16
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
##       rho 
## 0.4263157

Figure S2

Average GROVER token embeddings cluster by token characteristics and annotation
Clustering and annotation were performed separately in that the order after clustering was obtained and then used for the associated bar graphs. The results were then combined with Adobe Illustrator.

Gdist<- dist(Gemb,method = "euclidean")
Gdist <- as.matrix(Gdist)
Gdf <- data.frame(Gdist)
df$logfreq <- log10(df$frequency)
df$length <- as.factor(df$length)

annot <- df[,c("GROVER_W2V_PC1","GC","AG","length","logfreq")]
rownames(annot) <- df$token
annotcol <- list(length=mycolors,GROVER_W2V_PC1=hex, logfreq=viridis(100, option ="B",direction=-1)) 
breaks <- c(0, seq(min(Gdf[Gdf != 0]), max(Gdf), length.out = 99))

write.csv(Gdf, file="figS2_hm.csv")
write.csv(annot, file="figS2_annot.csv")

ph <- pheatmap(Gdf, 
         cluster_rows=TRUE,
         cluster_cols=TRUE,
         annotation_col=annot,
         annotation_colors = annotcol,
         annotation_row = NULL,
         annotation_row_names = FALSE,
         annotation_col_names = FALSE,
         color= c("grey",viridis(100,direction=1)),
         show_rownames = F,
         show_colnames = F,
         breaks = breaks)

Ghm1 <- ph$tree_col[["order"]]

load("objects/230413_rep_annotation.rdat")
melted <- melt(repdf2, id="repClass")

write.csv(melted, file="figS2_repeats.csv")

rep_cols <- c(brewer.pal(n = 8, name = "Dark2"),brewer.pal(n = 8, name = "Set2"))
repcol <- rep_cols[c(1,9,2,10,3,11,4,12,5,13,6,14,7,15,8,16)]

ggplot(melted,aes(x=variable,y=value,fill=repClass))+
  geom_bar(stat="identity", width = 1)+
  theme_cowplot()+
  scale_fill_manual(values=repcol)+
 coord_flip()+
  NULL

HHMdf <- get(load("objects/230413_HMM_annotation.rdat"))
HHMcol <- c("#FF0000","#FF6969","#CF0BC6", "#FACA00", "#FACA01", "#FFFC04", "#FFFC05", "#0ABEFE", "#00B050", "#00B051", "#99FF66","#7F7F7F", "#F5F5F5", "#F5F5F6", "#F5F5F7")
HHMdf <- data.frame(HHMdf)
 
HHMdf[,3:603] <- t(t(HHMdf[,3:603])/colSums(HHMdf[,3:603]))

melted <- melt(HHMdf[,-2], id=c("ID"))

write.csv(melted, file="figS2_chromatin.csv")

ggplot(melted,aes(x=variable,y=value,fill=ID))+
  geom_bar(stat="identity", width = 1)+
  theme_cowplot()+
  scale_fill_manual(values=HHMcol)+
  coord_flip()+
  NULL

Figure 5

GROVER learns token context and genome annotation

Clustering and annotation were performed separately in that the order after clustering was obtained and then used for the associated bar graphs. The results were then combined with Adobe Illustrator.

Figure 5A

self_sim <- read.csv("data/self_similarity_per_token.csv")

ss <- self_sim[,3:14]

df <- merge(df, self_sim, id="token")
rownames(ss) <- self_sim$token

df$logfreq <- log10(df$frequency)

annot <- df[,c("logfreq","length","GROVER_W2V_PC1","GROVER_AUC")[4:1]]
annot$length <- factor(annot$length)
rownames(annot) <- df$token

annotcol <- list(length=mycolors,GROVER_W2V_PC1=hex, logfreq=viridis(100, option ="B",direction=-1)) 

write.csv(annot, file="fig5A_annot.csv")
write.csv(ss, file="fig5A_heatmap.csv")

phss <-  pheatmap(ss,
         cluster_cols=FALSE,
         annotation_row = annot,
         annotation_colors = annotcol,
         show_rownames = F,
         color= viridis(100,direction=1))

sshm <- phss$tree_row[["order"]]

token_order <- rownames(ss)[sshm]

HHMdf <- get(load("objects/230413_HMM_annotation.rdat"))
HHMdf <- data.frame(HHMdf)
HHMdf[,3:603] <- t(t(HHMdf[,3:603])/colSums(HHMdf[,3:603]))
HHMdf <- HHMdf[,c("ID",token_order)]
melted <- melt(HHMdf[,-2], id=c("ID"))

ggplot(melted,aes(x=variable,y=value,fill=ID))+
  geom_bar(stat="identity", width = 1)+
  theme_cowplot()+
  scale_fill_manual(values=HHMcol)+
 coord_flip()+
  NULL

load("objects/230413_rep_annotation.rdat")

repdf2 <- repdf2[,c("repClass",token_order)]
melted <- melt(repdf2, id="repClass")

rep_cols <- c(brewer.pal(n = 8, name = "Dark2"),brewer.pal(n = 8, name = "Set2"))
repcol <- rep_cols[c(1,9,2,10,3,11,4,12,5,13,6,14,7,15,8,16)]

ggplot(melted,aes(x=variable,y=value,fill=repClass))+
  geom_bar(stat="identity", width = 1)+
  theme_cowplot()+
  scale_fill_manual(values=repcol)+
  coord_flip()+
  NULL

Figure 5B-D

emb <- read.table("data/umap/cls_tokens_n_neighbors_200_min_dist_0.1_metric_euclidean.bed")
colnames(emb)<- c("chromosome","start","end","x","y")
emb$start <- emb$start+1 # the ranges are from python and thus 0 based
emb$end <- emb$end+1
GR <- GRanges(emb)
seqinfo(GR)<- seqinfo(Hsapiens)

dat <- data.frame(elementMetadata(GR))

  ggplot(dat, aes(x=x, y=y))+
  geom_point(colour="black", alpha=0.05, size=0.01)+
  scale_x_continuous(limits=c(-8,6.5))+
  scale_y_continuous(limits=c(-15,8))+
  ggtitle("all")+
  NULL

## Warning: Removed 995 rows containing missing values or values outside the scale range
## (`geom_point()`).

#annotate to repeats
  repdb <- get(load("/Users/anna/projects/NLP/GROVER/data/repdb_UCSC.Rdata"))
  seqlevels(repdb)<- seqlevels(Hsapiens)
  seqinfo(repdb)<- seqinfo(Hsapiens)

## Warning in valid.GenomicRanges.seqinfo(x, suggest.trim = TRUE): GRanges object contains 14 out-of-bound ranges located on sequences
##   chrUn_gl000211, chrUn_gl000217, chrUn_gl000220, chrUn_gl000227,
##   chrUn_gl000230, chrUn_gl000232, chrUn_gl000233, chrUn_gl000235,
##   chrUn_gl000236, chrUn_gl000244, chrUn_gl000246, chrUn_gl000247,
##   chr1_gl000191_random, and chr19_gl000208_random. Note that ranges
##   located on a sequence whose length is unknown (NA) or on a circular
##   sequence are not considered out-of-bound (use seqlengths() and
##   isCircular() to get the lengths and circularity flags of the underlying
##   sequences). You can use trim() to trim these ranges. See
##   ?`trim,GenomicRanges-method` for more information.

  rep_cols <- c(brewer.pal(n = 8, name = "Dark2"),brewer.pal(n = 8, name = "Set2"))
  repcol <- rep_cols[c(1,9,2,10,3,11,4,12,5,13,6,14,7,15,8,16)]

  rep_ss <- repdb[repdb$repClass %in% c("LINE", "LINE?") & strand(repdb) == "-"]
  dat <- GR[GR %over% rep_ss]
  dat <- data.frame(elementMetadata(dat))

  
  ggplot(dat, aes(x=x, y=y))+
  geom_point(colour=rep_cols[1], alpha=0.05, size=0.01)+
  scale_x_continuous(limits=c(-8,6.5))+
  scale_y_continuous(limits=c(-15,8))+
  ggtitle("LINE-")+
  theme(axis.text = element_blank(),
        axis.title = element_blank(),
        axis.ticks = element_blank(),
        panel.grid = element_blank(),
        axis.line = element_blank()
        )+
  NULL

## Warning: Removed 19 rows containing missing values or values outside the scale range
## (`geom_point()`).

  rep_ss <- repdb[repdb$repClass %in% c("LINE", "LINE?") & strand(repdb) == "+"]
  dat <- GR[GR %over% rep_ss]
  dat <- data.frame(elementMetadata(dat))
  
  ggplot(dat, aes(x=x, y=y))+
  geom_point(colour=rep_cols[9], alpha=0.05, size=0.01)+
  scale_x_continuous(limits=c(-8,6.5))+
  scale_y_continuous(limits=c(-15,8))+
  ggtitle("LINE+")+
  theme(axis.text = element_blank(),
        axis.title = element_blank(),
        axis.ticks = element_blank(),
        panel.grid = element_blank(),
        axis.line = element_blank()
        )+
  NULL

## Warning: Removed 37 rows containing missing values or values outside the scale range
## (`geom_point()`).

  rep_ss <- repdb[repdb$repClass %in% c("Low_complexity")]
  dat <- GR[GR %over% rep_ss]
  dat <- data.frame(elementMetadata(dat))
  
  ggplot(dat, aes(x=x, y=y))+
  geom_point(colour=rep_cols[2], alpha=0.05, size=0.01)+
  scale_x_continuous(limits=c(-8,6.5))+
  scale_y_continuous(limits=c(-15,8))+
  ggtitle("Low_complexity")+
   theme(axis.text = element_blank(),
        axis.title = element_blank(),
        axis.ticks = element_blank(),
        panel.grid = element_blank(),
        axis.line = element_blank()
        )+
  NULL

## Warning: Removed 51 rows containing missing values or values outside the scale range
## (`geom_point()`).

  rep_ss <- repdb[repdb$repClass %in% c("LTR","LTR?") & strand(repdb) == "-"]
  dat <- GR[GR %over% rep_ss]
  dat <- data.frame(elementMetadata(dat))
  
  ggplot(dat, aes(x=x, y=y))+
  geom_point(colour=rep_cols[3],  alpha=0.05, size=0.01)+
  scale_x_continuous(limits=c(-8,6.5))+
  scale_y_continuous(limits=c(-15,8))+
  ggtitle("LTR-")+
   theme(axis.text = element_blank(),
        axis.title = element_blank(),
        axis.ticks = element_blank(),
        panel.grid = element_blank(),
        axis.line = element_blank()
        )+
  NULL

## Warning: Removed 116 rows containing missing values or values outside the scale range
## (`geom_point()`).

  rep_ss <- repdb[repdb$repClass %in% c("LTR","LTR?") & strand(repdb) == "+"]
  dat <- GR[GR %over% rep_ss]
  dat <- data.frame(elementMetadata(dat))
  
  ggplot(dat, aes(x=x, y=y))+
  geom_point(colour=rep_cols[11], alpha=0.05, size=0.01)+
  scale_x_continuous(limits=c(-8,6.5))+
  scale_y_continuous(limits=c(-15,8))+
  ggtitle("LTR+")+
   theme(axis.text = element_blank(),
        axis.title = element_blank(),
        axis.ticks = element_blank(),
        panel.grid = element_blank(),
        axis.line = element_blank()
        )+
  NULL

## Warning: Removed 4 rows containing missing values or values outside the scale range
## (`geom_point()`).

  rep_ss <- repdb[repdb$repClass %in% c("other","DNA","DNA?","RC","RNA","scRNA","snRNA","srpRNA","tRNA","Unknown","Unknown?") & strand(repdb) == "-"]
  dat <- GR[GR %over% rep_ss]
  dat <- data.frame(elementMetadata(dat))
  
  ggplot(dat, aes(x=x, y=y))+
  geom_point(colour=rep_cols[4], alpha=0.05, size=0.01)+
  scale_x_continuous(limits=c(-8,6.5))+
  scale_y_continuous(limits=c(-15,8))+
  ggtitle("other-")+
   theme(axis.text = element_blank(),
        axis.title = element_blank(),
        axis.ticks = element_blank(),
        panel.grid = element_blank(),
        axis.line = element_blank()
        )+
  NULL

## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_point()`).

    rep_ss <- repdb[repdb$repClass %in% c("other","DNA","DNA?","RC","RNA","scRNA","snRNA","srpRNA","tRNA","Unknown","Unknown?") & strand(repdb) == "+"]
  dat <- GR[GR %over% rep_ss]
  dat <- data.frame(elementMetadata(dat))
  
  ggplot(dat, aes(x=x, y=y))+
  geom_point(colour=rep_cols[12], alpha=0.05, size=0.01)+
  scale_x_continuous(limits=c(-8,6.5))+
  scale_y_continuous(limits=c(-15,8))+
  ggtitle("other+")+
   theme(axis.text = element_blank(),
        axis.title = element_blank(),
        axis.ticks = element_blank(),
        panel.grid = element_blank(),
        axis.line = element_blank()
        )+
  NULL

    rep_ss <- repdb[repdb$repClass %in% c("rRNA") & strand(repdb) == "-"]
  dat <- GR[GR %over% rep_ss]
  dat <- data.frame(elementMetadata(dat))
  
  ggplot(dat, aes(x=x, y=y))+
  geom_point(colour=rep_cols[5], alpha=1, size=0.1)+
  scale_x_continuous(limits=c(-8,6.5))+
  scale_y_continuous(limits=c(-15,8))+
  ggtitle("rRNA-")+
   theme(axis.text = element_blank(),
        axis.title = element_blank(),
        axis.ticks = element_blank(),
        panel.grid = element_blank(),
        axis.line = element_blank()
        )+
  NULL

    rep_ss <- repdb[repdb$repClass %in% c("rRNA") & strand(repdb) == "+"]
  dat <- GR[GR %over% rep_ss]
  dat <- data.frame(elementMetadata(dat))
  
  ggplot(dat, aes(x=x, y=y))+
  geom_point(colour=rep_cols[13], alpha=1, size=0.1)+
  scale_x_continuous(limits=c(-8,6.5))+
  scale_y_continuous(limits=c(-15,8))+
  ggtitle("rRNA+")+
   theme(axis.text = element_blank(),
        axis.title = element_blank(),
        axis.ticks = element_blank(),
        panel.grid = element_blank(),
        axis.line = element_blank()
        )+
  NULL

     rep_ss <- repdb[repdb$repClass %in% c("Satellite") & strand(repdb) == "-"]
  dat <- GR[GR %over% rep_ss]
  dat <- data.frame(elementMetadata(dat))
  
  ggplot(dat, aes(x=x, y=y))+
  geom_point(colour=rep_cols[6], alpha=1, size=0.1)+
  scale_x_continuous(limits=c(-8,6.5))+
  scale_y_continuous(limits=c(-15,8))+
  ggtitle("Satellite-")+
   theme(axis.text = element_blank(),
        axis.title = element_blank(),
        axis.ticks = element_blank(),
        panel.grid = element_blank(),
        axis.line = element_blank()
        )+
  NULL

## Warning: Removed 285 rows containing missing values or values outside the scale range
## (`geom_point()`).

    rep_ss <- repdb[repdb$repClass %in% c("Satellite") & strand(repdb) == "+"]
  dat <- GR[GR %over% rep_ss]
  dat <- data.frame(elementMetadata(dat))
  
  ggplot(dat, aes(x=x, y=y))+
  geom_point(colour=rep_cols[14], alpha=1, size=0.1)+
  scale_x_continuous(limits=c(-8,6.5))+
  scale_y_continuous(limits=c(-15,8))+
  ggtitle("Satellite+")+
   theme(axis.text = element_blank(),
        axis.title = element_blank(),
        axis.ticks = element_blank(),
        panel.grid = element_blank(),
        axis.line = element_blank()
        )+
  NULL

## Warning: Removed 525 rows containing missing values or values outside the scale range
## (`geom_point()`).

      rep_ss <- repdb[repdb$repClass %in% c("Simple_repeat") ]
  dat <- GR[GR %over% rep_ss]
  dat <- data.frame(elementMetadata(dat))
  
  ggplot(dat, aes(x=x, y=y))+
  geom_point(colour=rep_cols[15], alpha=0.05, size=0.01)+
  scale_x_continuous(limits=c(-8,6.5))+
  scale_y_continuous(limits=c(-15,8))+
  ggtitle("Simple")+
   theme(axis.text = element_blank(),
        axis.title = element_blank(),
        axis.ticks = element_blank(),
        panel.grid = element_blank(),
        axis.line = element_blank()
        )+
  NULL

## Warning: Removed 150 rows containing missing values or values outside the scale range
## (`geom_point()`).

  rep_ss <- repdb[repdb$repClass %in% c("SINE", "SINE?") & strand(repdb) == "-"]
  dat <- GR[GR %over% rep_ss]
  dat <- data.frame(elementMetadata(dat))
  
  ggplot(dat, aes(x=x, y=y))+
  geom_point(colour=rep_cols[8], alpha=0.05, size=0.01)+
  scale_x_continuous(limits=c(-8,6.5))+
  scale_y_continuous(limits=c(-15,8))+
  ggtitle("SINE-")+
     theme(axis.text = element_blank(),
        axis.title = element_blank(),
        axis.ticks = element_blank(),
        panel.grid = element_blank(),
        axis.line = element_blank()
        )+
  NULL

## Warning: Removed 32 rows containing missing values or values outside the scale range
## (`geom_point()`).

  rep_ss <- repdb[repdb$repClass %in% c("SINE","SINE?") & strand(repdb) == "+"]
  
  dat <- GR[GR %over% rep_ss]
  dat <- data.frame(elementMetadata(dat))
  
  ggplot(dat, aes(x=x, y=y))+
  geom_point(colour=rep_cols[16], alpha=0.05, size=0.01)+
  scale_x_continuous(limits=c(-8,6.5))+
  scale_y_continuous(limits=c(-15,8))+
  ggtitle("SINE+")+
    theme(axis.text = element_blank(),
        axis.title = element_blank(),
        axis.ticks = element_blank(),
        panel.grid = element_blank(),
        axis.line = element_blank()
        )+
  NULL

## Warning: Removed 40 rows containing missing values or values outside the scale range
## (`geom_point()`).

Highlight knowledge of strand specificity

 rep_m <- repdb[repdb$repClass %in% c("SINE", "SINE?") & strand(repdb) == "-"]
  dat_m <- GR[GR %over% rep_m]
  rep_p <- repdb[repdb$repClass %in% c("SINE","SINE?") & strand(repdb) == "+"]
  dat_p <- GR[GR %over% rep_p]
  
  dat_m2 <- dat_m[!dat_m %over% dat_p]
  dat_p2 <- dat_p[!dat_p %over% dat_m]
  
  dat1 <- data.frame(elementMetadata(dat_m2))
  dat1$ori <- "m"
  
  ggplot(dat, aes(x=x, y=y))+
  geom_point(colour=rep_cols[8], alpha=0.05, size=0.01)+
  scale_x_continuous(limits=c(-8,6.5))+
  scale_y_continuous(limits=c(-15,8))+
  ggtitle("SINE-")+
     theme(axis.text = element_blank(),
        axis.title = element_blank(),
        axis.ticks = element_blank(),
        panel.grid = element_blank(),
        axis.line = element_blank()
        )+
  NULL

## Warning: Removed 40 rows containing missing values or values outside the scale range
## (`geom_point()`).

  dat2 <- data.frame(elementMetadata(dat_p2))
  dat2$ori <- "p"
  
  dat <- rbind (dat1,dat2)
  
  ggplot(dat, aes(x=x, y=y, colour=ori))+
  geom_point(alpha=0.05, size=0.01)+
  scale_x_continuous(limits=c(-8,6.5))+
  scale_y_continuous(limits=c(-15,8))+
  scale_colour_manual(values=c("green","magenta"))+
  ggtitle("SINE orientation")+
     theme(axis.text = element_blank(),
        axis.title = element_blank(),
        axis.ticks = element_blank(),
        panel.grid = element_blank(),
        axis.line = element_blank(),
        legend.position = "none"
        )+
  NULL

## Warning: Removed 37 rows containing missing values or values outside the scale range
## (`geom_point()`).

Line orientation

 rep_m <- repdb[repdb$repClass %in% c("LINE", "LINE?") & strand(repdb) == "-"]
  dat_m <- GR[GR %over% rep_m]
  rep_p <- repdb[repdb$repClass %in% c("LINE","LINE?") & strand(repdb) == "+"]
  dat_p <- GR[GR %over% rep_p]
  
  dat_m2 <- dat_m[!dat_m %over% dat_p]
  dat_p2 <- dat_p[!dat_p %over% dat_m]
  
  dat1 <- data.frame(elementMetadata(dat_m2))
  dat1$ori <- "m"

  dat2 <- data.frame(elementMetadata(dat_p2))
  dat2$ori <- "p"
  
  dat <- rbind (dat1,dat2)
  
  ggplot(dat, aes(x=x, y=y, colour=ori))+
  geom_point(alpha=0.05, size=0.01)+
  scale_x_continuous(limits=c(-8,6.5))+
  scale_y_continuous(limits=c(-15,8))+
  scale_colour_manual(values=c("green","magenta"))+
  ggtitle("LINE orientation")+
     theme(axis.text = element_blank(),
        axis.title = element_blank(),
        axis.ticks = element_blank(),
        panel.grid = element_blank(),
        axis.line = element_blank(),
        legend.position = "none"
        )+
  NULL

## Warning: Removed 47 rows containing missing values or values outside the scale range
## (`geom_point()`).

Full length and new Line orientation

 rep_m <- repdb[repdb$repFamily %in% c("L1") & strand(repdb) == "-" & width(repdb) > 5700 & width(repdb) < 6300]
  dat_m <- GR[GR %over% rep_m]
  rep_p <- repdb[repdb$repFamily %in% c("L1") & strand(repdb) == "+" & width(repdb) > 5700 & width(repdb) < 6300]
  dat_p <- GR[GR %over% rep_p]
  
  dat_m2 <- dat_m[!dat_m %over% dat_p]
  dat_p2 <- dat_p[!dat_p %over% dat_m]
  
  dat1 <- data.frame(elementMetadata(dat_m2))
  dat1$ori <- "m"

  dat2 <- data.frame(elementMetadata(dat_p2))
  dat2$ori <- "p"
  
  dat <- rbind (dat1,dat2)
  
  ggplot(dat, aes(x=x, y=y, colour=ori))+
  geom_point(alpha=0.05, size=0.01)+
  scale_x_continuous(limits=c(-8,6.5))+
  scale_y_continuous(limits=c(-15,8))+
  scale_colour_manual(values=c("green","magenta"))+
  ggtitle("LINE orientation")+
     theme(axis.text = element_blank(),
        axis.title = element_blank(),
        axis.ticks = element_blank(),
        panel.grid = element_blank(),
        axis.line = element_blank(),
        legend.position = "none"
        )+
  NULL

## Warning: Removed 5 rows containing missing values or values outside the scale range
## (`geom_point()`).

Full length and new Alu orientation

 rep_m <- repdb[repdb$repFamily %in% c("Alu") & strand(repdb) == "-" & width(repdb) > 270 & width(repdb) < 330]
  dat_m <- GR[GR %over% rep_m]
  rep_p <- repdb[repdb$repFamily %in% c("Alu") & strand(repdb) == "+" & width(repdb) > 270 & width(repdb) < 330]
  dat_p <- GR[GR %over% rep_p]
  
  dat_m2 <- dat_m[!dat_m %over% dat_p]
  dat_p2 <- dat_p[!dat_p %over% dat_m]
  
  dat1 <- data.frame(elementMetadata(dat_m2))
  dat1$ori <- "m"

  dat2 <- data.frame(elementMetadata(dat_p2))
  dat2$ori <- "p"
  
  dat <- rbind (dat1,dat2)
  
  ggplot(dat, aes(x=x, y=y, colour=ori))+
  geom_point(alpha=0.05, size=0.01)+
  scale_x_continuous(limits=c(-8,6.5))+
  scale_y_continuous(limits=c(-15,8))+
  scale_colour_manual(values=c("green","magenta"))+
  ggtitle("SINE orientation")+
     theme(axis.text = element_blank(),
        axis.title = element_blank(),
        axis.ticks = element_blank(),
        panel.grid = element_blank(),
        axis.line = element_blank(),
        legend.position = "none"
        )+
  NULL

## Warning: Removed 6 rows containing missing values or values outside the scale range
## (`geom_point()`).

LTR orientation

 rep_m <- repdb[repdb$repClass %in% c("LTR","LTR?") & strand(repdb) == "-"]
  dat_m <- GR[GR %over% rep_m]
  rep_p <- repdb[repdb$repClass %in% c("LTR","LTR?") & strand(repdb) == "+"]
  dat_p <- GR[GR %over% rep_p]
  
  dat_m2 <- dat_m[!dat_m %over% dat_p]
  dat_p2 <- dat_p[!dat_p %over% dat_m]
  
  dat1 <- data.frame(elementMetadata(dat_m2))
  dat1$ori <- "m"

  dat2 <- data.frame(elementMetadata(dat_p2))
  dat2$ori <- "p"
  
  dat <- rbind (dat1,dat2)
  
  ggplot(dat, aes(x=x, y=y, colour=ori))+
  geom_point(alpha=0.05, size=0.01)+
  scale_x_continuous(limits=c(-8,6.5))+
  scale_y_continuous(limits=c(-15,8))+
  scale_colour_manual(values=c("green","magenta"))+
  ggtitle("LTR orientation")+
     theme(axis.text = element_blank(),
        axis.title = element_blank(),
        axis.ticks = element_blank(),
        panel.grid = element_blank(),
        axis.line = element_blank(),
        legend.position = "none"
        )+
  NULL

## Warning: Removed 113 rows containing missing values or values outside the scale range
## (`geom_point()`).

LTR > 1000 orientation

 rep_m <- repdb[repdb$repClass %in% c("LTR") & strand(repdb) == "-" & width(repdb) > 1000]
  dat_m <- GR[GR %over% rep_m]
  rep_p <- repdb[repdb$repClass %in% c("LTR") & strand(repdb) == "+" & width(repdb) > 1000]
  dat_p <- GR[GR %over% rep_p]
  
  dat_m2 <- dat_m[!dat_m %over% dat_p]
  dat_p2 <- dat_p[!dat_p %over% dat_m]
  
  dat1 <- data.frame(elementMetadata(dat_m2))
  dat1$ori <- "m"

  dat2 <- data.frame(elementMetadata(dat_p2))
  dat2$ori <- "p"
  
  dat <- rbind (dat1,dat2)
  
  ggplot(dat, aes(x=x, y=y, colour=ori))+
  geom_point(alpha=0.05, size=0.01)+
  scale_x_continuous(limits=c(-8,6.5))+
  scale_y_continuous(limits=c(-15,8))+
  scale_colour_manual(values=c("green","magenta"))+
  ggtitle("LTR orientation >1000")+
     theme(axis.text = element_blank(),
        axis.title = element_blank(),
        axis.ticks = element_blank(),
        panel.grid = element_blank(),
        axis.line = element_blank(),
        legend.position = "none"
        )+
  NULL

## Warning: Removed 106 rows containing missing values or values outside the scale range
## (`geom_point()`).

Figure 6C

HMM <- read.table("data/GM12878_ChromHMM.bed")
HMM <- HMM[,c(2,3,4,5,10)]
colnames(HMM) <- c("seqnames","start","end","ID","colour")
HMM <- GRanges(HMM)

HHMdf <- get(load("../GROVER/objects/230413_HMM_annotation.rdat"))
HHMcol <- c("#FF0000","#FF6969","#CF0BC6", "#FACA00", "#FACA01", "#FFFC04", "#FFFC05", "#0ABEFE", "#00B050", "#00B051", "#99FF66","#7F7F7F", "#F5F5F5", "#F5F5F6", "#F5F5F7")
HHMdf <- data.frame(HHMdf)

HHMcol[13:15] <- "black"

for(i in c(1:15)){
  GR2 <- HMM[HMM$ID == levels(HHMdf$ID)[i]]
  dat <- GR[GR %over% GR2]
  dat <- data.frame(elementMetadata(dat))
  
  p1 <- ggplot(dat, aes(x=x, y=y))+
  geom_point(colour=HHMcol[i], alpha=0.05, size=0.01)+
  scale_x_continuous(limits=c(-8,6.5))+
  scale_y_continuous(limits=c(-15,8))+
  ggtitle(levels(HHMdf$ID)[i])+
     theme(axis.text = element_blank(),
        axis.title = element_blank(),
        axis.ticks = element_blank(),
        panel.grid = element_blank(),
        axis.line = element_blank()
        )+
  NULL
}


for(i in c(1:15)){
  GR2 <- HMM[HMM$ID == levels(HHMdf$ID)[i]]
  dat <- GR[GR %over% GR2]
  dat <- data.frame(elementMetadata(dat))
  
  p1 <- ggplot(dat, aes(x=x, y=y))+
  geom_point(colour=HHMcol[i], alpha=1, size=0.1)+
  scale_x_continuous(limits=c(-8,6.5))+
  scale_y_continuous(limits=c(-15,8))+
  ggtitle(levels(HHMdf$ID)[i])+
     theme(axis.text = element_blank(),
        axis.title = element_blank(),
        axis.ticks = element_blank(),
        panel.grid = element_blank(),
        axis.line = element_blank()
        )+
  NULL
}

replication timing

repltime <- read.table("data/GSE131417_K562_reptime_segmentation_recolored.bed")
colnames(repltime)[1:4] <- c("seqnames","start","end","timing")
repltime <- GRanges(repltime)
seqlevels(repltime, pruning.mode="coarse") <- paste0("chr",c(1:22,"X","Y"))
seqinfo(repltime)<- seqinfo(Hsapiens)

## Warning in valid.GenomicRanges.seqinfo(x, suggest.trim = TRUE): GRanges object contains 24 out-of-bound ranges located on sequences
##   chr1, chr2, chr3, chr4, chr5, chr6, chr7, chr8, chr9, chr10, chr11,
##   chr12, chr13, chr14, chr15, chr16, chr17, chr18, chr19, chr20, chr21,
##   chr22, chrX, and chrY. Note that ranges located on a sequence whose
##   length is unknown (NA) or on a circular sequence are not considered
##   out-of-bound (use seqlengths() and isCircular() to get the lengths and
##   circularity flags of the underlying sequences). You can use trim() to
##   trim these ranges. See ?`trim,GenomicRanges-method` for more
##   information.

early <- repltime[repltime$timing == "early"]
mid_early <- repltime[repltime$timing == "mid-early"]
mid_late <- repltime[repltime$timing == "mid-late"]
late <- repltime[repltime$timing == "late"]


  dat_m <- GR[GR %over% early]
  dat_p <- GR[GR %over% late]
  
  dat_m2 <- dat_m[!dat_m %over% dat_p]
  dat_p2 <- dat_p[!dat_p %over% dat_m]
  
  dat1 <- data.frame(elementMetadata(dat_m2))
  dat1$ori <- "m"

  dat2 <- data.frame(elementMetadata(dat_p2))
  dat2$ori <- "p"
  
  dat <- rbind (dat1,dat2)
  
  ggplot(dat, aes(x=x, y=y, colour=ori))+
  geom_point(alpha=0.05, size=0.01)+
  scale_x_continuous(limits=c(-8,6.5))+
  scale_y_continuous(limits=c(-15,8))+
  scale_colour_manual(values=c("yellow","blue"))+
  ggtitle("replication timing")+
     theme(axis.text = element_blank(),
        axis.title = element_blank(),
        axis.ticks = element_blank(),
        panel.grid = element_blank(),
        axis.line = element_blank(),
        legend.position = "none"
        )+
  NULL

## Warning: Removed 403 rows containing missing values or values outside the scale range
## (`geom_point()`).

replication strand

replstrand <- rtracklayer::import("data/OK-seq_K562_BR1.bam.rfd.bw")

seqlevels(replstrand, pruning.mode="coarse") <- paste0("chr",c(1:22,"X","Y"))
seqinfo(replstrand)<- seqinfo(Hsapiens)

plusrepl <- reduce(replstrand[replstrand$score>0])
minusrepl <- reduce(replstrand[replstrand$score<0])
neut<- reduce(replstrand[replstrand$score==0])

  dat_m <- GR[GR %over% minusrepl]
  dat_p <- GR[GR %over% plusrepl]
  
  dat_m2 <- dat_m[!dat_m %over% dat_p]
  dat_p2 <- dat_p[!dat_p %over% dat_m]
  
  dat1 <- data.frame(elementMetadata(dat_m2))
  dat1$ori <- "m"

  dat2 <- data.frame(elementMetadata(dat_p2))
  dat2$ori <- "p"
  
  dat <- rbind (dat1,dat2)
  
  ggplot(dat, aes(x=x, y=y, colour=ori))+
  geom_point(alpha=0.05, size=0.01)+
  scale_x_continuous(limits=c(-8,6.5))+
  scale_y_continuous(limits=c(-15,8))+
  scale_colour_manual(values=c("green","magenta"))+
  ggtitle("replication strand")+
     theme(axis.text = element_blank(),
        axis.title = element_blank(),
        axis.ticks = element_blank(),
        panel.grid = element_blank(),
        axis.line = element_blank(),
        legend.position = "none"
        )+
  NULL

## Warning: Removed 463 rows containing missing values or values outside the scale range
## (`geom_point()`).

transcription strand

genes <- genes(txdb)

##   403 genes were dropped because they have exons located on both strands
##   of the same reference sequence or on more than one reference sequence,
##   so cannot be represented by a single genomic range.
##   Use 'single.strand.genes.only=FALSE' to get all the genes in a
##   GRangesList object, or use suppressMessages() to suppress this message.

genes_p <- genes[strand(genes) == "+"]
genes_m <- genes[strand(genes) == "-"]
strand(genes_m) <- "+"

  dat_m <- GR[GR %over% genes_m]
  dat_p <- GR[GR %over% genes_p]
  
  dat_m2 <- dat_m[!dat_m %over% dat_p]
  dat_p2 <- dat_p[!dat_p %over% dat_m]
  
  dat1 <- data.frame(elementMetadata(dat_m2))
  dat1$ori <- "m"

  dat2 <- data.frame(elementMetadata(dat_p2))
  dat2$ori <- "p"
  
  dat <- rbind (dat1,dat2)
  
  ggplot(dat, aes(x=x, y=y, colour=ori))+
  geom_point(alpha=0.05, size=0.01)+
  scale_x_continuous(limits=c(-8,6.5))+
  scale_y_continuous(limits=c(-15,8))+
  scale_colour_manual(values=c("green","magenta"))+
  ggtitle("replication strand")+
     theme(axis.text = element_blank(),
        axis.title = element_blank(),
        axis.ticks = element_blank(),
        panel.grid = element_blank(),
        axis.line = element_blank(),
        legend.position = "none"
        )+
  NULL

## Warning: Removed 41 rows containing missing values or values outside the scale range
## (`geom_point()`).

Coding sequence strand

cds <- cds(txdb)
cds_p <- cds[strand(cds) == "+"]
cds_m <- cds[strand(cds) == "-"]
strand(cds_m) <- "+"

  dat_m <- GR[GR %over% cds_m]
  dat_p <- GR[GR %over% cds_p]
  
  dat_m2 <- dat_m[!dat_m %over% dat_p]
  dat_p2 <- dat_p[!dat_p %over% dat_m]
  
  dat1 <- data.frame(elementMetadata(dat_m2))
  dat1$ori <- "m"

  dat2 <- data.frame(elementMetadata(dat_p2))
  dat2$ori <- "p"
  
  dat <- rbind (dat1,dat2)
  
  ggplot(dat, aes(x=x, y=y, colour=ori))+
  geom_point(alpha=0.05, size=0.01)+
  scale_x_continuous(limits=c(-8,6.5))+
  scale_y_continuous(limits=c(-15,8))+
  scale_colour_manual(values=c("green","magenta"))+
  ggtitle("coding sequence strand ")+
     theme(axis.text = element_blank(),
        axis.title = element_blank(),
        axis.ticks = element_blank(),
        panel.grid = element_blank(),
        axis.line = element_blank(),
        legend.position = "none"
        )+
  NULL

## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_point()`).

Figure 6

GROVER outperforms models with fixed-size k-mers for biological fine-tuning tasks

Figure 6A,D,G

Figures 6A,D,G are directly genome browser tracks from the IGV browser for illustration.

Figure 6B & C

dat <- read.table("data/Prom300.tsv", header=TRUE)

dat$model <- factor(dat$model, levels=dat$model[c(10,9,2:4,5:8,11,1,12:18)])

dat_hs <- dat[c(1:4,9:11),]
dat_nc <- dat[c(12:17),]

melted <- melt(dat_hs[,c(2,6)])

## Using model as id variables

colnames(melted) <- c("Model","task","value")

p1 <- ggplot(melted, aes(x=Model, y=value*100,fill=Model))+
  geom_bar(stat="identity", position="dodge")+
  scale_y_continuous(limits=c(0,100))+
  ggtitle("DNA language models")+
  scale_fill_manual(values=cols1)+
  theme_cowplot()+
  theme(axis.text.x=element_text(angle = 90,hjust = 1))+
  NULL

melted <- melt(dat_nc[,c(2,6)])

## Using model as id variables

colnames(melted) <- c("Model","task","value")

p3 <- ggplot(melted,aes(x=Model, y=value*100,fill=Model))+
  geom_bar(stat="identity", position="dodge")+
  scale_y_continuous(limits=c(0,100))+
  ggtitle("context free")+
  scale_fill_manual(values=colsIDF)+
  theme_cowplot()+
  theme(axis.text.x=element_text(angle = 90,hjust = 1))+
  NULL

p1+p3+plot_layout(guides = 'collect')

# shuffling

dat_hs <- dat[c(1:4,9:11),]
dat_hs$model <- factor(c("GROVER","b4mer","b5mer","b6mer","k4mer","k5mer","k6mer"), levels=c("k4mer","k5mer","k6mer","b4mer","b5mer","b6mer","GROVER"))


melted <- melt(dat_hs[,c(2,6)])

## Using model as id variables

colnames(melted) <- c("Model","task","value")

p1 <- ggplot(melted, aes(x=Model, y=value*100,fill=Model))+
  geom_bar(stat="identity", position="dodge")+
  scale_y_continuous(limits=c(0,100))+
  ggtitle("DNA language models")+
  scale_fill_manual(values=shuffle_cols)+
  theme_cowplot()+
  theme(axis.text.x=element_text(angle = 90,hjust = 1))+
  NULL
p1

Figure S3

# shuffling with MCC
dat_hs <- dat[c(1:4,9:11),]
dat_hs$model <- factor(c("GROVER","b4mer","b5mer","b6mer","k4mer","k5mer","k6mer"), levels=c("k4mer","k5mer","k6mer","b4mer","b5mer","b6mer","GROVER"))


melted <- melt(dat_hs[,c(2,6)])

## Using model as id variables

colnames(melted) <- c("Model","task","value")

p1 <- ggplot(melted, aes(x=Model, y=value*100,fill=Model))+
  geom_bar(stat="identity", position="dodge")+
  scale_y_continuous(limits=c(0,100))+
  ggtitle("DNA language models")+
  scale_fill_manual(values=shuffle_cols)+
  theme_cowplot()+
  theme(axis.text.x=element_text(angle = 90,hjust = 1))+
  NULL
p1

dat_hs <- dat[c(1:4,9:11),]
dat_nc <- dat[c(12:17),]

melted <- melt(dat_hs[,c(2,3)])

## Using model as id variables

colnames(melted) <- c("Model","task","value")

p1 <- ggplot(melted, aes(x=Model, y=value*100,fill=Model))+
  geom_bar(stat="identity", position="dodge")+
  scale_y_continuous(limits=c(0,100))+
  ggtitle("DNA language models")+
  scale_fill_manual(values=cols2)+
  theme_cowplot()+
  theme(axis.text.x=element_text(angle = 90,hjust = 1))+
  NULL

melted <- melt(dat_nc[,c(2,3)])

## Using model as id variables

colnames(melted) <- c("Model","task","value")

p3 <- ggplot(melted, aes(x=Model, y=value*100,fill=Model))+
  geom_bar(stat="identity", position="dodge")+
  scale_y_continuous(limits=c(0,100))+
  ggtitle("context free")+
  scale_fill_manual(values=colsIDF)+
  theme_cowplot()+
  theme(axis.text.x=element_text(angle = 90,hjust = 1))+
  NULL

p1+p3+plot_layout(guides = 'collect')

ggsave("Prom300_precision.pdf",width=10,height=5)

# shuffling

dat_hs <- dat[c(1:4,9:11),]
dat_hs$model <- factor(c("GROVER","b4mer","b5mer","b6mer","k4mer","k5mer","k6mer"), levels=c("k4mer","k5mer","k6mer","b4mer","b5mer","b6mer","GROVER"))


melted <- melt(dat_hs[,c(2,3)])

## Using model as id variables

colnames(melted) <- c("Model","task","value")

p1 <- ggplot(melted, aes(x=Model, y=value*100,fill=Model))+
  geom_bar(stat="identity", position="dodge")+
  scale_y_continuous(limits=c(0,100))+
  ggtitle("DNA language models")+
  scale_fill_manual(values=shuffle_cols)+
  theme_cowplot()+
  theme(axis.text.x=element_text(angle = 90,hjust = 1))+
  NULL
p1

ggsave("Prom300_precision_shuffling.pdf",width=5,height=5)


dat_hs <- dat[c(1:4,9:11),]
dat_nc <- dat[c(12:17),]

melted <- melt(dat_hs[,c(2,4)])

## Using model as id variables

colnames(melted) <- c("Model","task","value")

p1 <- ggplot(melted, aes(x=Model, y=value*100,fill=Model))+
  geom_bar(stat="identity", position="dodge")+
  scale_y_continuous(limits=c(0,100))+
  ggtitle("DNA language models")+
  scale_fill_manual(values=cols2)+
  theme_cowplot()+
  theme(axis.text.x=element_text(angle = 90,hjust = 1))+
  NULL

melted <- melt(dat_nc[,c(2,4)])

## Using model as id variables

colnames(melted) <- c("Model","task","value")

p3 <- ggplot(melted, aes(x=Model, y=value*100,fill=Model))+
  geom_bar(stat="identity", position="dodge")+
  scale_y_continuous(limits=c(0,100))+
  ggtitle("context free")+
  scale_fill_manual(values=colsIDF)+
  theme_cowplot()+
  theme(axis.text.x=element_text(angle = 90,hjust = 1))+
  NULL

p1+p3+plot_layout(guides = 'collect')

ggsave("Prom300_recall.pdf",width=10,height=5)

# shuffling

dat_hs <- dat[c(1:4,19:21),]
dat_hs$model <- factor(c("GROVER","b4mer","b5mer","b6mer","k4mer","k5mer","k6mer"), levels=c("k4mer","k5mer","k6mer","b4mer","b5mer","b6mer","GROVER"))


melted <- melt(dat_hs[,c(2,5)])

## Using model as id variables

colnames(melted) <- c("Model","task","value")

p1 <- ggplot(melted, aes(x=Model, y=value*100,fill=Model))+
  geom_bar(stat="identity", position="dodge")+
  scale_y_continuous(limits=c(0,100))+
  ggtitle("DNA language models")+
  theme_cowplot()+
  scale_fill_manual(values=shuffle_cols)+
  theme(axis.text.x=element_text(angle = 90,hjust = 1))+
  NULL
p1

ggsave("Prom300_recall_shuffling.pdf",width=5,height=5)


dat_hs <- dat[c(1:4,9:11),]
dat_nc <- dat[c(12:17),]

melted <- melt(dat_hs[,c(2,5)])

## Using model as id variables

colnames(melted) <- c("Model","task","value")

p1 <- ggplot(melted, aes(x=Model, y=value*100,fill=Model))+
  geom_bar(stat="identity", position="dodge")+
  scale_y_continuous(limits=c(0,100))+
  ggtitle("DNA language models")+
  scale_fill_manual(values=cols2)+
  theme_cowplot()+
  theme(axis.text.x=element_text(angle = 90,hjust = 1))+
  NULL

melted <- melt(dat_nc[,c(2,5)])

## Using model as id variables

colnames(melted) <- c("Model","task","value")

p3 <- ggplot(melted, aes(x=Model, y=value*100,fill=Model))+
  geom_bar(stat="identity", position="dodge")+
  scale_y_continuous(limits=c(0,100))+
  ggtitle("context free")+
  scale_fill_manual(values=colsIDF)+
  theme_cowplot()+
  theme(axis.text.x=element_text(angle = 90,hjust = 1))+
  NULL

p1+p3+plot_layout(guides = 'collect')

ggsave("Prom300_f1.pdf",width=10,height=5)

# shuffling

dat_hs <- dat[c(1:4,19:21),]
dat_hs$model <- factor(c("GROVER","b4mer","b5mer","b6mer","k4mer","k5mer","k6mer"), levels=c("k4mer","k5mer","k6mer","b4mer","b5mer","b6mer","GROVER"))


melted <- melt(dat_hs[,c(2,5)])

## Using model as id variables

colnames(melted) <- c("Model","task","value")

p1 <- ggplot(melted, aes(x=Model, y=value*100,fill=Model))+
  geom_bar(stat="identity", position="dodge")+
  scale_y_continuous(limits=c(0,100))+
  ggtitle("DNA language models")+
  scale_fill_manual(values=shuffle_cols)+
  theme_cowplot()+
  theme(axis.text.x=element_text(angle = 90,hjust = 1))+
  NULL
p1

ggsave("Prom300_f1_shuffling.pdf",width=5,height=5)

Figure 6E&F

dat <- read.table("data/PromScan.tsv", header=TRUE)

dat$model <- factor(dat$model, levels=dat$model[c(5,2:4,1,7:12)])

dat_hs <- dat[c(1:5),]
dat_nc <- dat[c(7:12),]

melted <- melt(dat_hs[,c(2,6)])

## Using model as id variables

colnames(melted) <- c("Model","task","value")

p1 <- ggplot(melted, aes(x=Model, y=value*100,fill=Model))+
  geom_bar(stat="identity", position="dodge")+
  scale_y_continuous(limits=c(0,100))+
  ggtitle("DNA language models")+
  scale_fill_manual(values=cols1a)+
  theme_cowplot()+
  theme(axis.text.x=element_text(angle = 90,hjust = 1))+
  NULL

melted <- melt(dat_nc[,c(2,6)])

## Using model as id variables

colnames(melted) <- c("Model","task","value")

p3 <- ggplot(melted, aes(x=Model, y=value*100,fill=Model))+
  geom_bar(stat="identity", position="dodge")+
  scale_y_continuous(limits=c(0,100))+
  ggtitle("context free")+
  scale_fill_manual(values=colsIDF)+
  theme_cowplot()+
  theme(axis.text.x=element_text(angle = 90,hjust = 1))+
  NULL

p1+p3+plot_layout(guides = 'collect')

Figure S4

melted <- melt(dat_hs[,c(2,3)])

## Using model as id variables

colnames(melted) <- c("Model","task","value")

p1 <- ggplot(melted, aes(x=Model, y=value*100,fill=Model))+
  geom_bar(stat="identity", position="dodge")+
  scale_y_continuous(limits=c(0,100))+
  ggtitle("DNA language models")+
  scale_fill_manual(values=cols1a)+
  theme_cowplot()+
  theme(axis.text.x=element_text(angle = 90,hjust = 1))+
  NULL

melted <- melt(dat_nc[,c(2,3)])

## Using model as id variables

colnames(melted) <- c("Model","task","value")

p3 <- ggplot(melted, aes(x=Model, y=value*100,fill=Model))+
  geom_bar(stat="identity", position="dodge")+
  scale_y_continuous(limits=c(0,100))+
  ggtitle("context free")+
  scale_fill_manual(values=colsIDF)+
  theme_cowplot()+
  theme(axis.text.x=element_text(angle = 90,hjust = 1))+
  NULL

p1+p3+plot_layout(guides = 'collect')

melted <- melt(dat_hs[,c(2,4)])

## Using model as id variables

colnames(melted) <- c("Model","task","value")

p1 <- ggplot(melted, aes(x=Model, y=value*100,fill=Model))+
  geom_bar(stat="identity", position="dodge")+
  scale_y_continuous(limits=c(0,100))+
  ggtitle("DNA language models")+
  scale_fill_manual(values=cols1a)+
  theme_cowplot()+
  theme(axis.text.x=element_text(angle = 90,hjust = 1))+
  NULL

melted <- melt(dat_nc[,c(2,4)])

## Using model as id variables

colnames(melted) <- c("Model","task","value")

p3 <- ggplot(melted, aes(x=Model, y=value*100,fill=Model))+
  geom_bar(stat="identity", position="dodge")+
  scale_y_continuous(limits=c(0,100))+
  ggtitle("context free")+
  scale_fill_manual(values=colsIDF)+
  theme_cowplot()+
  theme(axis.text.x=element_text(angle = 90,hjust = 1))+
  NULL

p1+p3+plot_layout(guides = 'collect')

melted <- melt(dat_hs[,c(2,5)])

## Using model as id variables

colnames(melted) <- c("Model","task","value")

p1 <- ggplot(melted, aes(x=Model, y=value*100,fill=Model))+
  geom_bar(stat="identity", position="dodge")+
  scale_y_continuous(limits=c(0,100))+
  ggtitle("DNA language models")+
  scale_fill_manual(values=cols1a)+
  theme_cowplot()+
  theme(axis.text.x=element_text(angle = 90,hjust = 1))+
  NULL

melted <- melt(dat_nc[,c(2,5)])

## Using model as id variables

colnames(melted) <- c("Model","task","value")

p3 <- ggplot(melted, aes(x=Model, y=value*100,fill=Model))+
  geom_bar(stat="identity", position="dodge")+
  scale_y_continuous(limits=c(0,100))+
  ggtitle("context free")+
  scale_fill_manual(values=colsIDF)+
  theme_cowplot()+
  theme(axis.text.x=element_text(angle = 90,hjust = 1))+
  NULL

p1+p3+plot_layout(guides = 'collect')

Figure 6H&I

dat <- read.table("data/CTCF.tsv", header=TRUE)

dat$model <- factor(dat$model, levels=dat$model[c(6,5,2:4,7,1,8:13)])

dat_hs <- dat[c(1:7),]
dat_nc <- dat[c(8:13),]

melted <- melt(dat_hs[,c(2,6)])

## Using model as id variables

colnames(melted) <- c("Model","task","value")

p1 <- ggplot(melted, aes(x=Model, y=value*100,fill=Model))+
  geom_bar(stat="identity", position="dodge")+
  scale_y_continuous(limits=c(0,100))+
  ggtitle("DNA language models")+
  scale_fill_manual(values=cols1)+
  theme_cowplot()+
  theme(axis.text.x=element_text(angle = 90,hjust = 1))+
  NULL

melted <- melt(dat_nc[,c(2,6)])

## Using model as id variables

colnames(melted) <- c("Model","task","value")

p3 <- ggplot(melted, aes(x=Model, y=value*100,fill=Model))+
  geom_bar(stat="identity", position="dodge")+
  scale_y_continuous(limits=c(0,100))+
  ggtitle("context free")+
  scale_fill_manual(values=colsIDF)+
  theme_cowplot()+
  theme(axis.text.x=element_text(angle = 90,hjust = 1))+
  NULL

p1+p3+plot_layout(guides = 'collect')

Figure S5

melted <- melt(dat_hs[,c(2,3)])

## Using model as id variables

colnames(melted) <- c("Model","task","value")

p1 <- ggplot(melted, aes(x=Model, y=value*100,fill=Model))+
  geom_bar(stat="identity", position="dodge")+
  scale_y_continuous(limits=c(0,100))+
  ggtitle("DNA language models")+
  scale_fill_manual(values=cols1)+
  theme_cowplot()+
  theme(axis.text.x=element_text(angle = 90,hjust = 1))+
  NULL

melted <- melt(dat_nc[,c(2,3)])

## Using model as id variables

colnames(melted) <- c("Model","task","value")

p3 <- ggplot(melted, aes(x=Model, y=value*100,fill=Model))+
  geom_bar(stat="identity", position="dodge")+
  scale_y_continuous(limits=c(0,100))+
  ggtitle("context free")+
  scale_fill_manual(values=colsIDF)+
  theme_cowplot()+
  theme(axis.text.x=element_text(angle = 90,hjust = 1))+
  NULL

p1+p3+plot_layout(guides = 'collect')

melted <- melt(dat_hs[,c(2,4)])

## Using model as id variables

colnames(melted) <- c("Model","task","value")

p1 <- ggplot(melted, aes(x=Model, y=value*100,fill=Model))+
  geom_bar(stat="identity", position="dodge")+
  scale_y_continuous(limits=c(0,100))+
  ggtitle("DNA language models")+
  scale_fill_manual(values=cols1)+
  theme_cowplot()+
  theme(axis.text.x=element_text(angle = 90,hjust = 1))+
  NULL

melted <- melt(dat_nc[,c(2,4)])

## Using model as id variables

colnames(melted) <- c("Model","task","value")

p3 <- ggplot(melted, aes(x=Model, y=value*100,fill=Model))+
  geom_bar(stat="identity", position="dodge")+
  scale_y_continuous(limits=c(0,100))+
  ggtitle("context free")+
  scale_fill_manual(values=colsIDF)+
  theme_cowplot()+
  theme(axis.text.x=element_text(angle = 90,hjust = 1))+
  NULL

p1+p3+plot_layout(guides = 'collect')

melted <- melt(dat_hs[,c(2,5)])

## Using model as id variables

colnames(melted) <- c("Model","task","value")

p1 <- ggplot(melted, aes(x=Model, y=value*100,fill=Model))+
  geom_bar(stat="identity", position="dodge")+
  scale_y_continuous(limits=c(0,100))+
  ggtitle("DNA language models")+
  scale_fill_manual(values=cols1)+
  theme_cowplot()+
  theme(axis.text.x=element_text(angle = 90,hjust = 1))+
  NULL

melted <- melt(dat_nc[,c(2,5)])

## Using model as id variables

colnames(melted) <- c("Model","task","value")

p3 <- ggplot(melted, aes(x=Model, y=value*100,fill=Model))+
  geom_bar(stat="identity", position="dodge")+
  scale_y_continuous(limits=c(0,100))+
  ggtitle("context free")+
  scale_fill_manual(values=colsIDF)+
  theme_cowplot()+
  theme(axis.text.x=element_text(angle = 90,hjust = 1))+
  NULL

p1+p3+plot_layout(guides = 'collect')

Figure 6J

dat <- read.csv("data/InstaDeep/matthews_correlation.csv")
dat$Model <- factor(dat$Model, levels=dat$Model)

dat_hs <- dat[c(2,4,5,9),]
dat_ms <- dat[c(1,3,6,7,8),]
dat_nc <- dat[c(10:15),]

melted <- melt(dat_hs)

## Using Model as id variables

colnames(melted) <- c("Model","task","value")

p1 <- ggplot(melted, aes(x=task, y=value,fill=Model))+
  geom_bar(stat="identity", position="dodge")+
  scale_y_continuous(limits=c(0,100))+
  ggtitle("human")+
  scale_fill_manual(values=human_cols2)+
  theme_cowplot()+
  theme(axis.text.x=element_text(angle = 90,hjust = 1))+
  NULL

melted <- melt(dat_ms)

## Using Model as id variables

colnames(melted) <- c("Model","task","value")

p2<- ggplot(melted, aes(x=task, y=value,fill=Model))+
  geom_bar(stat="identity", position="dodge")+
  scale_y_continuous(limits=c(0,100))+
  ggtitle("multi species")+
  scale_fill_manual(values=human_mscols2)+
  theme_cowplot()+
  theme(axis.text.x=element_text(angle = 90,hjust = 1))+
  NULL

melted <- melt(dat_nc)

## Using Model as id variables

colnames(melted) <- c("Model","task","value")

p3 <- ggplot(melted, aes(x=task, y=value,fill=Model))+
  geom_bar(stat="identity", position="dodge")+
  scale_y_continuous(limits=c(0,100))+
  ggtitle("context free")+
  scale_fill_manual(values=colsIDF)+
  theme_cowplot()+
  theme(axis.text.x=element_text(angle = 90,hjust = 1))+
  NULL

p1+p2+p3+plot_layout(guides = 'collect')

Figure S6

dat <- read.csv("data/GUE/matthews_correlation.csv")

dat$Model <- factor(dat$Model, levels=dat$Model)

dat_hs <- dat[c(1,2,7),]
dat_ms <- dat[c(3:6),]
dat_nc <- dat[c(8:13),]


dat_tfs <- dat_hs[,1:6]
dat_tfs_ms <- dat_ms[,1:6]
dat_tfs_nc <- dat_nc[,1:6]

dat_cp <- dat_hs[,c(1,7:9)]
dat_cp_ms <- dat_ms[,c(1,7:9)]
dat_cp_nc <- dat_nc[,c(1,7:9)]

dat_p300 <- dat_hs[,c(1,10:12)]
dat_p300_ms <- dat_ms[,c(1,10:12)]
dat_p300_nc <- dat_nc[,c(1,10:12)]

dat_sp <- dat_hs[,c(1,13)]
dat_sp_ms <- dat_ms[,c(1,13)]
dat_sp_nc <- dat_nc[,c(1,13)]


## TFs
melted <- melt(dat_tfs)

## Using Model as id variables

colnames(melted) <- c("Model","task","value")

p1 <- ggplot(melted, aes(x=task, y=value,fill=Model))+
  geom_bar(stat="identity", position="dodge")+
  scale_y_continuous(limits=c(0,80))+
  scale_fill_manual(values=human_ms_cols[1:3])+
  ggtitle("human")+
  theme_cowplot()+
  NULL

melted <- melt(dat_tfs_ms)

## Using Model as id variables

colnames(melted) <- c("Model","task","value")

p2<- ggplot(melted, aes(x=task, y=value,fill=Model))+
  geom_bar(stat="identity", position="dodge")+
  scale_y_continuous(limits=c(0,80))+
  ggtitle("multi species")+
  scale_fill_manual(values=human_ms_cols[4:7])+
  theme_cowplot()+
  NULL

melted <- melt(dat_tfs_nc)

## Using Model as id variables

colnames(melted) <- c("Model","task","value")

p3 <- ggplot(melted, aes(x=task, y=value,fill=Model))+
  geom_bar(stat="identity", position="dodge")+
  scale_y_continuous(limits=c(0,80))+
  scale_fill_manual(values=colsIDF)+
  ggtitle("context free")+
  theme_cowplot()+
  NULL

p1+p2+p3+plot_layout(guides = 'collect')

## Core promoter

melted <- melt(dat_cp)

## Using Model as id variables

colnames(melted) <- c("Model","task","value")

p1 <- ggplot(melted, aes(x=task, y=value,fill=Model))+
  geom_bar(stat="identity", position="dodge")+
  scale_y_continuous(limits=c(0,80))+
  ggtitle("human")+
  scale_fill_manual(values=human_ms_cols[1:3])+
  theme_cowplot()+
  NULL

melted <- melt(dat_cp_ms)

## Using Model as id variables

colnames(melted) <- c("Model","task","value")

p2<- ggplot(melted, aes(x=task, y=value,fill=Model))+
  geom_bar(stat="identity", position="dodge")+
  scale_y_continuous(limits=c(0,80))+
  ggtitle("multi species")+
  scale_fill_manual(values=human_ms_cols[4:7])+
  theme_cowplot()+
  NULL

melted <- melt(dat_cp_nc)

## Using Model as id variables

colnames(melted) <- c("Model","task","value")

p3 <- ggplot(melted, aes(x=task, y=value,fill=Model))+
  geom_bar(stat="identity", position="dodge")+
  scale_y_continuous(limits=c(0,80))+
  ggtitle("context free")+
  scale_fill_manual(values=colsIDF)+
  theme_cowplot()+
  NULL

p1+p2+p3+plot_layout(guides = 'collect')

## p300

melted <- melt(dat_p300)

## Using Model as id variables

colnames(melted) <- c("Model","task","value")

p1 <- ggplot(melted, aes(x=task, y=value,fill=Model))+
  geom_bar(stat="identity", position="dodge")+
  scale_y_continuous(limits=c(0,100))+
  ggtitle("human")+
  scale_fill_manual(values=human_ms_cols[1:3])+
  theme_cowplot()+
  NULL

melted <- melt(dat_p300_ms)

## Using Model as id variables

colnames(melted) <- c("Model","task","value")

p2<- ggplot(melted, aes(x=task, y=value,fill=Model))+
  geom_bar(stat="identity", position="dodge")+
  scale_y_continuous(limits=c(0,100))+
  ggtitle("multi species")+
  scale_fill_manual(values=human_ms_cols[4:7])+
  theme_cowplot()+
  NULL

melted <- melt(dat_p300_nc)

## Using Model as id variables

colnames(melted) <- c("Model","task","value")

p3 <- ggplot(melted, aes(x=task, y=value,fill=Model))+
  geom_bar(stat="identity", position="dodge")+
  scale_y_continuous(limits=c(0,100))+
  ggtitle("context free")+
  scale_fill_manual(values=colsIDF)+
  theme_cowplot()+
  NULL

p1+p2+p3+plot_layout(guides = 'collect')

## sp

melted <- melt(dat_sp)

## Using Model as id variables

colnames(melted) <- c("Model","task","value")

p1 <- ggplot(melted, aes(x=task, y=value,fill=Model))+
  geom_bar(stat="identity", position="dodge")+
  scale_y_continuous(limits=c(0,100))+
  ggtitle("human")+
  scale_fill_manual(values=human_ms_cols[1:3])+
  theme_cowplot()+
  NULL

melted <- melt(dat_sp_ms)

## Using Model as id variables

colnames(melted) <- c("Model","task","value")

p2<- ggplot(melted, aes(x=task, y=value,fill=Model))+
  geom_bar(stat="identity", position="dodge")+
  scale_y_continuous(limits=c(0,100))+
  ggtitle("multi species")+
  scale_fill_manual(values=human_ms_cols[4:7])+
  theme_cowplot()+
  NULL

melted <- melt(dat_sp_nc)

## Using Model as id variables

colnames(melted) <- c("Model","task","value")

p3 <- ggplot(melted, aes(x=task, y=value,fill=Model))+
  geom_bar(stat="identity", position="dodge")+
  scale_y_continuous(limits=c(0,100))+
  ggtitle("context free")+
  scale_fill_manual(values=colsIDF)+
  theme_cowplot()+
  NULL

p1+p2+p3+plot_layout(guides = 'collect')

Session Info

sessionInfo()

## R version 4.3.1 (2023-06-16)
## Platform: x86_64-apple-darwin20 (64-bit)
## Running under: macOS Ventura 13.6
## 
## Matrix products: default
## BLAS:   /Library/Frameworks/R.framework/Versions/4.3-x86_64/Resources/lib/libRblas.0.dylib 
## LAPACK: /Library/Frameworks/R.framework/Versions/4.3-x86_64/Resources/lib/libRlapack.dylib;  LAPACK version 3.11.0
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## time zone: Europe/Berlin
## tzcode source: internal
## 
## attached base packages:
## [1] stats4    grid      stats     graphics  grDevices utils     datasets 
## [8] methods   base     
## 
## other attached packages:
##  [1] clusterProfiler_4.8.3                  
##  [2] TxDb.Hsapiens.UCSC.hg19.knownGene_3.2.2
##  [3] GenomicFeatures_1.52.2                 
##  [4] AnnotationDbi_1.62.2                   
##  [5] Biobase_2.60.0                         
##  [6] BSgenome.Hsapiens.UCSC.hg19_1.4.3      
##  [7] BSgenome_1.68.0                        
##  [8] rtracklayer_1.60.1                     
##  [9] Biostrings_2.68.1                      
## [10] XVector_0.40.0                         
## [11] GenomicRanges_1.52.1                   
## [12] GenomeInfoDb_1.36.4                    
## [13] IRanges_2.34.1                         
## [14] S4Vectors_0.38.2                       
## [15] BiocGenerics_0.46.0                    
## [16] patchwork_1.2.0                        
## [17] wordcloud2_0.2.1                       
## [18] lsa_0.73.3                             
## [19] SnowballC_0.7.1                        
## [20] umap_0.2.10.0                          
## [21] ggrepel_0.9.5                          
## [22] ggfortify_0.4.17                       
## [23] migest_2.0.4                           
## [24] viridis_0.6.5                          
## [25] viridisLite_0.4.2                      
## [26] pheatmap_1.0.12                        
## [27] gridExtra_2.3                          
## [28] cowplot_1.1.3                          
## [29] reshape2_1.4.4                         
## [30] RColorBrewer_1.1-3                     
## [31] colorspace_2.1-0                       
## [32] pals_1.8                               
## [33] lubridate_1.9.3                        
## [34] forcats_1.0.0                          
## [35] stringr_1.5.1                          
## [36] dplyr_1.1.4                            
## [37] purrr_1.0.2                            
## [38] readr_2.1.5                            
## [39] tidyr_1.3.1                            
## [40] tibble_3.2.1                           
## [41] tidyverse_2.0.0                        
## [42] ggpubr_0.6.0                           
## [43] ggplot2_3.5.1                          
## 
## loaded via a namespace (and not attached):
##   [1] splines_4.3.1               BiocIO_1.10.0              
##   [3] ggplotify_0.1.2             bitops_1.0-7               
##   [5] filelock_1.0.3              polyclip_1.10-6            
##   [7] XML_3.99-0.16.1             lifecycle_1.0.4            
##   [9] rstatix_0.7.2               vroom_1.6.5                
##  [11] MASS_7.3-60.0.1             lattice_0.22-6             
##  [13] backports_1.4.1             magrittr_2.0.3             
##  [15] sass_0.4.9                  rmarkdown_2.27             
##  [17] jquerylib_0.1.4             yaml_2.3.8                 
##  [19] askpass_1.2.0               reticulate_1.36.1          
##  [21] mapproj_1.2.11              DBI_1.2.2                  
##  [23] maps_3.4.2                  abind_1.4-5                
##  [25] zlibbioc_1.46.0             ggraph_2.2.1               
##  [27] RCurl_1.98-1.14             yulab.utils_0.1.4          
##  [29] tweenr_2.0.3                rappdirs_0.3.3             
##  [31] GenomeInfoDbData_1.2.10     enrichplot_1.20.3          
##  [33] tidytree_0.4.6              RSpectra_0.16-1            
##  [35] codetools_0.2-20            DelayedArray_0.26.7        
##  [37] ggforce_0.4.2               DOSE_3.26.2                
##  [39] xml2_1.3.6                  tidyselect_1.2.1           
##  [41] aplot_0.2.2                 farver_2.1.2               
##  [43] matrixStats_1.3.0           BiocFileCache_2.8.0        
##  [45] GenomicAlignments_1.36.0    jsonlite_1.8.8             
##  [47] tidygraph_1.3.1             systemfonts_1.1.0          
##  [49] tools_4.3.1                 progress_1.2.3             
##  [51] ragg_1.3.2                  treeio_1.24.3              
##  [53] Rcpp_1.0.12                 glue_1.7.0                 
##  [55] mgcv_1.9-1                  xfun_0.44                  
##  [57] qvalue_2.32.0               MatrixGenerics_1.12.3      
##  [59] withr_3.0.0                 fastmap_1.2.0              
##  [61] fansi_1.0.6                 openssl_2.2.0              
##  [63] digest_0.6.35               gridGraphics_0.5-1         
##  [65] timechange_0.3.0            R6_2.5.1                   
##  [67] textshaping_0.3.7           GO.db_3.17.0               
##  [69] dichromat_2.0-0.1           biomaRt_2.56.1             
##  [71] RSQLite_2.3.6               utf8_1.2.4                 
##  [73] generics_0.1.3              data.table_1.15.4          
##  [75] graphlayouts_1.1.1          prettyunits_1.2.0          
##  [77] httr_1.4.7                  htmlwidgets_1.6.4          
##  [79] S4Arrays_1.0.6              scatterpie_0.2.2           
##  [81] pkgconfig_2.0.3             gtable_0.3.5               
##  [83] blob_1.2.4                  shadowtext_0.1.3           
##  [85] htmltools_0.5.8.1           carData_3.0-5              
##  [87] fgsea_1.26.0                scales_1.3.0               
##  [89] png_0.1-8                   ggfun_0.1.4                
##  [91] knitr_1.45                  rstudioapi_0.16.0          
##  [93] tzdb_0.4.0                  rjson_0.2.21               
##  [95] nlme_3.1-164                curl_5.2.1                 
##  [97] cachem_1.1.0                parallel_4.3.1             
##  [99] HDO.db_0.99.1               restfulr_0.0.15            
## [101] pillar_1.9.0                vctrs_0.6.5                
## [103] car_3.1-2                   dbplyr_2.5.0               
## [105] evaluate_0.23               cli_3.6.2                  
## [107] compiler_4.3.1              Rsamtools_2.16.0           
## [109] rlang_1.1.3                 crayon_1.5.2               
## [111] ggsignif_0.6.4              labeling_0.4.3             
## [113] fs_1.6.4                    plyr_1.8.9                 
## [115] stringi_1.8.4               BiocParallel_1.34.2        
## [117] munsell_0.5.1               lazyeval_0.2.2             
## [119] GOSemSim_2.26.1             Matrix_1.6-5               
## [121] hms_1.1.3                   bit64_4.0.5                
## [123] KEGGREST_1.40.1             highr_0.10                 
## [125] SummarizedExperiment_1.30.2 igraph_2.0.3               
## [127] broom_1.0.6                 memoise_2.0.1              
## [129] bslib_0.7.0                 ggtree_3.8.2               
## [131] fastmatch_1.1-4             bit_4.0.5                  
## [133] downloader_0.4              gson_0.1.0                 
## [135] ape_5.8