The human genome’s vocabulary as proposed by the DNA language model GROVER

For issues and questions, please get in touch via arpoetsch@gmail.com The preprint that is associated with this script can be found at: https://www.biorxiv.org/content/10.1101/2023.07.19.549677v1
A tutorial on how to use GROVER as a foundation model can be found at: https://doi.org/10.5281/zenodo.8373159 The pretrained model can be found at: https://doi.org/10.5281/zenodo.8373117 The data for the tokenised genome are at: https://doi.org/10.5281/zenodo.8373053

Introduction

The models and performance measurements were done in python and the code can be found in the corresponding document. For visualisation, data were imported into R. We provide the outputs that we used for visualisation directly to make things easier for reproduction. Some figures were recoloured and/or combined with Adobe Illustrator, so some figures may look a bit different from the publication.

Figure 1

DNA byte-pair tokenisation and model architecture

Tokenisation was performed in Python. Visualisations are done in R with token clowds. The pickle file is converted into .rdat with

library(reticulate)
source_python(“src/pickle_reader.py”)
dat <- read_pickle_file(“data/all_vocabs_wg.pkl”)

# load tokens
load("data/all_vocabs_wg.rdat")

# make word clouds for cycle 1,2,3,23,38,121,356
c1 <- dat[[1]]
wc <- data.frame(word=names(c1),freq=unlist(c1))
colvec <- mycolors[nchar(wc$word)]
names(colvec) <- NULL
wordcloud2(data=wc, fontFamily = "Courier New", color=colvec)

c1 <- dat[[2]]
wc <- data.frame(word=names(c1),freq=unlist(c1))
colvec <- mycolors[nchar(wc$word)]
names(colvec) <- NULL
wordcloud2(data=wc, fontFamily = "Courier New", color=colvec)

c1 <- dat[[3]]
wc <- data.frame(word=names(c1),freq=unlist(c1))
colvec <- mycolors[nchar(wc$word)]
names(colvec) <- NULL
wordcloud2(data=wc, fontFamily = "Courier New", color=colvec)

c1 <- dat[[4]]
wc <- data.frame(word=names(c1),freq=unlist(c1))
colvec <- mycolors[nchar(wc$word)]
names(colvec) <- NULL
wordcloud2(data=wc, fontFamily = "Courier New", color=colvec)

c1 <- dat[[24]]
wc <- data.frame(word=names(c1),freq=unlist(c1))
colvec <- mycolors[nchar(wc$word)]
names(colvec) <- NULL
wordcloud2(data=wc, fontFamily = "Courier New", color=colvec)

c1 <- dat[[39]]
wc <- data.frame(word=names(c1),freq=unlist(c1))
colvec <- mycolors[nchar(wc$word)]
names(colvec) <- NULL
wordcloud2(data=wc, fontFamily = "Courier New", color=colvec)

c1 <- dat[[122]]
wc <- data.frame(word=names(c1),freq=unlist(c1))
colvec <- mycolors[nchar(wc$word)]
names(colvec) <- NULL
wordcloud2(data=wc, fontFamily = "Courier New", color=colvec)

c1 <- dat[[357]]
wc <- data.frame(word=names(c1),freq=unlist(c1))
colvec <- mycolors[nchar(wc$word)]
names(colvec) <- NULL
wordcloud2(data=wc, fontFamily = "Courier New", color=colvec)

Figure 2

Performance based selection of the vocabulary identifies 600 cycles of Byte-Pair Tokenization as optimal.

Performance metrices are imported from python.

Figure 2A

perf <- read.csv("data/performance.csv", header = TRUE)
colnames(perf) <- c("2-mers", "3-mers","4-mers","5-mers","6-mers")
rownames(perf) <- c(100*c(1:10),1000*c(2:5)) 

perf$cycle <- rownames(perf)
perf$cycle <- factor(perf$cycle,levels=perf$cycle)
melted <- melt(perf, id="cycle")
colnames(melted)<-c("cycle","kmer","accuracy") 

ggplot(data=melted, aes(x=cycle,y=accuracy,colour=kmer, fill=kmer, group=kmer))+
  geom_point(size=3)+
  geom_smooth()+
  geom_segment(aes(x=10.5,xend=10.5,y=accuracy,yend=accuracy+0.5), linewidth=5, colour="white")+
  facet_grid(facet="kmer",rows=5, scales="free")+
  scale_colour_manual(values=viridis(n=5))+
  scale_fill_manual(values=viridis(n=5))+
  theme(strip.background = element_rect(fill = "white"))+
  theme(axis.text.x = element_text(angle = 90))+
  theme(legend.position="none")+
  NULL

## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'

Figure 2B

acc <- read.csv("data/Models_Accuracy_kmers.csv")

acc$Model <- factor(acc$Model, levels=c("GROVER","4mer","5mer","6mer"))

ggplot(acc, aes(x=Model, y=Accuracy, fill=Model))+
  geom_bar(stat="identity")+
  theme(axis.text.x = element_text(angle = 90, hjust = 1),
        legend.position = "none")+
  scale_fill_manual(values=modelcols[c(1,2,4,6)])+
  scale_y_continuous(name= "accuracy [%]")+
  facet_wrap(~Prediction, scales="free_y", ncol=1)+
  NULL

Figure 2C

acc <- read.csv("data/top_k_accuracy.csv")
ggplot(acc[1:60,], aes(x=k,y=accuracy))+
  geom_bar(stat="identity")+
  NULL

Figure 2D

perp <- read.csv("data/perplexity_GROVER_chr21.csv")
perp4 <- read.csv("data/perplexity_4mer_chr21.csv")
perp5 <- read.csv("data/perplexity_5mer_chr21.csv")
perp6 <- read.csv("data/perplexity_6mer_chr21.csv")

perp$performance <- perp$performance /601
perp4$performance <- perp4$performance /(4^4)
perp5$performance <- perp5$performance /(4^5)
perp6$performance <- perp6$performance /(4^6)

datf <- data.frame(model=c("GROVER","4mer","5mer","6mer"),
              perplexity =c(min(perp$performance),min(perp4$performance),min(perp5$performance),min(perp6$performance)))

datf$model <- factor(datf$model, levels=datf$model)

ggplot(datf, aes(x=model,y=perplexity, fill=model))+
  geom_bar(stat="identity")+
  scale_fill_manual(values=modelcols[c(1,2,4,6)])+
  NULL

Data preparation for characterisation of vocabulary

load("data/all_vocabs_wg.rdat")
vocab600 <- dat[["600"]]
vocab600 <- t(data.frame(vocab600))
colnames(vocab600)<- "frequency"
vocab600 <- data.frame(vocab600)
vocab600$token <- rownames(vocab600)

Hseq <- getSeq(Hsapiens)
Hseq <- Hseq[names(Hseq) %in% paste0("chr",c(1:22,"X","Y"))]

kmers4 <- colSums(oligonucleotideFrequency(Hseq, 4))
kmers5 <- colSums(oligonucleotideFrequency(Hseq, 5))
kmers6 <- colSums(oligonucleotideFrequency(Hseq, 6))

kmers4 <- data.frame(kmers4)
kmers4$token <- rownames(kmers4)
colnames(kmers4)<- c("frequency","token")
kmers5 <- data.frame(kmers5)
kmers5$token <- rownames(kmers5)
colnames(kmers5)<- c("frequency","token")
kmers6 <- data.frame(kmers6)
kmers6$token <- rownames(kmers6)
colnames(kmers6)<- c("frequency","token")

Load embeddings

Embeddings for the GROVER vocabulary from the BERT infrastructure and Word2Vec, UMAP and PCA.

#GROVER
Gemb <- get(load("data/vocab_embedding.rdat"))
rownames(Gemb) <- read.table("data/vocab.txt")[,1]

#W2V
GWemb <-  get(load("data/iter600_w2v.rdat"))
rownames(GWemb) <-  read.table("data/iter600_w2v_vocab.txt")[,1]

Gemb <- Gemb[-c(1:7,9),]
Gum <- umap(Gemb)
Gpca <- prcomp(Gemb)
GWum <- umap(GWemb)
GWpca <- prcomp(GWemb)

Collection of data in dataframes

df <- data.frame(Gum$layout)
df$token <- rownames(df)
colnames(df)<- c("G_UMAP1","G_UMAP2","token")

dfGW <- data.frame(GWum$layout)
dfGW$token <- rownames(dfGW)
colnames(dfGW) <- c("G_W2V_UMAP1","G_W2V_UMAP2","token")
df <- merge(df,dfGW,id="token")

df2 <- data.frame(Gpca$x)[,1:6]
df2$token <- rownames(df2)
colnames(df2) <- c("GROVER_PC1","GROVER_PC2","GROVER_PC3","GROVER_PC4","GROVER_PC5","GROVER_PC6","token")
df <- merge(df,df2,id="token")

df2 <- data.frame(GWpca$x)[,1:6]
df2$token <- rownames(df2)
colnames(df2) <- c("GROVER_W2V_PC1","GROVER_W2V_PC2","GROVER_W2V_PC3","GROVER_W2V_PC4","GROVER_W2V_PC5","GROVER_W2V_PC6","token")
df <- merge(df,df2,id="token")
df <- merge(df,vocab600, id="token")
df$length <- factor(nchar(as.character(df$token)))

df$C <- sapply(as.character(df$token),str_count, pattern=c("C"))/nchar(as.character(df$token))
df$G <- sapply(as.character(df$token),str_count, pattern=c("G"))/nchar(as.character(df$token))
df$A <- sapply(as.character(df$token),str_count, pattern=c("A"))/nchar(as.character(df$token))
df$T <- sapply(as.character(df$token),str_count, pattern=c("T"))/nchar(as.character(df$token))
df$GC <- df$C + df$G
df$AG <- df$A + df$G
df$AC <- df$A + df$C

hex=jet(nrow(df)) #colour selection
df$W2V_hex <- hex[rank(df$GROVER_W2V_PC1)]
w2v_tokencol <- df[,c("token","W2V_hex","GROVER_W2V_PC1")]

Figure 3

The frequency balanced GROVER vocabulary shows differential learning performance by token length

Figure 3A

Compare frequency distribution between vocabularies

freqdf <- rbind(data.frame(model="GROVER",frequency=df$frequency),
                data.frame(model= "4mer",frequency=kmers4$frequency/4),
                 data.frame(model= "5mer",frequency=kmers5$frequency/5),
                 data.frame(model= "6mer",frequency=kmers6$frequency/6)
                )

freqdf$model <- factor(freqdf$model,levels=c("GROVER","4mer","5mer","6mer"))

ggplot(freqdf, aes(x = frequency, fill = model)) +
  geom_density() +  # Boxplot without outliers
  scale_x_log10() +  # Logarithmic y-axis scale
  labs(x = "frequency", y = "density") +  # Axis labels
  scale_fill_manual(values=modelcols[c(1,2,4,6)])+
  facet_grid(facet="model", rows=4)+
  theme(strip.background = element_rect(fill = "white"))+
  NULL

Figure 3B

dftmp <- aggregate(frequency~length, df[,c("frequency","length")],sum)
dftmp$length <- as.numeric(as.character(dftmp$length))
levels(dftmp$length) <- 1:16
dftmp <- rbind(dftmp,data.frame(length=c(13:15),frequency=0))

ggplot(dftmp, aes(x=length,y=frequency, fill=length))+
  geom_bar(stat="identity")+
  scale_fill_manual(values=mycolors)+
  scale_x_discrete("token length")+
  scale_y_continuous("frequency")+
  theme_cowplot()+
  NULL

Figure 3C

tab <- table(df$length)
tab["13"] <- 0
tab["14"] <- 0
tab["15"] <- 0

dfvoc <- data.frame(t(tab[c(1:12,14:16,13)]))[,2:3]
colnames(dfvoc) <- c("length","vocabulary")
dfvoc$frequency <- 4^c(1:16)
dfvoc$proportion <- dfvoc$vocabulary/dfvoc$frequency
#dfvoc$length<- factor(as.character(dfvoc$length),levels(dfvoc$length))


ggplot(dfvoc, aes(y=vocabulary,x=frequency, colour=length))+
  geom_point(size=5)+
  scale_colour_manual(values=mycolors)+
  scale_x_log10("k-mer frequency")+
  scale_y_log10("k-mers represented")+
  theme_cowplot()+
  NULL

## Warning: Transformation introduced infinite values in continuous y-axis

Figure 3D

ggplot(dfvoc, aes(x=length,y=proportion, fill=length))+
  geom_bar(stat="identity")+
  scale_fill_manual(values=mycolors)+
  scale_x_discrete("token length")+
  scale_y_continuous("proportion of k-mers represented")+
  theme_cowplot()+
  NULL

Figure 3E

repres_seq <- c(
sum(str_count(df$token, "A")),
sum(str_count(df$token, "C")),
sum(str_count(df$token, "G")),
sum(str_count(df$token, "T")))

genome_freq <- colSums(oligonucleotideFrequency(Hseq, 1))

first_letter_seq <- c(
sum(str_count(substr(df$token,1,1), "A")),
sum(str_count(substr(df$token,1,1), "C")),
sum(str_count(substr(df$token,1,1), "G")),
sum(str_count(substr(df$token,1,1), "T")))

freq_df <- data.frame(genome_frequency=genome_freq/sum(genome_freq),
                      representation = repres_seq/sum( repres_seq),
                      first_letter=first_letter_seq/sum(first_letter_seq))
freq_df$nucl <- factor(c("A","C","G","T"), levels=c("A","C","G","T"))


p1 <- ggplot(freq_df, aes(x = "", y = genome_frequency, fill = nucl)) +
  geom_bar(stat = "identity", width = 1) +
  scale_fill_manual(values=nucl_colours)+
  coord_polar("y", start = 0, direction = -1) +
  geom_text(aes(label = paste0(nucl,"\n",round(genome_frequency * 100, 2), "%")),
            position = position_stack(vjust = 0.5))+
  theme_void() +
  labs(fill = "Genome Frequency")+
  NULL

p2 <- ggplot(freq_df, aes(x = "", y = representation, fill = nucl)) +
  geom_bar(stat = "identity", width = 1) +
  scale_fill_manual(values=nucl_colours)+
  coord_polar("y", start = 0, direction = -1) +
  geom_text(aes(label = paste0(nucl,"\n",round(representation * 100, 2), "%")),
            position = position_stack(vjust = 0.5))+
  theme_void() +
  labs(fill = "Representation")+
  NULL

p3 <- ggplot(freq_df, aes(x = "", y = first_letter, fill = nucl)) +
  geom_bar(stat = "identity", width = 1) +
  scale_fill_manual(values=nucl_colours)+
  coord_polar("y", start = 0, direction = -1) +
  geom_text(aes(label = paste0(nucl,"\n",round(first_letter * 100, 2), "%")),
            position = position_stack(vjust = 0.5))+
  theme_void() +
  labs(fill = "First Letter")+
  NULL


p1+p2+p3

Figure 3F

tokmetr <- read.csv("data/metrics_per_token.csv")
tokdf <- cbind(tokmetr[c(8,10:609),2:4])
colnames(tokdf) <- c("token","GROVER_AUC","GROVER_Acc")

df <- merge(df, tokdf, id="token")

ggplot(df, aes(y=GROVER_AUC-0.5,x=factor(length), fill=factor(length)))+
         geom_boxplot() +
  scale_fill_manual(values=mycolors)+
  NULL

Figure 3G

ggplot(df, aes(y=GROVER_Acc,x=factor(length), fill=factor(length)))+
         geom_boxplot() +
  scale_fill_manual(values=mycolors)+
  NULL

Figure 4

UMAPs and PCAs of the average embeddings of the vocabulary. Putting the Principal Components in relation to genome biology requires integration with additional data.

Figure 4A

Geigs <- Gpca$sdev^2
Gvarvec <- Geigs/sum(Geigs)
GWeigs <- GWpca$sdev^2
GWvarvec <- GWeigs/sum(GWeigs)

MEV <- c(GROVER=Gvarvec[1],GROVER_W2V=GWvarvec[1])
MEV <- data.frame(MEV)
MEV$model=rownames(MEV)
MEV$model <- factor(MEV$model,levels=MEV$model) 

ggplot(MEV[1:2,],aes(x=model,y=100*MEV))+
    geom_bar(stat="identity", fill="slateblue")+
    scale_y_continuous("Maximum Explained Variance [%]")+
    theme_cowplot()+
    theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))+
    NULL

Figure 4B

Plotted are both the PCA and also the colour bar for the legend.

ggplot(df, aes(x=GROVER_W2V_PC1, y=GROVER_W2V_PC2, colour=token))+
  geom_point()+
  scale_colour_manual(values=df$W2V_hex)+
  scale_x_continuous(paste0("Word2Vec PC1 (",round(100*GWvarvec[1],2),"%)"))+
  scale_y_continuous(paste0("Word2Vec PC2 (",round(100*GWvarvec[2],2),"%)"))+
  #geom_label_repel(aes(label = token ))+
  theme_cowplot()+
  theme(legend.position = "none")+
  NULL

image(matrix(1:601, nrow = 1), col = hex,  axes = FALSE, xlab = "", ylab = "")

Figure 4C

ggplot(df, aes(x=GROVER_W2V_PC1, y=GC, colour=token))+
  geom_point()+
  scale_colour_manual(values=df$W2V_hex)+
  scale_x_continuous(paste0("Word2Vec PC1 (",round(100*GWvarvec[1],2),"%)"))+
  scale_y_continuous(paste0("GC content"))+
  #geom_label_repel(aes(label = token ))+
  theme_cowplot()+
  theme(legend.position = "none")+
  NULL

cor.test(df$GROVER_W2V_PC1,df$GC, method="spearman")

## Warning in cor.test.default(df$GROVER_W2V_PC1, df$GC, method = "spearman"):
## Cannot compute exact p-value with ties

## 
##  Spearman's rank correlation rho
## 
## data:  df$GROVER_W2V_PC1 and df$GC
## S = 69672108, p-value < 2.2e-16
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
##        rho 
## -0.9256972

Figure 4D

ggplot(df, aes(G_W2V_UMAP1, y=G_W2V_UMAP2, colour=token))+
  geom_point()+
  scale_colour_manual(values=df$W2V_hex)+
  theme_cowplot()+
  theme(legend.position = "none")+
  NULL

Figure 4E

ggplot(df, aes(x=GROVER_PC1, y=GROVER_PC2, colour=token))+
  geom_point()+
  scale_colour_manual(values=df$W2V_hex)+
  scale_x_continuous(name=paste("PC1 (",round(100*Gvarvec[1],2),"%)",sep=""))+
  scale_y_continuous(name=paste("PC2 (",round(100*Gvarvec[2],2),"%)",sep=""))+
  theme_cowplot()+
  theme(legend.position = "none")+
  NULL

Figure 4F

ggplot(df, aes(x=G_UMAP1, y=G_UMAP2, colour=token))+
  geom_point()+
  scale_colour_manual(values=df$W2V_hex)+
  theme_cowplot()+
  theme(legend.position = "none")+
  NULL

Annotation of Tokens

For Figure 4G and later figures, the proportion of tokens associated with features of genome biology is calculated.

Gene element annotation

dummydf <- data.frame(Feature= sort(c("Promoter","5' UTR","3' UTR","1st Exon","Other Exon","1st Intron","Other Intron","Downstream (<=300)","Distal Intergenic")))


if(!file.exists("objects/230413_gene_annotation.rdat")){

annotdf <- data.frame(
row.names=dummydf$Feature)

for (i in c(1:length(df$token))){
  tok <- df$token[i]
  cat(tok,"\n")
  bed_ss <- get(load(paste0("objects/GR_by_token/GROVER600_",tok,".rdat")))
  if(length(bed_ss)>50000){
    bed_ss <- bed_ss[sample(length(bed_ss),50000)]
  }
  seqlevels(bed_ss) <- paste0("chr",c(1:22,"X","Y"))
  seqinfo(bed_ss)<- seqinfo(Hsapiens)
  start(bed_ss) <- start(bed_ss)+1 #readjust start
  bed_ss <- sort(bed_ss)
  names(bed_ss)<- 1:length(bed_ss)
  
  peakAnno <- annotatePeak(bed_ss, tssRegion=c(-1000, 1000),
                         TxDb=txdb, annoDb="org.Hs.eg.db")
  tmp <- merge(dummydf,peakAnno@annoStat,by="Feature",all=TRUE)
  annotdf <- cbind(annotdf,tmp[,2])
}

colnames(annotdf) <- df$token

annotdf$feature <- factor(rownames(annotdf),levels=rev(c("Promoter","5' UTR","3' UTR","1st Exon","Other Exon","1st Intron","Other Intron","Downstream (<=300)","Distal Intergenic")))
annotdf[is.na(annotdf)]<- 0
melted <- melt(annotdf)

save(annotdf,file="objects/230413_gene_annotation.rdat")
}

Annotation to genes with orientation

genes <- genes(txdb)

##   403 genes were dropped because they have exons located on both strands
##   of the same reference sequence or on more than one reference sequence,
##   so cannot be represented by a single genomic range.
##   Use 'single.strand.genes.only=FALSE' to get all the genes in a
##   GRangesList object, or use suppressMessages() to suppress this message.

genes_p <- genes[strand(genes) == "+"]
genes_m <- genes[strand(genes) == "-"]
strand(genes_m) <- "+"

genedf <- data.frame(
row.names=c("gene_p","gene_m","intragenic"))
genedf$feature <- c("gene_p","gene_m","intragenic")

if(!file.exists("objects/230413_gene_orientation_annotation.rdat")){
genedummy <- genedf

for (i in 1:length(df$token)){
  cat(i,"\n")
  tok <- df$token[i]
  cat(tok,"\n")
  bed_ss <- get(load(paste0("objects/GR_by_token/GROVER600_",tok,".rdat")))
  if(length(bed_ss)>50000){
    bed_ss <- bed_ss[sample(length(bed_ss),50000)]
  }
  seqlevels(bed_ss) <- paste0("chr",c(1:22,"X","Y"))
  seqinfo(bed_ss)<- seqinfo(Hsapiens)
  start(bed_ss) <- start(bed_ss)+1 #readjust start
  bed_ss <- sort(bed_ss)
  bed_ss_rev <- bed_ss
  names(bed_ss)<- 1:length(bed_ss)
  peakAnno_p <- bed_ss[bed_ss %over% genes_p]
  peakAnno_m <- bed_ss[bed_ss %over% genes_m]
  peakAnno_o <- bed_ss[!bed_ss %over% c(peakAnno_p,peakAnno_m)]
  
  genedf <- cbind(genedf,c(length(peakAnno_p),length(peakAnno_m),length(peakAnno_o)))
}
colnames(genedf) <- c("class",df$token)

genedf[,2:602] <- genedf[,2:602]/colSums(genedf[,2:602])

genedf2 <- genedf[,c("class",df$token)]

save(genedf2,file="230413_gene_orientation_annotation.rdat")
}

Annotation to coding sequence with orientation

genes <- cds(txdb)
genes_p <- genes[strand(genes) == "+"]
genes_m <- genes[strand(genes) == "-"]
strand(genes_m) <- "+"

genedf <- data.frame(
row.names=c("cds_p","cds_m","other"))
genedf$feature <- c("cds_p","cds_m","other")

if(!file.exists("objects/230413_cds_orientation_annotation.rdat")){

for (i in 1:length(df$token)){
  cat(i,"\n")
  tok <- df$token[i]
  cat(tok,"\n")
  bed_ss <- get(load(paste0("objects/GR_by_token/GROVER600_",tok,".rdat")))
  if(length(bed_ss)>50000){
    bed_ss <- bed_ss[sample(length(bed_ss),50000)]
  }
  seqlevels(bed_ss) <- paste0("chr",c(1:22,"X","Y"))
  seqinfo(bed_ss)<- seqinfo(Hsapiens)
  start(bed_ss) <- start(bed_ss)+1 #readjust start
  bed_ss <- sort(bed_ss)
  bed_ss_rev <- bed_ss
  names(bed_ss)<- 1:length(bed_ss)
  peakAnno_p <- bed_ss[bed_ss %over% genes_p]
  peakAnno_m <- bed_ss[bed_ss %over% genes_m]
  peakAnno_o <- bed_ss[!bed_ss %over% c(peakAnno_p,peakAnno_m)]
  
  genedf <- cbind(genedf,c(length(peakAnno_p),length(peakAnno_m),length(peakAnno_o)))
}
colnames(genedf) <- c(df$token)
genedf <- cbind (class=rownames(genedf),genedf)
genedf[,2:602] <- genedf[,2:602]/colSums(genedf[,2:602])

genedf2 <- genedf[,c("class",df$token)]

save(genedf2,file="objects/230413_cds_orientation_annotation.rdat")

}

Annotation to Chromatin Colours

HMM <- read.table("data/GM12878_ChromHMM.bed")
HMM <- HMM[,c(2,3,4,5,10)]
colnames(HMM) <- c("seqnames","start","end","ID","colour")
HMM <- GRanges(HMM)

HMMdf <- data.frame(
row.names=names(table(HMM$ID)))

HMMdf$ID <- names(table(HMM$ID))
HMMdf <- unique(merge(HMMdf,elementMetadata(HMM)[,c("ID","colour")], id="ID", all.x=TRUE, all.y=FALSE))
HMMdf$ID <- factor(HMMdf$ID, levels=sort(HMMdf$ID)[c(1,8:15,2:7)])
HMMcol <- HMMdf$colour[c(1,8:15,2:7)]
names(HMMcol)[c(1,8:15,2:7)]<- as.character(HMMdf$ID) 
HMMdummy <- HMMdf

if(!file.exists("objects/230413_HMM_annotation.rdat")){

for (i in c(1:length(df$token))){
  cat(i,"\n")
  tok <- df$token[i]
  cat(tok,"\n")
  bed_ss <- get(load(paste0("objects/GR_by_token/GROVER600_",tok,".rdat")))
  if(length(bed_ss)>50000){
    bed_ss <- bed_ss[sample(length(bed_ss),50000)]
  }
  seqlevels(bed_ss) <- paste0("chr",c(1:22,"X","Y"))
  seqinfo(bed_ss)<- seqinfo(Hsapiens)
  start(bed_ss) <- start(bed_ss)+1 #readjust start
  bed_ss <- sort(bed_ss)
  names(bed_ss)<- 1:length(bed_ss)
  peakAnno <- mergeByOverlaps(bed_ss, HMM)
  
  tmp <- data.frame(table(peakAnno$ID)/length(bed_ss))
  
  tmp <- merge(HMMdummy,tmp,by.x="ID",by.y="Var1",all=TRUE)
  HMMdf <- cbind(HMMdf,tmp[,3])
}
colnames(HMMdf) <- c("ID","colour",df$token)

HHMdf <- HHMdf[c(1,8:15,2:7),]
HHMdf$ID <- factor(HHMdf$ID,levels=HHMdf$ID)
HHMdf$colour <-c("#FF0000","#FF6969","#CF0BC6", "#FACA00", "#FACA01", "#FFFC04", "#FFFC05", "#0ABEFE", "#00B050", "#00B051", "#99FF66","#7F7F7F", "#F5F5F5", "#F5F5F6", "#F5F5F7")

 save(HMMdf,file="objects/230413_HMM_annotation.rdat")
}

Annotation Repeats

if(!file.exists("objects/230413_rep_annotation.rdat")){
  
  repdb <- get(load("data/repdb_UCSC.Rdata"))
  repdf <- data.frame(
  row.names=names(table(repdb$repClass)))
  repdf$repClass <- names(table(repdb$repClass))
  
  repdfdummy <- repdf
  repdfdummy$repClass <- paste0(repdfdummy$repClass,"_p")
  repdfdummy2 <- repdf
  repdfdummy2$repClass <- paste0(repdfdummy2$repClass,"_m")
  repdfdummy <- rbind(repdfdummy,repdfdummy2)
  repdf <- repdfdummy
  
for (i in 1:length(df$token)){
  cat(i,"\n")
  tok <- df$token[i]
  cat(tok,"\n")
  bed_ss <- get(load(paste0("objects/GR_by_token/GROVER600_",tok,".rdat")))
  if(length(bed_ss)>50000){
    bed_ss <- bed_ss[sample(length(bed_ss),50000)]
  }
  seqlevels(bed_ss) <- paste0("chr",c(1:22,"X","Y"))
  seqinfo(bed_ss)<- seqinfo(Hsapiens)
  start(bed_ss) <- start(bed_ss)+1 #readjust start
  bed_ss <- sort(bed_ss)
  bed_ss_rev <- bed_ss
  strand(bed_ss_rev) <- "-"
  names(bed_ss)<- 1:length(bed_ss)
  peakAnno_p <- mergeByOverlaps(bed_ss, repdb[which(strand(repdb)=="+")])
  peakAnno_m <- mergeByOverlaps(bed_ss_rev, repdb[which(strand(repdb)=="-")])
  
  tmp_p <- data.frame(table(peakAnno_p$repClass)/length(bed_ss))
  tmp_m <- data.frame(table(peakAnno_m$repClass)/length(bed_ss))
  tmp_p$Var1 <- paste0(tmp_p$Var,"_p")
  tmp_m$Var1 <- paste0(tmp_m$Var,"_m")
  
  tmp <- rbind(tmp_p,tmp_m)
  repdf <- cbind(repdf,tmp[,2])
}
colnames(repdf) <- c("repClass",df$token)

repdf2 <- rbind(c(colSums(repdf[c(17:21,13,1:2),-1])),
                c(colSums(repdf[21+c(17:21,13,1:2),-1])),
                c(colSums(repdf[c(15:16),-1])),
                c(colSums(repdf[21+c(15:16),-1])),
                c(colSums(repdf[c(14),-1])),
                c(colSums(repdf[21+c(14),-1])),
                c(colSums(repdf[c(12),-1])),
                c(colSums(repdf[21+c(12),-1])),
                c(colSums(repdf[c(11),-1])),
                c(colSums(repdf[21+c(11),-1])),
                c(colSums(repdf[c(6:7),-1])),
                c(colSums(repdf[21+c(6:7),-1])),
                c(colSums(repdf[c(5),-1])),
                c(colSums(repdf[21+c(5),-1])),
                c(colSums(repdf[c(4,3),-1])),
                c(colSums(repdf[21+c(4,3),-1]))
                  )

repdf2 <- data.frame(repClass=c("other_p","other_m","SINE_p","SINE_m","Simple_p","Simple_m","Satellite_p","Satellite_m","rRNA_p","rRNA_m","LTR_p","LTR_m","Low_complexity_p","Low_complexity_m","LINE_p","LINE_m"),repdf2)


colnames(repdf2) <- c("repClass",df$token)
save(repdf2,file="objects/230413_rep_annotation.rdat")
}

Annotation Replication timing and strand

if(!file.exists("objects/230413_rep_strand_annotation.rdat") &! file.exists("objects/230413_rep_timing_annotation.rdat")){

  repltime <- read.table("data/GSE131417_K562_reptime_segmentation_recolored.bed")
replstrand <- rtracklayer::import("data/OK-seq_K562_BR1.bam.rfd.bw")

seqlevels(replstrand, pruning.mode="coarse") <- paste0("chr",c(1:22,"X","Y"))
seqinfo(replstrand)<- seqinfo(Hsapiens)

colnames(repltime)[1:4] <- c("seqnames","start","end","timing")
repltime <- GRanges(repltime)
seqlevels(repltime, pruning.mode="coarse") <- paste0("chr",c(1:22,"X","Y"))
seqinfo(repltime)<- seqinfo(Hsapiens)
early <- repltime[repltime$timing == "early"]
mid_early <- repltime[repltime$timing == "mid-early"]
mid_late <- repltime[repltime$timing == "mid-late"]
late <- repltime[repltime$timing == "late"]

plusrepl <- reduce(replstrand[replstrand$score>0])
minusrepl <- reduce(replstrand[replstrand$score<0])
neut<- reduce(replstrand[replstrand$score==0])

genedf <- data.frame(
row.names=c("early","mid_early","mid_late","late"))
genedf$feature <- c("early","mid_early","mid_late","late")

stranddf <- data.frame(
row.names=c("plus","minus","neutral"))
stranddf$feature <- c("plus","minus","neutral")
  
for (i in 1:length(df$token)){
  cat(i,"\n")
  tok <- df$token[i]
  cat(tok,"\n")
  bed_ss <- get(load(paste0("objects/GR_by_token/GROVER600_",tok,".rdat")))
  seqlevels(bed_ss) <- paste0("chr",c(1:22,"X","Y"))
  seqinfo(bed_ss)<- seqinfo(Hsapiens)
  if(length(bed_ss)>50000){
    bed_ss <- bed_ss[sample(length(bed_ss),50000)]
  }
  
  start(bed_ss) <- start(bed_ss)+1 #readjust start
  bed_ss <- sort(bed_ss)
  names(bed_ss)<- 1:length(bed_ss)
  peakAnno_early <- bed_ss[bed_ss %over% early]
  peakAnno_mid_early <- bed_ss[bed_ss %over% mid_early]
  peakAnno_mid_late <- bed_ss[bed_ss %over% mid_late]
  peakAnno_late <- bed_ss[bed_ss %over% late]
  peakAnno_plus <- bed_ss[bed_ss %over% plusrepl]
  peakAnno_minus <- bed_ss[bed_ss %over% minusrepl]
  peakAnno_neut <- bed_ss[bed_ss %over% neut]
  
  genedf <- cbind(genedf,c(length(peakAnno_early),length(peakAnno_mid_early),length(peakAnno_mid_late),length(peakAnno_late)))
  stranddf <- cbind(stranddf,c(length(peakAnno_plus),length(peakAnno_minus),length(peakAnno_neut)))
}
colnames(genedf) <- c("class",df$token)
colnames(stranddf) <- c("class",df$token)

genedf[,2:602] <- genedf[,2:602]/colSums(genedf[,2:602])
stranddf[,2:602] <- t(t(stranddf[,2:602])/colSums(stranddf[,2:602]))

genedf2 <- genedf[,c("class",df$token[Ghm1])]
genedf2$class <- factor(genedf2$class,levels=genedf2$class)
stranddf2 <- stranddf[,c("class",df$token[Ghm1])]

save(genedf2,file="objects/230413_rep_timing_annotation.rdat")
save(stranddf2,file="objects/230413_rep_strand_annotation.rdat")
}

Integration of annotation into df

annot <- get(load("objects/230413_gene_annotation.rdat"))
annot <- data.frame(t(annot[,-602]))
annot$token <- rownames(annot)
df <- merge(df,annot, by="token")

annot <- get(load("objects/230413_gene_orientation_annotation.rdat"))
rownames(annot) <- annot$class
annot <- data.frame(annot[,-c(1)])
annot <- data.frame(t(annot))
annot$token <- rownames(annot)
df <- merge(df,annot, by="token", all=TRUE)

annot <- get(load("objects/230413_cds_orientation_annotation.rdat"))
annot <- data.frame(t(annot[,-1]))
annot$token <- rownames(annot)
colnames(annot)<-  c("cds_p","cds_m","cds_other","token")

df <- merge(df,annot, by="token", all=TRUE)

annot <- get(load("objects/230413_HMM_annotation.rdat"))
rownames(annot) <- annot$ID
annot <- data.frame(annot[,-c(1:2)])
annot <- data.frame(t(annot))
annot$token <- rownames(annot)

df <- merge(df,annot, by="token", all=TRUE)

annot <- get(load("objects/230413_rep_annotation.rdat"))
rownames(annot) <- annot$repClass
annot <- data.frame(annot[,-c(1)])
annot <- data.frame(t(annot))
annot$token <- rownames(annot)

df <- merge(df,annot, by="token", all=TRUE)

annot <- get(load("objects/230413_rep_timing_annotation.rdat"))
rownames(annot) <- annot$class
annot <- data.frame(annot[,-c(1)])
annot <- data.frame(t(annot))
annot$token <- rownames(annot)

df <- merge(df,annot, by="token", all=TRUE)

annot <- get(load("objects/230413_rep_strand_annotation.rdat"))
rownames(annot) <- annot$class
annot <- data.frame(annot[,-c(1)])
annot <- data.frame(t(annot))
annot$token <- rownames(annot)

df <- merge(df,annot, by="token", all=TRUE)

save(df, file="objects/GROVER_annotation_df.rdat")

Regression with GC content

   df$X1st.Exon_residuals <- resid(loess(df$X1st.Exon ~ df$GC, span = 1))
   df$X1st.Intron_residuals <- resid(loess(df$X1st.Intron ~ df$GC, span = 1))
   df$X3..UTR_residuals <- resid(loess(df$X3..UTR ~ df$GC, span = 1))
   df$X5..UTR_residuals <- resid(loess(df$X5..UTR ~ df$GC, span = 1))
   df$Distal.Intergenic_residuals <- resid(loess(df$Distal.Intergenic ~ df$GC, span = 1))
   df$Downstream....300._residuals <- resid(loess(df$Downstream....300. ~ df$GC, span = 1))
   df$Other.Exon_residuals <- resid(loess(df$Other.Exon ~ df$GC, span = 1))
   df$Other.Intron_residuals <- resid(loess(df$Other.Intron ~ df$GC, span = 1))
   df$Promoter_residuals <- resid(loess(df$Promoter ~ df$GC, span = 1))
   df$X1_Active_Promoter_residuals <- resid(loess(df$X1_Active_Promoter ~ df$GC, span = 1))
   df$X2_Weak_Promoter_residuals <- resid(loess(df$X2_Weak_Promoter ~ df$GC, span = 1))
   df$X3_Poised_Promoter_residuals <- resid(loess(df$X3_Poised_Promoter ~ df$GC, span = 1))
   df$X4_Strong_Enhancer_residuals <- resid(loess(df$X4_Strong_Enhancer ~ df$GC, span = 1))
   df$X5_Strong_Enhancer_residuals <- resid(loess(df$X5_Strong_Enhancer ~ df$GC, span = 1))
   df$X6_Weak_Enhancer_residuals <- resid(loess(df$X6_Weak_Enhancer ~ df$GC, span = 1))
   df$X7_Weak_Enhancer_residuals <- resid(loess(df$X7_Weak_Enhancer ~ df$GC, span = 1))
   df$X8_Insulator_residuals <- resid(loess(df$X8_Insulator ~ df$GC, span = 1))
   df$X9_Txn_Transition_residuals <- resid(loess(df$X9_Txn_Transition ~ df$GC, span = 1))
   df$X10_Txn_Elongation_residuals <- resid(loess(df$X10_Txn_Elongation ~ df$GC, span = 1))
   df$X11_Weak_Txn_residuals <- resid(loess(df$X11_Weak_Txn ~ df$GC, span = 1))
   df$X12_Repressed_residuals <- resid(loess(df$X12_Repressed ~ df$GC, span = 1))
   df$X13_Heterochrom.lo_residuals <- resid(loess(df$X13_Heterochrom.lo ~ df$GC, span = 1))
   df$X14_Repetitive.CNV_residuals <- resid(loess(df$X14_Repetitive.CNV ~ df$GC, span = 1))
   df$X15_Repetitive.CNV_residuals <- resid(loess(df$X15_Repetitive.CNV ~ df$GC, span = 1))
   
   df$gene_p_residuals <- resid(loess(df$gene_p ~ df$GC, span = 1))
   df$gene_m_residuals <- resid(loess(df$gene_m ~ df$GC, span = 1))
   df$intragenic_residuals <- resid(loess(df$intragenic ~ df$GC, span = 1))
   
   df$early_residuals <- resid(loess(df$early ~ df$GC, span = 1))
   df$mid_early_residuals <- resid(loess(df$mid_early ~ df$GC, span = 1))
   df$mid_late_residuals <- resid(loess(df$mid_late ~ df$GC, span = 1))
   df$late_residuals <- resid(loess(df$late ~ df$GC, span = 1))
   df$cds_p_residuals <- resid(loess(df$cds_p ~ df$GC, span = 1))
   df$cds_m_residuals <- resid(loess(df$cds_m ~ df$GC, span = 1))
   df$cds_other_residuals <- resid(loess(df$cds_other ~ df$GC, span = 1))

Correlation of PCs with annotation

# fix class for token length to numeric
df$length <- as.numeric(as.character(df$length))

tmp <- data.frame(Gpca$x)[,1:100]
tmp$token <- rownames(tmp)
rownames(df)<- df$token
df <- df %>% arrange(df,token)
tmp <- tmp[df$token,]

cors <- data.frame(nrow=20)
for(i in c(colnames(df)[c(18,19,24:26,91,86,85,83,89,84,90,88,87,107:108,114:115,92:106,60:75,110:113,80:81)])){
  cors_tmp <- cor(tmp[,1:20],df[i],method="spearman")
  cors <- cbind(cors,cors_tmp)
}

## Warning in cor(tmp[, 1:20], df[i], method = "spearman"): the standard deviation
## is zero

## Warning in cor(tmp[, 1:20], df[i], method = "spearman"): the standard deviation
## is zero

tmp <- data.frame(GWpca$x)[,1:100]
tmp$token <- rownames(tmp)
rownames(df)<- df$token
df <- df %>% arrange(df,token)
tmp <- tmp[df$token,]

corsW <- data.frame(nrow=20)
for(i in c(colnames(df)[c(18,19,24:26,91,86,85,83,89,84,90,88,87,107:108,114:115,92:106,60:75,110:113,80:81)])){
  cors_tmp <- cor(tmp[,1:20],df[i],method="spearman")
  corsW <- cbind(corsW,cors_tmp)
}

## Warning in cor(tmp[, 1:20], df[i], method = "spearman"): the standard deviation
## is zero

## Warning in cor(tmp[, 1:20], df[i], method = "spearman"): the standard deviation
## is zero

Figure 4G & H

varexplplot(GWvarvec)

ph <- pheatmap(t(corsW[,-c(1,40,48)]),
         breaks = seq(-1, 1, length.out = 100),
         cluster_rows = FALSE,
         cluster_cols = FALSE,
         gaps_row = c(5,14,16,18,33,47,51),
         color = colorRampPalette(c("navy", "white", "firebrick3"))(100)
         )

print(ph)

varexplplot(Gvarvec)

ph <- pheatmap(t(cors[,-c(1,40,48)]),
         breaks = seq(-1, 1, length.out = 100),
         cluster_rows = FALSE,
         cluster_cols = FALSE,
         gaps_row = c(5,14,16,18,33,47,51),
         color = colorRampPalette(c("navy", "white", "firebrick3"))(100)
         )

  print(ph)

Figure 4I

ggplot(df, aes(x=GROVER_PC1, y=log10(frequency), colour=token))+
  geom_point()+
  scale_colour_manual(values=df$W2V_hex)+
  scale_x_continuous(name=paste("PC1 (",round(100*Gvarvec[1],2),"%)",sep=""))+
  scale_y_continuous(name=paste("frequency (log10)",sep=""))+
  theme_cowplot()+
  theme(legend.position = "none")+
  NULL

cor.test(df$GROVER_PC1,log10(df$frequency), method="spearman")

## 
##  Spearman's rank correlation rho
## 
## data:  df$GROVER_PC1 and log10(df$frequency)
## S = 4327716, p-value < 2.2e-16
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
##       rho 
## 0.8803844

Figure 4J

ggplot(df, aes(x=GROVER_PC2, y=GC, colour=token))+
  geom_point()+
  scale_colour_manual(values=df$W2V_hex)+
  scale_x_continuous(name=paste("PC2 (",round(100*Gvarvec[2],2),"%)",sep=""))+
  scale_y_continuous(name=paste("GC [%]",sep=""))+
  theme_cowplot()+
  theme(legend.position = "none")+
  NULL

cor.test(df$GROVER_PC2,df$GC, method="spearman")

## Warning in cor.test.default(df$GROVER_PC2, df$GC, method = "spearman"): Cannot
## compute exact p-value with ties

## 
##  Spearman's rank correlation rho
## 
## data:  df$GROVER_PC2 and df$GC
## S = 70998078, p-value < 2.2e-16
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
##        rho 
## -0.9623462

Figure 4K

ggplot(df, aes(x=GROVER_PC3, y=AG, colour=W2V_hex))+
  geom_point()+
  scale_colour_manual(values=df$W2V_hex)+
  scale_x_continuous(name=paste("PC3 (",round(100*Gvarvec[3],2),"%)",sep=""))+
  scale_y_continuous(name=paste("AG [%]",sep=""))+
  theme_cowplot()+
  theme(legend.position = "none")+
  NULL

cor.test(df$GROVER_PC3,df$AG,method="spearman")

## Warning in cor.test.default(df$GROVER_PC3, df$AG, method = "spearman"): Cannot
## compute exact p-value with ties

## 
##  Spearman's rank correlation rho
## 
## data:  df$GROVER_PC3 and df$AG
## S = 2083581, p-value < 2.2e-16
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
##      rho 
## 0.942411

Figure 4L

df$length <- as.numeric(df$length)
ggplot(df, aes(x=GROVER_PC4, y=length, colour=W2V_hex))+
  geom_point()+
  scale_colour_manual(values=df$W2V_hex)+
  #geom_label_repel(aes(label = token ))+
  scale_x_continuous(name=paste("PC4 (",round(100*Gvarvec[4],2),"%)",sep=""))+
  scale_y_continuous(name=paste("token length",sep=""))+
  theme_cowplot()+
  theme(legend.position = "none")+
  NULL

cor.test(df$GROVER_PC4,df$length,method="spearman")

## Warning in cor.test.default(df$GROVER_PC4, df$length, method = "spearman"):
## Cannot compute exact p-value with ties

## 
##  Spearman's rank correlation rho
## 
## data:  df$GROVER_PC4 and df$length
## S = 22032923, p-value < 2.2e-16
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
##       rho 
## 0.3910226

Figure 4M

ggplot(df, aes(x=GROVER_PC5, y=A+C, colour=W2V_hex))+
  geom_point()+
  scale_colour_manual(values=df$W2V_hex)+
  scale_x_continuous(name=paste("PC5 (",round(100*Gvarvec[5],2),"%)",sep=""))+
  scale_y_continuous(name=paste("AC",sep=""))+
  theme_cowplot()+
  theme(legend.position = "none")+
  NULL

cor.test(df$GROVER_PC5,df$A+df$C,method="spearman")

## Warning in cor.test.default(df$GROVER_PC5, df$A + df$C, method = "spearman"):
## Cannot compute exact p-value with ties

## 
##  Spearman's rank correlation rho
## 
## data:  df$GROVER_PC5 and df$A + df$C
## S = 65480318, p-value < 2.2e-16
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
##        rho 
## -0.8098385

Figure 4N

ggplot(df, aes(x=GROVER_PC6, y=length, colour=W2V_hex))+
  geom_point()+
  scale_colour_manual(values=df$W2V_hex)+
  scale_x_continuous(name=paste("PC6 (",round(100*Gvarvec[6],2),"%)",sep=""))+
  scale_y_continuous(name=paste("token length",sep=""))+
  theme_cowplot()+
  theme(legend.position = "none")+
  NULL

cor.test(df$GROVER_PC6,df$length,method="spearman")

## Warning in cor.test.default(df$GROVER_PC6, df$length, method = "spearman"):
## Cannot compute exact p-value with ties

## 
##  Spearman's rank correlation rho
## 
## data:  df$GROVER_PC6 and df$length
## S = 20756012, p-value < 2.2e-16
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
##       rho 
## 0.4263157

Figure 5

Average GROVER token embeddings cluster by token characteristics and annotation

Clustering and annotation were performed separately in that the order after clustering was obtained and then used for the associated bar graphs. The results were then combined with Adobe Illustrator.

Gdist<- dist(Gemb,method = "euclidean")
Gdist <- as.matrix(Gdist)
Gdf <- data.frame(Gdist)
df$logfreq <- log10(df$frequency)
df$length <- as.factor(df$length)

annot <- df[,c("GROVER_W2V_PC1","GC","AG","length","logfreq")]
rownames(annot) <- df$token
annotcol <- list(length=mycolors,GROVER_W2V_PC1=hex, logfreq=viridis(100, option ="B",direction=-1)) 
breaks <- c(0, seq(min(Gdf[Gdf != 0]), max(Gdf), length.out = 99))

ph <- pheatmap(Gdf, 
         cluster_rows=TRUE,
         cluster_cols=TRUE,
         annotation_col=annot,
         annotation_colors = annotcol,
         annotation_row = NULL,
         annotation_row_names = FALSE,
         annotation_col_names = FALSE,
         color= c("grey",viridis(100,direction=1)),
         show_rownames = F,
         show_colnames = F,
         breaks = breaks)

Ghm1 <- ph$tree_col[["order"]]

load("objects/230413_rep_annotation.rdat")
melted <- melt(repdf2, id="repClass")

rep_cols <- c(brewer.pal(n = 8, name = "Dark2"),brewer.pal(n = 8, name = "Set2"))
repcol <- rep_cols[c(1,9,2,10,3,11,4,12,5,13,6,14,7,15,8,16)]

ggplot(melted,aes(x=variable,y=value,fill=repClass))+
  geom_bar(stat="identity", width = 1)+
  theme_cowplot()+
  scale_fill_manual(values=repcol)+
 coord_flip()+
  NULL

HHMdf <- get(load("objects/230413_HMM_annotation.rdat"))
HHMcol <- c("#FF0000","#FF6969","#CF0BC6", "#FACA00", "#FACA01", "#FFFC04", "#FFFC05", "#0ABEFE", "#00B050", "#00B051", "#99FF66","#7F7F7F", "#F5F5F5", "#F5F5F6", "#F5F5F7")
HHMdf <- data.frame(HHMdf)
 
HHMdf[,3:603] <- t(t(HHMdf[,3:603])/colSums(HHMdf[,3:603]))

melted <- melt(HHMdf[,-2], id=c("ID"))

ggplot(melted,aes(x=variable,y=value,fill=ID))+
  geom_bar(stat="identity", width = 1)+
  theme_cowplot()+
  scale_fill_manual(values=HHMcol)+
  coord_flip()+
  NULL

Figure 6

GROVER learns token context and genome annotation

Clustering and annotation were performed separately in that the order after clustering was obtained and then used for the associated bar graphs. The results were then combined with Adobe Illustrator.

Figure 6A

self_sim <- read.csv("data/self_similarity_per_token.csv")

ss <- self_sim[,3:14]

df <- merge(df, self_sim, id="token")
rownames(ss) <- self_sim$token

df$logfreq <- log10(df$frequency)

annot <- df[,c("logfreq","length","GROVER_W2V_PC1","GROVER_AUC")[4:1]]
annot$length <- factor(annot$length)
rownames(annot) <- df$token

annotcol <- list(length=mycolors,GROVER_W2V_PC1=hex, logfreq=viridis(100, option ="B",direction=-1)) 

phss <-  pheatmap(ss,
         cluster_cols=FALSE,
         annotation_row = annot,
         annotation_colors = annotcol,
         show_rownames = F,
         color= viridis(100,direction=1))

sshm <- phss$tree_row[["order"]]

token_order <- rownames(ss)[sshm]

HHMdf <- get(load("objects/230413_HMM_annotation.rdat"))
HHMdf <- data.frame(HHMdf)
HHMdf[,3:603] <- t(t(HHMdf[,3:603])/colSums(HHMdf[,3:603]))
HHMdf <- HHMdf[,c("ID",token_order)]
melted <- melt(HHMdf[,-2], id=c("ID"))

ggplot(melted,aes(x=variable,y=value,fill=ID))+
  geom_bar(stat="identity", width = 1)+
  theme_cowplot()+
  scale_fill_manual(values=HHMcol)+
 coord_flip()+
  NULL

load("objects/230413_rep_annotation.rdat")

repdf2 <- repdf2[,c("repClass",token_order)]
melted <- melt(repdf2, id="repClass")

rep_cols <- c(brewer.pal(n = 8, name = "Dark2"),brewer.pal(n = 8, name = "Set2"))
repcol <- rep_cols[c(1,9,2,10,3,11,4,12,5,13,6,14,7,15,8,16)]

ggplot(melted,aes(x=variable,y=value,fill=repClass))+
  geom_bar(stat="identity", width = 1)+
  theme_cowplot()+
  scale_fill_manual(values=repcol)+
  coord_flip()+
  NULL

Figure 6B

emb <- read.table("data/umap_pca_50_cls_tokens.bed")
colnames(emb)<- c("chromosome","start","end","x","y")
emb$start <- emb$start+1
GR <- GRanges(emb)
seqinfo(GR)<- seqinfo(Hsapiens)


#annotate to repeats

  repdb <- get(load("data/repdb_UCSC.Rdata"))
  seqlevels(repdb)<- seqlevels(Hsapiens)
  seqinfo(repdb)<- seqinfo(Hsapiens)

## Warning in valid.GenomicRanges.seqinfo(x, suggest.trim = TRUE): GRanges object contains 14 out-of-bound ranges located on sequences
##   chrUn_gl000211, chrUn_gl000217, chrUn_gl000220, chrUn_gl000227,
##   chrUn_gl000230, chrUn_gl000232, chrUn_gl000233, chrUn_gl000235,
##   chrUn_gl000236, chrUn_gl000244, chrUn_gl000246, chrUn_gl000247,
##   chr1_gl000191_random, and chr19_gl000208_random. Note that ranges
##   located on a sequence whose length is unknown (NA) or on a circular
##   sequence are not considered out-of-bound (use seqlengths() and
##   isCircular() to get the lengths and circularity flags of the underlying
##   sequences). You can use trim() to trim these ranges. See
##   ?`trim,GenomicRanges-method` for more information.

  rep_ss <- repdb[repdb$repClass %in% c("LINE", "LINE?") & strand(repdb) == "-"]
  dat <- GR[GR %over% rep_ss]
  dat <- data.frame(elementMetadata(dat))
  
  ggplot(dat, aes(x=x, y=y))+
  geom_point(colour=rep_cols[1], alpha=0.4)+
  scale_x_continuous(limits=c(-4.411,20.5))+
  scale_y_continuous(limits=c(-7.69,16.12))+
  ggtitle("LINE-")+
  NULL

  rep_ss <- repdb[repdb$repClass %in% c("LINE", "LINE?") & strand(repdb) == "+"]
  dat <- GR[GR %over% rep_ss]
  dat <- data.frame(elementMetadata(dat))
  
  ggplot(dat, aes(x=x, y=y))+
  geom_point(colour=rep_cols[9], alpha=0.4)+
    scale_x_continuous(limits=c(-4.411,20.5))+
  scale_y_continuous(limits=c(-7.69,16.12))+
  ggtitle("LINE+")+
  NULL

  rep_ss <- repdb[repdb$repClass %in% c("Low_complexity")]
  dat <- GR[GR %over% rep_ss]
  dat <- data.frame(elementMetadata(dat))
  
  ggplot(dat, aes(x=x, y=y))+
  geom_point(colour=rep_cols[2], alpha=0.4)+
    scale_x_continuous(limits=c(-4.411,20.5))+
  scale_y_continuous(limits=c(-7.69,16.12))+
  ggtitle("Low_complexity")+
  NULL

  rep_ss <- repdb[repdb$repClass %in% c("LTR","LTR?") & strand(repdb) == "-"]
  dat <- GR[GR %over% rep_ss]
  dat <- data.frame(elementMetadata(dat))
  
  ggplot(dat, aes(x=x, y=y))+
  geom_point(colour=rep_cols[3], alpha=0.4)+
    scale_x_continuous(limits=c(-4.411,20.5))+
  scale_y_continuous(limits=c(-7.69,16.12))+
  ggtitle("LTR-")+
  NULL

  rep_ss <- repdb[repdb$repClass %in% c("LTR","LTR?") & strand(repdb) == "+"]
  dat <- GR[GR %over% rep_ss]
  dat <- data.frame(elementMetadata(dat))
  
  ggplot(dat, aes(x=x, y=y))+
  geom_point(colour=rep_cols[11], alpha=0.4)+
    scale_x_continuous(limits=c(-4.411,20.5))+
  scale_y_continuous(limits=c(-7.69,16.12))+
  ggtitle("LTR+")+
  NULL

  rep_ss <- repdb[repdb$repClass %in% c("other","DNA","DNA?","RC","RNA","scRNA","snRNA","srpRNA","tRNA","Unknown","Unknown?") & strand(repdb) == "-"]
  dat <- GR[GR %over% rep_ss]
  dat <- data.frame(elementMetadata(dat))
  
  ggplot(dat, aes(x=x, y=y))+
  geom_point(colour=rep_cols[4], alpha=0.4)+
    scale_x_continuous(limits=c(-4.411,20.5))+
  scale_y_continuous(limits=c(-7.69,16.12))+
  ggtitle("other-")+
  NULL

    rep_ss <- repdb[repdb$repClass %in% c("other","DNA","DNA?","RC","RNA","scRNA","snRNA","srpRNA","tRNA","Unknown","Unknown?") & strand(repdb) == "+"]
  dat <- GR[GR %over% rep_ss]
  dat <- data.frame(elementMetadata(dat))
  
  ggplot(dat, aes(x=x, y=y))+
  geom_point(colour=rep_cols[12], alpha=0.4)+
    scale_x_continuous(limits=c(-4.411,20.5))+
  scale_y_continuous(limits=c(-7.69,16.12))+
  ggtitle("other+")+
  NULL

    rep_ss <- repdb[repdb$repClass %in% c("rRNA") & strand(repdb) == "-"]
  dat <- GR[GR %over% rep_ss]
  dat <- data.frame(elementMetadata(dat))
  
  ggplot(dat, aes(x=x, y=y))+
  geom_point(colour=rep_cols[5], alpha=0.4)+
    scale_x_continuous(limits=c(-4.411,20.5))+
  scale_y_continuous(limits=c(-7.69,16.12))+
  ggtitle("rRNA-")+
  NULL

    rep_ss <- repdb[repdb$repClass %in% c("rRNA") & strand(repdb) == "+"]
  dat <- GR[GR %over% rep_ss]
  dat <- data.frame(elementMetadata(dat))
  
  ggplot(dat, aes(x=x, y=y))+
  geom_point(colour=rep_cols[13], alpha=0.4)+
    scale_x_continuous(limits=c(-4.411,20.5))+
  scale_y_continuous(limits=c(-7.69,16.12))+
  ggtitle("rRNA+")+
  NULL

     rep_ss <- repdb[repdb$repClass %in% c("Satellite") & strand(repdb) == "-"]
  dat <- GR[GR %over% rep_ss]
  dat <- data.frame(elementMetadata(dat))
  
  ggplot(dat, aes(x=x, y=y))+
  geom_point(colour=rep_cols[6], alpha=0.4)+
    scale_x_continuous(limits=c(-4.411,20.5))+
  scale_y_continuous(limits=c(-7.69,16.12))+
  ggtitle("Satellite-")+
  NULL

    rep_ss <- repdb[repdb$repClass %in% c("Satellite") & strand(repdb) == "+"]
  dat <- GR[GR %over% rep_ss]
  dat <- data.frame(elementMetadata(dat))
  
  ggplot(dat, aes(x=x, y=y))+
  geom_point(colour=rep_cols[14], alpha=0.4)+
    scale_x_continuous(limits=c(-4.411,20.5))+
  scale_y_continuous(limits=c(-7.69,16.12))+
  ggtitle("Satellite+")+
  NULL

      rep_ss <- repdb[repdb$repClass %in% c("Simple_repeat") ]
  dat <- GR[GR %over% rep_ss]
  dat <- data.frame(elementMetadata(dat))
  
  ggplot(dat, aes(x=x, y=y))+
  geom_point(colour=rep_cols[15], alpha=0.4)+
    scale_x_continuous(limits=c(-4.411,20.5))+
  scale_y_continuous(limits=c(-7.69,16.12))+
  ggtitle("Simple")+
  NULL

     rep_ss <- repdb[repdb$repClass %in% c("SINE", "SINE?") & strand(repdb) == "-"]
  dat <- GR[GR %over% rep_ss]
  dat <- data.frame(elementMetadata(dat))
  
  ggplot(dat, aes(x=x, y=y))+
  geom_point(colour=rep_cols[8], alpha=0.4)+
    scale_x_continuous(limits=c(-4.411,20.5))+
  scale_y_continuous(limits=c(-7.69,16.12))+
  ggtitle("SINE-")+
  NULL

    rep_ss <- repdb[repdb$repClass %in% c("SINE","SINE?") & strand(repdb) == "+"]
  dat <- GR[GR %over% rep_ss]
  dat <- data.frame(elementMetadata(dat))
  
  ggplot(dat, aes(x=x, y=y))+
  geom_point(colour=rep_cols[16], alpha=0.4)+
    scale_x_continuous(limits=c(-4.411,20.5))+
  scale_y_continuous(limits=c(-7.69,16.12))+
  ggtitle("SINE+")+
  NULL

Figure 6C

HHMcol[13:15] <- "black"

for(i in c(1:15)){
  GR2 <- HMM[HMM$ID == levels(HHMdf$ID)[i]]
  dat <- GR[GR %over% GR2]
  dat <- data.frame(elementMetadata(dat))
  
  p1 <- ggplot(dat, aes(x=x, y=y))+
  geom_point(colour=HHMcol[i], alpha=0.4)+
  scale_x_continuous(limits=c(-4.411,20.5))+
  scale_y_continuous(limits=c(-7.69,16.12))+
  ggtitle(levels(HHMdf$ID)[i])+
  NULL
  
  print(p1)
}

Figure 7

GROVER outperforms models with fixed-size k-mers for biological fine-tuning tasks

Figure 7A,C,E

Figures 7A,C,E are directly genome browser tracks from the IGV browser for illustration.

Figure 7B

metr <- read.csv("data/fine_tuning_metrics.csv")

prom300 <- metr[c(5:11,16:18),2:7]
melted <- melt(prom300[4:10,], id=c("Dataset","Model"))
melted$Dataset <- factor(melted$Dataset,levels=c("BP shuffle","4mer-shuffle","5mer-shuffle","6mer-shuffle"))

ggplot(melted, aes(x=interaction(Dataset,Model), group=Dataset, y=value, fill=interaction(Dataset, Model)))+
  geom_bar(stat="identity")+
  scale_y_continuous(limits=c(0,1))+
  scale_fill_manual(values=modelcols)+
  facet_wrap(~variable,nrow=1)+
  theme(axis.text.x = element_text(angle = 90, vjust = 1, hjust=1))+
  NULL

Figure 7B

promscan <- metr[19:22,3:7]
melted <- melt(promscan)

## Using Model as id variables

ggplot(melted, aes(x=Model, y=value, fill=Model))+
  geom_bar(stat="identity")+
  scale_fill_manual(values=modelcols[c(1,2,4,6)])+
  facet_wrap(~variable,nrow=1)+
  theme(axis.text.x = element_text(angle = 90, vjust = 1, hjust=1))+
  NULL

Figure 7B

CTCF <- metr[23:26,3:7]
melted <- melt(CTCF)

## Using Model as id variables

ggplot(melted, aes(x=Model, y=value, fill=Model))+
  geom_bar(stat="identity")+
  scale_fill_manual(values=modelcols[c(1,2,4,6)])+
  facet_wrap(~variable,nrow=1)+
  theme(axis.text.x = element_text(angle = 90, vjust = 1, hjust=1))+
  NULL

Session Info

sessionInfo()

## R version 4.2.1 (2022-06-23)
## Platform: x86_64-apple-darwin17.0 (64-bit)
## Running under: macOS Big Sur ... 10.16
## 
## Matrix products: default
## BLAS:   /Library/Frameworks/R.framework/Versions/4.2/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/4.2/Resources/lib/libRlapack.dylib
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## attached base packages:
## [1] stats4    grid      stats     graphics  grDevices utils     datasets 
## [8] methods   base     
## 
## other attached packages:
##  [1] clusterProfiler_4.6.0                  
##  [2] TxDb.Hsapiens.UCSC.hg19.knownGene_3.2.2
##  [3] GenomicFeatures_1.50.4                 
##  [4] AnnotationDbi_1.60.0                   
##  [5] Biobase_2.58.0                         
##  [6] BSgenome.Hsapiens.UCSC.hg19_1.4.3      
##  [7] BSgenome_1.66.3                        
##  [8] rtracklayer_1.58.0                     
##  [9] Biostrings_2.66.0                      
## [10] XVector_0.38.0                         
## [11] GenomicRanges_1.50.2                   
## [12] GenomeInfoDb_1.34.9                    
## [13] IRanges_2.32.0                         
## [14] S4Vectors_0.36.1                       
## [15] BiocGenerics_0.44.0                    
## [16] wordcloud2_0.2.2                       
## [17] lsa_0.73.3                             
## [18] SnowballC_0.7.1                        
## [19] umap_0.2.10.0                          
## [20] ggrepel_0.9.3                          
## [21] ggfortify_0.4.15                       
## [22] migest_2.0.3                           
## [23] viridis_0.6.2                          
## [24] viridisLite_0.4.1                      
## [25] pheatmap_1.0.12                        
## [26] gridExtra_2.3                          
## [27] cowplot_1.1.1                          
## [28] reshape2_1.4.4                         
## [29] RColorBrewer_1.1-3                     
## [30] colorspace_2.1-0                       
## [31] pals_1.7                               
## [32] lubridate_1.9.2                        
## [33] forcats_1.0.0                          
## [34] stringr_1.5.0                          
## [35] dplyr_1.1.0                            
## [36] purrr_1.0.1                            
## [37] readr_2.1.4                            
## [38] tidyr_1.3.0                            
## [39] tibble_3.1.8                           
## [40] tidyverse_2.0.0                        
## [41] ggpubr_0.6.0                           
## [42] ggplot2_3.4.1                          
## 
## loaded via a namespace (and not attached):
##   [1] utf8_1.2.3                  reticulate_1.28            
##   [3] tidyselect_1.2.0            RSQLite_2.3.0              
##   [5] htmlwidgets_1.6.2           BiocParallel_1.32.5        
##   [7] scatterpie_0.1.8            munsell_0.5.0              
##   [9] codetools_0.2-19            withr_2.5.0                
##  [11] GOSemSim_2.24.0             filelock_1.0.2             
##  [13] highr_0.10                  knitr_1.43                 
##  [15] rstudioapi_0.14             ggsignif_0.6.4             
##  [17] DOSE_3.24.2                 labeling_0.4.2             
##  [19] MatrixGenerics_1.10.0       GenomeInfoDbData_1.2.9     
##  [21] polyclip_1.10-4             bit64_4.0.5                
##  [23] farver_2.1.1                downloader_0.4             
##  [25] treeio_1.22.0               vctrs_0.6.3                
##  [27] generics_0.1.3              gson_0.0.9                 
##  [29] xfun_0.39                   timechange_0.2.0           
##  [31] BiocFileCache_2.6.1         R6_2.5.1                   
##  [33] graphlayouts_0.8.4          bitops_1.0-7               
##  [35] cachem_1.0.8                fgsea_1.24.0               
##  [37] gridGraphics_0.5-1          DelayedArray_0.24.0        
##  [39] assertthat_0.2.1            BiocIO_1.8.0               
##  [41] scales_1.2.1                ggraph_2.1.0               
##  [43] enrichplot_1.18.3           gtable_0.3.1               
##  [45] tidygraph_1.2.3             rlang_1.1.1                
##  [47] splines_4.2.1               lazyeval_0.2.2             
##  [49] rstatix_0.7.2               dichromat_2.0-0.1          
##  [51] broom_1.0.3                 yaml_2.3.7                 
##  [53] abind_1.4-5                 backports_1.4.1            
##  [55] qvalue_2.30.0               tools_4.2.1                
##  [57] ggplotify_0.1.0             ellipsis_0.3.2             
##  [59] jquerylib_0.1.4             Rcpp_1.0.10                
##  [61] plyr_1.8.8                  progress_1.2.2             
##  [63] zlibbioc_1.44.0             RCurl_1.98-1.10            
##  [65] prettyunits_1.1.1           openssl_2.0.5              
##  [67] SummarizedExperiment_1.28.0 magrittr_2.0.3             
##  [69] data.table_1.14.8           RSpectra_0.16-1            
##  [71] matrixStats_0.63.0          hms_1.1.2                  
##  [73] patchwork_1.1.2             evaluate_0.21              
##  [75] HDO.db_0.99.1               XML_3.99-0.13              
##  [77] compiler_4.2.1              biomaRt_2.54.0             
##  [79] maps_3.4.1                  shadowtext_0.1.2           
##  [81] crayon_1.5.2                htmltools_0.5.5            
##  [83] mgcv_1.8-41                 ggfun_0.0.9                
##  [85] tzdb_0.3.0                  aplot_0.1.9                
##  [87] DBI_1.1.3                   tweenr_2.0.2               
##  [89] dbplyr_2.3.0                MASS_7.3-58.2              
##  [91] rappdirs_0.3.3              Matrix_1.5-3               
##  [93] car_3.1-1                   cli_3.6.1                  
##  [95] parallel_4.2.1              igraph_1.4.0               
##  [97] pkgconfig_2.0.3             GenomicAlignments_1.34.0   
##  [99] xml2_1.3.3                  ggtree_3.6.2               
## [101] bslib_0.5.0                 yulab.utils_0.0.6          
## [103] digest_0.6.31               rmarkdown_2.25             
## [105] fastmatch_1.1-3             tidytree_0.4.2             
## [107] restfulr_0.0.15             curl_5.0.0                 
## [109] Rsamtools_2.14.0            rjson_0.2.21               
## [111] nlme_3.1-162                lifecycle_1.0.3            
## [113] jsonlite_1.8.5              carData_3.0-5              
## [115] mapproj_1.2.11              askpass_1.1                
## [117] fansi_1.0.4                 pillar_1.8.1               
## [119] lattice_0.20-45             KEGGREST_1.38.0            
## [121] fastmap_1.1.1               httr_1.4.5                 
## [123] GO.db_3.16.0                glue_1.6.2                 
## [125] png_0.1-8                   bit_4.0.5                  
## [127] ggforce_0.4.1               stringi_1.7.12             
## [129] sass_0.4.6                  blob_1.2.3                 
## [131] memoise_2.0.1               ape_5.7

The human genome’s vocabulary as proposed by the DNA language model GROVER

Melissa Sanabria, Jonas Hirsch, Anna R Poetsch

17/07/2023

Introduction

Figure 1

DNA byte-pair tokenisation and model architecture

Figure 2

Performance based selection of the vocabulary identifies 600 cycles of Byte-Pair Tokenization as optimal.

Figure 2A

Figure 2B

Figure 2C

Figure 2D

Data preparation for characterisation of vocabulary

Load embeddings

Figure 3

The frequency balanced GROVER vocabulary shows differential learning performance by token length

Figure 3A

Figure 3B

Figure 3C

Figure 3D

Figure 3E

Figure 3F

Figure 3G

Figure 4

Figure 4A

Figure 4B

Figure 4C

Figure 4D

Figure 4E

Figure 4F

Annotation of Tokens

Integration of annotation into df

Regression with GC content

Correlation of PCs with annotation

Figure 4G & H

Figure 4I

Figure 4J

Figure 4K

Figure 4L

Figure 4M

Figure 4N

Figure 5

Average GROVER token embeddings cluster by token characteristics and annotation

Figure 6

GROVER learns token context and genome annotation

Figure 6A

Figure 6B

Figure 6C

Figure 7

GROVER outperforms models with fixed-size k-mers for biological fine-tuning tasks

Figure 7A,C,E

Figure 7B

Figure 7B

Figure 7B

Session Info