GROVER ("Genome Rules Obtained Via Extracted Representations") is a foundation DNA language model with an optimized vocabulary for the human genome.

If you have any questions/requests/comments please contact melissa.sanabria@tu-dresden.de or arpoetsch@gmail.com

The preprint that is associated with this script can be found at: https://www.biorxiv.org/content/10.1101/2023.07.19.549677v1

A tutorial on how to use GROVER as a foundation model can be found at: https://doi.org/10.5281/zenodo.8373159

The pretrained model can be found at: https://doi.org/10.5281/zenodo.8373117

The data for the tokenised genome are at: https://doi.org/10.5281/zenodo.8373053

Introduction

Scripts to reproduce each figure can be found in the respective R-markdown document. The required input files, which can be produced with this jupyter notebook, are also separately provided there.

The following code shows how every data file was generated. You will find a flag FINAL_FILE that indicates the generation of a file that will be used in the R code.

Figure 1

Figure 1 includes the tokenization results that is derived from the code at Figure 2.

Figure 2

Performance based selection of the vocabulary identifies 600 cycles of Byte-Pair Tokenization as optimal.

Figure 2A

Selection of the optimal vocabulary through accuracy of next-token prediction as a fine- tuning task for the foundation models using prediction of 2 to 6 nucleotide long next-k-mers as readout. Depicted is accuracy with a loess fit and 95% confidence interval.

root_folder = "chr21/"
import random
import glob
import pickle
import os

from transformers import get_linear_schedule_with_warmup, AdamW, BertTokenizer, DataCollatorForLanguageModeling, Trainer
import torch
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader, Dataset, RandomSampler, SequentialSampler

from itertools import product
## Dataset. 400000 sequences from chromsome 21 for training. And 100000 sequences from chromosome 21 for testing.

train_file_path = root_folder + "train.csv"
test_file_path = root_folder + "test.csv"
## Vocabulary information per iteration
vocabs_file_path = root_folder + "vocabs_chr21.pkl"
## Tokenize sequences

def find_indices(seq, first, second):
    # find indices of all first tokens of most commong bigram
    ixs = {}
    ix = 0
    just_added = False # make sure there are no overlapping tokens like AAA gets tokenized in AA A and not in AA AA
    for ch1, ch2 in zip(seq, seq[1:]):
        if ((ch1 == first) and (ch2 == second)) and not just_added:
            ixs[ix] = 1
            just_added = True
        else:
            just_added = False
        ix += 1
    return ixs

def merge_tokens(seq, ixs, first, second):
    # merge most common tokens inplace at the first token (remove second token later)
    new_token = first + second
    for i in ixs.keys():
        seq[i] = new_token
    
    # remove the token, that got merged with its predecessor token
    seq = [x for i, x in enumerate(seq) if i-1 not in ixs]

    return seq

vocabs = pickle.load(open(vocabs_file_path, "rb"))

for iteration in vocabs.keys():
    iteration_path = root_path + str(iteration)+"/"
    if not os.path.exists(iteration_path):
        os.mkdir(iteration_path)
    else:
        continue
    
    data_path = iteration_path + "data/"
    if not os.path.exists(data_path):
        os.mkdir(data_path)
    for set_name in ["train", "test"]:
        with open(root_path + set_name + ".txt", "r") as samples_file:
            with open(data_path + set_name + ".txt", "w") as iter_set_file:
                for sample in samples_file.readlines(): 
                    tokenized_sequence = list(sample.replace("\n", ""))                        
                    for token, info in vocabs[iteration].items(): 
                        merge_rule = info[1]
                        ixs = find_indices(tokenized_sequence, merge_rule[0], merge_rule[1])
                        tokenized_sequence = merge_tokens(tokenized_sequence, ixs, merge_rule[0], merge_rule[1])
                    sample = " ".join(tokenized_sequence[:512])
                    iter_set_file.write(sample + "\n")

The following code is an example for one of the vocabulary iterations. You can repeat that for each one of them

iteration = 600
iteration_path = root_path + str(iteration)+"/"
with open(iteration_path +"vocab.txt", "w") as vocab_file:
        vocab_file.write("[PAD]\n[UNK]\n[CLS]\n[SEP]\n[MASK]\n")
        for key, value in vocabs[iteration].items():
            vocab_file.write(value[0] + "\n")
            
tokenizer = BertTokenizer.from_pretrained(iteration_path +"vocab.txt")


class LineByLineTextDataset(Dataset):
    def __init__(self, tokenizer: PreTrainedTokenizer, file_path: str, block_size=512):

        with open(file_path, encoding="utf-8") as f:

            lines = f.read().splitlines()[:-1]
            self.examples = tokenizer.batch_encode_plus(lines, add_special_tokens=True, max_length=block_size)["input_ids"]

    def __len__(self):
        return len(self.examples)

    def __getitem__(self, i):
        return torch.tensor(self.examples[i], dtype=torch.long)


test_dataset = LineByLineTextDataset(tokenizer, root_path + str(iteration)+"/data/test.txt")
train_dataset = LineByLineTextDataset(tokenizer, root_path + str(iteration)+"/data/train.txt")
    
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer,
        mlm_probability=mlm_probability)


model = BertForMaskedLM(config = root_path + "config.json")
output_path = ""

training_args = TrainingArguments(
    gradient_accumulation_steps=25,
    per_gpu_train_batch_size=10,
    per_gpu_eval_batch_size=6,
    save_steps=500,
    save_total_limit=20,
    max_steps=20000,
    learning_rate=4e-4,
    block_size=512,
    adam_epsilon=1e-6,
    weight_decay=0.01,
    adam_beta1=0.9,
    adam_beta2 = 0.98,
    mlm_probability=0.022,
    warmup_steps=1000,
    num_train_epochs = max_steps // (len(train_dataloader) // gradient_accumulation_steps) + 1,
    evaluate_during_training = True,
    output_path = output_path
    
)

# Initialize our Trainer
trainer = Trainer(
    model=model,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator
    
)
trainer.train()
trainer.save_model() 

perplexity_history = {}

step = 500
for log_history in trainer.state.log_history:
    if 'eval_loss' in log_history.keys():
        perplexity_history[step] = math.exp(log_history['loss'])
        step += 500
pickle.dump(perplexity_history, open(root_path + "perplexity.pkl", "wb"))

Fine-tuning on next-k-mer

We choose the training step with best performance (best perplexity) on the test set.

Here we will show an example to predict next-2mer. The process for the other next-k-mers is the same.

## Creation of the next-k-mer dataset
next_kmer = 2

kmers_vocab = {}
kmers_vocab_1 = {"A": "0", "C":"1", "G":"2", "T": "3"}
kmers_vocab = {}
count = 0
for nuc in kmers_vocabs[1]:
    for sec_nuc in kmers_vocabs[1]:
        kmers_vocabs[nuc+sec_nuc] = str(count)
        count += 1

grover_data = pd.read_csv(root_path + str(iteration)+"/data/test.txt", header=None, names=["sequence"])
grover_data_next_kmer = pd.DataFrame()

grover_data_next_kmer["sequence"] = grover_data["sequence"].apply(lambda x: x.split(" ")[:50])
grover_data_next_kmer["next_kmer"] = grover_data["sequence"].apply(lambda x: "".join(x.split(" ")[50:])[:next_kmer])
       
grover_data_next_kmer["class"] = grover_data_next_kmer["next_kmer"].apply(lambda x: kmer_vocab[x])
test_data_grover = pd.DataFrame({'X': grover_data_next_kmer["sequence"], 'class': grover_data_next_kmer["class"]})


grover_data = pd.read_csv(root_path + str(iteration)+"/data/train.txt", header=None, names=["sequence"])
grover_data_next_kmer = pd.DataFrame()

grover_data_next_kmer["sequence"] = grover_data["sequence"].apply(lambda x: x.split(" ")[:50])
grover_data_next_kmer["next_kmer"] = grover_data["sequence"].apply(lambda x: "".join(x.split(" ")[50:])[:next_kmer])
       
grover_data_next_kmer["class"] = grover_data_next_kmer["next_kmer"].apply(lambda x: kmer_vocab[x])
train_data_grover = pd.DataFrame({'X': grover_data_next_kmer["sequence"], 'class': grover_data_next_kmer["class"]})
class GroverDataSet(Dataset):
    def __init__(self, sequences, y, tokenizer, max_length=50):
        print("Loading GROVER Dataset")
        self.BERTtokenizer = tokenizer # to convert ATG ATCGA CG -> [CLS] ATG ATCGA CG [SEP] -> [2, 123, 456, 789, 101, 3]
        self.sequences = sequences
        self.max_length = max_length
        self.y = np.array(y, dtype=np.float32).reshape(-1, 1)

        self.label_encoder = OneHotEncoder(sparse_output=False, categories=[list(range(16))])
        self.label_encoder.fit(self.y)
        self.y = self.label_encoder.transform(self.y)

        self.y = torch.tensor(self.y)
        print(self.y.shape)

    def __len__(self):
        return len(self.sequences)

    def __getitem__(self, idx):
        seq = self.sequences[idx]
        tokenizer_res = self.BERTtokenizer.encode_plus(seq, add_special_tokens=True, padding="max_length", return_tensors="pt", max_length=self.max_length, truncation=True)
        ids = tokenizer_res["input_ids"].squeeze(0)
        attention_masks = tokenizer_res["attention_mask"]
        return ids, self.y[idx], attention_masks, idx
grover_tokenizer = BertTokenizer.from_pretrained(output_path)
train_grover = GroverDataSet(train_data_grover["X"].values, train_data_grover["class"], grover_tokenizer, max_length=50)
test_grover = GroverDataSet(test_data_grover["X"].values, test_data_grover["class"], grover_tokenizer, max_length=50)
grover = BertForSequenceClassification.from_pretrained(output_path, num_labels=len(kmer_vocab)).to(device)
output_path_next_kmer = ""

training_args = TrainingArguments(
    num_train_epochs = 100,
    learning_rate = 1e-6,
    per_gpu_train_batch_size=16,
    per_gpu_eval_batch_size=16,
    save_steps = 1000,
    logging_steps = 1000,
    evaluate_during_training=True,
    output_path = output_path_next_kmer
    
)
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(axis=1)
    return {
        'accuracy': (preds == labels).mean()
    }

# Initialize our Trainer
trainer = Trainer(
    model=model,
    train_dataset=train_grover,
    eval_dataset=test_grover,
    tokenizer=grover_tokenizer,
    compute_metrics = compute_metrics
    
)
trainer.train()
trainer.save_model()

perplexity_history = {}

step = 1000
for log_history in trainer.state.log_history:
    if 'eval_accuracy' in log_history.keys():
        perplexity_history[step] = math.exp(log_history['eval_accuracy'])
        step += 1000
pickle.dump(perplexity_history, open(output_path_next_kmer + "accuracy.pkl", "wb"))
## Creating performance file
results_dict = {}

for iteration in [100, 200, 300, 400, 500, 600, 700, 800, 900, 1000, 2000, 3000, 4000, 5000]:
    results_dict[iteration] = {}
    for n_mer in range(2, 7):
        results_dict[iteration][n_mer] = pickle.load(open(output_path_next_kmer + "accuracy.pkl", "rb"))
        
performance_to_save = {}
for idx, n_mer in enumerate(range(2, 7)):
    performance_to_save[n_mer] = {}
    for iteration in [100, 200, 300, 400, 500, 600, 700, 800, 900, 1000, 2000, 3000, 4000, 5000]:       
        performance_to_save[n_mer][iteration] = np.max(results_dict[iteration][n_mer])

df = pd.DataFrame.from_dict(performance_to_save)
df.to_csv("performance.csv", index=False)
## FINAL_FILE

Figure 2B

Performance comparison using accuracy of next-k-mer prediction as a fine-tuning task. Compared are GROVER with 600 cycles of Byte-Pair Tokenization with models based on k-mer-tokenization, with length of 4, 5, and 6 nucleotides.

letters = ['A', 'C', 'G', 'T']
for kmer in [4, 5, 6]:
    kmer_path = root_path + str(kmer)+"/"
    if not os.path.exists(kmer_path):
        os.mkdir(kmer_path)
    else:
        continue
    
    vocab = product(letters, repeat=kmer)
    with open(kmer_path +"vocab.txt", "w") as vocab_file:
        vocab_file.write("[PAD]\n[UNK]\n[CLS]\n[SEP]\n[MASK]\n")
        for token in vocab:
            vocab_file.write(token + "\n")
    
    data_path = kmer_path + "data/"
    if not os.path.exists(data_path):
        os.mkdir(data_path)
    for set_name in ["train", "test"]:
        with open(root_path + set_name + ".txt", "r") as samples_file:
            with open(data_path + set_name + ".txt", "w") as iter_set_file:
                for sample in samples_file.readlines(): 
                    new_line = ""
                    count = 0
                    for init in range(0, len(sample) - kmer, kmer):
                        new_line += sample[init: init + kmer] + " "
                        count += 1
                        if count > 510:
                            break
                    new_line = " ".join(new_line[:-1].split(" "))
                    iter_set_file.write(new_line + "\n")

We will show how to do it for one k-mer and it can be repeted for the others

kmer = 4

kmer_path = root_path + str(kmer)+"/"

tokenizer = BertTokenizer.from_pretrained(kmer_path +"vocab.txt")


class LineByLineTextDataset(Dataset):
    def __init__(self, tokenizer: PreTrainedTokenizer, file_path: str, block_size=512):

        with open(file_path, encoding="utf-8") as f:

            lines = f.read().splitlines()[:-1]
            self.examples = tokenizer.batch_encode_plus(lines, add_special_tokens=True, max_length=block_size)["input_ids"]

    def __len__(self):
        return len(self.examples)

    def __getitem__(self, i):
        return torch.tensor(self.examples[i], dtype=torch.long)


test_dataset = LineByLineTextDataset(tokenizer, kmer_path+"/data/test.txt")
train_dataset = LineByLineTextDataset(tokenizer, kmer_path+"/data/train.txt")
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer,
        mlm_probability=mlm_probability)


model = BertForMaskedLM(config = root_path + "config_4mer.json")
output_path = ""

training_args = TrainingArguments(
    gradient_accumulation_steps=25,
    per_gpu_train_batch_size=10,
    per_gpu_eval_batch_size=6,
    save_steps=500,
    save_total_limit=20,
    max_steps=20000,
    learning_rate=4e-4,
    block_size=512,
    adam_epsilon=1e-6,
    weight_decay=0.01,
    adam_beta1=0.9,
    adam_beta2 = 0.98,
    mlm_probability=0.022,
    warmup_steps=1000,
    num_train_epochs = max_steps // (len(train_dataset) // 25) + 1,
    evaluate_during_training = True,
    output_path = output_path
    
)

# Initialize our Trainer
trainer = Trainer(
    model=model,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator
    
)
trainer.train()
trainer.save_model()

perplexity_history = {}

step = 500
for log_history in trainer.state.log_history:
    if 'eval_loss' in log_history.keys():
        perplexity_history[step] = math.exp(log_history['loss'])
        step += 500
pickle.dump(perplexity_history, open(root_path + "perplexity.pkl", "wb"))

Fine-tuning on next-k-mers

We choose the training step with best performance on the test set.

Here we will show an example to predict next-2mer. The process for the other next-k-mer is the same.

## Creation of the next-k-mers dataset
next_kmer = 2

kmers_vocab = {}
kmers_vocab_1 = {"A": "0", "C":"1", "G":"2", "T": "3"}
kmers_vocab = {}
count = 0
for nuc in kmers_vocabs[1]:
    for sec_nuc in kmers_vocabs[1]:
        kmers_vocabs[nuc+sec_nuc] = str(count)
        count += 1

kmer_data = pd.read_csv(kmer_path+"/data/test.txt", header=None, names=["sequence"])
kmer_data_next_kmer = pd.DataFrame()

kmer_data_next_kmer["sequence"] = kmer_data["sequence"].apply(lambda x: x.split(" ")[:50])
kmer_data_next_kmer["next_kmer"] = kmer_data["sequence"].apply(lambda x: "".join(x.split(" ")[50:])[:next_kmer])
       
kmer_data_next_kmer["class"] = kmer_data_next_kmer["next_kmer"].apply(lambda x: kmer_vocab[x])
test_data_kmer = pd.DataFrame({'X': kmer_data_next_kmer["sequence"], 'class': grover_data_next_kmer["class"]})


kmer_data = pd.read_csv(kmer_path+"/data/train.txt", header=None, names=["sequence"])
kmer_data_next_kmer = pd.DataFrame()

kmer_data_next_kmer["sequence"] = kmer_data["sequence"].apply(lambda x: x.split(" ")[:50])
kmer_data_next_kmer["next_kmer"] = kmer_data["sequence"].apply(lambda x: "".join(x.split(" ")[50:])[:next_kmer])
       
kmer_data_next_kmer["class"] = kmer_data_next_kmer["next_kmer"].apply(lambda x: kmer_vocab[x])
train_data_kmer = pd.DataFrame({'X': kmer_data_next_kmer["sequence"], 'class': grover_data_next_kmer["class"]})
class GroverDataSet(Dataset):
    def __init__(self, sequences, y, tokenizer, max_length=50):
        print("Loading GROVER Dataset")
        self.BERTtokenizer = tokenizer # to convert ATG ATCGA CG -> [CLS] ATG ATCGA CG [SEP] -> [2, 123, 456, 789, 101, 3]
        self.sequences = sequences
        self.max_length = max_length
        self.y = np.array(y, dtype=np.float32).reshape(-1, 1)

        self.label_encoder = OneHotEncoder(sparse_output=False, categories=[list(range(16))])
        self.label_encoder.fit(self.y)
        self.y = self.label_encoder.transform(self.y)

        self.y = torch.tensor(self.y)
        print(self.y.shape)

    def __len__(self):
        return len(self.sequences)

    def __getitem__(self, idx):
        seq = self.sequences[idx]
        tokenizer_res = self.BERTtokenizer.encode_plus(seq, add_special_tokens=True, padding="max_length", return_tensors="pt", max_length=self.max_length, truncation=True)
        ids = tokenizer_res["input_ids"].squeeze(0)
        attention_masks = tokenizer_res["attention_mask"]
        return ids, self.y[idx], attention_masks, idx
kmer_tokenizer = BertTokenizer.from_pretrained(output_path)
train_kmer = GroverDataSet(train_data_kmer["X"].values, train_data_kmer["class"], kmer_tokenizer, max_length=50)
test_kmer = GroverDataSet(test_data_kmer["X"].values, test_data_kmer["class"], kmer_tokenizer, max_length=50)
kmer = BertForSequenceClassification.from_pretrained(output_path, num_labels=len(kmer_vocab)).to(device)
output_path_next_kmer = ""

training_args = TrainingArguments(
    num_train_epochs = 100,
    learning_rate = 1e-6,
    per_gpu_train_batch_size=16,
    per_gpu_eval_batch_size=16,
    save_steps = 1000,
    logging_steps = 1000,
    evaluate_during_training=True,
    output_path = output_path_next_kmer
    
)

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(axis=1)
    return {
        'accuracy': (preds == labels).mean()
    }

# Initialize our Trainer
trainer = Trainer(
    model=model,
    train_dataset=train_kmer,
    eval_dataset=test_kmer,
    tokenizer=kmer_tokenizer,
    compute_metrics = compute_metrics
    
)
trainer.train()
trainer.save_model()  

perplexity_history = {}

step = 1000
for log_history in trainer.state.log_history:
    if 'eval_accuracy' in log_history.keys():
        perplexity_history[step] = math.exp(log_history['eval_accuracy'])
        step += 1000
pickle.dump(perplexity_history, open(output_path_next_kmer + "accuracy.pkl", "wb"))
## Creating performance file
results_dict = {}

for iteration in [100, 200, 300, 400, 500, 600, 700, 800, 900, 1000, 2000, 3000, 4000, 5000]:
    results_dict[iteration] = {}
    for n_mer in range(2, 7):
        results_dict[iteration][n_mer] = pickle.load(open(output_path_next_kmer + "accuracy.pkl", "rb"))
        
performance_to_save = {}
for idx, n_mer in enumerate(range(2, 7)):
    performance_to_save[n_mer] = {}
    for iteration in [100, 200, 300, 400, 500, 600, 700, 800, 900, 1000, 2000, 3000, 4000, 5000]:       
        performance_to_save[n_mer][iteration] = np.max(results_dict[iteration][n_mer])

df = pd.DataFrame.from_dict(performance_to_save)
df.to_csv("performance.csv", index=False)
## FINAL_FILE

Figure 2C

Performance assessment of GROVER with 600 cycles Byte-Pair Tokenization using accuracy for the masked token being predicted as the TOP 1 token, up to TOP 60, i.e. the TOP 10%.

First, we will explain how we obtained the data from the whole reference genome. Then we will show how to train GROVER.

root_folder = "wg/"

The tokenized chromosomes can be found iin the folder tokenized_chromosomes. In case you want to know they were generated, these are the steps to tokenize the whole genome.

import random
import glob
import pickle
import os
import argparse
from Bio import SeqIO

def n_intervals(seq):
    for n_code in ["R", "Y", "S", "W", "K", "M", "B", "D", "H", "V"]:
        seq = seq.replace(n_code, "N")

    N_positions = [i for i in range(len(seq)) if seq.startswith("N", i)]
    N_intervals = []
    current_interval = [N_positions[0], 0]
    for idx in range(1, len(N_positions)):
        if N_positions[idx] != (N_positions[idx - 1] + 1):
            current_interval[1] = N_positions[idx - 1] + 1
            N_intervals.append([N_positions[idx - 1] + 1, N_positions[idx]])
            current_interval[0] = N_positions[idx]
    return N_intervals

def find_indices(seq, first, second):
    # find indices of all first tokens of most commong bigram
    ixs = {}
    ix = 0
    just_added = False # make sure there are no overlapping tokens like AAA gets tokenized in AA A and not in AA AA
    for ch1, ch2 in zip(seq, seq[1:]):
        if ((ch1 == first) and (ch2 == second)) and not just_added:
            ixs[ix] = 1
            just_added = True
        else:
            just_added = False
        ix += 1
    return ixs

def merge_tokens(seq, ixs, first, second):
    # merge most common tokens inplace at the first token (remove second token later)
    new_token = first + second
    for i in ixs.keys():
        seq[i] = new_token
    
    # remove the token, that got merged with its predecessor token
    seq = [x for i, x in enumerate(seq) if i-1 not in ixs]

    return seq

for chrom in range(1, 25):

    chrom_file = "seq_chr_"+str(chrom)+".fasta" ## Fasta file per chromosome
    chrom_seq = SeqIO.read(chrom_file, "fasta").seq
    these_n_intervals = n_intervals(chrom_seq)
    sequences = []
    for interval in these_n_intervals:
        sequences.append(chrom_seq[interval[0] : interval[1]])

    vocab_iter_file = root_folder + "/vocab_info.pkl"
    vocab_iter = pickle.load(open(vocab_iter_file, "rb"))

    tok_chrom = []
    for sequence in sequences:
        tokenized_sequence = list(sequence)
        for key, value in vocab_iter.items(): 
            rule = value[1]
            ixs = find_indices(tokenized_sequence, rule[0], rule[1])
            tokenized_sequence = merge_tokens(tokenized_sequence, ixs, rule[0], rule[1])

        tok_chrom.append(tokenized_sequence)
        
    pickle.dump(tok_chrom, open("chr_"+str(chrom)+".pkl", "wb"))

You can access the train and test data from the root folder.

In case you want to know how we generated the data, this is the code.

## Create windows
import random
import pickle
from random import shuffle

windows = []
for chrom in range(1, 25):
    tok_chrom = pickle.load(open("tokenized_chromosomes/chr_"+str(chrom)+".pkl", "rb"))
    for s, sequence in enumerate(tok_chrom):
        length_seq = len(sequence)
        window_start = 0
        remaining_length = length_seq - window_start
        while remaining_length > 510:
            prob = random.uniform(0, 1)
            if prob < 0.5:
                length = 510
            else:
                length = random.randint(20, 510)
            windows.append([chrom, s, window_start, window_start + length])
            
            window_start = window_start + length
            remaining_length = length_seq - window_start

            
shuffle(windows)
windows_per_chromosome = {}
for w, window in enumerate(windows):
    chrom = window[0]
    if chrom not in windows_per_chromosome:
        windows_per_chromosome[chrom] = {}
        windows_per_chromosome[chrom]["train"] = []
        windows_per_chromosome[chrom]["test"] = []
    if w > train_length:
        split = "test"
    else:
        split = "train"
    
    windows_per_chromosome[chrom][split].append(window[1:])
    
samples_per_chromosome = {}
train_factor = 3
test_factor = 2

nb_train_windows = 0
nb_test_windows = 0
for chrom in windows_per_chromosome.keys():
    for split in windows_per_chromosome[chrom].keys():
        if split in "train":
            factor = train_factor
        if split in "test":
            factor = test_factor
        
        new_windows = []
        for window in windows_per_chromosome[chrom][split]:
            window_start = window[-2]
            window_end = window[-1]
            window_length = window_end - window_start
            if window_length > 50:
                for i in range(factor):
                    random_start = random.randint(window_start + 1, window_end - 20)
                    random_end = random.randint(random_start + 20, window_end)
                    new_windows.append([window[0], random_start, random_end])
        windows_per_chromosome[chrom][split] += new_windows
        
with open(root_folder + "train.txt", "w") as train_file:
    for chrom in range(1, 25):
        tok_chrom = pickle.load(open("chr_"+str(chrom)+".pkl", "rb"))
        for window in windows_per_chromosome[chrom]["train"]:
            sample = tok_chrom[window[0]][window[1]: window[2]]
            train_file.write(" ".join(sample) + "\n")
            
with open(root_folder + "test.txt", "w") as train_file:
    for chrom in range(1, 25):
        tok_chrom = pickle.load(open("chr_"+str(chrom)+".pkl", "rb"))
        for window in windows_per_chromosome[chrom]["test"]:
            sample = tok_chrom[window[0]][window[1]: window[2]]
            train_file.write(" ".join(sample) + "\n")
## Training GROVER

with open(root_path +"vocab.txt", "w") as vocab_file:
        vocab_file.write("[PAD]\n[UNK]\n[CLS]\n[SEP]\n[MASK]\n")
        for key, value in vocabs[iteration].items():
            vocab_file.write(value[0] + "\n")
            
tokenizer = BertTokenizer.from_pretrained(root_path +"vocab.txt")


class LineByLineTextDataset(Dataset):
    def __init__(self, tokenizer: PreTrainedTokenizer, file_path: str, block_size=512):

        with open(file_path, encoding="utf-8") as f:

            lines = f.read().splitlines()[:-1]
            self.examples = tokenizer.batch_encode_plus(lines, add_special_tokens=True, max_length=block_size)["input_ids"]

    def __len__(self):
        return len(self.examples)

    def __getitem__(self, i):
        return torch.tensor(self.examples[i], dtype=torch.long)


test_dataset = LineByLineTextDataset(tokenizer, root_path + "/test.txt")
train_dataset = LineByLineTextDataset(tokenizer, root_path + "/train.txt")
    
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer,
        mlm_probability=mlm_probability)


model = BertForMaskedLM(config = root_path + "config.json")
output_path = ""

training_args = TrainingArguments(
    gradient_accumulation_steps=25,
    per_gpu_train_batch_size=10,
    per_gpu_eval_batch_size=6,
    save_steps=500,
    save_total_limit=20,
    max_steps=20000,
    learning_rate=4e-4,
    block_size=512,
    adam_epsilon=1e-6,
    weight_decay=0.01,
    adam_beta1=0.9,
    adam_beta2 = 0.98,
    mlm_probability=0.022,
    warmup_steps=1000,
    num_train_epochs = max_steps // (len(train_dataset) // 25) + 1,
    evaluate_during_training = True,
    output_path = output_path
    
)

# Initialize our Trainer
trainer = Trainer(
    model=model,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator
    
)
trainer.train()
trainer.save_model()  

perplexity_history = {}

step = 500
for log_history in trainer.state.log_history:
    if 'eval_loss' in log_history.keys():
        perplexity_history[step] = math.exp(log_history['loss'])
        step += 500
pickle.dump(perplexity_history, open(root_path + "perplexity.pkl", "wb"))
## Prediction of a masked token per sample

from sklearn.metrics import top_k_accuracy_score
from scipy.special import softmax

def collate(examples: List[torch.Tensor]):
    if tokenizer._pad_token is None:
        return pad_sequence(examples, batch_first=True)
    return pad_sequence(examples, batch_first=True, padding_value=tokenizer.pad_token_id)

eval_sampler = SequentialSampler(test_dataset)
eval_dataloader = DataLoader(
    test_dataset, sampler=eval_sampler, batch_size=32, collate_fn=collate)


model = BertModel.from_pretrained(output_path, config=root_path + "config.json")
predictions = []
labels = []
for batch in tqdm(eval_dataloader, desc="Evaluating"):
    with torch.no_grad():
        
        probability_matrix = torch.rand(batch.shape)
        special_tokens_mask = [
        tokenizer.get_special_tokens_mask(val, already_has_special_tokens=True) for val in batch.tolist()]
        probability_matrix.masked_fill_(torch.tensor(special_tokens_mask, dtype=torch.bool), value=0.0)
        if tokenizer._pad_token is not None:
            padding_mask = batch.eq(tokenizer.pad_token_id)
            probability_matrix.masked_fill_(padding_mask, value=0.0)
        
        token_per_sample = torch.argmax(probability_matrix, dim=1)
        
        batch[torch.arange(batch.shape[0]), token_per_sample] = tokenizer.convert_tokens_to_ids(tokenizer.mask_token)
        
        batch = batch.to("cuda:0")
        outputs = model(batch)
        prediction = outputs[0].detach().cpu().numpy()[torch.arange(batch.shape[0]), token_per_sample]
        predictions.append(softmax(prediction, axis=-1))
        
        labels.append(batch[np.arange(batch.shape[0]), token_per_sample.numpy()].numpy())
        
labels = np.hstack(labels)
predictions = np.vstack(predictions)
top_k = np.zeros(300)
for k in range(1,300):
   
    top_k[k-1] = top_k_accuracy_score(labels, predictions, k=k, labels=np.arange(609))

    print(k, top_k[k-1] )

pickle.dump(top_k, open(root_path + "top_k.pkl", "wb")) ## FINAL_FILE
pickle.dump(labels, open(root_path + "labels.pkl", "wb"))
pickle.dump(predictions, open(root_path + "predictions.pkl", "wb"))

Figure 2D

Performance assessment of GROVER with 600 cycles Byte-Pair Tokenization using perplexity, divided by the total number of words in the dictionary. Comparison with models based on k-mer-tokenization, with length of 4, 5, and 6 nucleotides.

Figure 3

The frequency balanced GROVER vocabulary shows differential learning performance by token length

To generate Figure 3F and Figure 3G we need the metrics per token.

from sklearn.metrics import roc_auc_score

def roc_auc_score_multiclass(actual_class, pred_class, average = "weighted"):
    
    #creating a set of all the unique classes using the actual class list
    unique_class = list(set(actual_class))
    roc_auc_dict = {}
    for per_class in unique_class:
        
        #creating a list of all the classes except the current class 
        other_class = [x for x in unique_class if x != per_class]

        #marking the current class as 1 and all other classes as 0
        new_actual_class = [0 if x in other_class else 1 for x in actual_class]
        new_pred_class = [0 if x in other_class else 1 for x in pred_class]

        #using the sklearn metrics method to calculate the roc_auc_score
        roc_auc = roc_auc_score(new_actual_class, new_pred_class, average = average)
        roc_auc_dict[per_class] = roc_auc
        
    return roc_auc_dict


labels = pickle.load(open(root_path + "labels.pkl", "rb"))
predictions = pickle.load(open(root_path + "predictions.pkl", "rb"))

acc_per_token = np.zeros((609, 300))
for k in range(300):
    for t in range(609):
        this_token_ids = np.where(labels == t)[0]
        if len(this_token_ids) > 0:
            this_token_labels = labels[this_token_ids]
            this_token_predictions = predictions[this_token_ids]
            acc_per_token[t, k] = top_k_accuracy_score(this_token_labels, this_token_predictions, k=k, labels=np.arange(609))
        

auc_per_token = roc_auc_score_multiclass(labels, predictions, "macro")        
        
        
output_file_path = "metrics_per_token.csv" ## FINAL_FILE
with open(output_file_path, "w") as output:
    header = "token_id,token,auc,"
    for k in range(1,301):
        header += "top"+str(k)+","
    header = header[:-1] ## Remove comma
    output.write(header + "\n")
    for t, token in enumerate(tokens):
        auc = 0
        if t in auc_per_token:
            auc = auc_per_token[t]
        
        line_to_write = str(t) +","+ token +","+ str(auc) +","
        for k in range(300):
            line_to_write += str(acc_per_token[t,k]) + ","
        output.write(line_to_write[:-1] + "\n")

Figure 4

Average GROVER token embedding shows learning of genome information content

from gensim.models import Word2Vec
from sklearn.decomposition import PCA

with open(root_path + "/train.txt") as train_file:
    data = train_file.readlines()[:-1]
sent = [row.split() for row in data[:300000]]
w2v_model = gensim.models.Word2Vec(sent, min_count = 1, vector_size = 768, window = 5, workers=10)

pickle.dump(w2v_model.wv.vectors, open(root_path + "iter600_w2v.pkl", "wb")) ## FINAL_FILE

with open(root_path + "iter600_w2v_vocab.txt", "w") as vocab_file: ## FINAL_FILE
    for word in w2v_model.wv.index_to_key:
        vocab_file.write(word + "\n")
model = BertModel.from_pretrained(output_path, config=root_path + "config.json")

vocabulary = ""
with open(root_path +"vocab.txt", "r") as vocabulary_file:
    for line in vocabulary_file.readlines()[5:]:
        vocabulary += line.replace("\n", "") + " "
        
embedding_matrix = model.embeddings.word_embeddings.weight.to("cpu").detach().numpy()

pickle.dump(embedding_matrix, open("vocab_embedding.pkl", "wb")) ## FINAL_FILE

Figure 6

GROVER learns token context and genome annotation

Figure 6A

Self-similarity per token sequence as extracted by cosine similarity of the same token in different contexts throughout the 12 transformer layers.

def collate(examples: List[torch.Tensor]):
    if tokenizer._pad_token is None:
        return pad_sequence(examples, batch_first=True)
    return pad_sequence(examples, batch_first=True, padding_value=tokenizer.pad_token_id)

eval_sampler = SequentialSampler(test_dataset)
eval_dataloader = DataLoader(
    test_dataset, sampler=eval_sampler, batch_size=32, collate_fn=collate)

model = BertModel.from_pretrained(output_path, config=root_path + "config.json")
model.eval()
count = 0
for batch in tqdm(eval_dataloader, desc="Evaluating"):
    with torch.no_grad():
        batch = batch.to("cuda:0")
        outputs = model(batch)
        embeddings = torch.stack(list(outputs[1]), dim=0).detach().cpu().numpy()
        pickle.dump(embeddings, open("embeddings_test/"+str(count)+".pkl", "wb"))
        del embeddings
        count += 1


## Group the embeddings per token
tokens = []
with open(root_path +"vocab.txt") as vocab_file:
    for line in vocab_file.readlines():
        tokens.append(line.replace("\n", ""))

tokens_to_get = np.asarray([[7] + list(range(9, len(tokens)))][0])

total_samples = 5000

for layer in range(1,13):
    token_embeddings = [[] for _ in range(len(tokens_to_get))]
    token_counts = [0 for _ in range(len(tokens_to_get))]
    count = 0
    for batch in eval_dataloader:
        batch = batch.numpy()
        if not os.path.exists("embeddings_test/"+str(count)+".pkl"):
            break
        embeddings = pickle.load(open("embeddings_test/"+str(count)+".pkl", "rb"))[layer]
        for t_idx, token_id in enumerate(tokens_to_get):
            if token_counts[t_idx] >= total_samples:
                print("Token ",tokens[token_id], "is full")
                continue   
            condition = np.nonzero(batch == token_id)
            times = len(condition[0])
            if times > 0:   
                token_embeddings[t_idx].append(embeddings[condition])
                token_counts[t_idx] += times


        del embeddings
        count += 1
        all_full = True
        for t_idx in range(len(tokens_to_get)):
            if token_counts[t_idx] < total_samples:
                all_full = False
                break
        if all_full:
            break
    for t_idx, token_id in enumerate(tokens_to_get):
        print("Saving file for token ", tokens[token_id])
        pickle.dump(np.vstack(token_embeddings[t_idx]), open(str(token_id)+"_layer_"+str(layer)+".pkl", "wb"))
        
self_sim_per_token = np.zeros((len(tokens_to_get), 12))
for t_idx, token_id in enumerate(tokens_to_get):
    for l_idx, layer in enumerate(range(1,13)):
        token_embeddings = pickle.load(open(str(token_id)+"_layer_"+str(layer)+".pkl", "rb"))[:5000]
        sim = cosine_similarity(token_embeddings)
        upper = sim[np.triu_indices(len(token_embeddings), k = 1)]
        self_sim_per_token[t_idx, l_idx] = upper.mean()
        
        
output_file_path = "self_similarity_per_token.csv" ## FINAL_FILE
with open(output_file_path, "w") as output:
    header = "token_id,token,"
    for k in range(12):
        header += "layer"+str(k + 1)+","
    header = header[:-1] 
    output.write(header + "\n")
    for t_idx, t in enumerate(tokens_to_get):
        token = tokens[t]
        line_to_write = str(t) +","+ token +","
        for k in range(12):
            line_to_write += str(self_sim_per_token[t_idx,k]) + ","
        output.write(line_to_write[:-1] + "\n")

Figure 7

GROVER outperforms models with fixed-size k-mers for biological fine-tuning tasks

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import torch
from torch import nn
from torch.utils.data import DataLoader, Dataset
import sys
import transformers
from transformers import BertModel, BertTokenizer
import os
import sys
import pandas as pd
import sklearn.metrics as metrics
## Helper classes
class GroverDataSet(Dataset):
    def __init__(self, sequences, y, pretrained_model_path, max_length=512):
        print("Loading GROVER Dataset")
        self.BERTtokenizer = BertTokenizer.from_pretrained(pretrained_model_path) # to convert ATG ATCGA CG -> [CLS] ATG ATCGA CG [SEP] -> [2, 123, 456, 789, 101, 3]
        self.sequences = sequences
        self.max_length = max_length
        self.y = np.array(y, dtype=np.float32)
        
    def __len__(self):
        return len(self.sequences)
    
    def __getitem__(self, idx):
        seq = self.sequences[idx]
        tokenizer_res = self.BERTtokenizer.encode_plus(seq, add_special_tokens=True, padding="max_length", return_tensors="pt", max_length=self.max_length, truncation=True)
        ids = tokenizer_res["input_ids"]
        attention_masks = tokenizer_res["attention_mask"]
        return ids, self.y[idx], attention_masks, idx

class KmerDataSet(Dataset):
    def __init__(self, sequences, y, pretrained_model_path, kmer, max_length=512):
        print(f"Loading {kmer}mer Dataset")
        self.BERTtokenizer = BertTokenizer.from_pretrained(pretrained_model_path) # to convert ATG ATC GGA CGC -> [CLS] ATG ATC GGA CGC [SEP] -> [2, 123, 456, 789, 101, 3]
        #print(sequences[:2])
        self.sequences = ["".join(seq) for seq in sequences]
        print(self.sequences[0][:10])
        # split into non-overlapping kmers
        kmer_seqs = []
        for i in range(len(self.sequences)):
            seq = self.sequences[i]
            kmer_seq = []
            for j in range(0, len(seq), kmer):
                kmer_seq.append(seq[j:j+kmer])
            kmer_seqs.append(kmer_seq)
        self.sequences = kmer_seqs
        self.max_length = max_length
        self.y = np.array(y, dtype=np.float32)

        print(len(self.sequences))

    def __len__(self):
        return len(self.sequences)
    
    def __getitem__(self, idx):
        seq = self.sequences[idx]
        tokenizer_res = self.BERTtokenizer.encode_plus(seq, add_special_tokens=True, padding="max_length", return_tensors="pt", max_length=self.max_length, truncation=True)
        ids = tokenizer_res["input_ids"]
        attention_masks = tokenizer_res["attention_mask"]
        return ids, self.y[idx], attention_masks, idx

class OverlappingKmerDataSet(Dataset):
    def __init__(self, sequences, y, pretrained_model_path, kmer, max_length=512):
        print(f"Loading overlapping {kmer}mer Dataset")
        self.BERTtokenizer = BertTokenizer.from_pretrained(pretrained_model_path)# to convert ATG TGC GCC CCA -> [CLS] ATG TGC GCC CCA [SEP] -> [2, 123, 456, 789, 101, 3]
        self.sequences = ["".join(seq) for seq in sequences]
        print(self.sequences[0][:10])
        # split into overlapping kmers
        kmer_seqs = []
        for i in range(len(self.sequences)):
            seq = self.sequences[i]
            kmer_seq = []
            for j in range(len(seq)-kmer+1):
                kmer_seq.append(seq[j:j+kmer])
            kmer_seqs.append(kmer_seq)
        self.sequences = kmer_seqs
        self.max_length = max_length
        self.y = np.array(y, dtype=np.float32)
    
    def __len__(self):
        return len(self.sequences)
    
    def __getitem__(self, idx):
        seq = self.sequences[idx]
        tokenizer_res = self.BERTtokenizer.encode_plus(seq, add_special_tokens=True, padding="max_length", return_tensors="pt", max_length=self.max_length, truncation=True)
        ids = tokenizer_res["input_ids"]
        attention_masks = tokenizer_res["attention_mask"]
        return ids, self.y[idx], attention_masks, idx
def predict(args):
    checkpoint_path = args.get("checkpoint_path")
    model_path = args.get("model_path")
    test_data = args.get("test_data")
    dataset_type = args.get("dataset_type")
    max_length = args.get("max_length")
    kmer = args.get("kmer")
    batch_size = args.get("batch_size")

    if max_length is None:
        max_length = 512

    device = torch.device("cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu")
    print("device: ", device)

    if device == "cuda":
        torch.set_float32_matmul_precision('medium')

    print(f"Loading test data from : {test_data}")
    test_data = pd.read_csv(test_data, sep="\t", converters={"X": eval, "y": eval})
    print("Data loaded")

    y_test = test_data["y"].values
    
    if dataset_type == "BP":
        test = GroverDataSet(test_data["X"].values, y_test, model_path, max_length=max_length)
    elif dataset_type == "OverlappingKmers":
        test = OverlappingKmerDataSet(test_data["X"].values, y_test, model_path, max_length=max_length, kmer=kmer)
    elif dataset_type == "Kmer":
        test = KmerDataSet(test_data["X"].values, y_test, model_path, max_length=max_length, kmer=kmer)
    else:
        print("dataset type has to be one of: BP, OverlappingKmers or Kmer.\nAborting script.")
        sys.exit(1)
    print(test.sequences[0][:10])
    print(test.y[0])

    output_head = nn.Linear(768, 1)

    print("Loading model")
    model = LitGROVER_for_binary_classification.load_from_checkpoint(
            checkpoint_path,
            map_location=device,
            grover_path=model_path,
            output_head=output_head
            ).eval()
    model.to(device)
    print("Model loaded")

    test_loader = DataLoader(test, batch_size=batch_size, shuffle=False, num_workers=0)

    print("Starting testing")
    all_probs = []
    with torch.no_grad():
        for batch in test_loader:
            ids, y, attention_masks, idx = batch
            ids = ids.to(device)
            attention_masks = attention_masks.to(device)
            y = y.unsqueeze(1).to(device)
            idx = idx.to(device)
            logits = model(ids, attention_masks)
            probs = torch.sigmoid(logits)
            for prob in probs.cpu().numpy():
                all_probs.append(prob[0])
        test_data["probs"] = all_probs
    return test_data[["y", "probs"]]
import torch
from torch import nn
from transformers import BertModel
import pytorch_lightning as pl
import torchmetrics
from torch.optim.lr_scheduler import ReduceLROnPlateau
import torch.optim as optim

class LitGROVER_for_binary_classification(pl.LightningModule):
    def __init__(self, grover_path="../../models/Grover/GROVER_full/", loss_fn=nn.BCEWithLogitsLoss, output_attentions=False, dropout=0.3, lr=0.0005, output_head=None, positive_class=1):
        super().__init__()
        self.output_attentions = output_attentions
        self.lr = lr
        self.positive_class = positive_class
        self.train_accuracy = torchmetrics.Accuracy(task="binary")
        self.train_precision = torchmetrics.Precision(task="binary")
        self.train_recall = torchmetrics.Recall(task="binary")
        self.train_f1score = torchmetrics.F1Score(task="binary", average="macro")
        self.val_accuracy = torchmetrics.Accuracy(task="binary")
        self.val_precision = torchmetrics.Precision(task="binary")
        self.val_recall = torchmetrics.Recall(task="binary")
        self.val_f1score = torchmetrics.F1Score(task="binary", average="macro")
        self.grover = BertModel.from_pretrained(grover_path, output_attentions=self.output_attentions)
        self.dropout = nn.Dropout(dropout)
        self.loss_fn = loss_fn
        if output_head is None:
            self.output_head = nn.Sequential(
                nn.Linear(768, 2)
            )
        else:
            self.output_head = output_head

    def forward(self, x, attention_masks=None):
        x = x.squeeze(1)
        outputs = self.grover(x, attention_mask=attention_masks)
        if self.output_attentions:
            x, h, attentions = outputs[0], outputs[1], outputs[2]
        else:
            x, h = outputs[0], outputs[1]
        
        # calculate output logits
        x = self.dropout(x)
        x = torch.mean(x, dim=1)
        logits = self.output_head(x)
        
        if self.output_attentions:
            return logits, attentions, x
        else:
            return logits

    def training_step(self, batch, batch_idx):
        # training_step defines the train loop.
        # it is independent of forward
        x, y, attention_masks, _ = batch
        y=y.unsqueeze(1)#.squeeze()#.to(torch.int64)
        #print(y.shape)
        
        if self.output_attentions:
            logits, attentions = self.forward(x, attention_masks)
        else:
            logits = self.forward(x, attention_masks)
        probs = torch.sigmoid(logits)
        loss = self.loss_fn(logits, y)

        #probs_positive_class = probs[:, self.positive_class]
        probs_positive_class = probs

        acc = self.train_accuracy(probs_positive_class, y)
        precision = self.train_precision(probs_positive_class, y)
        recall = self.train_recall(probs_positive_class, y)
        f1score = self.train_f1score(probs_positive_class, y)

        # Logging to TensorBoard (if installed) by default
        self.log_dict({"train_loss": loss, "train_acc": acc, "train_recall": recall, "train_precision": precision, "train_f1score": f1score, "lr": self.lr}, logger=True, on_step=True, on_epoch=True)
        #self.log("train_loss", loss)
        return loss

    def validation_step(self, batch, batch_idx):
        # this is the test loop
        x, y, attention_masks, _ = batch
        y=y.unsqueeze(1)#.squeeze()#.to(torch.int64)
        #print(y.shape)
        
        if self.output_attentions:
            logits, attentions = self.forward(x, attention_masks)
        else:
            logits = self.forward(x, attention_masks)

        #probs = self.softmax(logits)
        probs = torch.sigmoid(logits)
        loss = self.loss_fn(logits, y)

        #probs_positive_class = probs[:, self.positive_class]
        probs_positive_class = probs

        
        acc = self.val_accuracy(probs_positive_class, y)
        precision = self.val_precision(probs_positive_class, y)
        recall = self.val_recall(probs_positive_class, y)
        f1score = self.val_f1score(probs_positive_class, y)
        # Logging to TensorBoard (if installed) by default
        self.log_dict({"val_loss": loss, "val_acc": acc, "val_recall": recall, "val_precision": precision, "val_f1score": f1score}, logger=True, on_epoch=True)
        #self.log("val_loss", loss, )
        return loss

    # The ReduceLROnPlateau scheduler requires a monitor
    def configure_optimizers(self):
        optimizer = optim.Adam(self.parameters(), lr=self.lr)
        return {
            "optimizer": optimizer,
            "lr_scheduler": {
                "scheduler": ReduceLROnPlateau(optimizer, mode="max", patience=15, verbose=True, factor=0.5),
                "monitor": "val_f1score",
                "frequency": 1
                # If "monitor" references validation metrics, then "frequency" should be set to a
                # multiple of "trainer.check_val_every_n_epoch".
            },
        }

Figure 7B

Performance for the Prom300 task of GROVER versus models of fixed-size k-mers with the shuffling based on GROVER tokenization and tokenization that matches the respective benchmarking models. Metrics are given as F1 score, precision, recall, and Mathews Correlation Coefficient (MCC).

Dataset

annotations come from: https://epd.epfl.ch/human/human_database.php?db=human

root_path = "finetuning_tasks/Prom300/"
annotations = pd.read_csv(root_path+'promoterRangesHg19.bed', sep='\t', header=None)
annotations.columns = ['chr', 'start', 'end', 'name', 'score', 'strand']
annotations.sort_values(by=['chr', 'start'], inplace=True)

annotations["start"] = annotations.apply(lambda x: x["start"]-249 if x["strand"] == "+" else x["start"]-50, axis=1)
annotations["end"] = annotations.apply(lambda x: x["end"]+50 if x["strand"] == "+" else x["end"]+249, axis=1)


# load BP chromosome mapper
def load_mapper(BP_chr):
    # create mapper that maps nucleotide position to BP position
    mapper = []
    curr_BP_pos = 0
    for i in range(len(BP_chr)):
        for e in range(len(BP_chr[i])):
            mapper.append(i)
    return mapper


# iterate through each chromosome
BP_seqs = []
for chrom in annotations["chr"].unique():
    chrom_nr = chrom[3:]
    if chrom_nr == "X":
        chrom_nr = 23
    elif chrom_nr == "Y":
        chrom_nr = 24
    # slice the annotations for the current chromosome
    annotations_chr = annotations.loc[annotations["chr"] == chrom]
    # load BP tokenized chromosome
    with open(f"tokenized_chromosomes/chr_{chrom_nr}.pkl", "rb") as f:
        BP_chr = pickle.load(f)
    
    # load mapper that maps nucleotide position to BP position
    print(f"loading mapper {chrom}")
    mapper = load_mapper(BP_chr)

    BP_starts = annotations.loc[annotations["chr"] == chrom]["start"].apply(lambda x: mapper[x])
    BP_ends = annotations.loc[annotations["chr"] == chrom]["end"].apply(lambda x: mapper[x])
    for start, end in zip(BP_starts, BP_ends):
        BP_seqs.append(BP_chr[start:end])

annotations["BPseq"] = BP_seqs
annotations["sequence"] = annotations["BPseq"].apply(lambda x: list("".join(x)))
Random mutate

Preprocessing like https://github.com/egochao/DeePromoter

Procedure for create negative dataset as described in paper:

Step 1: Break the sequence in N parts(20 as in the paper)

Old Step 2: Random choose M parts of the original sequence to keep it, and random initialize the rest

New Step 2: Randomly choose M parts of the original sequence to keep it, shuffle the rest around (preserves sequence attributes like GC content and AG balance)
# function that returns the indices of all elements of k chunks of size (len(lst) // k) in a list
def get_chunks(lst, k):
    chunkSize = len(lst) // k
    indices = []
    for i in range(0, len(lst), chunkSize):
        indices.append([e for e in range(i, min(i + chunkSize, len(lst)))]) # min is used to avoid index out of bounds
    return indices

def choose_random_chunks(chunks, nrOfChunksToMutate):
    random_indices = np.random.choice(len(chunks), nrOfChunksToMutate, replace=False)
    return [chunks[i] for i in random_indices]

def unroll_chunks(chunks):
    return [e for chunk in chunks for e in chunk]

def shuffleIndices(indices):
    shuffleIndices = indices.copy()
    np.random.shuffle(shuffleIndices)
    return shuffleIndices

def random_mutate(seq, k, nrOfChunksToMutate):
    seq = seq.copy()
    chunks = get_chunks(seq, k=k)
    chunksToMutate = choose_random_chunks(chunks, nrOfChunksToMutate=nrOfChunksToMutate)
    indicesToMutate = unroll_chunks(chunksToMutate)
    shuffledIndicesToMutate = shuffleIndices(indicesToMutate)
    for x,y in zip(indicesToMutate,shuffledIndicesToMutate):
        seq[x] = seq[y]
    return seq
Shuffle GROVER sequence
seq = annotations.loc[1, 'BPseq']
nrOfChunks = 8 # in how many splits do we want to divide the sequence
nrOfChunksToMutate = 6 # how many of these splits do we want to mutate (randomly shuffle)

annotations["BPseqMutated"] = annotations["BPseq"].apply(lambda x: random_mutate(x, nrOfChunks, nrOfChunksToMutate))

data_non_mutated = pd.DataFrame({"X": annotations["BPseq"], "y": [1] * len(annotations), "start": annotations["start"], "end": annotations["end"], "name": annotations["name"], "strand": annotations["strand"], "chr": annotations["chr"]})
data_mutated = pd.DataFrame({"X": annotations["BPseqMutated"], "y": [0] * len(annotations), "start": annotations["start"], "end": annotations["end"], "name": annotations["name"], "strand": annotations["strand"], "chr": annotations["chr"]})
data = pd.concat([data_non_mutated, data_mutated], ignore_index=True)
# split data into train, val, test: 80%, 10%, 10%
train = data.sample(frac=0.8, random_state=42)
val_test = data.drop(train.index)
val = val_test.sample(frac=0.5, random_state=42)
test = val_test.drop(val.index)

train.to_csv(root_path + "train.tsv", index=False, sep='\t')
val.to_csv(root_path + "validate.tsv", index=False, sep='\t')
test.to_csv(root_path + "test.tsv", index=False, sep='\t')

Shuffle kmers

def kmerize(seq, k):
    return ["".join(seq[i:i+k]) for i in range(0, len(seq)-k+1, k)]

annotations["4mer_seq"] = annotations["sequence"].apply(lambda x: kmerize(x, 4))
annotations["5mer_seq"] = annotations["sequence"].apply(lambda x: kmerize(x, 5))
annotations["6mer_seq"] = annotations["sequence"].apply(lambda x: kmerize(x, 6))

seq = annotations.loc[1, '4mer_seq']
nrOfChunks = 8 # in how many splits do we want to divide the sequence
nrOfChunksToMutate = 6 # how many of these splits do we want to mutate (randomly shuffle)

shuffeld_seq = random_mutate(seq, nrOfChunks, nrOfChunksToMutate)
nrOfChunks = 8 # in how many splits do we want to divide the sequence
nrOfChunksToMutate = 6 # how many of these splits do we want to mutate (randomly shuffle)

for kmer in [4,5,6]:
    annotations[f"{kmer}mer_seq_mut"] = annotations[f"{kmer}mer_seq"].apply(lambda x: random_mutate(x, nrOfChunks, nrOfChunksToMutate))
for kmer in [4,5,6]:
    data_non_mutated = pd.DataFrame({"X": annotations[f"{kmer}mer_seq"], "y": [1] * len(annotations), "start": annotations["start"], "end": annotations["end"], "name": annotations["name"], "strand": annotations["strand"], "chr": annotations["chr"]})
    data_mutated = pd.DataFrame({"X": annotations[f"{kmer}mer_seq_mut"], "y": [0] * len(annotations), "start": annotations["start"], "end": annotations["end"], "name": annotations["name"], "strand": annotations["strand"], "chr": annotations["chr"]})
    data = pd.concat([data_non_mutated, data_mutated], ignore_index=True)

    # split data into train, val, test: 80%, 10%, 10%
    train = data.sample(frac=0.8, random_state=42)
    val_test = data.drop(train.index)
    val = val_test.sample(frac=0.5, random_state=42)
    test = val_test.drop(val.index)

    train.to_csv(f"{root_path}train_{kmer}mer.tsv", index=False, sep='\t')
    val.to_csv(f"{root_path}val_{kmer}mer.tsv", index=False, sep='\t')
    test.to_csv(f"{root_path}test_{kmer}mer.tsv", index=False, sep='\t')

Prediction

GROVER shuffle
## GROVER
args = {}

args["checkpoint_path"] = root_path + "grover_epoch_04-val_loss_0.02.ckpt"
args["model_path"] = "GROVER_pretrained/"
args["test_data"] = root_path + "test.tsv"
args["dataset_type"] = "BP"
args["max_length"] = 100
args["batch_size"] = 4

grover_res = predict(args)

## 4mer
args = {}

args["checkpoint_path"] = root_path + "4mer_epoch_18-val_loss_0.25.ckpt"
args["model_path"] = "../../kmer/4mer/"
args["test_data"] = root_path + "test.tsv"
args["dataset_type"] = "Kmer"
args["kmer"] = 4
args["max_length"] = 300//args["kmer"] + 10
args["batch_size"] = 4

res_4mer = predict(args)

## 5mer
args = {}

args["checkpoint_path"] = root_path + "5mer_epoch_15-val_loss_0.29.ckpt"
args["model_path"] = "../../kmer/5mer/"
args["test_data"] = root_path + "test.tsv"
args["dataset_type"] = "Kmer"
args["kmer"] = 5
args["max_length"] = 300//args["kmer"] + 10
args["batch_size"] = 4

res_5mer = predict(args)

## 6mer
args = {}

args["checkpoint_path"] = root_path + "6mer_epoch_28-val_loss_0.36.ckpt"
args["model_path"] = "../../kmer/6mer/"
args["test_data"] = root_path + "test.tsv"
args["dataset_type"] = "Kmer"
args["kmer"] = 6
args["max_length"] = 300//args["kmer"] + 10
args["batch_size"] = 4

res_6mer = predict(args)
k-mer shuffle
## 4mer
args = {}

args["checkpoint_path"] = root_path + "4mer_4mer_shuffle_epoch_19-val_loss_0.42.ckpt"
args["model_path"] = "../../kmer/4mer/"
args["test_data"] = root_path + "test_4mer.tsv"
args["dataset_type"] = "Kmer"
args["kmer"] = 4
args["max_length"] = 300//args["kmer"] + 10
args["batch_size"] = 4

res_4mer_4mer_shuffle = predict(args)

## 5mer
args = {}

args["checkpoint_path"] = root_path + "5mer_5mer_shuffle_epoch_02-val_loss_0.58.ckpt"
args["model_path"] = "../../kmer/5mer/"
args["test_data"] = root_path + "test_5mer.tsv"
args["dataset_type"] = "Kmer"
args["kmer"] = 5
args["max_length"] = 300//args["kmer"] + 10
args["batch_size"] = 4

res_5mer_5mer_shuffle = predict(args)

## 6mer
args = {}

args["checkpoint_path"] = root_path + "6mer_6mer_shuffle_epoch_05-val_loss_0.67.ckpt"
args["model_path"] = "../../kmer/6mer/"
args["test_data"] = root_path + "test_6mer.tsv"
args["dataset_type"] = "Kmer"
args["kmer"] = 6
args["max_length"] = 300//args["kmer"] + 10
args["batch_size"] = 4

res_6mer_6mer_shuffle = predict(args)
BP_shuffle = pd.read_csv(root_path + "test.tsv", sep="\t", converters={"X": eval, "y": eval})
shuffle_4mer = pd.read_csv(root_path + "test_4mer.tsv", sep="\t", converters={"X": eval, "y": eval})
shuffle_5mer = pd.read_csv(root_path + "test_5mer.tsv", sep="\t", converters={"X": eval, "y": eval})
shuffle_6mer = pd.read_csv(root_path + "test_6mer.tsv", sep="\t", converters={"X": eval, "y": eval})

test_dfs = [
    BP_shuffle,
    shuffle_4mer,
    shuffle_5mer,
    shuffle_6mer
]


preds_dfs = [
    grover_res,
    res_4mer_4mer_shuffle,
    res_5mer_5mer_shuffle,
    res_6mer_6mer_shuffle
]

names = [
    "GROVER",
    "4mer",
    "5mer",
    "6mer"
]

for test_df, preds_df, name in zip(test_dfs, preds_dfs, names):
    preds_df["preds"] = preds_df["probs"].apply(lambda x: 1 if x > 0.5 else 0)
    test_df[name] = preds_df["preds"]

results = {"model_name": [], "f1": [], "precision": [], "recall": [], "mcc": []}

for name, test_df in zip(names, test_dfs):
    print(name)
    results["model_name"].append(name)
    results["f1"].append(metrics.f1_score(test["y"], test_df[name]))
    results["precision"].append(metrics.precision_score(test_df["y"], test_df[name]))
    results["recall"].append(metrics.recall_score(test_df["y"], test_df[name]))
    results["mcc"].append(metrics.matthews_corrcoef(test_df["y"], test_df[name]))

    
metrics_df = pd.DataFrame(results) ## FINAL_FILE

Figure 7D

Performance of the Prom-scan task of GROVER versus models of fixed-size k-mers. Metrics are given as F1 score, precision, recall, and Mathews Correlation Coefficient (MCC).

Dataset

annotations come from: https://epd.epfl.ch/human/human_database.php?db=human

root_path = "finetuning_tasks/PromScan/"
annotations = pd.read_csv(root_path + 'promoterRangesHg19.bed', sep='\t', header=None)
annotations.columns = ['chr', 'start', 'end', 'name', 'score', 'strand']

annotations["tss"] = annotations["start"].copy()
annotations["start"] = annotations["start"] - 5000
annotations["end"] = annotations["end"] + 5000
## Split long sequences in overlapping windows of size 1001 nucleotides and stride of 300 nucl.

def split_in_overlapping_windows(start, end, window_size, stride, tss_pos, chr, strand, name):
    windows = []
    for i in range(start, end, stride):
        this_start = i
        this_end = i + window_size
        this_window = {"start": this_start, "end": this_end, "chr": chr, "is_ tss": this_start <= tss_pos and tss_pos <= this_end, "strand": strand, "name": name}
        windows.append(this_window)
    return windows

stacked_windows = annotations.apply(lambda x: split_in_overlapping_windows(x["start"], x["end"], 1001, 300, x["tss"], x["chr"], x["strand"], x["name"]), axis=1)


windows_dict = {"start": [], "end": [], "chr": [], "is_tss": [], "strand": [], "name": []}

for sample in stacked_windows:
    for window in sample:
        windows_dict["start"].append(window["start"])
        windows_dict["end"].append(window["end"])
        windows_dict["chr"].append(window["chr"])
        windows_dict["is_tss"].append(window["is_ tss"])
        windows_dict["strand"].append(window["strand"])
        windows_dict["name"].append(window["name"])
        
window_df = pd.DataFrame(windows_dict)
# load BP chromosome mapper
def load_mapper(BP_chr):
    # create mapper that maps nucleotide position to BP position
    mapper = []
    curr_BP_pos = 0
    for i in range(len(BP_chr)):
        for e in range(len(BP_chr[i])):
            mapper.append(i)
    return mapper


# iterate through each chromosome
BP_seqs = []
for chrom in annotations["chr"].unique():
    chrom_nr = chrom[3:]
    if chrom_nr == "X":
        chrom_nr = 23
    elif chrom_nr == "Y":
        chrom_nr = 24
    # slice the annotations for the current chromosome
    annotations_chr = window_df.loc[window_df["chr"] == chrom]
    # load BP tokenized chromosome
    with open(f"tokenized_chromosomes/chr_{chrom_nr}.pkl", "rb") as f:
        BP_chr = pickle.load(f)
    
    # load mapper that maps nucleotide position to BP position
    print(f"loading mapper {chrom}")
    mapper = load_mapper(BP_chr)

    BP_starts = window_df.loc[window_df["chr"] == chrom]["start"].apply(lambda x: mapper[x])
    BP_ends = window_df.loc[window_df["chr"] == chrom]["end"].apply(lambda x: mapper[x])
    for start, end in zip(BP_starts, BP_ends):
        BP_seqs.append(BP_chr[start:end])

window_df["BPseq"] = BP_seqs
window_df["sequence"] = annotations["BPseq"].apply(lambda x: list("".join(x)))

window_df.rename(columns={"BPseq": "X", "is_tss": "y"}, inplace=True)
# split into train, val, test: 80/10/10
train, validate, test = np.split(window_df.sample(frac=1, random_state=42), [int(.8*len(window_df)), int(.9*len(window_df))])
train.to_csv(root_path + "train.tsv", index=False, sep="\t")
validate.to_csv(root_path + "validate.tsv", index=False, sep="\t")
test.to_csv(root_path + "test.tsv", index=False, sep="\t")

Prediction

## GROVER
args = {}

args["checkpoint_path"] = root_path + "grover_epoch_18-val_loss_0.18.ckpt"
args["model_path"] = "GROVER_pretrained/"
args["test_data"] = root_path + "test.tsv"
args["dataset_type"] = "BP"
args["max_length"] = 256
args["batch_size"] = 4

grover_res = predict(args)

## 4mer
args = {}

args["checkpoint_path"] = root_path + "4mer_epoch_12-val_loss_0.20.ckpt"
args["model_path"] = "../../kmer/4mer/"
args["test_data"] = root_path + "test.tsv"
args["dataset_type"] = "Kmer"
args["kmer"] = 4
args["max_length"] = 1000//args["kmer"] + 10
args["batch_size"] = 4

res_4mer = predict(args)

## 5mer
args = {}

args["checkpoint_path"] = root_path + "5mer_epoch_10-val_loss_0.20.ckpt"
args["model_path"] = "../../kmer/5mer/"
args["test_data"] = root_path + "test.tsv"
args["dataset_type"] = "Kmer"
args["kmer"] = 5
args["max_length"] = 1000//args["kmer"] + 10
args["batch_size"] = 4

res_5mer = predict(args)

## 6mer
args = {}

args["checkpoint_path"] = root_path + "6mer_epoch_13-val_loss_0.21.ckpt"
args["model_path"] = "../../kmer/6mer/"
args["test_data"] = root_path + "test.tsv"
args["dataset_type"] = "Kmer"
args["kmer"] = 6
args["max_length"] = 1000//args["kmer"] + 10
args["batch_size"] = 4

res_6mer = predict(args)
test = pd.read_csv(root_path + "test.tsv", sep="\t", converters={"X": eval, "y": eval})

preds_dfs = [
    grover_res,
    res_4mer,
    res_5mer,
    res_6mer
]

names = [
    "GROVER",
    "4mer",
    "5mer",
    "6mer"
]

for preds_df, name in zip(preds_dfs, names):
    preds_df["preds"] = preds_df["probs"].apply(lambda x: 1 if x > 0.5 else 0)
    test[name] = preds_df["preds"]

results = {"model_name": [], "f1": [], "precision": [], "recall": [], "mcc": []}

for name in names:
    print(name)
    results["model_name"].append(name)
    results["f1"].append(metrics.f1_score(test["y"], test[name]))
    results["precision"].append(metrics.precision_score(test["y"], test[name]))
    results["recall"].append(metrics.recall_score(test["y"], test[name]))
    results["mcc"].append(metrics.matthews_corrcoef(test["y"], test[name]))

    
metrics_df = pd.DataFrame(results) ## FINAL_FILE

Figure 7F

Performance of the CTCF-motif-binding task of GROVER versus models of fixed-size k-mers. Metrics are given as F1 score, precision, recall, and Mathews Correlation Coefficient (MCC)

Dataset

CTCF peaks from https://www.encodeproject.org/experiments/ENCSR000BIE/

root_path = "finetuning_tasks/TF_binding/"
peaks = pd.read_csv(root_path + 'ENCFF915BIE.bed', sep='\t', header=None)
motif_sites  = pd.read_csv(root_path + 'CTCF_motif_sites.gff', sep='\t', header=None, skiprows=1)

run in command line:

  • intersectBed -loj -a CTCF_motif_sites.gff -b ENCFF915BIE.bed > CTCF_motifs_with_peak_annotation.bed
motifs_with_peaks = pd.read_csv(root_path + 'CTCF_motifs_with_peak_annotation.bed', sep='\t', header=None)
## Getting the center of motif
motifs_with_peaks["center_of_motif"] = motifs_with_peaks[3] + (motifs_with_peaks[4] - motifs_with_peaks[3])//2

## Get 1kb area around motif
motifs_with_peaks["start_of_bin"] = motifs_with_peaks["center_of_motif"] - 500
motifs_with_peaks["end_of_bin"] = motifs_with_peaks["center_of_motif"] + 500
motifs_with_peaks["width"] = motifs_with_peaks["end_of_bin"] - motifs_with_peaks["start_of_bin"]

## Add target annotation column for machine learning task
motifs_with_peaks["y"] = motifs_with_peaks[9].apply(lambda x: 1 if x != '.' else 0)
## Retrieve GROVER tokens

data = motifs_with_peaks[[0, "start_of_bin", "end_of_bin", "y"]]
data = data.loc[data[0].apply(lambda x: True if len(x) <= 5 else False)] # some random lines with chr1_gl000191_random etc
data = data.sort_values(by=[0, "start_of_bin"], inplace=False)
def load_mapper(BP_chr):
    # create mapper that maps nucleotide position to BP position
    mapper = []
    for i in range(len(BP_chr)):
        for e in range(len(BP_chr[i])):
            mapper.append(i)
    return mapper

# iterate through each chromosome
BP_seqs = []
for chrom in data[0].unique():
    chrom_nr = chrom[3:]
    if chrom_nr == "X":
        chrom_nr = 23
    elif chrom_nr == "Y":
        chrom_nr = 24
    # slice the data for the current chromosome
    data_chr = data.loc[data[0] == chrom]
    # load BP tokenized chromosome
    with open(f"tokenized_chromosomes/chr_{chrom_nr}.pkl", "rb") as f:
        BP_chr = pickle.load(f)
    
    # load mapper that maps nucleotide position to BP position
    print(f"loading mapper {chrom}")
    mapper = load_mapper(BP_chr)

    BP_starts = data.loc[data[0] == chrom]["start_of_bin"].apply(lambda x: mapper[x])
    BP_ends = data.loc[data[0] == chrom]["end_of_bin"].apply(lambda x: mapper[x])
    for start, end in zip(BP_starts, BP_ends):
        BP_seqs.append(BP_chr[start:end])
data["X"] = BP_seqs

## create train val test (80/10/10)
train = data.sample(frac=0.8, random_state=0)
val = data.drop(train.index)
test = val.sample(frac=0.5, random_state=0)
val = val.drop(test.index)

train.to_csv(root_path + "CTCF_train.tsv", index=False, sep="\t")
val.to_csv(root_path + "CTCF_val.tsv", index=False, sep="\t")
test.to_csv(root_path + "CTCF_test.tsv", index=False, sep="\t")

Prediction

## GROVER
args = {}

args["checkpoint_path"] = root_path + "grover_epoch_18-val_loss_0.45.ckpt"
args["model_path"] = "GROVER_pretrained/"
args["test_data"] = root_path + "CTCF_test.tsv"
args["dataset_type"] = "BP"
args["max_length"] = 310
args["batch_size"] = 4

grover_res = predict(args)

## 4mer
args = {}

args["checkpoint_path"] = root_path + "4mer_epoch_29-val_loss_0.48.ckpt"
args["model_path"] = "../../kmer/4mer/"
args["test_data"] = root_path + "CTCF_test.tsv"
args["dataset_type"] = "Kmer"
args["kmer"] = 4
args["max_length"] = 1000//args["kmer"] + 10
args["batch_size"] = 4          

res_4mer = predict(args)

## 5mer
args = {}

args["checkpoint_path"] = root_path + "5mer_epoch_23-val_loss_0.53.ckpt"
args["model_path"] = "../../kmer/5mer/"
args["test_data"] = root_path + "CTCF_test.tsv"
args["dataset_type"] = "Kmer"
args["kmer"] = 5
args["max_length"] = 1000//args["kmer"] + 10
args["batch_size"] = 4
            
res_5mer = predict(args)

## 6mer
args = {}

args["checkpoint_path"] = root_path + "6mer_epoch_19-val_loss_0.56.ckpt"
args["model_path"] = "../../kmer/6mer/"
args["test_data"] = root_path + "CTCF_test.tsv"
args["dataset_type"] = "Kmer"
args["kmer"] = 6
args["max_length"] = 1000//args["kmer"] + 10
args["batch_size"] = 4

res_6mer = predict(args)
test = pd.read_csv(root_path + "CTCF_test.tsv", sep="\t", converters={"X": eval, "y": eval})

preds_dfs = [
    grover_res,
    res_4mer,
    res_5mer,
    res_6mer
]

names = [
    "GROVER",
    "4mer",
    "5mer",
    "6mer"
]

for preds_df, name in zip(preds_dfs, names):
    preds_df["preds"] = preds_df["probs"].apply(lambda x: 1 if x > 0.5 else 0)
    test[name] = preds_df["preds"]

results = {"model_name": [], "f1": [], "precision": [], "recall": [], "mcc": []}

for name in names:
    print(name)
    results["model_name"].append(name)
    results["f1"].append(metrics.f1_score(test["y"], test[name]))
    results["precision"].append(metrics.precision_score(test["y"], test[name]))
    results["recall"].append(metrics.recall_score(test["y"], test[name]))
    results["mcc"].append(metrics.matthews_corrcoef(test["y"], test[name]))

    
metrics_df = pd.DataFrame(results) ## FINAL_FILE