GROVER ("Genome Rules Obtained Via Extracted Representations") is a foundation DNA language model with an optimized vocabulary for the human genome.
If you have any questions/requests/comments please contact melissa.sanabria@tu-dresden.de or arpoetsch@gmail.com
The preprint that is associated with this script can be found at: https://www.biorxiv.org/content/10.1101/2023.07.19.549677v1
A tutorial on how to use GROVER as a foundation model can be found at: https://doi.org/10.5281/zenodo.8373159
The pretrained model can be found at: https://doi.org/10.5281/zenodo.8373117
The data for the tokenised genome are at: https://doi.org/10.5281/zenodo.8373053
Introduction¶
Scripts to reproduce each figure can be found in the respective R-markdown document. The required input files, which can be produced with this jupyter notebook, are also separately provided there.
The following code shows how every data file was generated. You will find a flag FINAL_FILE that indicates the generation of a file that will be used in the R code.
Figure 1¶
Figure 1 includes the tokenization results that is derived from the code at Figure 2.
Figure 2¶
Performance based selection of the vocabulary identifies 600 cycles of Byte-Pair Tokenization as optimal.
Figure 2A¶
Selection of the optimal vocabulary through accuracy of next-token prediction as a fine- tuning task for the foundation models using prediction of 2 to 6 nucleotide long next-k-mers as readout. Depicted is accuracy with a loess fit and 95% confidence interval.
root_folder = "chr21/"
import random
import glob
import pickle
import os
from transformers import get_linear_schedule_with_warmup, AdamW, BertTokenizer, DataCollatorForLanguageModeling, Trainer
import torch
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader, Dataset, RandomSampler, SequentialSampler
from itertools import product
## Dataset. 400000 sequences from chromsome 21 for training. And 100000 sequences from chromosome 21 for testing.
train_file_path = root_folder + "train.csv"
test_file_path = root_folder + "test.csv"
## Vocabulary information per iteration
vocabs_file_path = root_folder + "vocabs_chr21.pkl"
## Tokenize sequences
def find_indices(seq, first, second):
# find indices of all first tokens of most commong bigram
ixs = {}
ix = 0
just_added = False # make sure there are no overlapping tokens like AAA gets tokenized in AA A and not in AA AA
for ch1, ch2 in zip(seq, seq[1:]):
if ((ch1 == first) and (ch2 == second)) and not just_added:
ixs[ix] = 1
just_added = True
else:
just_added = False
ix += 1
return ixs
def merge_tokens(seq, ixs, first, second):
# merge most common tokens inplace at the first token (remove second token later)
new_token = first + second
for i in ixs.keys():
seq[i] = new_token
# remove the token, that got merged with its predecessor token
seq = [x for i, x in enumerate(seq) if i-1 not in ixs]
return seq
vocabs = pickle.load(open(vocabs_file_path, "rb"))
for iteration in vocabs.keys():
iteration_path = root_path + str(iteration)+"/"
if not os.path.exists(iteration_path):
os.mkdir(iteration_path)
else:
continue
data_path = iteration_path + "data/"
if not os.path.exists(data_path):
os.mkdir(data_path)
for set_name in ["train", "test"]:
with open(root_path + set_name + ".txt", "r") as samples_file:
with open(data_path + set_name + ".txt", "w") as iter_set_file:
for sample in samples_file.readlines():
tokenized_sequence = list(sample.replace("\n", ""))
for token, info in vocabs[iteration].items():
merge_rule = info[1]
ixs = find_indices(tokenized_sequence, merge_rule[0], merge_rule[1])
tokenized_sequence = merge_tokens(tokenized_sequence, ixs, merge_rule[0], merge_rule[1])
sample = " ".join(tokenized_sequence[:512])
iter_set_file.write(sample + "\n")
The following code is an example for one of the vocabulary iterations. You can repeat that for each one of them
iteration = 600
iteration_path = root_path + str(iteration)+"/"
with open(iteration_path +"vocab.txt", "w") as vocab_file:
vocab_file.write("[PAD]\n[UNK]\n[CLS]\n[SEP]\n[MASK]\n")
for key, value in vocabs[iteration].items():
vocab_file.write(value[0] + "\n")
tokenizer = BertTokenizer.from_pretrained(iteration_path +"vocab.txt")
class LineByLineTextDataset(Dataset):
def __init__(self, tokenizer: PreTrainedTokenizer, file_path: str, block_size=512):
with open(file_path, encoding="utf-8") as f:
lines = f.read().splitlines()[:-1]
self.examples = tokenizer.batch_encode_plus(lines, add_special_tokens=True, max_length=block_size)["input_ids"]
def __len__(self):
return len(self.examples)
def __getitem__(self, i):
return torch.tensor(self.examples[i], dtype=torch.long)
test_dataset = LineByLineTextDataset(tokenizer, root_path + str(iteration)+"/data/test.txt")
train_dataset = LineByLineTextDataset(tokenizer, root_path + str(iteration)+"/data/train.txt")
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer,
mlm_probability=mlm_probability)
model = BertForMaskedLM(config = root_path + "config.json")
output_path = ""
training_args = TrainingArguments(
gradient_accumulation_steps=25,
per_gpu_train_batch_size=10,
per_gpu_eval_batch_size=6,
save_steps=500,
save_total_limit=20,
max_steps=20000,
learning_rate=4e-4,
block_size=512,
adam_epsilon=1e-6,
weight_decay=0.01,
adam_beta1=0.9,
adam_beta2 = 0.98,
mlm_probability=0.022,
warmup_steps=1000,
num_train_epochs = max_steps // (len(train_dataloader) // gradient_accumulation_steps) + 1,
evaluate_during_training = True,
output_path = output_path
)
# Initialize our Trainer
trainer = Trainer(
model=model,
train_dataset=train_dataset,
eval_dataset=test_dataset,
tokenizer=tokenizer,
data_collator=data_collator
)
trainer.train()
trainer.save_model()
perplexity_history = {}
step = 500
for log_history in trainer.state.log_history:
if 'eval_loss' in log_history.keys():
perplexity_history[step] = math.exp(log_history['loss'])
step += 500
pickle.dump(perplexity_history, open(root_path + "perplexity.pkl", "wb"))
Fine-tuning on next-k-mer¶
We choose the training step with best performance (best perplexity) on the test set.
Here we will show an example to predict next-2mer. The process for the other next-k-mers is the same.
## Creation of the next-k-mer dataset
next_kmer = 2
kmers_vocab = {}
kmers_vocab_1 = {"A": "0", "C":"1", "G":"2", "T": "3"}
kmers_vocab = {}
count = 0
for nuc in kmers_vocabs[1]:
for sec_nuc in kmers_vocabs[1]:
kmers_vocabs[nuc+sec_nuc] = str(count)
count += 1
grover_data = pd.read_csv(root_path + str(iteration)+"/data/test.txt", header=None, names=["sequence"])
grover_data_next_kmer = pd.DataFrame()
grover_data_next_kmer["sequence"] = grover_data["sequence"].apply(lambda x: x.split(" ")[:50])
grover_data_next_kmer["next_kmer"] = grover_data["sequence"].apply(lambda x: "".join(x.split(" ")[50:])[:next_kmer])
grover_data_next_kmer["class"] = grover_data_next_kmer["next_kmer"].apply(lambda x: kmer_vocab[x])
test_data_grover = pd.DataFrame({'X': grover_data_next_kmer["sequence"], 'class': grover_data_next_kmer["class"]})
grover_data = pd.read_csv(root_path + str(iteration)+"/data/train.txt", header=None, names=["sequence"])
grover_data_next_kmer = pd.DataFrame()
grover_data_next_kmer["sequence"] = grover_data["sequence"].apply(lambda x: x.split(" ")[:50])
grover_data_next_kmer["next_kmer"] = grover_data["sequence"].apply(lambda x: "".join(x.split(" ")[50:])[:next_kmer])
grover_data_next_kmer["class"] = grover_data_next_kmer["next_kmer"].apply(lambda x: kmer_vocab[x])
train_data_grover = pd.DataFrame({'X': grover_data_next_kmer["sequence"], 'class': grover_data_next_kmer["class"]})
class GroverDataSet(Dataset):
def __init__(self, sequences, y, tokenizer, max_length=50):
print("Loading GROVER Dataset")
self.BERTtokenizer = tokenizer # to convert ATG ATCGA CG -> [CLS] ATG ATCGA CG [SEP] -> [2, 123, 456, 789, 101, 3]
self.sequences = sequences
self.max_length = max_length
self.y = np.array(y, dtype=np.float32).reshape(-1, 1)
self.label_encoder = OneHotEncoder(sparse_output=False, categories=[list(range(16))])
self.label_encoder.fit(self.y)
self.y = self.label_encoder.transform(self.y)
self.y = torch.tensor(self.y)
print(self.y.shape)
def __len__(self):
return len(self.sequences)
def __getitem__(self, idx):
seq = self.sequences[idx]
tokenizer_res = self.BERTtokenizer.encode_plus(seq, add_special_tokens=True, padding="max_length", return_tensors="pt", max_length=self.max_length, truncation=True)
ids = tokenizer_res["input_ids"].squeeze(0)
attention_masks = tokenizer_res["attention_mask"]
return ids, self.y[idx], attention_masks, idx
grover_tokenizer = BertTokenizer.from_pretrained(output_path)
train_grover = GroverDataSet(train_data_grover["X"].values, train_data_grover["class"], grover_tokenizer, max_length=50)
test_grover = GroverDataSet(test_data_grover["X"].values, test_data_grover["class"], grover_tokenizer, max_length=50)
grover = BertForSequenceClassification.from_pretrained(output_path, num_labels=len(kmer_vocab)).to(device)
output_path_next_kmer = ""
training_args = TrainingArguments(
num_train_epochs = 100,
learning_rate = 1e-6,
per_gpu_train_batch_size=16,
per_gpu_eval_batch_size=16,
save_steps = 1000,
logging_steps = 1000,
evaluate_during_training=True,
output_path = output_path_next_kmer
)
def compute_metrics(pred):
labels = pred.label_ids
preds = pred.predictions.argmax(axis=1)
return {
'accuracy': (preds == labels).mean()
}
# Initialize our Trainer
trainer = Trainer(
model=model,
train_dataset=train_grover,
eval_dataset=test_grover,
tokenizer=grover_tokenizer,
compute_metrics = compute_metrics
)
trainer.train()
trainer.save_model()
perplexity_history = {}
step = 1000
for log_history in trainer.state.log_history:
if 'eval_accuracy' in log_history.keys():
perplexity_history[step] = math.exp(log_history['eval_accuracy'])
step += 1000
pickle.dump(perplexity_history, open(output_path_next_kmer + "accuracy.pkl", "wb"))
## Creating performance file
results_dict = {}
for iteration in [100, 200, 300, 400, 500, 600, 700, 800, 900, 1000, 2000, 3000, 4000, 5000]:
results_dict[iteration] = {}
for n_mer in range(2, 7):
results_dict[iteration][n_mer] = pickle.load(open(output_path_next_kmer + "accuracy.pkl", "rb"))
performance_to_save = {}
for idx, n_mer in enumerate(range(2, 7)):
performance_to_save[n_mer] = {}
for iteration in [100, 200, 300, 400, 500, 600, 700, 800, 900, 1000, 2000, 3000, 4000, 5000]:
performance_to_save[n_mer][iteration] = np.max(results_dict[iteration][n_mer])
df = pd.DataFrame.from_dict(performance_to_save)
df.to_csv("performance.csv", index=False)
## FINAL_FILE
Figure 2B¶
Performance comparison using accuracy of next-k-mer prediction as a fine-tuning task. Compared are GROVER with 600 cycles of Byte-Pair Tokenization with models based on k-mer-tokenization, with length of 4, 5, and 6 nucleotides.
letters = ['A', 'C', 'G', 'T']
for kmer in [4, 5, 6]:
kmer_path = root_path + str(kmer)+"/"
if not os.path.exists(kmer_path):
os.mkdir(kmer_path)
else:
continue
vocab = product(letters, repeat=kmer)
with open(kmer_path +"vocab.txt", "w") as vocab_file:
vocab_file.write("[PAD]\n[UNK]\n[CLS]\n[SEP]\n[MASK]\n")
for token in vocab:
vocab_file.write(token + "\n")
data_path = kmer_path + "data/"
if not os.path.exists(data_path):
os.mkdir(data_path)
for set_name in ["train", "test"]:
with open(root_path + set_name + ".txt", "r") as samples_file:
with open(data_path + set_name + ".txt", "w") as iter_set_file:
for sample in samples_file.readlines():
new_line = ""
count = 0
for init in range(0, len(sample) - kmer, kmer):
new_line += sample[init: init + kmer] + " "
count += 1
if count > 510:
break
new_line = " ".join(new_line[:-1].split(" "))
iter_set_file.write(new_line + "\n")
We will show how to do it for one k-mer and it can be repeted for the others
kmer = 4
kmer_path = root_path + str(kmer)+"/"
tokenizer = BertTokenizer.from_pretrained(kmer_path +"vocab.txt")
class LineByLineTextDataset(Dataset):
def __init__(self, tokenizer: PreTrainedTokenizer, file_path: str, block_size=512):
with open(file_path, encoding="utf-8") as f:
lines = f.read().splitlines()[:-1]
self.examples = tokenizer.batch_encode_plus(lines, add_special_tokens=True, max_length=block_size)["input_ids"]
def __len__(self):
return len(self.examples)
def __getitem__(self, i):
return torch.tensor(self.examples[i], dtype=torch.long)
test_dataset = LineByLineTextDataset(tokenizer, kmer_path+"/data/test.txt")
train_dataset = LineByLineTextDataset(tokenizer, kmer_path+"/data/train.txt")
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer,
mlm_probability=mlm_probability)
model = BertForMaskedLM(config = root_path + "config_4mer.json")
output_path = ""
training_args = TrainingArguments(
gradient_accumulation_steps=25,
per_gpu_train_batch_size=10,
per_gpu_eval_batch_size=6,
save_steps=500,
save_total_limit=20,
max_steps=20000,
learning_rate=4e-4,
block_size=512,
adam_epsilon=1e-6,
weight_decay=0.01,
adam_beta1=0.9,
adam_beta2 = 0.98,
mlm_probability=0.022,
warmup_steps=1000,
num_train_epochs = max_steps // (len(train_dataset) // 25) + 1,
evaluate_during_training = True,
output_path = output_path
)
# Initialize our Trainer
trainer = Trainer(
model=model,
train_dataset=train_dataset,
eval_dataset=test_dataset,
tokenizer=tokenizer,
data_collator=data_collator
)
trainer.train()
trainer.save_model()
perplexity_history = {}
step = 500
for log_history in trainer.state.log_history:
if 'eval_loss' in log_history.keys():
perplexity_history[step] = math.exp(log_history['loss'])
step += 500
pickle.dump(perplexity_history, open(root_path + "perplexity.pkl", "wb"))
Fine-tuning on next-k-mers¶
We choose the training step with best performance on the test set.
Here we will show an example to predict next-2mer. The process for the other next-k-mer is the same.
## Creation of the next-k-mers dataset
next_kmer = 2
kmers_vocab = {}
kmers_vocab_1 = {"A": "0", "C":"1", "G":"2", "T": "3"}
kmers_vocab = {}
count = 0
for nuc in kmers_vocabs[1]:
for sec_nuc in kmers_vocabs[1]:
kmers_vocabs[nuc+sec_nuc] = str(count)
count += 1
kmer_data = pd.read_csv(kmer_path+"/data/test.txt", header=None, names=["sequence"])
kmer_data_next_kmer = pd.DataFrame()
kmer_data_next_kmer["sequence"] = kmer_data["sequence"].apply(lambda x: x.split(" ")[:50])
kmer_data_next_kmer["next_kmer"] = kmer_data["sequence"].apply(lambda x: "".join(x.split(" ")[50:])[:next_kmer])
kmer_data_next_kmer["class"] = kmer_data_next_kmer["next_kmer"].apply(lambda x: kmer_vocab[x])
test_data_kmer = pd.DataFrame({'X': kmer_data_next_kmer["sequence"], 'class': grover_data_next_kmer["class"]})
kmer_data = pd.read_csv(kmer_path+"/data/train.txt", header=None, names=["sequence"])
kmer_data_next_kmer = pd.DataFrame()
kmer_data_next_kmer["sequence"] = kmer_data["sequence"].apply(lambda x: x.split(" ")[:50])
kmer_data_next_kmer["next_kmer"] = kmer_data["sequence"].apply(lambda x: "".join(x.split(" ")[50:])[:next_kmer])
kmer_data_next_kmer["class"] = kmer_data_next_kmer["next_kmer"].apply(lambda x: kmer_vocab[x])
train_data_kmer = pd.DataFrame({'X': kmer_data_next_kmer["sequence"], 'class': grover_data_next_kmer["class"]})
class GroverDataSet(Dataset):
def __init__(self, sequences, y, tokenizer, max_length=50):
print("Loading GROVER Dataset")
self.BERTtokenizer = tokenizer # to convert ATG ATCGA CG -> [CLS] ATG ATCGA CG [SEP] -> [2, 123, 456, 789, 101, 3]
self.sequences = sequences
self.max_length = max_length
self.y = np.array(y, dtype=np.float32).reshape(-1, 1)
self.label_encoder = OneHotEncoder(sparse_output=False, categories=[list(range(16))])
self.label_encoder.fit(self.y)
self.y = self.label_encoder.transform(self.y)
self.y = torch.tensor(self.y)
print(self.y.shape)
def __len__(self):
return len(self.sequences)
def __getitem__(self, idx):
seq = self.sequences[idx]
tokenizer_res = self.BERTtokenizer.encode_plus(seq, add_special_tokens=True, padding="max_length", return_tensors="pt", max_length=self.max_length, truncation=True)
ids = tokenizer_res["input_ids"].squeeze(0)
attention_masks = tokenizer_res["attention_mask"]
return ids, self.y[idx], attention_masks, idx
kmer_tokenizer = BertTokenizer.from_pretrained(output_path)
train_kmer = GroverDataSet(train_data_kmer["X"].values, train_data_kmer["class"], kmer_tokenizer, max_length=50)
test_kmer = GroverDataSet(test_data_kmer["X"].values, test_data_kmer["class"], kmer_tokenizer, max_length=50)
kmer = BertForSequenceClassification.from_pretrained(output_path, num_labels=len(kmer_vocab)).to(device)
output_path_next_kmer = ""
training_args = TrainingArguments(
num_train_epochs = 100,
learning_rate = 1e-6,
per_gpu_train_batch_size=16,
per_gpu_eval_batch_size=16,
save_steps = 1000,
logging_steps = 1000,
evaluate_during_training=True,
output_path = output_path_next_kmer
)
def compute_metrics(pred):
labels = pred.label_ids
preds = pred.predictions.argmax(axis=1)
return {
'accuracy': (preds == labels).mean()
}
# Initialize our Trainer
trainer = Trainer(
model=model,
train_dataset=train_kmer,
eval_dataset=test_kmer,
tokenizer=kmer_tokenizer,
compute_metrics = compute_metrics
)
trainer.train()
trainer.save_model()
perplexity_history = {}
step = 1000
for log_history in trainer.state.log_history:
if 'eval_accuracy' in log_history.keys():
perplexity_history[step] = math.exp(log_history['eval_accuracy'])
step += 1000
pickle.dump(perplexity_history, open(output_path_next_kmer + "accuracy.pkl", "wb"))
## Creating performance file
results_dict = {}
for iteration in [100, 200, 300, 400, 500, 600, 700, 800, 900, 1000, 2000, 3000, 4000, 5000]:
results_dict[iteration] = {}
for n_mer in range(2, 7):
results_dict[iteration][n_mer] = pickle.load(open(output_path_next_kmer + "accuracy.pkl", "rb"))
performance_to_save = {}
for idx, n_mer in enumerate(range(2, 7)):
performance_to_save[n_mer] = {}
for iteration in [100, 200, 300, 400, 500, 600, 700, 800, 900, 1000, 2000, 3000, 4000, 5000]:
performance_to_save[n_mer][iteration] = np.max(results_dict[iteration][n_mer])
df = pd.DataFrame.from_dict(performance_to_save)
df.to_csv("performance.csv", index=False)
## FINAL_FILE
Figure 2C¶
Performance assessment of GROVER with 600 cycles Byte-Pair Tokenization using accuracy for the masked token being predicted as the TOP 1 token, up to TOP 60, i.e. the TOP 10%.
First, we will explain how we obtained the data from the whole reference genome. Then we will show how to train GROVER.
root_folder = "wg/"
The tokenized chromosomes can be found iin the folder tokenized_chromosomes. In case you want to know they were generated, these are the steps to tokenize the whole genome.
import random
import glob
import pickle
import os
import argparse
from Bio import SeqIO
def n_intervals(seq):
for n_code in ["R", "Y", "S", "W", "K", "M", "B", "D", "H", "V"]:
seq = seq.replace(n_code, "N")
N_positions = [i for i in range(len(seq)) if seq.startswith("N", i)]
N_intervals = []
current_interval = [N_positions[0], 0]
for idx in range(1, len(N_positions)):
if N_positions[idx] != (N_positions[idx - 1] + 1):
current_interval[1] = N_positions[idx - 1] + 1
N_intervals.append([N_positions[idx - 1] + 1, N_positions[idx]])
current_interval[0] = N_positions[idx]
return N_intervals
def find_indices(seq, first, second):
# find indices of all first tokens of most commong bigram
ixs = {}
ix = 0
just_added = False # make sure there are no overlapping tokens like AAA gets tokenized in AA A and not in AA AA
for ch1, ch2 in zip(seq, seq[1:]):
if ((ch1 == first) and (ch2 == second)) and not just_added:
ixs[ix] = 1
just_added = True
else:
just_added = False
ix += 1
return ixs
def merge_tokens(seq, ixs, first, second):
# merge most common tokens inplace at the first token (remove second token later)
new_token = first + second
for i in ixs.keys():
seq[i] = new_token
# remove the token, that got merged with its predecessor token
seq = [x for i, x in enumerate(seq) if i-1 not in ixs]
return seq
for chrom in range(1, 25):
chrom_file = "seq_chr_"+str(chrom)+".fasta" ## Fasta file per chromosome
chrom_seq = SeqIO.read(chrom_file, "fasta").seq
these_n_intervals = n_intervals(chrom_seq)
sequences = []
for interval in these_n_intervals:
sequences.append(chrom_seq[interval[0] : interval[1]])
vocab_iter_file = root_folder + "/vocab_info.pkl"
vocab_iter = pickle.load(open(vocab_iter_file, "rb"))
tok_chrom = []
for sequence in sequences:
tokenized_sequence = list(sequence)
for key, value in vocab_iter.items():
rule = value[1]
ixs = find_indices(tokenized_sequence, rule[0], rule[1])
tokenized_sequence = merge_tokens(tokenized_sequence, ixs, rule[0], rule[1])
tok_chrom.append(tokenized_sequence)
pickle.dump(tok_chrom, open("chr_"+str(chrom)+".pkl", "wb"))
You can access the train and test data from the root folder.
In case you want to know how we generated the data, this is the code.
## Create windows
import random
import pickle
from random import shuffle
windows = []
for chrom in range(1, 25):
tok_chrom = pickle.load(open("tokenized_chromosomes/chr_"+str(chrom)+".pkl", "rb"))
for s, sequence in enumerate(tok_chrom):
length_seq = len(sequence)
window_start = 0
remaining_length = length_seq - window_start
while remaining_length > 510:
prob = random.uniform(0, 1)
if prob < 0.5:
length = 510
else:
length = random.randint(20, 510)
windows.append([chrom, s, window_start, window_start + length])
window_start = window_start + length
remaining_length = length_seq - window_start
shuffle(windows)
windows_per_chromosome = {}
for w, window in enumerate(windows):
chrom = window[0]
if chrom not in windows_per_chromosome:
windows_per_chromosome[chrom] = {}
windows_per_chromosome[chrom]["train"] = []
windows_per_chromosome[chrom]["test"] = []
if w > train_length:
split = "test"
else:
split = "train"
windows_per_chromosome[chrom][split].append(window[1:])
samples_per_chromosome = {}
train_factor = 3
test_factor = 2
nb_train_windows = 0
nb_test_windows = 0
for chrom in windows_per_chromosome.keys():
for split in windows_per_chromosome[chrom].keys():
if split in "train":
factor = train_factor
if split in "test":
factor = test_factor
new_windows = []
for window in windows_per_chromosome[chrom][split]:
window_start = window[-2]
window_end = window[-1]
window_length = window_end - window_start
if window_length > 50:
for i in range(factor):
random_start = random.randint(window_start + 1, window_end - 20)
random_end = random.randint(random_start + 20, window_end)
new_windows.append([window[0], random_start, random_end])
windows_per_chromosome[chrom][split] += new_windows
with open(root_folder + "train.txt", "w") as train_file:
for chrom in range(1, 25):
tok_chrom = pickle.load(open("chr_"+str(chrom)+".pkl", "rb"))
for window in windows_per_chromosome[chrom]["train"]:
sample = tok_chrom[window[0]][window[1]: window[2]]
train_file.write(" ".join(sample) + "\n")
with open(root_folder + "test.txt", "w") as train_file:
for chrom in range(1, 25):
tok_chrom = pickle.load(open("chr_"+str(chrom)+".pkl", "rb"))
for window in windows_per_chromosome[chrom]["test"]:
sample = tok_chrom[window[0]][window[1]: window[2]]
train_file.write(" ".join(sample) + "\n")
## Training GROVER
with open(root_path +"vocab.txt", "w") as vocab_file:
vocab_file.write("[PAD]\n[UNK]\n[CLS]\n[SEP]\n[MASK]\n")
for key, value in vocabs[iteration].items():
vocab_file.write(value[0] + "\n")
tokenizer = BertTokenizer.from_pretrained(root_path +"vocab.txt")
class LineByLineTextDataset(Dataset):
def __init__(self, tokenizer: PreTrainedTokenizer, file_path: str, block_size=512):
with open(file_path, encoding="utf-8") as f:
lines = f.read().splitlines()[:-1]
self.examples = tokenizer.batch_encode_plus(lines, add_special_tokens=True, max_length=block_size)["input_ids"]
def __len__(self):
return len(self.examples)
def __getitem__(self, i):
return torch.tensor(self.examples[i], dtype=torch.long)
test_dataset = LineByLineTextDataset(tokenizer, root_path + "/test.txt")
train_dataset = LineByLineTextDataset(tokenizer, root_path + "/train.txt")
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer,
mlm_probability=mlm_probability)
model = BertForMaskedLM(config = root_path + "config.json")
output_path = ""
training_args = TrainingArguments(
gradient_accumulation_steps=25,
per_gpu_train_batch_size=10,
per_gpu_eval_batch_size=6,
save_steps=500,
save_total_limit=20,
max_steps=20000,
learning_rate=4e-4,
block_size=512,
adam_epsilon=1e-6,
weight_decay=0.01,
adam_beta1=0.9,
adam_beta2 = 0.98,
mlm_probability=0.022,
warmup_steps=1000,
num_train_epochs = max_steps // (len(train_dataset) // 25) + 1,
evaluate_during_training = True,
output_path = output_path
)
# Initialize our Trainer
trainer = Trainer(
model=model,
train_dataset=train_dataset,
eval_dataset=test_dataset,
tokenizer=tokenizer,
data_collator=data_collator
)
trainer.train()
trainer.save_model()
perplexity_history = {}
step = 500
for log_history in trainer.state.log_history:
if 'eval_loss' in log_history.keys():
perplexity_history[step] = math.exp(log_history['loss'])
step += 500
pickle.dump(perplexity_history, open(root_path + "perplexity.pkl", "wb"))
## Prediction of a masked token per sample
from sklearn.metrics import top_k_accuracy_score
from scipy.special import softmax
def collate(examples: List[torch.Tensor]):
if tokenizer._pad_token is None:
return pad_sequence(examples, batch_first=True)
return pad_sequence(examples, batch_first=True, padding_value=tokenizer.pad_token_id)
eval_sampler = SequentialSampler(test_dataset)
eval_dataloader = DataLoader(
test_dataset, sampler=eval_sampler, batch_size=32, collate_fn=collate)
model = BertModel.from_pretrained(output_path, config=root_path + "config.json")
predictions = []
labels = []
for batch in tqdm(eval_dataloader, desc="Evaluating"):
with torch.no_grad():
probability_matrix = torch.rand(batch.shape)
special_tokens_mask = [
tokenizer.get_special_tokens_mask(val, already_has_special_tokens=True) for val in batch.tolist()]
probability_matrix.masked_fill_(torch.tensor(special_tokens_mask, dtype=torch.bool), value=0.0)
if tokenizer._pad_token is not None:
padding_mask = batch.eq(tokenizer.pad_token_id)
probability_matrix.masked_fill_(padding_mask, value=0.0)
token_per_sample = torch.argmax(probability_matrix, dim=1)
batch[torch.arange(batch.shape[0]), token_per_sample] = tokenizer.convert_tokens_to_ids(tokenizer.mask_token)
batch = batch.to("cuda:0")
outputs = model(batch)
prediction = outputs[0].detach().cpu().numpy()[torch.arange(batch.shape[0]), token_per_sample]
predictions.append(softmax(prediction, axis=-1))
labels.append(batch[np.arange(batch.shape[0]), token_per_sample.numpy()].numpy())
labels = np.hstack(labels)
predictions = np.vstack(predictions)
top_k = np.zeros(300)
for k in range(1,300):
top_k[k-1] = top_k_accuracy_score(labels, predictions, k=k, labels=np.arange(609))
print(k, top_k[k-1] )
pickle.dump(top_k, open(root_path + "top_k.pkl", "wb")) ## FINAL_FILE
pickle.dump(labels, open(root_path + "labels.pkl", "wb"))
pickle.dump(predictions, open(root_path + "predictions.pkl", "wb"))
Figure 2D¶
Performance assessment of GROVER with 600 cycles Byte-Pair Tokenization using perplexity, divided by the total number of words in the dictionary. Comparison with models based on k-mer-tokenization, with length of 4, 5, and 6 nucleotides.
Figure 3¶
The frequency balanced GROVER vocabulary shows differential learning performance by token length
To generate Figure 3F and Figure 3G we need the metrics per token.
from sklearn.metrics import roc_auc_score
def roc_auc_score_multiclass(actual_class, pred_class, average = "weighted"):
#creating a set of all the unique classes using the actual class list
unique_class = list(set(actual_class))
roc_auc_dict = {}
for per_class in unique_class:
#creating a list of all the classes except the current class
other_class = [x for x in unique_class if x != per_class]
#marking the current class as 1 and all other classes as 0
new_actual_class = [0 if x in other_class else 1 for x in actual_class]
new_pred_class = [0 if x in other_class else 1 for x in pred_class]
#using the sklearn metrics method to calculate the roc_auc_score
roc_auc = roc_auc_score(new_actual_class, new_pred_class, average = average)
roc_auc_dict[per_class] = roc_auc
return roc_auc_dict
labels = pickle.load(open(root_path + "labels.pkl", "rb"))
predictions = pickle.load(open(root_path + "predictions.pkl", "rb"))
acc_per_token = np.zeros((609, 300))
for k in range(300):
for t in range(609):
this_token_ids = np.where(labels == t)[0]
if len(this_token_ids) > 0:
this_token_labels = labels[this_token_ids]
this_token_predictions = predictions[this_token_ids]
acc_per_token[t, k] = top_k_accuracy_score(this_token_labels, this_token_predictions, k=k, labels=np.arange(609))
auc_per_token = roc_auc_score_multiclass(labels, predictions, "macro")
output_file_path = "metrics_per_token.csv" ## FINAL_FILE
with open(output_file_path, "w") as output:
header = "token_id,token,auc,"
for k in range(1,301):
header += "top"+str(k)+","
header = header[:-1] ## Remove comma
output.write(header + "\n")
for t, token in enumerate(tokens):
auc = 0
if t in auc_per_token:
auc = auc_per_token[t]
line_to_write = str(t) +","+ token +","+ str(auc) +","
for k in range(300):
line_to_write += str(acc_per_token[t,k]) + ","
output.write(line_to_write[:-1] + "\n")
Figure 4¶
Average GROVER token embedding shows learning of genome information content
from gensim.models import Word2Vec
from sklearn.decomposition import PCA
with open(root_path + "/train.txt") as train_file:
data = train_file.readlines()[:-1]
sent = [row.split() for row in data[:300000]]
w2v_model = gensim.models.Word2Vec(sent, min_count = 1, vector_size = 768, window = 5, workers=10)
pickle.dump(w2v_model.wv.vectors, open(root_path + "iter600_w2v.pkl", "wb")) ## FINAL_FILE
with open(root_path + "iter600_w2v_vocab.txt", "w") as vocab_file: ## FINAL_FILE
for word in w2v_model.wv.index_to_key:
vocab_file.write(word + "\n")
model = BertModel.from_pretrained(output_path, config=root_path + "config.json")
vocabulary = ""
with open(root_path +"vocab.txt", "r") as vocabulary_file:
for line in vocabulary_file.readlines()[5:]:
vocabulary += line.replace("\n", "") + " "
embedding_matrix = model.embeddings.word_embeddings.weight.to("cpu").detach().numpy()
pickle.dump(embedding_matrix, open("vocab_embedding.pkl", "wb")) ## FINAL_FILE
Figure 6¶
GROVER learns token context and genome annotation
Figure 6A¶
Self-similarity per token sequence as extracted by cosine similarity of the same token in different contexts throughout the 12 transformer layers.
def collate(examples: List[torch.Tensor]):
if tokenizer._pad_token is None:
return pad_sequence(examples, batch_first=True)
return pad_sequence(examples, batch_first=True, padding_value=tokenizer.pad_token_id)
eval_sampler = SequentialSampler(test_dataset)
eval_dataloader = DataLoader(
test_dataset, sampler=eval_sampler, batch_size=32, collate_fn=collate)
model = BertModel.from_pretrained(output_path, config=root_path + "config.json")
model.eval()
count = 0
for batch in tqdm(eval_dataloader, desc="Evaluating"):
with torch.no_grad():
batch = batch.to("cuda:0")
outputs = model(batch)
embeddings = torch.stack(list(outputs[1]), dim=0).detach().cpu().numpy()
pickle.dump(embeddings, open("embeddings_test/"+str(count)+".pkl", "wb"))
del embeddings
count += 1
## Group the embeddings per token
tokens = []
with open(root_path +"vocab.txt") as vocab_file:
for line in vocab_file.readlines():
tokens.append(line.replace("\n", ""))
tokens_to_get = np.asarray([[7] + list(range(9, len(tokens)))][0])
total_samples = 5000
for layer in range(1,13):
token_embeddings = [[] for _ in range(len(tokens_to_get))]
token_counts = [0 for _ in range(len(tokens_to_get))]
count = 0
for batch in eval_dataloader:
batch = batch.numpy()
if not os.path.exists("embeddings_test/"+str(count)+".pkl"):
break
embeddings = pickle.load(open("embeddings_test/"+str(count)+".pkl", "rb"))[layer]
for t_idx, token_id in enumerate(tokens_to_get):
if token_counts[t_idx] >= total_samples:
print("Token ",tokens[token_id], "is full")
continue
condition = np.nonzero(batch == token_id)
times = len(condition[0])
if times > 0:
token_embeddings[t_idx].append(embeddings[condition])
token_counts[t_idx] += times
del embeddings
count += 1
all_full = True
for t_idx in range(len(tokens_to_get)):
if token_counts[t_idx] < total_samples:
all_full = False
break
if all_full:
break
for t_idx, token_id in enumerate(tokens_to_get):
print("Saving file for token ", tokens[token_id])
pickle.dump(np.vstack(token_embeddings[t_idx]), open(str(token_id)+"_layer_"+str(layer)+".pkl", "wb"))
self_sim_per_token = np.zeros((len(tokens_to_get), 12))
for t_idx, token_id in enumerate(tokens_to_get):
for l_idx, layer in enumerate(range(1,13)):
token_embeddings = pickle.load(open(str(token_id)+"_layer_"+str(layer)+".pkl", "rb"))[:5000]
sim = cosine_similarity(token_embeddings)
upper = sim[np.triu_indices(len(token_embeddings), k = 1)]
self_sim_per_token[t_idx, l_idx] = upper.mean()
output_file_path = "self_similarity_per_token.csv" ## FINAL_FILE
with open(output_file_path, "w") as output:
header = "token_id,token,"
for k in range(12):
header += "layer"+str(k + 1)+","
header = header[:-1]
output.write(header + "\n")
for t_idx, t in enumerate(tokens_to_get):
token = tokens[t]
line_to_write = str(t) +","+ token +","
for k in range(12):
line_to_write += str(self_sim_per_token[t_idx,k]) + ","
output.write(line_to_write[:-1] + "\n")
Figure 7¶
GROVER outperforms models with fixed-size k-mers for biological fine-tuning tasks
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import torch
from torch import nn
from torch.utils.data import DataLoader, Dataset
import sys
import transformers
from transformers import BertModel, BertTokenizer
import os
import sys
import pandas as pd
import sklearn.metrics as metrics
## Helper classes
class GroverDataSet(Dataset):
def __init__(self, sequences, y, pretrained_model_path, max_length=512):
print("Loading GROVER Dataset")
self.BERTtokenizer = BertTokenizer.from_pretrained(pretrained_model_path) # to convert ATG ATCGA CG -> [CLS] ATG ATCGA CG [SEP] -> [2, 123, 456, 789, 101, 3]
self.sequences = sequences
self.max_length = max_length
self.y = np.array(y, dtype=np.float32)
def __len__(self):
return len(self.sequences)
def __getitem__(self, idx):
seq = self.sequences[idx]
tokenizer_res = self.BERTtokenizer.encode_plus(seq, add_special_tokens=True, padding="max_length", return_tensors="pt", max_length=self.max_length, truncation=True)
ids = tokenizer_res["input_ids"]
attention_masks = tokenizer_res["attention_mask"]
return ids, self.y[idx], attention_masks, idx
class KmerDataSet(Dataset):
def __init__(self, sequences, y, pretrained_model_path, kmer, max_length=512):
print(f"Loading {kmer}mer Dataset")
self.BERTtokenizer = BertTokenizer.from_pretrained(pretrained_model_path) # to convert ATG ATC GGA CGC -> [CLS] ATG ATC GGA CGC [SEP] -> [2, 123, 456, 789, 101, 3]
#print(sequences[:2])
self.sequences = ["".join(seq) for seq in sequences]
print(self.sequences[0][:10])
# split into non-overlapping kmers
kmer_seqs = []
for i in range(len(self.sequences)):
seq = self.sequences[i]
kmer_seq = []
for j in range(0, len(seq), kmer):
kmer_seq.append(seq[j:j+kmer])
kmer_seqs.append(kmer_seq)
self.sequences = kmer_seqs
self.max_length = max_length
self.y = np.array(y, dtype=np.float32)
print(len(self.sequences))
def __len__(self):
return len(self.sequences)
def __getitem__(self, idx):
seq = self.sequences[idx]
tokenizer_res = self.BERTtokenizer.encode_plus(seq, add_special_tokens=True, padding="max_length", return_tensors="pt", max_length=self.max_length, truncation=True)
ids = tokenizer_res["input_ids"]
attention_masks = tokenizer_res["attention_mask"]
return ids, self.y[idx], attention_masks, idx
class OverlappingKmerDataSet(Dataset):
def __init__(self, sequences, y, pretrained_model_path, kmer, max_length=512):
print(f"Loading overlapping {kmer}mer Dataset")
self.BERTtokenizer = BertTokenizer.from_pretrained(pretrained_model_path)# to convert ATG TGC GCC CCA -> [CLS] ATG TGC GCC CCA [SEP] -> [2, 123, 456, 789, 101, 3]
self.sequences = ["".join(seq) for seq in sequences]
print(self.sequences[0][:10])
# split into overlapping kmers
kmer_seqs = []
for i in range(len(self.sequences)):
seq = self.sequences[i]
kmer_seq = []
for j in range(len(seq)-kmer+1):
kmer_seq.append(seq[j:j+kmer])
kmer_seqs.append(kmer_seq)
self.sequences = kmer_seqs
self.max_length = max_length
self.y = np.array(y, dtype=np.float32)
def __len__(self):
return len(self.sequences)
def __getitem__(self, idx):
seq = self.sequences[idx]
tokenizer_res = self.BERTtokenizer.encode_plus(seq, add_special_tokens=True, padding="max_length", return_tensors="pt", max_length=self.max_length, truncation=True)
ids = tokenizer_res["input_ids"]
attention_masks = tokenizer_res["attention_mask"]
return ids, self.y[idx], attention_masks, idx
def predict(args):
checkpoint_path = args.get("checkpoint_path")
model_path = args.get("model_path")
test_data = args.get("test_data")
dataset_type = args.get("dataset_type")
max_length = args.get("max_length")
kmer = args.get("kmer")
batch_size = args.get("batch_size")
if max_length is None:
max_length = 512
device = torch.device("cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu")
print("device: ", device)
if device == "cuda":
torch.set_float32_matmul_precision('medium')
print(f"Loading test data from : {test_data}")
test_data = pd.read_csv(test_data, sep="\t", converters={"X": eval, "y": eval})
print("Data loaded")
y_test = test_data["y"].values
if dataset_type == "BP":
test = GroverDataSet(test_data["X"].values, y_test, model_path, max_length=max_length)
elif dataset_type == "OverlappingKmers":
test = OverlappingKmerDataSet(test_data["X"].values, y_test, model_path, max_length=max_length, kmer=kmer)
elif dataset_type == "Kmer":
test = KmerDataSet(test_data["X"].values, y_test, model_path, max_length=max_length, kmer=kmer)
else:
print("dataset type has to be one of: BP, OverlappingKmers or Kmer.\nAborting script.")
sys.exit(1)
print(test.sequences[0][:10])
print(test.y[0])
output_head = nn.Linear(768, 1)
print("Loading model")
model = LitGROVER_for_binary_classification.load_from_checkpoint(
checkpoint_path,
map_location=device,
grover_path=model_path,
output_head=output_head
).eval()
model.to(device)
print("Model loaded")
test_loader = DataLoader(test, batch_size=batch_size, shuffle=False, num_workers=0)
print("Starting testing")
all_probs = []
with torch.no_grad():
for batch in test_loader:
ids, y, attention_masks, idx = batch
ids = ids.to(device)
attention_masks = attention_masks.to(device)
y = y.unsqueeze(1).to(device)
idx = idx.to(device)
logits = model(ids, attention_masks)
probs = torch.sigmoid(logits)
for prob in probs.cpu().numpy():
all_probs.append(prob[0])
test_data["probs"] = all_probs
return test_data[["y", "probs"]]
import torch
from torch import nn
from transformers import BertModel
import pytorch_lightning as pl
import torchmetrics
from torch.optim.lr_scheduler import ReduceLROnPlateau
import torch.optim as optim
class LitGROVER_for_binary_classification(pl.LightningModule):
def __init__(self, grover_path="../../models/Grover/GROVER_full/", loss_fn=nn.BCEWithLogitsLoss, output_attentions=False, dropout=0.3, lr=0.0005, output_head=None, positive_class=1):
super().__init__()
self.output_attentions = output_attentions
self.lr = lr
self.positive_class = positive_class
self.train_accuracy = torchmetrics.Accuracy(task="binary")
self.train_precision = torchmetrics.Precision(task="binary")
self.train_recall = torchmetrics.Recall(task="binary")
self.train_f1score = torchmetrics.F1Score(task="binary", average="macro")
self.val_accuracy = torchmetrics.Accuracy(task="binary")
self.val_precision = torchmetrics.Precision(task="binary")
self.val_recall = torchmetrics.Recall(task="binary")
self.val_f1score = torchmetrics.F1Score(task="binary", average="macro")
self.grover = BertModel.from_pretrained(grover_path, output_attentions=self.output_attentions)
self.dropout = nn.Dropout(dropout)
self.loss_fn = loss_fn
if output_head is None:
self.output_head = nn.Sequential(
nn.Linear(768, 2)
)
else:
self.output_head = output_head
def forward(self, x, attention_masks=None):
x = x.squeeze(1)
outputs = self.grover(x, attention_mask=attention_masks)
if self.output_attentions:
x, h, attentions = outputs[0], outputs[1], outputs[2]
else:
x, h = outputs[0], outputs[1]
# calculate output logits
x = self.dropout(x)
x = torch.mean(x, dim=1)
logits = self.output_head(x)
if self.output_attentions:
return logits, attentions, x
else:
return logits
def training_step(self, batch, batch_idx):
# training_step defines the train loop.
# it is independent of forward
x, y, attention_masks, _ = batch
y=y.unsqueeze(1)#.squeeze()#.to(torch.int64)
#print(y.shape)
if self.output_attentions:
logits, attentions = self.forward(x, attention_masks)
else:
logits = self.forward(x, attention_masks)
probs = torch.sigmoid(logits)
loss = self.loss_fn(logits, y)
#probs_positive_class = probs[:, self.positive_class]
probs_positive_class = probs
acc = self.train_accuracy(probs_positive_class, y)
precision = self.train_precision(probs_positive_class, y)
recall = self.train_recall(probs_positive_class, y)
f1score = self.train_f1score(probs_positive_class, y)
# Logging to TensorBoard (if installed) by default
self.log_dict({"train_loss": loss, "train_acc": acc, "train_recall": recall, "train_precision": precision, "train_f1score": f1score, "lr": self.lr}, logger=True, on_step=True, on_epoch=True)
#self.log("train_loss", loss)
return loss
def validation_step(self, batch, batch_idx):
# this is the test loop
x, y, attention_masks, _ = batch
y=y.unsqueeze(1)#.squeeze()#.to(torch.int64)
#print(y.shape)
if self.output_attentions:
logits, attentions = self.forward(x, attention_masks)
else:
logits = self.forward(x, attention_masks)
#probs = self.softmax(logits)
probs = torch.sigmoid(logits)
loss = self.loss_fn(logits, y)
#probs_positive_class = probs[:, self.positive_class]
probs_positive_class = probs
acc = self.val_accuracy(probs_positive_class, y)
precision = self.val_precision(probs_positive_class, y)
recall = self.val_recall(probs_positive_class, y)
f1score = self.val_f1score(probs_positive_class, y)
# Logging to TensorBoard (if installed) by default
self.log_dict({"val_loss": loss, "val_acc": acc, "val_recall": recall, "val_precision": precision, "val_f1score": f1score}, logger=True, on_epoch=True)
#self.log("val_loss", loss, )
return loss
# The ReduceLROnPlateau scheduler requires a monitor
def configure_optimizers(self):
optimizer = optim.Adam(self.parameters(), lr=self.lr)
return {
"optimizer": optimizer,
"lr_scheduler": {
"scheduler": ReduceLROnPlateau(optimizer, mode="max", patience=15, verbose=True, factor=0.5),
"monitor": "val_f1score",
"frequency": 1
# If "monitor" references validation metrics, then "frequency" should be set to a
# multiple of "trainer.check_val_every_n_epoch".
},
}
Figure 7B¶
Performance for the Prom300 task of GROVER versus models of fixed-size k-mers with the shuffling based on GROVER tokenization and tokenization that matches the respective benchmarking models. Metrics are given as F1 score, precision, recall, and Mathews Correlation Coefficient (MCC).
Dataset¶
annotations come from: https://epd.epfl.ch/human/human_database.php?db=human
root_path = "finetuning_tasks/Prom300/"
annotations = pd.read_csv(root_path+'promoterRangesHg19.bed', sep='\t', header=None)
annotations.columns = ['chr', 'start', 'end', 'name', 'score', 'strand']
annotations.sort_values(by=['chr', 'start'], inplace=True)
annotations["start"] = annotations.apply(lambda x: x["start"]-249 if x["strand"] == "+" else x["start"]-50, axis=1)
annotations["end"] = annotations.apply(lambda x: x["end"]+50 if x["strand"] == "+" else x["end"]+249, axis=1)
# load BP chromosome mapper
def load_mapper(BP_chr):
# create mapper that maps nucleotide position to BP position
mapper = []
curr_BP_pos = 0
for i in range(len(BP_chr)):
for e in range(len(BP_chr[i])):
mapper.append(i)
return mapper
# iterate through each chromosome
BP_seqs = []
for chrom in annotations["chr"].unique():
chrom_nr = chrom[3:]
if chrom_nr == "X":
chrom_nr = 23
elif chrom_nr == "Y":
chrom_nr = 24
# slice the annotations for the current chromosome
annotations_chr = annotations.loc[annotations["chr"] == chrom]
# load BP tokenized chromosome
with open(f"tokenized_chromosomes/chr_{chrom_nr}.pkl", "rb") as f:
BP_chr = pickle.load(f)
# load mapper that maps nucleotide position to BP position
print(f"loading mapper {chrom}")
mapper = load_mapper(BP_chr)
BP_starts = annotations.loc[annotations["chr"] == chrom]["start"].apply(lambda x: mapper[x])
BP_ends = annotations.loc[annotations["chr"] == chrom]["end"].apply(lambda x: mapper[x])
for start, end in zip(BP_starts, BP_ends):
BP_seqs.append(BP_chr[start:end])
annotations["BPseq"] = BP_seqs
annotations["sequence"] = annotations["BPseq"].apply(lambda x: list("".join(x)))
Random mutate¶
Preprocessing like https://github.com/egochao/DeePromoter
Procedure for create negative dataset as described in paper:
Step 1: Break the sequence in N parts(20 as in the paper)
Old Step 2: Random choose M parts of the original sequence to keep it, and random initialize the rest
New Step 2: Randomly choose M parts of the original sequence to keep it, shuffle the rest around (preserves sequence attributes like GC content and AG balance)
# function that returns the indices of all elements of k chunks of size (len(lst) // k) in a list
def get_chunks(lst, k):
chunkSize = len(lst) // k
indices = []
for i in range(0, len(lst), chunkSize):
indices.append([e for e in range(i, min(i + chunkSize, len(lst)))]) # min is used to avoid index out of bounds
return indices
def choose_random_chunks(chunks, nrOfChunksToMutate):
random_indices = np.random.choice(len(chunks), nrOfChunksToMutate, replace=False)
return [chunks[i] for i in random_indices]
def unroll_chunks(chunks):
return [e for chunk in chunks for e in chunk]
def shuffleIndices(indices):
shuffleIndices = indices.copy()
np.random.shuffle(shuffleIndices)
return shuffleIndices
def random_mutate(seq, k, nrOfChunksToMutate):
seq = seq.copy()
chunks = get_chunks(seq, k=k)
chunksToMutate = choose_random_chunks(chunks, nrOfChunksToMutate=nrOfChunksToMutate)
indicesToMutate = unroll_chunks(chunksToMutate)
shuffledIndicesToMutate = shuffleIndices(indicesToMutate)
for x,y in zip(indicesToMutate,shuffledIndicesToMutate):
seq[x] = seq[y]
return seq
Shuffle GROVER sequence¶
seq = annotations.loc[1, 'BPseq']
nrOfChunks = 8 # in how many splits do we want to divide the sequence
nrOfChunksToMutate = 6 # how many of these splits do we want to mutate (randomly shuffle)
annotations["BPseqMutated"] = annotations["BPseq"].apply(lambda x: random_mutate(x, nrOfChunks, nrOfChunksToMutate))
data_non_mutated = pd.DataFrame({"X": annotations["BPseq"], "y": [1] * len(annotations), "start": annotations["start"], "end": annotations["end"], "name": annotations["name"], "strand": annotations["strand"], "chr": annotations["chr"]})
data_mutated = pd.DataFrame({"X": annotations["BPseqMutated"], "y": [0] * len(annotations), "start": annotations["start"], "end": annotations["end"], "name": annotations["name"], "strand": annotations["strand"], "chr": annotations["chr"]})
data = pd.concat([data_non_mutated, data_mutated], ignore_index=True)
# split data into train, val, test: 80%, 10%, 10%
train = data.sample(frac=0.8, random_state=42)
val_test = data.drop(train.index)
val = val_test.sample(frac=0.5, random_state=42)
test = val_test.drop(val.index)
train.to_csv(root_path + "train.tsv", index=False, sep='\t')
val.to_csv(root_path + "validate.tsv", index=False, sep='\t')
test.to_csv(root_path + "test.tsv", index=False, sep='\t')
Shuffle kmers¶
def kmerize(seq, k):
return ["".join(seq[i:i+k]) for i in range(0, len(seq)-k+1, k)]
annotations["4mer_seq"] = annotations["sequence"].apply(lambda x: kmerize(x, 4))
annotations["5mer_seq"] = annotations["sequence"].apply(lambda x: kmerize(x, 5))
annotations["6mer_seq"] = annotations["sequence"].apply(lambda x: kmerize(x, 6))
seq = annotations.loc[1, '4mer_seq']
nrOfChunks = 8 # in how many splits do we want to divide the sequence
nrOfChunksToMutate = 6 # how many of these splits do we want to mutate (randomly shuffle)
shuffeld_seq = random_mutate(seq, nrOfChunks, nrOfChunksToMutate)
nrOfChunks = 8 # in how many splits do we want to divide the sequence
nrOfChunksToMutate = 6 # how many of these splits do we want to mutate (randomly shuffle)
for kmer in [4,5,6]:
annotations[f"{kmer}mer_seq_mut"] = annotations[f"{kmer}mer_seq"].apply(lambda x: random_mutate(x, nrOfChunks, nrOfChunksToMutate))
for kmer in [4,5,6]:
data_non_mutated = pd.DataFrame({"X": annotations[f"{kmer}mer_seq"], "y": [1] * len(annotations), "start": annotations["start"], "end": annotations["end"], "name": annotations["name"], "strand": annotations["strand"], "chr": annotations["chr"]})
data_mutated = pd.DataFrame({"X": annotations[f"{kmer}mer_seq_mut"], "y": [0] * len(annotations), "start": annotations["start"], "end": annotations["end"], "name": annotations["name"], "strand": annotations["strand"], "chr": annotations["chr"]})
data = pd.concat([data_non_mutated, data_mutated], ignore_index=True)
# split data into train, val, test: 80%, 10%, 10%
train = data.sample(frac=0.8, random_state=42)
val_test = data.drop(train.index)
val = val_test.sample(frac=0.5, random_state=42)
test = val_test.drop(val.index)
train.to_csv(f"{root_path}train_{kmer}mer.tsv", index=False, sep='\t')
val.to_csv(f"{root_path}val_{kmer}mer.tsv", index=False, sep='\t')
test.to_csv(f"{root_path}test_{kmer}mer.tsv", index=False, sep='\t')
Prediction¶
GROVER shuffle¶
## GROVER
args = {}
args["checkpoint_path"] = root_path + "grover_epoch_04-val_loss_0.02.ckpt"
args["model_path"] = "GROVER_pretrained/"
args["test_data"] = root_path + "test.tsv"
args["dataset_type"] = "BP"
args["max_length"] = 100
args["batch_size"] = 4
grover_res = predict(args)
## 4mer
args = {}
args["checkpoint_path"] = root_path + "4mer_epoch_18-val_loss_0.25.ckpt"
args["model_path"] = "../../kmer/4mer/"
args["test_data"] = root_path + "test.tsv"
args["dataset_type"] = "Kmer"
args["kmer"] = 4
args["max_length"] = 300//args["kmer"] + 10
args["batch_size"] = 4
res_4mer = predict(args)
## 5mer
args = {}
args["checkpoint_path"] = root_path + "5mer_epoch_15-val_loss_0.29.ckpt"
args["model_path"] = "../../kmer/5mer/"
args["test_data"] = root_path + "test.tsv"
args["dataset_type"] = "Kmer"
args["kmer"] = 5
args["max_length"] = 300//args["kmer"] + 10
args["batch_size"] = 4
res_5mer = predict(args)
## 6mer
args = {}
args["checkpoint_path"] = root_path + "6mer_epoch_28-val_loss_0.36.ckpt"
args["model_path"] = "../../kmer/6mer/"
args["test_data"] = root_path + "test.tsv"
args["dataset_type"] = "Kmer"
args["kmer"] = 6
args["max_length"] = 300//args["kmer"] + 10
args["batch_size"] = 4
res_6mer = predict(args)
k-mer shuffle¶
## 4mer
args = {}
args["checkpoint_path"] = root_path + "4mer_4mer_shuffle_epoch_19-val_loss_0.42.ckpt"
args["model_path"] = "../../kmer/4mer/"
args["test_data"] = root_path + "test_4mer.tsv"
args["dataset_type"] = "Kmer"
args["kmer"] = 4
args["max_length"] = 300//args["kmer"] + 10
args["batch_size"] = 4
res_4mer_4mer_shuffle = predict(args)
## 5mer
args = {}
args["checkpoint_path"] = root_path + "5mer_5mer_shuffle_epoch_02-val_loss_0.58.ckpt"
args["model_path"] = "../../kmer/5mer/"
args["test_data"] = root_path + "test_5mer.tsv"
args["dataset_type"] = "Kmer"
args["kmer"] = 5
args["max_length"] = 300//args["kmer"] + 10
args["batch_size"] = 4
res_5mer_5mer_shuffle = predict(args)
## 6mer
args = {}
args["checkpoint_path"] = root_path + "6mer_6mer_shuffle_epoch_05-val_loss_0.67.ckpt"
args["model_path"] = "../../kmer/6mer/"
args["test_data"] = root_path + "test_6mer.tsv"
args["dataset_type"] = "Kmer"
args["kmer"] = 6
args["max_length"] = 300//args["kmer"] + 10
args["batch_size"] = 4
res_6mer_6mer_shuffle = predict(args)
BP_shuffle = pd.read_csv(root_path + "test.tsv", sep="\t", converters={"X": eval, "y": eval})
shuffle_4mer = pd.read_csv(root_path + "test_4mer.tsv", sep="\t", converters={"X": eval, "y": eval})
shuffle_5mer = pd.read_csv(root_path + "test_5mer.tsv", sep="\t", converters={"X": eval, "y": eval})
shuffle_6mer = pd.read_csv(root_path + "test_6mer.tsv", sep="\t", converters={"X": eval, "y": eval})
test_dfs = [
BP_shuffle,
shuffle_4mer,
shuffle_5mer,
shuffle_6mer
]
preds_dfs = [
grover_res,
res_4mer_4mer_shuffle,
res_5mer_5mer_shuffle,
res_6mer_6mer_shuffle
]
names = [
"GROVER",
"4mer",
"5mer",
"6mer"
]
for test_df, preds_df, name in zip(test_dfs, preds_dfs, names):
preds_df["preds"] = preds_df["probs"].apply(lambda x: 1 if x > 0.5 else 0)
test_df[name] = preds_df["preds"]
results = {"model_name": [], "f1": [], "precision": [], "recall": [], "mcc": []}
for name, test_df in zip(names, test_dfs):
print(name)
results["model_name"].append(name)
results["f1"].append(metrics.f1_score(test["y"], test_df[name]))
results["precision"].append(metrics.precision_score(test_df["y"], test_df[name]))
results["recall"].append(metrics.recall_score(test_df["y"], test_df[name]))
results["mcc"].append(metrics.matthews_corrcoef(test_df["y"], test_df[name]))
metrics_df = pd.DataFrame(results) ## FINAL_FILE
Figure 7D¶
Performance of the Prom-scan task of GROVER versus models of fixed-size k-mers. Metrics are given as F1 score, precision, recall, and Mathews Correlation Coefficient (MCC).
Dataset¶
annotations come from: https://epd.epfl.ch/human/human_database.php?db=human
root_path = "finetuning_tasks/PromScan/"
annotations = pd.read_csv(root_path + 'promoterRangesHg19.bed', sep='\t', header=None)
annotations.columns = ['chr', 'start', 'end', 'name', 'score', 'strand']
annotations["tss"] = annotations["start"].copy()
annotations["start"] = annotations["start"] - 5000
annotations["end"] = annotations["end"] + 5000
## Split long sequences in overlapping windows of size 1001 nucleotides and stride of 300 nucl.
def split_in_overlapping_windows(start, end, window_size, stride, tss_pos, chr, strand, name):
windows = []
for i in range(start, end, stride):
this_start = i
this_end = i + window_size
this_window = {"start": this_start, "end": this_end, "chr": chr, "is_ tss": this_start <= tss_pos and tss_pos <= this_end, "strand": strand, "name": name}
windows.append(this_window)
return windows
stacked_windows = annotations.apply(lambda x: split_in_overlapping_windows(x["start"], x["end"], 1001, 300, x["tss"], x["chr"], x["strand"], x["name"]), axis=1)
windows_dict = {"start": [], "end": [], "chr": [], "is_tss": [], "strand": [], "name": []}
for sample in stacked_windows:
for window in sample:
windows_dict["start"].append(window["start"])
windows_dict["end"].append(window["end"])
windows_dict["chr"].append(window["chr"])
windows_dict["is_tss"].append(window["is_ tss"])
windows_dict["strand"].append(window["strand"])
windows_dict["name"].append(window["name"])
window_df = pd.DataFrame(windows_dict)
# load BP chromosome mapper
def load_mapper(BP_chr):
# create mapper that maps nucleotide position to BP position
mapper = []
curr_BP_pos = 0
for i in range(len(BP_chr)):
for e in range(len(BP_chr[i])):
mapper.append(i)
return mapper
# iterate through each chromosome
BP_seqs = []
for chrom in annotations["chr"].unique():
chrom_nr = chrom[3:]
if chrom_nr == "X":
chrom_nr = 23
elif chrom_nr == "Y":
chrom_nr = 24
# slice the annotations for the current chromosome
annotations_chr = window_df.loc[window_df["chr"] == chrom]
# load BP tokenized chromosome
with open(f"tokenized_chromosomes/chr_{chrom_nr}.pkl", "rb") as f:
BP_chr = pickle.load(f)
# load mapper that maps nucleotide position to BP position
print(f"loading mapper {chrom}")
mapper = load_mapper(BP_chr)
BP_starts = window_df.loc[window_df["chr"] == chrom]["start"].apply(lambda x: mapper[x])
BP_ends = window_df.loc[window_df["chr"] == chrom]["end"].apply(lambda x: mapper[x])
for start, end in zip(BP_starts, BP_ends):
BP_seqs.append(BP_chr[start:end])
window_df["BPseq"] = BP_seqs
window_df["sequence"] = annotations["BPseq"].apply(lambda x: list("".join(x)))
window_df.rename(columns={"BPseq": "X", "is_tss": "y"}, inplace=True)
# split into train, val, test: 80/10/10
train, validate, test = np.split(window_df.sample(frac=1, random_state=42), [int(.8*len(window_df)), int(.9*len(window_df))])
train.to_csv(root_path + "train.tsv", index=False, sep="\t")
validate.to_csv(root_path + "validate.tsv", index=False, sep="\t")
test.to_csv(root_path + "test.tsv", index=False, sep="\t")
Prediction¶
## GROVER
args = {}
args["checkpoint_path"] = root_path + "grover_epoch_18-val_loss_0.18.ckpt"
args["model_path"] = "GROVER_pretrained/"
args["test_data"] = root_path + "test.tsv"
args["dataset_type"] = "BP"
args["max_length"] = 256
args["batch_size"] = 4
grover_res = predict(args)
## 4mer
args = {}
args["checkpoint_path"] = root_path + "4mer_epoch_12-val_loss_0.20.ckpt"
args["model_path"] = "../../kmer/4mer/"
args["test_data"] = root_path + "test.tsv"
args["dataset_type"] = "Kmer"
args["kmer"] = 4
args["max_length"] = 1000//args["kmer"] + 10
args["batch_size"] = 4
res_4mer = predict(args)
## 5mer
args = {}
args["checkpoint_path"] = root_path + "5mer_epoch_10-val_loss_0.20.ckpt"
args["model_path"] = "../../kmer/5mer/"
args["test_data"] = root_path + "test.tsv"
args["dataset_type"] = "Kmer"
args["kmer"] = 5
args["max_length"] = 1000//args["kmer"] + 10
args["batch_size"] = 4
res_5mer = predict(args)
## 6mer
args = {}
args["checkpoint_path"] = root_path + "6mer_epoch_13-val_loss_0.21.ckpt"
args["model_path"] = "../../kmer/6mer/"
args["test_data"] = root_path + "test.tsv"
args["dataset_type"] = "Kmer"
args["kmer"] = 6
args["max_length"] = 1000//args["kmer"] + 10
args["batch_size"] = 4
res_6mer = predict(args)
test = pd.read_csv(root_path + "test.tsv", sep="\t", converters={"X": eval, "y": eval})
preds_dfs = [
grover_res,
res_4mer,
res_5mer,
res_6mer
]
names = [
"GROVER",
"4mer",
"5mer",
"6mer"
]
for preds_df, name in zip(preds_dfs, names):
preds_df["preds"] = preds_df["probs"].apply(lambda x: 1 if x > 0.5 else 0)
test[name] = preds_df["preds"]
results = {"model_name": [], "f1": [], "precision": [], "recall": [], "mcc": []}
for name in names:
print(name)
results["model_name"].append(name)
results["f1"].append(metrics.f1_score(test["y"], test[name]))
results["precision"].append(metrics.precision_score(test["y"], test[name]))
results["recall"].append(metrics.recall_score(test["y"], test[name]))
results["mcc"].append(metrics.matthews_corrcoef(test["y"], test[name]))
metrics_df = pd.DataFrame(results) ## FINAL_FILE
Figure 7F¶
Performance of the CTCF-motif-binding task of GROVER versus models of fixed-size k-mers. Metrics are given as F1 score, precision, recall, and Mathews Correlation Coefficient (MCC)
Dataset¶
CTCF peaks from https://www.encodeproject.org/experiments/ENCSR000BIE/
root_path = "finetuning_tasks/TF_binding/"
peaks = pd.read_csv(root_path + 'ENCFF915BIE.bed', sep='\t', header=None)
motif_sites = pd.read_csv(root_path + 'CTCF_motif_sites.gff', sep='\t', header=None, skiprows=1)
run in command line:
- intersectBed -loj -a CTCF_motif_sites.gff -b ENCFF915BIE.bed > CTCF_motifs_with_peak_annotation.bed
motifs_with_peaks = pd.read_csv(root_path + 'CTCF_motifs_with_peak_annotation.bed', sep='\t', header=None)
## Getting the center of motif
motifs_with_peaks["center_of_motif"] = motifs_with_peaks[3] + (motifs_with_peaks[4] - motifs_with_peaks[3])//2
## Get 1kb area around motif
motifs_with_peaks["start_of_bin"] = motifs_with_peaks["center_of_motif"] - 500
motifs_with_peaks["end_of_bin"] = motifs_with_peaks["center_of_motif"] + 500
motifs_with_peaks["width"] = motifs_with_peaks["end_of_bin"] - motifs_with_peaks["start_of_bin"]
## Add target annotation column for machine learning task
motifs_with_peaks["y"] = motifs_with_peaks[9].apply(lambda x: 1 if x != '.' else 0)
## Retrieve GROVER tokens
data = motifs_with_peaks[[0, "start_of_bin", "end_of_bin", "y"]]
data = data.loc[data[0].apply(lambda x: True if len(x) <= 5 else False)] # some random lines with chr1_gl000191_random etc
data = data.sort_values(by=[0, "start_of_bin"], inplace=False)
def load_mapper(BP_chr):
# create mapper that maps nucleotide position to BP position
mapper = []
for i in range(len(BP_chr)):
for e in range(len(BP_chr[i])):
mapper.append(i)
return mapper
# iterate through each chromosome
BP_seqs = []
for chrom in data[0].unique():
chrom_nr = chrom[3:]
if chrom_nr == "X":
chrom_nr = 23
elif chrom_nr == "Y":
chrom_nr = 24
# slice the data for the current chromosome
data_chr = data.loc[data[0] == chrom]
# load BP tokenized chromosome
with open(f"tokenized_chromosomes/chr_{chrom_nr}.pkl", "rb") as f:
BP_chr = pickle.load(f)
# load mapper that maps nucleotide position to BP position
print(f"loading mapper {chrom}")
mapper = load_mapper(BP_chr)
BP_starts = data.loc[data[0] == chrom]["start_of_bin"].apply(lambda x: mapper[x])
BP_ends = data.loc[data[0] == chrom]["end_of_bin"].apply(lambda x: mapper[x])
for start, end in zip(BP_starts, BP_ends):
BP_seqs.append(BP_chr[start:end])
data["X"] = BP_seqs
## create train val test (80/10/10)
train = data.sample(frac=0.8, random_state=0)
val = data.drop(train.index)
test = val.sample(frac=0.5, random_state=0)
val = val.drop(test.index)
train.to_csv(root_path + "CTCF_train.tsv", index=False, sep="\t")
val.to_csv(root_path + "CTCF_val.tsv", index=False, sep="\t")
test.to_csv(root_path + "CTCF_test.tsv", index=False, sep="\t")
Prediction¶
## GROVER
args = {}
args["checkpoint_path"] = root_path + "grover_epoch_18-val_loss_0.45.ckpt"
args["model_path"] = "GROVER_pretrained/"
args["test_data"] = root_path + "CTCF_test.tsv"
args["dataset_type"] = "BP"
args["max_length"] = 310
args["batch_size"] = 4
grover_res = predict(args)
## 4mer
args = {}
args["checkpoint_path"] = root_path + "4mer_epoch_29-val_loss_0.48.ckpt"
args["model_path"] = "../../kmer/4mer/"
args["test_data"] = root_path + "CTCF_test.tsv"
args["dataset_type"] = "Kmer"
args["kmer"] = 4
args["max_length"] = 1000//args["kmer"] + 10
args["batch_size"] = 4
res_4mer = predict(args)
## 5mer
args = {}
args["checkpoint_path"] = root_path + "5mer_epoch_23-val_loss_0.53.ckpt"
args["model_path"] = "../../kmer/5mer/"
args["test_data"] = root_path + "CTCF_test.tsv"
args["dataset_type"] = "Kmer"
args["kmer"] = 5
args["max_length"] = 1000//args["kmer"] + 10
args["batch_size"] = 4
res_5mer = predict(args)
## 6mer
args = {}
args["checkpoint_path"] = root_path + "6mer_epoch_19-val_loss_0.56.ckpt"
args["model_path"] = "../../kmer/6mer/"
args["test_data"] = root_path + "CTCF_test.tsv"
args["dataset_type"] = "Kmer"
args["kmer"] = 6
args["max_length"] = 1000//args["kmer"] + 10
args["batch_size"] = 4
res_6mer = predict(args)
test = pd.read_csv(root_path + "CTCF_test.tsv", sep="\t", converters={"X": eval, "y": eval})
preds_dfs = [
grover_res,
res_4mer,
res_5mer,
res_6mer
]
names = [
"GROVER",
"4mer",
"5mer",
"6mer"
]
for preds_df, name in zip(preds_dfs, names):
preds_df["preds"] = preds_df["probs"].apply(lambda x: 1 if x > 0.5 else 0)
test[name] = preds_df["preds"]
results = {"model_name": [], "f1": [], "precision": [], "recall": [], "mcc": []}
for name in names:
print(name)
results["model_name"].append(name)
results["f1"].append(metrics.f1_score(test["y"], test[name]))
results["precision"].append(metrics.precision_score(test["y"], test[name]))
results["recall"].append(metrics.recall_score(test["y"], test[name]))
results["mcc"].append(metrics.matthews_corrcoef(test["y"], test[name]))
metrics_df = pd.DataFrame(results) ## FINAL_FILE