from argparse import ArgumentParser
import gzip
import numpy as np
import os
import pandas as pd
def get_args():
parser = ArgumentParser(
description="Creates list of unique Kmers "
"for each 1Mb of the genome. Relies on an environmental "
"variable such as SGE_TASK_ID to identify the correct "
"megabase of the genome for this purpose. "
"If not using job arrays, specify -job_id manually.")
parser.add_argument(
"chrsize_path",
help="Path to 2 column tsv file where first column is "
"chromosome name and second column is chromosome size")
parser.add_argument(
"out_dir",
help="Path to the directory for creating "
"<chromosome>.<Megabase>.<kmer>.kmer.gz files")
parser.add_argument(
"kmer",
default="infer",
help="The software would infer it based on the "
"name of the 'out_dir'. If it is set and "
"contradicts the 'out_dir', a subfolder "
"under out_dir will be created named 'kmer' and "
"out_dir will be changed to that.")
parser.add_argument(
"chr_dir",
help="Path to directory with <chromosome>.fasta files.")
parser.add_argument(
"-job_id",
default=0,
type=int,
help="If not submitted in job array, would require this "
"parameter to be set. (1-based index)")
parser.add_argument(
"-var_id",
default="SGE_TASK_ID",
help="The variable name that the script would use "
"for identifying the job id. By default: SGE_TASK_ID.")
args = parser.parse_args()
out_dir = args.out_dir
inferred_kmer = args.out_dir.split("/")[-1]
kmer = args.kmer
job_id = args.job_id
if args.kmer == "infer":
kmer = inferred_kmer
elif args.kmer != inferred_kmer:
kmer = args.kmer
out_dir = "{}/{}".format(out_dir, kmer)
if job_id == 0:
job_id = int(os.environ[args.var_id]) - 1
out_list = [args.chrsize_path, out_dir,
kmer, job_id, args.chr_dir]
return out_list
[docs]class GetKmers:
def __init__(self, out_dir, kmer, job_id,
chr_dir, chrsize_path):
"""Creates all the possible k-mers for part of the genome.
Used a referece file to find the appropriate chromosome,
start and end position. Passes through the fasta file
of the chromosome and generates all of the possible k-mers.
Args:
:param out_dir: Directory for saving <chrom>.<jobid>.kmer.gz files
:param str kmer: k-mer string such as 'k24'
:param int job_id: Reference ID used for finding chrom, start and end
:param chr_dir: Path to directory with chromosome fasta files
:param chrsize_path: Path to 2 column file of chrom\tsize\n
:returns: An object with methods such as get_step_fasta(),
get_seq_ar(), write_kmers() and write_regions().
:raises ValueError: if expected chromosome path does not exist
"""
self.out_dir = out_dir
self.kmer = kmer
self.job_id = job_id
self.chrsize_path = chrsize_path
self.chr_dir = chr_dir
self.chrom, self.start, self.end = self.get_region()
self.chr_path = "{}/{}.fasta".format(
self.chr_dir, self.chrom)
if not os.path.exists(self.chr_path):
self.chr_path = self.chr_path.replace(".fasta", ".fa")
if not os.path.exists(self.chr_path):
raise ValueError("{} does not exist".format(self.chr_path))
def get_region(self):
"""Find chromosomal chunk
Using job_id and chrsize_path, finds chromosome,
start and end. Looks if chrsize_index.tsv exists.
If it doesn't, it will create chrsize_index.tsv.
Otherwise will use it for finding that information.
:returns: chrom, start, end
:raises ValueError: If job_id is out of expected range
"""
chrsize_path = self.chrsize_path
job_id = self.job_id
CHUNK_SIZE = int(1e6)
ind_path = "{}/chrsize_index.tsv".format(
"/".join(chrsize_path.split("/")[:-1]))
if os.path.exists(ind_path):
ind_df = pd.read_csv(ind_path, sep="\t", index_col=0)
dict_inds = ind_df.to_dict()
else:
ind_link = open(ind_path, "w")
ind_link.write("Index\tChromosome\tStart\tEnd\n")
start = 1
ind = 0
# Identifying chromosome, start, end for job_id
with open(chrsize_path, "r") as chrsize_link:
for chrsize_line in chrsize_link:
chrom, len_chr = chrsize_line.rstrip().split("\t")
end = int(len_chr) + start
for pos in range(start, end, CHUNK_SIZE):
if pos < end:
if pos + CHUNK_SIZE - 1 > end:
pos_end = end
else:
pos_end = pos + CHUNK_SIZE - 1
ind_link.write(
"\t".join([str(ind), chrom, str(pos),
str(int(pos_end))]) +
"\n")
ind = ind + 1
ind_link.close()
ind_df = pd.read_csv(ind_path, sep="\t", index_col=0)
dict_inds = ind_df.to_dict()
try:
chrom, start, end = [
dict_inds[qry][job_id] for
qry in ["Chromosome", "Start", "End"]]
except:
raise ValueError(
"{} Job id is larger than available indices".format(
job_id))
return chrom, int(start), int(end)
def get_step_fasta(self):
"""Finds number of nucleotides at each line of FASTA
:raises ValueError: If top 10 FASTA lines have
sequences of varying length
"""
chr_path = self.chr_path
line_nums = 10
len_lines = []
with open(chr_path, "r") as chr_link:
for line_num in range(line_nums):
chr_line = chr_link.readline().rstrip()
if ">" not in chr_line:
len_line = len(chr_line)
if len_line not in len_lines:
if chr_line != "":
len_lines.append(len_line)
if len(len_lines) == 1:
return(len_lines[0])
else:
raise ValueError(
"Top 10 FASTA lines have sequences of varying length")
def get_seq_ar(self, fasta_fix):
"""Gets sequence as numpy array
get_seq_ar creates a numpy array matching sequence
of the region specified by chromosome, start and
end. However it starts the sequence with kmer - 1
less than the specified start to account for the
missing kmers from the previous chunk of chromosome.
Each element is one line of a fasta file that may be
cut to match the exact start/end positions.
Args:
:param int fasta_fix: Number of nucleotides per fasta line.
:returns: Numpy array with each member being one line of FASTA
"""
start = self.start
end, chr_path = self.end, self.chr_path
kmer = int(self.kmer.replace("k", ""))
start = start - kmer
if start < 0:
start = 0
st_line = int(np.floor(start / fasta_fix))
st_line_pos = start % fasta_fix
end_line = int(np.floor(end / fasta_fix))
end_line_pos = (end % fasta_fix) + 1
seq_ar = np.empty(int(end_line - st_line + 1),
dtype="|S{}".format(fasta_fix + 1))
ind_seq = 0
STORE_LINE = False
with open(chr_path, "r") as chr_link:
line_ind = 0
for each_line in chr_link:
if line_ind == st_line:
STORE_LINE = True
each_line = each_line.rstrip()[st_line_pos:]
if STORE_LINE:
if line_ind == end_line:
STORE_LINE = False
each_line = each_line.rstrip()[:end_line_pos]
seq_ar[ind_seq] = each_line.rstrip()
ind_seq = ind_seq + 1
line_ind = line_ind + 1
if "chr" in seq_ar[0]:
seq_ar = seq_ar[1:]
return seq_ar
def write_kmers(self, cur_seq, kmer_int, out_link):
"""Creates k-mers and saves them to the file
Using a numpy array of sequences, generates and
saves all of the possible k-mers.
Args:
:param cur_seq: A string of sequences
:param kmer_int: An integer denoting the k-mer size
:param out_link: File link for writing k-mers
:returns: May modify cur_seq by deducing the k-mers that
are written to the out_link file object.
"""
if len(cur_seq) < kmer_int:
return cur_seq
elif len(cur_seq) == kmer_int:
out_link.write(cur_seq + "\n")
cur_seq = cur_seq[1:]
return cur_seq
else:
for i in range(len(cur_seq) - kmer_int):
j = i + kmer_int
if j > len(cur_seq):
j = len(cur_seq)
seq_out = cur_seq[i:j]
if "N" not in seq_out:
out_link.write(seq_out + "\n")
cur_seq = cur_seq[(i + 1):]
return cur_seq
def write_region(self):
"""Wrapper of GetKmers to get and write k-mers
write_region method executes all the other methods
of the GetKmers class for writing out all of the
possible k-mers from chunk of a FASTA file.
"""
out_dir, kmer, job_id = self.out_dir, self.kmer, self.job_id
chrom = self.chrom
start, end = self.start, self.end
out_path = "{}/{}.{}.{}.kmer.gz".format(
out_dir, chrom, job_id, kmer.replace("k", ""))
out_link = gzip.open(out_path, "wb")
fasta_fix = self.get_step_fasta()
kmer_int = int(kmer.replace("k", ""))
seq_ar = self.get_seq_ar(fasta_fix)
cur_seq = ""
for each_seq in seq_ar:
cur_seq = cur_seq + each_seq
cur_seq = self.write_kmers(cur_seq, kmer_int, out_link)
print("Created all sequences for {}:{}-{}".format(
chrom, start, end))
out_link.close()
if __name__ == "__main__":
chrsize_path, out_dir, kmer, job_id, chr_dir = get_args()
GetKmerObj = GetKmers(
out_dir, kmer, job_id,
chr_dir, chrsize_path)
GetKmerObj.write_region()