Source code for umap.unify_bowtie
from argparse import ArgumentParser
import gzip
import numpy as np
import os
import pandas as pd
import re
def subset_list(list_items, regex):
out_list = []
for each_item in list_items:
RE = re.search(regex, each_item)
if RE:
out_list.append(each_item)
return out_list
[docs]class UnifyBowtie:
def __init__(self, bowtie_outdir, chrsize_path, job_id):
"""Merges bowtie.gz outputs of run_bowtie
Based on outputs of run_bowtie that are saved in format
of <chr>.<kmer>.<job_id>.bowtie.gz, this class uses
a variable ID to select a particular chromosome and
merge the data in all of the different bowtie.gz files
of that chromosome.
:param bowtie_outdir: Directory with <chr>.<kmer>.<job_id>.bowtie.gz
:param chrsize_path: Path to 2-column file: <chr>\t<size>\n...
:param int job_id: A 0-based index to select chromosome
based on chrsize_path
:returns: Saves the output to bowtie_outdir/
<chr>.<kmer>.uint8.unique.gz file
"""
self.bowtie_outdir = bowtie_outdir
self.chrsize_path = chrsize_path
self.ind_chr = job_id
self.chr_dict = self.make_chr_dict()
self.bowtie_to_unique()
def make_chr_dict(self):
"""Makes a dictionary using self.chrsize_path"""
chr_dict = {}
with open(self.chrsize_path, "r") as chrsize_link:
for chrsize_line in chrsize_link:
chrom, size = chrsize_line.rstrip().split("\t")
chr_dict[chrom] = int(size)
return chr_dict
def get_mapped_positions(self, bowtie_path):
"""Finds mapped regions in bowtie output
In a gzipped bowtie output with perfect matches,
filters the results to those in the forward strand
and saves them to an array.
Why results filtered for forward strand?
The results uniquely mapped
to the reverse strand are rare and a mistake of the
aligning algorithm because all of the k-mers are
generated from the forward strand. If match happens
on reverse strand, it means that the match is not the
only unique match (reads were generated from forward
strand).
"""
bowtie_df = pd.read_csv(
bowtie_path, sep="\t",
compression="gzip",
names=["Ind", "Strand", "Chr", "Start"])
bowtie_df = bowtie_df[bowtie_df["Strand"] == "+"]
ind_ar = bowtie_df.iloc[:, 3]
return ind_ar
def get_other_chr_name(self, chr_name):
"""Finds name of the reverse complement chromosome
In Bismap, we generate a set of reverse complemented
chromosomes to account for bisulfite conversion before
reverse complementation. This function finds those
chromosomes"""
new_name = chr_name
if "RC" in chr_name:
len_paths = len(
subset_list(
os.listdir(self.bowtie_outdir), chr_name))
if len_paths == 0:
new_name = chr_name + "_RC"
return new_name
def bowtie_to_unique(self):
"""Wrapper method of UnifyBowtie
Uses information of bowtie.gz files to find mappability
of a given chromosome. Is automatically called by
UnifyBowtie."""
KMER = int(self.bowtie_outdir.split("/")[-1].split("k")[-1])
all_chrs = self.chr_dict.keys()
all_chrs.sort()
chrom = all_chrs[self.ind_chr]
size = self.chr_dict[chrom]
new_chr_name = self.get_other_chr_name(chrom)
chr_paths = ["{}/{}".format(self.bowtie_outdir, bowtie_path)
for bowtie_path in subset_list(
os.listdir(self.bowtie_outdir),
"{}\.".format(new_chr_name))]
bowtie_paths = subset_list(chr_paths, ".bowtie.gz")
unique_ar = np.zeros(size, dtype=np.uint8)
for bowtie_path in bowtie_paths:
mapped_indices = self.get_mapped_positions(bowtie_path)
for st_index in mapped_indices:
end_index = st_index + KMER
unique_ar[st_index:end_index] = 1
print("Done with {}".format(bowtie_path))
out_path = "{}/{}.k{}.uint8.unique.gz".format(
self.bowtie_outdir, chrom, KMER)
out_link = gzip.open(out_path, "wb")
# np.save(out_link, unique_ar)
out_link.write(unique_ar.tobytes())
# unique_ar.tofile(out_link)
out_link.close()
print("Saved {}".format(out_path))
print("Exiting successfully")
if __name__ == "__main__":
parser = ArgumentParser(
description="")
parser.add_argument(
"bowtie_outdir",
help="Directory containing bowtie output files")
parser.add_argument(
"chrsize_path",
help="A file containing the order of chromosome names\
to consider (one chromosome name per line)")
parser.add_argument(
"-job_id",
default=0,
type=int,
help="If not using a cluster for submitting jobs, "
"specify the job_id by integer ranging from 1 to "
"total number of chromosomes in chrsize_path")
parser.add_argument(
"-var_id",
default="SGE_TASK_ID",
help="HPC variable name for job ID (1-based index)")
args = parser.parse_args()
job_id = args.job_id
if job_id == 0:
job_id = int(os.environ[args.var_id]) - 1
UnifyBowtie(args.bowtie_outdir, args.chrsize_path, job_id)