Source code for umap.combine_umaps

from argparse import ArgumentParser
import gzip
import numpy as np
import os


def get_args():
    parser = ArgumentParser(
        description="Combines mappability uint8 vectors of "
        "several kmers into 1 uint8 vector per chromosome. "
        "It requires a directory with subfolders names as "
        "k<read length>. This script requires a number to infer "
        "chromosome. If not specifying -job_id, it will "
        "identify the chromosome using -var_id environmental "
        "varibale which by default is SGE_TASK_ID.")
    parser.add_argument(
        "kmer_dir",
        help="Directory with subfolders "
        "named as k<read length>)")
    parser.add_argument(
        "chrsize_path",
        help="Path to 2 column tsv file with first column "
        "as chromosome and second column as its size. Will "
        "be used to identify order of the chromosomes.")
    parser.add_argument(
        "-out_dir",
        default="infer",
        help="If not specified, a subfolder "
        "will be created in kmer_dir names as "
        "globalmap_k<smallestkmer>tok<largestkmer>")
    parser.add_argument(
        "-job_id",
        default=0,
        type=int,
        help="1-based index for finding chromosome from -chrsize_path. "
        "If not specified, it will user -var_id to "
        "infer the chromosome for combining mappabilitiy of "
        "different kmers.")
    parser.add_argument(
        "-var_id",
        default="SGE_TASK_ID",
        help="If -job_id is not specified, job_id will be inferred "
        "from environmental variable -var_id.")
    parser.add_argument(
        "-kmer_dir_2",
        default="NA",
        help="Specify to merge kmers of two different directories "
        "by logical operation AND.")
    args = parser.parse_args()
    job_id = args.job_id
    out_dir = args.out_dir
    kmers = [each_kmer for each_kmer in next(os.walk(args.kmer_dir))[1]
             if "k" == each_kmer[0]]
    if args.job_id == 0:
        job_id = int(os.environ[args.var_id]) - 1
    else:
        job_id = job_id - 1
    if args.out_dir == "infer":
        kmer_ints = [int(each_kmer.replace("k", "")) for each_kmer in kmers]
        out_dir = "{}/globalmap_k{}tok{}".format(
            args.kmer_dir, min(kmer_ints), max(kmer_ints))
    try:
        out_list = [args.kmer_dir, args.chrsize_path,
                    out_dir, job_id, args.kmer_dir_2]
    except:
        raise ValueError(
            "chrsize_path or job_id were invalid.")
    if not os.path.exists(out_dir):
        os.makedirs(out_dir)
    return out_list


[docs]class CombineUmaps: def __init__(self, kmer_dir, chrsize_path, out_dir, job_id, kmer_dir_2): """Combine <chr>.<kmer>.uint8.unique.gz files The kmer_dir has subfolders named as k<integer> with <chr>.<kmer>.uint8.unique.gz files inside. This script will use the job_id to use a particular chromosome and merge the uint8.gz files across all of the different kmers into one uint8.gz. Note: All methods would run consequently when creating the class instance. Args: kmer_dir: Directory with k<integer> subfolders chrsize_path: 2-column tsv file of <chrom>\t<size>\n... out_dir: Output files will be saved to this folder job_id: 0-based index to find chromosome from chrsize_path kmer_dir_2: If using Bismap and want to merge C2T and G2A data Returns Saves the output to outdir/<chrom>.uint8.unique.gz files """ self.kmer_dir = kmer_dir self.kmer_dir_2 = kmer_dir_2 self.chrsize_path = chrsize_path self.out_dir = out_dir self.job_id = job_id self.chrom, self.size = self.get_chrom_size() self.kmers = [each_kmer for each_kmer in next(os.walk(self.kmer_dir))[1] if "k" == each_kmer[0]] combined_ar = self.combine_uints() self.write_ar(combined_ar) def get_chrom_size(self): """Finds chrom and size from self.chrsize_path""" with open(self.chrsize_path, "r") as chrsize_link: ind_chr = 0 for chrsize_line in chrsize_link: if ind_chr == self.job_id: chromosome = chrsize_line.rstrip().split("\t")[0] size = int(chrsize_line.rstrip().split("\t")[1]) ind_chr = ind_chr + 1 return chromosome, size def combine_uints(self): """Merged different kmer arrays of one chromosome """ MergeKmers = False if os.path.exists(self.kmer_dir_2): print( "Limit mappability to regions {} {} and {}".format( "that are unique in both", kmer_dir, kmer_dir_2)) MergeKmers = True combined_ar = np.zeros(self.size, dtype=np.uint8) kmer_nums = [int(kmer.replace("k", "")) for kmer in self.kmers] kmer_nums.sort() for kmer_num in kmer_nums: kmer = "k{}".format(kmer_num) full_kmer_path = "{}/{}/{}.{}.uint8.unique.gz".format( self.kmer_dir, kmer, self.chrom, kmer) if not os.path.exists(full_kmer_path): raise ValueError("{} does not exist".format(full_kmer_path)) kmer_link = gzip.open(full_kmer_path, "rb") kmer_ar = np.frombuffer(kmer_link.read(), dtype=np.uint8) if len(kmer_ar) != self.size: kmer_ar = np.append(kmer_ar, np.zeros(kmer_num)) kmer_link.close() index_comb_0 = combined_ar == 0 index_kmer = kmer_ar != 0 index_adkmer = np.logical_and(index_comb_0, index_kmer) if MergeKmers: full_kmer_path_2 = "{}/{}/{}.{}.uint8.unique.gz".format( self.kmer_dir_2, kmer, self.chrom, kmer) kmer_link_2 = gzip.open(full_kmer_path_2, "rb") kmer_ar_2 = np.frombuffer(kmer_link_2.read(), dtype=np.uint8) if len(kmer_ar_2) != self.size: kmer_ar_2 = np.append(kmer_ar_2, np.zeros(kmer_num)) kmer_link_2.close() index_kmer_2 = kmer_ar_2 != 0 index_adkmer_2 = np.logical_and(index_comb_0, index_kmer_2) index_adkmer = np.logical_and(index_adkmer, index_adkmer_2) combined_ar[index_adkmer] = kmer_num print( "Added information of {} for {}".format( kmer, self.chrom)) return combined_ar def write_ar(self, combined_ar): """Writes merged array to unsigned 8bit integer file. Used self.out_dir and self.chrom. Args: combined_ar: Can be any numpy array """ if not os.path.exists(self.out_dir): os.makedirs(self.out_dir) out_path = "{}/{}.uint8.unique.gz".format( self.out_dir, self.chrom) out_link = gzip.open(out_path, "wb") out_link.write(combined_ar.tobytes()) out_link.close()
if __name__ == "__main__": kmer_dir, chrsize_path, out_dir, job_id, kmer_dir_2 = get_args() CombineUmaps(kmer_dir, chrsize_path, out_dir, job_id, kmer_dir_2)