Source code for umap.uint8_to_bed

import argparse
from datetime import datetime
import gzip
import numpy as np
import os
import re
import warnings


def subset_list(my_list, regex, reg_neg=False):
    out_list = []
    for item in my_list:
        RE = re.search(regex, item)
        if RE and not reg_neg:
            out_list.append(item)
        elif not RE and reg_neg:
            out_list.append(item)
    return out_list


[docs]class Int8Handler: def __init__(self, in_dir, out_dir, C2T, G2A, chrsize_path): """Infers important directory structure from in_dir The Int8Handler class requires in_dir argument which is a directory with subfolders named as k<integer> that have <chrom>.<kmer>.uint8.unique.gz files. The Int8Handler.write_beds method creates one browser extensible data (BED) file for each of the k<integer> subfolders. :param in_dir: Directory with <chrom>.uint8.unique.gz files :param out_dir: Directory for saving output BED files """ self.in_dir = in_dir self.C2T = C2T self.G2A = G2A self.chrsize_path = chrsize_path self.in_prev_dir = "/".join(in_dir.split("/")[:-1]) self.fix = ".uint8.unique.gz" self.uint8s = subset_list(os.listdir(in_dir), self.fix) self.prev_dir = "/".join(self.in_dir.split("/")[:-1]) self.kmers = subset_list(next(os.walk(self.in_prev_dir))[1], "^k") self.out_dir = out_dir self.type = self.find_type() self.chrsize_dict = self.make_chrsize_dict() self.chroms = [each_chr for each_chr in self.chrsize_dict.keys() if "RC" not in each_chr] def find_type(self): if self.C2T: self.type = "Bismap.Forward" elif self.G2A: self.type = "Bismap.Reverse" else: self.type = "Umap" return self.type
[docs] def write_beds(self, out_label, kmer_cur, WriteUnique=False): """Convert uint8 files to BED A method of class Int8Handler, it reads uint8 files of different chromosomes and saves them into a BED file. This is the only method in this class that needs to be called manually by the user. Args: out_label: Will be used in output names: <kmer>.<out_label>.bed.gz kmer_cur: k-mer size in format of k<integer> WriteUnique: Defaults to False. Will merge and save uint files. Use if working with -Bismap and have reverse complemented chromosomes. Returns: Saved ths output to a gzipped BED file Raises: Warning: If no uniquely mappable read is found in any of the uint8 arrays. """ kmer = int(kmer_cur.replace("k", "")) STRAND = "+" if self.G2A: STRAND = "-" in_dir = self.in_dir all_chrs = self.chroms all_chrs.sort() for cur_chr in self.chroms: other_chr = cur_chr + "_RC" uint_path = "{}/{}{}".format( in_dir, cur_chr, self.fix) uniquely_mappable = self.load_uint_ar( uint_path, kmer, cur_chr) if self.type != "Umap": uint_path_other = in_dir +\ "/" + uint_path.replace(cur_chr, other_chr) uniquely_mappable_other = self.load_uint_ar( uint_path_other, kmer, other_chr) # Remove the last k nucleotides of the reverse chromosome uniquely_mappable_other = uniquely_mappable_other[:-kmer] # Reverse the chromosome (Hence it's a reverse complement) uniquely_mappable_other = uniquely_mappable_other[::-1] # Remove the last k nucleotides of the forward chromosome uniquely_mappable = uniquely_mappable[:-kmer] # Merge to find uniquely mappable reads in both strands uniquely_mappable = np.array( np.logical_and( uniquely_mappable_other, uniquely_mappable), dtype=int) if WriteUnique: unique_ar = np.array(uniquely_mappable, dtype=np.uint8) out_dir_uint = "{}/k{}".format(self.out_dir, kmer) if not os.path.exists(out_dir_uint): os.makedirs(out_dir_uint) out_path_uint = "{}/{}.k{}.uint8.unique.gz".format( (out_dir_uint, cur_chr, kmer)) out_link_uint = gzip.open(out_path_uint, "wb") out_link_uint.write(unique_ar.tobytes()) out_link_uint.close() bed_kmer_pos = self.get_bed6( uniquely_mappable, kmer, STRAND, uint_path, cur_chr) # Write the BED6 to a gzipped tsv file out_name = "{}/{}.{}.bed.gz".format( self.out_dir, kmer_cur, out_label) if cur_chr == all_chrs[0]: header = self.make_header(kmer_cur, type) out_link = gzip.open(out_name, "wb") out_link.write(header) else: out_link = gzip.open(out_name, "ab") if len(bed_kmer_pos) > 0: print "Found %d regions in %s" %\ (bed_kmer_pos.shape[0], cur_chr) for bed_line in bed_kmer_pos: line_out = [str(val) for val in bed_line] line_out = "\t".join(line_out) + "\n" out_link.write(line_out) print "Created data of %s at %s" %\ (cur_chr, str(datetime.now())) out_link.close()
def get_bed6(self, uniquely_mappable, kmer, STRAND, uint_path, cur_chr): """Make BED6 from a binary vector Converts a binary vector with the same length as the chromosome to a BED6 file. Each 1 entry in the binary file indicates that the k-mer starting at that position and ending at k nucleotides downstream is uniquely mappable. Thus the BED6 would show any region in the genome that is uniquely mappable by at least one k-mer. :param uniquely_mappable: numpy binary array (0 or 1 values) :param kmer: Integer scalar showing read length (k-mer) :param STRAND: Strand that will be saved to the BED6 :param uint_path: Path of the uint8 array that was used :param cur_chr: Chromosome name the the data is from :returns: A numpy string array with BED6 information """ unimap_diff = np.diff(uniquely_mappable) poses_start, = np.where(unimap_diff == 1) poses_end, = np.where(unimap_diff == -1) if len(poses_start) != len(poses_end): if len(poses_start) > len(poses_end): poses_end = np.append(poses_end, [len(uniquely_mappable)]) else: poses_start = np.append([0], poses_start) elif uniquely_mappable[0] == 1: poses_start = np.append([0], poses_start) poses_end = np.append(poses_end, [len(uniquely_mappable)]) if len(poses_start) == 0: warnings.warn( "Found no uniquely mappable reads for {}!".format( uint_path)) bed_kmer_pos = [] else: chr_length = self.chrsize_dict[cur_chr] bed_kmer_pos = np.empty(shape=(len(poses_start), 6), dtype="S16") for ind_st in range(len(poses_start)): ad_ar = [cur_chr, str(poses_start[ind_st] + 1), str(poses_end[ind_st] + kmer - 1), "k" + str(kmer), 1, STRAND] if poses_end[ind_st] + kmer - 1 > chr_length: ad_ar[2] = chr_length bed_kmer_pos = np.array( [[cur_chr, str(poses_start[i] + 1), str(poses_end[i] + kmer - 1), "k" + str(kmer), 1, STRAND] for i in range(len(poses_start))], dtype="S64") return bed_kmer_pos def load_uint_ar(self, uint_path, kmer, cur_chr): """Loads a gzipped unsigned 8-bit integer as a numpy array """ uint_link = gzip.open(uint_path, "rb") uint_ar = np.frombuffer(uint_link.read(), dtype=np.uint8) uint_link.close() print "Processing {} for {} {}".format( uint_path, kmer, cur_chr) less_than_kmer = uint_ar <= kmer not_zero = uint_ar != 0 uniquely_mappable = np.array( np.logical_and(less_than_kmer, not_zero), dtype=int) return uniquely_mappable def make_header(self, kmer, type): """Created BED6 header :param str kmer: k<integer> :param type: One of Bismap.Forward, Bismap.Reverse or Umap """ dict_col = {"Bisulfite.Converted": "240,40,80", "Umap": "80,40,240", "Bismap.Forward": "220,20,80", "Bismap.Reverse": "80,20,220"} header = 'track name="{} {}"'.format(type, kmer) +\ 'description="Single-read mappability with {}-mers'.format( kmer) +\ 'color=%s \n'.format(dict_col.get(type, "40,40,240")) return header def make_chrsize_dict(self): """Creates dictionary from 2-column chromosome size file """ chrsize_dict = {} with open(self.chrsize_path, "r") as chrsize_link: for each_line in chrsize_link: chr = each_line.rstrip("\n").split("\t")[0] length = int(each_line.rstrip("\n").split("\t")[1]) chrsize_dict[chr] = length return chrsize_dict def bin_arr_to_wig(self, uniquely_mappable, kmer): """Converts a uniquely a binary array to mult-read mappability Unique mappability array is a binary array where each value of 1 indicates that the k-mer starting at that position is uniquely mappable. This function generates an array of the same length and saves the multi-read mappability of each position in that array. Multi-read mappability is the probability of finding a uniquely mappable k-mer among all of the k-mers that overlap with a given position. :param uniquely_mappable: A binary numpy array :param kmer: An integer defining read length :returns: Multi-read mappability array. """ LEN_AR = len(uniquely_mappable) poses_start, = np.where(uniquely_mappable == 1) ar_quant = np.zeros(len(uniquely_mappable), dtype=float) for ind_st in poses_start: ind_end = ind_st + kmer if ind_end > LEN_AR: ind_end = LEN_AR ar_quant[ind_st:ind_end] = ar_quant[ind_st:ind_end] + 1.0 print("Finished generating multi-read mappability date at {}".format( str(datetime.now()))) del poses_start ar_quant = ar_quant / float(kmer) return ar_quant
[docs] def write_as_wig(self, uint_path, out_path, kmer, chrom): """unsigned 8-bit integer array file to wiggle For a given numeric unsigned 8-bit integer vector that is generated by Umap, this method save the wiggle file which is specific for one chromosome over one read length. :param uint_path: Path to a gzipped numeric unsigned 8-bit array :param out_path: Gzipped path for saving wiggle :param kmer: Integer defining read length :param chrom: Chromosome that the uint_path is specific to """ uniquely_mappable = self.load_uint_ar(uint_path, kmer, chrom) ar_quant = self.bin_arr_to_wig(uniquely_mappable, kmer) out_link = gzip.open(out_path, "wb") unimap_diff = np.diff(np.array(ar_quant > 0, dtype=int)) poses_start, = np.where(unimap_diff == 1) poses_end, = np.where(unimap_diff == -1) if len(poses_start) != len(poses_end): if len(poses_start) > len(poses_end): poses_end = np.append(poses_end, [len(uniquely_mappable)]) else: poses_start = np.append([0], poses_start) elif uniquely_mappable[0] == 1: poses_start = np.append([0], poses_start) poses_end = np.append(poses_end, [len(uniquely_mappable)]) for ind_st in range(len(poses_start)): pos_st = poses_start[ind_st] + 1 pos_end = poses_end[ind_st] + 1 start_line = "fixedStep chrom={} start={} step=1 span=1\n".format( chrom, pos_st) out_link.write(start_line) for each_pos in range(pos_st, pos_end): out_link.write(str(ar_quant[each_pos]) + "\n") print( "Finished saving the data to {} at {}".format( out_path, str(datetime.now()))) out_link.close()
if __name__ == "__main__": parser = argparse.ArgumentParser(description="Converts\ unionized uint8 outputs of umap/ubismap to bed files") parser.add_argument("in_dir", help="folder with <chrom>.uint8.unique.gz files") parser.add_argument("out_dir", help="Folder for writing the output files") parser.add_argument("out_label", help="File names would be kmer.<out_label>.bed.gz") parser.add_argument("-C2T", action="store_true", help="If using converted genomes specify\ -C2T or -G2A") parser.add_argument("-G2A", action="store_true", help="If using converted genomes specify\ -C2T or -G2A") parser.add_argument("-chrsize_path", default="../../chrsize.tsv", help="Path to a 2 column file of chromosome and length.\ By default it goes to ../../chrsize.tsv from\ out_dir") parser.add_argument("-WriteUnique", action="store_true", help="If -Bismap is true and want to store the merged\ uint file, specify this option") args = parser.parse_args() if not os.path.exists(args.out_dir): os.makedirs(args.out_dir) FileHandler = Int8Handler(args.in_dir, args.out_dir, args.C2T, args.G2A, args.chrsize_path) kmers = FileHandler.kmers # for kmer in kmers: for kmer in kmers: FileHandler.write_beds(args.out_label, kmer, args.WriteUnique) for chrom in FileHandler.chroms: out_path = "{}/{}.{}.{}.MultiReadMappability.wg.gz".format( args.out_dir, args.out_label, chrom, kmer) uint_path = "{}/{}.uint8.unique.gz".format( args.in_dir, chrom) kmer_num = int(kmer.replace("k", "")) FileHandler.write_as_wig(uint_path, out_path, kmer_num, chrom)