from __future__ import print_function
from __future__ import absolute_import
from __future__ import division
"""Modified version of BioPython.bgzf module. Includes LRU buffer dictionary.
Copyright (c) 2018, Marcus D. Sherman
This code is part of the bamnostic distribution and governed by its
license. Please see the LICENSE file that should have been included
as part of this package.
Some methods are modified versions of their counterparts within the BioPython.bgzf
module. BioPython BGZF is now under a 3-clause BSD license. The same license
found in this package. Refer to LICENSE for information.
Copyright (c) 2010-2015 by Peter Cock.
Description: Read and write BGZF compressed files (the GZIP variant used in BAM).
Significant changes were made to the original BGZF module, produced by
Peter Cock. Aside from adding an LRU dictionary, the new BGZF module can read
BAM files directly, decompressing and unpacking the byte-encoded data structure
outlined in the BAM_ format.
.. _BAM: https://samtools.github.io/hts-specs/SAMv1.pdf
@author: "Marcus D. Sherman"
@copyright: "Copyright 2018, University of Michigan, Mills Lab
@email: "mdsherman<at>betteridiot<dot>tech"
"""
import sys
import zlib
import struct
import io
import os
import warnings
import array
import re
import bamnostic
from bamnostic.utils import *
_PY_VERSION = sys.version
if _PY_VERSION.startswith('2'):
from io import open
def _format_warnings(message, category, filename, lineno, file=None, line=None):
""" Warning formatter
Args:
message: warning message
category (str): level of warning
filename (str): path for warning output
lineno (int): Where the warning originates
Returns:
Formatted warning for logging purposes
"""
return ' {}:{}:{}: {}\n'.format(category.__name__, filename, lineno, message)
warnings.formatwarning = _format_warnings
# Constants used in BGZF format
_bgzf_magic = b"\x1f\x8b\x08\x04" # First 4 bytes of BAM file
_bgzf_header = b"\x1f\x8b\x08\x04\x00\x00\x00\x00\x00\xff\x06\x00\x42\x43\x02\x00" # Ideal GZIP header
_bgzf_eof = b"\x1f\x8b\x08\x04\x00\x00\x00\x00\x00\xff\x06\x00BC\x02\x00\x1b\x00\x03\x00\x00\x00\x00\x00\x00\x00\x00\x00" # 28 null byte signature at the end of a non-truncated BAM file
_bytes_BC = b"BC" # "Payload" or Subfield Identifiers 1 & 2 of GZIP header
def _as_bytes(s):
""" Used to ensure string is treated as bytes
The output
Args:
s (str): string to convert to bytes
Returns:
byte-encoded string
Example:
>>> str(_as_bytes('Hello, World').decode()) # Duck typing to check for byte-type object
'Hello, World'
"""
if isinstance(s, bytes):
return s
return bytes(s, encoding='latin_1')
# Helper compiled structures
unpack_gzip_header = struct.Struct('<4BI2BH').unpack
_gzip_header_size = struct.calcsize('<4BI2BH')
unpack_subfields = struct.Struct('<2s2H').unpack
_subfield_size = struct.calcsize('<2s2H')
unpack_gzip_integrity = struct.Struct('<2I').unpack
_integrity_size = struct.calcsize('<2I')
unpack_bgzf_metaheader = struct.Struct('<4BI2BH2BH').unpack
_metaheader_size = struct.calcsize('<4BI2BH2BH')
def _bgzf_metaheader(handle):
""" Pull out the metadata header for a BGZF block
BAM files essentially concatenated GZIP blocks put together into a cohesive file
format. The caveat to this is that the GZIP blocks are specially formatted to contain
metadata that indicates them as being part of a larger BAM file. Due to these specifications,
these blocks are identified as BGZF blocks. Listed below are those specifications. These
specifications can be found `here <https://samtools.github.io/hts-specs/SAMv1.pdf>`_.
The following GZIP fields are to have these expected values:
==================== =========== ===========
Field: Label: Exp.Value:
==================== =========== ===========
Identifier1 **ID1** 31
Identifier2 **ID2** 139
Compression Method **CM** 8
Flags **FLG** 4
Subfield Identifier1 **SI1** 66
Subfield Identifier2 **SI2** 67
Subfield Length **SLEN** 2
==================== =========== ===========
Args:
handle (:py:obj:`file`): Open BAM file object
Returns:
:py:obj:`tuple` of (:py:obj:`tuple`, :py:obj:`bytes`): the unpacked metadata and its raw bytestring
Raises:
ValueError: if the header does not match expected values
.. _here: https://samtools.github.io/hts-specs/SAMv1.pdf
"""
meta_raw = handle.read(_metaheader_size)
meta = unpack_bgzf_metaheader(meta_raw)
ID1, ID2, CM, FLG, MTIME, XFL, OS, XLEN, SI1, SI2, SLEN = meta
# check the header integrity
checks = [
ID1 == 31,
ID2 == 139,
CM == 8,
FLG == 4,
SI1 == 66,
SI2 == 67,
SLEN == 2]
if not all(checks):
raise ValueError('Malformed BGZF block')
return meta, meta_raw
[docs]def get_block(handle, offset=0):
r""" Pulls out entire GZIP block
Used primarily for copying the header block of a BAM file. However,
it can be used to copy any BGZF block within a BAM file that starts at
the given offset.
Note:
Does not progress file cursor position.
Args:
handle (:py:obj:`file`): open BAM file
offset (int): offset of BGZF block (default: 0)
Returns:
Complete BGZF block
Raises:
ValueError: if the BGZF block header is malformed
Example:
>>> with open('./bamnostic/data/example.bam','rb') as bam:
... bam_header = get_block(bam)
... try:
... bam_header.startswith(b'\x1f\x8b\x08\x04')
... except SyntaxError:
... bam_header.startswith('\x1f\x8b\x08\x04')
True
"""
if isinstance(handle, bamnostic.core.AlignmentFile):
handle = handle._handle
with open(handle.name, 'rb') as header_handle:
header_handle.seek(offset) # get to the start of the BGZF block
# Capture raw bytes of metadata header
_, meta_raw = _bgzf_metaheader(header_handle)
BSIZE_raw = header_handle.read(2)
BSIZE = struct.unpack('<H', BSIZE_raw)[0]
# capture the CRC32 and ISIZE fields in addition to compressed data
# 6 = XLEN, 19 = spec offset, 8 = CRC32 & ISIZE -> -5
block_tail = header_handle.read(BSIZE - 5)
return meta_raw + BSIZE_raw + block_tail
def _load_bgzf_block(handle):
r"""Load the next BGZF block of compressed data (PRIVATE).
BAM files essentially concatenated GZIP blocks put together into a cohesive file
format. The caveat to this is that the GZIP blocks are specially formatted to contain
metadata that indicates them as being part of a larger BAM file. Due to these specifications,
these blocks are identified as BGZF blocks.
Args:
handle (:py:obj:`file`): open BAM file
Returns:
deflated GZIP data
Raises:
ValueError: if CRC32 or ISIZE do not match deflated data
Example:
>>> with open('./bamnostic/data/example.bam','rb') as bam:
... block = _load_bgzf_block(bam)
... try:
... block[0] == 53 and block[1].startswith(b'BAM\x01')
... except TypeError:
... block[0] == 53 and block[1].startswith('BAM\x01')
True
"""
# Pull in the BGZF block header information
header, _ = _bgzf_metaheader(handle)
XLEN = header[-4]
BSIZE = struct.unpack('<H', handle.read(2))[0]
# Expose the compressed data
d_size = BSIZE - XLEN - 19
d_obj = zlib.decompressobj(-15)
data = d_obj.decompress(handle.read(d_size)) + d_obj.flush()
# Checking data integrity
CRC32, ISIZE = unpack_gzip_integrity(handle.read(_integrity_size))
deflated_crc = zlib.crc32(data)
if deflated_crc < 0:
deflated_crc = deflated_crc % (1 << 32)
if CRC32 != deflated_crc:
raise ValueError('CRCs are not equal: is {}, not {}'.format(CRC32, deflated_crc))
if ISIZE != len(data):
raise ValueError('unequal uncompressed data size')
return BSIZE + 1, data
[docs]class BgzfReader(object):
""" The BAM reader. Heavily modified from Peter Cock's BgzfReader.
Attributes:
header: representation of header data (if present)
lengths (:py:obj:`list` of :py:obj:`int`): lengths of references listed in header
nocoordinate (int): number of reads that have no coordinates
nreferences (int): number of references in header
ref2tid (:py:obj:`dict` of :py:obj:`str`, :py:obj:`int`): refernce names and refID dictionary
references (:py:obj:`list` of :py:obj:`str`): names of references listed in header
text (str): SAM header (if present)
unmapped (int): number of unmapped reads
Note:
This implementation is likely to change. While the API was meant to
mirror `pysam`, it makes sense to include the `pysam`-like API in an extension
that will wrap the core reader. This would be a major refactor, and therefore
will not happen any time soon (30 May 2018).
"""
def __init__(self, filepath_or_object, mode="rb", max_cache=128, index_filename=None,
filename=None, check_header=False, check_sq=True, reference_filename=None,
filepath_index=None, require_index=False, duplicate_filehandle=None,
ignore_truncation=False):
"""Initialize the class.
Args:
filepath_or_object (str | :py:obj:`file`): the path or file object of the BAM file
mode (str): Mode for reading. BAM files are binary by nature (default: 'rb').
max_cache (int): number of desired LRU cache size, preferably a multiple of 2 (default: 128).
index_filename (str): path to index file (BAI) if it is named differently than the BAM file (default: None).
filename (str | :py:obj:`file`): synonym for `filepath_or_object`
check_header (bool): Obsolete method maintained for backwards compatibility (default: False)
check_sq (bool): Inspect BAM file for `@SQ` entries within the header
reference_filename (str): Not implemented. Maintained for backwards compatibility
filepath_index (str): synonym for `index_filename`
require_index (bool): require the presence of an index file or raise (default: False)
duplicate_filehandle (bool): Not implemented. Raises warning if True.
ignore_truncation (bool): Whether or not to allow trucated file processing (default: False).
"""
# Set up the LRU buffer dictionary
if max_cache < 1:
raise ValueError("Use max_cache with a minimum of 1")
self._buffers = LruDict(max_cache=max_cache)
# handle contradictory arguments caused by synonyms
if filepath_or_object and filename and filename != filepath_or_object:
raise ValueError('filepath_or_object and filename parameters do not match. Try using only one')
elif filepath_or_object:
pass
else:
if filename:
filepath_or_object = filename
else:
raise ValueError('either filepath_or_object or filename must be set')
# Check to see if file object or path was passed
if isinstance(filepath_or_object, io.IOBase):
handle = fileobj
else:
handle = open(filepath_or_object, "rb")
self._text = "b" not in mode.lower()
if 'b' not in mode.lower():
raise IOError('BAM file requires binary mode ("rb")')
# Connect to the BAM file
self._handle = handle
# Check BAM file integrity
if not ignore_truncation:
if self._check_truncation():
raise Exception('BAM file may be truncated. Turn off ignore_truncation if you wish to continue')
# Connect and process the Index file (if present)
self._index = None
if filepath_index and index_filename and index_filename != filepath_index:
raise IOError('Use index_filename or filepath_or_object. Not both')
self._check_idx = self.check_index(index_filename if index_filename else filepath_index, require_index)
self._init_index()
# Load the first block into the buffer and intialize cursor attributes
self._block_start_offset = None
self._block_raw_length = None
self._load_block(handle.tell())
# Load in the BAM header as an instance attribute
self._load_header(check_sq)
# Helper dictionary for changing reference names to refID/TID
self.ref2tid = {v[0]: k for k, v in self._header.refs.items()}
# Final exception handling
if check_header:
warnings.warn('Obsolete method', UserWarning)
if duplicate_filehandle:
warnings.warn('duplicate_filehandle not necessary as the C API for samtools is not used', UserWarning)
if reference_filename:
raise NotImplementedError('CRAM file support not yet implemented')
def _load_block(self, start_offset=None):
"""(PRIVATE) Used to load next BGZF block into the buffer, and orients the cursor position.
Args:
start_offset (int): byte offset of BGZF block (default: None)
"""
if start_offset is None:
# If the file is being read sequentially, then _handle.tell()
# should be pointing at the start of the next block.
# However, if seek has been used, we can't assume that.
start_offset = self._block_start_offset + self._block_raw_length
if start_offset == self._block_start_offset:
self._within_block_offset = 0
return
elif start_offset in self._buffers:
# Already in cache
try:
self._buffer, self._block_raw_length = self._buffers[start_offset]
except TypeError:
pass
self._within_block_offset = 0
self._block_start_offset = start_offset
return
# Now load the block
handle = self._handle
if start_offset is not None:
handle.seek(start_offset)
self._block_start_offset = handle.tell()
try:
block_size, self._buffer = _load_bgzf_block(handle)
except StopIteration:
# EOF
block_size = 0
if self._text:
self._buffer = ""
else:
self._buffer = b""
self._within_block_offset = 0
self._block_raw_length = block_size
# Finally save the block in our cache,
self._buffers[self._block_start_offset] = self._buffer, block_size
[docs] def check_index(self, index_filename=None, req_idx=False):
""" Checks to make sure index file is available. If not, it disables random access.
Args:
index_filename (str): path to index file (BAI) if it does not fit naming convention (default: None).
req_idx (bool): Raise error if index file is not present (default: False).
Returns:
(bool): True if index is present, else False
Raises:
IOError: If the index file is closed or index could not be opened
Warns:
UserWarning: If index could not be loaded. Random access is disabled.
Examples:
>>> bam = bamnostic.AlignmentFile(bamnostic.example_bam)
>>> bam.check_index(bamnostic.example_bam + '.bai')
True
>>> bam.check_index('not_a_file.bai')
False
"""
if index_filename is None:
possible_index_path = r'./{}.bai'.format(os.path.relpath(self._handle.name))
if os.path.isfile(possible_index_path):
self._index_path = possible_index_path
self._random_access = True
return True
else:
if req_idx:
raise IOError('htsfile is closed or index could not be opened')
warnings.warn("No supplied index file and '{}' was not found. Random access disabled".format(possible_index_path), UserWarning)
self._random_access = False
return False
else:
if os.path.isfile(index_filename):
self._index_path = index_filename
self._random_access = True
return True
else:
if req_idx:
raise IOError('htsfile is closed or index could not be opened')
warnings.warn("Index file '{}' was not found. Random access disabled".format(index_filename), UserWarning)
self._random_access = False
return False
def _init_index(self):
"""Initialize the index file (BAI)"""
if self._check_idx:
self._index = bamnostic.bai.Bai(self._index_path)
self.__nocoordinate = self._index.n_no_coor
self.__mapped = sum(self._index.unmapped[mapped].n_mapped for mapped in self._index.unmapped) + self.nocoordinate
self.__unmapped = sum(self._index.unmapped[unmapped].n_unmapped for unmapped in self._index.unmapped) + self.nocoordinate
@property
def nocoordinate(self):
"""Get the number of reads without coordiantes according to the statistics recorded in the index.
Returns:
(int): sum of reads without coordinates
"""
return self.__nocoordinate
@property
def mapped(self):
"""Get the number of mapped reads according to the statistics recorded in the index.
Returns:
(int): sum of mapped reads
"""
return self.__mapped
@property
def unmapped(self):
"""Get the number of unmapped reads without coordiantes **and**
without coordinate.
Returns:
(int): sum of unmapped reads and reads without coordinates
"""
return self.__unmapped + self.nocoordinate
def _check_sq(self):
""" Inspect BAM file for @SQ entries within the header
The implementation of this check is for BAM files specifically. I inspects
the SAM header (if present) for the `@SQ` entires. However, if the SAM header
is not present, will inspect the BAM header for reference sequence entries. If this
test ever returns `FALSE`, the BAM file is not operational.
Returns:
(bool): True if present, else false
Example:
>>> bam = bamnostic.AlignmentFile(bamnostic.example_bam, 'rb')
>>> bam._check_sq()
True
"""
if self._header._header_length == 0:
if not self._header.refs:
return False
else:
if 'SQ' not in self._header.SAMheader:
return False
return True
def _load_header(self, check_sq=True):
""" Loads the header into the reader object
Args:
check_sq (bool): whether to check for file header or not (default: True).
Raises:
KeyError: If 'SQ' entry is not present in BAM header
"""
self._header = BAMheader(self)
self.header = self._header.SAMheader if self._header.SAMheader else self._header
self.text = self._header._SAMheader_raw
# make compatible with pysam attributes, even though the data exists elsewhere
self.__references = []
self.__lengths = []
for n in range(self._header.n_refs):
self.__references.append(self._header.refs[n][0])
self.__lengths.append(self._header.refs[n][1])
self.__nreferences = self._header.n_refs
if check_sq:
if not self._check_sq():
raise KeyError('No SQ entries in header')
@property
def references(self):
""" Get all references used in alignment.
Returns:
(:py:obj:`tuple` of :py:obj:`str`): reference names
"""
return tuple(self.__references)
@property
def nreferences(self):
""" Get the number of references listed in alignment
Returns:
(int): count of references
"""
return self.__nreferences
@property
def lengths(self):
""" Get all reference lengths used in alignment.
Returns:
(:py:obj:`tuple` of :py:obj:`int`): reference lengths
"""
return tuple(self.__lengths)
def _check_truncation(self):
""" Confusing function to check for file truncation.
Every BAM file should contain an EOF signature within the last
28 bytes of the file. This function checks for that signature.
Returns:
(bool): True if truncated, else False
Warns:
BytesWarning: if no EOF signature found.
"""
temp_pos = self._handle.tell()
self._handle.seek(-28, 2)
eof = self._handle.read()
self._handle.seek(temp_pos)
if eof == _bgzf_eof:
return False
else:
warnings.BytesWarning('No EOF character found. File may be truncated')
return True
def pileup(self):
raise NotImplementedError('Pileup is not implemented. Consider using `fetch` instead')
[docs] def has_index(self):
"""Checks if file has index and it is open
Returns:
bool: True if present and opened, else False
"""
if self._check_idx and self._index:
return self._check_idx
[docs] def tell(self):
"""Return a 64-bit unsigned BGZF virtual offset."""
return make_virtual_offset(self._block_start_offset, self._within_block_offset)
[docs] def seek(self, virtual_offset):
"""Seek to a 64-bit unsigned BGZF virtual offset.
A virtual offset is a composite number made up of the compressed
offset (`coffset`) position of the start position of the BGZF block that
the position originates within, and the uncompressed offset (`uoffset`)
within the deflated BGZF block where the position starts. The virtual offset
is defined as
`virtual_offset = coffset << 16 | uoffset`
Args:
virtual_offset (int): 64-bit unsigned composite byte offset
Returns:
virtual_offset (int): an echo of the new position
Raises:
ValueError: if within block offset is more than block size
AssertionError: if the start position is not the block start position
Example:
>>> bam = bamnostic.AlignmentFile(bamnostic.example_bam, 'rb')
>>> bam.seek(10)
10
>>> bam.seek(bamnostic.utils.make_virtual_offset(0, 42))
Traceback (most recent call last):
...
ValueError: Within offset 42 but block size only 38
"""
# Do this inline to avoid a function call,
# start_offset, within_block = split_virtual_offset(virtual_offset)
start_offset = virtual_offset >> 16
within_block = virtual_offset ^ (start_offset << 16)
if start_offset != self._block_start_offset:
# Don't need to load the block if already there
# (this avoids a function call since _load_block would do nothing)
self._load_block(start_offset)
assert start_offset == self._block_start_offset
if within_block > len(self._buffer):
if not (within_block == 0 and len(self._buffer) == 0):
raise ValueError("Within offset %i but block size only %i"
% (within_block, len(self._buffer)))
self._within_block_offset = within_block
return virtual_offset
[docs] def read(self, size=-1):
"""Read method for the BGZF module.
Args:
size (int): the number of bytes to read from file. Advances the cursor.
Returns:
data (:py:obj:`bytes`): byte string of length `size`
Raises:
NotImplementedError: if the user tries to read the whole file
AssertionError: if read does not return any data
"""
if size < 0:
raise NotImplementedError("Don't be greedy, that could be massive!")
elif size == 0:
if self._text:
return ""
else:
return b""
elif self._within_block_offset + size <= len(self._buffer):
# This may leave us right at the end of a block
# (lazy loading, don't load the next block unless we have too)
data = self._buffer[self._within_block_offset:self._within_block_offset + size]
self._within_block_offset += size
assert data # Must be at least 1 byte
return data
else:
# if read data overflows to next block
# pull in rest of data in current block
data = self._buffer[self._within_block_offset:]
# decrement size so that we only pull the rest of the data
# from next block
size -= len(data)
self._load_block() # will reset offsets
if not self._buffer:
return data # EOF
# if there is still more to read
elif size:
# pull rest of data from next block
return data + self.read(size)
else:
# Only needed the end of the last block
return data
[docs] def readline(self):
"""Read a single line for the BGZF file.
Binary operations do not support `readline()`. Code is commented
out for posterity sake
"""
raise NotImplementedError("Readline does not work on byte data")
"""
i = self._buffer.find(self._newline, self._within_block_offset)
# Three cases to consider,
if i == -1:
# No newline, need to read in more data
data = self._buffer[self._within_block_offset:]
self._load_block() # will reset offsets
if not self._buffer:
return data # EOF
else:
# TODO - Avoid recursion
return data + self.readline()
elif i + 1 == len(self._buffer):
# Found new line, but right at end of block (SPECIAL)
data = self._buffer[self._within_block_offset:]
# Must now load the next block to ensure tell() works
self._load_block() # will reset offsets
assert data
return data
else:
# Found new line, not at end of block (easy case, no IO)
data = self._buffer[self._within_block_offset:i + 1]
self._within_block_offset = i + 1
# assert data.endswith(self._newline)
return data
"""
[docs] def fetch(self, contig=None, start=None, stop=None, region=None,
tid=None, until_eof=False, multiple_iterators=False,
reference=None, end=None):
r"""Creates a generator that returns all reads within the given region
Args:
contig (str): name of reference/contig
start (int): start position of region of interest (0-based)
stop (int): stop position of region of interest (0-based)
region (str): SAM region formatted string. Accepts tab-delimited values as well
tid (int): the refID or target id of a reference/contig
until_eof (bool): iterate until end of file
mutiple_iterators (bool): allow multiple iterators over region. Not Implemented. \
Notice: each iterator will open up a new view into the BAM file, so overhead will apply.
reference (str): synonym for `contig`
end (str): synonym for `stop`
Yields:
reads over the region of interest if any
Raises:
ValueError: if the genomic coordinates are out of range or invalid
KeyError: Reference is not found in header
Notes:
SAM region formatted strings take on the following form:
'chr1:100000-200000'
Usage:
AlignmentFile.fetch(contig='chr1', start=1, stop= 1000)
AlignmentFile.fetch('chr1', 1, 1000)
AlignmentFile.fetch('chr1:1-1000')
AlignmentFile.fetch('chr1', 1)
AlignmentFile.fetch('chr1')
Examples:
>>> bam = bamnostic.AlignmentFile(bamnostic.example_bam, 'rb')
>>> next(bam.fetch('chr1', 1, 10)) # doctest: +ELLIPSIS, +NORMALIZE_WHITESPACE
EAS56_57:6:190:289:82 ... MF:C:192
>>> next(bam.fetch('chr10', 1, 10))
Traceback (most recent call last):
...
KeyError: 'chr10 was not found in the file header'
>>> next(bam.fetch('chr1', 1700, 1701))
Traceback (most recent call last):
...
ValueError: Genomic region out of bounds.
>>> next(bam.fetch('chr1', 100, 10))
Traceback (most recent call last):
...
AssertionError: Malformed region: start should be <= stop, you entered 100, 10
"""
if not self._random_access:
raise ValueError('Random access not available due to lack of index file')
if multiple_iterators:
raise NotImplementedError('multiple_iterators not yet implemented')
signature = locals()
for key in ['self', 'multiple_iterators']:
signature.pop(key)
query = parse_region(**signature)
if query.tid is not None and query.contig is None:
query.contig = self.get_reference_name(tid)
elif query.tid is not None and query.contig is not None:
if self.ref2tid[query.contig] != tid:
raise ValueError('tid and contig name do not match')
elif query.contig is not None and query.tid is None:
try:
query.tid = self.ref2tid[query.contig]
except KeyError:
raise KeyError('{} was not found in the file header'.format(query.contig))
try:
if query.start > self._header.refs[query.tid][1]:
raise ValueError('Genomic region out of bounds.')
if query.stop is None:
# set end to length of chromosome
query.stop = self._header.refs[query.tid][1]
assert query.start <= query.stop, 'Malformed region: start should be <= stop, you entered {}, {}'.format(query.start, query.stop)
except KeyError:
raise KeyError('{} was not found in the file header'.format(query.contig))
# from the index, get the virtual offset of the chunk that
# begins the overlapping region of interest
first_read_block = self._index.query(query.tid, query.start, query.stop)
if first_read_block is None:
return
# move to that virtual offset...should load the block into the cache
# if it hasn't been visited before
self.seek(first_read_block)
boundary_check = True
while boundary_check:
next_read = next(self)
if not until_eof:
# check to see if the read is out of bounds of the region
if next_read.reference_name != query.contig:
boundary_check = False
if query.start < query.stop < next_read.pos:
boundary_check = False
# check for stop iteration
if next_read:
yield next_read
else:
return
else:
try:
yield next_read
except:
return
else:
return
[docs] def count(self, contig=None, start=None, stop=None, region=None,
until_eof=False, tid=None, read_callback='nofilter',
reference=None, end=None):
r"""Count the number of reads in the given region
Note: this counts the number of reads that **overlap** the given region.
Can potentially make use of a filter for the reads (or custom function
that returns `True` or `False` for each read).
Args:
contig (str): the reference name (Default: None)
reference (str): synonym for `contig` (Default: None)
start (int): 0-based inclusive start position (Default: None)
stop (int): 0-based exclusive start position (Default: None)
end (int): Synonym for `stop` (Default: None)
region (str): SAM-style region format. \
Example: 'chr1:10000-50000' (Default: None)
until_eof (bool): count number of reads from start to end of file \
Note, this can potentially be an expensive operation. \
(Default: False)
read_callback (str|function): select (or create) a filter of which \
reads to count. Built-in filters:
* `all`: skips reads that contain the following flags:
* 0x4 (4): read unmapped
* 0x100 (256): not primary alignment
* 0x200 (512): QC Fail
* 0x400 (1024): PCR or optical duplicate
* `nofilter`: uses all reads (Default)
* The user can also supply a custom function that \
returns boolean objects for each read
Returns:
(int): count of reads in the given region that meet parameters
Raises:
ValueError: if genomic coordinates are out of range or invalid or random access is disabled
RuntimeError: if `read_callback` is not properly set
KeyError: Reference is not found in header
AssertionError: if genomic region is malformed
Notes:
SAM region formatted strings take on the following form:
'chr1:100000-200000'
Usage:
AlignmentFile.count(contig='chr1', start=1, stop= 1000)
AlignmentFile.count('chr1', 1, 1000)
AlignmentFile.count('chr1:1-1000')
AlignmentFile.count('chr1', 1)
AlignmentFile.count('chr1')
Example:
>>> bam = bamnostic.AlignmentFile(bamnostic.example_bam, 'rb')
>>> bam.count('chr1', 1, 100)
3
>>> bam.count('chr1', 1, 100, read_callback='all')
2
>>> bam.count('chr10', 1, 10)
Traceback (most recent call last):
...
KeyError: 'chr10 was not found in the file header'
>>> bam.count('chr1', 1700, 1701)
Traceback (most recent call last):
...
ValueError: Genomic region out of bounds.
>>> bam.count('chr1', 100, 10)
Traceback (most recent call last):
...
AssertionError: Malformed region: start should be <= stop, you entered 100, 10
"""
# pass the signature to fetch
signature = locals()
signature.pop('read_callback')
signature.pop('self')
roi_reads = self.fetch(**signature)
# make `nofilter` the default filter unless told otherwise
# read_callback = kwargs.get('read_callback', 'nofilter')
# go through all the reads over a given region and count them
count = 0
for read in roi_reads:
if filter_read(read, read_callback):
count += 1
return count
[docs] def count_coverage(self, contig=None, start=None, stop=None, region=None,
quality_threshold=15, read_callback='all',
reference=None, end=None, base_quality_threshold=0):
""" Counts the coverage of each base supported by a read the given interval.
Given an interval (inclusive, inclusive), this method pulls each read that overlaps
with the region. To ensure that the read truly overlaps with the region, the CIGAR string
is required. These reads can further be filtered out by their flags, MAPQ qualities, or
custom filtering function. Using the CIGAR string, the aligned portion of the read
is traversed and the presence of each nucleotide base is tallied into respective arrays.
Additionally, the user can choose to filter the counted bases based on its base quality
score that is stored in the quality string.
Args:
contig (str): the reference name (Default: None)
reference (str): synonym for `contig` (Default: None)
start (int): 0-based inclusive start position (Default: None)
stop (int): 0-based exclusive start position (Default: None)
end (int): Synonym for `stop` (Default: None)
region (str): SAM-style region format. \
Example: 'chr1:10000-50000' (Default: None)
quality_threshold (int): MAPQ quality threshold (Default: 15)
base_quality_threshold (int): base quality score threshold (Default: 0)
read_callback (str|function): select (or create) a filter of which \
reads to count. Built-in filters:
* `all`: skips reads that contain the following flags:
* 0x4 (4): read unmapped
* 0x100 (256): not primary alignment
* 0x200 (512): QC Fail
* 0x400 (1024): PCR or optical duplicate
* `nofilter`: uses all reads (Default)
* The user can also supply a custom function that \
returns boolean objects for each read
Returns:
(:py:obj:`array.array`): Four arrays in the order of **A**, **C**, **G**, **T**
Raises:
ValueError: if genomic coordinates are out of range or invalid, random access is disabled, or nucleotide base is unrecognized
RuntimeError: if `read_callback` is not properly set
KeyError: Reference is not found in header
AssertionError: if genomic region is malformed
Notes:
SAM region formatted strings take on the following form:
'chr1:100-200'
Usage:
AlignmentFile.count_coverage(contig='chr1', start=1, stop= 1000)
AlignmentFile.count_coverage('chr1', 1, 1000)
AlignmentFile.count_coverage('chr1:1-1000')
AlignmentFile.count_coverage('chr1', 1)
AlignmentFile.count_coverage('chr1')
Examples:
>>> bam = bamnostic.AlignmentFile(bamnostic.example_bam, 'rb')
>>> bam.count_coverage('chr1', 100, 150) # doctest: +ELLIPSIS, +NORMALIZE_WHITESPACE
(array('L', [0, 0, 0, 0, ..., 0, 0, 0, 0, 0]),
array('L', [0, 0, 0, 0, ..., 0, 0, 0, 0, 0]),
array('L', [1, 1, 2, 2, ..., 0, 14, 0, 14, 14]),
array('L', [0, 0, 0, 0, ..., 15, 0, 14, 0, 0]))
>>> bam.count_coverage('chr1', 100, 150, quality_threshold=20, base_quality_threshold=25) # doctest: +ELLIPSIS, +NORMALIZE_WHITESPACE
(array('L', [0, 0, 0, 0, ..., 0, 0, 0, 0, 0]),
array('L', [0, 0, 0, 0, ..., 0, 0, 0, 0, 0]),
array('L', [1, 1, 2, 2, ..., 0, 14, 0, 13, 11]),
array('L', [0, 0, 0, 0,..., 14, 0, 13, 0, 0]))
"""
signature = locals()
for key in ['self', 'quality_threshold', 'read_callback', 'base_quality_threshold']:
signature.pop(key)
adenine = array.array('L', [0] * (stop - start + 1))
cytosine = adenine[:]
guanine = adenine[:]
thymine = adenine[:]
for read in self.fetch(**signature):
if read.cigarstring is not None and read.mapq >= quality_threshold:
if filter_read(read, read_callback):
for base, index in cigar_alignment(seq=read.seq, cigar=read.cigarstring,
start_pos=read.pos, qualities=read.query_qualities,
base_qual_thresh=base_quality_threshold):
if start <= index <= stop:
if base == 'A':
adenine[index - start] += 1
elif base == 'G':
guanine[index - start] += 1
elif base == 'C':
guanine[index - start] += 1
elif base == 'T':
thymine[index - start] += 1
else:
raise ValueError('Read base was {}, not A, T, C, or G'.format(base))
return adenine, cytosine, guanine, thymine
[docs] def get_index_stats(self):
""" Inspects the index file (BAI) for alignment statistics.
Every BAM index file contains metrics regarding the alignment
process for the given BAM file. The stored data are the number
of mapped and unmapped reads for a given reference. Unmapped reads
are paired end reads where only one part is mapped. Additionally,
index files also contain the number of unplaced unmapped reads. This
is stored within the `nocoordinate` instance attribute (if present).
Returns:
idx_stats (:py:obj:`list` of :py:obj:`tuple`): list of tuples for each reference in the order seen in the header. Each tuple contains the number of mapped reads, unmapped reads, and the sum of both.
Raises:
AssertionError: if the index file is not available
Example:
>>> bam = bamnostic.AlignmentFile(bamnostic.example_bam, 'rb')
>>> bam.get_index_stats()
[(1446, 18, 1464), (1789, 17, 1806)]
>>> bam_no_bai = bamnostic.AlignmentFile(bamnostic.example_bam, 'rb', index_filename='not_a_file.bai')
>>> bam_no_bai.get_index_stats()
Traceback (most recent call last):
...
AssertionError: No index available
"""
assert self._check_idx, 'No index available'
idx_stats = []
for ref in range(self._header.n_refs):
try:
mapped = self._index.unmapped[ref].n_mapped
unmapped = self._index.unmapped[ref].n_unmapped
idx_stats.append((mapped, unmapped, mapped + unmapped))
except KeyError:
idx_stats.append((0, 0, 0))
return idx_stats
[docs] def is_valid_tid(self, tid):
""" Return `True` if TID/RefID is valid.
Returns:
`True` if TID/refID is valid, else `False`
Example:
>>> bam = bamnostic.AlignmentFile(bamnostic.example_bam, 'rb')
>>> bam.is_valid_tid(0)
True
>>> bam.is_valid_tid(10) # because there are only 2 in this file
False
"""
return True if tid in self._header.refs else False
[docs] def get_reference_name(self, tid):
""" Convert TID/refID to reference name.
The TID/refID is the position a reference sequence is seen
within the header file of the BAM file. The references are
sorted by ASCII order. Therefore, for a **Homo sapien** aligned
to GRCh38, 'chr10' comes before 'chr1' in the header. Therefore,
'chr10' would have the TID/refID of 0, not 'chr1'.
Args:
tid (int): TID/refID of desired reference/contig
Returns:
String representation of chromosome if valid, else None
Raises:
KeyError: if TID/refID is not valid
Examples:
>>> bam = bamnostic.AlignmentFile(bamnostic.example_bam, 'rb')
>>> bam.get_reference_name(0)
\'chr1'
>>> bam.get_reference_name(10)
Traceback (most recent call last):
...
KeyError: '10 is not a valid TID/refID for this file.'
"""
if self.is_valid_tid(tid):
return self._header.refs[tid][0]
else:
raise KeyError('{} is not a valid TID/refID for this file.'.format(tid))
[docs] def get_tid(self, reference):
""" Convert reference/contig name to refID/TID.
The TID/refID is the position a reference sequence is seen
within the header file of the BAM file. The references are
sorted by ASCII order. Therefore, for a **Homo sapien** aligned
to GRCh38, 'chr10' comes before 'chr1' in the header. Therefore,
'chr10' would have the TID/refID of 0, not 'chr1'.
Args:
reference (str): reference/contig name
Returns:
(int): the TID/refID of desired reference/contig
Raises:
KeyError: if reference name not found file header
Example:
>>> bam = bamnostic.AlignmentFile(bamnostic.example_bam, 'rb')
>>> bam.get_tid('chr1')
0
>>> bam.get_tid('chr10')
Traceback (most recent call last):
...
KeyError: 'chr10 was not found in the file header'
"""
tid = self.ref2tid.get(reference, -1)
if tid == -1:
raise KeyError('{} was not found in the file header'.format(reference))
return tid
[docs] def mate(self, AlignedSegment):
""" Gets the mate to a given AlignedSegment.
Note:
Slow, when compared to the C-API. Not meant for high-throughput analysis.
Does not advance current iterator position.
Args:
AlignedSegment (:py:class:`bamnostic.AlignedSegment`): a bamnostic AlignedSegment read with a mate
Returns:
(:py:class:`bamnostic.AlignedSegment`): if read has a valid mate, else None
Raises:
ValueError: if AlignedSegment is unpaired
"""
with bamnostic.AlignmentFile(self._handle.name, index_filename=self._index_path) as mate_head:
# Don't look if there isn't a pair
if not AlignedSegment.is_paired:
raise ValueError('Read is unpaired')
# Based on standard convention
read_name_base = read_name_pat.search(AlignedSegment.read_name).groupdict()['read_name']
rnext = AlignedSegment.next_reference_id
# Look for available mate information
if rnext < 0:
return None # Information is unavailable
pnext = AlignedSegment.next_reference_start
if pnext < 0:
return None # no information available on read
mate_gen = mate_head.fetch(tid=rnext, start=pnext, stop=pnext + 1)
for read in mate_gen:
if read.read_name.split('/')[0].split('#')[0] == read_name_base:
if AlignedSegment.is_read1 is not read.is_read1:
return read
[docs] def head(self, n=5, multiple_iterators=False):
""" List out the first **n** reads of the file.
This method is primarily used when doing an initial exploration
of the data. Whether or not `multiple_iterators` is used, cursor
position within the file will not change.
Note:
Using `multiple_interators` opens a new file object of the
same file currently in use and, thus, impacts the memory
footprint of your analysis.
Args:
n (int): number of aligned reads to print (default: 5)
mutliple_iterators (bool): Whether to use current file object or create a new one (default: False).
Returns:
head_reads (:py:obj:`list` of :py:obj:`AlignedSegment`): list of **n** reads from the front of the BAM file
Example:
>>> bam = bamnostic.AlignmentFile(bamnostic.example_bam, 'rb')
>>> bam.head(n=5)[0] # doctest: +ELLIPSIS, +NORMALIZE_WHITESPACE
EAS56_57:6:190:289:82 ... MF:C:192
>>> bam.head(n = 5, multiple_iterators = True)[1] # doctest: +ELLIPSIS, +NORMALIZE_WHITESPACE
EAS56_57:6:190:289:82 ... H1:C:0
"""
if multiple_iterators:
head_iter = bamnostic.AlignmentFile(self._handle.name, index_filename=self._index_path)
else:
curr_pos = self.tell()
# BAMheader uses byte specific positions (and not BGZF virtual offsets)
self._handle.seek(self._header._BAMheader_end)
self._load_block(self._handle.tell())
head_iter = self
head_reads = [next(head_iter) for read in range(n)]
if multiple_iterators:
# close the independent file object
head_iter.close()
else:
# otherwise, just go back to old position
self.seek(curr_pos)
assert self.tell() == curr_pos
return head_reads
def __next__(self):
"""Return the next line (Py2 Compatibility)."""
read = bamnostic.AlignedSegment(self)
if not read:
raise StopIteration
return read
[docs] def next(self):
"""Return the next line."""
read = bamnostic.AlignedSegment(self)
if not read:
raise StopIteration
return read
def __iter__(self):
"""Iterate over the lines in the BGZF file."""
return self
[docs] def close(self):
"""Close BGZF file."""
self._handle.close()
self._buffer = None
self._block_start_offset = None
self._buffers = None
[docs] def seekable(self):
"""Return True indicating the BGZF supports random access.
Note:
Modified from original Bio.BgzfReader: checks to see if BAM
file has associated index file (BAI)
"""
return self._check_idx
[docs] def isatty(self):
"""Return True if connected to a TTY device."""
return False
[docs] def fileno(self):
"""Return integer file descriptor."""
return self._handle.fileno()
def __enter__(self):
"""Open a file operable with WITH statement."""
return self
def __exit__(self, type, value, traceback):
"""Close a file with WITH statement."""
self.close()
[docs]class BgzfWriter(object):
"""Define a BGZFWriter object."""
def __init__(self, filepath_or_object, mode="wb", compresslevel=6):
"""Initialize the class."""
if isinstance(filepath_or_object, io.IOBase):
handle = fileobj
else:
handle = open(filepath_or_object, "wb")
"""
if fileobj:
assert filename is None
handle = fileobj
else:
if "w" not in mode.lower() and "a" not in mode.lower():
raise ValueError("Must use write or append mode, not %r" % mode)
if "a" in mode.lower():
handle = open(filename, "ab")
else:
handle = open(filename, "wb")
"""
self._text = "b" not in mode.lower()
self._handle = handle
self._buffer = b""
self.compresslevel = compresslevel
def _write_block(self, block):
"""Write provided data to file as a single BGZF compressed block (PRIVATE)."""
# print("Saving %i bytes" % len(block))
start_offset = self._handle.tell()
assert len(block) <= 65536
# Giving a negative window bits means no gzip/zlib headers,
# -15 used in samtools
c = zlib.compressobj(self.compresslevel,
zlib.DEFLATED,
-15,
zlib.DEF_MEM_LEVEL,
0)
compressed = c.compress(block) + c.flush()
del c
assert len(compressed) < 65536, \
"TODO - Didn't compress enough, try less data in this block"
crc = zlib.crc32(block)
# Should cope with a mix of Python platforms...
if crc < 0:
crc = struct.pack("<i", crc)
else:
crc = struct.pack("<I", crc)
bsize = struct.pack("<H", len(compressed) + 25) # includes -1
crc = struct.pack("<I", zlib.crc32(block) & 0xffffffff)
uncompressed_length = struct.pack("<I", len(block))
# Fixed 16 bytes,
# gzip magic bytes (4) mod time (4),
# gzip flag (1), os (1), extra length which is six (2),
# sub field which is BC (2), sub field length of two (2),
# Variable data,
# 2 bytes: block length as BC sub field (2)
# X bytes: the data
# 8 bytes: crc (4), uncompressed data length (4)
data = _bgzf_header + bsize + compressed + crc + uncompressed_length
self._handle.write(data)
[docs] def write(self, data):
"""Write method for the class."""
# TODO - Check bytes vs unicode
data = _as_bytes(data)
# block_size = 2**16 = 65536
data_len = len(data)
if len(self._buffer) + data_len < 65536:
# print("Cached %r" % data)
self._buffer += data
return
else:
# print("Got %r, writing out some data..." % data)
self._buffer += data
while len(self._buffer) >= 65536:
self._write_block(self._buffer[:65536])
self._buffer = self._buffer[65536:]
[docs] def flush(self):
"""Flush data explicitly."""
while len(self._buffer) >= 65536:
self._write_block(self._buffer[:65535])
self._buffer = self._buffer[65535:]
self._write_block(self._buffer)
self._buffer = b""
self._handle.flush()
[docs] def close(self):
"""Flush data, write 28 bytes BGZF EOF marker, and close BGZF file.
samtools will look for a magic EOF marker, just a 28 byte empty BGZF
block, and if it is missing warns the BAM file may be truncated. In
addition to samtools writing this block, so too does bgzip - so this
implementation does too.
"""
if self._buffer:
self.flush()
self._handle.write(_bgzf_eof)
self._handle.flush()
self._handle.close()
[docs] def tell(self):
"""Return a BGZF 64-bit virtual offset."""
return make_virtual_offset(self._handle.tell(), len(self._buffer))
[docs] def seekable(self):
"""Return True indicating the BGZF supports random access."""
# Not seekable, but we do support tell...
return False
[docs] def isatty(self):
"""Return True if connected to a TTY device."""
return False
[docs] def fileno(self):
"""Return integer file descriptor."""
return self._handle.fileno()
def __enter__(self):
"""Open a file operable with WITH statement."""
return self
def __exit__(self, type, value, traceback):
"""Close a file with WITH statement."""
self.close()