Source code for plastid.readers.bed

#!/usr/bin/env python
"""This module contains |BED_Reader|, an iterator that reads each line of a `BED`_
or :term:`extended BED` file into a |SegmentChain|, |Transcript|, or similar object. 

.. contents::
   :local:

Module contents
---------------

.. autosummary::

   BED_Reader
   bed_x_formats

Examples
--------
Read entries in a `BED`_ file as |Transcripts|. `thickEnd` and `thickStart`
columns will be interpreted as the endpoints of coding regions::

    >>> bed_reader = BED_Reader("some_file.bed",return_type=Transcript)
    >>> for transcript in bed_reader:
            pass # do something fun with each Transcript/SegmentChain

If `return_type` is unspecified, `BED`_ lines are read as |SegmentChains|::

    >>> my_chains = list(BED_Reader("some_file.bed"))
    >>> my_chains[:5]
        [list of segment chains as output...]

Open an :term:`extended BED` file, which contains additional columns for `gene_id`
and `favorite_color`. Values for these attributes will be stored in the `attr`
dict of each |Transcript|::

    >>> bed_reader = BED_Reader("some_file.bed",return_type=Transcript,extra_columns=["gene_id","favorite_color"])

Open several `Tabix`_-compressed `BED`_ files, and iterate over them as if
they were one stream::

    >>> bed_reader = BED_Reader("file1.bed.gz","file2.bed.gz",tabix=True)
    >>> for chain in bed_reader:
    >>>     pass # do something interesting with each chain
                                

See Also
--------
`UCSC file format FAQ <http://genome.ucsc.edu/FAQ/FAQformat.html>`_.
    BED format specification at UCSC
"""
__date__ = "Aug 23, 2011"
__author__ = "joshua"

import shlex
from plastid.readers.common import AssembledFeatureReader
from plastid.util.services.exceptions import FileFormatWarning, warn

bed_x_formats = {
    "bedDetail": [
        ("ID", str),
        ("description", str),
    ],
    "narrowPeak": [("signalValue", float),
                   ("pValue", float),
                   ("qValue", float),
                   ("peak", int)],
    "broadPeak": [("signalValue", float),
                  ("pValue", float),
                  ("qValue", float)],
    "gappedPeak": [("signalValue", float),
                   ("pValue", float),
                   ("qValue", float)],
    "tagAlign": [("sequence", str),
                 ("score", float),
                 ("strand", str)],
    "pairedTagAlign": [("seq1", str),
                       ("seq2", str)],
    "peptideMapping":
    [("rawScore", float),
     ("spectrumId", str),
     ("peptideRank", int),
     ("peptideRepeatCount", int)],
}
"""Column names and types for various :term:`extended BED` formats used by 
the `ENCODE`_ project. These can be passed to the `extra_columns` keyword of
:class:`BED_Reader`."""


[docs]class BED_Reader(AssembledFeatureReader): """ BED_Reader(*streams, return_type=SegmentChain, add_three_for_stop=False, extra_columns=0, printer=None, tabix=False) Reads `BED`_ and :term:`extended BED` files line-by-line into |SegmentChains| or |Transcripts|. Metadata, if present in a track declaration, is saved in `self.metadata`. Malformed lines are stored in `self.rejected`, while parsing continues. Parameters ---------- *streams : file-like One or more open filehandles of input data. return_type : |SegmentChain| or subclass, optional Type of feature to return from assembled subfeatures (Default: |SegmentChain|) add_three_for_stop : bool, optional Some annotation files exclude the stop codon from CDS annotations. If set to `True`, three nucleotides will be added to the threeprime end of each CDS annotation, **UNLESS** the annotated transcript contains explicit stop_codon feature. (Default: `False`) extra_columns: int or list optional Extra, non-BED columns in :term:`extended BED` format file corresponding to feature attributes. This is common in `ENCODE`_-specific `BED`_ variants. if `extra-columns` is: - an :class:`int`: it is taken to be the number of attribute columns. Attributes will be stored in the `attr` dictionary of the |SegmentChain|, under names like `custom0`, `custom1`, ... , `customN`. - a :class:`list` of :class:`str`, it is taken to be the names of the attribute columns, in order, from left to right in the file. In this case, attributes in extra columns will be stored under their respective names in the `attr` dict. - a :class:`list` of :class:`tuple`, each tuple is taken to be a pair of `(attribute_name, formatter_func)`. In this case, the value of `attribute_name` in the `attr` dict of the |SegmentChain| will be set to `formatter_func(column_value)`. (Default: 0) printer : file-like, optional Logger implementing a ``write()`` method. Default: |NullWriter| tabix : boolean, optional `streams` point to `tabix`_-compressed files or are open :class:`~pysam.ctabix.tabix_file_iterator` (Default: `False`) Examples -------- Read entries in a `BED`_ file as |Transcripts|. `thickEnd` and `thickStart` columns will be interpreted as the endpoints of coding regions:: >>> bed_reader = BED_Reader(open("some_file.bed"),return_type=Transcript) >>> for transcript in bed_reader: >>> pass # do something fun Open an :term:`extended BED` file that contains additional columns for `gene_id` and `favorite_color`. Values for these attributes will be stored in the `attr` dict of each |Transcript|:: >>> bed_reader = BED_Reader(open("some_file.bed"),return_type=Transcript,extra_columns=["gene_id","favorite_color"]) Open several `Tabix`_-compressed `BED`_ files, and iterate over them as if they were one uncompressed stream:: >>> bed_reader = BED_Reader("file1.bed.gz","file2.bed.gz",tabix=True) >>> for chain in bed_reader: >>> pass # do something more interesting Attributes ---------- streams : file-like One or more open streams (usually filehandles) of input data. return_type : class The type of object assembled by the reader. Typically a |SegmentChain| or a subclass thereof. Must import a method called ``from_bed()`` counter : int Cumulative line number counter over all streams rejected : list List of `BED`_ lines that could not be parsed metadata : dict Attributes declared in track line, if any extra_columns : int or list, optional Extra, non-BED columns in :term:`extended BED` format file corresponding to feature attributes. This is common in `ENCODE`_-specific `BED`_ variants. if `extra_columns` is: - an :class:`int`: it is taken to be the number of attribute columns. Attributes will be stored in the `attr` dictionary of the |SegmentChain|, under names like `custom0`, `custom1`, ... , `customN`. - a :class:`list` of :class:`str`, it is taken to be the names of the attribute columns, in order, from left to right in the file. In this case, attributes in extra columns will be stored under there respective names in the `attr` dict. - a :class:`list` of :class:`tuple`, each tuple is taken to be a pair of `(attribute_name, formatter_func)`. In this case, the value of `attribute_name` in the `attr` dict of the |SegmentChain| will be set to `formatter_func(column_value)`. If unspecified, :class:`BED_Reader` reads the track declaration line (if present), and: - if a known track type is specified by the `type` field, it attempts to format the extra columns as specified by that type. Known track types presently include: - bedDetail - narrowPeak - broadPeak - gappedPeak - tagAlign - pairedTagAlign - peptideMapping - if not, it assumes 0 non-`BED`_ fields are present, and that all columns are `BED`_ formatted. """ def __init__(self, *args, **kwargs): """ BED_Reader(*streams, return_type=SegmentChain, add_three_for_stop=False, extra_columns=0, printer=None, tabix=False) Parameters ---------- *streams : file-like One or more open filehandles of input data. return_type : |SegmentChain| or subclass, optional Type of feature to return from assembled subfeatures (Default: |SegmentChain|) add_three_for_stop : bool, optional Some annotation files exclude the stop codon from CDS annotations. If set to `True`, three nucleotides will be added to the threeprime end of each CDS annotation, **UNLESS** the annotated transcript contains explicit stop_codon feature. (Default: `False`) extra_columns: int or list optional Extra, non-BED columns in :term:`Extended BED`_ format file corresponding to feature attributes. This is common in `ENCODE`_-specific `BED`_ variants. if `extra-columns` is: - an :class:`int`: it is taken to be the number of attribute columns. Attributes will be stored in the `attr` dictionary of the |SegmentChain|, under names like `custom0`, `custom1`, ... , `customN`. - a :class:`list` of :class:`str`, it is taken to be the names of the attribute columns, in order, from left to right in the file. In this case, attributes in extra columns will be stored under their respective names in the `attr` dict. - a :class:`list` of :class:`tuple`, each tuple is taken to be a pair of `(attribute_name, formatter_func)`. In this case, the value of `attribute_name` in the `attr` dict of the |SegmentChain| will be set to `formatter_func(column_value)`. (Default: 0) printer : file-like, optional Logger implementing a ``write()`` method. Default: |NullWriter| tabix : boolean, optional `streams` are `tabix`_-compressed (Default: `False`) """ AssembledFeatureReader.__init__(self, *args, **kwargs) self.extra_columns = kwargs.get("extra_columns", 0) def _parse_track_line(self, inp): """Parse track line from `BED`_ / extended BED file Parameters ---------- inp : str track definition line from `BED`_ / extended BED file Returns ------- dict key-value pairs from `BED`_ line """ self.metadata = {} ltmp = shlex.split(inp.strip("\n")) for item in ltmp: k, v = item.split("=") self.metadata[k] = v track_type = self.metadata.get("type", None) if track_type is not None: if track_type in bed_x_formats: self.printer.write( "Found track type '%s' in track definition line. Assuming extra columns follow UCSC definitions." % track_type ) if self.extra_columns == 0: self.extra_columns = bed_x_formats[track_type] elif self.extra_columns != bed_x_formats[track_type]: my_columns = self._get_extra_column_names() track_format_columns = ",".join([X[0] for X in bed_x_formats[track_type]]) warn("Extra columns specified by %s track type declaration (%s) don't match those specified by user (%s). Using those specified by user." %\ (track_type,track_format_columns,my_columns),FileFormatWarning) self.metadata["type"] = "custom" else: self.printer.write("Found track type '%s' in track definition line." % track_type) def _get_extra_column_names(self): """Return names of extra columns in extended BED file)""" if isinstance(self.extra_columns, int): my_columns = "%s unnamed columns" % self.extra_columns elif isinstance(self.extra_columns, list): if all([isinstance(X, tuple) for X in self.extra_columns]): my_columns = ",".join([X[0] for X in self.extra_columns]) elif all([isinstance(X, str) for X in self.extra_columns]): my_columns = ",".join(self.extra_columns) return my_columns def _assemble(self, line): """Read `BED`_ files line-by-line into types specified by `self.return_type`""" self.counter += 1 if line.strip() == "": return self.__next__() elif line.startswith("browser"): return self.__next__() elif line.startswith("track"): # reset metadata self._parse_track_line(line[5:]) return self.__next__() elif line.startswith("#"): return self.__next__() else: try: return self.return_type.from_bed(line, extra_columns=self.extra_columns) except: self.rejected.append(line) msg = "Cannot parse BED line number %s. " % self.counter if self.metadata.get("type", None) is not None: msg += ( "Are you sure this is a %s BED file with extra columns (%s)?" % (self.metadata.get("type"), self._get_extra_column_names()) ) elif self.extra_columns != 0: msg += ( "Are you sure this BED file has extra columns (%s)?" % self._get_extra_column_names() ) else: msg += "Maybe this BED has extra columns (i.e. is an extended BED file)?" msg += ("\n %s" % line) warn(msg, FileFormatWarning) return self.__next__()