Source code for plastid.readers.bed

#!/usr/bin/env python
"""This module contains |BED_Reader|, an iterator that reads each line of a `BED`_
or :term:`extended BED` file into a |SegmentChain|, |Transcript|, or similar object. 

.. contents::
   :local:

Module contents
---------------

.. autosummary::

   BED_Reader
   bed_x_formats

Examples
--------
Read entries in a `BED`_ file as |Transcripts|. `thickEnd` and `thickStart`
columns will be interpreted as the endpoints of coding regions::

    >>> bed_reader = BED_Reader("some_file.bed",return_type=Transcript)
    >>> for transcript in bed_reader:
            pass # do something fun with each Transcript/SegmentChain

If `return_type` is unspecified, `BED`_ lines are read as |SegmentChains|::

    >>> my_chains = list(BED_Reader("some_file.bed"))
    >>> my_chains[:5]
        [list of segment chains as output...]

Open an :term:`extended BED` file, which contains additional columns for `gene_id`
and `favorite_color`. Values for these attributes will be stored in the `attr`
dict of each |Transcript|::

    >>> bed_reader = BED_Reader("some_file.bed",return_type=Transcript,extra_columns=["gene_id","favorite_color"])

Open several `Tabix`_-compressed `BED`_ files, and iterate over them as if
they were one stream::

    >>> bed_reader = BED_Reader("file1.bed.gz","file2.bed.gz",tabix=True)
    >>> for chain in bed_reader:
    >>>     pass # do something interesting with each chain
                                

See Also
--------
`UCSC file format FAQ <http://genome.ucsc.edu/FAQ/FAQformat.html>`_.
    BED format specification at UCSC
"""
__date__ = "Aug 23, 2011"
__author__ = "joshua"

import shlex
from plastid.readers.common import AssembledFeatureReader
from plastid.util.services.exceptions import FileFormatWarning, warn

bed_x_formats = {
    "bedDetail": [
        ("ID", str),
        ("description", str),
    ],
    "narrowPeak": [("signalValue", float),
                   ("pValue", float),
                   ("qValue", float),
                   ("peak", int)],
    "broadPeak": [("signalValue", float),
                  ("pValue", float),
                  ("qValue", float)],
    "gappedPeak": [("signalValue", float),
                   ("pValue", float),
                   ("qValue", float)],
    "tagAlign": [("sequence", str),
                 ("score", float),
                 ("strand", str)],
    "pairedTagAlign": [("seq1", str),
                       ("seq2", str)],
    "peptideMapping":
    [("rawScore", float),
     ("spectrumId", str),
     ("peptideRank", int),
     ("peptideRepeatCount", int)],
}
"""Column names and types for various :term:`extended BED` formats used by 
the `ENCODE`_ project. These can be passed to the `extra_columns` keyword of
:class:`BED_Reader`."""


[docs]class BED_Reader(AssembledFeatureReader):
    """
    BED_Reader(*streams, return_type=SegmentChain, add_three_for_stop=False, extra_columns=0, printer=None, tabix=False)
    
    Reads `BED`_ and :term:`extended BED` files line-by-line into |SegmentChains|
    or |Transcripts|. Metadata, if present in a track declaration, is saved
    in `self.metadata`. Malformed lines are stored in `self.rejected`, while
    parsing continues.

    Parameters
    ----------
    *streams : file-like
        One or more open filehandles of input data.
    
    return_type : |SegmentChain| or subclass, optional
        Type of feature to return from assembled subfeatures (Default: |SegmentChain|)

    add_three_for_stop : bool, optional
        Some annotation files exclude the stop codon from CDS annotations. If set to
        `True`, three nucleotides will be added to the threeprime end of each
        CDS annotation, **UNLESS** the annotated transcript contains explicit stop_codon 
        feature. (Default: `False`)

    extra_columns: int or list optional
        Extra, non-BED columns in :term:`extended BED` format file corresponding
        to feature attributes. This is common in `ENCODE`_-specific `BED`_ variants.
        
        if `extra-columns` is:
        
          - an :class:`int`: it is taken to be the
            number of attribute columns. Attributes will be stored in
            the `attr` dictionary of the |SegmentChain|, under names like
            `custom0`, `custom1`, ... , `customN`.

          - a :class:`list` of :class:`str`, it is taken to be the names
            of the attribute columns, in order, from left to right in the file.
            In this case, attributes in extra columns will be stored under
            their respective names in the `attr` dict.

          - a :class:`list` of :class:`tuple`, each tuple is taken
            to be a pair of `(attribute_name, formatter_func)`. In this case,
            the value of `attribute_name` in the `attr` dict of the |SegmentChain|
            will be set to `formatter_func(column_value)`.
        
        (Default: 0)
                    
    printer : file-like, optional
        Logger implementing a ``write()`` method. Default: |NullWriter|
    
    tabix : boolean, optional
        `streams` point to `tabix`_-compressed files or are open
        :class:`~pysam.ctabix.tabix_file_iterator` (Default: `False`)
        

    Examples
    --------
    Read entries in a `BED`_ file as |Transcripts|. `thickEnd` and `thickStart`
    columns will be interpreted as the endpoints of coding regions::
    
        >>> bed_reader = BED_Reader(open("some_file.bed"),return_type=Transcript)
        >>> for transcript in bed_reader:
        >>>     pass # do something fun
    
    Open an :term:`extended BED` file that contains additional columns for `gene_id`
    and `favorite_color`. Values for these attributes will be stored in the `attr`
    dict of each |Transcript|::
    
        >>> bed_reader = BED_Reader(open("some_file.bed"),return_type=Transcript,extra_columns=["gene_id","favorite_color"])

    Open several `Tabix`_-compressed `BED`_ files, and iterate over them as if
    they were one uncompressed stream::
    
        >>> bed_reader = BED_Reader("file1.bed.gz","file2.bed.gz",tabix=True)
        >>> for chain in bed_reader:
        >>>     pass # do something more interesting

    
    Attributes
    ----------
    streams : file-like
        One or more open streams (usually filehandles) of input data.
    
    return_type : class
        The type of object assembled by the reader. Typically a |SegmentChain|
        or a subclass thereof. Must import a method called ``from_bed()``

    counter : int
        Cumulative line number counter over all streams
    
    rejected : list
        List of `BED`_ lines that could not be parsed
    
    metadata : dict
        Attributes declared in track line, if any
    
    extra_columns : int or list, optional
        Extra, non-BED columns in :term:`extended BED` format file corresponding to feature
        attributes. This is common in `ENCODE`_-specific `BED`_ variants.
        
        if `extra_columns` is:
        
          - an :class:`int`: it is taken to be the
            number of attribute columns. Attributes will be stored in
            the `attr` dictionary of the |SegmentChain|, under names like
            `custom0`, `custom1`, ... , `customN`.

          - a :class:`list` of :class:`str`, it is taken to be the names
            of the attribute columns, in order, from left to right in the file.
            In this case, attributes in extra columns will be stored under
            there respective names in the `attr` dict.

          - a :class:`list` of :class:`tuple`, each tuple is taken
            to be a pair of `(attribute_name, formatter_func)`. In this case,
            the value of `attribute_name` in the `attr` dict of the |SegmentChain|
            will be set to `formatter_func(column_value)`.
        
        If unspecified, :class:`BED_Reader` reads the track declaration line
        (if present), and:

          - if a known track type is specified by the `type` field, it attempts
            to format the extra columns as specified by that type. Known track
            types presently include:

              - bedDetail
              - narrowPeak
              - broadPeak
              - gappedPeak
              - tagAlign
              - pairedTagAlign
              - peptideMapping
          
          - if not, it assumes 0 non-`BED`_ fields are present, and that all columns
            are `BED`_ formatted.
    """

    def __init__(self, *args, **kwargs):
        """
        BED_Reader(*streams, return_type=SegmentChain, add_three_for_stop=False, extra_columns=0, printer=None, tabix=False)
        
        Parameters
        ----------
        *streams : file-like
            One or more open filehandles of input data.
        
        return_type : |SegmentChain| or subclass, optional
            Type of feature to return from assembled subfeatures (Default: |SegmentChain|)

        add_three_for_stop : bool, optional
            Some annotation files exclude the stop codon from CDS annotations. If set to
            `True`, three nucleotides will be added to the threeprime end of each
            CDS annotation, **UNLESS** the annotated transcript contains explicit stop_codon 
            feature. (Default: `False`)

        extra_columns: int or list optional
            Extra, non-BED columns in :term:`Extended BED`_ format file corresponding
            to feature attributes. This is common in `ENCODE`_-specific `BED`_ variants.
            
            if `extra-columns` is:
            
              - an :class:`int`: it is taken to be the
                number of attribute columns. Attributes will be stored in
                the `attr` dictionary of the |SegmentChain|, under names like
                `custom0`, `custom1`, ... , `customN`.

              - a :class:`list` of :class:`str`, it is taken to be the names
                of the attribute columns, in order, from left to right in the file.
                In this case, attributes in extra columns will be stored under
                their respective names in the `attr` dict.

              - a :class:`list` of :class:`tuple`, each tuple is taken
                to be a pair of `(attribute_name, formatter_func)`. In this case,
                the value of `attribute_name` in the `attr` dict of the |SegmentChain|
                will be set to `formatter_func(column_value)`.
            
            (Default: 0)
                        
        printer : file-like, optional
            Logger implementing a ``write()`` method. Default: |NullWriter|
        
        tabix : boolean, optional
            `streams` are `tabix`_-compressed (Default: `False`)
        """
        AssembledFeatureReader.__init__(self, *args, **kwargs)
        self.extra_columns = kwargs.get("extra_columns", 0)

    def _parse_track_line(self, inp):
        """Parse track line from `BED`_ / extended BED file
        
        Parameters
        ----------
        inp : str
            track definition line from `BED`_  / extended BED file

        Returns
        -------
        dict
            key-value pairs from `BED`_ line
        """
        self.metadata = {}
        ltmp = shlex.split(inp.strip("\n"))
        for item in ltmp:
            k, v = item.split("=")
            self.metadata[k] = v

        track_type = self.metadata.get("type", None)
        if track_type is not None:
            if track_type in bed_x_formats:
                self.printer.write(
                    "Found track type '%s' in track definition line. Assuming extra columns follow UCSC definitions."
                    % track_type
                )
                if self.extra_columns == 0:
                    self.extra_columns = bed_x_formats[track_type]
                elif self.extra_columns != bed_x_formats[track_type]:
                    my_columns = self._get_extra_column_names()
                    track_format_columns = ",".join([X[0] for X in bed_x_formats[track_type]])
                    warn("Extra columns specified by %s track type declaration (%s) don't match those specified by user (%s). Using those specified by user." %\
                         (track_type,track_format_columns,my_columns),FileFormatWarning)
                    self.metadata["type"] = "custom"
            else:
                self.printer.write("Found track type '%s' in track definition line." % track_type)

    def _get_extra_column_names(self):
        """Return names of extra columns in extended BED file)"""
        if isinstance(self.extra_columns, int):
            my_columns = "%s unnamed columns" % self.extra_columns
        elif isinstance(self.extra_columns, list):
            if all([isinstance(X, tuple) for X in self.extra_columns]):
                my_columns = ",".join([X[0] for X in self.extra_columns])
            elif all([isinstance(X, str) for X in self.extra_columns]):
                my_columns = ",".join(self.extra_columns)

        return my_columns

    def _assemble(self, line):
        """Read `BED`_ files line-by-line into types specified by `self.return_type`"""
        self.counter += 1
        if line.strip() == "":
            return self.__next__()
        elif line.startswith("browser"):
            return self.__next__()
        elif line.startswith("track"):
            # reset metadata
            self._parse_track_line(line[5:])
            return self.__next__()
        elif line.startswith("#"):
            return self.__next__()
        else:
            try:
                return self.return_type.from_bed(line, extra_columns=self.extra_columns)
            except:
                self.rejected.append(line)
                msg = "Cannot parse BED line number %s. " % self.counter
                if self.metadata.get("type", None) is not None:
                    msg += (
                        "Are you sure this is a %s BED file with extra columns (%s)?" %
                        (self.metadata.get("type"), self._get_extra_column_names())
                    )
                elif self.extra_columns != 0:
                    msg += (
                        "Are you sure this BED file has extra columns (%s)?" %
                        self._get_extra_column_names()
                    )
                else:
                    msg += "Maybe this BED has extra columns (i.e. is an extended BED file)?"

                msg += ("\n    %s" % line)
                warn(msg, FileFormatWarning)
                return self.__next__()