Source code for plastid.readers.common

#!/usr/bin/env python
"""Constants, functions, and classes used by multiple readers in this subpackage


Functions & classes
-------------------
:func:`get_identical_attributes`
    Return a dictionary of all key-value pairs that are common to all `attr`
    dictionaries in a set of |SegmentChains|
    
|AssembledFeatureReader|
    Base class for readers that assemble high-level features (e.g. gapped
    alignments or transcripts) from one or more sub-features in an annotation file

"""
import itertools
import pysam

try:
    from pysam.ctabix import tabix_generic_iterator, tabix_file_iterator
except ImportError:
    from pysam.libctabix import tabix_generic_iterator, tabix_file_iterator

from plastid.util.io.filters import AbstractReader
from plastid.util.io.openers import NullWriter, multiopen
from plastid.genomics.roitools import (
    GenomicSegment,
    SegmentChain,
    Transcript,
    add_three_for_stop_codon,
)
from abc import abstractmethod

#===============================================================================
# INDEX: helper functions
#===============================================================================


[docs]def get_identical_attributes(features, exclude=None):
    """Return a dictionary of all key-value pairs that are identical for all |SegmentChains| in `features`
    
    Parameters
    ----------
    features : list 
        list of |SegmentChains|

    exclude : set
        attributes to exclude from identity criteria
    
    Returns
    -------
    dict
        Dictionary of all key-value pairs that have identical values in all the
        `attr` dictionaries of all the features in `features`
    """
    exclude = [] if exclude is None else exclude
    common_keys = set(features[0].attr.keys())
    for feature in features:
        common_keys &= set(feature.attr.keys())

    common_keys -= set(exclude)

    dtmp = { K : features[0].attr[K] for K in common_keys \
                 if all([X.attr[K] == features[0].attr[K] for X in features]) == True }
    return dtmp


#===============================================================================
# INDEX: classes
#===============================================================================


[docs]class AssembledFeatureReader(AbstractReader):
    """
    AssembledFeatureReader(*streams, return_type=SegmentChain, add_three_for_stop=False, tabix=False, printer=None, **kwargs)
    
    Abstract base class for readers that yield complex or discontinuous features
    such as transcripts or gapped alignments.
    
    For memory efficiency, all readers function as iterators. Readers built
    by subclassing |AssembledFeatureReader| are responsible for:
    
      - choosing when to yield assembled features
      
      - deciding how many subfeatures to hold in memory
      
      - overloading :meth:`~AssembledFeatureReader._assemble`
      

    Parameters
    ----------
    *streams : file-like
        One or more open filehandles of input data.
    
    return_type : |SegmentChain| or subclass, optional
        Type of feature to return from assembled subfeatures (Default: |SegmentChain|)

    add_three_for_stop : bool, optional
        Some annotation files exclude the stop codon from CDS annotations. If set to
        `True`, three nucleotides will be added to the threeprime end of each
        CDS annotation, **UNLESS** the annotated transcript contains explicit stop_codon 
        feature. (Default: `False`)
                    
    printer : file-like, optional
        Logger implementing a ``write()`` method. Default: |NullWriter|
    
    tabix : boolean, optional
        `streams` are `tabix`_-compressed (Default: `False`)

    **kwargs
        Other keyword arguments used by specific parsers
      
    
    Attributes
    ----------
    streams : file-like
        Input streams, usually constructed from or more open filehandles
    
    metadata : dict
        Various attributes gleaned from the stream, if any

    counter : int
        Cumulative line number counter over all streams

    printer : file-like, optional
        Logger implementing a ``write()`` method.

    return_type : class
        The type of object assembled by the reader. Typically an |SegmentChain|
        or a subclass thereof.
    
    rejected : list
        A list of transcript IDs that failed to assemble properly
    """

    def __init__(self, *streams, **kwargs):
        """
        AssembledFeatureReader(*streams, return_type=SegmentChain, add_three_for_stop=False, printer=None, tabix=False, **kwargs)
        
        Parameters
        ----------
        streams : file-like
            One or more fileneames or open filehandles of input data.
        
        return_type : |SegmentChain| or subclass, optional
            Type of feature to return from assembled subfeatures (Default: |SegmentChain|)

        add_three_for_stop : bool, optional
            Some annotation files exclude the stop codon from CDS annotations. If set to
            `True`, three nucleotides will be added to the threeprime end of each
            CDS annotation, **UNLESS** the annotated transcript contains explicit stop_codon 
            feature. (Default: `False`)
                        
        printer : file-like, optional
            Logger implementing a ``write()`` method. Default: |NullWriter|
        
        tabix : boolean, optional
            `streams` point to `tabix`_-compressed files or are open
            :class:`~pysam.ctabix.tabix_file_iterator` (Default: `False`)

        **kwargs
            Other keyword arguments used by specific parsers
        """
        streams = multiopen(streams, fn=open, kwargs=dict(mode="rb"))

        if kwargs.get("tabix", False) == True:
            self.stream = itertools.chain.from_iterable((_tabix_iteradaptor(X) for X in streams))
        else:
            self.stream = itertools.chain.from_iterable(streams)

        self.counter = 0

        self.printer = kwargs.get("printer", NullWriter())

        self.return_type = kwargs.get("return_type", SegmentChain)
        add_three_for_stop = kwargs.get("add_three_for_stop", False)
        self._finalize = add_three_for_stop_codon if add_three_for_stop == True else lambda x: x

        self.metadata = {}
        self.rejected = []

    @abstractmethod
    def _assemble(self, data):
        """Assemble features from data. This must be implemented in subclass.
        
        Returns
        -------
        |SegmentChain| or subclass
            Next feature assembled from `self.streams`, type specified by `self.return_type`
        """

[docs]    def filter(self, data):
        """Return next assembled feature from `self.stream`
        
        Returns
        -------
        |SegmentChain| or subclass
            Next feature assembled from `self.streams`, type specified by `self.return_type`
        """
        return self._finalize(self._assemble(data))


def _tabix_iteradaptor(stream):
    """Open `stream` as an iterator over a `tabix`_ file, returning raw strings from tabix data.
    
    Parameters
    ----------
    streams : open file-like, :class:`pysam.ctabix.tabix_file_iterator`
    
    Returns
    -------
    generator
        Generator of tab-delimited string records in `tabix`_ file
    """
    if not isinstance(stream, (tabix_generic_iterator, tabix_file_iterator)):
        stream = tabix_file_iterator(stream, pysam.asTuple())

    return (str(X) for X in stream)