Source code for plastid.readers.psl

#!/usr/bin/env python
"""This module defines a two classes for reading `PSL`_ files (made by, for example,
`blat`_):


|PSL_Reader|
    Read a `PSL`_ file line-by-line, converting each line into a |SegmentChain|
    or |Transcript|

|BundledPSL_Reader|
    Read `PSL`_ files, returning lists of |SegmentChains| grouped by query sequence.
"""
__date__ = "2011-09-01"
__author__ = "joshua"
import itertools
from plastid.readers.common import AssembledFeatureReader
from plastid.genomics.roitools import SegmentChain
from plastid.util.services.exceptions import FileFormatWarning, warn


[docs]class PSL_Reader(AssembledFeatureReader):
    """
    PSL_reader(*streams, return_type=SegmentChain, add_three_for_stop=False, tabix=False, printer=None, **kwargs)
    
    Read `PSL`_ files into |SegmentChain|  or |Transcript| objects


    Parameters
    ----------
    *streams : file-like
        One or more open filehandles of input data.
    
    return_type : |SegmentChain| or subclass, optional
        Type of feature to return from assembled subfeatures (Default: |SegmentChain|)

    add_three_for_stop : bool, optional
        Some annotation files exclude the stop codon from CDS annotations. If set to
        `True`, three nucleotides will be added to the threeprime end of each
        CDS annotation, **UNLESS** the annotated transcript contains explicit stop_codon 
        feature. (Default: `False`)
                    
    printer : file-like, optional
        Logger implementing a ``write()`` method. Default: |NullWriter|

    tabix : bool, optional
        `streams` point to `tabix`_-compressed files or are open
        :class:`~pysam.ctabix.tabix_file_iterator` (Default: `False`)


    **kwargs
        Other keyword arguments used by specific parsers

    
    Attributes
    ----------
    streams : file-like
        One or more open streams (usually filehandles) of input data.
    
    return_type : class
        The type of object assembled by the reader. Typically an |SegmentChain|
        or a subclass thereof. Must import a method called ``from_psl``

    counter : int
        Cumulative line number counter over all streams
    
    rejected : list
        A list of lines from `PSL`_ file that did not assemble properly
    
    metadata : dict
        Various attributes gleaned from the stream, if any    
    """

    def _assemble(self, line):
        """Read `PSL`_ files line-by-line into types specified by ``self.return_type``"""
        self.counter += 1
        if line.strip() == "":
            return self.__next__()
        elif line.startswith("psLayout"):
            return self.__next__()
        elif line.lstrip().startswith("match"):
            return self.__next__()
        elif line.startswith("--"):
            return self.__next__()
        elif line.startswith("#"):
            return self.__next__()
        else:
            try:
                return self.return_type.from_psl(line)
            except Exception as e:
                self.rejected.append(line)
                warn(
                    "Rejecting line %s because of %s: %s" % (self.counter, e.message, line),
                    FileFormatWarning
                )
                return self.__next__()


[docs]class BundledPSL_Reader(PSL_Reader):
    """
    BundledPSL_reader(*streams, return_type=SegmentChain, add_three_for_stop=False, tabix=False, printer=None, **kwargs)
    
    Read `PSL`_ files, returning lists of |SegmentChains| grouped by query sequence.
    Use this when a given query sequence has multiple hits in your `PSL`_ file,
    and you want the output to be grouped.
    

    Parameters
    ----------
    *streams : file-like
        One or more open filehandles of input data.
    
    return_type : |SegmentChain| or subclass, optional
        Type of feature to return from assembled subfeatures (Default: |SegmentChain|)

    add_three_for_stop : bool, optional
        Some annotation files exclude the stop codon from CDS annotations. If set to
        `True`, three nucleotides will be added to the threeprime end of each
        CDS annotation, **UNLESS** the annotated transcript contains explicit stop_codon 
        feature. (Default: `False`)
                    
    printer : file-like, optional
        Logger implementing a ``write()`` method. Default: |NullWriter|
    
    tabix : bool, optional
        `streams` point to `tabix`_-compressed files or are open
        :class:`~pysam.ctabix.tabix_file_iterator` (Default: `False`)

    **kwargs
        Other keyword arguments used by specific parsers    
    """

[docs]    def filter(self, line):
        """Process lines of `PSL`_ files input into |SegmentChain|, and group
        these by query sequence.
         
        Parameters
        ----------
        line : str
            line of `PSL`_ input
         
        Returns
        -------
        list
            list of |SegmentChain| objects sharing a query sequence 
        """
        ltmp = []
        aln = SegmentChain.from_psl(line)
        last_name = aln.attr["query_name"]
        try:
            while last_name == aln.attr["query_name"]:
                ltmp.append(aln)
                line = next(self.stream)
                aln = SegmentChain.from_psl(line)

            self.stream = itertools.chain([line], self.stream)
            return ltmp
        except StopIteration:
            # send final bundle
            return ltmp