Source code for plastid.readers.gff

#!/usr/bin/env python
"""Tools for reading, writing, analyzing, and manipulating GFF file subtypes
(e.g. `GTF2`_ and `GFF3`_).

.. contents::
   :local:


Summary
-------

Because `GTF2`_/`GFF3`_ files are hierarchically structured -- i.e. a complex
feature can be assembled from several component features; each component
feature having its own record on its own line -- two interfaces for reading
`GTF2`_/`GFF3`_ files are included:

    Assembly of transcripts from exon, CDS, & UTR annotations
        |GTF2_TranscriptAssembler| and |GFF3_TranscriptAssembler| collect
        individual exon and CDS features, and assemble these into |Transcripts|.

        Features are read from `GTF2`_/`GFF3`_ files, grouped by `transcript_id`,
        `Parent`, or `ID` attributes, depending on file type. Assembled
        |Transcripts| are yielded only when their component features have fully
        been collected.

    Low-level parsing of simple features
        |GTF2_Reader| and |GFF3_Reader| read raw features (such as individual
        exons, stop codons, SNPs, et c) from `GTF2`_/`GFF3`_ files. Each line
        is returned as a |SegmentChain|.

Module contents
---------------

.. autosummary::

   GTF2_Reader
   GTF2_TranscriptAssembler
   GFF3_Reader
   GFF3_TranscriptAssembler


Examples
--------

|GTF2_Reader| and |GFF3_Reader| return raw, unmodified features from `GTF2`_ or
`GFF3`_ files -- e.g. exons, coding regions, stop codons -- without assembling
them into transcripts::

    >>> feature_reader = GTF2_Reader("some_file.gtf")
    >>> for feature in reader:
    >>>     print(feature.get_name(),feature.attr["type"],str(feature))
    ('YAL030W_mRNA',  'exon',        'chrI:87262-87387(+)')
    ('YAL030W_mRNA',  'exon',        'chrI:87500-87857(+)')
    ('YAL030W_mRNA',  'CDS',         'chrI:87285-87387(+)')
    ('YAL030W_mRNA',  'CDS',         'chrI:87500-87749(+)')
    ('YAL030W_mRNA',  'start_codon', 'chrI:87285-87288(+)')
    ('YAL030W_mRNA',  'stop_codon',  'chrI:87749-87752(+)')
    ('YBL092W_mRNA',  'exon',        'chrII:45643-45644(+)')
    ('YBL092W_mRNA',  'exon',        'chrII:45977-46440(+)')
    ('YBL092W_mRNA',  'CDS',         'chrII:45977-46367(+)')
    ('YBL092W_mRNA',  'start_codon', 'chrII:45977-45980(+)')
    [rest of output omitted]

In contrast, |GTF2_TranscriptAssembler| and |GFF3_TranscriptAssembler| reconstruct
transcripts from their components, based upon their `transcript_id`, `ID`, or
`Parent` attributes. Note how all features are of type `mRNA`, and how some
contain multiple exons (coordinates separated by `'^'`)::

    >>> transcript_reader = GTF2_TranscriptAssembler("some_file.gtf")
    >>> for transcript in reader:
    >>>     print(transcript.get_name(),transcript.attr["type"],str(transcript))
    ('YAL030W_mRNA',   'mRNA',  'chrI:87262-87387^87500-87857(+)')
    ('YBL092W_mRNA',   'mRNA',  'chrII:45643-45644^45977-46440(+)')
    ('YBL057C_mRNA',   'mRNA',  'chrII:112749-113427^113444-113450(-)')
    ('YBL040C_mRNA',   'mRNA',  'chrII:142033-142749^142846-142891(-)')
    ('YBL018C_mRNA',   'mRNA',  'chrII:185961-186352^186427-186504(-)')
    ('YBR012W-B',      'mRNA',  'chrII:259868-261173^261174-265140(+)')
    ('YBR044C_mRNA',   'mRNA',  'chrII:324292-324336^324340-326127(-)')
    ('YBR082C_mRNA',   'mRNA',  'chrII:406506-407027^407122-407379(-)')
    ('YBR126W-B_mRNA', 'mRNA',  'chrII:490824-491202(+)')
    ('YBR138C_mRNA',   'mRNA',  'chrII:513636-515391(-)')
    [rest of output omitted]


See Also
--------
`GFF3 specification <http://song.sourceforge.net/gff3.shtml>`_
    GFF3 specification by the Sequence Ontology consortium

`GTF2.2 specification <http://mblab.wustl.edu/GTF22.html>`_
    Hosted by the Brent lab

`UCSC file format FAQ <http://genome.ucsc.edu/FAQ/FAQformat.html>`_.
    GFF & GTF descriptions at UCSC
"""
__author__ = "joshua"
__date__ = "$Dec 1, 2010 11:00:55 AM$"
import itertools
import gc
import copy
import sys
from abc import abstractmethod
from plastid.util.io.filters import AbstractReader, SkipBlankReader
from plastid.util.io.openers import NullWriter, multiopen
from plastid.readers.common import get_identical_attributes, \
                                   AssembledFeatureReader
from plastid.genomics.roitools import Transcript, SegmentChain, \
                                      GenomicSegment, add_three_for_stop_codon
from plastid.readers.gff_tokens import parse_GFF3_tokens, parse_GTF2_tokens
from plastid.util.services.exceptions import DataWarning, warn



#===============================================================================
# INDEX: SO v2.5.3 feature types
#   see: http://www.sequenceontology.org/resources/intro.html
#===============================================================================

_DEFAULT_GFF3_GENE_TYPES = {
    "gene",
    "candidate_gene",
    "functional_candidate_gene",
    "positional_candidate_gene",
    "cryptic_gene",
    "cryptogene",
    "engineered_gene",
    "engineered_foreign_gene",
    "engineered_foreign_transposable_element_gene",
    "engineered_fusion_gene"
    "epigenetically_modified_gene",
    "allelically_excluded_gene",
    "gene_rearranged_at_DNA_level",
    "maternally_imprinted_gene",
    "paternally_imprinted_gene",
    "foreign_gene",
    "fusion_gene",
    "gene_cassette",
    "gene_with_non_canonical_start_codon",
    "gene_with_start_codon_CUG",
    "gene_with_polycistronic_transcript",
    "gene_with_dicistronic_transcript",
    "gene_with_dicistronic_mRNA",
    "gene_with_dicistronic_primary_transcript",
    "gene_with_trans_spliced_transcript",
    "mt_gene",
    "kinetoplast_gene",
    "maxicircle_gene",
    "minicircle_gene",
    "ncRNA_gene",
    "gRNA_gene",
    "lincRNA_gene",
    "miRNA_gene",
    "piRNA_gene",
    "RNase_MRP_RNA_gene",
    "RNase_P_RNA_gene",
    "rRNA_gene",
    "scRNA_gene",
    "snoRNA_gene",
    "snRNA_gene",
    "SRP_RNA_gene",
    "telomerase_RNA_gene",
    "tmRNA_gene",
    "tRNA_gene",
    "negatively_autoregulated_gene",
    "nuclear_gene",
    "nucleomorph_gene",
    "plasmid_gene",
    "plastid_gene",
    "apicoplast_gene",
    "chromoplast_gene",
    "ct_gene",
    "cyanelle_gene",
    "leucoplast_gene",
    "proplastid_gene",
    "positively_autoregulated_gene",
    "post_translationally_regulated_gene",
    "predicted_gene",
    "protein_coding_gene",
    "gene_with_edited_transcript",
    "gene_with_mRNA_with_frameshift",
    "gene_with_polyadenylated_mRNA",
    "gene_with_recoded_mRNA",
    "gene_with_mRNA_recoded_by_translational_bypass",
    "gene_with_stop_codon_read_through",
    "gene_with_stop_codon_redefined_as_pyrrollysine",
    "gene_with_stop_codon_redefined_as_selenocysteine",
    "gene_with_transcript_with_translational_frameshift", "proviral_gene",
    "endogenous_retroviral_gene", "non_functional_homolog_of_pseudogene",
    "non_processed_pseudogene", "cassette_pseudogene", "duplicated_pseudogene",
    "nuclear_mt_pseudogene", "pseudogene_by_unequal_crossing_over", "unitary_pseudogene",
    "polymorphic_pseudogene", "processed_pseudogene", "transposable_element_pseudogene",
    "recombinatorially_rearranged_gene", "recombinatorially_inverted_gene",
    "recombinatorially_rearranged_vertebrate_immune_system_gene", "rescue_gene",
    "wild_type_rescue_gene", "retrogene", "silenced_gene", "gene_silenced_by_DNA_modification",
    "gene_silenced_by_DNA_methylation", "gene_silenced_by_histone_modification",
    "gene_silenced_by_histone_deacetylation", "gene_silenced_by_histone_methylation",
    "gene_silenced_by_RNA_interference", "transgene", "floxed_gene",
    "translationally_regulated_gene", "transposable_element_gene",
    "engineered_foreign_transposable_element_gene",
}
"""GFF3 gene types as annotated by `SO 2.5.3 <http://www.sequenceontology.org/resources/intro.html>`_"""

_DEFAULT_GFF3_TRANSCRIPT_TYPES = {
    "transcript",
    "mature_transcript",
    "enzymatic_RNA",
    "mRNA",
    "mRNA_with_frameshift",
    "mRNA_with_minus_1_frameshift",
    "mRNA_with_minus_2_frameshift",
    "mRNA_with_plus_1_frameshift",
    "mRNA_with_plus_2_frameshift",
    "polyadenylated_mRNA",
    "polycistronic_mRNA",
    "dicistronic_mRNA",
    "monocistronic_mRNA",
    "recoded_mRNA",
    "mRNA_recoded_by_codon_redefinition",
    "mRNA_recoded_by_translational_bypass",
    "trans_spliced_mRNA",
    "ncRNA",
    "antisense_RNA",
    "MicF_RNA",
    "class_I_RNA",
    "class_II_RNA",
    "enhancerRNA",
    "guide_RNA",
    "lnc_RNA",
    "antisense_lncRNA",
    "intronic_lncRNA",
    "lincRNA",
    "piRNA",
    "priRNA",
    "rasiRNA",
    "RNase_MRP_RNA",
    "RNase_P_RNA",
    "rRNA",
    "large_subunit_rRNA",
    "rRNA_21S",
    "rRNA_23S",
    "rRNA_25S",
    "rRNA_28S",
    "rRNA_5_8S",
    "rRNA_5S",
    "small_subunit_rRNA",
    "rRNA_16S",
    "rRNA_18S",
    "rRNA_cleavage_RNA",
    "scRNA",
    "shRNA",
    "siRNA",
    "small_regulatory_ncRNA",
    "CsrB_RsmB_RNA",
    "DsrA_RNA",
    "GcvB_RNA",
    "IoR",
    "miRNA",
    "moR",
    "OxyS_RNA",
    "RNA_6S",
    "RprA_RNA",
    "RRE_RNA",
    "spot42_RNA",
    "tmRNA",
    "snoRNA",
    "C_D_box_snoRNA",
    "methylation_guide_snoRNA",
    "U14_snoRNA",
    "U3_snoRNA",
    "H_ACA_box_snoRNA",
    "pseudouridylation_guide_snoRNA",
    "snRNA",
    "U11_snRNA",
    "U12_snRNA",
    "U1_snRNA",
    "U2_snRNA",
    "U4_snRNA",
    "U4atac_snRNA",
    "U5_snRNA",
    "U6_snRNA",
    "U6atac_snRNA",
    "SRP_RNA",
    "tasiRNA",
    "telomerase_RNA",
    "telomeric_transcript",
    "anti_ARRET",
    "ARIA",
    "ARRET",
    "TERRA",
    "tRNA",
    "alanyl_tRNA",
    "arginyl_tRNA",
    "asparaginyl_tRNA",
    "aspartyl_tRNA",
    "cysteinyl_tRNA",
    "glutaminyl_tRNA",
    "glutamyl_tRNA",
    "glycyl_tRNA",
    "histidyl_tRNA",
    "isoleucyl_tRNA",
    "leucyl_tRNA",
    "lysyl_tRNA",
    "methionyl_tRNA",
    "phenylalanyl_tRNA",
    "prolyl_tRNA",
    "pyrrollysyl_tRNA",
    "selenocysteinyl_tRNA",
    "seryl_tRNA",
    "threonyl_tRNA",
    "tryptophanyl_tRNA",
    "tyrosyl_tRNA",
    "valyl_tRNA",
    "vault_RNA",
    "Y_RNA",
    "edited_transcript",
    "edited_mRNA",
    "edited_transcript_by_A_to_I_substitution",
    "non_functional_homolog_of_pseudogenic_transcript",
    "pseudogenic_transcript",
}
"""GFF3 mature transcript types as annotated by `SO 2.5.3
<http://www.sequenceontology.org/resources/intro.html>`_"""

_DEFAULT_GFF3_EXON_TYPES = {
    "exon",
    "coding_exon",
    "noncoding_exon",
    "exon_of_single_exon_gene",
    "interior_exon",
    "interior_coding_exon",
    "five_prime_coding_exon",
    "three_prime_coding_exon"
    "five_prime_noncoding_exon",
    "three_prime_noncoding_exon",
    "pseudogenic_exon",
}
"""GFF3 exon feature types as annotated by `SO 2.5.3
<http://www.sequenceontology.org/resources/intro.html>`_"""

_DEFAULT_GFF3_CDS_TYPES = {
    "CDS",
    "CDS_fragment",
    "CDS_indpendently_known",
    "CDS_predicted",
}
"""GFF3 CDS feature types as annotated by `SO 2.5.3
<http://www.sequenceontology.org/resources/intro.html>`_"""

#===============================================================================
# INDEX: Readers for GFF formats
#===============================================================================

StopFeature = SegmentChain(GenomicSegment("Stop", 0, 1, "."), type="StopFeature", ID="StopFeature")
"""Special |SegmentChain| emitted from GFF readers when:

- the special line ``###`` is encountered
- the special line ``###FASTA`` is encountered
- a GFF file is marked as sorted, and the contig/chromosome changes
- the source stream of features is changed

indicating that all previously returned features may be assembled into full
objects.

.. note::

   Because :obj:`StopFeature` is zero-length, it does not evaluate as equal to
   itself. Use ``x is StopFeature`` or ``x is not StopFeature`` it testing for
   equality.
"""


class AbstractGFF_Reader(AbstractReader):
    """Abstract base class for GFF readers.

    Parses GFF streams line by line into |GenomicSegment|

    Attributes
    ----------
    metadata : dict
        Dictionary of metadata found in file headers
    """

    def __init__(self, *streams, **kwargs):
        """Create an |AbstractGFF_Reader|

        Parameters
        ----------
        *streams : one or more str or file-like
            One or more input streams or filenames pointing to GFF information

        adjust_to_0 : bool, optional
            Boolean, whether or not to adjust feature
            indices to a 0 base. True for `GTF2`_ and `GFF3`_
            files, as these are 1-indexed. (Default: `True`)

        end_included : bool, optional
            Boolean, whether the end coordinate is
            included in the feature (closed or 'end-included' intervals)
            or not (half-open intervals). (Default: `True`)

        return_stopfeatures : bool, optional
            If `True`, will return a special |SegmentChain| called
            :obj:`StopFeature` signifying that all previously emitted GFF
            entries may be assembled into complete entities. These are emitted
            when the line `'###'` is encountered in a GFF. (Default: `True`)

        is_sorted : bool, optional
            If True and `return_stopfeatures` is True, assume the GFF is sorted.
            The reader will return :obj:`StopFeature` when the chromosome name
            of a given feature differs from that of the previous feature.
            (Default: `False`)

        tabix : boolean, optional
            `streams` point to `tabix`_-compressed files or are open
            :class:`~pysam.ctabix.tabix_file_iterator` (Default: `False`)
        """
        stream = itertools.chain.from_iterable(multiopen(streams, fn=open))
        if kwargs.get("tabix", False) == True:
            stream = ("\t".join(X) for X in stream)

        self.chromosomes = {}
        self.metadata = {}
        self.line_queue = []

        self.adjust_to_0 = kwargs.get("adjust_to_0", True)
        self.end_included = kwargs.get("end_included", True)
        self.return_stopfeatures = kwargs.get("return_stopfeatures", True)
        self.is_sorted = kwargs.get("is_sorted", False)

        line = next(stream)
        while line[0:2] == "##":
            self._parse_metatokens(line[2:])
            line = next(stream)

        self.line_queue.append(line)
        self._last_chrom = None

        self.stream = itertools.chain(self.line_queue, SkipBlankReader(stream))
        super(AbstractGFF_Reader, self).__init__(self.stream)

    def _parse_metatokens(self, inp):
        """Parses metadata embedded in a GFF stream, and stores
        these in appropriate attributes.

        Parameters
        ----------
        inp : str
            line of GFF input

        """
        items = inp.rstrip().split()
        if len(items) > 0:
            key = items[0]
            if key == "sequence-region":
                try:
                    self.chromosomes[items[1]] = (items[2], items[3])
                except IndexError:
                    self.chromosomes[items[1]] = tuple(items[1:])

            elif key in self.metadata.keys():
                self.metadata[key] += ";" + " ".join(items[1:])

            else:
                self.metadata[key] = " ".join(items[1:])

    @abstractmethod
    def _parse_tokens(self, attr_string):
        """Placeholder function to parse column 9, which is formatted
        differently in different GFF subtypes. Implement this
        in subclasses

        Parameters
        ----------
        attr_string : str
            Ninth column of GFF

        Returns
        -------
        dict
            Dictionary of parsed tokens from ninth GFF column
        """
        pass

    def _parse_genomic_feature(self, line):
        """Parse GFF lines into |SegmentChain| objects

        Parameters
        ----------
        line : str
            Valid line of a GFF formatted file

        Returns
        -------
        |SegmentChain|
        """
        # yapf: disable
        items = line.rstrip("\n").split("\t")
        chrom        = items[0]
        source       = items[1]
        feature_type = items[2]
        start        = int(items[3]) - int(self.adjust_to_0)
        end          = int(items[4]) - int(self.adjust_to_0) + int(self.end_included)
        score        = items[5]
        strand       = items[6]
        phase        = items[7]
        attr_string  = items[8]

        info_dict = self._parse_tokens(attr_string)
        info_dict["source"] = source
        info_dict["score"]  = score
        info_dict["phase"]  = phase
        info_dict["type"]   = feature_type
        # yapf: enable

        my_iv = GenomicSegment(chrom, start, end, strand)
        my_feature = SegmentChain(my_iv, **info_dict)

        if chrom != self._last_chrom:
            old_chrom = self._last_chrom
            self._last_chrom = chrom
            if old_chrom is not None:
                if self.is_sorted == True and self.return_stopfeatures == True:
                    self.stream = itertools.chain([line], self.stream)
                    return StopFeature

        return my_feature

    def filter(self, line):
        """Parses lines of the GFF stream into |SegmentChain|
        When metadata is found, temporarily delegates processing to
        :meth:`_parse_metatokens`, and then reads the next genomic feature

        Parameters
        ----------
        line
            Next line from GFF stream

        Returns
        -------
        |SegmentChain|
            Next feature in file
        """
        if line.startswith("###"):
            if self.return_stopfeatures == True:
                return StopFeature
            else:
                return next(self)

        elif line.startswith("##FASTA"):
            return StopFeature

        elif line.startswith("##"):
            self._parse_metatokens(line[2:])
            return next(self)

        elif line.startswith("#"):
            return next(self)

        else:
            return self._parse_genomic_feature(line)


[docs]class GFF3_Reader(AbstractGFF_Reader):
    """
    GFF3_Reader(*streams, end_included=True, return_stopfeatures=False, is_sorted=False, tabix=False)

    Read raw features in `GFF3`_ files as |SegmentChains|.

    Users who wish to reconstruct |Transcripts| from raw features should instead
    use |GFF3_TranscriptAssembler|, which performs this task automatically.

    Assumes input stream to use 1-indexed coordinates, in compliance with the
    `Sequence Ontology GFF3 specification <http://song.sourceforge.net/gff3.shtml>`_.

    `GFF3`_ attributes (from column 9) for each record are stored in its ``attr``
    dictionary. Names and values of attributes are unescaped. The values for the
    attributes `Parent`, `Alias`, `Dbxref`, `dbxref`, and `Note`, if present, are
    lists rather than strings, because the `GFF3`_ spec enables these to have
    multiple values.


    Parameters
    ----------
    *streams : one or more str or file-like
        One or more input streams or filenames pointing to GFF information

    end_included : bool, optional
        Boolean, whether the end coordinate is included in the feature (closed
        or 'end-included' intervals) or not (half-open intervals). All
        coordinates will be normalized to 0-indexed, half-open (Default: `True`)

    return_stopfeatures : bool, optional
        If `True`, return a special |SegmentChain| called :py:obj:`StopFeature`
        signifying that all previously emitted GFF entries may be assembled
        into complete entities. These are emitted when the line "###"
        is encountered in a `GFF3`_. (Default: `False`)

    is_sorted : bool, optional
        If `True` and `return_stopfeatures` is `True`, assume the `GFF3`_ is
        sorted.  The reader will return :obj:`StopFeature` when the chromosome
        name of a given feature differs from that of the previous feature.
        (Default: `False`)

    tabix : boolean, optional
        `streams` point to `tabix`_-compressed files or are open
        :class:`~pysam.ctabix.tabix_file_iterator` (Default: `False`)


    Attributes
    ----------
    metadata : dict
        Dictionary of metadata found in file headers


    Examples
    --------

    Read raw features from a `GFF3`_ file::

        >>> feature_reader = GFF3_Reader(open("./some_file.gff"))
        >>> for feature in feature_reader:
        >>>     print(feature.get_name(), feature.attr["type"], str(feature))
        ('chrI', 'chromosome', 'chrI:0-230218(.)')
        ('TEL01L-TR', 'telomeric_repeat', 'chrI:0-62(-)')
        ('TEL01L', 'telomere', 'chrI:0-801(-)')
        ('TEL01L-XR', 'X_element_combinatorial_repeat', 'chrI:62-336(-)')
        ('YAL069W', 'gene', 'chrI:334-649(+)')
        ('TEL01L-XC', 'X_element', 'chrI:336-801(-)')
        ('TEL01L-XC_nucleotide_match', 'nucleotide_match', 'chrI:752-763(-)')
        ('TEL01L-XC_binding_site', 'binding_site', 'chrI:531-544(-)')
        ('YAL068W-A', 'gene', 'chrI:537-792(+)')
        ('ARS102', 'ARS', 'chrI:649-1791(.)')
        [rest of output omitted]
    """

    def __init__(self, *streams, **kwargs):
        """
        GFF3_Reader(*streams, end_included=True, return_stopfeatures=False, is_sorted=False, tabix=False)

        Parameters
        ----------
        *streams : one or more str or file-like
            One or more input streams or filenames pointing to GFF information

        end_included : bool, optional
            Boolean, whether the end coordinate is
            included in the feature (closed or 'end-included' intervals)
            or not (half-open intervals). (Default: `True`)

        return_stopfeatures : bool, optional
            If `True`, return a special |SegmentChain| called :py:obj:`StopFeature`
            signifying that all previously emitted GFF entries may be assembled
            into complete entities. These are emitted when the line "###"
            is encountered in a `GFF3`_. (Default: `False`)

        is_sorted : bool, optional
            If `True` and `return_stopfeatures` is `True`, assume the `GFF3`_ is
            sorted.  The reader will return :obj:`StopFeature` when the
            chromosome name of a given feature differs from that of the previous
            feature.  (Default: `False`)

        tabix : boolean, optional
            `streams` point to `tabix`_-compressed files or are open
            :class:`~pysam.ctabix.tabix_file_iterator` (Default: `False`)
         """
        super(GFF3_Reader, self).__init__(*streams, adjust_to_0=True, **kwargs)

    def _parse_tokens(self, inp):
        """Parse column 9 of `GFF3`_ into attribute dictionary

        Parameters
        ----------
        inp : str
            Ninth column of `GFF3`_

        Returns
        -------
        dict
            Dictionary of parsed tokens from ninth `GFF3`_ column
        """
        return parse_GFF3_tokens(inp)


[docs]class GTF2_Reader(AbstractGFF_Reader):
    """
    GTF2_Reader(*streams, end_included=True, return_stopfeatures=False, is_sorted=False, tabix=False)

    Read raw features in `GTF2`_ files as |SegmentChains|. To assemble transcripts
    from raw features, use |GTF2_TranscriptAssembler|.

    Assumes input to comply with the
    `GTF2 specification <http://mblab.wustl.edu/GTF22.html>`_. Each element must:

      - use 1-indexed, fully-closed coordinates
      - have defined `gene_id` and `transcript_id` attributes

    All |SegmentChain| objects returned by the reader have 0-indexed,
    half-open coordinates in keeping with Python conventions.


    Parameters
    ----------
    *streams : one or more str or file-like
        One or more input streams or filenames pointing to GFF information

    end_included : bool, optional
        Boolean, whether the end coordinate is included in the feature (closed
        or 'end-included' intervals) or not (half-open intervals).
        (Default: `True`)

    return_stopfeatures : bool, optional
        If `True`, will return a special |SegmentChain| called
        :py:obj:`StopFeature` signifying that all previously emitted
        SegmentChains may be assembled into complete entities. These are emitted
        when the line "###" is encountered in a `GTF2`_. (Default: `False`)

    is_sorted : bool, optional
        If `True` and `return_stopfeatures` is `True`, assume the `GTF2`_ is
        sorted by chromosome. The reader will return :obj:`StopFeature` when the
        chromosome name of a given feature differs from that of the previous
        feature. (Default: `False`)

    tabix : boolean, optional
        `streams` point to `tabix`_-compressed files or are open
        :class:`~pysam.ctabix.tabix_file_iterator` (Default: `False`)

    Examples
    --------
    Read raw features from a `GTF2`_ file::

        >>> feature_reader = GTF2_Reader(open("some_file.gtf"))
        >>> for feature in reader:
        >>>     print(feature.get_name(),feature.attr["type"],str(feature))
        ('YAL030W_mRNA',  'exon',        'chrI:87262-87387(+)')
        ('YAL030W_mRNA',  'exon',        'chrI:87500-87857(+)')
        ('YAL030W_mRNA',  'CDS',         'chrI:87285-87387(+)')
        ('YAL030W_mRNA',  'CDS',         'chrI:87500-87749(+)')
        ('YAL030W_mRNA',  'start_codon', 'chrI:87285-87288(+)')
        ('YAL030W_mRNA',  'stop_codon',  'chrI:87749-87752(+)')
        ('YBL092W_mRNA',  'exon',        'chrII:45643-45644(+)')
        ('YBL092W_mRNA',  'exon',        'chrII:45977-46440(+)')
        ('YBL092W_mRNA',  'CDS',         'chrII:45977-46367(+)')
        ('YBL092W_mRNA',  'start_codon', 'chrII:45977-45980(+)')
        [rest of output omitted]


    Attributes
    ----------
    metadata : dict
        Dictionary of metadata found in file headers
    """

    def __init__(self, *streams, **kwargs):
        """
        GTF2_Reader(*streams, end_included=True, return_stopfeatures=False, is_sorted=False, tabix=False)

        Parameters
        ----------
        *streams : one or more str or file-like
            One or more input streams or filenames pointing to GFF information

        end_included : bool, optional
            Boolean, whether the end coordinate is
            included in the feature (closed or 'end-included' intervals)
            or not (half-open intervals). (Default: `True`)

        return_stopfeatures : bool, optional
            If `True`, will return a special |SegmentChain| called
            :py:obj:`StopFeature` signifying that all previously emitted
            SegmentChains may be assembled into complete entities. These are
            emitted when the line "###" is encountered in a `GTF2`_. (Default:
            `False`)

        is_sorted : bool, optional
            If `True` and `return_stopfeatures` is `True`, assume the `GTF2`_ is
            sorted by chromosome. The reader will return :obj:`StopFeature` when
            the chromosome name of a given feature differs from that of the
            previous feature. (Default: `False`)

        tabix : boolean, optional
            `streams` point to `tabix`_-compressed files or are open
            :class:`~pysam.ctabix.tabix_file_iterator` (Default: `False`)
        """
        super(GTF2_Reader, self).__init__(*streams, adjust_to_0=True, **kwargs)

    def _parse_tokens(self, inp):
        """Parse column 9 of `GTF2`_ into dictionary

        Parameters
        ----------
        inp : str
            Ninth column of `GTF2`_

        Returns
        -------
        dict
            Dictionary of parsed tokens from ninth `GTF2`_ column
        """
        return parse_GTF2_tokens(inp)


class AbstractGFF_Assembler(AssembledFeatureReader):
    """Abstract base class for readers that assemble composite features
    -- e.g. |Transcript| objects -- from one or more features in one or
    more streams of `GTF2`_ or `GFF3`_ data.


    Attributes
    ----------
    stream : file-like
        Input stream, usually constructed from or more open filehandles

    metadata : dict
        Various attributes gleaned from the stream, if any

    counter : int
        Cumulative line number counter over all streams

    printer : file-like, optional
        Logger implementing a ``write()`` method.

    rejected : list
        A list of transcript IDs that failed to assemble properly
    """

    def __init__(self, *streams, **kwargs):
        """Create a |AbstractGFF_Assembler|

        Parameters
        ----------
        *streams : one or more str or file-like
            One or more input streams or filenames pointing to GFF information

        is_sorted : bool, optional
            GFF is sorted by chromosome name, allowing some memory savings
            (Default: `False`)

        return_type : |SegmentChain| or subclass, optional
            Type of feature to return from assembled subfeatures (Default:
            |SegmentChain|)

        add_three_for_stop : bool, optional
            Some annotation files exclude the stop codon from CDS annotations.
            If set to `True`, three nucleotides will be added to the threeprime
            end of each CDS annotation. (Default: `False`)

        printer : file-like, optional
            Logger implementing a ``write()`` method. (Default: |NullWriter|)

        reader_class : class
            |GFF3_Reader| or |GTF2_Reader|

        tabix : boolean, optional
            `streams` point to `tabix`_-compressed files or are open
            :class:`~pysam.ctabix.tabix_file_iterator` (Default: `False`)

        **kwargs
            Other keyword arguments used by specific parsers
        """

        tabix = kwargs.get("tabix", False)
        end_included = kwargs.get("end_included", True)
        return_stopfeatures = kwargs.get("return_stopfeatures", True)
        is_sorted = kwargs.get("is_sorted", False)
        reader_class = kwargs.get("reader_class")

        streams = multiopen(streams, open)

        iterables = []
        for stream in streams:
            iterables.append(
                reader_class(
                    stream,
                    end_included=end_included,
                    return_stopfeatures=return_stopfeatures,
                    is_sorted=is_sorted,
                    tabix=tabix
                )
            )
            iterables.append([StopFeature])

        self.stream = itertools.chain.from_iterable(iterables)

        self.printer = kwargs.get("printer", NullWriter())
        self.return_type = kwargs.get("return_type", SegmentChain)
        self.add_three_for_stop = kwargs.get("add_three_for_stop", False)

        self.metadata = {}
        self.rejected = []
        self.counter = 0

        self._transcript_cache = iter([])
        self._feature_cache = {}

    def _finalize(self, tx):
        return tx

    @abstractmethod
    def _collect(self, feature):
        """Collect relevant features of transcripts

        Parameters
        ----------
        feature : |SegmentChain|
            Feature to collect
        """
        pass

    @abstractmethod
    def _assemble_transcripts(self):
        """Assemble |Transcript| objects from collected features

        Returns
        -------
        list
            list of transcripts
        """
        pass

    @abstractmethod
    def _reset(self):
        """Release memory and reset internal hashes"""
        pass

    def _get_transcript_batches(self):
        """Lazily assemble batches of transcripts from components in
        `self.stream`, when signals in `self.stream` indicate it is safe to
        assemble transcripts from previously collected features. This feature is
        called by `self.__iter__`, which returns individual transcripts.

        Signals that can signify safety to assemble previous transcripts
        include:

         - The special line ``###``, which indicates that the preceding
           collection of lines in a GFF3 file are safe to assemble

         - The special line ``###FASTA`` which indicates end of the feature
           section of a GFF3 file

         - A change in reference files (if multiple input files were given)

        When an assembly signal is given, transcripts are assembled an internal
        caches of collected feature are reset to free memory.


        Yields
        ------
        list
            Sorted list of transcripts in current GFF block
        """
        for feature in self.stream:
            self.counter += 1

            # collect features until a stop signal, such as:
            #     - change of contig if GTF2/GFF3 is sorted
            #     - '###FASTA' block, indicating end of features
            #     - switch between source files in `self.stream`
            if feature is not StopFeature:
                self._collect(feature)

            # if stop signal is reached, clear memory, because those features
            # no longer needed
            else:
                self.printer.write("Assembling next batch of transcripts ...")
                transcripts, rejected = self._assemble_transcripts()

                # remove previously assembled transcripts
                # these lines required to free memory in Python 2.7
                # and early versions of Python 3.x
                gc.collect()
                del gc.garbage[:]

                # sort and prep next set of transcripts
                transcripts = sorted(transcripts)
                self.rejected.extend(rejected)
                self._reset()
                yield transcripts

        # assemble any remaining features into transcripts, e.g. if no
        # stop signals present within a single file
        transcripts, rejected = self._assemble_transcripts()
        gc.collect()
        del gc.garbage[:]

        transcripts = sorted(transcripts)
        self.rejected.extend(rejected)
        self._reset()
        yield transcripts

    def __iter__(self):
        """Return next assembled transcript from GFF

        Yields
        ------
        |Transcript|
            Next complex feature in annotation (usually a transcript)
        """
        for my_batch in self._get_transcript_batches():
            for my_tx in my_batch:
                yield self._finalize(my_tx)


[docs]class GTF2_TranscriptAssembler(AbstractGFF_Assembler):
    """
    GTF2_TranscriptAssembler(*streams, is_sorted=False, return_type=SegmentChain, add_three_for_stop=False, printer=None, tabix=False)

    Assemble |Transcripts| from raw features in `GTF2`_ format.

    Exons and CDS features are grouped by shared ``transcript_id``.
    Attributes that have common values for all exons and CDS within a transcript
    are propagated up to the `attr` dict of the assembled |Transcript|. Other
    attributes from individual CDS or exon components are discarded.

    The assembler functions as an iterator. Within each chromosome, transcripts
    are returned in lexical order.

    For access to raw features, instead use |GTF2_Reader|.


    Parameters
    ----------
    *streams : one or more str or file-like
        One or more input streams or filenames pointing to `GTF2`_ data

    is_sorted : bool, optional
        `GTF2`_ is sorted by chromosome name, allowing some memory savings
        (Default: `False`)

    return_type : |SegmentChain| or subclass, optional
        Type of feature to return from assembled subfeatures (Default:
        |SegmentChain|)

    add_three_for_stop : bool, optional
        Some annotation files exclude the stop codon from CDS annotations. If
        set to `True`, three nucleotides will be added to the threeprime end of
        each CDS annotation, UNLESS the annotated transcript contains explicit
        `stop_codon` feature. (Default: `False`)

    printer : file-like, optional
        Logger implementing a ``write()`` method. Default: |NullWriter|

    tabix : boolean, optional
        `streams` point to `tabix`_-compressed files or are open
        :class:`~pysam.ctabix.tabix_file_iterator` (Default: `False`)


    Examples
    --------
    Assemble transcripts from a `GTF2`_ file::

        >>> transcript_reader = GTF2_TranscriptAssembler(open("some_file.gtf"))
        >>> for transcript in reader:
        >>>     print(transcript.get_name(),transcript.attr["type"],str(transcript)) # do something

        ('YAL030W_mRNA',   'mRNA',  'chrI:87262-87387^87500-87857(+)')
        ('YBL092W_mRNA',   'mRNA',  'chrII:45643-45644^45977-46440(+)')
        ('YBL057C_mRNA',   'mRNA',  'chrII:112749-113427^113444-113450(-)')
        ('YBL040C_mRNA',   'mRNA',  'chrII:142033-142749^142846-142891(-)')
        ('YBL018C_mRNA',   'mRNA',  'chrII:185961-186352^186427-186504(-)')
        ('YBR012W-B',      'mRNA',  'chrII:259868-261173^261174-265140(+)')
        ('YBR044C_mRNA',   'mRNA',  'chrII:324292-324336^324340-326127(-)')
        ('YBR082C_mRNA',   'mRNA',  'chrII:406506-407027^407122-407379(-)')
        ('YBR126W-B_mRNA', 'mRNA',  'chrII:490824-491202(+)')
        ('YBR138C_mRNA',   'mRNA',  'chrII:513636-515391(-)')
        [rest of output omitted]


    Attributes
    ----------
    streams : file-like
        Input streams, usually constructed from one or more open filehandles

    metadata : dict
        Various attributes gleaned from the streams, if any

    counter : int
        Cumulative line number counter over all streams

    printer : file-like, optional
        Logger implementing a ``write()`` method.

    rejected : list
        A list of transcript IDs from transcripts that failed to assemble
        properly
    """
    # transcripts can be represented as collections of exons + cds
    # or cds + UTRs, et c. We consider all UTR and exons as exons
    # and CDS, and start & stop codons as CDS areas
    _feature_map = {
        "exon": ["exon_like"],
        "5UTR": ["exon_like"],
        "3UTR": ["exon_like"],
        "CDS" : ["CDS_like", "exon_like"],
        "start_codon": ["CDS_like", "exon_like"],
        "stop_codon" : ["CDS_like", "exon_like"],
    } # yapf: disable
    dtmp = {"exon_like": {}, "CDS_like": {}}

    def __init__(self, *streams, **kwargs):
        """
        GTF2_TranscriptAssembler(*streams, is_sorted=False, return_type=SegmentChain, add_three_for_stop=False, printer=None, tabix=False)

        Parameters
        ----------
        *streams : one or more str or file-like
            One or more input streams or filenames pointing to `GTF2`_ data

        is_sorted : bool, optional
            `GTF2`_ is sorted by chromosome name, allowing some memory savings
            (Default: `False`)

        return_type : |SegmentChain| or subclass, optional
            Type of feature to return from assembled subfeatures (Default: |SegmentChain|)

        add_three_for_stop : bool, optional
            Some annotation files exclude the stop codon from CDS annotations. If set to
            `True`, three nucleotides will be added to the threeprime end of each
            CDS annotation, UNLESS the annotated transcript contains explicit `stop_codon`
            feature. (Default: `False`)

        printer : file-like, optional
            Logger implementing a ``write()`` method. Default: |NullWriter|

        tabix : boolean, optional
            `streams` point to `tabix`_-compressed files or are open
            :class:`~pysam.ctabix.tabix_file_iterator` (Default: `False`)
        """
        AbstractGFF_Assembler.__init__(self, *streams, reader_class=GTF2_Reader, **kwargs)
        self._feature_cache = {"exon_like": {}, "CDS_like": {}}

    def _collect(self, feature):
        """Collect transcript component CDS and exons objects from
        `self.streams`, and populate `self._feature_cache`

        Parameters
        ----------
        feature : |SegmentChain|
            Feature to collect
        """
        feature_classes = self._feature_map.get(feature.attr["type"], None)
        if feature_classes is not None:
            tname = feature.attr.get("transcript_id")
            for feature_class in feature_classes:
                try:
                    self._feature_cache[feature_class][tname].append(feature)
                except KeyError:
                    self._feature_cache[feature_class][tname] = [feature]
                    sys.exc_traceback = None

    def _assemble_transcripts(self):
        """Assemble |Transcript| objects from `self._feature_cache`,
        mapping transcript IDs to corresponding CDS and exon features.

        Attributes common to all CDS and exons for a given transcript (e.g.
        `gene_id` and `transcript_id`) are propagated up to the |Transcript|.
        Other component attributes are discarded.
        """
        rejected_transcripts = []
        transcripts = []
        for tname in set(self._feature_cache["exon_like"].keys()) \
                | set(self._feature_cache["CDS_like"].keys()):
            exons = self._feature_cache["exon_like"].pop(tname, [])
            cds = self._feature_cache["CDS_like"].pop(tname, [])
            if len(exons) > 0:
                exons = sorted(exons)
                exon_segments = [X.spanning_segment for X in exons]
            elif len(cds) > 0:
                # if cds but no exons, create exons since they are implied
                exons = sorted(cds)
                exon_segments = [X.spanning_segment for X in exons]

            # propagate attributes that are the same in all exons/cds
            # to parent. This should include `gene_id` and `transcript_id`
            attr = get_identical_attributes(exons + cds)
            if len(cds) > 0:
                cds = sorted(cds)
                attr["cds_genome_end"] = cds[-1].spanning_segment.end
                attr["cds_genome_start"] = cds[0].spanning_segment.start

            try:
                my_tx = Transcript(*tuple(exon_segments), **attr)
                if self.add_three_for_stop == True:
                    # only move stop codons if no exon feature is of type "stop_codon"
                    if "stop_codon" not in set([X.attr["type"] for X in exons]):
                        my_tx = add_three_for_stop_codon(my_tx)

                transcripts.append(my_tx)
            except ValueError:
                warn(
                    "Rejecting transcript '%s' because it contains exons on "
                    "multiple chromosomes or strands." % tname, DataWarning
                )
                # transcripts with exons on two strands
                rejected_transcripts.append(tname)
            except KeyError:
                # transcripts where CDS ends outside bounds of transcript
                # there are 25 of these in flybase r5.43
                rejected_transcripts.append(tname)
                warn(
                    "Rejecting transcript '%s' because start or stop codons "
                    "are outside exon boundaries." % tname, DataWarning
                )
        transcripts.sort()
        sys.exc_traceback = None

        return transcripts, rejected_transcripts

    def _reset(self):
        """Release memory and reset internal hashes"""
        del self._feature_cache
        gc.collect()
        del gc.garbage[:]
        self._feature_cache = {"exon_like": {}, "CDS_like": {}}


[docs]class GFF3_TranscriptAssembler(AbstractGFF_Assembler):
    """
    GFF3_TranscriptAssembler(*streams, is_sorted=False, return_type=SegmentChain, add_three_for_stop=False, printer=None, tabix=False)

    Assemble |Transcripts| from raw features in `GFF3`_ format.

    Within a chromosome, transcripts are returned in lexical order. Features
    that do not constitute portions of transcripts (e.g.  origins of replication)
    are ignored. For access to those, read raw features using |GFF3_Reader|.


    Parameters
    ----------
    streams : one or more str or file-like
        One or more input streams or filenames pointing to `GFF3`_ data

    is_sorted : bool, optional
        `GFF3`_ is sorted by chromosome name, allowing some memory savings
        (Default: `False`)

    return_type : |SegmentChain| or subclass, optional
        Type of feature to return from assembled subfeatures (Default:
        |SegmentChain|)

    add_three_for_stop : bool, optional
        Some annotation files exclude the stop codon from CDS annotations. If
        set to `True`, three nucleotides will be added to the threeprime end of
        each CDS annotation. (Default: `False`)

    transcript_types : list, optional
        List of `GFF3`_ feature types that should be considered as transcripts
        (Default: as specified in SO 2.5.3 )

    exon_types : list, optional
        List of `GFF3`_ feature types that should be considered as exons or
        contributing to transcript nucleotide positions
        during transcript assembly (Default: as specified in SO 2.5.3 )

    cds_types : list, optional
        List of `GFF3`_ feature types that should be considered as CDS or
        contributing to transcript coding regions during transcript assembly
        (Default: as specified in SO 2.5.3 )

    printer : file-like, optional
        Logger implementing a ``write()`` method. Default: |NullWriter|

    tabix : boolean, optional
        `streams` point to `tabix`_-compressed files or are open
        :class:`~pysam.ctabix.tabix_file_iterator` (Default: `False`)


    Examples
    --------
    Assemble transcripts from a `GFF3`_ file::

        >>> transcript_reader = GFF3_TranscriptAssembler(open("some_file.gff"))
        >>> for transcript in reader:
        >>>     print(transcript.get_name(),transcript.attr["type"],str(transcript)) # do something

        ('YAL030W_mRNA',   'mRNA',  'chrI:87262-87387^87500-87857(+)')
        ('YBL092W_mRNA',   'mRNA',  'chrII:45643-45644^45977-46440(+)')
        ('YBL057C_mRNA',   'mRNA',  'chrII:112749-113427^113444-113450(-)')
        ('YBL040C_mRNA',   'mRNA',  'chrII:142033-142749^142846-142891(-)')
        ('YBL018C_mRNA',   'mRNA',  'chrII:185961-186352^186427-186504(-)')
        ('YBR012W-B',      'mRNA',  'chrII:259868-261173^261174-265140(+)')
        ('YBR044C_mRNA',   'mRNA',  'chrII:324292-324336^324340-326127(-)')
        ('YBR082C_mRNA',   'mRNA',  'chrII:406506-407027^407122-407379(-)')
        ('YBR126W-B_mRNA', 'mRNA',  'chrII:490824-491202(+)')
        ('YBR138C_mRNA',   'mRNA',  'chrII:513636-515391(-)')
        [rest of output omitted]


    Attributes
    ----------
    streams : file-like
        Input stream, usually constructed from or more open filehandles

    metadata : dict
        Various attributes gleaned from the stream, if any

    counter : int
        Cumulative line number counter over all streams

    printer : file-like, optional
        Logger implementing a ``write()`` method.

    rejected : list
        A list of transcript IDs from transcripts that failed to assemble
        properly


    Notes
    -----
   `GFF3`_ schemas vary
       `GFF3`_ files can have many different schemas of hierarchy. We deal with
       that here by allowing users to supply `transcript_types` and
       `exon_types`, to indicate which sorts of features should be included. By
       default, we use a subset of the schema set out in `Seqence Ontology 2.5.3
       <http://www.sequenceontology.org/resources/intro.html>`_

       Briefly:

        1. The GFF3 file is combed for transcripts of the types specified by
        `transcript_types`, exons specified by `exon_types`, and CDS specified
        by types listed in `cds_types`.

        2. Exons and CDS are matched with their parent transcripts by matching
           the `Parent` attributes of CDS and exons to the `ID` of transcripts.
           Transcripts are then constructed from those intervals, and coding
           regions set accordingly.

        3. If exons and/or CDS features point to a `Parent` that is not
           in `transcript_types`, they are grouped into a new transcript,
           whose ID is set to the value of their shared `Parent`. However,
           this value for `Parent` might refer to a gene rather than
           a transcript; unfortunately this cannot be known without other
           information. Attributes that are common to all CDS and exon
           features are bubbled up to the transcript.

        4. If exons and/or CDS features have no `Parent`, but share a common ID,
           they are grouped by ID into a single transcript. Attributes common
           to all CDS and exon features are bubbled up to the transcript.
           The `Parent` attribute is left unset.

        5. If a transcript feature is annotated but has no child CDS or exons,
           the transcript is assumed to be non-coding and is assembled from
           any transcript-type features that share its `ID` attribute.

   Identity relationships between elements vary between `GFF3`_ files
       Different `GFF3`_ files specify discontiguous features differently. For
       example, in `Flybase`_, different exons of a transcript will have unique
       IDs, but will share the same `'Parent'` attribute in column 9 of the GFF.
       In `Wormbase`_, however, different exons of the same transcript will
       share the same ID. Here, we first check for the Flybase style (by
       Parent), then fall back to Wormbase style (by shared ID).

   Transcript assembly
       To save memory, transcripts are assembled lazily as follows:

       #.  If there exist assembled transcripts in `self._transript_cache`,
           return the next transcript. Transcripts in the cache are stored
           lexically.

       #.  Otherwise, collect features from the `GFF3`_ stream until either a
           `'###'` line or EOF is encountered. Then, assemble transcripts and
           store them in `self._transcript_cache`. Delete unused features
           from memory. If the `GFF3`_ is sorted, then a change in chromosome
           name will also trigger assembly of collected features.
    """

    def __init__(self, *streams, **kwargs):
        """
        GFF3_TranscriptAssembler(*streams, is_sorted=False, return_type=SegmentChain, add_three_for_stop=False, printer=None, tabix=False)

        Parameters
        ----------
        *streams : one or more str or file-like
            One or more input streams or filenames pointing to `GFF3`_ data

        is_sorted : bool, optional
            `GFF3`_ is sorted by chromosome name, allowing some memory savings
            (Default: `False`)

        return_type : |SegmentChain| or subclass, optional
            Type of feature to return from assembled subfeatures (Default:
            |SegmentChain|)

        add_three_for_stop : bool, optional
            Some annotation files exclude the stop codon from CDS annotations.
            If set to `True`, three nucleotides will be added to the threeprime
            end of each CDS annotation. (Default: `False`)

        transcript_types : list, optional
            List of `GFF3`_ feature types that should be considered as
            transcripts (Default: as specified in SO 2.5.3 )

        exon_types : list, optional
            List of `GFF3`_ feature types that should be considered as exons or
            contributing to transcript nucleotide positions
            during transcript assembly (Default: as specified in SO 2.5.3 )

        cds_types : list, optional
            List of `GFF3`_ feature types that should be considered as CDS or
            contributing to transcript coding regions during transcript assembly
            (Default: as specified in SO 2.5.3 )

        printer : file-like, optional
            Logger implementing a ``write()`` method. Default: |NullWriter|

        tabix : boolean, optional
            `streams` point to `tabix`_-compressed files or are open
            :class:`~pysam.ctabix.tabix_file_iterator` (Default: `False`)


        Notes
        -----
        Sequence Ontology 2.5.3
            By default, this assembler constructs transcripts following a subset
            of the `GFF3`_ schema from the `SO Consortium
            <http://www.sequenceontology.org/resources/intro.html>`_.  For
            details on assembly see the :class:`class docstring
            <GFF3_TranscriptAssembler>`, above.
        """
        AbstractGFF_Assembler.__init__(self, *streams, reader_class=GFF3_Reader, **kwargs)
        self.transcript_types = set(kwargs.get("transcript_types", _DEFAULT_GFF3_TRANSCRIPT_TYPES))
        self.exon_types = set(kwargs.get("exon_types", _DEFAULT_GFF3_EXON_TYPES))
        self.cds_types  = set(kwargs.get("cds_types", _DEFAULT_GFF3_CDS_TYPES)) # yapf: disable
        self.transcript_components = self.exon_types | self.cds_types
        self._feature_cache = {}
        self._tx_features = {}
        self._reset()

    def _collect(self, feature):
        """Collect CDS and exon components of transcripts from `self.streams`,
        and populate `self._feature_cache`

        Parameters
        ----------
        feature : |SegmentChain|
            Feature to collect
        """
        feature_name = feature.get_name()

        if feature.attr["type"] in self.transcript_types:
            try:
                self._tx_features[feature_name].append(feature)
            except KeyError:
                self._tx_features[feature_name] = [feature]
                sys.exc_traceback = None

        elif feature.attr["type"] in self.transcript_components:
            # assume parent is transcript ID.
            # If no Parent, assume transcript is described as exon or CDS
            # with identical ID attributes

            tnames = feature.attr.get("Parent", [feature.attr.get("ID", [])])

            if len(tnames) == 0:
                warn(
                    "Found %s at %s with no `Parent` or `ID`. Ignoring." %
                    (feature.attr["type"], str(feature.spanning_segment)), DataWarning
                )
            for tname in tnames:
                try:
                    self._feature_cache[feature.attr["type"]][tname].append(feature)
                except KeyError:
                    self._feature_cache[feature.attr["type"]][tname] = [feature]
                    sys.exc_traceback = None

    def _assemble_transcripts(self):
        """Assemble |Transcript| s from components in `self._feature_cache`

        Returns
        -------
        list
            list of transcripts
        """
        rejected = []
        transcripts = []
        tx_features_counted = []

        # names of transcripts in transcript_types
        tnames = set([])

        # find parents of exons & cds that are not present
        # in types from transcript_types
        for type_ in self.exon_types | self.cds_types:
            tnames |= set(self._feature_cache[type_].keys())

        for tname in tnames:
            tx_features_counted.append(tname)

            exons = []
            for type_ in self.exon_types:
                exons.extend(self._feature_cache[type_].get(tname, []))

            cds = []
            for cds_type in self.cds_types:
                cds.extend(self._feature_cache[cds_type].get(tname, []))

            # if transcript is represented, not just implied, use its attributes
            if tname in self._tx_features:

                # use transcript name as gene if no Parent
                gene_id = self._tx_features[tname][0].attr.get("Parent", [tname])

                # gene IDs are now returned as lists from GFF3 parser
                gene_id = ",".join(sorted(gene_id))

                # get attr from transcript object
                attr = self._tx_features[tname][0].attr
                attr["ID"] = tname
                attr["transcript_id"] = tname
                attr["gene_id"] = gene_id

            # if transcript is just implied by presence of CDS and exons
            # TODO: make sure "Parent" will carry over sensibly from lists
            else:
                attr = get_identical_attributes(exons + cds)
                attr["ID"] = tname
                attr["transcript_id"] = tname
                attr["type"] = "mRNA"

            exon_segments = [X.spanning_segment for X in exons]
            cds_segments = [X.spanning_segment for X in cds]

            if len(exon_segments) + len(cds_segments) > 0:
                # transcript with child features
                if len(cds) > 0:
                    cds = sorted(cds)
                    attr["cds_genome_start"] = cds[0].spanning_segment.start
                    attr["cds_genome_end"] = cds[-1].spanning_segment.end

                try:
                    my_tx = Transcript(*tuple(exon_segments + cds_segments), **attr)
                    if self.add_three_for_stop == True:
                        my_tx = add_three_for_stop_codon(my_tx)

                    transcripts.append(my_tx)

                except ValueError:
                    warn(
                        "Rejecting transcript '%s' because it contains exons "
                        " on multiple strands." % tname, DataWarning
                    )
                    # transcripts with exons on two strands
                    rejected.append(tname)
                except KeyError:
                    # transcripts where CDS ends outside bounds of transcript
                    # ideally this would not occur but does in rare cases
                    # e.g. there are 25 of these in flybase r5.43
                    warn(
                        "Rejecting transcript '%s because start or stop codons "
                        "are outside exon boundaries." % tname, DataWarning
                    )
                    rejected.append(tname)
            else:
                # transcript that has multiple subfeatures with shared ID but no children
                attr = self._tx_features[tname][0].attr
                attr["ID"] = tname
                attr["transcript_id"] = tname
                segments = [X.spanning_segment for X in self._tx_features[tname]]
                my_tx = Transcript(*tuple(segments), **attr)
                transcripts.append(my_tx)

            sys.exc_traceback = None

        return transcripts, rejected

    def _reset(self):
        """Release memory and reset internal hashes"""
        del self._feature_cache
        del self._tx_features
        gc.collect()
        del gc.garbage[:]
        self._tx_features = {}
        self._feature_cache = {X: copy.deepcopy({}) for X in self.transcript_components}