#!/usr/bin/env python
"""Tools for reading, writing, analyzing, and manipulating GFF file subtypes
(e.g. `GTF2`_ and `GFF3`_).
.. contents::
:local:
Summary
-------
Because `GTF2`_/`GFF3`_ files are hierarchically structured -- i.e. a complex
feature can be assembled from several component features; each component
feature having its own record on its own line -- two interfaces for reading
`GTF2`_/`GFF3`_ files are included:
Assembly of transcripts from exon, CDS, & UTR annotations
|GTF2_TranscriptAssembler| and |GFF3_TranscriptAssembler| collect
individual exon and CDS features, and assemble these into |Transcripts|.
Features are read from `GTF2`_/`GFF3`_ files, grouped by `transcript_id`,
`Parent`, or `ID` attributes, depending on file type. Assembled
|Transcripts| are yielded only when their component features have fully
been collected.
Low-level parsing of simple features
|GTF2_Reader| and |GFF3_Reader| read raw features (such as individual
exons, stop codons, SNPs, et c) from `GTF2`_/`GFF3`_ files. Each line
is returned as a |SegmentChain|.
Module contents
---------------
.. autosummary::
GTF2_Reader
GTF2_TranscriptAssembler
GFF3_Reader
GFF3_TranscriptAssembler
Examples
--------
|GTF2_Reader| and |GFF3_Reader| return raw, unmodified features from `GTF2`_ or
`GFF3`_ files -- e.g. exons, coding regions, stop codons -- without assembling
them into transcripts::
>>> feature_reader = GTF2_Reader("some_file.gtf")
>>> for feature in reader:
>>> print(feature.get_name(),feature.attr["type"],str(feature))
('YAL030W_mRNA', 'exon', 'chrI:87262-87387(+)')
('YAL030W_mRNA', 'exon', 'chrI:87500-87857(+)')
('YAL030W_mRNA', 'CDS', 'chrI:87285-87387(+)')
('YAL030W_mRNA', 'CDS', 'chrI:87500-87749(+)')
('YAL030W_mRNA', 'start_codon', 'chrI:87285-87288(+)')
('YAL030W_mRNA', 'stop_codon', 'chrI:87749-87752(+)')
('YBL092W_mRNA', 'exon', 'chrII:45643-45644(+)')
('YBL092W_mRNA', 'exon', 'chrII:45977-46440(+)')
('YBL092W_mRNA', 'CDS', 'chrII:45977-46367(+)')
('YBL092W_mRNA', 'start_codon', 'chrII:45977-45980(+)')
[rest of output omitted]
In contrast, |GTF2_TranscriptAssembler| and |GFF3_TranscriptAssembler| reconstruct
transcripts from their components, based upon their `transcript_id`, `ID`, or
`Parent` attributes. Note how all features are of type `mRNA`, and how some
contain multiple exons (coordinates separated by `'^'`)::
>>> transcript_reader = GTF2_TranscriptAssembler("some_file.gtf")
>>> for transcript in reader:
>>> print(transcript.get_name(),transcript.attr["type"],str(transcript))
('YAL030W_mRNA', 'mRNA', 'chrI:87262-87387^87500-87857(+)')
('YBL092W_mRNA', 'mRNA', 'chrII:45643-45644^45977-46440(+)')
('YBL057C_mRNA', 'mRNA', 'chrII:112749-113427^113444-113450(-)')
('YBL040C_mRNA', 'mRNA', 'chrII:142033-142749^142846-142891(-)')
('YBL018C_mRNA', 'mRNA', 'chrII:185961-186352^186427-186504(-)')
('YBR012W-B', 'mRNA', 'chrII:259868-261173^261174-265140(+)')
('YBR044C_mRNA', 'mRNA', 'chrII:324292-324336^324340-326127(-)')
('YBR082C_mRNA', 'mRNA', 'chrII:406506-407027^407122-407379(-)')
('YBR126W-B_mRNA', 'mRNA', 'chrII:490824-491202(+)')
('YBR138C_mRNA', 'mRNA', 'chrII:513636-515391(-)')
[rest of output omitted]
See Also
--------
`GFF3 specification <http://song.sourceforge.net/gff3.shtml>`_
GFF3 specification by the Sequence Ontology consortium
`GTF2.2 specification <http://mblab.wustl.edu/GTF22.html>`_
Hosted by the Brent lab
`UCSC file format FAQ <http://genome.ucsc.edu/FAQ/FAQformat.html>`_.
GFF & GTF descriptions at UCSC
"""
__author__ = "joshua"
__date__ = "$Dec 1, 2010 11:00:55 AM$"
import itertools
import gc
import copy
import sys
from abc import abstractmethod
from plastid.util.io.filters import AbstractReader, SkipBlankReader
from plastid.util.io.openers import NullWriter, multiopen
from plastid.readers.common import get_identical_attributes, \
AssembledFeatureReader
from plastid.genomics.roitools import Transcript, SegmentChain, \
GenomicSegment, add_three_for_stop_codon
from plastid.readers.gff_tokens import parse_GFF3_tokens, parse_GTF2_tokens
from plastid.util.services.exceptions import DataWarning, warn
#===============================================================================
# INDEX: SO v2.5.3 feature types
# see: http://www.sequenceontology.org/resources/intro.html
#===============================================================================
_DEFAULT_GFF3_GENE_TYPES = {
"gene",
"candidate_gene",
"functional_candidate_gene",
"positional_candidate_gene",
"cryptic_gene",
"cryptogene",
"engineered_gene",
"engineered_foreign_gene",
"engineered_foreign_transposable_element_gene",
"engineered_fusion_gene"
"epigenetically_modified_gene",
"allelically_excluded_gene",
"gene_rearranged_at_DNA_level",
"maternally_imprinted_gene",
"paternally_imprinted_gene",
"foreign_gene",
"fusion_gene",
"gene_cassette",
"gene_with_non_canonical_start_codon",
"gene_with_start_codon_CUG",
"gene_with_polycistronic_transcript",
"gene_with_dicistronic_transcript",
"gene_with_dicistronic_mRNA",
"gene_with_dicistronic_primary_transcript",
"gene_with_trans_spliced_transcript",
"mt_gene",
"kinetoplast_gene",
"maxicircle_gene",
"minicircle_gene",
"ncRNA_gene",
"gRNA_gene",
"lincRNA_gene",
"miRNA_gene",
"piRNA_gene",
"RNase_MRP_RNA_gene",
"RNase_P_RNA_gene",
"rRNA_gene",
"scRNA_gene",
"snoRNA_gene",
"snRNA_gene",
"SRP_RNA_gene",
"telomerase_RNA_gene",
"tmRNA_gene",
"tRNA_gene",
"negatively_autoregulated_gene",
"nuclear_gene",
"nucleomorph_gene",
"plasmid_gene",
"plastid_gene",
"apicoplast_gene",
"chromoplast_gene",
"ct_gene",
"cyanelle_gene",
"leucoplast_gene",
"proplastid_gene",
"positively_autoregulated_gene",
"post_translationally_regulated_gene",
"predicted_gene",
"protein_coding_gene",
"gene_with_edited_transcript",
"gene_with_mRNA_with_frameshift",
"gene_with_polyadenylated_mRNA",
"gene_with_recoded_mRNA",
"gene_with_mRNA_recoded_by_translational_bypass",
"gene_with_stop_codon_read_through",
"gene_with_stop_codon_redefined_as_pyrrollysine",
"gene_with_stop_codon_redefined_as_selenocysteine",
"gene_with_transcript_with_translational_frameshift", "proviral_gene",
"endogenous_retroviral_gene", "non_functional_homolog_of_pseudogene",
"non_processed_pseudogene", "cassette_pseudogene", "duplicated_pseudogene",
"nuclear_mt_pseudogene", "pseudogene_by_unequal_crossing_over", "unitary_pseudogene",
"polymorphic_pseudogene", "processed_pseudogene", "transposable_element_pseudogene",
"recombinatorially_rearranged_gene", "recombinatorially_inverted_gene",
"recombinatorially_rearranged_vertebrate_immune_system_gene", "rescue_gene",
"wild_type_rescue_gene", "retrogene", "silenced_gene", "gene_silenced_by_DNA_modification",
"gene_silenced_by_DNA_methylation", "gene_silenced_by_histone_modification",
"gene_silenced_by_histone_deacetylation", "gene_silenced_by_histone_methylation",
"gene_silenced_by_RNA_interference", "transgene", "floxed_gene",
"translationally_regulated_gene", "transposable_element_gene",
"engineered_foreign_transposable_element_gene",
}
"""GFF3 gene types as annotated by `SO 2.5.3 <http://www.sequenceontology.org/resources/intro.html>`_"""
_DEFAULT_GFF3_TRANSCRIPT_TYPES = {
"transcript",
"mature_transcript",
"enzymatic_RNA",
"mRNA",
"mRNA_with_frameshift",
"mRNA_with_minus_1_frameshift",
"mRNA_with_minus_2_frameshift",
"mRNA_with_plus_1_frameshift",
"mRNA_with_plus_2_frameshift",
"polyadenylated_mRNA",
"polycistronic_mRNA",
"dicistronic_mRNA",
"monocistronic_mRNA",
"recoded_mRNA",
"mRNA_recoded_by_codon_redefinition",
"mRNA_recoded_by_translational_bypass",
"trans_spliced_mRNA",
"ncRNA",
"antisense_RNA",
"MicF_RNA",
"class_I_RNA",
"class_II_RNA",
"enhancerRNA",
"guide_RNA",
"lnc_RNA",
"antisense_lncRNA",
"intronic_lncRNA",
"lincRNA",
"piRNA",
"priRNA",
"rasiRNA",
"RNase_MRP_RNA",
"RNase_P_RNA",
"rRNA",
"large_subunit_rRNA",
"rRNA_21S",
"rRNA_23S",
"rRNA_25S",
"rRNA_28S",
"rRNA_5_8S",
"rRNA_5S",
"small_subunit_rRNA",
"rRNA_16S",
"rRNA_18S",
"rRNA_cleavage_RNA",
"scRNA",
"shRNA",
"siRNA",
"small_regulatory_ncRNA",
"CsrB_RsmB_RNA",
"DsrA_RNA",
"GcvB_RNA",
"IoR",
"miRNA",
"moR",
"OxyS_RNA",
"RNA_6S",
"RprA_RNA",
"RRE_RNA",
"spot42_RNA",
"tmRNA",
"snoRNA",
"C_D_box_snoRNA",
"methylation_guide_snoRNA",
"U14_snoRNA",
"U3_snoRNA",
"H_ACA_box_snoRNA",
"pseudouridylation_guide_snoRNA",
"snRNA",
"U11_snRNA",
"U12_snRNA",
"U1_snRNA",
"U2_snRNA",
"U4_snRNA",
"U4atac_snRNA",
"U5_snRNA",
"U6_snRNA",
"U6atac_snRNA",
"SRP_RNA",
"tasiRNA",
"telomerase_RNA",
"telomeric_transcript",
"anti_ARRET",
"ARIA",
"ARRET",
"TERRA",
"tRNA",
"alanyl_tRNA",
"arginyl_tRNA",
"asparaginyl_tRNA",
"aspartyl_tRNA",
"cysteinyl_tRNA",
"glutaminyl_tRNA",
"glutamyl_tRNA",
"glycyl_tRNA",
"histidyl_tRNA",
"isoleucyl_tRNA",
"leucyl_tRNA",
"lysyl_tRNA",
"methionyl_tRNA",
"phenylalanyl_tRNA",
"prolyl_tRNA",
"pyrrollysyl_tRNA",
"selenocysteinyl_tRNA",
"seryl_tRNA",
"threonyl_tRNA",
"tryptophanyl_tRNA",
"tyrosyl_tRNA",
"valyl_tRNA",
"vault_RNA",
"Y_RNA",
"edited_transcript",
"edited_mRNA",
"edited_transcript_by_A_to_I_substitution",
"non_functional_homolog_of_pseudogenic_transcript",
"pseudogenic_transcript",
}
"""GFF3 mature transcript types as annotated by `SO 2.5.3
<http://www.sequenceontology.org/resources/intro.html>`_"""
_DEFAULT_GFF3_EXON_TYPES = {
"exon",
"coding_exon",
"noncoding_exon",
"exon_of_single_exon_gene",
"interior_exon",
"interior_coding_exon",
"five_prime_coding_exon",
"three_prime_coding_exon"
"five_prime_noncoding_exon",
"three_prime_noncoding_exon",
"pseudogenic_exon",
}
"""GFF3 exon feature types as annotated by `SO 2.5.3
<http://www.sequenceontology.org/resources/intro.html>`_"""
_DEFAULT_GFF3_CDS_TYPES = {
"CDS",
"CDS_fragment",
"CDS_indpendently_known",
"CDS_predicted",
}
"""GFF3 CDS feature types as annotated by `SO 2.5.3
<http://www.sequenceontology.org/resources/intro.html>`_"""
#===============================================================================
# INDEX: Readers for GFF formats
#===============================================================================
StopFeature = SegmentChain(GenomicSegment("Stop", 0, 1, "."), type="StopFeature", ID="StopFeature")
"""Special |SegmentChain| emitted from GFF readers when:
- the special line ``###`` is encountered
- the special line ``###FASTA`` is encountered
- a GFF file is marked as sorted, and the contig/chromosome changes
- the source stream of features is changed
indicating that all previously returned features may be assembled into full
objects.
.. note::
Because :obj:`StopFeature` is zero-length, it does not evaluate as equal to
itself. Use ``x is StopFeature`` or ``x is not StopFeature`` it testing for
equality.
"""
class AbstractGFF_Reader(AbstractReader):
"""Abstract base class for GFF readers.
Parses GFF streams line by line into |GenomicSegment|
Attributes
----------
metadata : dict
Dictionary of metadata found in file headers
"""
def __init__(self, *streams, **kwargs):
"""Create an |AbstractGFF_Reader|
Parameters
----------
*streams : one or more str or file-like
One or more input streams or filenames pointing to GFF information
adjust_to_0 : bool, optional
Boolean, whether or not to adjust feature
indices to a 0 base. True for `GTF2`_ and `GFF3`_
files, as these are 1-indexed. (Default: `True`)
end_included : bool, optional
Boolean, whether the end coordinate is
included in the feature (closed or 'end-included' intervals)
or not (half-open intervals). (Default: `True`)
return_stopfeatures : bool, optional
If `True`, will return a special |SegmentChain| called
:obj:`StopFeature` signifying that all previously emitted GFF
entries may be assembled into complete entities. These are emitted
when the line `'###'` is encountered in a GFF. (Default: `True`)
is_sorted : bool, optional
If True and `return_stopfeatures` is True, assume the GFF is sorted.
The reader will return :obj:`StopFeature` when the chromosome name
of a given feature differs from that of the previous feature.
(Default: `False`)
tabix : boolean, optional
`streams` point to `tabix`_-compressed files or are open
:class:`~pysam.ctabix.tabix_file_iterator` (Default: `False`)
"""
stream = itertools.chain.from_iterable(multiopen(streams, fn=open))
if kwargs.get("tabix", False) == True:
stream = ("\t".join(X) for X in stream)
self.chromosomes = {}
self.metadata = {}
self.line_queue = []
self.adjust_to_0 = kwargs.get("adjust_to_0", True)
self.end_included = kwargs.get("end_included", True)
self.return_stopfeatures = kwargs.get("return_stopfeatures", True)
self.is_sorted = kwargs.get("is_sorted", False)
line = next(stream)
while line[0:2] == "##":
self._parse_metatokens(line[2:])
line = next(stream)
self.line_queue.append(line)
self._last_chrom = None
self.stream = itertools.chain(self.line_queue, SkipBlankReader(stream))
super(AbstractGFF_Reader, self).__init__(self.stream)
def _parse_metatokens(self, inp):
"""Parses metadata embedded in a GFF stream, and stores
these in appropriate attributes.
Parameters
----------
inp : str
line of GFF input
"""
items = inp.rstrip().split()
if len(items) > 0:
key = items[0]
if key == "sequence-region":
try:
self.chromosomes[items[1]] = (items[2], items[3])
except IndexError:
self.chromosomes[items[1]] = tuple(items[1:])
elif key in self.metadata.keys():
self.metadata[key] += ";" + " ".join(items[1:])
else:
self.metadata[key] = " ".join(items[1:])
@abstractmethod
def _parse_tokens(self, attr_string):
"""Placeholder function to parse column 9, which is formatted
differently in different GFF subtypes. Implement this
in subclasses
Parameters
----------
attr_string : str
Ninth column of GFF
Returns
-------
dict
Dictionary of parsed tokens from ninth GFF column
"""
pass
def _parse_genomic_feature(self, line):
"""Parse GFF lines into |SegmentChain| objects
Parameters
----------
line : str
Valid line of a GFF formatted file
Returns
-------
|SegmentChain|
"""
# yapf: disable
items = line.rstrip("\n").split("\t")
chrom = items[0]
source = items[1]
feature_type = items[2]
start = int(items[3]) - int(self.adjust_to_0)
end = int(items[4]) - int(self.adjust_to_0) + int(self.end_included)
score = items[5]
strand = items[6]
phase = items[7]
attr_string = items[8]
info_dict = self._parse_tokens(attr_string)
info_dict["source"] = source
info_dict["score"] = score
info_dict["phase"] = phase
info_dict["type"] = feature_type
# yapf: enable
my_iv = GenomicSegment(chrom, start, end, strand)
my_feature = SegmentChain(my_iv, **info_dict)
if chrom != self._last_chrom:
old_chrom = self._last_chrom
self._last_chrom = chrom
if old_chrom is not None:
if self.is_sorted == True and self.return_stopfeatures == True:
self.stream = itertools.chain([line], self.stream)
return StopFeature
return my_feature
def filter(self, line):
"""Parses lines of the GFF stream into |SegmentChain|
When metadata is found, temporarily delegates processing to
:meth:`_parse_metatokens`, and then reads the next genomic feature
Parameters
----------
line
Next line from GFF stream
Returns
-------
|SegmentChain|
Next feature in file
"""
if line.startswith("###"):
if self.return_stopfeatures == True:
return StopFeature
else:
return next(self)
elif line.startswith("##FASTA"):
return StopFeature
elif line.startswith("##"):
self._parse_metatokens(line[2:])
return next(self)
elif line.startswith("#"):
return next(self)
else:
return self._parse_genomic_feature(line)
[docs]class GFF3_Reader(AbstractGFF_Reader):
"""
GFF3_Reader(*streams, end_included=True, return_stopfeatures=False, is_sorted=False, tabix=False)
Read raw features in `GFF3`_ files as |SegmentChains|.
Users who wish to reconstruct |Transcripts| from raw features should instead
use |GFF3_TranscriptAssembler|, which performs this task automatically.
Assumes input stream to use 1-indexed coordinates, in compliance with the
`Sequence Ontology GFF3 specification <http://song.sourceforge.net/gff3.shtml>`_.
`GFF3`_ attributes (from column 9) for each record are stored in its ``attr``
dictionary. Names and values of attributes are unescaped. The values for the
attributes `Parent`, `Alias`, `Dbxref`, `dbxref`, and `Note`, if present, are
lists rather than strings, because the `GFF3`_ spec enables these to have
multiple values.
Parameters
----------
*streams : one or more str or file-like
One or more input streams or filenames pointing to GFF information
end_included : bool, optional
Boolean, whether the end coordinate is included in the feature (closed
or 'end-included' intervals) or not (half-open intervals). All
coordinates will be normalized to 0-indexed, half-open (Default: `True`)
return_stopfeatures : bool, optional
If `True`, return a special |SegmentChain| called :py:obj:`StopFeature`
signifying that all previously emitted GFF entries may be assembled
into complete entities. These are emitted when the line "###"
is encountered in a `GFF3`_. (Default: `False`)
is_sorted : bool, optional
If `True` and `return_stopfeatures` is `True`, assume the `GFF3`_ is
sorted. The reader will return :obj:`StopFeature` when the chromosome
name of a given feature differs from that of the previous feature.
(Default: `False`)
tabix : boolean, optional
`streams` point to `tabix`_-compressed files or are open
:class:`~pysam.ctabix.tabix_file_iterator` (Default: `False`)
Attributes
----------
metadata : dict
Dictionary of metadata found in file headers
Examples
--------
Read raw features from a `GFF3`_ file::
>>> feature_reader = GFF3_Reader(open("./some_file.gff"))
>>> for feature in feature_reader:
>>> print(feature.get_name(), feature.attr["type"], str(feature))
('chrI', 'chromosome', 'chrI:0-230218(.)')
('TEL01L-TR', 'telomeric_repeat', 'chrI:0-62(-)')
('TEL01L', 'telomere', 'chrI:0-801(-)')
('TEL01L-XR', 'X_element_combinatorial_repeat', 'chrI:62-336(-)')
('YAL069W', 'gene', 'chrI:334-649(+)')
('TEL01L-XC', 'X_element', 'chrI:336-801(-)')
('TEL01L-XC_nucleotide_match', 'nucleotide_match', 'chrI:752-763(-)')
('TEL01L-XC_binding_site', 'binding_site', 'chrI:531-544(-)')
('YAL068W-A', 'gene', 'chrI:537-792(+)')
('ARS102', 'ARS', 'chrI:649-1791(.)')
[rest of output omitted]
"""
def __init__(self, *streams, **kwargs):
"""
GFF3_Reader(*streams, end_included=True, return_stopfeatures=False, is_sorted=False, tabix=False)
Parameters
----------
*streams : one or more str or file-like
One or more input streams or filenames pointing to GFF information
end_included : bool, optional
Boolean, whether the end coordinate is
included in the feature (closed or 'end-included' intervals)
or not (half-open intervals). (Default: `True`)
return_stopfeatures : bool, optional
If `True`, return a special |SegmentChain| called :py:obj:`StopFeature`
signifying that all previously emitted GFF entries may be assembled
into complete entities. These are emitted when the line "###"
is encountered in a `GFF3`_. (Default: `False`)
is_sorted : bool, optional
If `True` and `return_stopfeatures` is `True`, assume the `GFF3`_ is
sorted. The reader will return :obj:`StopFeature` when the
chromosome name of a given feature differs from that of the previous
feature. (Default: `False`)
tabix : boolean, optional
`streams` point to `tabix`_-compressed files or are open
:class:`~pysam.ctabix.tabix_file_iterator` (Default: `False`)
"""
super(GFF3_Reader, self).__init__(*streams, adjust_to_0=True, **kwargs)
def _parse_tokens(self, inp):
"""Parse column 9 of `GFF3`_ into attribute dictionary
Parameters
----------
inp : str
Ninth column of `GFF3`_
Returns
-------
dict
Dictionary of parsed tokens from ninth `GFF3`_ column
"""
return parse_GFF3_tokens(inp)
[docs]class GTF2_Reader(AbstractGFF_Reader):
"""
GTF2_Reader(*streams, end_included=True, return_stopfeatures=False, is_sorted=False, tabix=False)
Read raw features in `GTF2`_ files as |SegmentChains|. To assemble transcripts
from raw features, use |GTF2_TranscriptAssembler|.
Assumes input to comply with the
`GTF2 specification <http://mblab.wustl.edu/GTF22.html>`_. Each element must:
- use 1-indexed, fully-closed coordinates
- have defined `gene_id` and `transcript_id` attributes
All |SegmentChain| objects returned by the reader have 0-indexed,
half-open coordinates in keeping with Python conventions.
Parameters
----------
*streams : one or more str or file-like
One or more input streams or filenames pointing to GFF information
end_included : bool, optional
Boolean, whether the end coordinate is included in the feature (closed
or 'end-included' intervals) or not (half-open intervals).
(Default: `True`)
return_stopfeatures : bool, optional
If `True`, will return a special |SegmentChain| called
:py:obj:`StopFeature` signifying that all previously emitted
SegmentChains may be assembled into complete entities. These are emitted
when the line "###" is encountered in a `GTF2`_. (Default: `False`)
is_sorted : bool, optional
If `True` and `return_stopfeatures` is `True`, assume the `GTF2`_ is
sorted by chromosome. The reader will return :obj:`StopFeature` when the
chromosome name of a given feature differs from that of the previous
feature. (Default: `False`)
tabix : boolean, optional
`streams` point to `tabix`_-compressed files or are open
:class:`~pysam.ctabix.tabix_file_iterator` (Default: `False`)
Examples
--------
Read raw features from a `GTF2`_ file::
>>> feature_reader = GTF2_Reader(open("some_file.gtf"))
>>> for feature in reader:
>>> print(feature.get_name(),feature.attr["type"],str(feature))
('YAL030W_mRNA', 'exon', 'chrI:87262-87387(+)')
('YAL030W_mRNA', 'exon', 'chrI:87500-87857(+)')
('YAL030W_mRNA', 'CDS', 'chrI:87285-87387(+)')
('YAL030W_mRNA', 'CDS', 'chrI:87500-87749(+)')
('YAL030W_mRNA', 'start_codon', 'chrI:87285-87288(+)')
('YAL030W_mRNA', 'stop_codon', 'chrI:87749-87752(+)')
('YBL092W_mRNA', 'exon', 'chrII:45643-45644(+)')
('YBL092W_mRNA', 'exon', 'chrII:45977-46440(+)')
('YBL092W_mRNA', 'CDS', 'chrII:45977-46367(+)')
('YBL092W_mRNA', 'start_codon', 'chrII:45977-45980(+)')
[rest of output omitted]
Attributes
----------
metadata : dict
Dictionary of metadata found in file headers
"""
def __init__(self, *streams, **kwargs):
"""
GTF2_Reader(*streams, end_included=True, return_stopfeatures=False, is_sorted=False, tabix=False)
Parameters
----------
*streams : one or more str or file-like
One or more input streams or filenames pointing to GFF information
end_included : bool, optional
Boolean, whether the end coordinate is
included in the feature (closed or 'end-included' intervals)
or not (half-open intervals). (Default: `True`)
return_stopfeatures : bool, optional
If `True`, will return a special |SegmentChain| called
:py:obj:`StopFeature` signifying that all previously emitted
SegmentChains may be assembled into complete entities. These are
emitted when the line "###" is encountered in a `GTF2`_. (Default:
`False`)
is_sorted : bool, optional
If `True` and `return_stopfeatures` is `True`, assume the `GTF2`_ is
sorted by chromosome. The reader will return :obj:`StopFeature` when
the chromosome name of a given feature differs from that of the
previous feature. (Default: `False`)
tabix : boolean, optional
`streams` point to `tabix`_-compressed files or are open
:class:`~pysam.ctabix.tabix_file_iterator` (Default: `False`)
"""
super(GTF2_Reader, self).__init__(*streams, adjust_to_0=True, **kwargs)
def _parse_tokens(self, inp):
"""Parse column 9 of `GTF2`_ into dictionary
Parameters
----------
inp : str
Ninth column of `GTF2`_
Returns
-------
dict
Dictionary of parsed tokens from ninth `GTF2`_ column
"""
return parse_GTF2_tokens(inp)
class AbstractGFF_Assembler(AssembledFeatureReader):
"""Abstract base class for readers that assemble composite features
-- e.g. |Transcript| objects -- from one or more features in one or
more streams of `GTF2`_ or `GFF3`_ data.
Attributes
----------
stream : file-like
Input stream, usually constructed from or more open filehandles
metadata : dict
Various attributes gleaned from the stream, if any
counter : int
Cumulative line number counter over all streams
printer : file-like, optional
Logger implementing a ``write()`` method.
rejected : list
A list of transcript IDs that failed to assemble properly
"""
def __init__(self, *streams, **kwargs):
"""Create a |AbstractGFF_Assembler|
Parameters
----------
*streams : one or more str or file-like
One or more input streams or filenames pointing to GFF information
is_sorted : bool, optional
GFF is sorted by chromosome name, allowing some memory savings
(Default: `False`)
return_type : |SegmentChain| or subclass, optional
Type of feature to return from assembled subfeatures (Default:
|SegmentChain|)
add_three_for_stop : bool, optional
Some annotation files exclude the stop codon from CDS annotations.
If set to `True`, three nucleotides will be added to the threeprime
end of each CDS annotation. (Default: `False`)
printer : file-like, optional
Logger implementing a ``write()`` method. (Default: |NullWriter|)
reader_class : class
|GFF3_Reader| or |GTF2_Reader|
tabix : boolean, optional
`streams` point to `tabix`_-compressed files or are open
:class:`~pysam.ctabix.tabix_file_iterator` (Default: `False`)
**kwargs
Other keyword arguments used by specific parsers
"""
tabix = kwargs.get("tabix", False)
end_included = kwargs.get("end_included", True)
return_stopfeatures = kwargs.get("return_stopfeatures", True)
is_sorted = kwargs.get("is_sorted", False)
reader_class = kwargs.get("reader_class")
streams = multiopen(streams, open)
iterables = []
for stream in streams:
iterables.append(
reader_class(
stream,
end_included=end_included,
return_stopfeatures=return_stopfeatures,
is_sorted=is_sorted,
tabix=tabix
)
)
iterables.append([StopFeature])
self.stream = itertools.chain.from_iterable(iterables)
self.printer = kwargs.get("printer", NullWriter())
self.return_type = kwargs.get("return_type", SegmentChain)
self.add_three_for_stop = kwargs.get("add_three_for_stop", False)
self.metadata = {}
self.rejected = []
self.counter = 0
self._transcript_cache = iter([])
self._feature_cache = {}
def _finalize(self, tx):
return tx
@abstractmethod
def _collect(self, feature):
"""Collect relevant features of transcripts
Parameters
----------
feature : |SegmentChain|
Feature to collect
"""
pass
@abstractmethod
def _assemble_transcripts(self):
"""Assemble |Transcript| objects from collected features
Returns
-------
list
list of transcripts
"""
pass
@abstractmethod
def _reset(self):
"""Release memory and reset internal hashes"""
pass
def _get_transcript_batches(self):
"""Lazily assemble batches of transcripts from components in
`self.stream`, when signals in `self.stream` indicate it is safe to
assemble transcripts from previously collected features. This feature is
called by `self.__iter__`, which returns individual transcripts.
Signals that can signify safety to assemble previous transcripts
include:
- The special line ``###``, which indicates that the preceding
collection of lines in a GFF3 file are safe to assemble
- The special line ``###FASTA`` which indicates end of the feature
section of a GFF3 file
- A change in reference files (if multiple input files were given)
When an assembly signal is given, transcripts are assembled an internal
caches of collected feature are reset to free memory.
Yields
------
list
Sorted list of transcripts in current GFF block
"""
for feature in self.stream:
self.counter += 1
# collect features until a stop signal, such as:
# - change of contig if GTF2/GFF3 is sorted
# - '###FASTA' block, indicating end of features
# - switch between source files in `self.stream`
if feature is not StopFeature:
self._collect(feature)
# if stop signal is reached, clear memory, because those features
# no longer needed
else:
self.printer.write("Assembling next batch of transcripts ...")
transcripts, rejected = self._assemble_transcripts()
# remove previously assembled transcripts
# these lines required to free memory in Python 2.7
# and early versions of Python 3.x
gc.collect()
del gc.garbage[:]
# sort and prep next set of transcripts
transcripts = sorted(transcripts)
self.rejected.extend(rejected)
self._reset()
yield transcripts
# assemble any remaining features into transcripts, e.g. if no
# stop signals present within a single file
transcripts, rejected = self._assemble_transcripts()
gc.collect()
del gc.garbage[:]
transcripts = sorted(transcripts)
self.rejected.extend(rejected)
self._reset()
yield transcripts
def __iter__(self):
"""Return next assembled transcript from GFF
Yields
------
|Transcript|
Next complex feature in annotation (usually a transcript)
"""
for my_batch in self._get_transcript_batches():
for my_tx in my_batch:
yield self._finalize(my_tx)
[docs]class GTF2_TranscriptAssembler(AbstractGFF_Assembler):
"""
GTF2_TranscriptAssembler(*streams, is_sorted=False, return_type=SegmentChain, add_three_for_stop=False, printer=None, tabix=False)
Assemble |Transcripts| from raw features in `GTF2`_ format.
Exons and CDS features are grouped by shared ``transcript_id``.
Attributes that have common values for all exons and CDS within a transcript
are propagated up to the `attr` dict of the assembled |Transcript|. Other
attributes from individual CDS or exon components are discarded.
The assembler functions as an iterator. Within each chromosome, transcripts
are returned in lexical order.
For access to raw features, instead use |GTF2_Reader|.
Parameters
----------
*streams : one or more str or file-like
One or more input streams or filenames pointing to `GTF2`_ data
is_sorted : bool, optional
`GTF2`_ is sorted by chromosome name, allowing some memory savings
(Default: `False`)
return_type : |SegmentChain| or subclass, optional
Type of feature to return from assembled subfeatures (Default:
|SegmentChain|)
add_three_for_stop : bool, optional
Some annotation files exclude the stop codon from CDS annotations. If
set to `True`, three nucleotides will be added to the threeprime end of
each CDS annotation, UNLESS the annotated transcript contains explicit
`stop_codon` feature. (Default: `False`)
printer : file-like, optional
Logger implementing a ``write()`` method. Default: |NullWriter|
tabix : boolean, optional
`streams` point to `tabix`_-compressed files or are open
:class:`~pysam.ctabix.tabix_file_iterator` (Default: `False`)
Examples
--------
Assemble transcripts from a `GTF2`_ file::
>>> transcript_reader = GTF2_TranscriptAssembler(open("some_file.gtf"))
>>> for transcript in reader:
>>> print(transcript.get_name(),transcript.attr["type"],str(transcript)) # do something
('YAL030W_mRNA', 'mRNA', 'chrI:87262-87387^87500-87857(+)')
('YBL092W_mRNA', 'mRNA', 'chrII:45643-45644^45977-46440(+)')
('YBL057C_mRNA', 'mRNA', 'chrII:112749-113427^113444-113450(-)')
('YBL040C_mRNA', 'mRNA', 'chrII:142033-142749^142846-142891(-)')
('YBL018C_mRNA', 'mRNA', 'chrII:185961-186352^186427-186504(-)')
('YBR012W-B', 'mRNA', 'chrII:259868-261173^261174-265140(+)')
('YBR044C_mRNA', 'mRNA', 'chrII:324292-324336^324340-326127(-)')
('YBR082C_mRNA', 'mRNA', 'chrII:406506-407027^407122-407379(-)')
('YBR126W-B_mRNA', 'mRNA', 'chrII:490824-491202(+)')
('YBR138C_mRNA', 'mRNA', 'chrII:513636-515391(-)')
[rest of output omitted]
Attributes
----------
streams : file-like
Input streams, usually constructed from one or more open filehandles
metadata : dict
Various attributes gleaned from the streams, if any
counter : int
Cumulative line number counter over all streams
printer : file-like, optional
Logger implementing a ``write()`` method.
rejected : list
A list of transcript IDs from transcripts that failed to assemble
properly
"""
# transcripts can be represented as collections of exons + cds
# or cds + UTRs, et c. We consider all UTR and exons as exons
# and CDS, and start & stop codons as CDS areas
_feature_map = {
"exon": ["exon_like"],
"5UTR": ["exon_like"],
"3UTR": ["exon_like"],
"CDS" : ["CDS_like", "exon_like"],
"start_codon": ["CDS_like", "exon_like"],
"stop_codon" : ["CDS_like", "exon_like"],
} # yapf: disable
dtmp = {"exon_like": {}, "CDS_like": {}}
def __init__(self, *streams, **kwargs):
"""
GTF2_TranscriptAssembler(*streams, is_sorted=False, return_type=SegmentChain, add_three_for_stop=False, printer=None, tabix=False)
Parameters
----------
*streams : one or more str or file-like
One or more input streams or filenames pointing to `GTF2`_ data
is_sorted : bool, optional
`GTF2`_ is sorted by chromosome name, allowing some memory savings
(Default: `False`)
return_type : |SegmentChain| or subclass, optional
Type of feature to return from assembled subfeatures (Default: |SegmentChain|)
add_three_for_stop : bool, optional
Some annotation files exclude the stop codon from CDS annotations. If set to
`True`, three nucleotides will be added to the threeprime end of each
CDS annotation, UNLESS the annotated transcript contains explicit `stop_codon`
feature. (Default: `False`)
printer : file-like, optional
Logger implementing a ``write()`` method. Default: |NullWriter|
tabix : boolean, optional
`streams` point to `tabix`_-compressed files or are open
:class:`~pysam.ctabix.tabix_file_iterator` (Default: `False`)
"""
AbstractGFF_Assembler.__init__(self, *streams, reader_class=GTF2_Reader, **kwargs)
self._feature_cache = {"exon_like": {}, "CDS_like": {}}
def _collect(self, feature):
"""Collect transcript component CDS and exons objects from
`self.streams`, and populate `self._feature_cache`
Parameters
----------
feature : |SegmentChain|
Feature to collect
"""
feature_classes = self._feature_map.get(feature.attr["type"], None)
if feature_classes is not None:
tname = feature.attr.get("transcript_id")
for feature_class in feature_classes:
try:
self._feature_cache[feature_class][tname].append(feature)
except KeyError:
self._feature_cache[feature_class][tname] = [feature]
sys.exc_traceback = None
def _assemble_transcripts(self):
"""Assemble |Transcript| objects from `self._feature_cache`,
mapping transcript IDs to corresponding CDS and exon features.
Attributes common to all CDS and exons for a given transcript (e.g.
`gene_id` and `transcript_id`) are propagated up to the |Transcript|.
Other component attributes are discarded.
"""
rejected_transcripts = []
transcripts = []
for tname in set(self._feature_cache["exon_like"].keys()) \
| set(self._feature_cache["CDS_like"].keys()):
exons = self._feature_cache["exon_like"].pop(tname, [])
cds = self._feature_cache["CDS_like"].pop(tname, [])
if len(exons) > 0:
exons = sorted(exons)
exon_segments = [X.spanning_segment for X in exons]
elif len(cds) > 0:
# if cds but no exons, create exons since they are implied
exons = sorted(cds)
exon_segments = [X.spanning_segment for X in exons]
# propagate attributes that are the same in all exons/cds
# to parent. This should include `gene_id` and `transcript_id`
attr = get_identical_attributes(exons + cds)
if len(cds) > 0:
cds = sorted(cds)
attr["cds_genome_end"] = cds[-1].spanning_segment.end
attr["cds_genome_start"] = cds[0].spanning_segment.start
try:
my_tx = Transcript(*tuple(exon_segments), **attr)
if self.add_three_for_stop == True:
# only move stop codons if no exon feature is of type "stop_codon"
if "stop_codon" not in set([X.attr["type"] for X in exons]):
my_tx = add_three_for_stop_codon(my_tx)
transcripts.append(my_tx)
except ValueError:
warn(
"Rejecting transcript '%s' because it contains exons on "
"multiple chromosomes or strands." % tname, DataWarning
)
# transcripts with exons on two strands
rejected_transcripts.append(tname)
except KeyError:
# transcripts where CDS ends outside bounds of transcript
# there are 25 of these in flybase r5.43
rejected_transcripts.append(tname)
warn(
"Rejecting transcript '%s' because start or stop codons "
"are outside exon boundaries." % tname, DataWarning
)
transcripts.sort()
sys.exc_traceback = None
return transcripts, rejected_transcripts
def _reset(self):
"""Release memory and reset internal hashes"""
del self._feature_cache
gc.collect()
del gc.garbage[:]
self._feature_cache = {"exon_like": {}, "CDS_like": {}}
[docs]class GFF3_TranscriptAssembler(AbstractGFF_Assembler):
"""
GFF3_TranscriptAssembler(*streams, is_sorted=False, return_type=SegmentChain, add_three_for_stop=False, printer=None, tabix=False)
Assemble |Transcripts| from raw features in `GFF3`_ format.
Within a chromosome, transcripts are returned in lexical order. Features
that do not constitute portions of transcripts (e.g. origins of replication)
are ignored. For access to those, read raw features using |GFF3_Reader|.
Parameters
----------
streams : one or more str or file-like
One or more input streams or filenames pointing to `GFF3`_ data
is_sorted : bool, optional
`GFF3`_ is sorted by chromosome name, allowing some memory savings
(Default: `False`)
return_type : |SegmentChain| or subclass, optional
Type of feature to return from assembled subfeatures (Default:
|SegmentChain|)
add_three_for_stop : bool, optional
Some annotation files exclude the stop codon from CDS annotations. If
set to `True`, three nucleotides will be added to the threeprime end of
each CDS annotation. (Default: `False`)
transcript_types : list, optional
List of `GFF3`_ feature types that should be considered as transcripts
(Default: as specified in SO 2.5.3 )
exon_types : list, optional
List of `GFF3`_ feature types that should be considered as exons or
contributing to transcript nucleotide positions
during transcript assembly (Default: as specified in SO 2.5.3 )
cds_types : list, optional
List of `GFF3`_ feature types that should be considered as CDS or
contributing to transcript coding regions during transcript assembly
(Default: as specified in SO 2.5.3 )
printer : file-like, optional
Logger implementing a ``write()`` method. Default: |NullWriter|
tabix : boolean, optional
`streams` point to `tabix`_-compressed files or are open
:class:`~pysam.ctabix.tabix_file_iterator` (Default: `False`)
Examples
--------
Assemble transcripts from a `GFF3`_ file::
>>> transcript_reader = GFF3_TranscriptAssembler(open("some_file.gff"))
>>> for transcript in reader:
>>> print(transcript.get_name(),transcript.attr["type"],str(transcript)) # do something
('YAL030W_mRNA', 'mRNA', 'chrI:87262-87387^87500-87857(+)')
('YBL092W_mRNA', 'mRNA', 'chrII:45643-45644^45977-46440(+)')
('YBL057C_mRNA', 'mRNA', 'chrII:112749-113427^113444-113450(-)')
('YBL040C_mRNA', 'mRNA', 'chrII:142033-142749^142846-142891(-)')
('YBL018C_mRNA', 'mRNA', 'chrII:185961-186352^186427-186504(-)')
('YBR012W-B', 'mRNA', 'chrII:259868-261173^261174-265140(+)')
('YBR044C_mRNA', 'mRNA', 'chrII:324292-324336^324340-326127(-)')
('YBR082C_mRNA', 'mRNA', 'chrII:406506-407027^407122-407379(-)')
('YBR126W-B_mRNA', 'mRNA', 'chrII:490824-491202(+)')
('YBR138C_mRNA', 'mRNA', 'chrII:513636-515391(-)')
[rest of output omitted]
Attributes
----------
streams : file-like
Input stream, usually constructed from or more open filehandles
metadata : dict
Various attributes gleaned from the stream, if any
counter : int
Cumulative line number counter over all streams
printer : file-like, optional
Logger implementing a ``write()`` method.
rejected : list
A list of transcript IDs from transcripts that failed to assemble
properly
Notes
-----
`GFF3`_ schemas vary
`GFF3`_ files can have many different schemas of hierarchy. We deal with
that here by allowing users to supply `transcript_types` and
`exon_types`, to indicate which sorts of features should be included. By
default, we use a subset of the schema set out in `Seqence Ontology 2.5.3
<http://www.sequenceontology.org/resources/intro.html>`_
Briefly:
1. The GFF3 file is combed for transcripts of the types specified by
`transcript_types`, exons specified by `exon_types`, and CDS specified
by types listed in `cds_types`.
2. Exons and CDS are matched with their parent transcripts by matching
the `Parent` attributes of CDS and exons to the `ID` of transcripts.
Transcripts are then constructed from those intervals, and coding
regions set accordingly.
3. If exons and/or CDS features point to a `Parent` that is not
in `transcript_types`, they are grouped into a new transcript,
whose ID is set to the value of their shared `Parent`. However,
this value for `Parent` might refer to a gene rather than
a transcript; unfortunately this cannot be known without other
information. Attributes that are common to all CDS and exon
features are bubbled up to the transcript.
4. If exons and/or CDS features have no `Parent`, but share a common ID,
they are grouped by ID into a single transcript. Attributes common
to all CDS and exon features are bubbled up to the transcript.
The `Parent` attribute is left unset.
5. If a transcript feature is annotated but has no child CDS or exons,
the transcript is assumed to be non-coding and is assembled from
any transcript-type features that share its `ID` attribute.
Identity relationships between elements vary between `GFF3`_ files
Different `GFF3`_ files specify discontiguous features differently. For
example, in `Flybase`_, different exons of a transcript will have unique
IDs, but will share the same `'Parent'` attribute in column 9 of the GFF.
In `Wormbase`_, however, different exons of the same transcript will
share the same ID. Here, we first check for the Flybase style (by
Parent), then fall back to Wormbase style (by shared ID).
Transcript assembly
To save memory, transcripts are assembled lazily as follows:
#. If there exist assembled transcripts in `self._transript_cache`,
return the next transcript. Transcripts in the cache are stored
lexically.
#. Otherwise, collect features from the `GFF3`_ stream until either a
`'###'` line or EOF is encountered. Then, assemble transcripts and
store them in `self._transcript_cache`. Delete unused features
from memory. If the `GFF3`_ is sorted, then a change in chromosome
name will also trigger assembly of collected features.
"""
def __init__(self, *streams, **kwargs):
"""
GFF3_TranscriptAssembler(*streams, is_sorted=False, return_type=SegmentChain, add_three_for_stop=False, printer=None, tabix=False)
Parameters
----------
*streams : one or more str or file-like
One or more input streams or filenames pointing to `GFF3`_ data
is_sorted : bool, optional
`GFF3`_ is sorted by chromosome name, allowing some memory savings
(Default: `False`)
return_type : |SegmentChain| or subclass, optional
Type of feature to return from assembled subfeatures (Default:
|SegmentChain|)
add_three_for_stop : bool, optional
Some annotation files exclude the stop codon from CDS annotations.
If set to `True`, three nucleotides will be added to the threeprime
end of each CDS annotation. (Default: `False`)
transcript_types : list, optional
List of `GFF3`_ feature types that should be considered as
transcripts (Default: as specified in SO 2.5.3 )
exon_types : list, optional
List of `GFF3`_ feature types that should be considered as exons or
contributing to transcript nucleotide positions
during transcript assembly (Default: as specified in SO 2.5.3 )
cds_types : list, optional
List of `GFF3`_ feature types that should be considered as CDS or
contributing to transcript coding regions during transcript assembly
(Default: as specified in SO 2.5.3 )
printer : file-like, optional
Logger implementing a ``write()`` method. Default: |NullWriter|
tabix : boolean, optional
`streams` point to `tabix`_-compressed files or are open
:class:`~pysam.ctabix.tabix_file_iterator` (Default: `False`)
Notes
-----
Sequence Ontology 2.5.3
By default, this assembler constructs transcripts following a subset
of the `GFF3`_ schema from the `SO Consortium
<http://www.sequenceontology.org/resources/intro.html>`_. For
details on assembly see the :class:`class docstring
<GFF3_TranscriptAssembler>`, above.
"""
AbstractGFF_Assembler.__init__(self, *streams, reader_class=GFF3_Reader, **kwargs)
self.transcript_types = set(kwargs.get("transcript_types", _DEFAULT_GFF3_TRANSCRIPT_TYPES))
self.exon_types = set(kwargs.get("exon_types", _DEFAULT_GFF3_EXON_TYPES))
self.cds_types = set(kwargs.get("cds_types", _DEFAULT_GFF3_CDS_TYPES)) # yapf: disable
self.transcript_components = self.exon_types | self.cds_types
self._feature_cache = {}
self._tx_features = {}
self._reset()
def _collect(self, feature):
"""Collect CDS and exon components of transcripts from `self.streams`,
and populate `self._feature_cache`
Parameters
----------
feature : |SegmentChain|
Feature to collect
"""
feature_name = feature.get_name()
if feature.attr["type"] in self.transcript_types:
try:
self._tx_features[feature_name].append(feature)
except KeyError:
self._tx_features[feature_name] = [feature]
sys.exc_traceback = None
elif feature.attr["type"] in self.transcript_components:
# assume parent is transcript ID.
# If no Parent, assume transcript is described as exon or CDS
# with identical ID attributes
tnames = feature.attr.get("Parent", [feature.attr.get("ID", [])])
if len(tnames) == 0:
warn(
"Found %s at %s with no `Parent` or `ID`. Ignoring." %
(feature.attr["type"], str(feature.spanning_segment)), DataWarning
)
for tname in tnames:
try:
self._feature_cache[feature.attr["type"]][tname].append(feature)
except KeyError:
self._feature_cache[feature.attr["type"]][tname] = [feature]
sys.exc_traceback = None
def _assemble_transcripts(self):
"""Assemble |Transcript| s from components in `self._feature_cache`
Returns
-------
list
list of transcripts
"""
rejected = []
transcripts = []
tx_features_counted = []
# names of transcripts in transcript_types
tnames = set([])
# find parents of exons & cds that are not present
# in types from transcript_types
for type_ in self.exon_types | self.cds_types:
tnames |= set(self._feature_cache[type_].keys())
for tname in tnames:
tx_features_counted.append(tname)
exons = []
for type_ in self.exon_types:
exons.extend(self._feature_cache[type_].get(tname, []))
cds = []
for cds_type in self.cds_types:
cds.extend(self._feature_cache[cds_type].get(tname, []))
# if transcript is represented, not just implied, use its attributes
if tname in self._tx_features:
# use transcript name as gene if no Parent
gene_id = self._tx_features[tname][0].attr.get("Parent", [tname])
# gene IDs are now returned as lists from GFF3 parser
gene_id = ",".join(sorted(gene_id))
# get attr from transcript object
attr = self._tx_features[tname][0].attr
attr["ID"] = tname
attr["transcript_id"] = tname
attr["gene_id"] = gene_id
# if transcript is just implied by presence of CDS and exons
# TODO: make sure "Parent" will carry over sensibly from lists
else:
attr = get_identical_attributes(exons + cds)
attr["ID"] = tname
attr["transcript_id"] = tname
attr["type"] = "mRNA"
exon_segments = [X.spanning_segment for X in exons]
cds_segments = [X.spanning_segment for X in cds]
if len(exon_segments) + len(cds_segments) > 0:
# transcript with child features
if len(cds) > 0:
cds = sorted(cds)
attr["cds_genome_start"] = cds[0].spanning_segment.start
attr["cds_genome_end"] = cds[-1].spanning_segment.end
try:
my_tx = Transcript(*tuple(exon_segments + cds_segments), **attr)
if self.add_three_for_stop == True:
my_tx = add_three_for_stop_codon(my_tx)
transcripts.append(my_tx)
except ValueError:
warn(
"Rejecting transcript '%s' because it contains exons "
" on multiple strands." % tname, DataWarning
)
# transcripts with exons on two strands
rejected.append(tname)
except KeyError:
# transcripts where CDS ends outside bounds of transcript
# ideally this would not occur but does in rare cases
# e.g. there are 25 of these in flybase r5.43
warn(
"Rejecting transcript '%s because start or stop codons "
"are outside exon boundaries." % tname, DataWarning
)
rejected.append(tname)
else:
# transcript that has multiple subfeatures with shared ID but no children
attr = self._tx_features[tname][0].attr
attr["ID"] = tname
attr["transcript_id"] = tname
segments = [X.spanning_segment for X in self._tx_features[tname]]
my_tx = Transcript(*tuple(segments), **attr)
transcripts.append(my_tx)
sys.exc_traceback = None
return transcripts, rejected
def _reset(self):
"""Release memory and reset internal hashes"""
del self._feature_cache
del self._tx_features
gc.collect()
del gc.garbage[:]
self._tx_features = {}
self._feature_cache = {X: copy.deepcopy({}) for X in self.transcript_components}