Source code for plastid.readers.gff_tokens

#!/usr/bin/env python
"""This module contains functions for escaping, unescaping, and parsing 
tokens from the ninth column of `GTF2`_ and `GFF3`_ files.

Important methods
-----------------
:py:func:`make_GTF2_tokens`
    Format a dictionary of attributes as `GTF2`_ column 9 attributes

:py:func:`make_GFF3_tokens`
    Format a dictionary of attributes as `GFF3`_ column 9 attributes

:py:func:`parse_GTF2_tokens`
    Parse `GTF2`_ column 9 tokens into a dictionary of key-value pairs

:py:func:`parse_GFF3_tokens`
    Parse `GFF3`_ column 9 tokens into a dictionary of key-value pairs

See also
--------

  - `The Sequence Ontology GFF3 specification <http://www.sequenceontology.org/gff3.shtml>`_
  - `The Brent lab GTF2.2 specification <http://mblab.wustl.edu/GTF22.html>`_
"""

# Unit tests for these are in :py:mod:`plastid.test.unit.genomics.readers.test_gff`

import re
import shlex
import copy
from plastid.util.services.exceptions import FileFormatWarning, warn

gtfpat = re.compile(r"^ *([^ ]*) +(.*) *$")

# From the spec: http://www.sequenceontology.org/gff3.shtml
# In addition to Parent, the Alias, Note, Dbxref and Ontology_term attributes can have multiple values.
# Also, SGD uses 'dbxref' instead of 'Dbxref'
_GFF3_DEFAULT_LISTS = ("Parent", "Alias", "Note", "Dbxref", "Ontology_term", "dbxref")

#===============================================================================
# INDEX: helper functions for escaping
#===============================================================================

# must escape % first, otherwise we'll end up escaping everything else,
# since other escape codes start with percent signs
_GFF3_escape_sequences = [
    ('%', '%25'),  # percent signs MUST be escaped FIRST 
    (';', '%3B'),
    (',', '%2C'),
    ('=', '%3D'),
    ('&', '%26'),
    ('\x00', '%00'),
    ('\x01', '%01'),
    ('\x02', '%02'),
    ('\x03', '%03'),
    ('\x04', '%04'),
    ('\x05', '%05'),
    ('\x06', '%06'),
    ('\x07', '%07'),
    ('\x08', '%08'),
    ('\t', '%09'),
    ('\n', '%0A'),
    ('\x0b', '%0B'),
    ('\x0c', '%0C'),
    ('\r', '%0D'),
    ('\x0e', '%0E'),
    ('\x0f', '%0F'),
    ('\x10', '%10'),
    ('\x11', '%11'),
    ('\x12', '%12'),
    ('\x13', '%13'),
    ('\x14', '%14'),
    ('\x15', '%15'),
    ('\x16', '%16'),
    ('\x17', '%17'),
    ('\x18', '%18'),
    ('\x19', '%19'),
    ('\x1a', '%1A'),
    ('\x1b', '%1B'),
    ('\x1c', '%1C'),
    ('\x1d', '%1D'),
    ('\x1e', '%1E'),
    ('\x1f', '%1F'),
    ('\x7f', '%7F'),
    ('\x80', '%80'),
    ('\x81', '%81'),
    ('\x82', '%82'),
    ('\x83', '%83'),
    ('\x84', '%84'),
    ('\x85', '%85'),
    ('\x86', '%86'),
    ('\x87', '%87'),
    ('\x88', '%88'),
    ('\x89', '%89'),
    ('\x8a', '%8A'),
    ('\x8b', '%8B'),
    ('\x8c', '%8C'),
    ('\x8d', '%8D'),
    ('\x8e', '%8E'),
    ('\x8f', '%8F'),
    ('\x90', '%90'),
    ('\x91', '%91'),
    ('\x92', '%92'),
    ('\x93', '%93'),
    ('\x94', '%94'),
    ('\x95', '%95'),
    ('\x96', '%96'),
    ('\x97', '%97'),
    ('\x98', '%98'),
    ('\x99', '%99'),
    ('\x9a', '%9A'),
    ('\x9b', '%9B'),
    ('\x9c', '%9C'),
    ('\x9d', '%9D'),
    ('\x9e', '%9E'),
    ('\x9f', '%9F')
]
"""List mapping characters to their escape sequences, per the `GFF3`_ specification"""

_GTF2_escape_sequences = copy.deepcopy(_GFF3_escape_sequences)
_GTF2_escape_sequences.append(("\"", "%22"))
"""List mapping characters to their escape sequences for `GTF2`_. These are undefined,
but we are using `GFF3`_ characters plus double quotation marks as a convention.
"""


[docs]def escape(inp, char_pairs):
    """Escape reserved characters specified in the list of tuples `char_pairs`
    
    Parameters
    ----------
    inp : str
        Input string
    
    chair_pairs : list
        List of tuples of (character, escape sequence for character)
    
    
    Returns
    -------
    str
        Escaped output
    
    
    See also
    --------
    unescape_GFF3
    """
    for char_, repl in char_pairs:
        inp = inp.replace(char_, repl)

    return inp


[docs]def unescape(inp, char_pairs):
    """Unescape reserved characters specified in the list of tuples `char_pairs`
    
    Parameters
    ----------
    inp : str
        Input string
    
    
    Returns
    -------
    str
        Unescaped output
    
    
    See also
    --------
    escape_GFF3
    """
    for repl, char_ in reversed(char_pairs):
        inp = inp.replace(char_, repl)

    return inp


[docs]def escape_GFF3(inp):
    """Escape reserved characters in `GFF3`_ tokens using percentage notation.
    
    In the `GFF3`_ spec, reserved characters include:
    
        - control characters (ASCII 0-32, 127, and 128-159)
        
        - tab, newline, & carriage return
        
        - semicolons & commas
        
        - the percent sign
        
        - the equals sign
        
        - the ampersand  
    
    Parameters
    ----------
    inp : str
        Input string
    
    chair_pairs : list
        List of tuples of (character, escape sequence for character)
    
    
    Returns
    -------
    str
        Escaped output
    
    
    See also
    --------
    unescape_GFF3
    """
    return escape(inp, _GFF3_escape_sequences)


[docs]def unescape_GFF3(inp):
    """Unescape reserved characters in `GFF3`_ tokens using percentage notation.
    
    In the `GFF3`_ spec, reserved characters include:
    
        - control characters (ASCII 0-32, 127, and 128-159)
        
        - tab, newline, & carriage return
        
        - semicolons & commas
        
        - the percent sign
        
        - the equals sign
        
        - the ampersand  
    
    Parameters
    ----------
    inp : str
        Input string
    
    
    Returns
    -------
    str
        Unescaped output
    
    
    See also
    --------
    escape_GFF3
    """
    return unescape(inp, _GFF3_escape_sequences)


[docs]def escape_GTF2(inp):
    """Escape reserved characters in `GTF2`_ tokens using percentage notation.
    While the `GTF2`_ spec is agnostic for escaping, it is useful when adding
    extra attributes to files. As a convention, we escape the characters
    specified in the `GFF3`_ spec, as well as double quotation marks.
    
    In the `GTF2`_ spec, reserved characters include:
    
        - control characters (ASCII 0-32, 127, and 128-159)
        
        - tab, newline, & carriage return
        
        - semicolons & commas
        
        - the percent sign
        
        - the equals sign
        
        - the ampersand  
    
    Parameters
    ----------
    inp : str
        Input string
    
    chair_pairs : list
        List of tuples of (character, escape sequence for character)
    
    
    Returns
    -------
    str
        Escaped output
    
    
    See also
    --------
    unescape_GFF3
    """
    return escape(inp, _GTF2_escape_sequences)


[docs]def unescape_GTF2(inp):
    """Unescape reserved characters in `GTF2`_ tokens using percentage notation.
    While the `GTF2`_ spec is agnostic for escaping, it is useful when adding
    extra attributes to files. As a convention, we escape the characters
    specified in the `GFF3`_ spec, as well as single quotation marks.
        
    In the `GFF3`_ spec, reserved characters include:
    
        - control characters (ASCII 0-32, 127, and 128-159)
        
        - tab, newline, & carriage return
        
        - semicolons & commas
        
        - the percent sign
        
        - the equals sign
        
        - the ampersand  
    
    Parameters
    ----------
    inp : str
        Input string
    
    
    Returns
    -------
    str
        Unescaped output
    
    
    See also
    --------
    escape_GFF3
    """
    return unescape(inp, _GTF2_escape_sequences)


#===============================================================================
# INDEX: attribute token formatting and parsing
#===============================================================================


def _make_generic_tokens(attr, excludes=None, join_pat='%s %s; ', escape=None):
    """Helper function to convert the `attr` dict of a |SegmentChain|
    into the string representation used in GFF files. This includes
    URL escaping of keys and values, and catenating lists with `','`
    before string conversion
    
    Parameters
    ----------
    attr : dict
        Dictionary of key-value pairs to export
        
    excludes : list<str>
        List of keys to exclude from string
        
    join_pat
        printf-style pattern explaining how to join key:value pairs
    
    escape : None or func, optional
        If None, no special characters are escaped. If a function, that
        funciton will be used to perform the escaping. (Default: `False`)
        
    Returns
    -------
    str
    """
    f = lambda x: x[0] not in excludes
    if escape is None:
        esc = lambda inp: inp
    else:
        esc = lambda inp: escape(str(inp))

    excludes = [] if excludes is None else excludes

    ltmp = []
    for key, val in filter(f, attr.items()):
        if isinstance(val, list):
            val = ",".join([esc(X) for X in val])
        else:
            val = esc(val)
        ltmp.append(join_pat % (esc(key), val))

    return ''.join(ltmp)


[docs]def make_GFF3_tokens(attr, excludes=None, escape=True):
    """Helper function to convert the `attr` dict of a |SegmentChain|
    into the string representation used in `GFF3`_ files. This includes
    URL escaping of special characters, and catenating lists with '`,`'
    before string conversion

    Examples
    --------
        >>> d = {'a':1,'b':2,'c':3,'d':4,'e':5,'z':26,'text':"something; with escape sequences"}
        >>> _make_GFF3_tokens(d)
        'a=1;c=3;b=2;e=5;d=4;z=26;text=something%3B with escape sequences'
    
        >>> excludes=['a','b','c']
        >>> _make_GFF3_tokens(d,excludes)
        'e=5;d=4;z=26;text=something%3B with escape sequences'

        >>> d = {'a':1,'b':2,'c':[3,7],'d':4,'e':5,'z':26}
        >>> _make_GFF3_tokens(d)
        'a=1;c=3,7;b=2;e=5;d=4;z=26'


    Parameters
    ----------
    attr : dict
        Dictionary of key-value pairs to export
        
    excludes : list, optional
        List of keys to exclude from string
        
    escape : bool, optional
        If True, special characters in output are `GFF3`_-escaped (Default: `True`)
        
    Returns
    -------
    str
        Data formatted for *attributes* column of `GFF3`_ (column 9)
    """
    escape = escape_GFF3 if escape == True else None
    excludes = [] if excludes is None else excludes

    return _make_generic_tokens(attr, excludes=excludes, join_pat="%s=%s;", escape=escape)


[docs]def make_GTF2_tokens(attr, excludes=None, escape=True):
    """Helper function to convert the `attr` dict  of a |SegmentChain|
    into the string representation used in `GTF2`_ files. By default, special
    characters defined in the `GFF3`_ spec will be URL-escaped.

    Examples
    --------
        >>> d = {'transcript_id' : 't;id', 'a':1,'b':2,'c':3,'d':4,'e':5,'z':26,
                    'gene_id' : 'gid'}
        >>> _make_GTF2_tokens(d)
        'transcript_id "t%3Bid"; gene_id "gid"; a "1"; c "3"; b "2"; e "5"; d "4"; z "26";'
    
        >>> excludes=['a','b','c']
        >>> _make_GTF2_tokens(d,excludes)
        'transcript_id "t%3Bid"; gene_id "gid"; e "5"; d "4"; z "26";'


    Parameters
    ----------
    attr : dict
        Dictionary of key-value pairs to export
        
    excludes : list, optional
        List of keys to exclude from string
        
    escape : bool, optional
        If True, special characters in output are `GTF2`_-escaped (Default: `True`)
        
    Returns
    -------
    str
        Data formatted for *attributes* column of `GTF2`_ (column 9)
    """
    excludes = [] if excludes is None else excludes
    excludes.extend(["transcript_id", "gene_id"])
    stmp = 'gene_id "%s"; transcript_id "%s"; ' % (attr.get("gene_id"), attr.get("transcript_id"))

    if escape == True:
        escape = escape_GTF2
    else:
        escape = None

    return stmp + _make_generic_tokens(
        attr, excludes=excludes, join_pat='%s "%s"; ', escape=escape
    ).strip(" ")


[docs]def parse_GFF3_tokens(inp, list_types=None):
    """Helper function to parse tokens in the final column of a `GFF3`_ file
    into a dictionary of attributes. Because, the following attributes are
    permitted to have multiple values in the `GFF3`_ spec, their values, if present
    are returned as lists in the dictionary rather than strings:
    
        - `Parent`
        - `Alias`
        - `Note`
        - `Dbxref`
        - `Ontology_term`
 
    All values are unescaped folowing the `GFF3`_ specification.
 
    Examples
    --------
        >>> tokens = 'a=1;c=3;b=2;e=5;d=4;z=26,Parents=gene01'
        >>> parse_GFF3_tokens(tokens)
        {'a': '1', 'c': '3', 'b': '2', 'e': '5', 'd': '4', 'z': '26', 'parents' : ['gene01'] }

        >>> tokens = 'a=1;c=3,7;b=2;e=5;d=4;z=26,Parents=gene01,gene02'
        >>> parse_GFF3_tokens(tokens)
        {'a': '1', 'c': '3,7', 'b': '2', 'e': '5', 'd': '4', 'z': '26', 'parents' : ['gene01','gene02']}

 
    Parameters
    ----------
    inp : str
        Ninth column of `GFF3`_ entry
    
    list_types : list, optional
        Names of attributes that should be returned as lists
        (Default: %s)
         
    Returns
    -------
    dict : key-value pairs
    """ % ",".join(_GFF3_DEFAULT_LISTS)
    if list_types is None:
        list_types = _GFF3_DEFAULT_LISTS

    d = {}
    items = inp.strip("\n").strip(";").split(";")
    for item in items:
        if len(item) > 0:
            key, val = item.split("=")
            key = unescape_GFF3(key.strip(" "))
            if key in list_types:
                val = [unescape_GFF3(X) for X in val.strip(" ").split(",")]
            else:
                val = unescape_GFF3(val.strip(" "))

            if key in d:
                warn(
                    "Found duplicate attribute key '%s' in GFF3 line. Catenating value with previous value for key in attr dict:\n    %s"
                    % (key, inp), FileFormatWarning
                )
                val = "%s,%s" % (d[key], val)
            d[key] = val
    return d


[docs]def parse_GTF2_tokens(inp):
    """Helper function to parse tokens in the final column of a `GTF2`_ file
    into a dictionary of attributes. All attributes are returned as strings,
    and are unescaped if GFF escape sequences (e.g. *'%2B'*) are present.

    If duplicate keys are present (e.g. as in GENCODE `GTF2`_ files),
    their values are catenated, separated by a comma.
    
    Examples
    --------
        >>> tokens = 'gene_id "mygene"; transcript_id "mytranscript";'
        >>> parse_GTF2_tokens(tokens)
        {'gene_id' : 'mygene', 'transcript_id' : 'mytranscript'}
    
        >>> tokens = 'gene_id "mygene"; transcript_id "mytranscript"'
        >>> parse_GTF2_tokens(tokens)
        {'gene_id' : 'mygene', 'transcript_id' : 'mytranscript'}
    
        >>> tokens = 'gene_id "mygene;"; transcript_id "myt;ranscript"'
        >>> parse_GTF2_tokens(tokens)
        {'gene_id' : 'mygene;', 'transcript_id' : 'myt;ranscript'}
    
        >>> tokens = 'gene_id "mygene"; transcript_id "mytranscript"; tag "tag value";'
        >>> parse_GTF2_tokens(tokens)
        {'gene_id' : 'mygene', 'tag' : 'tag value', 'transcript_id' : 'mytranscript'}

        >>> tokens = 'gene_id "mygene"; transcript_id "mytranscript"; tag "tag value"; tag "tag value 2";'
        >>> parse_GTF2_tokens(tokens)
        {'gene_id' : 'mygene', 'tag' : 'tag value,tag value 2', 'transcript_id' : 'mytranscript'}



    Parameters
    ----------
    inp : str
        Ninth column of `GTF2`_ entry
        
    Returns
    -------
    dict : key-value pairs
    """
    d = {}
    items = shlex.split(inp.strip("\n"))
    assert len(items) % 2 == 0
    for i in range(0, len(items), 2):
        key = unescape_GTF2(items[i])
        val = items[i + 1]
        # require separation by semicolons for all but final token
        if i + 1 < len(items) - 2:
            assert val.endswith(";")

        if val.endswith(";"):
            val = val[:-1]

        if key in d:
            warn(
                "Found duplicate attribute key '%s' in GTF2 line. Catenating value with previous value for key in attr dict:\n    %s"
                % (key, inp), FileFormatWarning
            )
            d[key] = "%s,%s" % (d[key], unescape_GTF2(val))

        else:
            d[key] = unescape_GTF2(val)

    return d