#!/usr/bin/env python
"""This module contains functions for escaping, unescaping, and parsing
tokens from the ninth column of `GTF2`_ and `GFF3`_ files.
Important methods
-----------------
:py:func:`make_GTF2_tokens`
Format a dictionary of attributes as `GTF2`_ column 9 attributes
:py:func:`make_GFF3_tokens`
Format a dictionary of attributes as `GFF3`_ column 9 attributes
:py:func:`parse_GTF2_tokens`
Parse `GTF2`_ column 9 tokens into a dictionary of key-value pairs
:py:func:`parse_GFF3_tokens`
Parse `GFF3`_ column 9 tokens into a dictionary of key-value pairs
See also
--------
- `The Sequence Ontology GFF3 specification <http://www.sequenceontology.org/gff3.shtml>`_
- `The Brent lab GTF2.2 specification <http://mblab.wustl.edu/GTF22.html>`_
"""
# Unit tests for these are in :py:mod:`plastid.test.unit.genomics.readers.test_gff`
import re
import shlex
import copy
from plastid.util.services.exceptions import FileFormatWarning, warn
gtfpat = re.compile(r"^ *([^ ]*) +(.*) *$")
# From the spec: http://www.sequenceontology.org/gff3.shtml
# In addition to Parent, the Alias, Note, Dbxref and Ontology_term attributes can have multiple values.
# Also, SGD uses 'dbxref' instead of 'Dbxref'
_GFF3_DEFAULT_LISTS = ("Parent", "Alias", "Note", "Dbxref", "Ontology_term", "dbxref")
#===============================================================================
# INDEX: helper functions for escaping
#===============================================================================
# must escape % first, otherwise we'll end up escaping everything else,
# since other escape codes start with percent signs
_GFF3_escape_sequences = [
('%', '%25'), # percent signs MUST be escaped FIRST
(';', '%3B'),
(',', '%2C'),
('=', '%3D'),
('&', '%26'),
('\x00', '%00'),
('\x01', '%01'),
('\x02', '%02'),
('\x03', '%03'),
('\x04', '%04'),
('\x05', '%05'),
('\x06', '%06'),
('\x07', '%07'),
('\x08', '%08'),
('\t', '%09'),
('\n', '%0A'),
('\x0b', '%0B'),
('\x0c', '%0C'),
('\r', '%0D'),
('\x0e', '%0E'),
('\x0f', '%0F'),
('\x10', '%10'),
('\x11', '%11'),
('\x12', '%12'),
('\x13', '%13'),
('\x14', '%14'),
('\x15', '%15'),
('\x16', '%16'),
('\x17', '%17'),
('\x18', '%18'),
('\x19', '%19'),
('\x1a', '%1A'),
('\x1b', '%1B'),
('\x1c', '%1C'),
('\x1d', '%1D'),
('\x1e', '%1E'),
('\x1f', '%1F'),
('\x7f', '%7F'),
('\x80', '%80'),
('\x81', '%81'),
('\x82', '%82'),
('\x83', '%83'),
('\x84', '%84'),
('\x85', '%85'),
('\x86', '%86'),
('\x87', '%87'),
('\x88', '%88'),
('\x89', '%89'),
('\x8a', '%8A'),
('\x8b', '%8B'),
('\x8c', '%8C'),
('\x8d', '%8D'),
('\x8e', '%8E'),
('\x8f', '%8F'),
('\x90', '%90'),
('\x91', '%91'),
('\x92', '%92'),
('\x93', '%93'),
('\x94', '%94'),
('\x95', '%95'),
('\x96', '%96'),
('\x97', '%97'),
('\x98', '%98'),
('\x99', '%99'),
('\x9a', '%9A'),
('\x9b', '%9B'),
('\x9c', '%9C'),
('\x9d', '%9D'),
('\x9e', '%9E'),
('\x9f', '%9F')
]
"""List mapping characters to their escape sequences, per the `GFF3`_ specification"""
_GTF2_escape_sequences = copy.deepcopy(_GFF3_escape_sequences)
_GTF2_escape_sequences.append(("\"", "%22"))
"""List mapping characters to their escape sequences for `GTF2`_. These are undefined,
but we are using `GFF3`_ characters plus double quotation marks as a convention.
"""
[docs]def escape(inp, char_pairs):
"""Escape reserved characters specified in the list of tuples `char_pairs`
Parameters
----------
inp : str
Input string
chair_pairs : list
List of tuples of (character, escape sequence for character)
Returns
-------
str
Escaped output
See also
--------
unescape_GFF3
"""
for char_, repl in char_pairs:
inp = inp.replace(char_, repl)
return inp
[docs]def unescape(inp, char_pairs):
"""Unescape reserved characters specified in the list of tuples `char_pairs`
Parameters
----------
inp : str
Input string
Returns
-------
str
Unescaped output
See also
--------
escape_GFF3
"""
for repl, char_ in reversed(char_pairs):
inp = inp.replace(char_, repl)
return inp
[docs]def escape_GFF3(inp):
"""Escape reserved characters in `GFF3`_ tokens using percentage notation.
In the `GFF3`_ spec, reserved characters include:
- control characters (ASCII 0-32, 127, and 128-159)
- tab, newline, & carriage return
- semicolons & commas
- the percent sign
- the equals sign
- the ampersand
Parameters
----------
inp : str
Input string
chair_pairs : list
List of tuples of (character, escape sequence for character)
Returns
-------
str
Escaped output
See also
--------
unescape_GFF3
"""
return escape(inp, _GFF3_escape_sequences)
[docs]def unescape_GFF3(inp):
"""Unescape reserved characters in `GFF3`_ tokens using percentage notation.
In the `GFF3`_ spec, reserved characters include:
- control characters (ASCII 0-32, 127, and 128-159)
- tab, newline, & carriage return
- semicolons & commas
- the percent sign
- the equals sign
- the ampersand
Parameters
----------
inp : str
Input string
Returns
-------
str
Unescaped output
See also
--------
escape_GFF3
"""
return unescape(inp, _GFF3_escape_sequences)
[docs]def escape_GTF2(inp):
"""Escape reserved characters in `GTF2`_ tokens using percentage notation.
While the `GTF2`_ spec is agnostic for escaping, it is useful when adding
extra attributes to files. As a convention, we escape the characters
specified in the `GFF3`_ spec, as well as double quotation marks.
In the `GTF2`_ spec, reserved characters include:
- control characters (ASCII 0-32, 127, and 128-159)
- tab, newline, & carriage return
- semicolons & commas
- the percent sign
- the equals sign
- the ampersand
Parameters
----------
inp : str
Input string
chair_pairs : list
List of tuples of (character, escape sequence for character)
Returns
-------
str
Escaped output
See also
--------
unescape_GFF3
"""
return escape(inp, _GTF2_escape_sequences)
[docs]def unescape_GTF2(inp):
"""Unescape reserved characters in `GTF2`_ tokens using percentage notation.
While the `GTF2`_ spec is agnostic for escaping, it is useful when adding
extra attributes to files. As a convention, we escape the characters
specified in the `GFF3`_ spec, as well as single quotation marks.
In the `GFF3`_ spec, reserved characters include:
- control characters (ASCII 0-32, 127, and 128-159)
- tab, newline, & carriage return
- semicolons & commas
- the percent sign
- the equals sign
- the ampersand
Parameters
----------
inp : str
Input string
Returns
-------
str
Unescaped output
See also
--------
escape_GFF3
"""
return unescape(inp, _GTF2_escape_sequences)
#===============================================================================
# INDEX: attribute token formatting and parsing
#===============================================================================
def _make_generic_tokens(attr, excludes=None, join_pat='%s %s; ', escape=None):
"""Helper function to convert the `attr` dict of a |SegmentChain|
into the string representation used in GFF files. This includes
URL escaping of keys and values, and catenating lists with `','`
before string conversion
Parameters
----------
attr : dict
Dictionary of key-value pairs to export
excludes : list<str>
List of keys to exclude from string
join_pat
printf-style pattern explaining how to join key:value pairs
escape : None or func, optional
If None, no special characters are escaped. If a function, that
funciton will be used to perform the escaping. (Default: `False`)
Returns
-------
str
"""
f = lambda x: x[0] not in excludes
if escape is None:
esc = lambda inp: inp
else:
esc = lambda inp: escape(str(inp))
excludes = [] if excludes is None else excludes
ltmp = []
for key, val in filter(f, attr.items()):
if isinstance(val, list):
val = ",".join([esc(X) for X in val])
else:
val = esc(val)
ltmp.append(join_pat % (esc(key), val))
return ''.join(ltmp)
[docs]def make_GFF3_tokens(attr, excludes=None, escape=True):
"""Helper function to convert the `attr` dict of a |SegmentChain|
into the string representation used in `GFF3`_ files. This includes
URL escaping of special characters, and catenating lists with '`,`'
before string conversion
Examples
--------
>>> d = {'a':1,'b':2,'c':3,'d':4,'e':5,'z':26,'text':"something; with escape sequences"}
>>> _make_GFF3_tokens(d)
'a=1;c=3;b=2;e=5;d=4;z=26;text=something%3B with escape sequences'
>>> excludes=['a','b','c']
>>> _make_GFF3_tokens(d,excludes)
'e=5;d=4;z=26;text=something%3B with escape sequences'
>>> d = {'a':1,'b':2,'c':[3,7],'d':4,'e':5,'z':26}
>>> _make_GFF3_tokens(d)
'a=1;c=3,7;b=2;e=5;d=4;z=26'
Parameters
----------
attr : dict
Dictionary of key-value pairs to export
excludes : list, optional
List of keys to exclude from string
escape : bool, optional
If True, special characters in output are `GFF3`_-escaped (Default: `True`)
Returns
-------
str
Data formatted for *attributes* column of `GFF3`_ (column 9)
"""
escape = escape_GFF3 if escape == True else None
excludes = [] if excludes is None else excludes
return _make_generic_tokens(attr, excludes=excludes, join_pat="%s=%s;", escape=escape)
[docs]def make_GTF2_tokens(attr, excludes=None, escape=True):
"""Helper function to convert the `attr` dict of a |SegmentChain|
into the string representation used in `GTF2`_ files. By default, special
characters defined in the `GFF3`_ spec will be URL-escaped.
Examples
--------
>>> d = {'transcript_id' : 't;id', 'a':1,'b':2,'c':3,'d':4,'e':5,'z':26,
'gene_id' : 'gid'}
>>> _make_GTF2_tokens(d)
'transcript_id "t%3Bid"; gene_id "gid"; a "1"; c "3"; b "2"; e "5"; d "4"; z "26";'
>>> excludes=['a','b','c']
>>> _make_GTF2_tokens(d,excludes)
'transcript_id "t%3Bid"; gene_id "gid"; e "5"; d "4"; z "26";'
Parameters
----------
attr : dict
Dictionary of key-value pairs to export
excludes : list, optional
List of keys to exclude from string
escape : bool, optional
If True, special characters in output are `GTF2`_-escaped (Default: `True`)
Returns
-------
str
Data formatted for *attributes* column of `GTF2`_ (column 9)
"""
excludes = [] if excludes is None else excludes
excludes.extend(["transcript_id", "gene_id"])
stmp = 'gene_id "%s"; transcript_id "%s"; ' % (attr.get("gene_id"), attr.get("transcript_id"))
if escape == True:
escape = escape_GTF2
else:
escape = None
return stmp + _make_generic_tokens(
attr, excludes=excludes, join_pat='%s "%s"; ', escape=escape
).strip(" ")
[docs]def parse_GFF3_tokens(inp, list_types=None):
"""Helper function to parse tokens in the final column of a `GFF3`_ file
into a dictionary of attributes. Because, the following attributes are
permitted to have multiple values in the `GFF3`_ spec, their values, if present
are returned as lists in the dictionary rather than strings:
- `Parent`
- `Alias`
- `Note`
- `Dbxref`
- `Ontology_term`
All values are unescaped folowing the `GFF3`_ specification.
Examples
--------
>>> tokens = 'a=1;c=3;b=2;e=5;d=4;z=26,Parents=gene01'
>>> parse_GFF3_tokens(tokens)
{'a': '1', 'c': '3', 'b': '2', 'e': '5', 'd': '4', 'z': '26', 'parents' : ['gene01'] }
>>> tokens = 'a=1;c=3,7;b=2;e=5;d=4;z=26,Parents=gene01,gene02'
>>> parse_GFF3_tokens(tokens)
{'a': '1', 'c': '3,7', 'b': '2', 'e': '5', 'd': '4', 'z': '26', 'parents' : ['gene01','gene02']}
Parameters
----------
inp : str
Ninth column of `GFF3`_ entry
list_types : list, optional
Names of attributes that should be returned as lists
(Default: %s)
Returns
-------
dict : key-value pairs
""" % ",".join(_GFF3_DEFAULT_LISTS)
if list_types is None:
list_types = _GFF3_DEFAULT_LISTS
d = {}
items = inp.strip("\n").strip(";").split(";")
for item in items:
if len(item) > 0:
key, val = item.split("=")
key = unescape_GFF3(key.strip(" "))
if key in list_types:
val = [unescape_GFF3(X) for X in val.strip(" ").split(",")]
else:
val = unescape_GFF3(val.strip(" "))
if key in d:
warn(
"Found duplicate attribute key '%s' in GFF3 line. Catenating value with previous value for key in attr dict:\n %s"
% (key, inp), FileFormatWarning
)
val = "%s,%s" % (d[key], val)
d[key] = val
return d
[docs]def parse_GTF2_tokens(inp):
"""Helper function to parse tokens in the final column of a `GTF2`_ file
into a dictionary of attributes. All attributes are returned as strings,
and are unescaped if GFF escape sequences (e.g. *'%2B'*) are present.
If duplicate keys are present (e.g. as in GENCODE `GTF2`_ files),
their values are catenated, separated by a comma.
Examples
--------
>>> tokens = 'gene_id "mygene"; transcript_id "mytranscript";'
>>> parse_GTF2_tokens(tokens)
{'gene_id' : 'mygene', 'transcript_id' : 'mytranscript'}
>>> tokens = 'gene_id "mygene"; transcript_id "mytranscript"'
>>> parse_GTF2_tokens(tokens)
{'gene_id' : 'mygene', 'transcript_id' : 'mytranscript'}
>>> tokens = 'gene_id "mygene;"; transcript_id "myt;ranscript"'
>>> parse_GTF2_tokens(tokens)
{'gene_id' : 'mygene;', 'transcript_id' : 'myt;ranscript'}
>>> tokens = 'gene_id "mygene"; transcript_id "mytranscript"; tag "tag value";'
>>> parse_GTF2_tokens(tokens)
{'gene_id' : 'mygene', 'tag' : 'tag value', 'transcript_id' : 'mytranscript'}
>>> tokens = 'gene_id "mygene"; transcript_id "mytranscript"; tag "tag value"; tag "tag value 2";'
>>> parse_GTF2_tokens(tokens)
{'gene_id' : 'mygene', 'tag' : 'tag value,tag value 2', 'transcript_id' : 'mytranscript'}
Parameters
----------
inp : str
Ninth column of `GTF2`_ entry
Returns
-------
dict : key-value pairs
"""
d = {}
items = shlex.split(inp.strip("\n"))
assert len(items) % 2 == 0
for i in range(0, len(items), 2):
key = unescape_GTF2(items[i])
val = items[i + 1]
# require separation by semicolons for all but final token
if i + 1 < len(items) - 2:
assert val.endswith(";")
if val.endswith(";"):
val = val[:-1]
if key in d:
warn(
"Found duplicate attribute key '%s' in GTF2 line. Catenating value with previous value for key in attr dict:\n %s"
% (key, inp), FileFormatWarning
)
d[key] = "%s,%s" % (d[key], unescape_GTF2(val))
else:
d[key] = unescape_GTF2(val)
return d