Source code for plastid.readers.autosql

#!/usr/bin/env python
"""This module contains parsers for data structures written in the `autoSql`_
object specification language, used by the `UCSC genome browser`_, `BigBed`_ files
and `BigWig`_ files.

.. contents::
   :local:
   
   
Summary
-------

Parsers are constructed by initializing an |AutoSqlDeclaration| with a block of
`autoSql`_ text::

    >>> declaration = '''table easy_table
    "A table with a comment on the next line" 
        (
        uint number auto; "a number with a token"
        uint [3] points ; "r,g,b values"
        lstring  my_string ; "a long string"
        uint a_field_size ; "the size for the next field"
        float [a_field_size] float_array ; "an array of floats"
        set(a,b,c) alpha ; "the first three letters of the alphabet"
        )
    '''
    >>> record_parser = AutoSqlDeclaration(declaration)


The parser that is created can then be called to parse text records into dictionaries::

    >>> record_parser("3    1,2,3    my string with spaces    5    1.1,1.2,1.3,1.4,1.5    a,b")
    OrderedDict([("number",3),
                 ("points",(1,2,3)),
                 ("my_string","my string with spaces"),
                 ("a_field_size",5),
                 ("float_array",(1.1,1.2,1.3,1.4,1.5)),
                 ("alpha",{'a','b'}]))

Module contents
---------------
|AutoSqlDeclaration|
    Parses `autoSql`_ declarations for `table`, `simple`, and `object`
    declaration types. Delegates parsing of individual fields to appropriate subclasses 
    (e.g. |AutoSqlField|, |SizedAutoSqlField|, and |ValuesAutoSqlField|).

|AutoSqlField|, |SizedAutoSqlField|, |ValuesAutoSqlField|
    Parse various sorts of fields within an autoSql declaration block


Notes
-----
 #. These parsers seek only to provide Python bindings for `autoSql`_ declarations.
    They do **NOT** generate C or SQL code from `autoSql`_, as those functions
    are already provided by `Jim Kent's utilities <https://github.com/ENCODE-DCC/kentUtils/tree/master/>`_

 #. ``set`` and ``enum`` field types are parsed as ``sets`` of strings

 #. ``primary``, ``index``, and ``auto`` `autoSQL`_ tags are accepted in line declarations,
    but are ignored because they are not relevant for parsing

 #. The parsers assume that they will be parsing tab-delimited text blocks
  
 #. Although declarations are routinely nested as fields within other
    declarations in C ``struct`` s and in SQL databases, in the absence of a standard,
    it is unclear how these would be serialized within tab-delimited columns of `BigBed`_
    files. Therefore, nested declarations are not supported..


See Also
--------
`Updated autoSql Grammar specification <https://github.com/ENCODE-DCC/kentUtils/blob/36d6274459f644d5400843b8fa097b380b8f7867/src/hg/autoSql/autoSql.doc>`_
    Explanation of autoSql grammar

`The ENCODE project's tests for autoSql parsers <https://github.com/ENCODE-DCC/kentUtils/tree/master/src/hg/autoSql/tests/input>`_
    Official autoSql unit tests

`Kent & Brumbaugh, 2002 <http://www.linuxjournal.com/article/5949>`_
    First publication of autoSql & autoXml 

"""
import re
from collections import OrderedDict
from abc import abstractmethod
from plastid.util.services.exceptions import DataWarning, warn, filterwarnings

# use "once" because we want each literal to be shown once
filterwarnings("once", "Could not find formatter for field.*")
filterwarnings("once", "Could not find formatter for field.*")
filterwarnings("once", "Could not convert autoSql value.*")
filterwarnings("once", ".*already found in autoSql declaration.*")

# regular expressions that recognize various autoSql elements
# TODO: migrate this to Parsimonious or something more sane
_pattern_bits = { 
    "start"     : r"^\s*",
    "type"      : r"(?P<type>\w+)",
    "name"      : r"\s+(?P<name>\w+)\s*",
    "semi"      : r"\s*;\s*",
    "comment"   : r"\"(?P<comment>[^\"]*)\"",
    "size"      : r"\s*\[\s*(?P<size>\w+)\s*\]\s*",
    "values"    : r"\s*\(\s*(?P<value_names>[^()]+)\s*\)\s*",
    "optionals" : r"(?P<opt1>\s+primary|\s+auto|\s+index\s*(\[\s*\d+\s*\])?)?\s*(?P<opt2>\s+primary|\s+auto|\s+index\s*(\[\s*\d+\s*\])?)?\s*(?P<opt3>\s+primary|\s+auto|\s+index\s*(\[\s*\d+\s*\])?)?", 
    "declare_type_name" : r"(?P<declare_type>object|simple|table)\s+(?P<declare_name>\w+)\s+",
    "field_text" :  r"\s*\(\s*(?P<field_text>.*)\)",
 } # yapf: disable


class AbstractAutoSqlElement(object):
    """Abstract base class for parsers of autoSql elements
    
    Attributes
    ----------
    attr : dict
        Dictionary of attributes describing the element (e.g. *name,* *type,* et c) 
        
    autosql : str
        Block of autoSql text specifying format of element
        
    match_pattern : :py:class:`re.RegexObject`
        Pattern that determines whether or not a block of autoSql matches this object
    
    parent : instance of subclass of |AbstractAutoSqlElement|, or None
        Parent / enclosing element
    
    field_types : dict
        Dictionary matching type names (as strings) to formatters that parse them
        from plaintext
    
    delim : str, optional
        Text delimiter for fields in blocks called by :py:meth:~__call__~
        (Default: "\t")
    """
    match_str = ""
    match_pattern = re.compile(match_str)

    def __init__(self, autosql, parent=None, delim="\t"):
        self.autosql = autosql
        self.parent = parent
        self.delim = delim
        self.field_types = {
            "int"     : (int, "i"),   #32-bit
            "uint"    : (int, "I"),   #32-bit
            "short"   : (int, "h"),   #16-bit
            "ushort"  : (int, "H"),   #16-bit
            "byte"    : (int, "b"),   #8-bit
            "ubyte"   : (int, "B"),   #8-bit
            "float"   : (float, "f"), #single-precision
            "char"    : (str, "c"),   #8-bit
            "string"  : (str, "s"),   #variable up to 255bytes
            "lstring" : (str, "s"),   #variable up to 2billion bytes
        } # yapf: disable
        self.attr = self.match_pattern.search(autosql).groupdict()

    def __repr__(self):
        return "<%s name=%s type=%s>" % (
            self.__class__.__name__, self.attr["name"],
            self.attr.get("type", self.__class__.__name__)
        )

    def add_type(self, name, formatter):
        """Add a type to the parser
        
        Parameters
        ----------
        name : str
            Name of data type
        
        formatter : callable
            Function/callable that, when applied to autoSql text, yields
            an object of the type specified by ``name``
        """
        self.field_types[name] = formatter

    @abstractmethod
    def __call__(self, text, rec=None):
        """Parse an OrderedDict matching ``self.autosql`` from a block of delimited text
        
        Parameters
        ----------
        text : str
            Multiline text block, formatted in autoSql
        
        rec : OrderedDict or None, optional
            Record whose attributes are being populated by recursive
            processing of ``text``. Passed in cases where fields sized by variables
            need to look up instance values of earlier fields to evaluate those
            variables.
        """
        pass

    @staticmethod
    def mask_comments(text):
        """Mask all comments in an autoSql block in order to facilitate parsing
        by regular expressions
        
        Parameters
        ----------
        text : str
            autoSql-formatted text
        
        Returns
        -------
        str
            Text with comments replaced by "xxxxxx" of same length
        
        list
            List of (comment.start,comment.end), including quotes, for each comment
            in ``text`` 
        """
        cpat = re.compile(r"\"[^\"]+\"")
        match_locs = []
        for match in cpat.finditer(text):
            my_start = match.start()
            my_end = match.end()
            match_len = my_end - my_start
            match_locs.append((my_start + 1, my_end - 1))
            text = text[:my_start + 1] + "x" * (match_len - 2) + text[my_end - 1:]

        return text, match_locs

    @classmethod
    def matches(cls, text):
        """Determine whether autoSql formatting text matches this autoSql element
        
        Parameters
        ----------
        text : str
            Block of autoSql-formatted declaration text
        
        Returns
        bool
            True an autoSql parser of this class's type can be made from this
            specification, otherwise False
        """
        return cls.match_pattern.search(text) is not None


[docs]class AutoSqlDeclaration(AbstractAutoSqlElement): """Parser factory that converts delimited text blocks into OrderedDicts, following the field names and types described by an autoSql declaration element Parameters ---------- autosql : str Block of autoSql text specifying format of element parent : instance of subclass of |AbstractAutoSqlObject| or `None`, optional Parent / enclosing element. Default: None delim : str, optional Field delimiter (default: tab) Attributes ---------- attr : dict Dictionary of descriptive attributes (e.g. *name,* *type,* *declare_type,* et c) field_formatters : OrderedDict Dictionary mapping field names to type names field_comments : OrderedDict Dictionary mapping field names to comments field_types : dict Dictionary matching type names (as strings) to formatters that parse them from plaintext autosql : str Block of autoSql text specifying format of element match_pattern : :py:class:`re.RegexObject` Pattern that determines whether or not a block of autoSql matches this object parent : instance of subclass of |AbstractAutoSqlObject|, or None Parent / enclosing element. Default: None delim : str, optional Text delimiter for fields in blocks called by :py:meth:~__call__~ (Default: "\t") Methods ------- :py:meth:`AutoSqlDeclaration.__call__` Parse autoSql-formatted blocks of text according to this declaration """ match_str = r"".join( [_pattern_bits[_X] for _X in ("start", "declare_type_name", "comment", "field_text")] ) match_pattern = re.compile(match_str, re.S) def __init__(self, autosql, parent=None, delim="\n"): """Create an |AutoSqlDeclaration| Parameters ---------- autosql : str Block of autoSql text specifying format of element parent : instance of subclass of |AbstractAutoSqlObject| or None, optional Parent / enclosing element. Default: None delim : str, optional Field delimiter (default: tab) """ AbstractAutoSqlElement.__init__(self, autosql, parent=parent, delim="\t") # re-do regex match masking out comments, in case the comments # contain special characters that would mess up the parsing masked_sql, comment_match_locs = self.mask_comments(autosql) match_dict = self.match_pattern.search(masked_sql).groupdict() self.attr["declare_type"] = match_dict["declare_type"] self.attr["name"] = match_dict["declare_name"] masked_field_text = match_dict["field_text"] self.attr["comment"] = autosql[comment_match_locs[0][0]:comment_match_locs[0][1] ].strip("\n").strip("\"") field_text_start = masked_sql.index(masked_field_text) self._field_text = autosql[field_text_start:field_text_start + len(masked_field_text)] if self.parent is not None: self.parent.add_type(self.attr["declare_name"], self) self.field_formatters = OrderedDict() self.field_comments = OrderedDict() self._parse_fields() def _parse_fields(self): """Parse fields of an autoSql declaration, and populate ``self.field_formatters`` and ``self.field_comments``. """ # order in which we try to match autoSql fields match_order = [AutoSqlField, SizedAutoSqlField, ValuesAutoSqlField] # fields are area of string from last starting point to end of comment # first starting point is 0; all subsequent starting points will be end # of previous comment _, comment_locs = self.mask_comments(self._field_text) last_index = 0 for (_, next_index) in comment_locs: field_str = self._field_text[last_index:next_index + 1] for field_class in match_order: if field_class.matches(field_str): my_parser = field_class(field_str) name = my_parser.attr["name"] if name in self.field_formatters: oldname = name i = 1 current_formatter = self.field_formatters[name] current_type = current_formatter.attr.get( "type", current_formatter.__class__.__name__ ) new_type = my_parser.attr.get("type", my_parser.__class__.__name__) while name in self.field_formatters: i += 1 name = "%s%s" % (oldname, i) warn( "Element named '%s' of type '%s' already found in autoSql declaration '%s.' Renaming current element of type '%s' to '%s'" % ( oldname, current_type, self.attr.get("name", "unnamed declaration"), new_type, name ), DataWarning ) my_parser.attr["name"] = name self.field_formatters[name] = my_parser self.field_comments[name] = my_parser.attr["comment"] last_index = next_index + 1 def __repr__(self): return "<%s name=%s type=%s fields=[%s]>" % ( self.__class__.__name__, self.attr["name"], self.attr.get("type", self.__class__.__name__), ",".join(self.field_formatters.keys()) ) def __call__(self, text, rec=None): """Parse an OrderedDict matching ``self.autosql`` from a block of delimited text Parameters ---------- text : str Multiline text block, formatted in autoSql rec : OrderedDict or None, optional Record whose attributes are being populated by recursive processing of ``text``. Passed in cases where fields sized by variables need to look up instance values of earlier fields to evaluate those variables. Returns ------- OrderedDict Dictionary mapping field names to their values """ items = text.split(self.delim) rec = OrderedDict() if rec is None else rec obj = OrderedDict() for item, (field_name, formatter) in zip(items, self.field_formatters.items()): obj[field_name] = formatter(item, rec=obj) return obj
[docs]class AutoSqlField(AbstractAutoSqlElement): """Parser factory for autoSql fields of type ``fieldType fieldName ';' comment`` Parameters ---------- autosql : str Block of autoSql text specifying format of element parent : instance of subclass of |AbstractAutoSqlObject| or None, optional Parent / enclosing element. Default: None delim : str, optional Field delimiter (default: tab) Attributes ---------- attr : dict Dictionary of descriptive attributes (e.g. name, type, et c) formatter : callable Callable/function that converts plain text into an object of the correct type autosql : str Block of autoSql text specifying format of element match_pattern : :class:`re.RegexObject` Pattern that determines whether or not a block of autoSql matches this object parent : instance of subclass of :class:`AbstractAutoSqlObject` or `None` Parent / enclosing element (Default: None) delim : str, optional Text delimiter for fields in blocks called by :meth:`__call__` (Default: newline) """ match_str = r"".join( [_pattern_bits[_X] for _X in ("start", "type", "name", "optionals", "semi", "comment")] ) match_pattern = re.compile(match_str) def __init__(self, autosql, parent=None, delim=""): """Create an |AutoSqlField| Parameters ---------- autosql : str Block of autoSql text specifying format of element parent : instance of subclass of |AbstractAutoSqlObject| or None, optional Parent / enclosing element. Default: None delim : str, optional Field delimiter (default: tab) """ AbstractAutoSqlElement.__init__(self, autosql, parent=parent, delim=delim) type_ = self.attr["type"] try: self.formatter = self.field_types[type_][0] except KeyError: try: self.formatter = self.parent.field_types[type_][0] except: self.formatter = str warn( "Could not find formatter for field '%s' of type '%s'. Casting to 'string' instead." % (self.attr["name"], type_), DataWarning ) def __call__(self, text, rec=None): """Parse an value matching the field described by ``self.autosql`` from a block of delimited text Parameters ---------- text : str Multiline text block, formatted in autoSql Returns ------- Value or object of appropriate type """ try: return self.formatter(text) except ValueError: message = "Could not convert autoSql value '%s' for field '%s' to type '%s'. Casting to 'string' instead. " % ( text, self.attr["name"], self.formatter.__name__ ) warn(message, DataWarning) return text
[docs]class SizedAutoSqlField(AutoSqlField): """Parser factory for autoSql fields of type ``fieldType `[` fieldSize `]` fieldName ';' comment`` Parameters ---------- autosql : str Block of autoSql text specifying format of element parent : instance of subclass of |AbstractAutoSqlObject| or None, optional Parent / enclosing element. Default: None delim : str, optional Field delimiter (default: tab) Attributes ---------- attr : dict Dictionary of descriptive attributes (e.g. *name*, *size,* *type,* et c) formatter : callable Callable/function that converts plain text into an object of the correct type autosql : str Block of autoSql text specifying format of element match_pattern : :class:`re.RegexObject` Pattern that determines whether or not a block of autoSql matches this object parent : instance of subclass of :class:`AbstractAutoSqlObject` or `None` Parent / enclosing element (Default: None) delim : str, optional Text delimiter for fields in blocks called by :meth:`__call__` (Default: newline) Methods ------- :py:meth:`SizedAutoSqlField.__call__` Parse autoSql-formatted blocks of text into the tuples of the object type specified by this field """ match_str = r"".join( [ _pattern_bits[_X] for _X in ("start", "type", "size", "name", "optionals", "semi", "comment") ] ) match_pattern = re.compile(match_str) def __init__(self, autosql, size=1, parent=None, delim=","): """Create a |SizedAutoSqlField| Parameters ---------- autosql : str Block of autoSql text specifying format of element parent : instance of subclass of |AbstractAutoSqlObject| or None, optional Parent / enclosing element. Default: None delim : str, optional Field delimiter (default: tab) """ AutoSqlField.__init__(self, autosql, parent=parent, delim=delim) try: self.attr["size"] = int(self.attr["size"]) self.attr["size_is_int"] = True except ValueError: self.attr["size_is_int"] = False def __call__(self, text, rec=None): """Parse an value matching the field described by ``self.autosql`` from a block of delimited text Parameters ---------- text : str Multiline text block, formatted in autoSql rec : OrderedDict or None, optional Record whose attributes are being populated by recursive processing of ``text``. Passed in cases where fields sized by variables need to look up instance values of earlier fields to evaluate those variables. Returns ------- tuple Tuple of appropriate type """ if self.formatter != str: try: retval = tuple( [self.formatter(X) for X in text.strip().strip(self.delim).split(self.delim)] ) except ValueError: message = "Could not convert autoSql value '%s' in field '%s' to tuple of type '%s'. Leaving as str " % ( text, self.attr["name"], self.formatter.__name__ ) warn(message, DataWarning) return text else: retval = text if self.attr["size_is_int"] == True: assert len(retval) == self.attr["size"] else: assert len(retval) == rec[self.attr["size"]] return retval
# for set, enum types
[docs]class ValuesAutoSqlField(AbstractAutoSqlElement): """Parser factory for autoSql fields of type ``fieldType `(` fieldValues `)` fieldName ';' comment`` where ``fieldType`` would typically be ``set`` or ``enum`` Parameters ---------- autosql : str Block of autoSql text specifying format of element parent : instance of subclass of |AbstractAutoSqlObject| or None, optional Parent / enclosing element. Default: None delim : str, optional Field delimiter (default: tab) """ match_str = r"".join( [ _pattern_bits[_X] for _X in ("start", "type", "values", "name", "optionals", "semi", "comment") ] ) match_pattern = re.compile(match_str) def __init__(self, autosql, parent=None, delim=","): """Create a |ValuesAutoSqlField| Parameters ---------- autosql : str Block of autoSql text specifying format of element parent : instance of subclass of |AbstractAutoSqlObject| or None, optional Parent / enclosing element. Default: None delim : str, optional Field delimiter (default: tab) """ AbstractAutoSqlElement.__init__(self, autosql, parent=parent, delim=delim) self.attr["value_names"] = [X.strip() for X in self.attr["value_names"].split(",")] def __call__(self, text, rec=None): """Parse an value matching the field described by ``self.autosql`` from a block of delimited text Parameters ---------- text : str Multiline text block, formatted in autoSql rec : OrderedDict or None, optional Record whose attributes are being populated by recursive processing of ``text``. Passed in cases where fields sized by variables need to look up instance values of earlier fields to evaluate those variables. Returns ------- set set of items found in column """ items = set( [X.strip() for X in text.strip(self.delim).split(self.delim) if len(X.strip()) > 0] ) return items