Source code for plastid.util.scriptlib.argparsers

#!/usr/bin/env python
"""This module contains classes that:

  - build :class:`argparse.ArgumentParser` objects for various data types
    used in genomics

  - parse those arguments into useful file types


Arguments are grouped into the following sets:

    ===========================================================   ======================================
    **Parameter/argument set**                                    **Parser building class**
    -----------------------------------------------------------   --------------------------------------
    Generic parameters (e.g. for error reporting, logging)        :class:`BaseParser`

    :term:`Read alignments` or :term:`count files <count file>`   :class:`AlignmentParser`

    Genomic feature or mask annotations                           :class:`AnnotationParser`

    Genomic sequence files                                        :class:`SequenceParser`

    Plotting parameters for charts                                :class:`PlottingParser`
    ===========================================================   ======================================


Example
-------
To use any of these in your own command line scripts, follow these steps:

  #. Import one or more of the classes above::

         >>> import argparse
         >>> from plastid.util.scriptlib.argparsers import AnnotationParser


  #. Use the first function to create an  :class:`~argparse.ArgumentParser`,
     and supply this object as a `parent` when you build your script's
     :py:class:`~argparse.ArgumentParser`::

         >>> ap = AnnotationParser()

         # create annotation file parser
         >>> annotation_file_parser = ap.get_parser(disabled=["some_option_to_disable"])

         # create my own parser, incorporating flags from annotation_file_parser
         >>>> my_own_parser = argparse.ArgumentParser(parents=[annotation_file_parser])

         # add script-specific arguments
         >>> my_own_parer.add_argument("positional_argument",type=str)
         >>> my_own_parser.add_argument("--foo",type=int,default=5,help="Some option")
         >>> my_own_parser.add_argument("--bar",type=str,default="a string",help="Another option")

  #. Then, use the second parse the arguments::

         >>> args = parser.parse_args()

         # get transcript objects from arguments
         # this will be an iterator over |Transcripts|
         >>> transcripts = ap.get_transcripts_from_args(args)

         >>> pass # rest of your script


Your script will then be able process whatever sorts of annotation files that
plastid currently supports.


See Also
--------
:py:mod:`argparse`
    Python documentation on argument parsing

:py:obj:`plastid.bin`
    Source code of command-line scripts, for further examples
"""
import sys
import os
import functools
import argparse
import warnings
import pkg_resources
import pysam

from plastid.util.services.exceptions import (
    MalformedFileError,
    ArgumentWarning,
    DataWarning,
    FileFormatWarning,
    filterwarnings,
)
from plastid.util.services.decorators import deprecated
from plastid.util.io.openers import opener, NullWriter
from plastid.util.io.filters import CommentReader
from plastid.genomics.roitools import Transcript, SegmentChain
from plastid.readers.gff import (
    _DEFAULT_GFF3_TRANSCRIPT_TYPES,
    _DEFAULT_GFF3_EXON_TYPES,
    _DEFAULT_GFF3_CDS_TYPES,
)

#===============================================================================
# INDEX: Constants used in parsers below
#===============================================================================

# Size above which we recommend sorting a GFF/GTF2 file
_GFF_SORT_SIZE = 100 * 1024 * 1024

_MAPPING_RULE_TITLE = "alignment mapping functions (BAM & bowtie files only)"
_MAPPING_RULE_DESCRIPTION = \
"""For BAM or bowtie files, one of the mutually exclusive read mapping functions
is required:
"""

_MAPPING_OPTION_TITLE = "filtering and alignment mapping options"
_MAPPING_OPTION_DESCRIPTION = \
"""
The remaining arguments are optional and affect the behavior of specific
mapping functions:
"""


_DEFAULT_ALIGNMENT_FILE_PARSER_DESCRIPTION = \
"Open alignment or count files and optionally set mapping rules"

_DEFAULT_ALIGNMENT_FILE_PARSER_TITLE = "count & alignment file options"



_DEFAULT_ANNOTATION_PARSER_DESCRIPTION = \
"Open one or more genome annotation files"

_DEFAULT_ANNOTATION_PARSER_TITLE = \
"annotation file options (one or more annotation files required)"

GFF_SORT_MESSAGE = \
"""Sort and index your GTF2/GFF with Tabix as follows:

    $ sort -k1,1 -k4,4n my_file.FORMAT | bgzip > my_file_sorted.FORMAT.gz
    $ tabix -p gff my_file_sorted.FORMAT.gz

See http://www.htslib.org/doc/tabix.html for download and documentation of
tabix and bgzip."""

_MASK_PARSER_TITLE = "mask file options (optional)"
_MASK_PARSER_DESCRIPTION = \
"""Add mask file(s) that annotate regions that should be excluded from analyses
(e.g. repetitive genomic regions)."""

_DEFAULT_SEQUENCE_PARSER_TITLE = "sequence options"
_DEFAULT_SEQUENCE_PARSER_DESCRIPTION = ""

_DEFAULT_PLOTTING_TITLE = "Plotting options"

#===============================================================================
# INDEX: Base class for parsers
#===============================================================================


[docs]class Parser(object): """Base class for argument parser factories used below Parameters ---------- groupname : str, optional Name of argument group. If not `None`, an argument group with the specified name will be created and added to the parser. If not, arguments will be in the main group. prefix : str, optional string prefix to add to default argument options (Default: "") disabled : list, optional list of parameter names that should be disabled from parser, without preceding dashes """ def __init__(self, groupname=None, prefix="", disabled=None, **kwargs): """Create a parser Parameters ---------- groupname : str, optional Name of argument group. If not `None`, an argument group with the specified name will be created and added to the parser. If not, arguments will be in the main group. prefix : str, optional string prefix to add to default argument options (Default: "") disabled : list, optional list of parameter names that should be disabled from parser, without preceding dashes """ self.prefix = prefix self.disabled = [] if disabled is None else disabled self.groupname = groupname # define in __init__ of subclass self.arguments = []
[docs] def get_parser( self, parser=None, groupname=None, arglist=None, title=None, description=None, **kwargs ): """Create an populate :class:`argparse.ArgumentParser` with arguments Parameters ---------- parser : :class:`argparse.ArgumentParser` or None, optional If `None`, a new parser will be created, and arguments will be added to it. If not `None`, arguments will be added to `parser`. (Default: `None`) groupname : str or None, optional If not `None`, default to `self.groupname`. If either `groupname` or `self.groupname` is not `None`, an option group with this name will be added to `parser`, and arguments added to that groupname instead of the main argument group of `parser`. In this case, `title` and `description` will be applied to the option group instead of to `parser`. Default : `None`) arglist : list, optional If not `None`, arguments in this list will be added to `parser`. Otherwise, arguments will be taken from `self.arguments`. The list should be a list of tuples of ('argument_name',dict_of_options), where `argument_name` is a string, and `dict_of_options` a dictionary of keyword arguments to pass to :meth:`argparse.ArgumentParser.add_argument`. title : str, optional Optional title for parser description : str, optional Optional description for parser kwargs : keyword arguments Additional arguments passed during creation of :class:`argparse.ArgumentParser` Returns ------- :class:`argparse.ArgumentParser` """ if groupname is None: groupname = self.groupname if parser is None: if groupname is None: parser = argparse.ArgumentParser(description=description, add_help=False, **kwargs) else: parser = argparse.ArgumentParser(add_help=False, **kwargs) addto = parser if groupname is not None: addto = parser.add_argument_group(title=title, description=description) arglist = self.arguments if arglist is None else arglist for arg_name, arg_opts in filter(lambda x: x[0] not in self.disabled, arglist): addto.add_argument("--%s%s" % (self.prefix, arg_name), **arg_opts) return parser
#=============================================================================== # INDEX: Alignment & count file parser #===============================================================================
[docs]class AlignmentParser(Parser): """Parser for files containing read alignments or quantitative data. Checks for additional mapping rules and command-line arguments by checking the entrypoints ``plastid.mapping_rules`` and ``plastid.mapping_options`` Parameters ---------- groupname : str, optional Name of argument group. If not `None`, an argument group with the specified name will be created and added to the parser. If not, arguments will be in the main group. prefix : str, optional string prefix to add to default argument options (Default: "") disabled : list, optional list of parameter names that should be disabled from parser, without preceding dashes input_choices : list, optional list of permitted alignment file type choices for input allow_mapping : bool, optional Enable/disable user configuration of mapping rules (default: True) """ def __init__( self, prefix="", disabled=None, input_choices=("BAM", "bigwig", "bowtie", "wiggle"), groupname="alignment_options", allow_mapping=True ): # yapf: disable """Create a parser for read alignments and/or quantitative data Parameters ---------- groupname : str, optional Name of argument group. If not `None`, an argument group with the specified name will be created and added to the parser. If not, arguments will be in the main group. prefix : str, optional string prefix to add to default argument options (Default: "") disabled : list, optional list of parameter names that should be disabled from parser, without preceding dashes input_choices : list, optional list of permitted alignment file type choices for input allow_mapping : bool, optional Enable/disable user configuration of mapping rules (default: True) """ Parser.__init__(self, groupname=groupname, prefix=prefix, disabled=disabled) self.input_choices = input_choices self.allow_mapping = allow_mapping self.bamfuncs = {} self.bowtiefuncs = {} self.arguments = [ ("count_files" , dict(type=str, default=[], nargs="+", help="One or more count or alignment file(s) from a single sample or set of samples to be pooled.")), ("countfile_format", dict(choices=input_choices, default="BAM", help="Format of file containing alignments or counts (Default: %(default)s)")), ("normalize" , dict(action="store_true", help="Whether counts should be normalized"+ " to counts per million (usually not. default: %(default)s)", default=False)), ("sum" , dict(type=float,default=None, help="Sum used in normalization of counts and RPKM/RPNT calculations "+\ "(Default: total mapped reads/counts in dataset)")), ] length_ops = [ ( "min_length", dict( type=int, default=25, metavar="N", help=( "Minimum read length required to be included " "(BAM & bowtie files only. Default: %(default)s)" ), ), ), ( "max_length", dict( type=int, default=100, metavar="N", help=( "Maximum read length permitted to be included " "(BAM & bowtie files only. Default: %(default)s)" ), ), ), ] big_genome = [ ( "big_genome", dict( action="store_true", default=False, help=( "Use slower but memory-efficient implementation " "for big genomes or for memory-limited computers. " "For wiggle & bowtie files only." ), ), ), ] maxmem = [ ( "maxmem", dict( type=float, default=0, help=( "Maximum desired memory footprint in MB to devote to " "BigBed/BigWig files. May be exceeded by large queries. " "(Default: 0, No maximum)" ) ) ), ] # filetype-specific options self.filetype_options = { "BAM": length_ops, "bowtie": length_ops + big_genome, "wiggle": big_genome, "bigwig": maxmem, } if self.allow_mapping == False: self.map_rules = [] self.map_ops = [] else: map_rules = [ ( "fiveprime_variable", dict( action="store_const", const="fiveprime_variable", dest="%smapping" % prefix, help=( "Map read alignment to a variable offset from 5' " "position of read, with offset determined by read " "length. Requires `--offset` below" ), ), ), ( "fiveprime", dict( action="store_const", const="fiveprime", dest="%smapping" % prefix, help="Map read alignment to 5' position." ), ), ( "threeprime", dict( action="store_const", const="threeprime", dest="%smapping" % prefix, help="Map read alignment to 3' position" ), ), ( "center", dict( action="store_const", const="center", dest="%smapping" % prefix, help=( "Subtract N positions from each end of read, " "and add 1/(length-N), to each remaining position, " "where N is specified by `--nibble`" ) ) ), ] map_ops = [ ( "offset", dict( default=0, metavar="OFFSET", help=( "For `--fiveprime` or `--threeprime`, provide an integer " "representing the offset into the read, starting " "from either the 5\' or 3\' end, at which data " "should be plotted. For `--fiveprime_variable`, " "provide the filename of a two-column tab-delimited " "text file, in which first column represents read " "length or the special keyword `'default'`, and " "the second column represents the offset from the " "five prime end of that read length at which the " "read should be mapped. (Default: %(default)s)" ), ), ), ( "nibble", dict( type=int, default=0, metavar="N", help=( "For use with `--center` only. nt to remove from " "each end of read before mapping " "(Default: %(default)s)" ), ), ), ] for epoint in pkg_resources.iter_entry_points(group="plastid.mapping_rules"): reg_name = epoint.name pdict = epoint.load() if "name" in pdict: reg_name = pdict.pop("name") pdict["const"] = reg_name pdict["action"] = "store_const" pdict["dest"] = "%smapping" % self.prefix bamfunc = pdict.get("bamfunc", None) bowtiefunc = pdict.get("bowtiefunc", None) if bamfunc is not None: self.bamfuncs[reg_name] = bamfunc pdict.pop("bamfunc") if bowtiefunc is not None: self.bowtiefuncs[reg_name] = bowtiefunc pdict.pop("bowtiefunc") map_rules.append((reg_name, pdict)) for epoint in pkg_resources.iter_entry_points(group="plastid.mapping_options"): reg_name = epoint.name pdict = epoint.load() if "name" in pdict: reg_name = pdict.pop("name") map_ops.append((reg_name, pdict)) self.map_rules = map_rules self.map_ops = map_ops
[docs] def get_parser( self, title=_DEFAULT_ALIGNMENT_FILE_PARSER_TITLE, description=_DEFAULT_ALIGNMENT_FILE_PARSER_DESCRIPTION, **kwargs ): """Return an :py:class:`~argparse.ArgumentParser` that opens alignment (`BAM`_, or `bowtie`_) or count (`Wiggle`_, `bedGraph`_) files. In the case of `bowtie`_ or `BAM`_ import, also parse arguments for mapping rules (e.g. fiveprime end mapping, threeprime end mapping, et c) and optional read length filters Parameters ---------- groupname : str, optional Name of argument group. If not `None`, an argument group with the specified name will be created and added to the parser. If not, arguments will be in the main group. title : str, optional title for option group (used in command-line help screen) description : str, optional description of parser (used in command-line help screen) kwargs : keyword arguments Additional arguments to pass to :meth:`Parser.get_parser` Returns ------- :class:`argparse.ArgumentParser` """ parser = Parser.get_parser(self, title=title, description=description, **kwargs) extra_args = [] for k in self.input_choices: arglist = self.filetype_options.get(k, []) for arg in arglist: if arg not in extra_args: extra_args.append(arg) if len(extra_args) > 0: # use mutator function- add new parser to `parser` Parser.get_parser( self, parser=parser, arglist=extra_args, title=title, description=description, ) if self.allow_mapping == True: Parser.get_parser( self, parser=parser, groupname="mapping_options", arglist=self.map_rules, title=_MAPPING_RULE_TITLE, description=_MAPPING_RULE_DESCRIPTION, ) Parser.get_parser( self, parser=parser, groupname="sub_options", arglist=self.map_ops, title=_MAPPING_OPTION_TITLE, description=_MAPPING_OPTION_DESCRIPTION, ) return parser
[docs] def get_genome_array_from_args(self, args, printer=None): """Return a |GenomeArray|, |SparseGenomeArray| or |BAMGenomeArray| from arguments parsed by :py:func:`get_alignment_file_parser` Parameters ---------- args : :py:class:`argparse.Namespace` Arguments from the parser printer : file-like, optional A stream to which stderr-like info can be written (default: |NullWriter|) Returns ------- |GenomeArray|, |SparseGenomeArray|, or |BAMGenomeArray| """ from plastid.genomics.genome_array import ( GenomeArray, SparseGenomeArray, BAMGenomeArray, BigWigGenomeArray, SizeFilterFactory, CenterMapFactory, FivePrimeMapFactory, ThreePrimeMapFactory, VariableFivePrimeMapFactory, five_prime_map, three_prime_map, center_map, variable_five_prime_map, ) args = PrefixNamespaceWrapper(args, self.prefix) disabled = self.disabled map_rule = args.mapping if printer is None: printer = NullWriter() # require at least one countfile if len(args.count_files) == 0: printer.write("Please include at least one input file.") sys.exit(1) # require mapping rules unless wiggle if map_rule is None and args.countfile_format in ("BAM", "bowtie"): printer.write("Please specify a read mapping rule.") sys.exit(1) if "countfile_format" not in disabled: if args.countfile_format in ("BAM", "CRAM"): count_files = [pysam.Samfile(X, "rb") for X in args.count_files] try: ga = BAMGenomeArray(count_files) except ValueError: printer.write("Input BAM file(s) not indexed. Please index via:") printer.write("") for fn in args.count_files: printer.write(" samtools index [-b|-c] %s" % fn) printer.write("") printer.write("Exiting.") sys.exit(1) size_filter = SizeFilterFactory(min=args.min_length, max=args.max_length) ga.add_filter("size:%s-%s" % (args.min_length, args.max_length), size_filter) if map_rule == "fiveprime": map_function = FivePrimeMapFactory(int(args.offset)) elif map_rule == "threeprime": map_function = ThreePrimeMapFactory(int(args.offset)) elif map_rule == "center": map_function = CenterMapFactory(args.nibble) elif map_rule == "fiveprime_variable": if str(args.offset) == "0": printer.write( "Please specify a filename to use for fiveprime " "variable offsets in --offset." ) sys.exit(1) offset_dict = _parse_variable_offset_file(CommentReader(open(args.offset))) map_function = VariableFivePrimeMapFactory(offset_dict) elif map_rule in self.bamfuncs: map_function = functools.partial(self.bamfuncs[map_rule], args=args) else: printer.write( "Mapping rule '%s' not implemented for BAM input. " "Exiting." % map_rule ) sys.exit(1) ga.set_mapping(map_function) elif args.countfile_format == "bigwig": ga = BigWigGenomeArray(maxmem=args.maxmem) for align_file in args.count_files: ga.add_from_bigwig("%s_fw.bw" % align_file, "+") ga.add_from_bigwig("%s_rc.bw" % align_file, "-") # wiggle/bedGraph and bowtie else: if "big_genome" not in disabled and args.big_genome == True: ga = SparseGenomeArray() else: ga = GenomeArray() # wiggle/bedGraph if args.countfile_format == "wiggle": for align_file in args.count_files: printer.write("Opening wiggle files %s ..." % align_file) with open("%s_fw.wig" % align_file) as fh: ga.add_from_wiggle(fh, "+") with open("%s_rc.wig" % align_file) as fh: ga.add_from_wiggle(fh, "-") # bowtie elif args.countfile_format == "bowtie": trans_args = {"nibble": int(args.nibble)} if map_rule == "fiveprime_variable": transformation = variable_five_prime_map if str(args.offset) == "0": printer.write( "Please specify a filename to use for " "fiveprime variable offsets in --offset." ) sys.exit(1) else: with open(args.offset) as myfile: trans_args["offset"] = _parse_variable_offset_file( CommentReader(myfile) ) else: trans_args["offset"] = int(args.offset) if map_rule == "fiveprime": transformation = five_prime_map elif map_rule == "threeprime": transformation = three_prime_map elif map_rule == "entire": transformation = center_map elif map_rule == "center": transformation = center_map elif map_rule in self.bowtiefuncs: transformation = self.bowtiefuncs[map_rule] trans_args["args"] = args else: printer.write( "Mapping rule '%s' not implemented for bowtie " "input. Exiting." % map_rule ) sys.exit(1) for infile in args.count_files: with opener(infile) as my_file: ga.add_from_bowtie( my_file, transformation, min_length=args.min_length, max_length=args.max_length, **trans_args ) printer.write("Counted %s total reads." % ga.sum()) if "sum" not in disabled and args.sum is not None: ga.set_sum(args.sum) if "normalize" not in disabled and args.normalize == True: printer.write("Normalizing to reads per million.") ga.set_normalize(True) return ga
#=============================================================================== # INDEX: Annotation file parser #===============================================================================
[docs]class AnnotationParser(Parser): """Parser for annotation files in various formats Parameters ---------- groupname : str, optional Name of argument group. If not `None`, an argument group with the specified name will be created and added to the parser. If not, arguments will be in the main group. prefix : str, optional string prefix to add to default argument options (Default: "") disabled : list, optional list of parameter names that should be disabled from parser, without preceding dashes input_choices : list, optional list of permitted alignment file type choices for input allow_mapping : bool, optional Enable/disable user configuration of mapping rules (default: True) """ def __init__( self, prefix="", disabled=None, groupname="annotation_options", input_choices=("BED", "BigBed", "GTF2", "GFF3") ): """Create a parser for genomic features in an annotation file Parameters ---------- groupname : str, optional Name of argument group. If not `None`, an argument group with the specified name will be created and added to the parser. If not, arguments will be in the main group. prefix : str, optional string prefix to add to default argument options (Default: "") disabled : list, optional list of parameter names that should be disabled from parser, without preceding dashes input_choices : list, optional list of permitted alignment file type choices for input allow_mapping : bool, optional Enable/disable user configuration of mapping rules (default: True) """ Parser.__init__(self, groupname=groupname, prefix=prefix, disabled=disabled) self.input_choices = input_choices self.arguments = [ ( "annotation_files", dict( metavar="infile.[%s]" % " | ".join(input_choices), # | psl]", type=str, nargs="+", default=[], help="Zero or more annotation files (max 1 file if BigBed)" ) ), ( "annotation_format", dict( choices=input_choices, default="GTF2", help=( "Format of %sannotation_files (Default: GTF2). " "Note: GFF3 assembly assumes SO v.2.5.2 feature " "ontologies, which may or may not match your specific " "file." % prefix ) ) ), ( "add_three", dict( default=False, action="store_true", help=( "If supplied, coding regions will be extended by 3 " "nucleotides at their 3\' ends (except for GTF2 files " "that explicitly include `stop_codon` features). " "Use if your annotation file excludes stop codons " "from CDS." ), ) ), ( "tabix", dict( default=False, action="store_true", help=( "%sannotation_files are tabix-compressed and indexed " "(Default: False). Ignored for BigBed files." % prefix ) ) ), ( "sorted", dict( default=False, action="store_true", help=( "%sannotation_files are sorted by chromosomal position " "(Default: False)" % prefix ) ) ), ] # options for specific filetypes self.filetype_options = { "BED" : [("bed_extra_columns", dict(default=0, nargs="+", help="Number of extra columns in BED file (e.g. in custom ENCODE formats) "+ "or list of names for those columns. (Default: %(default)s).")) ], "BigBed" : [ ("maxmem" , dict(type=float,default=0, help="Maximum desired memory footprint in MB to devote to BigBed/BigWig files. May be exceeded by large queries. (Default: 0, No maximum)")), ], "GFF3" : [("gff_transcript_types", dict(type=str, default=_DEFAULT_GFF3_TRANSCRIPT_TYPES, nargs="+", help="GFF3 feature types to include as transcripts, even "+\ "if no exons are present (for GFF3 only; default: use SO v2.5.3 specification)")), ("gff_exon_types", dict(type=str, default=_DEFAULT_GFF3_EXON_TYPES, nargs="+", help="GFF3 feature types to include as exons (for GFF3 only; default: use SO v2.5.3 specification)")), ("gff_cds_types", dict(type=str, default=_DEFAULT_GFF3_CDS_TYPES, nargs="+", help="GFF3 feature types to include as CDS (for GFF3 only; default: use SO v2.5.3 specification)")), ] }
[docs] def get_parser( self, title=_DEFAULT_ANNOTATION_PARSER_TITLE, description=_DEFAULT_ANNOTATION_PARSER_DESCRIPTION, **kwargs ): """Return an :class:`~argparse.ArgumentParser` that opens annotation files. Parameters ---------- title : str, optional title for option group (used in command-line help screen) description : str, optional description of parser (used in command-line help screen) kwargs : keyword arguments Additional arguments to pass to :meth:`Parser.get_parser` Returns ------- :class:`argparse.ArgumentParser` """ parser = Parser.get_parser(self, title=title, description=description, **kwargs) for k in self.input_choices: arglist = self.filetype_options.get(k) if arglist is not None: # use mutator function- add new parser to `parser` Parser.get_parser( self, parser=parser, groupname="%s_%s_options" % (self.groupname, k), title="%s-specific options" % k, arglist=arglist ) return parser
[docs] def get_transcripts_from_args(self, args, printer=None, return_type=None, require_sort=False): """Return a generator of |Transcript| objects from arguments parsed by :func:`get_annotation_file_parser` Parameters ---------- args : :py:class:`argparse.Namespace` Namespace object from :py:func:`get_annotation_file_parser` printer : file-like, optional A stream to which stderr-like info can be written (Default: |NullWriter|) return_type : |SegmentChain| or subclass, optional Type of object to return (Default: |Transcript|) require_sort : bool, optional If True, quit if the annotation file(s) are not sorted or indexed Returns ------- iterator |Transcript| objects, either in order of appearance (if input was a `BED`_, `BigBed`_, or `PSL`_ file), or sorted lexically by chromosome, start coordinate, end coordinate, and then strand (if input was `GTF2`_ or `GFF3`_). See Also -------- get_annotation_file_parser Function that creates :py:class:`argparse.ArgumentParser` whose output :py:class:`~argparse.Namespace` is processed by this function """ return self.get_segmentchains_from_args( args, printer=printer, return_type=Transcript, require_sort=require_sort )
[docs] def get_segmentchains_from_args(self, args, printer=None, return_type=None, require_sort=False): """Return a generator of |SegmentChain| objects from arguments parsed by :py:func:`get_annotation_file_parser` Parameters ---------- args : :py:class:`argparse.Namespace` Namespace object from :py:func:`get_annotation_file_parser` printer : file-like, optional A stream to which stderr-like info can be written (Default: |NullWriter|) return_type : |SegmentChain| or subclass, optional Type of object to return (Default: |Transcript|) require_sort : bool, optional If True, quit if the annotation file(s) are not sorted or indexed Returns ------- iterator |SegmentChain| objects, either in order of appearance (if input was a `BED`_, `BigBed`_, or `PSL`_ file), or sorted lexically by chromosome, start coordinate, end coordinate, and then strand (if input was `GTF2`_ or `GFF3`_). See Also -------- get_annotation_file_parser Function that creates :py:class:`argparse.ArgumentParser` whose output :py:class:`~argparse.Namespace` is processed by this function """ if printer is None: printer = NullWriter() if return_type is None: return_type = SegmentChain args = PrefixNamespaceWrapper(args, self.prefix) disabled = self.disabled if require_sort == True and 'sorted' not in disabled: if args.annotation_format in ("BED", "GTF2", "GFF3") \ and args.sorted == False \ and 'tabix' not in disabled \ and args.tabix == False: printer.write( "Using unsorted/unindexed annotation files requires " "impractical amounts of memory." ) if args.annotation_format == "BED": printer.write( """Convert BED to BigBed using Jim Kent's bedToBigBed utility as follows: $ sort -k1,1 -k2,2n my_file > my_file_sorted.bed $ bedToBigBed my_file_sorted.bed chrom.sizes my_file_sorted.bb See https://github.com/ENCODE-DCC/kentUtils/tree/master/src/product/scripts for download & documentation of Kent utilities""" ) sys.exit(1) else: printer.write(GFF_SORT_MESSAGE.replace("FORMAT", args.annotation_format)) sys.exit(1) printer.write("Parsing features in %s ..." % ", ".join(args.annotation_files)) if "tabix" not in disabled: tabix = args.tabix else: tabix = False if "add_three" not in disabled: add_three = args.add_three else: add_three = False if "bed_extra_columns" not in disabled: bed_extra_columns = args.bed_extra_columns if not (isinstance(bed_extra_columns, list)): try: bed_extra_columns = int(bed_extra_columns) except ValueError: pass else: bed_extra_columns = 0 if args.annotation_format.lower() == "bigbed": if len(args.annotation_files) > 1: printer.write("Bad arguments: we can only process one BigBed file.") sys.exit(2) if tabix == True: warnings.warn( "Tabix compression is incompatible with BigBed files. Ignoring.", ArgumentWarning, ) from plastid.readers.bigbed import BigBedReader transcripts = iter( BigBedReader( args.annotation_files[0], return_type=return_type, cache_depth=1, add_three_for_stop=add_three, printer=printer, maxmem=args.maxmem ) ) elif tabix == True: streams = [ pysam.tabix_iterator(opener(X), pysam.asTuple()) for X in args.annotation_files ] else: streams = (opener(X) for X in args.annotation_files) if args.annotation_format in ("GFF3", "GTF2"): from plastid.readers.gff import GFF3_TranscriptAssembler, GTF2_TranscriptAssembler if 'sorted' not in disabled \ and args.sorted == False \ and 'tabix' not in disabled \ and args.tabix == False \ and any((os.stat(X).st_size >= _GFF_SORT_SIZE for X in args.annotation_files)): msg = ( "Transcript assembly on large FORMAT files can require " "a lot of memory. Consider using a sorted file with " "the '--sorted' flag and/or tabix-compression. " ) msg += GFF_SORT_MESSAGE msg = msg.replace("FORMAT", args.annotation_format) warnings.warn(msg, ArgumentWarning) if args.annotation_format.lower() == "gff3": transcripts = GFF3_TranscriptAssembler( *streams, transcript_types=args.gff_transcript_types, exon_types=args.gff_exon_types, cds_types=args.gff_cds_types, printer=printer, add_three_for_stop=add_three, tabix=tabix, return_type=return_type, is_sorted=args.sorted ) elif args.annotation_format.lower() == "gtf2": transcripts = GTF2_TranscriptAssembler( *streams, printer=printer, tabix=tabix, return_type=return_type, add_three_for_stop=add_three, is_sorted=args.sorted ) elif args.annotation_format.lower() == "bed": from plastid.readers.bed import BED_Reader transcripts = BED_Reader( *streams, add_three_for_stop=add_three, tabix=tabix, return_type=return_type, printer=printer, extra_columns=bed_extra_columns ) elif args.annotation_format.lower() == "psl": from plastid.readers.psl import PSL_Reader transcripts = PSL_Reader( *streams, tabix=tabix, return_type=return_type, printer=printer ) return transcripts
[docs] def get_genome_hash_from_args(self, args, printer=None): """Return a |GenomeHash| of regions from command-line arguments Parameters ---------- args : :py:class:`argparse.Namespace` Namespace object from :py:func:`get_mask_file_parser` printer : file-like A stream to which stderr-like info can be written (Default: |NullWriter|) Returns ------- |GenomeHash| Hashed data structure of masked genomic regions See Also -------- get_mask_file_parser Function that creates :py:class:`argparse.ArgumentParser` whose output :py:class:`~argparse.Namespace` is processed by this function """ from plastid.genomics.genome_hash import GenomeHash, BigBedGenomeHash, TabixGenomeHash from plastid.readers.bed import BED_Reader from plastid.readers.gff import GTF2_Reader, GFF3_Reader from plastid.readers.psl import PSL_Reader if printer is None: printer = NullWriter() prefix = self.prefix args = PrefixNamespaceWrapper(args, prefix) if len(args.annotation_files) > 0: printer.write( "Opening mask annotation file(s) %s ..." % ", ".join(args.annotation_files) ) if args.annotation_format in ("BED", "GTF2", "GFF3") and args.tabix == False: msg = ( "Unindexed mask files can require lots of memory in large " "(e.g. mammalian) genomes. Consider converting to BigBed " "or using tabix to index your mask file." ) warnings.warn(msg, ArgumentWarning) if len(args.annotation_files) > 0: if args.annotation_format == "BigBed": if len(args.annotation_files) > 1: printer.write("Bad arguments: we can only process one BigBed file.") sys.exit(2) return BigBedGenomeHash(args.annotation_files[0]) elif "tabix" not in self.disabled and args.tabix == True: return TabixGenomeHash( args.annotation_files, args.annotation_format, printer=printer ) else: streams = (opener(X) for X in args.annotation_files) if args.annotation_format == "BED": reader = BED_Reader elif args.annotation_format == "GTF2": reader = GTF2_Reader elif args.annotation_format == "GFF3": reader = GFF3_Reader elif args.annotation_format == "PSL": reader = PSL_Reader else: assert False hash_ivcs = list(reader(*streams)) return GenomeHash(hash_ivcs) else: return GenomeHash()
[docs]class MaskParser(AnnotationParser): """Create a parser for masking genomic features given in an annotation file Parameters ---------- groupname : str, optional Name of argument group. If not `None`, an argument group with the specified name will be created and added to the parser. If not, arguments will be in the main group. prefix : str, optional string prefix to add to default argument options (Default: "") disabled : list, optional list of parameter names that should be disabled from parser, without preceding dashes input_choices : list, optional list of permitted alignment file type choices for input allow_mapping : bool, optional Enable/disable user configuration of mapping rules (default: True) """ def __init__( self, prefix="mask_", disabled=None, groupname="mask_options", input_choices=("BED", "BigBed", "GTF2", "GFF3", "PSL") ): """Create a parser for genomic features in an annotation file Parameters ---------- groupname : str, optional Name of argument group. If not `None`, an argument group with the specified name will be created and added to the parser. If not, arguments will be in the main group. prefix : str, optional string prefix to add to default argument options (Default: "") disabled : list, optional list of parameter names that should be disabled from parser, without preceding dashes input_choices : list, optional list of permitted alignment file type choices for input allow_mapping : bool, optional Enable/disable user configuration of mapping rules (default: True) """ AnnotationParser.__init__( self, prefix=prefix, disabled=disabled, groupname=groupname, input_choices=("BED", "BigBed", "GTF2", "GFF3", "PSL") )
[docs] def get_parser(self, title=_MASK_PARSER_TITLE, description=_MASK_PARSER_DESCRIPTION, **kwargs): """Return an :py:class:`~argparse.ArgumentParser` that opens annotation files as masks alignment (`BAM`_ or `bowtie`_) or count (`Wiggle`_, `bedGraph`_) files. Parameters ---------- title : str, optional title for option group (used in command-line help screen) description : str, optional description of parser (used in command-line help screen) arglist : list, optional If not `None`, arguments in this list will be added to `parser`. Otherwise, arguments will be taken from `self.arguments`. The list should be a list of tuples of ('argument_name',dict_of_options), where `argument_name` is a string, and `dict_of_options` a dictionary of keyword arguments to pass to :meth:`argparse.ArgumentParser.add_argument`. Returns ------- :class:`argparse.ArgumentParser` """ return AnnotationParser.get_parser(self, title=title, description=description, **kwargs)
#=============================================================================== # INDEX: Sequence parser #===============================================================================
[docs]class SequenceParser(AnnotationParser): """Parser for sequence files Parameters ---------- groupname : str, optional Name of argument group. If not `None`, an argument group with the specified name will be created and added to the parser. If not, arguments will be in the main group. prefix : str, optional string prefix to add to default argument options (Default: "") disabled : list, optional list of parameter names that should be disabled from parser, without preceding dashes input_choices : list, optional list of permitted alignment file type choices for input """ def __init__( self, groupname="sequence_options", prefix="", disabled=None, input_choices=("fasta", "fastq", "twobit", "genbank", "embl"), ): """Create a parser for genomic sequence Parameters ---------- groupname : str, optional Name of argument group. If not `None`, an argument group with the specified name will be created and added to the parser. If not, arguments will be in the main group. prefix : str, optional string prefix to add to default argument options (Default: "") disabled : list, optional list of parameter names that should be disabled from parser, without preceding dashes input_choices : list, optional list of permitted alignment file type choices for input """ Parser.__init__(self, groupname=groupname, prefix=prefix, disabled=disabled) self.input_choices = input_choices self.arguments = [ ( "sequence_file", dict( metavar="infile.[%s]" % " | ".join(input_choices), type=str, help="A file of DNA sequence" ) ), ( "sequence_format", dict( choices=input_choices, default="fasta", help="Format of %ssequence_file (Default: fasta)." % prefix ) ), ]
[docs] def get_parser( self, title=_DEFAULT_SEQUENCE_PARSER_TITLE, description=_DEFAULT_SEQUENCE_PARSER_DESCRIPTION, **kwargs ): """Return an :py:class:`~argparse.ArgumentParser` that opens sequence files Parameters ---------- title : str, optional title for option group (used in command-line help screen) description : str, optional description of parser (used in command-line help screen) kwargs : keyword arguments Additional arguments to pass to :meth:`Parser.get_parser` Returns ------- :class:`argparse.ArgumentParser` See also -------- get_seqdict_from_args function that parses the :py:class:`~argparse.Namespace` returned by this :py:class:`~argparse.ArgumentParser` """ return Parser.get_parser(self, title=title, description=description, **kwargs)
[docs] def get_seqdict_from_args(self, args, index=True, printer=None): """Retrieve a dictionary-like object of sequences Parameters ---------- args : :py:class:`argparse.Namespace` Namespace object from :py:func:`get_sequence_file_parser` index : bool, optional If sequence format is anything other than twobit, open with lazily-evaluating :func:`Bio.SeqIO.index` instead of :func:`Bio.SeqIO.to_dict` (Default: `True`) printer : file-like A stream to which stderr-like info can be written (Default: |NullWriter|) Returns ------- dict-like Dictionary-like object mapping chromosome names to :class:`Bio.SeqRecord.SeqRecord`-like objects """ if printer is None: printer = NullWriter() args = PrefixNamespaceWrapper(args, self.prefix) printer.write("Opening sequence file '%s'." % args.sequence_file) if args.sequence_format == "twobit": from plastid.genomics.seqtools import TwoBitSeqRecordAdaptor return TwoBitSeqRecordAdaptor(args.sequence_file) else: from Bio import SeqIO if index == True: return SeqIO.index(args.sequence_file, args.sequence_format) else: return SeqIO.to_dict(SeqIO.parse(args.sequence_file, args.sequence_format))
#=============================================================================== # INDEX: Plotting parser #===============================================================================
[docs]class PlottingParser(Parser): """Parser for plotting options Parameters ---------- groupname : str, optional Name of argument group. If not `None`, an argument group with the specified name will be created and added to the parser. If not, arguments will be in the main group. prefix : str, optional string prefix to add to default argument options (Default: "") disabled : list, optional list of parameter names that should be disabled from parser, without preceding dashes """ def __init__(self, groupname="plotting_options", prefix="", disabled=None): """Create a parser for plotting arguments Parameters ---------- groupname : str, optional Name of argument group. If not `None`, an argument group with the specified name will be created and added to the parser. If not, arguments will be in the main group. prefix : str, optional string prefix to add to default argument options (Default: "") disabled : list, optional list of parameter names that should be disabled from parser, without preceding dashes """ Parser.__init__(self, groupname=groupname, prefix=prefix, disabled=disabled) from matplotlib.backend_bases import FigureCanvasBase as fcb if len(prefix) > 0: prefix += "_" try: filetypes = sorted(fcb.get_supported_filetypes().keys()) default_ftype = fcb.get_default_filetype() except: # matplotlib < 1.4.0 filetypes = ["eps", "jpeg", "pdf", "png", "svg"] default_ftype = "pdf" self.arguments = [ ( "figformat", dict( default=default_ftype, type=str, choices=filetypes, help="File format for figure(s); Default: %(default)s)" ), ), ( "figsize", dict( nargs=2, default=None, type=float, metavar="N", help=( "Figure width and height, in inches. " "(Default: use matplotlibrc params)" ), ), ), ("title", dict(type=str, default=None, help="Base title for plot(s).")), ( "cmap", dict( type=str, default=None, help=( "Matplotlib color map from which palette will be made " "(e.g. 'Blues','autumn','Set1'; default: use colors " "from ``--stylesheet`` " "if given, or color cycle in matplotlibrc)" ), ), ), ("dpi", dict(type=int, default=150, help="Figure resolution (Default: %(default)s)")), ] try: import matplotlib.style stylesheets = matplotlib.style.available if "stylesheet" not in self.disabled: self.arguments.append( ( "stylesheet", dict( default=None, choices=stylesheets, help=( "Use this matplotlib stylesheet instead " "of matplotlibrc params" ), ) ) ) except ImportError: # matplotlib < 1.4.0 pass
[docs] def get_parser(self, title=_DEFAULT_PLOTTING_TITLE, description=None): """Return an :py:class:`~argparse.ArgumentParser` to control plotting Parameters ---------- title : str, optional title for option group (used in command-line help screen) description : str, optional description of parser (used in command-line help screen) Returns ------- :class:`argparse.ArgumentParser` """ return Parser.get_parser(self, title=title, description=description)
[docs] def set_style_from_args(self, args): """Parse style information, if present on system and defined in `args` Parameters ---------- args : :class:`argparse.Namespace` Namespace object from :func:`get_plotting_parser` """ try: import matplotlib.style if getattr(args, "stylesheet", None) is not None: matplotlib.style.use(args.stylesheet) except ImportError: pass
[docs] def get_figure_from_args(self, args, **kwargs): """Return a :class:`matplotlib.figure.Figure` following arguments from :func:`get_plotting_parser` A new figure is created with parameters specified in `args`. If these are not found, values found in `**kwargs` will instead be used. If these are not found, we fall back to matplotlibrc values. Parameters ---------- args : :class:`argparse.Namespace` Namespace object from :func:`get_plotting_parser` kwargs : keyword arguments Fallback arguments for items not defined in `args`, plus any other keyword arguments. Returns ------- :class:`matplotlib.figure.Figure` Matplotlib figure """ import matplotlib.pyplot as plt args = PrefixNamespaceWrapper(args, self.prefix) fargs = {} # keep this loop in place in case we add additional # command line attributes as fig properties later for attr in ("figsize", ): if attr in kwargs: v = kwargs[attr] else: v = getattr(args, attr, None) if v is not None: fargs[attr] = v # copy values from fargs kwargs.update(fargs) return plt.figure(**kwargs)
[docs] def get_colors_from_args(self, args, num_colors): """Return a list of colors from arguments parsed by a parser from :func:`get_plotting_parser` If a matplotlib colormap is specified in `args.figcolors`, colors will be generated from that map. Otherwise, if a stylesheet is specified, colors will be fetched from the stylesheet's color cycle. Otherwise, colors will be chosen from the default color cycle specified ``matplotlibrc``. Parameters ---------- args : :class:`argparse.Namespace` Namespace object from :func:`get_plotting_parser` num_colors : int Number of colors to fetch Returns ------- list List of matplotlib colors """ import matplotlib.cm args = PrefixNamespaceWrapper(args, self.prefix) figcolors = getattr(args, "cmap", None) if figcolors is not None: import numpy cmap = matplotlib.cm.get_cmap(figcolors) if num_colors > 1: colors = cmap(numpy.linspace(0, 1.0, num_colors)) else: colors = [cmap(0.5)] else: from itertools import cycle try: color_cycle = cycle(matplotlib.rcParams["axes.prop_cycle"].by_key()["color"]) except KeyError: color_cycle = cycle(matplotlib.rcParams["axes.color_cycle"]) colors = [next(color_cycle) for _ in range(num_colors)] return colors
#=============================================================================== # INDEX: Parser for generic command-line options (e.g. warning control) #===============================================================================
[docs]class BaseParser(Parser): """Parser basic options Parameters ---------- groupname : str, optional Name of argument group. If not `None`, an argument group with the specified name will be created and added to the parser. If not, arguments will be in the main group. prefix : str, optional string prefix to add to default argument options (Default: "") disabled : list, optional list of parameter names that should be disabled from parser, without preceding dashes """ def __init__( self, groupname="base_options", prefix="", disabled=None, ): """Create a parser for basic options for command-line scripts, such as warnings and logging Parameters ---------- groupname : str, optional Name of argument group. If not `None`, an argument group with the specified name will be created and added to the parser. If not, arguments will be in the main group. prefix : str, optional string prefix to add to default argument options (Default: "") disabled : list, optional list of parameter names that should be disabled from parser, without preceding dashes """ Parser.__init__(self, groupname=groupname, prefix=prefix, disabled=disabled) self.arguments = [] # self.level_desc = ["--silent","--quiet","--verbose","--raise"]
[docs] def get_parser(self, title=None, description=None): """Return an :py:class:`~argparse.ArgumentParser` Parameters ---------- title : str, optional title for option group (used in command-line help screen) description : str, optional description of parser (used in command-line help screen) Returns ------- :class:`argparse.ArgumentParser` """ p = Parser.get_parser(self) g = p.add_argument_group(title="warning/error options") g.add_argument( "-q", "--quiet", dest="warnlevel", action="store_const", const=-1, help="Suppress all warning messages. Cannot use with '-v'." ) g.add_argument( "-v", "--verbose", dest="warnlevel", action="count", help=( "Increase verbosity. With '-v', show every warning. " "With '-vv', turn warnings into exceptions. " "Cannot use with '-q'. " "(Default: show each type of warning once)" ) ) p.set_defaults(warnlevel=0) return p
[docs] def get_base_ops_from_args(self, args): global warnings args = PrefixNamespaceWrapper(args, self.prefix) warnlevel = args.warnlevel actions = ["ignore", "onceperfamily", "always", "error"] if warnlevel >= len(actions) - 1: warnlevel = len(actions) - 2 try: action = actions[warnlevel + 1] except IndexError: warnings.warn( ( "Invalid warning level. Expected 0-3, found %s. " "Defaulting to level 1 (`--once`)." % warnlevel ), UserWarning, ) action = actions[1] for type_, msg in PLASTID_WARNINGS: filterwarnings(action, message=msg, category=type_)
PLASTID_WARNINGS = [ # mapping rules (DataWarning, "File contains read alignments shorter"), (DataWarning, "No offset for reads of length"), (DataWarning, "longer than read length"), # genome_array (DataWarning, "Temporarily turning off normalization"), # roi_tools (DataWarning, "is a zero-length SegmentChain. Returning 0-length count vector"), # metagene (Warning, r"IndexError finding common positions at region.*"), (DataWarning, "has no gene_id. Inferring gene_id"), (DataWarning, "has no attribute"), (DataWarning, "Ignoring labels"), # phase_by_size (DataWarning, "is not divisible by 3. Ignoring last partial codon."), # util.io.filters (Warning, "Could not alert listener"), # util.services.decorators (DeprecationWarning, "is deprecated and will be removed from module"), # gff (DataWarning, "because it contains exons on multiple chromosomes or strands"), (DataWarning, "because start or stop codons are outside exon boundaries"), (DataWarning, "with no `Parent` or `ID`. Ignoring."), (DataWarning, "because it contains exons on multiple strands"), (DataWarning, "because start or stop codons are outside exon boundaries."), # bed (FileFormatWarning, "Extra columns specified by."), (FileFormatWarning, "Are you sure this is a"), (FileFormatWarning, "Are you sure this BED file has extra columns"), (FileFormatWarning, "Maybe this BED has extra columns"), # gff_tokens (FileFormatWarning, "Found duplicate attribute key"), # BigBed (FileFormatWarning, "Could not find or could not parse autoSql declaration in BigBedFile"), # autoSql (DataWarning, "Could not convert autoSql value"), # psl (FileFormatWarning, "Rejecting line") ] #=============================================================================== # INDEX: Deprecated alignment functions, now aliased to classes above #===============================================================================
[docs]@deprecated(version="0.6.1", instead="AlignmentParser") def get_alignment_file_parser( input_choices=("BAM", "bigwig", "bowtie", "wiggle"), disabled=None, prefix="", title=_DEFAULT_ALIGNMENT_FILE_PARSER_TITLE, description=_DEFAULT_ALIGNMENT_FILE_PARSER_DESCRIPTION, map_desc=_MAPPING_RULE_DESCRIPTION, return_subparsers=False ): tmp = AlignmentParser(input_choices=input_choices, prefix=prefix, disabled=disabled) return tmp.get_parser(title=title, description=description)
[docs]@deprecated(version="0.6.1", instead="AlignmentParser.get_genome_array_from_args()") def get_genome_array_from_args(args, prefix="", disabled=None, printer=None): """Return a |GenomeArray|, |SparseGenomeArray| or |BAMGenomeArray| from arguments parsed by :py:func:`get_alignment_file_parser` Parameters ---------- args : :py:class:`argparse.Namespace` Namespace object from :py:func:`get_alignment_file_parser` prefix : str, optional string prefix to add to default argument options (Default: "") Must be same prefix that was added in call to :py:func:`get_alignment_file_parser` (Default: "") disabled : list, optional list of parameter names that were disabled when the argparser was created in :py:func:`get_alignment_file_parser`. (Default: ``[]``) printer : file-like, optional A stream to which stderr-like info can be written (default: |NullWriter|) Returns ------- |GenomeArray|, |SparseGenomeArray|, or |BAMGenomeArray| See Also -------- get_alignment_file_parser Function that creates :py:class:`~argparse.ArgumentParser` whose output :py:class:`~argparse.Namespace` is processed by this function """ tmp = AlignmentParser(prefix=prefix, disabled=disabled) return tmp.get_genome_array_from_args(args, printer=printer)
#=============================================================================== # INDEX: deprecated annotation file parser, and helper functions #===============================================================================
[docs]@deprecated(version="0.6.1", instead="AnnotationParser") def get_annotation_file_parser( input_choices=["BED", "BigBed", "GTF2", "GFF3"], disabled=[], prefix="", title=_DEFAULT_ANNOTATION_PARSER_TITLE, description=_DEFAULT_ANNOTATION_PARSER_DESCRIPTION, return_subparsers=False ): """Return an :py:class:`~argparse.ArgumentParser` that opens annotation files from `BED`_, `BigBed`_, `GTF2`_, or `GFF3`_ formats Parameters ---------- input_choices : list, optional list of permitted alignment file type choices. (Default: '["BED","BigBed","GTF2","GFF3"]'). 'PSL'_ may also be added disabled : list, optional list of parameter names that should be disabled from parser without preceding dashes prefix : str, optional string prefix to add to default argument options (Default: `''`) title : str, optional title for option group (used in command-line help screen) description : str, optional description of parser (used in command-line help screen) return_subparsers : bool, optional if True, additionally return a dictionary of subparser option groups, to which additional options may be added (Default: `False`) Returns ------- :class:`argparse.ArgumentParser` See also -------- get_transcripts_from_args function that parses the :py:class:`~argparse.Namespace` returned by this :py:class:`~argparse.ArgumentParser` """ tmp = AnnotationParser( groupname="annotation_options", prefix=prefix, disabled=disabled, input_choices=input_choices ) parser = tmp.get_parser(title, description) return parser
[docs]@deprecated(version="0.6.1", instead="AnnotationParser.get_transcripts_from_args()") def get_transcripts_from_args( args, prefix="", disabled=[], printer=NullWriter(), return_type=None, require_sort=False ): """Return a list of |Transcript| objects from arguments parsed by :py:func:`get_annotation_file_parser` Parameters ---------- args : :py:class:`argparse.Namespace` Namespace object from :py:func:`get_annotation_file_parser` prefix : str, optional string prefix to add to default argument options. Must be same prefix that was added in call to :py:func:`get_annotation_file_parser` (Default: `''`) disabled : list, optional list of parameter names that were disabled when the annotation file parser was created by :py:func:`get_annotation_file_parser`. (Default: `[]`) printer : file-like, optional A stream to which stderr-like info can be written (Default: |NullWriter|) return_type : |SegmentChain| or subclass, optional Type of object to return (Default: |Transcript|) require_sort : bool, optional If True, quit if the annotation file(s) are not sorted or indexed Returns ------- iterator |Transcript| objects, either in order of appearance (if input was a `BED`_, `BigBed`_, or `PSL`_ file), or sorted lexically by chromosome, start coordinate, end coordinate, and then strand (if input was `GTF2`_ or `GFF3`_). See Also -------- get_annotation_file_parser Function that creates :py:class:`argparse.ArgumentParser` whose output :py:class:`~argparse.Namespace` is processed by this function """ tmp = AnnotationParser(groupname="annotation_options", prefix=prefix, disabled=disabled) return tmp.get_transcripts_from_args( args, printer=printer, return_type=return_type, require_sort=require_sort )
[docs]@deprecated(version="0.6.1", instead="AnnotationParser.get_parser()") def get_segmentchain_file_parser( input_choices=["BED", "BigBed", "GTF2", "GFF3", "PSL"], disabled=[], prefix="", title=_DEFAULT_ANNOTATION_PARSER_TITLE, description=_DEFAULT_ANNOTATION_PARSER_DESCRIPTION ): """Create an :class:`~argparse.ArgumentParser` to open annotation files as |SegmentChains| Parameters ---------- input_choices : list, optional list of permitted alignment file type choices (Default: `["BED","BigBed","GTF2","GFF3", "PSL"]`) disabled : list, optional list of parameter names that should be disabled from parser without preceding dashes prefix : str, optional string prefix to add to default argument options (Default: `''`) title : str, optional title for option group (used in command-line help screen) description : str, optional description of parser (used in command-line help screen) Returns ------- :class:`argparse.ArgumentParser` See Also -------- get_segmentchains_from_args function that parses the :py:class:`~argparse.Namespace` returned by this :py:class:`~argparse.ArgumentParser` """ disabled.append([prefix + "add_three"]) return get_annotation_file_parser( input_choices=input_choices, prefix=prefix, title=title, disabled=disabled, description=description )
[docs]@deprecated(version="0.6.1", instead="AnnotationParser.get_transcripts_from_args()") def get_segmentchains_from_args( args, prefix="", disabled=[], printer=NullWriter(), require_sort=False ): """Return a list of |SegmentChain| objects from arguments parsed by an :class:`~argparse.ArgumentParser` created by :func:`get_segmentchain_file_parser` Parameters ---------- args : :py:class:`argparse.Namespace` Namespace object from :py:func:`get_segmentchain_file_parser` prefix : str, optional string prefix to add to default argument options. Must be same prefix that was added in call to :py:func:`get_segmentchain_file_parser` (Default: "") disabled : list, optional list of parameter names that were disabled when the annotation file parser was created by :py:func:`get_segmentchain_file_parser`. (Default: ``[]``) printer : file-like A stream to which stderr-like info can be written (Default: |NullWriter|) require_sort : bool, optional If True, quit if the annotation file(s) are not sorted or indexed Returns ------- iterator sequence of |SegmentChain| objects, either in order of appearance (if input was a BED or PSL file), or sorted lexically by chromosome, start coordinate, end coordinate, and then strand (if input was) GTF or GFF See Also -------- get_segmentchain_file_parser Function that creates :py:class:`argparse.ArgumentParser` whose output :py:class:`~argparse.Namespace` is processed by this function """ disabled.append([prefix + "add_three"]) return get_transcripts_from_args( args, prefix=prefix, disabled=disabled, printer=printer, return_type=SegmentChain, require_sort=require_sort )
[docs]@deprecated(version="0.6.1", instead="AnnotationParser") def get_mask_file_parser(prefix="mask_", disabled=[]): """Create an :class:`~argparse.ArgumentParser` to open annotation files that describe regions of the genome to mask from analyses Parameters ---------- prefix : str, optional Prefix to add to default argument options (Default: `'mask_'`) disabled : list, optional list of parameter names to disable from the mask file parser (Default: `[]`. `add_three` is always disabled.) Returns ------- argparse.ArgumentParser See Also -------- get_genome_hash_from_mask_args function that parses the :py:class:`~argparse.Namespace` returned by this :py:class:`~argparse.ArgumentParser` """ tmp = AnnotationParser( groupname="%s_options" % prefix, prefix=prefix, disabled=disabled, input_choices=["BED", "GTF2", "GFF3", "BigBed", "PSL"] ) return tmp.get_parser(_MASK_PARSER_TITLE, _MASK_PARSER_DESCRIPTION)
[docs]@deprecated(version="0.6.1", instead="AnnotationParser.get_genome_hash()") def get_genome_hash_from_mask_args(args, prefix="mask_", printer=NullWriter()): """Return a |GenomeHash| of regions from command-line arguments Parameters ---------- args : :py:class:`argparse.Namespace` Namespace object from :py:func:`get_mask_file_parser` prefix : str, optional string prefix to add to default argument options. Must be same prefix that was added in call to :py:func:`get_mask_file_parser` (Default: "mask_") printer : file-like A stream to which stderr-like info can be written (Default: |NullWriter|) Returns ------- |GenomeHash| Hashed data structure of masked genomic regions See Also -------- get_mask_file_parser Function that creates :py:class:`argparse.ArgumentParser` whose output :py:class:`~argparse.Namespace` is processed by this function """ tmp = AnnotationParser(groupname="mask_options", prefix=prefix) return tmp.get_genome_hash_from_args(args, printer=printer)
#=============================================================================== # INDEX: deprecated sequence file parser #===============================================================================
[docs]@deprecated(version="0.6.1", instead="SequenceParser") def get_sequence_file_parser( input_choices=("fasta", "fastq", "twobit", "genbank", "embl"), disabled=(), prefix="", title=_DEFAULT_SEQUENCE_PARSER_TITLE, description=_DEFAULT_SEQUENCE_PARSER_DESCRIPTION ): """Return an :py:class:`~argparse.ArgumentParser` that opens annotation files from `BED`_, `BigBed`_, `GTF2`_, or `GFF3`_ formats Parameters ---------- input_choices : list, optional list of permitted sequence file type choices. (Default: '["FASTA","twobit","genbank","embl"]'). disabled : list, optional list of parameter names that should be disabled from parser without preceding dashes prefix : str, optional string prefix to add to default argument options (Default: `''`) title : str, optional title for option group (used in command-line help screen) description : str, optional description of parser (used in command-line help screen) Returns ------- :class:`argparse.ArgumentParser` See also -------- get_seqdict_from_args function that parses the :py:class:`~argparse.Namespace` returned by this :py:class:`~argparse.ArgumentParser` """ tmp = SequenceParser(disabled=disabled, prefix=prefix, input_choices=input_choices) return tmp.get_parser(title=title, description=description)
[docs]@deprecated(version="0.6.1", instead="SequenceParser.get_seqdict_from_args()") def get_seqdict_from_args(args, index=True, prefix="", printer=NullWriter()): """Retrieve a dictionary-like object of sequences Parameters ---------- args : :py:class:`argparse.Namespace` Namespace object from :py:func:`get_sequence_file_parser` prefix : str, optional string prefix to add to default argument options. Must be same prefix that was added in call to :py:func:`get_sequence_file_parser` (Default: "") index : bool, optional If sequence format is anything other than twobit, open with lazily-evaluating :func:`Bio.SeqIO.index` instead of :func:`Bio.SeqIO.to_dict` (Default: `True`) printer : file-like A stream to which stderr-like info can be written (Default: |NullWriter|) Returns ------- dict-like Dictionary-like object mapping chromosome names to :class:`Bio.SeqRecord.SeqRecord`-like objects """ tmp = SequenceParser(prefix=prefix) return tmp.get_seqdict_from_args(args, index=index, printer=printer)
#=============================================================================== # INDEX: deprecated plotting #===============================================================================
[docs]@deprecated(version="0.6.1", instead="PlottingParser") def get_plotting_parser(prefix="", disabled=[], title=_DEFAULT_PLOTTING_TITLE): """Return an :py:class:`~argparse.ArgumentParser` to control plotting Parameters ---------- disabled : list, optional list of parameter names that should be disabled from parser without preceding dashes prefix : str, optional string prefix to add to default argument options (Default: `''`) title : str, optional title for option group (used in command-line help screen) description : str, optional description of parser (used in command-line help screen) Returns ------- :class:`argparse.ArgumentParser` See also -------- get_colors_from_args parse colors and/or colormaps from this argument parser """ tmp = PlottingParser(prefix=prefix, disabled=disabled) return tmp.get_parser(title=title)
[docs]@deprecated(version="0.6.1", instead="PlottingParser.get_figure_from_args()") def get_figure_from_args(args, **kwargs): """Return a :class:`matplotlib.figure.Figure` following arguments from :func:`get_plotting_parser` A new figure is created with parameters specified in `args`. If these are not found, values found in `**kwargs` will instead be used. If these are not found, we fall back to matplotlibrc values. Parameters ---------- args : :class:`argparse.Namespace` Namespace object from :func:`get_plotting_parser` kwargs : keyword arguments Fallback arguments for items not defined in `args`, plus any other keyword arguments. Returns ------- :class:`matplotlib.figure.Figure` Matplotlib figure """ tmp = PlottingParser() return tmp.get_figure_from_args(args, **kwargs)
[docs]@deprecated(version="0.6.1", instead="PlottingParser.get_colors_from_args()") def get_colors_from_args(args, num_colors): """Return a list of colors from arguments parsed by a parser from :func:`get_plotting_parser` If a matplotlib colormap is specified in `args.figcolors`, colors will be generated from that map. Otherwise, if a stylesheet is specified, colors will be fetched from the stylesheet's color cycle. Otherwise, colors will be chosen from the default color cycle specified ``matplotlibrc``. Parameters ---------- args : :class:`argparse.Namespace` Namespace object from :func:`get_plotting_parser` num_colors : int Number of colors to fetch Returns ------- list List of matplotlib colors """ tmp = PlottingParser() return tmp.get_colors_from_args(args, num_colors)
#=============================================================================== # INDEX: Utility classes #===============================================================================
[docs]class PrefixNamespaceWrapper(object): """Wrapper class to facilitate processing of :py:class:`~argparse.Namespace` objects created by :py:func:`get_alignment_file_parser` or :py:func:`get_annotation_file_parser` with non-empty ``prefix`` values, as if no prefix had been used. Attributes ---------- namespace : :py:class:`~argparse.Namespace` Result of calling :py:meth:`argparse.ArgumentParser.parse_args` prefix : str Prefix that will be prepended to names of attributes of `self.namespace` before they are fetched. Must match prefix that was used in creation of the :py:class:`argparse.ArgumentParser` that created `self.namespace` See Also -------- get_annotation_file_parser get_alignment_file_parser get_genome_array_from_args get_transcripts_from_args """ def __init__(self, namespace, prefix): """Create a |PrefixNamespaceWrapper| Parameters ---------- namespace : :py:class:`~argparse.Namespace` Result of calling :py:meth:`argparse.ArgumentParser.parse_args` prefix : str Prefix that will be prepended to items from the :py:class:`~argparse.Namespace` before they are checked """ self.namespace = namespace self.prefix = prefix def __getattr__(self, k): """Fetch an attribute from `self.namespace`, appending `self.prefix` to `k` before fetching Parameters ---------- k : str Attribute to fetch """ return getattr(self.namespace, "%s%s" % (self.prefix, k))
#=============================================================================== # INDEX: Utility functions #=============================================================================== def _parse_variable_offset_file(fh): """Read a variable-offset text file into a dictionary. These text files contain two columns and are tab-delimited. The first column specifies the read length, or contains the special value `'default'`. The second column specifies the offset from the 5' end of that read length to use. Parameters ---------- fh : file-like open filehandle pointing to data Returns ------- dict dictionary mapping sequencing read lengths to their 5' offsets """ my_dict = {} for line in fh: if line.startswith("length"): continue items = line.strip("\n").split("\t") if len(items) != 2: name = getattr(fh, "__name__", "Variable offset file") raise MalformedFileError( name, "More or fewer than two columns on line:\n\t%s" % line.strip("\n") ) if items[0] == "length": continue key = items[0] try: key = key if key == "default" else int(key) except ValueError: name = getattr(fh, "__name__", "Variable offset file") raise MalformedFileError( name, "Non integer value for key '%s' on line:\n\t%s" % (key, line.strip("\n")), ) if key in my_dict: name = getattr(fh, "__name__", "Variable offset file") raise MalformedFileError(name, "multiple offsets defined for read length %s" % key) else: try: my_dict[key] = int(items[1]) except ValueError: name = getattr(fh, "__name__", "Variable offset file") raise MalformedFileError( name, ( "Non integer value for value '%s' on line:\n\t%s" % (items[1], line.strip("\n")) ), ) return my_dict