Source code for plastid.bin.gff_parent_types

#!/usr/bin/env python
"""Exports a table of parent-child feature relationships for all feature types
found in a `GFF3`_ file. Features with multiple parents are sorted into a category
called `Multiple`.
"""
from plastid.readers.gff import GFF3_Reader
from plastid.util.io.filters import NameDateWriter
from plastid.util.io.openers import get_short_name, opener, argsopener
from plastid.util.scriptlib.help_formatters import format_module_docstring
from plastid.util.scriptlib.argparsers import BaseParser
from collections import Counter
import argparse
import sys
import inspect
import warnings

warnings.simplefilter("once")
printer = NameDateWriter(get_short_name(inspect.stack()[-1][1]))


[docs]def main(argv=sys.argv[1:]): """Command-line program Parameters ---------- argv : list, optional A list of command-line arguments, which will be processed as if the script were called from the command line if :py:func:`main` is called directly. Default: `sys.argv[1:]`. The command-line arguments, if the script is invoked from the command line """ bp = BaseParser() parser = argparse.ArgumentParser( description = format_module_docstring(__doc__), formatter_class = argparse.RawDescriptionHelpFormatter, parents = [bp.get_parser()] ) parser.add_argument( "--exclude", nargs="+", default=[], help="Feature types to exclude from consideration" ) parser.add_argument("infile", metavar="infile.gff", type=str, help="Input GFF3 file") parser.add_argument("outfile", metavar="outfile.txt", type=str, help="Name of output file") args = parser.parse_args(argv) bp.get_base_ops_from_args(args) excluded = set(args.exclude) fin = sys.stdin if args.infile == "-" else opener(args.infile) # yapf : disable feature_counts = Counter() features_with_parents = [] feature_types = {} name_type = {} # yapf : enable printer.write("Opening %s..." % args.infile) c = 0 for feature in GFF3_Reader(fin, return_stopfeatures=False): if c % 10000 == 0: printer.write("Processed %s features..." % c) c += 1 ftype = feature.attr["type"] fname = feature.get_name() if ftype not in excluded: if ftype not in feature_types: feature_types[ftype] = Counter() feature_counts[ftype] += 1 if fname is not None: name_type[fname] = ftype if "Parent" in feature.attr: features_with_parents.append(feature) else: feature_types[ftype]["parent unspecified"] += 1 printer.write("Sorting parents...") c = 0 for feature in features_with_parents: if c % 10000 == 0: printer.write("Processed %s parents..." % c) c += 1 pnames = feature.attr["Parent"] ftype = feature.attr["type"] if pnames == "": feature_types[ftype]["parent unspecified"] += 1 else: if len(pnames) > 1: feature_types[ftype]["multiple parents"] += 1 else: ptype = name_type.get(pnames[0], "parent not in database") feature_types[ftype][ptype] += 1 rows = sorted(feature_types.keys()) cols = rows + ["parent unspecified", "parent not in database", "multiple parents"] with argsopener(args.outfile, args, "w") as fh: printer.write("Writing %s..." % args.outfile) header = "#feature_type\tcount\t" + "\t".join(cols) + "\n" fh.write(header) for r in rows: sout = "%s\t%s" % (r, feature_counts[r]) for i in cols: sout += "\t%s" % feature_types[r].get(i, 0) fh.write("%s\n" % sout) printer.write("Done.")
if __name__ == "__main__": main()