Source code for plastid.bin.get_count_vectors

#!/usr/bin/env python
"""Fetch vectors of :term:`counts` at each nucleotide position in one or more
regions of interest (ROIs).


Output files
------------
Vectors are saved as individual line-delimited files -- one position per line --
in a user-specified output folder. Each file is named for the ROI to which it
corresponds. If a :term:`mask file` -- e.g. from  :py:mod:`~plastid.bin.crossmap`
-- is provided, masked positions will be have value `nan` in output.
"""
import argparse
import inspect
import os
import warnings
import sys
import numpy

from plastid.util.scriptlib.argparsers import (
    AlignmentParser,
    AnnotationParser,
    MaskParser,
    BaseParser,
)
from plastid.util.io.openers import get_short_name
from plastid.util.io.filters import NameDateWriter
from plastid.util.scriptlib.help_formatters import format_module_docstring

warnings.simplefilter("once")
printer = NameDateWriter(get_short_name(inspect.stack()[-1][1]))


[docs]def main(args=sys.argv[1:]): """Command-line program Parameters ---------- argv : list, optional A list of command-line arguments, which will be processed as if the script were called from the command line if :func:`main` is called directly. Default: `sys.argv[1:]`. The command-line arguments, if the script is invoked from the command line """ al = AlignmentParser() an = AnnotationParser() mp = MaskParser() bp = BaseParser() alignment_file_parser = al.get_parser(conflict_handler="resolve") annotation_file_parser = an.get_parser(conflict_handler="resolve") mask_file_parser = mp.get_parser() base_parser = bp.get_parser() parser = argparse.ArgumentParser( description = format_module_docstring(__doc__), formatter_class = argparse.RawDescriptionHelpFormatter, conflict_handler = "resolve", parents = [base_parser, alignment_file_parser, annotation_file_parser, mask_file_parser] ) # yapf: disable parser.add_argument("out_folder", type=str, help="Folder in which to save output vectors") parser.add_argument( "--out_prefix", default="", type=str, help="Prefix to prepend to output files (default: no prefix)" ) parser.add_argument( "--format", default="%.8f", type=str, help=r"printf-style format string for output (default: '%%.8f')" ) args = parser.parse_args(args) bp.get_base_ops_from_args(args) # if output folder doesn't exist, create it if not os.path.isdir(args.out_folder): os.mkdir(args.out_folder) # parse args # yapf: disable ga = al.get_genome_array_from_args(args, printer=printer) transcripts = an.get_segmentchains_from_args(args, printer=printer) mask_hash = mp.get_genome_hash_from_args(args, printer=printer) # yapf: enable # evaluate for n, tx in enumerate(transcripts): if n % 1000 == 0: printer.write("Processed %s regions of interest" % n) filename = "%s%s.txt" % (args.out_prefix, tx.get_name()) full_filename = os.path.join(args.out_folder, filename) # mask out overlapping masked regions overlapping = mask_hash.get_overlapping_features(tx) for feature in overlapping: tx.add_masks(*feature.segments) count_vec = tx.get_masked_counts(ga) numpy.savetxt(full_filename, count_vec, fmt=args.format)
if __name__ == "__main__": main()