Source code for plastid.bin.counts_in_region

#!/usr/bin/env python
"""Count the number of :term:`read alignments<alignment>` covering 
regions of interest in the genome, and calculate read densities (in reads
per nucleotide and in :term:`RPKM`) over these regions.

Results are output as a table with the following columns:

    ========================  ==================================================
    **Name**                  **Definition**
    ------------------------  --------------------------------------------------
    `region_name`             Name or ID of region of interest
    
    `region`                  Genomic coordinates of region, formatted as
                              described in
                              :meth:`plastid.genomics.roitools.SegmentChain.from_str`
                              
    `counts`                  Number of reads mapping to region
    
    `counts_per_nucleotide`   Read density, measured in number of reads mapping
                              to region, divided by length of region
                              
    `rpkm`                    Read density, measured in :term:`RPKM`
    
    `length`                  Region length, in nucleotides
    ========================  ==================================================
    
If a :term:`mask annotation file` is supplied, masked portions of regions
will be excluded when tabulating counts, measuring region length, and calculating
`counts_per_nucleotide` and `rpkm`.


See also
--------
:mod:`~plastid.bin.cs` script
    Count the number of :term:`read alignments<alignment>` and calculate
    read densities (in :term:`RPKM`) specifically for genes and sub-regions
    (5\' UTR, CDS, 3\' UTR), excluding positions covered by multiple genes
"""
import argparse
import inspect
import sys
import itertools
import warnings
import numpy

from plastid.genomics.roitools import SegmentChain
from plastid.util.io.filters import NameDateWriter
from plastid.util.io.openers import argsopener, get_short_name
from plastid.util.scriptlib.argparsers import (
    AnnotationParser,
    AlignmentParser,
    MaskParser,
    BaseParser,
)
from plastid.util.scriptlib.help_formatters import format_module_docstring

warnings.simplefilter("once")
printer = NameDateWriter(get_short_name(inspect.stack()[-1][1]))

_DISABLED = ["normalize"]


[docs]def main(argv=sys.argv[1:]): """Command-line program Parameters ---------- argv : list, optional A list of command-line arguments, which will be processed as if the script were called from the command line if :func:`main` is called directly. Default: `sys.argv[1:]`. The command-line arguments, if the script is invoked from the command line """ ap = AnnotationParser() annotation_file_parser = ap.get_parser(conflict_handler="resolve") al = AlignmentParser(disabled=_DISABLED) alignment_file_parser = al.get_parser(conflict_handler="resolve") mp = MaskParser() mask_file_parser = mp.get_parser() bp = BaseParser() base_parser = bp.get_parser() parser = argparse.ArgumentParser( description = format_module_docstring(__doc__), formatter_class = argparse.RawDescriptionHelpFormatter, parents = [ base_parser, alignment_file_parser, annotation_file_parser, mask_file_parser, ], ) # yapf: disable parser.add_argument("outfile", type=str, help="Output filename") args = parser.parse_args(argv) bp.get_base_ops_from_args(args) ga = al.get_genome_array_from_args(args, printer=printer) transcripts = ap.get_transcripts_from_args(args, printer=printer, return_type=SegmentChain) crossmap = mp.get_genome_hash_from_args(args, printer=printer) ga_sum = ga.sum() normconst = 1000.0 * 1e6 / ga_sum with argsopener(args.outfile, args, "w") as fout: fout.write("## total_dataset_counts: %s\n" % ga_sum) fout.write("region_name\tregion\tcounts\tcounts_per_nucleotide\trpkm\tlength\n") for n, ivc in enumerate(transcripts): name = ivc.get_name() masks = crossmap.get_overlapping_features(ivc) ivc.add_masks(*itertools.chain.from_iterable((X for X in masks))) if n % 1000 == 0: printer.write("Processed %s regions..." % n) counts = numpy.nansum(ivc.get_masked_counts(ga)) length = ivc.masked_length rpnt = numpy.nan if length == 0 else float(counts) / length rpkm = numpy.nan if length == 0 else rpnt * normconst ltmp = [name, str(ivc), "%.8e" % counts, "%.8e" % rpnt, "%.8e" % rpkm, "%d" % length] fout.write("%s\n" % "\t".join(ltmp)) fout.close() printer.write("Processed %s regions total." % n) printer.write("Done.")
if __name__ == '__main__': main()