Source code for plastid.bin.psite

#!/usr/bin/env python
"""This script estimates :term:`P-site offsets <P-site offset>`, stratified by read length,
in a :term:`ribosome profiling` dataset. To do so, read alignments are mapped to
their fiveprime ends, and a :term:`metagene` profile surrounding the start 
codon is calculated separately for each read length.

The start codon peak for each read length is heuristically identified as the
largest peak upstream of the start codon, or within a region defined by the user.
The distance between that peak and the start codon itself is taken to be the
:term:`P-site offset` for that read length.


Notes
------
Generate an *ROI file first*
    This script requires an ROI file of :term:`maximal spanning windows <maximal spanning window>`
    surrounding each gene's start codon. This file can be generated by the
    ``generate`` subprogram of the |metagene| script.
    
Check the data
    Users should examine the graphical output to make sure the P-site estimates
    are reasonable, because if clear start codon peaks are not present in the
    data, the algorithm described above will have trouble.

For RNase I only
    This algorithm presumes that the RNase used to prepare the ribosome-protected
    footprints has no appreciable cutting bias, so that footprints may be
    clearly resolved to the edge of the ribosome.


Output files
------------
    OUTBASE_p_offsets.txt
        Tab-delimited text file with two columns. The first is read length,
        and the second the offset from the fiveprime end of that read length
        to the ribosomal P-site. This table can be supplied as the argument 
        for ``--offset`` when using ``--fiveprime_variable`` mapping in any
        of the other scripts in :obj:`plastid.bin`

    OUTBASE_p_offsets.[svg | png | pdf | et c]
        Plot of metagene profiles for each read length, when reads are mapped
        to their 5' ends, :term:`P-site offsets <P-site offset>` are applied.

    OUTBASE_metagene_profiles.txt
        Metagene profiles, stratified by read length, before :term:`P-site offsets <P-site offset>`
        are applied.

    OUTBASE_K_rawcounts.txt
        Saved if ``--keep`` is given on command line. Raw count vectors for each
        :term:`metagene` window specified in input ROI file, without P-site
        mapping rules applied, for reads of length `K`

    OUTBASE_K_normcounts.txt
        Saved if ``--keep`` is given on command line. Normalized count vectors
        for each metagene window specified in input ROI file, without P-site
        mapping rules applied, for reads of length `K`

where `OUTBASE` is supplied by the user.
"""
import sys
import argparse
import inspect
import warnings

import matplotlib
matplotlib.use("Agg")
import numpy
import pandas as pd
import matplotlib.pyplot as plt

from collections import OrderedDict
from plastid.util.scriptlib.argparsers import (
    AlignmentParser,
    PlottingParser,
    BaseParser,
)
from plastid.genomics.roitools import SegmentChain
from plastid.util.io.openers import get_short_name, argsopener, NullWriter, opener
from plastid.util.io.filters import NameDateWriter
from plastid.util.scriptlib.help_formatters import format_module_docstring
from plastid.util.services.exceptions import ArgumentWarning

# to handle deprecated command-line arguments
from plastid.bin.metagene import _get_norm_region

warnings.simplefilter("once")
printer = NameDateWriter(get_short_name(inspect.stack()[-1][1]))


[docs]def do_count( roi_table, ga, norm_start, norm_end, min_counts, min_len, max_len, aggregate=False, printer=NullWriter() ): """Calculate a :term:`metagene profile` for each read length in the dataset Parameters ---------- roi_table : :class:`pandas.DataFrame` Table specifying regions of interest, generated by :py:func:`plastid.bin.metagene.do_generate` ga : |BAMGenomeArray| Count data norm_start : int Coordinate in window specifying normalization region start norm_end : int Coordinate in window specifying normalization region end min_counts : float Minimum number of counts in `window[norm_start:norm_end]` required for inclusion in metagene profile min_len : int Minimum read length to include max_len : int Maximum read length to include aggregate : bool, optional Estimate P-site from aggregate reads at each position, instead of median normalized read density. Potentially noisier, but helpful for lower-count data or read lengths with few counts. (Default: False) printer : file-like, optional filehandle to write logging info to (Default: :func:`~plastid.util.io.openers.NullWriter`) Returns ------- dict Dictionary of :class:`numpy.ndarray` s of raw counts at each position (column) for each window (row) dict Dictionary of :class:`numpy.ndarray` s of normalized counts at each position (column) for each window (row), normalized by the total number of counts in that row from `norm_start` to `norm_end` :class:`pandas.DataFrame` Metagene profile of median normalized counts at each position across all windows, and the number of windows included in the calculation of each median, stratified by read length """ window_size = roi_table["window_size"][0] upstream_flank = roi_table["zero_point"][0] raw_count_dict = OrderedDict() norm_count_dict = OrderedDict() shape = (len(roi_table), window_size) for i in range(min_len, max_len + 1): # mask all by default raw_count_dict[i] = numpy.ma.MaskedArray( numpy.tile(numpy.nan, shape), mask=numpy.tile(True, shape), dtype=float ) for i, row in roi_table.iterrows(): if i % 1000 == 0: printer.write("Counted %s ROIs ..." % (i + 1)) roi = SegmentChain.from_str(row["region"]) mask = SegmentChain.from_str(row["masked"]) roi.add_masks(*mask) valid_mask = roi.get_masked_counts(ga).mask offset = int(round((row["alignment_offset"]))) assert offset + roi.length <= window_size count_vectors = {} for k in raw_count_dict: count_vectors[k] = [] for seg in roi: reads = ga.get_reads(seg) read_dict = {} for k in raw_count_dict: read_dict[k] = [] for read in filter(lambda x: len(x.positions) in read_dict, reads): read_dict[len(read.positions)].append(read) for k in read_dict: count_vector = ga.map_fn(read_dict[k], seg)[1] count_vectors[k].extend(count_vector) for k in raw_count_dict: if roi.strand == "-": count_vectors[k] = count_vectors[k][::-1] raw_count_dict[k].data[i, offset:offset + roi.length] = numpy.array(count_vectors[k]) raw_count_dict[k].mask[i, offset:offset + roi.length] = valid_mask profile_table = {"x": numpy.arange(-upstream_flank, window_size - upstream_flank)} printer.write("Counted %s ROIs total." % (i + 1)) for k in raw_count_dict: k_raw = raw_count_dict[k] denominator = numpy.nansum(k_raw[:, norm_start:norm_end], axis=1) norm_count_dict[k] = (k_raw.T.astype(float) / denominator).T # copy mask from raw counts, then add nans and infs norm_counts = numpy.ma.MaskedArray(norm_count_dict[k], mask=k_raw.mask) norm_counts.mask[numpy.isnan(norm_counts)] = True norm_counts.mask[numpy.isinf(norm_counts)] = True with warnings.catch_warnings(): # ignore numpy mean of empty slice warning, given by numpy in Python 2.7-3.4 warnings.filterwarnings("ignore", ".*mean of empty.*", RuntimeWarning) try: if aggregate == False: profile = numpy.ma.median(norm_counts[denominator >= min_counts], axis=0) else: profile = numpy.nansum(k_raw[denominator >= min_counts], axis=0) # in numpy under Python3.5, this is an IndexError instead of a warning except IndexError: profile = numpy.zeros_like(profile_table["x"], dtype=float) # in new versions of numpy, this is a ValueEror instead of an IndexError except ValueError: profile = numpy.zeros_like(profile_table["x"], dtype=float) num_genes = ((~norm_counts.mask)[denominator >= min_counts]).sum(0) profile_table["%s-mers" % k] = profile profile_table["%s_regions_counted" % k] = num_genes profile_table = pd.DataFrame(profile_table) return raw_count_dict, norm_count_dict, profile_table
[docs]def main(argv=sys.argv[1:]): """Command-line program Parameters ---------- argv : list, optional A list of command-line arguments, which will be processed as if the script were called from the command line if :py:func:`main` is called directrly. Default: `sys.argv[1:]`. The command-line arguments, if the script is invoked from the command line """ ap = AlignmentParser( allow_mapping=False, input_choices=["BAM"], disabled=[ "normalize", "big_genome", ] ) bp = BaseParser() alignment_file_parser = ap.get_parser() base_parser = bp.get_parser() pp = PlottingParser() plotting_parser = pp.get_parser() parser = argparse.ArgumentParser( description=format_module_docstring(__doc__), formatter_class=argparse.RawDescriptionHelpFormatter, parents=[base_parser, alignment_file_parser, plotting_parser] ) parser.add_argument( "--min_counts", type=int, default=10, metavar="N", help="Minimum counts required in normalization region " + "to be included in metagene average (Default: 10)" ) parser.add_argument( "--normalize_over", type=int, nargs=2, metavar="N", default=None, #default=(20,50), help="Portion of each window against which its individual raw count profile" + " will be normalized. Specify two integers, in nucleotide" + " distance from landmark (negative for upstream, positive for downstream. Surround negative numbers with quotes.). (Default: 20 50)" ) parser.add_argument( "--norm_region", type=int, nargs=2, metavar="N", default=None, help="Deprecated. Use ``--normalize_over`` instead. " + "Formerly, Portion of each window against which its individual raw count profile" + " will be normalized. Specify two integers, in nucleotide" + " distance, from 5\' end of window. (Default: 70 100)" ) parser.add_argument( "--require_upstream", default=False, action="store_true", help="If supplied, the P-site offset is taken to be the distance " + "between the largest peak upstream of the start codon and " + "the start codon itself. Otherwise, the P-site offset is taken " + "to be the distance between the largest peak in the entire ROI " + "and the start codon. Ignored if ``--constrain`` is used." ) parser.add_argument( "--constrain", type=int, nargs=2, default=None, metavar="X", help="Constrain P-site offset to be between specified distance from " + "start codon. Useful for noisy data. " + "(Reasonable set: 10 15; default: not constrained)" ) parser.add_argument( "--aggregate", default=False, action="store_true", help="Estimate P-site from aggregate reads at each position, instead " + "of median normalized read density. Noisier, but helpful for " + "lower-count data or read lengths with few counts. (Default: False)" ), parser.add_argument( "--keep", default=False, action="store_true", help="Save intermediate count files. Useful for additional computations (Default: False)" ) parser.add_argument( "--default", type=int, default=13, help= "Default 5\' P-site offset for read lengths that are not present or evaluated in the dataset. Unaffected by ``--constrain`` (Default: 13)" ) parser.add_argument( "roi_file", type=str, help="ROI file surrounding start codons, from ``metagene generate`` subprogram" ) parser.add_argument("outbase", type=str, help="Basename for output files") # set manual options args = parser.parse_args(argv) bp.get_base_ops_from_args(args) # set defaults args.mapping = "fiveprime" args.offset = 0 args.nibble = 0 # process arguments min_len = args.min_length max_len = args.max_length profiles = max_len + 1 - min_len lengths = list(range(min_len, max_len + 1)) outbase = args.outbase title = "Fiveprime read offsets by length" if args.title is None else args.title pp.set_style_from_args(args) colors = pp.get_colors_from_args(args, profiles) printer.write("Opening ROI file %s ..." % args.roi_file) with opener(args.roi_file) as roi_fh: roi_table = pd.read_table(roi_fh, sep="\t", comment="#", index_col=None, header=0) roi_fh.close() printer.write("Opening count files %s ..." % ",".join(args.count_files)) ga = ap.get_genome_array_from_args(args, printer=printer) # remove default size filters my_filters = ga._filters.keys() for f in my_filters: ga.remove_filter(f) norm_start, norm_end = _get_norm_region(roi_table, args) # count count_dict, norm_count_dict, metagene_profile = do_count( roi_table, ga, norm_start, norm_end, args.min_counts, min_len, max_len, aggregate=args.aggregate, printer=printer ) # save counts profile_fn = "%s_metagene_profiles.txt" % outbase with argsopener(profile_fn, args, "w") as metagene_out: metagene_profile.to_csv( metagene_out, sep="\t", header=True, index=False, na_rep="nan", columns=["x"] + ["%s-mers" % X for X in lengths] ) metagene_out.close() if args.keep == True: printer.write("Saving raw and normalized counts ...") for k in count_dict: count_fn = "%s_%s_rawcounts.txt.gz" % (outbase, k) normcount_fn = "%s_%s_normcounts.txt.gz" % (outbase, k) mask_fn = "%s_%s_mask.txt.gz" % (outbase, k) numpy.savetxt(count_fn, count_dict[k], delimiter="\t") numpy.savetxt(normcount_fn, norm_count_dict[k], delimiter="\t") numpy.savetxt(mask_fn, norm_count_dict[k].mask, delimiter="\t") # plotting & offsets printer.write("Plotting and determining offsets ...") offset_dict = OrderedDict() # Determine scaling factor for plotting metagene profiles max_y = numpy.nan with warnings.catch_warnings(): # ignore warnings for slices that contain only NaNs warnings.simplefilter("ignore", category=RuntimeWarning) for k in lengths: max_y = numpy.nanmax([max_y, numpy.nanmax(metagene_profile["%s-mers" % k].values)]) if numpy.isnan(max_y) or max_y == 0: max_y = 1.0 # parse arguments & set styles mplrc = matplotlib.rcParams plt_incr = 1.2 # use this figsize if not specified on command line figheight = 1.0 + 0.25 * (profiles - 1) + 0.75 * (profiles) default_figsize = (7.5, figheight) fig = pp.get_figure_from_args(args, figsize=default_figsize) ax = plt.gca() plt.title(title) plt.xlabel("Distance from CDS start, (nt; 5' end mapping)") if args.aggregate == True: plt.ylabel("Aggregate read counts (au)") else: plt.ylabel("Median normalized read density (au)") plt.axvline(0.0, color=mplrc["axes.edgecolor"], dashes=[3, 2]) x = metagene_profile["x"].values xmin = x.min() xmax = x.max() if args.constrain is not None: mask = numpy.tile(True, len(x)) zp = (x == 0).argmax() l, r = args.constrain if l == r: warnings.warn( "Minimum and maximum distance constraints are equal (both '%s')." % l, ArgumentWarning ) mindist = min(l, r) maxdist = max(l, r) mask[zp - maxdist:zp - mindist + 1] = False elif args.require_upstream == True: mask = x >= 0 else: mask = numpy.tile(False, len(x)) for n, k in enumerate(lengths): color = colors[n] baseline = plt_incr * n y = metagene_profile["%s-mers" % k].values #ymask = y[mask] ymask = numpy.ma.MaskedArray(y, mask=mask) if numpy.isnan(y).all(): plot_y = numpy.zeros_like(x) else: if args.aggregate == False: plot_y = y / max_y else: plot_y = y.astype(float) / numpy.nanmax(y) * 0.9 # plot metagene profiles on common scale, offset by baseline from bottom to top ax.plot(x, baseline + plot_y, color=color) ax.text( xmin, baseline, "%s-mers" % k, ha="left", va="bottom", color=color, transform=matplotlib.transforms.offset_copy( ax.transData, fig, x=6.0, y=3.0, units="points", ) ) ymax = baseline + numpy.nanmax(plot_y) # if all valid positions are nan, or if all valid positions are <= 0 if (~mask).sum() == numpy.isnan(ymask).sum() or numpy.nanmax(ymask) == 0: offset = args.default usedefault = True else: offset = -x[numpy.ma.argmax(ymask)] usedefault = False offset_dict[k] = offset if usedefault == False: yadj = ymax - 0.2 * plt_incr ax.plot([-offset, 0], [yadj, yadj], color=color, dashes=[3, 2]) ax.text( -offset / 2.0, yadj, "%s nt" % (offset), color=color, ha="center", va="bottom", transform=matplotlib.transforms.offset_copy( ax.transData, fig, x=0.0, y=3.0, units="points", ) ) plt.xlim(xmin, xmax) plt.ylim(-0.1, plt_incr + baseline) ax.yaxis.set_ticks([]) # save data as p-site offset table fn = "%s_p_offsets.txt" % outbase fout = argsopener(fn, args) printer.write("Writing offset table to %s ..." % fn) fout.write("length\tp_offset\n") for k in offset_dict: fout.write("%s\t%s\n" % (k, offset_dict[k])) fout.write("default\t%s" % args.default) fout.close() # save plot plot_fn = "%s_p_offsets.%s" % (outbase, args.figformat) printer.write("Saving plot to %s ..." % plot_fn) plt.savefig(plot_fn, dpi=args.dpi, bbox_inches="tight") printer.write("Done.")
if __name__ == "__main__": main()