#!/usr/bin/env python
"""This script estimates :term:`P-site offsets <P-site offset>`, stratified by read length,
in a :term:`ribosome profiling` dataset. To do so, read alignments are mapped to
their fiveprime ends, and a :term:`metagene` profile surrounding the start
codon is calculated separately for each read length.
The start codon peak for each read length is heuristically identified as the
largest peak upstream of the start codon, or within a region defined by the user.
The distance between that peak and the start codon itself is taken to be the
:term:`P-site offset` for that read length.
Notes
------
Generate an *ROI file first*
This script requires an ROI file of :term:`maximal spanning windows <maximal spanning window>`
surrounding each gene's start codon. This file can be generated by the
``generate`` subprogram of the |metagene| script.
Check the data
Users should examine the graphical output to make sure the P-site estimates
are reasonable, because if clear start codon peaks are not present in the
data, the algorithm described above will have trouble.
For RNase I only
This algorithm presumes that the RNase used to prepare the ribosome-protected
footprints has no appreciable cutting bias, so that footprints may be
clearly resolved to the edge of the ribosome.
Output files
------------
OUTBASE_p_offsets.txt
Tab-delimited text file with two columns. The first is read length,
and the second the offset from the fiveprime end of that read length
to the ribosomal P-site. This table can be supplied as the argument
for ``--offset`` when using ``--fiveprime_variable`` mapping in any
of the other scripts in :obj:`plastid.bin`
OUTBASE_p_offsets.[svg | png | pdf | et c]
Plot of metagene profiles for each read length, when reads are mapped
to their 5' ends, :term:`P-site offsets <P-site offset>` are applied.
OUTBASE_metagene_profiles.txt
Metagene profiles, stratified by read length, before :term:`P-site offsets <P-site offset>`
are applied.
OUTBASE_K_rawcounts.txt
Saved if ``--keep`` is given on command line. Raw count vectors for each
:term:`metagene` window specified in input ROI file, without P-site
mapping rules applied, for reads of length `K`
OUTBASE_K_normcounts.txt
Saved if ``--keep`` is given on command line. Normalized count vectors
for each metagene window specified in input ROI file, without P-site
mapping rules applied, for reads of length `K`
where `OUTBASE` is supplied by the user.
"""
import sys
import argparse
import inspect
import warnings
import matplotlib
matplotlib.use("Agg")
import numpy
import pandas as pd
import matplotlib.pyplot as plt
from collections import OrderedDict
from plastid.util.scriptlib.argparsers import (
AlignmentParser,
PlottingParser,
BaseParser,
)
from plastid.genomics.roitools import SegmentChain
from plastid.util.io.openers import get_short_name, argsopener, NullWriter, opener
from plastid.util.io.filters import NameDateWriter
from plastid.util.scriptlib.help_formatters import format_module_docstring
from plastid.util.services.exceptions import ArgumentWarning
# to handle deprecated command-line arguments
from plastid.bin.metagene import _get_norm_region
warnings.simplefilter("once")
printer = NameDateWriter(get_short_name(inspect.stack()[-1][1]))
[docs]def do_count(
roi_table,
ga,
norm_start,
norm_end,
min_counts,
min_len,
max_len,
aggregate=False,
printer=NullWriter()
):
"""Calculate a :term:`metagene profile` for each read length in the dataset
Parameters
----------
roi_table : :class:`pandas.DataFrame`
Table specifying regions of interest, generated
by :py:func:`plastid.bin.metagene.do_generate`
ga : |BAMGenomeArray|
Count data
norm_start : int
Coordinate in window specifying normalization region start
norm_end : int
Coordinate in window specifying normalization region end
min_counts : float
Minimum number of counts in `window[norm_start:norm_end]`
required for inclusion in metagene profile
min_len : int
Minimum read length to include
max_len : int
Maximum read length to include
aggregate : bool, optional
Estimate P-site from aggregate reads at each position, instead of median
normalized read density. Potentially noisier, but helpful for lower-count
data or read lengths with few counts. (Default: False)
printer : file-like, optional
filehandle to write logging info to (Default: :func:`~plastid.util.io.openers.NullWriter`)
Returns
-------
dict
Dictionary of :class:`numpy.ndarray` s of raw counts at each position (column)
for each window (row)
dict
Dictionary of :class:`numpy.ndarray` s of normalized counts at each position (column)
for each window (row), normalized by the total number of counts in that row
from `norm_start` to `norm_end`
:class:`pandas.DataFrame`
Metagene profile of median normalized counts at each position across
all windows, and the number of windows included in the calculation of each
median, stratified by read length
"""
window_size = roi_table["window_size"][0]
upstream_flank = roi_table["zero_point"][0]
raw_count_dict = OrderedDict()
norm_count_dict = OrderedDict()
shape = (len(roi_table), window_size)
for i in range(min_len, max_len + 1):
# mask all by default
raw_count_dict[i] = numpy.ma.MaskedArray(
numpy.tile(numpy.nan, shape), mask=numpy.tile(True, shape), dtype=float
)
for i, row in roi_table.iterrows():
if i % 1000 == 0:
printer.write("Counted %s ROIs ..." % (i + 1))
roi = SegmentChain.from_str(row["region"])
mask = SegmentChain.from_str(row["masked"])
roi.add_masks(*mask)
valid_mask = roi.get_masked_counts(ga).mask
offset = int(round((row["alignment_offset"])))
assert offset + roi.length <= window_size
count_vectors = {}
for k in raw_count_dict:
count_vectors[k] = []
for seg in roi:
reads = ga.get_reads(seg)
read_dict = {}
for k in raw_count_dict:
read_dict[k] = []
for read in filter(lambda x: len(x.positions) in read_dict, reads):
read_dict[len(read.positions)].append(read)
for k in read_dict:
count_vector = ga.map_fn(read_dict[k], seg)[1]
count_vectors[k].extend(count_vector)
for k in raw_count_dict:
if roi.strand == "-":
count_vectors[k] = count_vectors[k][::-1]
raw_count_dict[k].data[i, offset:offset + roi.length] = numpy.array(count_vectors[k])
raw_count_dict[k].mask[i, offset:offset + roi.length] = valid_mask
profile_table = {"x": numpy.arange(-upstream_flank, window_size - upstream_flank)}
printer.write("Counted %s ROIs total." % (i + 1))
for k in raw_count_dict:
k_raw = raw_count_dict[k]
denominator = numpy.nansum(k_raw[:, norm_start:norm_end], axis=1)
norm_count_dict[k] = (k_raw.T.astype(float) / denominator).T
# copy mask from raw counts, then add nans and infs
norm_counts = numpy.ma.MaskedArray(norm_count_dict[k], mask=k_raw.mask)
norm_counts.mask[numpy.isnan(norm_counts)] = True
norm_counts.mask[numpy.isinf(norm_counts)] = True
with warnings.catch_warnings():
# ignore numpy mean of empty slice warning, given by numpy in Python 2.7-3.4
warnings.filterwarnings("ignore", ".*mean of empty.*", RuntimeWarning)
try:
if aggregate == False:
profile = numpy.ma.median(norm_counts[denominator >= min_counts], axis=0)
else:
profile = numpy.nansum(k_raw[denominator >= min_counts], axis=0)
# in numpy under Python3.5, this is an IndexError instead of a warning
except IndexError:
profile = numpy.zeros_like(profile_table["x"], dtype=float)
# in new versions of numpy, this is a ValueEror instead of an IndexError
except ValueError:
profile = numpy.zeros_like(profile_table["x"], dtype=float)
num_genes = ((~norm_counts.mask)[denominator >= min_counts]).sum(0)
profile_table["%s-mers" % k] = profile
profile_table["%s_regions_counted" % k] = num_genes
profile_table = pd.DataFrame(profile_table)
return raw_count_dict, norm_count_dict, profile_table
[docs]def main(argv=sys.argv[1:]):
"""Command-line program
Parameters
----------
argv : list, optional
A list of command-line arguments, which will be processed
as if the script were called from the command line if
:py:func:`main` is called directrly.
Default: `sys.argv[1:]`. The command-line arguments, if the script is
invoked from the command line
"""
ap = AlignmentParser(
allow_mapping=False, input_choices=["BAM"], disabled=[
"normalize",
"big_genome",
]
)
bp = BaseParser()
alignment_file_parser = ap.get_parser()
base_parser = bp.get_parser()
pp = PlottingParser()
plotting_parser = pp.get_parser()
parser = argparse.ArgumentParser(
description=format_module_docstring(__doc__),
formatter_class=argparse.RawDescriptionHelpFormatter,
parents=[base_parser, alignment_file_parser, plotting_parser]
)
parser.add_argument(
"--min_counts",
type=int,
default=10,
metavar="N",
help="Minimum counts required in normalization region " +
"to be included in metagene average (Default: 10)"
)
parser.add_argument(
"--normalize_over",
type=int,
nargs=2,
metavar="N",
default=None,
#default=(20,50),
help="Portion of each window against which its individual raw count profile" +
" will be normalized. Specify two integers, in nucleotide" +
" distance from landmark (negative for upstream, positive for downstream. Surround negative numbers with quotes.). (Default: 20 50)"
)
parser.add_argument(
"--norm_region",
type=int,
nargs=2,
metavar="N",
default=None,
help="Deprecated. Use ``--normalize_over`` instead. " +
"Formerly, Portion of each window against which its individual raw count profile" +
" will be normalized. Specify two integers, in nucleotide" +
" distance, from 5\' end of window. (Default: 70 100)"
)
parser.add_argument(
"--require_upstream",
default=False,
action="store_true",
help="If supplied, the P-site offset is taken to be the distance " +
"between the largest peak upstream of the start codon and " +
"the start codon itself. Otherwise, the P-site offset is taken " +
"to be the distance between the largest peak in the entire ROI " +
"and the start codon. Ignored if ``--constrain`` is used."
)
parser.add_argument(
"--constrain",
type=int,
nargs=2,
default=None,
metavar="X",
help="Constrain P-site offset to be between specified distance from " +
"start codon. Useful for noisy data. " + "(Reasonable set: 10 15; default: not constrained)"
)
parser.add_argument(
"--aggregate",
default=False,
action="store_true",
help="Estimate P-site from aggregate reads at each position, instead " +
"of median normalized read density. Noisier, but helpful for " +
"lower-count data or read lengths with few counts. (Default: False)"
),
parser.add_argument(
"--keep",
default=False,
action="store_true",
help="Save intermediate count files. Useful for additional computations (Default: False)"
)
parser.add_argument(
"--default",
type=int,
default=13,
help=
"Default 5\' P-site offset for read lengths that are not present or evaluated in the dataset. Unaffected by ``--constrain`` (Default: 13)"
)
parser.add_argument(
"roi_file",
type=str,
help="ROI file surrounding start codons, from ``metagene generate`` subprogram"
)
parser.add_argument("outbase", type=str, help="Basename for output files")
# set manual options
args = parser.parse_args(argv)
bp.get_base_ops_from_args(args)
# set defaults
args.mapping = "fiveprime"
args.offset = 0
args.nibble = 0
# process arguments
min_len = args.min_length
max_len = args.max_length
profiles = max_len + 1 - min_len
lengths = list(range(min_len, max_len + 1))
outbase = args.outbase
title = "Fiveprime read offsets by length" if args.title is None else args.title
pp.set_style_from_args(args)
colors = pp.get_colors_from_args(args, profiles)
printer.write("Opening ROI file %s ..." % args.roi_file)
with opener(args.roi_file) as roi_fh:
roi_table = pd.read_table(roi_fh, sep="\t", comment="#", index_col=None, header=0)
roi_fh.close()
printer.write("Opening count files %s ..." % ",".join(args.count_files))
ga = ap.get_genome_array_from_args(args, printer=printer)
# remove default size filters
my_filters = ga._filters.keys()
for f in my_filters:
ga.remove_filter(f)
norm_start, norm_end = _get_norm_region(roi_table, args)
# count
count_dict, norm_count_dict, metagene_profile = do_count(
roi_table,
ga,
norm_start,
norm_end,
args.min_counts,
min_len,
max_len,
aggregate=args.aggregate,
printer=printer
)
# save counts
profile_fn = "%s_metagene_profiles.txt" % outbase
with argsopener(profile_fn, args, "w") as metagene_out:
metagene_profile.to_csv(
metagene_out,
sep="\t",
header=True,
index=False,
na_rep="nan",
columns=["x"] + ["%s-mers" % X for X in lengths]
)
metagene_out.close()
if args.keep == True:
printer.write("Saving raw and normalized counts ...")
for k in count_dict:
count_fn = "%s_%s_rawcounts.txt.gz" % (outbase, k)
normcount_fn = "%s_%s_normcounts.txt.gz" % (outbase, k)
mask_fn = "%s_%s_mask.txt.gz" % (outbase, k)
numpy.savetxt(count_fn, count_dict[k], delimiter="\t")
numpy.savetxt(normcount_fn, norm_count_dict[k], delimiter="\t")
numpy.savetxt(mask_fn, norm_count_dict[k].mask, delimiter="\t")
# plotting & offsets
printer.write("Plotting and determining offsets ...")
offset_dict = OrderedDict()
# Determine scaling factor for plotting metagene profiles
max_y = numpy.nan
with warnings.catch_warnings():
# ignore warnings for slices that contain only NaNs
warnings.simplefilter("ignore", category=RuntimeWarning)
for k in lengths:
max_y = numpy.nanmax([max_y, numpy.nanmax(metagene_profile["%s-mers" % k].values)])
if numpy.isnan(max_y) or max_y == 0:
max_y = 1.0
# parse arguments & set styles
mplrc = matplotlib.rcParams
plt_incr = 1.2
# use this figsize if not specified on command line
figheight = 1.0 + 0.25 * (profiles - 1) + 0.75 * (profiles)
default_figsize = (7.5, figheight)
fig = pp.get_figure_from_args(args, figsize=default_figsize)
ax = plt.gca()
plt.title(title)
plt.xlabel("Distance from CDS start, (nt; 5' end mapping)")
if args.aggregate == True:
plt.ylabel("Aggregate read counts (au)")
else:
plt.ylabel("Median normalized read density (au)")
plt.axvline(0.0, color=mplrc["axes.edgecolor"], dashes=[3, 2])
x = metagene_profile["x"].values
xmin = x.min()
xmax = x.max()
if args.constrain is not None:
mask = numpy.tile(True, len(x))
zp = (x == 0).argmax()
l, r = args.constrain
if l == r:
warnings.warn(
"Minimum and maximum distance constraints are equal (both '%s')." % l,
ArgumentWarning
)
mindist = min(l, r)
maxdist = max(l, r)
mask[zp - maxdist:zp - mindist + 1] = False
elif args.require_upstream == True:
mask = x >= 0
else:
mask = numpy.tile(False, len(x))
for n, k in enumerate(lengths):
color = colors[n]
baseline = plt_incr * n
y = metagene_profile["%s-mers" % k].values
#ymask = y[mask]
ymask = numpy.ma.MaskedArray(y, mask=mask)
if numpy.isnan(y).all():
plot_y = numpy.zeros_like(x)
else:
if args.aggregate == False:
plot_y = y / max_y
else:
plot_y = y.astype(float) / numpy.nanmax(y) * 0.9
# plot metagene profiles on common scale, offset by baseline from bottom to top
ax.plot(x, baseline + plot_y, color=color)
ax.text(
xmin,
baseline,
"%s-mers" % k,
ha="left",
va="bottom",
color=color,
transform=matplotlib.transforms.offset_copy(
ax.transData, fig, x=6.0, y=3.0, units="points",
)
)
ymax = baseline + numpy.nanmax(plot_y)
# if all valid positions are nan, or if all valid positions are <= 0
if (~mask).sum() == numpy.isnan(ymask).sum() or numpy.nanmax(ymask) == 0:
offset = args.default
usedefault = True
else:
offset = -x[numpy.ma.argmax(ymask)]
usedefault = False
offset_dict[k] = offset
if usedefault == False:
yadj = ymax - 0.2 * plt_incr
ax.plot([-offset, 0], [yadj, yadj], color=color, dashes=[3, 2])
ax.text(
-offset / 2.0,
yadj,
"%s nt" % (offset),
color=color,
ha="center",
va="bottom",
transform=matplotlib.transforms.offset_copy(
ax.transData, fig, x=0.0, y=3.0, units="points",
)
)
plt.xlim(xmin, xmax)
plt.ylim(-0.1, plt_incr + baseline)
ax.yaxis.set_ticks([])
# save data as p-site offset table
fn = "%s_p_offsets.txt" % outbase
fout = argsopener(fn, args)
printer.write("Writing offset table to %s ..." % fn)
fout.write("length\tp_offset\n")
for k in offset_dict:
fout.write("%s\t%s\n" % (k, offset_dict[k]))
fout.write("default\t%s" % args.default)
fout.close()
# save plot
plot_fn = "%s_p_offsets.%s" % (outbase, args.figformat)
printer.write("Saving plot to %s ..." % plot_fn)
plt.savefig(plot_fn, dpi=args.dpi, bbox_inches="tight")
printer.write("Done.")
if __name__ == "__main__":
main()