Source code for plastid.bin.cs

#!/usr/bin/env python
"""Count the number of :term:`read alignments<alignment>` and calculate
read densities (in :term:`RPKM`) over genes, correcting gene boundaries
for overlap with other genes or regions specified in a :term:`mask file`.

:term:`Counts <counts>` and densities are calculated separately per gene for
exons, 5' UTRs, coding regions, and 3' UTRs. In addition, positions overlapped
by multiple genes are excluded, as are positions annotated in
:term:`mask annotation files<crossmap>`, if one is provided.

The script's operation is divided into three subprograms:

Generate
    The :func:`generate <do_generate>` mode pre-process a genome annotation
    as follows:

     #. All genes whose transcripts share exact exons are collapsed to
        "merged" genes.

     #. Positions covered by more than one merged gene on the same strand
        are excluded from analysis, and subtracted from each merged genes.

     #. Remaining positions in each merged gene are then divided
        into the following groups:

            *exon*
                all positions in all transcripts mapping to the merged gene

            *CDS*
                positions which appear in coding regions in *all* transcript
                isoforms mapping to the merged gene. i.e. These positions
                are never part of a fiveprime or threeprime UTR in *any*
                transcript mapping to the merged gene

            *UTR5*
                positions which are annotated only as *5' UTR* in all
                transcript isoforms mapping to the merged gene

            *UTR3*
                positions which are annotated only as *3 UTR* in all
                transcript isoforms mapping to the merged gene

            *masked*
                positions excluded from analyses as directed in an optional
                :term:`mask file`

    The following files are output, where `OUTBASE` is a name supplied
    by the user:

        OUTBASE_gene.positions
            Tab-delimited text file. Each line is a merged gene, and columns
            indicate the genomic coordinates and lengths of each of the position
            sets above.

        OUTBASE_transcript.positions
            Tab-delimited text file. Each line is a transcript, and columns
            indicate the genomic coordinates and lengths of each of the position
            sets above.

        OUTBASE_gene_REGION.bed
             `BED`_ files showing position sets for `REGION`,
             where `REGION` is one of *exon*, *CDS*, *UTR5*, and *UTR3* or
             *masked*. These contain the same information in
             ``OUTBASE_gene.positions``, but can be visualized easily in a
             :term:`genome browser`


Count
    The :func:`count <do_count>` mode counts the number and density of
    read alignments in each sub-region (*exon*, *CDS*, *UTR5*, and *UTR3*)
    of each gene.


Chart
    The :func:`chart <do_chart>` mode takes output from one or more samples
    run under :func:`count <do_count>` mode and generates several tables and
    charts that provide broad overviews of the data.

See command-line help for each subprogram for details on each mode

See also
--------
:mod:`~plastid.bin.counts_in_region` script
    Calculate  the number and density  of :term:`read alignments <alignment>`
    covering any set of regions of interest, making no corrections for gene
    boundaries.
"""
import os
import sys
import argparse
import itertools
import copy
import inspect
import gc
import warnings

import matplotlib
matplotlib.use("Agg")
import matplotlib.pyplot as plt
import numpy
import pandas as pd
import scipy.stats

from plastid.util.scriptlib.argparsers import AnnotationParser,\
                                              AlignmentParser,\
                                              MaskParser,\
                                              PlottingParser, \
                                              BaseParser

from plastid.util.scriptlib.help_formatters import format_module_docstring

from plastid.genomics.roitools import positions_to_segments, SegmentChain
from plastid.genomics.genome_hash import GenomeHash
from plastid.util.io.openers import opener, get_short_name, argsopener, read_pl_table
from plastid.util.io.filters import NameDateWriter
from plastid.util.services.sets import merge_sets
from plastid.util.services.decorators import skipdoc
from plastid.util.services.exceptions import DataWarning, FileFormatWarning
import numpy.ma as ma
from plastid.plotting.plots import scatterhist_xy, ma_plot, clean_invalid
from plastid.plotting.colors import process_black
from plastid.readers.bigbed import BigBedReader

printer = NameDateWriter(get_short_name(inspect.stack()[-1][1]))

#===============================================================================
# 'generate' subprogram
#===============================================================================


@skipdoc
def write_output_files(table, title, args):
    """Write gene info table from :py:func:`do_generate` to several output files:

    OUTBASE_gene.positions
        Tab-delimited text file. Each line is a merged gene, and columns
        indicate the genomic coordinates and lengths of each of the position
        sets above.

    OUTBASE_transcript.positions
        Tab-delimited text file. Each line is a transcript, and columns
        indicate the genomic coordinates and lengths of each of the position
        sets above.

    OUTBASE_gene_REGION.bed
         `BED`_ files showing position sets for `REGION`,
         where `REGION` is one of *exon*, *utr5*, *cds*, *utr3*, or
         *masked*. These contain the same information in
         ``OUTBASE_gene.positions``, but can be visualized easily in a
         :term:`genome browser`


    Parameters
    ----------
    table : :class:`pandas.DataFrame`
        Gene info table made in :py:func:`do_generate`

    title : str
        Title ("gene" or "transcript")

    args : :py:class:`argparse.Namespace`
        Command-line arguments
    """
    keys = ("utr5", "utr3", "cds", "masked", "exon")
    bed_columns = ["%s_bed" % K for K in keys]

    bedfiles = {X: argsopener("%s_%s_%s.bed" % (args.outbase, title, X), args) for X in keys}

    for _, row in table[bed_columns].iterrows():
        for k in keys:
            bedfiles[k].write(row["%s_bed" % k])

    for k in bedfiles:
        bedfiles[k].close()

    pos_out = argsopener("%s_%s.positions" % (args.outbase, title), args)
    table.to_csv(
        pos_out,
        sep          = "\t",
        header       = True,
        index        = False,
        na_rep       = "nan",
        float_format = "%.8f",
        columns      = [
            "region", "exon", "utr5", "cds", "utr3", "masked", "exon_unmasked",
            "transcript_ids"
        ]) # yapf: disable
    pos_out.close()


[docs]def merge_genes(tx_ivcs):
    """Merge genes whose transcripts share exons into a combined, "merged" gene

    Parameters
    ----------
    tx_ivcs : dict
        Dictionary mapping unique transcript IDs to |Transcripts|


    Returns
    -------
    dict
        Dictionary mapping raw gene names to the names of the merged genes
    """
    dout = {}
    exondicts = {"+": {}, "-": {}}

    # backmap exons to genes as tuples
    printer.write("Mapping exons to genes ...")
    for txid in tx_ivcs.keys():
        my_segmentchain = tx_ivcs[txid]
        my_gene = my_segmentchain.get_gene()
        chrom = my_segmentchain.spanning_segment.chrom
        strand = my_segmentchain.spanning_segment.strand
        for my_iv in my_segmentchain:
            start = my_iv.start
            end = my_iv.end
            # separate exons by chromosome and strand to reduce
            # downstream comparisons in merging
            if chrom not in exondicts[strand]:
                exondicts[strand][chrom] = {}
            try:
                exondicts[strand][chrom][(start, end)].append(my_gene)
            except KeyError:
                exondicts[strand][chrom][(start, end)] = [my_gene]

    for strand in exondicts:
        for chrom in exondicts[strand]:
            exondicts[strand][chrom] = {K: set(V) for K, V in exondicts[strand][chrom].items()}
            printer.write("Flattening genes on %s(%s) ..." % (chrom, strand))
            gene_groups = merge_sets(exondicts[strand][chrom].values(), printer=printer)

            for group in gene_groups:
                merged_name = ",".join(sorted(group))
                for gene in group:
                    dout[gene] = merged_name

    printer.write("Flattened to %s groups." % len(dout))
    return dout


[docs]def process_partial_group(transcripts, mask_hash, printer):
    """Correct boundaries of merged genes, as described in :func:`do_generate`

    Parameters
    ----------
    transcripts : dict
        Dictionary mapping unique transcript IDs to |Transcripts|.
        This set should be complete in the sense that it should contain
        all transcripts that have any chance of mutually overlapping
        each other (e.g. all on same chromosome and strand).

    mask_hash : |GenomeHash|
        |GenomeHash| of regions to exclude from analysis


    Returns
    -------
    :class:`pandas.DataFrame`
        Table of merged gene positions

    :class:`pandas.DataFrame`
        Table of adjusted transcript positions

    :class:`dict`
        Dictionary mapping raw gene names to merged gene names
    """
    gene_table = {
        "region"          : [],
        "transcript_ids"  : [],
        "exon_unmasked"   : [],
        "exon"            : [],
        "masked"          : [],
        "utr5"            : [],
        "cds"             : [],
        "utr3"            : [],
        "exon_bed"        : [],
        "utr5_bed"        : [],
        "cds_bed"         : [],
        "utr3_bed"        : [],
        "masked_bed"      : [],
    } # yapf: disable

    # data table for transcripts
    transcript_table = {
        "region"          : [],
        "exon"            : [],
        "utr5"            : [],
        "cds"             : [],
        "utr3"            : [],
        "masked"          : [],
        "exon_unmasked"   : [],
        "transcript_ids"  : [],
        "exon_bed"        : [],
        "utr5_bed"        : [],
        "cds_bed"         : [],
        "utr3_bed"        : [],
        "masked_bed"      : [],
    } # yapf: disable

    keycombos = list(itertools.permutations(("utr5", "cds", "utr3"), 2))

    # merge genes that share exons & write output
    printer.write("Collapsing genes that share exons ...")
    merged_genes = merge_genes(transcripts)

    # remap transcripts to merged genes
    # and vice-versa
    merged_gene_tx = {}
    tx_merged_gene = {}
    printer.write("Mapping transcripts to merged genes...")
    for txid in transcripts:
        my_tx = transcripts[txid]
        my_gene = my_tx.get_gene()
        my_merged = merged_genes[my_gene]
        tx_merged_gene[txid] = my_merged
        try:
            merged_gene_tx[my_merged].append(txid)
        except KeyError:
            merged_gene_tx[my_merged] = [txid]

    # flatten merged genes
    printer.write("Flattening merged genes, masking positions, and labeling subfeatures ...")
    for n, (gene_id, my_txids) in enumerate(merged_gene_tx.items()):
        if n % 1000 == 0 and n > 0:
            printer.write("    %s genes ..." % n)

        my_gene_positions = []
        chroms = []
        strands = []
        for my_txid in my_txids:
            my_segmentchain = transcripts[my_txid]
            chroms.append(my_segmentchain.chrom)
            strands.append(my_segmentchain.strand)
            my_gene_positions.extend(my_segmentchain.get_position_list())

            try:
                assert len(set(chroms)) == 1
            except AssertionError:
                printer.write(
                    "Skipping gene %s which contains multiple chromosomes: %s" %
                    (gene_id, ",".join(chroms))
                )

            try:
                assert len(set(strands)) == 1
            except AssertionError:
                printer.write(
                    "Skipping gene %s which contains multiple strands: %s" %
                    (gene_id, ",".join(strands))
                )

        my_gene_positions = set(my_gene_positions)
        gene_ivc_raw = SegmentChain(
            *positions_to_segments(chroms[0], strands[0], my_gene_positions)
        )
        gene_table["region"].append(gene_id)
        gene_table["transcript_ids"].append(",".join(sorted(my_txids)))
        gene_table["exon_unmasked"].append(gene_ivc_raw)

    printer.write("    %s genes total." % (n + 1))

    # mask genes
    printer.write("Masking positions and labeling subfeature positions ...")
    gene_hash = GenomeHash(gene_table["exon_unmasked"], do_copy=False)

    for n, (gene_id, gene_ivc_raw) in enumerate(zip(gene_table["region"],
                                                    gene_table["exon_unmasked"])):
        if n % 2000 == 0:
            printer.write("    %s genes ..." % n)

        my_chrom = gene_ivc_raw.spanning_segment.chrom
        my_strand = gene_ivc_raw.spanning_segment.strand

        masked_positions = []
        nearby_genes = gene_hash[gene_ivc_raw]

        # don't mask out positions from identical gene
        gene_ivc_raw_positions = gene_ivc_raw.get_position_set()
        nearby_genes = [X for X in nearby_genes if X.get_position_set() != gene_ivc_raw_positions]
        for gene in nearby_genes:
            masked_positions.extend(gene.get_position_list())

        nearby_masks = mask_hash[gene_ivc_raw]
        for mask in nearby_masks:
            masked_positions.extend(mask.get_position_list())

        masked_positions = set(masked_positions)

        gene_positions_raw = gene_ivc_raw.get_position_set()
        mask_ivc_positions = gene_positions_raw & masked_positions
        total_mask_ivc = SegmentChain(
            *positions_to_segments(my_chrom, my_strand, mask_ivc_positions),
            ID=gene_id
        )
        gene_table["masked"].append(total_mask_ivc)
        gene_table["masked_bed"].append(total_mask_ivc.as_bed())

        gene_post_mask = gene_positions_raw - masked_positions
        gene_post_mask_ivc = SegmentChain(
            *positions_to_segments(my_chrom, my_strand, gene_post_mask),
            ID=gene_id
        )
        gene_table["exon"].append(gene_post_mask_ivc)
        gene_table["exon_bed"].append(gene_post_mask_ivc.as_bed())

        masked_positions = total_mask_ivc.get_position_set()
        tmp_positions = {
            "utr5" : set(),
            "cds"  : set(),
            "utr3" : set(),
        } # yapf: disable
        txids = sorted(merged_gene_tx[gene_id])
        chrom = gene_post_mask_ivc.chrom
        strand = gene_post_mask_ivc.strand

        # pool transcript positions
        for txid in txids:
            transcript = transcripts[txid]

            # yapf: disable
            utr5pos = transcript.get_utr5().get_position_set()
            cdspos  = transcript.get_cds().get_position_set()
            utr3pos = transcript.get_utr3().get_position_set()

            tmp_positions["utr5"] |= utr5pos
            tmp_positions["cds"]  |= cdspos
            tmp_positions["utr3"] |= utr3pos
            # yapf: enable

        # eliminate positions in which CDS & UTRs overlap from each transcript
        for txid in txids:
            transcript = transcripts[txid]
            transcript_positions = {
                "utr5": transcript.get_utr5().get_position_set(),
                "cds" : transcript.get_cds().get_position_set(),
                "utr3": transcript.get_utr3().get_position_set(),
            } # yapf: disable

            for key1, key2 in keycombos:
                transcript_positions[key1] -= tmp_positions[key2]
                transcript_positions[key1] -= masked_positions

            transcript_table["region"].append(txid)

            # all unmasked positions
            my_chain = SegmentChain(
                *positions_to_segments(
                    chrom,
                    strand,
                    transcript.get_position_set() - masked_positions
                ),
                ID=txid
            )
            transcript_table["exon"].append(str(my_chain))
            transcript_table["exon_bed"].append(my_chain.as_bed())

            # all uniquely-labeled unmasked positions
            for k, v in transcript_positions.items():
                my_chain = SegmentChain(*positions_to_segments(chrom, strand, v), ID=txid)
                transcript_table[k].append(str(my_chain))
                transcript_table["%s_bed" % k].append(my_chain.as_bed())

            total_mask_ivc.attr["ID"] = txid
            transcript_table["masked"].append(str(total_mask_ivc))
            transcript_table["masked_bed"].append(total_mask_ivc.as_bed())
            transcript_table["exon_unmasked"].append(str(transcript))
            transcript_table["transcript_ids"].append(txid)

        tmp_positions2 = copy.deepcopy(tmp_positions)
        for k1, k2 in keycombos:
            tmp_positions[k1] -= tmp_positions2[k2]
            tmp_positions[k1] -= masked_positions

        for k in (tmp_positions.keys()):
            my_chain = SegmentChain(
                *positions_to_segments(chrom, strand, tmp_positions[k]), ID=gene_id
            )
            gene_table[k].append(str(my_chain))
            gene_table["%s_bed" % k].append(my_chain.as_bed())

    printer.write("    %s genes total." % (n + 1))

    # cast SegmentChains/Transcripts to strings to keep numpy from unpacking them
    conversion_keys = ["exon", "utr5", "cds", "utr3", "masked", "exon_unmasked"]
    for k in conversion_keys:
        gene_table[k] = [str(X) for X in gene_table[k]]
        transcript_table[k] = [str(X) for X in transcript_table[k]]

    gene_df = pd.DataFrame(gene_table)
    gene_df.sort_values(["region"], inplace=True)

    transcript_df = pd.DataFrame(transcript_table)
    transcript_df.sort_values(["region"], inplace=True)

    return gene_df, transcript_df, merged_genes


[docs]def do_generate(args, annotation_parser, mask_parser):
    """Generate gene position files from gene annotations.

     1. Genes whose transcripts share exons are first collapsed into merged
        genes.

     2. Within merged genes, all positions are classified. All positions are
        included in a set called *exon*. All positions that appear as coding
        regions in all transcripts (i.e. are never part of a 5'UTR or 3'UTR)
        included in a set called *CDS*. Similarly, all positions that appear
        as 5' UTR or 3' UTR in all transcripts are included in sets called
        *UTR5* or *UTR3*, respectively.

     3. Genomic positions that are overlapped by multiple merged genes are
        excluded from the position sets for those genes.

     4. If a :term:`mask file` is supplied, positions annotated in the mask file
        are also excluded

     5. Output is given as a series of `BED`_ files and a `positions` file
        containing the same data.

    Parameters
    ----------
    args : :py:class:`argparse.Namespace`
        command-line arguments for ``generate`` subprogram
    """
    # variables for transcript <-> merged gene mapping
    transcripts = {}
    merged_genes = {}

    # data table for merged genes
    gene_table = pd.DataFrame({
            "region"          : [],
            "transcript_ids"  : [],
            "exon_unmasked"   : [],
            "exon"            : [],
            "masked"          : [],
            "utr5"            : [],
            "cds"             : [],
            "utr3"            : [],
            "exon_bed"        : [],
            "utr5_bed"        : [],
            "cds_bed"         : [],
            "utr3_bed"        : [],
            "masked_bed"      : [],
        }) # yapf: disable

    # data table for transcripts
    transcript_table = pd.DataFrame({
        "region"          : [],
        "exon"            : [],
        "utr5"            : [],
        "cds"             : [],
        "utr3"            : [],
        "exon_bed"        : [],
        "utr5_bed"        : [],
        "cds_bed"         : [],
        "utr3_bed"        : [],
        "masked"          : [],
        "exon_unmasked"   : [],
        "transcript_ids"  : [],
        "masked_bed"      : [],
    }) # yapf: disable

    # data
    is_sorted = (args.sorted == True) or \
                (args.tabix == True) or \
                (args.annotation_format == "BigBed")

    annotation_message = (
        "`cs` relies upon relationships between transcripts and genes "
        "to collapse transcripts to genes for quantitation. "
        "Gene-transcript relationships are not generally preserved in BED "
        "or BigBed files, and a `gene_id` column could not be found in the "
        "input data. This may yield nonsensical results in the output. "
        "Consider either (1) using a GTF2 or GFF3 file or (2) creating an "
        "extended BED or BigBed file with a `gene_id` column."
    )

    if args.annotation_format == "BED":
        if not isinstance(args.bed_extra_columns, list) \
            or 'gene_id' not in args.bed_extra_columns:
            warnings.warn(annotation_message, FileFormatWarning)

    elif args.annotation_format == "BigBed":
        reader = BigBedReader(args.annotation_files[0])
        if 'gene_id' not in reader.extension_fields:
            warnings.warn(annotation_message, FileFormatWarning)

    source = iter(
        annotation_parser.get_transcripts_from_args(args, printer=printer)
    )
    mask_hash = mask_parser.get_genome_hash_from_args(args)

    # loop conditions
    last_chrom = None
    do_loop = True

    # to save memory, we process one chromosome at a time if input file is sorted
    # knowing that at that moment all transcript parts are assembled
    while do_loop == True:
        try:
            tx = next(source)
        except StopIteration:
            do_loop = False

        try:
            # if chromosome is completely processed or EOF
            if (is_sorted and tx.spanning_segment.chrom != last_chrom) \
                or do_loop == False:
                if do_loop == True:
                    source = itertools.chain([tx], source)

                if last_chrom is not None or do_loop == False:
                    printer.write("Merging genes on chromosome/contig '%s'" % last_chrom)
                    my_gene_table, my_transcript_table, my_merged_genes = process_partial_group(
                        transcripts,
                        mask_hash,
                        printer,
                    )
                    gene_table = pd.concat((gene_table, my_gene_table), axis=0)
                    transcript_table = pd.concat((transcript_table, my_transcript_table), axis=0)
                    merged_genes.update(my_merged_genes)

                del transcripts
                gc.collect()
                del gc.garbage[:]
                transcripts = {}

                # reset last chrom
                last_chrom = tx.spanning_segment.chrom

            # otherwise, remember transcript
            else:
                transcripts[tx.get_name()] = tx

        # exit gracefully if no transcripts found
        except UnboundLocalError:
            pass

    # write output
    printer.write("Writing output ...")

    merged_fn = "%s_merged.txt" % args.outbase
    number_merged = len(set(merged_genes.values()))
    printer.write(
        "Collapsed %s genes to %s merged groups. Writing to %s" %
        (len(merged_genes), number_merged, merged_fn)
    )
    fout = argsopener(merged_fn, args, "w")
    for gene, merged_name in sorted(merged_genes.items()):
        fout.write("%s\t%s\n" % (gene, merged_name))

    fout.close()

    printer.write("Writing gene table and BED files ...")
    write_output_files(gene_table, "gene", args)
    printer.write("Writing transcript summary table and BED files ...")
    write_output_files(transcript_table, "transcript", args)
    printer.write("Done!")


#===============================================================================
# 'count' subprogram
#===============================================================================


[docs]def do_count(args, alignment_parser):
    """Count the number and density covering each merged gene in an annotation
    made made using the `generate` subcommand).

    Parameters
    ----------
    args : :py:class:`argparse.Namespace`
        command-line arguments for ``count`` subprogram
    """
    # we expect many zero-lenght segmentchains, so turn these off for now
    warnings.filterwarnings(
        "ignore",
        ".*zero-length SegmentChain.*",
    )

    keys = ("exon", "utr5", "cds", "utr3")
    column_order = ["region"]
    gene_positions = read_pl_table(args.position_file)

    # read count files
    ga = alignment_parser.get_genome_array_from_args(args, printer=printer)
    total_counts = ga.sum()

    normconst = 1000.0 * 1e6 / total_counts

    printer.write("Dataset has %s counts in it." % total_counts)
    printer.write("Tallying genes ...")

    dtmp = {"region": []}
    for x in keys:
        for y in ("reads", "length", "rpkm"):
            label = "%s_%s" % (x, y)
            dtmp[label] = []
            column_order.append(label)

    for i, name in enumerate(gene_positions["region"]):
        dtmp["region"].append(name)
        if i % 500 == 0:
            printer.write("Processed %s genes ..." % i)

        for k in keys:
            ivc = SegmentChain.from_str(gene_positions[k][i])
            total = sum(ivc.get_counts(ga))
            length = ivc.length
            rpkm = (normconst * total / length) if length > 0 else numpy.nan
            dtmp["%s_reads" % k].append(total)
            dtmp["%s_length" % k].append(length)
            dtmp["%s_rpkm" % k].append(rpkm)

    fout = argsopener("%s.txt" % args.outbase, args, "w")
    dtmp = pd.DataFrame(dtmp)
    dtmp.to_csv(
        fout,
        sep     = "\t",
        header  = True,
        index   = False,
        columns = column_order,
        na_rep  = "nan",
        float_format="%.8f"
    ) # yapf: disable

    fout.close()
    printer.write("Done.")


#===============================================================================
# 'chart' subprogram
#===============================================================================


@skipdoc
def read_count_file(fh, genes_to_include=None):
    """Read rows selected by gene name from a count file

    Parameters
    ----------
    fname : str
        Name of output file from cs.py

    genes_to_include : list or None
        List of genes to include in output.
        If `None`, all are included

    Returns
    -------
    :class:`pandas.DataFrame`

        Count data
    """
    df = read_pl_table(fh)
    if genes_to_include is not None:
        import_mask = numpy.array(
            [True if X in genes_to_include else False for X in df["region"]]
        )
        return df[import_mask]
    else:
        return df


@skipdoc
def get_bin_mask_by_summed_key(rep_a, rep_b, bins, key="exon_reads"):
    """Bins genes into groups based upon the summed counts in samples `rep_a`
    and `rep_b`

    Parameters
    ----------
    rep_a : :class:`pandas.DataFrame`
        replicate a, from :func:`read_count_file`

    rep_b : :class:`pandas.DataFrame`
        replicate b, from :func:`read_count_file`

    bins : :class:`numpy.ndarray`
        sequence of bins, specified as floors

    key : str, optional
        quantity on which to perform binning (Default: `exon_reads`)

    Returns
    -------
    dict[bin]
        = numpy boolean mask
    """
    my_sums = rep_a[key] + rep_b[key]
    bins = sorted(bins)
    dout = dict.fromkeys(bins)
    for k in dout.keys():
        dout[k] = numpy.tile(False, len(my_sums))
    for n, my_sum in enumerate(my_sums):
        for i in range(len(bins) - 1):
            bin1 = bins[i]
            bin2 = bins[i + 1]
            if my_sum >= bin1 and my_sum < bin2:
                dout[bin1][n] = True
        if my_sum >= bins[-1]:
            dout[bins[-1]][n] = True
    return dout


@skipdoc
def get_nonzero_either_mask(vector_a, vector_b):
    """Returns a numpy array of boolean values indicating where values in two
    vectors are both greater than zero.

    Parameters
    ----------
    vector_a : numpy.ndarray
        Array of counts or RPKM

    vector_b : numpy.ndarray
        Array of counts or RPKM

    Returns
    -------
    numpy.ndarray
        Boolean array that is `True` where both `vector_a` and `vector_b`
        have values greater than zero, and `False` elsewhere.
    """
    return (vector_a > 0) & (vector_b > 0)


@skipdoc
def get_short_samplename(inp):
    """Creates a sample legend label from a sample filename by removing
    everything including and after ``".txt"``

    Parameters
    ----------
    inp : str
        filename

    Returns
    -------
    str
    """
    return get_short_name(inp, separator=os.path.sep, terminator=".txt")


[docs]def do_scatter(
        x,
        y,
        count_mask,
        plot_parser,
        args,
        pearsonr = None,
        xlabel   = None,
        ylabel   = None,
        title    = None,
        min_x    = 10**-3,
        min_y    = 10**-3
): # yapf: disable
    """Scatter plot helper for cs `chart` subprogram

    Parameters
    ----------
    x, y : :class:`numpy.ndarray`
        Data to plot

    count_mask : :class:`numpy.ndarray`
        Threshold mask

    args : :class:`~argparse.Namespace`
        Command-line arguments

    pearsonr : float
        Pearson's r of the two samples

    xlabel : str or None, optional
        If not `None`, an x-axis label

    ylabel : str or None, optional
        If not `None`, a y-axis label

    min_x,min_y : float
        value to which low x- or y-values will respectively be truncated

    Returns
    -------
    :class:`matplotlib.figure.Figure`
        Formatted figure
    """
    fig = plot_parser.get_figure_from_args(args)
    xm = x[count_mask]
    ym = y[count_mask]

    xb, yb = clean_invalid(x[~count_mask], y[~count_mask], min_x=min_x, min_y=min_y)

    _, axdict = scatterhist_xy(xm, ym, log="xy", label=">= 128 counts")
    _, _ = scatterhist_xy(
        xb,
        yb,
        axes  = axdict,
        color = process_black,
        label = "< 128 counts",
        mask_invalid=False,
    ) # yapf: disable

    text_kwargs = {
        "horizontalalignment": "left",
        "verticalalignment": "baseline",
        "transform": axdict["main"].transAxes
    }

    mainax = axdict["main"]
    mainax.legend(loc="lower right", frameon=False)

    if pearsonr is not None:
        mainax.text(0.02, 0.9, "Pearson r**2 >= 128: %0.4e" % pearsonr**2, **text_kwargs)

    if xlabel is not None:
        mainax.set_xlabel(xlabel)

    if ylabel is not None:
        mainax.set_ylabel(ylabel)

    if title is not None:
        plt.suptitle(title)

    mainax.set_xlim(min_x / 5.0, mainax.get_xlim()[1])
    mainax.set_ylim(min_y / 5.0, mainax.get_ylim()[1])

    return fig


[docs]def do_chart(args, plot_parser):
    """Produce a set of charts comparing multiple samples pairwise.

    Charts include histograms of log2 fold changes and scatter plots with
    correlation coefficients, both generated for raw count and RPKM data.

    Parameters
    ----------
    args : :py:class:`argparse.Namespace`
        command-line arguments for ``chart`` subprogram
    """
    plot_parser.set_style_from_args(args)

    outbase = args.outbase
    bins = numpy.array(args.bins)
    figformat = args.figformat

    # read input files
    printer.write("Reading input files: %s ..." % ", ".join(args.infiles))

    samples = { get_short_samplename(X) : read_count_file(opener(X),args.list_of_regions)\
                                          for X in args.infiles }

    # Define some variables for later
    sample_names = sorted(samples.keys())
    comparisons = [X for X in itertools.combinations(sample_names, 2) if X[0] != X[1]]
    colors = plot_parser.get_colors_from_args(args, len(comparisons))
    binkeys = tuple(["%s_reads" % k for k in args.regions])

    comparison_labels = sorted(["%s_vs_%s" % (X, Y) for X, Y in comparisons])
    bigtable = {}.fromkeys(comparison_labels)
    corrcoef_by_bin_table = {}.fromkeys(comparison_labels)

    # ki, kj = names of samples i and j
    # vi, vj = data of samples i and j
    for ki, kj in comparisons:
        try:
            assert (samples[ki]["region"] == samples[kj]["region"]).all()
        except AssertionError:
            printer.write(
                "Mismatched line entries for samples %s and %s. "
                "Were different gene lists used for counting?"
                % (ki, kj)
            )

        vi = samples[ki]
        vj = samples[kj]
        printer.write("Comparing %s to %s:" % (ki, kj))

        label = "%s_vs_%s" % (ki, kj)

        bigtable[label] = {}
        corrcoef_by_bin_table[label] = {}

        for binkey in binkeys:
            bigtable[label][binkey] = {K: copy.deepcopy({}) for K in args.metrics}
            corrcoef_by_bin_table[label][binkey] = {K: copy.deepcopy({}) for K in args.metrics}

        for region in args.regions:
            region_counts = "%s_reads" % region
            count_mask = (vi[region_counts].values + vj[region_counts].values >= 128)

            for metric in args.metrics:
                region_metric = "%s_%s" % (region, metric)
                printer.write("    -%s across %s for all >=128 ..." % (metric, region))

                # divide into regions >=128 and <128 and plot
                viover = vi[region_metric][count_mask].values
                vjover = vj[region_metric][count_mask].values

                viunder = vi[region_metric][~count_mask].values
                vjunder = vj[region_metric][~count_mask].values

                pearsonr = scipy.stats.pearsonr(viover, vjover)[0]
                spearmanr = scipy.stats.spearmanr(viover, vjover)[0]

                # log2 fold change stats
                log2_ratios = numpy.log2(numpy.ma.masked_invalid(vjover / viover))
                log2_mean = log2_ratios.mean()
                log2_std = log2_ratios.std()
                min_diff = 2**(3 * log2_std)
                num_genes_log2 = (~log2_ratios.mask).sum()

                # ma plot
                try:
                    ma_title = "%s vs %s (%s %s)" % (ki, kj, region, metric)
                    fig = plot_parser.get_figure_from_args(args)
                    _, axdict = ma_plot(
                        viunder,
                        vjunder,
                        color   = process_black,
                        label   = "< 128 counts",
                        axes    = plt.gca(),
                        kdalpha = 0.2,
                        title   = ma_title) # yapf: disable
                    _, _ = ma_plot(
                        viover,
                        vjover,
                        axes    = axdict,
                        label   = ">= 128 counts",
                        kdalpha = 0.8) # yapf: disable

                    mainax = axdict["main"]
                    mainax.set_xlabel("%s %s" % (ki, metric))
                    mainax.legend(loc="upper right", frameon=False)

                    text_kwargs = {
                        "horizontalalignment": "right",
                        "verticalalignment": "baseline",
                        "linespacing": 1.6,
                        "transform": mainax.transAxes,
                    }

                    plot_text = "\n".join(
                        [
                            "sample mean: %0.3e" % log2_mean,
                            "sample stdev: %0.3e" % log2_std,
                            "3-sigma fold change: %0.2f" % min_diff,
                            "regions_counted: %s" % count_mask.sum(),
                        ]
                    )

                    mainax.text(0.96, 0.04, plot_text, **text_kwargs)

                    plt.savefig(
                        "%s_ma_%s_%s_%s.%s" % (outbase, label, region, metric, figformat),
                        bbox_inches="tight"
                    )
                    plt.close()
                except Exception as e:
                    warnings.warn(
                        "Could not make MA plot for samples %s and %s. Error message:\n    %s" %
                        (ki, kj, e.message), DataWarning
                    )

                # scatter plot
                try:
                    scatter_title = "%s vs %s (%s %s)" % (ki, kj, region, metric)
                    do_scatter(
                        vi[region_metric].values,
                        vj[region_metric].values,
                        count_mask,
                        plot_parser,
                        args,
                        pearsonr = pearsonr,
                        xlabel   = ki,
                        ylabel   = kj,
                        title    = scatter_title
                    ) # yapf: disable

                    plt.savefig(
                        "%s_scatter_%s_%s_%s.%s" % (outbase, label, region, metric, figformat),
                        bbox_inches="tight"
                    )
                    plt.close()
                except ValueError as e:
                    warnings.warn(
                        "Could not make scatter plot for samples %s and %s. Error message:\n    %s"
                        % (ki, kj, e.message), DataWarning
                    )

                # TODO: make these tables into dataframes. this scheme is difficult to read

                # add entries to bigtable for export later
                # yapf: disable
                bigtable[label][region_counts][metric]["pearsonr"]  = pearsonr
                bigtable[label][region_counts][metric]["spearmanr"] = spearmanr
                bigtable[label][region_counts][metric]["log2_mean"] = log2_mean
                bigtable[label][region_counts][metric]["log2_std"]  = log2_std
                bigtable[label][region_counts][metric]["2**3sigma"] = 2**(3 * log2_std)
                bigtable[label][region_counts][metric][
                    "num_genes_128plus"] = count_mask.sum()
                bigtable[label][region_counts][metric][
                    "num_genes_log2"] = num_genes_log2
                # yapf: enable

                # do bin-by-bin counting
                printer.write("    -%s across %s by bin ..." % (metric, region))
                bin_masks = get_bin_mask_by_summed_key(vi, vj, bins=args.bins, key=region_counts)
                for my_bin, bin_mask in sorted(bin_masks.items()):

                    bin_vec_i = vi[region_metric][bin_mask]
                    bin_vec_j = vj[region_metric][bin_mask]

                    # make sure there are genes in bin before attempting calculations
                    if len(bin_vec_i) > 0:
                        nonzero_binmask = get_nonzero_either_mask(bin_vec_i, bin_vec_j)
                        bin_vi_log2 = bin_vec_i[nonzero_binmask]
                        bin_vj_log2 = bin_vec_j[nonzero_binmask]
                        my_logs = numpy.log2(bin_vj_log2 / bin_vi_log2)
                        my_logmean = numpy.mean(my_logs)
                        my_logstd = numpy.std(my_logs)
                        if len(bin_vec_i) > 2:
                            my_pearsonr = scipy.stats.pearsonr(bin_vec_i, bin_vec_j)[0]
                            my_spearmanr = scipy.stats.spearmanr(bin_vec_i, bin_vec_j)[0]
                        else:
                            my_spearmanr = numpy.nan
                            my_pearsonr = numpy.nan
                    else:
# yapf: disable
                        # fill with dummy values
                        my_logs      = numpy.array([])
                        my_logmean   = numpy.nan
                        my_logstd    = numpy.nan
                        my_spearmanr = numpy.nan
                        my_pearsonr  = numpy.nan

                    corrcoef_by_bin_table[label][region_counts][metric][my_bin] = {}
                    corrcoef_by_bin_table[label][region_counts][metric][my_bin]["pearsonr"]     = my_pearsonr
                    corrcoef_by_bin_table[label][region_counts][metric][my_bin]["spearmanr"]    = my_spearmanr
                    corrcoef_by_bin_table[label][region_counts][metric][my_bin]["genes_in_bin"] = sum(bin_mask)
                    corrcoef_by_bin_table[label][region_counts][metric][my_bin]["log2_genes_in_bin"] = sum(nonzero_binmask)
                    corrcoef_by_bin_table[label][region_counts][metric][my_bin]["log2_mean"]    = my_logmean
                    corrcoef_by_bin_table[label][region_counts][metric][my_bin]["log2_std"]     = my_logstd

    # export big (non-binned) table
    printer.write("Writing tables ...")

    bigtable_out = argsopener("%s_bigtable.txt" % args.outbase, args, "w")
    stats = (
        "num_genes_128plus",  # 0
        "pearsonr",           # 1
        "spearmanr",          # 2
        "num_genes_log2",     # 3
        "log2_mean",          # 4
        "log2_std",           # 5
        "2**3sigma",          # 6
    )
# yapf: enable

    header = ["#region", "metric", "statistic"]
    header += [X for X in comparison_labels]

    bigtable_out.write("\t".join(header) + "\n")
    for region in binkeys:
        for metric in args.metrics:
            for stat in stats:
                ltmp = [region, metric, stat]
                for key in comparison_labels:
                    ltmp.append(bigtable[key][region][metric][stat])
                ltmp = [str(X) for X in ltmp]
                bigtable_out.write("\t".join(ltmp) + "\n")

    bigtable_out.close()

    # export binned data table and make bin-by-bin plots
    bintable_out = argsopener("%s_bintable.txt" % args.outbase, args, "w")

    region_metrics = ["%s_%s" % (X, Y) for X in args.regions for Y in args.metrics]
    stats = [
        "genes_in_bin",       # 0
        "pearsonr",           # 1
        "spearmanr",          # 2
        "",                   # 3
        "log2_genes_in_bin",  # 4
        "log2_mean",          # 5
        "log2_std",           # 6
        "",                   # 7
    ] # yapf: disable
    for region_metric in region_metrics:
        plt.close()
        fig = plot_parser.get_figure_from_args(args)
        plt.semilogx(basex=2)
        plt.title("Correlation coefficients by bin for %s" % region_metric)
        plt.xlabel("Bin")
        plt.ylabel("Spearman rho")

        region, metric = region_metric.split("_")
        region_counts = "%s_reads" % region
        bintable_out.write("%s\n\n" % region_metric)

        try:
            for n, label in enumerate(comparison_labels):
                corrcoefs = []
                bintable_out.write("%s\t\t\t\t\t\t\t\t" % label)
                bintable_out.write("\n")
                bintable_out.write("bin\t" + "\t".join(stats) + "\n")
                for my_bin in bins:
                    ltmp = [my_bin]
                    for stat in stats:
                        ltmp.append(
                            corrcoef_by_bin_table[label][region_counts][metric][my_bin].get(
                                stat, ""
                            )
                        )
                    ltmp = [str(X) for X in ltmp]
                    bintable_out.write("\t".join(ltmp) + "\n\n")
                    corrcoefs.append(
                        corrcoef_by_bin_table[label][region_counts][metric][my_bin]["spearmanr"]
                    )
                corrcoefs = ma.masked_invalid(corrcoefs)
                plt.plot(
                    bins[~corrcoefs.mask], corrcoefs[~corrcoefs.mask], label=label, color=colors[n]
                )
            plt.legend(loc="lower right")
            plt.savefig(
                "%s_corrcoef_by_bin_%s_%s.%s" % (outbase, region_metric, label, figformat),
                bbox_inches="tight"
            )
        except Exception as e:
            warnings.warn(
                "Could not plot correlation-by-bin plot for '%s', '%s'" % (outbase, region_metric),
                DataWarning
            )

        bintable_out.write("\n\n")
    bintable_out.close()

    printer.write("Done.")


#===============================================================================
# main program body
#===============================================================================


[docs]def main(argv=sys.argv[1:]):
    """Command-line program

    Parameters
    ----------
    argv : list, optional
        A list of command-line arguments, which will be processed
        as if the script were called from the command line if
        :func:`main` is called directly.

        Default: `sys.argv[1:]`. The command-line arguments, if the script is
        invoked from the command line
    """
    al = AlignmentParser(disabled=["normalize"])
    an = AnnotationParser()
    mp = MaskParser()
    pl = PlottingParser()
    bp = BaseParser()

    # yapf: disable
    alignment_file_parser  = al.get_parser()
    annotation_file_parser = an.get_parser()
    mask_file_parser       = mp.get_parser()
    plotting_parser        = pl.get_parser()
    base_parser            = bp.get_parser()

    generator_help = "Create unambiguous position file from GFF3 annotation"
    generator_desc = format_module_docstring(do_generate.__doc__)

    counter_help = "Count reads in unambiguous gene positions"
    counter_desc = format_module_docstring(do_count.__doc__)

    chart_help = "Produce charts comparing reads between samples"
    chart_desc = format_module_docstring(do_chart.__doc__)

    parser = argparse.ArgumentParser(
        description     = format_module_docstring(__doc__),
        formatter_class = argparse.RawDescriptionHelpFormatter
    )
    subparsers = parser.add_subparsers(
        title       = "subcommands",
        description = "choose one of the following",
        dest        = "program"
    )
    gparser = subparsers.add_parser(
        "generate",
        help            = generator_help,
        description     = generator_desc,
        parents         = [base_parser, annotation_file_parser, mask_file_parser],
        formatter_class = argparse.RawDescriptionHelpFormatter,
    )
    cparser = subparsers.add_parser(
        "count",
        help            = counter_help,
        description     = counter_desc,
        parents         = [base_parser, alignment_file_parser],
        formatter_class = argparse.RawDescriptionHelpFormatter,
    )
    pparser = subparsers.add_parser(
        "chart",
        help            = chart_help,
        description     = chart_desc,
        parents         = [base_parser, plotting_parser],
        formatter_class = argparse.RawDescriptionHelpFormatter
    )

    gparser.add_argument(
        "outbase",
        metavar ="outbase",
        type    = str,
        help    = "Basename for output files"
    )

    cparser.add_argument(
        "position_file",
        type    = str,
        metavar = "file.positions",
        help    = (
            "File assigning positions to genes or transcripts "
            "(made using 'generate' subcommand)"
        )
    )
    cparser.add_argument("outbase", type=str, help="Basename for output files")

    pparser.add_argument(
        "-i",
        "--in",
        nargs="+",
        type=str,
        dest="infiles",
        help="input files, made by 'count' subprogram")
    pparser.add_argument(
        "--bins",
        nargs="+",
        type=int,
        default=(0, 32, 64, 128, 256, 512, 1024, 2048, 4096),
        help="Bins into which features are partitioned based on counts")
    pparser.add_argument(
        "--regions",
        nargs="+",
        type=str,
        default=("exon", "utr5", "cds", "utr3"),
        help="Regions to compare (default: exon, utr5, cds, utr3)")
    pparser.add_argument(
        "--metrics",
        nargs="+",
        type=str,
        default=("rpkm", "reads"),
        help="Metrics to compare (default: rpkm, reads)")
    pparser.add_argument(
        "list_of_regions",
        type=str,
        metavar='gene_list.txt',
        nargs="?",
        default=None,
        help=(
            "Optional. File listing regions (genes or transcripts), one per "
            "line, to include in comparisons. If not given, all genes are "
            "included."
        )
    )
    pparser.add_argument("outbase", type=str, help="Basename for output files")

    args = parser.parse_args(argv)
    bp.get_base_ops_from_args(args)

    if args.program == "generate":
        #generate position file
        do_generate(args, an, mp)

    elif args.program == "count":
        #use position file to count gene expression in infiles
        do_count(args, al)

    elif args.program == "chart":
        #use count files to generate a family of charts and tables
        do_chart(args, pl)


if __name__ == "__main__":
    main()