Source code for plastid.bin.reformat_transcripts

#!/usr/bin/env python
"""Convert transcripts from `BED`_, `BigBed`_, `GTF2`_, `GFF3`_, or `PSL`_ format
to `BED`_, :term:`extended BED`, or `GTF2`_ format.

 .. note::

    GFF3 schemas vary
        Different GFF3s have different schemas of hierarchy. By default, we
        assume the ontology used by the Sequence Ontology consortium. Users
        that require a different schema may supply `transcript_types` and
        `exon_types`, to indicate which sorts of features should be included.

    Identity relationships between elements vary between GFF3 files
        GFF3 files can represent discontiguous features using two strategies. In 
        one strategy, the exons of a transcript have unique IDs, but will share
        contain the same parent ID in their same `Parent` attribute in column 9
        of the GFF. In another strategy different exons of the same transcript
        simply share the same ID, and don't define a `Parent`. Here, both schemes
        are accepted, although what happens if they conflict within a single
        transcript is undefined.

"""
import argparse
import inspect
import warnings
import sys
import os
import copy

from plastid.util.scriptlib.argparsers import AnnotationParser, BaseParser
from plastid.util.services.exceptions import ArgumentWarning, warn
from plastid.util.io.openers import argsopener, get_short_name
from plastid.util.io.filters import NameDateWriter
from plastid.util.scriptlib.help_formatters import format_module_docstring

warnings.simplefilter("once")
printer = NameDateWriter(get_short_name(inspect.stack()[-1][1]))

MAKE_BIGBED_MESSAGE =\
"""The BED-formatted file `%(outbase)s` has been exported with extended BED
columns. Plastid and other software can detect these automatically if you convert
`%(outbase)s` to a BigBed file using Jim Kent's bedToBigBed utility, as follows:

    $ sort -k1,1 -k2,2n -t"\t" %(outbase)s >%(outbase)s_sorted.bed
    $ bedToBigBed -tab -type=bed12+%(numcols)s -as=fields.as %(outbase)s.bed -extraIndex=name chrom.sizes %(outbase)s.bb

where `chrom.sizes` is a two-column, tab-delimited table of chromosome name & size,
and `fields.as` is an autoSql declaration describing the types & names of the extra
columns. For `%(outbase)s.bed`, `fields.as` could be:

%(autosql)s

Simply edit the type of each custom field to match your data type. Field type
options are:

    int, uint, short, ushort, byte, ubyte, float, char, string, lstring, enum, set

For details, See the autoSql Grammar specification at:

https://github.com/ENCODE-DCC/kentUtils/blob/36d6274459f644d5400843b8fa097b380b8f7867/src/hg/autoSql/autoSql.doc
"""

BED12_RESERVED_NAMES = [
    "chrom",
    "chromStart",
    "chromEnd",
    "name",
    "score",
    "strand",
    "thickStart",
    "thickEnd",
    "reserved",
    "blockCount",
    "blockSizes",
    "chromStarts",
]

DEFAULT_AUTOSQL_STR =\
"""
table bigbed_columns "%s columns"
(
    string            chrom;          "Chromosome"
    uint              chromStart;     "chr start"
    uint              chromEnd;       "chr end"
    string            name;           "item name"
    uint              score;          "score"
    char[1]           strand;         "strand"
    uint              thickStart;     "thickstart"
    uint              thickEnd;       "thickend"
    uint              reserved;       "normally itemRgb"
    int               blockCount;     "block count"
    int[blockCount]   blockSizes;     "block sizes"
    int[blockCount]   chromStarts;    "block starts"
%s
)
"""

AUTOSQL_ROW_FMT_STR = '    string            %s;%s"description of custom field contents"'


[docs]def fix_name(inp, names_used):
    """Append a number if an autoSql field name is duplicated.
    """
    name = inp
    i = 2
    while name in names_used:
        name = "%s%s" % (inp, i)
        i += 1

    names_used.append(name)
    return name


#: TODO: no functional test for --extra_columns
[docs]def main(argv=sys.argv[1:]):
    """Command-line program
    
    Parameters
    ----------
    argv : list, optional
        A list of command-line arguments, which will be processed
        as if the script were called from the command line if
        :py:func:`main` is called directly.

        Default: sys.argv[1:] (actually command-line arguments)
    """
    ap = AnnotationParser()
    bp = BaseParser()
    annotation_parser = ap.get_parser()
    base_parser = bp.get_parser()

    parser = argparse.ArgumentParser(
        description=format_module_docstring(__doc__),
        parents=[base_parser, annotation_parser],
        formatter_class=argparse.RawDescriptionHelpFormatter
    )
    parser.add_argument(
        "--no_escape",
        default=True,
        action="store_false",
        help=
        "If specified and output format is GTF2, special characters in column 9 will be escaped (default: True)"
    )
    parser.add_argument(
        "--output_format",
        choices=["BED", "GTF2"],
        default="GTF2",
        help="Format of output file. (default: GTF2)"
    )
    parser.add_argument(
        "--extra_columns",
        nargs="+",
        default=[],
        type=str,
        help=
        "Attributes (e.g. 'gene_id' to output as extra columns in extended BED format (BED output only)."
    )
    parser.add_argument(
        "--empty_value",
        default="na",
        type=str,
        help=
        "Value to use of an attribute in `extra_columns` is not defined for a particular record (Default: 'na'"
    )
    parser.add_argument("outfile", metavar="outfile.[ bed | gtf ]", type=str, help="Output file")
    args = parser.parse_args(argv)
    bp.get_base_ops_from_args(args)

    end_message = ""
    extra_cols = args.extra_columns
    if extra_cols is not None:
        if args.output_format == "BED":

            # avoid name clashes
            names_used = copy.copy(BED12_RESERVED_NAMES)
            asql_names = [fix_name(X, names_used) for X in extra_cols]
            autosql_str = "\n".join(
                AUTOSQL_ROW_FMT_STR % (X, " " * max(15 - len(X), 2)) for X in asql_names
            )

            file_info = {
                "outbase": args.outfile.replace(".bed", "").replace(".gtf", ""),
                "numcols": len(extra_cols),
                "autosql": DEFAULT_AUTOSQL_STR % (os.path.basename(args.outfile[:-4]), autosql_str),
            }
            end_message = MAKE_BIGBED_MESSAGE % file_info
        else:
            warn(
                "`--extra_columns` is ignored for %s-formatted output." % (args.output_format),
                ArgumentWarning
            )

    with argsopener(args.outfile, args, "w") as fout:
        c = 0
        transcripts = ap.get_transcripts_from_args(args, printer=printer)

        for transcript in transcripts:
            if args.output_format == "GTF2":
                fout.write(transcript.as_gtf(escape=args.no_escape))
            elif args.output_format == "BED":
                fout.write(
                    transcript.as_bed(extra_columns=extra_cols, empty_value=args.empty_value)
                )
            if c % 1000 == 1:
                printer.write("Processed %s transcripts ..." % c)
            c += 1

    printer.write("Processed %s transcripts total." % c)
    printer.write("Done.")
    print(end_message)


if __name__ == "__main__":
    main()