Source code for plastid.bin.findjuncs

#!/usr/bin/env python
"""This script identify all the unique splice junctions in one or more transcript
annotations, and exports these as a `BED`_ file with one splice junction per line
Optionally, this script can also export junctions as a `Tophat`_ ``.juncs`` file.

If a splice junction appears multiple times (e.g. used by more than one transcript),
only the first occurrence of the junction will be reported. Scores, if present,
are exported unaltered in `BED`_ output.

Examples:


.. code-block:: shell

   # identify splice junctions from a transcript annotation supplied in GTF2
   # creates output file 'annotation.bed'
   $ findjuncs my_annotation --annotation_format GTF2 \\
               --annotation_files transcripts.gtf
    
   # merge unique annotations from annotation.bed and newly_discovered.bed,
   # export only unique junctions to 'merged_unique.bed'
   $ findjuncs merged_unique --annotation_format BED \\
               --annotation_files annotation.bed newly_discovered.bed


See also
--------
:mod:`plastid.bin.slidejuncs`
   Script that makes richer comparisons between discovered and annotated
   junctions, using genomic sequence and :py:mod:`plastid.bin.crossmap`
   results to classify junctions
"""
"""
From the Tophat specification for `.juncs` files:

    Junctions are specified one per line, in a tab-delimited format. Records
    look like::
    
        <chrom> <left> <right> <+/->
    
    left and right are zero-based coordinates, and specify the last character of
    the left sequenced to be spliced to the first character of the right sequence,
    inclusive. That is, the last and the first positions of the flanking exons.
    Users can convert junctions.bed (one of the TopHat outputs) to this format
    using bed_to_juncs < junctions.bed > new_list.juncs where bed_to_juncs can
    be found under the same folder as tophat

See http://ccb.jhu.edu/software/tophat/index.shtml for more information.
"""
import sys
import argparse
import inspect
import warnings
from plastid.genomics.roitools import SegmentChain
from plastid.util.scriptlib.argparsers import AnnotationParser, BaseParser
from plastid.util.io.filters import NameDateWriter
from plastid.util.io.openers import argsopener, get_short_name
from plastid.util.scriptlib.help_formatters import format_module_docstring

warnings.simplefilter("once")
printer = NameDateWriter(get_short_name(inspect.stack()[-1][1]))

_ANNOTATION_INPUT_CHOICES = ["BED", "BigBed", "GTF2", "GFF3", "PSL"]
_ANNOTATION_DISABLED = ["add_three", "annotation_file"]


[docs]def main(argv=sys.argv[1:]): """Command-line program Parameters ---------- argv : list, optional A list of command-line arguments, which will be processed as if the script were called from the command line if :py:func:`main` is called directly. Default: sys.argv[1:] (actually command-line arguments) """ ap = AnnotationParser(input_choices=_ANNOTATION_INPUT_CHOICES) annotation_file_parser = ap.get_parser() bp = BaseParser() base_parser = bp.get_parser() # yapf: disable parser = argparse.ArgumentParser( description = format_module_docstring(__doc__), formatter_class = argparse.RawDescriptionHelpFormatter, parents =[base_parser, annotation_file_parser] ) parser.add_argument( "--export_tophat", default =False, action ="store_true", help ="Export tophat `.juncs` file in addition to BED output" ) parser.add_argument("outbase", type=str, help="Basename for output files") # yapf: enable args = parser.parse_args(argv) bp.get_base_ops_from_args(args) transcripts = ap.get_transcripts_from_args(args, printer=printer, return_type=SegmentChain) with argsopener("%s.bed" % args.outbase, args, "w") as bed_out: if args.export_tophat == True: tophat_out = open("%s.juncs" % args.outbase, "w") printer.write("params: " + " ".join(argv)) printer.write("Detecting & comparing junctions...") ex_pairs = {} c = 0 u = 0 for chain in transcripts: if len(chain) > 1: # if multi-exon chrom = chain.chrom strand = chain.strand try: ep = ex_pairs[(chrom, strand)] except KeyError: ex_pairs[(chrom, strand)] = [] ep = ex_pairs[(chrom, strand)] for i in range(0, len(chain) - 1): seg1 = chain[i] seg2 = chain[i + 1] if c % 1000 == 0 and c > 0: printer.write("Processed %s junctions. Found %s unique..." % (c, u)) c += 1 key = (seg1.end, seg2.start) if key not in ep: ep.append(key) u += 1 new_chain = SegmentChain(seg1, seg2) bed_out.write(new_chain.as_bed()) if args.export_tophat == True: my_junc = (chrom, seg1.end - 1, seg2.start, strand) tophat_out.write("%s\t%s\t%s\t%s\n" % my_junc) del new_chain del seg1 del seg2 del chain printer.write("Processed %s total junctions. Found %s unique." % (c, u)) bed_out.close() if args.export_tophat == True: tophat_out.close() printer.write("Done.")
if __name__ == "__main__": main()