#!/usr/bin/env python
"""This script empirically determines which positions in a genome yield
multimapping reads under a given set of alignment parameters. These positions
are saved in a `BED`_-formatted :term:`mask file`, so that they may be excluded
as from further analyses.
To identify multimapping regions, a genome sequence is diced into :term:`k-mers
<k-mer>` and the :term:`k-mers` aligned back to the genome. Positions in the
genome that give rise to :term:`k-mers <k-mer>` that align equally well to more
than one genomic location are then marked as multimapping..
`k` is specified by the user, as are the alignment parameters.
Output files
------------
The following files are made:
OUTBASE_READLENGTH_MISMATCHES_crossmap.bed
Final :term:`mask file` annotation, in `BED`_ format
OUTBASE_READLENGTH_MISMATCHES_CHROMOSOME_kmers.fa
:term:`K-mers <k-mer>` derived from chromosome `CHROMOSOME`. These
files can be reused in subsequent runs allowing a different number of
mismatches, using the ``--have_kmers`` option
where:
- `OUTBASE` is a name meaningful to the user
- `READLENGTH` is the :term:`k-mer` length chosen by the user
- `MISMATCHES` is the number of mismatches permitted during alignment,
also set by the user.
Considerations for large genomes
--------------------------------
For large genomes (e.g. vertebrate, plant, or some *very* big amoebas):
- |crossmap| can require a ton of memory if genome sequence is stored
in a fasta file. If |crossmap| maxes out your system's memory, it may
be terminated by your system before it completes.
Consider converting the file to a `2bit`_ file to save memory and
avoid this potential problem
- Enabling mismatches with short read sizes will make |crossmap| take
a lot longer, especially on large genomes. Consider using ``--mismatches
0`` if you run into this problem
- Using multiple processes (e.g. via ``-p 2``) will speed |crossmap|'s
execution time, but will increase its memory footprint, as each process
will need its own memory space to create and align k-mers from chromosomal
sequence
- By default, |crossmap| creates `BED`_ files. Consider converting these to
`BigBed`_ files will save substantial amounts of time and memory in the future.
This can be achieved using Jim Kent's ``bedToBigBed`` utility as follows
(from the terminal)::
$ bowtie-inspect --summary BOWTIE_INDEX | grep Sequence |\\
cut -f2,3 | sed -e "s/\([^ ]\+\).*\\t/\\1\\t/" >OUTFILE.sizes
$ sort -k1,1 -k2,2n OUTBASE.bed > OUTBASE_sorted.bed
$ bedToBigBed OUTBASE_sorted.bed OUTBASE.sizes OUTBASE_sorted.bb
See https://github.com/ENCODE-DCC/kentUtils/tree/master/src/product/scripts
for download & documentation of Kent utilities
"""
__author__ = "joshua"
import argparse
import functools
import glob
import inspect
import multiprocessing
import os
import re
import shutil
import subprocess
import sys
from plastid.util.io.filters import NameDateWriter, AbstractReader
from plastid.util.io.openers import get_short_name, argsopener
from plastid.genomics.roitools import SegmentChain, positionlist_to_segments, GenomicSegment
from plastid.util.scriptlib.help_formatters import format_module_docstring
from plastid.util.services.mini2to3 import xrange
from plastid.util.services.exceptions import MalformedFileError
from plastid.util.scriptlib.argparsers import SequenceParser, BaseParser
namepat = re.compile(r"(.*):([0-9]+)\(\+\)")
printer = NameDateWriter(get_short_name(inspect.stack()[-1][1]))
BigBedMessage = """Crossmap complete and saved as 'OUTFILE.bed'.
For large (e.g. mammalian) genomes, it is highly recommended to convert
the BED-format output to a BigBed file, using Jim Kent's bedToBigBed
utility as follows (from the terminal):
$ bowtie-inspect --summary BOWTIE_INDEX | grep Sequence |\\
cut -f2,3 | sed -e "s/\\([^ ]\+\\).*\\t/\\1\\t/" >OUTFILE.sizes
$ sort -k1,1 -k2,2n OUTFILE.bed > OUTFILE_sorted.bed
$ bedToBigBed OUTFILE_sorted.bed OUTFILE.sizes OUTFILE_sorted.bb
See https://github.com/ENCODE-DCC/kentUtils/tree/master/src/product/scripts
for download & documentation of Kent utilities
"""
[docs]def simulate_reads(seq_record, fh=sys.stdout, k=30):
"""Chops a DNA sequence into :term:`k-mers <k-mer>`, mimicking a sequencing
run. Output is delivered in fasta format. Sequences are named for position
of origin using 0-based indices.
Parameters
----------
seq_record : :py:class:`Bio.SeqRecord.SeqRecord`
DNA sequence
fh : file-like
filehandle to write output
k : int, optional
length of k-mers to generate (Default: `30`)
"""
seq = str(seq_record.seq)
for x in xrange(0, len(seq_record) - k + 1):
fh.write(">%s:%s(+)\n" % (seq_record.name, x))
fh.write("%s\n" % seq[x:x + k])
return None
[docs]class FastaNameReader(AbstractReader):
"""Returns names of sequences in a fasta file"""
[docs] def filter(self, line):
"""Return next sequence name in a fasta file
Parameters
----------
line : str
Line of text
Returns
-------
str
Name of next sequence, excluding prefix `'>'` and line terminator
"""
if line.startswith(">"):
return line[1:].rstrip()
else:
return self.__next__()
[docs]def revcomp_mask_chain(seg, k, offset=0):
"""Reverse-complement a single-interval mask, correcting for `offset`.
Parameters
----------
seg : |SegmentChain|
Plus-strand mask, including `offset`
k : int
Length of k-mers
offset : int, optional
Offset from 5' end of read at which to map mask (Default: `0`)
Returns
-------
|SegmentChain|
Mask on minus strand corresponding to `seg`
"""
# Algorithm note:
#
# Let
# FW = plus-strand coordinate
# RC = minus-strand coordinate
#
# Then
# RC = FW + k - 1 - offset
#
# But we are given FW + offset, so:
#
# RC + offset = (FW + offset) + k - 1 - offset
# RC = (FW + offset) + k - 1 - 2*offset
span = seg.spanning_segment
new_offset = k - 1 - 2 * offset
ivminus = GenomicSegment(span.chrom, span.start + new_offset, span.end + new_offset, "-")
return SegmentChain(ivminus)
[docs]def fa_to_bed(toomany_fh, k, offset=0):
"""Create a `BED`_ file indicating genomic origins of reads in a `bowtie`_
``toomany`` file
Parameters
----------
toomany_fh : file-like
Open filehandle to fasta-formatted ``toomany`` file from `bowtie`_
k : int
Length of k-mers
offset : int, optional
Offset from 5' end of read at which to map read, if any (Default: `0`)
Yields
------
|SegmentChain|
Plus-strand |SegmentChain| representing a repetitive region
|SegmentChain|
Minus-strand |SegmentChain| representing a repetitive region
"""
last_chrom = chrom = None
last_pos = None
start_pos = None
reader = FastaNameReader(toomany_fh)
for n, read_name in enumerate(reader):
chrom, pos = namepat.search(read_name).groups()
pos = int(pos) + offset
if chrom != last_chrom:
if last_chrom is not None:
plus_chain = SegmentChain(GenomicSegment(last_chrom, start_pos, last_pos + 1, "+"))
minus_chain = revcomp_mask_chain(plus_chain, k, offset)
last_chrom = chrom
start_pos = pos
last_pos = pos
yield plus_chain, minus_chain
else:
last_chrom = chrom
start_pos = pos
last_pos = pos
else:
delta = pos - last_pos
if delta > 1:
plus_chain = SegmentChain(GenomicSegment(chrom, start_pos, last_pos + 1, "+"))
minus_chain = revcomp_mask_chain(plus_chain, k, offset)
last_pos = pos
start_pos = pos
yield plus_chain, minus_chain
elif delta == 1:
last_pos = pos
else:
msg = "k-mers are not sorted at read '%s' ! Aborting." % read_name
raise MalformedFileError(toomany_fh, msg, line_num=n)
if chrom is not None:
plus_chain = SegmentChain(GenomicSegment(chrom, start_pos, last_pos + 1, "+"))
minus_chain = revcomp_mask_chain(plus_chain, k, offset)
yield plus_chain, minus_chain
[docs]def chrom_worker(chrom_seq, args=None):
name, seq_or_kmers = chrom_seq
printer.write("Processing chromosome '%s' ..." % name)
base = "%s_%s_%s_%s" % (args.outbase, args.read_length, args.mismatches, name)
kmer_file = "%s_kmers.fa" % base
toomany_file = "%s_multimap.fa" % base
bed_file = "%s_crossmap.bed" % base
if args.have_kmers == False:
with open(kmer_file, "w") as kmer:
simulate_reads(seq_or_kmers, kmer, args.read_length)
kmer.close()
else:
kmer_file = seq_or_kmers
argdict = {
"mismatches": args.mismatches,
"processors": 1,
"bowtie" : args.bowtie,
"toomany" : toomany_file,
"kmers" : kmer_file,
"ebwt" : args.ebwt,
"null" : os.devnull,
} # yapf: disable
cmd = "%(bowtie)s -m1 -a --best -f -v %(mismatches)s -p %(processors)s %(ebwt)s %(kmers)s --max %(toomany)s >%(null)s" % argdict
printer.write("Aligning %s-mers for chromosome '%s' :\n\t'%s'" % (args.read_length, name, cmd))
try:
retcode = subprocess.call(cmd, shell=True)
if retcode < 0 or retcode == 2:
printer.write(
"Alignment for chromosome '%s' terminated with status %s" % (name, retcode)
)
else:
if os.path.exists(toomany_file):
printer.write(
"Assembling multimappers from chromosome '%s' into crossmap ..." % name
)
with argsopener(bed_file, args, "w") as bed_out:
for plus_chain, minus_chain in fa_to_bed(open(toomany_file), args.read_length,
offset=args.offset):
bed_out.write(plus_chain.as_bed())
bed_out.write(minus_chain.as_bed())
bed_out.close()
os.remove(toomany_file)
else:
printer.write(
"Could not find multimapper source file '%s' ." % toomany_file
)
except OSError as e:
printer.write("Alignment failed for chromosome '%s': %s" % (name, e))
printer.write("Cleaning up chromosome '%s' ..." % name)
if args.have_kmers == False and args.save_kmers == False:
os.remove(kmer_file)
return bed_file
[docs]def main(argv=sys.argv[1:]):
"""Command-line program
Parameters
----------
argv : list, optional
A list of command-line arguments, which will be processed
as if the script were called from the command line if
:py:func:`main` is called directly.
Default: `sys.argv[1:]`. The command-line arguments, if the script is
invoked from the command line
"""
sp = SequenceParser()
bp = BaseParser()
parser = argparse.ArgumentParser(
description = format_module_docstring(__doc__),
formatter_class = argparse.RawDescriptionHelpFormatter,
parents = [bp.get_parser(), sp.get_parser()],
)
parser.add_argument(
"-k",
dest = "read_length",
metavar = "READ_LENGTH",
type = int,
default = 29,
help = "K-mer length to generate from input file. (Default: 29)",
)
parser.add_argument(
"--offset",
type = int,
default = 14,
help =
"Offset from 5' end of plus-strand read at which to attribute score (Default: 14)"
)
parser.add_argument(
"--mismatches",
metavar="N",
type=int,
default=0,
help="Number of mismatches tolerated in alignment. (Default: 0)"
)
parser.add_argument(
"--bowtie",
dest = "bowtie",
default = "/usr/bin/bowtie",
type = str,
help = "Location of bowtie binary (Default: /usr/bin/bowtie)",
)
parser.add_argument(
"--have_kmers",
default=False,
action="store_true",
help=(
"If specified, use k-mer files from previous run. In this case "
"'sequence_file' should be the value 'outbase' from the k-mer "
"files you want to use. (Default: False)"
),
)
parser.add_argument(
"--save_kmers",
default = False,
action = "store_true",
help = "Save k-mer files for reuse in a subsequent run.",
)
parser.add_argument(
"-p",
"--processes",
type = int,
default = 2,
metavar = "N",
help = (
"Number of processes to use (should be <= number of chromosomes; "
"default: 2)"
),
)
parser.add_argument(
"ebwt",
type = str,
help = (
"Bowtie index of genome against which crossmap will be made. "
"In most cases, should be generated from the same sequences that "
"are in `sequence_file`."
),
)
parser.add_argument("outbase", type=str, help="Basename for output files")
args = parser.parse_args(argv)
bp.get_base_ops_from_args(args)
#filenames
base = "%s_%s_%s" % (args.outbase, args.read_length, args.mismatches)
bed_file = "%s_crossmap.bed" % base
if args.have_kmers == True:
kmer_files = glob.glob(args.sequence_file + "*kmers.fa")
seq_pat = re.compile(r".*_([^_]*)_kmers.fa")
seqs = {seq_pat.search(X).groups()[0]: X for X in kmer_files}
else:
seqs = sp.get_seqdict_from_args(args, index=True)
worker = functools.partial(chrom_worker, args=args)
chroms = seqs.items()
pool = multiprocessing.Pool(processes=args.processes)
bed_filenames = pool.map(worker, chroms, 1)
pool.close()
pool.join()
with open(bed_file, "w") as fout:
for f in sorted(bed_filenames):
if os.path.exists(f):
shutil.copyfileobj(open(f, "r"), fout)
os.remove(f)
else:
printer.write("Could not find intermediate bed file '%s' ." % f)
fout.close()
printer.write("Done.")
printer.write(
BigBedMessage.replace(
"OUTFILE",
bed_file.replace(".bed", "")
).replace("BOWTIE_INDEX", args.ebwt)
)
if __name__ == "__main__":
main()