Source code for plastid.genomics.seqtools

#!/usr/bin/env python
"""Utilities for mutating and searching nucleic acid sequences.

Contents
--------

.. autosummary::

   TwoBitSeqRecordAdaptor
   mutate_seqs
   seq_to_regex
   IUPAC_TABLE
"""
import random, re
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from twobitreader import TwoBitFile

IUPAC_TABLE = {
    "A": "A",
    "C": "C",
    "T": ("T", "U"),
    "U": ("T", "U"),
    "G": "G",
    "N": ("A", "C", "T", "G", "U"),

    # puRines
    "R": ("A", "G"),

    # pYrimidines
    "Y": ("C", "T", "U"),

    # Strong binding
    "S": ("G", "C"),

    # Weak binding
    "W": ("A", "T", "U"),

     # Keto-functionalized
    "K": ("G", "T", "U"),

    # aMino-functionalized
    "M": ("A", "C"),

    # B = any but A
    "B": ("C", "G", "T", "U"),

    # C = any but C
    "D": ("A", "G", "T", "U"),

    # H = any but G
    "H": ("A", "C", "T", "U"),

    # V = any but T or U
    "V": ("A", "C", "G"),
}
"""Dictionary mapping IUPAC nucleotide symbols to tuples of nucleotides
they represent (e.g. R -> (A, G) )
"""


[docs]def seq_to_regex(inp, flags=0): """Convert a nucleotide sequence of IUPAC nucleotide characters as a regular expression. Ambiguous IUPAC characters are converted to groups (e.g. `'Y'` to `'[CTU]'`), and T and U are considered equivalent. Parameters ---------- inp : str Nucleotide sequence using IUPAC nucleotide codes flags : int, optional Flags to pass to :py:func:`re.compile` (Default: 0 / no flags) Examples -------- Convert a sequence to a regex:: >>> seq_to_regex("CARYYA").pattern 'CA[AG][CTU][CTU]A' Returns ------- :py:class:`re.RegexObject` Regular expression pattern corresponding to IUPAC sequence in `inp` """ out = [] for ch in inp: if len(IUPAC_TABLE.get(ch, ch)) == 1: out.append(ch) else: out.append("[" + "".join(IUPAC_TABLE.get(ch, ch)) + "]") return re.compile("".join(out), flags=flags)
[docs]def mutate_seqs(seqs, nucleotides="NACTG", mutations=1): """Generate all sequences within `mutations` distance from a reference sequence Parameters ---------- seqs : str or list of str Single reference sequence (a string) or a group of strings nucleotides : list of char, optional Permitted nucleotide substitutions (Default: `'NACTG'`) mutations : int, optional Number of substitutions to make (Default: `1`) Returns ------- set all sequences within `mutations` substitutions from the sequence(s) specified in `seqs` """ if isinstance(seqs, str): seqs = [seqs] if mutations == 0: return set(seqs) else: seqsout = [] for seq in seqs: for nuc in nucleotides: for i in range(len(seq)): newseq = list(seq)[:] newseq[i] = nuc seqsout.append("".join(newseq)) seqsout.extend(mutate_seqs(seqsout, nucleotides=nucleotides, mutations=mutations - 1)) return set(seqsout) | set(seqs)
[docs]def random_seq(size, nucleotides="ACTG"): """Generate a random nucleotide sequence of length `size` and composition `nucleotides` Parameters ---------- size : int length of desired sequence nucleotides : str, optional string of nucleotides to use in sequence, in desired base composition (i.e. need not be unique; can supply `'AATCG'` to increase `'A'` bias. Default: `'ACTG'`) Returns ------- str : randomized DNA sequence """ seq = "".join([nucleotides[random.randrange(0, len(nucleotides))] for _ in range(0, size)]) return seq
[docs]def revive(twobitreader, seqname): _TwoBitSeqProxy(twobitreader[seqname], twobitreader)
class _TwoBitSeqProxy(object): """Adaptor class that fetches :class:`Bio.SeqRecord.SeqRecord` objects from :class:`twobitreader.TwoBitSequence` objects Defines `seq` property and :meth:`~_TwoBitSeqProxy.reverse_complement` method """ def __init__(self, twobitfile, key): """ Parameters ---------- twobitseq : :class:`twobitfile.TwoBitSequence` """ self.name = key self._twobitfile = twobitfile self.twobitseq = twobitfile[key] self._seq = None def __reduce__(self): return (_TwoBitSeqProxy, (self._twobitfile, self.name)) def __getitem__(self, slice_): return SeqRecord( Seq(self.twobitseq.get_slice(min_=slice_.start, max_=slice_.stop)) ) def __len__(self): return len(self.twobitseq) def __str__(self): """Return sequence in `self.twobitseq` as str""" return str(self.twobitseq) def __getattr__(self, attr): if attr == "seq": if self._seq is None: self._seq = Seq(str(self.twobitseq)) return self._seq def reverse_complement(self): """Return the reverse complement of the TwoBitSequence Returns ------- :class:`Bio.SeqRecord.SeqRecord` Reverse complement of the sequence held in `self.twobitseq` """ return SeqRecord(self.seq).reverse_complement()
[docs]class TwoBitSeqRecordAdaptor(object): """Adaptor class that makes a :class:`twobitreader.TwoBitFile` behave like a dictionary of :class:`Bio.SeqRecord.SeqRecord` objects. """ def __init__(self, fh): self.twobitfile = TwoBitFile(fh) self._filename = fh self._chroms = {K: _TwoBitSeqProxy(self.twobitfile, K) for K in self.twobitfile} def __reduce__(self): return (TwoBitSeqRecordAdaptor, (self._filename, )) def __getitem__(self, key): return self._chroms[key] def __getattr__(self, attr): try: return getattr(self._chroms, attr) except AttributeError: return getattr(self.twobitfile, attr) def __iter__(self): return iter(self._chroms) def __len__(self): return len(self._chroms)