#!/usr/bin/env python
"""Utilities for mutating and searching nucleic acid sequences.
Contents
--------
.. autosummary::
TwoBitSeqRecordAdaptor
mutate_seqs
seq_to_regex
IUPAC_TABLE
"""
import random, re
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from twobitreader import TwoBitFile
IUPAC_TABLE = {
"A": "A",
"C": "C",
"T": ("T", "U"),
"U": ("T", "U"),
"G": "G",
"N": ("A", "C", "T", "G", "U"),
# puRines
"R": ("A", "G"),
# pYrimidines
"Y": ("C", "T", "U"),
# Strong binding
"S": ("G", "C"),
# Weak binding
"W": ("A", "T", "U"),
# Keto-functionalized
"K": ("G", "T", "U"),
# aMino-functionalized
"M": ("A", "C"),
# B = any but A
"B": ("C", "G", "T", "U"),
# C = any but C
"D": ("A", "G", "T", "U"),
# H = any but G
"H": ("A", "C", "T", "U"),
# V = any but T or U
"V": ("A", "C", "G"),
}
"""Dictionary mapping IUPAC nucleotide symbols to tuples of nucleotides
they represent (e.g. R -> (A, G) )
"""
[docs]def seq_to_regex(inp, flags=0):
"""Convert a nucleotide sequence of IUPAC nucleotide characters as a regular expression.
Ambiguous IUPAC characters are converted to groups (e.g. `'Y'` to `'[CTU]'`),
and T and U are considered equivalent.
Parameters
----------
inp : str
Nucleotide sequence using IUPAC nucleotide codes
flags : int, optional
Flags to pass to :py:func:`re.compile` (Default: 0 / no flags)
Examples
--------
Convert a sequence to a regex::
>>> seq_to_regex("CARYYA").pattern
'CA[AG][CTU][CTU]A'
Returns
-------
:py:class:`re.RegexObject`
Regular expression pattern corresponding to IUPAC sequence in `inp`
"""
out = []
for ch in inp:
if len(IUPAC_TABLE.get(ch, ch)) == 1:
out.append(ch)
else:
out.append("[" + "".join(IUPAC_TABLE.get(ch, ch)) + "]")
return re.compile("".join(out), flags=flags)
[docs]def mutate_seqs(seqs, nucleotides="NACTG", mutations=1):
"""Generate all sequences within `mutations` distance from a reference sequence
Parameters
----------
seqs : str or list of str
Single reference sequence (a string) or a group of strings
nucleotides : list of char, optional
Permitted nucleotide substitutions (Default: `'NACTG'`)
mutations : int, optional
Number of substitutions to make (Default: `1`)
Returns
-------
set
all sequences within `mutations` substitutions from the sequence(s)
specified in `seqs`
"""
if isinstance(seqs, str):
seqs = [seqs]
if mutations == 0:
return set(seqs)
else:
seqsout = []
for seq in seqs:
for nuc in nucleotides:
for i in range(len(seq)):
newseq = list(seq)[:]
newseq[i] = nuc
seqsout.append("".join(newseq))
seqsout.extend(mutate_seqs(seqsout, nucleotides=nucleotides, mutations=mutations - 1))
return set(seqsout) | set(seqs)
[docs]def random_seq(size, nucleotides="ACTG"):
"""Generate a random nucleotide sequence of length `size` and composition `nucleotides`
Parameters
----------
size : int
length of desired sequence
nucleotides : str, optional
string of nucleotides to use in sequence, in desired base composition
(i.e. need not be unique; can supply `'AATCG'` to increase `'A'` bias.
Default: `'ACTG'`)
Returns
-------
str : randomized DNA sequence
"""
seq = "".join([nucleotides[random.randrange(0, len(nucleotides))] for _ in range(0, size)])
return seq
[docs]def revive(twobitreader, seqname):
_TwoBitSeqProxy(twobitreader[seqname], twobitreader)
class _TwoBitSeqProxy(object):
"""Adaptor class that fetches :class:`Bio.SeqRecord.SeqRecord`
objects from :class:`twobitreader.TwoBitSequence` objects
Defines `seq` property and :meth:`~_TwoBitSeqProxy.reverse_complement` method
"""
def __init__(self, twobitfile, key):
"""
Parameters
----------
twobitseq : :class:`twobitfile.TwoBitSequence`
"""
self.name = key
self._twobitfile = twobitfile
self.twobitseq = twobitfile[key]
self._seq = None
def __reduce__(self):
return (_TwoBitSeqProxy, (self._twobitfile, self.name))
def __getitem__(self, slice_):
return SeqRecord(
Seq(self.twobitseq.get_slice(min_=slice_.start, max_=slice_.stop))
)
def __len__(self):
return len(self.twobitseq)
def __str__(self):
"""Return sequence in `self.twobitseq` as str"""
return str(self.twobitseq)
def __getattr__(self, attr):
if attr == "seq":
if self._seq is None:
self._seq = Seq(str(self.twobitseq))
return self._seq
def reverse_complement(self):
"""Return the reverse complement of the TwoBitSequence
Returns
-------
:class:`Bio.SeqRecord.SeqRecord`
Reverse complement of the sequence held in `self.twobitseq`
"""
return SeqRecord(self.seq).reverse_complement()
[docs]class TwoBitSeqRecordAdaptor(object):
"""Adaptor class that makes a :class:`twobitreader.TwoBitFile` behave
like a dictionary of :class:`Bio.SeqRecord.SeqRecord` objects.
"""
def __init__(self, fh):
self.twobitfile = TwoBitFile(fh)
self._filename = fh
self._chroms = {K: _TwoBitSeqProxy(self.twobitfile, K) for K in self.twobitfile}
def __reduce__(self):
return (TwoBitSeqRecordAdaptor, (self._filename, ))
def __getitem__(self, key):
return self._chroms[key]
def __getattr__(self, attr):
try:
return getattr(self._chroms, attr)
except AttributeError:
return getattr(self.twobitfile, attr)
def __iter__(self):
return iter(self._chroms)
def __len__(self):
return len(self._chroms)