# BSD 3-Clause License
#
# Copyright (c) 2025, Spill-Tea
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# 1. Redistributions of source code must retain the above copyright notice, this
# list of conditions and the following disclaimer.
#
# 2. Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
#
# 3. Neither the name of the copyright holder nor the names of its
# contributors may be used to endorse or promote products derived from
# this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
"""Common utility functions to work with and analyze oligonucleotide sequences."""
from ._oligonucleotides import manacher
from ._oligos import (
complement,
nrepeats,
palindrome,
reverse,
reverse_complement,
stretch,
)
__all__ = [
"complement",
"complement_py",
"manacher",
"nrepeats",
"nrepeats_py",
"palindrome",
"palindrome_py",
"reverse",
"reverse_complement",
"reverse_complement_py",
"reverse_py",
"stretch",
"stretch_py",
]
BASEPAIRS_DNA: dict[str, str] = dict(
zip(
"AGTCNURYSWKMBVDH-.",
"TCAGNAYRSWMKVBHD-.",
strict=True,
)
)
BASEPAIRS_DNA.update({k.lower(): v for k, v in BASEPAIRS_DNA.items()})
BASEPAIRS_RNA: dict[str, str] = BASEPAIRS_DNA.copy()
BASEPAIRS_RNA.update({"A": "U", "a": "U"})
DEFAULT_ENCODING: str = "utf_8"
def _make_translation(
mapping: dict[str, str],
encoding: str = DEFAULT_ENCODING,
) -> bytes:
"""Construct a string translation table from a dictionary mapping.
Args:
mapping (dict): a dictionary of complements.
encoding (str): Valid and supported encoding schema.
Returns:
(bytes): translation table of the provided mapping, prepared as specified by the
encoding.
"""
keys: bytes = "".join(mapping.keys()).encode(encoding)
values: bytes = "".join(mapping.values()).encode(encoding)
return bytes.maketrans(keys, values)
DNA: bytes = _make_translation(BASEPAIRS_DNA, DEFAULT_ENCODING)
RNA: bytes = _make_translation(BASEPAIRS_RNA, DEFAULT_ENCODING)
[docs]
def reverse_py(sequence: str) -> str:
"""Reverse a nucleotide sequence.
Args:
sequence (str): Nucleotide sequence string.
Returns:
(str) Reverse a string.
Examples:
.. code-block:: python
reverse_py("ATATAT") == "TATATA"
reverse_py("AATATA") == "ATATAA"
"""
return sequence[::-1]
[docs]
def complement_py(sequence: str, dna: bool = True) -> str:
"""Return the complement of a nucleotide sequence.
Args:
sequence (str): Nucleotide sequence string.
dna (bool): If true, treat sequence as DNA, otherwise treat as RNA
Returns:
(str): Complement of input sequence.
Examples:
.. code-block:: python
complement_py("ATGC", True) == "TACG"
complement_py("ATGC", False) == "UACG"
"""
return (
sequence.encode(DEFAULT_ENCODING)
.translate(DNA if dna else RNA)
.decode(DEFAULT_ENCODING)
)
[docs]
def reverse_complement_py(sequence: str, dna: bool = True) -> str:
"""Reverse complement a nucleotide sequence.
Args:
sequence (str): Nucleotide sequence string.
dna (bool): sequence is dna, else rna.
Returns:
(str) Reverse complement of sequence string.
Examples:
.. code-block:: python
reverse_complement_py("ATGC", True) == "GCAT"
reverse_complement_py("ATGC", False) == "GCAU"
"""
return complement_py(reverse_py(sequence), dna)
def _center_expansion(
s: str,
c: str,
left: int,
right: int,
length: int,
) -> tuple[int, int]:
while left > -1 and right < length and s[left] == c[right] and s[right] == c[left]:
left -= 1
right += 1
return left + 1, right
[docs]
def palindrome_py(sequence: str, dna: bool = True) -> str:
"""Find the longest substring palindrome within a nucleotide sequence.
Args:
sequence (str): Nucleotide sequence string.
dna (bool): If true, treat sequence as DNA, otherwise treat as RNA
Returns:
(str): longest palindromic subsequence within sequence.
Examples:
.. code-block:: python
palindrome_py("ATAT") == "ATAT"
palindrome_py("GATATG") == "ATAT"
Notes:
* Algorithmic time complexity is O(N).
* If a sequence contains two or more palindromic substrings of equal size, the
first leftmost palindrome is prioritized.
"""
seq_length: int = len(sequence)
comp: str = complement_py(sequence, dna)
best_left: int = 0
best_right: int = 0
left: int = 0
right: int = 0
length: int = 0
for i in range(seq_length - 1):
# If we only consider ATGC based sequences, then Palindromic nucleotides are
# only even length, reducing search space in half
left, right = _center_expansion(sequence, comp, i, i + 1, seq_length)
current: int = right - left
if current > length:
length = current
best_left = left
best_right = right
# Handle Degenerate bases which equals it's complement (e.g. N, S, W)
# We can have odd length palindromes if a degenerate base is at the center
if sequence[i] != comp[i]:
continue
left, right = _center_expansion(sequence, comp, i - 1, i + 1, seq_length)
current = right - left
if current > length:
length = current
best_left = left
best_right = right
return sequence[best_left:best_right]
[docs]
def stretch_py(sequence: str) -> int:
"""Calculate the maximum stretch of a single character in a string.
Args:
sequence (str): Nucleotide sequence string.
Returns:
(int): maximum length observed within sequence of a repeated character.
Examples:
.. code-block:: python
stretch_py("AAAA") == 3
stretch_py("AATT") == 1
"""
if not sequence:
return 0
longest: int = 0
current: int = 0
last: str = sequence[0]
char: str
for char in sequence[1:]:
if char == last:
current += 1
if current > longest:
longest = current
else: # reset
current = 0
last = char
return longest
[docs]
def nrepeats_py(sequence: str, n: int) -> int:
"""Calculate the longest substring of n repeating characters.
Args:
sequence (str): Nucleotide string or Series of string
n (int): stretch of k-mer to observe
Returns:
(int) The longest run of repeating n-length characters.
Raises:
ValueError: when n < 1
Examples:
.. code-block:: python
nrepeats_py("AAAA", 1) == 3 # True
nrepeats_py("AAAA", 2) == 1 # True
nrepeats_py("ACAACAACA", 3) == 2 # True
"""
previous: list[str] = [sequence[i : n + i] for i in range(n)]
current: list[int] = [0] * n
max_val: int = 0
for j in range(n, len(sequence), n):
for k in range(n):
phase: str = sequence[j + k : j + k + n]
if phase == previous[k]:
current[k] += 1
if current[k] > max_val:
max_val = current[k]
else:
current[k] = 0
previous[k] = phase
return max_val