Source code for designer_dna.oligos

# BSD 3-Clause License
#
# Copyright (c) 2025, Spill-Tea
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# 1. Redistributions of source code must retain the above copyright notice, this
#    list of conditions and the following disclaimer.
#
# 2. Redistributions in binary form must reproduce the above copyright notice,
#    this list of conditions and the following disclaimer in the documentation
#    and/or other materials provided with the distribution.
#
# 3. Neither the name of the copyright holder nor the names of its
#    contributors may be used to endorse or promote products derived from
#    this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

"""Common utility functions to work with and analyze oligonucleotide sequences."""

from ._oligonucleotides import manacher
from ._oligos import (
    complement,
    nrepeats,
    palindrome,
    reverse,
    reverse_complement,
    stretch,
)


__all__ = [
    "complement",
    "complement_py",
    "manacher",
    "nrepeats",
    "nrepeats_py",
    "palindrome",
    "palindrome_py",
    "reverse",
    "reverse_complement",
    "reverse_complement_py",
    "reverse_py",
    "stretch",
    "stretch_py",
]

BASEPAIRS_DNA: dict[str, str] = dict(
    zip(
        "AGTCNURYSWKMBVDH-.",
        "TCAGNAYRSWMKVBHD-.",
        strict=True,
    )
)
BASEPAIRS_DNA.update({k.lower(): v for k, v in BASEPAIRS_DNA.items()})
BASEPAIRS_RNA: dict[str, str] = BASEPAIRS_DNA.copy()
BASEPAIRS_RNA.update({"A": "U", "a": "U"})
DEFAULT_ENCODING: str = "utf_8"


def _make_translation(
    mapping: dict[str, str],
    encoding: str = DEFAULT_ENCODING,
) -> bytes:
    """Construct a string translation table from a dictionary mapping.

    Args:
        mapping (dict): a dictionary of complements.
        encoding (str): Valid and supported encoding schema.

    Returns:
        (bytes): translation table of the provided mapping, prepared as specified by the
        encoding.

    """
    keys: bytes = "".join(mapping.keys()).encode(encoding)
    values: bytes = "".join(mapping.values()).encode(encoding)

    return bytes.maketrans(keys, values)


DNA: bytes = _make_translation(BASEPAIRS_DNA, DEFAULT_ENCODING)
RNA: bytes = _make_translation(BASEPAIRS_RNA, DEFAULT_ENCODING)


[docs] def reverse_py(sequence: str) -> str: """Reverse a nucleotide sequence. Args: sequence (str): Nucleotide sequence string. Returns: (str) Reverse a string. Examples: .. code-block:: python reverse_py("ATATAT") == "TATATA" reverse_py("AATATA") == "ATATAA" """ return sequence[::-1]
[docs] def complement_py(sequence: str, dna: bool = True) -> str: """Return the complement of a nucleotide sequence. Args: sequence (str): Nucleotide sequence string. dna (bool): If true, treat sequence as DNA, otherwise treat as RNA Returns: (str): Complement of input sequence. Examples: .. code-block:: python complement_py("ATGC", True) == "TACG" complement_py("ATGC", False) == "UACG" """ return ( sequence.encode(DEFAULT_ENCODING) .translate(DNA if dna else RNA) .decode(DEFAULT_ENCODING) )
[docs] def reverse_complement_py(sequence: str, dna: bool = True) -> str: """Reverse complement a nucleotide sequence. Args: sequence (str): Nucleotide sequence string. dna (bool): sequence is dna, else rna. Returns: (str) Reverse complement of sequence string. Examples: .. code-block:: python reverse_complement_py("ATGC", True) == "GCAT" reverse_complement_py("ATGC", False) == "GCAU" """ return complement_py(reverse_py(sequence), dna)
def _center_expansion( s: str, c: str, left: int, right: int, length: int, ) -> tuple[int, int]: while left > -1 and right < length and s[left] == c[right] and s[right] == c[left]: left -= 1 right += 1 return left + 1, right
[docs] def palindrome_py(sequence: str, dna: bool = True) -> str: """Find the longest substring palindrome within a nucleotide sequence. Args: sequence (str): Nucleotide sequence string. dna (bool): If true, treat sequence as DNA, otherwise treat as RNA Returns: (str): longest palindromic subsequence within sequence. Examples: .. code-block:: python palindrome_py("ATAT") == "ATAT" palindrome_py("GATATG") == "ATAT" Notes: * Algorithmic time complexity is O(N). * If a sequence contains two or more palindromic substrings of equal size, the first leftmost palindrome is prioritized. """ seq_length: int = len(sequence) comp: str = complement_py(sequence, dna) best_left: int = 0 best_right: int = 0 left: int = 0 right: int = 0 length: int = 0 for i in range(seq_length - 1): # If we only consider ATGC based sequences, then Palindromic nucleotides are # only even length, reducing search space in half left, right = _center_expansion(sequence, comp, i, i + 1, seq_length) current: int = right - left if current > length: length = current best_left = left best_right = right # Handle Degenerate bases which equals it's complement (e.g. N, S, W) # We can have odd length palindromes if a degenerate base is at the center if sequence[i] != comp[i]: continue left, right = _center_expansion(sequence, comp, i - 1, i + 1, seq_length) current = right - left if current > length: length = current best_left = left best_right = right return sequence[best_left:best_right]
[docs] def stretch_py(sequence: str) -> int: """Calculate the maximum stretch of a single character in a string. Args: sequence (str): Nucleotide sequence string. Returns: (int): maximum length observed within sequence of a repeated character. Examples: .. code-block:: python stretch_py("AAAA") == 3 stretch_py("AATT") == 1 """ if not sequence: return 0 longest: int = 0 current: int = 0 last: str = sequence[0] char: str for char in sequence[1:]: if char == last: current += 1 if current > longest: longest = current else: # reset current = 0 last = char return longest
[docs] def nrepeats_py(sequence: str, n: int) -> int: """Calculate the longest substring of n repeating characters. Args: sequence (str): Nucleotide string or Series of string n (int): stretch of k-mer to observe Returns: (int) The longest run of repeating n-length characters. Raises: ValueError: when n < 1 Examples: .. code-block:: python nrepeats_py("AAAA", 1) == 3 # True nrepeats_py("AAAA", 2) == 1 # True nrepeats_py("ACAACAACA", 3) == 2 # True """ previous: list[str] = [sequence[i : n + i] for i in range(n)] current: list[int] = [0] * n max_val: int = 0 for j in range(n, len(sequence), n): for k in range(n): phase: str = sequence[j + k : j + k + n] if phase == previous[k]: current[k] += 1 if current[k] > max_val: max_val = current[k] else: current[k] = 0 previous[k] = phase return max_val