Source code for atomworks.io.tools.fasta

"""
Convenience utils for working with (generalized) FASTA files.
"""

import logging
import os
import re

from atomworks.enums import ChainType
from atomworks.io.constants import CCD_MIRROR_PATH
from atomworks.io.utils.ccd import (
    check_ccd_codes_are_available,
)
from atomworks.io.utils.sequence import get_3_from_1_letter_code

logger = logging.getLogger("atomworks.io")


[docs] def split_generalized_fasta_sequence(sequence: str) -> list[str]: """ Splits a sequence at each letter, keeping groups with parentheses intact. Args: - sequence (str): The input sequence to be split. Returns: - List[str]: A list of individual letters and/or groups with parentheses. Example: >>> split_generalized_fasta_sequence("ABC(DEF)GH(IJ)K") ['A', 'B', 'C', '(DEF)', 'G', 'H', '(IJ)', 'K'] """ pattern = r"\([^)]*\)|\w" return re.findall(pattern, sequence)
[docs] def one_letter_to_ccd_code( seq: list[str], chain_type: ChainType, ccd_mirror_path: os.PathLike = CCD_MIRROR_PATH, check_ccd_codes: bool = True ) -> list[str]: """ Convert a sequence of one-letter codes or parenthesized full CCD IDs to full CCD IDs. This function takes a list of either one-letter amino acid codes or parenthesized CCD IDs and converts them to their corresponding full CCD (Chemical Component Dictionary) IDs. It handles both standard amino acids and non-standard chemical components. Args: seq (list[str]): A list of one-letter codes or parenthesized CCD IDs. chain_type (ChainType): The type of chain (e.g., POLYPEPTIDE_L, DNA, RNA) to determine the correct conversion for one-letter codes. check_ccd_codes (bool): If True, check if the CCD IDs are available in the CCD mirror. Returns: - list[str]: A list of full CCD IDs corresponding to the input sequence. Raises: - ValueError: If a non-standard chemical component ID is not found in the processed CCD. Example: >>> seq = ["A", "C", "(SEP)", "G", "H"] >>> chain_type = ChainType.POLYPEPTIDE_L >>> one_letter_to_ccd_code(seq, chain_type) ['ALA', 'CYS', 'SEP', 'GLY', 'HIS'] """ seq_with_ccd_ids = [] for chem_comp_id in seq: if "(" in chem_comp_id: # ... this is a non-standard chemical component that only has a unique # >1 letter code # ... remove the parentheses and yield the 3-letter code chem_comp_id = chem_comp_id.strip("()") # ... ensure it is contained in the CCD mirror if check_ccd_codes: check_ccd_codes_are_available([chem_comp_id], ccd_mirror_path=ccd_mirror_path, mode="raise") else: chem_comp_id = get_3_from_1_letter_code(chem_comp_id, chain_type=chain_type) seq_with_ccd_ids.append(chem_comp_id) return seq_with_ccd_ids