Source code for atomworks.io.utils.ccd

import functools
import logging
import os
from collections import defaultdict
from collections.abc import Iterable
from typing import Literal

import biotite.structure as struc
import biotite.structure.io.pdbx as pdbx
import networkx as nx
import numpy as np
import toolz

from atomworks.enums import ChainType, ChainTypeInfo
from atomworks.io.common import exists, immutable_lru_cache
from atomworks.io.constants import (
    AA_LIKE_CHEM_TYPES,
    CCD_MIRROR_PATH,
    DNA_LIKE_CHEM_TYPES,
    DO_NOT_MATCH_CCD,
    NA_LIKE_CHEM_TYPES,
    RNA_LIKE_CHEM_TYPES,
    UNKNOWN_AA,
    UNKNOWN_DNA,
    UNKNOWN_LIGAND,
    UNKNOWN_RNA,
)

logger = logging.getLogger(__name__)


[docs] @functools.cache def aa_chem_comps() -> frozenset[str]: """Set of amino acid chemical components. E.g. {'ALA', 'ARG', ...}""" return frozenset(struc.info.groups._get_group_members(list(AA_LIKE_CHEM_TYPES)))
[docs] @functools.cache def na_chem_comps() -> frozenset[str]: """Set of nucleic acid chemical components. E.g. {'DA', 'DC', ...}""" return frozenset(struc.info.groups._get_group_members(list(NA_LIKE_CHEM_TYPES)))
[docs] @functools.cache def rna_chem_comps() -> frozenset[str]: """Set of RNA chemical components. E.g. {'A', 'C', ...}""" return frozenset(struc.info.groups._get_group_members(list(RNA_LIKE_CHEM_TYPES)))
[docs] @functools.cache def dna_chem_comps() -> frozenset[str]: """Set of DNA chemical components. E.g. {'DA', 'DC', ...}""" return frozenset(struc.info.groups._get_group_members(list(DNA_LIKE_CHEM_TYPES)))
[docs] @functools.cache def chem_comp_to_one_letter() -> dict[str, str]: """Dictionary mapping the chemical components to their 1-letter code. NOTE: Chemical components historically used to be 3-letter codes, but nowadays longer codes exist. """ ccd = struc.info.ccd.get_ccd() three_letter_code = ccd["chem_comp"]["three_letter_code"].as_array() one_letter_code = ccd["chem_comp"]["one_letter_code"].as_array() three_to_one = {} for full, one in zip(three_letter_code, one_letter_code, strict=False): if (len(one) > 1) or (one == "?"): continue if full == "?": continue three_to_one[full] = one return three_to_one
[docs] @functools.cache def get_available_ccd_codes_in_mirror(ccd_mirror_path: os.PathLike = CCD_MIRROR_PATH) -> frozenset[str]: """Set of all CCD codes available in the local mirror. Only counts codes when they adhere to the CCD mirror layout (e.g. .../H/HEM/HEM.cif) """ root = os.fspath(ccd_mirror_path) codes: set[str] = set() # NOTE: The below is an optimized file-system scan since this is run at every with os.scandir(root) as level1: for l1 in level1: if not l1.is_dir(follow_symlinks=False): continue first_letter = l1.name if len(first_letter) != 1: continue with os.scandir(l1.path) as level2: for l2 in level2: if not l2.is_dir(follow_symlinks=False): continue code = l2.name if not code or code[0] != first_letter: continue expected = os.path.join(l2.path, f"{code}.cif") if os.path.isfile(expected): codes.add(code) return frozenset(codes)
[docs] @functools.cache def get_available_ccd_codes_in_biotite() -> frozenset[str]: """Set of all CCD codes available in Biotite's built-in Chemical Component Dictionary.""" return frozenset(struc.info.ccd.get_ccd()["chem_comp"]["id"].as_array())
[docs] @functools.cache def get_available_ccd_codes(ccd_mirror_path: os.PathLike | None = CCD_MIRROR_PATH) -> frozenset[str]: """Returns a frozenset of all CCD codes available. If a mirror path is provided, it will be used to check the local mirror first. Otherwise, Biotite's built-in CCD will be used. """ mirror_codes = get_available_ccd_codes_in_mirror(ccd_mirror_path) if ccd_mirror_path else frozenset() biotite_codes = get_available_ccd_codes_in_biotite() return mirror_codes | biotite_codes
[docs] def get_ccd_component_from_biotite(ccd_code: str, **parse_ccd_cif_kwargs) -> struc.AtomArray: """ Retrieves a component from the Chemical Component Dictionary using Biotite's built-in functionality. Args: - ccd_code (str): The three-letter code of the chemical component to retrieve. Returns: - AtomArray: The atomic structure of the requested component. """ try: block = _filter_biotite_ccd_for_ccd_code(ccd_code) atom_array = parse_ccd_cif(block, **parse_ccd_cif_kwargs) return atom_array except KeyError: raise ValueError(f"No atom information found for residue '{ccd_code}' in Biotite's CCD") from None
[docs] def check_ccd_codes_are_available( ccd_codes: Iterable[str], ccd_mirror_path: os.PathLike = CCD_MIRROR_PATH, mode: Literal["warn", "raise"] = "warn" ) -> bool: """Checks if the provided CCD codes are available in the local mirror.""" available_ccds = get_available_ccd_codes(ccd_mirror_path) invalid_ccds = set(ccd_codes) - available_ccds if invalid_ccds: which_mirror = "Biotite's built-in CCD" if ccd_mirror_path is None else f"the local mirror at {ccd_mirror_path}" if mode == "warn": logger.warning(f"The following CCD codes were not found in {which_mirror}: {invalid_ccds}") elif mode == "raise": raise ValueError(f"The following CCD codes were not found in {which_mirror}: {invalid_ccds}") return not bool(invalid_ccds)
def _get_ccd_path(ccd_code: str, ccd_mirror_path: os.PathLike = CCD_MIRROR_PATH) -> os.PathLike: """ Constructs the file path for a Chemical Component Dictionary entry in the local mirror. Args: - ccd_code (str): The three-letter code of the chemical component. - ccd_mirror_path (os.PathLike): Path to the root of the CCD mirror directory. Returns: - os.PathLike: Full path to the component's CIF file. """ return os.path.join(ccd_mirror_path, ccd_code[0], ccd_code, ccd_code + ".cif") def _filter_biotite_ccd_for_ccd_code(ccd_code: str) -> pdbx.CIFBlock: """Filter the Biotite CCD for a given CCD code.""" if ccd_code not in get_available_ccd_codes_in_biotite(): raise KeyError(f"CCD code `{ccd_code}` not found in Biotite's CCD") ccd = struc.info.get_ccd() # Chem comp chem_comp = ccd.get("chem_comp") chem_comp = pdbx.convert._filter(chem_comp, chem_comp["id"].as_array() == ccd_code) # Chem comp atom chem_comp_atom = ccd.get("chem_comp_atom") chem_comp_atom = pdbx.convert._filter(chem_comp_atom, chem_comp_atom["comp_id"].as_array() == ccd_code) # Chem comp bond chem_comp_bond = ccd.get("chem_comp_bond") chem_comp_bond = pdbx.convert._filter(chem_comp_bond, chem_comp_bond["comp_id"].as_array() == ccd_code) return pdbx.CIFBlock( { "chem_comp": chem_comp, "chem_comp_atom": chem_comp_atom, "chem_comp_bond": chem_comp_bond, } )
[docs] def parse_ccd_cif( cif: pdbx.CIFFile, coords: Literal["model", "ideal_pdbx", "ideal_rdkit"] | None | tuple[str, ...] = ( "ideal_pdbx", "model", "ideal_rdkit", ), add_properties: bool = False, add_mapping: bool = False, ) -> struc.AtomArray: """ Parses a Chemical Component Dictionary CIF file into a Biotite AtomArray structure. Args: - cif (CIFFile): The CIF file containing the component data. - coords (Literal["model", "ideal_pdbx", "ideal_rdkit"] | None | tuple[str, ...]): Type of coordinates to use. Defaults to ("ideal_pdbx", "model", "ideal_rdkit"). Can be a single coordinate type or a tuple of fallback preferences (e.g., ("ideal_pdbx", "model", "ideal_rdkit")). - "model": Use the coordinates that are found in a random (but fixed) pdb file. - "ideal_pdbx": Use the idealized coordinates computed by the RCSB PDB (sometimes not available). - "ideal_rdkit": Use the idealized coordinates computed by RDKit (sometimes unrealistic). - add_properties (bool): Whether to include RDKit-computed properties. Defaults to False. Properties are available under the `properties` attribute of the returned `AtomArray`. - add_mapping (bool): Whether to include external resource mappings, such as e.g. the ChEMBL ID. Defaults to False. Mappings are available under the `mapping` attribute of the returned `AtomArray`. Returns: - AtomArray: The parsed atomic structure with requested annotations and properties. Example: >>> cif = pdbx.CIFFile.read("path/to/ALA.cif") >>> atom_array = parse_ccd_cif(cif, coords="ideal_pdbx") >>> # With fallback preferences: >>> atom_array = parse_ccd_cif(cif, coords=["ideal_pdbx", "model", "ideal_rdkit"]) """ # Convert single value or list to tuple for uniform processing and hashability if isinstance(coords, str): coord_types = (coords,) elif isinstance(coords, list | tuple): coord_types = tuple(coords) else: coord_types = (coords,) if coords is not None else (None,) valid_types = ("model", "ideal_pdbx", "ideal_rdkit", None) # Validate all coord types for coord_type in coord_types: if coord_type not in valid_types: raise ValueError( f"Invalid coordinate type: {coord_type}. Must be one of 'model', 'ideal_pdbx', 'ideal_rdkit' or `None`." ) block = pdbx.convert._get_block(cif, None) # Extract metadata metadata = block.get("chem_comp") ccd_code = metadata["id"].as_item() # Extract atom specific information atom_data = block.get("chem_comp_atom") # Initialize the empty array: atoms = struc.AtomArray(atom_data.row_count) # Fill standard annotations as_bool = lambda x: np.where(x.as_array(str) == "Y", True, False) # noqa: E731 atoms.set_annotation("res_name", atom_data.get("comp_id").as_array(str)) atoms.set_annotation("atom_name", atom_data.get("atom_id").as_array(str)) atoms.set_annotation("alt_atom_id", atom_data.get("alt_atom_id").as_array(str)) atoms.set_annotation("element", atom_data.get("type_symbol").as_array(str)) atoms.set_annotation("charge", atom_data.get("charge").as_array(np.int8)) atoms.set_annotation("stereo", atom_data.get("pdbx_stereo_config").as_array(str)) atoms.set_annotation("is_aromatic", as_bool(atom_data.get("pdbx_aromatic_flag"))) atoms.set_annotation("is_leaving_atom", as_bool(atom_data.get("pdbx_leaving_atom_flag"))) atoms.set_annotation("is_backbone_atom", as_bool(atom_data.get("pdbx_backbone_atom_flag"))) atoms.set_annotation("is_n_terminal_atom", as_bool(atom_data.get("pdbx_n_terminal_atom_flag"))) atoms.set_annotation("is_c_terminal_atom", as_bool(atom_data.get("pdbx_c_terminal_atom_flag"))) atoms.set_annotation("res_id", np.full(len(atoms), 1)) # We 1-index residue IDs to be consistent with RCSB # Try setting hetero flag hetero = ccd_code not in struc.info.atoms.NON_HETERO_RESIDUES atoms.set_annotation("hetero", [hetero] * len(atoms)) # Define coordinate columns for each type coordinate_columns = { "model": ["model_Cartn_x", "model_Cartn_y", "model_Cartn_z"], "ideal_pdbx": ["pdbx_model_Cartn_x_ideal", "pdbx_model_Cartn_y_ideal", "pdbx_model_Cartn_z_ideal"], "ideal_rdkit": ["Cartn_x_rdkit", "Cartn_y_rdkit", "Cartn_z_rdkit"], } # Try each coordinate type until one works coords_set = False for coord_type in coord_types: if coord_type is None: # Skip if None is explicitly requested in the preference list continue try: if (coord_type == "ideal_rdkit") and (rdkit_data := block.get("pdbe_chem_comp_rdkit_conformer")): # Special case for rdkit as it uses a different dataset rdkit_data = block.get("pdbe_chem_comp_rdkit_conformer") assert np.all(rdkit_data["atom_id"].as_array(str) == atoms.get_annotation("atom_name")) for i, col in enumerate(coordinate_columns[coord_type]): atoms.coord[:, i] = rdkit_data[col].as_array(np.float32) else: # Standard case for model and ideal_pdbx for i, col in enumerate(coordinate_columns[coord_type]): atoms.coord[:, i] = atom_data[col].as_array(np.float32) # Check if the coordinates are valid (not all zeros/NaN) if np.all(atoms.coord == 0) or np.all(np.isnan(atoms.coord)): logger.debug( f"Coordinate type '{coord_type}' for '{ccd_code}' contains only zeros/NaN, trying next option" ) continue coords_set = True # If we're not using the first preference, log a warning if coord_type != coord_types[0]: logger.warning( f"Using fallback coordinate type '{coord_type}' for '{ccd_code}' instead of '{coord_types[0]}'" ) break except (KeyError, AssertionError): # Continue to next coordinate type if this one fails logger.debug(f"Coordinate type '{coord_type}' not available for '{ccd_code}', trying next option") continue # Log warning if no coordinates were set if not coords_set and coord_types and coord_types[0] is not None: logger.warning( f"No suitable coordinates found for '{ccd_code}' among preferences {coord_types}. Coordinates will be 'nan'." ) atoms.coord = np.full((len(atoms), 3), np.nan) # Extract bond data try: bond_data = block.get("chem_comp_bond") if bond_data is not None: bond_dict = pdbx.convert._parse_intra_residue_bonds(bond_data) atoms.bonds = struc.connect_via_residue_names(atoms, custom_bond_dict=bond_dict) except KeyError: atoms.bonds = None logger.warning(f"No bond data found for `{ccd_code}`. Bonds will be `None`.") # Set general annotations: if add_properties: try: atoms.properties = toolz.valmap(lambda x: x.as_item(), dict(block["pdbe_chem_comp_rdkit_properties"])) except KeyError: logger.warning(f"No properties data found for `{ccd_code}`. Properties will be `None`.") atoms.properties = None if add_mapping: try: mapping = block.get("pdbe_chem_comp_external_mappings") atoms.mapping = dict(zip(mapping["resource"], mapping["resource_id"], strict=True)) except KeyError: atoms.mapping = None logger.warning(f"No mapping data found for `{ccd_code}`. Mapping will be `None`.") return atoms
[docs] @immutable_lru_cache(maxsize=20000, deepcopy=True) def get_ccd_component_from_mirror( ccd_code: str, ccd_mirror_path: os.PathLike = CCD_MIRROR_PATH, **parse_ccd_cif_kwargs ) -> struc.AtomArray: """ Retrieves and parses a component from a local mirror of the Chemical Component Dictionary. Args: - ccd_code (str): The three-letter code of the chemical component. - ccd_mirror_path (os.PathLike): Path to the root of the CCD mirror directory. - **parse_ccd_cif_kwargs: Additional keyword arguments passed to parse_ccd_cif(): - coords (Literal["model", "ideal_pdbx", "ideal_rdkit"] | None): Type of coordinates to use. Defaults to "ideal_pdbx". - add_properties (bool): Whether to include RDKit-computed properties. Defaults to True. - add_mapping (bool): Whether to include external resource mappings, such as e.g. the ChEMBL ID. Defaults to False. Returns: - AtomArray: The parsed atomic structure of the requested component. Example: >>> atom_array = get_ccd_component_from_mirror("ALA", coords="ideal_pdbx") """ cif = pdbx.CIFFile.read(_get_ccd_path(ccd_code, ccd_mirror_path)) atom_array = parse_ccd_cif(cif, **parse_ccd_cif_kwargs) return atom_array
[docs] @immutable_lru_cache(maxsize=200, deepcopy=True) def atom_array_from_ccd_code( ccd_code: str, ccd_mirror_path: os.PathLike = CCD_MIRROR_PATH, **parse_ccd_cif_kwargs ) -> struc.AtomArray: """Retrieves and parses a component from the Chemical Component Dictionary. First attempts to retrieve the component from a local mirror if provided and the code exists there. Falls back to Biotite's built-in CCD if the code is not found in the local mirror or no mirror path is provided. Args: ccd_code: The three-letter code of the chemical component. ccd_mirror_path: Path to the root of the CCD mirror directory. **parse_ccd_cif_kwargs: Additional keyword arguments passed to parse_ccd_cif(): coords: Type of coordinates to use ("model", "ideal_pdbx", "ideal_rdkit", or None). Defaults to "ideal_pdbx". add_properties: Whether to include RDKit-computed properties. Defaults to True. add_mapping: Whether to include external resource mappings. Defaults to False. Returns: struc.AtomArray: The parsed atomic structure of the requested component. Raises: ValueError: If the CCD code is not found in the local mirror or Biotite's built-in CCD. Example: >>> atom_array = atom_array_from_ccd_code("ALA") """ if ccd_mirror_path and ccd_code in get_available_ccd_codes_in_mirror(ccd_mirror_path): return get_ccd_component_from_mirror(ccd_code, ccd_mirror_path, **parse_ccd_cif_kwargs) else: return get_ccd_component_from_biotite(ccd_code, **parse_ccd_cif_kwargs)
def _find_connected_components_after_removal(graph: nx.Graph, node_to_remove: int) -> list[list[int]]: """ Identifies connected components that would form after removing a node from a graph. Args: - graph (nx.Graph): The input graph. - node_to_remove (int): The node to hypothetically remove. Returns: - list[list[int]]: List of lists containing node indices in each new component. """ # Get the neighbors before removal neighbors = set(graph.neighbors(node_to_remove)) if not neighbors: return [] # Create subgraph without the node subgraph = graph.subgraph(set(graph.nodes) - {node_to_remove}) # Use BFS from any neighbor to find components components = [] unvisited = neighbors.copy() while unvisited: start = unvisited.pop() if start not in subgraph: continue # Find all nodes reachable from this neighbor component = list(nx.bfs_tree(subgraph, start)) components.append(component) unvisited -= set(component) return components
[docs] @functools.cache def get_chem_comp_leaving_atom_names( ccd_code: str, ccd_mirror_path: os.PathLike = CCD_MIRROR_PATH, mode: Literal["warn", "raise"] = "warn" ) -> dict[str, tuple[str, ...]]: """ Computes the canonical leaving groups for a given CCD entry based on the PDBs annotation of leaving atoms. The returned dictionary maps the name of the atom to the names of the atoms that would become disconnected if the atom were removed. Example: >>> get_chem_comp_leaving_atom_names("ALA") {'N': ('H2',), 'C': ('OXT', 'HXT'), 'OXT': ('HXT',)} """ # Skip CCD lookup for codes that shouldn't be matched against CCD (e.g., UNL, water-like codes) if ccd_code in DO_NOT_MATCH_CCD: if mode == "warn": logger.debug(f"Skipping CCD lookup for `{ccd_code}` as it's in DO_NOT_MATCH_CCD") return {} try: chem_comp = atom_array_from_ccd_code(ccd_code, ccd_mirror_path) except (ValueError, AttributeError) as e: if mode == "warn": logger.warning(f"Failed to compute leaving groups for `{ccd_code}`: {e}") elif mode == "raise": raise ValueError(f"Failed to compute leaving groups for `{ccd_code}`: {e}") from e return {} if "is_leaving_atom" not in chem_comp.get_annotation_categories(): if mode == "warn": logger.warning( f"No 'is_leaving_atom' annotation found for `{ccd_code}`. " "Cannot compute leaving groups, returning empty dictionary. " "Check if your CCD mirror is up to date." ) elif mode == "raise": raise ValueError( f"No 'is_leaving_atom' annotation found for `{ccd_code}`. " "Cannot compute leaving groups. Check if your CCD mirror is up to date." ) return {} # ... initialize output leaving_atom_names = defaultdict(list) # ... get relevant annotations is_leaving_atom = chem_comp.get_annotation("is_leaving_atom") atom_name = chem_comp.get_annotation("atom_name") element = chem_comp.get_annotation("element") # ... skip if no atoms are annotated as leaving atoms (majority of CCD entries) if not any(is_leaving_atom): return {} # ... compute the leaving groups based on the bond graph and annotation bond_graph = chem_comp.bonds.as_graph() for atom_idx in range(chem_comp.array_length()): # ... find the connected groups of atoms if the current atom were removed connected_groups = _find_connected_components_after_removal(bond_graph, atom_idx) # ... check if all atoms in the connected group are flagged as leaving atoms # by the CCD entry for connected_group in connected_groups: heavy_atoms: list[int] = list(filter(lambda x: element[x] != "H", connected_group)) is_leaving_group = ( all(is_leaving_atom[heavy_atoms]) if len(heavy_atoms) > 0 else all(is_leaving_atom[connected_group]) ) if is_leaving_group: leaving_atom_names[atom_name[atom_idx]] += [atom_name[idx] for idx in connected_group] # ... turn leaving_atom_names into a dictionary of tuples leaving_atom_names = {k: tuple(v) for k, v in leaving_atom_names.items()} return leaving_atom_names
@functools.cache def _chem_comp_type_dict() -> dict[str, str]: """Get a dictionary of all residue names and their corresponding chemical component types. Example: >>> _chem_comp_type_dict()["ALA"] 'L-PEPTIDE LINKING' """ ccd = struc.info.ccd.get_ccd() # NOTE: biotite caches this internally chem_comp_ids = np.char.upper(ccd["chem_comp"]["id"].as_array()) chem_comp_types = np.char.upper(ccd["chem_comp"]["type"].as_array()) return dict(zip(chem_comp_ids, chem_comp_types, strict=True))
[docs] def get_chem_comp_type(ccd_code: str, mode: Literal["warn", "raise"] = "warn") -> str: """Get the chemical component type for a CCD code from the Chemical Component Dictionary (CCD). Can be combined with CHEM_TYPES from `atomworks.io_biotite.constants` to determine if a component is a protein, nucleic acid, or carbohydrate. Args: ccd_code (str): The CCD code for the component. E.g. `ALA` for alanine, `NAP` for N-acetyl-D-glucosamine. mode (Literal["warn", "raise"]): How to handle unknown chemical component types. Example: >>> get_chem_comp_type("ALA") 'L-PEPTIDE LINKING' """ chem_comp_type = _chem_comp_type_dict().get(ccd_code, None) # ... handle unknown chemical component types if not exists(chem_comp_type): if mode == "raise": # ... raise an error if we want to fail loudly raise ValueError(f"Chemical component type for `{ccd_code=}` not found in CCD.") elif mode == "warn": # ... otherwise set chemical component type to "other" - the equivalent of unknown. logger.info(f"Chemical component type for `{ccd_code=}` not found in CCD. Using 'other'.") chem_comp_type = "OTHER" return chem_comp_type
[docs] def get_chain_type_from_chem_comp_type(chem_comp_type: str) -> ChainType: """Get the ChainType enum corresponding to a chemical component type.""" return ChainTypeInfo.CHEM_COMP_TYPE_TO_ENUM.get(chem_comp_type, ChainType.OTHER_POLYMER)
[docs] def get_chain_type_from_ccd_code(ccd_code: str) -> ChainType: """Get the ChainType enum corresponding to a CCD code.""" return get_chain_type_from_chem_comp_type(get_chem_comp_type(ccd_code))
[docs] def get_unknown_ccd_code_for_chem_comp_type(chem_comp_type: str) -> str: """Get the CCD code for an unknown chemical component type.""" if chem_comp_type in AA_LIKE_CHEM_TYPES: return UNKNOWN_AA elif chem_comp_type in DNA_LIKE_CHEM_TYPES: return UNKNOWN_DNA elif chem_comp_type in RNA_LIKE_CHEM_TYPES: return UNKNOWN_RNA else: return UNKNOWN_LIGAND
[docs] def get_std_to_alt_atom_name_map(ccd_code: str, ccd_mirror_path: os.PathLike = CCD_MIRROR_PATH) -> dict[str, str]: """Get a map from standard atom names to alternative atom names.""" chem_comp = atom_array_from_ccd_code(ccd_code, ccd_mirror_path) return dict(zip(chem_comp.atom_name, chem_comp.alt_atom_id, strict=True))