Source code for atomworks.ml.transforms.rdkit_utils

import logging
import time
from typing import Any, ClassVar, Literal

import numpy as np
from biotite.structure import AtomArray
from rdkit import Chem, RDLogger
from rdkit.Chem import AllChem, Mol, rdDistGeom

from atomworks.common import default
from atomworks.io.tools.rdkit import (
    add_hydrogens,
    atom_array_from_rdkit,
    atom_array_to_rdkit,
    ccd_code_to_rdkit,
    preserve_annotations,
    remove_hydrogens,
)
from atomworks.ml.transforms._checks import (
    check_atom_array_annotation,
    check_contains_keys,
    check_does_not_contain_keys,
    check_is_instance,
    check_nonzero_length,
)
from atomworks.ml.transforms.base import Transform
from atomworks.ml.utils import timer

logger = logging.getLogger(__name__)
# ... disable RDKit logging
RDLogger.DisableLog("rdApp.*")

# Set default pickle properties to all properties, otherwise
#  annotations get lost when pickling/unpickling molecules
# https://github.com/rdkit/rdkit/issues/1320
Chem.SetDefaultPickleProperties(Chem.PropertyPickleOptions.AllProps)


def _has_explicit_hydrogen(mol: Chem.Mol) -> bool:
    """Check if the molecule has explicit hydrogens."""
    return mol.GetNumAtoms() > mol.GetNumHeavyAtoms()


def _get_random_seed() -> int:
    """Get a random seed for RDKit. This needs to be a python 'int' to play well with RDKit."""
    return int(np.random.randint(0, 2**31 - 1))



[docs]
@preserve_annotations
def generate_conformers(
    mol: Mol,
    *,
    seed: int | None = None,
    n_conformers: int = 1,
    method: str = "ETKDGv3",
    num_threads: int = 1,
    hydrogen_policy: Literal["infer", "remove", "keep", "auto"] = "remove",
    optimize: bool = True,
    attempts_with_distance_geometry: int = 10,
    attempts_with_random_coordinates: int = 10_000,
    **uff_optimize_kwargs: dict,
) -> Mol:
    """
    Generate conformations for the given molecule.

    Args:
        - mol (rdkit.Chem.Mol): The RDKit molecule to generate conformations for.
        - seed (int | None): Random seed for reproducibility. If None, a random seed is used.
        - n_conformers (int): Number of conformations to generate.
        - method (str): The method to use for conformer generation. Default is "ETKDGv3".
            Allowed methods are: "ETDG", "ETKDG", "ETKDGv2", "ETKDGv3", "srETKDGv3"
            See https://rdkit.org/docs/RDKit_Book.html#conformer-generation for details.
        - num_threads (int): Number of threads to use for parallel computation. Default is 1.
        - hydrogen_policy (Literal["infer", "remove", "keep", "auto"]): Whether to add explicit
            hydrogens to the molecule. If "remove", hydrogens are temporarily added for conformer
            generation, but removed again before returning the molecule. If "keep" the molecule is
            used as-is (without adding or removing hydrogens). If "auto", the policy is set to "keep"
            if the molecule already has explicit hydrogens, otherwise it is set to "remove".
            If "infer", we follow the same behavior as "remove," but do not remove added hydrogens
            prior to returning the molecule.
        - optimize (bool): Whether to optimize the generated conformers using UFF.
            Default is True.
        - **uff_optimize_kwargs (dict): Additional keyword arguments for UFF optimization:
            - maxIters (int): Maximum number of iterations (default 200).
            - vdwThresh (float): Used to exclude long-range van der Waals interactions
              (default 10.0).
            - ignoreInterfragInteractions (bool): If True, nonbonded terms between
              fragments will not be added to the forcefield (default True).

    Returns:
        rdkit.Chem.Mol: The molecule with generated conformations.

    Note:
        - Optimizing conformers (optimize_conformers=True) is recommended for obtaining
          more realistic and lower-energy conformations. However, it may increase
          computation time.
        - The ETKDGv3 method is used for conformer generation, which incorporates
          torsion angle preferences and basic knowledge (e.g. aromatic rings are planar)
          for improved accuracy.
        - For macrocycles or complex ring systems, you may need to increase the number
          of conformers generated to ensure good sampling of the conformational space
          (if a representative ensemble of conformers is what you are after).

    Best Practices:
        1. Always add hydrogens before generating conformers unless you have a specific
           reason not to (e.g., you're working with a protein structure where hydrogens
           are already correctly placed).
        2. Use a non-zero seed for reproducibility in research or production environments.
        3. Generate multiple conformers (e.g., 50-100) for flexible molecules to sample
           the conformational space more thoroughly.
        4. Optimize conformers using UFF or MMFF94 for more realistic geometries, especially
           if the conformers will be used for further calculations or analysis.
        5. For very large or complex molecules, you may need to adjust parameters such as
           maxIterations or use more advanced sampling techniques.

    References:
        1. Conformer tutorial: https://rdkit.org/docs/RDKit_Book.html#conformer-generation
        1. RDKit Cookbook: https://www.rdkit.org/docs/Cookbook.html
        2. Riniker and Landrum, "Better Informed Distance Geometry: Using What We Know To
           Improve Conformation Generation", JCIM, 2015.

    """
    # Ensure that all properties are being pickled (needed when we use timeout)
    assert (
        Chem.GetDefaultPickleProperties() == Chem.PropertyPickleOptions.AllProps
    ), "Default pickle properties are not set to all properties. Annotation loss will occur."
    assert attempts_with_distance_geometry > 0, "Attempts with distance geometry must be greater than 0."
    assert attempts_with_random_coordinates > 0, "Attempts with random coordinates must be greater than 0."

    if hydrogen_policy == "auto":
        hydrogen_policy = "keep" if _has_explicit_hydrogen(mol) else "remove"
    if hydrogen_policy in ("infer", "remove"):
        mol = remove_hydrogens(mol)
        # ... temporarily add hydrogens for more realistic conformer generation
        mol = add_hydrogens(mol)
    elif hydrogen_policy == "keep":
        pass
    else:
        raise ValueError(f"Invalid hydrogen policy: {hydrogen_policy}. Must be one of 'infer', 'remove', or 'keep'.")

    # Setup the parameters for the coordinate embedding
    params = getattr(rdDistGeom, method)()
    params.clearConfs = True
    params.randomSeed = seed or _get_random_seed()
    params.enforceChirality = True
    params.useRandomCoords = False
    params.numThreads = num_threads

    # Newer RDKit version have renamed this parameter
    if hasattr(params, "maxAttempts"):
        params.maxAttempts = attempts_with_distance_geometry
    else:
        params.maxIterations = attempts_with_distance_geometry

    try:
        successful_cids = rdDistGeom.EmbedMultipleConfs(mol, numConfs=int(n_conformers), params=params)
        if len(successful_cids) < n_conformers:
            logger.warning(
                f"Initial conformer generation based on distance geometry failed. Successful: {len(successful_cids)}. "
                "Falling back to generating a conformer starting from random coordinates."
            )
            raise RuntimeError("Failed to generate enough conformers.")
    except RuntimeError:
        # Addresses issues with bad conformers, which happens when distance embeddings fail due to
        #  too many constraints or rotatable bonds, see for example:
        # https://github.com/rdkit/rdkit/issues/1433#issuecomment-305097888
        params.useRandomCoords = True
        params.enforceChirality = False
        if hasattr(params, "maxAttempts"):
            params.maxAttempts = attempts_with_random_coordinates
        else:
            params.maxIterations = attempts_with_random_coordinates
        successful_cids = rdDistGeom.EmbedMultipleConfs(mol, numConfs=int(n_conformers), params=params)
        if len(successful_cids) < n_conformers:
            raise RuntimeError(  # noqa: B904
                f"Requested {n_conformers} conformers, but only {mol.GetNumConformers()} were generated."
            )

    if optimize:
        mol = optimize_conformers(mol, **uff_optimize_kwargs)

    mol = remove_hydrogens(mol) if hydrogen_policy == "remove" else mol
    return mol




[docs]
@preserve_annotations
def optimize_conformers(
    mol: Mol,
    numThreads: int = 1,  # noqa: N803
    maxIters: int = 200,  # noqa: N803
    vdwThresh: float = 10.0,  # noqa: N803
    ignoreInterfragInteractions: bool = True,  # noqa: N803
) -> Mol:
    """
    Optimize the conformers of an RDKit molecule.

    Args:
        - mol (Mol): The RDKit molecule to optimize.
        - numThreads (int): Number of threads to use for parallel computation. Default is 1.
        - maxIters (int): Maximum number of iterations for UFF optimization. Defaults to 200.
        - vdwThresh (float): Used to exclude long-range van der Waals interactions. Defaults to 10.0.
        - ignoreInterfragInteractions (bool): If True, nonbonded terms between fragments will not be added to the
            forcefield. Defaults to True.

    Returns:
        Mol: The optimized RDKit molecule.
    """
    success = AllChem.UFFOptimizeMoleculeConfs(
        mol,
        numThreads=numThreads,
        maxIters=maxIters,
        vdwThresh=vdwThresh,
        ignoreInterfragInteractions=ignoreInterfragInteractions,
    )
    if not success:
        logger.warning("Conformer optimization did not converge.")
    return mol




[docs]
def get_chiral_centers(mol: Mol) -> list[int]:
    """Identify and return the tetrahedral chiral centers in an RDKit molecule.

    Finds all tetrahedral chiral centers in the given molecule
    and returns their information, including the chiral center atom index and
    the indices of the atoms bonded to it.

    Args:
        mol (rdkit.Chem.Mol): The RDKit molecule to analyze.

    Returns:
        - list[dict]: A list of dictionaries, where each dictionary contains:
            - "chiral_center_idx" (int): The index of the chiral center atom.
            - "bonded_explicit_atom_idxs" (list[int]): A list of indices of the atoms
              bonded to the chiral center.
            - "chirality" (str): The chirality of the center ('R' or 'S').

    Note:
        This function will generate a 3D conformation if one is not present, as
        chirality assignment requires 3D coordinates in RDKit to break the conditional
        tie between multiple possible chirality centers.
    """
    # Infer 3D coordinates if not present
    if mol.GetNumConformers() == 0:
        generate_conformers(mol, n_conformers=1)

    # Assign chiral tags based on the 3D structure
    Chem.AssignAtomChiralTagsFromStructure(mol)

    # Identify chiral centers
    chiral_centers = Chem.FindMolChiralCenters(mol, includeUnassigned=True)

    def _should_exclude_chiral_center(atom: Chem.Atom) -> bool:
        """Exclude edge cases for chiral centers."""
        # EDGE CASE: The chiral center is a Phosphorous (P, 15) or Sulfur (S, 16) atom bonded to 2 Oxygens (O, 8)
        # For an example of where this case occurs, see the CCD ligand `NAP` (e.g., in `5OCM`)
        if atom.GetAtomicNum() == 15 or atom.GetAtomicNum() == 16:
            # ... get the atomic numbers of the bonded atoms
            atomic_nums_of_bonded_atoms = [bond.GetOtherAtom(atom).GetAtomicNum() for bond in atom.GetBonds()]
            # ... check if there are 2 Oxygens (O, 8) bonded to the chiral center
            if atomic_nums_of_bonded_atoms.count(8) >= 2:
                return True

        return False

    # Filter chiral centers with tetrahedral geometry
    tetrahedral_chiral_centers = []
    for center in chiral_centers:
        idx, chirality = center
        atom = mol.GetAtomWithIdx(idx)
        if (
            atom.GetChiralTag() == Chem.rdchem.ChiralType.CHI_TETRAHEDRAL_CW
            or atom.GetChiralTag() == Chem.rdchem.ChiralType.CHI_TETRAHEDRAL_CCW
        ):
            chiral_center_atom_name = atom.GetProp("atom_name") if atom.HasProp("atom_name") else None
            if not _should_exclude_chiral_center(atom):
                tetrahedral_chiral_centers.append(
                    {
                        "chiral_center_idx": idx,
                        "bonded_explicit_atom_idxs": [bond.GetOtherAtomIdx(idx) for bond in atom.GetBonds()],
                        "chirality": chirality,
                        # For debugging - currently unused by the pipeline
                        "chiral_center_atom_name": chiral_center_atom_name,
                        "bonded_explicit_atom_names": [
                            mol.GetAtomWithIdx(bond.GetOtherAtomIdx(idx)).GetProp("atom_name")
                            for bond in atom.GetBonds()
                            if chiral_center_atom_name
                        ]
                        if chiral_center_atom_name
                        else None,
                    }
                )

    # Remove chiral centers that are R or S bonded to 2 O

    return tetrahedral_chiral_centers




[docs]
def find_automorphisms_with_rdkit(
    mol: Chem.Mol,
    max_automorphs: int = 1000,
    timeout: float | None = None,
    timeout_strategy: Literal["signal", "subprocess"] = "subprocess",
) -> np.ndarray:
    """
    Find automorphisms of a given RDKit molecule.

    This function identifies the automorphisms (symmetry-related atom swaps) of the input molecule
    and returns them as a numpy array. If the search for automorphisms times out, it returns a single
    automorphism representing the identity (no swaps).

    Args:
        mol (Chem.Mol): The RDKit molecule for which to find automorphisms.
        max_automorphs (int): The maximum number of automorphisms to return. These are deterministically
            set to be the first `max_automorphs` automorphisms found by RDKit.
            For model training it is recommended to deterministically select the automorphisms
            to be used (as done in this transform) as a model might otherwise be nudged towards a specific
            automorph in one training step, but that automorph then does not show up in the next training
            step, leading to a moving target problem.
        timeout (float | None): The timeout for the automorphism search. If None, no timeout is applied and
            the timeout strategy is ignored (no subprocesses will be spawned).
        timeout_strategy (Literal["signal", "subprocess"]): The strategy to use for the timeout.
            Defaults to "subprocess".
    Returns:
        automorphs (np.ndarray): A numpy array of shape [n_automorphs, n_atoms, 2], where each element
            represents an automorphism as list of paired atom indices (from_idx, to_idx).
            If the search fails (e.g. due to running out of memory), returns an array with
            a single automorphism representing the identity (no swaps).

    References:
        - https://sourceforge.net/p/rdkit/mailman/message/27897393/

    Example:
        >>> from openbabel import pybel
        >>> mol = pybel.readstring("smi", "c1c(O)cccc1(O)").OBMol
        >>> automorphisms = find_automorphisms(mol)
        >>> print(automorphisms)
            [[[0 0]
              [1 1]
              [2 2]
              [3 3]
              [4 4]
              [5 5]
              [6 6]
              [7 7]]

             [[0 0]
              [1 6]
              [2 7]
              [3 5]
              [4 4]
              [5 3]
              [6 1]
              [7 2]]]
    """

    # NOTE: We compute the automorphisms via a substructure match. This may not be the computationally most
    #  efficient way, but still works well even for highly symmetric molecules (e.g. 60C).
    #  (c.f. https://sourceforge.net/p/rdkit/mailman/message/27897393/)
    #  The probably optimal way to do this would be to access internal symmetry labels for models
    #  (c.f. https://sourceforge.net/p/rdkit/mailman/message/27902778/)
    #  but this would require using an underlying graph librarly like nauty to determine the automorphisms of
    #  the coloured graph. Until we run into performance issues, we will stick with the current approach.
    @timer.timeout(timeout=timeout, strategy=timeout_strategy)
    def _find_automorphisms() -> tuple:
        return mol.GetSubstructMatches(mol, uniquify=False, maxMatches=max_automorphs, useChirality=False)

    _start = time.time()
    try:
        automorphs_tuple = _find_automorphisms()
    except TimeoutError:
        logger.warning(
            f"Automorphism search timed out after {time.time() - _start:.2f}s. Returning identity automorphism."
        )
        automorphs_tuple = (tuple(range(mol.GetNumAtoms())),)

    # Turn the tuple of automorphisms into a numpy array of shape [n_automorphs, n_atoms, 2]
    automorphs = np.array(automorphs_tuple)
    n_automorphs, n_atoms = automorphs.shape
    identity = np.tile(np.arange(n_atoms), (n_automorphs, 1))

    return np.stack([identity, automorphs], axis=-1)



def _auto_timeout_policy(n_conformers: int, offset: float = 3.0, slope: float = 0.15) -> float:
    return offset + slope * (n_conformers - 1)



[docs]
def sample_rdkit_conformer_for_atom_array(
    atom_array: AtomArray,
    n_conformers: int = 1,
    seed: int | None = None,
    timeout: float | None | tuple[float, float] = (3.0, 0.15),
    timeout_strategy: Literal["signal", "subprocess"] = "subprocess",
    return_mol: bool = False,
    **generate_conformers_kwargs,
) -> AtomArray:
    """Sample a conformer for a Biotite AtomArray using RDKit.

    Args:
        - atom_array: The Biotite AtomArray to sample a conformer for.
        - n_conformers: The number of conformers to sample.
        - timeout: The timeout for conformer generation. If None,
            no timeout is applied. If a tuple, the first element is the offset and the
            second element is the slope.
        - seed: The seed for conformer generation. If None, a random seed
            is generated using the global numpy RNG.
        - timeout_strategy: The strategy to use for the timeout.
            Defaults to "subprocess".
        - **generate_conformers_kwargs: Additional keyword arguments to pass to the
            generate_conformers function.

    Returns:
        - AtomArray: The AtomArray with updated coordinates from the sampled conformer.
        - Chem.Mol: The RDKit molecule with the generated conformer.

    Note:
        This function preserves the original atom order and properties of the input AtomArray.
    """
    seed = seed or _get_random_seed()
    atom_array = atom_array.copy()
    mol = atom_array_to_rdkit(atom_array, hydrogen_policy="remove")
    # ... remove RDKit's conformer if there is one
    if mol.GetNumConformers() > 0:
        mol.RemoveAllConformers()

    timeout = _auto_timeout_policy(n_conformers, *timeout) if isinstance(timeout, tuple) else timeout
    generate_conformers_with_timeout = timer.timeout(timeout=timeout, strategy=timeout_strategy)(generate_conformers)

    set_coord_if_available = True
    try:
        mol = generate_conformers_with_timeout(mol, n_conformers=n_conformers, seed=seed, **generate_conformers_kwargs)
    except (TimeoutError, RuntimeError):
        set_coord_if_available = False
        logger.warning(
            f"Failed to generate {n_conformers} conformers for {atom_array.res_name[0]}. Falling back to zeros."
        )

    new_atom_array = atom_array_from_rdkit(
        mol,
        set_coord_if_available=set_coord_if_available,
        remove_inferred_atoms=True,
        remove_hydrogens=False,
    )

    assert new_atom_array.array_length() == atom_array.array_length()
    assert np.all(new_atom_array.atom_name == atom_array.atom_name)
    assert np.all(new_atom_array.res_id == atom_array.res_id)
    assert np.all(new_atom_array.res_name == atom_array.res_name)
    atom_array.coord = new_atom_array.coord

    if return_mol:
        return atom_array, mol

    return atom_array




[docs]
def ccd_code_to_rdkit_with_conformers(
    ccd_code: str,
    n_conformers: int,
    *,
    seed: int | None = None,
    timeout: float | None | tuple[float, float] = (3.0, 0.15),
    timeout_strategy: Literal["signal", "subprocess"] = "subprocess",
    skip_rdkit_conformer_generation: bool = False,
    **generate_conformers_kwargs,
) -> Chem.Mol:
    """
    Generate an RDKit molecule with conformers for a given residue name.

    This function attempts to generate the specified number of conformers for the given CCD code
    using RDKit's conformer generation (based on ETKDGv3 per default).
    If conformer generation fails or times out, it falls back to using the idealized conformer
    from the CCD entry if one is available.
    Args:
        ccd_code: The CCD code to generate conformers for. E.g. 'ALA' or 'GLY', '9RH' etc.
        n_conformers: The number of conformers to generate for the given CCD code.
        seed: The seed for conformer generation. If None, a random seed
            is generated using the global numpy RNG.
        timeout: The timeout for the automorphism search. If None, no timeout is applied and
            the timeout strategy is ignored (no subprocesses will be spawned). If a tuple,
            the first element is the offset and the second element is the slope.
        timeout_strategy: The strategy to use for the timeout.
            Defaults to "subprocess".
        **generate_conformers_kwargs: Additional keyword arguments to pass to the
            generate_conformers function.

    Returns:
        Chem.Mol: An RDKit molecule with the specified number of conformers.
    """
    # ... get molecule from CCD with its idealized conformer (default conformer 0)
    mol = ccd_code_to_rdkit(ccd_code, hydrogen_policy="remove")

    # ... get idealized conformer from CCD entry
    idealized_conformer = Chem.Conformer(mol.GetConformer(0))  # creates a copy

    # ... try generating `count` conformers within a given time limit
    if not skip_rdkit_conformer_generation:
        timeout = _auto_timeout_policy(n_conformers, *timeout) if isinstance(timeout, tuple) else timeout
        generate_conformers_with_timeout = timer.timeout(timeout=timeout, strategy=timeout_strategy)(
            generate_conformers
        )
        try:
            seed = seed or _get_random_seed()
            mol = generate_conformers_with_timeout(
                mol, n_conformers=n_conformers, seed=seed, **generate_conformers_kwargs
            )
        except (TimeoutError, RuntimeError, Chem.MolSanitizeException) as e:
            logger.warning(
                f"Failed to generate {n_conformers} conformers for {ccd_code=}. Falling back to idealized conformer from the CCD. Error message: {e}"
            )

    # ... if conformer generation fails or is incomplete, return the idealized conformer (set `count` conformers)
    missing_conformers = n_conformers - mol.GetNumConformers()
    if missing_conformers > 0:
        for _ in range(missing_conformers):
            mol.AddConformer(Chem.Conformer(idealized_conformer), assignId=True)

    return mol



# -------------------------------------------------------------------------------------------------
# ---------------------  RDKit related transforms  ------------------------------------------------
# -------------------------------------------------------------------------------------------------



[docs]
class AddRDKitMoleculesForAtomizedMolecules(Transform):
    """
    Add RDKit molecules for atomized molecules in the atom array.

    This transform converts atomized molecules in the atom array to RDKit Mol objects and stores them in the
    `data` dictionary under the "rdkit" key. Each molecule is identified by its `pn_unit_iid`.

    Note:
        This transform requires the `AtomizeByCCDName` transform to be applied previously.

    Args:
        data (dict[str, Any]): A dictionary containing the input data, including the atom array.

    Returns:
        dict[str, Any]: The updated `data` dictionary with the added RDKit molecules under the
            `"rdkit"` key.

    Example:
        >>> data = {
        >>>     "atom_array": AtomArray(...),  # Your atom array here
        >>> }
        >>> transform = AddRDKitMoleculesForAtomizedMolecules()
        >>> data = transform(data)
        >>> print(data["rdkit"])
        {
            'A_1': <rdkit.Chem.rdchem.Mol object at 0x...>,
            'B_1': <rdkit.Chem.rdchem.Mol object at 0x...>,
            ...
        }
    """

    requires_previous_transforms: ClassVar[list[str | Transform]] = ["AtomizeByCCDName"]
    incompatible_previous_transforms: ClassVar[list[str | Transform]] = ["CropContiguousLikeAF3", "CropSpatialLikeAF3"]

    def __init__(self, hydrogen_policy: Literal["infer", "remove", "keep"] = "keep"):
        self.hydrogen_policy = hydrogen_policy


[docs]
    def check_input(self, data: dict[str, Any]) -> None:
        check_does_not_contain_keys(data, ["rdkit"])
        check_atom_array_annotation(data, ["atomize", "pn_unit_iid"])


    def _convert_atom_array_to_rdkit_robust(self, atom_array: AtomArray) -> Mol:
        """Convert an AtomArray to RDKit molecule with error sanitization failure handling"""

        conversion_kwargs = {
            "hydrogen_policy": self.hydrogen_policy,
            "annotations_to_keep": ["chain_id", "res_id", "res_name", "atom_name", "atom_id", "pn_unit_iid"],
            "sanitize": True,
        }

        try:
            # First attempt: try without fixing for better performance
            rdmol = atom_array_to_rdkit(atom_array, attempt_fixing_corrupted_molecules=False, **conversion_kwargs)

            # Check if sanitization actually succeeded
            sanitization_result = Chem.SanitizeMol(rdmol, catchErrors=True)
            if sanitization_result != Chem.SanitizeFlags.SANITIZE_NONE:
                # Molecule failed sanitization, retry with fixing enabled
                logger.warning(
                    f"Molecule {atom_array.res_name[0]} failed sanitization ({sanitization_result}). "
                    "Retrying with attempt_fixing_corrupted_molecules=True."
                )
                rdmol = atom_array_to_rdkit(atom_array, attempt_fixing_corrupted_molecules=True, **conversion_kwargs)

            return rdmol

        except KeyboardInterrupt:
            raise
        except Exception as e:
            logger.error(
                f"Failed to convert molecule {atom_array.res_name[0]} to RDKit: {e!s}. "
                "Trying again and attempting to fix the corrupted molecule."
            )
            return atom_array_to_rdkit(atom_array, attempt_fixing_corrupted_molecules=True, **conversion_kwargs)


[docs]
    def forward(self, data: dict[str, Any]) -> dict[str, Any]:
        atom_array: AtomArray = data["atom_array"]

        # Subset to atomized molecules
        _atom_array = atom_array[atom_array.atomize]

        # Iterate over unique pn_unit_iids
        data["rdkit"] = {}
        for pn_unit_iid in np.unique(_atom_array.pn_unit_iid):
            pn_unit_mask = _atom_array.pn_unit_iid == pn_unit_iid
            molecule = _atom_array[pn_unit_mask]
            rdmol = self._convert_atom_array_to_rdkit_robust(molecule)
            data["rdkit"][pn_unit_iid] = rdmol

        return data





[docs]
class GenerateRDKitConformers(Transform):
    """
    Generate conformers for RDKit molecules stored in the `data["rdkit"]` dictionary.

    This transform generates conformers for each RDKit molecule in the data dictionary and updates
    the molecules with the new conformers. The random seed for conformer generation is derived from
    the global numpy RNG state.

    Args:
        data (dict[str, Any]): A dictionary containing the input data, including RDKit molecules
            under the `"rdkit"` key.
        n_conformers (int): Number of conformations to generate for each molecule. Default is 1.

    Returns:
        dict[str, Any]: The updated `data` dictionary with RDKit molecules containing generated conformers.

    Example:
        >>> data = {
        >>>     "rdkit": {
        >>>         'A_1': <rdkit.Chem.rdchem.Mol object at 0x...>,
        >>>         'B_1': <rdkit.Chem.rdchem.Mol object at 0x...>,
        >>>     }
        >>> }
        >>> transform = GenerateRDKitConformers(n_conformers=3)
        >>> data = transform(data)
        >>> print(data["rdkit"]["A_1"].GetNumConformers())
        3
    """

    requires_previous_transforms: ClassVar[list[str | Transform]] = ["AddRDKitMoleculesForAtomizedMolecules"]

    def __init__(
        self, n_conformers: int = 1, optimize_conformers: bool = True, optimize_kwargs: dict[str, Any] | None = None
    ):
        self.n_conformers = n_conformers
        self.optimize_conformers = optimize_conformers
        self.optimize_kwargs = default(optimize_kwargs, {})


[docs]
    def check_input(self, data: dict[str, Any]) -> None:
        check_contains_keys(data, ["rdkit"])
        check_is_instance(data, "rdkit", dict)
        check_nonzero_length(data, "rdkit")



[docs]
    def forward(self, data: dict[str, Any]) -> dict[str, Any]:
        for pn_unit_iid, rdmol in data["rdkit"].items():
            try:
                # Generate a random seed using numpy's global RNG
                seed = np.random.randint(0, 2**16 - 1)

                rdmol_with_conformers = generate_conformers(
                    rdmol,
                    seed=seed,
                    n_conformers=self.n_conformers,
                    hydrogen_policy="auto",
                    optimize=self.optimize_conformers,
                    optimize_kwargs=self.optimize_kwargs,
                )
                data["rdkit"][pn_unit_iid] = rdmol_with_conformers
            except Exception as e:
                logger.warning(f"Failed to generate conformers for molecule {pn_unit_iid}: {e}")

        return data





[docs]
def get_rdkit_chiral_centers(rdkit_mols: dict[str, Mol]) -> dict:
    """Computes the chiral centers for a dictionary of RDKit molecules.

    See the `GetRDKitChiralCenters` transform for more details.
    """
    chiral_centers = {}

    # Get chiral centers for all rdkit mols
    for resname, rdmol in rdkit_mols.items():
        try:
            # Get the chiral centers (returned are the indices of the chiral center atoms within the `obmol` object)
            chiral_centers[resname] = get_chiral_centers(rdmol)

        except Exception as e:
            logger.warning(f"Failed to find chiral centers for molecule {resname}: {e}")

    return chiral_centers




[docs]
class GetRDKitChiralCenters(Transform):
    """
    Identify chiral centers in the RDKit molecules stored in the `data["rdkit"]` dictionary.
    Returns a dictionary mapping each residue name to a list of chiral centers, e.g:
      data["chiral_centers"] = {
          ...
          "ILE": [
              {'chiral_center_idx': 1, 'bonded_explicit_atom_idxs': [0, 2, 4], 'chirality': 'S'},
              {'chiral_center_idx': 4, 'bonded_explicit_atom_idxs': [1, 5, 6], 'chirality': 'S'}
          ],
          ...
      }
    Each chiral center is a dict with a center atom index, 3 or 4 bonded atom indices, and the
    RDKit-determined chirality.

    Uses RDKit molecules first computed in GetAF3ReferenceMoleculeFeatures.

    Args:
        data (dict[str, Any]): A dictionary containing the input data, including RDKit molecules
            under the `"rdkit"` key.

    Returns:
        dict[str, Any]: The updated `data` dictionary with `chiral_centers` containing chiral
            centers for each molecule.
    """

    requires_previous_transforms: ClassVar[list[str | Transform]] = ["GetAF3ReferenceMoleculeFeatures"]


[docs]
    def check_input(self, data: dict[str, Any]) -> None:
        check_contains_keys(data, ["rdkit"])
        check_is_instance(data, "rdkit", dict)
        check_nonzero_length(data, "rdkit")



[docs]
    def forward(self, data: dict[str, Any]) -> dict[str, Any]:
        data["chiral_centers"] = get_rdkit_chiral_centers(data["rdkit"])

        return data