Source code for atomworks.ml.transforms.covalent_modifications

"""Transforms to handle covalent modifications"""

from __future__ import annotations

from typing import ClassVar

import numpy as np
from biotite.structure import AtomArray

from atomworks.ml.preprocessing.utils.structure_utils import get_inter_pn_unit_bond_mask
from atomworks.ml.transforms._checks import (
    check_atom_array_annotation,
    check_contains_keys,
    check_is_instance,
)
from atomworks.ml.transforms.atomize import AtomizeByCCDName
from atomworks.ml.transforms.base import Transform


[docs] def flag_and_reassign_covalent_modifications(atom_array: AtomArray) -> AtomArray: """ Mark covalent modifications for atomization and reassign the corresponding PN unit annotations. Args: atom_array (AtomArray): Current `AtomArray` within the Transform pipeline Returns: AtomArray: The modified `AtomArray` with updated annotations for covalent modifications. The `pn_unit_id` and `pn_unit_iid` of polymer atoms are reassigned to those of the non-polymer unit they are bound to, and the `atomize` annotation is set to `True` for these atoms. Additionally, the entire pn_unit is marked with `is_covalent_modification = True`. NOTE: If `atomize` annotation is not present in the `AtomArray`, it will be added. NOTE: If `is_covalent_modification` annotation is not present in the `AtomArray`, it will be added. NOTE: We do not modify the `is_polymer` annotation, which will still refer to the protein chain for the atomized polymer atoms. """ # Get all inter-PN unit bonds in the entry (i.e. between a polymer and a non-polymer PN unit) inter_pn_unit_bond_mask = get_inter_pn_unit_bond_mask(atom_array) bonds_to_check = atom_array.bonds.as_array()[inter_pn_unit_bond_mask] # Filter out bonds that are not between a polymer and a non-polymer PN unit bonds_to_check = bonds_to_check[ # One atom is a polymer, the other is not => must be polymer/non-polymer bond atom_array.is_polymer[bonds_to_check[:, 0]] != atom_array.is_polymer[bonds_to_check[:, 1]] ] # Add the atomize annotation to the AtomArray, if not already present if "atomize" not in atom_array.get_annotation_categories(): atom_array.set_annotation("atomize", np.array([False] * len(atom_array))) # Add the is_covalent_modification annotation to the AtomArray, if not already present if "is_covalent_modification" not in atom_array.get_annotation_categories(): atom_array.set_annotation("is_covalent_modification", np.array([False] * len(atom_array))) # Loop through inter-molecular bonds # NOTE: There aren't likely to be many inter-molecular bonds in the entry, so vectorization is not necessary and would be less readable for bond in bonds_to_check: # Get the atoms involved in the inter-molecular bonds atom_a = atom_array[bond[0]] atom_b = atom_array[bond[1]] # Note which atom is in the polymer and which is in the non-polymer polymer_atom, non_polymer_atom = (atom_a, atom_b) if atom_a.is_polymer else (atom_b, atom_a) # Create a mask of the atoms in the residue that is covalently bound to the non-polymer PN unit # We can uniquely identify a residue by its res_id, pn_unit_iid, and chain_id (or chain_iid, either works) polymer_residue_mask = ( (atom_array.res_id == polymer_atom.res_id) & (atom_array.chain_id == polymer_atom.chain_id) & (atom_array.pn_unit_iid == polymer_atom.pn_unit_iid) ) # For all atoms in the target polymer residue, set the pn_unit_iid and the pn_unit_id to that of the non-polymer PN unit num_residues = np.sum(polymer_residue_mask) atom_array.pn_unit_id[polymer_residue_mask] = np.array([non_polymer_atom.pn_unit_id] * num_residues) atom_array.pn_unit_iid[polymer_residue_mask] = np.array([non_polymer_atom.pn_unit_iid] * num_residues) # Mark the non-polymer residue for atomization (now includes all atoms in the bonded polymer residue) atom_array.atomize[(atom_array.pn_unit_iid == non_polymer_atom.pn_unit_iid)] = True # Mark the entire pn_unit as a covalent modification atom_array.is_covalent_modification[(atom_array.pn_unit_iid == non_polymer_atom.pn_unit_iid)] = True return atom_array
[docs] class FlagAndReassignCovalentModifications(Transform): """Handles covalent modifications within the AtomArray. Covalent modifications, e.g., glycosylation, are handled by the following algorithm: ------------------------------------------------------------------------------------------------ for polymer residues with atoms covalently bound to a NON-POLYMER: for ALL atoms in the polymer residue: set the pn_unit_iid and pn_unit_id identifying annotations to that of the NON-POLYMER polymer/non-polymer unit set atomize = true (thus, this transform must be run before the Atomize transform) set is_covalent_modification = true (for the entire pn_unit) ------------------------------------------------------------------------------------------------ """ incompatible_previous_transforms: ClassVar[list[str | Transform]] = [AtomizeByCCDName, "AddGlobalTokenIdAnnotation"]
[docs] def check_input(self, data: dict) -> None: check_contains_keys(data, ["atom_array"]) check_is_instance(data, "atom_array", AtomArray) check_atom_array_annotation(data, ["pn_unit_id", "pn_unit_iid"])
[docs] def forward(self, data: dict) -> dict: data["atom_array"] = flag_and_reassign_covalent_modifications(data["atom_array"]) return data