Source code for atomworks.io.utils.non_rcsb

"""
Utility functions for handling non-RCSB CIF files.
Such files do not follow the standard CIF format and thus may require special handling.
"""

__all__ = [
    "get_identity_assembly_gen_category",
    "get_identity_op_expr_category",
    "initialize_chain_info_from_atom_array",
]

import logging
from collections.abc import Sequence

import numpy as np
from biotite.structure import AtomArray
from biotite.structure.io.pdbx import CIFCategory

from atomworks.enums import ChainType
from atomworks.io.constants import (
    AA_LIKE_CHEM_TYPES,
    DNA_LIKE_CHEM_TYPES,
    POLYPEPTIDE_D_CHEM_TYPES,
    POLYPEPTIDE_L_CHEM_TYPES,
    RNA_LIKE_CHEM_TYPES,
)
from atomworks.io.utils.ccd import get_chem_comp_type
from atomworks.io.utils.selection import get_residue_starts
from atomworks.io.utils.sequence import get_1_from_3_letter_code

logger = logging.getLogger("atomworks.io")


def infer_chain_type_from_ccd_codes(ccd_code_seq: Sequence[str]) -> ChainType:
    chain_type_counts = dict.fromkeys(
        [
            "aa_like",
            ChainType.POLYPEPTIDE_D,
            ChainType.POLYPEPTIDE_L,
            ChainType.DNA,
            ChainType.RNA,
            ChainType.NON_POLYMER,
        ],
        0,
    )

    # ... infer the chain type based on each residue
    for res_name in ccd_code_seq:
        chem_comp = get_chem_comp_type(res_name, mode="warn")
        # Increment the count for the appropriate chain type category
        # (All amino acid-like chem types are considered "aa_like")
        if chem_comp in AA_LIKE_CHEM_TYPES:
            chain_type_counts["aa_like"] += 1
            # (We further differentiate between L- and D-polypeptides)
            if chem_comp in POLYPEPTIDE_D_CHEM_TYPES:
                chain_type_counts[ChainType.POLYPEPTIDE_D] += 1
            elif chem_comp in POLYPEPTIDE_L_CHEM_TYPES:
                chain_type_counts[ChainType.POLYPEPTIDE_L] += 1
        # (We differentiate between RNA and DNA)
        elif chem_comp in RNA_LIKE_CHEM_TYPES:
            chain_type_counts[ChainType.RNA] += 1
        elif chem_comp in DNA_LIKE_CHEM_TYPES:
            chain_type_counts[ChainType.DNA] += 1
        # (All other chem types are considered non-polymer)
        else:
            chain_type_counts[ChainType.NON_POLYMER] += 1

    # WARNING: The following logic is heuristic, and may fail in cases of multiple residues types within a chain.

    # ... if we have both RNA and DNA, set the chain type to RNA/DNA hybrid
    if chain_type_counts[ChainType.RNA] > 0 and chain_type_counts[ChainType.DNA] > 0:
        chain_type = ChainType.DNA_RNA_HYBRID

    # ... if we have proteins, set to either L- or D-polypeptide, depending on the counts
    elif chain_type_counts[ChainType.POLYPEPTIDE_L] > 0 or chain_type_counts[ChainType.POLYPEPTIDE_D] > 0:
        # ... if we have equal or more L-polypeptides than D-polypeptides in the chain, set to L-polypeptide
        if chain_type_counts[ChainType.POLYPEPTIDE_L] >= chain_type_counts[ChainType.POLYPEPTIDE_D]:
            chain_type = ChainType.POLYPEPTIDE_L

        # ... if we have more D-polypeptides than L-polypeptides, set to D-polypeptide
        elif chain_type_counts[ChainType.POLYPEPTIDE_L] < chain_type_counts[ChainType.POLYPEPTIDE_D]:
            chain_type = ChainType.POLYPEPTIDE_D

    # ... if we only have "aa_like", default to "polypeptide(L)"
    elif (
        chain_type_counts["aa_like"] > 0
        and chain_type_counts[ChainType.POLYPEPTIDE_L] == 0
        and chain_type_counts[ChainType.POLYPEPTIDE_D] == 0
    ):
        chain_type = ChainType.POLYPEPTIDE_L

    # ... if we have RNA, set to polyribonucleotide
    elif chain_type_counts[ChainType.RNA] > 0:
        chain_type = ChainType.RNA
    # ... if we have DNA, set to polydeoxyribonucleotide
    elif chain_type_counts[ChainType.DNA] > 0:
        chain_type = ChainType.DNA
    # ... otherwise set to non-polymer, if we have non-polymer residues
    elif chain_type_counts[ChainType.NON_POLYMER] > 0:
        chain_type = ChainType.NON_POLYMER
    else:
        raise ValueError(f"Could not infer chain type from residue names: {ccd_code_seq}")

    return chain_type


[docs] def initialize_chain_info_from_atom_array( atom_array: AtomArray, infer_chain_type: bool = True, infer_chain_sequences: bool = True, use_chain_iids: bool = False, ) -> dict: """Infer specified information (chain type and sequence information) from an AtomArray. Required when the chain type or polymer sequence information is not explicitly provided in the CIF file. Such situations may arise when the CIF file is not from the RCSB PDB database (e.g., distillation). WARNING: Use this function for computationally predicted structures or for inference; do not use for files from the RCSB PDB database. In particular, this function adds the following information to the chain_info_dict: - The RCSB entity ID for each chain (e.g., 1, 2, 3, etc.), if present in the AtomArray (under the entity_id atom site label) - The unprocessed one-letter entity canonical and non-canonical sequences. - (OptionallyA boolean flag indicating whether the chain is a polymer. - (Optionally) The chain type as an IntEnum (e.g., polypeptide(L), non-polymer, etc.) - (Optionally) The residue IDs and residue names, inferred from the AtomArray. Args: atom_array (AtomArray): The AtomArray object to infer chain information from. infer_chain_type (bool, optional): Whether to infer the chain type from the AtomArray. Defaults to True. infer_chain_sequences (bool, optional): Whether to infer the chain sequences from the AtomArray. Defaults to True. use_chain_iids (bool, optional): Whether to use the chain_iid annotation rather than the chain_id. Defaults to False. Returns: dict: A dictionary containing the specified chain information as keys """ logger.info( "Could not read ChainType from CIF file, inferring from AtomArray (ensure this is the correct behavior)!" ) chain_info_dict = {} _res_starts = get_residue_starts(atom_array) if use_chain_iids: chain_identifiers = atom_array.chain_iid[_res_starts] else: chain_identifiers = atom_array.chain_id[_res_starts] res_ids = atom_array.res_id[_res_starts] res_names = atom_array.res_name[_res_starts] hetero = atom_array.hetero[_res_starts] # Loop through chains for chain_identifier in np.unique(chain_identifiers): is_in_chain = chain_identifiers == chain_identifier seq = res_names[is_in_chain] # ... EDGE CASE: If all atoms are "HETATM", override the chain type to non-polymer if np.all(hetero[is_in_chain]): chain_type = ChainType.NON_POLYMER else: chain_type = infer_chain_type_from_ccd_codes(seq) processed_entity_non_canonical_sequence = "".join( get_1_from_3_letter_code(ccd_code, chain_type, use_closest_canonical=False) for ccd_code in seq ) processed_entity_canonical_sequence = "".join( get_1_from_3_letter_code(ccd_code, chain_type, use_closest_canonical=True) for ccd_code in seq ) chain_info_dict[chain_identifier] = {} if "label_entity_id" in atom_array.get_annotation_categories(): # (Entity ID is the same for all atoms in the chain) atom_level_chain_identifiers = atom_array.chain_iid if use_chain_iids else atom_array.chain_id chain_info_dict[chain_identifier]["rcsb_entity"] = int( atom_array.label_entity_id[atom_level_chain_identifiers == chain_identifier][0] ) if infer_chain_type: chain_info_dict[chain_identifier]["chain_type"] = chain_type chain_info_dict[chain_identifier]["is_polymer"] = chain_type.is_polymer() if infer_chain_sequences: chain_info_dict[chain_identifier]["res_id"] = list(res_ids[is_in_chain]) chain_info_dict[chain_identifier]["res_name"] = list(res_names[is_in_chain]) chain_info_dict[chain_identifier]["processed_entity_non_canonical_sequence"] = ( processed_entity_non_canonical_sequence ) chain_info_dict[chain_identifier]["processed_entity_canonical_sequence"] = ( processed_entity_canonical_sequence ) return chain_info_dict
[docs] def get_identity_op_expr_category() -> CIFCategory: return CIFCategory.deserialize( """_pdbx_struct_oper_list.id 1 _pdbx_struct_oper_list.type 'identity operation' _pdbx_struct_oper_list.name 1_555 _pdbx_struct_oper_list.symmetry_operation x,y,z _pdbx_struct_oper_list.matrix[1][1] 1.0000000000 _pdbx_struct_oper_list.matrix[1][2] 0.0000000000 _pdbx_struct_oper_list.matrix[1][3] 0.0000000000 _pdbx_struct_oper_list.vector[1] 0.0000000000 _pdbx_struct_oper_list.matrix[2][1] 0.0000000000 _pdbx_struct_oper_list.matrix[2][2] 1.0000000000 _pdbx_struct_oper_list.matrix[2][3] 0.0000000000 _pdbx_struct_oper_list.vector[2] 0.0000000000 _pdbx_struct_oper_list.matrix[3][1] 0.0000000000 _pdbx_struct_oper_list.matrix[3][2] 0.0000000000 _pdbx_struct_oper_list.matrix[3][3] 1.0000000000 _pdbx_struct_oper_list.vector[3] 0.0000000000 """ # noqa: W291 )
[docs] def get_identity_assembly_gen_category(chain_ids: list[str]) -> CIFCategory: return CIFCategory.deserialize( f"""_pdbx_struct_assembly_gen.assembly_id 1 _pdbx_struct_assembly_gen.oper_expression 1 _pdbx_struct_assembly_gen.asym_id_list {",".join(chain_ids)} """ )