Note
Go to the end to download the full example code.
CIFUtils: User Example#
This example demonstrates how to use the atomworks.io package to load, inspect, and manipulate mmCIF structure files. You’ll see how to parse a structure, visualize it, and perform basic analysis.
"""
## 1.1 Loading from mmCIF
We start by loading a structure from an mmCIF file using `atomworks.io.parse`. This function supports various file formats and allows you to specify options such as which assembly to build, whether to add missing atoms, and more.
"""
import io
from biotite.database import rcsb
from atomworks.io import parse
from atomworks.io.utils.testing import get_pdb_path
from atomworks.io.utils.visualize import view
def get_example_path_or_buffer(pdb_id: str) -> str | io.StringIO:
try:
# ... if file is locally available
return get_pdb_path(pdb_id)
except FileNotFoundError:
# ... otherwise, fetch the file from RCSB
return rcsb.fetch(pdb_id, format="cif")
result_dict = parse(
filename=get_example_path_or_buffer("6lyz"),
build_assembly=["1"],
add_missing_atoms=True,
remove_waters=True,
hydrogen_policy="remove",
model=1,
)
print("Keys in parsed result:", list(result_dict.keys()))
Keys in parsed result: ['extra_info', 'metadata', 'chain_info', 'assemblies', 'ligand_info', 'asym_unit']
"""
## 1.2 Visualizing the asymmetric unit and assembly
You can visualize the asymmetric unit or any assembly using the built-in viewer from `atomworks.io`. This is helpful for quickly inspecting the structure and its components.
"""
asym_unit = result_dict["asym_unit"][0]
asym_unit = asym_unit[asym_unit.occupancy > 0]
view(asym_unit)
assembly = result_dict["assemblies"]["1"][0]
assembly = assembly[assembly.occupancy > 0]
view(assembly)
<py3Dmol.view object at 0x7fbeb2f8b830>
"""
## 1.3 Inspecting structure metadata
The parsed result contains rich metadata, including chain and ligand information, as well as annotation categories. This information is useful for downstream analysis and filtering.
"""
print("Chain info:", result_dict["chain_info"])
print("Ligand info:", result_dict["ligand_info"])
print("Metadata:", result_dict["metadata"])
print("Annotation categories:", result_dict["asym_unit"][0].get_annotation_categories())
Chain info: {'A': {'rcsb_entity': '1', 'chain_type': <ChainType.POLYPEPTIDE_L: 6>, 'unprocessed_entity_canonical_sequence': 'KVFGRCELAAAMKRHGLDNYRGYSLGNWVCAAKFESNFNTQATNRNTDGSTDYGILQINSRWWCNDGRTPGSRNLCNIPCSALLSSDITASVNCAKKIVSDGNGMNAWVAWRNRCKGTDVQAWIRGCRL', 'unprocessed_entity_non_canonical_sequence': 'KVFGRCELAAAMKRHGLDNYRGYSLGNWVCAAKFESNFNTQATNRNTDGSTDYGILQINSRWWCNDGRTPGSRNLCNIPCSALLSSDITASVNCAKKIVSDGNGMNAWVAWRNRCKGTDVQAWIRGCRL', 'is_polymer': True, 'ec_numbers': ['3.2.1.17'], 'res_name': ['LYS', 'VAL', 'PHE', 'GLY', 'ARG', 'CYS', 'GLU', 'LEU', 'ALA', 'ALA', 'ALA', 'MET', 'LYS', 'ARG', 'HIS', 'GLY', 'LEU', 'ASP', 'ASN', 'TYR', 'ARG', 'GLY', 'TYR', 'SER', 'LEU', 'GLY', 'ASN', 'TRP', 'VAL', 'CYS', 'ALA', 'ALA', 'LYS', 'PHE', 'GLU', 'SER', 'ASN', 'PHE', 'ASN', 'THR', 'GLN', 'ALA', 'THR', 'ASN', 'ARG', 'ASN', 'THR', 'ASP', 'GLY', 'SER', 'THR', 'ASP', 'TYR', 'GLY', 'ILE', 'LEU', 'GLN', 'ILE', 'ASN', 'SER', 'ARG', 'TRP', 'TRP', 'CYS', 'ASN', 'ASP', 'GLY', 'ARG', 'THR', 'PRO', 'GLY', 'SER', 'ARG', 'ASN', 'LEU', 'CYS', 'ASN', 'ILE', 'PRO', 'CYS', 'SER', 'ALA', 'LEU', 'LEU', 'SER', 'SER', 'ASP', 'ILE', 'THR', 'ALA', 'SER', 'VAL', 'ASN', 'CYS', 'ALA', 'LYS', 'LYS', 'ILE', 'VAL', 'SER', 'ASP', 'GLY', 'ASN', 'GLY', 'MET', 'ASN', 'ALA', 'TRP', 'VAL', 'ALA', 'TRP', 'ARG', 'ASN', 'ARG', 'CYS', 'LYS', 'GLY', 'THR', 'ASP', 'VAL', 'GLN', 'ALA', 'TRP', 'ILE', 'ARG', 'GLY', 'CYS', 'ARG', 'LEU'], 'res_id': ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', '17', '18', '19', '20', '21', '22', '23', '24', '25', '26', '27', '28', '29', '30', '31', '32', '33', '34', '35', '36', '37', '38', '39', '40', '41', '42', '43', '44', '45', '46', '47', '48', '49', '50', '51', '52', '53', '54', '55', '56', '57', '58', '59', '60', '61', '62', '63', '64', '65', '66', '67', '68', '69', '70', '71', '72', '73', '74', '75', '76', '77', '78', '79', '80', '81', '82', '83', '84', '85', '86', '87', '88', '89', '90', '91', '92', '93', '94', '95', '96', '97', '98', '99', '100', '101', '102', '103', '104', '105', '106', '107', '108', '109', '110', '111', '112', '113', '114', '115', '116', '117', '118', '119', '120', '121', '122', '123', '124', '125', '126', '127', '128', '129'], 'processed_entity_non_canonical_sequence': 'KVFGRCELAAAMKRHGLDNYRGYSLGNWVCAAKFESNFNTQATNRNTDGSTDYGILQINSRWWCNDGRTPGSRNLCNIPCSALLSSDITASVNCAKKIVSDGNGMNAWVAWRNRCKGTDVQAWIRGCRL', 'processed_entity_canonical_sequence': 'KVFGRCELAAAMKRHGLDNYRGYSLGNWVCAAKFESNFNTQATNRNTDGSTDYGILQINSRWWCNDGRTPGSRNLCNIPCSALLSSDITASVNCAKKIVSDGNGMNAWVAWRNRCKGTDVQAWIRGCRL', 'has_sequence_heterogeneity': False}}
Ligand info: {'ligand_of_interest': [], 'has_ligand_of_interest': False}
Metadata: {'id': '6lyz', 'method': 'X-RAY_DIFFRACTION', 'deposition_date': '1975-02-01', 'release_date': '1977-04-12', 'resolution': 2.0, 'extra_metadata': None, 'crystallization_details': {'pH': None}}
Annotation categories: ['chain_id', 'res_id', 'ins_code', 'res_name', 'hetero', 'atom_name', 'element', 'is_aromatic', 'alt_atom_id', 'occupancy', 'b_factor', 'uses_alt_atom_id', 'is_polymer', 'is_backbone_atom', 'chain_type', 'charge', 'stereo', 'pn_unit_id', 'molecule_id', 'chain_entity', 'pn_unit_entity', 'molecule_entity', 'atomic_number']
"""
## 1.4 Manipulating AtomArray
You can easily extract coordinates for specific atoms or chains, and inspect bond information. This is useful for custom analysis or feature extraction.
"""
ca = assembly[(assembly.atom_name == "CA") & (assembly.occupancy > 0)]
print("Coordinates of all resolved CA atoms:", ca.coord.shape)
chain = assembly[assembly.chain_id == "A"]
print("Coordinates of chain A (all heavy atoms):", chain.coord.shape)
print("Bond array:", assembly.bonds.as_array())
Coordinates of all resolved CA atoms: (129, 3)
Coordinates of chain A (all heavy atoms): (1001, 3)
Bond array: [[ 2 9 1]
[ 11 16 1]
[ 18 27 1]
...
[996 997 1]
[997 998 1]
[997 999 1]]
"""
## 1.5 Distance computations
`biotite.structure` provides convenient functions for distance calculations between atoms or sets of atoms. Here we compute distances between C-alpha atoms.
"""
import biotite.structure as struc
distance = struc.distance(ca.coord[0], ca.coord[1])
print(f"Distance between first two C-alpha atoms: {distance:.2f} Å")
distance = struc.distance(ca[0], ca)
print(f"Distances between first C-alpha atom and all other C-alpha atoms: {distance}")
Distance between first two C-alpha atoms: 3.80 Å
Distances between first C-alpha atom and all other C-alpha atoms: [ 0. 3.7999492 6.156354 9.661932 12.549521 14.182252
11.581763 10.84263 14.616771 15.219817 13.272157 14.838497
18.069147 17.901442 17.395086 20.602865 19.68099 21.579292
24.697742 24.088991 26.870222 27.57354 24.496967 23.660084
20.183601 20.09242 20.787966 17.43983 15.200652 17.336142
17.041845 13.243525 13.362299 15.886873 14.705508 11.209983
10.677215 8.0477 5.7841344 5.6595535 6.6849732 9.572078
13.11964 16.576756 20.033554 22.90127 26.54439 27.58783
25.418657 22.863508 19.15349 16.59757 13.387169 10.174317
9.402125 13.177384 14.283987 16.653786 19.467123 20.810495
24.450527 24.489876 21.609152 20.13909 21.382933 19.977352
23.561102 24.538763 25.224674 28.811474 29.968405 27.057043
27.373085 24.488192 25.097178 21.882395 22.806473 19.33622
18.493265 16.118 13.694339 13.188874 11.805438 8.704005
7.6456475 5.4546723 8.908026 9.965934 13.512428 14.592234
13.578032 15.495068 18.250061 18.6872 18.320152 21.14647
23.370226 22.798538 23.847483 26.741693 28.212841 29.969715
28.73933 26.48814 23.56441 25.639193 23.927898 21.194921
22.984303 20.622774 21.259388 24.949894 25.389885 23.332045
22.84652 26.0037 28.647352 27.044004 26.817791 23.237877
23.657808 22.825241 19.110613 19.498287 21.972675 21.360188
19.54008 20.864511 20.917368 ]
"""
## 1.6 Efficient neighbor search with CellList
For efficient spatial queries, use `CellList` to find atoms within a certain radius. This is useful for contact analysis and neighborhood queries.
"""
resolved_atom_array = assembly[assembly.occupancy > 0]
cell_list = struc.CellList(resolved_atom_array, cell_size=5.0)
near_atoms = cell_list.get_atoms(resolved_atom_array[0].coord, radius=4)
print(f"Number of atoms within 7 Å of the first atom: {near_atoms.shape[0]}")
print(f"Atom indices: {near_atoms}")
print(f"Chain IDs: {resolved_atom_array.chain_id[near_atoms]}")
print(f"Residue IDs: {resolved_atom_array.res_id[near_atoms]}")
print(f"Residue names: {resolved_atom_array.res_name[near_atoms]}")
Number of atoms within 7 Å of the first atom: 8
Atom indices: [ 4 5 0 1 9 2 3 315]
Chain IDs: ['A' 'A' 'A' 'A' 'A' 'A' 'A' 'A']
Residue IDs: [ 1 1 1 1 2 1 1 40]
Residue names: ['LYS' 'LYS' 'LYS' 'LYS' 'VAL' 'LYS' 'LYS' 'THR']
Total running time of the script: (0 minutes 1.283 seconds)