Skip to content

Commit a05aecc

Browse files
authored
Add parsing steps described in second paragraph of AF3 supplement Section 2.1 (#71)
* Update data_pipeline.py * Update biomolecule.py * Update data_pipeline.py
1 parent 9380be0 commit a05aecc

File tree

2 files changed

+45
-12
lines changed

2 files changed

+45
-12
lines changed

alphafold3_pytorch/common/biomolecule.py

Lines changed: 32 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -298,6 +298,9 @@ def _from_mmcif_object(
298298
ValueError: If insertion code is detected at a residue.
299299
"""
300300
structure = mmcif_object.structure
301+
# Resolve alternative locations for atoms/residues by taking the one with the largest occupancy.
302+
# NOTE: For `DisorderedAtom` objects, selecting the highest-occupancy atom is already the default behavior in Biopython.
303+
# Reference: https://biopython-tutorial.readthedocs.io/en/latest/notebooks/11%20-%20Going%203D%20-%20The%20PDB%20module.html#Disordered-atoms[disordered-atoms]
301304
if isinstance(structure, Model):
302305
model = structure
303306
else:
@@ -333,6 +336,7 @@ def _from_mmcif_object(
333336
f" {res_chem_comp_details.id} in the mmCIF chemical component dictionary for {mmcif_object.file_id}."
334337
)
335338
is_polymer_residue = is_polymer(res_chem_comp_details.type)
339+
is_peptide_residue = "peptide" in res_chem_comp_details.type.lower()
336340
residue_constants = get_residue_constants(res_chem_type=res_chem_comp_details.type)
337341
res_shortname = residue_constants.restype_3to1.get(res.resname, "X")
338342
restype_idx = residue_constants.restype_order.get(
@@ -345,12 +349,36 @@ def _from_mmcif_object(
345349
for atom in res:
346350
if is_polymer_residue and atom.name not in residue_constants.atom_types_set:
347351
continue
348-
pos[residue_constants.atom_order[atom.name]] = atom.coord
349-
mask[residue_constants.atom_order[atom.name]] = 1.0
350-
res_b_factors[residue_constants.atom_order[atom.name]] = atom.bfactor
352+
elif is_peptide_residue and atom.name.upper() == "SE" and res.get_resname() == "MSE":
353+
# Put the coords of the selenium atom in the sulphur column.
354+
pos[residue_constants.atom_order["SD"]] = atom.coord
355+
mask[residue_constants.atom_order["SD"]] = 1.0
356+
res_b_factors[residue_constants.atom_order["SD"]] = atom.bfactor
357+
else:
358+
pos[residue_constants.atom_order[atom.name]] = atom.coord
359+
mask[residue_constants.atom_order[atom.name]] = 1.0
360+
res_b_factors[residue_constants.atom_order[atom.name]] = atom.bfactor
351361
if np.sum(mask) < 0.5:
352362
# If no known atom positions are reported for a polymer residue then skip it.
353363
continue
364+
if is_peptide_residue:
365+
# Fix naming errors in arginine residues where NH2 is incorrectly
366+
# assigned to be closer to CD than NH1
367+
cd = residue_constants.atom_order["CD"]
368+
nh1 = residue_constants.atom_order["NH1"]
369+
nh2 = residue_constants.atom_order["NH2"]
370+
if (
371+
res.get_resname() == "ARG"
372+
and all(mask[atom_index] for atom_index in (cd, nh1, nh2))
373+
and (np.linalg.norm(pos[nh1] - pos[cd]) > np.linalg.norm(pos[nh2] - pos[cd]))
374+
):
375+
pos[nh1], pos[nh2] = pos[nh2].copy(), pos[nh1].copy()
376+
mask[nh1], mask[nh2] = mask[nh2].copy(), mask[nh1].copy()
377+
res_b_factors[nh1], res_b_factors[nh2] = (
378+
res_b_factors[nh2].copy(),
379+
res_b_factors[nh1].copy(),
380+
)
381+
# Collect the residue's features.
354382
restype.append(restype_idx)
355383
chemid.append(res_chem_comp_details.id)
356384
chemtype.append(residue_constants.chemtype_num)
@@ -379,6 +407,7 @@ def _from_mmcif_object(
379407
# into a single ligand residue using indexing operations
380408
# working jointly on chain_index and residue_index.
381409
for atom in res:
410+
# NOTE: This code assumes water residues have previously been filtered out.
382411
pos = np.zeros((residue_constants.atom_type_num, 3))
383412
mask = np.zeros((residue_constants.atom_type_num,))
384413
res_b_factors = np.zeros((residue_constants.atom_type_num,))

alphafold3_pytorch/data/data_pipeline.py

Lines changed: 13 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,7 @@
11
"""General-purpose data pipeline."""
22

3+
import os
4+
35
from loguru import logger
46
from typing import MutableMapping, Optional, Tuple
57

@@ -111,10 +113,10 @@ def make_mmcif_features(
111113
)
112114
)
113115

114-
# Expand the first bioassembly/model sequence and structure, to obtain a biologically relevant complex (AF3 Supplement, Section 2.1).
116+
# As necessary, expand the first bioassembly/model sequence and structure, to obtain a biologically relevant complex (AF3 Supplement, Section 2.1).
115117
# Reference: https://github.com/biotite-dev/biotite/blob/1045f43f80c77a0dc00865e924442385ce8f83ab/src/biotite/structure/io/pdbx/convert.py#L1441
116118

117-
assembly = get_assembly(_from_mmcif_object(mmcif_object))
119+
assembly = _from_mmcif_object(mmcif_object) if "assembly" in description else get_assembly(_from_mmcif_object(mmcif_object))
118120

119121
mmcif_feats["all_atom_positions"] = assembly.atom_positions
120122
mmcif_feats["all_atom_mask"] = assembly.atom_mask
@@ -138,20 +140,22 @@ def make_mmcif_features(
138140

139141

140142
if __name__ == "__main__":
143+
filepath = "data/pdb_data/mmcifs/ak/7akd-assembly1.cif"
144+
file_id = os.path.splitext(os.path.basename(filepath))[0]
145+
141146
mmcif_object = mmcif_parsing.parse_mmcif_object(
142-
# Load an example mmCIF file that includes
143-
# protein, nucleic acid, and ligand residues.
144-
filepath="data/pdb_data/mmcifs/f7/4f7u.cif",
145-
file_id="4f7u",
147+
filepath=filepath,
148+
file_id=file_id,
146149
)
147150
mmcif_feats, assembly = make_mmcif_features(mmcif_object)
148151
mmcif_string = to_mmcif(
149152
assembly,
150-
file_id="4f7u",
153+
file_id=file_id,
151154
gapless_poly_seq=True,
152155
insert_alphafold_mmcif_metadata=False,
153156
unique_res_atom_names=assembly.unique_res_atom_names,
154157
)
155-
with open("4f7u_reconstructed.cif", "w") as f:
158+
with open(os.path.basename(filepath).replace(".cif", "_reconstructed.cif"), "w") as f:
156159
f.write(mmcif_string)
157-
print("Successfully reconstructed the mmCIF file after assembly expansion.")
160+
161+
print(f"Successfully reconstructed {filepath} after mmCIF featurization.")

0 commit comments

Comments
 (0)