Skip to content

Commit 99bc8ed

Browse files
authored
Clean up and revise the PDB filtering/clustering scripts (#121)
* Update filter_pdb_train_mmcifs.py * Update filter_pdb_val_mmcifs.py * Update filter_pdb_test_mmcifs.py * Update biomolecule.py * Update cluster_pdb_train_mmcifs.py * Update cluster_pdb_val_mmcifs.py * Update cluster_pdb_test_mmcifs.py
1 parent 37f8108 commit 99bc8ed

File tree

7 files changed

+736
-2582
lines changed

7 files changed

+736
-2582
lines changed

alphafold3_pytorch/common/biomolecule.py

Lines changed: 14 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -372,9 +372,9 @@ def spatial_crop(
372372
token_res_rep_atom_indices[self.chemtype == 3] = np.where(
373373
self.atom_mask[self.chemtype == 3]
374374
)[1]
375-
token_res_atom_position_mask[np.arange(self.chain_id.size), token_res_rep_atom_indices] = (
376-
True
377-
)
375+
token_res_atom_position_mask[
376+
np.arange(self.chain_id.size), token_res_rep_atom_indices
377+
] = True
378378
token_center_atom_positions = self.atom_positions[token_res_atom_position_mask]
379379

380380
# potentially filter candidate token center atoms by chain ID
@@ -435,8 +435,8 @@ def crop(
435435
spatial_weight: float = 0.4,
436436
spatial_interface_weight: float = 0.4,
437437
n_res: int = 384,
438-
chain_1: Optional[str] = None,
439-
chain_2: Optional[str] = None,
438+
chain_1: str | None = None,
439+
chain_2: str | None = None,
440440
) -> "Biomolecule":
441441
"""Crop a Biomolecule using a randomly-sampled cropping function."""
442442
crop_fn_weights = [contiguous_weight, spatial_weight, spatial_interface_weight]
@@ -602,6 +602,7 @@ def get_unique_res_atom_names(
602602
def _from_mmcif_object(
603603
mmcif_object: mmcif_parsing.MmcifObject,
604604
chain_ids: Optional[Set[str]] = None,
605+
atomize_modified_polymer_residues: bool = False,
605606
) -> Biomolecule:
606607
"""Takes a Biopython structure/model mmCIF object and creates a `Biomolecule` instance.
607608
@@ -619,6 +620,10 @@ def _from_mmcif_object(
619620
:param mmcif_object: The parsed Biopython structure/model mmCIF object.
620621
:param chain_ids: If chain_ids are specified (e.g. A), then only these chains are parsed.
621622
Otherwise all chains are parsed.
623+
:param atomize_modified_polymer_residues: If True, then the atoms of modified
624+
polymer residues are treated as "pseudoresidues". This is useful for
625+
representing modified polymer residues as a collection of (e.g., ligand)
626+
atoms rather than as a single residue.
622627
623628
:return: A new `Biomolecule` created from the structure/model mmCIF object contents.
624629
@@ -673,7 +678,10 @@ def _from_mmcif_object(
673678
restype_idx = residue_constants.restype_order.get(
674679
res_shortname, residue_constants.restype_num
675680
)
676-
if is_polymer_residue:
681+
is_modified_polymer_residue = is_polymer_residue and res_shortname == "X"
682+
if is_polymer_residue and not (
683+
is_modified_polymer_residue and atomize_modified_polymer_residues
684+
):
677685
pos = np.zeros((residue_constants.atom_type_num, 3))
678686
mask = np.zeros((residue_constants.atom_type_num,))
679687
res_b_factors = np.zeros((residue_constants.atom_type_num,))

0 commit comments

Comments
 (0)