Fix OXT parsing bug for amino acids, and add new subsetting options (#155)

amorehead · web-flow · commit 8354dc6c655b · 2024-08-07T08:58:19.000-07:00
* Update inputs.py

* Update inputs.py

* Update amino_acid_constants.py
diff --git a/alphafold3_pytorch/common/amino_acid_constants.py b/alphafold3_pytorch/common/amino_acid_constants.py
@@ -44,7 +44,8 @@
     "CZ2",
     "CZ3",
     "NZ",
-    "OXT",
+    # "OXT",  # NOTE: This often appears in mmCIF files, but it will not be used for any amino acid type in AlphaFold.
+    "_",
     "_",
     "_",
     "_",
diff --git a/alphafold3_pytorch/inputs.py b/alphafold3_pytorch/inputs.py
@@ -2566,6 +2566,13 @@ def pdb_input_to_molecule_input(
     assert len(molecules) == len(missing_atom_indices)
     assert len(missing_token_indices) == num_tokens
 
+    mol_total_atoms = sum([mol.GetNumAtoms() for mol in molecules])
+    num_missing_atom_indices = sum(
+        len(mol_miss_atom_indices) for mol_miss_atom_indices in missing_atom_indices
+    )
+    num_present_atoms = mol_total_atoms - num_missing_atom_indices
+    assert num_present_atoms == int(biomol.atom_mask.sum())
+
     # TODO: install additional token features once MSAs are available
     # 0: f_profile
     # 1: f_deletion_mean
@@ -2645,6 +2652,7 @@ def __init__(
         spatial_interface_weight: float = 0.4,
         crop_size: int = 384,
         training: bool | None = None,  # extra training flag placed by Alex on PDBInput
+        sample_only_pdb_ids: Set[str] | None = None,
         **pdb_input_kwargs,
     ):
         if isinstance(folder, str):
@@ -2660,6 +2668,7 @@ def __init__(
         self.sampler = sampler
         self.sample_type = sample_type
         self.training = training
+        self.sample_only_pdb_ids = sample_only_pdb_ids
         self.pdb_input_kwargs = pdb_input_kwargs
 
         self.cropping_config = {
@@ -2679,6 +2688,12 @@ def __init__(
                 if file in sampler_pdb_ids
             }
 
+        if exists(sample_only_pdb_ids):
+            assert exists(self.sampler), "A sampler must be provided to use `sample_only_pdb_ids`."
+            assert all(
+                pdb_id in sampler_pdb_ids for pdb_id in sample_only_pdb_ids
+            ), "Some PDB IDs in `sample_only_pdb_ids` are not present in the dataset's sampler mappings."
+
         assert len(self) > 0, f"No valid mmCIFs / PDBs found at {str(folder)}"
 
     def __len__(self):
@@ -2690,10 +2705,18 @@ def __getitem__(self, idx: int | str) -> PDBInput:
         sampled_id = None
 
         if exists(self.sampler):
-            if self.sample_type == "clustered":
-                (sampled_id,) = self.sampler.cluster_based_sample(1)
-            else:
-                (sampled_id,) = self.sampler.sample(1)
+            sample_fn = (
+                self.sampler.cluster_based_sample
+                if self.sample_type == "clustered"
+                else self.sampler.sample
+            )
+            (sampled_id,) = sample_fn(1)
+
+            # ensure that the sampled PDB ID is in the specified set of PDB IDs from which to sample
+
+            if exists(self.sample_only_pdb_ids):
+                while sampled_id[0] not in self.sample_only_pdb_ids:
+                    (sampled_id,) = sample_fn(1)
 
         pdb_id, chain_id_1, chain_id_2 = None, None, None
 

-Original file line number
+Diff line change
     "CZ2",
     "CZ3",
     "NZ",
 -    "OXT",
 +    # "OXT",  # NOTE: This often appears in mmCIF files, but it will not be used for any amino acid type in AlphaFold.
 +    "_",
     "_",
     "_",
     "_",