add missing_atom_indices to both Alphafold3Input as well as MoleculeInput

lucidrains · lucidrains · commit 2a52b2aba1f8 · 2024-07-16T12:36:51.000-07:00
diff --git a/alphafold3_pytorch/inputs.py b/alphafold3_pytorch/inputs.py
@@ -213,6 +213,7 @@ class MoleculeInput:
     token_bonds:                Bool['n n']
     molecule_atom_indices:      List[int | None] | None = None
     distogram_atom_indices:     List[int | None] | None = None
+    missing_atom_indices:       List[Int[' _'] | None] | None = None
     atom_parent_ids:            Int[' m'] | None = None
     additional_token_feats:     Float[f'n dtf'] | None = None
     templates:                  Float['t n n dt'] | None = None
@@ -288,6 +289,24 @@ def molecule_to_atom_input(
     all_num_atoms = tensor([mol.GetNumAtoms() for mol in molecules])
     offsets = exclusive_cumsum(all_num_atoms)
 
+    # handle maybe missing atom indices
+
+    missing_atom_mask = None
+
+    if exists(i.missing_atom_indices) and len(i.missing_atom_indices) > 0:
+
+        missing_atom_mask = []
+
+        for num_atoms, mol_missing_atom_indices in zip(all_num_atoms, i.missing_atom_indices):
+            mol_miss_atom_mask = torch.zeros(num_atoms, dtype = torch.bool)
+
+            if exists(mol_missing_atom_indices) and mol_missing_atom_indices.numel() > 0:
+                mol_miss_atom_mask.scatter_(-1, mol_missing_atom_indices, True)
+
+            missing_atom_mask.append(mol_miss_atom_mask)
+
+        missing_atom_mask = torch.cat(missing_atom_mask)
+
     # handle maybe atompair embeds
 
     atompair_ids = None
@@ -420,6 +439,7 @@ def molecule_to_atom_input(
         molecule_ids = i.molecule_ids,
         molecule_atom_indices = i.molecule_atom_indices,
         distogram_atom_indices = i.distogram_atom_indices,
+        missing_atom_mask = missing_atom_mask,
         additional_token_feats = i.additional_token_feats,
         additional_molecule_feats = i.additional_molecule_feats,
         is_molecule_types = i.is_molecule_types,
@@ -448,6 +468,7 @@ class Alphafold3Input:
     ds_dna:                     List[Int[' _'] | str] = imm_list()
     ds_rna:                     List[Int[' _'] | str] = imm_list()
     atom_parent_ids:            Int[' m'] | None = None
+    missing_atom_indices:       List[List[int] | None] = imm_list()
     additional_token_feats:     Float[f'n dtf'] | None = None
     templates:                  Float['t n n dt'] | None = None
     msa:                        Float['s n dm'] | None = None
@@ -844,10 +865,25 @@ def get_num_atoms_per_chain(chains: List[List[Mol]]) -> List[int]:
     src_tgt_atom_indices = tensor(src_tgt_atom_indices)
     src_tgt_atom_indices = pad_to_len(src_tgt_atom_indices, num_tokens, value = -1, dim = -2)
 
-    # todo - handle atom positions for variable lengthed atoms (eventual missing atoms from mmCIF)
+    # atom positions
 
     atom_pos = i.atom_pos
 
+    # handle missing atom indices
+
+    missing_atom_indices = None
+
+    if exists(i.missing_atom_indices) and len(i.missing_atom_indices) > 0:
+        missing_atom_indices = []
+
+        for mol_miss_atom_indices in i.missing_atom_indices:
+            mol_miss_atom_indices = default(mol_miss_atom_indices, [])
+            mol_miss_atom_indices = tensor(mol_miss_atom_indices, dtype = torch.long)
+
+            missing_atom_indices.append(mol_miss_atom_indices)
+
+        assert len(molecules) == len(missing_atom_indices)
+
     # create molecule input
 
     molecule_input = MoleculeInput(
@@ -860,6 +896,7 @@ def get_num_atoms_per_chain(chains: List[List[Mol]]) -> List[int]:
         additional_molecule_feats = additional_molecule_feats,
         additional_token_feats = default(i.additional_token_feats, torch.zeros(num_tokens, 2)),
         is_molecule_types = is_molecule_types,
+        missing_atom_indices = missing_atom_indices,
         src_tgt_atom_indices = src_tgt_atom_indices,
         atom_pos = atom_pos,
         templates = i.templates,
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "alphafold3-pytorch"
-version = "0.1.129"
+version = "0.1.130"
 description = "Alphafold 3 - Pytorch"
 authors = [
     { name = "Phil Wang", email = "lucidrains@gmail.com" }
diff --git a/tests/test_input.py b/tests/test_input.py
@@ -92,6 +92,7 @@ def test_atompos_input():
 
     train_alphafold3_input = Alphafold3Input(
         proteins = [contrived_protein],
+        missing_atom_indices = [[1, 2], None],
         atom_pos = mock_atompos
     )
 

Original file line number	Diff line number	Diff line change
`@@ -92,6 +92,7 @@ def test_atompos_input():`
`92`	`92`
`93`	`93`	`train_alphafold3_input = Alphafold3Input(`
`94`	`94`	`proteins = [contrived_protein],`
	`95`	`+ missing_atom_indices = [[1, 2], None],`
`95`	`96`	`atom_pos = mock_atompos`
`96`	`97`	`)`
`97`	`98`