first remove the reordering of the atoms per monomer, and allow the phoshpodiesterase and peptide atomic bonds to be incorrectly marked

lucidrains · lucidrains · commit dcde4454b9d3 · 2024-07-15T10:26:41.000-07:00
diff --git a/alphafold3_pytorch/alphafold3.py b/alphafold3_pytorch/alphafold3.py
@@ -3351,7 +3351,6 @@ def forward(
         molecule_atom_indices: Int['b n'] | None = None, # the 'token centre atoms' mentioned in the paper, unsure where it is used in the architecture
         num_sample_steps: int | None = None,
         atom_pos: Float['b m 3'] | None = None,
-        output_atompos_indices: Int['b m'] | None = None,
         distance_labels: Int['b n n'] | None = None,
         pae_labels: Int['b n n'] | None = None,
         pde_labels: Int['b n n'] | None = None,
@@ -3603,26 +3602,6 @@ def forward(
             if exists(atom_mask):
                 sampled_atom_pos = einx.where('b m, b m c, -> b m c', atom_mask, sampled_atom_pos, 0.)
 
-            if not exists(output_atompos_indices):
-                return sampled_atom_pos
-
-            # in the case the atoms are passed in not ordered canonically
-
-            order_mask = output_atompos_indices >= 0   # -1 is padding, which means do not order (metal ions, ligands, or entire row if None was passed in)
-
-            output_atompos_indices = einx.where(
-                'b m, b m, m -> b m',
-                order_mask,
-                output_atompos_indices,
-                torch.arange(atom_seq_len, device = self.device)
-            )
-
-            sampled_atom_pos = einx.get_at(
-                'b [m] 3, b rm -> b rm 3',
-                sampled_atom_pos,
-                output_atompos_indices
-            )
-
             return sampled_atom_pos
 
         # if being forced to return loss, but do not have sufficient information to return losses, just return 0
diff --git a/alphafold3_pytorch/inputs.py b/alphafold3_pytorch/inputs.py
@@ -135,7 +135,6 @@ class AtomInput:
     template_mask:              Bool[' t'] | None = None
     msa_mask:                   Bool[' s'] | None = None
     atom_pos:                   Float['m 3'] | None = None
-    output_atompos_indices:     Int[' m'] | None = None
     molecule_atom_indices:      Int[' n'] | None = None
     distogram_atom_indices:     Int[' n'] | None = None
     distance_labels:            Int['n n'] | None = None
@@ -166,7 +165,6 @@ class BatchedAtomInput:
     template_mask:              Bool['b t'] | None = None
     msa_mask:                   Bool['b s'] | None = None
     atom_pos:                   Float['b m 3'] | None = None
-    output_atompos_indices:     Int['b m'] | None = None
     molecule_atom_indices:      Int['b n'] | None = None
     distogram_atom_indices:     Int['b n'] | None = None
     distance_labels:            Int['b n n'] | None = None
@@ -215,7 +213,6 @@ class MoleculeInput:
     templates:                  Float['t n n dt'] | None = None
     msa:                        Float['s n dm'] | None = None
     atom_pos:                   List[Float['_ 3']] | Float['m 3'] | None = None
-    output_atompos_indices:     Int[' m'] | None = None
     template_mask:              Bool[' t'] | None = None
     msa_mask:                   Bool[' s'] | None = None
     distance_labels:            Int['n n'] | None = None
@@ -355,9 +352,13 @@ def molecule_to_atom_input(
             # and not the first biomolecule in the chain, add a single covalent bond between first atom of incoming biomolecule and the last atom  of the last biomolecule
 
             if is_chainable_biomolecule and not is_first_mol_in_chain:
+
+
                 atompair_ids[offset, offset - 1] = 1
                 atompair_ids[offset - 1, offset] = 1
 
+                last_mol = mol
+
     # atom_inputs
 
     atom_inputs: List[Float['m dai']] = []
@@ -444,7 +445,6 @@ class Alphafold3Input:
     resolved_labels:            Int[' n'] | None = None
     add_atom_ids:               bool = False
     add_atompair_ids:           bool = False
-    add_output_atompos_indices: bool = True
     directed_bonds:             bool = False
     extract_atom_feats_fn:      Callable[[Atom], Float['m dai']] = default_extract_atom_feats_fn
     extract_atompair_feats_fn:  Callable[[Mol], Float['m m dapi']] = default_extract_atompair_feats_fn
@@ -814,58 +814,9 @@ def get_num_atoms_per_chain(chains: List[List[Mol]]) -> List[int]:
     molecule_atom_indices = tensor(molecule_atom_indices)
     molecule_atom_indices = pad_to_len(molecule_atom_indices, num_tokens, value = -1)
 
-    # handle atom positions
+    # todo - handle atom positions for variable lengthed atoms (eventual missing atoms from mmCIF)
 
     atom_pos = i.atom_pos
-    output_atompos_indices = None
-
-    if exists(atom_pos):
-        if isinstance(atom_pos, list):
-            atom_pos = torch.cat(atom_pos, dim = -2)
-
-        assert atom_pos.shape[-2] == total_atoms
-
-        # to automatically reorder the atom positions back to canonical
-
-        if i.add_output_atompos_indices:
-            offset = 0
-
-            reorder_atompos_indices = []
-            output_atompos_indices = []
-
-            for chain in chainable_biomol_entries:
-                for idx, entry in enumerate(chain):
-                    is_last = idx == (len(chain) - 1)
-
-                    mol = entry['rdchem_mol']
-                    num_atoms = mol.GetNumAtoms()
-                    atom_reorder_indices = entry['atom_reorder_indices']
-
-                    if not is_last:
-                        num_atoms -= 1
-                        atom_reorder_indices = atom_reorder_indices[:-1]
-
-                    reorder_back_indices = atom_reorder_indices.argsort()
-
-                    output_atompos_indices.append(reorder_back_indices + offset)
-                    reorder_atompos_indices.append(atom_reorder_indices + offset)
-
-                    offset += num_atoms
-
-            output_atompos_indices = torch.cat(output_atompos_indices, dim = -1)
-            output_atompos_indices = pad_to_length(output_atompos_indices, total_atoms, value = -1)
-
-        # if atom positions are passed in, need to be reordered for the bonds between residues / nucleotides to be contiguous
-        # todo - fix to have no reordering needed (bonds are built not contiguous, just hydroxyl removed)
-
-        if i.reorder_atom_pos:
-            orig_order = torch.arange(total_atoms)
-            reorder_atompos_indices = torch.cat(reorder_atompos_indices, dim = -1)
-            reorder_atompos_indices = pad_to_length(reorder_atompos_indices, total_atoms, value = -1)
-
-            reorder_indices = torch.where(reorder_atompos_indices != -1, reorder_atompos_indices, orig_order)
-
-            atom_pos = atom_pos[reorder_indices]
 
     # create molecule input
 
@@ -880,7 +831,6 @@ def get_num_atoms_per_chain(chains: List[List[Mol]]) -> List[int]:
         additional_token_feats = default(i.additional_token_feats, torch.zeros(num_tokens, 2)),
         is_molecule_types = is_molecule_types,
         atom_pos = atom_pos,
-        output_atompos_indices = output_atompos_indices,
         templates = i.templates,
         msa = i.msa,
         template_mask = i.template_mask,
diff --git a/alphafold3_pytorch/trainer.py b/alphafold3_pytorch/trainer.py
@@ -176,14 +176,6 @@ def collate_inputs_to_batched_atom_input(
 
     batched_atom_input_dict = dict(tuple(zip(keys, outputs)))
 
-    # just ensure output_atompos_indices has full atom_seq_len manually for now
-
-    output_atompos_indices = batched_atom_input_dict.get('output_atompos_indices', None)
-
-    if exists(output_atompos_indices):
-        atom_seq_len = batched_atom_input_dict['atom_inputs'].shape[-2]
-        batched_atom_input_dict.update(output_atompos_indices = pad_or_slice_to(output_atompos_indices, atom_seq_len, dim = -1, pad_value = -1))
-
     # reconstitute dictionary
 
     batched_atom_inputs = BatchedAtomInput(**batched_atom_input_dict)
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "alphafold3-pytorch"
-version = "0.1.120"
+version = "0.1.121"
 description = "Alphafold 3 - Pytorch"
 authors = [
     { name = "Phil Wang", email = "lucidrains@gmail.com" }
diff --git a/tests/test_trainer.py b/tests/test_trainer.py
@@ -63,12 +63,6 @@ def __getitem__(self, idx):
         if random() > 0.5:
             msa_mask = torch.ones((7,)).bool()
 
-        # randomly reorder output atompos indices for testing
-
-        output_atompos_indices = None
-        if random() > 0.5:
-            output_atompos_indices = torch.arange(atom_seq_len).long()
-
         # required for training, but omitted on inference
 
         atom_pos = torch.randn(atom_seq_len, 3)
@@ -94,7 +88,6 @@ def __getitem__(self, idx):
             msa = msa,
             msa_mask = msa_mask,
             atom_pos = atom_pos,
-            output_atompos_indices = output_atompos_indices,
             molecule_atom_indices = molecule_atom_indices,
             distance_labels = distance_labels,
             pae_labels = pae_labels,
@@ -182,7 +175,7 @@ def test_trainer():
 
     # assert can load latest checkpoint by loading from a directory
 
-    trainer.load('./checkpoints')
+    trainer.load('./checkpoints', strict = False)
 
     assert exists(trainer.model_loaded_from_path)