address a few issues and fix molecule_ids not accounting for ligands have one token per atom

lucidrains · lucidrains · commit 0a8bae530647 · 2024-07-17T19:30:07.000-07:00
diff --git a/README.md b/README.md
@@ -30,6 +30,8 @@ A fork with full Lightning + Hydra support is being maintained by <a href="https
 
 - <a href="https://github.com/milot-mirdita">Milot</a> for optimizing the PDB dataset clustering script!
 
+- <a href="https://github.com/amorehead">Alex</a> for basically writing the entire gargantuan flow from parsing the PDB all the way to the molecule and atomic inputs for training
+
 - <a href="https://github.com/patrick-kidger">Patrick</a> for <a href="https://docs.kidger.site/jaxtyping/">jaxtyping</a>, <a href="https://github.com/fferflo">Florian</a> for <a href="https://github.com/fferflo/einx">einx</a>, and of course, <a href="https://github.com/arogozhnikov">Alex</a> for <a href="https://einops.rocks/">einops</a>
 
 ## Install
diff --git a/alphafold3_pytorch/alphafold3.py b/alphafold3_pytorch/alphafold3.py
@@ -3381,7 +3381,7 @@ def forward(
         return_present_sampled_atoms: bool = False,
         num_rollout_steps: int = 20,
         rollout_show_tqdm_pbar: bool = False
-    ) -> Float['b m 3'] | Float['l 3'] | Float[''] | Tuple[Float[''], LossBreakdown]:
+    ) -> Float['b m 3'] | List[Float['l 3']] | Float[''] | Tuple[Float[''], LossBreakdown]:
 
         atom_seq_len = atom_inputs.shape[-2]
 
@@ -3624,8 +3624,9 @@ def forward(
 
             if exists(atom_mask):
                 sampled_atom_pos = einx.where('b m, b m c, -> b m c', atom_mask, sampled_atom_pos, 0.)
+
             if exists(missing_atom_mask) and return_present_sampled_atoms:
-                sampled_atom_pos = sampled_atom_pos[~missing_atom_mask]
+                sampled_atom_pos = [one_sampled_atom_pos[~one_missing_atom_mask] for one_sampled_atom_pos, one_missing_atom_mask in zip(sampled_atom_pos, missing_atom_mask)]
 
             return sampled_atom_pos
 
diff --git a/alphafold3_pytorch/inputs.py b/alphafold3_pytorch/inputs.py
@@ -707,7 +707,8 @@ def alphafold3_input_to_molecule_input(alphafold3_input: Alphafold3Input) -> Mol
         (mol_from_smile(ligand) if isinstance(ligand, str) else ligand) for ligand in ligands
     ]
 
-    molecule_ids.append(tensor([ligand_id] * len(mol_ligands)))
+    for mol_ligand in mol_ligands:
+        molecule_ids.append(tensor([ligand_id] * mol_ligand.GetNumAtoms()))
 
     # create the molecule input
 
@@ -819,8 +820,6 @@ def alphafold3_input_to_molecule_input(alphafold3_input: Alphafold3Input) -> Mol
     # handle molecule ids
 
     molecule_ids = torch.cat(molecule_ids).long()
-    # TODO: do not pad this with zeros anymore, as it will mistakenly treat padded tokens as `ALA`
-    molecule_ids = pad_to_len(molecule_ids, num_tokens)
 
     # handle atom_parent_ids
     # this governs in the atom encoder / decoder, which atom attends to which
diff --git a/alphafold3_pytorch/utils/model_utils.py b/alphafold3_pytorch/utils/model_utils.py
@@ -1,5 +1,5 @@
 from functools import wraps
-from typing import List, Tuple, Union
+from typing import List, Tuple, Union, Any
 
 import einx
 import torch
@@ -18,7 +18,6 @@
 
 # default scheduler used in paper w/ warmup
 
-
 def default_lambda_lr_fn(steps: int) -> float:
     """Default lambda learning rate function.
 
diff --git a/alphafold3_pytorch/utils/utils.py b/alphafold3_pytorch/utils/utils.py
@@ -1,6 +1,15 @@
 import numpy as np
 
-from typing import Any
+from typing import Any, List
+
+def first(arr: List) -> Any:
+    """
+    Returns first element of list
+
+    :param arr: the list
+    :return: the element
+    """
+    return arr[0]
 
 
 def exists(val: Any) -> bool:
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "alphafold3-pytorch"
-version = "0.1.134"
+version = "0.1.135"
 description = "Alphafold 3 - Pytorch"
 authors = [
     { name = "Phil Wang", email = "lucidrains@gmail.com" }
diff --git a/tests/test_input.py b/tests/test_input.py
@@ -184,7 +184,8 @@ def test_pdbinput_input():
     batched_eval_atom_input = pdb_inputs_to_batched_atom_input(eval_pdb_input, atoms_per_window=27)
 
     alphafold3.eval()
-    sampled_atom_pos = alphafold3(
+
+    sampled_atom_pos, = alphafold3(
         **batched_eval_atom_input.dict(), return_loss=False, return_present_sampled_atoms=True
     )
 

Original file line number	Diff line number	Diff line change
`@@ -184,7 +184,8 @@ def test_pdbinput_input():`
`184`	`184`	`batched_eval_atom_input = pdb_inputs_to_batched_atom_input(eval_pdb_input, atoms_per_window=27)`
`185`	`185`
`186`	`186`	`alphafold3.eval()`
`187`		`- sampled_atom_pos = alphafold3(`
	`187`	`+`
	`188`	`+ sampled_atom_pos, = alphafold3(`
`188`	`189`	`**batched_eval_atom_input.dict(), return_loss=False, return_present_sampled_atoms=True`
`189`	`190`	`)`
`190`	`191`