address #90 and just do is_molecule_types correctly

lucidrains · lucidrains · commit 1be7cf41fc26 · 2024-07-18T05:55:12.000-07:00
diff --git a/README.md b/README.md
@@ -62,7 +62,7 @@ atompair_inputs = torch.randn(2, atom_seq_len, atom_seq_len, 5)
 
 additional_molecule_feats = torch.randint(0, 2, (2, seq_len, 5))
 additional_token_feats = torch.randn(2, seq_len, 2)
-is_molecule_types = torch.randint(0, 2, (2, seq_len, 4)).bool()
+is_molecule_types = torch.randint(0, 2, (2, seq_len, 5)).bool()
 molecule_ids = torch.randint(0, 32, (2, seq_len))
 
 template_feats = torch.randn(2, 2, seq_len, seq_len, 44)
diff --git a/alphafold3_pytorch/alphafold3.py b/alphafold3_pytorch/alphafold3.py
@@ -40,6 +40,9 @@
 
 from alphafold3_pytorch.inputs import (
     IS_MOLECULE_TYPES,
+    IS_PROTEIN_INDEX,
+    IS_LIGAND_INDEX,
+    IS_BIOMOLECULE_INDICES,
     ADDITIONAL_MOLECULE_FEATS
 )
 
@@ -104,12 +107,13 @@
 """
 
 """
-is_molecule_types: [*, 4]
+is_molecule_types: [*, 5]
 
 0: is_protein
 1: is_rna
 2: is_dna
 3: is_ligand
+4: is_metal_ions_or_misc
 """
 
 # constants
@@ -2269,7 +2273,7 @@ def forward(
             is_nucleotide_or_ligand_fields = tuple(repeat_consecutive_with_lens(t, molecule_atom_lens) for t in is_nucleotide_or_ligand_fields)
             is_nucleotide_or_ligand_fields = tuple(pad_or_slice_to(t, length = align_weights.shape[-1], dim = -1) for t in is_nucleotide_or_ligand_fields)
 
-            _, atom_is_dna, atom_is_rna, atom_is_ligand = is_nucleotide_or_ligand_fields
+            _, atom_is_dna, atom_is_rna, atom_is_ligand, _ = is_nucleotide_or_ligand_fields
 
             # section 3.7.1 equation 4
 
@@ -3493,7 +3497,7 @@ def forward(
         # only apply relative positional encodings to biomolecules that are chained
         # not to ligands + metal ions
 
-        is_chained_biomol = is_molecule_types[..., :3].any(dim = -1) # first three types are chained biomolecules (protein, rna, dna)
+        is_chained_biomol = is_molecule_types[..., IS_BIOMOLECULE_INDICES].any(dim = -1) # first three types are chained biomolecules (protein, rna, dna)
         paired_is_chained_biomol = einx.logical_and('b i, b j -> b i j', is_chained_biomol, is_chained_biomol)
 
         relative_position_encoding = einx.where(
@@ -3531,7 +3535,7 @@ def forward(
         # prepare mask for msa module and template embedder
         # which is equivalent to the `is_protein` of the `is_molecular_types` input
 
-        is_protein_mask = is_molecule_types[..., 0]
+        is_protein_mask = is_molecule_types[..., IS_PROTEIN_INDEX]
 
         # init recycled single and pairwise
 
diff --git a/alphafold3_pytorch/inputs.py b/alphafold3_pytorch/inputs.py
@@ -47,7 +47,12 @@
 
 # constants
 
-IS_MOLECULE_TYPES = 4
+IS_MOLECULE_TYPES = 5
+IS_PROTEIN_INDEX = 0
+IS_LIGAND_INDEX = -2
+IS_METAL_ION_INDEX = -1
+IS_BIOMOLECULE_INDICES = slice(0, 3)
+
 ADDITIONAL_MOLECULE_FEATS = 5
 
 CCD_COMPONENTS_FILEPATH = os.path.join('data', 'ccd_data', 'components.cif')
@@ -243,7 +248,7 @@ def molecule_to_atom_input(mol_input: MoleculeInput) -> AtomInput:
     if not exists(atom_lens):
         atom_lens = []
 
-        for mol, is_ligand in zip(molecules, i.is_molecule_types[:, -1]):
+        for mol, is_ligand in zip(molecules, i.is_molecule_types[:, IS_LIGAND_INDEX]):
             num_atoms = mol.GetNumAtoms()
 
             if is_ligand:
@@ -347,7 +352,7 @@ def molecule_to_atom_input(mol_input: MoleculeInput) -> AtomInput:
         asym_ids = F.pad(asym_ids, (1, 0), value=-1)
         is_first_mol_in_chains = (asym_ids[1:] - asym_ids[:-1]) == 1
 
-        is_chainable_biomolecules = i.is_molecule_types[..., :3].any(dim=-1)
+        is_chainable_biomolecules = i.is_molecule_types[..., IS_BIOMOLECULE_INDICES].any(dim=-1)
 
         # for every molecule, build the bonds id matrix and add to `atompair_ids`
 
@@ -746,9 +751,10 @@ def alphafold3_input_to_molecule_input(alphafold3_input: Alphafold3Input) -> Mol
         len(all_rna_mols),
         len(all_dna_mols),
         total_ligand_tokens,
+        num_metal_ions
     ]
 
-    num_tokens = sum(molecule_type_token_lens) + num_metal_ions
+    num_tokens = sum(molecule_type_token_lens)
 
     assert num_tokens > 0, "you have an empty alphafold3 input"
 
@@ -757,7 +763,6 @@ def alphafold3_input_to_molecule_input(alphafold3_input: Alphafold3Input) -> Mol
     molecule_types_lens_cumsum = tensor([0, *molecule_type_token_lens]).cumsum(dim=-1)
     left, right = molecule_types_lens_cumsum[:-1], molecule_types_lens_cumsum[1:]
 
-    # TODO: fix bug that may leave molecules with no assigned type
     is_molecule_types = (arange >= left) & (arange < right)
 
     # all molecules, layout is
@@ -950,7 +955,7 @@ def get_num_atoms_per_chain(chains: List[List[Mol]]) -> List[int]:
         for mol_index, (mol_miss_atom_indices, mol) in enumerate(
             zip(i.missing_atom_indices, molecules)
         ):
-            is_ligand_residue = is_molecule_types[mol_index, -1].item()
+            is_ligand_residue = is_molecule_types[mol_index, IS_LIGAND_INDEX].item()
             mol_miss_atom_indices = default(mol_miss_atom_indices, [])
             mol_miss_atom_indices = tensor(mol_miss_atom_indices, dtype=torch.long)
 
@@ -1427,9 +1432,9 @@ def pdb_input_to_molecule_input(pdb_input: PDBInput, training: bool = True) -> M
     molecule_ids = torch.from_numpy(biomol.restype)
 
     # retrieve is_molecule_types from the `Biomolecule` object, which is a boolean tensor of shape [*, 4]
-    # is_protein | is_rna | is_dna | is_ligand
+    # is_protein | is_rna | is_dna | is_ligand | is_metal_ion
     # this is needed for their special diffusion loss
-    n_one_hot = 4
+    n_one_hot = 5
     is_molecule_types = F.one_hot(torch.from_numpy(biomol.chemtype), num_classes=n_one_hot).bool()
 
     # manually derive remaining features using the `Biomolecule` object
@@ -1464,11 +1469,20 @@ def pdb_input_to_molecule_input(pdb_input: PDBInput, training: bool = True) -> M
             molecule_atom_types.extend([mol_type] * num_atoms)
             # ensure modified polymer residues are one-hot encoded as ligands
             # TODO: double-check whether this handling of modified polymer residues makes sense
-            is_molecule_types[molecule_idx : molecule_idx + num_atoms, : n_one_hot - 1] = False
-            is_molecule_types[molecule_idx : molecule_idx + num_atoms, n_one_hot - 1] = True
+
+            molecule_type_row_idx = slice(molecule_idx, molecule_idx + num_atoms)
+
+            is_molecule_types[molecule_type_row_idx, IS_BIOMOLECULE_INDICES] = False
+
             if num_atoms == 1:
                 # NOTE: we manually set the molecule ID of ions to the `gap` ID
-                molecule_ids[molecule_idx] = gap_id
+                molecule_ids[molecule_type_row_idx] = gap_id
+                is_mol_type_index = IS_METAL_ION_INDEX
+            else:
+                is_mol_type_index = IS_LIGAND_INDEX
+
+            is_molecule_types[molecule_type_row_idx, is_mol_type_index] = True
+
             molecule_idx += num_atoms
         else:
             token_pool_lens.append(num_atoms)
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "alphafold3-pytorch"
-version = "0.2.0"
+version = "0.2.1"
 description = "Alphafold 3 - Pytorch"
 authors = [
     { name = "Phil Wang", email = "lucidrains@gmail.com" }
diff --git a/tests/test_af3.py b/tests/test_af3.py
@@ -444,7 +444,7 @@ def test_alphafold3(
 
     additional_molecule_feats = torch.randint(0, 2, (2, seq_len, 5))
     additional_token_feats = torch.randn(2, 16, 2)
-    is_molecule_types = torch.randint(0, 2, (2, seq_len, 4)).bool()
+    is_molecule_types = torch.randint(0, 2, (2, seq_len, 5)).bool()
     molecule_ids = torch.randint(0, 32, (2, seq_len))
 
     is_molecule_mod = None
@@ -556,7 +556,7 @@ def test_alphafold3_without_msa_and_templates():
     atompair_inputs = torch.randn(2, atom_seq_len, atom_seq_len, 5)
     additional_molecule_feats = torch.randint(0, 2, (2, seq_len, 5))
     additional_token_feats = torch.randn(2, seq_len, 2)
-    is_molecule_types = torch.randint(0, 2, (2, seq_len, 4)).bool()
+    is_molecule_types = torch.randint(0, 2, (2, seq_len, 5)).bool()
     molecule_ids = torch.randint(0, 32, (2, seq_len))
 
     atom_pos = torch.randn(2, atom_seq_len, 3)
@@ -716,7 +716,7 @@ def test_alphafold3_with_atom_and_bond_embeddings():
 
     additional_molecule_feats = torch.randint(0, 2, (2, seq_len, 5))
     additional_token_feats = torch.randn(2, seq_len, 2)
-    is_molecule_types = torch.randint(0, 2, (2, seq_len, 4)).bool()
+    is_molecule_types = torch.randint(0, 2, (2, seq_len, 5)).bool()
     molecule_ids = torch.randint(0, 32, (2, seq_len))
 
     template_feats = torch.randn(2, 2, seq_len, seq_len, 44)