break out a boolean tensor from additional_molecular_feats

lucidrains · lucidrains · commit cb4acf2b0e10 · 2024-06-25T14:23:42.000-07:00
diff --git a/README.md b/README.md
@@ -54,7 +54,8 @@ atom_seq_len = molecule_atom_lens.sum(dim = -1).amax()
 atom_inputs = torch.randn(2, atom_seq_len, 77)
 atompair_inputs = torch.randn(2, atom_seq_len, atom_seq_len, 5)
 
-additional_molecule_feats = torch.randn(2, seq_len, 9)
+additional_molecule_feats = torch.randn(2, seq_len, 5)
+is_molecule_types = torch.randint(0, 2, (2, seq_len)).bool()
 molecule_ids = torch.randint(0, 32, (2, seq_len))
 
 template_feats = torch.randn(2, 2, seq_len, seq_len, 44)
@@ -83,6 +84,7 @@ loss = alphafold3(
     molecule_ids = molecule_ids,
     molecule_atom_lens = molecule_atom_lens,
     additional_molecule_feats = additional_molecule_feats,
+    is_molecule_types = is_molecule_types,
     msa = msa,
     msa_mask = msa_mask,
     templates = template_feats,
diff --git a/alphafold3_pytorch/alphafold3.py b/alphafold3_pytorch/alphafold3.py
@@ -78,22 +78,30 @@
 """
 
 """
-additional_molecule_feats: [*, 9]:
+additional_molecule_feats: [*, 5]:
 
 0: molecule_index
 1: token_index
 2: asym_id
 3: entity_id
 4: sym_id
-5: is_protein
-6: is_rna
-7: is_dna
-8: is_ligand
+"""
+
+"""
+is_molecule_types: [*, 4]
+
+0: is_protein
+1: is_rna
+2: is_dna
+3: is_ligand
 """
 
 # constants
 
-ADDITIONAL_MOLECULE_FEATS = 9
+from alphafold3_pytorch.inputs import (
+    IS_MOLECULE_TYPES,
+    ADDITIONAL_MOLECULE_FEATS
+)
 
 LinearNoBias = partial(Linear, bias = False)
 
@@ -1169,9 +1177,8 @@ def forward(
     ) -> Float['b n n dp']:
 
         device = additional_molecule_feats.device
-        assert additional_molecule_feats.shape[-1] >= 5
 
-        res_idx, token_idx, asym_id, entity_id, sym_id = additional_molecule_feats[..., :5].unbind(dim = -1)
+        res_idx, token_idx, asym_id, entity_id, sym_id = additional_molecule_feats.unbind(dim = -1)
         
         diff_res_idx = einx.subtract('b i, b j -> b i j', res_idx, res_idx)
         diff_token_idx = einx.subtract('b i, b j -> b i j', token_idx, token_idx)
@@ -2173,6 +2180,7 @@ def forward(
         molecule_atom_lens: Int['b n'],
         atom_parent_ids: Int['b m'] | None = None,
         return_denoised_pos = False,
+        is_molecule_types: Bool[f'b n {IS_MOLECULE_TYPES}'] | None = None,
         additional_molecule_feats: Float[f'b n {ADDITIONAL_MOLECULE_FEATS}'] | None = None,
         add_smooth_lddt_loss = False,
         add_bond_loss = False,
@@ -2218,13 +2226,13 @@ def forward(
 
         align_weights = atom_pos_ground_truth.new_ones(atom_pos_ground_truth.shape[:2])
 
-        if exists(additional_molecule_feats):
-            is_nucleotide_or_ligand_fields = (additional_molecule_feats[..., -3:] != 0.).unbind(dim = -1)
+        if exists(is_molecule_types):
+            is_nucleotide_or_ligand_fields = is_molecule_types.unbind(dim = -1)
 
             is_nucleotide_or_ligand_fields = tuple(repeat_consecutive_with_lens(t, molecule_atom_lens) for t in is_nucleotide_or_ligand_fields)
             is_nucleotide_or_ligand_fields = tuple(pad_or_slice_to(t, length = align_weights.shape[-1], dim = -1) for t in is_nucleotide_or_ligand_fields)
 
-            atom_is_dna, atom_is_rna, atom_is_ligand = is_nucleotide_or_ligand_fields
+            _, atom_is_dna, atom_is_rna, atom_is_ligand = is_nucleotide_or_ligand_fields
 
             # section 3.7.1 equation 4
 
@@ -2281,7 +2289,7 @@ def forward(
         smooth_lddt_loss = self.zero
 
         if add_smooth_lddt_loss:
-            assert exists(additional_molecule_feats)
+            assert exists(is_molecule_types)
 
             smooth_lddt_loss = self.smooth_lddt_loss(
                 denoised_atom_pos,
@@ -2651,7 +2659,7 @@ def __init__(
             dim_out = dim_token
         )
 
-        dim_single_input = dim_token + ADDITIONAL_MOLECULE_FEATS
+        dim_single_input = dim_token + ADDITIONAL_MOLECULE_FEATS + IS_MOLECULE_TYPES
 
         self.single_input_to_single_init = LinearNoBias(dim_single_input, dim_single)
         self.single_input_to_pairwise_init = LinearNoBiasThenOuterSum(dim_single_input, dim_pairwise)
@@ -2668,6 +2676,7 @@ def forward(
         atom_inputs: Float['b m dai'],
         atompair_inputs: Float['b m m dapi'] | Float['b nw w1 w2 dapi'],
         atom_mask: Bool['b m'],
+        is_molecule_types: Bool[f'b n {IS_MOLECULE_TYPES}'],
         additional_molecule_feats: Float[f'b n {ADDITIONAL_MOLECULE_FEATS}'],
         molecule_atom_lens: Int['b n'],
         molecule_ids: Int['b n']
@@ -2716,7 +2725,11 @@ def forward(
             molecule_atom_lens = molecule_atom_lens
         )
 
-        single_inputs = torch.cat((single_inputs, additional_molecule_feats), dim = -1)
+        single_inputs = torch.cat((
+            single_inputs,
+            additional_molecule_feats,
+            is_molecule_types.float()
+        ), dim = -1)
 
         single_init = self.single_input_to_single_init(single_inputs)
         pairwise_init = self.single_input_to_pairwise_init(single_inputs)
@@ -3046,7 +3059,7 @@ def __init__(
             **input_embedder_kwargs
         )
 
-        dim_single_inputs = dim_input_embedder_token + ADDITIONAL_MOLECULE_FEATS
+        dim_single_inputs = dim_input_embedder_token + ADDITIONAL_MOLECULE_FEATS + IS_MOLECULE_TYPES
 
         # relative positional encoding
         # used by pairwise in main alphafold2 trunk
@@ -3236,6 +3249,7 @@ def forward(
         atom_inputs: Float['b m dai'],
         atompair_inputs: Float['b m m dapi'] | Float['b nw w1 w2 dapi'],
         additional_molecule_feats: Float[f'b n {ADDITIONAL_MOLECULE_FEATS}'],
+        is_molecule_types: Bool[f'b n {IS_MOLECULE_TYPES}'],
         molecule_atom_lens: Int['b n'],
         molecule_ids: Int['b n'],
         atom_ids: Int['b m'] | None = None,
@@ -3311,6 +3325,7 @@ def forward(
             atom_inputs = atom_inputs,
             atompair_inputs = atompair_inputs,
             atom_mask = atom_mask,
+            is_molecule_types = is_molecule_types,
             additional_molecule_feats = additional_molecule_feats,
             molecule_atom_lens = molecule_atom_lens,
             molecule_ids = molecule_ids
@@ -3513,6 +3528,7 @@ def forward(
                     pairwise,
                     relative_position_encoding,
                     additional_molecule_feats,
+                    is_molecule_types,
                     molecule_atom_indices,
                     molecule_atom_lens,
                     pae_labels,
@@ -3535,6 +3551,7 @@ def forward(
                         pairwise,
                         relative_position_encoding,
                         additional_molecule_feats,
+                        is_molecule_types,
                         molecule_atom_indices,
                         molecule_atom_lens,
                         pae_labels,
@@ -3566,6 +3583,7 @@ def forward(
             diffusion_loss, denoised_atom_pos, diffusion_loss_breakdown, _ = self.edm(
                 atom_pos,
                 additional_molecule_feats = additional_molecule_feats,
+                is_molecule_types = is_molecule_types,
                 add_smooth_lddt_loss = diffusion_add_smooth_lddt_loss,
                 add_bond_loss = diffusion_add_bond_loss,
                 atom_feats = atom_feats,
diff --git a/alphafold3_pytorch/inputs.py b/alphafold3_pytorch/inputs.py
@@ -8,6 +8,11 @@
     Int, Bool, Float
 )
 
+# constants
+
+IS_MOLECULE_TYPES = 4
+ADDITIONAL_MOLECULE_FEATS = 5
+
 # atom level, what Alphafold3 accepts
 
 @typecheck
@@ -16,7 +21,8 @@ class AtomInput(TypedDict):
     molecule_ids:               Int['n']
     molecule_atom_lens:         Int['n']
     atompair_inputs:            Float['m m dapi'] | Float['nw w (w*2) dapi']
-    additional_molecule_feats:  Float['n 9']
+    additional_molecule_feats:  Float[f'n {ADDITIONAL_MOLECULE_FEATS}']
+    is_molecule_types:          Bool[f'n {IS_MOLECULE_TYPES}']
     templates:                  Float['t n n dt']
     msa:                        Float['s n dm']
     token_bonds:                Bool['n n'] | None
@@ -38,7 +44,8 @@ class BatchedAtomInput(TypedDict):
     molecule_ids:               Int['b n']
     molecule_atom_lens:         Int['b n']
     atompair_inputs:            Float['b m m dapi'] | Float['b nw w (w*2) dapi']
-    additional_molecule_feats:  Float['b n 9']
+    additional_molecule_feats:  Float[f'b n {ADDITIONAL_MOLECULE_FEATS}']
+    is_molecule_types:          Bool[f'b n {IS_MOLECULE_TYPES}']
     templates:                  Float['b t n n dt']
     msa:                        Float['b s n dm']
     token_bonds:                Bool['b n n'] | None
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "alphafold3-pytorch"
-version = "0.1.67"
+version = "0.1.68"
 description = "Alphafold 3 - Pytorch"
 authors = [
     { name = "Phil Wang", email = "lucidrains@gmail.com" }
diff --git a/tests/test_af3.py b/tests/test_af3.py
@@ -330,7 +330,7 @@ def test_diffusion_module():
     assert sampled_atom_pos.shape == noised_atom_pos.shape
     
 def test_relative_position_encoding():
-    additional_molecule_feats = torch.randn(8, 100, 9)
+    additional_molecule_feats = torch.randn(8, 100, 5)
 
     embedder = RelativePositionEncoding()
 
@@ -387,7 +387,8 @@ def test_input_embedder():
     atompair_inputs = torch.randn(2, atom_seq_len, atom_seq_len, 5)
 
     atom_mask = torch.ones((2, atom_seq_len)).bool()
-    additional_molecule_feats = torch.randn(2, 16, 9)
+    additional_molecule_feats = torch.randn(2, 16, 5)
+    is_molecule_types = torch.randint(0, 2, (2, 16, 4)).bool()
     molecule_ids = torch.randint(0, 32, (2, 16))
 
     embedder = InputFeatureEmbedder(
@@ -400,6 +401,7 @@ def test_input_embedder():
         atompair_inputs = atompair_inputs,
         molecule_atom_lens = molecule_atom_lens,
         molecule_ids = molecule_ids,
+        is_molecule_types = is_molecule_types,
         additional_molecule_feats = additional_molecule_feats
     )
 
@@ -433,7 +435,8 @@ def test_alphafold3(
     if window_atompair_inputs:
         atompair_inputs = full_pairwise_repr_to_windowed(atompair_inputs, window_size = atoms_per_window)
 
-    additional_molecule_feats = torch.randn(2, seq_len, 9)
+    additional_molecule_feats = torch.randn(2, seq_len, 5)
+    is_molecule_types = torch.randint(0, 2, (2, seq_len, 4)).bool()
     molecule_ids = torch.randint(0, 32, (2, seq_len))
 
     atom_parent_ids = None
@@ -487,6 +490,7 @@ def test_alphafold3(
         molecule_atom_lens = molecule_atom_lens,
         atom_parent_ids = atom_parent_ids,
         atompair_inputs = atompair_inputs,
+        is_molecule_types = is_molecule_types,
         additional_molecule_feats = additional_molecule_feats,
         token_bonds = token_bonds,
         msa = msa,
@@ -511,6 +515,7 @@ def test_alphafold3(
         molecule_ids = molecule_ids,
         molecule_atom_lens = molecule_atom_lens,
         atompair_inputs = atompair_inputs,
+        is_molecule_types = is_molecule_types,
         additional_molecule_feats = additional_molecule_feats,
         msa = msa,
         templates = template_feats,
@@ -526,7 +531,8 @@ def test_alphafold3_without_msa_and_templates():
 
     atom_inputs = torch.randn(2, atom_seq_len, 77)
     atompair_inputs = torch.randn(2, atom_seq_len, atom_seq_len, 5)
-    additional_molecule_feats = torch.randn(2, seq_len, 9)
+    additional_molecule_feats = torch.randn(2, seq_len, 5)
+    is_molecule_types = torch.randint(0, 2, (2, seq_len, 4)).bool()
     molecule_ids = torch.randint(0, 32, (2, seq_len))
 
     atom_pos = torch.randn(2, atom_seq_len, 3)
@@ -567,6 +573,7 @@ def test_alphafold3_without_msa_and_templates():
         molecule_ids = molecule_ids,
         molecule_atom_lens = molecule_atom_lens,
         atompair_inputs = atompair_inputs,
+        is_molecule_types = is_molecule_types,
         additional_molecule_feats = additional_molecule_feats,
         atom_pos = atom_pos,
         molecule_atom_indices = molecule_atom_indices,
@@ -587,7 +594,8 @@ def test_alphafold3_force_return_loss():
 
     atom_inputs = torch.randn(2, atom_seq_len, 77)
     atompair_inputs = torch.randn(2, atom_seq_len, atom_seq_len, 5)
-    additional_molecule_feats = torch.randn(2, seq_len, 9)
+    additional_molecule_feats = torch.randn(2, seq_len, 5)
+    is_molecule_types = torch.randint(0, 2, (2, seq_len, 4)).bool()
     molecule_ids = torch.randint(0, 32, (2, seq_len))
 
     atom_pos = torch.randn(2, atom_seq_len, 3)
@@ -628,6 +636,7 @@ def test_alphafold3_force_return_loss():
         molecule_ids = molecule_ids,
         molecule_atom_lens = molecule_atom_lens,
         atompair_inputs = atompair_inputs,
+        is_molecule_types = is_molecule_types,
         additional_molecule_feats = additional_molecule_feats,
         atom_pos = atom_pos,
         molecule_atom_indices = molecule_atom_indices,
@@ -648,6 +657,7 @@ def test_alphafold3_force_return_loss():
         molecule_ids = molecule_ids,
         molecule_atom_lens = molecule_atom_lens,
         atompair_inputs = atompair_inputs,
+        is_molecule_types = is_molecule_types,
         additional_molecule_feats = additional_molecule_feats,
         return_loss_breakdown = True,
         return_loss = True # force returning loss even if no labels given
@@ -676,7 +686,8 @@ def test_alphafold3_with_atom_and_bond_embeddings():
     atom_inputs = torch.randn(2, atom_seq_len, 77)
     atompair_inputs = torch.randn(2, atom_seq_len, atom_seq_len, 5)
 
-    additional_molecule_feats = torch.randn(2, seq_len, 9)
+    additional_molecule_feats = torch.randn(2, seq_len, 5)
+    is_molecule_types = torch.randint(0, 2, (2, seq_len, 4)).bool()
     molecule_ids = torch.randint(0, 32, (2, seq_len))
 
     template_feats = torch.randn(2, 2, seq_len, seq_len, 44)
@@ -706,6 +717,7 @@ def test_alphafold3_with_atom_and_bond_embeddings():
         atompair_inputs = atompair_inputs,
         molecule_ids = molecule_ids,
         molecule_atom_lens = molecule_atom_lens,
+        is_molecule_types = is_molecule_types,
         additional_molecule_feats = additional_molecule_feats,
         msa = msa,
         msa_mask = msa_mask,
diff --git a/tests/test_trainer.py b/tests/test_trainer.py
@@ -47,7 +47,8 @@ def __getitem__(self, idx):
         atompair_inputs = torch.randn(atom_seq_len, atom_seq_len, 5)
 
         molecule_atom_lens = torch.randint(1, self.atoms_per_window, (seq_len,))
-        additional_molecule_feats = torch.randn(seq_len, 9)
+        additional_molecule_feats = torch.randn(seq_len, 5)
+        is_molecule_types = torch.randint(0, 2, (seq_len, 4)).bool()
         molecule_ids = torch.randint(0, 32, (seq_len,))
         token_bonds = torch.randint(0, 2, (seq_len, seq_len)).bool()
 
@@ -78,6 +79,7 @@ def __getitem__(self, idx):
             token_bonds = token_bonds,
             molecule_atom_lens = molecule_atom_lens,
             additional_molecule_feats = additional_molecule_feats,
+            is_molecule_types = is_molecule_types,
             templates = templates,
             template_mask = template_mask,
             msa = msa,