handle the "restypes" as "molecule_ids" explicitly, removed from "additional molecule feats"

lucidrains · lucidrains · commit 5b43671805b6 · 2024-06-17T13:15:34.000-07:00
diff --git a/README.md b/README.md
@@ -50,7 +50,8 @@ atom_seq_len = molecule_atom_lens.sum(dim = -1).amax()
 atom_inputs = torch.randn(2, atom_seq_len, 77)
 atompair_inputs = torch.randn(2, atom_seq_len, atom_seq_len, 5)
 
-additional_molecule_feats = torch.randn(2, seq_len, 10)
+additional_molecule_feats = torch.randn(2, seq_len, 9)
+molecule_ids = torch.randint(0, 32, (2, seq_len))
 
 template_feats = torch.randn(2, 2, seq_len, seq_len, 44)
 template_mask = torch.ones((2, 2)).bool()
@@ -75,6 +76,7 @@ loss = alphafold3(
     num_recycling_steps = 2,
     atom_inputs = atom_inputs,
     atompair_inputs = atompair_inputs,
+    molecule_ids = molecule_ids,
     molecule_atom_lens = molecule_atom_lens,
     additional_molecule_feats = additional_molecule_feats,
     msa = msa,
@@ -99,6 +101,7 @@ sampled_atom_pos = alphafold3(
     num_sample_steps = 16,
     atom_inputs = atom_inputs,
     atompair_inputs = atompair_inputs,
+    molecule_ids = molecule_ids,
     molecule_atom_lens = molecule_atom_lens,
     additional_molecule_feats = additional_molecule_feats,
     msa = msa,
diff --git a/alphafold3_pytorch/alphafold3.py b/alphafold3_pytorch/alphafold3.py
@@ -77,23 +77,22 @@
 """
 
 """
-additional_molecule_feats: [*, 10]:
+additional_molecule_feats: [*, 9]:
 
 0: molecule_index
 1: token_index
 2: asym_id
 3: entity_id
 4: sym_id
-5: restype (must be one hot encoded to 32)
-6: is_protein
-7: is_rna
-8: is_dna
-9: is_ligand
+5: is_protein
+6: is_rna
+7: is_dna
+8: is_ligand
 """
 
 # constants
 
-ADDITIONAL_MOLECULE_FEATS = 10
+ADDITIONAL_MOLECULE_FEATS = 9
 
 LinearNoBias = partial(Linear, bias = False)
 
@@ -2196,7 +2195,7 @@ def forward(
         align_weights = atom_pos_ground_truth.new_ones(atom_pos_ground_truth.shape[:2])
 
         if exists(additional_molecule_feats):
-            is_nucleotide_or_ligand_fields = (additional_molecule_feats[..., 7:] != 0.).unbind(dim = -1)
+            is_nucleotide_or_ligand_fields = (additional_molecule_feats[..., -3:] != 0.).unbind(dim = -1)
 
             is_nucleotide_or_ligand_fields = tuple(repeat_consecutive_with_lens(t, molecule_atom_lens) for t in is_nucleotide_or_ligand_fields)
             is_nucleotide_or_ligand_fields = tuple(pad_or_slice_to(t, length = align_weights.shape[-1], dim = -1) for t in is_nucleotide_or_ligand_fields)
@@ -2587,6 +2586,7 @@ def __init__(
         dim_token = 384,
         dim_single = 384,
         dim_pairwise = 128,
+        num_molecule_types = 32,
         atom_transformer_blocks = 3,
         atom_transformer_heads = 4,
         atom_transformer_kwargs: dict = dict(),
@@ -2632,6 +2632,11 @@ def __init__(
         self.single_input_to_single_init = LinearNoBias(dim_single_input, dim_single)
         self.single_input_to_pairwise_init = LinearNoBiasThenOuterSum(dim_single_input, dim_pairwise)
 
+        # this accounts for the `restypes` in the additional molecule features
+
+        self.single_molecule_embed = nn.Embedding(num_molecule_types, dim_single)
+        self.pairwise_molecule_embed = nn.Embedding(num_molecule_types, dim_pairwise)
+
     @typecheck
     def forward(
         self,
@@ -2641,6 +2646,7 @@ def forward(
         atom_mask: Bool['b m'],
         additional_molecule_feats: Float[f'b n {ADDITIONAL_MOLECULE_FEATS}'],
         molecule_atom_lens: Int['b n'],
+        molecule_ids: Int['b n']
 
     ) -> EmbeddedInputs:
 
@@ -2691,6 +2697,20 @@ def forward(
         single_init = self.single_input_to_single_init(single_inputs)
         pairwise_init = self.single_input_to_pairwise_init(single_inputs)
 
+        # account for molecule id (restypes)
+
+        molecule_ids = torch.where(molecule_ids >= 0, molecule_ids, 0) # account for padding
+
+        single_molecule_embed = self.single_molecule_embed(molecule_ids)
+
+        pairwise_molecule_embed = self.pairwise_molecule_embed(molecule_ids)
+        pairwise_molecule_embed = einx.add('b i dp, b j dp -> b i j dp', pairwise_molecule_embed, pairwise_molecule_embed)
+
+        # sum to single init and pairwise init, equivalent to one-hot in additional residue features
+
+        single_init = single_init + single_molecule_embed
+        pairwise_init = pairwise_init + pairwise_molecule_embed
+
         return EmbeddedInputs(single_inputs, single_init, pairwise_init, atom_feats, atompair_feats)
 
 # distogram head
@@ -2872,6 +2892,7 @@ def __init__(
         dim_single = 384,
         dim_pairwise = 128,
         dim_token = 768,
+        num_molecule_types: int = 32,  # restype in additional residue information, apparently 32 (must be human amino acids + nucleotides + something else)
         num_atom_embeds: int | None = None,
         num_atompair_embeds: int | None = None,
         distance_bins: List[float] = torch.linspace(3, 20, 38).float().tolist(),
@@ -3192,6 +3213,7 @@ def forward(
         atompair_inputs: Float['b m m dapi'] | Float['b nw w1 w2 dapi'],
         additional_molecule_feats: Float[f'b n {ADDITIONAL_MOLECULE_FEATS}'],
         molecule_atom_lens: Int['b n'],
+        molecule_ids: Int['b n'],
         atom_ids: Int['b m'] | None = None,
         atompair_ids: Int['b m m'] | Int['b nw w1 w2'] | None = None,
         atom_mask: Bool['b m'] | None = None,
@@ -3265,7 +3287,8 @@ def forward(
             atompair_inputs = atompair_inputs,
             atom_mask = atom_mask,
             additional_molecule_feats = additional_molecule_feats,
-            molecule_atom_lens = molecule_atom_lens
+            molecule_atom_lens = molecule_atom_lens,
+            molecule_ids = molecule_ids
         )
 
         # handle maybe atom and atompair embeddings
diff --git a/alphafold3_pytorch/inputs.py b/alphafold3_pytorch/inputs.py
@@ -10,6 +10,7 @@
 @typecheck
 class AtomInput(TypedDict):
     atom_inputs:                Float['*b m dai']
+    molecule_ids:               Int['*b n']
     molecule_atom_lens:         Int['*b n']
     atompair_inputs:            Float['*b m m dapi'] | Float['*b nw w (w*2) dapi']
     additional_molecule_feats:  Float['*b n 10']
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "alphafold3-pytorch"
-version = "0.1.52"
+version = "0.1.53"
 description = "Alphafold 3 - Pytorch"
 authors = [
     { name = "Phil Wang", email = "lucidrains@gmail.com" }
diff --git a/tests/test_af3.py b/tests/test_af3.py
@@ -330,7 +330,7 @@ def test_diffusion_module():
     assert sampled_atom_pos.shape == noised_atom_pos.shape
     
 def test_relative_position_encoding():
-    additional_molecule_feats = torch.randn(8, 100, 10)
+    additional_molecule_feats = torch.randn(8, 100, 9)
 
     embedder = RelativePositionEncoding()
 
@@ -387,7 +387,8 @@ def test_input_embedder():
     atompair_inputs = torch.randn(2, atom_seq_len, atom_seq_len, 5)
 
     atom_mask = torch.ones((2, atom_seq_len)).bool()
-    additional_molecule_feats = torch.randn(2, 16, 10)
+    additional_molecule_feats = torch.randn(2, 16, 9)
+    molecule_ids = torch.randint(0, 32, (2, 16))
 
     embedder = InputFeatureEmbedder(
         dim_atom_inputs = 77,
@@ -398,6 +399,7 @@ def test_input_embedder():
         atom_mask = atom_mask,
         atompair_inputs = atompair_inputs,
         molecule_atom_lens = molecule_atom_lens,
+        molecule_ids = molecule_ids,
         additional_molecule_feats = additional_molecule_feats
     )
 
@@ -429,7 +431,8 @@ def test_alphafold3(
     if window_atompair_inputs:
         atompair_inputs = full_pairwise_repr_to_windowed(atompair_inputs, window_size = atoms_per_window)
 
-    additional_molecule_feats = torch.randn(2, seq_len, 10)
+    additional_molecule_feats = torch.randn(2, seq_len, 9)
+    molecule_ids = torch.randint(0, 32, (2, seq_len))
 
     template_feats = torch.randn(2, 2, seq_len, seq_len, 44)
     template_mask = torch.ones((2, 2)).bool()
@@ -473,6 +476,7 @@ def test_alphafold3(
     loss, breakdown = alphafold3(
         num_recycling_steps = 2,
         atom_inputs = atom_inputs,
+        molecule_ids = molecule_ids,
         molecule_atom_lens = molecule_atom_lens,
         atompair_inputs = atompair_inputs,
         additional_molecule_feats = additional_molecule_feats,
@@ -496,6 +500,7 @@ def test_alphafold3(
     sampled_atom_pos = alphafold3(
         num_sample_steps = 16,
         atom_inputs = atom_inputs,
+        molecule_ids = molecule_ids,
         molecule_atom_lens = molecule_atom_lens,
         atompair_inputs = atompair_inputs,
         additional_molecule_feats = additional_molecule_feats,
@@ -513,7 +518,8 @@ def test_alphafold3_without_msa_and_templates():
 
     atom_inputs = torch.randn(2, atom_seq_len, 77)
     atompair_inputs = torch.randn(2, atom_seq_len, atom_seq_len, 5)
-    additional_molecule_feats = torch.randn(2, seq_len, 10)
+    additional_molecule_feats = torch.randn(2, seq_len, 9)
+    molecule_ids = torch.randint(0, 32, (2, seq_len))
 
     atom_pos = torch.randn(2, atom_seq_len, 3)
     molecule_atom_indices = molecule_atom_lens - 1
@@ -550,6 +556,7 @@ def test_alphafold3_without_msa_and_templates():
     loss, breakdown = alphafold3(
         num_recycling_steps = 2,
         atom_inputs = atom_inputs,
+        molecule_ids = molecule_ids,
         molecule_atom_lens = molecule_atom_lens,
         atompair_inputs = atompair_inputs,
         additional_molecule_feats = additional_molecule_feats,
@@ -572,7 +579,8 @@ def test_alphafold3_force_return_loss():
 
     atom_inputs = torch.randn(2, atom_seq_len, 77)
     atompair_inputs = torch.randn(2, atom_seq_len, atom_seq_len, 5)
-    additional_molecule_feats = torch.randn(2, seq_len, 10)
+    additional_molecule_feats = torch.randn(2, seq_len, 9)
+    molecule_ids = torch.randint(0, 32, (2, seq_len))
 
     atom_pos = torch.randn(2, atom_seq_len, 3)
     molecule_atom_indices = molecule_atom_lens - 1
@@ -609,6 +617,7 @@ def test_alphafold3_force_return_loss():
     sampled_atom_pos = alphafold3(
         num_recycling_steps = 2,
         atom_inputs = atom_inputs,
+        molecule_ids = molecule_ids,
         molecule_atom_lens = molecule_atom_lens,
         atompair_inputs = atompair_inputs,
         additional_molecule_feats = additional_molecule_feats,
@@ -628,6 +637,7 @@ def test_alphafold3_force_return_loss():
     loss, _ = alphafold3(
         num_recycling_steps = 2,
         atom_inputs = atom_inputs,
+        molecule_ids = molecule_ids,
         molecule_atom_lens = molecule_atom_lens,
         atompair_inputs = atompair_inputs,
         additional_molecule_feats = additional_molecule_feats,
@@ -658,7 +668,8 @@ def test_alphafold3_with_atom_and_bond_embeddings():
     atom_inputs = torch.randn(2, atom_seq_len, 77)
     atompair_inputs = torch.randn(2, atom_seq_len, atom_seq_len, 5)
 
-    additional_molecule_feats = torch.randn(2, seq_len, 10)
+    additional_molecule_feats = torch.randn(2, seq_len, 9)
+    molecule_ids = torch.randint(0, 32, (2, seq_len))
 
     template_feats = torch.randn(2, 2, seq_len, seq_len, 44)
     template_mask = torch.ones((2, 2)).bool()
@@ -685,6 +696,7 @@ def test_alphafold3_with_atom_and_bond_embeddings():
         atompair_ids = atompair_ids,
         atom_inputs = atom_inputs,
         atompair_inputs = atompair_inputs,
+        molecule_ids = molecule_ids,
         molecule_atom_lens = molecule_atom_lens,
         additional_molecule_feats = additional_molecule_feats,
         msa = msa,
diff --git a/tests/test_trainer.py b/tests/test_trainer.py
@@ -46,7 +46,8 @@ def __getitem__(self, idx):
         atompair_inputs = torch.randn(atom_seq_len, atom_seq_len, 5)
 
         molecule_atom_lens = torch.randint(1, self.atoms_per_window, (seq_len,))
-        additional_molecule_feats = torch.randn(seq_len, 10)
+        additional_molecule_feats = torch.randn(seq_len, 9)
+        molecule_ids = torch.randint(0, 32, (seq_len,))
 
         templates = torch.randn(2, seq_len, seq_len, 44)
         template_mask = torch.ones((2,)).bool()
@@ -71,6 +72,7 @@ def __getitem__(self, idx):
         return AtomInput(
             atom_inputs = atom_inputs,
             atompair_inputs = atompair_inputs,
+            molecule_ids = molecule_ids,
             molecule_atom_lens = molecule_atom_lens,
             additional_molecule_feats = additional_molecule_feats,
             templates = templates,