completely wire up the diffusion module, and show end to end in readme

lucidrains · lucidrains · commit fdfe72b29103 · 2024-05-20T06:25:19.000-07:00
diff --git a/README.md b/README.md
@@ -43,6 +43,11 @@ template_mask = torch.ones((2, 2)).bool()
 
 msa = torch.randn(2, 7, seq_len, 64)
 
+# required for training, but omitted on inference
+
+atom_pos = torch.randn(2, atom_seq_len, 3)
+distance_labels = torch.randint(0, 37, (2, seq_len, seq_len))
+
 # train
 
 loss = alphafold3(
@@ -53,11 +58,28 @@ loss = alphafold3(
     additional_residue_feats = additional_residue_feats,
     msa = msa,
     templates = template_feats,
-    template_mask = template_mask
+    template_mask = template_mask,
+    atom_pos = atom_pos,
+    distance_labels = distance_labels
 )
 
 loss.backward()
 
+# after much training ...
+
+sampled_atom_pos = alphafold3(
+    num_recycling_steps = 4,
+    num_sample_steps = 16,
+    atom_inputs = atom_inputs,
+    atom_mask = atom_mask,
+    atompair_feats = atompair_feats,
+    additional_residue_feats = additional_residue_feats,
+    msa = msa,
+    templates = template_feats,
+    template_mask = template_mask
+)
+
+sampled_atom_pos.shape # (2, 16 * 27, 3)
 ```
 
 ## Citations
@@ -83,7 +105,7 @@ loss.backward()
               {\v Z}{\'\i}dek, Augustin and Bapst, Victor and Kohli, Pushmeet
               and Jaderberg, Max and Hassabis, Demis and Jumper, John M",
   journal  = "Nature",
-  month    =  may,
+  month    = "May",
   year     =  2024
 }
 ```
diff --git a/alphafold3_pytorch/alphafold3.py b/alphafold3_pytorch/alphafold3.py
@@ -1982,12 +1982,14 @@ def __init__(
         dim_input_embedder_token = 384,
         dim_single = 384,
         dim_pairwise = 128,
+        dim_token = 768,
         atompair_dist_bins: Float[' dist_bins'] = torch.linspace(3, 20, 37),
         ignore_index = -1,
         num_dist_bins = 38,
         num_plddt_bins = 50,
         num_pde_bins = 64,
         num_pae_bins = 64,
+        sigma_data = 16,
         loss_confidence_weight = 1e-4,
         loss_distogram_weight = 1e-2,
         loss_diffusion_weight = 4.,
@@ -2023,6 +2025,32 @@ def __init__(
         relative_position_encoding_kwargs: dict = dict(
             r_max = 32,
             s_max = 2,
+        ),
+        diffusion_module_kwargs: dict = dict(
+            single_cond_kwargs = dict(
+                num_transitions = 2,
+                transition_expansion_factor = 2,
+            ),
+            pairwise_cond_kwargs = dict(
+                num_transitions = 2
+            ),
+            atom_encoder_depth = 3,
+            atom_encoder_heads = 4,
+            token_transformer_depth = 24,
+            token_transformer_heads = 16,
+            atom_decoder_depth = 3,
+            atom_decoder_heads = 4
+        ),
+        edm_kwargs: dict = dict(
+            sigma_min = 0.002,
+            sigma_max = 80,
+            rho = 7,
+            P_mean = -1.2,
+            P_std = 1.2,
+            S_churn = 80,
+            S_tmin = 0.05,
+            S_tmax = 50,
+            S_noise = 1.003,
         )
     ):
         super().__init__()
@@ -2091,6 +2119,27 @@ def __init__(
             LinearNoBias(dim_pairwise, dim_pairwise)
         )
 
+        # diffusion
+
+        self.diffusion_module = DiffusionModule(
+            dim_pairwise_trunk = dim_pairwise,
+            dim_pairwise_rel_pos_feats = dim_pairwise,
+            atoms_per_window = atoms_per_window,
+            dim_pairwise = dim_pairwise,
+            sigma_data = sigma_data,
+            dim_atom = dim_atom,
+            dim_atompair = dim_atompair,
+            dim_token = dim_token,
+            dim_single = dim_single + dim_single_inputs,
+            **diffusion_module_kwargs
+        )
+
+        self.edm = ElucidatedAtomDiffusion(
+            self.diffusion_module,
+            sigma_data = sigma_data,
+            **edm_kwargs
+        )
+
         # logit heads
 
         self.distogram_head = DistogramHead(
@@ -2116,11 +2165,11 @@ def __init__(
         self.loss_confidence_weight = loss_confidence_weight
         self.loss_diffusion_weight = loss_diffusion_weight
 
-        self.register_buffer('dummy', torch.tensor(0), persistent = False)
+        self.register_buffer('zero', torch.tensor(0.), persistent = False)
 
     @property
     def device(self):
-        return self.dummy.device
+        return self.zero.device
 
     @typecheck
     def forward(
@@ -2134,6 +2183,8 @@ def forward(
         templates: Float['b t n n dt'],
         template_mask: Bool['b t'],
         num_recycling_steps: int = 1,
+        num_sample_steps: int | None = None,
+        atom_pos: Float['b m 3'] | None = None,
         distance_labels: Int['b n n'] | None = None,
         pae_labels: Int['b n n'] | None = None,
         pde_labels: Int['b n n'] | None = None,
@@ -2228,23 +2279,52 @@ def forward(
         # determine whether to return loss if any labels were to be passed in
         # otherwise will sample the atomic coordinates
 
+        atom_pos_given = exists(atom_pos)
+
         labels = (distance_labels, pae_labels, pde_labels, plddt_labels, resolved_labels)
-        return_loss = any([*map(exists, labels)])
+        has_labels = any([*map(exists, labels)])
+
+        return_loss = atom_pos_given or has_labels
+
+        # setup all the data necessary for conditioning the diffusion module
+
+        diffusion_cond = dict(
+            atom_feats = atom_feats,
+            atompair_feats = atompair_feats,
+            atom_mask = atom_mask,
+            mask = mask,
+            single_trunk_repr = single,
+            single_inputs_repr = single_inputs,
+            pairwise_trunk = pairwise,
+            pairwise_rel_pos_feats = relative_position_encoding
+        )
+
+        # if neither atom positions or any labels are passed in, sample a structure and return
 
         if not return_loss:
-            return torch.randn((*atom_inputs.shape[:2], 3), device = self.device)
+            return self.edm.sample(num_sample_steps = num_sample_steps, **diffusion_cond)
+
+        # otherwise, noise and make it learn to denoise
+
+        diffusion_loss = self.zero
+
+        if exists(atom_pos):
+            diffusion_loss = self.edm(atom_pos, **diffusion_cond)
 
         # calculate all logits and losses
 
         ignore = self.ignore_index
 
+        distogram_loss = self.zero
+
         if exists(distance_labels):
             distance_labels = torch.where(pairwise_mask, distance_labels, ignore)
             distogram_logits = self.distogram_head(pairwise)
             distogram_loss = F.cross_entropy(distogram_logits, distance_labels, ignore_index = ignore)
 
         loss = (
-            distogram_loss * self.loss_distogram_weight
+            distogram_loss * self.loss_distogram_weight +
+            diffusion_loss * self.loss_diffusion_weight
         )
 
         return loss
diff --git a/tests/test_readme.py b/tests/test_readme.py
@@ -256,6 +256,7 @@ def test_alphafold3():
 
     msa = torch.randn(2, 7, seq_len, 64)
 
+    atom_pos = torch.randn(2, atom_seq_len, 3)
     distance_labels = torch.randint(0, 38, (2, seq_len, seq_len))
 
     alphafold3 = Alphafold3(
@@ -274,7 +275,21 @@ def test_alphafold3():
         msa = msa,
         templates = template_feats,
         template_mask = template_mask,
+        atom_pos = atom_pos,
         distance_labels = distance_labels
     )
 
     loss.backward()
+
+    sampled_atom_pos = alphafold3(
+        num_sample_steps = 16,
+        atom_inputs = atom_inputs,
+        atom_mask = atom_mask,
+        atompair_feats = atompair_feats,
+        additional_residue_feats = additional_residue_feats,
+        msa = msa,
+        templates = template_feats,
+        template_mask = template_mask,
+    )
+
+    assert sampled_atom_pos.ndim == 3