add two more checkpoints on trunk pairformer and diffusion token transformer

lucidrains · lucidrains · commit 7d1025aed190 · 2024-08-02T07:07:21.000-07:00
diff --git a/alphafold3_pytorch/alphafold3.py b/alphafold3_pytorch/alphafold3.py
@@ -2130,6 +2130,7 @@ def __init__(
         atom_decoder_kwargs: dict = dict(),
         token_transformer_kwargs: dict = dict(),
         use_linear_attn = False,
+        checkpoint_token_transformer = False,
         linear_attn_kwargs: dict = dict(
             heads = 8,
             dim_head = 16
@@ -2222,6 +2223,10 @@ def __init__(
 
         self.attended_token_norm = nn.LayerNorm(dim_token)
 
+        # checkpointing
+
+        self.checkpoint_token_transformer = checkpoint_token_transformer
+
         # atom attention decoding related modules
 
         self.tokens_to_atom_decoder_input_cond = LinearNoBias(dim_token, dim_atom)
@@ -2378,11 +2383,18 @@ def forward(
             molecule_atom_lens = molecule_atom_lens
         )
 
+        # maybe checkpoint token transformer
+
+        token_transformer = self.token_transformer
+
+        if should_checkpoint(self, tokens, 'checkpoint_token_transformer'):
+            token_transformer = partial(checkpoint, token_transformer, use_reentrant = False)
+
         # token transformer
 
         tokens = self.cond_tokens_with_cond_single(conditioned_single_repr) + tokens
 
-        tokens = self.token_transformer(
+        tokens = token_transformer(
             tokens,
             mask = mask,
             single_repr = conditioned_single_repr,
@@ -4300,7 +4312,10 @@ def __init__(
         ),
         augment_kwargs: dict = dict(),
         stochastic_frame_average = False,
-        confidence_head_atom_resolution = False
+        confidence_head_atom_resolution = False,
+        checkpoint_input_embedding = False,
+        checkpoint_trunk_pairformer = False,
+        checkpoint_diffusion_token_transformer = False,
     ):
         super().__init__()
 
@@ -4447,6 +4462,7 @@ def __init__(
             dim_atompair = dim_atompair,
             dim_token = dim_token,
             dim_single = dim_single + dim_single_inputs,
+            checkpoint_token_transformer = checkpoint_diffusion_token_transformer,
             **diffusion_module_kwargs
         )
 
@@ -4484,6 +4500,11 @@ def __init__(
             **confidence_head_kwargs
         )
 
+        # checkpointing related
+
+        self.checkpoint_trunk_pairformer = checkpoint_trunk_pairformer
+        self.checkpoint_diffusion_token_transformer = checkpoint_diffusion_token_transformer
+
         # loss related
 
         self.ignore_index = ignore_index
@@ -4817,9 +4838,16 @@ def forward(
 
                 pairwise = embedded_msa + pairwise
 
+            # maybe checkpoint trunk pairformer
+
+            pairformer = self.pairformer
+
+            if should_checkpoint(self, (single, pairwise), 'checkpoint_trunk_pairformer'):
+                pairformer = partial(checkpoint, pairformer, use_reentrant = False)
+
             # main attention trunk (pairformer)
 
-            single, pairwise = self.pairformer(
+            single, pairwise = pairformer(
                 single_repr = single,
                 pairwise_repr = pairwise,
                 mask = mask
@@ -4877,7 +4905,7 @@ def forward(
                 pred_atom_pos = confidence_head_atom_pos_input.detach(),
                 molecule_atom_indices = molecule_atom_indices,
                 molecule_atom_lens = molecule_atom_lens,
-                atom_feats = atom_feats,
+                atom_feats = atom_feats.detach(),
                 mask = mask,
                 return_pae_logits = True
             )
@@ -5056,7 +5084,7 @@ def forward(
                 molecule_atom_indices = molecule_atom_indices,
                 molecule_atom_lens = molecule_atom_lens,
                 mask = mask,
-                atom_feats = atom_feats,
+                atom_feats = atom_feats.detach(),
                 return_pae_logits = return_pae_logits
             )
 
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "alphafold3-pytorch"
-version = "0.2.70"
+version = "0.2.71"
 description = "Alphafold 3 - Pytorch"
 authors = [
     { name = "Phil Wang", email = "lucidrains@gmail.com" }
diff --git a/tests/test_af3.py b/tests/test_af3.py
@@ -673,6 +673,8 @@ def test_alphafold3_without_msa_and_templates():
         dim_atom_inputs = 77,
         dim_template_feats = 44,
         num_dist_bins = 38,
+        checkpoint_trunk_pairformer = True,
+        checkpoint_diffusion_token_transformer = True,
         confidence_head_kwargs = dict(
             pairformer_depth = 1
         ),