slightly more efficient linear attention

lucidrains · lucidrains · commit 73e4471f7901 · 2024-08-21T08:57:30.000-07:00
diff --git a/alphafold3_pytorch/alphafold3.py b/alphafold3_pytorch/alphafold3.py
@@ -149,6 +149,11 @@
 
 LinearNoBias = partial(Linear, bias = False)
 
+# always use non reentrant checkpointing
+
+checkpoint = partial(checkpoint, use_reentrant = False)
+checkpoint_sequential = partial(checkpoint_sequential, use_reentrant = False)
+
 # helper functions
 
 def exists(v):
@@ -1179,7 +1184,7 @@ def inner(inputs):
             wrapped_layers.append(msa_transition_wrapper(msa_transition))
             wrapped_layers.append(pairwise_block_wrapper(pairwise_block))
 
-        pairwise_repr, *_ = checkpoint_sequential(wrapped_layers, self.checkpoint_segments, inputs, use_reentrant = False)
+        pairwise_repr, *_ = checkpoint_sequential(wrapped_layers, self.checkpoint_segments, inputs)
 
         return pairwise_repr
 
@@ -1399,7 +1404,7 @@ def inner(inputs, *args, **kwargs):
                 wrapped_layers.append(pair_bias_attn_wrapper(pair_bias_attn))
                 wrapped_layers.append(single_transition_wrapper(single_transition))
 
-        single_repr, pairwise_repr, _ = checkpoint_sequential(wrapped_layers, self.checkpoint_segments, inputs, use_reentrant = False)
+        single_repr, pairwise_repr, _ = checkpoint_sequential(wrapped_layers, self.checkpoint_segments, inputs)
 
         return single_repr, pairwise_repr
 
@@ -1618,7 +1623,7 @@ def inner(inputs):
         for block in self.pairformer_stack:
             wrapped_layers.append(block_wrapper(block))
 
-        templates, _ = checkpoint_sequential(wrapped_layers, self.checkpoint_segments, inputs, use_reentrant = False)
+        templates, _ = checkpoint_sequential(wrapped_layers, self.checkpoint_segments, inputs)
 
         return templates
 
@@ -1858,6 +1863,7 @@ def __init__(
                     dim = dim,
                     prenorm = True,
                     gate_value_heads = True,
+                    remove_even_power_dups = True,
                     **linear_attn_kwargs
                 )
 
@@ -1971,7 +1977,7 @@ def inner(inputs):
             wrapped_layers.append(attn_wrapper(attn))
             wrapped_layers.append(transition_wrapper(transition))
 
-        out = checkpoint_sequential(wrapped_layers, self.checkpoint_segments, inputs, use_reentrant = False)
+        out = checkpoint_sequential(wrapped_layers, self.checkpoint_segments, inputs)
 
         noised_repr, *_ = out
         return noised_repr
@@ -2439,7 +2445,7 @@ def forward(
         token_transformer = self.token_transformer
 
         if should_checkpoint(self, tokens, 'checkpoint_token_transformer'):
-            token_transformer = partial(checkpoint, token_transformer, use_reentrant = False)
+            token_transformer = partial(checkpoint, token_transformer)
 
         # token transformer
 
@@ -2917,9 +2923,7 @@ def forward(
             mask = mask & paired_coords_mask
 
         # Calculate masked averaging
-        lddt_sum = (eps * mask).sum(dim=(-1, -2))
-        lddt_count = mask.sum(dim=(-1, -2))
-        lddt = lddt_sum / lddt_count.clamp(min=1)
+        lddt = masked_average(eps, mask = mask, dim = (-1, -2), eps = 1)
 
         return 1. - lddt.mean()
 
@@ -4994,6 +4998,7 @@ def __init__(
         distogram_atom_resolution = False,
         checkpoint_input_embedding = False,
         checkpoint_trunk_pairformer = False,
+        checkpoint_distogram_head = True,
         checkpoint_diffusion_token_transformer = False,
         detach_when_recycling = True,
         pdb_training_set=True,
@@ -5213,6 +5218,7 @@ def __init__(
 
         self.checkpoint_trunk_pairformer = checkpoint_trunk_pairformer
         self.checkpoint_diffusion_token_transformer = checkpoint_diffusion_token_transformer
+        self.checkpoint_distogram_head = checkpoint_distogram_head
 
         # loss related
 
@@ -5566,7 +5572,7 @@ def forward(
             pairformer = self.pairformer
 
             if should_checkpoint(self, (single, pairwise), 'checkpoint_trunk_pairformer'):
-                pairformer = partial(checkpoint, pairformer, use_reentrant = False)
+                pairformer = partial(checkpoint, pairformer)
 
             # main attention trunk (pairformer)
 
@@ -5693,7 +5699,12 @@ def forward(
 
             distance_labels = torch.where(distogram_mask, distance_labels, ignore)
 
-            distogram_logits = self.distogram_head(
+            distogram_head_fn = self.distogram_head
+
+            if should_checkpoint(self, pairwise, 'checkpoint_distogram_head'):
+                distogram_head_fn = partial(checkpoint, distogram_head_fn)
+
+            distogram_logits = distogram_head_fn(
                 pairwise,
                 molecule_atom_lens = molecule_atom_lens,
                 atom_feats = atom_feats
diff --git a/alphafold3_pytorch/inputs.py b/alphafold3_pytorch/inputs.py
@@ -276,7 +276,7 @@ def file_to_atom_input(path: str | Path) -> AtomInput:
 
     assert path.is_file()
 
-    atom_input_dict = torch.load(str(path))
+    atom_input_dict = torch.load(str(path), weights_only = True)
     return AtomInput(**atom_input_dict)
 
 @typecheck
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "alphafold3-pytorch"
-version = "0.3.10"
+version = "0.3.11"
 description = "Alphafold 3 - Pytorch"
 authors = [
     { name = "Phil Wang", email = "lucidrains@gmail.com" },
@@ -49,7 +49,7 @@ dependencies = [
     "sh>=2.0.7",
     "shortuuid",
     "tensorboard",
-    "taylor-series-linear-attention>=0.1.11",
+    "taylor-series-linear-attention>=0.1.12",
     "torchtyping>=0.1.5",
     "timeout_decorator>=0.5.0",
     'torch_geometric',