make attention softmax done in full precision customizable, bfloat16 should be ok

lucidrains · lucidrains · commit 165d795d31fd · 2024-09-28T17:08:12.000-07:00
diff --git a/alphafold3_pytorch/attention.py b/alphafold3_pytorch/attention.py
@@ -178,7 +178,8 @@ def __init__(
         num_memory_kv: int = 0,
         enable_attn_softclamp = False,
         attn_softclamp_value = 50.,
-        init_gate_bias = -2.
+        init_gate_bias = -2.,
+        softmax_full_precision = False
     ):
         super().__init__()
         """
@@ -201,6 +202,7 @@ def __init__(
             window_size = window_size,
             enable_attn_softclamp = enable_attn_softclamp,
             attn_softclamp_value = attn_softclamp_value,
+            softmax_full_precision = softmax_full_precision
         )
 
         self.split_heads = Rearrange('b n (h d) -> b h n d', h = heads)
@@ -279,7 +281,8 @@ def __init__(
         window_size = None,
         scale: float | None = None,
         enable_attn_softclamp = False,
-        attn_softclamp_value = 50.
+        attn_softclamp_value = 50.,
+        softmax_full_precision = False
     ):
         super().__init__()
         """
@@ -309,6 +312,9 @@ def __init__(
         self.enable_attn_softclamp = enable_attn_softclamp
         self.attn_softclamp_value = attn_softclamp_value
 
+        # whether to use full precision for softmax
+        self.softmax_full_precision = softmax_full_precision
+
     @typecheck
     def local_attn(
         self,
@@ -505,9 +511,16 @@ def forward(
                 mask, sim, max_neg_value(sim)
             )
 
+        # attention cast float32 - in case there are instabilities with float16
+
+        softmax_kwargs = dict()
+
+        if self.softmax_full_precision:
+            softmax_kwargs.update(dtype = torch.float32)
+
         # attention
 
-        attn = sim.softmax(dim = -1, dtype = torch.float32)
+        attn = sim.softmax(dim = -1, **softmax_kwargs)
         attn = attn.to(dtype)
 
         attn = self.attn_dropout(attn)
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "alphafold3-pytorch"
-version = "0.5.49"
+version = "0.5.50"
 description = "Alphafold 3 - Pytorch"
 authors = [
     { name = "Phil Wang", email = "lucidrains@gmail.com" },