local attention cannot used register tokens, so we will use the next best thing

lucidrains · lucidrains · commit d85f17e34495 · 2024-06-02T10:25:28.000-07:00
diff --git a/alphafold3_pytorch/alphafold3.py b/alphafold3_pytorch/alphafold3.py
@@ -570,6 +570,7 @@ def __init__(
         heads,
         dim_pairwise,
         window_size = None,
+        num_memory_kv = 0,
         **attn_kwargs
     ):
         super().__init__()
@@ -579,6 +580,7 @@ def __init__(
         self.attn = Attention(
             heads = heads,
             window_size = window_size,
+            num_memory_kv = num_memory_kv,
             **attn_kwargs
         )
 
@@ -1434,6 +1436,7 @@ def __init__(
         dim_pairwise = 128,
         attn_window_size = None,
         attn_pair_bias_kwargs: dict = dict(),
+        attn_num_memory_kv = False,
         num_register_tokens = 0,
         serial = False,
         use_linear_attn = False,
@@ -1466,6 +1469,7 @@ def __init__(
                 dim_pairwise = dim_pairwise,
                 heads = heads,
                 window_size = attn_window_size,
+                num_memory_kv = attn_num_memory_kv,
                 **attn_pair_bias_kwargs
             )
 
diff --git a/alphafold3_pytorch/attention.py b/alphafold3_pytorch/attention.py
@@ -165,6 +165,7 @@ def __init__(
         query_bias = True,
         flash = True,
         window_size = None,
+        num_memory_kv: int = 0,
         efficient_attn_config: Config = Config(True, True, True)
     ):
         super().__init__()
@@ -178,6 +179,7 @@ def __init__(
         e - dimension (pairwise rep)
         i - source sequence
         j - context sequence
+        m - memory key / value seq
         """
 
         dim_inner = dim_head * heads
@@ -196,6 +198,12 @@ def __init__(
         self.to_kv = nn.Linear(dim, dim_inner * 2, bias = False)
         self.to_out = nn.Linear(dim_inner, dim, bias = False)
 
+        self.memory_kv = None
+
+        if num_memory_kv > 0:
+            self.memory_kv = nn.Parameter(torch.zeros(2, heads, num_memory_kv, dim_head))
+            nn.init.normal_(self.memory_kv, std = 0.02)
+
         # gating of value
         # allows attention to attend to nothing
 
@@ -230,7 +238,8 @@ def forward(
         out = self.attend(
             q, k, v,
             attn_bias = attn_bias,
-            mask = mask
+            mask = mask,
+            memory_kv = self.memory_kv
         )
 
         # merge heads
@@ -315,7 +324,8 @@ def local_attn(
         k: Float['b h n d'],
         v: Float['b h n d'],
         mask: Bool['b n'] | None = None,
-        attn_bias: Float['... n n'] | Float['... nw w (w*2)'] | None = None
+        attn_bias: Float['... n n'] | Float['... nw w (w*2)'] | None = None,
+        memory_kv: Float['2 h m d'] | None = None
     ) -> Float['b h n d']:
         """
         simple local attention with a radius of 1 window size
@@ -363,6 +373,24 @@ def local_attn(
 
         q = q * scale
 
+        # append memory key / values for local attention windows
+
+        if exists(memory_kv):
+            batch, seq, num_mem_kv = k.shape[0], k.shape[2], memory_kv.shape[-2]
+
+            mk, mv = memory_kv
+            mk, mv = tuple(repeat(t, 'h m d -> b h n m d', b = batch, n = seq) for t in (mk, mv))
+            k = torch.cat((mk, k), dim = -2)
+            v = torch.cat((mv, v), dim = -2)
+
+            if exists(attn_bias):
+                attn_bias = pad_at_dim(attn_bias, (num_mem_kv, 0), value = 0.)
+
+            if exists(mask):
+                mask = pad_at_dim(mask, (num_mem_kv, 0), value = True)
+
+        # similarity
+
         sim = einsum(q, k, "... i d, ... j d -> ... i j")
 
         if exists(attn_bias):
@@ -399,6 +427,7 @@ def forward(
         v: Float['b h j d'],
         mask: Bool['b j'] | None = None,
         attn_bias: Float['... i j'] | Float['... nw w (w*2)'] | None = None,
+        memory_kv: Float['2 h m d'] | None = None
     ) -> Float['b h i d']:
 
         is_windowed_attn_bias = None
@@ -410,10 +439,26 @@ def forward(
         # todo (handle attn bias efficiently)
 
         if self.is_local_attn:
-            return self.local_attn(q, k, v, mask = mask, attn_bias = attn_bias)
+            return self.local_attn(q, k, v, mask = mask, attn_bias = attn_bias, memory_kv = memory_kv)
 
         assert not exists(is_windowed_attn_bias) or not is_windowed_attn_bias
 
+        # append memory key / values
+
+        if exists(memory_kv):
+            batch, num_mem_kv = q.shape[0], memory_kv.shape[-2]
+
+            mk, mv = memory_kv
+            mk, mv = tuple(repeat(t, 'h m d -> b h m d', b = batch) for t in (mk, mv))
+            k = torch.cat((mk, k), dim = -2)
+            v = torchc.at((mv, v), dim = -2)
+
+            if exists(attn_bias):
+                attn_bias = pad_at_dim(attn_bias, (num_mem_kv, 0), value = 0.)
+
+            if exists(mask):
+                mask = pad_at_dim(mask, (num_mem_kv, 0), value = True)
+
         # forward to using flash attention if applicable
 
         can_use_flash = self.flash and not exists(attn_bias), 'flash attention does not support attention bias with gradients'
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "alphafold3-pytorch"
-version = "0.1.11"
+version = "0.1.12"
 description = "Alphafold 3 - Pytorch"
 authors = [
     { name = "Phil Wang", email = "lucidrains@gmail.com" }
diff --git a/tests/test_af3.py b/tests/test_af3.py
@@ -262,6 +262,9 @@ def test_diffusion_module():
         atom_encoder_depth = 1,
         atom_decoder_depth = 1,
         token_transformer_depth = 1,
+        atom_encoder_kwargs = dict(
+            attn_num_memory_kv = 2
+        ),
         token_transformer_kwargs = dict(
             num_register_tokens = 2
         )