leverage einmix for a kv compress mlp with separate parameters per head

lucidrains · lucidrains · commit 115279f19ab4 · 2025-02-20T16:34:39.000-08:00
diff --git a/native_sparse_attention_pytorch/compress_networks.py b/native_sparse_attention_pytorch/compress_networks.py
@@ -3,6 +3,7 @@
 from torch.nn import Module, ModuleList
 
 from einops import einsum, rearrange
+from einops.layers.torch import EinMix as Mix
 
 # helpers
 
@@ -66,3 +67,34 @@ def forward(
 
         return compressed
 
+# mlp per head
+
+class GroupedMLP(Module):
+    def __init__(
+        self,
+        dim_head,
+        compress_block_size,
+        heads,
+        expand_factor = 1.,
+    ):
+        super().__init__()
+
+        dim = dim_head * compress_block_size
+        dim_hidden = int(dim * expand_factor)
+        dim_out = dim_head
+
+        self.net = nn.Sequential(
+            Mix('b h w i -> b h w o', weight_shape = 'h i o', bias_shape = 'h o', h = heads, i = dim, o = dim_hidden),
+            nn.ReLU(),
+            Mix('b h w i -> b h w o', weight_shape = 'h i o', bias_shape = 'h o', h = heads, i = dim_hidden, o = dim_out),
+        )
+
+    def forward(
+        self,
+        kv
+    ):
+        kv = rearrange(kv, 'b h w n d -> b h w (n d)')
+
+        compressed = self.net(kv)
+
+        return compressed
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "native-sparse-attention-pytorch"
-version = "0.0.27"
+version = "0.0.28"
 description = "Native Sparse Attention"
 authors = [
     { name = "Phil Wang", email = "lucidrains@gmail.com" }
@@ -24,7 +24,7 @@ classifiers=[
 
 dependencies = [
     "einx>=0.3.0",
-    "einops>=0.8.0",
+    "einops>=0.8.1",
     "local-attention>=1.11.1",
     "rotary-embedding-torch",
     "torch>=2.5",
diff --git a/tests/test_custom_compress_mlp.py b/tests/test_custom_compress_mlp.py
@@ -0,0 +1,79 @@
+import pytest
+
+import torch
+from torch import nn
+from einops.layers.torch import Rearrange
+
+from native_sparse_attention_pytorch import SparseAttention
+
+def test_alternative_compress_mlp():
+
+    dim_head = 64
+    compress_dim = dim_head * 4
+
+    compress_mlp = nn.Sequential(
+        Rearrange('b h w n d -> b h w (n d)'),
+        nn.Linear(compress_dim, compress_dim),
+        nn.SiLU(),
+        nn.Linear(compress_dim, compress_dim),
+        nn.SiLU(),
+        nn.Linear(compress_dim, dim_head),
+    )
+
+    attn = SparseAttention(
+        dim = 512,
+        dim_head = 64,
+        heads = 8,
+        sliding_window_size = 2,
+        compress_block_size = 4,
+        selection_block_size = 4,
+        num_selected_blocks = 2,
+        compress_mlp = compress_mlp
+    )
+
+    tokens = torch.randn(2, 31, 512)
+
+    attended = attn(tokens)
+
+    assert tokens.shape == attended.shape
+
+
+def test_compress_networks():
+    from native_sparse_attention_pytorch.compress_networks import AttentionPool
+
+    attn = SparseAttention(
+        dim = 512,
+        dim_head = 64,
+        heads = 8,
+        sliding_window_size = 2,
+        compress_block_size = 4,
+        selection_block_size = 4,
+        num_selected_blocks = 2,
+        compress_mlp = AttentionPool(64, 4)
+    )
+
+    tokens = torch.randn(2, 31, 512)
+
+    attended = attn(tokens)
+
+    assert tokens.shape == attended.shape
+
+def test_group_mlp():
+    from native_sparse_attention_pytorch.compress_networks import GroupedMLP
+
+    attn = SparseAttention(
+        dim = 512,
+        dim_head = 64,
+        heads = 8,
+        sliding_window_size = 2,
+        compress_block_size = 4,
+        selection_block_size = 4,
+        num_selected_blocks = 2,
+        compress_mlp = GroupedMLP(64, 4, 8)
+    )
+
+    tokens = torch.randn(2, 31, 512)
+
+    attended = attn(tokens)
+
+    assert tokens.shape == attended.shape
diff --git a/tests/test_sparse_attn.py b/tests/test_sparse_attn.py
@@ -33,55 +33,3 @@ def test_sparse_attn(
     attended = attn(tokens)
 
     assert tokens.shape == attended.shape
-
-def test_alternative_compress_mlp():
-
-    dim_head = 64
-    compress_dim = dim_head * 4
-
-    compress_mlp = nn.Sequential(
-        Rearrange('b h w n d -> b h w (n d)'),
-        nn.Linear(compress_dim, compress_dim),
-        nn.SiLU(),
-        nn.Linear(compress_dim, compress_dim),
-        nn.SiLU(),
-        nn.Linear(compress_dim, dim_head),
-    )
-
-    attn = SparseAttention(
-        dim = 512,
-        dim_head = 64,
-        heads = 8,
-        sliding_window_size = 2,
-        compress_block_size = 4,
-        selection_block_size = 4,
-        num_selected_blocks = 2,
-        compress_mlp = compress_mlp
-    )
-
-    tokens = torch.randn(2, 31, 512)
-
-    attended = attn(tokens)
-
-    assert tokens.shape == attended.shape
-
-
-def test_compress_networks():
-    from native_sparse_attention_pytorch.compress_networks import AttentionPool
-
-    attn = SparseAttention(
-        dim = 512,
-        dim_head = 64,
-        heads = 8,
-        sliding_window_size = 2,
-        compress_block_size = 4,
-        selection_block_size = 4,
-        num_selected_blocks = 2,
-        compress_mlp = AttentionPool(64, 4)
-    )
-
-    tokens = torch.randn(2, 31, 512)
-
-    attended = attn(tokens)
-
-    assert tokens.shape == attended.shape