small test for customizable compress mlp

lucidrains · lucidrains · commit d531216e31f4 · 2025-02-20T13:11:09.000Z
diff --git a/native_sparse_attention_pytorch/native_sparse_attention.py b/native_sparse_attention_pytorch/native_sparse_attention.py
@@ -135,15 +135,15 @@ def __init__(
             compress_dim = compress_block_size * dim_head
             compress_mlp_dim_hidden = int(compress_mlp_expand_factor * compress_dim)
 
-            mlp = nn.Sequential(
+            compress_mlp = nn.Sequential(
                 Rearrange('b h w n d -> b h w (n d)'),
                 nn.Linear(compress_dim, compress_mlp_dim_hidden),
                 nn.SiLU(),
                 nn.Linear(compress_mlp_dim_hidden, dim_head),
             )
 
-        self.k_compress = deepcopy(mlp)
-        self.v_compress = deepcopy(mlp)
+        self.k_compress = deepcopy(compress_mlp)
+        self.v_compress = deepcopy(compress_mlp)
 
         # selection related
 
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "native-sparse-attention-pytorch"
-version = "0.0.9"
+version = "0.0.11"
 description = "Native Sparse Attention"
 authors = [
     { name = "Phil Wang", email = "lucidrains@gmail.com" }
diff --git a/tests/test_sparse_attn.py b/tests/test_sparse_attn.py
@@ -1,12 +1,15 @@
 import pytest
+
 import torch
+from torch import nn
+from einops.layers.torch import Rearrange
+
+from native_sparse_attention_pytorch import SparseAttention
 
 @pytest.mark.parametrize('use_diff_topk', (False, True))
 def test_sparse_attn(
     use_diff_topk
 ):
-    from native_sparse_attention_pytorch import SparseAttention
-
     attn = SparseAttention(
         dim = 512,
         dim_head = 64,
@@ -23,3 +26,34 @@ def test_sparse_attn(
     attended = attn(tokens)
 
     assert tokens.shape == attended.shape
+
+def test_alternative_compress_mlp():
+
+    dim_head = 64
+    compress_dim = dim_head * 4
+
+    compress_mlp = nn.Sequential(
+        Rearrange('b h w n d -> b h w (n d)'),
+        nn.Linear(compress_dim, compress_dim),
+        nn.SiLU(),
+        nn.Linear(compress_dim, compress_dim),
+        nn.SiLU(),
+        nn.Linear(compress_dim, dim_head),
+    )
+
+    attn = SparseAttention(
+        dim = 512,
+        dim_head = 64,
+        heads = 8,
+        sliding_window_size = 2,
+        compress_block_size = 4,
+        selection_block_size = 4,
+        num_selected_blocks = 2,
+        compress_mlp = compress_mlp
+    )
+
+    tokens = torch.randn(2, 31, 512)
+
+    attended = attn(tokens)
+
+    assert tokens.shape == attended.shape