deviate from the paper and allow for interpolation of the compressed scores for better selected fine blocks, when compress block size > fine block size

lucidrains · lucidrains · commit 6aa5fd84af36 · 2025-02-21T14:44:35.000Z
diff --git a/native_sparse_attention_pytorch/native_sparse_attention.py b/native_sparse_attention_pytorch/native_sparse_attention.py
@@ -15,7 +15,7 @@
 # einstein notation
 
 import einx
-from einops import einsum, repeat, rearrange, reduce
+from einops import einsum, repeat, rearrange, reduce, pack, unpack
 from einops.layers.torch import Rearrange
 
 # b - batch
@@ -109,13 +109,27 @@ def round_up_mult(n, mult):
 def divisible_by(num, den):
     return (num % den) == 0
 
+def pack_one_with_inverse(t, pattern):
+    packed, ps = pack([t], pattern)
+    def inverse(out):
+        return unpack(out, ps, pattern)[0]
+
+    return packed, inverse
+
 # tensor helpers
 
 def pad_at_dim(t, pad, dim = -1, value = 0.):
     dims_from_right = (- dim - 1) if dim < 0 else (t.ndim - dim - 1)
     zeros = ((0, 0) * dims_from_right)
     return F.pad(t, (*zeros, *pad), value = value)
 
+def interpolate_1d(x, length, mode = 'bilinear'):
+    x, inverse_pack = pack_one_with_inverse(x, '* n')
+    x = rearrange(x, 'b n -> b 1 n 1')
+    x = F.interpolate(x, (length, 1), mode = mode)
+    x = rearrange(x, 'b 1 n 1 -> b n')
+    return inverse_pack(x)
+
 def straight_through(t, target):
     return t + (target - t).detach()
 
@@ -135,6 +149,7 @@ def __init__(
         num_compressed_mem_kv = 4,
         norm = True,
         use_diff_topk = False,
+        interpolated_importance_score = False,
         compress_mlp: Module | None = None,
         compress_mlp_expand_factor = 1.,
         strategy_combine_mlp: Module | None = None
@@ -216,6 +231,8 @@ def __init__(
 
         self.use_diff_topk = use_diff_topk
 
+        self.interpolated_importance_score = interpolated_importance_score # in the case fine block size < compressed block size, will weigh space better when selecting
+
         self.selection_block_size = selection_block_size
 
         assert num_selected_blocks > 0
@@ -326,10 +343,18 @@ def forward(
         # first we expand all the compressed scores to the full sequence length, then average within each fine / selection block size - pad on the right to 0s, which should be fine as sliding window convers the local anyways
 
         if self.compress_block_size != self.selection_block_size:
-            importance_scores = repeat(importance_scores, '... j -> ... (j block_size)', block_size = self.compress_block_size)
-            padding = fine_divisible_seq_len - importance_scores.shape[-1]
 
+            score_len = importance_scores.shape[-1]
+            compress_seq_len = score_len * self.compress_block_size
+
+            if self.interpolated_importance_score:
+                importance_scores = interpolate_1d(importance_scores, compress_seq_len)
+            else:
+                importance_scores = repeat(importance_scores, '... j -> ... (j block_size)', block_size = self.compress_block_size)
+
+            padding = fine_divisible_seq_len - compress_seq_len
             importance_scores = F.pad(importance_scores, (0, padding))
+
             importance_scores = reduce(importance_scores, '... (j block_size) -> ... j', 'mean', block_size = self.selection_block_size)
 
         # handle if number of total blocks is less than number to select for fine attention
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "native-sparse-attention-pytorch"
-version = "0.0.34"
+version = "0.0.35"
 description = "Native Sparse Attention"
 authors = [
     { name = "Phil Wang", email = "lucidrains@gmail.com" }
diff --git a/train.py b/train.py
@@ -104,7 +104,8 @@ def base_decoding(
         compress_block_size = 32,
         selection_block_size = 32,
         num_selected_blocks = 2,
-        use_diff_topk = False
+        use_diff_topk = False,
+        interpolated_importance_score = True
     )
 ).cuda()
 

Original file line number	Diff line number	Diff line change
`@@ -104,7 +104,8 @@ def base_decoding(`
`104`	`104`	`compress_block_size = 32,`
`105`	`105`	`selection_block_size = 32,`
`106`	`106`	`num_selected_blocks = 2,`
`107`		`- use_diff_topk = False`
	`107`	`+ use_diff_topk = False,`
	`108`	`+ interpolated_importance_score = True`
`108`	`109`	`)`
`109`	`110`	`).cuda()`
`110`	`111`