handle no compressed blocks edge case

lucidrains · lucidrains · commit ba24fbeeda7a · 2025-02-20T06:30:47.000-08:00
diff --git a/README.md b/README.md
@@ -59,3 +59,13 @@ To record some of your experiments, just invoke `wandb login` first before modif
     url     = {https://api.semanticscholar.org/CorpusID:276408911}
 }
 ```
+
+```bibtex
+@inproceedings{Keles2022OnTC,
+    title   = {On The Computational Complexity of Self-Attention},
+    author  = {Feyza Duman Keles and Pruthuvi Maheshakya Wijewardena and Chinmay Hegde},
+    booktitle = {International Conference on Algorithmic Learning Theory},
+    year    = {2022},
+    url     = {https://api.semanticscholar.org/CorpusID:252198880}
+}
+```
diff --git a/native_sparse_attention_pytorch/native_sparse_attention.py b/native_sparse_attention_pytorch/native_sparse_attention.py
@@ -243,83 +243,97 @@ def forward(
 
         importance_scores = cattn[..., num_mem_compress_kv:]
 
-        topk = min(self.num_selected_blocks, importance_scores.shape[-1])
-
-        selected_importance_values, selected_block_indices = importance_scores.topk(topk, dim = -1)
-
-        if self.use_diff_topk:
-            gates = selected_importance_values + (1. - selected_importance_values).detach()
-
-        fmask = selected_importance_values > 1e-10
+        num_selected = min(self.num_selected_blocks, importance_scores.shape[-1])
 
         fq = rotated_q
         fk = rotated_k
         fv = v
 
-        if seq_len < fine_divisible_seq_len:
-            remainder = fine_divisible_seq_len - seq_len
-            fk = pad_at_dim(fk, (0, remainder), value = 0., dim = -2)
-            fv = pad_at_dim(fv, (0, remainder), value = 0., dim = -2)
-            fq = pad_at_dim(fq, (0, remainder), value = 0., dim = -2)
+        if num_selected > 0:
+            selected_importance_values, selected_block_indices = importance_scores.topk(num_selected, dim = -1)
+
+            if self.use_diff_topk:
+                gates = selected_importance_values + (1. - selected_importance_values).detach()
 
-            fmask = pad_at_dim(fmask, (0, remainder), value = False, dim = -2)
+            fmask = selected_importance_values > 1e-10
 
-            selected_block_indices = pad_at_dim(selected_block_indices, (0, remainder), value = 0, dim = -2)
+            if seq_len < fine_divisible_seq_len:
+                remainder = fine_divisible_seq_len - seq_len
+                fk = pad_at_dim(fk, (0, remainder), value = 0., dim = -2)
+                fv = pad_at_dim(fv, (0, remainder), value = 0., dim = -2)
+                fq = pad_at_dim(fq, (0, remainder), value = 0., dim = -2)
 
-            if self.use_diff_topk:
-                gates = pad_at_dim(gates, (0, remainder), value = 1., dim = -2)
+                fmask = pad_at_dim(fmask, (0, remainder), value = False, dim = -2)
 
-        # handle block causal diagonal in the diagram, but run experiments without to see
+                selected_block_indices = pad_at_dim(selected_block_indices, (0, remainder), value = 0, dim = -2)
 
-        fine_window_seq = arange(fine_divisible_seq_len, device = device) // self.selection_block_size
-        fine_window_seq = repeat(fine_window_seq, 'n -> b h n 1', b = batch, h = heads)
-        selected_block_indices = cat((selected_block_indices, fine_window_seq), dim = -1) # for the block causal diagonal in fig2
+                if self.use_diff_topk:
+                    gates = pad_at_dim(gates, (0, remainder), value = 1., dim = -2)
 
-        fmask = repeat(fmask, 'b h i w -> b h i w j', j = self.selection_block_size)
+            # handle block causal diagonal in the diagram, but run experiments without to see
 
-        causal_mask = torch.ones((self.selection_block_size,) * 2, device = device, dtype = torch.bool).tril()
-        causal_mask = repeat(causal_mask, 'i j -> b h (w i) 1 j', w = num_fine_blocks, b = batch, h = heads)
+            fine_window_seq = arange(fine_divisible_seq_len, device = device) // self.selection_block_size
+            fine_window_seq = repeat(fine_window_seq, 'n -> b h n 1', b = batch, h = heads)
+            selected_block_indices = cat((selected_block_indices, fine_window_seq), dim = -1) # for the block causal diagonal in fig2
 
-        fmask = cat((fmask, causal_mask), dim = -2)
-        fmask = rearrange(fmask, 'b h i w j -> b h i (w j)')
+            fmask = repeat(fmask, 'b h i w -> b h i w j', j = self.selection_block_size)
 
-        # select out the spatial crops of keys / values for fine attention
+            causal_mask = torch.ones((self.selection_block_size,) * 2, device = device, dtype = torch.bool).tril()
+            causal_mask = repeat(causal_mask, 'i j -> b h (w i) 1 j', w = num_fine_blocks, b = batch, h = heads)
 
-        fk = rearrange(fk, 'b h (w n) d -> b h w n d', w = num_fine_blocks)
-        fv = rearrange(fv, 'b h (w n) d -> b h w n d', w = num_fine_blocks)
+            fmask = cat((fmask, causal_mask), dim = -2)
+            fmask = rearrange(fmask, 'b h i w j -> b h i (w j)')
 
-        # get_at("b h [w] j d, b h i selected -> b h i selected j d", fkv, selected_block_indices)
+            # select out the spatial crops of keys / values for fine attention
 
-        fk = repeat(fk, 'b h w j d -> b h i w j d', i = selected_block_indices.shape[2])
-        fv = repeat(fv, 'b h w j d -> b h i w j d', i = selected_block_indices.shape[2])
+            fk = rearrange(fk, 'b h (w n) d -> b h w n d', w = num_fine_blocks)
+            fv = rearrange(fv, 'b h (w n) d -> b h w n d', w = num_fine_blocks)
 
-        selected_block_indices = repeat(selected_block_indices, 'b h i sel -> b h i sel j d', j = fk.shape[-2], d = fk.shape[-1])
+            # get_at("b h [w] j d, b h i selected -> b h i selected j d", fkv, selected_block_indices)
 
-        fk = fk.gather(3, selected_block_indices)
-        fv = fv.gather(3, selected_block_indices)
+            fk = repeat(fk, 'b h w j d -> b h i w j d', i = selected_block_indices.shape[2])
+            fv = repeat(fv, 'b h w j d -> b h i w j d', i = selected_block_indices.shape[2])
 
-        # handle maybe gating
+            selected_block_indices = repeat(selected_block_indices, 'b h i sel -> b h i sel j d', j = fk.shape[-2], d = fk.shape[-1])
 
-        if self.use_diff_topk:
-            gates = F.pad(gates, (0, 1), value = 1.)
+            fk = fk.gather(3, selected_block_indices)
+            fv = fv.gather(3, selected_block_indices)
 
-            fk = einx.multiply('b h i w, b h i w j d -> b h i w j d', gates, fk)
-            fv = einx.multiply('b h i w, b h i w j d -> b h i w j d', gates, fv)
+            # handle maybe gating
+
+            if self.use_diff_topk:
+                gates = F.pad(gates, (0, 1), value = 1.)
 
-        fk = rearrange(fk, 'b h i w j d -> b h i (w j) d')
-        fv = rearrange(fv, 'b h i w j d -> b h i (w j) d')
+                fk = einx.multiply('b h i w, b h i w j d -> b h i w j d', gates, fk)
+                fv = einx.multiply('b h i w, b h i w j d -> b h i w j d', gates, fv)
 
-        # fine attention
+            fk = rearrange(fk, 'b h i w j d -> b h i (w j) d')
+            fv = rearrange(fv, 'b h i w j d -> b h i (w j) d')
+
+            # fine attention
+
+            fsim = einsum(fq, fk, 'b h i d, b h i j d -> b h i j') * self.scale
+
+            fsim = fsim.masked_fill(~fmask, mask_value)
+
+            fattn = fsim.softmax(dim = -1)
+
+            fine_attn_out = einsum(fattn, fv, 'b h i j, b h i j d -> b h i d')
+
+            fine_attn_out = fine_attn_out[..., :seq_len, :]
+        else:
+            # if only first block, just do a simple block causal
 
-        fsim = einsum(fq, fk, 'b h i d, b h i j d -> b h i j') * self.scale
+            seq_len = fk.shape[-2]
+            fmask = causal_mask = torch.ones((seq_len, seq_len), device = device, dtype = torch.bool).tril()
 
-        fsim = fsim.masked_fill(~fmask, mask_value)
+            fsim = einsum(fq, fk, 'b h i d, b h j d -> b h i j') * self.scale
 
-        fattn = fsim.softmax(dim = -1)
+            fsim = fsim.masked_fill(~fmask, mask_value)
 
-        fine_attn_out = einsum(fattn, fv, 'b h i j, b h i j d -> b h i d')
+            fattn = fsim.softmax(dim = -1)
 
-        fine_attn_out = fine_attn_out[..., :seq_len, :]
+            fine_attn_out = einsum(fattn, fv, 'b h i j, b h j d -> b h i d')
 
         # 3. overlapping sliding window, this is unsurprising and expected
 
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "native-sparse-attention-pytorch"
-version = "0.0.16"
+version = "0.0.17"
 description = "Native Sparse Attention"
 authors = [
     { name = "Phil Wang", email = "lucidrains@gmail.com" }
diff --git a/tests/test_sparse_attn.py b/tests/test_sparse_attn.py
@@ -7,7 +7,7 @@
 from native_sparse_attention_pytorch import SparseAttention
 
 @pytest.mark.parametrize('use_diff_topk', (False, True))
-@pytest.mark.parametrize('seq_len', (4, 31, 32, 120))
+@pytest.mark.parametrize('seq_len', (1, 4, 31, 32, 120))
 def test_sparse_attn(
     use_diff_topk,
     seq_len