Support varlen mean pooling compression

yzhangcs · yzhangcs · commit a60d29120967 · 2025-03-15T07:26:46.000-07:00
diff --git a/3rdparty/flash-linear-attention b/3rdparty/flash-linear-attention
@@ -1 +1 @@
-Subproject commit 80e5b0c0331c734789328ceb117f47e344584308
+Subproject commit d262316b96872e4b5b4112cbc639a46b48930292
diff --git a/native_sparse_attention/modeling_nsa.py b/native_sparse_attention/modeling_nsa.py
@@ -98,7 +98,6 @@ def forward(
         g = rearrange(self.g_proj(hidden_states), '... (h d) -> ... h d', d=3)
         g_cmp, g_slc, g_swa = g.sigmoid().unbind(-1)
 
-        # equivalent to cu_seqlens in `flash_attn`
         cu_seqlens = kwargs.get('cu_seqlens', None)
 
         seqlen_offset, max_seqlen = 0, seq_len
@@ -138,6 +137,7 @@ def forward(
             block_size=self.block_size,
             block_counts=self.block_counts,
             window_size=self.window_size,
+            cu_seqlens=cu_seqlens,
             head_first=False
         )
         o = o.reshape(batch_size, seq_len, -1)
diff --git a/native_sparse_attention/ops/naive.py b/native_sparse_attention/ops/naive.py
@@ -5,9 +5,25 @@
 from typing import Optional, Union
 
 import torch
+import torch.nn.functional as F
 from einops import rearrange, repeat
 
-from native_sparse_attention.ops.parallel import compression
+
+@torch.compile
+def compression(
+    k: torch.Tensor,
+    v: torch.Tensor,
+    block_size: int
+) -> torch.Tensor:
+    # Currently, we set mean pooling as our basic compression function.
+    B, T, H = k.shape[:3]
+    num_block = math.ceil(T / block_size)
+    if k.shape[1] % block_size != 0:
+        k = F.pad(k, (0, 0, 0, 0, 0, num_block * block_size - T))
+        v = F.pad(v, (0, 0, 0, 0, 0, num_block * block_size - T))
+    k_cmp = k.view(B, num_block, block_size, H, -1).mean(dim=2)
+    v_cmp = v.view(B, num_block, block_size, H, -1).mean(dim=2)
+    return k_cmp, v_cmp
 
 
 def naive_nsa(
diff --git a/native_sparse_attention/ops/parallel.py b/native_sparse_attention/ops/parallel.py
@@ -1,19 +1,18 @@
 # -*- coding: utf-8 -*-
 # Copyright (c) 2023-2025, Songlin Yang, Yu Zhang
 
-import math
 import warnings
 from typing import Optional, Union
 
 import torch
-import torch.nn.functional as F
 import triton
 import triton.language as tl
 import triton.language.core as core
 from einops import rearrange
 
 from fla.ops.common.utils import (prepare_chunk_indices, prepare_chunk_offsets,
                                   prepare_lens, prepare_token_indices)
+from fla.ops.utils import mean_pooling
 from fla.utils import autocast_custom_bwd, autocast_custom_fwd, contiguous
 from native_sparse_attention.ops.utils import _bitonic_merge
 
@@ -812,23 +811,6 @@ def parallel_nsa_bwd_kernel_dkv(
     tl.store(p_dv, b_dv.to(p_dv.dtype.element_ty), boundary_check=(0, 1))
 
 
-@torch.compile
-def compression(
-    k: torch.Tensor,
-    v: torch.Tensor,
-    block_size: int
-) -> torch.Tensor:
-    # Currently, we set mean pooling as our basic compression function.
-    B, T, H = k.shape[:3]
-    num_block = math.ceil(T / block_size)
-    if k.shape[1] % block_size != 0:
-        k = F.pad(k, (0, 0, 0, 0, 0, num_block * block_size - T))
-        v = F.pad(v, (0, 0, 0, 0, 0, num_block * block_size - T))
-    k_cmp = k.view(B, num_block, block_size, H, -1).mean(dim=2)
-    v_cmp = v.view(B, num_block, block_size, H, -1).mean(dim=2)
-    return k_cmp, v_cmp
-
-
 def parallel_nsa_compression_fwd(
     q: torch.Tensor,
     k: torch.Tensor,
@@ -1411,7 +1393,7 @@ def parallel_nsa(
             block_counts = rearrange(block_counts, 'b h t -> b t h')
     assert q.shape[2] % (k.shape[2] * 16) == 0, "Group size must be a multiple of 16 in NSA"
 
-    k_cmp, v_cmp = compression(k, v, block_size)
+    k_cmp, v_cmp = mean_pooling(k, block_size, cu_seqlens), mean_pooling(v, block_size, cu_seqlens)
     o_cmp, lse_cmp = None, None
     if g_cmp is not None:
         o_cmp, lse_cmp = parallel_nsa_compression(
@@ -1439,7 +1421,7 @@ def parallel_nsa(
         o = torch.addcmul(o, o_cmp, g_cmp.unsqueeze(-1))
     if window_size > 0:
         if cu_seqlens is not None:
-            max_seqlen = prepare_lens(cu_seqlens)
+            max_seqlen = q.shape[1]
             o_swa = flash_attn_varlen_func(
                 q.squeeze(0), k.squeeze(0), v.squeeze(0),
                 cu_seqlens_q=cu_seqlens,