sliding windows logic for backwards

lucidrains · lucidrains · commit 1d3712ccae53 · 2025-03-03T15:10:23.000Z
diff --git a/native_sparse_attention_pytorch/triton_native_sparse_attention.py b/native_sparse_attention_pytorch/triton_native_sparse_attention.py
@@ -180,112 +180,121 @@ def forward_kernel_causal_and_sparse(
                 other = 0.0
             )
 
+    q = q.reshape(BLOCK * QUERY_HEAD_GROUPS, BLOCK_HEADDIM)
+
     if INCLUDE_BLOCK_CAUSAL:
 
-        offs_n = start_m * BLOCK + tl.arange(0, BLOCK)
+        if SLIDING:
+            num_kv_blocks = 2
+            offset = -BLOCK
+        else:
+            num_kv_blocks = 1
+            offset = 0
 
-        k_ptrs = (
-            K +
-            off_b * stride_kb +
-            off_h * stride_kh +
-            offs_n[:, None] * stride_kn +
-            offs_d[None, :]
-        )
+        offs_n = start_m * BLOCK + tl.arange(0, BLOCK) + offset
 
-        v_ptrs = (
-            V +
-            off_b * stride_vb +
-            off_h * stride_vh +
-            offs_n[:, None] * stride_vn +
-            offs_d[None, :]
-        )
+        for _ in range(num_kv_blocks):
 
-        if EVEN_N & EVEN_M:
-            if EVEN_HEADDIM:
-                k = tl.load(k_ptrs)
-            else:
-                k = tl.load(k_ptrs, mask=offs_d[None, :] < headdim, other=0.0)
-        else:
-            if EVEN_HEADDIM:
-                k = tl.load(
-                    k_ptrs,
-                    mask = offs_n[:, None] < seqlen_k,
-                    other = 0.0,
-                )
-            else:
-                k = tl.load(
-                    k_ptrs,
-                    mask = (offs_n[:, None] < seqlen_k) & (offs_d[None, :] < headdim),
-                    other = 0.0,
-                )
+            k_ptrs = (
+                K +
+                off_b * stride_kb +
+                off_h * stride_kh +
+                offs_n[:, None] * stride_kn +
+                offs_d[None, :]
+            )
 
-        qk = tl.zeros([BLOCK * QUERY_HEAD_GROUPS, BLOCK], dtype=tl.float32)
+            v_ptrs = (
+                V +
+                off_b * stride_vb +
+                off_h * stride_vh +
+                offs_n[:, None] * stride_vn +
+                offs_d[None, :]
+            )
 
-        q = q.reshape(BLOCK * QUERY_HEAD_GROUPS, BLOCK_HEADDIM)
+            if EVEN_N & EVEN_M:
+                if EVEN_HEADDIM:
+                    k = tl.load(k_ptrs)
+                else:
+                    k = tl.load(k_ptrs, mask=offs_d[None, :] < headdim, other=0.0)
+            else:
+                if EVEN_HEADDIM:
+                    k = tl.load(
+                        k_ptrs,
+                        mask = offs_n[:, None] < seqlen_k,
+                        other = 0.0,
+                    )
+                else:
+                    k = tl.load(
+                        k_ptrs,
+                        mask = (offs_n[:, None] < seqlen_k) & (offs_d[None, :] < headdim),
+                        other = 0.0,
+                    )
 
-        qk += tl.dot(q, tl.trans(k))
+            qk = tl.zeros([BLOCK * QUERY_HEAD_GROUPS, BLOCK], dtype=tl.float32)
 
-        qk = qk.reshape(BLOCK, QUERY_HEAD_GROUPS, BLOCK)
+            qk += tl.dot(q, tl.trans(k))
 
-        if not EVEN_N:
-            within_range_mask = offs_n[None, :] < seqlen_k
+            qk = qk.reshape(BLOCK, QUERY_HEAD_GROUPS, BLOCK)
 
-            if SLIDING:
-                within_range_mask &= offs_n[None, :] >= 0.
+            if not EVEN_N:
+                within_range_mask = offs_n[None, :] < seqlen_k
 
-            qk += tl.where(within_range_mask, 0, float("-inf"))
+                if SLIDING:
+                    within_range_mask &= offs_n[None, :] >= 0.
 
-        qk = qk.reshape(BLOCK, QUERY_HEAD_GROUPS, BLOCK)
+                qk += tl.where(within_range_mask, 0, float("-inf"))
 
-        causal_mask = offs_m[:, None, None] >= offs_n[None, None, :]
+            qk = qk.reshape(BLOCK, QUERY_HEAD_GROUPS, BLOCK)
 
-        if SLIDING:
-            causal_mask &= (offs_n[None, None, :] - offs_m[:, None, None]) <= BLOCK
+            causal_mask = offs_m[:, None, None] >= offs_n[None, None, :]
 
-        qk += tl.where(causal_mask, 0, float("-inf"))
+            if SLIDING:
+                causal_mask &= (offs_n[None, None, :] - offs_m[:, None, None]) <= BLOCK
 
-        m_ij = tl.maximum(tl.max(qk, 2) * softmax_scale, lse_i)
-        p = tl.exp(qk * softmax_scale - m_ij[:, :, None])
+            qk += tl.where(causal_mask, 0, float("-inf"))
 
-        l_ij = tl.sum(p, 2)
+            m_ij = tl.maximum(tl.max(qk, 2) * softmax_scale, lse_i)
+            p = tl.exp(qk * softmax_scale - m_ij[:, :, None])
 
-        acc_o_scale = tl.exp(m_i - m_ij)
-        acc_o *= acc_o_scale[:, :, None]
+            l_ij = tl.sum(p, 2)
 
-        if EVEN_N & EVEN_M:
-            if EVEN_HEADDIM:
-                v = tl.load(v_ptrs)
-            else:
-                v = tl.load(
-                    v_ptrs,
-                    mask = offs_d[None, :] < headdim,
-                    other = 0.0
-                )
-        else:
-            if EVEN_HEADDIM:
-                v = tl.load(
-                    v_ptrs,
-                    mask = offs_n[:, None] < seqlen_k,
-                    other = 0.0,
-                )
+            acc_o_scale = tl.exp(m_i - m_ij)
+            acc_o *= acc_o_scale[:, :, None]
+
+            if EVEN_N & EVEN_M:
+                if EVEN_HEADDIM:
+                    v = tl.load(v_ptrs)
+                else:
+                    v = tl.load(
+                        v_ptrs,
+                        mask = offs_d[None, :] < headdim,
+                        other = 0.0
+                    )
             else:
-                v = tl.load(
-                    v_ptrs,
-                    mask = (offs_n[:, None] < seqlen_k) & (offs_d[None, :] < headdim),
-                    other = 0.0,
-                )
+                if EVEN_HEADDIM:
+                    v = tl.load(
+                        v_ptrs,
+                        mask = offs_n[:, None] < seqlen_k,
+                        other = 0.0,
+                    )
+                else:
+                    v = tl.load(
+                        v_ptrs,
+                        mask = (offs_n[:, None] < seqlen_k) & (offs_d[None, :] < headdim),
+                        other = 0.0,
+                    )
 
-        p = p.reshape(BLOCK * QUERY_HEAD_GROUPS, BLOCK).to(v.dtype)
+            p = p.reshape(BLOCK * QUERY_HEAD_GROUPS, BLOCK).to(v.dtype)
 
-        causal_o = tl.dot(p, v)
+            causal_o = tl.dot(p, v)
 
-        acc_o += causal_o.reshape(BLOCK, QUERY_HEAD_GROUPS, BLOCK_HEADDIM)
+            acc_o += causal_o.reshape(BLOCK, QUERY_HEAD_GROUPS, BLOCK_HEADDIM)
 
-        # -- update statistics
+            # -- update statistics
 
-        m_i = m_ij
-        l_i_new = tl.exp(lse_i - m_ij) + l_ij
-        lse_i = m_ij + tl.log(l_i_new)
+            m_i = m_ij
+            l_i_new = tl.exp(lse_i - m_ij) + l_ij
+            lse_i = m_ij + tl.log(l_i_new)
 
     # # take care of the selected kv blocks
 
@@ -1029,6 +1038,7 @@ def backward_kernel_one_col_block_causal(
     BLOCK: tl.constexpr,
     QUERY_HEAD_GROUPS: tl.constexpr,
     QUERY_EXPAND_DIM: tl.constexpr,
+    SLIDING: tl.constexpr
 ):
     # We need to make sure begin_m is a multiple of BLOCK_M (not BLOCK_N)
 
@@ -1143,11 +1153,16 @@ def backward_kernel_one_col_block_causal(
 
     qk = qk.reshape(QUERY_HEAD_GROUPS, BLOCK, BLOCK)
 
+    mask = offs_m[:, None] >= offs_n[None, :]
+
     # Trying to combine the two masks seem to make the result wrong
     if not EVEN_N:  # Need to mask out otherwise the softmax is wrong
-        qk = tl.where(offs_n[None, :] < seqlen_k, qk, float("-inf"))
+        mask &= offs_n[None, :] < seqlen_k
+
+    if SLIDING:
+        mask &= (offs_n[None, :] - offs_m[:, None]) < BLOCK
 
-    qk = tl.where(offs_m[:, None] >= (offs_n[None, :]), qk, float("-inf"))
+    qk = tl.where(mask, qk, float("-inf"))
 
     qk = qk.reshape(QUERY_HEAD_GROUPS * BLOCK, BLOCK)
 
@@ -1315,7 +1330,8 @@ def backward_kernel(
     QUERY_HEAD_GROUPS: tl.constexpr,
     QUERY_EXPAND_DIM: tl.constexpr,
     RETURN_SEL_GRADS: tl.constexpr,
-    INCLUDE_BLOCK_CAUSAL: tl.constexpr
+    INCLUDE_BLOCK_CAUSAL: tl.constexpr,
+    SLIDING: tl.constexpr,
 ):
     off_hb = tl.program_id(1)
     off_b = off_hb // kv_heads
@@ -1393,6 +1409,7 @@ def backward_kernel(
                 BLOCK = BLOCK,
                 QUERY_HEAD_GROUPS = QUERY_HEAD_GROUPS,
                 QUERY_EXPAND_DIM = QUERY_EXPAND_DIM,
+                SLIDING = SLIDING
             )
     else:
         for start_n in range(0, num_block_n):
@@ -1448,7 +1465,8 @@ def native_sparse_attn_backward(
     dq, dk, dv,
     block_size = 128,
     include_block_causal = True,
-    return_sel_grads = False
+    return_sel_grads = False,
+    sliding = False
 ):
     device = do.device
 
@@ -1563,6 +1581,7 @@ def native_sparse_attn_backward(
         EVEN_HEADDIM = BLOCK_HEADDIM == dim,
         RETURN_SEL_GRADS = return_sel_grads,
         INCLUDE_BLOCK_CAUSAL = include_block_causal,
+        SLIDING = sliding
         # BLOCK_M=BLOCK_M, BLOCK_N=BLOCK_N,
         # num_warps=num_warps,
         # num_stages=1,
@@ -1600,7 +1619,7 @@ def forward(
             selected_block_indices,
             fmask,
             block_size = block_size,
-            include_block_causal = include_block_causal
+            include_block_causal = include_block_causal,
         )
 
         ctx.save_for_backward(fq, fk, fv, selected_block_indices, fmask, out, lse)