dk down, dq to go

lucidrains · lucidrains · commit af261a6a3e6f · 2025-02-25T16:43:48.000Z
diff --git a/native_sparse_attention_pytorch/triton_native_sparse_attention.py b/native_sparse_attention_pytorch/triton_native_sparse_attention.py
@@ -773,11 +773,33 @@ def backward_kernel_one_col_block(
 
         p = tl.exp(qk * softmax_scale - lse_i[:, None])
 
+        # take care of block dv
+
         block_dv = p.to(do.dtype)[:, :, None] * do[:, None, :]
         block_dv = tl.where(block_masks[:, None, None], block_dv, 0.)
 
         tl.atomic_add(block_dv_ptrs, block_dv, sem = 'relaxed')
 
+        # get dp
+
+        do_expanded = tl.expand_dims(do, 1)
+        do_expanded = tl.broadcast_to(do_expanded, (BLOCK, 16, BLOCK_HEADDIM))
+        block_v = tl.permute(block_v, (0, 2, 1))
+
+        dp = tl.dot(do_expanded, block_v)
+        dp = tl.sum(dp, 1) / 16.
+
+        # ds
+
+        ds = (p * (dp - Di[:, None]) * softmax_scale)
+        ds = ds.to(q.dtype)
+
+        # block dk
+
+        block_dk = ds[:, :, None] * q[:, None, :]
+
+        tl.atomic_add(block_dk_ptrs, block_dk, sem = 'relaxed')
+
     # # increment pointers
     # dq_ptrs += BLOCK * stride_dqm
     # q_ptrs += BLOCK * stride_qm
diff --git a/test_triton_nsa.py b/test_triton_nsa.py
@@ -105,5 +105,6 @@ def regular_attend(
 assert torch.allclose(out, nsa_out, atol = 1e-2)
 
 assert torch.allclose(nv.grad, rv.grad, atol = 1e-2)
-assert torch.allclose(nq.grad, rq.grad, atol = 1e-2)
+print((nk.grad - rk.grad).abs().amax())
 assert torch.allclose(nk.grad, rk.grad, atol = 1e-2)
+assert torch.allclose(nq.grad, rq.grad, atol = 1e-2)