nshepperd
diff --git a/‎csrc/flash_attn/mha_bwd.cpp‎
Lines changed: 19 additions & 13 deletions b/‎csrc/flash_attn/mha_bwd.cpp‎
Lines changed: 19 additions & 13 deletions
diff --git a/‎src/flash_attn_jax/flash_hlo.py‎
Lines changed: 74 additions & 41 deletions b/‎src/flash_attn_jax/flash_hlo.py‎
Lines changed: 74 additions & 41 deletions
diff --git a/‎src/flash_attn_jax/ring_attention.py‎
Lines changed: 3 additions & 2 deletions b/‎src/flash_attn_jax/ring_attention.py‎
Lines changed: 3 additions & 2 deletions
diff --git a/‎tests/__init__.py‎
Lines changed: 1 addition & 0 deletions b/‎tests/__init__.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎tests/ref_mha.py‎
Lines changed: 82 additions & 0 deletions b/‎tests/ref_mha.py‎
Lines changed: 82 additions & 0 deletions
@@ -2,6 +2,7 @@
 #include <cutlass/numeric_types.h>
 #include <cuda_runtime_api.h>
 #include <pybind11/pybind11.h>
+#include <cute/layout.hpp>
 
 #include "flash.h"
 #include "exception.h"
@@ -65,18 +66,24 @@ void set_params_dgrad(Flash_bwd_params &params,
     params.dq_ptr = dq_ptr;
     params.dk_ptr = dk_ptr;
     params.dv_ptr = dv_ptr;
-    params.dq_row_stride = params.q_row_stride;
-    params.dk_row_stride = params.k_row_stride;
-    params.dv_row_stride = params.v_row_stride;
-    params.dq_head_stride = params.q_head_stride;
-    params.dk_head_stride = params.k_head_stride;
-    params.dv_head_stride = params.v_head_stride;
+
+    // dk&dv is expanded to the same h as dq for MQA, we sum it later
+    auto dq = cute::compact_row_major(cute::make_shape(b, seqlen_q, h, d));
+	auto dk = cute::compact_row_major(cute::make_shape(b, seqlen_k, h, d));
+	auto dv = cute::compact_row_major(cute::make_shape(b, seqlen_k, h, d));
+
+    params.dq_row_stride = cute::get<1>(dq);
+    params.dk_row_stride = cute::get<1>(dk);
+    params.dv_row_stride = cute::get<1>(dv);
+    params.dq_head_stride = cute::get<2>(dq);
+    params.dk_head_stride = cute::get<2>(dk);
+    params.dv_head_stride = cute::get<2>(dv);
 
     if (cu_seqlens_q_d == nullptr) {
         params.do_batch_stride = params.o_batch_stride;
-        params.dq_batch_stride = params.q_batch_stride;
-        params.dk_batch_stride = params.k_batch_stride;
-        params.dv_batch_stride = params.v_batch_stride;
+        params.dq_batch_stride = cute::get<0>(dq);
+        params.dk_batch_stride = cute::get<0>(dk);
+        params.dv_batch_stride = cute::get<0>(dv);
     }
 
     params.dq_accum_ptr = dq_accum_d;
@@ -273,9 +280,8 @@ mha_bwd(cudaStream_t stream, void **buffers, const char* opaque, size_t opaque_l
     }
 
 
-	// Not sure what this is about. It needs extra scratch space for dk and dv when hk > h?
-	// Maybe because it's partitioning by n and h.
-	// disabled for now and figure out how to handle it later
+    // For MQA, dk and dv are expanded to the same n_heads as dq (handled in xla).
+    // After returning the result, it gets reduced to the original size by summing, so we don't need to do anything here.
 	void* dk_expanded = dk;
 	void* dv_expanded = dv;
     // at::Tensor dk_expanded, dv_expanded;
@@ -376,7 +382,7 @@ mha_bwd(cudaStream_t stream, void **buffers, const char* opaque, size_t opaque_l
 
     // For MQA/GQA we need to sum dK and dV across the groups
     if (num_heads_k != num_heads) {
-		CHECK(false, "don't handle MQA yet");
+		// CHECK(false, "don't handle MQA yet");
         // at::sum_out(dk, at::reshape(dk_expanded, {batch_size, seqlen_k, num_heads_k, num_heads / num_heads_k, head_size}), {3});
         // at::sum_out(dv, at::reshape(dv_expanded, {batch_size, seqlen_k, num_heads_k, num_heads / num_heads_k, head_size}), {3});
     }
 
@@ -10,7 +10,6 @@
 from jax.interpreters import xla
 from jax.interpreters.mlir import ir
 from jax.lib import xla_client
-from jaxlib.hlo_helpers import custom_call
 from jax.experimental.custom_partitioning import custom_partitioning
 
 from jax.sharding import PartitionSpec as P
@@ -19,6 +18,7 @@
 from jax.sharding import PositionalSharding
 
 from einops import rearrange
+import einops
 import math
 
 import flash_attn_jax_lib.flash_api as flash_api
@@ -33,6 +33,10 @@
 _flash_mha_bwd_hlo_p.multiple_results = True
 _flash_mha_bwd_hlo_p.def_impl(partial(xla.apply_primitive, _flash_mha_bwd_hlo_p))
 
+_custom_call_p = core.Primitive("custom_call")
+_custom_call_p.multiple_results = True
+_custom_call_p.def_impl(partial(xla.apply_primitive, _custom_call_p))
+
 # ==== Primitive wrapper ====
 
 def _flash_mha_fwd_hlo(q, k, v, softmax_scale, is_causal, window_size):
@@ -43,6 +47,9 @@ def _flash_mha_bwd_hlo(dout, q, k, v, out, lse, softmax_scale, is_causal, window
     dq, dk, dv = _flash_mha_bwd_hlo_p.bind(dout, q, k, v, out, lse, softmax_scale=softmax_scale, is_causal=is_causal, window_size=window_size)
     return dq, dk, dv
 
+def custom_call(*args, call_target_name, result_types, backend_config, operand_layouts, result_layouts):
+    return _custom_call_p.bind(*args, call_target_name=call_target_name, result_types=result_types, backend_config=backend_config, operand_layouts=operand_layouts, result_layouts=result_layouts)
+
 # ==== HLO lowerings ====
 
 # Register functions defined in gpu_ops as custom call target for GPUs
@@ -112,7 +119,7 @@ def _flash_mha_fwd_hlo_lowering(ctx, q, k, v, softmax_scale=None, is_causal=Fals
 
         out_types = [ir.RankedTensorType.get(o_shape, element_type), lse_type]
 
-        (o, lse) = custom_call(
+        (o, lse) = mlir.custom_call(
             b"flash_mha_fwd",
             result_types=out_types,
             operands=[q_padded, k_padded, v_padded],
@@ -125,7 +132,7 @@ def _flash_mha_fwd_hlo_lowering(ctx, q, k, v, softmax_scale=None, is_causal=Fals
         return (o,lse)
     else:
         out_types = [ir.RankedTensorType.get([n, l, h, d], element_type), lse_type]
-        out = custom_call(
+        out = mlir.custom_call(
             b"flash_mha_fwd",
             result_types=out_types,
             operands=[q, k, v],
@@ -155,6 +162,7 @@ def _flash_mha_bwd_hlo_lowering(ctx, dout, q, k, v, out, lse, softmax_scale=None
     assert q_type == v_type
     assert q_type == out_type
     assert type(lse_type) in [ir.F32Type]
+    dtype = q_type
 
     dout_shape = ir.RankedTensorType(dout.type).shape
     q_shape = ir.RankedTensorType(q.type).shape
@@ -184,49 +192,45 @@ def _flash_mha_bwd_hlo_lowering(ctx, dout, q, k, v, out, lse, softmax_scale=None
         flash_api.BF16 if type(q_type) == ir.BF16Type else flash_api.FP16,
         0)
 
-    if d % 8 != 0:
-        # We need padding. It's better to let xla's allocator handle it here than directly call cudaMalloc.
-        dpad = 8 - d%8
-
-        z = np.array(0.0, dtype=ir_type_to_dtype(q_type))
-        z = mlir.ir_constant(z)
-        q_padded = mlir.hlo.PadOp(q,z,[0,0,0,0],[0,0,0,dpad],[0,0,0,0]).result
-        k_padded = mlir.hlo.PadOp(k,z,[0,0,0,0],[0,0,0,dpad],[0,0,0,0]).result
-        v_padded = mlir.hlo.PadOp(v,z,[0,0,0,0],[0,0,0,dpad],[0,0,0,0]).result
-        out_padded = mlir.hlo.PadOp(out,z,[0,0,0,0],[0,0,0,dpad],[0,0,0,0]).result
-        dout_padded = mlir.hlo.PadOp(dout,z,[0,0,0,0],[0,0,0,dpad],[0,0,0,0]).result
-
-        # Outputs are the same shape as the q,k,v (including padding)
-        out_types = [q_padded.type, k_padded.type, v_padded.type]
+    def fwd(dout, q, k, v, out, lse):
+        dpad = (8 - d%8) % 8
+        if dpad > 0:
+            # We need padding. It's better to let xla's allocator handle it here than directly call cudaMalloc.
+            q = jnp.pad(q, ((0,0),(0,0),(0,0),(0,dpad)), 'constant')
+            k = jnp.pad(k, ((0,0),(0,0),(0,0),(0,dpad)), 'constant')
+            v = jnp.pad(v, ((0,0),(0,0),(0,0),(0,dpad)), 'constant')
+            out = jnp.pad(out, ((0,0),(0,0),(0,0),(0,dpad)), 'constant')
+            dout = jnp.pad(dout, ((0,0),(0,0),(0,0),(0,dpad)), 'constant')
+
+        # For MQA/GQA, hq != hk, but we pass a hq sized output tensor to the kernel and sum over it afterwards to reduce the size.
+        out_types = [ir.RankedTensorType.get([n, lq, hq, d+dpad], dtype),
+                    ir.RankedTensorType.get([n, lk, hq, d+dpad], dtype),
+                    ir.RankedTensorType.get([n, lk, hq, d+dpad], dtype)]
+        out_layouts = default_layouts([n, lq, hq, d+dpad], [n, lk, hq, d+dpad], [n, lk, hq, d+dpad])
 
         dq, dk, dv = custom_call(
-            b"flash_mha_bwd",
-            result_types=out_types,
-            operands=[dout_padded, q_padded, k_padded, v_padded, out_padded, lse],
+            dout, q, k, v, out, lse,
+            call_target_name=b"flash_mha_bwd",
+            operand_layouts=default_layouts(dout.shape, q.shape, k.shape, v.shape, out.shape, lse.shape),
             backend_config=opaque,
-            operand_layouts=value_layouts(dout_padded, q_padded, k_padded, v_padded, out_padded, lse),
-            result_layouts=value_layouts(q_padded, k_padded, v_padded), # dq, dk, dv
-        ).results
-
-        dq = mlir.hlo.SliceOp(dq, [0,0,0,0], tuple(q_shape), [1,1,1,1]).result
-        dk = mlir.hlo.SliceOp(dk, [0,0,0,0], tuple(k_shape), [1,1,1,1]).result
-        dv = mlir.hlo.SliceOp(dv, [0,0,0,0], tuple(v_shape), [1,1,1,1]).result
+            result_types=out_types,
+            result_layouts=out_layouts,
+        )
+
+        if hq != hk:
+            assert hq > hk and hq % hk == 0
+            m = hq // hk
+            dk = einops.reduce(dk, 'n l (h m) d -> n l h d', reduction='sum', h=hk)
+            dv = einops.reduce(dv, 'n l (h m) d -> n l h d', reduction='sum', h=hk)
+        
+        if dpad > 0:
+            dq = dq[:,:,:,:d]
+            dk = dk[:,:,:,:d]
+            dv = dv[:,:,:,:d]
 
         return dq, dk, dv
-    else:
-        out_types = [ir.RankedTensorType.get(q_shape, q_type),
-                     ir.RankedTensorType.get(k_shape, k_type),
-                     ir.RankedTensorType.get(v_shape, v_type)]
-
-        out = custom_call(
-            b"flash_mha_bwd",
-            result_types=out_types,
-            operands=[dout, q, k, v, out, lse],
-            backend_config=opaque,
-            operand_layouts=default_layouts(dout_shape, q_shape, k_shape, v_shape, out_shape, lse_shape),
-            result_layouts=default_layouts(*[o.shape for o in out_types]),
-        ).results
-        return out
+    
+    return mlir.lower_fun(fwd, multiple_results=True)(ctx, dout, q, k, v, out, lse)
 
 mlir.register_lowering(
     _flash_mha_bwd_hlo_p,
@@ -266,3 +270,32 @@ def _flash_mha_bwd_abstract(dout, q, k, v, out, lse, softmax_scale=None, is_caus
         ShapedArray(v.shape, v_dtype, named_shape=v.named_shape),
     )
 _flash_mha_bwd_hlo_p.def_abstract_eval(_flash_mha_bwd_abstract)
+
+# ==== Custom Call ====
+
+def _custom_call_abstract_eval(*args, call_target_name, result_types, backend_config, operand_layouts, result_layouts):
+    def convert(ty):
+        ty = ir.RankedTensorType(ty)
+        shape = tuple(ty.shape)
+        dtype = ir_type_to_dtype(ty.element_type)
+        return ShapedArray(shape, dtype)
+    out_types = [convert(o) for o in result_types]
+    return tuple(out_types)
+
+_custom_call_p.def_abstract_eval(_custom_call_abstract_eval)
+
+def _custom_call_hlo_lowering(ctx, *args, call_target_name, result_types, backend_config, operand_layouts, result_layouts):
+    out = mlir.custom_call(
+            call_target_name,
+            operands=args,
+            result_types=result_types,
+            backend_config=backend_config,
+            operand_layouts=operand_layouts,
+            result_layouts=result_layouts,
+        ).results
+    return out
+
+mlir.register_lowering(
+    _custom_call_p,
+    _custom_call_hlo_lowering
+)
@@ -79,14 +79,15 @@ def f(c, a):
 
 def ring_bwd(do,q,k,v,o,lse, axis_name, axis_size, mha_bwd, softmax_scale=None, is_causal=False):
     [n,l,h,d] = q.shape
+    [n,lk,hk,d] = k.shape
     if softmax_scale is None:
         softmax_scale = 1/math.sqrt(d)
 
     ix = jax.lax.axis_index(axis_name)
 
     dq = jnp.zeros([n,l,h,d], jnp.float32)
-    dk = jnp.zeros([n,l,h,d], jnp.float32)
-    dv = jnp.zeros([n,l,h,d], jnp.float32)
+    dk = jnp.zeros([n,lk,hk,d], jnp.float32)
+    dv = jnp.zeros([n,lk,hk,d], jnp.float32)
 
     # scan :: (c -> a -> (c, b)) -> c -> [a] -> (c, [b])
     def f(acc, _):
 
@@ -0,0 +1 @@
+0
@@ -0,0 +1,82 @@
+import glob
+import sys, os
+
+import jax
+import jax.numpy as jnp
+import numpy as np
+from functools import partial
+import einops
+
+def make_mask(R, C, is_causal, window_size):
+    mask = jnp.ones([R,C], dtype=jnp.int32)
+    if is_causal:
+        mask = jnp.tril(mask)
+    if window_size[0] != -1:
+        mask = jnp.triu(mask, -window_size[0])
+    if window_size[1] != -1:
+        mask = jnp.tril(mask, window_size[1])
+    return mask
+
+def ref_mha(q,k,v, is_causal=False, window_size=(-1,-1), softmax_scale=None):
+    return ref_fwd(q,k,v, is_causal=is_causal, window_size=window_size, softmax_scale=softmax_scale)[0]
+
+def ref_fwd(q,k,v, is_causal=False, window_size=(-1,-1), softmax_scale=None):
+    [n, l, h, d] = q.shape
+    [n, lk, hk, d] = k.shape
+    if softmax_scale is None:
+        softmax_scale = 1/np.sqrt(d)
+    mask = make_mask(l,lk,is_causal,window_size)
+    if h != hk:
+        assert h > hk and h % hk == 0
+        q = einops.rearrange(q, 'n L (h x) d -> n L h x d', h=hk)
+        S = jnp.einsum('nlhxd,nLhd->nhxlL',q,k) * softmax_scale
+        S = jnp.where(mask, S, float('-inf'))
+        lse = jax.nn.logsumexp(S, axis=-1) #nhxl
+        P = jnp.exp(S - lse[...,None]) # n h l L
+        o = jnp.einsum('nhxlL,nLhd->nlhxd',P,v)
+        o = einops.rearrange(o, 'n l h x d -> n l (h x) d')
+        lse = einops.rearrange(lse, 'n h x l -> n (h x) l')
+        return o.astype(q.dtype), lse.astype(jnp.float32)
+    else:
+        att = jnp.einsum('nlhd,nLhd->nhlL',q,k)*softmax_scale
+        [_, _, l, L] = att.shape
+        mask = make_mask(l,L,is_causal,window_size)
+        att = jnp.where(mask, att, float('-inf'))
+        lse = jax.nn.logsumexp(att, axis=-1) #nhl
+        att = jnp.exp(att - lse[...,None])
+        o = jnp.einsum('nhlL,nLhd->nlhd',att,v)
+        return o.astype(q.dtype), lse.astype(jnp.float32)
+
+def ref_bwd(do,q,k,v,o,lse, is_causal=False, window_size=(-1,-1), softmax_scale=None):
+    [n, l, h, d] = q.shape
+    [n, lk, hk, d] = k.shape
+    if softmax_scale is None:
+        softmax_scale = 1/np.sqrt(d)
+    mask = make_mask(l,lk,is_causal,window_size)
+    if h != hk:
+        assert h > hk and h % hk == 0
+        q = einops.rearrange(q, 'n l (h x) d -> n l h x d', h=hk)
+        lse = einops.rearrange(lse, 'n (h x) l -> n h x l', h=hk)
+        S = jnp.einsum('nlhxd,nLhd->nhxlL',q,k) * softmax_scale
+        D = einops.reduce(do * o, 'n l (h x) d -> n h x l', reduction='sum', h=hk)
+        do = einops.rearrange(do, 'n l (h x) d -> n l h x d', h=hk)
+        S = jnp.where(mask, S, float('-inf'))
+        P = jnp.exp(S - lse[...,None]) # n h x l L
+        dP = jnp.einsum('nlhxd,nLhd->nhxlL',do,v)
+        dv = jnp.einsum('nlhxd,nhxlL->nLhd',do,P)
+        dS = P * (dP - D[...,None])
+        dq = softmax_scale*jnp.einsum('nLhd,nhxlL->nlhxd',k,dS)
+        dk = softmax_scale*jnp.einsum('nlhxd,nhxlL->nLhd',q,dS)
+        dq = einops.rearrange(dq, 'n l h x d -> n l (h x) d')
+        return dq.astype(q.dtype),dk.astype(q.dtype),dv.astype(q.dtype)
+    else:
+        S = jnp.einsum('nlhd,nLhd->nhlL',q,k)*softmax_scale
+        D = einops.reduce(do * o, 'n l h d -> n h l', reduction='sum')
+        S = jnp.where(mask, S, float('-inf'))
+        P = jnp.exp(S - lse[...,None]) # n h l L
+        dP = jnp.einsum('nlhd,nLhd->nhlL',do,v)
+        dv = jnp.einsum('nlhd,nhlL->nLhd',do,P)
+        dS = P * (dP - D[...,None])
+        dq = softmax_scale*jnp.einsum('nLhd,nhlL->nlhd',k,dS)
+        dk = softmax_scale*jnp.einsum('nlhd,nhlL->nLhd',q,dS)
+        return dq.astype(q.dtype),dk.astype(q.dtype),dv.astype(q.dtype)