[Kernels] Force mxfp4->bf16 conversion to use mul.bf16x2 for scaling (#8967)

peterbell10 · web-flow · commit ec6b435a8c27 · 2025-12-11T16:45:21.000+01:00
LLVM doesn't auto-vectorize this very well, and ends up with a mix of
vector and scalar muls. I think the cost heuristics gets tripped up by
the scale broadcasting which requires unpacking and duplicating the
scales, for which we generate ptx like
```
mov.b32 {%rs0, %rs1}, %packed_scales
mov.b32 %r1, {%rs0, %rs0}
mov.b32 %r2, {%rs1, %rs1}
```

However, ptxas can fuse this into the multiply e.g.
```
HMUL2.BF16_V2 R90, R90, R100.H0_H0
HMUL2.BF16_V2 R91, R91, R100.H1_H1
```
where the movs have become the register modifier in the instruction.

This gives a modest 1% speedup on non-persistent bf16xmxfp4 MoE.
diff --git a/python/triton_kernels/triton_kernels/tensor_details/layout_details/hopper_value.py b/python/triton_kernels/triton_kernels/tensor_details/layout_details/hopper_value.py
@@ -291,6 +291,22 @@ def _unpack_fp4_to_bf16_triton(x):
     return x
 
 
+@triton.jit
+def mul_bf16x2(a, b):
+    use_mul: tl.constexpr = cuda_capability_geq(9)
+    op_instr: tl.constexpr = "mul.bf16x2" if use_mul else "fma.rn.bf16x2"
+    op_suffix: tl.constexpr = "" if use_mul else ", z"
+
+    return tl.inline_asm_elementwise(
+        asm=f"{op_instr} $0, $1, $2{op_suffix};",
+        constraints="=r,r,r",
+        args=[a, b],
+        dtype=tl.bfloat16,
+        is_pure=True,
+        pack=2,
+    )
+
+
 @triton.jit
 def mxfp4_to_bf16_triton(x, scale, mx_axis: tl.constexpr):
     """
@@ -345,5 +361,5 @@ def mxfp4_to_bf16_triton(x, scale, mx_axis: tl.constexpr):
     scale = scale.reshape(x.shape)
 
     # Combine scale and x
-    x = x * scale
+    x = mul_bf16x2(x, scale)
     return x