[Kernels] Support A100 upcasting for mxfp4 (#8428)

ThomasRaoux · web-flow · commit da3d437fdad8 · 2025-10-10T18:08:25.000-07:00
diff --git a/python/triton_kernels/triton_kernels/tensor_details/layout_details/hopper_value.py b/python/triton_kernels/triton_kernels/tensor_details/layout_details/hopper_value.py
@@ -2,6 +2,7 @@
 import triton
 import triton.language as tl
 from .base import Layout
+from triton_kernels.target_info import cuda_capability_geq
 
 
 def right_shift_unsigned(x, shift):
@@ -228,23 +229,25 @@ def _unshuffle_triton(x, mma_version: tl.constexpr):
 
 @triton.jit
 def _unpack_fp4_to_bf16_triton(x):
-    # For now we implement just H100 support (mul.bf16x2)
-    # A100 support is possible via fma
+    # Use fma on a100 as there is no mul.bf16x2.
+    use_mul: tl.constexpr = cuda_capability_geq(9)
+    op_instr: tl.constexpr = "mul.bf16x2" if use_mul else "fma.rn.bf16x2"
+    op_suffix: tl.constexpr = "" if use_mul else ", z"
     r0, r1 = tl.inline_asm_elementwise(
-        r"""
-        {
-            .reg .b32 b, c, d<7>, scale;
+        asm=f"""{{
+            .reg .b32 b, c, z, d<7>, scale;
             .reg .b32 bias;
+            mov.b32 z, 0;
             mov.b32 bias, 0x7e807e80; // 2 ** 126 == 2 ** (bias_bf16 - bias_fp2)
             // We add the missing bias to the scale directly
             and.b32 $0, $4, 0b10000001110000001000000111000000;
-            mul.bf16x2 $0, $0, bias;
+            {op_instr} $0, $0, bias{op_suffix};
             shl.b32 b, $4, 3;
             and.b32 $1, b,  0b10000001110000001000000111000000;
-            mul.bf16x2 $1, $1, bias;
+            {op_instr} $1, $1, bias{op_suffix};
             shl.b32 c, $4, 6;
             and.b32 $2, c,  0b10000001110000001000000111000000;
-            mul.bf16x2 $2, $2, bias;
+            {op_instr} $2, $2, bias{op_suffix};
             // Unpack last two elements
             shl.b32 d0, $4, 1;
             and.b32 d1, d0, 0b10000000000000001000000000000000;
@@ -254,9 +257,8 @@ def _unpack_fp4_to_bf16_triton(x):
             shr.b32 d5, $4, 7;
             and.b32 d6, d5, 0b00000000010000000000000001000000;
             or.b32 $3, d4, d6;
-            mul.bf16x2 $3, $3, bias;
-        }
-        """,
+            {op_instr} $3, $3, bias{op_suffix};
+        }}""",
         constraints="=r,=r,=r,=r,r",
         args=[x],
         dtype=(tl.bfloat16, tl.bfloat16),