[MXFP] Fix packing for mxfp4 type (#5197)

ThomasRaoux · web-flow · commit 7088c64d7cd8 · 2024-11-20T12:08:09.000-08:00
When packing we should have element 0 in the lower bits, until this PR
it was in higher bits.
diff --git a/lib/Conversion/TritonGPUToLLVM/Utility.cpp b/lib/Conversion/TritonGPUToLLVM/Utility.cpp
@@ -752,26 +752,26 @@ SmallVector<Value> convertMxfp4x2ToBf16x2(RewriterBase &rewriter, Location loc,
                                           ArrayRef<Value> values) {
   SmallVector<Value> results;
   for (auto v : values) {
-    auto em0 = and_(v, i8_val(0x70));
-    auto em1 = and_(v, i8_val(0x7));
-    Value v0 = or_(shl(zext(i16_ty, em0), i16_val(2)),
-                   shl(zext(i16_ty, and_(v, i8_val(0x80))), i16_val(8)));
-    Value v1 = or_(shl(zext(i16_ty, em1), i16_val(6)),
+    auto em0 = and_(v, i8_val(0x7));
+    auto em1 = and_(v, i8_val(0x70));
+    Value v0 = or_(shl(zext(i16_ty, em0), i16_val(6)),
                    shl(zext(i16_ty, and_(v, i8_val(0x8))), i16_val(12)));
+    Value v1 = or_(shl(zext(i16_ty, em1), i16_val(2)),
+                   shl(zext(i16_ty, and_(v, i8_val(0x80))), i16_val(8)));
 
     // Three cases:
     // 1) x is normal and non-zero: Correct bias
-    v0 = select(icmp_ne(and_(em0, i8_val(0x60)), i8_val(0)),
+    v0 = select(icmp_ne(and_(em0, i8_val(0x6)), i8_val(0)),
                 add(v0, i16_val((127 - 1) << 7)), v0);
-    v1 = select(icmp_ne(and_(em1, i8_val(0x6)), i8_val(0)),
+    v1 = select(icmp_ne(and_(em1, i8_val(0x60)), i8_val(0)),
                 add(v1, i16_val((127 - 1) << 7)), v1);
 
     // 2) x is subnormal (x == 0bs001 where s is the sign): Map to +-0.5 in
     // bf16
-    v0 = bitcast(select(icmp_eq(em0, i8_val(0x10)),
+    v0 = bitcast(select(icmp_eq(em0, i8_val(0x1)),
                         or_(i16_val(16128), and_(v0, i16_val(0x8000))), v0),
                  bf16_ty);
-    v1 = bitcast(select(icmp_eq(em1, i8_val(0x1)),
+    v1 = bitcast(select(icmp_eq(em1, i8_val(0x10)),
                         or_(i16_val(16128), and_(v1, i16_val(0x8000))), v1),
                  bf16_ty);
     // 3) x is zero, nothing to do
diff --git a/python/test/unit/language/test_core.py b/python/test/unit/language/test_core.py
@@ -3469,17 +3469,17 @@ def mxfp_to_bf16_kernel(
                 x_bf16 = x_f8.to(tl.bfloat16)
         else:
             # e2m1
-            em0 = x & 0x70
-            em1 = x & 0x7
-            x0 = (em0.to(tl.uint16) << 2) | ((x & 0x80).to(tl.uint16) << 8)
-            x1 = (em1.to(tl.uint16) << (2 + 4)) | ((x & 0x8).to(tl.uint16) << (8 + 4))
+            em0 = x & 0x7
+            em1 = x & 0x70
+            x0 = (em0.to(tl.uint16) << 2 + 4) | ((x & 0x8).to(tl.uint16) << 8 + 4)
+            x1 = (em1.to(tl.uint16) << 2) | ((x & 0x80).to(tl.uint16) << (8))
             # Three cases:
             # 1) x is normal and non-zero: Correct bias
-            x0 = tl.where((em0 & 0x60) != 0, x0 + ((127 - 1) << 7), x0)
-            x1 = tl.where((em1 & 0x6) != 0, x1 + ((127 - 1) << 7), x1)
+            x0 = tl.where((em0 & 0x6) != 0, x0 + ((127 - 1) << 7), x0)
+            x1 = tl.where((em1 & 0x60) != 0, x1 + ((127 - 1) << 7), x1)
             # 2) x is subnormal (x == 0bs001 where s is the sign): Map to +-0.5 in bf16
-            x0 = tl.where(em0 == 0x10, 16128 | (x0 & 0x8000), x0)
-            x1 = tl.where(em1 == 0x1, 16128 | (x1 & 0x8000), x1)
+            x0 = tl.where(em0 == 0x1, 16128 | (x0 & 0x8000), x0)
+            x1 = tl.where(em1 == 0x10, 16128 | (x1 & 0x8000), x1)
             # 3) x is zero, do nothing
             x_bf16 = tl.interleave(x0, x1).to(tl.bfloat16, bitcast=True)
         # Multiplication preserves infs and NaNs in x_bf16
diff --git a/python/test/unit/language/test_pipeliner.py b/python/test/unit/language/test_pipeliner.py
@@ -160,17 +160,17 @@ def mxfp_to_bf16_kernel(
             x_bf16 = x_f8.to(tl.bfloat16)
     else:
         # e2m1
-        em0 = x & 0x70
-        em1 = x & 0x7
-        x0 = (em0.to(tl.uint16) << 2) | ((x & 0x80).to(tl.uint16) << 8)
-        x1 = (em1.to(tl.uint16) << (2 + 4)) | ((x & 0x8).to(tl.uint16) << (8 + 4))
+        em0 = x & 0x7
+        em1 = x & 0x70
+        x0 = (em0.to(tl.uint16) << 2 + 4) | ((x & 0x8).to(tl.uint16) << 8 + 4)
+        x1 = (em1.to(tl.uint16) << (2)) | ((x & 0x80).to(tl.uint16) << (8))
         # Three cases:
         # 1) x is normal and non-zero: Correct bias
-        x0 = tl.where((em0 & 0x60) != 0, x0 + ((127 - 1) << 7), x0)
-        x1 = tl.where((em1 & 0x6) != 0, x1 + ((127 - 1) << 7), x1)
+        x0 = tl.where((em0 & 0x6) != 0, x0 + ((127 - 1) << 7), x0)
+        x1 = tl.where((em1 & 0x60) != 0, x1 + ((127 - 1) << 7), x1)
         # 2) x is subnormal (x == 0bs001 where s is the sign): Map to +-0.5 in bf16
-        x0 = tl.where(em0 == 0x10, 16128 | (x0 & 0x8000), x0)
-        x1 = tl.where(em1 == 0x1, 16128 | (x1 & 0x8000), x1)
+        x0 = tl.where(em0 == 0x1, 16128 | (x0 & 0x8000), x0)
+        x1 = tl.where(em1 == 0x10, 16128 | (x1 & 0x8000), x1)
         # 3) x is zero, do nothing
         x_bf16 = tl.interleave(x0, x1).to(tl.bfloat16, bitcast=True)
     # Multiplication preserves infs and NaNs in x_bf16
diff --git a/python/triton/language/core.py b/python/triton/language/core.py
@@ -1647,16 +1647,16 @@ def dot_scaled(lhs, lhs_scale, lhs_format, rhs, rhs_scale, rhs_format, acc=None,
     lhs and rhs use microscaling formats described here:
     https://www.opencompute.org/documents/ocp-microscaling-formats-mx-v1-0-spec-final-pdf
     :param lhs: The first tensor to be multiplied.
-    :type lhs: 2D tensor representing fp4 or fp8 elements packed into uint8 for fp4 inputs, or in uint8 or the corresponding fp8 type for fp8 inputs.
+    :type lhs: 2D tensor representing fp4, fp8 or bf16 elements. Fp4 elements are packed into uint8 inputs with the first element in lower bits. Fp8 are stored as uint8 or the corresponding fp8 type.
     :param lhs_scale: Scale factor for lhs tensor.
     :type lhs_scale: e8m0 type represented as an uint8 tensor.
-    :param lhs_format: format of the lhs tensor. Available formats: {:code:`e2m1`, :code:`e4m3`, :code:`e5m2`}.
+    :param lhs_format: format of the lhs tensor. Available formats: {:code:`e2m1`, :code:`e4m3`, :code:`e5m2`, :code:`bf16`}.
     :type lhs_format: str
     :param rhs: The second tensor to be multiplied.
-    :type rhs: 2D tensor representing fp8 or bf16 elements in uint8 or the corresponding fp8 type for fp8 inputs or bf16 for bf16 inputs.
+    :type rhs: 2D tensor representing fp4, fp8 or bf16 elements. Fp4 elements are packed into uint8 inputs with the first element in lower bits. Fp8 are stored as uint8 or the corresponding fp8 type.
     :param rhs_scale: Scale factor for rhs tensor.
     :type rhs_scale: e8m0 type represented as an uint8 tensor.
-    :param rhs_format: format of the rhs tensor. Available formats: {:code:`e4m3`, :code:`e5m2`, :code:`bf16`}.
+    :param rhs_format: format of the rhs tensor. Available formats: {:code:`e2m1`, :code:`e4m3`, :code:`e5m2`, :code:`bf16`}.
     :type rhs_format: str
     :param acc: The accumulator tensor. If not None, the result is added to this tensor.
     """