Fix e2m1

leonling-ll · leonling-ll · commit 0a58d8ed0bd8 · 2024-12-06T14:10:35.000Z
diff --git a/lib/Dialect/TritonGPU/IR/Ops.cpp b/lib/Dialect/TritonGPU/IR/Ops.cpp
@@ -4,6 +4,7 @@
 #include "triton/Dialect/Triton/IR/Utility.h"
 #include "triton/Dialect/TritonGPU/IR/Attributes.h"
 #include "triton/Dialect/TritonGPU/IR/Dialect.h"
+#include "triton/Tools/Sys/GetEnv.hpp"
 
 #define GET_OP_CLASSES
 #include "triton/Dialect/TritonGPU/IR/Ops.cpp.inc"
@@ -109,6 +110,8 @@ LogicalResult UpcastMXFPOp::inferReturnTypes(
   auto xShape = xTy.getShape();
 
   auto encoding = xTy.getEncoding();
+  bool upcastMXFPUseDotOpEnc =
+      mlir::triton::tools::getBoolEnv("TRITON_INTEL_UPCASTMXFP_DOTOP_ENCODING");
 
   if (typeEncoded == ScaleDotElemType::E2M1) {
     RankedTensorType retTy;
@@ -118,34 +121,47 @@ LogicalResult UpcastMXFPOp::inferReturnTypes(
       newShape.back() *= 2;
       retTy = RankedTensorType::get(xShape, FloatType::getBF16(ctx));
     } else {
-      auto oldEncoding = cast<DotOperandEncodingAttr>(encoding);
-
-      const int opIdx = oldEncoding.getOpIdx();
-      const bool hasBatch = xShape.size() == 3;
-      const int kIdx = (opIdx == 0 ? 1 : 0) + hasBatch;
-      newShape[kIdx] *= 2;
       Type elemType = FloatType::getBF16(ctx);
-
-      // Note: For Intel the dot operands layout's kWidth parameter must match
-      // the parent's DPAS layout opsPerChannel so we need to materialize a new
-      // DPAS layout.
       Attribute newVEncoding;
-      if (auto dpasEncoding =
-              dyn_cast<intel::DpasEncodingAttr>(oldEncoding.getParent())) {
-        auto newDpasEncoding = intel::DpasEncodingAttr::get(
-            ctx, dpasEncoding.getRepeatCount(), dpasEncoding.getSystolicDepth(),
-            dpasEncoding.getExecutionSize(),
-            intel::DpasEncodingAttr::getOpsPerChannel(elemType),
-            dpasEncoding.getWarpsPerCTA(), dpasEncoding.getRepCluster(),
-            dpasEncoding.getSubGroupSize());
-        newVEncoding = DotOperandEncodingAttr::get(
-            ctx, opIdx, newDpasEncoding, newDpasEncoding.getOpsPerChannel());
+      if (upcastMXFPUseDotOpEnc) {
+        auto oldEncoding = cast<DotOperandEncodingAttr>(encoding);
+
+        const int opIdx = oldEncoding.getOpIdx();
+        const bool hasBatch = xShape.size() == 3;
+        const int kIdx = (opIdx == 0 ? 1 : 0) + hasBatch;
+        newShape[kIdx] *= 2;
+
+        // Note: For Intel the dot operands layout's kWidth parameter must match
+        // the parent's DPAS layout opsPerChannel so we need to materialize a
+        // new DPAS layout.
+        if (auto dpasEncoding =
+                dyn_cast<intel::DpasEncodingAttr>(oldEncoding.getParent())) {
+          auto newDpasEncoding = intel::DpasEncodingAttr::get(
+              ctx, dpasEncoding.getRepeatCount(),
+              dpasEncoding.getSystolicDepth(), dpasEncoding.getExecutionSize(),
+              intel::DpasEncodingAttr::getOpsPerChannel(elemType),
+              dpasEncoding.getWarpsPerCTA(), dpasEncoding.getRepCluster(),
+              dpasEncoding.getSubGroupSize());
+          newVEncoding = DotOperandEncodingAttr::get(
+              ctx, opIdx, newDpasEncoding, newDpasEncoding.getOpsPerChannel());
+        } else {
+          // Figure out the K dimension for the input A/B, given that the return
+          // type is upcasted A/B type so we need to update the proper dim size.
+          newVEncoding = DotOperandEncodingAttr::get(
+              ctx, oldEncoding.getOpIdx(), oldEncoding.getParent(),
+              oldEncoding.getKWidth() * 2);
+        }
       } else {
-        // Figure out the K dimension for the input A/B, given that the return
-        // type is upcasted A/B type so we need to update the proper dim size.
-        newVEncoding = DotOperandEncodingAttr::get(ctx, oldEncoding.getOpIdx(),
-                                                   oldEncoding.getParent(),
-                                                   oldEncoding.getKWidth() * 2);
+        auto oldEncoding = dyn_cast<BlockedEncodingAttr>(encoding);
+        assert(oldEncoding &&
+               "Expected a blocked encoding for UpcastMXFP op result.");
+        newShape.back() *= 2;
+        SmallVector<unsigned> sizePerThread = oldEncoding.getSizePerThread();
+        sizePerThread.back() *= 2;
+        newVEncoding = BlockedEncodingAttr::get(
+            ctx, sizePerThread, oldEncoding.getThreadsPerWarp(),
+            oldEncoding.getWarpsPerCTA(), oldEncoding.getCTAOrder(),
+            oldEncoding.getCTALayout());
       }
       retTy = RankedTensorType::get(newShape, elemType, newVEncoding);
     }
diff --git a/python/test/unit/language/test_core.py b/python/test/unit/language/test_core.py
@@ -3441,8 +3441,8 @@ def test_scaled_dot(M, N, K, col_a, col_b, rhs_scale, normal_type, mxfp_type, nu
         if mma == 16 and K == 64:
             pytest.skip(f"K == {K} too small for mfma {mma} in scaled_dot")
     if is_xpu():
-        if "e2m1" in (normal_type, mxfp_type):
-            pytest.skip("scaled_dot e2m1 isn't supported on XPU")
+        if rhs_scale:
+            pytest.skip("scaled_dot with rhs_scale not supported on XPU")
 
     @triton.jit
     def dot_scale_kernel(a_base, stride_a0, stride_a1, a_scale, b_base, stride_b0, stride_b1, b_scale, out,
diff --git a/third_party/intel/lib/Analysis/DPAS.cpp b/third_party/intel/lib/Analysis/DPAS.cpp
@@ -3,6 +3,7 @@
 #include "mlir/IR/BuiltinTypes.h"
 #include "triton/Dialect/Triton/IR/Dialect.h"
 #include "llvm/Support/Casting.h"
+#include <iostream>
 
 namespace mlir::triton::gpu::intel {
 
@@ -150,9 +151,10 @@ DPASAnalysis::DPASEngineType DPASAnalysis::getDPASType(Operation *op) {
         if ((aElemTy.isFloat8E4M3FN() || aElemTy.isFloat8E5M2()) &&
             bElemTy.isFloat4E2M1FN())
           return DPASEngineType::FP32_FP32_FP8_FP4;
-        if (aElemTy.isFloat4E2M1FN() && bElemTy.isBF16())
+        // 2 E2M1 are packed into 1 int8
+        if (aElemTy.isInteger(8) && bElemTy.isBF16())
           return DPASEngineType::FP32_FP32_FP4_BF16;
-        if (aElemTy.isFloat4E2M1FN() &&
+        if (aElemTy.isInteger(8) &&
             (bElemTy.isFloat8E4M3FN() || bElemTy.isFloat8E5M2()))
           return DPASEngineType::FP32_FP32_FP4_FP8;
       }
diff --git a/third_party/intel/lib/TritonIntelGPUTransforms/AccelerateMatmul.cpp b/third_party/intel/lib/TritonIntelGPUTransforms/AccelerateMatmul.cpp
@@ -252,8 +252,7 @@ class DecomposeScaledBlocked : public OpRewritePattern<tt::DotScaledOp> {
 
 private:
   bool upcastMXFPUseDotOpEnc =
-      mlir::triton::tools::getBoolEnv(
-          "TRITON_INTEL_UPCASTMXFP_DOTOP_ENCODING") == 1;
+      mlir::triton::tools::getBoolEnv("TRITON_INTEL_UPCASTMXFP_DOTOP_ENCODING");
 
   struct OpDescriptor {
     TensorValue op;
@@ -294,11 +293,12 @@ class DecomposeScaledBlocked : public OpRewritePattern<tt::DotScaledOp> {
     assert(opDesc.scale && "Expecting valid operand & scale");
 
     unsigned opsPerChannel = dpasEnc.getOpsPerChannel();
-    if (opDesc.elemType == tt::ScaleDotElemType::E2M1)
-      opsPerChannel *= 2;
 
     MLIRContext *ctx = opDesc.op.getContext();
+    unsigned rank = retType.getRank();
     if (upcastMXFPUseDotOpEnc) {
+      if (opDesc.elemType == tt::ScaleDotElemType::E2M1)
+        opsPerChannel *= 2;
       auto opEncoding = ttg::intel::DpasEncodingAttr::get(
           ctx, dpasEnc.getRepeatCount(), dpasEnc.getSystolicDepth(),
           dpasEnc.getExecutionSize(), opsPerChannel, dpasEnc.getWarpsPerCTA(),
@@ -313,7 +313,6 @@ class DecomposeScaledBlocked : public OpRewritePattern<tt::DotScaledOp> {
       unsigned instrShapeM = dpasEnc.getDPASInstShapeA()[1];
       SmallVector<unsigned, 2> threadsPerWarp{instrShapeM,
                                               warpSize / instrShapeM};
-      unsigned rank = retType.getRank();
       int numWarps = ttg::TritonGPUDialect::getNumWarps(mod);
       SmallVector<unsigned, 2> warpsPerCTA(rank, 1);
       warpsPerCTA[0] = numWarps;
@@ -334,10 +333,13 @@ class DecomposeScaledBlocked : public OpRewritePattern<tt::DotScaledOp> {
       // https://www.opencompute.org/documents/ocp-microscaling-formats-mx-v1-0-spec-final-pdf
       // the scalingBlockSize should be 32 for E5M2, E4M3 and E2M1
       unsigned scalingBlockSize = 32;
+      // 2 FP4E2M1 are packed in 1 I8
       if (opDesc.elemType == tt::ScaleDotElemType::E2M1)
         scalingBlockSize = 16;
+      SmallVector<unsigned, 2> sizePerThread(rank, 1);
+      sizePerThread[rank - 1 - opIdx] = scalingBlockSize;
       auto newOpEncoding = ttg::BlockedEncodingAttr::get(
-          ctx, {1, scalingBlockSize}, scaleEncoding.getThreadsPerWarp(),
+          ctx, sizePerThread, scaleEncoding.getThreadsPerWarp(),
           scaleEncoding.getWarpsPerCTA(), scaleEncoding.getCTAOrder(),
           scaleEncoding.getCTALayout());