Fix mxfp8 mxfp4 matmul shape mismatch (#4060)

leonling-ll · web-flow · commit 108e8a153406 · 2025-05-13T12:27:34.000-04:00
This PR fix the shape mismatch because of different float4 pack dim from
the change of `B_TRANS` and `PACK_B_ALONG_K`.
diff --git a/python/test/unit/intel/test_mxfp_matmul.py b/python/test/unit/intel/test_mxfp_matmul.py
@@ -111,8 +111,6 @@ def test_mxfp_matmul(M, N, K, BLOCK_M, BLOCK_N, BLOCK_K, NUM_STAGES, B_TRANS, PA
         pytest.skip("Float4 for both A and B has [ZE]0x78000011 error")
     if not PACK_B_ALONG_K and B_DATA_TYPE != "float4":
         pytest.xfail("Pack along K can only be False for float4")
-    if not PACK_B_ALONG_K and B_DATA_TYPE == "float4":
-        pytest.skip("Pack along K fix depends on https://github.com/intel/intel-xpu-backend-for-triton/pull/4060")
 
     if BLOCK_N == 256 and BLOCK_K == 256:
         NUM_STAGES = 2
diff --git a/third_party/intel/lib/TritonIntelGPUTransforms/DecomposeScaledBlocked.cpp b/third_party/intel/lib/TritonIntelGPUTransforms/DecomposeScaledBlocked.cpp
@@ -185,6 +185,7 @@ class DecomposeScaledBlocked : public OpRewritePattern<DotScaledOp> {
                                         DotScaledOp scaledDotOp, int opIdx,
                                         FloatType computeType) const {
     auto v = opIdx == 0 ? scaledDotOp.getA() : scaledDotOp.getB();
+    auto res = scaledDotOp.getD();
     auto scale = opIdx == 0 ? scaledDotOp.getAScale() : scaledDotOp.getBScale();
     auto isFp4 =
         ScaleDotElemType::E2M1 ==
@@ -199,8 +200,14 @@ class DecomposeScaledBlocked : public OpRewritePattern<DotScaledOp> {
 
     // 0) Upcast value to computeType (fp16/bf16)
     if (isFp4) {
-      // We always pack along the fastest moving dimension, kDim
-      v = rewriter.create<Fp4ToFpOp>(loc, v, computeType, kDim);
+      auto resShape = res.getType().getShape();
+      auto vShape = v.getType().getShape();
+      auto packDim = kDim;
+      if ((opIdx == 0 && resShape[rank - 2] != vShape[rank - 2]) ||
+          (opIdx == 1 && resShape[rank - 1] != vShape[rank - 1])) {
+        packDim = (packDim + 1) % 2;
+      }
+      v = rewriter.create<Fp4ToFpOp>(loc, v, computeType, packDim);
     } else {
       auto vType16 = v.getType().clone(computeType);
       v = cast<TypedValue<RankedTensorType>>(