[AMD] Disable swap operands for fp8 matmul (triton-lang#5577)

zhanglx13 · web-flow · commit e8ef0bbeb1fa · 2025-01-13T10:31:27.000-06:00
We found regressions for moe kernel with fp8 inputs. This PR effectively reverts part of triton-lang#4767 and disables the swap-operand feature for fp8 inputs matmul kernels for now while we investigate the regression.
diff --git a/third_party/amd/lib/TritonAMDGPUTransforms/AccelerateAMDMatmul.cpp b/third_party/amd/lib/TritonAMDGPUTransforms/AccelerateAMDMatmul.cpp
@@ -337,6 +337,23 @@ class BlockedToMFMA : public OpRewritePattern<tt::DotOp> {
       : OpRewritePattern(context, benefit), mfmaVersion(mfmaVersion),
         nonKDim(nonKDim), kPack(kPack) {}
 
+  bool isChainDot(tt::DotOp &dotOp) const {
+    auto filter = [&dotOp](Operation *op) {
+      return op->getParentRegion() == dotOp->getParentRegion();
+    };
+    ForwardSliceOptions fwdOpt;
+    fwdOpt.filter = filter;
+    BackwardSliceOptions bwdOpt;
+    bwdOpt.omitBlockArguments = true;
+    bwdOpt.filter = filter;
+    auto slices = getSlice(dotOp, bwdOpt, fwdOpt);
+    for (Operation *op : slices) {
+      if (isa<tt::DotOp>(op) && (op != dotOp))
+        return true;
+    }
+    return false;
+  }
+
   bool isSecondDot(tt::DotOp &dotOp) const {
     auto filter = [&dotOp](Operation *op) {
       return op->getParentRegion() == dotOp->getParentRegion();
@@ -391,12 +408,16 @@ class BlockedToMFMA : public OpRewritePattern<tt::DotOp> {
     auto warpsPerTile =
         warpsPerTileMFMA(dotOp, retShape, numWarps, {mDim, nDim});
 
-    // Always use transposed mfma layout. This enables larger vectorization
-    // for global store instructions
+    // Use transposed mfma layout to enable larger vectorization for global
+    // store instructions, except for fp8 matmul kernels due to regression
+    // TODO (lixun): investigate the regression and enable this feature again
+    auto aElemTy = mfmaInstr.getElementTypeA();
+    bool isFP8 = aElemTy.isFloat8E5M2FNUZ() || aElemTy.isFloat8E4M3FNUZ();
+    bool isTransposed = isChainDot(dotOp) || !isFP8;
     mfmaEnc = ttg::AMDMfmaEncodingAttr::get(
         oldRetType.getContext(),
         /*versionMajor*/ mfmaVersion, /*versionMinor*/ 0, warpsPerTile,
-        /*instrShape*/ mDim, nDim, /*isTransposed*/ true, CTALayout);
+        /*instrShape*/ mDim, nDim, isTransposed, CTALayout);
 
     Type mfmaAccType;
     if (oldRetType.getElementType().isIntOrIndex())