[AMD] Support splatted scale in MFMA (#7270)

PMylon · web-flow · commit 7f9460918347 · 2025-06-23T17:37:07.000Z
diff --git a/python/test/unit/language/test_matmul.py b/python/test/unit/language/test_matmul.py
@@ -885,8 +885,6 @@ def test_mxfp8_mxfp4_matmul(M, N, K, BLOCK_M, BLOCK_N, BLOCK_K, NUM_STAGES, B_TR
             pytest.skip("Pack along M/N is not enabled on AMD backend")
         if not is_hip_cdna4():
             pytest.skip("Scaled mxfp4 & mxfp8 matmul is only natively supported on CDNA4")
-        if CONST_SCALE:
-            pytest.skip("Constant scale is not supported in AMD backend for now")
         if (nonKDim == 16 and BLOCK_K < 128) or (nonKDim == 32 and BLOCK_K < 64):
             pytest.skip(f"CDNA4 does not support {BLOCK_K=} for scaled mfma {nonKDim=} variants")
         if (A_DATA_TYPE == 'float4' and not WITH_A_SCALE) or (B_DATA_TYPE == 'float4' and not WITH_B_SCALE):
diff --git a/third_party/amd/lib/TritonAMDGPUToLLVM/DotOpToLLVM/MFMA.cpp b/third_party/amd/lib/TritonAMDGPUToLLVM/DotOpToLLVM/MFMA.cpp
@@ -558,8 +558,10 @@ struct ScaledDotOpMFMAConversionHelper : DotOpMFMAConversionHelper {
     }
 
     bool existBothScales = aScale && bScale;
-    bool isAScaleConstant = aScale && aScale.getDefiningOp<arith::ConstantOp>();
-    bool isBScaleConstant = bScale && bScale.getDefiningOp<arith::ConstantOp>();
+    bool isAScaleConstant = aScale && isa<arith::ConstantOp, triton::SplatOp>(
+                                          aScale.getDefiningOp());
+    bool isBScaleConstant = bScale && isa<arith::ConstantOp, triton::SplatOp>(
+                                          bScale.getDefiningOp());
     Value d = op.getD();
     auto aTensorTy = cast<RankedTensorType>(a.getType());
     auto bTensorTy = cast<RankedTensorType>(b.getType());