PR review round 3

Muzammiluddin-Syed-ECE · Muzammiluddin-Syed-ECE · commit 02f5d9800898 · 2025-04-30T14:35:55.000-05:00
Signed-off-by: Muzammiluddin Syed &lt;muzasyed@amd.com&gt;
diff --git a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td
@@ -688,10 +688,9 @@ def MFMAOutTypes : AnyTypeOf<[F64,
                               VectorOfLengthAndType<[4, 16, 32], [I32]>,
                               VectorOfLengthAndType<[4], [F64]>]>;
 // scaled_mfma
-def ScaledMFMAInTypes : AnyTypeOf<[VectorOfLengthAndType<[8], [F8E5M2FNUZ, F8E4M3FNUZ]>,
-                             VectorOfLengthAndType<[8, 32], [F8E5M2, F8E4M3FN]>,
-                             VectorOfLengthAndType<[32], [F6E2M3FN, F6E3M2FN, F4E2M1FN]>]>;
-def ScaledMFMAOutTypes : AnyTypeOf<[VectorOfLengthAndType<[4, 16, 32], [F32]>]>;
+def ScaledMFMAInTypes : AnyTypeOf<[VectorOfLengthAndType<[32], [F8E5M2, F8E4M3FN]>,
+                                   VectorOfLengthAndType<[32], [F6E2M3FN, F6E3M2FN, F4E2M1FN]>]>;
+def ScaledMFMAOutTypes : AnyTypeOf<[VectorOfLengthAndType<[4, 16], [F32]>]>;
 // wmma
 def WMMAInTypes : AnyTypeOf<[VectorOfLengthAndType<
                              [4, 8, 16],
@@ -847,8 +846,9 @@ def AMDGPU_ScaledMFMAOp :
                    ScaledMFMAOutTypes:$destC,
                    AnyTypeOf<[I8, FixedVectorOfLengthAndType<[4], [I8]>]>:$scalesA,
                    AnyTypeOf<[I8, FixedVectorOfLengthAndType<[4], [I8]>]>:$scalesB,
-                   I32Attr:$scalesIdxA,
-                   I32Attr:$scalesIdxB)>,
+                   ConfinedAttr<I32Attr, [IntNonNegative, IntMaxValue<3>]>:$scalesIdxA,
+                   ConfinedAttr<I32Attr, [IntNonNegative, IntMaxValue<3>]>:$scalesIdxB
+                   )>,
     Results<(outs ScaledMFMAOutTypes: $destD)> {
   let summary = "MLIR wrapper for CDNA scaled mfma instructions";
   let description = [{
@@ -879,8 +879,7 @@ def AMDGPU_ScaledMFMAOp :
   let assemblyFormat = [{
     `(` $scalesA `[` $scalesIdxA `]` `*` $sourceA `)` `*` `(` $scalesB `[` $scalesIdxB `]` `*` $sourceB `)` `+` $destC
     attr-dict
-    `:` type($sourceA) `,` type($scalesA) `,` type($sourceB) `,` type($scalesB) `,` type($destC)
+    `:` type($scalesA) `,` type($sourceA) `,` type($scalesB) `,` type($sourceB) `,` type($destC)
   }];
-  let hasVerifier = 1;
 }
 #endif // AMDGPU
diff --git a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
@@ -529,6 +529,25 @@ static Value convertMFMAVectorOperand(ConversionPatternRewriter &rewriter,
   return input;
 }
 
+/// Converts the scaled MFMA operands, `scalesA` and `scalesB`, from MLIR AMDGPU
+/// dialect convention to ROCDL and LLVM AMDGPU intrinsics convention.
+///
+/// Specifically:
+/// 1. If `input` is a i8 value, zero extend it to i32
+/// 2. If `input` is a vector of length 4 and type i8, cast it to i32
+///
+/// Note that the type of `input` has already been LLVM type converted:
+/// therefore 8-bit and smaller floats are represented as their corresponding
+/// `iN` integers.
+static Value castScaledMFMAVectorOperand(ConversionPatternRewriter &rewriter,
+                                         Location loc, Value input) {
+  Type inputType = input.getType();
+  Type outputType = rewriter.getI32Type();
+  if (auto intType = dyn_cast<IntegerType>(inputType))
+    return rewriter.create<LLVM::ZExtOp>(loc, outputType, input);
+  return rewriter.create<LLVM::BitcastOp>(loc, outputType, input);
+}
+
 /// Push an input operand. If it is a float type, nothing to do. If it is
 /// an integer type, then we need to also push its signdness (1 for signed, 0
 /// for unsigned) and we need to pack the input 16xi8 vector into a 4xi32
@@ -827,20 +846,19 @@ mfmaOpToScaledIntrinsic(Type aType, Type bType, Type destType, uint32_t m,
 }
 
 static std::optional<std::tuple<StringRef, uint32_t, uint32_t>>
-mfmaOpToScaledIntrinsic(Operation *op, Chipset chipset) {
-  if (auto mfma = llvm::dyn_cast_or_null<MFMAOp>(op)) {
-    return mfmaOpToScaledIntrinsic(
-        mfma.getSourceA().getType(), mfma.getSourceB().getType(),
-        mfma.getDestC().getType(), mfma.getM(), mfma.getN(), mfma.getK(),
-        mfma.getBlocks(), chipset);
-  }
-  if (auto smfma = llvm::dyn_cast_or_null<ScaledMFMAOp>(op)) {
-    return mfmaOpToScaledIntrinsic(smfma.getSourceA().getType(),
-                                   smfma.getSourceB().getType(),
-                                   smfma.getDestC().getType(), smfma.getM(),
-                                   smfma.getN(), smfma.getK(), 1u, chipset);
-  }
-  return std::nullopt;
+mfmaOpToScaledIntrinsic(MFMAOp mfma, Chipset chipset) {
+  return mfmaOpToScaledIntrinsic(
+      mfma.getSourceA().getType(), mfma.getSourceB().getType(),
+      mfma.getDestC().getType(), mfma.getM(), mfma.getN(), mfma.getK(),
+      mfma.getBlocks(), chipset);
+}
+
+static std::optional<std::tuple<StringRef, uint32_t, uint32_t>>
+mfmaOpToScaledIntrinsic(ScaledMFMAOp smfma, Chipset chipset) {
+  return mfmaOpToScaledIntrinsic(smfma.getSourceA().getType(),
+                                 smfma.getSourceB().getType(),
+                                 smfma.getDestC().getType(), smfma.getM(),
+                                 smfma.getN(), smfma.getK(), 1u, chipset);
 }
 
 /// Return the `rocdl` intrinsic corresponding to a WMMA operation `wmma`
@@ -991,17 +1009,19 @@ struct ScaledMFMAOpLowering : public ConvertOpToLLVMPattern<ScaledMFMAOp> {
         {convertMFMAVectorOperand(rewriter, loc, adaptor.getSourceA()),
          convertMFMAVectorOperand(rewriter, loc, adaptor.getSourceB()),
          adaptor.getDestC()});
-    Value scalesIdxA = createI32Constant(rewriter, loc, adaptor.getScalesIdxA());
-    Value scalesIdxB = createI32Constant(rewriter, loc, adaptor.getScalesIdxB());
+    Value scalesIdxA =
+        createI32Constant(rewriter, loc, adaptor.getScalesIdxA());
+    Value scalesIdxB =
+        createI32Constant(rewriter, loc, adaptor.getScalesIdxB());
     loweredOp.addOperands(
         {createI32Constant(rewriter, loc, aTypeCode),
          createI32Constant(rewriter, loc, bTypeCode),
+         /*scales idx A=*/scalesIdxA,
          /*scales A*/
-         convertMFMAVectorOperand(rewriter, loc, adaptor.getScalesA()),
+         castScaledMFMAVectorOperand(rewriter, loc, adaptor.getScalesA()),
+         /*scales idx B=*/scalesIdxB,
          /*scales B*/
-         convertMFMAVectorOperand(rewriter, loc, adaptor.getScalesB()),
-         /*scales idx A=*/scalesIdxA,
-         /*scales idx B=*/scalesIdxB});
+         castScaledMFMAVectorOperand(rewriter, loc, adaptor.getScalesB())});
     Value lowered = rewriter.create(loweredOp)->getResult(0);
     rewriter.replaceOp(op, lowered);
     return success();
diff --git a/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp b/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp
@@ -506,19 +506,6 @@ LogicalResult GatherToLDSOp::verify() {
   return success();
 }
 
-LogicalResult ScaledMFMAOp::verify() {
-  unsigned scalesIdxA = getScalesIdxA();
-  unsigned scalesIdxB = getScalesIdxB();
-
-  if (scalesIdxA > 3)
-    return emitOpError("scales idx A must be a value from 0 to 3 inclusive");
-
-  if (scalesIdxB > 3)
-    return emitOpError("scales idx B must be a value from 0 to 3 inclusive");
-
-  return success();
-}
-
 #include "mlir/Dialect/AMDGPU/IR/AMDGPUEnums.cpp.inc"
 
 #define GET_ATTRDEF_CLASSES
diff --git a/mlir/test/Conversion/AMDGPUToROCDL/mfma-gfx950.mlir b/mlir/test/Conversion/AMDGPUToROCDL/mfma-gfx950.mlir
@@ -60,42 +60,43 @@ func.func @scaled_mfma_to_rocdl(%arg0 : vector<16xf32>,
   
   // CHECK: %[[c0:.+]] = llvm.mlir.constant(0 : i32) : i32
   // CHECK: %[[c1:.+]] = llvm.mlir.constant(1 : i32) : i32
-  // CHECK: llvm.bitcast
+  // CHECK: %[[c2:.+]] = llvm.bitcast{{.*}} : vector<4xi8> to i32
+  // CHECK: %[[c3:.+]] = llvm.zext{{.*}} : i8 to i32
 
-  // CHECK: rocdl.mfma.scale.f32.32x32x64.f8f6f4{{.*}}, %[[c0]], %[[c1]] : (vector<8xi32>, vector<8xi32>, vector<16xf32>, i32, i32, i32, i8, i32, i32) -> vector<16xf32>
-  amdgpu.scaled_mfma ( %arg7 [ 0 ] * %arg2 ) * ( %arg8 [ 1 ] * %arg2 ) + %arg0 { k = 64 : i32, m = 32 : i32, n = 32 : i32 } : vector<32xf8E4M3FN>, vector<4xi8>, vector<32xf8E4M3FN>, i8, vector<16xf32>
-  // CHECK: rocdl.mfma.scale.f32.16x16x128.f8f6f4{{.*}}, %[[c0]], %[[c1]] : (vector<8xi32>, vector<8xi32>, vector<4xf32>, i32, i32, i32, i8, i32, i32) -> vector<4xf32>
-  amdgpu.scaled_mfma ( %arg7 [ 0 ] * %arg2 ) * ( %arg8 [ 1 ] * %arg2 ) + %arg1 { k = 128 : i32, m = 16 : i32, n = 16 : i32 } : vector<32xf8E4M3FN>, vector<4xi8>, vector<32xf8E4M3FN>, i8, vector<4xf32>
+  // CHECK: rocdl.mfma.scale.f32.32x32x64.f8f6f4{{.*}}, %[[c0]], %[[c2]], %[[c1]], %[[c3]] : (vector<8xi32>, vector<8xi32>, vector<16xf32>, i32, i32, i32, i32, i32, i32) -> vector<16xf32>
+  amdgpu.scaled_mfma(%arg7[0] * %arg2) * (%arg8[1] * %arg2) + %arg0 { k = 64 : i32, m = 32 : i32, n = 32 : i32 } : vector<4xi8>, vector<32xf8E4M3FN>, i8, vector<32xf8E4M3FN>, vector<16xf32>
+  // CHECK: rocdl.mfma.scale.f32.16x16x128.f8f6f4{{.*}}, %[[c0]], %[[c2]], %[[c1]], %[[c3]] : (vector<8xi32>, vector<8xi32>, vector<4xf32>, i32, i32, i32, i32, i32, i32) -> vector<4xf32>
+  amdgpu.scaled_mfma(%arg7[0] * %arg2) * (%arg8[1] * %arg2) + %arg1 { k = 128 : i32, m = 16 : i32, n = 16 : i32 } : vector<4xi8>, vector<32xf8E4M3FN>, i8, vector<32xf8E4M3FN>, vector<4xf32>
   
   // CHECK: llvm.bitcast
   
-  // CHECK: rocdl.mfma.scale.f32.32x32x64.f8f6f4{{.*}}, %[[c0]], %[[c1]] : (vector<8xi32>, vector<8xi32>, vector<16xf32>, i32, i32, i32, i8, i32, i32) -> vector<16xf32>
-  amdgpu.scaled_mfma ( %arg7 [ 0 ] * %arg3 ) * ( %arg8 [ 1 ] * %arg3 ) + %arg0 { k = 64 : i32, m = 32 : i32, n = 32 : i32 } : vector<32xf8E5M2>, vector<4xi8>, vector<32xf8E5M2>, i8, vector<16xf32>
-  // CHECK: rocdl.mfma.scale.f32.16x16x128.f8f6f4{{.*}}, %[[c0]], %[[c1]] : (vector<8xi32>, vector<8xi32>, vector<4xf32>, i32, i32, i32, i8, i32, i32) -> vector<4xf32>
-  amdgpu.scaled_mfma ( %arg7 [ 0 ] * %arg3 ) * ( %arg8 [ 1 ] * %arg3 ) + %arg1 { k = 128 : i32, m = 16 : i32, n = 16 : i32 } : vector<32xf8E5M2>, vector<4xi8>, vector<32xf8E5M2>, i8, vector<4xf32>
+  // CHECK: rocdl.mfma.scale.f32.32x32x64.f8f6f4{{.*}}, %[[c0]], %[[c2]], %[[c1]], %[[c3]] : (vector<8xi32>, vector<8xi32>, vector<16xf32>, i32, i32, i32, i32, i32, i32) -> vector<16xf32>
+  amdgpu.scaled_mfma(%arg7[0] * %arg3) * (%arg8[1] * %arg3) + %arg0 { k = 64 : i32, m = 32 : i32, n = 32 : i32 } : vector<4xi8>, vector<32xf8E5M2>, i8, vector<32xf8E5M2>, vector<16xf32>
+  // CHECK: rocdl.mfma.scale.f32.16x16x128.f8f6f4{{.*}}, %[[c0]], %[[c2]], %[[c1]], %[[c3]] : (vector<8xi32>, vector<8xi32>, vector<4xf32>, i32, i32, i32, i32, i32, i32) -> vector<4xf32>
+  amdgpu.scaled_mfma(%arg7[0] * %arg3) * (%arg8[1] * %arg3) + %arg1 { k = 128 : i32, m = 16 : i32, n = 16 : i32 } : vector<4xi8>, vector<32xf8E5M2>, i8, vector<32xf8E5M2>, vector<4xf32>
   
   // CHECK: llvm.bitcast
   
-  // CHECK: rocdl.mfma.scale.f32.32x32x64.f8f6f4{{.*}}, %[[c0]], %[[c1]] : (vector<6xi32>, vector<6xi32>, vector<16xf32>, i32, i32, i32, i8, i32, i32) -> vector<16xf32>
-  amdgpu.scaled_mfma ( %arg7 [ 0 ] * %arg4 ) * ( %arg8 [ 1 ] * %arg4 ) + %arg0 { k = 64 : i32, m = 32 : i32, n = 32 : i32 } : vector<32xf6E2M3FN>, vector<4xi8>, vector<32xf6E2M3FN>, i8, vector<16xf32>
-  // CHECK: rocdl.mfma.scale.f32.16x16x128.f8f6f4{{.*}}, %[[c0]], %[[c1]] : (vector<6xi32>, vector<6xi32>, vector<4xf32>, i32, i32, i32, i8, i32, i32) -> vector<4xf32>
-  amdgpu.scaled_mfma ( %arg7 [ 0 ] * %arg4 ) * ( %arg8 [ 1 ] * %arg4 ) + %arg1 { k = 128 : i32, m = 16 : i32, n = 16 : i32 } : vector<32xf6E2M3FN>, vector<4xi8>, vector<32xf6E2M3FN>, i8, vector<4xf32>
+  // CHECK: rocdl.mfma.scale.f32.32x32x64.f8f6f4{{.*}}, %[[c0]], %[[c2]], %[[c1]], %[[c3]] : (vector<6xi32>, vector<6xi32>, vector<16xf32>, i32, i32, i32, i32, i32, i32) -> vector<16xf32>
+  amdgpu.scaled_mfma(%arg7[0] * %arg4) * (%arg8[1] * %arg4) + %arg0 { k = 64 : i32, m = 32 : i32, n = 32 : i32 } : vector<4xi8>, vector<32xf6E2M3FN>, i8, vector<32xf6E2M3FN>, vector<16xf32>
+  // CHECK: rocdl.mfma.scale.f32.16x16x128.f8f6f4{{.*}}, %[[c0]], %[[c2]], %[[c1]], %[[c3]] : (vector<6xi32>, vector<6xi32>, vector<4xf32>, i32, i32, i32, i32, i32, i32) -> vector<4xf32>
+  amdgpu.scaled_mfma(%arg7[0] * %arg4) * (%arg8[1] * %arg4) + %arg1 { k = 128 : i32, m = 16 : i32, n = 16 : i32 } : vector<4xi8>, vector<32xf6E2M3FN>, i8, vector<32xf6E2M3FN>, vector<4xf32>
   
   // CHECK: llvm.bitcast
-  // CHECK: %[[c3:.+]] = llvm.mlir.constant(3 : i32) : i32
+  // CHECK: llvm.mlir.constant(3 : i32) : i32
 
-  // CHECK: rocdl.mfma.scale.f32.32x32x64.f8f6f4{{.*}}, %[[c0]], %[[c1]] : (vector<6xi32>, vector<6xi32>, vector<16xf32>, i32, i32, i32, i8, i32, i32) -> vector<16xf32>
-  amdgpu.scaled_mfma ( %arg7 [ 0 ] * %arg5 ) * ( %arg8 [ 1 ] * %arg5 ) + %arg0 { k = 64 : i32, m = 32 : i32, n = 32 : i32 } : vector<32xf6E3M2FN>, vector<4xi8>, vector<32xf6E3M2FN>, i8, vector<16xf32>
-  // CHECK: rocdl.mfma.scale.f32.16x16x128.f8f6f4{{.*}}, %[[c0]], %[[c1]] : (vector<6xi32>, vector<6xi32>, vector<4xf32>, i32, i32, i32, i8, i32, i32) -> vector<4xf32>
-  amdgpu.scaled_mfma ( %arg7 [ 0 ] * %arg5 ) * ( %arg8 [ 1 ] * %arg5 ) + %arg1 { k = 128 : i32, m = 16 : i32, n = 16 : i32 } : vector<32xf6E3M2FN>, vector<4xi8>, vector<32xf6E3M2FN>, i8, vector<4xf32>
+  // CHECK: rocdl.mfma.scale.f32.32x32x64.f8f6f4{{.*}}, %[[c0]], %[[c2]], %[[c1]], %[[c3]] : (vector<6xi32>, vector<6xi32>, vector<16xf32>, i32, i32, i32, i32, i32, i32) -> vector<16xf32>
+  amdgpu.scaled_mfma(%arg7[0] * %arg5) * (%arg8[1] * %arg5) + %arg0 { k = 64 : i32, m = 32 : i32, n = 32 : i32 } : vector<4xi8>, vector<32xf6E3M2FN>, i8, vector<32xf6E3M2FN>, vector<16xf32>
+  // CHECK: rocdl.mfma.scale.f32.16x16x128.f8f6f4{{.*}}, %[[c0]], %[[c2]], %[[c1]], %[[c3]] : (vector<6xi32>, vector<6xi32>, vector<4xf32>, i32, i32, i32, i32, i32, i32) -> vector<4xf32>
+  amdgpu.scaled_mfma(%arg7[0] * %arg5) * (%arg8[1] * %arg5) + %arg1 { k = 128 : i32, m = 16 : i32, n = 16 : i32 } : vector<4xi8>, vector<32xf6E3M2FN>, i8, vector<32xf6E3M2FN>, vector<4xf32>
   
   // CHECK: llvm.bitcast
-  // CHECK: %[[c4:.+]] = llvm.mlir.constant(4 : i32) : i32
+  // CHECK: llvm.mlir.constant(4 : i32) : i32
   
-  // CHECK: rocdl.mfma.scale.f32.32x32x64.f8f6f4{{.*}}, %[[c0]], %[[c1]] : (vector<4xi32>, vector<4xi32>, vector<16xf32>, i32, i32, i32, i8, i32, i32) -> vector<16xf32>
-  amdgpu.scaled_mfma ( %arg7 [ 0 ] * %arg6 ) * ( %arg8 [ 1 ] * %arg6 ) + %arg0 { k = 64 : i32, m = 32 : i32, n = 32 : i32 } : vector<32xf4E2M1FN>, vector<4xi8>, vector<32xf4E2M1FN>, i8, vector<16xf32>
-  // CHECK: rocdl.mfma.scale.f32.16x16x128.f8f6f4{{.*}}, %[[c0]], %[[c1]] : (vector<4xi32>, vector<4xi32>, vector<4xf32>, i32, i32, i32, i8, i32, i32) -> vector<4xf32>
-  amdgpu.scaled_mfma ( %arg7 [ 0 ] * %arg6 ) * ( %arg8 [ 1 ] * %arg6 ) + %arg1 { k = 128 : i32, m = 16 : i32, n = 16 : i32 } : vector<32xf4E2M1FN>, vector<4xi8>, vector<32xf4E2M1FN>, i8, vector<4xf32>
+  // CHECK: rocdl.mfma.scale.f32.32x32x64.f8f6f4{{.*}}, %[[c0]], %[[c2]], %[[c1]], %[[c3]] : (vector<4xi32>, vector<4xi32>, vector<16xf32>, i32, i32, i32, i32, i32, i32) -> vector<16xf32>
+  amdgpu.scaled_mfma(%arg7[0] * %arg6) * (%arg8[1] * %arg6) + %arg0 { k = 64 : i32, m = 32 : i32, n = 32 : i32 } : vector<4xi8>, vector<32xf4E2M1FN>, i8, vector<32xf4E2M1FN>, vector<16xf32>
+  // CHECK: rocdl.mfma.scale.f32.16x16x128.f8f6f4{{.*}}, %[[c0]], %[[c2]], %[[c1]], %[[c3]] : (vector<4xi32>, vector<4xi32>, vector<4xf32>, i32, i32, i32, i32, i32, i32) -> vector<4xf32>
+  amdgpu.scaled_mfma(%arg7[0] * %arg6) * (%arg8[1] * %arg6) + %arg1 { k = 128 : i32, m = 16 : i32, n = 16 : i32 } : vector<4xi8>, vector<32xf4E2M1FN>, i8, vector<32xf4E2M1FN>, vector<4xf32>
 
   func.return
 }
diff --git a/mlir/test/Dialect/AMDGPU/ops.mlir b/mlir/test/Dialect/AMDGPU/ops.mlir
@@ -164,3 +164,10 @@ func.func @swizzle_bitmode(%arg0 : f32) -> f32 {
   %0 = amdgpu.swizzle_bitmode %arg0 1 2 4 : f32
   func.return %0 : f32
 }
+
+// CHECK-LABEL: func @scaled_mfma
+func.func @scaled_mfma(%arg0 : i8, %arg1 : vector<32xf6E2M3FN>, %arg2 : vector<16xf32>) -> vector<16xf32> {
+  // CHECK: amdgpu.scaled_mfma
+  %0 = amdgpu.scaled_mfma(%arg0[0] * %arg1) * (%arg0[1] * %arg1) + %arg2 { k = 64 : i32, m = 32 : i32, n = 32 : i32 } : i8, vector<32xf6E2M3FN>, i8, vector<32xf6E2M3FN>, vector<16xf32>
+  func.return %0 : vector<16xf32>
+}