llvm
diff --git a/‎mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td‎
Lines changed: 93 additions & 0 deletions b/‎mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td‎
Lines changed: 93 additions & 0 deletions
diff --git a/‎mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp‎
Lines changed: 172 additions & 30 deletions b/‎mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp‎
Lines changed: 172 additions & 30 deletions
@@ -962,6 +962,15 @@ def MFMAOutTypes : AnyTypeOf<[F64,
 def ScaledMFMAInTypes : AnyTypeOf<[VectorOfLengthAndType<[32], [F8E5M2, F8E4M3FN]>,
                                    VectorOfLengthAndType<[32], [F6E2M3FN, F6E3M2FN, F4E2M1FN]>]>;
 def ScaledMFMAOutTypes : AnyTypeOf<[VectorOfLengthAndType<[4, 16], [F32]>]>;
+
+// scaled_wmma
+def ScaledWMMAInTypes
+    : AnyTypeOf<[VectorOfLengthAndType<[64], [F8E5M2, F8E4M3FN]>,
+                 VectorOfLengthAndType<[64], [F6E2M3FN, F6E3M2FN]>,
+                 VectorOfLengthAndType<[64, 128], [F4E2M1FN]>]>;
+
+def ScaledWMMAOutTypes : AnyTypeOf<[VectorOfLengthAndType<[8, 16], [F32]>]>;
+
 // wmma
 def WMMAInTypes : AnyTypeOf<[VectorOfLengthAndType<[2], [F32]>,
                              VectorOfLengthAndType<[4, 8, 16], [F16, BF16]>,
@@ -1229,6 +1238,90 @@ def AMDGPU_ScaledMFMAOp :
   let hasCanonicalizer = 1;
 }
 
+def AMDGPU_ScaledWMMAOp
+    : AMDGPU_Op<"scaled_wmma", [AllTypesMatch<["destC", "destD"]>, Pure]>,
+      Arguments<(ins ConfinedAttr<I32Attr, [IntIsOneOf<[16, 32]>]>:$m,
+          ConfinedAttr<I32Attr, [IntIsOneOf<[16]>]>:$n,
+          ConfinedAttr<I32Attr, [IntIsOneOf<[128]>]>:$k,
+          ScaledWMMAInTypes:$sourceA, ScaledWMMAInTypes:$sourceB,
+          ScaledWMMAOutTypes:$destC,
+          VectorOfLengthAndType<[4, 8], [F8E8M0FNU, F8E4M3FN]>:$scaleA,
+          ConfinedAttr<I32Attr, [IntIsOneOf<[0, 16]>]>:$a_first_scale_lane,
+          VectorOfLengthAndType<[4, 8], [F8E8M0FNU, F8E4M3FN]>:$scaleB,
+          ConfinedAttr<I32Attr, [IntIsOneOf<[0, 16]>]>:$b_first_scale_lane)>,
+      Results<(outs ScaledWMMAOutTypes:$destD)> {
+  // TODO: E5M3FNU scales are supported, but there is not yet MLIR support for
+  // this datatype. Once we have support for that, update the scaleA and scaleB
+  // types here.
+  let summary = "MLIR wrapper for scaled wmma instructions";
+  let description = [{
+    The `amdgpu.scaled_wmma` op is an MLIR wrapper around intrinsics for scaled
+    `wmma` instructions. These instructions perform matrix multiplication with
+    per-block scaling of inputs, supporting fp4, fp6, and fp8 data formats.
+
+    The scale instructions support a block size of 16 or 32 and two tile sizes:
+    - 16x16x128 with mixed f8/f6/f4 formats (output: vector<8xf32>)
+    - 32x16x128 with f4 format only (output: vector<16xf32>)
+
+    Scale parameters (`scaleA`, `scaleB`) are small vectors of f8 scale values
+    (either f8E8M0FNU, or f8E4M3FN) that are packed into i32/i64 values during
+    lowering. Each lane can operate on 4 bytes (4 scale values), and the
+    number of scales required for each matrix is determined by:
+      num_scales_A = (M × K) / block_size
+      num_scales_B = (N × K) / block_size
+      
+    The index attributes (`a_first_scale_lane`, `b_first_scale_lane`) select
+    which lane to start reading scale values from (0 or 16):
+    - For block size 32, 32 lanes across a single wave are used for the scale
+    values. If the number of scales (num_scales_A or num_scales_B) can fit
+    into half of the available lanes
+    (i.e., num_scales / scales_per_lane == 16 (num_lanes)),
+    then then first_scale_lane can be either 0 or 16. If all lanes are required
+    for storing the scale values (num_scales / scales_per_lane == 32 (num_lanes)),
+    then the first_scale_lane must be 0.
+    - For block size 16, the same rules apply as above except that there are 64
+    lanes across two waves that are used for the scale values. When
+    num_scales / scales_per_lane == 32 (num lanes), then 16 lanes from each wave are used.
+    first_scale_lane of 0 or 16 will decide which lanes are used for this. When
+    num_scales / scales_per_lane == 64 (num_lanes), then first_scale_lane must
+    be set to 0.
+    
+    For tile size 16x16x128, each matrix gets 64 scales stored
+      16 lanes, with `a_first_scale_lane`/`b_first_scale_lane` selecting lanes
+      0-15 (index=0) or lanes 16-31 (index=16). For a tile size of 32x16x128,
+      matrix A gets 128 scales in a full VGPR (`a_first_scale_lane` is unused),
+      while matrix B gets 64 scales in half a VGPR.
+    - Block size 16: For a tile size of 16x16x128, each matrix gets
+      128 scales stored in half of two VGPRs, with `a_first_scale_lane`/`b_first_scale_lane`
+      selecting lanes 0-15 (index=0) or 16-31 (index=1) for each of the VGPRs.
+      For 32x16x128, matrix A gets 256 scales in two VGPRs (`a_first_scale_lane` is unused),
+      while matrix B gets 128 scales stored in half of two VGPRs.
+
+    Example:
+    ```mlir
+      // 16x16x128: fp8 inputs
+      %0 = amdgpu.scaled_wmma 16x16x128 (%scaleVecA * %matA) * (%scaleVecB * %matB) + %matC
+        {a_first_scale_lane = 0 : i32, b_first_scale_lane = 0 : i32}
+        : vector<4xf8E8M0FNU>, vector<64xf8E4M3FN>,
+        vector<4xf8E8M0FNU>, vector<64xf8E4M3FN>, vector<8xf32>
+
+      // 32x16x128: fp4 inputs with different scale lanes
+      %1 = amdgpu.scaled_wmma 32x16x128 (%scaleVecD * %matD) * (%scaleVecE * %matE) + %matF
+        {a_first_scale_lane = 0 : i32, b_first_scale_lane = 16 : i32}
+        : vector<8xf8E4M3FN>, vector<128xf4E2M1FN>,
+        vector<8xf8E4M3FN>, vector<64xf4E2M1FN>, vector<16xf32>
+    ```
+  }];
+  let assemblyFormat = [{
+    custom<MNKDimensionList>($m, $n, $k) ` `
+    `(` $scaleA `*` $sourceA `)` `*`
+    `(` $scaleB `*` $sourceB `)` `+` $destC
+    attr-dict
+    `:` type($scaleA) `,` type($sourceA) `,` type($scaleB) `,` type($sourceB) `,` type($destC)
+  }];
+  let hasVerifier = 1;
+}
+
 def AMDGPU_MakeDmaBaseOp :
     AMDGPU_Op<"make_dma_base", [Pure, AttrSizedOperandSegments, AllElementTypesMatch<["global", "lds"]>]>,
     Arguments<(ins Arg<AnyMemRef>:$global,
 
@@ -619,8 +619,8 @@ struct SchedBarrierOpLowering : public ConvertOpToLLVMPattern<SchedBarrierOp> {
 
 } // namespace
 
-/// Converts a MFMA vector operand from MLIR AMDGPU dialect convention to ROCDL
-/// and LLVM AMDGPU intrinsics convention.
+/// Pack small float vector operands (fp4/fp6/fp8/bf16) into the format
+/// expected by scaled matrix multiply intrinsics (MFMA/WMMA).
 ///
 /// Specifically:
 /// 1. If the element type is bfloat16, bitcast it to i16 unless rocdl intrinsic
@@ -634,9 +634,9 @@ struct SchedBarrierOpLowering : public ConvertOpToLLVMPattern<SchedBarrierOp> {
 /// Note that the type of `input` has already been LLVM type converted:
 /// therefore 8-bit and smaller floats are represented as their corresponding
 /// `iN` integers.
-static Value convertMFMAVectorOperand(ConversionPatternRewriter &rewriter,
-                                      Location loc, Value input,
-                                      bool allowBf16 = true) {
+static Value packSmallFloatVectorOperand(ConversionPatternRewriter &rewriter,
+                                         Location loc, Value input,
+                                         bool allowBf16 = true) {
   Type inputType = input.getType();
   if (auto vectorType = dyn_cast<VectorType>(inputType)) {
     if (vectorType.getElementType().isBF16() && !allowBf16)
@@ -660,23 +660,60 @@ static Value convertMFMAVectorOperand(ConversionPatternRewriter &rewriter,
   return input;
 }
 
-/// Converts the scaled MFMA operands, `scalesA` and `scalesB`, from MLIR AMDGPU
-/// dialect convention to ROCDL and LLVM AMDGPU intrinsics convention.
+/// Converts the scaled MFMA/WMMA operands, `scalesA` and `scalesB`, from MLIR
+/// AMDGPU dialect convention to ROCDL and LLVM AMDGPU intrinsics convention.
 ///
 /// Specifically:
 /// 1. If `input` is a i8 value, zero extend it to i32
-/// 2. If `input` is a vector of length 4 and type i8, cast it to i32
+/// 2. If `input` is a vector of length 4 or 8 and type i8, cast it to i32
 ///
 /// Note that the type of `input` has already been LLVM type converted:
 /// therefore 8-bit and smaller floats are represented as their corresponding
 /// `iN` integers.
-static Value castMFMAScaleOperand(ConversionPatternRewriter &rewriter,
-                                  Location loc, Value input) {
-  Type inputType = input.getType();
-  Type outputType = rewriter.getI32Type();
-  if (auto intType = dyn_cast<IntegerType>(inputType))
-    return LLVM::ZExtOp::create(rewriter, loc, outputType, input);
-  return LLVM::BitcastOp::create(rewriter, loc, outputType, input);
+static Value castScaleOperand(ConversionPatternRewriter &rewriter, Location loc,
+                              Value input) {
+  return TypeSwitch<Type, Value>(input.getType())
+      .Case<IntegerType>([&](IntegerType) {
+        // Handle scalar i8: zero extend to i32.
+        return LLVM::ZExtOp::create(rewriter, loc, rewriter.getI32Type(),
+                                    input);
+      })
+      .Case<VectorType>([&](VectorType vectorType) {
+        // Handle vector<4xi8> -> i32 or vector<8xi8> -> i64.
+        int64_t numElements = vectorType.getNumElements();
+        assert((numElements == 4 || numElements == 8) &&
+               "scale operand must be a vector of length 4 or 8");
+        IntegerType outputType =
+            (numElements == 4) ? rewriter.getI32Type() : rewriter.getI64Type();
+        return LLVM::BitcastOp::create(rewriter, loc, outputType, input);
+      })
+      .Default([](Type) -> Value {
+        llvm_unreachable("unexpected input type for scale operand");
+      });
+}
+
+/// Maps f8 scale element types to WMMA scale format codes.
+static std::optional<uint32_t> getWmmaScaleFormat(Type elemType) {
+  return TypeSwitch<Type, std::optional<uint32_t>>(elemType)
+      .Case([](Float8E8M0FNUType) { return 0; })
+      .Case([](Float8E4M3FNType) { return 2; })
+      .Default(std::nullopt);
+}
+
+/// Determines the ROCDL intrinsic name for scaled WMMA based on dimensions
+/// and scale block size (16 or 32).
+static std::optional<StringRef>
+getScaledWmmaIntrinsicName(int64_t m, int64_t n, int64_t k, bool isScale16) {
+  if (m == 16 && n == 16 && k == 128)
+    return isScale16
+               ? ROCDL::wmma_scale16_f32_16x16x128_f8f6f4::getOperationName()
+               : ROCDL::wmma_scale_f32_16x16x128_f8f6f4::getOperationName();
+
+  if (m == 32 && n == 16 && k == 128)
+    return isScale16 ? ROCDL::wmma_scale16_f32_32x16x128_f4::getOperationName()
+                     : ROCDL::wmma_scale_f32_32x16x128_f4::getOperationName();
+
+  return std::nullopt;
 }
 
 /// Push an input operand. If it is a float type, nothing to do. If it is
@@ -925,7 +962,7 @@ static std::optional<StringRef> mfmaOpToIntrinsic(MFMAOp mfma,
   return std::nullopt;
 }
 
-static std::optional<uint32_t> mfmaTypeSelectCode(Type mlirElemType) {
+static std::optional<uint32_t> smallFloatTypeToFormatCode(Type mlirElemType) {
   return llvm::TypeSwitch<Type, std::optional<uint32_t>>(mlirElemType)
       .Case([](Float8E4M3FNType) { return 0u; })
       .Case([](Float8E5M2Type) { return 1u; })
@@ -954,8 +991,8 @@ mfmaOpToScaledIntrinsic(Type aType, Type bType, Type destType, uint32_t m,
   if (!isa<Float32Type>(destType))
     return std::nullopt;
 
-  std::optional<uint32_t> aTypeCode = mfmaTypeSelectCode(aType);
-  std::optional<uint32_t> bTypeCode = mfmaTypeSelectCode(bType);
+  std::optional<uint32_t> aTypeCode = smallFloatTypeToFormatCode(aType);
+  std::optional<uint32_t> bTypeCode = smallFloatTypeToFormatCode(bType);
   if (!aTypeCode || !bTypeCode)
     return std::nullopt;
 
@@ -1219,9 +1256,9 @@ struct MFMAOpLowering : public ConvertOpToLLVMPattern<MFMAOp> {
     }();
     OperationState loweredOp(loc, intrinsicName);
     loweredOp.addTypes(intrinsicOutType);
-    loweredOp.addOperands({convertMFMAVectorOperand(
+    loweredOp.addOperands({packSmallFloatVectorOperand(
                                rewriter, loc, adaptor.getSourceA(), allowBf16),
-                           convertMFMAVectorOperand(
+                           packSmallFloatVectorOperand(
                                rewriter, loc, adaptor.getSourceB(), allowBf16),
                            adaptor.getDestC()});
     if (isScaled) {
@@ -1268,8 +1305,8 @@ struct ScaledMFMAOpLowering : public ConvertOpToLLVMPattern<ScaledMFMAOp> {
     OperationState loweredOp(loc, intrinsicName);
     loweredOp.addTypes(intrinsicOutType);
     loweredOp.addOperands(
-        {convertMFMAVectorOperand(rewriter, loc, adaptor.getSourceA()),
-         convertMFMAVectorOperand(rewriter, loc, adaptor.getSourceB()),
+        {packSmallFloatVectorOperand(rewriter, loc, adaptor.getSourceA()),
+         packSmallFloatVectorOperand(rewriter, loc, adaptor.getSourceB()),
          adaptor.getDestC()});
     Value scalesIdxA =
         createI32Constant(rewriter, loc, adaptor.getScalesIdxA());
@@ -1280,10 +1317,10 @@ struct ScaledMFMAOpLowering : public ConvertOpToLLVMPattern<ScaledMFMAOp> {
          createI32Constant(rewriter, loc, bTypeCode),
          /*scales idx A=*/scalesIdxA,
          /*scales A*/
-         castMFMAScaleOperand(rewriter, loc, adaptor.getScalesA()),
+         castScaleOperand(rewriter, loc, adaptor.getScalesA()),
          /*scales idx B=*/scalesIdxB,
          /*scales B*/
-         castMFMAScaleOperand(rewriter, loc, adaptor.getScalesB())});
+         castScaleOperand(rewriter, loc, adaptor.getScalesB())});
     Value lowered = rewriter.create(loweredOp)->getResult(0);
     rewriter.replaceOp(op, lowered);
     return success();
@@ -1370,6 +1407,111 @@ struct WMMAOpLowering : public ConvertOpToLLVMPattern<WMMAOp> {
   }
 };
 
+struct ScaledWMMAOpLowering : public ConvertOpToLLVMPattern<ScaledWMMAOp> {
+  ScaledWMMAOpLowering(const LLVMTypeConverter &converter, Chipset chipset)
+      : ConvertOpToLLVMPattern<ScaledWMMAOp>(converter), chipset(chipset) {}
+
+  Chipset chipset;
+
+  LogicalResult
+  matchAndRewrite(ScaledWMMAOp op, ScaledWMMAOpAdaptor adaptor,
+                  ConversionPatternRewriter &rewriter) const override {
+    Location loc = op.getLoc();
+    auto outType =
+        typeConverter->convertType<VectorType>(op.getDestD().getType());
+    if (!outType)
+      return rewriter.notifyMatchFailure(op, "type conversion failed");
+
+    if (chipset < kGfx1250)
+      return op->emitOpError("WMMA scale only supported on gfx1250+");
+
+    int64_t m = op.getM();
+    int64_t n = op.getN();
+    int64_t k = op.getK();
+
+    Type aElemType = getElementTypeOrSelf(op.getSourceA().getType());
+    Type bElemType = getElementTypeOrSelf(op.getSourceB().getType());
+
+    std::optional<uint32_t> aFmtCode = smallFloatTypeToFormatCode(aElemType);
+    std::optional<uint32_t> bFmtCode = smallFloatTypeToFormatCode(bElemType);
+
+    if (!aFmtCode || !bFmtCode)
+      return op.emitOpError("unsupported element types for scaled_wmma");
+
+    // Get scale vector types and determine variant (scale vs scale16).
+    auto scaleAVecType = cast<VectorType>(op.getScaleA().getType());
+    auto scaleBVecType = cast<VectorType>(op.getScaleB().getType());
+
+    if (scaleAVecType.getNumElements() != scaleBVecType.getNumElements())
+      return op.emitOpError("scaleA and scaleB must have equal vector length");
+
+    // Extract scale format from element types.
+    Type scaleAElemType = scaleAVecType.getElementType();
+    Type scaleBElemType = scaleBVecType.getElementType();
+
+    std::optional<uint32_t> scaleAFmt = getWmmaScaleFormat(scaleAElemType);
+    std::optional<uint32_t> scaleBFmt = getWmmaScaleFormat(scaleBElemType);
+
+    if (!scaleAFmt || !scaleBFmt)
+      return op.emitOpError("unsupported scale element types");
+
+    // Determine which intrinsic to use based on dimensions.
+    bool isScale16 = (scaleAVecType.getNumElements() == 8);
+    std::optional<StringRef> intrinsicName =
+        getScaledWmmaIntrinsicName(m, n, k, isScale16);
+    if (!intrinsicName)
+      return op.emitOpError("unsupported scaled_wmma dimensions: ")
+             << m << "x" << n << "x" << k;
+
+    SmallVector<NamedAttribute, 8> attrs;
+
+    // The f4 variant does not have fmtA and fmtB attributes.
+    bool is32x16 = (m == 32 && n == 16 && k == 128);
+    if (!is32x16) {
+      attrs.emplace_back("fmtA", rewriter.getI32IntegerAttr(*aFmtCode));
+      attrs.emplace_back("fmtB", rewriter.getI32IntegerAttr(*bFmtCode));
+    }
+
+    // modC uses default value of 0.
+    attrs.emplace_back("modC", rewriter.getI16IntegerAttr(0));
+
+    // Scale attributes. Convert user-facing firstScaleLane (0 or 16) to the
+    // half of the wave that is being selected (0 or 1).
+    attrs.emplace_back(
+        "scaleAType", rewriter.getI32IntegerAttr(op.getAFirstScaleLane() / 16));
+    attrs.emplace_back("fmtScaleA", rewriter.getI32IntegerAttr(*scaleAFmt));
+    attrs.emplace_back(
+        "scaleBType", rewriter.getI32IntegerAttr(op.getBFirstScaleLane() / 16));
+    attrs.emplace_back("fmtScaleB", rewriter.getI32IntegerAttr(*scaleBFmt));
+
+    // Reuse flags use default value of false.
+    attrs.emplace_back("reuseA", rewriter.getBoolAttr(false));
+    attrs.emplace_back("reuseB", rewriter.getBoolAttr(false));
+
+    // Convert typed float vectors to packed format.
+    Value sourceA =
+        packSmallFloatVectorOperand(rewriter, loc, adaptor.getSourceA());
+    Value sourceB =
+        packSmallFloatVectorOperand(rewriter, loc, adaptor.getSourceB());
+
+    // Pack scale vectors into i32/i64.
+    Value packedScaleA = castScaleOperand(rewriter, loc, adaptor.getScaleA());
+    Value packedScaleB = castScaleOperand(rewriter, loc, adaptor.getScaleB());
+
+    // Create the intrinsic call.
+    OperationState loweredOp(loc, *intrinsicName);
+    loweredOp.addTypes(outType);
+    loweredOp.addOperands(
+        {sourceA, sourceB, adaptor.getDestC(), packedScaleA, packedScaleB});
+    loweredOp.addAttributes(attrs);
+
+    Operation *lowered = rewriter.create(loweredOp);
+    rewriter.replaceOp(op, lowered->getResults());
+
+    return success();
+  }
+};
+
 struct TransposeLoadOpLowering
     : public ConvertOpToLLVMPattern<TransposeLoadOp> {
   TransposeLoadOpLowering(const LLVMTypeConverter &converter, Chipset chipset)
@@ -2780,11 +2922,11 @@ void mlir::populateAMDGPUToROCDLConversionPatterns(LLVMTypeConverter &converter,
                           ROCDL::RawPtrBufferAtomicCmpSwap>,
       AMDGPUDPPLowering, MemoryCounterWaitOpLowering, LDSBarrierOpLowering,
       SchedBarrierOpLowering, MFMAOpLowering, ScaledMFMAOpLowering,
-      WMMAOpLowering, ExtPackedFp8OpLowering, ScaledExtPackedMatrixOpLowering,
-      ScaledExtPackedOpLowering, PackedScaledTruncOpLowering,
-      PackedTrunc2xFp8OpLowering, PackedStochRoundFp8OpLowering,
-      GatherToLDSOpLowering, TransposeLoadOpLowering, AMDGPUPermlaneLowering,
-      AMDGPUMakeDmaBaseLowering, AMDGPUMakeDmaDescriptorLowering>(converter,
-                                                                  chipset);
+      WMMAOpLowering, ScaledWMMAOpLowering, ExtPackedFp8OpLowering,
+      ScaledExtPackedMatrixOpLowering, ScaledExtPackedOpLowering,
+      PackedScaledTruncOpLowering, PackedTrunc2xFp8OpLowering,
+      PackedStochRoundFp8OpLowering, GatherToLDSOpLowering,
+      TransposeLoadOpLowering, AMDGPUPermlaneLowering,AMDGPUMakeDmaBaseLowering,
+      AMDGPUMakeDmaDescriptorLowering>(converter, chipset);
   patterns.add<AMDGPUSwizzleBitModeLowering>(converter);
 }