[mlir][amdgpu] Add scaled_ext_packed{8,16} operations

amd-eochoalo · amd-eochoalo · commit 9c09c35633e9 · 2025-10-16T09:02:01.000-04:00
diff --git a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td
@@ -112,6 +112,54 @@ def AMDGPU_ExtPackedFp8Op :
   }];
 }
 
+def IsValidBlockSize: AttrConstraint<
+    CPred<"::llvm::cast<::mlir::IntegerAttr>($_self).getInt() == 16 || ::llvm::cast<::mlir::IntegerAttr>($_self).getInt() == 32">,
+    "whose value is 16 or 32">;
+
+def AMDGPU_ScaledExtPacked816Op
+    : AMDGPU_Op<"scaled_ext_packed816", [Pure]>,
+      Arguments<(
+          ins AnyTypeOf<[VectorOfLengthAndType<[8], [F4E2M1FN,F8E4M3FN,F8E5M2]>,
+                         VectorOfLengthAndType<[16], [F6E2M3FN, F6E3M2FN]>]>:$source,
+          FixedVectorOfLengthAndType<[4], [F8E8M0FNU]>:$scale,
+          ConfinedAttr<I32Attr, [IsValidBlockSize]>:$blockSize,
+          ConfinedAttr<I32Attr, [IntMinValue<0>, IntMaxValue<1>]>:$firstScaleLane,
+          ConfinedAttr<I32Attr, [IntMinValue<0>, IntMaxValue<2>]>:$firstScaleByte)>,
+      Results<(
+          outs AnyTypeOf<[FixedVectorOfLengthAndType<[8], [F32]>,
+                          FixedVectorOfLengthAndType<[8], [F16]>,
+                          FixedVectorOfLengthAndType<[8], [BF16]>,
+                          FixedVectorOfLengthAndType<[16], [F32]>,
+                          FixedVectorOfLengthAndType<[16], [F16]>,
+                          FixedVectorOfLengthAndType<[16], [BF16]>]>:$res)> {
+
+  let summary = "Extend a vector of packed floating point values";
+
+  let description = [{
+    The scales applied to the input microfloats are stored in two bytes which
+    come from the `scales` input provided in a *half* of the wave identified
+    by `firstScaleLane`. The pair of bytes used is selected by
+    `firstScaleByte`. The 16 vectors in consecutive lanes starting from
+    `firstScaleLane` (which we'll call the scale vectors) will be used by both
+    halves of the wave (with lane L reading from L % 16'th scale vector), but
+    each half will use a different byte.
+
+    When the block size is 32, `firstScaleByte` can be either 0 or 2,
+    selecting halves of the scale vectors. Lanes 0-15 will read from
+    `firstScaleByte` and lanes 16-31 will read from `firstScaleByte` + 1.
+
+    However, when the block size is 16, `firstScaleByte` can be 0 or 1.
+    Lanes 0-15 read from the `firstScaleByte`th element of the scale vectors,
+    while lanes 16-31 read from `firstScaleByte` + 2.
+
+    Note: the layout for the scales generally mirrors how the WMMA
+    instructions use for matix scales. These selection operands allows
+    one to choose portions of the matrix to convert.
+  }];
+
+  let hasCustomAssemblyFormat = 1;
+}
+
 def AMDGPU_ScaledExtPackedOp
     : AMDGPU_Op<"scaled_ext_packed", [Pure]>,
       Arguments<(
@@ -860,7 +908,7 @@ def AMDGPU_MFMAOp :
     based on the provided `m`, `k`, `n`, and `nBlks` attributes, along with the
     types of the source and destination arguments.
 
-    For information on the layouts of the input and output matrces (which are stored
+    For information on the layouts of the input and output matrices (which are stored
     in `sourceA`, `sourceB`, `destC`, and `destD`), see the CDNA ISA documentation.
 
     The `cbsz`, `abid`, and `blgp` parameters control how the lanes of the wave
diff --git a/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp b/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp
@@ -338,6 +338,76 @@ void RawBufferAtomicCmpswapOp::getCanonicalizationPatterns(
       context);
 }
 
+//===----------------------------------------------------------------------===//
+// ScaledExtPacked816Op
+//===----------------------------------------------------------------------===//
+mlir::ParseResult ScaledExtPacked816Op::parse(mlir::OpAsmParser &parser,
+                                              mlir::OperationState &result) {
+  // Parse attributes
+  if (parser.parseOptionalAttrDict(result.attributes))
+    return failure();
+
+  // Parse source operand
+  OpAsmParser::UnresolvedOperand source;
+  if (parser.parseOperand(source))
+    return failure();
+
+  if (parser.parseKeyword("scale") || parser.parseLParen())
+    return failure();
+  OpAsmParser::UnresolvedOperand scale;
+  if (parser.parseOperand(scale) || parser.parseRParen())
+    return failure();
+
+  // Parse attributes
+  IntegerAttr blockSize, firstScaleLane, firstScaleByte;
+  if (parser.parseKeyword("blockSize") || parser.parseLParen() ||
+      parser.parseAttribute(blockSize, parser.getBuilder().getI32Type()) ||
+      parser.parseRParen())
+    return failure();
+
+  if (parser.parseKeyword("firstScaleLane") || parser.parseLParen() ||
+      parser.parseAttribute(firstScaleLane, parser.getBuilder().getI32Type()) ||
+      parser.parseRParen())
+    return failure();
+
+  if (parser.parseKeyword("firstScaleByte") || parser.parseLParen() ||
+      parser.parseAttribute(firstScaleByte, parser.getBuilder().getI32Type()) ||
+      parser.parseRParen())
+    return failure();
+
+  Type sourceType, resultType;
+  if (parser.parseColon() || parser.parseType(sourceType) ||
+      parser.parseKeyword("to") || parser.parseType(resultType))
+    return failure();
+
+  // Resolve operands with types
+  Type scaleType =
+      VectorType::get({4}, Float8E8M0FNUType::get(parser.getContext()));
+  if (parser.resolveOperand(source, sourceType, result.operands) ||
+      parser.resolveOperand(scale, scaleType, result.operands))
+    return failure();
+
+  result.addAttribute("blockSize", blockSize);
+  result.addAttribute("firstScaleLane", firstScaleLane);
+  result.addAttribute("firstScaleByte", firstScaleByte);
+
+  result.addTypes(resultType);
+  return success();
+}
+
+void ScaledExtPacked816Op::print(OpAsmPrinter &p) {
+  p << " ";
+  p.printOptionalAttrDict(
+      (*this)->getAttrs(),
+      /*elideAttrs=*/{"blockSize", "firstScaleLane", "firstScaleByte"});
+  p << " " << getSource();
+  p << " scale(" << getScale() << ")";
+  p << " blockSize(" << getBlockSize() << ")";
+  p << " firstScaleLane(" << getFirstScaleLane() << ")";
+  p << " firstScaleByte(" << getFirstScaleByte() << ")";
+  p << " : " << getSource().getType() << " to " << getRes().getType();
+}
+
 //===----------------------------------------------------------------------===//
 // WMMAOp
 //===----------------------------------------------------------------------===//
diff --git a/mlir/test/Dialect/AMDGPU/ops.mlir b/mlir/test/Dialect/AMDGPU/ops.mlir
@@ -221,6 +221,61 @@ func.func @scaled_ext_scalar_f4e2m1_bf16(%v: vector<2xf4E2M1FN>, %scale: f32) ->
   func.return %ret : vector<2xbf16>
 }
 
+// CHECK-LABEL: func.func @scaled_ext_packed816_fp4
+func.func @scaled_ext_packed816_fp4(%v: vector<8xf4E2M1FN>, %scale: vector<4xf8E8M0FNU>) -> (vector<8xf16>, vector<8xbf16>, vector<8xf32>) {
+  // CHECK: amdgpu.scaled_ext_packed816
+  %ret0 = amdgpu.scaled_ext_packed816 %v scale(%scale) blockSize(32) firstScaleLane(0) firstScaleByte(0) : vector<8xf4E2M1FN> to vector<8xf16>
+  // CHECK: amdgpu.scaled_ext_packed816
+  %ret1 = amdgpu.scaled_ext_packed816 %v scale(%scale) blockSize(32) firstScaleLane(0) firstScaleByte(0) : vector<8xf4E2M1FN> to vector<8xbf16>
+  // CHECK: amdgpu.scaled_ext_packed816
+  %ret2 = amdgpu.scaled_ext_packed816 %v scale(%scale) blockSize(32) firstScaleLane(0) firstScaleByte(0) : vector<8xf4E2M1FN> to vector<8xf32>
+  func.return %ret0, %ret1, %ret2 : vector<8xf16>, vector<8xbf16>, vector<8xf32>
+}
+
+// CHECK-LABEL: func.func @scaled_ext_packed816_fp8
+func.func @scaled_ext_packed816_fp8(%v: vector<8xf8E4M3FN>, %scale: vector<4xf8E8M0FNU>) -> (vector<8xf16>, vector<8xbf16>, vector<8xf32>) {
+  // CHECK: amdgpu.scaled_ext_packed816
+  %ret0 = amdgpu.scaled_ext_packed816 %v scale(%scale) blockSize(32) firstScaleLane(0) firstScaleByte(0) : vector<8xf8E4M3FN> to vector<8xf16>
+  // CHECK: amdgpu.scaled_ext_packed816
+  %ret1 = amdgpu.scaled_ext_packed816 %v scale(%scale) blockSize(32) firstScaleLane(0) firstScaleByte(0) : vector<8xf8E4M3FN> to vector<8xbf16>
+  // CHECK: amdgpu.scaled_ext_packed816
+  %ret2 = amdgpu.scaled_ext_packed816 %v scale(%scale) blockSize(32) firstScaleLane(0) firstScaleByte(0) : vector<8xf8E4M3FN> to vector<8xf32>
+  func.return %ret0, %ret1, %ret2 : vector<8xf16>, vector<8xbf16>, vector<8xf32>
+}
+
+// CHECK-LABEL: func.func @scaled_ext_packed816_bf8
+func.func @scaled_ext_packed816_bf8(%v: vector<8xf8E5M2>, %scale: vector<4xf8E8M0FNU>) -> (vector<8xf16>, vector<8xbf16>, vector<8xf32>) {
+  // CHECK: amdgpu.scaled_ext_packed816
+  %ret0 = amdgpu.scaled_ext_packed816 %v scale(%scale) blockSize(32) firstScaleLane(0) firstScaleByte(0) : vector<8xf8E5M2> to vector<8xf16>
+  // CHECK: amdgpu.scaled_ext_packed816
+  %ret1 = amdgpu.scaled_ext_packed816 %v scale(%scale) blockSize(32) firstScaleLane(0) firstScaleByte(0) : vector<8xf8E5M2> to vector<8xbf16>
+  // CHECK: amdgpu.scaled_ext_packed816
+  %ret2 = amdgpu.scaled_ext_packed816 %v scale(%scale) blockSize(32) firstScaleLane(0) firstScaleByte(0) : vector<8xf8E5M2> to vector<8xf32>
+  func.return %ret0, %ret1, %ret2 : vector<8xf16>, vector<8xbf16>, vector<8xf32>
+}
+
+// CHECK-LABEL: func.func @scaled_ext_packed816_fp6
+func.func @scaled_ext_packed816_fp6(%v: vector<16xf6E2M3FN>, %scale: vector<4xf8E8M0FNU>) -> (vector<16xf16>, vector<16xbf16>, vector<16xf32>) {
+  // CHECK: amdgpu.scaled_ext_packed816
+  %ret0 = amdgpu.scaled_ext_packed816 %v scale(%scale) blockSize(32) firstScaleLane(0) firstScaleByte(0) : vector<16xf6E2M3FN> to vector<16xf16>
+  // CHECK: amdgpu.scaled_ext_packed816
+  %ret1 = amdgpu.scaled_ext_packed816 %v scale(%scale) blockSize(32) firstScaleLane(0) firstScaleByte(0) : vector<16xf6E2M3FN> to vector<16xbf16>
+  // CHECK: amdgpu.scaled_ext_packed816
+  %ret2 = amdgpu.scaled_ext_packed816 %v scale(%scale) blockSize(32) firstScaleLane(0) firstScaleByte(0) : vector<16xf6E2M3FN> to vector<16xf32>
+  func.return %ret0, %ret1, %ret2 : vector<16xf16>, vector<16xbf16>, vector<16xf32>
+}
+
+// CHECK-LABEL: func.func @scaled_ext_packed816_bf16
+func.func @scaled_ext_packed816_bf16(%v: vector<16xf6E3M2FN>, %scale: vector<4xf8E8M0FNU>) -> (vector<16xf16>, vector<16xbf16>, vector<16xf32>) {
+  // CHECK: amdgpu.scaled_ext_packed816
+  %ret0 = amdgpu.scaled_ext_packed816 %v scale(%scale) blockSize(32) firstScaleLane(0) firstScaleByte(0) : vector<16xf6E3M2FN> to vector<16xf16>
+  // CHECK: amdgpu.scaled_ext_packed816
+  %ret1 = amdgpu.scaled_ext_packed816 %v scale(%scale) blockSize(32) firstScaleLane(0) firstScaleByte(0) : vector<16xf6E3M2FN> to vector<16xbf16>
+  // CHECK: amdgpu.scaled_ext_packed816
+  %ret2 = amdgpu.scaled_ext_packed816 %v scale(%scale) blockSize(32) firstScaleLane(0) firstScaleByte(0) : vector<16xf6E3M2FN> to vector<16xf32>
+  func.return %ret0, %ret1, %ret2 : vector<16xf16>, vector<16xbf16>, vector<16xf32>
+}
+
 // CHECK-LABEL: func.func @packed_scaled_trunc_f8e4m3_f32
 // CHECK: amdgpu.packed_scaled_trunc
 func.func @packed_scaled_trunc_f8e4m3_f32(%v: vector<2xf32>, %scale: f32) -> vector<4xf8E4M3FN> {