diff --git a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td index 8370d350afd1e..7184de93bfacb 100644 --- a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td +++ b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td @@ -112,6 +112,97 @@ def AMDGPU_ExtPackedFp8Op : }]; } +def IsValidBlockSize: AttrConstraint< + CPred<"::llvm::is_contained({16, 32}, ::llvm::cast<::mlir::IntegerAttr>($_self).getInt())">, + "whose value is 16 or 32">; + +def AMDGPU_ScaledExtPacked816Op + : AMDGPU_Op<"scaled_ext_packed816", [Pure, AllShapesMatch<["source", "res"]>]>, + Arguments<( + ins AnyTypeOf<[FixedVectorOfShapeAndType<[8], F4E2M1FN>, + FixedVectorOfShapeAndType<[8], F8E4M3FN>, + FixedVectorOfShapeAndType<[8], F8E5M2>, + FixedVectorOfShapeAndType<[16], F6E2M3FN>, + FixedVectorOfShapeAndType<[16], F6E3M2FN>]>:$source, + FixedVectorOfShapeAndType<[4], F8E8M0FNU>:$scale, + ConfinedAttr:$blockSize, + ConfinedAttr, IntMaxValue<1>]>:$firstScaleLane, + ConfinedAttr, IntMaxValue<2>]>:$firstScaleByte)>, + Results<( + outs AnyTypeOf<[FixedVectorOfShapeAndType<[8], F32>, + FixedVectorOfShapeAndType<[8], F16>, + FixedVectorOfShapeAndType<[8], BF16>, + FixedVectorOfShapeAndType<[16], F32>, + FixedVectorOfShapeAndType<[16], F16>, + FixedVectorOfShapeAndType<[16], BF16>]>:$res)> { + + let summary = "Extend a vector of packed floating point values"; + + let description = [{ + The scales applied to the input microfloats are stored in two bytes which + come from the `scales` input provided in a *half* of the wave identified + by `firstScaleLane`. The pair of bytes used is selected by + `firstScaleByte`. The 16 vectors in consecutive lanes starting from + `firstScaleLane` (which we'll call the scale vectors) will be used by both + halves of the wave (with lane L reading from L % 16'th scale vector), but + each half will use a different byte. + + When the block size is 32, `firstScaleByte` can be either 0 or 2, + selecting halves of the scale vectors. Lanes 0-15 will read from + `firstScaleByte` and lanes 16-31 will read from `firstScaleByte` + 1. + For example: + ```mlir + // Input: 8-element vector of F8E4M3FN, converting to F32 + // Lanes 0-15 read from byte 0, lanes 16-31 read from byte 1 + %result = amdgpu.scaled_ext_packed816 %source scale(%scales) + blockSize(32) firstScaleLane(0) firstScaleByte(0) + : vector<8xf8E4M3FN>, vector<4xf8E8M0FNU> -> vector<8xf32> + + // Input: 16-element vector of F6E2M3FN, converting to F16 + // Lanes 0-15 read from byte 2, lanes 16-31 read from byte 3 + %result = amdgpu.scaled_ext_packed816 %source scale(%scales) + blockSize(32) firstScaleLane(1) firstScaleByte(2) + : vector<16xf6E2M3FN>, vector<4xf8E8M0FNU> -> vector<16xf16> + ``` + + However, when the block size is 16, `firstScaleByte` can be 0 or 1. + Lanes 0-15 read from the `firstScaleByte`th element of the scale vectors, + while lanes 16-31 read from `firstScaleByte` + 2. + For example: + ```mlir + // Input: 8-element vector of F8E5M2, converting to BF16 + // Lanes 0-15 read from byte 0, lanes 16-31 read from byte 2 (0+2) + %result = amdgpu.scaled_ext_packed816 %source scale(%scales) + blockSize(16) firstScaleLane(0) firstScaleByte(0) + : vector<8xf8E5M2>, vector<4xf8E8M0FNU> -> vector<8xbf16> + + // Input: 16-element vector of F6E3M2FN, converting to F32 + // Lanes 0-15 read from byte 1, lanes 16-31 read from byte 3 (1+2) + %result = amdgpu.scaled_ext_packed816 %source scale(%scales) + blockSize(16) firstScaleLane(1) firstScaleByte(1) + : vector<16xf6E3M2FN>, vector<4xf8E8M0FNU> -> vector<16xf32> + ``` + + Note: the layout for the scales generally mirrors how the WMMA + instructions use for matix scales. These selection operands allows + one to choose portions of the matrix to convert. + + Available on gfx1250+. + }]; + + let assemblyFormat = [{ + attr-dict $source + `scale` `(` $scale `)` + `blockSize` `(` $blockSize `)` + `firstScaleLane` `(` $firstScaleLane`)` + `firstScaleByte` `(` $firstScaleByte `)` + `:` type($source) `,` type($scale) `->` type($res) + }]; + + let hasVerifier = 1; + +} + def AMDGPU_ScaledExtPackedOp : AMDGPU_Op<"scaled_ext_packed", [Pure]>, Arguments<( @@ -860,7 +951,7 @@ def AMDGPU_MFMAOp : based on the provided `m`, `k`, `n`, and `nBlks` attributes, along with the types of the source and destination arguments. - For information on the layouts of the input and output matrces (which are stored + For information on the layouts of the input and output matrices (which are stored in `sourceA`, `sourceB`, `destC`, and `destD`), see the CDNA ISA documentation. The `cbsz`, `abid`, and `blgp` parameters control how the lanes of the wave diff --git a/mlir/include/mlir/IR/CommonTypeConstraints.td b/mlir/include/mlir/IR/CommonTypeConstraints.td index 6b4e3dd603198..8427ba560c8aa 100644 --- a/mlir/include/mlir/IR/CommonTypeConstraints.td +++ b/mlir/include/mlir/IR/CommonTypeConstraints.td @@ -623,6 +623,14 @@ class VectorOfLengthAndType allowedLengths, VectorOfNonZeroRankOf.summary # VectorOfLength.summary, "::mlir::VectorType">; +class FixedVectorOfShapeAndType shape, Type elType>: ShapedContainerType< + [elType], + And<[IsVectorOfShape, IsFixedVectorOfAnyRankTypePred]>, + "vector<" # !interleave(shape, "x") # "x" # elType # ">", + "::mlir::VectorType">, + BuildableType<"::mlir::VectorType::get({" # !interleave(shape, " ,") # "} , " # elType.builderCall # " );">; + + // Any fixed-length vector where the number of elements is from the given // `allowedLengths` list and the type is from the given `allowedTypes` list class FixedVectorOfLengthAndType allowedLengths, diff --git a/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp b/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp index f405d0cc7aa02..1c1794d5a1826 100644 --- a/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp +++ b/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp @@ -338,6 +338,25 @@ void RawBufferAtomicCmpswapOp::getCanonicalizationPatterns( context); } +//===----------------------------------------------------------------------===// +// ScaledExtPacked816Op +//===----------------------------------------------------------------------===// +LogicalResult ScaledExtPacked816Op::verify() { + int blockSize = getBlockSize(); + assert((blockSize == 16 || blockSize == 32) && "invalid block size"); + int firstScaleByte = getFirstScaleByte(); + if (blockSize == 16 && !llvm::is_contained({0, 1}, firstScaleByte)) { + return emitOpError( + "blockSize of 16 can only have firstScaleByte be 0 or 1."); + } + if (blockSize == 32 && !llvm::is_contained({0, 2}, firstScaleByte)) { + return emitOpError( + "blockSize of 32 can only have firstScaleByte be 0 or 2."); + } + + return success(); +} + //===----------------------------------------------------------------------===// // WMMAOp //===----------------------------------------------------------------------===// diff --git a/mlir/test/Dialect/AMDGPU/invalid.mlir b/mlir/test/Dialect/AMDGPU/invalid.mlir index 66e7dd4014af9..a8256b16ed8a1 100644 --- a/mlir/test/Dialect/AMDGPU/invalid.mlir +++ b/mlir/test/Dialect/AMDGPU/invalid.mlir @@ -238,3 +238,27 @@ func.func @gather_to_lds_non_lds(%idx1 : index, %mem1 : memref<32xf16>, %mem2 : amdgpu.gather_to_lds %mem1[%idx1], %mem2[%idx1] : vector<2xf16>, memref<32xf16>, memref<32xf16, strided<[?]>, #gpu.address_space> func.return } + +// ----- + +func.func @amdgpu.scaled_ext_packed816_invalid_block_size_and_first_scale_byte_16(%v: vector<8xf8E5M2>, %scale: vector<4xf8E8M0FNU>) { + // expected-error@+1 {{'amdgpu.scaled_ext_packed816' op blockSize of 16 can only have firstScaleByte be 0 or 1.}} + %ret0 = amdgpu.scaled_ext_packed816 %v scale(%scale) blockSize(16) firstScaleLane(0) firstScaleByte(2) : vector<8xf8E5M2>, vector<4xf8E8M0FNU> -> vector<8xf16> + func.return +} + +// ----- + +func.func @amdgpu.scaled_ext_packed816_invalid_block_size_and_first_scale_byte_32(%v: vector<8xf8E5M2>, %scale: vector<4xf8E8M0FNU>) { + // expected-error@+1 {{'amdgpu.scaled_ext_packed816' op blockSize of 32 can only have firstScaleByte be 0 or 2.}} + %ret0 = amdgpu.scaled_ext_packed816 %v scale(%scale) blockSize(32) firstScaleLane(0) firstScaleByte(1) : vector<8xf8E5M2>, vector<4xf8E8M0FNU> -> vector<8xf16> + func.return +} + +// ----- + +func.func @amdgpu.scaled_ext_packed816_invalid_input_output_sizes(%v: vector<8xf8E5M2>, %scale: vector<4xf8E8M0FNU>) { + // expected-error@+1 {{'amdgpu.scaled_ext_packed816' op failed to verify that all of {source, res} have same shape}} + %ret0 = amdgpu.scaled_ext_packed816 %v scale(%scale) blockSize(16) firstScaleLane(0) firstScaleByte(0) : vector<8xf8E5M2>, vector<4xf8E8M0FNU> -> vector<16xf16> + func.return +} diff --git a/mlir/test/Dialect/AMDGPU/ops.mlir b/mlir/test/Dialect/AMDGPU/ops.mlir index 8f427e9d56f45..f9c6899dadfc1 100644 --- a/mlir/test/Dialect/AMDGPU/ops.mlir +++ b/mlir/test/Dialect/AMDGPU/ops.mlir @@ -221,6 +221,61 @@ func.func @scaled_ext_scalar_f4e2m1_bf16(%v: vector<2xf4E2M1FN>, %scale: f32) -> func.return %ret : vector<2xbf16> } +// CHECK-LABEL: func.func @scaled_ext_packed816_fp4 +func.func @scaled_ext_packed816_fp4(%v: vector<8xf4E2M1FN>, %scale: vector<4xf8E8M0FNU>) -> (vector<8xf16>, vector<8xbf16>, vector<8xf32>) { + // CHECK: amdgpu.scaled_ext_packed816 + %ret0 = amdgpu.scaled_ext_packed816 %v scale(%scale) blockSize(32) firstScaleLane(0) firstScaleByte(0) : vector<8xf4E2M1FN>, vector<4xf8E8M0FNU> -> vector<8xf16> + // CHECK: amdgpu.scaled_ext_packed816 + %ret1 = amdgpu.scaled_ext_packed816 %v scale(%scale) blockSize(32) firstScaleLane(0) firstScaleByte(0) : vector<8xf4E2M1FN>, vector<4xf8E8M0FNU> -> vector<8xbf16> + // CHECK: amdgpu.scaled_ext_packed816 + %ret2 = amdgpu.scaled_ext_packed816 %v scale(%scale) blockSize(32) firstScaleLane(0) firstScaleByte(0) : vector<8xf4E2M1FN>, vector<4xf8E8M0FNU> -> vector<8xf32> + func.return %ret0, %ret1, %ret2 : vector<8xf16>, vector<8xbf16>, vector<8xf32> +} + +// CHECK-LABEL: func.func @scaled_ext_packed816_fp8 +func.func @scaled_ext_packed816_fp8(%v: vector<8xf8E4M3FN>, %scale: vector<4xf8E8M0FNU>) -> (vector<8xf16>, vector<8xbf16>, vector<8xf32>) { + // CHECK: amdgpu.scaled_ext_packed816 + %ret0 = amdgpu.scaled_ext_packed816 %v scale(%scale) blockSize(32) firstScaleLane(0) firstScaleByte(0) : vector<8xf8E4M3FN>, vector<4xf8E8M0FNU> -> vector<8xf16> + // CHECK: amdgpu.scaled_ext_packed816 + %ret1 = amdgpu.scaled_ext_packed816 %v scale(%scale) blockSize(32) firstScaleLane(0) firstScaleByte(0) : vector<8xf8E4M3FN>, vector<4xf8E8M0FNU> -> vector<8xbf16> + // CHECK: amdgpu.scaled_ext_packed816 + %ret2 = amdgpu.scaled_ext_packed816 %v scale(%scale) blockSize(32) firstScaleLane(0) firstScaleByte(0) : vector<8xf8E4M3FN>, vector<4xf8E8M0FNU> -> vector<8xf32> + func.return %ret0, %ret1, %ret2 : vector<8xf16>, vector<8xbf16>, vector<8xf32> +} + +// CHECK-LABEL: func.func @scaled_ext_packed816_bf8 +func.func @scaled_ext_packed816_bf8(%v: vector<8xf8E5M2>, %scale: vector<4xf8E8M0FNU>) -> (vector<8xf16>, vector<8xbf16>, vector<8xf32>) { + // CHECK: amdgpu.scaled_ext_packed816 + %ret0 = amdgpu.scaled_ext_packed816 %v scale(%scale) blockSize(32) firstScaleLane(0) firstScaleByte(0) : vector<8xf8E5M2>, vector<4xf8E8M0FNU> -> vector<8xf16> + // CHECK: amdgpu.scaled_ext_packed816 + %ret1 = amdgpu.scaled_ext_packed816 %v scale(%scale) blockSize(32) firstScaleLane(0) firstScaleByte(0) : vector<8xf8E5M2>, vector<4xf8E8M0FNU> -> vector<8xbf16> + // CHECK: amdgpu.scaled_ext_packed816 + %ret2 = amdgpu.scaled_ext_packed816 %v scale(%scale) blockSize(32) firstScaleLane(0) firstScaleByte(0) : vector<8xf8E5M2>, vector<4xf8E8M0FNU> -> vector<8xf32> + func.return %ret0, %ret1, %ret2 : vector<8xf16>, vector<8xbf16>, vector<8xf32> +} + +// CHECK-LABEL: func.func @scaled_ext_packed816_fp6 +func.func @scaled_ext_packed816_fp6(%v: vector<16xf6E2M3FN>, %scale: vector<4xf8E8M0FNU>) -> (vector<16xf16>, vector<16xbf16>, vector<16xf32>) { + // CHECK: amdgpu.scaled_ext_packed816 + %ret0 = amdgpu.scaled_ext_packed816 %v scale(%scale) blockSize(32) firstScaleLane(0) firstScaleByte(0) : vector<16xf6E2M3FN>, vector<4xf8E8M0FNU> -> vector<16xf16> + // CHECK: amdgpu.scaled_ext_packed816 + %ret1 = amdgpu.scaled_ext_packed816 %v scale(%scale) blockSize(32) firstScaleLane(0) firstScaleByte(0) : vector<16xf6E2M3FN>, vector<4xf8E8M0FNU> -> vector<16xbf16> + // CHECK: amdgpu.scaled_ext_packed816 + %ret2 = amdgpu.scaled_ext_packed816 %v scale(%scale) blockSize(32) firstScaleLane(0) firstScaleByte(0) : vector<16xf6E2M3FN>, vector<4xf8E8M0FNU> -> vector<16xf32> + func.return %ret0, %ret1, %ret2 : vector<16xf16>, vector<16xbf16>, vector<16xf32> +} + +// CHECK-LABEL: func.func @scaled_ext_packed816_bf16 +func.func @scaled_ext_packed816_bf16(%v: vector<16xf6E3M2FN>, %scale: vector<4xf8E8M0FNU>) -> (vector<16xf16>, vector<16xbf16>, vector<16xf32>) { + // CHECK: amdgpu.scaled_ext_packed816 + %ret0 = amdgpu.scaled_ext_packed816 %v scale(%scale) blockSize(32) firstScaleLane(0) firstScaleByte(0) : vector<16xf6E3M2FN>, vector<4xf8E8M0FNU> -> vector<16xf16> + // CHECK: amdgpu.scaled_ext_packed816 + %ret1 = amdgpu.scaled_ext_packed816 %v scale(%scale) blockSize(32) firstScaleLane(0) firstScaleByte(0) : vector<16xf6E3M2FN>, vector<4xf8E8M0FNU> -> vector<16xbf16> + // CHECK: amdgpu.scaled_ext_packed816 + %ret2 = amdgpu.scaled_ext_packed816 %v scale(%scale) blockSize(32) firstScaleLane(0) firstScaleByte(0) : vector<16xf6E3M2FN>, vector<4xf8E8M0FNU> -> vector<16xf32> + func.return %ret0, %ret1, %ret2 : vector<16xf16>, vector<16xbf16>, vector<16xf32> +} + // CHECK-LABEL: func.func @packed_scaled_trunc_f8e4m3_f32 // CHECK: amdgpu.packed_scaled_trunc func.func @packed_scaled_trunc_f8e4m3_f32(%v: vector<2xf32>, %scale: f32) -> vector<4xf8E4M3FN> {