-
Notifications
You must be signed in to change notification settings - Fork 15k
[mlir][amdgpu] Add scaled_ext_packed{8,16} operations #159830
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[mlir][amdgpu] Add scaled_ext_packed{8,16} operations #159830
Conversation
|
@llvm/pr-subscribers-mlir-core @llvm/pr-subscribers-backend-amdgpu Author: Erick Ochoa Lopez (amd-eochoalo) ChangesFull diff: https://github.com/llvm/llvm-project/pull/159830.diff 2 Files Affected:
diff --git a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td
index a24a918357f2d..d5ea737e229ff 100644
--- a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td
+++ b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td
@@ -112,6 +112,48 @@ def AMDGPU_ExtPackedFp8Op :
}];
}
+def AMDGPU_ScaledExtPacked8Op
+ : AMDGPU_Op<"scaled_ext_packed8", [Pure]>,
+ Arguments<(
+ ins VectorOfLengthAndType<[8], [F4E2M1FN,F8E4M3FN,F8E5M2]>:$source,
+ F32:$scale,
+ ConfinedAttr<I32Attr, [IntNonNegative, IntMaxValue<7>]>:$index)>,
+ Results<(
+ outs AnyTypeOf<[FixedVectorOfLengthAndType<[8], [F32]>,
+ FixedVectorOfLengthAndType<[8], [F16]>,
+ FixedVectorOfLengthAndType<[8], [BF16]>]>:$res)> {
+ let summary = "Extend a vector of packed floating point values";
+
+ let description = [{
+ Extend and scale eight packed floats in to eight floats and return them.
+ }];
+
+ let assemblyFormat = [{
+ attr-dict $source `,` $scale `[` $index `]` `:` type($source) `to` type($res)
+ }];
+}
+
+def AMDGPU_ScaledExtPacked16Op
+ : AMDGPU_Op<"scaled_ext_packed16", [Pure]>,
+ Arguments<(
+ ins VectorOfLengthAndType<[16], [F6E2M3FN, F6E3M2FN]>:$source,
+ F32:$scale,
+ ConfinedAttr<I32Attr, [IntNonNegative, IntMaxValue<7>]>:$index)>,
+ Results<(
+ outs AnyTypeOf<[FixedVectorOfLengthAndType<[16], [F32]>,
+ FixedVectorOfLengthAndType<[16], [F16]>,
+ FixedVectorOfLengthAndType<[16], [BF16]>]>:$res)> {
+ let summary = "Extend a vector of packed floating point values";
+
+ let description = [{
+ Extend and scale 16 packed floats to 16 floats and return them.
+ }];
+
+ let assemblyFormat = [{
+ attr-dict $source `,` $scale `[` $index `]` `:` type($source) `to` type($res)
+ }];
+}
+
def AMDGPU_ScaledExtPackedOp
: AMDGPU_Op<"scaled_ext_packed", [Pure]>,
Arguments<(
diff --git a/mlir/test/Dialect/AMDGPU/ops.mlir b/mlir/test/Dialect/AMDGPU/ops.mlir
index 369e0fff538e1..1841c0815b435 100644
--- a/mlir/test/Dialect/AMDGPU/ops.mlir
+++ b/mlir/test/Dialect/AMDGPU/ops.mlir
@@ -221,6 +221,61 @@ func.func @scaled_ext_scalar_f4e2m1_bf16(%v: vector<2xf4E2M1FN>, %scale: f32) ->
func.return %ret : vector<2xbf16>
}
+// CHECK-LABEL: func.func @scaled_ext_packed8_fp4
+func.func @scaled_ext_packed8_fp4(%v: vector<8xf4E2M1FN>, %scale: f32) -> (vector<8xf16>, vector<8xbf16>, vector<8xf32>) {
+ // CHECK: amdgpu.scaled_ext_packed8
+ %ret0 = amdgpu.scaled_ext_packed8 %v, %scale[0] : vector<8xf4E2M1FN> to vector<8xf16>
+ // CHECK: amdgpu.scaled_ext_packed8
+ %ret1 = amdgpu.scaled_ext_packed8 %v, %scale[0] : vector<8xf4E2M1FN> to vector<8xbf16>
+ // CHECK: amdgpu.scaled_ext_packed8
+ %ret2 = amdgpu.scaled_ext_packed8 %v, %scale[0] : vector<8xf4E2M1FN> to vector<8xf32>
+ func.return %ret0, %ret1, %ret2 : vector<8xf16>, vector<8xbf16>, vector<8xf32>
+}
+
+// CHECK-LABEL: func.func @scaled_ext_packed8_fp8
+func.func @scaled_ext_packed8_fp8(%v: vector<8xf8E4M3FN>, %scale: f32) -> (vector<8xf16>, vector<8xbf16>, vector<8xf32>) {
+ // CHECK: amdgpu.scaled_ext_packed8
+ %ret0 = amdgpu.scaled_ext_packed8 %v, %scale[0] : vector<8xf8E4M3FN> to vector<8xf16>
+ // CHECK: amdgpu.scaled_ext_packed8
+ %ret1 = amdgpu.scaled_ext_packed8 %v, %scale[0] : vector<8xf8E4M3FN> to vector<8xbf16>
+ // CHECK: amdgpu.scaled_ext_packed8
+ %ret2 = amdgpu.scaled_ext_packed8 %v, %scale[0] : vector<8xf8E4M3FN> to vector<8xf32>
+ func.return %ret0, %ret1, %ret2 : vector<8xf16>, vector<8xbf16>, vector<8xf32>
+}
+
+// CHECK-LABEL: func.func @scaled_ext_packed8_bf8
+func.func @scaled_ext_packed8_bf8(%v: vector<8xf8E5M2>, %scale: f32) -> (vector<8xf16>, vector<8xbf16>, vector<8xf32>) {
+ // CHECK: amdgpu.scaled_ext_packed8
+ %ret0 = amdgpu.scaled_ext_packed8 %v, %scale[0] : vector<8xf8E5M2> to vector<8xf16>
+ // CHECK: amdgpu.scaled_ext_packed8
+ %ret1 = amdgpu.scaled_ext_packed8 %v, %scale[0] : vector<8xf8E5M2> to vector<8xbf16>
+ // CHECK: amdgpu.scaled_ext_packed8
+ %ret2 = amdgpu.scaled_ext_packed8 %v, %scale[0] : vector<8xf8E5M2> to vector<8xf32>
+ func.return %ret0, %ret1, %ret2 : vector<8xf16>, vector<8xbf16>, vector<8xf32>
+}
+
+// CHECK-LABEL: func.func @scaled_ext_packed16_fp6
+func.func @scaled_ext_packed16_fp6(%v: vector<16xf6E2M3FN>, %scale: f32) -> (vector<16xf16>, vector<16xbf16>, vector<16xf32>) {
+ // CHECK: amdgpu.scaled_ext_packed16
+ %ret0 = amdgpu.scaled_ext_packed16 %v, %scale[0] : vector<16xf6E2M3FN> to vector<16xf16>
+ // CHECK: amdgpu.scaled_ext_packed16
+ %ret1 = amdgpu.scaled_ext_packed16 %v, %scale[0] : vector<16xf6E2M3FN> to vector<16xbf16>
+ // CHECK: amdgpu.scaled_ext_packed16
+ %ret2 = amdgpu.scaled_ext_packed16 %v, %scale[0] : vector<16xf6E2M3FN> to vector<16xf32>
+ func.return %ret0, %ret1, %ret2 : vector<16xf16>, vector<16xbf16>, vector<16xf32>
+}
+
+// CHECK-LABEL: func.func @scaled_ext_packed16_bf16
+func.func @scaled_ext_packed16_bf16(%v: vector<16xf6E3M2FN>, %scale: f32) -> (vector<16xf16>, vector<16xbf16>, vector<16xf32>) {
+ // CHECK: amdgpu.scaled_ext_packed16
+ %ret0 = amdgpu.scaled_ext_packed16 %v, %scale[0] : vector<16xf6E3M2FN> to vector<16xf16>
+ // CHECK: amdgpu.scaled_ext_packed16
+ %ret1 = amdgpu.scaled_ext_packed16 %v, %scale[0] : vector<16xf6E3M2FN> to vector<16xbf16>
+ // CHECK: amdgpu.scaled_ext_packed16
+ %ret2 = amdgpu.scaled_ext_packed16 %v, %scale[0] : vector<16xf6E3M2FN> to vector<16xf32>
+ func.return %ret0, %ret1, %ret2 : vector<16xf16>, vector<16xbf16>, vector<16xf32>
+}
+
// CHECK-LABEL: func.func @packed_scaled_trunc_f8e4m3_f32
// CHECK: amdgpu.packed_scaled_trunc
func.func @packed_scaled_trunc_f8e4m3_f32(%v: vector<2xf32>, %scale: f32) -> vector<4xf8E4M3FN> {
|
|
@llvm/pr-subscribers-mlir Author: Erick Ochoa Lopez (amd-eochoalo) ChangesFull diff: https://github.com/llvm/llvm-project/pull/159830.diff 2 Files Affected:
diff --git a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td
index a24a918357f2d..d5ea737e229ff 100644
--- a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td
+++ b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td
@@ -112,6 +112,48 @@ def AMDGPU_ExtPackedFp8Op :
}];
}
+def AMDGPU_ScaledExtPacked8Op
+ : AMDGPU_Op<"scaled_ext_packed8", [Pure]>,
+ Arguments<(
+ ins VectorOfLengthAndType<[8], [F4E2M1FN,F8E4M3FN,F8E5M2]>:$source,
+ F32:$scale,
+ ConfinedAttr<I32Attr, [IntNonNegative, IntMaxValue<7>]>:$index)>,
+ Results<(
+ outs AnyTypeOf<[FixedVectorOfLengthAndType<[8], [F32]>,
+ FixedVectorOfLengthAndType<[8], [F16]>,
+ FixedVectorOfLengthAndType<[8], [BF16]>]>:$res)> {
+ let summary = "Extend a vector of packed floating point values";
+
+ let description = [{
+ Extend and scale eight packed floats in to eight floats and return them.
+ }];
+
+ let assemblyFormat = [{
+ attr-dict $source `,` $scale `[` $index `]` `:` type($source) `to` type($res)
+ }];
+}
+
+def AMDGPU_ScaledExtPacked16Op
+ : AMDGPU_Op<"scaled_ext_packed16", [Pure]>,
+ Arguments<(
+ ins VectorOfLengthAndType<[16], [F6E2M3FN, F6E3M2FN]>:$source,
+ F32:$scale,
+ ConfinedAttr<I32Attr, [IntNonNegative, IntMaxValue<7>]>:$index)>,
+ Results<(
+ outs AnyTypeOf<[FixedVectorOfLengthAndType<[16], [F32]>,
+ FixedVectorOfLengthAndType<[16], [F16]>,
+ FixedVectorOfLengthAndType<[16], [BF16]>]>:$res)> {
+ let summary = "Extend a vector of packed floating point values";
+
+ let description = [{
+ Extend and scale 16 packed floats to 16 floats and return them.
+ }];
+
+ let assemblyFormat = [{
+ attr-dict $source `,` $scale `[` $index `]` `:` type($source) `to` type($res)
+ }];
+}
+
def AMDGPU_ScaledExtPackedOp
: AMDGPU_Op<"scaled_ext_packed", [Pure]>,
Arguments<(
diff --git a/mlir/test/Dialect/AMDGPU/ops.mlir b/mlir/test/Dialect/AMDGPU/ops.mlir
index 369e0fff538e1..1841c0815b435 100644
--- a/mlir/test/Dialect/AMDGPU/ops.mlir
+++ b/mlir/test/Dialect/AMDGPU/ops.mlir
@@ -221,6 +221,61 @@ func.func @scaled_ext_scalar_f4e2m1_bf16(%v: vector<2xf4E2M1FN>, %scale: f32) ->
func.return %ret : vector<2xbf16>
}
+// CHECK-LABEL: func.func @scaled_ext_packed8_fp4
+func.func @scaled_ext_packed8_fp4(%v: vector<8xf4E2M1FN>, %scale: f32) -> (vector<8xf16>, vector<8xbf16>, vector<8xf32>) {
+ // CHECK: amdgpu.scaled_ext_packed8
+ %ret0 = amdgpu.scaled_ext_packed8 %v, %scale[0] : vector<8xf4E2M1FN> to vector<8xf16>
+ // CHECK: amdgpu.scaled_ext_packed8
+ %ret1 = amdgpu.scaled_ext_packed8 %v, %scale[0] : vector<8xf4E2M1FN> to vector<8xbf16>
+ // CHECK: amdgpu.scaled_ext_packed8
+ %ret2 = amdgpu.scaled_ext_packed8 %v, %scale[0] : vector<8xf4E2M1FN> to vector<8xf32>
+ func.return %ret0, %ret1, %ret2 : vector<8xf16>, vector<8xbf16>, vector<8xf32>
+}
+
+// CHECK-LABEL: func.func @scaled_ext_packed8_fp8
+func.func @scaled_ext_packed8_fp8(%v: vector<8xf8E4M3FN>, %scale: f32) -> (vector<8xf16>, vector<8xbf16>, vector<8xf32>) {
+ // CHECK: amdgpu.scaled_ext_packed8
+ %ret0 = amdgpu.scaled_ext_packed8 %v, %scale[0] : vector<8xf8E4M3FN> to vector<8xf16>
+ // CHECK: amdgpu.scaled_ext_packed8
+ %ret1 = amdgpu.scaled_ext_packed8 %v, %scale[0] : vector<8xf8E4M3FN> to vector<8xbf16>
+ // CHECK: amdgpu.scaled_ext_packed8
+ %ret2 = amdgpu.scaled_ext_packed8 %v, %scale[0] : vector<8xf8E4M3FN> to vector<8xf32>
+ func.return %ret0, %ret1, %ret2 : vector<8xf16>, vector<8xbf16>, vector<8xf32>
+}
+
+// CHECK-LABEL: func.func @scaled_ext_packed8_bf8
+func.func @scaled_ext_packed8_bf8(%v: vector<8xf8E5M2>, %scale: f32) -> (vector<8xf16>, vector<8xbf16>, vector<8xf32>) {
+ // CHECK: amdgpu.scaled_ext_packed8
+ %ret0 = amdgpu.scaled_ext_packed8 %v, %scale[0] : vector<8xf8E5M2> to vector<8xf16>
+ // CHECK: amdgpu.scaled_ext_packed8
+ %ret1 = amdgpu.scaled_ext_packed8 %v, %scale[0] : vector<8xf8E5M2> to vector<8xbf16>
+ // CHECK: amdgpu.scaled_ext_packed8
+ %ret2 = amdgpu.scaled_ext_packed8 %v, %scale[0] : vector<8xf8E5M2> to vector<8xf32>
+ func.return %ret0, %ret1, %ret2 : vector<8xf16>, vector<8xbf16>, vector<8xf32>
+}
+
+// CHECK-LABEL: func.func @scaled_ext_packed16_fp6
+func.func @scaled_ext_packed16_fp6(%v: vector<16xf6E2M3FN>, %scale: f32) -> (vector<16xf16>, vector<16xbf16>, vector<16xf32>) {
+ // CHECK: amdgpu.scaled_ext_packed16
+ %ret0 = amdgpu.scaled_ext_packed16 %v, %scale[0] : vector<16xf6E2M3FN> to vector<16xf16>
+ // CHECK: amdgpu.scaled_ext_packed16
+ %ret1 = amdgpu.scaled_ext_packed16 %v, %scale[0] : vector<16xf6E2M3FN> to vector<16xbf16>
+ // CHECK: amdgpu.scaled_ext_packed16
+ %ret2 = amdgpu.scaled_ext_packed16 %v, %scale[0] : vector<16xf6E2M3FN> to vector<16xf32>
+ func.return %ret0, %ret1, %ret2 : vector<16xf16>, vector<16xbf16>, vector<16xf32>
+}
+
+// CHECK-LABEL: func.func @scaled_ext_packed16_bf16
+func.func @scaled_ext_packed16_bf16(%v: vector<16xf6E3M2FN>, %scale: f32) -> (vector<16xf16>, vector<16xbf16>, vector<16xf32>) {
+ // CHECK: amdgpu.scaled_ext_packed16
+ %ret0 = amdgpu.scaled_ext_packed16 %v, %scale[0] : vector<16xf6E3M2FN> to vector<16xf16>
+ // CHECK: amdgpu.scaled_ext_packed16
+ %ret1 = amdgpu.scaled_ext_packed16 %v, %scale[0] : vector<16xf6E3M2FN> to vector<16xbf16>
+ // CHECK: amdgpu.scaled_ext_packed16
+ %ret2 = amdgpu.scaled_ext_packed16 %v, %scale[0] : vector<16xf6E3M2FN> to vector<16xf32>
+ func.return %ret0, %ret1, %ret2 : vector<16xf16>, vector<16xbf16>, vector<16xf32>
+}
+
// CHECK-LABEL: func.func @packed_scaled_trunc_f8e4m3_f32
// CHECK: amdgpu.packed_scaled_trunc
func.func @packed_scaled_trunc_f8e4m3_f32(%v: vector<2xf32>, %scale: f32) -> vector<4xf8E4M3FN> {
|
| }]; | ||
| } | ||
|
|
||
| def AMDGPU_ScaledExtPacked16Op |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Don't make distinct operations here. Instead, loosen the definition of scaled_ext_packed and add checks for chip compatibility to the lowering.
If that's not feasible, get back to me.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I did it here c3832b0 . Is the assembly format acceptable for you?
|
On further offline discussion, I'll need more context / we'll want to see if two separate ops are actually the better design here. |
|
between f92db34 and c3832b0 I prefer f92db34. By having two optional attributes which are in an XOR-relationship the constructors for this operation will always require a nullptr and getting the attributes will always get a Between f92db34 and merging these two operations into their distinct operation, one thing to notice is that we will need a verifier to make sure the types are correctly matched. (Not a big deal). We also need to choose a name for this operation since |
0d09fc6 to
9c09c35
Compare
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM but wait for @krzysz00 before merging
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM after one comment
| }]; | ||
|
|
||
| let extraClassDeclaration = [{ | ||
| static Type getScaleType(MLIRContext *ctx) { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
What's this for?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I would like the assembly format to only have type($source) to type($res).
Without using
TypesMatchWith<"scale type is fixed",
"source", "scale",
"ScaledExtPacked816Op::getScaleType($_self.getContext())">]
// (which requires the definition of this extra class declaration)
The assembly format parser generator gives an error stating
error: type of operand #1, named 'scale', is not buildable and a buildable type cannot be inferred
attr-dict $source
I can inline this function like this:
diff --git a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td
index 05525d3a061d..210097138807 100644
--- a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td
+++ b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td
@@ -119,7 +119,7 @@ def IsValidBlockSize: AttrConstraint<
def AMDGPU_ScaledExtPacked816Op
: AMDGPU_Op<"scaled_ext_packed816", [Pure, TypesMatchWith<"scale type is fixed",
"source", "scale",
- "ScaledExtPacked816Op::getScaleType($_self.getContext())">]>,
+ "VectorType::get(4, Float8E8M0FNUType::get($_self.getContext()))">]>,
Arguments<(
ins AnyTypeOf<[VectorOfLengthAndType<[8], [F4E2M1FN,F8E4M3FN,F8E5M2]>,
VectorOfLengthAndType<[16], [F6E2M3FN, F6E3M2FN]>]>:$source,
@@ -170,12 +170,6 @@ def AMDGPU_ScaledExtPacked816Op
`:` type($source) `to` type($res)
}];
- let extraClassDeclaration = [{
- static Type getScaleType(MLIRContext *ctx) {
- return VectorType::get(4, Float8E8M0FNUType::get(ctx));
- }
- }];
-
}
def AMDGPU_ScaledExtPackedOpor if you prefer add the type declaration, or maybe another solution?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
- I don't think you want
TypesMatchWithhere - that implies a relationship between two types. - What you're looking for goes something like this
def Vector4Scales :
AllOfType<[IsVectorOfShape<[4]>, IsVectorOfType<[F8E8M0FNU]>,
"vector of 4 F8E8M0FNU scales",
"::mlir::VectorType">,
BuildableType<"::mlir::VectorType::get($_builder.getType<::mlir::Float8E8M0FNUType>, {4});">;
and then use that in the op definition
(See mlir/include/mlir/IR/CommonTypeConstraints.td for where all those tablegen bits came from.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
(The top half of that is defining the constraint the type has to obay. The BuildableType bit is a tag that tells the tablegen bits "Hey, this is one very specific type, you can just ... create it yourself and don't have to parse it".
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Thank you! I was looking for exactly this!
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Thanks for pointing into BuildableType. I had some issues using it exactly as you described above, but in the end I learned a little bit more about Predicates and Types. I ended up using the following:
def Vector4Scales :
AllOfType<[FixedVectorOfLengthAndType<[4], [F8E8M0FNU]>],
"vector of 4 F8E8M0FNU scales",
"::mlir::VectorType">,
BuildableType<"::mlir::VectorType::get({4}, $_builder.getType<::mlir::Float8E8M0FNUType>());">;Which if I understand correctly the FixedVectorOfLEngthAndType should already take care of the constraints we need. (Instead of using/defining these constraints [IsVectorOfShape<[4]>, IsVectorOfType<[F8E8M0FNU]>) and adds the necessary BuildableType which I was missing.
| `firstScaleByte` `(` $firstScaleByte `)` | ||
| `:` type($source) `to` type($res) | ||
| }]; | ||
|
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Can you add a verifier that errors out on invalid block size / firstScaleByte combinations?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Thanks for the review! 4f83cd9
|
|
||
|
|
||
| def Vector4Scales : | ||
| AllOfType<[FixedVectorOfLengthAndType<[4], [F8E8M0FNU]>], |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This technically allows vector<2x2xf8E8M0FNU>
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM % IR example in the description
No description provided.