From ece99af0a9c42390ad77ffb59d6e4473f2fd3644 Mon Sep 17 00:00:00 2001 From: Charitha Saumya Date: Fri, 31 Jan 2025 22:55:50 +0000 Subject: [PATCH 1/3] add mem side effects interface --- .../include/mlir/Dialect/XeGPU/IR/XeGPUOps.td | 393 +++++++++--------- mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp | 26 +- 2 files changed, 228 insertions(+), 191 deletions(-) diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td index c2335eecc3781..d98aa9ffb26f1 100644 --- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td +++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td @@ -276,97 +276,103 @@ def XeGPU_PrefetchNdOp : XeGPU_Op<"prefetch_nd", []> { } -def XeGPU_LoadNdOp : XeGPU_Op<"load_nd", [AllElementTypesMatch<["value", "TensorDesc"]>]> { - let summary = "loads a n-D block from memory (represented by TensorDesc)" - "to registers (represented by vector)"; - let description = [{ - LoadNdOp essentially mimics the hardware block read instruction to read - a block of data from memory to register. It takes a set of optional cache - hints for each level of cache, L1, L2 and L3. If hardware does not have a - correspoding cache, Corresponding cache hint attribute will be masked. - VNNI transformation is an hardware feature for Intel GPU, which is used to - do data packing during the load for B operand of matrix operation, if - the bit width of the data type is less then 32 bits, e.g., fp16. And - transpose is another Intel hardware feature, which will do transpose - operation when loading the data if the bit width of the data type is - fp32 or fp64. It implies that vnni and transpose cannot exit at the - same time. - - Example: - ```mlir - xegpu.load_nd %1 {transpose = [1, 0], - l1_hint = #xegpu.cache_hint, - l2_hint = #xegpu.cache_hint, - l3_hint = #xegpu.cache_hint} - : !xegpu.tensor_desc<8x16xf32> -> vector<16x8xf32> - ``` - - - }]; - - let arguments = (ins XeGPU_TensorDesc: $TensorDesc, - OptionalAttr: $packed, - OptionalAttr: $transpose, - OptionalAttr: $l1_hint, - OptionalAttr: $l2_hint, - OptionalAttr: $l3_hint); - - let results = (outs XeGPU_ValueType: $value); - - let extraClassDeclaration = extraBaseClassDeclaration # [{ - VectorType getType() { - return llvm::dyn_cast(getValue().getType()); - } +def XeGPU_LoadNdOp : XeGPU_Op<"load_nd", [ + AllElementTypesMatch<["value", "TensorDesc"]>, + DeclareOpInterfaceMethods + ]> { + let summary = "loads a n-D block from memory (represented by TensorDesc)" + "to registers (represented by vector)"; + let description = [{ + LoadNdOp essentially mimics the hardware block read instruction to read + a block of data from memory to register. It takes a set of optional cache + hints for each level of cache, L1, L2 and L3. If hardware does not have a + correspoding cache, Corresponding cache hint attribute will be masked. + VNNI transformation is an hardware feature for Intel GPU, which is used to + do data packing during the load for B operand of matrix operation, if + the bit width of the data type is less then 32 bits, e.g., fp16. And + transpose is another Intel hardware feature, which will do transpose + operation when loading the data if the bit width of the data type is + fp32 or fp64. It implies that vnni and transpose cannot exit at the + same time. + + Example: + ```mlir + xegpu.load_nd %1 {transpose = [1, 0], + l1_hint = #xegpu.cache_hint, + l2_hint = #xegpu.cache_hint, + l3_hint = #xegpu.cache_hint} + : !xegpu.tensor_desc<8x16xf32> -> vector<16x8xf32> + ``` + + + }]; + + let arguments = (ins XeGPU_TensorDesc: $TensorDesc, + OptionalAttr: $packed, + OptionalAttr: $transpose, + OptionalAttr: $l1_hint, + OptionalAttr: $l2_hint, + OptionalAttr: $l3_hint); + + let results = (outs XeGPU_ValueType: $value); + + let extraClassDeclaration = extraBaseClassDeclaration # [{ + VectorType getType() { + return llvm::dyn_cast(getValue().getType()); + } - xegpu::TensorDescType getTensorDescType() { - return getTensorDesc().getType(); - } - }]; + xegpu::TensorDescType getTensorDescType() { + return getTensorDesc().getType(); + } + }]; - let assemblyFormat = "$TensorDesc prop-dict attr-dict `:` qualified(type($TensorDesc)) `->` type($value)"; - let hasVerifier = 1; + let assemblyFormat = "$TensorDesc prop-dict attr-dict `:` qualified(type($TensorDesc)) `->` type($value)"; + let hasVerifier = 1; } -def XeGPU_StoreNdOp : XeGPU_Op<"store_nd", [AllElementTypesMatch<["value", "TensorDesc"]>]> { - let summary = "stores a n-D block register region back to memory, currently only supports 2D"; - - let description = [{ - StoreNdOp essentially mimics the hardware block write instruction io - write a block of data from register into the memory region as described - by the TensorDesc. It takes a set of optional cache hints for each level - of cache, L1, L2 and L3. If hardware does not have a correspoding cache, - Corresponding cache hint attribute will be masked. - - Example: - ```mlir - xegpu.store_nd %3, %2 {l1_hint = #xegpu.cache_hint, - l2_hint = #xegpu.cache_hint, - l3_hint = #xegpu.cache_hint} - : vector<8x16xf16>, !xegpu.tensor_desc<8x16xf16> - ``` - - - }]; - - let arguments = (ins XeGPU_ValueType: $value, - XeGPU_TensorDesc: $TensorDesc, - OptionalAttr: $l1_hint, - OptionalAttr: $l2_hint, - OptionalAttr: $l3_hint); - - let extraClassDeclaration = extraBaseClassDeclaration # [{ - VectorType getValueType() { - return llvm::dyn_cast(getValue().getType()); - } +def XeGPU_StoreNdOp : XeGPU_Op<"store_nd", [ + AllElementTypesMatch<["value", "TensorDesc"]>, + DeclareOpInterfaceMethods + ]> { + let summary = "stores a n-D block register region back to memory, currently only supports 2D"; + + let description = [{ + StoreNdOp essentially mimics the hardware block write instruction io + write a block of data from register into the memory region as described + by the TensorDesc. It takes a set of optional cache hints for each level + of cache, L1, L2 and L3. If hardware does not have a correspoding cache, + Corresponding cache hint attribute will be masked. + + Example: + ```mlir + xegpu.store_nd %3, %2 {l1_hint = #xegpu.cache_hint, + l2_hint = #xegpu.cache_hint, + l3_hint = #xegpu.cache_hint} + : vector<8x16xf16>, !xegpu.tensor_desc<8x16xf16> + ``` + + + }]; + + let arguments = (ins XeGPU_ValueType: $value, + XeGPU_TensorDesc: $TensorDesc, + OptionalAttr: $l1_hint, + OptionalAttr: $l2_hint, + OptionalAttr: $l3_hint); + + let extraClassDeclaration = extraBaseClassDeclaration # [{ + VectorType getValueType() { + return llvm::dyn_cast(getValue().getType()); + } - xegpu::TensorDescType getTensorDescType() { - return getTensorDesc().getType(); - } - }]; + xegpu::TensorDescType getTensorDescType() { + return getTensorDesc().getType(); + } + }]; - let assemblyFormat = [{$value `,` $TensorDesc prop-dict attr-dict - `:` type($value) `,` qualified(type($TensorDesc))}]; - let hasVerifier = 1; + let assemblyFormat = [{$value `,` $TensorDesc prop-dict attr-dict + `:` type($value) `,` qualified(type($TensorDesc))}]; + let hasVerifier = 1; } def XeGPU_UpdateNdOffsetOp : XeGPU_Op<"update_nd_offset", @@ -548,131 +554,138 @@ def XeGPU_PrefetchOp : XeGPU_Op<"prefetch", []> { let hasVerifier = 1; } -def XeGPU_LoadGatherOp : XeGPU_Op<"load", [AllElementTypesMatch<["value", "TensorDesc"]>]> { - let summary = "load a set of scattered data points from memory."; - - let description = [{ It (aka. load) load data per each work-item. The output - describes the data being loaded at the subgroup level, so its size is - consistent with the number of work-items in a subgroup. When the chunk size - is larger than 2, the output vector is a 2D vector, with dim-1 correspoding - to work-items, and dim-0 corresponding to the chunk size loaded by each work-item. - Specially, there is a transpose effect on the result (as compared to the TensorDesc) - due to the hardware implementation. Therefore, a transpose attribute is introduced - on purpose, making sure users are aware of this implicit transformation. - - The mask operand masks out memory access so that it is safe to pass out-of-boundary - addresses/offsets as long as they are masked. It applies to slots of SIMD lanes. - - Example 1: - ```mlir - %2 = xegpu.load %1, %0 {l1_hint = #xegpu.cache_hint, - l2_hint = #xegpu.cache_hint, - l3_hint = #xegpu.cache_hint} - : !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr>, - vector<16xi1> -> vector<16xf32> - ``` +def XeGPU_LoadGatherOp : XeGPU_Op<"load", [ + AllElementTypesMatch<["value", "TensorDesc"]>, + DeclareOpInterfaceMethods + ]> { + let summary = "load a set of scattered data points from memory."; + + let description = [{ It (aka. load) load data per each work-item. The output + describes the data being loaded at the subgroup level, so its size is + consistent with the number of work-items in a subgroup. When the chunk size + is larger than 2, the output vector is a 2D vector, with dim-1 correspoding + to work-items, and dim-0 corresponding to the chunk size loaded by each work-item. + Specially, there is a transpose effect on the result (as compared to the TensorDesc) + due to the hardware implementation. Therefore, a transpose attribute is introduced + on purpose, making sure users are aware of this implicit transformation. + + The mask operand masks out memory access so that it is safe to pass out-of-boundary + addresses/offsets as long as they are masked. It applies to slots of SIMD lanes. + + Example 1: + ```mlir + %2 = xegpu.load %1, %0 {l1_hint = #xegpu.cache_hint, + l2_hint = #xegpu.cache_hint, + l3_hint = #xegpu.cache_hint} + : !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr>, + vector<16xi1> -> vector<16xf32> + ``` - Example 2: - ```mlir - %2 = xegpu.load %1, %0 {transpose, - l1_hint = #xegpu.cache_hint, - l2_hint = #xegpu.cache_hint, - l3_hint = #xegpu.cache_hint} - : !xegpu.tensor_desc<16x8xf32, #xegpu.scatter_tdesc_attr>, - vector<16xi1> -> vector<8x16xf32> - ``` + Example 2: + ```mlir + %2 = xegpu.load %1, %0 {transpose, + l1_hint = #xegpu.cache_hint, + l2_hint = #xegpu.cache_hint, + l3_hint = #xegpu.cache_hint} + : !xegpu.tensor_desc<16x8xf32, #xegpu.scatter_tdesc_attr>, + vector<16xi1> -> vector<8x16xf32> + ``` - }]; + }]; - let arguments = (ins XeGPU_TensorDesc: $TensorDesc, - XeGPU_MaskType: $mask, - OptionalAttr: $transpose, - OptionalAttr: $l1_hint, - OptionalAttr: $l2_hint, - OptionalAttr: $l3_hint); - let results = (outs XeGPU_ValueType: $value); + let arguments = (ins XeGPU_TensorDesc: $TensorDesc, + XeGPU_MaskType: $mask, + OptionalAttr: $transpose, + OptionalAttr: $l1_hint, + OptionalAttr: $l2_hint, + OptionalAttr: $l3_hint); + let results = (outs XeGPU_ValueType: $value); - let extraClassDeclaration = extraBaseClassDeclaration # [{ - xegpu::TensorDescType getTensorDescType() { - return getTensorDesc().getType(); - } + let extraClassDeclaration = extraBaseClassDeclaration # [{ + xegpu::TensorDescType getTensorDescType() { + return getTensorDesc().getType(); + } - mlir::Type getElementType() { - auto type = getValue().getType(); - return getElementTypeOrSelf(type); - } + mlir::Type getElementType() { + auto type = getValue().getType(); + return getElementTypeOrSelf(type); + } - Type getValueType() { - return getValue().getType(); - } + Type getValueType() { + return getValue().getType(); + } - Type getMaskType() { - return getMask().getType(); - } + Type getMaskType() { + return getMask().getType(); + } - }]; + }]; - let assemblyFormat = [{$TensorDesc `,` $mask prop-dict attr-dict - `:` qualified(type($TensorDesc)) `,` type($mask) `->` type($value)}]; + let assemblyFormat = [{$TensorDesc `,` $mask prop-dict attr-dict + `:` qualified(type($TensorDesc)) `,` type($mask) `->` type($value)}]; - let hasVerifier = 1; + let hasVerifier = 1; } -def XeGPU_StoreScatterOp : XeGPU_Op<"store", [AllElementTypesMatch<["value", "TensorDesc"]>]> { - let summary = "store data to scattered memory locations."; - let description = [{ It (aka. store) stores data to scattered memory locations. The value is - typically a 1D vector. But when the chunk size of the TensorDesc is larger than 1, it will be - a 2D vector instead. For the later case, dim-1 of the value correspods to the simd lanes - and the dim-0 of the value corresponds to the chunk size stored per lane. So `store_scatter` - has transpose effect, which is similar to `load_gather`. Therefore, a transpose attribute is - introduced on purpose, making sure users are aware of this implicit transformation. - - Example 1: - ```mlir - %3 = xegpu.store %0, %1, %2 {l1_hint = #xegpu.cache_hint, - l2_hint = #xegpu.cache_hint, - l3_hint = #xegpu.cache_hint} - : vector<16xf32>, !xegpu.tensor_desc<16xf32, #xegpu.scattered_tdesc_attr<>>, vector<16xi1> - ``` +def XeGPU_StoreScatterOp : XeGPU_Op<"store", + [ + AllElementTypesMatch<["value", "TensorDesc"]>, + DeclareOpInterfaceMethods + ]> { + let summary = "store data to scattered memory locations."; + let description = [{ It (aka. store) stores data to scattered memory locations. The value is + typically a 1D vector. But when the chunk size of the TensorDesc is larger than 1, it will be + a 2D vector instead. For the later case, dim-1 of the value correspods to the simd lanes + and the dim-0 of the value corresponds to the chunk size stored per lane. So `store_scatter` + has transpose effect, which is similar to `load_gather`. Therefore, a transpose attribute is + introduced on purpose, making sure users are aware of this implicit transformation. + + Example 1: + ```mlir + %3 = xegpu.store %0, %1, %2 {l1_hint = #xegpu.cache_hint, + l2_hint = #xegpu.cache_hint, + l3_hint = #xegpu.cache_hint} + : vector<16xf32>, !xegpu.tensor_desc<16xf32, #xegpu.scattered_tdesc_attr<>>, vector<16xi1> + ``` - Example 2: - ```mlir - %3 = xegpu.store %0, %1, %2 {transpose, - l1_hint = #xegpu.cache_hint, - l2_hint = #xegpu.cache_hint, - l3_hint = #xegpu.cache_hint} - : vector<8x16xf32>, !xegpu.tensor_desc<16x8xf32, #xegpu.scattered_tdesc_attr>, vector<16xi1> - ``` + Example 2: + ```mlir + %3 = xegpu.store %0, %1, %2 {transpose, + l1_hint = #xegpu.cache_hint, + l2_hint = #xegpu.cache_hint, + l3_hint = #xegpu.cache_hint} + : vector<8x16xf32>, !xegpu.tensor_desc<16x8xf32, #xegpu.scattered_tdesc_attr>, vector<16xi1> + ``` - }]; + }]; - let arguments = (ins - XeGPU_ValueType: $value, - XeGPU_TensorDesc: $TensorDesc, - XeGPU_MaskType: $mask, - OptionalAttr: $transpose, - OptionalAttr: $l1_hint, - OptionalAttr: $l2_hint, - OptionalAttr: $l3_hint); + let arguments = (ins + XeGPU_ValueType: $value, + XeGPU_TensorDesc: $TensorDesc, + XeGPU_MaskType: $mask, + OptionalAttr: $transpose, + OptionalAttr: $l1_hint, + OptionalAttr: $l2_hint, + OptionalAttr: $l3_hint); - let extraClassDeclaration = extraBaseClassDeclaration # [{ - xegpu::TensorDescType getTensorDescType() { - return getTensorDesc().getType(); - } + let extraClassDeclaration = extraBaseClassDeclaration # [{ + xegpu::TensorDescType getTensorDescType() { + return getTensorDesc().getType(); + } - Type getValueType() { - return getValue().getType(); - } + Type getValueType() { + return getValue().getType(); + } - Type getMaskType() { - return getMask().getType(); - } - }]; + Type getMaskType() { + return getMask().getType(); + } + }]; - let assemblyFormat = [{$value `,` $TensorDesc `,` $mask prop-dict attr-dict - `:` type($value) `,` qualified(type($TensorDesc)) `,` type($mask)}]; + let assemblyFormat = [{$value `,` $TensorDesc `,` $mask prop-dict attr-dict + `:` type($value) `,` qualified(type($TensorDesc)) `,` type($mask)}]; - let hasVerifier = 1; + let hasVerifier = 1; } def XeGPU_UpdateOffsetOp: XeGPU_Op<"update_offset", diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp index cd883baa986b8..d015e5772a94f 100644 --- a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp +++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp @@ -324,6 +324,12 @@ LogicalResult LoadNdOp::verify() { return success(); } +void LoadNdOp::getEffects( + SmallVectorImpl> + &effects) { + effects.emplace_back(MemoryEffects::Read::get()); +} + //===----------------------------------------------------------------------===// // XeGPU_StoreNdOp //===----------------------------------------------------------------------===// @@ -361,6 +367,12 @@ LogicalResult StoreNdOp::verify() { return success(); } +void StoreNdOp::getEffects( + SmallVectorImpl> + &effects) { + effects.emplace_back(MemoryEffects::Write::get()); +} + //===----------------------------------------------------------------------===// // XeGPU_UpdateNDOffsetOp //===----------------------------------------------------------------------===// @@ -494,7 +506,7 @@ LogicalResult PrefetchOp::verify() { } //===----------------------------------------------------------------------===// -// XeGPU_LoadGatherOp +// XeGPU_jrOp //===----------------------------------------------------------------------===// LogicalResult LoadGatherOp::verify() { auto tdescTy = getTensorDescType(); @@ -553,6 +565,12 @@ LogicalResult LoadGatherOp::verify() { return success(); } +void LoadGatherOp::getEffects( + SmallVectorImpl> + &effects) { + effects.emplace_back(MemoryEffects::Read::get()); +} + //===----------------------------------------------------------------------===// // XeGPU_StoreScatterOp //===----------------------------------------------------------------------===// @@ -605,6 +623,12 @@ LogicalResult StoreScatterOp::verify() { return success(); } +void StoreScatterOp::getEffects( + SmallVectorImpl> + &effects) { + effects.emplace_back(MemoryEffects::Write::get()); +} + //===----------------------------------------------------------------------===// // XeGPU_UpdateOffsetOp //===----------------------------------------------------------------------===// From 1be0ae30b85e6270756e41fbc939edc3f42ba7e9 Mon Sep 17 00:00:00 2001 From: Charitha Saumya Date: Fri, 31 Jan 2025 23:18:30 +0000 Subject: [PATCH 2/3] add mem side effects interface --- .../include/mlir/Dialect/XeGPU/IR/XeGPUOps.td | 390 +++++++++--------- mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp | 2 +- 2 files changed, 194 insertions(+), 198 deletions(-) diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td index d98aa9ffb26f1..0ff723005d435 100644 --- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td +++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td @@ -277,102 +277,101 @@ def XeGPU_PrefetchNdOp : XeGPU_Op<"prefetch_nd", []> { def XeGPU_LoadNdOp : XeGPU_Op<"load_nd", [ - AllElementTypesMatch<["value", "TensorDesc"]>, - DeclareOpInterfaceMethods + AllElementTypesMatch<["value", "TensorDesc"]>, + DeclareOpInterfaceMethods ]> { - let summary = "loads a n-D block from memory (represented by TensorDesc)" - "to registers (represented by vector)"; - let description = [{ - LoadNdOp essentially mimics the hardware block read instruction to read - a block of data from memory to register. It takes a set of optional cache - hints for each level of cache, L1, L2 and L3. If hardware does not have a - correspoding cache, Corresponding cache hint attribute will be masked. - VNNI transformation is an hardware feature for Intel GPU, which is used to - do data packing during the load for B operand of matrix operation, if - the bit width of the data type is less then 32 bits, e.g., fp16. And - transpose is another Intel hardware feature, which will do transpose - operation when loading the data if the bit width of the data type is - fp32 or fp64. It implies that vnni and transpose cannot exit at the - same time. - - Example: - ```mlir - xegpu.load_nd %1 {transpose = [1, 0], - l1_hint = #xegpu.cache_hint, - l2_hint = #xegpu.cache_hint, - l3_hint = #xegpu.cache_hint} - : !xegpu.tensor_desc<8x16xf32> -> vector<16x8xf32> - ``` - - - }]; - - let arguments = (ins XeGPU_TensorDesc: $TensorDesc, - OptionalAttr: $packed, - OptionalAttr: $transpose, - OptionalAttr: $l1_hint, - OptionalAttr: $l2_hint, - OptionalAttr: $l3_hint); - - let results = (outs XeGPU_ValueType: $value); - - let extraClassDeclaration = extraBaseClassDeclaration # [{ - VectorType getType() { - return llvm::dyn_cast(getValue().getType()); - } + let summary = "loads a n-D block from memory (represented by TensorDesc)" + "to registers (represented by vector)"; + let description = [{ + LoadNdOp essentially mimics the hardware block read instruction to read + a block of data from memory to register. It takes a set of optional cache + hints for each level of cache, L1, L2 and L3. If hardware does not have a + correspoding cache, Corresponding cache hint attribute will be masked. + VNNI transformation is an hardware feature for Intel GPU, which is used to + do data packing during the load for B operand of matrix operation, if + the bit width of the data type is less then 32 bits, e.g., fp16. And + transpose is another Intel hardware feature, which will do transpose + operation when loading the data if the bit width of the data type is + fp32 or fp64. It implies that vnni and transpose cannot exit at the + same time. - xegpu::TensorDescType getTensorDescType() { - return getTensorDesc().getType(); - } - }]; + Example: + ```mlir + xegpu.load_nd %1 {transpose = [1, 0], + l1_hint = #xegpu.cache_hint, + l2_hint = #xegpu.cache_hint, + l3_hint = #xegpu.cache_hint} + : !xegpu.tensor_desc<8x16xf32> -> vector<16x8xf32> + ``` + + + }]; + + let arguments = (ins XeGPU_TensorDesc: $TensorDesc, + OptionalAttr: $packed, + OptionalAttr: $transpose, + OptionalAttr: $l1_hint, + OptionalAttr: $l2_hint, + OptionalAttr: $l3_hint); + + let results = (outs XeGPU_ValueType: $value); + + let extraClassDeclaration = extraBaseClassDeclaration # [{ + VectorType getType() { + return llvm::dyn_cast(getValue().getType()); + } - let assemblyFormat = "$TensorDesc prop-dict attr-dict `:` qualified(type($TensorDesc)) `->` type($value)"; - let hasVerifier = 1; + xegpu::TensorDescType getTensorDescType() { + return getTensorDesc().getType(); + } + }]; + + let assemblyFormat = "$TensorDesc prop-dict attr-dict `:` qualified(type($TensorDesc)) `->` type($value)"; + let hasVerifier = 1; } def XeGPU_StoreNdOp : XeGPU_Op<"store_nd", [ - AllElementTypesMatch<["value", "TensorDesc"]>, - DeclareOpInterfaceMethods + AllElementTypesMatch<["value", "TensorDesc"]>, DeclareOpInterfaceMethods, ]> { - let summary = "stores a n-D block register region back to memory, currently only supports 2D"; - - let description = [{ - StoreNdOp essentially mimics the hardware block write instruction io - write a block of data from register into the memory region as described - by the TensorDesc. It takes a set of optional cache hints for each level - of cache, L1, L2 and L3. If hardware does not have a correspoding cache, - Corresponding cache hint attribute will be masked. - - Example: - ```mlir - xegpu.store_nd %3, %2 {l1_hint = #xegpu.cache_hint, - l2_hint = #xegpu.cache_hint, - l3_hint = #xegpu.cache_hint} - : vector<8x16xf16>, !xegpu.tensor_desc<8x16xf16> - ``` - - - }]; - - let arguments = (ins XeGPU_ValueType: $value, - XeGPU_TensorDesc: $TensorDesc, - OptionalAttr: $l1_hint, - OptionalAttr: $l2_hint, - OptionalAttr: $l3_hint); - - let extraClassDeclaration = extraBaseClassDeclaration # [{ - VectorType getValueType() { - return llvm::dyn_cast(getValue().getType()); - } + let summary = "stores a n-D block register region back to memory, currently only supports 2D"; - xegpu::TensorDescType getTensorDescType() { - return getTensorDesc().getType(); - } - }]; + let description = [{ + StoreNdOp essentially mimics the hardware block write instruction io + write a block of data from register into the memory region as described + by the TensorDesc. It takes a set of optional cache hints for each level + of cache, L1, L2 and L3. If hardware does not have a correspoding cache, + Corresponding cache hint attribute will be masked. + + Example: + ```mlir + xegpu.store_nd %3, %2 {l1_hint = #xegpu.cache_hint, + l2_hint = #xegpu.cache_hint, + l3_hint = #xegpu.cache_hint} + : vector<8x16xf16>, !xegpu.tensor_desc<8x16xf16> + ``` + + + }]; + + let arguments = (ins XeGPU_ValueType: $value, + XeGPU_TensorDesc: $TensorDesc, + OptionalAttr: $l1_hint, + OptionalAttr: $l2_hint, + OptionalAttr: $l3_hint); + + let extraClassDeclaration = extraBaseClassDeclaration # [{ + VectorType getValueType() { + return llvm::dyn_cast(getValue().getType()); + } + + xegpu::TensorDescType getTensorDescType() { + return getTensorDesc().getType(); + } + }]; - let assemblyFormat = [{$value `,` $TensorDesc prop-dict attr-dict - `:` type($value) `,` qualified(type($TensorDesc))}]; - let hasVerifier = 1; + let assemblyFormat = [{$value `,` $TensorDesc prop-dict attr-dict + `:` type($value) `,` qualified(type($TensorDesc))}]; + let hasVerifier = 1; } def XeGPU_UpdateNdOffsetOp : XeGPU_Op<"update_nd_offset", @@ -555,137 +554,134 @@ def XeGPU_PrefetchOp : XeGPU_Op<"prefetch", []> { } def XeGPU_LoadGatherOp : XeGPU_Op<"load", [ - AllElementTypesMatch<["value", "TensorDesc"]>, - DeclareOpInterfaceMethods + AllElementTypesMatch<["value", "TensorDesc"]>, + DeclareOpInterfaceMethods ]> { - let summary = "load a set of scattered data points from memory."; - - let description = [{ It (aka. load) load data per each work-item. The output - describes the data being loaded at the subgroup level, so its size is - consistent with the number of work-items in a subgroup. When the chunk size - is larger than 2, the output vector is a 2D vector, with dim-1 correspoding - to work-items, and dim-0 corresponding to the chunk size loaded by each work-item. - Specially, there is a transpose effect on the result (as compared to the TensorDesc) - due to the hardware implementation. Therefore, a transpose attribute is introduced - on purpose, making sure users are aware of this implicit transformation. - - The mask operand masks out memory access so that it is safe to pass out-of-boundary - addresses/offsets as long as they are masked. It applies to slots of SIMD lanes. - - Example 1: - ```mlir - %2 = xegpu.load %1, %0 {l1_hint = #xegpu.cache_hint, - l2_hint = #xegpu.cache_hint, - l3_hint = #xegpu.cache_hint} - : !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr>, - vector<16xi1> -> vector<16xf32> - ``` + let summary = "load a set of scattered data points from memory."; + + let description = [{ It (aka. load) load data per each work-item. The output + describes the data being loaded at the subgroup level, so its size is + consistent with the number of work-items in a subgroup. When the chunk size + is larger than 2, the output vector is a 2D vector, with dim-1 correspoding + to work-items, and dim-0 corresponding to the chunk size loaded by each work-item. + Specially, there is a transpose effect on the result (as compared to the TensorDesc) + due to the hardware implementation. Therefore, a transpose attribute is introduced + on purpose, making sure users are aware of this implicit transformation. + + The mask operand masks out memory access so that it is safe to pass out-of-boundary + addresses/offsets as long as they are masked. It applies to slots of SIMD lanes. + + Example 1: + ```mlir + %2 = xegpu.load %1, %0 {l1_hint = #xegpu.cache_hint, + l2_hint = #xegpu.cache_hint, + l3_hint = #xegpu.cache_hint} + : !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr>, + vector<16xi1> -> vector<16xf32> + ``` - Example 2: - ```mlir - %2 = xegpu.load %1, %0 {transpose, - l1_hint = #xegpu.cache_hint, - l2_hint = #xegpu.cache_hint, - l3_hint = #xegpu.cache_hint} - : !xegpu.tensor_desc<16x8xf32, #xegpu.scatter_tdesc_attr>, - vector<16xi1> -> vector<8x16xf32> - ``` + Example 2: + ```mlir + %2 = xegpu.load %1, %0 {transpose, + l1_hint = #xegpu.cache_hint, + l2_hint = #xegpu.cache_hint, + l3_hint = #xegpu.cache_hint} + : !xegpu.tensor_desc<16x8xf32, #xegpu.scatter_tdesc_attr>, + vector<16xi1> -> vector<8x16xf32> + ``` - }]; + }]; - let arguments = (ins XeGPU_TensorDesc: $TensorDesc, - XeGPU_MaskType: $mask, - OptionalAttr: $transpose, - OptionalAttr: $l1_hint, - OptionalAttr: $l2_hint, - OptionalAttr: $l3_hint); - let results = (outs XeGPU_ValueType: $value); + let arguments = (ins XeGPU_TensorDesc: $TensorDesc, + XeGPU_MaskType: $mask, + OptionalAttr: $transpose, + OptionalAttr: $l1_hint, + OptionalAttr: $l2_hint, + OptionalAttr: $l3_hint); + let results = (outs XeGPU_ValueType: $value); - let extraClassDeclaration = extraBaseClassDeclaration # [{ - xegpu::TensorDescType getTensorDescType() { - return getTensorDesc().getType(); - } + let extraClassDeclaration = extraBaseClassDeclaration # [{ + xegpu::TensorDescType getTensorDescType() { + return getTensorDesc().getType(); + } - mlir::Type getElementType() { - auto type = getValue().getType(); - return getElementTypeOrSelf(type); - } + mlir::Type getElementType() { + auto type = getValue().getType(); + return getElementTypeOrSelf(type); + } - Type getValueType() { - return getValue().getType(); - } + Type getValueType() { + return getValue().getType(); + } - Type getMaskType() { - return getMask().getType(); - } + Type getMaskType() { + return getMask().getType(); + } - }]; + }]; - let assemblyFormat = [{$TensorDesc `,` $mask prop-dict attr-dict - `:` qualified(type($TensorDesc)) `,` type($mask) `->` type($value)}]; + let assemblyFormat = [{$TensorDesc `,` $mask prop-dict attr-dict + `:` qualified(type($TensorDesc)) `,` type($mask) `->` type($value)}]; - let hasVerifier = 1; + let hasVerifier = 1; } -def XeGPU_StoreScatterOp : XeGPU_Op<"store", - [ - AllElementTypesMatch<["value", "TensorDesc"]>, - DeclareOpInterfaceMethods - ]> { - let summary = "store data to scattered memory locations."; - let description = [{ It (aka. store) stores data to scattered memory locations. The value is - typically a 1D vector. But when the chunk size of the TensorDesc is larger than 1, it will be - a 2D vector instead. For the later case, dim-1 of the value correspods to the simd lanes - and the dim-0 of the value corresponds to the chunk size stored per lane. So `store_scatter` - has transpose effect, which is similar to `load_gather`. Therefore, a transpose attribute is - introduced on purpose, making sure users are aware of this implicit transformation. - - Example 1: - ```mlir - %3 = xegpu.store %0, %1, %2 {l1_hint = #xegpu.cache_hint, - l2_hint = #xegpu.cache_hint, - l3_hint = #xegpu.cache_hint} - : vector<16xf32>, !xegpu.tensor_desc<16xf32, #xegpu.scattered_tdesc_attr<>>, vector<16xi1> - ``` +def XeGPU_StoreScatterOp : XeGPU_Op<"store", [ + AllElementTypesMatch<["value", "TensorDesc"]>, DeclareOpInterfaceMethods]> { + let summary = "store data to scattered memory locations."; + let description = [{ It (aka. store) stores data to scattered memory locations. The value is + typically a 1D vector. But when the chunk size of the TensorDesc is larger than 1, it will be + a 2D vector instead. For the later case, dim-1 of the value correspods to the simd lanes + and the dim-0 of the value corresponds to the chunk size stored per lane. So `store_scatter` + has transpose effect, which is similar to `load_gather`. Therefore, a transpose attribute is + introduced on purpose, making sure users are aware of this implicit transformation. + + Example 1: + ```mlir + %3 = xegpu.store %0, %1, %2 {l1_hint = #xegpu.cache_hint, + l2_hint = #xegpu.cache_hint, + l3_hint = #xegpu.cache_hint} + : vector<16xf32>, !xegpu.tensor_desc<16xf32, #xegpu.scattered_tdesc_attr<>>, vector<16xi1> + ``` - Example 2: - ```mlir - %3 = xegpu.store %0, %1, %2 {transpose, - l1_hint = #xegpu.cache_hint, - l2_hint = #xegpu.cache_hint, - l3_hint = #xegpu.cache_hint} - : vector<8x16xf32>, !xegpu.tensor_desc<16x8xf32, #xegpu.scattered_tdesc_attr>, vector<16xi1> - ``` + Example 2: + ```mlir + %3 = xegpu.store %0, %1, %2 {transpose, + l1_hint = #xegpu.cache_hint, + l2_hint = #xegpu.cache_hint, + l3_hint = #xegpu.cache_hint} + : vector<8x16xf32>, !xegpu.tensor_desc<16x8xf32, #xegpu.scattered_tdesc_attr>, vector<16xi1> + ``` - }]; + }]; - let arguments = (ins - XeGPU_ValueType: $value, - XeGPU_TensorDesc: $TensorDesc, - XeGPU_MaskType: $mask, - OptionalAttr: $transpose, - OptionalAttr: $l1_hint, - OptionalAttr: $l2_hint, - OptionalAttr: $l3_hint); + let arguments = (ins + XeGPU_ValueType: $value, + XeGPU_TensorDesc: $TensorDesc, + XeGPU_MaskType: $mask, + OptionalAttr: $transpose, + OptionalAttr: $l1_hint, + OptionalAttr: $l2_hint, + OptionalAttr: $l3_hint); - let extraClassDeclaration = extraBaseClassDeclaration # [{ - xegpu::TensorDescType getTensorDescType() { - return getTensorDesc().getType(); - } + let extraClassDeclaration = extraBaseClassDeclaration # [{ + xegpu::TensorDescType getTensorDescType() { + return getTensorDesc().getType(); + } - Type getValueType() { - return getValue().getType(); - } + Type getValueType() { + return getValue().getType(); + } - Type getMaskType() { - return getMask().getType(); - } - }]; + Type getMaskType() { + return getMask().getType(); + } + }]; - let assemblyFormat = [{$value `,` $TensorDesc `,` $mask prop-dict attr-dict - `:` type($value) `,` qualified(type($TensorDesc)) `,` type($mask)}]; + let assemblyFormat = [{$value `,` $TensorDesc `,` $mask prop-dict attr-dict + `:` type($value) `,` qualified(type($TensorDesc)) `,` type($mask)}]; - let hasVerifier = 1; + let hasVerifier = 1; } def XeGPU_UpdateOffsetOp: XeGPU_Op<"update_offset", diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp index d015e5772a94f..443a1347334e2 100644 --- a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp +++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp @@ -506,7 +506,7 @@ LogicalResult PrefetchOp::verify() { } //===----------------------------------------------------------------------===// -// XeGPU_jrOp +// XeGPU_LoadGatherOp //===----------------------------------------------------------------------===// LogicalResult LoadGatherOp::verify() { auto tdescTy = getTensorDescType(); From ffae0295ea78ce2ae00af83227063edeb04e4f20 Mon Sep 17 00:00:00 2001 From: Charitha Saumya Date: Mon, 3 Feb 2025 16:48:54 +0000 Subject: [PATCH 3/3] add mem side effects interface --- .../include/mlir/Dialect/XeGPU/IR/XeGPUOps.td | 11 ++++----- mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp | 24 ------------------- 2 files changed, 5 insertions(+), 30 deletions(-) diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td index 0ff723005d435..7560ede058faa 100644 --- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td +++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td @@ -277,8 +277,7 @@ def XeGPU_PrefetchNdOp : XeGPU_Op<"prefetch_nd", []> { def XeGPU_LoadNdOp : XeGPU_Op<"load_nd", [ - AllElementTypesMatch<["value", "TensorDesc"]>, - DeclareOpInterfaceMethods + AllElementTypesMatch<["value", "TensorDesc"]>, MemoryEffects<[MemRead]> ]> { let summary = "loads a n-D block from memory (represented by TensorDesc)" "to registers (represented by vector)"; @@ -331,7 +330,7 @@ def XeGPU_LoadNdOp : XeGPU_Op<"load_nd", [ } def XeGPU_StoreNdOp : XeGPU_Op<"store_nd", [ - AllElementTypesMatch<["value", "TensorDesc"]>, DeclareOpInterfaceMethods, + AllElementTypesMatch<["value", "TensorDesc"]>, MemoryEffects<[MemWrite]> ]> { let summary = "stores a n-D block register region back to memory, currently only supports 2D"; @@ -554,8 +553,7 @@ def XeGPU_PrefetchOp : XeGPU_Op<"prefetch", []> { } def XeGPU_LoadGatherOp : XeGPU_Op<"load", [ - AllElementTypesMatch<["value", "TensorDesc"]>, - DeclareOpInterfaceMethods + AllElementTypesMatch<["value", "TensorDesc"]>, MemoryEffects<[MemRead]> ]> { let summary = "load a set of scattered data points from memory."; @@ -627,7 +625,8 @@ def XeGPU_LoadGatherOp : XeGPU_Op<"load", [ } def XeGPU_StoreScatterOp : XeGPU_Op<"store", [ - AllElementTypesMatch<["value", "TensorDesc"]>, DeclareOpInterfaceMethods]> { + AllElementTypesMatch<["value", "TensorDesc"]>, MemoryEffects<[MemWrite]> + ]> { let summary = "store data to scattered memory locations."; let description = [{ It (aka. store) stores data to scattered memory locations. The value is typically a 1D vector. But when the chunk size of the TensorDesc is larger than 1, it will be diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp index 443a1347334e2..cd883baa986b8 100644 --- a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp +++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp @@ -324,12 +324,6 @@ LogicalResult LoadNdOp::verify() { return success(); } -void LoadNdOp::getEffects( - SmallVectorImpl> - &effects) { - effects.emplace_back(MemoryEffects::Read::get()); -} - //===----------------------------------------------------------------------===// // XeGPU_StoreNdOp //===----------------------------------------------------------------------===// @@ -367,12 +361,6 @@ LogicalResult StoreNdOp::verify() { return success(); } -void StoreNdOp::getEffects( - SmallVectorImpl> - &effects) { - effects.emplace_back(MemoryEffects::Write::get()); -} - //===----------------------------------------------------------------------===// // XeGPU_UpdateNDOffsetOp //===----------------------------------------------------------------------===// @@ -565,12 +553,6 @@ LogicalResult LoadGatherOp::verify() { return success(); } -void LoadGatherOp::getEffects( - SmallVectorImpl> - &effects) { - effects.emplace_back(MemoryEffects::Read::get()); -} - //===----------------------------------------------------------------------===// // XeGPU_StoreScatterOp //===----------------------------------------------------------------------===// @@ -623,12 +605,6 @@ LogicalResult StoreScatterOp::verify() { return success(); } -void StoreScatterOp::getEffects( - SmallVectorImpl> - &effects) { - effects.emplace_back(MemoryEffects::Write::get()); -} - //===----------------------------------------------------------------------===// // XeGPU_UpdateOffsetOp //===----------------------------------------------------------------------===//