diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPU.h b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPU.h index 7ac0cf77fe59b..d6c51d20571fd 100644 --- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPU.h +++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPU.h @@ -11,6 +11,7 @@ #include "mlir/Bytecode/BytecodeOpInterface.h" #include "mlir/Dialect/Arith/IR/Arith.h" +#include "mlir/Dialect/Vector/IR/VectorOps.h" #include "mlir/IR/BuiltinTypes.h" #include "mlir/IR/Dialect.h" #include "mlir/IR/TypeUtilities.h" diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td index e24a056de2caf..239ce0aa8e003 100644 --- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td +++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td @@ -424,9 +424,9 @@ def XeGPU_CreateDescOp: XeGPU_Op<"create_tdesc", [Pure, ViewLikeOpInterface]> { It accepts the following parameters: * source: a 1D memref or pointer (uint64_t) represents the flattened memory object. - * offsets: a array containing offsets of each access point. Its size + * offsets: a vector containing offsets of each access point. Its size is fixed to the hardware supportted subgroup size, e.g., 16 on PVC, - implying each element in the array corresponds to a work-item (SIMT lane) + implying each element in the vector corresponds to a work-item (SIMT lane) in the subgroup. The first dimension of the result TensorDesc corresponds to work-items, so it should @@ -436,56 +436,59 @@ def XeGPU_CreateDescOp: XeGPU_Op<"create_tdesc", [Pure, ViewLikeOpInterface]> { Example 1. It assumes subgroup size is 4, and accesses a[0], a[16], a[32], a[64] ```mlir %a = memref.alloc() : memref<1024xf32> - %1 = xegpu.create_tdesc %a[0, 16, 32, 64]: memref<1024xf32> -> TensorDesc<4xf32> + %0 = arith.constant dense<[0, 16, 32, 64]> : vector<4xindex> + %1 = xegpu.create_tdesc %a, %0: memref<1024xf32>, vector<4xindex> -> TensorDesc<4xf32> ``` Example 2. It assumes subgroup size is 4, and each workitem access 8 elements. It will access totally 32 data elements: a[0:7], a[16:23], a[32:39], a[64:71] ```mlir %0 = memref.alloc() : memref<1024xf32> - %1 = xegpu.create_tdesc %0[0, 16, 32, 64] : memref<1024xf32> -> TensorDesc<4x8xf32, chunk_size = 8> + %off = arith.constant dense<[0, 16, 32, 64]> : vector<4xindex> + %1 = xegpu.create_tdesc %0, %off : memref<1024xf32>, vector<4xindex> + -> TensorDesc<4x8xf32, #xegpu.scattered_tdesc_attr> ``` Example 3. It is similar to Example 2, but there is some overlaps among workitems. It accesses: a[0:7], a[4:11], a[8:15], a[12:19] ```mlir %0 = memref.alloc() : memref<1024xf32> - %1 = xegpu.create_tdesc %0[0, 4, 8, 12] : memref<1024xf32> -> TensorDesc<4x8xf32, chunk_size = 8>> + %off = arith.constant dense<[0, 4, 8, 12]> : vector<4xindex> + %1 = xegpu.create_tdesc %0, %off : memref<1024xf32>, vector<4xindex> + -> TensorDesc<4x8xf32, #xegpu.scattered_tdesc_attr> ``` }]; let arguments = (ins XeGPU_BaseAddrType: $source, - Variadic: $offsets, - DenseI64ArrayAttr: $const_offsets); + XeGPU_OffsetType: $offsets); let results = (outs XeGPU_TensorDesc:$TensorDesc); + let builders = [ + OpBuilder<(ins "xegpu::TensorDescType": $TensorDesc, "mlir::Value": $source, + "llvm::ArrayRef": $offsets)>, + OpBuilder<(ins "xegpu::TensorDescType": $TensorDesc, "mlir::Value": $source, + "llvm::ArrayRef": $offsets)>, + ]; + let assemblyFormat = [{ - $source - custom($offsets, $const_offsets) - attr-dict `:` type($source) `->` qualified(type($TensorDesc)) + $source `,` $offsets attr-dict `:` type($source) `,` type($offsets) `->` qualified(type($TensorDesc)) }]; - let extraClassDeclaration = extraBaseClassDeclaration # [{ + let extraClassDeclaration = [{ xegpu::TensorDescType getTensorDescType() { return getTensorDesc().getType(); } - SmallVector getMixedOffsets() { - Builder b(getContext()); - return getMixedValues(getConstOffsets(), getOffsets(), b); + mlir::VectorType getOffsetsType() { + return getOffsets().getType(); } size_t getNumOffsets() { - return getMixedOffsets().size(); + return getOffsetsType().getNumElements(); } mlir::Value getViewSource() { return getSource(); } - OpFoldResult getOffset(unsigned idx) { - assert(idx < getNumOffsets() && "Invalid out of bound access."); - return getMixedOffsets()[idx]; - } - unsigned getSourceMemorySpace() { auto srcTy = getSource().getType(); if (auto memrefTy = llvm::dyn_cast(srcTy)) { @@ -550,7 +553,7 @@ def XeGPU_LoadGatherOp : XeGPU_Op<"load", [AllRanksMatch<["value", "TensorDesc"] describes the data being loaded at the subgroup level, so its size is consistent with the number of work-items in a subgroup. When the chunk size is larger than 2, the output vector is a 2D vector, with dim-1 correspoding - to work-items, and dim-0 corresponding to the chunk_size loaded by each work-item. + to work-items, and dim-0 corresponding to the chunk size loaded by each work-item. Specially, there is a transpose effect on the result (as compared to the TensorDesc) due to the hardware implementation. Therefore, a transpose attribute is introduced on purpose, making sure users are aware of this implicit transformation. @@ -558,16 +561,25 @@ def XeGPU_LoadGatherOp : XeGPU_Op<"load", [AllRanksMatch<["value", "TensorDesc"] The mask operand masks out memory access so that it is safe to pass out-of-boundary addresses/offsets as long as they are masked. It applies to slots of SIMD lanes. - Example: + Example 1: ```mlir - %2 = xegpu.load %1, %0 {transpose, - l1_hint = #xegpu.cache_hint, + %2 = xegpu.load %1, %0 {l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint} : !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr>, vector<16xi1> -> vector<16xf32> ``` + Example 2: + ```mlir + %2 = xegpu.load %1, %0 {transpose, + l1_hint = #xegpu.cache_hint, + l2_hint = #xegpu.cache_hint, + l3_hint = #xegpu.cache_hint} + : !xegpu.tensor_desc<16x8xf32, #xegpu.scatter_tdesc_attr>, + vector<16xi1> -> vector<8x16xf32> + ``` + }]; let arguments = (ins XeGPU_TensorDesc: $TensorDesc, @@ -610,17 +622,27 @@ def XeGPU_StoreScatterOp : XeGPU_Op<"store", [AllElementCountsMatch<["value", "T let description = [{ It (aka. store) stores data to scattered memory locations. The value is typically a 1D vector. But when the chunk size of the TensorDesc is larger than 1, it will be a 2D vector instead. For the later case, dim-1 of the value correspods to the simd lanes - and the dim-0 of the value corresponds to the chunk_size stored per lane. So `store_scatter` + and the dim-0 of the value corresponds to the chunk size stored per lane. So `store_scatter` has transpose effect, which is similar to `load_gather`. Therefore, a transpose attribute is introduced on purpose, making sure users are aware of this implicit transformation. - Example: + Example 1: ```mlir %3 = xegpu.store %0, %1, %2 {l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint} - : vector<16xf32>, !xegpu.tensor_desc<16xf32, #xegpu.tdesc_attr>, vector<16xi1> + : vector<16xf32>, !xegpu.tensor_desc<16xf32, #xegpu.scattered_tdesc_attr<>>, vector<16xi1> + ``` + + Example 2: + ```mlir + %3 = xegpu.store %0, %1, %2 {transpose, + l1_hint = #xegpu.cache_hint, + l2_hint = #xegpu.cache_hint, + l3_hint = #xegpu.cache_hint} + : vector<8x16xf32>, !xegpu.tensor_desc<16x8xf32, #xegpu.scattered_tdesc_attr>, vector<16xi1> ``` + }]; let arguments = (ins @@ -666,40 +688,39 @@ def XeGPU_UpdateOffsetOp: XeGPU_Op<"update_offset", Example: ```mlir - %2 = xegpu.update_offset %1, [32, 32, 32, 32] - : !xegpu.tensor_desc<4x2xf32, #xegpu.tdesc_attr> + %off = arith.constant dense<[32, 32, 32, 32]> : vector<4xindex> + %2 = xegpu.update_offset %1, %off : + !xegpu.tensor_desc<4x2xf32, #xegpu.scattered_tdesc_attr<>>, vector<4xindex> ``` }]; let arguments = (ins XeGPU_TensorDesc: $TensorDesc, - Variadic: $offsets, - DenseI64ArrayAttr: $const_offsets); + XeGPU_OffsetType: $offsets); let results = (outs XeGPU_TensorDesc: $result); - let extraClassDeclaration = extraBaseClassDeclaration # [{ + let builders = [ + OpBuilder<(ins "mlir::Value": $TensorDesc, + "llvm::ArrayRef": $offsets)>, + OpBuilder<(ins "mlir::Value": $TensorDesc, + "llvm::ArrayRef": $offsets)> + ]; + + let extraClassDeclaration = [{ xegpu::TensorDescType getTensorDescType() { return getTensorDesc().getType(); } - SmallVector getMixedOffsets() { - Builder b(getContext()); - return getMixedValues(getConstOffsets(), getOffsets(), b); + mlir::VectorType getOffsetsType() { + return getOffsets().getType(); } size_t getNumOffsets() { - return getMixedOffsets().size(); - } - - OpFoldResult getOffset(unsigned idx) { - assert(idx < getNumOffsets() && "Invalid out of bound access."); - return getMixedOffsets()[idx]; + return getOffsetsType().getNumElements(); } }]; let assemblyFormat = [{ - $TensorDesc `,` - custom($offsets, $const_offsets) - attr-dict `:` qualified(type($TensorDesc)) + $TensorDesc `,` $offsets attr-dict `:` qualified(type($TensorDesc)) `,` type($offsets) }]; } diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp index 1a7a6b3478409..5bd3c370e3859 100644 --- a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp +++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp @@ -6,6 +6,7 @@ // //===----------------------------------------------------------------------===// +#include "mlir/Dialect/Arith/Utils/Utils.h" #include "mlir/Dialect/Utils/StaticValueUtils.h" #include "mlir/Dialect/XeGPU/IR/XeGPU.h" #include "mlir/IR/Builders.h" @@ -308,6 +309,24 @@ LogicalResult UpdateNdOffsetOp::verify() { // XeGPU_CreateDescOp //===----------------------------------------------------------------------===// +void CreateDescOp::build(OpBuilder &builder, OperationState &state, + TensorDescType TensorDesc, Value source, + llvm::ArrayRef offsets) { + auto loc = source.getLoc(); + int64_t size = static_cast(offsets.size()); + auto type = VectorType::get(size, builder.getIndexType()); + auto values = getValueOrCreateConstantIndexOp(builder, loc, offsets); + auto offset = builder.create(loc, type, values); + build(builder, state, TensorDesc, source, offset); +} + +void CreateDescOp::build(OpBuilder &builder, OperationState &state, + TensorDescType TensorDesc, Value source, + llvm::ArrayRef offsets) { + auto ofrs = getAsIndexOpFoldResult(builder.getContext(), offsets); + build(builder, state, TensorDesc, source, ofrs); +} + LogicalResult CreateDescOp::verify() { auto tdescTy = getTensorDescType(); @@ -473,6 +492,29 @@ LogicalResult StoreScatterOp::verify() { return success(); } + +//===----------------------------------------------------------------------===// +// XeGPU_UpdateOffsetOp +//===----------------------------------------------------------------------===// +void UpdateOffsetOp::build(OpBuilder &builder, OperationState &state, + mlir::Value tensorDesc, + llvm::ArrayRef offsets) { + auto tdescTy = mlir::dyn_cast(tensorDesc.getType()); + assert(tdescTy && "Expecting the source is a TensorDescType value."); + auto loc = tensorDesc.getLoc(); + int64_t size = static_cast(offsets.size()); + auto type = VectorType::get({size}, builder.getIndexType()); + auto values = getValueOrCreateConstantIndexOp(builder, loc, offsets); + auto offset = builder.create(loc, type, values); + build(builder, state, tdescTy, tensorDesc, offset); +} + +void UpdateOffsetOp::build(OpBuilder &builder, OperationState &state, + Value tensorDesc, llvm::ArrayRef offsets) { + auto ofrs = getAsIndexOpFoldResult(builder.getContext(), offsets); + build(builder, state, tensorDesc, ofrs); +} + //===----------------------------------------------------------------------===// // XeGPU_DpasOp //===----------------------------------------------------------------------===// diff --git a/mlir/test/Dialect/XeGPU/XeGPUOps.mlir b/mlir/test/Dialect/XeGPU/XeGPUOps.mlir index c1126efb6046d..6db57aad773aa 100644 --- a/mlir/test/Dialect/XeGPU/XeGPUOps.mlir +++ b/mlir/test/Dialect/XeGPU/XeGPUOps.mlir @@ -104,22 +104,28 @@ gpu.func @test_create_update_nd_tdesc_vc(%src: memref<24x32xf32>) { // CHECK: gpu.func @test_create_tdesc_vc(%[[arg0:.*]]: ui64) { gpu.func @test_create_tdesc_vc(%src: ui64) { - //CHECK: %[[R0:.*]] = xegpu.create_tdesc %arg0 [0, 8, 16, 24] : ui64 -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr> - %1 = xegpu.create_tdesc %src[0, 8, 16, 24] : ui64 -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr> + //CHECK: %[[cst:.*]] = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex> + %0 = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex> + //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[cst]] : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr> + %1 = xegpu.create_tdesc %src, %0 : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr> gpu.return } // CHECK: gpu.func @test_create_tdesc_vc_1(%[[arg0:.*]]: memref) { gpu.func @test_create_tdesc_vc_1(%src: memref) { - //CHECK: %[[R0:.*]] = xegpu.create_tdesc %arg0 [0, 8, 16, 24] : memref -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr> - %1 = xegpu.create_tdesc %src[0, 8, 16, 24] : memref -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr> + //CHECK: %[[cst:.*]] = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex> + %0 = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex> + //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[cst]] : memref, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr> + %1 = xegpu.create_tdesc %src, %0 : memref, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr> gpu.return } // CHECK: gpu.func @test_prefetch_vc(%[[arg0:.*]]: ui64) { gpu.func @test_prefetch_vc(%src: ui64) { - //CHECK: %[[R0:.*]] = xegpu.create_tdesc %arg0 [0, 8, 16, 24] : ui64 -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr> - %1 = xegpu.create_tdesc %src[0, 8, 16, 24] : ui64 -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr> + //CHECK: %[[cst:.*]] = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex> + %0 = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex> + //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[cst]] : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr> + %1 = xegpu.create_tdesc %src, %0 : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr> // CHECK: xegpu.prefetch %[[R0]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr> xegpu.prefetch %1 <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint}>: !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr> gpu.return @@ -129,8 +135,10 @@ gpu.func @test_prefetch_vc(%src: ui64) { gpu.func @test_load_gather_vc(%src: ui64) { //CHECK: %[[cst:.*]] = arith.constant dense : vector<4xi1> %0 = arith.constant dense<1>: vector<4xi1> - //CHECK: %[[R0:.*]] = xegpu.create_tdesc %arg0 [0, 8, 16, 24] : ui64 -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr> - %1 = xegpu.create_tdesc %src[0, 8, 16, 24] : ui64 -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr> + //CHECK: %[[c2:.*]] = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex> + %c = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex> + //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[c2]] : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr> + %1 = xegpu.create_tdesc %src, %c : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr> //CHECK: %[[R1:.*]] = xegpu.load %[[R0]], %[[cst]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, transpose}> //CHECK-SAME: !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr>, vector<4xi1> -> vector<2x4xf32> %2 = xegpu.load %1, %0 <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, transpose}> @@ -144,8 +152,10 @@ gpu.func @test_store_scatter_vc(%src: ui64) { %0 = arith.constant dense<1>: vector<4xi1> //CHECK: %[[c1:.*]] = arith.constant dense<2.900000e+00> : vector<2x4xf32> %1 = arith.constant dense<2.9>: vector<2x4xf32> - //CHECK: %[[R0:.*]] = xegpu.create_tdesc %arg0 [0, 8, 16, 24] : ui64 -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr> - %2 = xegpu.create_tdesc %src[0, 8, 16, 24] : ui64 -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr> + //CHECK: %[[c2:.*]] = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex> + %c = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex> + //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[c2]] : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr> + %2 = xegpu.create_tdesc %src, %c : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr> //CHECK: xegpu.store %[[c1]], %[[R0]], %[[c0]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, transpose}> //CHECK-SAME: vector<2x4xf32>, !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr>, vector<4xi1> xegpu.store %1, %2, %0 <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, transpose}> @@ -155,10 +165,14 @@ gpu.func @test_store_scatter_vc(%src: ui64) { // CHECK: gpu.func @test_create_update_tdesc_vc(%[[arg0:.*]]: ui64) { gpu.func @test_create_update_tdesc_vc(%src: ui64) { - //CHECK: %[[R0:.*]] = xegpu.create_tdesc %arg0 [0, 8, 16, 24] : ui64 -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr> - %1 = xegpu.create_tdesc %src[0, 8, 16, 24]: ui64 -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr> - //CHECK: %[[R1:.*]] = xegpu.update_offset %[[R0]], [32, 32, 32, 32] : !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr> - %2 = xegpu.update_offset %1, [32, 32, 32, 32] : !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr> + //CHECK: %[[cst:.*]] = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex> + //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[cst]] : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr> + //CHECK: %[[st:.*]] = arith.constant dense<32> : vector<4xindex> + //CHECK: %[[R1:.*]] = xegpu.update_offset %[[R0]], %[[st]] : !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr>, vector<4xindex> + %0 = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex> + %1 = xegpu.create_tdesc %src, %0 : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr> + %s = arith.constant dense<[32, 32, 32, 32]> : vector<4xindex> + %2 = xegpu.update_offset %1, %s : !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr>, vector<4xindex> gpu.return } @@ -179,8 +193,10 @@ gpu.func @test_dpas_vc_with_packed_b(%a : vector<8x16xf16>, %b: vector<8x16x2xf1 // CHECK: gpu.func @test_atomic_rmw(%[[arg0:.*]]: ui64, %[[arg1:.*]]: vector<16xf32>, %[[arg2:.*]]: vector<16xi1>) gpu.func @test_atomic_rmw(%src: ui64, %value : vector<16xf32>, %mask : vector<16xi1>) { - //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]] [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15] : ui64 -> !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>> - %1 = xegpu.create_tdesc %src[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]: ui64 -> !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>> + //CHECK: %[[c:.*]] = arith.constant dense<[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]> : vector<16xindex> + %c = arith.constant dense<[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]> : vector<16xindex> + //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[c]] : ui64, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>> + %1 = xegpu.create_tdesc %src, %c: ui64, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>> //CHECK: %[[R1:.*]] = xegpu.atomic_rmw addf %[[R0]], %[[arg2]], %[[arg1]] : !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>>, vector<16xi1>, vector<16xf32> -> vector<16xf32> xegpu.atomic_rmw addf %1, %mask, %value: !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>>, vector<16xi1>, vector<16xf32> -> vector<16xf32> gpu.return diff --git a/mlir/test/Dialect/XeGPU/invalid.mlir b/mlir/test/Dialect/XeGPU/invalid.mlir index 193dae352e370..f8a0d95bd70a2 100644 --- a/mlir/test/Dialect/XeGPU/invalid.mlir +++ b/mlir/test/Dialect/XeGPU/invalid.mlir @@ -39,8 +39,9 @@ func.func @test_prefetch_nd_vc_1(%src: memref<24x32xf16>) { // ----- func.func @test_prefetch_nd_vc_2(%src: memref<24xf16>) { - %1 = xegpu.create_tdesc %src[0, 1, 2, 3, 4, 5, 6, 7] - : memref<24xf16> -> !xegpu.tensor_desc<8xf16, #xegpu.scatter_tdesc_attr<>> + %0 = arith.constant dense<[0, 1, 2, 3, 4, 5, 6, 7]> : vector<8xindex> + %1 = xegpu.create_tdesc %src, %0 : memref<24xf16>, vector<8xindex> + -> !xegpu.tensor_desc<8xf16, #xegpu.scatter_tdesc_attr<>> // expected-error@+1 {{Expects a non-scattered TensorDesc}} xegpu.prefetch_nd %1 <{l1_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<8xf16, #xegpu.scatter_tdesc_attr<>> @@ -58,8 +59,9 @@ func.func @test_load_nd_vc_1(%src: memref<8x16xf16>) { // ----- func.func @test_load_nd_vc_2(%src: memref<16xf16>) { - %1 = xegpu.create_tdesc %src[0, 2, 4, 6, 8, 10, 12, 14] - : memref<16xf16> -> !xegpu.tensor_desc<8x2xf16, #xegpu.scatter_tdesc_attr> + %0 = arith.constant dense<[0, 2, 4, 6, 8, 10, 12, 14]> : vector<8xindex> + %1 = xegpu.create_tdesc %src, %0 : memref<16xf16>, vector<8xindex> + -> !xegpu.tensor_desc<8x2xf16, #xegpu.scatter_tdesc_attr> // expected-error@+1 {{Expects a non-scattered TensorDesc.}} %2 = xegpu.load_nd %1 <{l1_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<8x2xf16, #xegpu.scatter_tdesc_attr> -> vector<8x2xf16> @@ -86,9 +88,10 @@ func.func @test_store_nd_vc_1(%dst: memref<24x32xf16>) { // ----- func.func @test_store_nd_vc_2(%dst: memref<16xf16>) { + %0 = arith.constant dense<[0, 2, 4, 6, 8, 10, 12, 14]> : vector<8xindex> %1 = arith.constant dense<1.0>: vector<8x2xf16> - %2 = xegpu.create_tdesc %dst[0, 2, 4, 6, 8, 10, 12, 14] - : memref<16xf16> -> !xegpu.tensor_desc<8x2xf16, #xegpu.scatter_tdesc_attr> + %2 = xegpu.create_tdesc %dst, %0 : memref<16xf16>, vector<8xindex> + -> !xegpu.tensor_desc<8x2xf16, #xegpu.scatter_tdesc_attr> // expected-error@+1 {{Expects a non-scattered TensorDesc}} xegpu.store_nd %1, %2 <{l1_hint = #xegpu.cache_hint}> : vector<8x2xf16>, !xegpu.tensor_desc<8x2xf16, #xegpu.scatter_tdesc_attr> @@ -97,8 +100,9 @@ func.func @test_store_nd_vc_2(%dst: memref<16xf16>) { // ----- func.func @test_update_nd_offset_1(%dst: memref<16xf16>) { - %1 = xegpu.create_tdesc %dst[0, 2, 4, 6, 8, 10, 12, 14] - : memref<16xf16> -> !xegpu.tensor_desc<8x2xf16, #xegpu.scatter_tdesc_attr> + %0 = arith.constant dense<[0, 2, 4, 6, 8, 10, 12, 14]> : vector<8xindex> + %1 = xegpu.create_tdesc %dst, %0 : memref<16xf16>, vector<8xindex> + -> !xegpu.tensor_desc<8x2xf16, #xegpu.scatter_tdesc_attr> // expected-error@+1 {{Expects a non-scattered TensorDesc}} xegpu.update_nd_offset %1, [0, 2] : !xegpu.tensor_desc<8x2xf16, #xegpu.scatter_tdesc_attr> return @@ -106,24 +110,27 @@ func.func @test_update_nd_offset_1(%dst: memref<16xf16>) { // ----- func.func @test_create_tdesc_vc_1(%src: ui64) { + %0 = arith.constant dense<[0, 2, 4, 6, 8, 10, 12, 14]> : vector<8xindex> // expected-error@+1 {{Expects a scattered TensorDesc}} - %1 = xegpu.create_tdesc %src[0, 2, 4, 6, 8, 10, 12, 14] - : ui64 -> !xegpu.tensor_desc<8xf16> + %1 = xegpu.create_tdesc %src, %0 : ui64, vector<8xindex> -> !xegpu.tensor_desc<8xf16> return } // ----- func.func @test_create_tdesc_vc_2(%src: ui64) { + %0 = arith.constant dense<[0, 2, 4, 6, 8, 10, 12, 14]> : vector<8xindex> // expected-error@+1 {{Incorrect TensorDesc shape}} - %1 = xegpu.create_tdesc %src[0, 2, 4, 6, 8, 10, 12, 14] {chunk_size = 2} - : ui64 -> !xegpu.tensor_desc<8x4xf16, #xegpu.scatter_tdesc_attr<>> + %1 = xegpu.create_tdesc %src, %0 : ui64, vector<8xindex> + -> !xegpu.tensor_desc<8x4xf16, #xegpu.scatter_tdesc_attr<>> return } // ----- func.func @test_create_tdesc_vc_1(%src: memref) { + %0 = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex> // expected-error@+1 {{Memory space mismatch}} - %1 = xegpu.create_tdesc %src[0, 8, 16, 24] : memref -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr> + %1 = xegpu.create_tdesc %src, %0 : memref, vector<4xindex> + -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr> return } @@ -137,7 +144,9 @@ func.func @test_prefetch_vc_1(%src: memref<24x32xf16>) { // ----- func.func @test_prefetch_vc_2(%src: ui64) { - %1 = xegpu.create_tdesc %src[0, 8, 16, 24] : ui64 -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr> + %0 = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex> + %1 = xegpu.create_tdesc %src, %0 : ui64, vector<4xindex> + -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr> // expected-error@+1 {{invlid l1_hint: #xegpu.cache_hint}} xegpu.prefetch %1 <{l1_hint = #xegpu.cache_hint}>: !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr> return @@ -155,8 +164,9 @@ func.func @test_load_gather_vc_1(%src: memref<24x32xf16>) { // ----- func.func @test_load_gather_vc_2(%src: ui64) { + %cst = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex> %0 = arith.constant dense<1>: vector<4xi1> - %1 = xegpu.create_tdesc %src[0, 8, 16, 24] : ui64 + %1 = xegpu.create_tdesc %src, %cst : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr> // expected-error@+1 {{invlid l1_hint: #xegpu.cache_hint}} %2 = xegpu.load %1, %0 <{l1_hint = #xegpu.cache_hint}> @@ -178,10 +188,11 @@ func.func @test_store_scatter_vc_1(%src: memref<24x32xf32>) { // ----- func.func @test_store_scatter_vc_2(%src: ui64) { + %cst = arith.constant dense<[0, 8, 16, 24]>: vector<4xindex> %0 = arith.constant dense<1>: vector<4xi1> %1 = arith.constant dense<2.9>: vector<4x2xf32> - %2 = xegpu.create_tdesc %src[0, 8, 16, 24] - : ui64 -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr> + %2 = xegpu.create_tdesc %src, %cst : ui64, vector<4xindex> + -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr> // expected-error@+1 {{invlid l1_hint: #xegpu.cache_hint}} xegpu.store %1, %2, %0 <{l1_hint = #xegpu.cache_hint}> : vector<4x2xf32>, !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr>, vector<4xi1> @@ -204,7 +215,8 @@ func.func @test_dpas_vc_2(%a : vector<8x8x2xf16>, %b: vector<8x16x2xf16>) { // ----- func.func @test_atomic_rmw(%src: ui64, %value : vector<16x4xf32>, %mask : vector<16xi1>) { - %1 = xegpu.create_tdesc %src[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15] : ui64 -> !xegpu.tensor_desc<16x8xf32, #xegpu.scatter_tdesc_attr> + %0 = arith.constant dense<[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]> : vector<16xindex> + %1 = xegpu.create_tdesc %src, %0 : ui64, vector<16xindex> -> !xegpu.tensor_desc<16x8xf32, #xegpu.scatter_tdesc_attr> // expected-error@+1 {{failed to verify that all of {tensorDesc, value, result} have same shape}} xegpu.atomic_rmw addf %1, %mask, %value: !xegpu.tensor_desc<16x8xf32, #xegpu.scatter_tdesc_attr>, vector<16xi1>, vector<16x4xf32> -> vector<16x8xf32> return