add tests

Jianhui-Li · Jianhui-Li · commit 769bf19a3775 · 2025-07-25T02:33:06.000Z
diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
@@ -676,9 +676,7 @@ def XeGPU_PrefetchOp : XeGPU_Op<"prefetch", []> {
   let hasVerifier = 1;
 }
 
-def XeGPU_LoadGatherOp : XeGPU_Op<"load", [
-  AllElementTypesMatch<["value", "source"]>, MemoryEffects<[MemRead]>
-  ]> {
+def XeGPU_LoadGatherOp : XeGPU_Op<"load", [MemoryEffects<[MemRead]>]> {
   let summary = "load a set of scattered data points from memory.";
 
   let description = [{ It (aka. load) load data per each work-item. The output
@@ -721,8 +719,7 @@ def XeGPU_LoadGatherOp : XeGPU_Op<"load", [
   }];
 
   let arguments = (ins XeGPU_TensorDesc_or_MemRef: $source,
-                       Variadic<Index>: $offsets,
-                       OptionalAttr<DenseI64ArrayAttr>: $const_offsets,  
+                       Optional<XeGPU_OffsetType>: $offsets,
                        XeGPU_MaskType: $mask,
                        OptionalAttr<I64Attr>: $chunk_size,
                        OptionalAttr<XeGPU_CacheHintAttr>: $l1_hint,
@@ -760,11 +757,15 @@ def XeGPU_LoadGatherOp : XeGPU_Op<"load", [
   }];
 
   let assemblyFormat = [{
-    $source `` 
-    custom<OptionalDynamicIndexList>($offsets, $const_offsets) `,`
+    $source
+    (`[` $offsets^ `]`)? `,`
     $mask prop-dict 
-    attr-dict `:` qualified(type($source)) `,` type($mask) `->` type($value)
+    attr-dict `:` type(operands) `->` type($value)
   }];
+  
+  //    functional-type(operands, results)
+  // type($source) (type($offsets)^ )? `,` type($mask) `->` type($value)
+  
 
   let builders = [
     OpBuilder<(ins "Type": $value, "Value": $source, "Value": $mask,
@@ -776,9 +777,7 @@ def XeGPU_LoadGatherOp : XeGPU_Op<"load", [
   let hasVerifier = 1;
 }
 
-def XeGPU_StoreScatterOp : XeGPU_Op<"store", [
-  AllElementTypesMatch<["value", "dest"]>, MemoryEffects<[MemWrite]>
-  ]> {
+def XeGPU_StoreScatterOp : XeGPU_Op<"store", [MemoryEffects<[MemWrite]>]> {
   let summary = "store data to scattered memory locations.";
   let description = [{ It (aka. store) stores data to scattered memory locations. The value is
   typically a 1D vector. But when the chunk size of the TensorDesc is larger than 1, it will be
@@ -818,8 +817,7 @@ def XeGPU_StoreScatterOp : XeGPU_Op<"store", [
   let arguments = (ins
     XeGPU_ValueType: $value,
     XeGPU_TensorDesc_or_MemRef: $dest,
-    Variadic<Index>: $offsets,
-    OptionalAttr<DenseI64ArrayAttr>: $const_offsets,
+    Optional<XeGPU_OffsetType>: $offsets,
     XeGPU_MaskType: $mask,
     OptionalAttr<I64Attr>: $chunk_size,
     OptionalAttr<XeGPU_CacheHintAttr>: $l1_hint,
@@ -850,12 +848,13 @@ def XeGPU_StoreScatterOp : XeGPU_Op<"store", [
 
   let assemblyFormat = [{
     $value `,`
-    $dest `` 
-    custom<OptionalDynamicIndexList>($offsets, $const_offsets) `,`
+    $dest
+    (`[` $offsets^ `]`)? `,`
     $mask 
     prop-dict 
-    attr-dict `:` type($value) `,` qualified(type($dest)) `,` type($mask)
+    attr-dict `:`  type(operands)
   }];
+//    type($value) `,` qualified(type($dest)) (type($offsets)^)? `,` type($mask)
 
   let builders = [
     OpBuilder<(ins "Value": $value, "Value": $dest, "Value": $mask,
diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
@@ -110,6 +110,66 @@ isValidGatherScatterParams(Type maskTy, VectorType valueTy,
   return success();
 }
 
+static LogicalResult
+isValidGatherScatterMemRefParams(Type maskTy, VectorType valueTy,
+                                 MemRefType memTy, int64_t chunkSize,
+                                 function_ref<InFlightDiagnostic()> emitError) {
+
+  if (!valueTy)
+    return emitError() << "Expecting a vector type result.";
+
+  auto maskShape = getShapeOf(maskTy);
+  auto valueShape = getShapeOf(valueTy);
+  auto memShape = getShapeOf(memTy);
+
+  if (valueTy.getElementType() != memTy.getElementType())
+    return emitError() << "Value should have the same element type as MemRef.";
+
+  // a valid shape for SIMT case
+  if (valueTy.getRank() == 1) {
+    if (valueTy.getNumElements() != chunkSize)
+      return emitError() << "value elements must match chunk size " << chunkSize
+                         << " for SIMT code.";
+    return success();
+  }
+
+  llvm::SmallVector<int64_t> expectedMaskShape(valueShape);
+  if (chunkSize > 1)
+    expectedMaskShape.pop_back();
+  if (expectedMaskShape != maskShape)
+    return emitError() << "Mask should match value except the chunk size dim.";
+
+  return success();
+}
+
+static LogicalResult
+isValidGatherScatterRawptrParams(Type maskTy, VectorType valueTy,
+                                 int64_t chunkSize,
+                                 function_ref<InFlightDiagnostic()> emitError) {
+
+  if (!valueTy)
+    return emitError() << "Expecting a vector type result.";
+
+  auto maskShape = getShapeOf(maskTy);
+  auto valueShape = getShapeOf(valueTy);
+
+  // a valid shape for SIMT case
+  if (valueTy.getRank() == 1) {
+    if (valueTy.getNumElements() != chunkSize)
+      return emitError() << "value elements must match chunk size " << chunkSize
+                         << " for SIMT code.";
+    return success();
+  }
+
+  llvm::SmallVector<int64_t> expectedMaskShape(valueShape);
+  if (chunkSize > 1)
+    expectedMaskShape.pop_back();
+  if (expectedMaskShape != maskShape)
+    return emitError() << "Mask should match value except the chunk size dim.";
+
+  return success();
+}
+
 //===----------------------------------------------------------------------===//
 // XeGPU_CreateNdDescOp
 //===----------------------------------------------------------------------===//
@@ -683,17 +743,27 @@ LogicalResult LoadGatherOp::verify() {
   if (!isReadHintOrNone(getL3HintAttr()))
     return emitOpError("invalid l3_hint: ") << getL3HintAttr();
 
-  return isValidGatherScatterParams(maskTy, valueTy, tdescTy,
-                                    [&]() { return emitOpError(); });
+  if (tdescTy)
+    return isValidGatherScatterParams(maskTy, valueTy, tdescTy,
+                                      [&]() { return emitOpError(); });
+  auto srcTy = getSourceType();
+  uint64_t chunkSize = static_cast<int64_t>(getChunkSize().value_or(1));
+  auto memTy = dyn_cast<MemRefType>(srcTy);
+
+  if (memTy)
+    return isValidGatherScatterMemRefParams(maskTy, valueTy, memTy, chunkSize,
+                                            [&]() { return emitOpError(); });
+  return isValidGatherScatterRawptrParams(maskTy, valueTy, chunkSize,
+                                          [&]() { return emitOpError(); });
 }
 
 void LoadGatherOp::build(OpBuilder &builder, OperationState &state,
                          Type valueType, Value source, Value mask,
                          xegpu::CachePolicyAttr l1_hint,
                          xegpu::CachePolicyAttr l2_hint,
                          xegpu::CachePolicyAttr l3_hint) {
-  build(builder, state, valueType, source, ValueRange(), DenseI64ArrayAttr(),
-        mask, IntegerAttr(), l1_hint, l2_hint, l3_hint);
+  build(builder, state, valueType, source, Value(), mask, IntegerAttr(),
+        l1_hint, l2_hint, l3_hint);
 }
 
 //===----------------------------------------------------------------------===//
@@ -713,17 +783,28 @@ LogicalResult StoreScatterOp::verify() {
   if (!isWriteHintOrNone(getL3HintAttr()))
     return emitOpError("invalid l3_hint: ") << getL3HintAttr();
 
-  return isValidGatherScatterParams(maskTy, valueTy, tdescTy,
-                                    [&]() { return emitOpError(); });
+  if (tdescTy)
+    return isValidGatherScatterParams(maskTy, valueTy, tdescTy,
+                                      [&]() { return emitOpError(); });
+
+  auto destTy = getDestType();
+  uint64_t chunkSize = static_cast<int64_t>(getChunkSize().value_or(1));
+  auto memTy = dyn_cast<MemRefType>(destTy);
+
+  if (memTy)
+    return isValidGatherScatterMemRefParams(maskTy, valueTy, memTy, chunkSize,
+                                            [&]() { return emitOpError(); });
+  return isValidGatherScatterRawptrParams(maskTy, valueTy, chunkSize,
+                                          [&]() { return emitOpError(); });
 }
 
 void StoreScatterOp::build(OpBuilder &builder, OperationState &state,
                            Value value, Value dest, Value mask,
                            xegpu::CachePolicyAttr l1_hint,
                            xegpu::CachePolicyAttr l2_hint,
                            xegpu::CachePolicyAttr l3_hint) {
-  build(builder, state, value, dest, ValueRange(), DenseI64ArrayAttr(), mask,
-        IntegerAttr(), l1_hint, l2_hint, l3_hint);
+  build(builder, state, value, dest, Value(), mask, IntegerAttr(), l1_hint,
+        l2_hint, l3_hint);
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/mlir/test/Dialect/XeGPU/invalid.mlir b/mlir/test/Dialect/XeGPU/invalid.mlir
@@ -384,6 +384,28 @@ func.func @load_gather_vc_3(%src: ui64) {
   return
 }
 
+// -----
+func.func @load_offset(%src: ui64) {
+  %offsets = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
+  %mask = arith.constant dense<1>: vector<8xi1>
+  // expected-error@+1 {{Mask should match value except the chunk size dim}}
+  %2 = xegpu.load %src[%offsets], %mask
+        : ui64, vector<4xindex>, vector<8xi1>
+          -> vector<4x2xf32>
+  return
+}
+
+// -----
+func.func @store_offset(%src: ui64) {
+  %val = arith.constant dense<2.9>: vector<4x2xf16>
+  %offsets = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
+  %mask = arith.constant dense<1>: vector<8xi1>
+  // expected-error@+1 {{Mask should match value except the chunk size dim}}
+  xegpu.store %val, %src[%offsets], %mask
+        : vector<4x2xf16>, ui64, vector<4xindex>, vector<8xi1>
+  return
+}
+
 // -----
 func.func @store_scatter_vc_1(%src: memref<24x32xf32>) {
   %0 = arith.constant dense<1>: vector<4xi1>
diff --git a/mlir/test/Dialect/XeGPU/ops.mlir b/mlir/test/Dialect/XeGPU/ops.mlir
@@ -130,15 +130,6 @@ gpu.func @prefetch_nd_offset_1(%src: memref<48x64xf16>, %x : index, %y : index)
   gpu.return
 }
 
-// CHECK: gpu.func @prefetch_nd_offset_1(%[[arg0:.*]]: memref<8x24x32x48x64xf16>) {
-gpu.func @prefetch_nd_offset_1(%src: memref<8x24x32x48x64xf16>) {
-  // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %[[arg0]][0, 0, 0, 0, 0] : memref<8x24x32x48x64xf16> -> !xegpu.tensor_desc<1x2x4x8x16xf16>
-  %1 = xegpu.create_nd_tdesc %src[0, 0, 0, 0, 0] : memref<8x24x32x48x64xf16> -> !xegpu.tensor_desc<1x2x4x8x16xf16>
-  // CHECK: xegpu.prefetch_nd %[[R0]][0, 0] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<1x2x4x8x16xf16>
-  xegpu.prefetch_nd %1[0, 0] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}>: !xegpu.tensor_desc<1x2x4x8x16xf16>
-  gpu.return
-}
-
 // CHECK: func @subgroup_load_nd(%[[arg0:.*]]: memref<8x16xf16>) {
 gpu.func @subgroup_load_nd(%src: memref<8x16xf16>) {
   // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16>
@@ -339,16 +330,6 @@ gpu.func @subgroup_store_nd_2(%dst: memref<24x32xf16>, %x : index) {
   gpu.return
 }
 
-// CHECK: func @subgroup_store_nd_3(%[[arg0:.*]]: memref<24x32xf16>) {
-gpu.func @subgroup_store_nd_3(%dst: memref<24x32xf16>) {
-  // CHECK: %[[C:.*]] = arith.constant dense<1.000000e+00> : vector<32xf16>
-  %1 = arith.constant dense<1.0>: vector<32xf16>
-  // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %[[arg0]][0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<32xf16>
-  %2 = xegpu.create_nd_tdesc %dst[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<32xf16>
-  // CHECK: xegpu.store_nd %[[C]], %[[R0]][0] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<uncached>}> : vector<32xf16>, !xegpu.tensor_desc<32xf16>
-  xegpu.store_nd %1, %2[0] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<uncached>}>: vector<32xf16>, !xegpu.tensor_desc<32xf16>
-  gpu.return
-}
 
 // CHECK: func @subgroup_store_nd_offset_1(%[[arg0:.*]]: memref<24x32xf16>) {
 gpu.func @subgroup_store_nd_offset_1(%dst: memref<24x32xf16>) {
@@ -541,6 +522,16 @@ gpu.func @subgroup_load_4(%src: ui64) {
   gpu.return
 }
 
+// CHECK: gpu.func @subgroup_load_offset_1(%arg0: memref<?xf16>) {
+gpu.func @subgroup_load_offset_1(%src: memref<?xf16>) {
+  %offset = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
+  %mask = arith.constant dense<1>: vector<4xi1>
+  //CHECK: %[[R1:.*]] = xegpu.load %arg0[%cst], %cst_0 <{chunk_size = 2 : i64, l1_hint = #xegpu.cache_hint<cached>}> : memref<?xf16>, vector<4xindex>, vector<4xi1> -> vector<4x2xf16>
+  %val = xegpu.load %src[%offset], %mask <{chunk_size=2, l1_hint = #xegpu.cache_hint<cached>}>
+      : memref<?xf16>, vector<4xindex>, vector<4xi1> -> vector<4x2xf16>
+  gpu.return
+}
+
 // CHECK: gpu.func @subgroup_store(%[[arg0:.*]]: ui64) {
 gpu.func @subgroup_store(%src: ui64) {
   //CHECK: %[[cst:.*]] = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
@@ -646,6 +637,17 @@ gpu.func @subgroup_store_4(%src: ui64) {
   gpu.return
 }
 
+// CHECK: gpu.func @subgroup_store_offset_1(%arg0: memref<?xf16>) {
+gpu.func @subgroup_store_offset_1(%dest: memref<?xf16>) {
+  %val = arith.constant dense<2.9>: vector<4x2xf16>
+  %offset = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
+  %mask = arith.constant dense<1>: vector<4xi1>
+  //CHECK: xegpu.store %[[R0:.*]], %arg0[%cst_0], %cst_1 <{chunk_size = 2 : i64, l1_hint = #xegpu.cache_hint<cached>}> : vector<4x2xf16>, memref<?xf16>, vector<4xindex>, vector<4xi1>
+  xegpu.store %val, %dest[%offset], %mask <{chunk_size=2, l1_hint = #xegpu.cache_hint<cached>}>
+      : vector<4x2xf16>, memref<?xf16>, vector<4xindex>, vector<4xi1>
+  gpu.return
+}
+
 // CHECK: gpu.func @prefetch(%[[arg0:.*]]: ui64) {
 gpu.func @prefetch(%src: ui64) {
   //CHECK: %[[cst:.*]] = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>