llvm · chencha3 · Jun 26, 2025 · Jun 23, 2025 · Jun 23, 2025 · Jun 23, 2025
diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
@@ -80,9 +80,6 @@ def XeGPU_CreateNdDescOp: XeGPU_Op<"create_nd_tdesc", [Pure, ViewLikeOpInterface
         information e.g., memref<?x?xf16>, the strides information has to be explicitly
         passed via the "strides" and "const_strides" argument.
 
-    In SIMT mode, tensor descriptor is augmented with `LayoutAttr` which describes the
-    mapping of the tensor descriptor to the work items.
-
     Example 1 (suppose the tensor shape inferred by the compiler is 8x16):
     ```mlir
     %0 = memref.alloc() : memref<1024x1024xf32>
@@ -106,15 +103,6 @@ def XeGPU_CreateNdDescOp: XeGPU_Op<"create_nd_tdesc", [Pure, ViewLikeOpInterface
     %c1 = arith.constant 1 : index
     %1 = xegpu.create_nd_tdesc %0[%c0, %c0], [%h, %w], [%w, %c1]: ui64 -> TensorDesc<8x16xf32>
     ```
-
-    Example 4 (SIMT mode):
-    ```mlir
-    %0 = memref.alloc() : memref<1024x1024xf32>
-    %c0 = arith.constant 0 : index
-    %c1 = arith.constant 8 : index
-    %1 = xegpu.create_nd_tdesc %0[%c0, %c0] : memref<1024x1024xf32>
-          -> !xegpu.tensor_desc<8x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
-    ```
   }];
 
   let arguments = (ins
@@ -301,9 +289,7 @@ def XeGPU_LoadNdOp : XeGPU_Op<"load_nd", [
     fp32 or fp64. It implies that vnni and transpose cannot exit at the
     same time.
 
-    In SIMT mode, LoadNdOp expects the tensor descriptor to be augmented with `LayoutAttr`
-    which describes the mapping of the tensor to the work items. In this case, result
-    vector represents the data to be loaded by each work-item.
+    In SIMT mode, result vector represents the data to be loaded by each work-item.
 
     Example 1:
     ```mlir
@@ -317,8 +303,7 @@ def XeGPU_LoadNdOp : XeGPU_Op<"load_nd", [
     ```mlir
       xegpu.load_nd %1 {l1_hint = #xegpu.cache_hint<cached>,
                         l2_hint = #xegpu.cache_hint<uncached>}>
-        : !xegpu.tensor_desc<8x16xf32,
-          #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> -> vector<8x1xf32>
+        : !xegpu.tensor_desc<8x16xf32> -> vector<8xf32>
     ```
 
 
@@ -359,9 +344,7 @@ def XeGPU_StoreNdOp : XeGPU_Op<"store_nd", [
     of cache, L1, L2 and L3. If hardware does not have a correspoding cache,
     Corresponding cache hint attribute will be masked.
 
-    In SIMT mode, StoreNdOp expects the tensor descriptor to be augmented with `LayoutAttr`
-    which describes the mapping of the tensor to the work items. In this case, input
-    vector represents the data to be stored by each work-item.
+    In SIMT mode, the input vector represents the data to be stored by each work-item.
 
     Example 1:
     ```mlir
@@ -375,8 +358,7 @@ def XeGPU_StoreNdOp : XeGPU_Op<"store_nd", [
       xegpu.store_nd %3, %2 {l1_hint = #xegpu.cache_hint<uncached>,
                              l2_hint = #xegpu.cache_hint<write_back>,
                              l3_hint = #xegpu.cache_hint<write_through>}
-                             : vector<8x1xf16>, !xegpu.tensor_desc<8x16xf16,
-                               #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
+                             : vector<8xf16>, !xegpu.tensor_desc<8x16xf16>
     ```
 
 
@@ -410,15 +392,10 @@ def XeGPU_UpdateNdOffsetOp : XeGPU_Op<"update_nd_offset",
     The offsets are relative offset to the current position in the number
     of elements. It will result in a same type TensorDesc as the input.
 
-  Example 1:
+  Example:
   ```
     %2 = xegpu.update_nd_offset %1, [0, 16]: !xegpu.tensor_desc<8x16xf32>
   ```
-  Example 2 (SIMT mode):
-  ```
-    %2 = xegpu.update_nd_offset %1, [0, 16]:
-      !xegpu.tensor_desc<8x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
-  ```
   }];
 
   let arguments = (ins
@@ -476,11 +453,6 @@ def XeGPU_CreateDescOp: XeGPU_Op<"create_tdesc", [Pure, ViewLikeOpInterface]> {
     match the dimension of offsets. It may also has a second dimension corresponding to
     the chunk_size if the chunk size is larger than 1.
 
-    In SIMT mode, similar to `create_nd_tdesc` the resulting tensor descriptor is augmented
-    with `LayoutAttr` which describes the mapping of the tensor descriptor to the work items.
-    In this case, the first dimension of the tensor descriptor represents the work-items, and
-    the second dimension represents the chunk size.
-
     Example 1: It assumes subgroup size is 4, and accesses a[0], a[16], a[32], a[64]
     ```mlir
     %a = memref.alloc() : memref<1024xf32>
@@ -505,15 +477,6 @@ def XeGPU_CreateDescOp: XeGPU_Op<"create_tdesc", [Pure, ViewLikeOpInterface]> {
     %1 = xegpu.create_tdesc %0, %off : memref<1024xf32>, vector<4xindex>
           -> TensorDesc<4x8xf32, #xegpu.scattered_tdesc_attr<chunk_size = 8>>
     ```
-
-    Example 4: SIMT mode
-    ```mlir
-    %0 = memref.alloc() : memref<1024xf32>
-    %off = arith.constant dense<[0, 16, 32, 64]> : vector<4xindex>
-    %1 = xegpu.create_tdesc %0, %off : memref<1024xf32>, vector<4xindex>
-          -> TensorDesc<4x8xf32, #xegpu.scattered_tdesc_attr<chunk_size = 8>,
-          #xegpu.layout<lane_layout = [4, 1], lane_data = [1, 1]>>
-    ```
   }];
 
   let arguments = (ins XeGPU_BaseAddrType: $source,
@@ -609,54 +572,44 @@ def XeGPU_LoadGatherOp : XeGPU_Op<"load", [
   let description = [{ It (aka. load) load data per each work-item. The output
     describes the data being loaded at the subgroup level, so its size is
     consistent with the number of work-items in a subgroup. When the chunk size
-    is larger than 2, the output vector is a 2D vector, with dim-1 correspoding
-    to work-items, and dim-0 corresponding to the chunk size loaded by each work-item.
-    Specially, there is a transpose effect on the result (as compared to the TensorDesc)
-    due to the hardware implementation. Therefore, a transpose attribute is introduced
-    on purpose, making sure users are aware of this implicit transformation.
-
+    is larger than 2, the output vector is a 2D vector, with dim-0 correspoding
+    to work-items, and dim-1 corresponding to the chunk size loaded by each work-item.
     The mask operand masks out memory access so that it is safe to pass out-of-boundary
     addresses/offsets as long as they are masked. It applies to slots of SIMD lanes.
 
-    In SIMT mode, LoadGatherOp expects the tensor descriptor to be augmented with `LayoutAttr`
-    which describes the mapping of the tensor to the work items. In this case, result vector
-    represents the data to be loaded by each work-item. Each work-item recieves a `chunk_size`
-    number of elements.
+    In SIMT mode, the result vector represents the data to be loaded by each work-item.
+    Each work-item recieves a `chunk_size` number of elements.
 
   Example 1:
   ```mlir
-    %2 = xegpu.load %1, %0 {l1_hint = #xegpu.cache_hint<cached>,
-                            l2_hint = #xegpu.cache_hint<uncached>,
-                            l3_hint = #xegpu.cache_hint<uncached>}
+    %2 = xegpu.load %1, %0 <{l1_hint = #xegpu.cache_hint<cached>,
+                             l2_hint = #xegpu.cache_hint<uncached>,
+                             l3_hint = #xegpu.cache_hint<uncached>}>
           : !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<memory_space=global>>,
             vector<16xi1> -> vector<16xf32>
   ```
 
   Example 2:
   ```mlir
-    %2 = xegpu.load %1, %0 {transpose,
-                            l1_hint = #xegpu.cache_hint<cached>,
-                            l2_hint = #xegpu.cache_hint<uncached>,
-                            l3_hint = #xegpu.cache_hint<uncached>}
+    %2 = xegpu.load %1, %0 <{l1_hint = #xegpu.cache_hint<cached>,
+                             l2_hint = #xegpu.cache_hint<uncached>,
+                             l3_hint = #xegpu.cache_hint<uncached>}>
           : !xegpu.tensor_desc<16x8xf32, #xegpu.scatter_tdesc_attr<memory_space=global, chunk_size=8>>,
-            vector<16xi1> -> vector<8x16xf32>
+            vector<16xi1> -> vector<16x8xf32>
   ```
   Example 3 (SIMT mode):
   ```mlir
-    %2 = xegpu.load %1, %0 {transpose,
-                            l1_hint = #xegpu.cache_hint<cached>,
-                            l2_hint = #xegpu.cache_hint<uncached>,
-                            l3_hint = #xegpu.cache_hint<uncached>}
-          : !xegpu.tensor_desc<16x8xf32, #xegpu.scatter_tdesc_attr<memory_space=global, chunk_size=8>,
-            !xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>>
-            vector<16xi1> -> vector<8x1xf32>
+    %2 = xegpu.load %1, %0 <{l1_hint = #xegpu.cache_hint<cached>,
+                             l2_hint = #xegpu.cache_hint<uncached>,
+                             l3_hint = #xegpu.cache_hint<uncached>}>
+          : !xegpu.tensor_desc<16x8xf32, #xegpu.scatter_tdesc_attr<memory_space=global, chunk_size=8>>
+            vector<16xi1> -> vector<8xf32>
   ```
 
   }];
 
   let arguments = (ins XeGPU_TensorDesc: $TensorDesc,
                        XeGPU_MaskType: $mask,
-                       OptionalAttr<UnitAttr>: $transpose,
                        OptionalAttr<XeGPU_CacheHintAttr>: $l1_hint,
                        OptionalAttr<XeGPU_CacheHintAttr>: $l2_hint,
                        OptionalAttr<XeGPU_CacheHintAttr>: $l3_hint);
@@ -699,44 +652,38 @@ def XeGPU_StoreScatterOp : XeGPU_Op<"store", [
   has transpose effect, which is similar to `load_gather`. Therefore, a transpose attribute is
   introduced on purpose, making sure users are aware of this implicit transformation.
 
-  In SIMT mode, StoreScatterOp expects the tensor descriptor to be augmented with `LayoutAttr`
-  which describes the mapping of the tensor to the work items. In this case, input vector
-  represents the data to be stored by each work-item. Each work-item recieves a `chunk_size`
-  number of elements.
+  In SIMT mode, the input vector represents the data to be stored by each work-item.
+  Each work-item stores a `chunk_size` number of elements.
 
   Example 1:
   ```mlir
-    xegpu.store %0, %1, %2 {l1_hint = #xegpu.cache_hint<uncached>,
-                                 l2_hint = #xegpu.cache_hint<write_back>,
-                                 l3_hint = #xegpu.cache_hint<write_through>}
+    xegpu.store %0, %1, %2 <{l1_hint = #xegpu.cache_hint<uncached>,
+                             l2_hint = #xegpu.cache_hint<write_back>,
+                             l3_hint = #xegpu.cache_hint<write_through>}>
           : vector<16xf32>, !xegpu.tensor_desc<16xf32, #xegpu.scattered_tdesc_attr<>>, vector<16xi1>
   ```
 
   Example 2:
   ```mlir
-    xegpu.store %0, %1, %2 {transpose,
-                                 l1_hint = #xegpu.cache_hint<uncached>,
-                                 l2_hint = #xegpu.cache_hint<write_back>,
-                                 l3_hint = #xegpu.cache_hint<write_through>}
-          : vector<8x16xf32>, !xegpu.tensor_desc<16x8xf32, #xegpu.scattered_tdesc_attr<chunk_size=8>>, vector<16xi1>
+    xegpu.store %0, %1, %2 <{l1_hint = #xegpu.cache_hint<uncached>,
+                             l2_hint = #xegpu.cache_hint<write_back>,
+                             l3_hint = #xegpu.cache_hint<write_through>}>
+          : vector<16x8xf32>, !xegpu.tensor_desc<16x8xf32, #xegpu.scattered_tdesc_attr<chunk_size=8>>, vector<16xi1>
   ```
+
   Example 3 (SIMT mode):
   ```mlir
-    xegpu.store %0, %1, %2 {transpose,
-                                 l1_hint = #xegpu.cache_hint<uncached>,
-                                 l2_hint = #xegpu.cache_hint<write_back>,
-                                 l3_hint = #xegpu.cache_hint<write_through>}
-          : vector<8x1xf32>, !xegpu.tensor_desc<16x8xf32, #xegpu.scattered_tdesc_attr<chunk_size=8>,
-            !xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>> vector<16xi1>
+    xegpu.store %0, %1, %2 <{l1_hint = #xegpu.cache_hint<uncached>,
+                             l2_hint = #xegpu.cache_hint<write_back>,
+                             l3_hint = #xegpu.cache_hint<write_through>}>
+          : vector<8xf32>, !xegpu.tensor_desc<16x8xf32, #xegpu.scattered_tdesc_attr<chunk_size=8>> vector<16xi1>
   ```
-
   }];
 
   let arguments = (ins
     XeGPU_ValueType: $value,
     XeGPU_TensorDesc: $TensorDesc,
     XeGPU_MaskType: $mask,
-    OptionalAttr<UnitAttr>: $transpose,
     OptionalAttr<XeGPU_CacheHintAttr>: $l1_hint,
     OptionalAttr<XeGPU_CacheHintAttr>: $l2_hint,
     OptionalAttr<XeGPU_CacheHintAttr>: $l3_hint);
@@ -773,20 +720,13 @@ def XeGPU_UpdateOffsetOp: XeGPU_Op<"update_offset",
     update the offset per work-item, so its offsets contains values representing
     shifts for each work-item.
 
-    Example 1:
+    Example:
     ```mlir
       %off = arith.constant dense<[32, 32, 32, 32]> : vector<4xindex>
       %2 = xegpu.update_offset %1, %off :
               !xegpu.tensor_desc<4x2xf32, #xegpu.scattered_tdesc_attr<chunk_size=2>>, vector<4xindex>
     ```
 
-    Example 2 (SIMT mode):
-    ```mlir
-      %off = arith.constant dense<[32, 32, 32, 32]> : vector<4xindex>
-      %2 = xegpu.update_offset %1, %off :
-              !xegpu.tensor_desc<4x2xf32, #xegpu.scattered_tdesc_attr<chunk_size=2>,
-              #xegpu.layout<lane_layout = [4, 1], lane_data = [1, 1]>>, vector<4xindex>
-    ```
   }];
 
   let arguments = (ins XeGPU_TensorDesc: $TensorDesc,

diff --git a/mlir/include/mlir/Dialect/XeGPU/Utils/XeGPUUtils.h b/mlir/include/mlir/Dialect/XeGPU/Utils/XeGPUUtils.h
@@ -35,6 +35,8 @@ constexpr unsigned packedSizeInBitsForDefault =
     16; // Minimum packing size per register for DPAS A.
 constexpr unsigned packedSizeInBitsForDpasB =
     32; // Minimum packing size per register for DPAS B.
+constexpr unsigned packedSizeInBitsForGatherScatter =
+    32; // Minimum packing size per register for Gather and Scatter ops.
 } // namespace targetinfo
 } // namespace xegpu
 

diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
@@ -8,6 +8,7 @@
 
 #include "mlir/Dialect/Utils/IndexingUtils.h"
 #include "mlir/Dialect/XeGPU/IR/XeGPU.h"
+#include "mlir/Dialect/XeGPU/Utils/XeGPUUtils.h"
 #include "mlir/IR/Builders.h"
 #include "mlir/IR/DialectImplementation.h"
 #include "llvm/ADT/TypeSwitch.h"
@@ -309,11 +310,23 @@ LogicalResult TensorDescType::verify(
     llvm::ArrayRef<int64_t> shape, mlir::Type elementType,
     mlir::Attribute encoding, mlir::Attribute layout) {
   size_t rank = shape.size();
-  // Low-precision types are packed in 32-bit units.
-  int32_t packingFactor = 32 / elementType.getIntOrFloatBitWidth();
   if (rank != 1 && rank != 2)
     return emitError() << "expected 1D or 2D tensor";
 
+  auto blockAttr = mlir::dyn_cast_if_present<BlockTensorDescAttr>(encoding);
+  if (blockAttr) {
+    MemorySpaceAttr memorySpaceAttr = blockAttr.getMemorySpace();
+    if (rank == 2 && memorySpaceAttr &&
+        memorySpaceAttr.getValue() == MemorySpace::SLM)
+      return emitError() << "SLM is not supported for 2D block tensor";
+  }
+
+  // for gather and scatter ops, Low-precision types are packed in 32-bit units.
+  unsigned bitWidth = elementType.getIntOrFloatBitWidth();
+  int packingFactor =
+      bitWidth < targetinfo::packedSizeInBitsForGatherScatter
+          ? targetinfo::packedSizeInBitsForGatherScatter / bitWidth
+          : 1;
   auto scatterAttr = mlir::dyn_cast_if_present<ScatterTensorDescAttr>(encoding);
   if (scatterAttr) {
     // Expected tensor ranks for scattered data:
@@ -336,14 +349,6 @@ LogicalResult TensorDescType::verify(
     }
   }
 
-  auto blockAttr = mlir::dyn_cast_if_present<BlockTensorDescAttr>(encoding);
-  if (blockAttr) {
-    MemorySpaceAttr memorySpaceAttr = blockAttr.getMemorySpace();
-    if (rank == 2 && memorySpaceAttr &&
-        memorySpaceAttr.getValue() == MemorySpace::SLM)
-      return emitError() << "SLM is not supported for 2D block tensor";
-  }
-
   auto layoutAttr = llvm::dyn_cast_if_present<LayoutAttr>(layout);
   if (layoutAttr) {
     if (rank != (size_t)layoutAttr.getRank())

diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
@@ -20,13 +20,6 @@
 namespace mlir {
 namespace xegpu {
 
-static void transpose(llvm::ArrayRef<int64_t> trans,
-                      SmallVector<int64_t> &shape) {
-  SmallVector<int64_t> old = shape;
-  for (size_t i = 0; i < trans.size(); i++)
-    shape[i] = old[trans[i]];
-}
-
 template <typename T>
 static std::string makeString(T array, bool breakline = false) {
   std::string buf;
@@ -76,7 +69,7 @@ static bool isWriteHintOrNone(const CachePolicyAttr &attr) {
 
 static LogicalResult
 isValidGatherScatterParams(Type maskTy, VectorType valueTy,
-                           TensorDescType tdescTy, UnitAttr transposeAttr,
+                           TensorDescType tdescTy,
                            function_ref<InFlightDiagnostic()> emitError) {
 
   if (!tdescTy.isScattered())
@@ -102,17 +95,9 @@ isValidGatherScatterParams(Type maskTy, VectorType valueTy,
   if (valueTy.getRank() == 1 && valueTy.getNumElements() == chunkSize) {
     if (tdescTy.getLayoutAttr())
       return emitError() << "TensorDesc doesn't need LayoutAttr for SIMT code";
-    if (transposeAttr)
-      return emitError() << "doesn't need TransposeAttr for SIMT code";
     return success();
   }
 
-  if (tdescTy.getRank() == 2 && valueTy.getRank() == 2) {
-    if (!transposeAttr)
-      return emitError() << "rank-2 tensor has to be transposed.";
-    transpose({1, 0}, tdescShape);
-  }
-
   if (tdescShape != valueShape)
     return emitError() << "Value shape " << makeString(valueShape)
                        << " is neither a valid distribution for SIMT nor "
@@ -309,14 +294,10 @@ LogicalResult LoadNdOp::verify() {
   auto valueShape = getShapeOf(valueTy);
 
   if (getTranspose()) {
+    // Make sure the transpose value is valid, and apply it
     auto trans = getTranspose().value();
-
-    // Make sure the transpose value is valid.
-    bool valid = llvm::all_of(
-        trans, [&](int t) { return t >= 0 && t < tdescTy.getRank(); });
-
-    if (valid)
-      transpose(trans, tdescShape);
+    if (llvm::all_of(trans, [&](size_t s) { return s < tdescShape.size(); }))
+      tdescShape = applyPermutation(tdescShape, trans);
     else
       mlir::emitWarning(getLoc()) << "Invalid transpose attr. It is ignored.";
   }
@@ -536,7 +517,6 @@ LogicalResult LoadGatherOp::verify() {
     return emitOpError("invalid l3_hint: ") << getL3HintAttr();
 
   return isValidGatherScatterParams(maskTy, valueTy, tdescTy,
-                                    getTransposeAttr(),
                                     [&]() { return emitOpError(); });
 }
 
@@ -558,7 +538,6 @@ LogicalResult StoreScatterOp::verify() {
     return emitOpError("invalid l3_hint: ") << getL3HintAttr();
 
   return isValidGatherScatterParams(maskTy, valueTy, tdescTy,
-                                    getTransposeAttr(),
                                     [&]() { return emitOpError(); });
 }