From cce8abaa92703dea562536c02fee3a8fd00ef9e6 Mon Sep 17 00:00:00 2001
From: Chao Chen <chao.chen@intel.com>
Date: Fri, 8 Aug 2025 15:57:16 +0000
Subject: [PATCH 01/14] init

---
 .../include/mlir/Dialect/XeGPU/IR/XeGPUOps.td | 96 +++++++++++--------
 .../mlir/Dialect/XeGPU/IR/XeGPUTypes.td       | 23 +++++
 mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp    | 56 +++++++++++
 3 files changed, 134 insertions(+), 41 deletions(-)
diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
index 75b16a87e03c6..3b074a35e9cbd 100644
--- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
+++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
@@ -29,7 +29,7 @@ class XeGPU_Op<string mnemonic, list<Trait> traits = []>:
     void printProperties(::mlir::MLIRContext *ctx,
             ::mlir::OpAsmPrinter &p, const Properties &prop,
             ::mlir::ArrayRef<::llvm::StringRef> elidedProps) {
-      
+
       DictionaryAttr propAttr = dyn_cast_if_present<mlir::DictionaryAttr>(getPropertiesAsAttr(ctx, prop));
 
       // filter out the elidedProps from propAttr, and get the resultAttr
@@ -43,7 +43,7 @@ class XeGPU_Op<string mnemonic, list<Trait> traits = []>:
       }
 
       if (!filteredAttrs.empty()) {
-        p << "<" << DictionaryAttr::get(ctx, filteredAttrs) << ">"; 
+        p << "<" << DictionaryAttr::get(ctx, filteredAttrs) << ">";
       }
     }
 
@@ -189,11 +189,11 @@ def XeGPU_CreateNdDescOp: XeGPU_Op<"create_nd_tdesc", [Pure, ViewLikeOpInterface
     ArrayRef<int64_t> getStaticOffsets(){
       auto attr = getConstOffsetsAttr();
 
-      if (attr) 
+      if (attr)
         return attr;
 
       int64_t rank = getMixedSizes().size();
-      
+
       setConstOffsets(llvm::SmallVector<int64_t, 4>(rank, 0));
 
       attr = getConstOffsetsAttr();
@@ -233,7 +233,7 @@ def XeGPU_CreateNdDescOp: XeGPU_Op<"create_nd_tdesc", [Pure, ViewLikeOpInterface
       auto attr = getConstStridesAttr();
       if (attr)
         return attr;
-      
+
       if (llvm::isa<IntegerType>(getSourceType()))
         return emptyStrides;
 
@@ -314,15 +314,15 @@ def XeGPU_PrefetchNdOp : XeGPU_Op<"prefetch_nd", []> {
   }];
 
   let assemblyFormat = [{
-    $TensorDesc `` 
-    custom<OptionalDynamicIndexList>($offsets, $const_offsets) 
+    $TensorDesc ``
+    custom<OptionalDynamicIndexList>($offsets, $const_offsets)
     prop-dict attr-dict `:` qualified(type($TensorDesc))
   }];
 
   let builders = [
-    OpBuilder<(ins "Value": $TensorDesc, 
-                   "xegpu::CachePolicyAttr": $l1_hint, 
-                   "xegpu::CachePolicyAttr": $l2_hint, 
+    OpBuilder<(ins "Value": $TensorDesc,
+                   "xegpu::CachePolicyAttr": $l1_hint,
+                   "xegpu::CachePolicyAttr": $l2_hint,
                    "xegpu::CachePolicyAttr": $l3_hint)>
   ];
 
@@ -370,7 +370,7 @@ def XeGPU_LoadNdOp : XeGPU_Op<"load_nd", [
 
   let arguments = (ins XeGPU_TensorDesc: $TensorDesc,
                        Variadic<Index>: $offsets,
-                       OptionalAttr<DenseI64ArrayAttr>: $const_offsets,  
+                       OptionalAttr<DenseI64ArrayAttr>: $const_offsets,
                        OptionalAttr<UnitAttr>: $packed,
                        OptionalAttr<DenseI64ArrayAttr>: $transpose,
                        OptionalAttr<XeGPU_CacheHintAttr>: $l1_hint,
@@ -390,16 +390,16 @@ def XeGPU_LoadNdOp : XeGPU_Op<"load_nd", [
   }];
 
   let assemblyFormat = [{
-    $TensorDesc `` 
-    custom<OptionalDynamicIndexList>($offsets, $const_offsets) 
+    $TensorDesc ``
+    custom<OptionalDynamicIndexList>($offsets, $const_offsets)
     prop-dict attr-dict `:` qualified(type($TensorDesc)) `->` type($value)
   }];
 
   let builders = [
-    OpBuilder<(ins "Type": $value, "Value": $TensorDesc, 
+    OpBuilder<(ins "Type": $value, "Value": $TensorDesc,
                     "UnitAttr": $packed, "DenseI64ArrayAttr": $transpose,
-                    "xegpu::CachePolicyAttr": $l1_hint, 
-                    "xegpu::CachePolicyAttr": $l2_hint, 
+                    "xegpu::CachePolicyAttr": $l1_hint,
+                    "xegpu::CachePolicyAttr": $l2_hint,
                     "xegpu::CachePolicyAttr": $l3_hint)>
   ];
 
@@ -442,7 +442,7 @@ def XeGPU_StoreNdOp : XeGPU_Op<"store_nd", [
   let arguments = (ins XeGPU_ValueType: $value,
                        XeGPU_TensorDesc: $TensorDesc,
                        Variadic<Index>: $offsets,
-                       OptionalAttr<DenseI64ArrayAttr>: $const_offsets,  
+                       OptionalAttr<DenseI64ArrayAttr>: $const_offsets,
                        OptionalAttr<XeGPU_CacheHintAttr>: $l1_hint,
                        OptionalAttr<XeGPU_CacheHintAttr>: $l2_hint,
                        OptionalAttr<XeGPU_CacheHintAttr>: $l3_hint);
@@ -458,16 +458,16 @@ def XeGPU_StoreNdOp : XeGPU_Op<"store_nd", [
   }];
 
    let assemblyFormat = [{
-    $value `,` 
-    $TensorDesc `` 
-    custom<OptionalDynamicIndexList>($offsets, $const_offsets) 
+    $value `,`
+    $TensorDesc ``
+    custom<OptionalDynamicIndexList>($offsets, $const_offsets)
     prop-dict attr-dict `:`  type($value) `,` qualified(type($TensorDesc))
   }];
 
   let builders = [
-    OpBuilder<(ins "Value": $value, "Value": $TensorDesc, 
-                   "xegpu::CachePolicyAttr": $l1_hint, 
-                   "xegpu::CachePolicyAttr": $l2_hint, 
+    OpBuilder<(ins "Value": $value, "Value": $TensorDesc,
+                   "xegpu::CachePolicyAttr": $l1_hint,
+                   "xegpu::CachePolicyAttr": $l2_hint,
                    "xegpu::CachePolicyAttr": $l3_hint)>
   ];
 
@@ -635,12 +635,12 @@ def XeGPU_PrefetchOp : XeGPU_Op<"prefetch", []> {
                              l3_hint = #xegpu.cache_hint<cached>}
         : !xegpu.tensor_desc<16xf16>
     ```
-    
+
     Example 2:
     A variant accepts memref as base pointer and an offset instead of scattered TensorTdesc.
     It combines "create scattered TensorTdesc" and "prefetch with scattered TensorTdesc".
     The source operand could be a raw pointer (uint64_t).
-    Please refer to create_tdesc for the restriction of memref. 
+    Please refer to create_tdesc for the restriction of memref.
     ```mlir
       %a = memref.alloc() : memref<1024xf32>
       %0 = arith.constant dense<[0, 16, 32, 64]> : vector<4xindex>
@@ -676,16 +676,16 @@ def XeGPU_PrefetchOp : XeGPU_Op<"prefetch", []> {
   }];
 
   let assemblyFormat = [{
-    $source 
+    $source
     (`[` $offsets^ `]`)?
     prop-dict
-    attr-dict `:` type(operands) 
+    attr-dict `:` type(operands)
   }];
-    
+
   let builders = [
     OpBuilder<(ins "Value": $source,
-                    "xegpu::CachePolicyAttr": $l1_hint, 
-                    "xegpu::CachePolicyAttr": $l2_hint, 
+                    "xegpu::CachePolicyAttr": $l1_hint,
+                    "xegpu::CachePolicyAttr": $l2_hint,
                     "xegpu::CachePolicyAttr": $l3_hint)>
   ];
 
@@ -723,7 +723,7 @@ def XeGPU_LoadGatherOp : XeGPU_Op<"load", [MemoryEffects<[MemRead]>]> {
           : !xegpu.tensor_desc<16x8xf32, #xegpu.scatter_tdesc_attr<memory_space=global, chunk_size=8>>,
             vector<16xi1> -> vector<16x8xf32>
   ```
-  
+
   Example 3 (SIMT mode):
   ```mlir
     %2 = xegpu.load %1, %0 <{l1_hint = #xegpu.cache_hint<cached>,
@@ -732,12 +732,12 @@ def XeGPU_LoadGatherOp : XeGPU_Op<"load", [MemoryEffects<[MemRead]>]> {
           : !xegpu.tensor_desc<16x8xf32, #xegpu.scatter_tdesc_attr<memory_space=global, chunk_size=8>>
             vector<16xi1> -> vector<8xf32>
   ```
-  
+
   Example 4:
   A variant accepts memref as base pointer and an offset instead of scattered TensorTdesc.
   It combines "create scattered TensorTdesc" and "load with scattered TensorTdesc".
   The source operand could be a raw pointer (uint64_t). Please refer to create_tdesc
-  for the restriction of memref. 
+  for the restriction of memref.
   ```mlir
     %a = memref.alloc() : memref<1024xf32>
     %offsets = vector.step : vector<16xindex>
@@ -794,14 +794,14 @@ def XeGPU_LoadGatherOp : XeGPU_Op<"load", [MemoryEffects<[MemRead]>]> {
   let assemblyFormat = [{
     $source
     (`[` $offsets^ `]`)? `,`
-    $mask prop-dict 
+    $mask prop-dict
     attr-dict `:` type(operands) `->` type($value)
   }];
 
   let builders = [
     OpBuilder<(ins "Type": $value, "Value": $source, "Value": $mask,
-                    "xegpu::CachePolicyAttr": $l1_hint, 
-                    "xegpu::CachePolicyAttr": $l2_hint, 
+                    "xegpu::CachePolicyAttr": $l1_hint,
+                    "xegpu::CachePolicyAttr": $l2_hint,
                     "xegpu::CachePolicyAttr": $l3_hint)>
    ];
 
@@ -848,7 +848,7 @@ def XeGPU_StoreScatterOp : XeGPU_Op<"store", [MemoryEffects<[MemWrite]>]> {
   A variant accepts memref as base pointer and an offset instead of scattered TensorTdesc.
   It combines "create scattered TensorTdesc" and "store with scattered TensorTdesc".
   The dest operand could be a raw pointer (uint64_t).
-  Please refer to create_tdesc for the restriction of memref. 
+  Please refer to create_tdesc for the restriction of memref.
   ```mlir
     %a = memref.alloc() : memref<1024xf32>
     %val = arith.constant dense<0.0> : vector<16xf32>
@@ -901,15 +901,15 @@ def XeGPU_StoreScatterOp : XeGPU_Op<"store", [MemoryEffects<[MemWrite]>]> {
     $value `,`
     $dest
     (`[` $offsets^ `]`)? `,`
-    $mask 
-    prop-dict 
+    $mask
+    prop-dict
     attr-dict `:`  type(operands)
   }];
 
   let builders = [
     OpBuilder<(ins "Value": $value, "Value": $dest, "Value": $mask,
-                    "xegpu::CachePolicyAttr": $l1_hint, 
-                    "xegpu::CachePolicyAttr": $l2_hint, 
+                    "xegpu::CachePolicyAttr": $l1_hint,
+                    "xegpu::CachePolicyAttr": $l2_hint,
                     "xegpu::CachePolicyAttr": $l3_hint)>
    ];
 
@@ -1146,4 +1146,18 @@ def XeGPU_ConvertLayoutOp: XeGPU_Op<"convert_layout", [Pure, AllTypesMatch<["sou
     let hasCanonicalizer = 1;
 }
 
+def XeGPU_CreateMatrixDescOp: XeGPU_Op<"create_matrix_desc"> {
+  let summary = "Create a matrix descriptor.";
+  let description = [{
+    Matrices are treated as 2D units.
+    In case the ROI rank is >2, the two fastest changing dimensions
+    represent a 2D unit and other dimensions specify the multiple
+    of these units that are stacked vertically.
+    Results:
+     - `matrix_desc` : a descriptor for SLM allocation.
+  }];
+  let results = (outs XeGPU_MatrixDesc:$matrix_desc);
+  let assemblyFormat = "attr-dict `:` type($matrix_desc)";
+}
+
 #endif // MLIR_DIALECT_XEGPU_IR_XEGPUOPS_TD
diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td
index b268cabb5d266..6ac126a84d39c 100644
--- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td
+++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td
@@ -201,4 +201,27 @@ def XeGPU_Nbarrier: XeGPUTypeDef<"Nbarrier", "nbarrier", [], "mlir::Type"> {
   }];
 }
 
+def XeGPU_MatrixDesc: XeGPUTypeDef<"MatrixDesc", "matrix_desc", [ShapedTypeInterface], "mlir::Type"> {
+  let summary = "MatrixDesc describing the data in SLM";
+  let description = [{
+    MatrixDesc describes the data stored in SLM. Unleass specified via
+    the the optional layout attribute, the data is stored in a continuous
+    SLM region in row-major order by default.
+  }];
+  let parameters = (ins ArrayRefParameter<"int64_t">: $shape,
+                        "mlir::Type": $elementType,
+                        OptionalParameter<"mlir::Attribute">: $layout);
+
+  let extraClassDeclaration = [{
+    // using mlir::ShapedType::Trait<MatrixDescType>::getElementTypeBitWidth;
+    // using mlir::ShapedType::Trait<MatrixDescType>::getElementTypeBitWidth;
+    // using mlir::ShapedType::Trait<MatrixDescType>::getRank;
+    // using mlir::ShapedType::Trait<MatrixDescType>::getNumElements;
+    // using mlir::ShapedType::Trait<MatrixDescType>::isDynamicDim;
+    // using mlir::ShapedType::Trait<MatrixDescType>::hasStaticShape;
+  }];
+
+  let hasCustomAssemblyFormat = true;
+}
+
 #endif // MLIR_DIALECT_XEGPU_IR_XEGPUTYPES_TD
diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
index 3c0ca114a62d4..50eb90dbc1df9 100644
--- a/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
+++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
@@ -394,6 +394,62 @@ LogicalResult TensorDescType::verify(
   return success();
 }
 
+//===----------------------------------------------------------------------===//
+// XeGPU_MatrixDescType
+//===----------------------------------------------------------------------===//
+mlir::Type MatrixDescType::parse(::mlir::AsmParser &parser) {
+  llvm::SmallVector<int64_t> shape;
+  mlir::Type elementType;
+  mlir::FailureOr<mlir::Attribute> layout;
+
+  // Parse literal '<'
+  if (parser.parseLess())
+    return {};
+
+  auto shapeLoc = parser.getCurrentLocation();
+  if (mlir::failed(parser.parseDimensionList(shape, false, true))) {
+    parser.emitError(shapeLoc, "failed to parse parameter 'shape'");
+    return {};
+  }
+
+  auto elemTypeLoc = parser.getCurrentLocation();
+  if (mlir::failed(parser.parseType(elementType))) {
+    parser.emitError(elemTypeLoc, "failed to parse parameter 'elementType'");
+    return {};
+  }
+
+  // parse optional attributes
+  if (mlir::succeeded(parser.parseOptionalComma())) {
+    mlir::Attribute attr;
+    ParseResult res = parser.parseAttribute(attr);
+    if (mlir::failed(res))
+      return {};
+    layout = attr;
+  }
+
+  // Parse literal '>'
+  if (parser.parseGreater())
+    return {};
+
+  MLIRContext *ctxt = parser.getContext();
+  return MatrixDescType::getChecked(
+      [&]() { return parser.emitError(parser.getNameLoc()); }, ctxt, shape,
+      elementType, layout.value_or(mlir::Attribute()));
+}
+
+void MatrixDescType::print(::mlir::AsmPrinter &printer) const {
+  printer << "<";
+
+  printer.printDimensionList(getShape());
+  printer << 'x';
+  printer << getElementType();
+
+  if (auto layout = getLayout())
+    printer << ", " << layout;
+
+  printer << ">";
+}
+
 } // namespace xegpu
 } // namespace mlir
 

From 76ccc39d6f3c599015d0d6d853cc20a4853fcb7f Mon Sep 17 00:00:00 2001
From: Chao Chen <chao.chen@intel.com>
Date: Mon, 11 Aug 2025 18:48:38 +0000
Subject: [PATCH 02/14] sync

---
 .../include/mlir/Dialect/XeGPU/IR/XeGPUOps.td | 87 ++++++++++++++++++-
 .../mlir/Dialect/XeGPU/IR/XeGPUTypes.td       | 16 ++--
 2 files changed, 92 insertions(+), 11 deletions(-)

diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
index 3b074a35e9cbd..59c1a432dce66 100644
--- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
+++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
@@ -1146,18 +1146,101 @@ def XeGPU_ConvertLayoutOp: XeGPU_Op<"convert_layout", [Pure, AllTypesMatch<["sou
     let hasCanonicalizer = 1;
 }
 
-def XeGPU_CreateMatrixDescOp: XeGPU_Op<"create_matrix_desc"> {
+def XeGPU_CreateMatrixDescOp: XeGPU_Op<"create_matrix_desc", [Pure]>  {
   let summary = "Create a matrix descriptor.";
   let description = [{
     Matrices are treated as 2D units.
     In case the ROI rank is >2, the two fastest changing dimensions
     represent a 2D unit and other dimensions specify the multiple
     of these units that are stacked vertically.
+    Arguments:
+     - `source` : a base address of SLM allocation.
     Results:
      - `matrix_desc` : a descriptor for SLM allocation.
   }];
+  let arguments = (ins XeGPU_BaseAddrType:$source);
   let results = (outs XeGPU_MatrixDesc:$matrix_desc);
-  let assemblyFormat = "attr-dict `:` type($matrix_desc)";
+  let assemblyFormat = "$source prop-dict attr-dict `:` type($source) `->` type($matrix_desc)";
 }
 
+def XeGPU_LoadMatrixOp: XeGPU_Op<"load_matrix", [MemoryEffects<[MemRead]>]>  {
+  let arguments = (ins XeGPU_MatrixDesc:$matrix_desc,
+    Variadic<Index>: $offsets,
+    DenseI64ArrayAttr: $const_offsets,
+    OptionalAttr<XeGPU_LayoutAttr>:$layout
+  );
+  let results = (outs XeGPU_ValueType:$res);
+  let assemblyFormat = [{
+    $matrix_desc `` custom<DynamicIndexList>($offsets, $const_offsets)
+    prop-dict attr-dict `:` functional-type(operands, results)
+  }];
+  let summary = "Load matrix from SLM.";
+  let description = [{
+    This operation loads a matrix from the SLM using the matrix descriptor.
+    There are additional parameters and attributes that support loading, but they must only
+    be specified for a work-item level operation.
+
+    General rules:
+    1. Non-WI-level code must not specify optional attributes.
+    2. If the load uses `vector` semantics, all of the vector attributes must be specified.
+    3. If the load uses `array` semantics, all of the array attributes must be specified.
+
+    Arguments:
+     - `matrix_desc` : a matrix descriptor (SLM allocation + matrix type).
+     - `offsets`     : Coordinates of the matrix to load.
+    Results:
+      - `res` : loaded matrix elements.
+  }];
+
+  let builders = [
+    // OpBuilder<(ins "Type":$res, "TypedValue<MatrixDescType>": $matrix_desc, "llvm::ArrayRef<OpFoldResult>": $offsets, "LayoutAttr": $layout)>,
+  ];
+  let extraClassDeclaration = [{
+    SmallVector<OpFoldResult> getMixedOffsets() {
+      return getMixedValues(getConstOffsets(), getOffsets(), getContext());
+    }
+  }];
+  // let hasVerifier = 1;
+}
+
+def XeGPU_StoreMatrixOp: XeGPU_Op<"store_matrix"> {
+  let arguments = (ins
+    XeGPU_MatrixDesc:$matrix_desc,
+    XeGPU_ValueType:$data,
+    Variadic<Index>: $offsets,
+    DenseI64ArrayAttr: $const_offsets,
+    OptionalAttr<XeGPU_LayoutAttr>:$layout
+  );
+  let assemblyFormat = [{
+    $matrix_desc `` custom<DynamicIndexList>($offsets, $const_offsets) `,` $data
+    prop-dict attr-dict `:` type(operands)
+  }];
+  let summary = "Store matrix from SLM.";
+  let description = [{
+    This operation stores workitem's `data` fragment of the matrix to the SLM (`matrix_desc`).
+    There are additional parameters and attributes that support loading, but they must only
+    be specified for a work-item level operation.
+
+    General rules:
+    1. Non-WI-level code must not specify optional attributes.
+    2. If the store uses `vector` semantics, all of the vector attributes must be specified.
+
+    Arguments:
+     - `matrix_desc` : a matrix descriptor.
+     - `data`        : data to be stored to the matrix.
+     - `offsets`     : Coordinates of the matrix where the data will be stored.
+  }];
+  let builders = [
+    // OpBuilder<(ins "TypedValue<MatrixDescType>": $matrix_desc,  "Value" : $data, "llvm::ArrayRef<OpFoldResult>": $offsets, "LayoutAttr": $layout)>,
+  ];
+  let extraClassDeclaration = [{
+    SmallVector<OpFoldResult> getMixedOffsets() {
+      Builder b(getContext());
+      return getMixedValues(getConstOffsets(), getOffsets(), b);
+    }
+  }];
+  // let hasVerifier = 1;
+}
+
+
 #endif // MLIR_DIALECT_XEGPU_IR_XEGPUOPS_TD
diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td
index 6ac126a84d39c..f578fc8bc0735 100644
--- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td
+++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td
@@ -204,21 +204,19 @@ def XeGPU_Nbarrier: XeGPUTypeDef<"Nbarrier", "nbarrier", [], "mlir::Type"> {
 def XeGPU_MatrixDesc: XeGPUTypeDef<"MatrixDesc", "matrix_desc", [ShapedTypeInterface], "mlir::Type"> {
   let summary = "MatrixDesc describing the data in SLM";
   let description = [{
-    MatrixDesc describes the data stored in SLM. Unleass specified via
-    the the optional layout attribute, the data is stored in a continuous
-    SLM region in row-major order by default.
+    MatrixDesc describes a SLM region. Unleass specified via the optional layout attribute,
+    the data is stored contiguously in the region in row-major order by default.
   }];
   let parameters = (ins ArrayRefParameter<"int64_t">: $shape,
                         "mlir::Type": $elementType,
                         OptionalParameter<"mlir::Attribute">: $layout);
 
   let extraClassDeclaration = [{
-    // using mlir::ShapedType::Trait<MatrixDescType>::getElementTypeBitWidth;
-    // using mlir::ShapedType::Trait<MatrixDescType>::getElementTypeBitWidth;
-    // using mlir::ShapedType::Trait<MatrixDescType>::getRank;
-    // using mlir::ShapedType::Trait<MatrixDescType>::getNumElements;
-    // using mlir::ShapedType::Trait<MatrixDescType>::isDynamicDim;
-    // using mlir::ShapedType::Trait<MatrixDescType>::hasStaticShape;
+    bool hasRank() const { return true; }
+
+    MatrixDescType cloneWith(std::optional<llvm::ArrayRef<int64_t>> shape, Type elementType) const {
+      return MatrixDescType::get(getContext(), shape.value_or(getShape()), elementType, getLayout());
+    }
   }];
 
   let hasCustomAssemblyFormat = true;

From cb0a195e340bac10e10b6d5cb9de0d925d39deeb Mon Sep 17 00:00:00 2001
From: Chao Chen <chao.chen@intel.com>
Date: Tue, 12 Aug 2025 18:10:33 +0000
Subject: [PATCH 03/14] add unit tests for create_matrix_desc

---
 .../include/mlir/Dialect/XeGPU/IR/XeGPUOps.td | 24 +++++++++++++------
 mlir/lib/Dialect/XeGPU/IR/CMakeLists.txt      |  1 +
 mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp        | 10 ++++++++
 mlir/test/Dialect/XeGPU/invalid.mlir          | 16 +++++++++++++
 mlir/test/Dialect/XeGPU/ops.mlir              | 18 ++++++++++++++
 5 files changed, 62 insertions(+), 7 deletions(-)

diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
index 37e4c2c811155..e4ea0b27323ec 100644
--- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
+++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
@@ -1101,21 +1101,31 @@ def XeGPU_ConvertLayoutOp: XeGPU_Op<"convert_layout", [Pure, AllTypesMatch<["sou
     let hasCanonicalizer = 1;
 }
 
-def XeGPU_CreateMatrixDescOp: XeGPU_Op<"create_matrix_desc", [Pure]>  {
+def isSharedPred : CPred<"isSharedMemory(llvm::cast<mlir::MemRefType>($_self))">;
+class StaticShared1DMemRefOf<list<Type> allowedTypes> :
+  ConfinedType<MemRefRankOf<allowedTypes, [1]>, [HasStaticShapePred, isSharedPred],
+     "statically shaped " # MemRefOf<allowedTypes>.summary # " for shared memory",
+     "mlir::MemRefType">;
+
+class SizeInBits<string name> :
+  StrFunc<"llvm::cast<mlir::ShapedType>($" # name # ".getType()).getNumElements()"
+          "*llvm::cast<mlir::ShapedType>($" # name # ".getType()).getElementTypeBitWidth()">;
+class AllMemSizesMatch<list<string> names> :
+    AllMatchSameOperatorTrait<names, SizeInBits<"_self">.result,
+                              "size in bits">;
+
+def XeGPU_CreateMatrixDescOp: XeGPU_Op<"create_matrix_desc", [Pure,
+      AllMemSizesMatch<["source", "matrix_desc"]>]>  {
   let summary = "Create a matrix descriptor.";
   let description = [{
-    Matrices are treated as 2D units.
-    In case the ROI rank is >2, the two fastest changing dimensions
-    represent a 2D unit and other dimensions specify the multiple
-    of these units that are stacked vertically.
     Arguments:
      - `source` : a base address of SLM allocation.
     Results:
      - `matrix_desc` : a descriptor for SLM allocation.
   }];
-  let arguments = (ins XeGPU_BaseAddrType:$source);
+  let arguments = (ins StaticShared1DMemRefOf<[I8]>:$source);
   let results = (outs XeGPU_MatrixDesc:$matrix_desc);
-  let assemblyFormat = "$source prop-dict attr-dict `:` type($source) `->` type($matrix_desc)";
+  let assemblyFormat = "$source prop-dict attr-dict `` `:` type($source) `->` qualified(type($matrix_desc))";
 }
 
 def XeGPU_LoadMatrixOp: XeGPU_Op<"load_matrix", [MemoryEffects<[MemRead]>]>  {
diff --git a/mlir/lib/Dialect/XeGPU/IR/CMakeLists.txt b/mlir/lib/Dialect/XeGPU/IR/CMakeLists.txt
index 7c6a4f37db9af..603fb5d237544 100644
--- a/mlir/lib/Dialect/XeGPU/IR/CMakeLists.txt
+++ b/mlir/lib/Dialect/XeGPU/IR/CMakeLists.txt
@@ -17,6 +17,7 @@ add_mlir_dialect_library(MLIRXeGPUDialect
   MLIRAffineUtils
   MLIRArithUtils
   MLIRDialectUtils
+  MLIRGPUDialect
   MLIRIR
   MLIRViewLikeInterface
   MLIRVectorDialect
diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
index 2cd086feb5deb..ad4d8bd6e22cd 100644
--- a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
+++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
@@ -7,6 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "mlir/Dialect/Arith/Utils/Utils.h"
+#include "mlir/Dialect/GPU/IR/GPUDialect.h"
 #include "mlir/Dialect/Utils/IndexingUtils.h"
 #include "mlir/Dialect/Utils/StaticValueUtils.h"
 #include "mlir/Dialect/XeGPU/IR/XeGPU.h"
@@ -21,6 +22,15 @@
 namespace mlir {
 namespace xegpu {
 
+bool isSharedMemory(const MemRefType &memrefTy) {
+  Attribute attr = memrefTy.getMemorySpace();
+  if (auto intAttr = llvm::dyn_cast<IntegerAttr>(attr))
+    return intAttr.getInt() == 3;
+  if (auto memrefSpace = llvm::dyn_cast<MemorySpaceAttr>(attr))
+    return memrefSpace.getValue() == MemorySpace::SLM;
+  return gpu::GPUDialect::isWorkgroupMemoryAddressSpace(attr);
+}
+
 template <typename T>
 static std::string makeString(T array, bool breakline = false) {
   std::string buf;
diff --git a/mlir/test/Dialect/XeGPU/invalid.mlir b/mlir/test/Dialect/XeGPU/invalid.mlir
index 44e15dd7cbb38..1cd817918a772 100644
--- a/mlir/test/Dialect/XeGPU/invalid.mlir
+++ b/mlir/test/Dialect/XeGPU/invalid.mlir
@@ -762,3 +762,19 @@ func.func @slice_attr_repeat_dim() {
   return
 }
 
+// -----
+func.func @create_matrix_desc_non_slm() {
+  %m = memref.alloca() {alignment = 1024} : memref<2048xi8, 1>
+  // expected-error@+1 {{operand #0 must be statically shaped memref of 8-bit signless integer values for shared memory}}
+  %matrix_desc = xegpu.create_matrix_desc %m : memref<2048xi8, 1> -> !xegpu.matrix_desc<16x64xf16>
+  return
+}
+
+// -----
+func.func @create_matrix_desc_mismatch_sizes() {
+  %m = memref.alloca() {alignment = 1024} : memref<2048xi8, 3>
+  // expected-error@+1 {{failed to verify that all of {source, matrix_desc} have same size in bits}}
+  %matrix_desc = xegpu.create_matrix_desc %m : memref<2048xi8, 3> -> !xegpu.matrix_desc<16x32xf16>
+  return
+}
+
diff --git a/mlir/test/Dialect/XeGPU/ops.mlir b/mlir/test/Dialect/XeGPU/ops.mlir
index 67c00f5a9cc2f..c224749031328 100644
--- a/mlir/test/Dialect/XeGPU/ops.mlir
+++ b/mlir/test/Dialect/XeGPU/ops.mlir
@@ -751,4 +751,22 @@ gpu.func @fence() {
   gpu.return
 }
 
+// CHECK-LABEL: gpu.func @create_matrix_desc({{.*}}) {
+gpu.func @create_matrix_desc() {
+  //CHECK: [[alloc:%.+]] = memref.alloca() {alignment = 1024 : i64} : memref<2048xi8, 3>
+  //CHECK: [[mdesc:%.+]] = xegpu.create_matrix_desc [[alloc]] : memref<2048xi8, 3> -> !xegpu.matrix_desc<16x64xf16>
+  %m = memref.alloca() {alignment = 1024} : memref<2048xi8, 3>
+  %matrix_desc = xegpu.create_matrix_desc %m : memref<2048xi8, 3> -> !xegpu.matrix_desc<16x64xf16>
+  gpu.return
+}
+
+// CHECK-LABEL: gpu.func @create_matrix_desc_with_stride({{.*}}) {
+gpu.func @create_matrix_desc_with_stride() {
+  //CHECK: [[alloc:%.+]] = memref.alloca() {alignment = 1024 : i64} : memref<2048xi8, 3>
+  //CHECK: [[mdesc:%.+]] = xegpu.create_matrix_desc [[alloc]] : memref<2048xi8, 3> -> !xegpu.matrix_desc<16x64xf16, strided<[1, 16]>>
+  %m = memref.alloca() {alignment = 1024} : memref<2048xi8, 3>
+  %matrix_desc = xegpu.create_matrix_desc %m : memref<2048xi8, 3> -> !xegpu.matrix_desc<16x64xf16, strided<[1, 16]>>
+  gpu.return
+}
+
 }

From 98871ccb013229593e8d169533ab3b03b136f687 Mon Sep 17 00:00:00 2001
From: Chao Chen <chao.chen@intel.com>
Date: Tue, 12 Aug 2025 20:18:09 +0000
Subject: [PATCH 04/14] add unit test for load_matrix and store_matrix

---
 .../include/mlir/Dialect/XeGPU/IR/XeGPUOps.td | 31 ++++++-----
 mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp        | 53 +++++++++++++++++++
 mlir/test/Dialect/XeGPU/invalid.mlir          | 28 ++++++++++
 mlir/test/Dialect/XeGPU/ops.mlir              | 29 ++++++++++
 4 files changed, 129 insertions(+), 12 deletions(-)

diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
index e4ea0b27323ec..461df6efb8528 100644
--- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
+++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
@@ -1128,16 +1128,18 @@ def XeGPU_CreateMatrixDescOp: XeGPU_Op<"create_matrix_desc", [Pure,
   let assemblyFormat = "$source prop-dict attr-dict `` `:` type($source) `->` qualified(type($matrix_desc))";
 }
 
-def XeGPU_LoadMatrixOp: XeGPU_Op<"load_matrix", [MemoryEffects<[MemRead]>]>  {
+def XeGPU_LoadMatrixOp: XeGPU_Op<"load_matrix", [MemoryEffects<[MemRead]>,
+                              AllElementTypesMatch<["matrix_desc", "res"]>,
+                              AllRanksMatch<["matrix_desc", "res"]>]>  {
   let arguments = (ins XeGPU_MatrixDesc:$matrix_desc,
     Variadic<Index>: $offsets,
     DenseI64ArrayAttr: $const_offsets,
-    OptionalAttr<XeGPU_LayoutAttr>:$layout
+    OptionalAttr<LayoutTrait>:$layout
   );
   let results = (outs XeGPU_ValueType:$res);
   let assemblyFormat = [{
     $matrix_desc `` custom<DynamicIndexList>($offsets, $const_offsets)
-    prop-dict attr-dict `:` functional-type(operands, results)
+    prop-dict attr-dict `` `:` type(operands) `->` type(results)
   }];
   let summary = "Load matrix from SLM.";
   let description = [{
@@ -1158,23 +1160,27 @@ def XeGPU_LoadMatrixOp: XeGPU_Op<"load_matrix", [MemoryEffects<[MemRead]>]>  {
   }];
 
   let builders = [
-    // OpBuilder<(ins "Type":$res, "TypedValue<MatrixDescType>": $matrix_desc, "llvm::ArrayRef<OpFoldResult>": $offsets, "LayoutAttr": $layout)>,
+    OpBuilder<(ins "Type":$res, "TypedValue<MatrixDescType>": $matrix_desc,
+                    "llvm::ArrayRef<OpFoldResult>": $offsets, "LayoutTrait": $layout)>,
   ];
   let extraClassDeclaration = [{
     SmallVector<OpFoldResult> getMixedOffsets() {
       return getMixedValues(getConstOffsets(), getOffsets(), getContext());
     }
   }];
-  // let hasVerifier = 1;
+
+  let hasVerifier = 1;
 }
 
-def XeGPU_StoreMatrixOp: XeGPU_Op<"store_matrix"> {
+def XeGPU_StoreMatrixOp: XeGPU_Op<"store_matrix", [MemoryEffects<[MemWrite]>,
+                              AllElementTypesMatch<["matrix_desc", "data"]>,
+                              AllRanksMatch<["matrix_desc", "data"]>]> {
   let arguments = (ins
     XeGPU_MatrixDesc:$matrix_desc,
-    XeGPU_ValueType:$data,
     Variadic<Index>: $offsets,
     DenseI64ArrayAttr: $const_offsets,
-    OptionalAttr<XeGPU_LayoutAttr>:$layout
+    XeGPU_ValueType:$data,
+    OptionalAttr<LayoutTrait>:$layout
   );
   let assemblyFormat = [{
     $matrix_desc `` custom<DynamicIndexList>($offsets, $const_offsets) `,` $data
@@ -1196,15 +1202,16 @@ def XeGPU_StoreMatrixOp: XeGPU_Op<"store_matrix"> {
      - `offsets`     : Coordinates of the matrix where the data will be stored.
   }];
   let builders = [
-    // OpBuilder<(ins "TypedValue<MatrixDescType>": $matrix_desc,  "Value" : $data, "llvm::ArrayRef<OpFoldResult>": $offsets, "LayoutAttr": $layout)>,
+    OpBuilder<(ins "TypedValue<MatrixDescType>": $matrix_desc, "llvm::ArrayRef<OpFoldResult>": $offsets,
+                   "Value" : $data, "LayoutTrait": $layout)>,
   ];
   let extraClassDeclaration = [{
     SmallVector<OpFoldResult> getMixedOffsets() {
-      Builder b(getContext());
-      return getMixedValues(getConstOffsets(), getOffsets(), b);
+      return getMixedValues(getConstOffsets(), getOffsets(), getContext());
     }
   }];
-  // let hasVerifier = 1;
+
+  let hasVerifier = 1;
 }
 
 
diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
index ad4d8bd6e22cd..2051d7030340e 100644
--- a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
+++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
@@ -935,6 +935,59 @@ void ConvertLayoutOp::getCanonicalizationPatterns(RewritePatternSet &patterns,
   patterns.add<FoldConvertLayoutOp>(context);
 }
 
+//===----------------------------------------------------------------------===//
+// XeGPU_LoadMatrixOp
+//===----------------------------------------------------------------------===//
+void LoadMatrixOp::build(OpBuilder &builder, OperationState &state, Type res,
+                         TypedValue<MatrixDescType> matrixDesc,
+                         llvm::ArrayRef<OpFoldResult> offsets,
+                         LayoutTrait layout) {
+  llvm::SmallVector<Value> dynamicOffsets;
+  llvm::SmallVector<int64_t> staticOffsets;
+
+  dispatchIndexOpFoldResults(offsets, dynamicOffsets, staticOffsets);
+  auto staticOffsetsAttr = builder.getDenseI64ArrayAttr(staticOffsets);
+
+  build(builder, state, res, matrixDesc, dynamicOffsets, staticOffsetsAttr,
+        layout);
+}
+
+LogicalResult LoadMatrixOp::verify() {
+  ArrayRef<int64_t> valueShape = getRes().getType().getShape();
+  ArrayRef<int64_t> mdescShape = getMatrixDesc().getType().getShape();
+  if (llvm::any_of(llvm::zip_equal(valueShape, mdescShape),
+                   [](auto p) { return std::get<0>(p) > std::get<1>(p); }))
+    return emitOpError("result shape must not exceed matrix desc shape.");
+  return success();
+}
+
+//===----------------------------------------------------------------------===//
+// XeGPU_StoreMatrixOp
+//===----------------------------------------------------------------------===//
+void StoreMatrixOp::build(OpBuilder &builder, OperationState &state,
+                          TypedValue<MatrixDescType> matrixDesc,
+                          llvm::ArrayRef<OpFoldResult> offsets, Value data,
+                          LayoutTrait layout) {
+  llvm::SmallVector<Value> dynamicOffsets;
+  llvm::SmallVector<int64_t> staticOffsets;
+
+  dispatchIndexOpFoldResults(offsets, dynamicOffsets, staticOffsets);
+  auto staticOffsetsAttr = builder.getDenseI64ArrayAttr(staticOffsets);
+
+  build(builder, state, matrixDesc, dynamicOffsets, staticOffsetsAttr, data,
+        layout);
+}
+
+LogicalResult StoreMatrixOp::verify() {
+  ArrayRef<int64_t> dataShape = getData().getType().getShape();
+  ArrayRef<int64_t> mdescShape = getMatrixDesc().getType().getShape();
+  if (llvm::any_of(llvm::zip_equal(dataShape, mdescShape),
+                   [](auto p) { return std::get<0>(p) > std::get<1>(p); }))
+    return emitOpError("data shape must not exceed matrix desc shape.");
+
+  return success();
+}
+
 } // namespace xegpu
 } // namespace mlir
 
diff --git a/mlir/test/Dialect/XeGPU/invalid.mlir b/mlir/test/Dialect/XeGPU/invalid.mlir
index 1cd817918a772..2feb010d343a8 100644
--- a/mlir/test/Dialect/XeGPU/invalid.mlir
+++ b/mlir/test/Dialect/XeGPU/invalid.mlir
@@ -778,3 +778,31 @@ func.func @create_matrix_desc_mismatch_sizes() {
   return
 }
 
+// -----
+func.func @load_matrix_desc_mismatch_element_type(%arg0: !xegpu.matrix_desc<16x64xf16>) {
+  // expected-error@+1 {{failed to verify that all of {matrix_desc, res} have same element type}}
+  %data = xegpu.load_matrix %arg0[8, 8]: !xegpu.matrix_desc<16x64xf16> -> vector<8x16xf32>
+  return
+}
+
+// -----
+func.func @load_matrix_desc_invalid_result_size(%arg0: !xegpu.matrix_desc<16x64xf16>) {
+  // expected-error@+1 {{result shape must not exceed matrix desc shape}}
+  %data = xegpu.load_matrix %arg0[8, 8]: !xegpu.matrix_desc<16x64xf16> -> vector<32x16xf16>
+  return
+}
+
+// -----
+func.func @store_matrix_desc_mismatch_element_type(%arg0: !xegpu.matrix_desc<16x64xf16>, %arg1: vector<16x16xf32>) {
+  // expected-error@+1 {{failed to verify that all of {matrix_desc, data} have same element type}}
+  xegpu.store_matrix %arg0[8, 8], %arg1: !xegpu.matrix_desc<16x64xf16>, vector<16x16xf32>
+  return
+}
+
+// -----
+func.func @store_matrix_desc_invalid_data_size(%arg0: !xegpu.matrix_desc<16x64xf16>, %arg1: vector<32x32xf16>) {
+  // expected-error@+1 {{data shape must not exceed matrix desc shape}}
+  xegpu.store_matrix %arg0[8, 8], %arg1: !xegpu.matrix_desc<16x64xf16>, vector<32x32xf16>
+  return
+}
+
diff --git a/mlir/test/Dialect/XeGPU/ops.mlir b/mlir/test/Dialect/XeGPU/ops.mlir
index c224749031328..cda8f0ac1bb40 100644
--- a/mlir/test/Dialect/XeGPU/ops.mlir
+++ b/mlir/test/Dialect/XeGPU/ops.mlir
@@ -769,4 +769,33 @@ gpu.func @create_matrix_desc_with_stride() {
   gpu.return
 }
 
+// CHECK: gpu.func @load_matrix_desc([[ARG0:%.+]]: !xegpu.matrix_desc<16x64xf16>)
+gpu.func @load_matrix_desc(%arg0: !xegpu.matrix_desc<16x64xf16>) {
+  // CHECK: xegpu.load_matrix [[ARG0]][8, 8] : !xegpu.matrix_desc<16x64xf16> -> vector<8x16xf16>
+  %data = xegpu.load_matrix %arg0[8, 8]: !xegpu.matrix_desc<16x64xf16> -> vector<8x16xf16>
+  gpu.return
+}
+
+// CHECK: gpu.func @load_matrix_desc_with_stride(%arg0: !xegpu.matrix_desc<16x64xf16, strided<[1, 16]>>)
+gpu.func @load_matrix_desc_with_stride(%arg0: !xegpu.matrix_desc<16x64xf16, strided<[1, 16]>>) {
+  // CHECK: xegpu.load_matrix [[ARG0]][8, 8] : !xegpu.matrix_desc<16x64xf16, strided<[1, 16]>> -> vector<8x16xf16>
+  %data = xegpu.load_matrix %arg0[8, 8]: !xegpu.matrix_desc<16x64xf16, strided<[1, 16]>> -> vector<8x16xf16>
+  gpu.return
+}
+
+
+// CHECK: gpu.func @store_matrix_desc([[ARG0:%.+]]: !xegpu.matrix_desc<16x64xf16>, [[ARG1:%.+]]: vector<16x16xf16>)
+gpu.func @store_matrix_desc(%arg0: !xegpu.matrix_desc<16x64xf16>, %arg1: vector<16x16xf16>) {
+  // CHECK: xegpu.store_matrix [[ARG0]][8, 8], [[ARG1]] : !xegpu.matrix_desc<16x64xf16>, vector<16x16xf16>
+  xegpu.store_matrix %arg0[8, 8], %arg1: !xegpu.matrix_desc<16x64xf16>, vector<16x16xf16>
+  gpu.return
+}
+
+// CHECK: gpu.func @store_matrix_desc_with_stride([[ARG0:%.+]]: !xegpu.matrix_desc<16x64xf16, strided<[1, 16]>>, [[ARG1:%.+]]: vector<16x16xf16>)
+gpu.func @store_matrix_desc_with_stride(%arg0: !xegpu.matrix_desc<16x64xf16, strided<[1, 16]>>, %arg1: vector<16x16xf16>) {
+  // CHECK: xegpu.store_matrix [[ARG0]][8, 8], [[ARG1]] : !xegpu.matrix_desc<16x64xf16, strided<[1, 16]>>, vector<16x16xf16>
+  xegpu.store_matrix %arg0[8, 8], %arg1: !xegpu.matrix_desc<16x64xf16, strided<[1, 16]>>, vector<16x16xf16>
+  gpu.return
+}
+
 }

From 06eec6e51b755cbb13b62cfaa3ba2320e8bc3cb6 Mon Sep 17 00:00:00 2001
From: Chao Chen <chao.chen@intel.com>
Date: Tue, 12 Aug 2025 20:33:56 +0000
Subject: [PATCH 05/14] refine description

---
 .../include/mlir/Dialect/XeGPU/IR/XeGPUOps.td | 42 ++++++++-----------
 .../mlir/Dialect/XeGPU/IR/XeGPUTypes.td       |  5 ++-
 2 files changed, 20 insertions(+), 27 deletions(-)

diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
index 461df6efb8528..f536650e9d872 100644
--- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
+++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
@@ -1118,10 +1118,14 @@ def XeGPU_CreateMatrixDescOp: XeGPU_Op<"create_matrix_desc", [Pure,
       AllMemSizesMatch<["source", "matrix_desc"]>]>  {
   let summary = "Create a matrix descriptor.";
   let description = [{
+    Creates a matrix descriptor from a shared local memory (SLM) buffer.
+    The resulting matrix descriptor has to have the same size as the underlying
+    shared local memory.
+
     Arguments:
-     - `source` : a base address of SLM allocation.
+     - `source` : a 1D statically shaped memref with element type i8, representing the raw SLM buffer.
     Results:
-     - `matrix_desc` : a descriptor for SLM allocation.
+     - `matrix_desc` : the matrix descriptor.
   }];
   let arguments = (ins StaticShared1DMemRefOf<[I8]>:$source);
   let results = (outs XeGPU_MatrixDesc:$matrix_desc);
@@ -1141,22 +1145,16 @@ def XeGPU_LoadMatrixOp: XeGPU_Op<"load_matrix", [MemoryEffects<[MemRead]>,
     $matrix_desc `` custom<DynamicIndexList>($offsets, $const_offsets)
     prop-dict attr-dict `` `:` type(operands) `->` type(results)
   }];
-  let summary = "Load matrix from SLM.";
-  let description = [{
-    This operation loads a matrix from the SLM using the matrix descriptor.
-    There are additional parameters and attributes that support loading, but they must only
-    be specified for a work-item level operation.
 
-    General rules:
-    1. Non-WI-level code must not specify optional attributes.
-    2. If the load uses `vector` semantics, all of the vector attributes must be specified.
-    3. If the load uses `array` semantics, all of the array attributes must be specified.
+  let description = [{
+    This operation reads a block of data from shared local memory (SLM)
+    using the provided matrix descriptor.
 
     Arguments:
-     - `matrix_desc` : a matrix descriptor (SLM allocation + matrix type).
-     - `offsets`     : Coordinates of the matrix to load.
+     - `matrix_desc`: the matrix descriptor identifying the SLM region.
+     - `offsets`: the coordinates within the matrix to read from.
     Results:
-      - `res` : loaded matrix elements.
+     - `res`: the matrix elements loaded from SLM.
   }];
 
   let builders = [
@@ -1186,20 +1184,14 @@ def XeGPU_StoreMatrixOp: XeGPU_Op<"store_matrix", [MemoryEffects<[MemWrite]>,
     $matrix_desc `` custom<DynamicIndexList>($offsets, $const_offsets) `,` $data
     prop-dict attr-dict `:` type(operands)
   }];
-  let summary = "Store matrix from SLM.";
   let description = [{
-    This operation stores workitem's `data` fragment of the matrix to the SLM (`matrix_desc`).
-    There are additional parameters and attributes that support loading, but they must only
-    be specified for a work-item level operation.
-
-    General rules:
-    1. Non-WI-level code must not specify optional attributes.
-    2. If the store uses `vector` semantics, all of the vector attributes must be specified.
+    This operation writes the `data` fragment into the shared local memory region
+    identified by `matrix_desc`.
 
     Arguments:
-     - `matrix_desc` : a matrix descriptor.
-     - `data`        : data to be stored to the matrix.
-     - `offsets`     : Coordinates of the matrix where the data will be stored.
+     - `matrix_desc`: the matrix descriptor specifying the SLM region.
+     - `offsets`: the coordinates within the matrix where the data will be written.
+     - `data`: the values to be stored in the matrix.
   }];
   let builders = [
     OpBuilder<(ins "TypedValue<MatrixDescType>": $matrix_desc, "llvm::ArrayRef<OpFoldResult>": $offsets,
diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td
index f578fc8bc0735..02cabce82398b 100644
--- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td
+++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td
@@ -204,8 +204,9 @@ def XeGPU_Nbarrier: XeGPUTypeDef<"Nbarrier", "nbarrier", [], "mlir::Type"> {
 def XeGPU_MatrixDesc: XeGPUTypeDef<"MatrixDesc", "matrix_desc", [ShapedTypeInterface], "mlir::Type"> {
   let summary = "MatrixDesc describing the data in SLM";
   let description = [{
-    MatrixDesc describes a SLM region. Unleass specified via the optional layout attribute,
-    the data is stored contiguously in the region in row-major order by default.
+    MatrixDesc represents a block of data stored in shared local memory.
+    By default, unless a layout attribute is provided, the data is stored
+    contiguously in row-major order within the region.
   }];
   let parameters = (ins ArrayRefParameter<"int64_t">: $shape,
                         "mlir::Type": $elementType,

From 6df4291c7fcecccc233f0b9ffea67e5edaef5d9b Mon Sep 17 00:00:00 2001
From: Chao Chen <chao.chen@intel.com>
Date: Wed, 13 Aug 2025 00:02:35 +0000
Subject: [PATCH 06/14] add subview op

---
 .../include/mlir/Dialect/XeGPU/IR/XeGPUOps.td | 31 ++++++++++++++++++
 mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp        | 32 ++++++++++++++++---
 mlir/test/Dialect/XeGPU/invalid.mlir          | 20 ++++++++++++
 mlir/test/Dialect/XeGPU/ops.mlir              | 14 ++++++++
 4 files changed, 93 insertions(+), 4 deletions(-)

diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
index f536650e9d872..0c8980bb04b2e 100644
--- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
+++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
@@ -1206,5 +1206,36 @@ def XeGPU_StoreMatrixOp: XeGPU_Op<"store_matrix", [MemoryEffects<[MemWrite]>,
   let hasVerifier = 1;
 }
 
+def XeGPU_MatrixDescSubviewOp: XeGPU_Op<"matrix_desc_subview", [Pure, ViewLikeOpInterface,
+                                                                AllElementTypesMatch<["src", "res"]>,
+                                                                AllRanksMatch<["src", "res"]>]> {
+  let description = [{
+    Create a subview of a matrix descriptor.
+    Results:
+     - `src` : a matrix descriptor.
+     - `offsets` : the coordinates within the matrix the subview will be created from.
+  }];
+  let arguments = (ins XeGPU_MatrixDesc:$src,
+                       Variadic<Index>:$offsets,
+                       DenseI64ArrayAttr:$const_offsets,
+                       OptionalAttr<LayoutTrait>: $layout);
+  let results = (outs XeGPU_MatrixDesc:$res);
+  let assemblyFormat = [{$src `` custom<DynamicIndexList>($offsets, $const_offsets) prop-dict
+                         attr-dict `` `:` qualified(type($src)) `->` qualified(type($res))}];
+  let builders = [
+    OpBuilder<(ins "Type": $res, "Value":$src, "llvm::ArrayRef<OpFoldResult>": $offsets, "LayoutTrait": $layout)>
+  ];
+
+  let extraClassDeclaration = [{
+    mlir::Value getViewSource() { return getSrc(); }
+
+    SmallVector<OpFoldResult> getMixedOffsets() {
+      return getMixedValues(getConstOffsets(), getOffsets(), getContext());
+    }
+  }];
+
+  let hasVerifier = 1;
+}
+
 
 #endif // MLIR_DIALECT_XEGPU_IR_XEGPUOPS_TD
diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
index 2051d7030340e..a8ec058a12a93 100644
--- a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
+++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
@@ -944,10 +944,8 @@ void LoadMatrixOp::build(OpBuilder &builder, OperationState &state, Type res,
                          LayoutTrait layout) {
   llvm::SmallVector<Value> dynamicOffsets;
   llvm::SmallVector<int64_t> staticOffsets;
-
   dispatchIndexOpFoldResults(offsets, dynamicOffsets, staticOffsets);
   auto staticOffsetsAttr = builder.getDenseI64ArrayAttr(staticOffsets);
-
   build(builder, state, res, matrixDesc, dynamicOffsets, staticOffsetsAttr,
         layout);
 }
@@ -970,10 +968,8 @@ void StoreMatrixOp::build(OpBuilder &builder, OperationState &state,
                           LayoutTrait layout) {
   llvm::SmallVector<Value> dynamicOffsets;
   llvm::SmallVector<int64_t> staticOffsets;
-
   dispatchIndexOpFoldResults(offsets, dynamicOffsets, staticOffsets);
   auto staticOffsetsAttr = builder.getDenseI64ArrayAttr(staticOffsets);
-
   build(builder, state, matrixDesc, dynamicOffsets, staticOffsetsAttr, data,
         layout);
 }
@@ -988,6 +984,34 @@ LogicalResult StoreMatrixOp::verify() {
   return success();
 }
 
+//===----------------------------------------------------------------------===//
+// XeGPU_MatrixDescSubviewOp
+//===----------------------------------------------------------------------===//
+
+void MatrixDescSubviewOp::build(OpBuilder &builder, OperationState &state,
+                                Type resTy, Value src,
+                                llvm::ArrayRef<OpFoldResult> offsets,
+                                LayoutTrait layout) {
+  llvm::SmallVector<Value> dynamicOffsets;
+  llvm::SmallVector<int64_t> staticOffsets;
+  dispatchIndexOpFoldResults(offsets, dynamicOffsets, staticOffsets);
+  auto staticOffsetsAttr = builder.getDenseI64ArrayAttr(staticOffsets);
+  build(builder, state, resTy, src, dynamicOffsets, staticOffsetsAttr, layout);
+}
+
+LogicalResult MatrixDescSubviewOp::verify() {
+  ArrayRef<int64_t> srcShape = getSrc().getType().getShape();
+  ArrayRef<int64_t> resShape = getRes().getType().getShape();
+  if (llvm::any_of(llvm::zip_equal(resShape, srcShape),
+                   [](auto p) { return std::get<0>(p) > std::get<1>(p); }))
+    return emitOpError("result shape must not exceed source shape.");
+
+  if (getSrc().getType().getLayout() != getRes().getType().getLayout())
+    return emitOpError("result must inherit the source layout.");
+
+  return success();
+}
+
 } // namespace xegpu
 } // namespace mlir
 
diff --git a/mlir/test/Dialect/XeGPU/invalid.mlir b/mlir/test/Dialect/XeGPU/invalid.mlir
index 2feb010d343a8..63945dab1ccc2 100644
--- a/mlir/test/Dialect/XeGPU/invalid.mlir
+++ b/mlir/test/Dialect/XeGPU/invalid.mlir
@@ -806,3 +806,23 @@ func.func @store_matrix_desc_invalid_data_size(%arg0: !xegpu.matrix_desc<16x64xf
   return
 }
 
+// -----
+func.func @matrix_desc_subview_size_mismatch(%arg0: !xegpu.matrix_desc<16x64xf16>) {
+  // expected-error@+1 {{result shape must not exceed source shape}}
+  %data = xegpu.matrix_desc_subview %arg0[8, 8]: !xegpu.matrix_desc<16x64xf16> -> !xegpu.matrix_desc<32x16xf16>
+  return
+}
+
+// -----
+func.func @matrix_desc_subview_layout_mismatch(%arg0: !xegpu.matrix_desc<16x64xf16, strided<[1, 16]>>) {
+  // expected-error@+1 {{result must inherit the source layout}}
+  %data = xegpu.matrix_desc_subview %arg0[8, 8]: !xegpu.matrix_desc<16x64xf16, strided<[1, 16]>> -> !xegpu.matrix_desc<8x16xf16>
+  return
+}
+
+// -----
+func.func @matrix_desc_subview_rank_mismatch(%arg0: !xegpu.matrix_desc<16x64xf16>) {
+  // expected-error@+1 {{failed to verify that all of {src, res} have same element type}}
+  %data = xegpu.matrix_desc_subview %arg0[8, 8]: !xegpu.matrix_desc<16x64xf16> -> !xegpu.matrix_desc<8x16xf32>
+  return
+}
diff --git a/mlir/test/Dialect/XeGPU/ops.mlir b/mlir/test/Dialect/XeGPU/ops.mlir
index cda8f0ac1bb40..7bceda70dea9f 100644
--- a/mlir/test/Dialect/XeGPU/ops.mlir
+++ b/mlir/test/Dialect/XeGPU/ops.mlir
@@ -798,4 +798,18 @@ gpu.func @store_matrix_desc_with_stride(%arg0: !xegpu.matrix_desc<16x64xf16, str
   gpu.return
 }
 
+// CHECK: gpu.func @matrix_desc_subview([[ARG0:%.+]]: !xegpu.matrix_desc<16x64xf16>)
+gpu.func @matrix_desc_subview(%arg0: !xegpu.matrix_desc<16x64xf16>) {
+  //CHECK: xegpu.matrix_desc_subview [[ARG0]][8, 8] : !xegpu.matrix_desc<16x64xf16> -> !xegpu.matrix_desc<8x16xf16>
+  %data = xegpu.matrix_desc_subview %arg0[8, 8]: !xegpu.matrix_desc<16x64xf16> -> !xegpu.matrix_desc<8x16xf16>
+  gpu.return
+}
+
+// CHECK: gpu.func @matrix_desc_subview_with_stride([[ARG0:%.+]]: !xegpu.matrix_desc<16x64xf16, strided<[1, 16]>>)
+gpu.func @matrix_desc_subview_with_stride(%arg0: !xegpu.matrix_desc<16x64xf16, strided<[1, 16]>>) {
+  //CHECK: xegpu.matrix_desc_subview [[ARG0]][8, 8] : !xegpu.matrix_desc<16x64xf16, strided<[1, 16]>> -> !xegpu.matrix_desc<8x16xf16, strided<[1, 16]>>
+  %data = xegpu.matrix_desc_subview %arg0[8, 8]: !xegpu.matrix_desc<16x64xf16, strided<[1, 16]>> -> !xegpu.matrix_desc<8x16xf16, strided<[1, 16]>>
+  gpu.return
+}
+
 }

From e11c88db66366d3c61b158959f5418230ce2abbb Mon Sep 17 00:00:00 2001
From: Chao Chen <chao.chen@intel.com>
Date: Wed, 13 Aug 2025 13:57:59 +0000
Subject: [PATCH 07/14] address comments

---
 mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td | 6 ++++++
 mlir/lib/Dialect/XeGPU/IR/CMakeLists.txt       | 1 +
 mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp         | 3 +++
 3 files changed, 10 insertions(+)

diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
index 0c8980bb04b2e..6d06464e204a6 100644
--- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
+++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
@@ -1153,6 +1153,9 @@ def XeGPU_LoadMatrixOp: XeGPU_Op<"load_matrix", [MemoryEffects<[MemRead]>,
     Arguments:
      - `matrix_desc`: the matrix descriptor identifying the SLM region.
      - `offsets`: the coordinates within the matrix to read from.
+     - `layout`: [optional] An attribute for guiding distributions among
+                 subgroups and/or work-items. It currently can accept either
+                 LayoutAttr or SliceAttr.
     Results:
      - `res`: the matrix elements loaded from SLM.
   }];
@@ -1192,6 +1195,9 @@ def XeGPU_StoreMatrixOp: XeGPU_Op<"store_matrix", [MemoryEffects<[MemWrite]>,
      - `matrix_desc`: the matrix descriptor specifying the SLM region.
      - `offsets`: the coordinates within the matrix where the data will be written.
      - `data`: the values to be stored in the matrix.
+     - `layout`: [optional] An attribute for guiding distributions among
+                 subgroups and/or work-items. It currently can accept either
+                 LayoutAttr or SliceAttr.
   }];
   let builders = [
     OpBuilder<(ins "TypedValue<MatrixDescType>": $matrix_desc, "llvm::ArrayRef<OpFoldResult>": $offsets,
diff --git a/mlir/lib/Dialect/XeGPU/IR/CMakeLists.txt b/mlir/lib/Dialect/XeGPU/IR/CMakeLists.txt
index 603fb5d237544..7869a28dfed57 100644
--- a/mlir/lib/Dialect/XeGPU/IR/CMakeLists.txt
+++ b/mlir/lib/Dialect/XeGPU/IR/CMakeLists.txt
@@ -18,6 +18,7 @@ add_mlir_dialect_library(MLIRXeGPUDialect
   MLIRArithUtils
   MLIRDialectUtils
   MLIRGPUDialect
+  MLIRXeVMDialect
   MLIRIR
   MLIRViewLikeInterface
   MLIRVectorDialect
diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
index a8ec058a12a93..1157f21230485 100644
--- a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
+++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
@@ -8,6 +8,7 @@
 
 #include "mlir/Dialect/Arith/Utils/Utils.h"
 #include "mlir/Dialect/GPU/IR/GPUDialect.h"
+#include "mlir/Dialect/LLVMIR/XeVMDialect.h"
 #include "mlir/Dialect/Utils/IndexingUtils.h"
 #include "mlir/Dialect/Utils/StaticValueUtils.h"
 #include "mlir/Dialect/XeGPU/IR/XeGPU.h"
@@ -28,6 +29,8 @@ bool isSharedMemory(const MemRefType &memrefTy) {
     return intAttr.getInt() == 3;
   if (auto memrefSpace = llvm::dyn_cast<MemorySpaceAttr>(attr))
     return memrefSpace.getValue() == MemorySpace::SLM;
+  if (auto xevmSpace = llvm::dyn_cast<xevm::AddrSpaceAttr>(attr))
+    return xevmSpace.getValue() == xevm::AddrSpace::SHARED;
   return gpu::GPUDialect::isWorkgroupMemoryAddressSpace(attr);
 }
 

From 23380a923cd2c2073a66fd31b70c3650869dcf3b Mon Sep 17 00:00:00 2001
From: Chao Chen <chao.chen@intel.com>
Date: Wed, 13 Aug 2025 14:30:21 +0000
Subject: [PATCH 08/14] update doc

---
 mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
index 6d06464e204a6..112a18f0705ab 100644
--- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
+++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
@@ -1220,6 +1220,9 @@ def XeGPU_MatrixDescSubviewOp: XeGPU_Op<"matrix_desc_subview", [Pure, ViewLikeOp
     Results:
      - `src` : a matrix descriptor.
      - `offsets` : the coordinates within the matrix the subview will be created from.
+     - `layout`: [optional] An attribute for guiding distributions among
+                 subgroups and/or work-items. It currently can accept either
+                 LayoutAttr or SliceAttr.
   }];
   let arguments = (ins XeGPU_MatrixDesc:$src,
                        Variadic<Index>:$offsets,

From 9e3aa8d6631fe177fd17bfdb9fd48da2ef1d5072 Mon Sep 17 00:00:00 2001
From: Chao Chen <chao.chen@intel.com>
Date: Wed, 13 Aug 2025 21:25:18 +0000
Subject: [PATCH 09/14] remove the layout attribute from the subview op

---
 mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td | 8 ++------
 mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp         | 5 ++---
 2 files changed, 4 insertions(+), 9 deletions(-)

diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
index 112a18f0705ab..9ae2eb0c2e178 100644
--- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
+++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
@@ -1220,19 +1220,15 @@ def XeGPU_MatrixDescSubviewOp: XeGPU_Op<"matrix_desc_subview", [Pure, ViewLikeOp
     Results:
      - `src` : a matrix descriptor.
      - `offsets` : the coordinates within the matrix the subview will be created from.
-     - `layout`: [optional] An attribute for guiding distributions among
-                 subgroups and/or work-items. It currently can accept either
-                 LayoutAttr or SliceAttr.
   }];
   let arguments = (ins XeGPU_MatrixDesc:$src,
                        Variadic<Index>:$offsets,
-                       DenseI64ArrayAttr:$const_offsets,
-                       OptionalAttr<LayoutTrait>: $layout);
+                       DenseI64ArrayAttr:$const_offsets);
   let results = (outs XeGPU_MatrixDesc:$res);
   let assemblyFormat = [{$src `` custom<DynamicIndexList>($offsets, $const_offsets) prop-dict
                          attr-dict `` `:` qualified(type($src)) `->` qualified(type($res))}];
   let builders = [
-    OpBuilder<(ins "Type": $res, "Value":$src, "llvm::ArrayRef<OpFoldResult>": $offsets, "LayoutTrait": $layout)>
+    OpBuilder<(ins "Type": $res, "Value":$src, "llvm::ArrayRef<OpFoldResult>": $offsets)>
   ];
 
   let extraClassDeclaration = [{
diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
index 1157f21230485..27fd6797fed39 100644
--- a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
+++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
@@ -993,13 +993,12 @@ LogicalResult StoreMatrixOp::verify() {
 
 void MatrixDescSubviewOp::build(OpBuilder &builder, OperationState &state,
                                 Type resTy, Value src,
-                                llvm::ArrayRef<OpFoldResult> offsets,
-                                LayoutTrait layout) {
+                                llvm::ArrayRef<OpFoldResult> offsets) {
   llvm::SmallVector<Value> dynamicOffsets;
   llvm::SmallVector<int64_t> staticOffsets;
   dispatchIndexOpFoldResults(offsets, dynamicOffsets, staticOffsets);
   auto staticOffsetsAttr = builder.getDenseI64ArrayAttr(staticOffsets);
-  build(builder, state, resTy, src, dynamicOffsets, staticOffsetsAttr, layout);
+  build(builder, state, resTy, src, dynamicOffsets, staticOffsetsAttr);
 }
 
 LogicalResult MatrixDescSubviewOp::verify() {

From af2c25f457f4a94a0e304196040c0484718d54ca Mon Sep 17 00:00:00 2001
From: Chao Chen <chao.chen@intel.com>
Date: Thu, 14 Aug 2025 00:06:01 +0000
Subject: [PATCH 10/14] refine subview op

---
 .../include/mlir/Dialect/XeGPU/IR/XeGPUOps.td | 27 ++++++++++---------
 mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp        | 23 ++++++++++------
 mlir/test/Dialect/XeGPU/invalid.mlir          | 14 +++++++---
 mlir/test/Dialect/XeGPU/ops.mlir              | 15 ++++++++---
 4 files changed, 52 insertions(+), 27 deletions(-)

diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
index 9ae2eb0c2e178..65f805d1efa93 100644
--- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
+++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
@@ -1177,16 +1177,14 @@ def XeGPU_StoreMatrixOp: XeGPU_Op<"store_matrix", [MemoryEffects<[MemWrite]>,
                               AllElementTypesMatch<["matrix_desc", "data"]>,
                               AllRanksMatch<["matrix_desc", "data"]>]> {
   let arguments = (ins
+    XeGPU_ValueType:$data,
     XeGPU_MatrixDesc:$matrix_desc,
     Variadic<Index>: $offsets,
     DenseI64ArrayAttr: $const_offsets,
-    XeGPU_ValueType:$data,
     OptionalAttr<LayoutTrait>:$layout
   );
-  let assemblyFormat = [{
-    $matrix_desc `` custom<DynamicIndexList>($offsets, $const_offsets) `,` $data
-    prop-dict attr-dict `:` type(operands)
-  }];
+  let assemblyFormat = [{ $data `,` $matrix_desc `` custom<DynamicIndexList>($offsets, $const_offsets)
+                          prop-dict attr-dict `` `:` type(operands)}];
   let description = [{
     This operation writes the `data` fragment into the shared local memory region
     identified by `matrix_desc`.
@@ -1200,8 +1198,8 @@ def XeGPU_StoreMatrixOp: XeGPU_Op<"store_matrix", [MemoryEffects<[MemWrite]>,
                  LayoutAttr or SliceAttr.
   }];
   let builders = [
-    OpBuilder<(ins "TypedValue<MatrixDescType>": $matrix_desc, "llvm::ArrayRef<OpFoldResult>": $offsets,
-                   "Value" : $data, "LayoutTrait": $layout)>,
+    OpBuilder<(ins "Value" : $data, "TypedValue<MatrixDescType>": $matrix_desc,
+                   "llvm::ArrayRef<OpFoldResult>": $offsets, "LayoutTrait": $layout)>,
   ];
   let extraClassDeclaration = [{
     SmallVector<OpFoldResult> getMixedOffsets() {
@@ -1212,14 +1210,19 @@ def XeGPU_StoreMatrixOp: XeGPU_Op<"store_matrix", [MemoryEffects<[MemWrite]>,
   let hasVerifier = 1;
 }
 
-def XeGPU_MatrixDescSubviewOp: XeGPU_Op<"matrix_desc_subview", [Pure, ViewLikeOpInterface,
-                                                                AllElementTypesMatch<["src", "res"]>,
-                                                                AllRanksMatch<["src", "res"]>]> {
+def XeGPU_MatrixDescSubviewOp: XeGPU_Op<"matrix_desc_subview",
+          [Pure, ViewLikeOpInterface, AllElementTypesMatch<["src", "res"]>]> {
   let description = [{
-    Create a subview of a matrix descriptor.
-    Results:
+    Creates a subview of a matrix descriptor. The resulting matrix descriptor
+    may have a lower rank than the source, in which case the dimensions are left-aligned.
+
+    Arguments:
      - `src` : a matrix descriptor.
      - `offsets` : the coordinates within the matrix the subview will be created from.
+
+    Results:
+    - `res` : a matrix descriptor with smaller size.
+
   }];
   let arguments = (ins XeGPU_MatrixDesc:$src,
                        Variadic<Index>:$offsets,
diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
index 27fd6797fed39..27a652663190d 100644
--- a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
+++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
@@ -965,15 +965,15 @@ LogicalResult LoadMatrixOp::verify() {
 //===----------------------------------------------------------------------===//
 // XeGPU_StoreMatrixOp
 //===----------------------------------------------------------------------===//
-void StoreMatrixOp::build(OpBuilder &builder, OperationState &state,
+void StoreMatrixOp::build(OpBuilder &builder, OperationState &state, Value data,
                           TypedValue<MatrixDescType> matrixDesc,
-                          llvm::ArrayRef<OpFoldResult> offsets, Value data,
+                          llvm::ArrayRef<OpFoldResult> offsets,
                           LayoutTrait layout) {
   llvm::SmallVector<Value> dynamicOffsets;
   llvm::SmallVector<int64_t> staticOffsets;
   dispatchIndexOpFoldResults(offsets, dynamicOffsets, staticOffsets);
   auto staticOffsetsAttr = builder.getDenseI64ArrayAttr(staticOffsets);
-  build(builder, state, matrixDesc, dynamicOffsets, staticOffsetsAttr, data,
+  build(builder, state, data, matrixDesc, dynamicOffsets, staticOffsetsAttr,
         layout);
 }
 
@@ -1002,13 +1002,20 @@ void MatrixDescSubviewOp::build(OpBuilder &builder, OperationState &state,
 }
 
 LogicalResult MatrixDescSubviewOp::verify() {
-  ArrayRef<int64_t> srcShape = getSrc().getType().getShape();
-  ArrayRef<int64_t> resShape = getRes().getType().getShape();
-  if (llvm::any_of(llvm::zip_equal(resShape, srcShape),
-                   [](auto p) { return std::get<0>(p) > std::get<1>(p); }))
+  MatrixDescType srcTy = getSrc().getType();
+  MatrixDescType resTy = getRes().getType();
+  ArrayRef<int64_t> srcShape = srcTy.getShape();
+  ArrayRef<int64_t> resShape = resTy.getShape();
+
+  if (srcTy.getRank() < resTy.getRank())
+    return emitOpError("result rank must not exceed source rank.");
+
+  if (llvm::any_of(
+          llvm::zip_equal(resShape, srcShape.take_back(resShape.size())),
+          [](auto p) { return std::get<0>(p) > std::get<1>(p); }))
     return emitOpError("result shape must not exceed source shape.");
 
-  if (getSrc().getType().getLayout() != getRes().getType().getLayout())
+  if (srcTy.getLayout() != resTy.getLayout())
     return emitOpError("result must inherit the source layout.");
 
   return success();
diff --git a/mlir/test/Dialect/XeGPU/invalid.mlir b/mlir/test/Dialect/XeGPU/invalid.mlir
index 63945dab1ccc2..f2df1a3920e23 100644
--- a/mlir/test/Dialect/XeGPU/invalid.mlir
+++ b/mlir/test/Dialect/XeGPU/invalid.mlir
@@ -795,14 +795,14 @@ func.func @load_matrix_desc_invalid_result_size(%arg0: !xegpu.matrix_desc<16x64x
 // -----
 func.func @store_matrix_desc_mismatch_element_type(%arg0: !xegpu.matrix_desc<16x64xf16>, %arg1: vector<16x16xf32>) {
   // expected-error@+1 {{failed to verify that all of {matrix_desc, data} have same element type}}
-  xegpu.store_matrix %arg0[8, 8], %arg1: !xegpu.matrix_desc<16x64xf16>, vector<16x16xf32>
+  xegpu.store_matrix %arg1, %arg0[8, 8] : vector<16x16xf32>, !xegpu.matrix_desc<16x64xf16>
   return
 }
 
 // -----
 func.func @store_matrix_desc_invalid_data_size(%arg0: !xegpu.matrix_desc<16x64xf16>, %arg1: vector<32x32xf16>) {
   // expected-error@+1 {{data shape must not exceed matrix desc shape}}
-  xegpu.store_matrix %arg0[8, 8], %arg1: !xegpu.matrix_desc<16x64xf16>, vector<32x32xf16>
+  xegpu.store_matrix %arg1, %arg0[8, 8] : vector<32x32xf16>, !xegpu.matrix_desc<16x64xf16>
   return
 }
 
@@ -821,8 +821,16 @@ func.func @matrix_desc_subview_layout_mismatch(%arg0: !xegpu.matrix_desc<16x64xf
 }
 
 // -----
-func.func @matrix_desc_subview_rank_mismatch(%arg0: !xegpu.matrix_desc<16x64xf16>) {
+func.func @matrix_desc_subview_element_type_mismatch(%arg0: !xegpu.matrix_desc<16x64xf16>) {
   // expected-error@+1 {{failed to verify that all of {src, res} have same element type}}
   %data = xegpu.matrix_desc_subview %arg0[8, 8]: !xegpu.matrix_desc<16x64xf16> -> !xegpu.matrix_desc<8x16xf32>
   return
 }
+
+// -----
+func.func @matrix_desc_subview_rank_mismatch(%arg0: !xegpu.matrix_desc<16x64xf16>) {
+  // expected-error@+1 {{result rank must not exceed source rank}}
+  %data = xegpu.matrix_desc_subview %arg0[8, 8]: !xegpu.matrix_desc<16x64xf16> -> !xegpu.matrix_desc<4x8x16xf16>
+  return
+}
+
diff --git a/mlir/test/Dialect/XeGPU/ops.mlir b/mlir/test/Dialect/XeGPU/ops.mlir
index 7bceda70dea9f..7a9657587070a 100644
--- a/mlir/test/Dialect/XeGPU/ops.mlir
+++ b/mlir/test/Dialect/XeGPU/ops.mlir
@@ -786,15 +786,15 @@ gpu.func @load_matrix_desc_with_stride(%arg0: !xegpu.matrix_desc<16x64xf16, stri
 
 // CHECK: gpu.func @store_matrix_desc([[ARG0:%.+]]: !xegpu.matrix_desc<16x64xf16>, [[ARG1:%.+]]: vector<16x16xf16>)
 gpu.func @store_matrix_desc(%arg0: !xegpu.matrix_desc<16x64xf16>, %arg1: vector<16x16xf16>) {
-  // CHECK: xegpu.store_matrix [[ARG0]][8, 8], [[ARG1]] : !xegpu.matrix_desc<16x64xf16>, vector<16x16xf16>
-  xegpu.store_matrix %arg0[8, 8], %arg1: !xegpu.matrix_desc<16x64xf16>, vector<16x16xf16>
+  // CHECK: xegpu.store_matrix [[ARG1]], [[ARG0]][8, 8] : vector<16x16xf16>, !xegpu.matrix_desc<16x64xf16>
+  xegpu.store_matrix %arg1, %arg0[8, 8]: vector<16x16xf16>, !xegpu.matrix_desc<16x64xf16>
   gpu.return
 }
 
 // CHECK: gpu.func @store_matrix_desc_with_stride([[ARG0:%.+]]: !xegpu.matrix_desc<16x64xf16, strided<[1, 16]>>, [[ARG1:%.+]]: vector<16x16xf16>)
 gpu.func @store_matrix_desc_with_stride(%arg0: !xegpu.matrix_desc<16x64xf16, strided<[1, 16]>>, %arg1: vector<16x16xf16>) {
-  // CHECK: xegpu.store_matrix [[ARG0]][8, 8], [[ARG1]] : !xegpu.matrix_desc<16x64xf16, strided<[1, 16]>>, vector<16x16xf16>
-  xegpu.store_matrix %arg0[8, 8], %arg1: !xegpu.matrix_desc<16x64xf16, strided<[1, 16]>>, vector<16x16xf16>
+  // CHECK: xegpu.store_matrix [[ARG1]], [[ARG0]][8, 8] : vector<16x16xf16>, !xegpu.matrix_desc<16x64xf16, strided<[1, 16]>>
+  xegpu.store_matrix %arg1, %arg0[8, 8]: vector<16x16xf16>, !xegpu.matrix_desc<16x64xf16, strided<[1, 16]>>
   gpu.return
 }
 
@@ -805,6 +805,13 @@ gpu.func @matrix_desc_subview(%arg0: !xegpu.matrix_desc<16x64xf16>) {
   gpu.return
 }
 
+// CHECK: gpu.func @matrix_desc_subview_lower_rank([[ARG0:%.+]]: !xegpu.matrix_desc<16x64xf16>)
+gpu.func @matrix_desc_subview_lower_rank(%arg0: !xegpu.matrix_desc<16x64xf16>) {
+  //CHECK: xegpu.matrix_desc_subview [[ARG0]][8, 8] : !xegpu.matrix_desc<16x64xf16> -> !xegpu.matrix_desc<16xf16>
+  %data = xegpu.matrix_desc_subview %arg0[8, 8]: !xegpu.matrix_desc<16x64xf16> -> !xegpu.matrix_desc<16xf16>
+  gpu.return
+}
+
 // CHECK: gpu.func @matrix_desc_subview_with_stride([[ARG0:%.+]]: !xegpu.matrix_desc<16x64xf16, strided<[1, 16]>>)
 gpu.func @matrix_desc_subview_with_stride(%arg0: !xegpu.matrix_desc<16x64xf16, strided<[1, 16]>>) {
   //CHECK: xegpu.matrix_desc_subview [[ARG0]][8, 8] : !xegpu.matrix_desc<16x64xf16, strided<[1, 16]>> -> !xegpu.matrix_desc<8x16xf16, strided<[1, 16]>>

From 0531abf5c0164f483d025dd3aa3c39223e4566a4 Mon Sep 17 00:00:00 2001
From: Chao Chen <chao.chen@intel.com>
Date: Thu, 14 Aug 2025 21:00:24 +0000
Subject: [PATCH 11/14] add MemLayoutAttr

---
 .../mlir/Dialect/XeGPU/IR/XeGPUAttrs.td       | 31 +++++++
 .../mlir/Dialect/XeGPU/IR/XeGPUTypes.td       | 18 ++++-
 mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp    | 81 ++++++++++++++++---
 mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp        |  4 +-
 mlir/test/Dialect/XeGPU/invalid.mlir          |  8 +-
 mlir/test/Dialect/XeGPU/ops.mlir              | 36 ++++-----
 6 files changed, 140 insertions(+), 38 deletions(-)

diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td
index 1f420c13ebae0..59dcbafebc515 100644
--- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td
+++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td
@@ -527,4 +527,35 @@ def XeGPU_RangeAttr : XeGPUAttr<"Range", "range"> {
   let genVerifyDecl = 1;
 }
 
+def XeGPU_MemLayoutAttr : XeGPUAttr<"MemLayout", "mem_layout"> {
+  let summary = [{Specifies memory layouts with named attributes.}];
+
+  let description = [{
+    This attribute stores a collection of named attributes that describe
+    memory layout properties such as stride, block, etc.
+  }];
+
+  let parameters = (ins "DictionaryAttr": $attrs);
+  let hasCustomAssemblyFormat = 1;
+
+
+  let extraClassDeclaration = [{
+    /// Get a specific attribute by name
+    Attribute getAttr(StringRef name) const {
+      return getAttrs().get(name);
+    }
+
+    /// Check if a specific attribute exists
+    bool hasAttr(StringRef name) const {
+      return getAttrs().contains(name);
+    }
+
+    ArrayAttr getStrides() {
+      return getAttrs().getAs<ArrayAttr>("stride");
+    }
+
+  }];
+
+}
+
 #endif // MLIR_DIALECT_XEGPU_IR_XEGPUATTRS_TD
diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td
index 02cabce82398b..f027f3f82c9f4 100644
--- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td
+++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td
@@ -210,13 +210,27 @@ def XeGPU_MatrixDesc: XeGPUTypeDef<"MatrixDesc", "matrix_desc", [ShapedTypeInter
   }];
   let parameters = (ins ArrayRefParameter<"int64_t">: $shape,
                         "mlir::Type": $elementType,
-                        OptionalParameter<"mlir::Attribute">: $layout);
+                        OptionalParameter<"MemLayoutAttr">: $mem_layout);
 
   let extraClassDeclaration = [{
     bool hasRank() const { return true; }
 
     MatrixDescType cloneWith(std::optional<llvm::ArrayRef<int64_t>> shape, Type elementType) const {
-      return MatrixDescType::get(getContext(), shape.value_or(getShape()), elementType, getLayout());
+      return MatrixDescType::get(getContext(), shape.value_or(getShape()), elementType, getMemLayout());
+    }
+
+    ArrayAttr getStrides() {
+      auto layout = getMemLayout();
+      if (layout && layout.hasAttr("stride")) {
+        return layout.getStrides();
+      }
+
+      // derive and return default strides
+      SmallVector<int64_t> defaultStrides;
+      llvm::append_range(defaultStrides, getShape().drop_front());
+      llvm::append_values(defaultStrides, 1);
+      Builder builder(getContext());
+      return builder.getI64ArrayAttr(defaultStrides);
     }
   }];
 
diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
index ac9e994d4872c..fe5640627114b 100644
--- a/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
+++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
@@ -427,7 +427,7 @@ RangeAttr::verify(llvm::function_ref<mlir::InFlightDiagnostic()> emitError,
 // XeGPU_TensorDescType
 //===----------------------------------------------------------------------===//
 
-mlir::Type TensorDescType::parse(::mlir::AsmParser &parser) {
+mlir::Type TensorDescType::parse(AsmParser &parser) {
   llvm::SmallVector<int64_t> shape;
   mlir::Type elementType;
   mlir::FailureOr<mlir::Attribute> encoding;
@@ -477,7 +477,7 @@ mlir::Type TensorDescType::parse(::mlir::AsmParser &parser) {
       layout.value_or(mlir::Attribute()));
 }
 
-void TensorDescType::print(::mlir::AsmPrinter &printer) const {
+void TensorDescType::print(AsmPrinter &printer) const {
   printer << "<";
 
   auto shape = getShape();
@@ -522,10 +522,10 @@ TensorDescType TensorDescType::get(llvm::ArrayRef<int64_t> shape,
   return Base::get(context, shape, elementType, attr, layout);
 }
 
-LogicalResult TensorDescType::verify(
-    llvm::function_ref<::mlir::InFlightDiagnostic()> emitError,
-    llvm::ArrayRef<int64_t> shape, mlir::Type elementType,
-    mlir::Attribute encoding, mlir::Attribute layout) {
+LogicalResult
+TensorDescType::verify(llvm::function_ref<InFlightDiagnostic()> emitError,
+                       llvm::ArrayRef<int64_t> shape, mlir::Type elementType,
+                       mlir::Attribute encoding, mlir::Attribute layout) {
   size_t rank = shape.size();
 
   if (rank == 0)
@@ -594,10 +594,10 @@ LogicalResult TensorDescType::verify(
 //===----------------------------------------------------------------------===//
 // XeGPU_MatrixDescType
 //===----------------------------------------------------------------------===//
-mlir::Type MatrixDescType::parse(::mlir::AsmParser &parser) {
+mlir::Type MatrixDescType::parse(AsmParser &parser) {
   llvm::SmallVector<int64_t> shape;
   mlir::Type elementType;
-  mlir::FailureOr<mlir::Attribute> layout;
+  mlir::FailureOr<MemLayoutAttr> layout;
 
   // Parse literal '<'
   if (parser.parseLess())
@@ -617,7 +617,7 @@ mlir::Type MatrixDescType::parse(::mlir::AsmParser &parser) {
 
   // parse optional attributes
   if (mlir::succeeded(parser.parseOptionalComma())) {
-    mlir::Attribute attr;
+    MemLayoutAttr attr;
     ParseResult res = parser.parseAttribute(attr);
     if (mlir::failed(res))
       return {};
@@ -631,22 +631,79 @@ mlir::Type MatrixDescType::parse(::mlir::AsmParser &parser) {
   MLIRContext *ctxt = parser.getContext();
   return MatrixDescType::getChecked(
       [&]() { return parser.emitError(parser.getNameLoc()); }, ctxt, shape,
-      elementType, layout.value_or(mlir::Attribute()));
+      elementType, layout.value_or(MemLayoutAttr()));
 }
 
-void MatrixDescType::print(::mlir::AsmPrinter &printer) const {
+void MatrixDescType::print(AsmPrinter &printer) const {
   printer << "<";
 
   printer.printDimensionList(getShape());
   printer << 'x';
   printer << getElementType();
 
-  if (auto layout = getLayout())
+  if (auto layout = getMemLayout())
     printer << ", " << layout;
 
   printer << ">";
 }
 
+//===----------------------------------------------------------------------===//
+// XeGPU_MatrixDescType
+//===----------------------------------------------------------------------===//
+
+Attribute MemLayoutAttr::parse(AsmParser &parser, Type type) {
+
+  auto context = parser.getContext();
+  llvm::SMLoc loc = parser.getCurrentLocation();
+
+  llvm::SmallDenseSet<StringRef> seenKeys;
+  SmallVector<NamedAttribute> attributes;
+
+  auto parseElt = [&]() -> ParseResult {
+    StringRef nameId;
+    if (failed(parser.parseKeyword(&nameId)))
+      return parser.emitError(loc, "expected valid attribute name");
+
+    if (!seenKeys.insert(nameId).second)
+      return parser.emitError(loc, "duplicate key '")
+             << nameId << " in mem layout attribute";
+
+    if (failed(parser.parseEqual()))
+      return failure();
+
+    Attribute attr;
+    if (failed(parser.parseAttribute(attr)))
+      return failure();
+    attributes.emplace_back(nameId, attr);
+    return success();
+  };
+
+  // Parse literal '<'
+  if (parser.parseLess())
+    return {};
+
+  if (failed(parser.parseCommaSeparatedList(parseElt)))
+    return {};
+
+  // Parse literal '>'
+  if (parser.parseGreater())
+    return {};
+
+  return parser.getChecked<MemLayoutAttr>(
+      loc, context, DictionaryAttr::get(context, attributes));
+}
+
+void MemLayoutAttr::print(AsmPrinter &printer) const {
+  printer << "<";
+  ArrayRef<NamedAttribute> attrs = getAttrs().getValue();
+  for (size_t i = 0; i < attrs.size(); i++) {
+    printer << attrs[i].getName().str() << " = " << attrs[i].getValue();
+    if (i < attrs.size() - 1)
+      printer << ", ";
+  }
+  printer << ">";
+}
+
 } // namespace xegpu
 } // namespace mlir
 
diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
index 27a652663190d..4465ecb25d922 100644
--- a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
+++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
@@ -1015,8 +1015,8 @@ LogicalResult MatrixDescSubviewOp::verify() {
           [](auto p) { return std::get<0>(p) > std::get<1>(p); }))
     return emitOpError("result shape must not exceed source shape.");
 
-  if (srcTy.getLayout() != resTy.getLayout())
-    return emitOpError("result must inherit the source layout.");
+  if (srcTy.getStrides() != resTy.getStrides())
+    return emitOpError("result must inherit the source strides.");
 
   return success();
 }
diff --git a/mlir/test/Dialect/XeGPU/invalid.mlir b/mlir/test/Dialect/XeGPU/invalid.mlir
index f2df1a3920e23..79495c34abff8 100644
--- a/mlir/test/Dialect/XeGPU/invalid.mlir
+++ b/mlir/test/Dialect/XeGPU/invalid.mlir
@@ -814,16 +814,16 @@ func.func @matrix_desc_subview_size_mismatch(%arg0: !xegpu.matrix_desc<16x64xf16
 }
 
 // -----
-func.func @matrix_desc_subview_layout_mismatch(%arg0: !xegpu.matrix_desc<16x64xf16, strided<[1, 16]>>) {
-  // expected-error@+1 {{result must inherit the source layout}}
-  %data = xegpu.matrix_desc_subview %arg0[8, 8]: !xegpu.matrix_desc<16x64xf16, strided<[1, 16]>> -> !xegpu.matrix_desc<8x16xf16>
+func.func @matrix_desc_subview_layout_mismatch(%arg0: !xegpu.matrix_desc<16x64xf16, #xegpu.mem_layout<stride =[1, 16]>>) {
+  // expected-error@+1 {{result must inherit the source strides}}
+  %data = xegpu.matrix_desc_subview %arg0[8, 8]: !xegpu.matrix_desc<16x64xf16, #xegpu.mem_layout<stride =[1, 16]>> -> !xegpu.matrix_desc<8x16xf16>
   return
 }
 
 // -----
 func.func @matrix_desc_subview_element_type_mismatch(%arg0: !xegpu.matrix_desc<16x64xf16>) {
   // expected-error@+1 {{failed to verify that all of {src, res} have same element type}}
-  %data = xegpu.matrix_desc_subview %arg0[8, 8]: !xegpu.matrix_desc<16x64xf16> -> !xegpu.matrix_desc<8x16xf32>
+  %data = xegpu.matrix_desc_subview %arg0[8, 8]: !xegpu.matrix_desc<16x64xf16> -> !xegpu.matrix_desc<8x16xf32, #xegpu.mem_layout<stride =[64, 1]>>
   return
 }
 
diff --git a/mlir/test/Dialect/XeGPU/ops.mlir b/mlir/test/Dialect/XeGPU/ops.mlir
index 7a9657587070a..edd1fc844abeb 100644
--- a/mlir/test/Dialect/XeGPU/ops.mlir
+++ b/mlir/test/Dialect/XeGPU/ops.mlir
@@ -763,9 +763,9 @@ gpu.func @create_matrix_desc() {
 // CHECK-LABEL: gpu.func @create_matrix_desc_with_stride({{.*}}) {
 gpu.func @create_matrix_desc_with_stride() {
   //CHECK: [[alloc:%.+]] = memref.alloca() {alignment = 1024 : i64} : memref<2048xi8, 3>
-  //CHECK: [[mdesc:%.+]] = xegpu.create_matrix_desc [[alloc]] : memref<2048xi8, 3> -> !xegpu.matrix_desc<16x64xf16, strided<[1, 16]>>
+  //CHECK: [[mdesc:%.+]] = xegpu.create_matrix_desc [[alloc]] : memref<2048xi8, 3> -> !xegpu.matrix_desc<16x64xf16, #xegpu.mem_layout<stride = [1, 16]>>
   %m = memref.alloca() {alignment = 1024} : memref<2048xi8, 3>
-  %matrix_desc = xegpu.create_matrix_desc %m : memref<2048xi8, 3> -> !xegpu.matrix_desc<16x64xf16, strided<[1, 16]>>
+  %matrix_desc = xegpu.create_matrix_desc %m : memref<2048xi8, 3> -> !xegpu.matrix_desc<16x64xf16, #xegpu.mem_layout<stride = [1, 16]>>
   gpu.return
 }
 
@@ -776,10 +776,10 @@ gpu.func @load_matrix_desc(%arg0: !xegpu.matrix_desc<16x64xf16>) {
   gpu.return
 }
 
-// CHECK: gpu.func @load_matrix_desc_with_stride(%arg0: !xegpu.matrix_desc<16x64xf16, strided<[1, 16]>>)
-gpu.func @load_matrix_desc_with_stride(%arg0: !xegpu.matrix_desc<16x64xf16, strided<[1, 16]>>) {
-  // CHECK: xegpu.load_matrix [[ARG0]][8, 8] : !xegpu.matrix_desc<16x64xf16, strided<[1, 16]>> -> vector<8x16xf16>
-  %data = xegpu.load_matrix %arg0[8, 8]: !xegpu.matrix_desc<16x64xf16, strided<[1, 16]>> -> vector<8x16xf16>
+// CHECK: gpu.func @load_matrix_desc_with_stride(%arg0: !xegpu.matrix_desc<16x64xf16, #xegpu.mem_layout<stride = [1, 16]>>)
+gpu.func @load_matrix_desc_with_stride(%arg0: !xegpu.matrix_desc<16x64xf16, #xegpu.mem_layout<stride = [1, 16]>>) {
+  // CHECK: xegpu.load_matrix [[ARG0]][8, 8] : !xegpu.matrix_desc<16x64xf16, #xegpu.mem_layout<stride = [1, 16]>> -> vector<8x16xf16>
+  %data = xegpu.load_matrix %arg0[8, 8]: !xegpu.matrix_desc<16x64xf16, #xegpu.mem_layout<stride = [1, 16]>> -> vector<8x16xf16>
   gpu.return
 }
 
@@ -791,31 +791,31 @@ gpu.func @store_matrix_desc(%arg0: !xegpu.matrix_desc<16x64xf16>, %arg1: vector<
   gpu.return
 }
 
-// CHECK: gpu.func @store_matrix_desc_with_stride([[ARG0:%.+]]: !xegpu.matrix_desc<16x64xf16, strided<[1, 16]>>, [[ARG1:%.+]]: vector<16x16xf16>)
-gpu.func @store_matrix_desc_with_stride(%arg0: !xegpu.matrix_desc<16x64xf16, strided<[1, 16]>>, %arg1: vector<16x16xf16>) {
-  // CHECK: xegpu.store_matrix [[ARG1]], [[ARG0]][8, 8] : vector<16x16xf16>, !xegpu.matrix_desc<16x64xf16, strided<[1, 16]>>
-  xegpu.store_matrix %arg1, %arg0[8, 8]: vector<16x16xf16>, !xegpu.matrix_desc<16x64xf16, strided<[1, 16]>>
+// CHECK: gpu.func @store_matrix_desc_with_stride([[ARG0:%.+]]: !xegpu.matrix_desc<16x64xf16, #xegpu.mem_layout<stride = [1, 16]>>, [[ARG1:%.+]]: vector<16x16xf16>)
+gpu.func @store_matrix_desc_with_stride(%arg0: !xegpu.matrix_desc<16x64xf16, #xegpu.mem_layout<stride = [1, 16]>>, %arg1: vector<16x16xf16>) {
+  // CHECK: xegpu.store_matrix [[ARG1]], [[ARG0]][8, 8] : vector<16x16xf16>, !xegpu.matrix_desc<16x64xf16, #xegpu.mem_layout<stride = [1, 16]>>
+  xegpu.store_matrix %arg1, %arg0[8, 8]: vector<16x16xf16>, !xegpu.matrix_desc<16x64xf16, #xegpu.mem_layout<stride = [1, 16]>>
   gpu.return
 }
 
 // CHECK: gpu.func @matrix_desc_subview([[ARG0:%.+]]: !xegpu.matrix_desc<16x64xf16>)
 gpu.func @matrix_desc_subview(%arg0: !xegpu.matrix_desc<16x64xf16>) {
-  //CHECK: xegpu.matrix_desc_subview [[ARG0]][8, 8] : !xegpu.matrix_desc<16x64xf16> -> !xegpu.matrix_desc<8x16xf16>
-  %data = xegpu.matrix_desc_subview %arg0[8, 8]: !xegpu.matrix_desc<16x64xf16> -> !xegpu.matrix_desc<8x16xf16>
+  //CHECK: xegpu.matrix_desc_subview [[ARG0]][8, 8] : !xegpu.matrix_desc<16x64xf16> -> !xegpu.matrix_desc<8x16xf16, #xegpu.mem_layout<stride = [64, 1]>>
+  %data = xegpu.matrix_desc_subview %arg0[8, 8]: !xegpu.matrix_desc<16x64xf16> -> !xegpu.matrix_desc<8x16xf16, #xegpu.mem_layout<stride = [64, 1]>>
   gpu.return
 }
 
 // CHECK: gpu.func @matrix_desc_subview_lower_rank([[ARG0:%.+]]: !xegpu.matrix_desc<16x64xf16>)
 gpu.func @matrix_desc_subview_lower_rank(%arg0: !xegpu.matrix_desc<16x64xf16>) {
-  //CHECK: xegpu.matrix_desc_subview [[ARG0]][8, 8] : !xegpu.matrix_desc<16x64xf16> -> !xegpu.matrix_desc<16xf16>
-  %data = xegpu.matrix_desc_subview %arg0[8, 8]: !xegpu.matrix_desc<16x64xf16> -> !xegpu.matrix_desc<16xf16>
+  //CHECK: xegpu.matrix_desc_subview [[ARG0]][8, 8] : !xegpu.matrix_desc<16x64xf16> -> !xegpu.matrix_desc<16xf16, #xegpu.mem_layout<stride = [64, 1]>>
+  %data = xegpu.matrix_desc_subview %arg0[8, 8]: !xegpu.matrix_desc<16x64xf16> -> !xegpu.matrix_desc<16xf16, #xegpu.mem_layout<stride = [64, 1]>>
   gpu.return
 }
 
-// CHECK: gpu.func @matrix_desc_subview_with_stride([[ARG0:%.+]]: !xegpu.matrix_desc<16x64xf16, strided<[1, 16]>>)
-gpu.func @matrix_desc_subview_with_stride(%arg0: !xegpu.matrix_desc<16x64xf16, strided<[1, 16]>>) {
-  //CHECK: xegpu.matrix_desc_subview [[ARG0]][8, 8] : !xegpu.matrix_desc<16x64xf16, strided<[1, 16]>> -> !xegpu.matrix_desc<8x16xf16, strided<[1, 16]>>
-  %data = xegpu.matrix_desc_subview %arg0[8, 8]: !xegpu.matrix_desc<16x64xf16, strided<[1, 16]>> -> !xegpu.matrix_desc<8x16xf16, strided<[1, 16]>>
+// CHECK: gpu.func @matrix_desc_subview_with_stride([[ARG0:%.+]]: !xegpu.matrix_desc<16x64xf16, #xegpu.mem_layout<stride = [1, 16]>>)
+gpu.func @matrix_desc_subview_with_stride(%arg0: !xegpu.matrix_desc<16x64xf16, #xegpu.mem_layout<stride = [1, 16]>>) {
+  //CHECK: xegpu.matrix_desc_subview [[ARG0]][8, 8] : !xegpu.matrix_desc<16x64xf16, #xegpu.mem_layout<stride = [1, 16]>> -> !xegpu.matrix_desc<8x16xf16, #xegpu.mem_layout<stride = [1, 16]>>
+  %data = xegpu.matrix_desc_subview %arg0[8, 8]: !xegpu.matrix_desc<16x64xf16, #xegpu.mem_layout<stride = [1, 16]>> -> !xegpu.matrix_desc<8x16xf16, #xegpu.mem_layout<stride = [1, 16]>>
   gpu.return
 }
 

From 03850886e6f39c16f364a7d6c377060936023215 Mon Sep 17 00:00:00 2001
From: Chao Chen <chao.chen@intel.com>
Date: Thu, 14 Aug 2025 21:31:14 +0000
Subject: [PATCH 12/14] refine

---
 mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td   |  5 +++--
 mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td | 13 +++++++++++++
 mlir/test/Dialect/XeGPU/ops.mlir                 |  4 ++--
 3 files changed, 18 insertions(+), 4 deletions(-)

diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
index 65f805d1efa93..511dadd6c5b38 100644
--- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
+++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
@@ -1213,8 +1213,9 @@ def XeGPU_StoreMatrixOp: XeGPU_Op<"store_matrix", [MemoryEffects<[MemWrite]>,
 def XeGPU_MatrixDescSubviewOp: XeGPU_Op<"matrix_desc_subview",
           [Pure, ViewLikeOpInterface, AllElementTypesMatch<["src", "res"]>]> {
   let description = [{
-    Creates a subview of a matrix descriptor. The resulting matrix descriptor
-    may have a lower rank than the source, in which case the dimensions are left-aligned.
+    Creates a subview of a matrix descriptor. The resulting matrix descriptor can have
+    a lower rank than the source; in this case, the result dimensions correspond to the
+    higher-order dimensions of the source matrix descriptor.
 
     Arguments:
      - `src` : a matrix descriptor.
diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td
index f027f3f82c9f4..bf5cd4c5b070e 100644
--- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td
+++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td
@@ -207,6 +207,19 @@ def XeGPU_MatrixDesc: XeGPUTypeDef<"MatrixDesc", "matrix_desc", [ShapedTypeInter
     MatrixDesc represents a block of data stored in shared local memory.
     By default, unless a layout attribute is provided, the data is stored
     contiguously in row-major order within the region.
+
+    Examples:
+    ```mlir
+    // A matrix of data stored in column-major order.
+    !xegpu.matrix_desc<128x128xf16, #xegpu.mem_layout<stride = [1, 128]>>
+
+    // A matrix of data stored in a blocked layout. Elements within the same block
+    // are stored contiguously in memory. Blocks are stored in row-major order.
+    !xegpu.matrix_desc<128x128xf16, #xegpu.mem_layout<block = [8, 8]>>
+
+    // A matrix of data stored in column-major order with blocked layout.
+    !xegpu.matrix_desc<128x128xf16, #xegpu.mem_layout<stride = [1, 128], block = [8, 8]>>
+    ```
   }];
   let parameters = (ins ArrayRefParameter<"int64_t">: $shape,
                         "mlir::Type": $elementType,
diff --git a/mlir/test/Dialect/XeGPU/ops.mlir b/mlir/test/Dialect/XeGPU/ops.mlir
index edd1fc844abeb..7106b2667f5d0 100644
--- a/mlir/test/Dialect/XeGPU/ops.mlir
+++ b/mlir/test/Dialect/XeGPU/ops.mlir
@@ -793,8 +793,8 @@ gpu.func @store_matrix_desc(%arg0: !xegpu.matrix_desc<16x64xf16>, %arg1: vector<
 
 // CHECK: gpu.func @store_matrix_desc_with_stride([[ARG0:%.+]]: !xegpu.matrix_desc<16x64xf16, #xegpu.mem_layout<stride = [1, 16]>>, [[ARG1:%.+]]: vector<16x16xf16>)
 gpu.func @store_matrix_desc_with_stride(%arg0: !xegpu.matrix_desc<16x64xf16, #xegpu.mem_layout<stride = [1, 16]>>, %arg1: vector<16x16xf16>) {
-  // CHECK: xegpu.store_matrix [[ARG1]], [[ARG0]][8, 8] : vector<16x16xf16>, !xegpu.matrix_desc<16x64xf16, #xegpu.mem_layout<stride = [1, 16]>>
-  xegpu.store_matrix %arg1, %arg0[8, 8]: vector<16x16xf16>, !xegpu.matrix_desc<16x64xf16, #xegpu.mem_layout<stride = [1, 16]>>
+  // CHECK: xegpu.store_matrix [[ARG1]], [[ARG0]][0, 8] : vector<16x16xf16>, !xegpu.matrix_desc<16x64xf16, #xegpu.mem_layout<stride = [1, 16]>>
+  xegpu.store_matrix %arg1, %arg0[0, 8]: vector<16x16xf16>, !xegpu.matrix_desc<16x64xf16, #xegpu.mem_layout<stride = [1, 16]>>
   gpu.return
 }
 

From f6862faaf9554480cccce37de85061fdff548ddc Mon Sep 17 00:00:00 2001
From: Chao Chen <chao.chen@intel.com>
Date: Fri, 15 Aug 2025 00:40:11 +0000
Subject: [PATCH 13/14] rename matrix_desc to mem_desc

---
 .../include/mlir/Dialect/XeGPU/IR/XeGPUOps.td | 42 +++++------
 .../mlir/Dialect/XeGPU/IR/XeGPUTypes.td       | 16 ++---
 mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp    | 10 +--
 mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp        | 22 +++---
 .../Transforms/XeGPUWgToSgDistribute.cpp      |  4 +-
 mlir/test/Dialect/XeGPU/invalid.mlir          | 46 ++++++------
 mlir/test/Dialect/XeGPU/ops.mlir              | 72 +++++++++----------
 7 files changed, 106 insertions(+), 106 deletions(-)

diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
index 511dadd6c5b38..4e6c5a8b1a820 100644
--- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
+++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
@@ -1114,8 +1114,8 @@ class AllMemSizesMatch<list<string> names> :
     AllMatchSameOperatorTrait<names, SizeInBits<"_self">.result,
                               "size in bits">;
 
-def XeGPU_CreateMatrixDescOp: XeGPU_Op<"create_matrix_desc", [Pure,
-      AllMemSizesMatch<["source", "matrix_desc"]>]>  {
+def XeGPU_CreateMemDescOp: XeGPU_Op<"create_mem_desc", [Pure,
+      AllMemSizesMatch<["source", "mem_desc"]>]>  {
   let summary = "Create a matrix descriptor.";
   let description = [{
     Creates a matrix descriptor from a shared local memory (SLM) buffer.
@@ -1125,24 +1125,24 @@ def XeGPU_CreateMatrixDescOp: XeGPU_Op<"create_matrix_desc", [Pure,
     Arguments:
      - `source` : a 1D statically shaped memref with element type i8, representing the raw SLM buffer.
     Results:
-     - `matrix_desc` : the matrix descriptor.
+     - `mem_desc` : the matrix descriptor.
   }];
   let arguments = (ins StaticShared1DMemRefOf<[I8]>:$source);
-  let results = (outs XeGPU_MatrixDesc:$matrix_desc);
-  let assemblyFormat = "$source prop-dict attr-dict `` `:` type($source) `->` qualified(type($matrix_desc))";
+  let results = (outs XeGPU_MemDesc:$mem_desc);
+  let assemblyFormat = "$source prop-dict attr-dict `` `:` type($source) `->` qualified(type($mem_desc))";
 }
 
 def XeGPU_LoadMatrixOp: XeGPU_Op<"load_matrix", [MemoryEffects<[MemRead]>,
-                              AllElementTypesMatch<["matrix_desc", "res"]>,
-                              AllRanksMatch<["matrix_desc", "res"]>]>  {
-  let arguments = (ins XeGPU_MatrixDesc:$matrix_desc,
+                              AllElementTypesMatch<["mem_desc", "res"]>,
+                              AllRanksMatch<["mem_desc", "res"]>]>  {
+  let arguments = (ins XeGPU_MemDesc:$mem_desc,
     Variadic<Index>: $offsets,
     DenseI64ArrayAttr: $const_offsets,
     OptionalAttr<LayoutTrait>:$layout
   );
   let results = (outs XeGPU_ValueType:$res);
   let assemblyFormat = [{
-    $matrix_desc `` custom<DynamicIndexList>($offsets, $const_offsets)
+    $mem_desc `` custom<DynamicIndexList>($offsets, $const_offsets)
     prop-dict attr-dict `` `:` type(operands) `->` type(results)
   }];
 
@@ -1151,7 +1151,7 @@ def XeGPU_LoadMatrixOp: XeGPU_Op<"load_matrix", [MemoryEffects<[MemRead]>,
     using the provided matrix descriptor.
 
     Arguments:
-     - `matrix_desc`: the matrix descriptor identifying the SLM region.
+     - `mem_desc`: the matrix descriptor identifying the SLM region.
      - `offsets`: the coordinates within the matrix to read from.
      - `layout`: [optional] An attribute for guiding distributions among
                  subgroups and/or work-items. It currently can accept either
@@ -1161,7 +1161,7 @@ def XeGPU_LoadMatrixOp: XeGPU_Op<"load_matrix", [MemoryEffects<[MemRead]>,
   }];
 
   let builders = [
-    OpBuilder<(ins "Type":$res, "TypedValue<MatrixDescType>": $matrix_desc,
+    OpBuilder<(ins "Type":$res, "TypedValue<MemDescType>": $mem_desc,
                     "llvm::ArrayRef<OpFoldResult>": $offsets, "LayoutTrait": $layout)>,
   ];
   let extraClassDeclaration = [{
@@ -1174,23 +1174,23 @@ def XeGPU_LoadMatrixOp: XeGPU_Op<"load_matrix", [MemoryEffects<[MemRead]>,
 }
 
 def XeGPU_StoreMatrixOp: XeGPU_Op<"store_matrix", [MemoryEffects<[MemWrite]>,
-                              AllElementTypesMatch<["matrix_desc", "data"]>,
-                              AllRanksMatch<["matrix_desc", "data"]>]> {
+                              AllElementTypesMatch<["mem_desc", "data"]>,
+                              AllRanksMatch<["mem_desc", "data"]>]> {
   let arguments = (ins
     XeGPU_ValueType:$data,
-    XeGPU_MatrixDesc:$matrix_desc,
+    XeGPU_MemDesc:$mem_desc,
     Variadic<Index>: $offsets,
     DenseI64ArrayAttr: $const_offsets,
     OptionalAttr<LayoutTrait>:$layout
   );
-  let assemblyFormat = [{ $data `,` $matrix_desc `` custom<DynamicIndexList>($offsets, $const_offsets)
+  let assemblyFormat = [{ $data `,` $mem_desc `` custom<DynamicIndexList>($offsets, $const_offsets)
                           prop-dict attr-dict `` `:` type(operands)}];
   let description = [{
     This operation writes the `data` fragment into the shared local memory region
-    identified by `matrix_desc`.
+    identified by `mem_desc`.
 
     Arguments:
-     - `matrix_desc`: the matrix descriptor specifying the SLM region.
+     - `mem_desc`: the matrix descriptor specifying the SLM region.
      - `offsets`: the coordinates within the matrix where the data will be written.
      - `data`: the values to be stored in the matrix.
      - `layout`: [optional] An attribute for guiding distributions among
@@ -1198,7 +1198,7 @@ def XeGPU_StoreMatrixOp: XeGPU_Op<"store_matrix", [MemoryEffects<[MemWrite]>,
                  LayoutAttr or SliceAttr.
   }];
   let builders = [
-    OpBuilder<(ins "Value" : $data, "TypedValue<MatrixDescType>": $matrix_desc,
+    OpBuilder<(ins "Value" : $data, "TypedValue<MemDescType>": $mem_desc,
                    "llvm::ArrayRef<OpFoldResult>": $offsets, "LayoutTrait": $layout)>,
   ];
   let extraClassDeclaration = [{
@@ -1210,7 +1210,7 @@ def XeGPU_StoreMatrixOp: XeGPU_Op<"store_matrix", [MemoryEffects<[MemWrite]>,
   let hasVerifier = 1;
 }
 
-def XeGPU_MatrixDescSubviewOp: XeGPU_Op<"matrix_desc_subview",
+def XeGPU_MemDescSubviewOp: XeGPU_Op<"mem_desc_subview",
           [Pure, ViewLikeOpInterface, AllElementTypesMatch<["src", "res"]>]> {
   let description = [{
     Creates a subview of a matrix descriptor. The resulting matrix descriptor can have
@@ -1225,10 +1225,10 @@ def XeGPU_MatrixDescSubviewOp: XeGPU_Op<"matrix_desc_subview",
     - `res` : a matrix descriptor with smaller size.
 
   }];
-  let arguments = (ins XeGPU_MatrixDesc:$src,
+  let arguments = (ins XeGPU_MemDesc:$src,
                        Variadic<Index>:$offsets,
                        DenseI64ArrayAttr:$const_offsets);
-  let results = (outs XeGPU_MatrixDesc:$res);
+  let results = (outs XeGPU_MemDesc:$res);
   let assemblyFormat = [{$src `` custom<DynamicIndexList>($offsets, $const_offsets) prop-dict
                          attr-dict `` `:` qualified(type($src)) `->` qualified(type($res))}];
   let builders = [
diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td
index bf5cd4c5b070e..6602ff94d6ae3 100644
--- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td
+++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td
@@ -201,24 +201,24 @@ def XeGPU_Nbarrier: XeGPUTypeDef<"Nbarrier", "nbarrier", [], "mlir::Type"> {
   }];
 }
 
-def XeGPU_MatrixDesc: XeGPUTypeDef<"MatrixDesc", "matrix_desc", [ShapedTypeInterface], "mlir::Type"> {
-  let summary = "MatrixDesc describing the data in SLM";
+def XeGPU_MemDesc: XeGPUTypeDef<"MemDesc", "mem_desc", [ShapedTypeInterface], "mlir::Type"> {
+  let summary = "MemDesc describing the data in SLM";
   let description = [{
-    MatrixDesc represents a block of data stored in shared local memory.
+    MemDesc represents a block of data stored in shared local memory.
     By default, unless a layout attribute is provided, the data is stored
     contiguously in row-major order within the region.
 
     Examples:
     ```mlir
     // A matrix of data stored in column-major order.
-    !xegpu.matrix_desc<128x128xf16, #xegpu.mem_layout<stride = [1, 128]>>
+    !xegpu.mem_desc<128x128xf16, #xegpu.mem_layout<stride = [1, 128]>>
 
     // A matrix of data stored in a blocked layout. Elements within the same block
     // are stored contiguously in memory. Blocks are stored in row-major order.
-    !xegpu.matrix_desc<128x128xf16, #xegpu.mem_layout<block = [8, 8]>>
+    !xegpu.mem_desc<128x128xf16, #xegpu.mem_layout<block = [8, 8]>>
 
     // A matrix of data stored in column-major order with blocked layout.
-    !xegpu.matrix_desc<128x128xf16, #xegpu.mem_layout<stride = [1, 128], block = [8, 8]>>
+    !xegpu.mem_desc<128x128xf16, #xegpu.mem_layout<stride = [1, 128], block = [8, 8]>>
     ```
   }];
   let parameters = (ins ArrayRefParameter<"int64_t">: $shape,
@@ -228,8 +228,8 @@ def XeGPU_MatrixDesc: XeGPUTypeDef<"MatrixDesc", "matrix_desc", [ShapedTypeInter
   let extraClassDeclaration = [{
     bool hasRank() const { return true; }
 
-    MatrixDescType cloneWith(std::optional<llvm::ArrayRef<int64_t>> shape, Type elementType) const {
-      return MatrixDescType::get(getContext(), shape.value_or(getShape()), elementType, getMemLayout());
+    MemDescType cloneWith(std::optional<llvm::ArrayRef<int64_t>> shape, Type elementType) const {
+      return MemDescType::get(getContext(), shape.value_or(getShape()), elementType, getMemLayout());
     }
 
     ArrayAttr getStrides() {
diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
index fe5640627114b..1b26542ff65a3 100644
--- a/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
+++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
@@ -592,9 +592,9 @@ TensorDescType::verify(llvm::function_ref<InFlightDiagnostic()> emitError,
 }
 
 //===----------------------------------------------------------------------===//
-// XeGPU_MatrixDescType
+// XeGPU_MemDescType
 //===----------------------------------------------------------------------===//
-mlir::Type MatrixDescType::parse(AsmParser &parser) {
+mlir::Type MemDescType::parse(AsmParser &parser) {
   llvm::SmallVector<int64_t> shape;
   mlir::Type elementType;
   mlir::FailureOr<MemLayoutAttr> layout;
@@ -629,12 +629,12 @@ mlir::Type MatrixDescType::parse(AsmParser &parser) {
     return {};
 
   MLIRContext *ctxt = parser.getContext();
-  return MatrixDescType::getChecked(
+  return MemDescType::getChecked(
       [&]() { return parser.emitError(parser.getNameLoc()); }, ctxt, shape,
       elementType, layout.value_or(MemLayoutAttr()));
 }
 
-void MatrixDescType::print(AsmPrinter &printer) const {
+void MemDescType::print(AsmPrinter &printer) const {
   printer << "<";
 
   printer.printDimensionList(getShape());
@@ -648,7 +648,7 @@ void MatrixDescType::print(AsmPrinter &printer) const {
 }
 
 //===----------------------------------------------------------------------===//
-// XeGPU_MatrixDescType
+// XeGPU_MemDescType
 //===----------------------------------------------------------------------===//
 
 Attribute MemLayoutAttr::parse(AsmParser &parser, Type type) {
diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
index 4465ecb25d922..1caa37d8353bc 100644
--- a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
+++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
@@ -942,7 +942,7 @@ void ConvertLayoutOp::getCanonicalizationPatterns(RewritePatternSet &patterns,
 // XeGPU_LoadMatrixOp
 //===----------------------------------------------------------------------===//
 void LoadMatrixOp::build(OpBuilder &builder, OperationState &state, Type res,
-                         TypedValue<MatrixDescType> matrixDesc,
+                         TypedValue<MemDescType> matrixDesc,
                          llvm::ArrayRef<OpFoldResult> offsets,
                          LayoutTrait layout) {
   llvm::SmallVector<Value> dynamicOffsets;
@@ -955,7 +955,7 @@ void LoadMatrixOp::build(OpBuilder &builder, OperationState &state, Type res,
 
 LogicalResult LoadMatrixOp::verify() {
   ArrayRef<int64_t> valueShape = getRes().getType().getShape();
-  ArrayRef<int64_t> mdescShape = getMatrixDesc().getType().getShape();
+  ArrayRef<int64_t> mdescShape = getMemDesc().getType().getShape();
   if (llvm::any_of(llvm::zip_equal(valueShape, mdescShape),
                    [](auto p) { return std::get<0>(p) > std::get<1>(p); }))
     return emitOpError("result shape must not exceed matrix desc shape.");
@@ -966,7 +966,7 @@ LogicalResult LoadMatrixOp::verify() {
 // XeGPU_StoreMatrixOp
 //===----------------------------------------------------------------------===//
 void StoreMatrixOp::build(OpBuilder &builder, OperationState &state, Value data,
-                          TypedValue<MatrixDescType> matrixDesc,
+                          TypedValue<MemDescType> matrixDesc,
                           llvm::ArrayRef<OpFoldResult> offsets,
                           LayoutTrait layout) {
   llvm::SmallVector<Value> dynamicOffsets;
@@ -979,7 +979,7 @@ void StoreMatrixOp::build(OpBuilder &builder, OperationState &state, Value data,
 
 LogicalResult StoreMatrixOp::verify() {
   ArrayRef<int64_t> dataShape = getData().getType().getShape();
-  ArrayRef<int64_t> mdescShape = getMatrixDesc().getType().getShape();
+  ArrayRef<int64_t> mdescShape = getMemDesc().getType().getShape();
   if (llvm::any_of(llvm::zip_equal(dataShape, mdescShape),
                    [](auto p) { return std::get<0>(p) > std::get<1>(p); }))
     return emitOpError("data shape must not exceed matrix desc shape.");
@@ -988,12 +988,12 @@ LogicalResult StoreMatrixOp::verify() {
 }
 
 //===----------------------------------------------------------------------===//
-// XeGPU_MatrixDescSubviewOp
+// XeGPU_MemDescSubviewOp
 //===----------------------------------------------------------------------===//
 
-void MatrixDescSubviewOp::build(OpBuilder &builder, OperationState &state,
-                                Type resTy, Value src,
-                                llvm::ArrayRef<OpFoldResult> offsets) {
+void MemDescSubviewOp::build(OpBuilder &builder, OperationState &state,
+                             Type resTy, Value src,
+                             llvm::ArrayRef<OpFoldResult> offsets) {
   llvm::SmallVector<Value> dynamicOffsets;
   llvm::SmallVector<int64_t> staticOffsets;
   dispatchIndexOpFoldResults(offsets, dynamicOffsets, staticOffsets);
@@ -1001,9 +1001,9 @@ void MatrixDescSubviewOp::build(OpBuilder &builder, OperationState &state,
   build(builder, state, resTy, src, dynamicOffsets, staticOffsetsAttr);
 }
 
-LogicalResult MatrixDescSubviewOp::verify() {
-  MatrixDescType srcTy = getSrc().getType();
-  MatrixDescType resTy = getRes().getType();
+LogicalResult MemDescSubviewOp::verify() {
+  MemDescType srcTy = getSrc().getType();
+  MemDescType resTy = getRes().getType();
   ArrayRef<int64_t> srcShape = srcTy.getShape();
   ArrayRef<int64_t> resShape = resTy.getShape();
 
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp
index 4a5525c8abb30..5d5d698c88cba 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp
@@ -475,8 +475,8 @@ struct WgToSgElementwiseOp : public ConversionPattern {
 // is lowered to:
 //   #a = #xegpu.layout<inst_data = [16, 16]>
 //   #b = #xegpu.layout<inst_data = [8, 16]>
-//   store_matrix %1, %slm <{layout_input_0 = #a}> : vector<32x16>, matrix_desc<32x64xf32>
-//   %d = load_matrix %slm <{layout_result_0 = #a}> : matrix_desc<32x64xf32> -> vector<16x32xf32>
+//   store_matrix %1, %slm <{layout_input_0 = #a}> : vector<32x16>, mem_desc<32x64xf32>
+//   %d = load_matrix %slm <{layout_result_0 = #a}> : mem_desc<32x64xf32> -> vector<16x32xf32>
 //   xegpu.convert_layout %d <{input_layout = #a, target_layout = #b}> : vector<16x32xf32>
 // clang-format on
 struct WgToSgConvertLayoutOp
diff --git a/mlir/test/Dialect/XeGPU/invalid.mlir b/mlir/test/Dialect/XeGPU/invalid.mlir
index 79495c34abff8..e8ef57ca192a9 100644
--- a/mlir/test/Dialect/XeGPU/invalid.mlir
+++ b/mlir/test/Dialect/XeGPU/invalid.mlir
@@ -763,74 +763,74 @@ func.func @slice_attr_repeat_dim() {
 }
 
 // -----
-func.func @create_matrix_desc_non_slm() {
+func.func @create_mem_desc_non_slm() {
   %m = memref.alloca() {alignment = 1024} : memref<2048xi8, 1>
   // expected-error@+1 {{operand #0 must be statically shaped memref of 8-bit signless integer values for shared memory}}
-  %matrix_desc = xegpu.create_matrix_desc %m : memref<2048xi8, 1> -> !xegpu.matrix_desc<16x64xf16>
+  %mem_desc = xegpu.create_mem_desc %m : memref<2048xi8, 1> -> !xegpu.mem_desc<16x64xf16>
   return
 }
 
 // -----
-func.func @create_matrix_desc_mismatch_sizes() {
+func.func @create_mem_desc_mismatch_sizes() {
   %m = memref.alloca() {alignment = 1024} : memref<2048xi8, 3>
-  // expected-error@+1 {{failed to verify that all of {source, matrix_desc} have same size in bits}}
-  %matrix_desc = xegpu.create_matrix_desc %m : memref<2048xi8, 3> -> !xegpu.matrix_desc<16x32xf16>
+  // expected-error@+1 {{failed to verify that all of {source, mem_desc} have same size in bits}}
+  %mem_desc = xegpu.create_mem_desc %m : memref<2048xi8, 3> -> !xegpu.mem_desc<16x32xf16>
   return
 }
 
 // -----
-func.func @load_matrix_desc_mismatch_element_type(%arg0: !xegpu.matrix_desc<16x64xf16>) {
-  // expected-error@+1 {{failed to verify that all of {matrix_desc, res} have same element type}}
-  %data = xegpu.load_matrix %arg0[8, 8]: !xegpu.matrix_desc<16x64xf16> -> vector<8x16xf32>
+func.func @load_mem_desc_mismatch_element_type(%arg0: !xegpu.mem_desc<16x64xf16>) {
+  // expected-error@+1 {{failed to verify that all of {mem_desc, res} have same element type}}
+  %data = xegpu.load_matrix %arg0[8, 8]: !xegpu.mem_desc<16x64xf16> -> vector<8x16xf32>
   return
 }
 
 // -----
-func.func @load_matrix_desc_invalid_result_size(%arg0: !xegpu.matrix_desc<16x64xf16>) {
+func.func @load_mem_desc_invalid_result_size(%arg0: !xegpu.mem_desc<16x64xf16>) {
   // expected-error@+1 {{result shape must not exceed matrix desc shape}}
-  %data = xegpu.load_matrix %arg0[8, 8]: !xegpu.matrix_desc<16x64xf16> -> vector<32x16xf16>
+  %data = xegpu.load_matrix %arg0[8, 8]: !xegpu.mem_desc<16x64xf16> -> vector<32x16xf16>
   return
 }
 
 // -----
-func.func @store_matrix_desc_mismatch_element_type(%arg0: !xegpu.matrix_desc<16x64xf16>, %arg1: vector<16x16xf32>) {
-  // expected-error@+1 {{failed to verify that all of {matrix_desc, data} have same element type}}
-  xegpu.store_matrix %arg1, %arg0[8, 8] : vector<16x16xf32>, !xegpu.matrix_desc<16x64xf16>
+func.func @store_mem_desc_mismatch_element_type(%arg0: !xegpu.mem_desc<16x64xf16>, %arg1: vector<16x16xf32>) {
+  // expected-error@+1 {{failed to verify that all of {mem_desc, data} have same element type}}
+  xegpu.store_matrix %arg1, %arg0[8, 8] : vector<16x16xf32>, !xegpu.mem_desc<16x64xf16>
   return
 }
 
 // -----
-func.func @store_matrix_desc_invalid_data_size(%arg0: !xegpu.matrix_desc<16x64xf16>, %arg1: vector<32x32xf16>) {
+func.func @store_mem_desc_invalid_data_size(%arg0: !xegpu.mem_desc<16x64xf16>, %arg1: vector<32x32xf16>) {
   // expected-error@+1 {{data shape must not exceed matrix desc shape}}
-  xegpu.store_matrix %arg1, %arg0[8, 8] : vector<32x32xf16>, !xegpu.matrix_desc<16x64xf16>
+  xegpu.store_matrix %arg1, %arg0[8, 8] : vector<32x32xf16>, !xegpu.mem_desc<16x64xf16>
   return
 }
 
 // -----
-func.func @matrix_desc_subview_size_mismatch(%arg0: !xegpu.matrix_desc<16x64xf16>) {
+func.func @mem_desc_subview_size_mismatch(%arg0: !xegpu.mem_desc<16x64xf16>) {
   // expected-error@+1 {{result shape must not exceed source shape}}
-  %data = xegpu.matrix_desc_subview %arg0[8, 8]: !xegpu.matrix_desc<16x64xf16> -> !xegpu.matrix_desc<32x16xf16>
+  %data = xegpu.mem_desc_subview %arg0[8, 8]: !xegpu.mem_desc<16x64xf16> -> !xegpu.mem_desc<32x16xf16>
   return
 }
 
 // -----
-func.func @matrix_desc_subview_layout_mismatch(%arg0: !xegpu.matrix_desc<16x64xf16, #xegpu.mem_layout<stride =[1, 16]>>) {
+func.func @mem_desc_subview_layout_mismatch(%arg0: !xegpu.mem_desc<16x64xf16, #xegpu.mem_layout<stride =[1, 16]>>) {
   // expected-error@+1 {{result must inherit the source strides}}
-  %data = xegpu.matrix_desc_subview %arg0[8, 8]: !xegpu.matrix_desc<16x64xf16, #xegpu.mem_layout<stride =[1, 16]>> -> !xegpu.matrix_desc<8x16xf16>
+  %data = xegpu.mem_desc_subview %arg0[8, 8]: !xegpu.mem_desc<16x64xf16, #xegpu.mem_layout<stride =[1, 16]>> -> !xegpu.mem_desc<8x16xf16>
   return
 }
 
 // -----
-func.func @matrix_desc_subview_element_type_mismatch(%arg0: !xegpu.matrix_desc<16x64xf16>) {
+func.func @mem_desc_subview_element_type_mismatch(%arg0: !xegpu.mem_desc<16x64xf16>) {
   // expected-error@+1 {{failed to verify that all of {src, res} have same element type}}
-  %data = xegpu.matrix_desc_subview %arg0[8, 8]: !xegpu.matrix_desc<16x64xf16> -> !xegpu.matrix_desc<8x16xf32, #xegpu.mem_layout<stride =[64, 1]>>
+  %data = xegpu.mem_desc_subview %arg0[8, 8]: !xegpu.mem_desc<16x64xf16> -> !xegpu.mem_desc<8x16xf32, #xegpu.mem_layout<stride =[64, 1]>>
   return
 }
 
 // -----
-func.func @matrix_desc_subview_rank_mismatch(%arg0: !xegpu.matrix_desc<16x64xf16>) {
+func.func @mem_desc_subview_rank_mismatch(%arg0: !xegpu.mem_desc<16x64xf16>) {
   // expected-error@+1 {{result rank must not exceed source rank}}
-  %data = xegpu.matrix_desc_subview %arg0[8, 8]: !xegpu.matrix_desc<16x64xf16> -> !xegpu.matrix_desc<4x8x16xf16>
+  %data = xegpu.mem_desc_subview %arg0[8, 8]: !xegpu.mem_desc<16x64xf16> -> !xegpu.mem_desc<4x8x16xf16>
   return
 }
 
diff --git a/mlir/test/Dialect/XeGPU/ops.mlir b/mlir/test/Dialect/XeGPU/ops.mlir
index 7106b2667f5d0..35342eca1354c 100644
--- a/mlir/test/Dialect/XeGPU/ops.mlir
+++ b/mlir/test/Dialect/XeGPU/ops.mlir
@@ -751,71 +751,71 @@ gpu.func @fence() {
   gpu.return
 }
 
-// CHECK-LABEL: gpu.func @create_matrix_desc({{.*}}) {
-gpu.func @create_matrix_desc() {
+// CHECK-LABEL: gpu.func @create_mem_desc({{.*}}) {
+gpu.func @create_mem_desc() {
   //CHECK: [[alloc:%.+]] = memref.alloca() {alignment = 1024 : i64} : memref<2048xi8, 3>
-  //CHECK: [[mdesc:%.+]] = xegpu.create_matrix_desc [[alloc]] : memref<2048xi8, 3> -> !xegpu.matrix_desc<16x64xf16>
+  //CHECK: [[mdesc:%.+]] = xegpu.create_mem_desc [[alloc]] : memref<2048xi8, 3> -> !xegpu.mem_desc<16x64xf16>
   %m = memref.alloca() {alignment = 1024} : memref<2048xi8, 3>
-  %matrix_desc = xegpu.create_matrix_desc %m : memref<2048xi8, 3> -> !xegpu.matrix_desc<16x64xf16>
+  %mem_desc = xegpu.create_mem_desc %m : memref<2048xi8, 3> -> !xegpu.mem_desc<16x64xf16>
   gpu.return
 }
 
-// CHECK-LABEL: gpu.func @create_matrix_desc_with_stride({{.*}}) {
-gpu.func @create_matrix_desc_with_stride() {
+// CHECK-LABEL: gpu.func @create_mem_desc_with_stride({{.*}}) {
+gpu.func @create_mem_desc_with_stride() {
   //CHECK: [[alloc:%.+]] = memref.alloca() {alignment = 1024 : i64} : memref<2048xi8, 3>
-  //CHECK: [[mdesc:%.+]] = xegpu.create_matrix_desc [[alloc]] : memref<2048xi8, 3> -> !xegpu.matrix_desc<16x64xf16, #xegpu.mem_layout<stride = [1, 16]>>
+  //CHECK: [[mdesc:%.+]] = xegpu.create_mem_desc [[alloc]] : memref<2048xi8, 3> -> !xegpu.mem_desc<16x64xf16, #xegpu.mem_layout<stride = [1, 16]>>
   %m = memref.alloca() {alignment = 1024} : memref<2048xi8, 3>
-  %matrix_desc = xegpu.create_matrix_desc %m : memref<2048xi8, 3> -> !xegpu.matrix_desc<16x64xf16, #xegpu.mem_layout<stride = [1, 16]>>
+  %mem_desc = xegpu.create_mem_desc %m : memref<2048xi8, 3> -> !xegpu.mem_desc<16x64xf16, #xegpu.mem_layout<stride = [1, 16]>>
   gpu.return
 }
 
-// CHECK: gpu.func @load_matrix_desc([[ARG0:%.+]]: !xegpu.matrix_desc<16x64xf16>)
-gpu.func @load_matrix_desc(%arg0: !xegpu.matrix_desc<16x64xf16>) {
-  // CHECK: xegpu.load_matrix [[ARG0]][8, 8] : !xegpu.matrix_desc<16x64xf16> -> vector<8x16xf16>
-  %data = xegpu.load_matrix %arg0[8, 8]: !xegpu.matrix_desc<16x64xf16> -> vector<8x16xf16>
+// CHECK: gpu.func @load_mem_desc([[ARG0:%.+]]: !xegpu.mem_desc<16x64xf16>)
+gpu.func @load_mem_desc(%arg0: !xegpu.mem_desc<16x64xf16>) {
+  // CHECK: xegpu.load_matrix [[ARG0]][8, 8] : !xegpu.mem_desc<16x64xf16> -> vector<8x16xf16>
+  %data = xegpu.load_matrix %arg0[8, 8]: !xegpu.mem_desc<16x64xf16> -> vector<8x16xf16>
   gpu.return
 }
 
-// CHECK: gpu.func @load_matrix_desc_with_stride(%arg0: !xegpu.matrix_desc<16x64xf16, #xegpu.mem_layout<stride = [1, 16]>>)
-gpu.func @load_matrix_desc_with_stride(%arg0: !xegpu.matrix_desc<16x64xf16, #xegpu.mem_layout<stride = [1, 16]>>) {
-  // CHECK: xegpu.load_matrix [[ARG0]][8, 8] : !xegpu.matrix_desc<16x64xf16, #xegpu.mem_layout<stride = [1, 16]>> -> vector<8x16xf16>
-  %data = xegpu.load_matrix %arg0[8, 8]: !xegpu.matrix_desc<16x64xf16, #xegpu.mem_layout<stride = [1, 16]>> -> vector<8x16xf16>
+// CHECK: gpu.func @load_mem_desc_with_stride(%arg0: !xegpu.mem_desc<16x64xf16, #xegpu.mem_layout<stride = [1, 16]>>)
+gpu.func @load_mem_desc_with_stride(%arg0: !xegpu.mem_desc<16x64xf16, #xegpu.mem_layout<stride = [1, 16]>>) {
+  // CHECK: xegpu.load_matrix [[ARG0]][8, 8] : !xegpu.mem_desc<16x64xf16, #xegpu.mem_layout<stride = [1, 16]>> -> vector<8x16xf16>
+  %data = xegpu.load_matrix %arg0[8, 8]: !xegpu.mem_desc<16x64xf16, #xegpu.mem_layout<stride = [1, 16]>> -> vector<8x16xf16>
   gpu.return
 }
 
 
-// CHECK: gpu.func @store_matrix_desc([[ARG0:%.+]]: !xegpu.matrix_desc<16x64xf16>, [[ARG1:%.+]]: vector<16x16xf16>)
-gpu.func @store_matrix_desc(%arg0: !xegpu.matrix_desc<16x64xf16>, %arg1: vector<16x16xf16>) {
-  // CHECK: xegpu.store_matrix [[ARG1]], [[ARG0]][8, 8] : vector<16x16xf16>, !xegpu.matrix_desc<16x64xf16>
-  xegpu.store_matrix %arg1, %arg0[8, 8]: vector<16x16xf16>, !xegpu.matrix_desc<16x64xf16>
+// CHECK: gpu.func @store_mem_desc([[ARG0:%.+]]: !xegpu.mem_desc<16x64xf16>, [[ARG1:%.+]]: vector<16x16xf16>)
+gpu.func @store_mem_desc(%arg0: !xegpu.mem_desc<16x64xf16>, %arg1: vector<16x16xf16>) {
+  // CHECK: xegpu.store_matrix [[ARG1]], [[ARG0]][8, 8] : vector<16x16xf16>, !xegpu.mem_desc<16x64xf16>
+  xegpu.store_matrix %arg1, %arg0[8, 8]: vector<16x16xf16>, !xegpu.mem_desc<16x64xf16>
   gpu.return
 }
 
-// CHECK: gpu.func @store_matrix_desc_with_stride([[ARG0:%.+]]: !xegpu.matrix_desc<16x64xf16, #xegpu.mem_layout<stride = [1, 16]>>, [[ARG1:%.+]]: vector<16x16xf16>)
-gpu.func @store_matrix_desc_with_stride(%arg0: !xegpu.matrix_desc<16x64xf16, #xegpu.mem_layout<stride = [1, 16]>>, %arg1: vector<16x16xf16>) {
-  // CHECK: xegpu.store_matrix [[ARG1]], [[ARG0]][0, 8] : vector<16x16xf16>, !xegpu.matrix_desc<16x64xf16, #xegpu.mem_layout<stride = [1, 16]>>
-  xegpu.store_matrix %arg1, %arg0[0, 8]: vector<16x16xf16>, !xegpu.matrix_desc<16x64xf16, #xegpu.mem_layout<stride = [1, 16]>>
+// CHECK: gpu.func @store_mem_desc_with_stride([[ARG0:%.+]]: !xegpu.mem_desc<16x64xf16, #xegpu.mem_layout<stride = [1, 16]>>, [[ARG1:%.+]]: vector<16x16xf16>)
+gpu.func @store_mem_desc_with_stride(%arg0: !xegpu.mem_desc<16x64xf16, #xegpu.mem_layout<stride = [1, 16]>>, %arg1: vector<16x16xf16>) {
+  // CHECK: xegpu.store_matrix [[ARG1]], [[ARG0]][0, 8] : vector<16x16xf16>, !xegpu.mem_desc<16x64xf16, #xegpu.mem_layout<stride = [1, 16]>>
+  xegpu.store_matrix %arg1, %arg0[0, 8]: vector<16x16xf16>, !xegpu.mem_desc<16x64xf16, #xegpu.mem_layout<stride = [1, 16]>>
   gpu.return
 }
 
-// CHECK: gpu.func @matrix_desc_subview([[ARG0:%.+]]: !xegpu.matrix_desc<16x64xf16>)
-gpu.func @matrix_desc_subview(%arg0: !xegpu.matrix_desc<16x64xf16>) {
-  //CHECK: xegpu.matrix_desc_subview [[ARG0]][8, 8] : !xegpu.matrix_desc<16x64xf16> -> !xegpu.matrix_desc<8x16xf16, #xegpu.mem_layout<stride = [64, 1]>>
-  %data = xegpu.matrix_desc_subview %arg0[8, 8]: !xegpu.matrix_desc<16x64xf16> -> !xegpu.matrix_desc<8x16xf16, #xegpu.mem_layout<stride = [64, 1]>>
+// CHECK: gpu.func @mem_desc_subview([[ARG0:%.+]]: !xegpu.mem_desc<16x64xf16>)
+gpu.func @mem_desc_subview(%arg0: !xegpu.mem_desc<16x64xf16>) {
+  //CHECK: xegpu.mem_desc_subview [[ARG0]][8, 8] : !xegpu.mem_desc<16x64xf16> -> !xegpu.mem_desc<8x16xf16, #xegpu.mem_layout<stride = [64, 1]>>
+  %data = xegpu.mem_desc_subview %arg0[8, 8]: !xegpu.mem_desc<16x64xf16> -> !xegpu.mem_desc<8x16xf16, #xegpu.mem_layout<stride = [64, 1]>>
   gpu.return
 }
 
-// CHECK: gpu.func @matrix_desc_subview_lower_rank([[ARG0:%.+]]: !xegpu.matrix_desc<16x64xf16>)
-gpu.func @matrix_desc_subview_lower_rank(%arg0: !xegpu.matrix_desc<16x64xf16>) {
-  //CHECK: xegpu.matrix_desc_subview [[ARG0]][8, 8] : !xegpu.matrix_desc<16x64xf16> -> !xegpu.matrix_desc<16xf16, #xegpu.mem_layout<stride = [64, 1]>>
-  %data = xegpu.matrix_desc_subview %arg0[8, 8]: !xegpu.matrix_desc<16x64xf16> -> !xegpu.matrix_desc<16xf16, #xegpu.mem_layout<stride = [64, 1]>>
+// CHECK: gpu.func @mem_desc_subview_lower_rank([[ARG0:%.+]]: !xegpu.mem_desc<16x64xf16>)
+gpu.func @mem_desc_subview_lower_rank(%arg0: !xegpu.mem_desc<16x64xf16>) {
+  //CHECK: xegpu.mem_desc_subview [[ARG0]][8, 8] : !xegpu.mem_desc<16x64xf16> -> !xegpu.mem_desc<16xf16, #xegpu.mem_layout<stride = [64, 1]>>
+  %data = xegpu.mem_desc_subview %arg0[8, 8]: !xegpu.mem_desc<16x64xf16> -> !xegpu.mem_desc<16xf16, #xegpu.mem_layout<stride = [64, 1]>>
   gpu.return
 }
 
-// CHECK: gpu.func @matrix_desc_subview_with_stride([[ARG0:%.+]]: !xegpu.matrix_desc<16x64xf16, #xegpu.mem_layout<stride = [1, 16]>>)
-gpu.func @matrix_desc_subview_with_stride(%arg0: !xegpu.matrix_desc<16x64xf16, #xegpu.mem_layout<stride = [1, 16]>>) {
-  //CHECK: xegpu.matrix_desc_subview [[ARG0]][8, 8] : !xegpu.matrix_desc<16x64xf16, #xegpu.mem_layout<stride = [1, 16]>> -> !xegpu.matrix_desc<8x16xf16, #xegpu.mem_layout<stride = [1, 16]>>
-  %data = xegpu.matrix_desc_subview %arg0[8, 8]: !xegpu.matrix_desc<16x64xf16, #xegpu.mem_layout<stride = [1, 16]>> -> !xegpu.matrix_desc<8x16xf16, #xegpu.mem_layout<stride = [1, 16]>>
+// CHECK: gpu.func @mem_desc_subview_with_stride([[ARG0:%.+]]: !xegpu.mem_desc<16x64xf16, #xegpu.mem_layout<stride = [1, 16]>>)
+gpu.func @mem_desc_subview_with_stride(%arg0: !xegpu.mem_desc<16x64xf16, #xegpu.mem_layout<stride = [1, 16]>>) {
+  //CHECK: xegpu.mem_desc_subview [[ARG0]][8, 8] : !xegpu.mem_desc<16x64xf16, #xegpu.mem_layout<stride = [1, 16]>> -> !xegpu.mem_desc<8x16xf16, #xegpu.mem_layout<stride = [1, 16]>>
+  %data = xegpu.mem_desc_subview %arg0[8, 8]: !xegpu.mem_desc<16x64xf16, #xegpu.mem_layout<stride = [1, 16]>> -> !xegpu.mem_desc<8x16xf16, #xegpu.mem_layout<stride = [1, 16]>>
   gpu.return
 }
 

From 552c8716df510b0a27eddcc3fb924d1c7d20f474 Mon Sep 17 00:00:00 2001
From: Chao Chen <chao.chen@intel.com>
Date: Fri, 15 Aug 2025 01:00:11 +0000
Subject: [PATCH 14/14] update docs

---
 .../include/mlir/Dialect/XeGPU/IR/XeGPUOps.td | 24 +++++++++----------
 .../mlir/Dialect/XeGPU/IR/XeGPUTypes.td       |  6 ++---
 2 files changed, 15 insertions(+), 15 deletions(-)

diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
index 4e6c5a8b1a820..d5e2db0f7551d 100644
--- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
+++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
@@ -1116,16 +1116,16 @@ class AllMemSizesMatch<list<string> names> :
 
 def XeGPU_CreateMemDescOp: XeGPU_Op<"create_mem_desc", [Pure,
       AllMemSizesMatch<["source", "mem_desc"]>]>  {
-  let summary = "Create a matrix descriptor.";
+  let summary = "Create a memory descriptor.";
   let description = [{
-    Creates a matrix descriptor from a shared local memory (SLM) buffer.
-    The resulting matrix descriptor has to have the same size as the underlying
-    shared local memory.
+    Creates a memory descriptor from a shared local memory (SLM) buffer, and xegpu
+    specific memory layout. The resulting memory descriptor has to have the same size
+    as the underlying shared local memory.
 
     Arguments:
      - `source` : a 1D statically shaped memref with element type i8, representing the raw SLM buffer.
     Results:
-     - `mem_desc` : the matrix descriptor.
+     - `mem_desc` : the memory descriptor.
   }];
   let arguments = (ins StaticShared1DMemRefOf<[I8]>:$source);
   let results = (outs XeGPU_MemDesc:$mem_desc);
@@ -1148,10 +1148,10 @@ def XeGPU_LoadMatrixOp: XeGPU_Op<"load_matrix", [MemoryEffects<[MemRead]>,
 
   let description = [{
     This operation reads a block of data from shared local memory (SLM)
-    using the provided matrix descriptor.
+    using the provided memory descriptor.
 
     Arguments:
-     - `mem_desc`: the matrix descriptor identifying the SLM region.
+     - `mem_desc`: the memory descriptor identifying the SLM region.
      - `offsets`: the coordinates within the matrix to read from.
      - `layout`: [optional] An attribute for guiding distributions among
                  subgroups and/or work-items. It currently can accept either
@@ -1190,7 +1190,7 @@ def XeGPU_StoreMatrixOp: XeGPU_Op<"store_matrix", [MemoryEffects<[MemWrite]>,
     identified by `mem_desc`.
 
     Arguments:
-     - `mem_desc`: the matrix descriptor specifying the SLM region.
+     - `mem_desc`: the memory descriptor specifying the SLM region.
      - `offsets`: the coordinates within the matrix where the data will be written.
      - `data`: the values to be stored in the matrix.
      - `layout`: [optional] An attribute for guiding distributions among
@@ -1213,16 +1213,16 @@ def XeGPU_StoreMatrixOp: XeGPU_Op<"store_matrix", [MemoryEffects<[MemWrite]>,
 def XeGPU_MemDescSubviewOp: XeGPU_Op<"mem_desc_subview",
           [Pure, ViewLikeOpInterface, AllElementTypesMatch<["src", "res"]>]> {
   let description = [{
-    Creates a subview of a matrix descriptor. The resulting matrix descriptor can have
+    Creates a subview of a memory descriptor. The resulting memory descriptor can have
     a lower rank than the source; in this case, the result dimensions correspond to the
-    higher-order dimensions of the source matrix descriptor.
+    higher-order dimensions of the source memory descriptor.
 
     Arguments:
-     - `src` : a matrix descriptor.
+     - `src` : a memory descriptor.
      - `offsets` : the coordinates within the matrix the subview will be created from.
 
     Results:
-    - `res` : a matrix descriptor with smaller size.
+    - `res` : a memory descriptor with smaller size.
 
   }];
   let arguments = (ins XeGPU_MemDesc:$src,
diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td
index 6602ff94d6ae3..a4411ec8620da 100644
--- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td
+++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td
@@ -210,14 +210,14 @@ def XeGPU_MemDesc: XeGPUTypeDef<"MemDesc", "mem_desc", [ShapedTypeInterface], "m
 
     Examples:
     ```mlir
-    // A matrix of data stored in column-major order.
+    // A block of data stored in column-major order.
     !xegpu.mem_desc<128x128xf16, #xegpu.mem_layout<stride = [1, 128]>>
 
-    // A matrix of data stored in a blocked layout. Elements within the same block
+    // A block of data stored in a blocked layout. Elements within the same block
     // are stored contiguously in memory. Blocks are stored in row-major order.
     !xegpu.mem_desc<128x128xf16, #xegpu.mem_layout<block = [8, 8]>>
 
-    // A matrix of data stored in column-major order with blocked layout.
+    // A block of data stored in column-major order with blocked layout.
     !xegpu.mem_desc<128x128xf16, #xegpu.mem_layout<stride = [1, 128], block = [8, 8]>>
     ```
   }];