intel · Garra1980 · Aug 13, 2025 · Aug 13, 2025 · Aug 13, 2025
diff --git a/build_tools/llvm_version.txt b/build_tools/llvm_version.txt
@@ -1 +1 @@
-228e96b28a84828e1720c387a339a7e68dbdc029
+92164faf17d553359418b9f49c1a41d680d0de49
diff --git a/build_tools/patches/0001-Add-support-for-VectorAnyINTEL-capability.patch b/build_tools/patches/0001-Add-support-for-VectorAnyINTEL-capability.patch
diff --git a/build_tools/patches/0004-Add-serialization-and-de-serialization-support-for-s.patch b/build_tools/patches/0004-Add-serialization-and-de-serialization-support-for-s.patch
@@ -1,37 +1,37 @@
-From 89e527e48b727a1479aa47fdbe3d2d178d8969a7 Mon Sep 17 00:00:00 2001
+From 5900db1c91d40157c2724d324ea65e22936e3354 Mon Sep 17 00:00:00 2001
 From: Garra1980 <[email protected]>
-Date: Mon, 4 Aug 2025 17:50:56 +0200
-Subject: [PATCH] Add serilialization and deserialization for spirv
+Date: Tue, 12 Aug 2025 23:41:51 +0200
+Subject: [PATCH] Add serialization and de-serialization support for spirv
 
 ---
  mlir/lib/Target/SPIRV/Deserialization/Deserializer.cpp | 6 ++++++
  mlir/lib/Target/SPIRV/Serialization/Serializer.cpp     | 6 ++++++
  2 files changed, 12 insertions(+)
 
 diff --git a/mlir/lib/Target/SPIRV/Deserialization/Deserializer.cpp b/mlir/lib/Target/SPIRV/Deserialization/Deserializer.cpp
-index 88931b53a688..f1c22d09cc8e 100644
+index d8c54ec5f88c..3b539382dedd 100644
 --- a/mlir/lib/Target/SPIRV/Deserialization/Deserializer.cpp
 +++ b/mlir/lib/Target/SPIRV/Deserialization/Deserializer.cpp
-@@ -282,6 +282,7 @@ LogicalResult spirv::Deserializer::processDecoration(ArrayRef<uint32_t> words) {
+@@ -283,6 +283,7 @@ LogicalResult spirv::Deserializer::processDecoration(ArrayRef<uint32_t> words) {
          symbol, FPRoundingModeAttr::get(opBuilder.getContext(),
                                          static_cast<FPRoundingMode>(words[2])));
      break;
 +  case spirv::Decoration::Alignment:
    case spirv::Decoration::DescriptorSet:
    case spirv::Decoration::Binding:
      if (words.size() != 3) {
-@@ -343,6 +344,10 @@ LogicalResult spirv::Deserializer::processDecoration(ArrayRef<uint32_t> words) {
-   case spirv::Decoration::RestrictPointer:
-   case spirv::Decoration::NoContraction:
+@@ -346,6 +347,10 @@ LogicalResult spirv::Deserializer::processDecoration(ArrayRef<uint32_t> words) {
    case spirv::Decoration::Constant:
+   case spirv::Decoration::Invariant:
+   case spirv::Decoration::Patch:
 +  case spirv::Decoration::SingleElementVectorINTEL:
 +  case spirv::Decoration::VectorComputeCallableFunctionINTEL:
 +  case spirv::Decoration::VectorComputeFunctionINTEL:
 +  case spirv::Decoration::VectorComputeVariableINTEL:
      if (words.size() != 2) {
        return emitError(unknownLoc, "OpDecoration with ")
               << decorationName << "needs a single target <id>";
-@@ -351,6 +356,7 @@ LogicalResult spirv::Deserializer::processDecoration(ArrayRef<uint32_t> words) {
+@@ -354,6 +359,7 @@ LogicalResult spirv::Deserializer::processDecoration(ArrayRef<uint32_t> words) {
      break;
    case spirv::Decoration::Location:
    case spirv::Decoration::SpecId:
@@ -40,10 +40,10 @@ index 88931b53a688..f1c22d09cc8e 100644
        return emitError(unknownLoc, "OpDecoration with ")
               << decorationName << "needs a single integer literal";
 diff --git a/mlir/lib/Target/SPIRV/Serialization/Serializer.cpp b/mlir/lib/Target/SPIRV/Serialization/Serializer.cpp
-index 737f29662f64..cd925b02b6a6 100644
+index 7c007de31558..3aa26ab923a9 100644
 --- a/mlir/lib/Target/SPIRV/Serialization/Serializer.cpp
 +++ b/mlir/lib/Target/SPIRV/Serialization/Serializer.cpp
-@@ -283,8 +283,10 @@ LogicalResult Serializer::processDecorationAttr(Location loc, uint32_t resultID,
+@@ -302,8 +302,10 @@ LogicalResult Serializer::processDecorationAttr(Location loc, uint32_t resultID,
      }
      return emitError(loc, "expected FPRoundingModeAttr attribute for ")
             << stringifyDecoration(decoration);
@@ -54,17 +54,16 @@ index 737f29662f64..cd925b02b6a6 100644
    case spirv::Decoration::Location:
      if (auto intAttr = dyn_cast<IntegerAttr>(attr)) {
        args.push_back(intAttr.getValue().getZExtValue());
-@@ -318,6 +320,10 @@ LogicalResult Serializer::processDecorationAttr(Location loc, uint32_t resultID,
-   case spirv::Decoration::RestrictPointer:
-   case spirv::Decoration::NoContraction:
-   case spirv::Decoration::Constant:
+@@ -340,6 +342,10 @@ LogicalResult Serializer::processDecorationAttr(Location loc, uint32_t resultID,
+   case spirv::Decoration::Block:
+   case spirv::Decoration::Invariant:
+   case spirv::Decoration::Patch:
 +  case spirv::Decoration::SingleElementVectorINTEL:
 +  case spirv::Decoration::VectorComputeCallableFunctionINTEL:
 +  case spirv::Decoration::VectorComputeFunctionINTEL:
 +  case spirv::Decoration::VectorComputeVariableINTEL:
-   case spirv::Decoration::Block:
      // For unit attributes and decoration attributes, the args list
      // has no values so we do nothing.
--- 
+     if (isa<UnitAttr, DecorationAttr>(attr))
+--
 2.34.1
-
diff --git a/build_tools/patches/0008-xegpu-temporary-downstream-defintion-changes-and-vec.patch b/build_tools/patches/0008-xegpu-temporary-downstream-defintion-changes-and-vec.patch
@@ -14,7 +14,7 @@ index 7f4d4f1381df..ebd4f1a3f66a 100644
 --- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
 +++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
 @@ -373,6 +373,7 @@ def XeGPU_LoadNdOp : XeGPU_Op<"load_nd", [
-                        OptionalAttr<DenseI64ArrayAttr>: $const_offsets,  
+                        OptionalAttr<DenseI64ArrayAttr>: $const_offsets,
                         OptionalAttr<UnitAttr>: $packed,
                         OptionalAttr<DenseI64ArrayAttr>: $transpose,
 +                       OptionalAttr<I32Attr>: $transpose_bit_width,
@@ -24,7 +24,7 @@ index 7f4d4f1381df..ebd4f1a3f66a 100644
 @@ -1147,4 +1148,9 @@ def XeGPU_ConvertLayoutOp: XeGPU_Op<"convert_layout", [Pure, AllTypesMatch<["sou
      let hasCanonicalizer = 1;
  }
- 
+
 +def XeGPU_CompileHintOp : XeGPU_Op<"compile_hint", []> {
 +  let summary = "prevents the compiler from scheduling.";
 +  let assemblyFormat = [{ attr-dict }];
@@ -68,27 +68,26 @@ index 33450f3fa229..528b9d55ee61 100644
 +         kind == CachePolicy::STREAMING ||
           kind == CachePolicy::WRITE_BACK || kind == CachePolicy::WRITE_THROUGH;
  }
- 
+
 @@ -419,8 +420,8 @@ void LoadNdOp::build(OpBuilder &builder, OperationState &state, Type retType,
                       xegpu::CachePolicyAttr l3_hint) {
- 
+
    return build(builder, state, retType, tensorDesc, ValueRange(),
 -               DenseI64ArrayAttr(), packed, transpose, l1_hint, l2_hint,
 -               l3_hint);
 +               DenseI64ArrayAttr(), packed, transpose, nullptr,
 +               l1_hint, l2_hint, l3_hint);
  }
- 
+
  LogicalResult LoadNdOp::verify() {
 @@ -482,7 +483,7 @@ LogicalResult LoadNdOp::verify() {
        mlir::emitWarning(getLoc()) << "Invalid transpose attr. It is ignored.";
    }
- 
+
 -  if (getPacked()) {
 +  if (getPacked() || getTransposeBitWidth() == 32) {
      if (tdescTy.getRank() == 2) {
        const int axis = 0;
        auto vnni_factor = valueShape.back();
--- 
+--
 2.34.1
-
diff --git a/lib/Conversion/XeGPUToVC/LSCPatterns.cpp b/lib/Conversion/XeGPUToVC/LSCPatterns.cpp
@@ -1198,9 +1198,9 @@ class PrefetchPattern : public OpConversionPattern<PrefetchOp> {
     // auto l2hint = op.getL2Hint();
     auto l3hint = op.getL3Hint();
 
-    auto callOp = genPrefetchIntrinsicCall(rewriter, loc, simd_lanes, l1hint,
-                                           l3hint, elemTy, chunkSize, scope,
-                                           adaptor.getSource());
+    auto callOp =
+        genPrefetchIntrinsicCall(rewriter, loc, simd_lanes, l1hint, l3hint,
+                                 elemTy, chunkSize, scope, adaptor.getSource());
 
     rewriter.replaceOp(op, callOp);
     return success();

diff --git a/lib/Conversion/XeTileToXeGPU/XeTileToXeGPU.cpp b/lib/Conversion/XeTileToXeGPU/XeTileToXeGPU.cpp
@@ -491,10 +491,9 @@ class LoadOpPattern : public OpConversionPattern<xetile::LoadTileOp> {
     auto packAttr = UnitAttr();
     auto transAttr = DenseI64ArrayAttr();
     auto bitWidthAttr = IntegerAttr();
-    auto ldOp = rewriter.create<xegpu::LoadNdOp>(loc, vecTy, adaptor.getTile(),
-                                                 ValueRange(), DenseI64ArrayAttr(),
-                                                 packAttr, transAttr,
-                                                 bitWidthAttr, L1, L2, L3);
+    auto ldOp = rewriter.create<xegpu::LoadNdOp>(
+        loc, vecTy, adaptor.getTile(), ValueRange(), DenseI64ArrayAttr(),
+        packAttr, transAttr, bitWidthAttr, L1, L2, L3);
 
     llvm::SmallVector<Value> results({ldOp.getResult()});
     if (memSpace == xegpu::MemorySpace::SLM) {

diff --git a/lib/Dialect/NDArray/Extensions/MeshShardingExtensions.cpp b/lib/Dialect/NDArray/Extensions/MeshShardingExtensions.cpp
@@ -100,7 +100,7 @@ static T getBaseShardDimOff(T shard, T numShards, T extend) {
 }
 
 static Sharding ShardingFromOption(const ShardingOption &option,
-                                       MLIRContext *ctxt) {
+                                   MLIRContext *ctxt) {
   SmallVector<GridAxesAttr> res;
   for (const auto &v : option.shardingArray) {
     res.emplace_back(GridAxesAttr::get(ctxt, v));
@@ -141,7 +141,8 @@ getShardingWithShardedDimsOffs(Value ary, OffsetSizeAndStrideOpInterface op) {
       ShapedType::isDynamicShape(strides))
     return op->emitOpError("Dynamic offsets/sizes/strides are not supported");
 
-  auto arySharding = aryShardOp.getSharding().getDefiningOp<shard::ShardingOp>();
+  auto arySharding =
+      aryShardOp.getSharding().getDefiningOp<shard::ShardingOp>();
   // currently no support for sharding dims sizes on input
   if (!arySharding.getStaticShardedDimsOffsets().empty())
     return op->emitOpError(
@@ -190,10 +191,9 @@ getShardingWithShardedDimsOffs(Value ary, OffsetSizeAndStrideOpInterface op) {
     }
   }
 
-  return Sharding::get(
-      arySharding.getGridAttr(), arySharding.getSplitAxes().getAxes(),
-      {}, // static halo
-      splitOffs, {}, {});
+  return Sharding::get(arySharding.getGridAttr(),
+                       arySharding.getSplitAxes().getAxes(), {}, // static halo
+                       splitOffs, {}, {});
 }
 
 static std::pair<Value, Value>

diff --git a/lib/Dialect/XeTile/Transforms/Blocking.cpp b/lib/Dialect/XeTile/Transforms/Blocking.cpp
@@ -1042,8 +1042,8 @@ class RewriteTileReductionOp
       for (auto v : intermediates) {
         auto resultTy = VectorType::get({1, 1}, elemTy);
         for (auto i = 0; i < blkSize[1]; i++) {
-          auto extractOp =
-              rewriter.create<vector::ExtractOp>(loc, v, rewriter.getIndexAttr(i));
+          auto extractOp = rewriter.create<vector::ExtractOp>(
+              loc, v, rewriter.getIndexAttr(i));
           auto splatOp = rewriter.create<vector::SplatOp>(op.getLoc(), resultTy,
                                                           extractOp);
           newOps.push_back(splatOp);

diff --git a/lib/Target/CMakeLists.txt b/lib/Target/CMakeLists.txt
@@ -1 +1 @@
-add_subdirectory(LLVM)
+add_subdirectory(LLVM)
diff --git a/lib/Transforms/OptimizeTranspose.cpp b/lib/Transforms/OptimizeTranspose.cpp
@@ -516,10 +516,10 @@ struct LoadNdOpPattern : public OpConversionPattern<xegpu::LoadNdOp> {
                                      op.getType().getElementType());
     for (auto source : tdescSources) {
       auto loadNdOp = rewriter.create<xegpu::LoadNdOp>(
-          op.getLoc(), newLoadTy, source, 
-          ValueRange(), DenseI64ArrayAttr(), op.getPackedAttr(),
-          op.getTransposeAttr(), op.getTransposeBitWidthAttr(),
-          op.getL1HintAttr(), op.getL2HintAttr(), op.getL3HintAttr());
+          op.getLoc(), newLoadTy, source, ValueRange(), DenseI64ArrayAttr(),
+          op.getPackedAttr(), op.getTransposeAttr(),
+          op.getTransposeBitWidthAttr(), op.getL1HintAttr(), op.getL2HintAttr(),
+          op.getL3HintAttr());
       loadNdOps.push_back(loadNdOp);
     }
     rewriter.replaceOpWithMultiple(op, {loadNdOps});
@@ -847,10 +847,10 @@ struct TransposeRewritePattern : public OpRewritePattern<vector::TransposeOp> {
           rewriter.getIntegerType(32),
           32); // need to do a 32 bit transpose to get the packed layout.
       auto newLoadOp = rewriter.create<xegpu::LoadNdOp>(
-          loadOp.getLoc(), newVectorTy, loadOp.getTensorDesc(),
-          ValueRange(), DenseI64ArrayAttr(), packedAttr,
-          transposeAttr, transposeBitWidthAttr, loadOp.getL1HintAttr(),
-          loadOp.getL2HintAttr(), loadOp.getL3HintAttr());
+          loadOp.getLoc(), newVectorTy, loadOp.getTensorDesc(), ValueRange(),
+          DenseI64ArrayAttr(), packedAttr, transposeAttr, transposeBitWidthAttr,
+          loadOp.getL1HintAttr(), loadOp.getL2HintAttr(),
+          loadOp.getL3HintAttr());
       // Replace the uses of the packed layout conversion with new load.
       rewriter.replaceAllUsesWith(packedLayoutOps.back()->getResult(0),
                                   newLoadOp.getResult());
@@ -872,10 +872,10 @@ struct TransposeRewritePattern : public OpRewritePattern<vector::TransposeOp> {
       auto transposeAttr =
           DenseI64ArrayAttr::get(rewriter.getContext(), {1, 0});
       auto newLoadOp = rewriter.create<xegpu::LoadNdOp>(
-          loadOp.getLoc(), newVectorTy, loadOp.getTensorDesc(),
-          ValueRange(), DenseI64ArrayAttr(), packedAttr,
-          transposeAttr, IntegerAttr(), loadOp.getL1HintAttr(),
-          loadOp.getL2HintAttr(), loadOp.getL3HintAttr());
+          loadOp.getLoc(), newVectorTy, loadOp.getTensorDesc(), ValueRange(),
+          DenseI64ArrayAttr(), packedAttr, transposeAttr, IntegerAttr(),
+          loadOp.getL1HintAttr(), loadOp.getL2HintAttr(),
+          loadOp.getL3HintAttr());
       rewriter.replaceAllUsesWith(op.getResult(), newLoadOp.getResult());
     }
 

diff --git a/lib/Transforms/RemoveSingleElemVector.cpp b/lib/Transforms/RemoveSingleElemVector.cpp
@@ -33,8 +33,7 @@ namespace {
 
 struct VectorExtractOpConversion final
     : public mlir::OpConversionPattern<mlir::vector::ExtractOp> {
-  using mlir::OpConversionPattern<
-      mlir::vector::ExtractOp>::OpConversionPattern;
+  using mlir::OpConversionPattern<mlir::vector::ExtractOp>::OpConversionPattern;
 
   mlir::LogicalResult
   matchAndRewrite(mlir::vector::ExtractOp extractOp, OpAdaptor adaptor,
@@ -84,8 +83,8 @@ struct VectorExtractStridedSliceConversion final
 
     // We only convert ops extracting a single element from a 1D vector.
     if (resType.getNumElements() == 1 && srcVector.getType().getRank() == 1) {
-      rewriter.replaceOpWithNewOp<mlir::vector::ExtractOp>(
-          extractOp, srcVector, offsets[0]);
+      rewriter.replaceOpWithNewOp<mlir::vector::ExtractOp>(extractOp, srcVector,
+                                                           offsets[0]);
       return mlir::success();
     }
     return mlir::failure();
@@ -122,9 +121,8 @@ struct VectorizableOpPattern final
 };
 
 template <typename OpTy>
-static mlir::Value
-createInsertOps(OpTy op, mlir::ValueRange operands,
-                       mlir::ConversionPatternRewriter &rewriter) {
+static mlir::Value createInsertOps(OpTy op, mlir::ValueRange operands,
+                                   mlir::ConversionPatternRewriter &rewriter) {
   auto loc = op.getLoc();
   auto type = op.getType();
   auto elemType = type.getElementType();
@@ -139,8 +137,7 @@ createInsertOps(OpTy op, mlir::ValueRange operands,
   mlir::Value newOp =
       rewriter.create<mlir::arith::ConstantOp>(loc, type, denseAttr);
   for (auto [i, opr] : llvm::enumerate(operands)) {
-    newOp =
-        rewriter.create<mlir::vector::InsertOp>(loc, opr, newOp, i);
+    newOp = rewriter.create<mlir::vector::InsertOp>(loc, opr, newOp, i);
   }
   return newOp;
 }
@@ -267,7 +264,8 @@ struct RemoveSingleElemVectorPass final
         return mlir::Value();
 
       return builder
-          .create<mlir::vector::ExtractOp>(loc, inputs[0], builder.getIndexAttr(0))
+          .create<mlir::vector::ExtractOp>(loc, inputs[0],
+                                           builder.getIndexAttr(0))
           .getResult();
     };
 

diff --git a/test/Conversion/XeTileToXeGPU/sg_scattered_ops.mlir b/test/Conversion/XeTileToXeGPU/sg_scattered_ops.mlir
@@ -64,19 +64,18 @@ gpu.module @test {
     //CHECK: %[[cast_1:.*]] = memref.cast %[[arg2]] : memref<*xf32> to memref<?xf32>
     //CHECK: %[[block_id_x:.*]] = gpu.block_id  x
     //CHECK: %[[r0:.*]] = arith.muli %[[block_id_x]], %[[c1024]] : index
-    //CHECK: %[[r1:.*]] = vector.splat %[[r0]] : vector<1x16xindex>
-    //CHECK: %[[r2:.*]] = vector.shape_cast %[[r1]] : vector<1x16xindex> to vector<16xindex>
-    //CHECK: %[[r3:.*]] = xegpu.create_tdesc %[[cast]], %[[r2]] : memref<?xf32>, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>>
-    //CHECK: %[[r4:.*]] = xegpu.load %[[r3]], %[[cst]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<cached>, l3_hint = #xegpu.cache_hint<cached>}> : !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>>, vector<16xi1> -> vector<16xf32>
-    //CHECK: %[[r5:.*]] = vector.shape_cast %[[r4]] : vector<16xf32> to vector<1x16xf32>
-    //CHECK: %[[r6:.*]] = xegpu.create_tdesc %[[cast_0]], %[[r2]] : memref<?xf32>, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>>
-    //CHECK: %[[r7:.*]] = xegpu.load %[[r6]], %[[cst]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<cached>, l3_hint = #xegpu.cache_hint<cached>}> : !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>>, vector<16xi1> -> vector<16xf32>
-    //CHECK: %[[r8:.*]] = vector.shape_cast %[[r7]] : vector<16xf32> to vector<1x16xf32>
-    //CHECK: %[[r9:.*]] = arith.addf %[[r5]], %[[r8]] : vector<1x16xf32>
-    //CHECK: %[[r10:.*]] = xegpu.create_tdesc %[[cast_1]], %[[r2]] : memref<?xf32>, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>>
-    //CHECK: %[[r11:.*]] = vector.shape_cast %[[r9]] : vector<1x16xf32> to vector<16xf32>
-    //CHECK: xegpu.store %[[r11]], %[[r10]], %[[cst]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<write_back>, l3_hint = #xegpu.cache_hint<write_back>}> : vector<16xf32>, !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>>, vector<16xi1>
-    //CHECK: xegpu.store %[[r11]], %[[r10]], %[[cst]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<write_back>, l3_hint = #xegpu.cache_hint<write_back>}> : vector<16xf32>, !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>>, vector<16xi1>
+    //CHECK: %[[r1:.*]] = vector.broadcast %[[r0]] : index to vector<16xindex>
+    //CHECK: %[[r2:.*]] = xegpu.create_tdesc %[[cast]], %[[r1]] : memref<?xf32>, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>>
+    //CHECK: %[[r3:.*]] = xegpu.load %[[r2]], %[[cst]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<cached>, l3_hint = #xegpu.cache_hint<cached>}> : !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>>, vector<16xi1> -> vector<16xf32>
+    //CHECK: %[[r4:.*]] = vector.shape_cast %[[r3]] : vector<16xf32> to vector<1x16xf32>
+    //CHECK: %[[r5:.*]] = xegpu.create_tdesc %[[cast_0]], %[[r1]] : memref<?xf32>, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>>
+    //CHECK: %[[r6:.*]] = xegpu.load %[[r5]], %[[cst]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<cached>, l3_hint = #xegpu.cache_hint<cached>}> : !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>>, vector<16xi1> -> vector<16xf32>
+    //CHECK: %[[r7:.*]] = vector.shape_cast %[[r6]] : vector<16xf32> to vector<1x16xf32>
+    //CHECK: %[[r8:.*]] = arith.addf %[[r4]], %[[r7]] : vector<1x16xf32>
+    //CHECK: %[[r9:.*]] = xegpu.create_tdesc %[[cast_1]], %[[r1]] : memref<?xf32>, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>>
+    //CHECK: %[[r10:.*]] = vector.shape_cast %[[r8]] : vector<1x16xf32> to vector<16xf32>
+    //CHECK: xegpu.store %[[r10]], %[[r9]], %[[cst]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<write_back>, l3_hint = #xegpu.cache_hint<write_back>}> : vector<16xf32>, !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>>, vector<16xi1>
+    //CHECK: xegpu.store %[[r10]], %[[r9]], %[[cst]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<write_back>, l3_hint = #xegpu.cache_hint<write_back>}> : vector<16xf32>, !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>>, vector<16xi1>
     %c1024 = arith.constant 1024 : index
     %cst = arith.constant dense<true> : vector<1x32xi1>
     %cast = memref.cast %arg0 : memref<*xf32> to memref<?xf32>

diff --git a/test/Dialect/NDArray/Extensions/lit.local.cfg b/test/Dialect/NDArray/Extensions/lit.local.cfg
@@ -2,4 +2,4 @@
 local_excludes = ['mesh-spmdization.mlir']
 
 if(not config.imex_enable_excluded_tests):
-    config.excludes.update(local_excludes)
+    config.excludes.update(local_excludes)
Original file line number	Diff line number	Diff line change
		@@ -1 +1 @@
		228e96b28a84828e1720c387a339a7e68dbdc029
		92164faf17d553359418b9f49c1a41d680d0de49