intel
diff --git a/‎include/imex/Utils/XeCommon.h‎
Lines changed: 0 additions & 21 deletions b/‎include/imex/Utils/XeCommon.h‎
Lines changed: 0 additions & 21 deletions
diff --git a/‎lib/Dialect/XeTile/Transforms/Blocking.cpp‎
Lines changed: 95 additions & 268 deletions b/‎lib/Dialect/XeTile/Transforms/Blocking.cpp‎
Lines changed: 95 additions & 268 deletions
diff --git a/‎lib/Transforms/VectorLinearize.cpp‎
Lines changed: 42 additions & 15 deletions b/‎lib/Transforms/VectorLinearize.cpp‎
Lines changed: 42 additions & 15 deletions
diff --git a/‎lib/Utils/XeCommon.cpp‎
Lines changed: 0 additions & 88 deletions b/‎lib/Utils/XeCommon.cpp‎
Lines changed: 0 additions & 88 deletions
diff --git a/‎test/Conversion/XeTileToXeGPU/gemm_preop.mlir‎
Lines changed: 2 additions & 1 deletion b/‎test/Conversion/XeTileToXeGPU/gemm_preop.mlir‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎test/Conversion/XeTileToXeGPU/sg_softmax.mlir‎
Lines changed: 6 additions & 17 deletions b/‎test/Conversion/XeTileToXeGPU/sg_softmax.mlir‎
Lines changed: 6 additions & 17 deletions
diff --git a/‎test/Conversion/XeTileToXeGPU/sg_store_tile.mlir‎
Lines changed: 1 addition & 1 deletion b/‎test/Conversion/XeTileToXeGPU/sg_store_tile.mlir‎
Lines changed: 1 addition & 1 deletion
@@ -84,15 +84,6 @@ applyVnniTransform(mlir::OpBuilder &builder,
 // 16, 32, and 64 are only available if simdLanes == 1.
 llvm::SmallVector<int> getSupportedChunkSizes(int simdlanes);
 
-using PackFuncTy = std::function<mlir::TypedValue<mlir::VectorType>(
-    mlir::Value, mlir::Value, mlir::Location, mlir::OpBuilder &)>;
-
-// A wrapper function to merge small vectors into a big one. It takes a
-// range of mlir::Value objects with mlir::VectorType, and merge them
-// into a big vector using the provided transformation function.
-mlir::Value packVectorsWith(mlir::ValueRange ins, PackFuncTy op,
-                            mlir::Location loc, mlir::OpBuilder &builder);
-
 // Combine vectors vertically while keeping the logical data layout.
 // As an example, given two vectors (2x4xf16) p and q, it will merge
 // them in to a 4x4xf16 vector.
@@ -105,18 +96,6 @@ mlir::TypedValue<mlir::VectorType> stack(mlir::Value vecUp, mlir::Value vecDown,
                                          mlir::Location loc,
                                          mlir::OpBuilder &builder);
 
-// merge vectors horizontally while keep the logical data layout.
-// 1 2 3 4   +    10 11 12   =   1 2 3 4 10 11 12
-// 5 6 7 8        13 14 15       5 6 7 8 13 14 15
-// since there is no direct op in mlir exists, we will
-// using ShapeCast and Shuffle to mimic it. It comes with
-// cost of complex shuffle masks. the mask for the above one
-// will be like this: 0 1 2 3  8  9 10
-//                    4 5 6 7 11 12 13
-mlir::TypedValue<mlir::VectorType> concat(mlir::Value lhs, mlir::Value rhs,
-                                          mlir::Location loc,
-                                          mlir::OpBuilder &builder);
-
 // It checks each GPUFuncOp in the module to see
 // whether they have arguments and outputs with
 // xetile.TileType. They are currently not supported yet.
 
@@ -322,6 +322,17 @@ struct VectorExtractStridedSliceConversion final
   }
 };
 
+// clang-format off
+// linearize InsertStridedSliceOp by extracting rows from the source vector
+// using extract_strided_slice and inserting them into the destination vector
+// using insert_strided_slice. For example.
+//   vector.insert_strided_slice %s, %d {offsets=[0, 0]}: vector<2x4xf32> into vector<4x4xf32>
+// will lowered into (both s and d are linearized to 1D):
+//   %0 = vector.extract_strided_slice %s {offsets=[0], sizes=[4], strides=[1]} : vector<4xf32> from vector<8xf32>
+//   %1 = vector.insert_strided_slice %0, %d {offsets=[0], strides=[1]} : vector<4xf32> into vector<16xf32>
+//   %2 = vector.extract_strided_slice %s {offsets=[4], sizes=[4], strides=[1]} : vector<4xf32> from vector<8xf32>
+//   %3 = vector.insert_strided_slice %2, %1 {offsets=[4], strides=[1]} : vector<4xf32> into vector<16xf32>
+// clang-format on
 struct VectorInsertStridedSliceConversion final
     : public mlir::OpConversionPattern<mlir::vector::InsertStridedSliceOp> {
   using mlir::OpConversionPattern<
@@ -330,31 +341,47 @@ struct VectorInsertStridedSliceConversion final
   mlir::LogicalResult
   matchAndRewrite(mlir::vector::InsertStridedSliceOp op, OpAdaptor adaptor,
                   mlir::ConversionPatternRewriter &rewriter) const override {
+    auto loc = op.getLoc();
     auto srcTy = op.getSourceVectorType();
-    auto destTy = op.getDestVectorType();
+    auto dstTy = op.getDestVectorType();
 
     if (op.hasNonUnitStrides()) {
       return rewriter.notifyMatchFailure(
-          op, "InsertStridedSliceOp only supports unit strides.");
+          op, "InsertStridedSliceOp linearization only supports unit strides.");
     }
 
-    if (llvm::any_of(srcTy.getShape().drop_back(),
-                     [](int64_t dim) { return dim != 1; })) {
-      return rewriter.notifyMatchFailure(op,
-                                         "Only supports vectors with leading "
-                                         "dims (except the last dim) as 1s.");
+    if (srcTy.getRank() != 2) {
+      return rewriter.notifyMatchFailure(
+          op, "InsertStridedSliceOp linearization only supports 2D source.");
     }
 
-    auto strides = destTy.getShape().drop_front().vec();
-    strides.push_back(1);
+    if (!srcTy.hasStaticShape() || !dstTy.hasStaticShape()) {
+      return rewriter.notifyMatchFailure(
+          op, "InsertStridedSliceOp linerization only supports static shapes.");
+    }
+
+    auto dstShape = dstTy.getShape();
+    auto dstStrides = dstShape.drop_front().vec();
+    dstStrides.push_back(1);
     int64_t linearizedOffset = 0;
-    for (auto [off, stride] : llvm::zip_equal(op.getOffsets(), strides)) {
+    for (auto [off, stride] : llvm::zip_equal(op.getOffsets(), dstStrides)) {
       linearizedOffset += mlir::getConstantIntValue(off).value() * stride;
     }
 
-    rewriter.replaceOpWithNewOp<mlir::vector::InsertStridedSliceOp>(
-        op, adaptor.getSource(), adaptor.getDest(), linearizedOffset, 1);
+    // extracts a row from source, and insert it into the destination
+    auto srcShape = srcTy.getShape();
+    mlir::Value dstValue = adaptor.getDest();
+    for (auto i = 0; i < srcShape[0]; i++) {
+      auto srcOffset = i * srcShape[1];
+      auto value = rewriter.create<mlir::vector::ExtractStridedSliceOp>(
+          loc, adaptor.getSource(), srcOffset, srcShape[1], 1);
+
+      auto dstOffset = linearizedOffset + i * dstShape.back();
+      dstValue = rewriter.create<mlir::vector::InsertStridedSliceOp>(
+          loc, value, dstValue, dstOffset, 1);
+    }
 
+    rewriter.replaceOp(op, dstValue);
     return mlir::success();
   }
 };
@@ -672,9 +699,9 @@ struct VectorLinearizePass final
     target.addDynamicallyLegalOp<mlir::vector::InsertStridedSliceOp>(
         [&](mlir::vector::InsertStridedSliceOp op) {
           auto srcTy = op.getSourceVectorType();
-          if (!op.hasNonUnitStrides() && srcTy.getRank() != 1 &&
-              llvm::all_of(srcTy.getShape().drop_back(),
-                           [](int64_t dim) { return dim == 1; }))
+          auto dstTy = op.getDestVectorType();
+          if (!op.hasNonUnitStrides() && srcTy.getRank() == 2 &&
+              srcTy.hasStaticShape() && dstTy.hasStaticShape())
             return false;
           return true;
         });
 
@@ -318,94 +318,6 @@ mlir::TypedValue<mlir::VectorType> stack(mlir::Value vecUp, mlir::Value vecDown,
   return op;
 }
 
-// generate linearized shuffle mask for concat.
-static llvm::SmallVector<int64_t>
-getShuffleMask(llvm::ArrayRef<int64_t> shape1, llvm::ArrayRef<int64_t> shape2) {
-  assert(shape1.size() == shape2.size() && shape1.size() <= 2 &&
-         "only 1D/2D shape are supported.");
-  assert(shape1.drop_back() == shape2.drop_back() &&
-         "the row dim of the shapes should match.");
-  int64_t size1 = std::accumulate(shape1.begin(), shape1.end(), 1,
-                                  std::multiplies<int64_t>());
-  int64_t size2 = std::accumulate(shape2.begin(), shape2.end(), 1,
-                                  std::multiplies<int64_t>());
-  llvm::SmallVector<int64_t> mask(size1 + size2);
-  auto rows = shape1.size() == 1 ? 1 : shape1[0];
-  auto cols1 = shape1.size() == 1 ? shape1[0] : shape1[1];
-  auto cols2 = shape2.size() == 1 ? shape2[0] : shape2[1];
-  for (int64_t i = 0; i < rows; i++) {
-    int64_t s = i * (cols1 + cols2);
-    int64_t m = s + cols1;
-    int64_t e = m + cols2;
-    int64_t v1 = i * cols1;
-    int64_t v2 = size1 + i * cols2;
-    std::iota(mask.begin() + s, mask.begin() + m, v1);
-    std::iota(mask.begin() + m, mask.begin() + e, v2);
-  }
-  return mask;
-}
-
-mlir::TypedValue<mlir::VectorType> concat(mlir::Value lhs, mlir::Value rhs,
-                                          mlir::Location loc,
-                                          mlir::OpBuilder &builder) {
-  auto lhsTy = llvm::cast<mlir::VectorType>(lhs.getType());
-  auto rhsTy = llvm::cast<mlir::VectorType>(rhs.getType());
-
-  assert(lhsTy.getShape()[0] == lhsTy.getShape()[0] &&
-         "Operands of concat() do not have the same number of rows.");
-  assert(lhsTy.getRank() <= 2 && rhsTy.getRank() == lhsTy.getRank() &&
-         "Currently concat only works on 1D/2D vector.");
-
-  auto elemTy = lhsTy.getElementType();
-  auto leftSize = lhsTy.getNumElements();
-  auto leftShape = lhsTy.getShape();
-  auto leftFlatTy = mlir::VectorType::get({lhsTy.getNumElements()}, elemTy);
-
-  auto rightSize = rhsTy.getNumElements();
-  auto rightShape = rhsTy.getShape();
-  auto rightFlatTy = mlir::VectorType::get({rhsTy.getNumElements()}, elemTy);
-
-  auto newShape = lhsTy.getRank() == 1
-                      ? llvm::SmallVector<int64_t>({leftSize + rightSize})
-                      : llvm::SmallVector<int64_t>(
-                            {leftShape[0], leftShape[1] + rightShape[1]});
-  auto castLeft =
-      builder.create<mlir::vector::ShapeCastOp>(loc, leftFlatTy, lhs);
-  auto castRight =
-      builder.create<mlir::vector::ShapeCastOp>(loc, rightFlatTy, rhs);
-  auto mask = getShuffleMask(leftShape, rightShape);
-  auto shuffleOp =
-      builder.create<mlir::vector::ShuffleOp>(loc, castLeft, castRight, mask);
-  auto targetTy = mlir::VectorType::get(newShape, elemTy);
-  auto newOp =
-      builder.create<mlir::vector::ShapeCastOp>(loc, targetTy, shuffleOp);
-  return newOp;
-}
-
-// A wrapper function to merge small vectors into a big one. It takes a
-// range of mlir::Value objects with mlir::VectorType, and merge them
-// into a big vector using the provided transformation function.
-mlir::Value packVectorsWith(mlir::ValueRange ins, PackFuncTy op,
-                            mlir::Location loc, mlir::OpBuilder &builder) {
-  llvm::SmallVector<mlir::Value> shuffleOps(ins.begin(), ins.end());
-  while (shuffleOps.size() > 1) {
-    auto curr = shuffleOps;
-    shuffleOps.clear();
-    size_t currPairStartIdx{0};
-    while (currPairStartIdx < curr.size() - 1) {
-      size_t leftIdx{currPairStartIdx++};
-      size_t rightIdx{currPairStartIdx++};
-      auto newOp = op(curr[leftIdx], curr[rightIdx], loc, builder);
-      shuffleOps.push_back(newOp);
-    }
-    if (currPairStartIdx < curr.size()) {
-      assert(currPairStartIdx == curr.size() - 1);
-      shuffleOps.push_back(curr[curr.size() - 1]);
-    }
-  }
-  return shuffleOps[0];
-}
-
 /// Checks if the given `type` is a 1-D vector type that requires VectorAnyINTEL
 /// capability. In other words, the vector size is not supported by SPIR-V.
 /// SPIR-V only supports 2, 3, 4, 8, 16 elements (8 and 16 with Vector16
 
@@ -62,14 +62,15 @@ module attributes {gpu.container_module} {
         %27 = xetile.load_tile %arg5 {padding = 0.000000e+00 : f32}  : !xetile.tile<32x32xf16> -> vector<32x32xf16>
         %28 = xetile.load_tile %arg6 {padding = 0.000000e+00 : f32}  : !xetile.tile<32x32xf16> -> vector<32x32xf16>
         xegpu.compile_hint
-        //CHECK-COUNT-4: {{.*}} = vector.extract_strided_slice {{.*}} {offsets = [{{.*}}], sizes = [16, 16], strides = [1, 1]} : vector<32x16xf16> to vector<16x16xf16>
+        //CHECK-COUNT-8: {{.*}} = vector.extract_strided_slice {{.*}} {offsets = [{{.*}}], sizes = [8, 16], strides = [1, 1]} : vector<32x16xf16> to vector<8x16xf16>
         //CHECK-COUNT-8: {{.*}} = arith.addf {{.*}}, {{.*}} : vector<8x16xf16>
         %29 = arith.addf %27, %27 : vector<32x32xf16>
         xegpu.compile_hint
         %30 = xetile.update_tile_offset %arg5, [%c0,  %c32] : !xetile.tile<32x32xf16>
         %31 = xetile.update_tile_offset %arg6, [%c0,  %c32] : !xetile.tile<32x32xf16>
         xegpu.compile_hint
 
+        //CHECK-COUNT-4: {{.*}} = vector.extract_strided_slice {{.*}} {offsets = [{{.*}}], sizes = [16, 16], strides = [1, 1]} : vector<32x16xf16> to vector<16x16xf16>
         // CHECK-COUNT-16: {{.*}} = xegpu.dpas {{.*}}, {{.*}}, {{.*}} : vector<8x16xf16>, vector<16x16xf16>, vector<8x16xf32> -> vector<8x16xf32>
         %32 = xetile.tile_mma %29, %28, %arg7 : vector<32x32xf16>, vector<32x32xf16>, vector<32x32xf32> -> vector<32x32xf32>
         xegpu.compile_hint
 
@@ -18,8 +18,7 @@ gpu.module @test_kernel {
       //CHECK: %[[r3:.*]] = xegpu.load_nd %[[r1]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<cached>, l3_hint = #xegpu.cache_hint<cached>}> : !xegpu.tensor_desc<32x32xf16, #xegpu.block_tdesc_attr<array_length = 1 : i64, boundary_check = true>> -> vector<32x32xf16>
       %2 = xetile.load_tile %1: !xetile.tile<32x64xf16> -> vector<32x64xf16>
 
-      //CHECK-COUNT-4: {{.*}} = vector.extract_strided_slice %[[r2]] {offsets = [{{.*}}], sizes = [8, 32], strides = [1, 1]} : vector<32x32xf16> to vector<8x32xf16>
-      //CHECK-COUNT-4: {{.*}} = vector.extract_strided_slice %[[r3]] {offsets = [{{.*}}], sizes = [8, 32], strides = [1, 1]} : vector<32x32xf16> to vector<8x32xf16>
+      //CHECK-COUNT-8: {{.*}} = vector.extract_strided_slice %{{.*}} {offsets = [{{.*}}], sizes = [8, 32], strides = [1, 1]} : vector<32x32xf16> to vector<8x32xf16>
       //CHECK-COUNT-8: {{.*}} = math.exp %{{.*}} : vector<8x32xf16>
       %3 = math.exp %2: vector<32x64xf16>
       //CHECK-COUNT-62: arith.addf {{.*}}, {{.*}} : vector<1x32xf16>
@@ -42,8 +41,7 @@ gpu.module @test_kernel {
       //CHECK: %[[r2:.*]] = xegpu.load_nd %[[r0]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<cached>, l3_hint = #xegpu.cache_hint<cached>}> : !xegpu.tensor_desc<32x32xf16, #xegpu.block_tdesc_attr<array_length = 1 : i64, boundary_check = true>> -> vector<32x32xf16>
       //CHECK: %[[r3:.*]] = xegpu.load_nd %[[r1]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<cached>, l3_hint = #xegpu.cache_hint<cached>}> : !xegpu.tensor_desc<32x32xf16, #xegpu.block_tdesc_attr<array_length = 1 : i64, boundary_check = true>> -> vector<32x32xf16>
       %2 = xetile.load_tile %1: !xetile.tile<32x64xf16> -> vector<32x64xf16>
-      //CHECK-COUNT-4: {{.*}} = vector.extract_strided_slice %[[r2]] {offsets = [{{.*}}], sizes = [8, 32], strides = [1, 1]} : vector<32x32xf16> to vector<8x32xf16>
-      //CHECK-COUNT-4: {{.*}} = vector.extract_strided_slice %[[r3]] {offsets = [{{.*}}], sizes = [8, 32], strides = [1, 1]} : vector<32x32xf16> to vector<8x32xf16>
+      //CHECK-COUNT-8: {{.*}} = vector.extract_strided_slice %{{.*}} {offsets = [{{.*}}], sizes = [8, 32], strides = [1, 1]} : vector<32x32xf16> to vector<8x32xf16>
       //CHECK-COUNT-8: {{.*}} = math.exp %{{.*}} : vector<8x32xf16>
       %3 = math.exp %2: vector<32x64xf16>
       //CHECK: {{.*}} = arith.addf {{.*}}, {{.*}} : vector<1x32xf16>
@@ -203,22 +201,13 @@ gpu.module @test_kernel {
       //CHECK: {{.*}} = vector.shuffle {{.*}}, {{.*}} [0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 32, 34, 36, 38, 40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62] : vector<32xf16>, vector<32xf16>
       //CHECK: {{.*}} = vector.shuffle {{.*}}, {{.*}} [1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31, 33, 35, 37, 39, 41, 43, 45, 47, 49, 51, 53, 55, 57, 59, 61, 63] : vector<32xf16>, vector<32xf16>
       //CHECK: {{.*}} = arith.addf {{.*}}, {{.*}} : vector<32xf16>
-      //CHECK-COUNT-32: {{.*}} = vector.extractelement {{.*}}[{{.*}} : i32] : vector<32xf16>
+      //CHECK-COUNT-32: {{.*}} = vector.extractelement {{.*}}[{{.*}} : index] : vector<32xf16>
       //CHECK-COUNT-32: {{.*}} = vector.splat {{.*}} : vector<1x32xf16>
       %4 = xetile.reduction <add>, %3 [1]: vector<32x64xf16> -> vector<32x1xf16>
 
-      //CHECK-COUNT-4: %{{.*}} = vector.shuffle %{{.*}}, %{{.*}} [0, 1] : vector<1x32xf16>, vector<1x32xf16>
-      //CHECK-COUNT-2: %{{.*}} = vector.shuffle %{{.*}}, %{{.*}} [0, 1, 2, 3] : vector<2x32xf16>, vector<2x32xf16>
-      //CHECK: %{{.*}} = vector.shuffle %{{.*}}, %{{.*}} [0, 1, 2, 3, 4, 5, 6, 7] : vector<4x32xf16>, vector<4x32xf16>
-      //CHECK-COUNT-4: %{{.*}} = vector.shuffle %{{.*}}, %{{.*}} [0, 1] : vector<1x32xf16>, vector<1x32xf16>
-      //CHECK-COUNT-2: %{{.*}} = vector.shuffle %{{.*}}, %{{.*}} [0, 1, 2, 3] : vector<2x32xf16>, vector<2x32xf16>
-      //CHECK: %{{.*}} = vector.shuffle %{{.*}}, %{{.*}} [0, 1, 2, 3, 4, 5, 6, 7] : vector<4x32xf16>, vector<4x32xf16>
-      //CHECK-COUNT-4: %{{.*}} = vector.shuffle %{{.*}}, %{{.*}} [0, 1] : vector<1x32xf16>, vector<1x32xf16>
-      //CHECK-COUNT-2: %{{.*}} = vector.shuffle %{{.*}}, %{{.*}} [0, 1, 2, 3] : vector<2x32xf16>, vector<2x32xf16>
-      //CHECK: %{{.*}} = vector.shuffle %{{.*}}, %{{.*}} [0, 1, 2, 3, 4, 5, 6, 7] : vector<4x32xf16>, vector<4x32xf16>
-      //CHECK-COUNT-4: %{{.*}} = vector.shuffle %{{.*}}, %{{.*}} [0, 1] : vector<1x32xf16>, vector<1x32xf16>
-      //CHECK-COUNT-2: %{{.*}} = vector.shuffle %{{.*}}, %{{.*}} [0, 1, 2, 3] : vector<2x32xf16>, vector<2x32xf16>
-      //CHECK: %{{.*}} = vector.shuffle %{{.*}}, %{{.*}} [0, 1, 2, 3, 4, 5, 6, 7] : vector<4x32xf16>, vector<4x32xf16>
+      //CHECK-COUNT-64: %{{.*}} = vector.insert_strided_slice %{{.*}}, %{{.*}} {offsets = [{{.*}}], strides = [1, 1]} : vector<1x32xf16> into vector<32x64xf16>
+      //CHECK-COUNT-8: %{{.*}} = vector.extract_strided_slice %{{.*}} {offsets = [{{.*}}], sizes = [8, 32], strides = [1, 1]} : vector<32x64xf16> to vector<8x32xf16>
+
       %5 = xetile.broadcast %4 [1]: vector<32x1xf16> -> vector<32x64xf16>
       // CHECK-COUNT-8: {{.*}} = arith.divf {{.*}}, {{.*}} : vector<8x32xf16>
       %6 = arith.divf %3, %5: vector<32x64xf16>
 
@@ -1,5 +1,5 @@
 // RUN: imex-opt --split-input-file --xetile-init-duplicate --xetile-blocking \
-// RUN: --cse --convert-xetile-to-xegpu --cse %s -verify-diagnostics -o -| FileCheck %s
+// RUN: --cse --convert-xetile-to-xegpu --cse --canonicalize %s -verify-diagnostics -o -| FileCheck %s
 
 gpu.module @test_kernel {
   //CHECK: gpu.func @sg_tiled_store(%[[arg0:.*]]: memref<1024x1024xf32>) {