intel
diff --git a/‎include/gc/Transforms/Passes.td‎
Lines changed: 18 additions & 9 deletions b/‎include/gc/Transforms/Passes.td‎
Lines changed: 18 additions & 9 deletions
diff --git a/‎lib/gc/Transforms/GPU/GpuLoopTiling.cpp‎
Lines changed: 16 additions & 21 deletions b/‎lib/gc/Transforms/GPU/GpuLoopTiling.cpp‎
Lines changed: 16 additions & 21 deletions
diff --git a/‎lib/gc/Transforms/GPU/GpuTilingAndFusion.cpp‎
Lines changed: 111 additions & 66 deletions b/‎lib/gc/Transforms/GPU/GpuTilingAndFusion.cpp‎
Lines changed: 111 additions & 66 deletions
@@ -125,28 +125,37 @@ def GpuTilingAndFusion : Pass<"gpu-tiling", "func::FuncOp"> {
   let summary = "GPU tiling and fusion path.";
   let description = [{
     This path tiles linalg operations and wraps into foreach loops.
-    The tiles calculation is based on the Execution Unit cache size and the number of threads per EU.
+    The tiles calculation is based on the GPU device properties, retrieved from the DLTI attributes.
   }];
   let options = [
-    Option<"euMem", "eu-mem", "size_t",
+    Option<"numEus", "num-eus", "size_t",
+           /*default=*/"448",
+           "Number of Execution Units.">,
+    Option<"numEusPerSlice", "num-eus-per-slice", "size_t",
+           /*default=*/"8",
+           "Number of Execution Units per slice.">,
+    Option<"numThreadsPerEu", "num-threads-per-eu", "size_t",
+           /*default=*/"8",
+           "Number of threads per Execution Unit.">,
+    Option<"cacheSize", "cache-size", "size_t",
            /*default=*/"131072",
            "Execution Unit cache size.">,
-    Option<"euThreads", "eu-threads", "size_t",
-           /*default=*/"8",
-           "Number of threads per EU.">
+    Option<"vectorWidth", "vector-width", "size_t",
+           /*default=*/"512",
+           "The maximum width of EU's vector registers.">
     ];
 }
 
 def GpuLoopTiling : Pass<"gpu-loop-tiling", "func::FuncOp"> {
   let summary = "Create nested parallel loops to be mapped to GPU.";
   let description = [{
     This path tiles the loops created by the GpuTilingAndFusion pass and converted to parallel loops.
-    Each tile of the outer loop is divided by the number of threads per EU.
+    The tiles calculation is based on the workgroup size, retrieved from the DLTI attributes.
   }];
   let options = [
-    Option<"euThreads", "eu-threads", "size_t",
-           /*default=*/"8",
-           "Number of threads per Execution Unit.">
+    Option<"workGroupSize", "work-group-size", "size_t",
+           /*default=*/"64",
+           "The maximum workgroup size.">
     ];
 }
 #endif // GC_USE_IMEX
 
@@ -22,7 +22,7 @@
 #include "gc/Utils/Log.h"
 
 using namespace mlir;
-// using namespace mlir::gc::gpu;
+using namespace mlir::gc;
 
 namespace mlir::gc {
 #define GEN_PASS_DECL_GPULOOPTILING
@@ -41,34 +41,29 @@ struct GpuLoopTiling final : GpuPass<GpuLoopTiling>,
 
   void runOnOperation() override {
     IRRewriter rewriter(&getContext());
-    auto euThreads = static_cast<double>(getEuThreads(rewriter));
+    auto wgSize = getWorkGroupSize(rewriter);
     getOperation().walk<WalkOrder::PreOrder>([&](scf::ParallelOp loop) {
       if (!loop->getParentOfType<scf::ParallelOp>()) {
-        tile(loop, euThreads);
+        SmallVector<int64_t> tiles;
+        auto steps = loop.getStep();
+        tiles.reserve(steps.size());
+
+        for (auto step : steps) {
+          if (auto v = getConstIdxValue(step)) {
+            tiles.push_back(v);
+          } else {
+            tiles.push_back(32);
+          }
+        }
+
+        adjustTiles(wgSize, tiles);
+        tileParallelLoop(loop, tiles, false);
       }
       return WalkResult::skip();
     });
     if (failed(simplifyRegions(rewriter, getOperation()->getRegions()))) {
       gcLogD("Failed to simplify regions");
     }
   }
-
-private:
-  static void tile(scf::ParallelOp loop, double euThreads) {
-    SmallVector<int64_t> tileSizes;
-    auto steps = loop.getStep();
-    tileSizes.reserve(steps.size());
-
-    for (auto step : steps) {
-      if (auto v = getConstIdxValue(step)) {
-        tileSizes.push_back(static_cast<int64_t>(
-            std::ceil(static_cast<double>(v) / euThreads)));
-      } else {
-        tileSizes.push_back(32);
-      }
-    }
-
-    tileParallelLoop(loop, tileSizes, false);
-  }
 };
 } // namespace
@@ -24,7 +24,7 @@
 #include "gc/Utils/Log.h"
 
 using namespace mlir;
-// using namespace mlir::gc::gpu;
+using namespace mlir::gc;
 
 namespace mlir::gc {
 #define GEN_PASS_DECL_GPUTILINGANDFUSION
@@ -39,33 +39,39 @@ struct GpuTilingAndFusion final
       gc::impl::GpuTilingAndFusionBase<GpuTilingAndFusion> {
   friend struct GpuPass;
   explicit GpuTilingAndFusion()
-      : GpuTilingAndFusion(gc::GpuTilingAndFusionOptions{}) {}
-  explicit GpuTilingAndFusion(const gc::GpuTilingAndFusionOptions &opts)
+      : GpuTilingAndFusion(GpuTilingAndFusionOptions{}) {}
+  explicit GpuTilingAndFusion(const GpuTilingAndFusionOptions &opts)
       : GpuPass(), GpuTilingAndFusionBase(opts) {}
 
   void runOnOperation() override {
     IRRewriter rewriter(&getContext());
     scf::SCFTileAndFuseOptions opts;
+    opts.setFusionControlFn(
+        [&](tensor::ExtractSliceOp, OpResult originalProducer, bool)
+            -> std::optional<scf::SCFTileAndFuseOptions::ControlFnResult> {
+          Operation *op = originalProducer.getOwner();
+          if (!op) {
+            return std::nullopt;
+          }
+          if (auto linalgOp = dyn_cast<linalg::LinalgOp>(op)) {
+            if (!linalgOp.hasOnlyProjectedPermutations()) {
+              return std::nullopt;
+            }
+          }
+          return scf::SCFTileAndFuseOptions::ControlFnResult{};
+        });
     opts.tilingOptions.setLoopType(scf::SCFTilingOptions::LoopType::ForallOp);
-    // The outer loop is converted to a GPU kernel and the tile sizes are mapped
-    // to the grid sizes.
+    auto numEus = getNumEus(rewriter);
+    auto numEusPerSlice = getNumEusPerSlice(rewriter);
+    auto numThreadsPerEu = getNumThreadsPerEu(rewriter);
+    auto cacheSize = getCacheSize(rewriter);
+    auto vectorWidth = getVectorWidth(rewriter);
+    auto cachePerThread =
+        std::max(cacheSize / numEusPerSlice / numThreadsPerEu, vectorWidth);
+    // The inner loop is converted to a GPU kernel and the tile sizes are mapped
+    // to the block sizes.
     opts.tilingOptions.setTileSizeComputationFunction(
-        // The tile sizes calculation is based on the following equation:
-        // n * TS0 * TS1 * ... * TSn = euMem
-        // where:
-        // n - an average number of bytes, processed by each iteration
-        // TS0, TS1, ... TSn - the tile sizes for each loop correspondingly
-        // euMem - the physical memory (cache) size of the GPU execution unit
-        //
-        // To calculate the tile size TS, we need to divide the total loop size
-        // S by the ratio r:
-        //
-        // n * (S0/r0) * (S1/r1) * ... * (Sn/rn) = euMem
-        // r0 * r1 * ... * rn = (n * S0 * S1 * ... * Sn) / euMem
-        // If all sizes are equal, then S0 = ... = Sn = S, r0 = ... = rn = r:
-        // r^n = (n * S^n) / euMem
-        // r = (n * S^n / euMem)^(1/n)
-        [euMem = getEuMem(rewriter), euThreads = getEuThreads(rewriter)](
+        [cachePerThread, vectorWidth, numThreads = numEus * numThreadsPerEu](
             OpBuilder &builder, Operation *op) -> SmallVector<OpFoldResult> {
           auto ti = dyn_cast<TilingInterface>(op);
           if (!ti) {
@@ -76,44 +82,49 @@ struct GpuTilingAndFusion final
           auto itDomains = ti.getIterationDomain(builder);
           assert(itTypes.size() == itDomains.size());
 
-          // TODO: Add a parameter to the options?
-          size_t totalSize = calcOperandsSize(op) * euThreads;
-          unsigned loopCount = 0;
-
+          SmallVector<int64_t> tiles;
+          int64_t numIterations = 1;
           for (auto [t, r] : zip(itTypes, itDomains)) {
             if (t == utils::IteratorType::parallel) {
               if (auto v = getConstantIntValue(r.size)) {
-                loopCount++;
-                totalSize *= *v;
+                numIterations *= *v;
+                tiles.emplace_back(*v);
               } else {
-                return calcDynamicSizes(builder, ti, euMem, euThreads);
+                return calcDynamicSizes(builder, ti, cachePerThread);
               }
             }
           }
 
-          if (loopCount == 0) {
+          if (tiles.empty()) {
             return {};
           }
 
-          // TODO: In case of different sizes, calculate the ratio for each loop
-          double ratio = std::pow(static_cast<double>(totalSize) /
-                                      static_cast<double>(euMem),
-                                  1.0 / loopCount);
-          ratio = std::max(1.0, ratio);
-          SmallVector<OpFoldResult> tiles;
-          tiles.reserve(itDomains.size());
+          auto elementSize = getElementSize(op);
+          auto sizePerThread = numIterations / numThreads * elementSize;
+          auto tilesSize = std::max(sizePerThread, cachePerThread);
+          tilesSize = std::max(tilesSize / elementSize, 64L);
+
+          // If the operation could be lowered to XeGPU, make the tiles
+          // proportional to the vector width.
+          if (canLowerToXeGPU(op)) {
+            tilesSize = std::max(tilesSize / vectorWidth, 1L) * vectorWidth;
+          }
+
+          adjustTiles(tilesSize, tiles);
+
+          unsigned counter = 0;
+          SmallVector<OpFoldResult> result;
+          result.reserve(itDomains.size());
 
           for (auto [t, r] : zip(itTypes, itDomains)) {
             if (t != utils::IteratorType::parallel) {
-              tiles.emplace_back(builder.getIndexAttr(1));
-            } else if (auto v = getConstantIntValue(r.size)) {
-              tiles.emplace_back(ceil(builder, *v, ratio));
+              result.emplace_back(builder.getIndexAttr(1));
             } else {
-              abort(); // Must never get here
+              result.emplace_back(builder.getIndexAttr(tiles[counter++]));
             }
           }
 
-          return tiles;
+          return result;
         });
 
     auto fn = getOperation();
@@ -174,7 +185,8 @@ struct GpuTilingAndFusion final
   static std::optional<TilingInterface> findTi(Operation *op) {
     std::optional<TilingInterface> last;
     op->walk<WalkOrder::PreOrder>([&](linalg::LinalgOp linalgOp) {
-      if (!linalgOp->getParentOfType<scf::ForallOp>()) {
+      if (linalgOp.hasOnlyProjectedPermutations() &&
+          !linalgOp->getParentOfType<scf::ForallOp>()) {
         if (auto ti = dyn_cast<TilingInterface>(linalgOp.getOperation())) {
           last = ti;
         }
@@ -184,17 +196,16 @@ struct GpuTilingAndFusion final
     return last;
   }
 
-  static SmallVector<OpFoldResult> calcDynamicSizes(OpBuilder &builder,
-                                                    TilingInterface ti,
-                                                    size_t euMem,
-                                                    size_t euThreads) {
+  // TODO: Use the adjustTiles() function from MLIR.
+  static SmallVector<OpFoldResult>
+  calcDynamicSizes(OpBuilder &builder, TilingInterface ti, int64_t cacheSize) {
     auto itTypes = ti.getLoopIteratorTypes();
     auto itDomains = ti.getIterationDomain(builder);
     assert(itTypes.size() == itDomains.size());
 
     auto loc = ti.getLoc();
     Value dynamicSize;
-    size_t staticSize = calcOperandsSize(ti.getOperation()) * euThreads;
+    int64_t staticSize = getElementSize(ti.getOperation());
     unsigned loopCount = 0;
 
     for (auto [t, r] : zip(itTypes, itDomains)) {
@@ -225,7 +236,7 @@ struct GpuTilingAndFusion final
                                            dynamicSize));
 
     auto memSize = builder.create<arith::ConstantFloatOp>(
-        loc, APFloat(static_cast<double>(euMem)), builder.getF64Type());
+        loc, APFloat(static_cast<double>(cacheSize)), builder.getF64Type());
     auto pow = builder.create<arith::ConstantFloatOp>(
         loc, APFloat(1.0 / loopCount), builder.getF64Type());
     Value ratio = builder.create<math::PowFOp>(
@@ -265,29 +276,63 @@ struct GpuTilingAndFusion final
     return tiles;
   }
 
-  static size_t calcOperandsSize(Operation *op) {
-    size_t size = 0;
-    auto typeSize = [](Type t) -> size_t {
-      Type et;
-      if (auto mt = dyn_cast<MemRefType>(t)) {
-        et = mt.getElementType();
-      } else if (auto tt = dyn_cast<TensorType>(t)) {
-        et = tt.getElementType();
+  static int64_t getElementSize(Operation *op) {
+    int64_t elementSize = 1;
+    if (auto linalgOp = dyn_cast<linalg::LinalgOp>(op)) {
+      if (auto inits = linalgOp.getDpsInits(); !inits.empty()) {
+        if (auto t = getElementTypeOrSelf(inits[0].getType());
+            t.isIntOrFloat()) {
+          elementSize = t.getIntOrFloatBitWidth() / 8;
+        }
+      }
+    }
+    return elementSize;
+  }
+
+  // TODO: Add more checks
+  static bool canLowerToXeGPU(Operation *operation) {
+    auto op = dyn_cast<linalg::LinalgOp>(operation);
+    if (!op) {
+      return false;
+    }
+    if (op.hasDynamicShape()) {
+      return false;
+    }
+
+    auto checkOperand = [&](Value operand, bool isOutput = false) {
+      ShapedType type;
+      if (auto memref = dyn_cast<MemRefType>(operand.getType())) {
+        type = memref;
+      } else if (auto tensor = dyn_cast<RankedTensorType>(operand.getType())) {
+        type = tensor;
       } else {
-        return 0;
+        return false;
       }
-      return et.isIntOrFloat() ? et.getIntOrFloatBitWidth() / 8 : 1;
-    };
-    for (auto operand : op->getOperands()) {
-      if (auto defOp = operand.getDefiningOp()) {
-        for (auto t : defOp->getResultTypes()) {
-          size += typeSize(t);
+
+      auto shape = type.getShape();
+      if (isOutput) {
+        if (shape.size() != 2 || shape[0] * shape[1] < 16) {
+          return false;
         }
-      } else {
-        size += typeSize(operand.getType());
+      } else if (shape.size() > 2) {
+        return false;
       }
+
+      return true;
+    };
+
+    if (auto inits = op.getDpsInits();
+        !inits.empty() && !checkOperand(inits[0], true)) {
+      return false;
     }
-    return size == 0 ? 1 : size;
+
+    if (auto inputs = op.getDpsInputs();
+        !std::all_of(inputs.begin(), inputs.end(),
+                     [&](Value v) { return checkOperand(v); })) {
+      return false;
+    }
+
+    return true;
   }
 };
 } // namespace