test

AndreyPavlenko · AndreyPavlenko · commit 04326d831bd6 · 2024-10-24T04:36:53.000+02:00
diff --git a/lib/gc/Transforms/GPU/GpuLoopTiling.cpp b/lib/gc/Transforms/GPU/GpuLoopTiling.cpp
@@ -41,34 +41,30 @@ struct GpuLoopTiling final : GpuPass<GpuLoopTiling>,
 
   void runOnOperation() override {
     IRRewriter rewriter(&getContext());
-    auto euThreads = static_cast<double>(getEuThreads(rewriter));
+    size_t euThreads = getEuThreads(rewriter);
     getOperation().walk<WalkOrder::PreOrder>([&](scf::ParallelOp loop) {
       if (!loop->getParentOfType<scf::ParallelOp>()) {
-        tile(loop, euThreads);
+        SmallVector<int64_t> loopSizes;
+        auto steps = loop.getStep();
+        loopSizes.reserve(steps.size());
+
+        for (auto step : steps) {
+          if (auto v = getConstIdxValue(step)) {
+            loopSizes.push_back(v);
+          } else {
+            loopSizes.push_back(32);
+          }
+        }
+
+        SmallVector<int64_t> tileSizes;
+        normaliseTiles(euThreads, loopSizes, tileSizes);
+        tileParallelLoop(loop, tileSizes, false);
       }
       return WalkResult::skip();
     });
     if (failed(simplifyRegions(rewriter, getOperation()->getRegions()))) {
       gcLogD("Failed to simplify regions");
     }
   }
-
-private:
-  static void tile(scf::ParallelOp loop, double euThreads) {
-    SmallVector<int64_t> tileSizes;
-    auto steps = loop.getStep();
-    tileSizes.reserve(steps.size());
-
-    for (auto step : steps) {
-      if (auto v = getConstIdxValue(step)) {
-        tileSizes.push_back(static_cast<int64_t>(
-            std::ceil(static_cast<double>(v) / euThreads)));
-      } else {
-        tileSizes.push_back(32);
-      }
-    }
-
-    tileParallelLoop(loop, tileSizes, false);
-  }
 };
 } // namespace
diff --git a/lib/gc/Transforms/GPU/GpuTilingAndFusion.cpp b/lib/gc/Transforms/GPU/GpuTilingAndFusion.cpp
@@ -46,6 +46,21 @@ struct GpuTilingAndFusion final
   void runOnOperation() override {
     IRRewriter rewriter(&getContext());
     scf::SCFTileAndFuseOptions opts;
+    opts.setFusionControlFn(
+        [&](tensor::ExtractSliceOp candidateSliceOp, OpResult originalProducer,
+            bool isDestinationOperand)
+            -> std::optional<scf::SCFTileAndFuseOptions::ControlFnResult> {
+          Operation *op = originalProducer.getOwner();
+          if (!op) {
+            return std::nullopt;
+          }
+          if (auto linalgOp = dyn_cast<linalg::LinalgOp>(op)) {
+            if (!linalgOp.hasOnlyProjectedPermutations()) {
+              return std::nullopt;
+            }
+          }
+          return scf::SCFTileAndFuseOptions::ControlFnResult{};
+        });
     opts.tilingOptions.setLoopType(scf::SCFTilingOptions::LoopType::ForallOp);
     // The outer loop is converted to a GPU kernel and the tile sizes are mapped
     // to the grid sizes.
@@ -77,13 +92,15 @@ struct GpuTilingAndFusion final
           assert(itTypes.size() == itDomains.size());
 
           // TODO: Add a parameter to the options?
-          size_t totalSize = calcOperandsSize(op) * euThreads;
+          size_t totalSize = calcOperandsSize(op);
           unsigned loopCount = 0;
+          SmallVector<int64_t> sizes;
 
           for (auto [t, r] : zip(itTypes, itDomains)) {
             if (t == utils::IteratorType::parallel) {
               if (auto v = getConstantIntValue(r.size)) {
                 loopCount++;
+                sizes.emplace_back(*v);
                 totalSize *= *v;
               } else {
                 return calcDynamicSizes(builder, ti, euMem, euThreads);
@@ -95,19 +112,25 @@ struct GpuTilingAndFusion final
             return {};
           }
 
-          // TODO: In case of different sizes, calculate the ratio for each loop
-          double ratio = std::pow(static_cast<double>(totalSize) /
-                                      static_cast<double>(euMem),
-                                  1.0 / loopCount);
-          ratio = std::max(1.0, ratio);
+          auto outerTileSize = static_cast<size_t>(
+              std::ceil(static_cast<double>(euMem) /
+                        static_cast<double>(calcOperandsSize(op))));
+          SmallVector<int64_t> outerTiles;
+          SmallVector<int64_t> innerTiles;
+          normaliseTiles(outerTileSize, sizes, outerTiles);
+          normaliseTiles(euThreads, sizes, innerTiles);
+
+          unsigned counter = 0;
           SmallVector<OpFoldResult> tiles;
           tiles.reserve(itDomains.size());
 
           for (auto [t, r] : zip(itTypes, itDomains)) {
             if (t != utils::IteratorType::parallel) {
               tiles.emplace_back(builder.getIndexAttr(1));
             } else if (auto v = getConstantIntValue(r.size)) {
-              tiles.emplace_back(ceil(builder, *v, ratio));
+              tiles.emplace_back(
+                  ceil(builder, outerTiles[counter], innerTiles[counter]));
+              counter++;
             } else {
               abort(); // Must never get here
             }
@@ -174,7 +197,8 @@ struct GpuTilingAndFusion final
   static std::optional<TilingInterface> findTi(Operation *op) {
     std::optional<TilingInterface> last;
     op->walk<WalkOrder::PreOrder>([&](linalg::LinalgOp linalgOp) {
-      if (!linalgOp->getParentOfType<scf::ForallOp>()) {
+      if (linalgOp.hasOnlyProjectedPermutations() &&
+          !linalgOp->getParentOfType<scf::ForallOp>()) {
         if (auto ti = dyn_cast<TilingInterface>(linalgOp.getOperation())) {
           last = ti;
         }
diff --git a/lib/gc/Transforms/GPU/GpuUtils.h b/lib/gc/Transforms/GPU/GpuUtils.h
@@ -8,6 +8,8 @@
 #ifndef GPUUTILS_H
 #define GPUUTILS_H
 
+#include <numeric>
+
 #include "mlir/IR/Builders.h"
 #include "mlir/IR/BuiltinOps.h"
 #include "mlir/Interfaces/DataLayoutInterfaces.h"
@@ -69,4 +71,28 @@ static int64_t getConstIdxValue(Value value) {
   }
   return 0;
 }
+
+static void normaliseTiles(size_t totalSize, SmallVector<int64_t> &loopSizes,
+                           SmallVector<int64_t> &tiles) {
+  size_t loopCount = loopSizes.size();
+  assert(loopCount > 0);
+  std::vector<std::pair<int64_t, size_t>> sorted;
+  sorted.reserve(loopCount);
+  for (size_t i = 0; i < loopCount; ++i) {
+    sorted.emplace_back(loopSizes[i], i);
+  }
+  std::sort(sorted.begin(), sorted.end());
+  tiles.assign(loopCount, 1);
+
+  // Distribute the totalSize among the tiles
+  for (size_t i = 0; i < loopCount; ++i) {
+    auto factor = static_cast<int64_t>(
+        std::pow(totalSize, 1.0 / static_cast<double>(loopCount - i)));
+    if (factor >= sorted[i].first) {
+      factor = sorted[i].first;
+    }
+    tiles[sorted[i].second] = factor;
+    totalSize /= factor;
+  }
+}
 #endif