test

AndreyPavlenko · AndreyPavlenko · commit 8d0da5b66eb1 · 2024-10-25T16:40:41.000+02:00
diff --git a/lib/gc/Transforms/GPU/GpuLoopTiling.cpp b/lib/gc/Transforms/GPU/GpuLoopTiling.cpp
@@ -41,34 +41,29 @@ struct GpuLoopTiling final : GpuPass<GpuLoopTiling>,
 
   void runOnOperation() override {
     IRRewriter rewriter(&getContext());
-    auto euThreads = static_cast<double>(getEuThreads(rewriter));
+    size_t euThreads = getEuThreads(rewriter);
     getOperation().walk<WalkOrder::PreOrder>([&](scf::ParallelOp loop) {
       if (!loop->getParentOfType<scf::ParallelOp>()) {
-        tile(loop, euThreads);
+        SmallVector<int64_t> tiles;
+        auto steps = loop.getStep();
+        tiles.reserve(steps.size());
+
+        for (auto step : steps) {
+          if (auto v = getConstIdxValue(step)) {
+            tiles.push_back(v);
+          } else {
+            tiles.push_back(32);
+          }
+        }
+
+        calcTiles(euThreads, tiles);
+        tileParallelLoop(loop, tiles, false);
       }
       return WalkResult::skip();
     });
     if (failed(simplifyRegions(rewriter, getOperation()->getRegions()))) {
       gcLogD("Failed to simplify regions");
     }
   }
-
-private:
-  static void tile(scf::ParallelOp loop, double euThreads) {
-    SmallVector<int64_t> tileSizes;
-    auto steps = loop.getStep();
-    tileSizes.reserve(steps.size());
-
-    for (auto step : steps) {
-      if (auto v = getConstIdxValue(step)) {
-        tileSizes.push_back(static_cast<int64_t>(
-            std::ceil(static_cast<double>(v) / euThreads)));
-      } else {
-        tileSizes.push_back(32);
-      }
-    }
-
-    tileParallelLoop(loop, tileSizes, false);
-  }
 };
 } // namespace
diff --git a/lib/gc/Transforms/GPU/GpuTilingAndFusion.cpp b/lib/gc/Transforms/GPU/GpuTilingAndFusion.cpp
@@ -46,26 +46,27 @@ struct GpuTilingAndFusion final
   void runOnOperation() override {
     IRRewriter rewriter(&getContext());
     scf::SCFTileAndFuseOptions opts;
+    opts.setFusionControlFn(
+        [&](tensor::ExtractSliceOp candidateSliceOp, OpResult originalProducer,
+            bool isDestinationOperand)
+            -> std::optional<scf::SCFTileAndFuseOptions::ControlFnResult> {
+          Operation *op = originalProducer.getOwner();
+          if (!op) {
+            return std::nullopt;
+          }
+          if (auto linalgOp = dyn_cast<linalg::LinalgOp>(op)) {
+            if (!linalgOp.hasOnlyProjectedPermutations()) {
+              return std::nullopt;
+            }
+          }
+          return scf::SCFTileAndFuseOptions::ControlFnResult{};
+        });
     opts.tilingOptions.setLoopType(scf::SCFTilingOptions::LoopType::ForallOp);
     // The outer loop is converted to a GPU kernel and the tile sizes are mapped
     // to the grid sizes.
     opts.tilingOptions.setTileSizeComputationFunction(
-        // The tile sizes calculation is based on the following equation:
-        // n * TS0 * TS1 * ... * TSn = euMem
-        // where:
-        // n - an average number of bytes, processed by each iteration
-        // TS0, TS1, ... TSn - the tile sizes for each loop correspondingly
-        // euMem - the physical memory (cache) size of the GPU execution unit
-        //
-        // To calculate the tile size TS, we need to divide the total loop size
-        // S by the ratio r:
-        //
-        // n * (S0/r0) * (S1/r1) * ... * (Sn/rn) = euMem
-        // r0 * r1 * ... * rn = (n * S0 * S1 * ... * Sn) / euMem
-        // If all sizes are equal, then S0 = ... = Sn = S, r0 = ... = rn = r:
-        // r^n = (n * S^n) / euMem
-        // r = (n * S^n / euMem)^(1/n)
-        [euMem = getEuMem(rewriter), euThreads = getEuThreads(rewriter)](
+        [euMem = getEuMem(rewriter), euThreads = getEuThreads(rewriter),
+         vectorWidth = getVectorWidth(rewriter)](
             OpBuilder &builder, Operation *op) -> SmallVector<OpFoldResult> {
           auto ti = dyn_cast<TilingInterface>(op);
           if (!ti) {
@@ -76,44 +77,45 @@ struct GpuTilingAndFusion final
           auto itDomains = ti.getIterationDomain(builder);
           assert(itTypes.size() == itDomains.size());
 
-          // TODO: Add a parameter to the options?
-          size_t totalSize = calcOperandsSize(op) * euThreads;
-          unsigned loopCount = 0;
-
+          SmallVector<int64_t> tiles;
           for (auto [t, r] : zip(itTypes, itDomains)) {
             if (t == utils::IteratorType::parallel) {
               if (auto v = getConstantIntValue(r.size)) {
-                loopCount++;
-                totalSize *= *v;
+                tiles.emplace_back(*v);
               } else {
                 return calcDynamicSizes(builder, ti, euMem, euThreads);
               }
             }
           }
 
-          if (loopCount == 0) {
+          if (tiles.empty()) {
             return {};
           }
 
-          // TODO: In case of different sizes, calculate the ratio for each loop
-          double ratio = std::pow(static_cast<double>(totalSize) /
-                                      static_cast<double>(euMem),
-                                  1.0 / loopCount);
-          ratio = std::max(1.0, ratio);
-          SmallVector<OpFoldResult> tiles;
-          tiles.reserve(itDomains.size());
+          size_t elementSize = 1;
+          if (auto linalgOp = dyn_cast<linalg::LinalgOp>(op)) {
+            auto t = linalgOp.getDpsInits()[0].getType();
+            if (t.isIntOrFloat()) {
+              elementSize = t.getIntOrFloatBitWidth() / 8;
+            }
+          }
+          calcTiles(
+              std::max(euThreads, euThreads / 2 * vectorWidth / elementSize),
+              tiles);
+
+          unsigned counter = 0;
+          SmallVector<OpFoldResult> result;
+          result.reserve(itDomains.size());
 
           for (auto [t, r] : zip(itTypes, itDomains)) {
             if (t != utils::IteratorType::parallel) {
-              tiles.emplace_back(builder.getIndexAttr(1));
-            } else if (auto v = getConstantIntValue(r.size)) {
-              tiles.emplace_back(ceil(builder, *v, ratio));
+              result.emplace_back(builder.getIndexAttr(1));
             } else {
-              abort(); // Must never get here
+              result.emplace_back(builder.getIndexAttr(tiles[counter++]));
             }
           }
 
-          return tiles;
+          return result;
         });
 
     auto fn = getOperation();
@@ -174,7 +176,8 @@ struct GpuTilingAndFusion final
   static std::optional<TilingInterface> findTi(Operation *op) {
     std::optional<TilingInterface> last;
     op->walk<WalkOrder::PreOrder>([&](linalg::LinalgOp linalgOp) {
-      if (!linalgOp->getParentOfType<scf::ForallOp>()) {
+      if (linalgOp.hasOnlyProjectedPermutations() &&
+          !linalgOp->getParentOfType<scf::ForallOp>()) {
         if (auto ti = dyn_cast<TilingInterface>(linalgOp.getOperation())) {
           last = ti;
         }
diff --git a/lib/gc/Transforms/GPU/GpuUtils.h b/lib/gc/Transforms/GPU/GpuUtils.h
@@ -8,6 +8,8 @@
 #ifndef GPUUTILS_H
 #define GPUUTILS_H
 
+#include <numeric>
+
 #include "mlir/IR/Builders.h"
 #include "mlir/IR/BuiltinOps.h"
 #include "mlir/Interfaces/DataLayoutInterfaces.h"
@@ -42,6 +44,10 @@ template <typename DerivedT> struct GpuPass {
     return getDeviceProperty(builder, "threads_per_eu",
                              static_cast<DerivedT *>(this)->euThreads);
   }
+
+  size_t getVectorWidth(Builder &builder) {
+    return getDeviceProperty(builder, "max_vector_width", 512);
+  }
 };
 
 template <typename A, typename B>
@@ -69,4 +75,28 @@ static int64_t getConstIdxValue(Value value) {
   }
   return 0;
 }
+
+// Calculate tiles so that the product of the tiles is as close to totalSize as
+// possible and proportional to the initial tiles.
+static void calcTiles(size_t totalSize, SmallVector<int64_t> &tiles) {
+  size_t count = tiles.size();
+  assert(count > 0);
+  int64_t max = 1;
+  std::vector<std::pair<int64_t, size_t>> pairs;
+  pairs.reserve(count);
+  for (size_t i = 0; i < count; ++i) {
+    max = std::max(max, tiles[i]);
+    pairs.emplace_back(tiles[i], i);
+  }
+  std::sort(pairs.begin(), pairs.end());
+
+  // Distribute the totalSize among the tiles
+  for (size_t i = 0; i < count; ++i) {
+    auto tile = static_cast<int64_t>(
+        std::pow(totalSize, 1.0 / static_cast<double>(count - i)));
+    tile = 1 << static_cast<unsigned>(std::log2(tile));
+    tiles[pairs[i].second] = tile;
+    totalSize /= tile;
+  }
+}
 #endif