Apply suggestions from code review

AndreyPavlenko · AndreyPavlenko · commit 6e16b8978cb5 · 2024-11-22T04:13:21.000+01:00
diff --git a/include/gc/Transforms/Passes.td b/include/gc/Transforms/Passes.td
@@ -124,7 +124,7 @@ def GpuToGpuOcl : Pass<"gpu-to-gpuocl", "ModuleOp"> {
 def GpuTilingAndFusion : Pass<"gpu-tiling", "func::FuncOp"> {
   let summary = "GPU tiling and fusion path.";
   let description = [{
-    This pass tiles linalg operations and creates two nested csf.forall loops. When converting to gpu.launch,
+    This pass tiles linalg operations and creates two nested scf.forall loops. When converting to gpu.launch,
     the inner loop is mapped to the block sizes and the outer - to grid sizes. The tiles calculation is based
     on the GPU device properties, retrieved from the DLTI attributes. If the DLTI attributes are not specified,
     defaults to the pass options.
@@ -139,9 +139,9 @@ def GpuTilingAndFusion : Pass<"gpu-tiling", "func::FuncOp"> {
     Option<"numThreadsPerEu", "num-threads-per-eu", "size_t",
            /*default=*/"8",
            "Number of threads per Execution Unit.">,
-    Option<"cacheSize", "cache-size", "size_t",
+    Option<"localMemSize", "local-mem-size", "size_t",
            /*default=*/"131072",
-           "Execution Unit cache size.">,
+           "The size of the local memory, shared across a work-group.">,
     Option<"vectorWidth", "vector-width", "size_t",
            /*default=*/"512",
            "The maximum width of EU's vector registers.">,
diff --git a/lib/gc/ExecutionEngine/GPURuntime/ocl/GpuOclRuntime.cpp b/lib/gc/ExecutionEngine/GPURuntime/ocl/GpuOclRuntime.cpp
@@ -876,8 +876,7 @@ OclModuleBuilder::build(const OclRuntime::Ext &ext) {
         {CL_DEVICE_MAX_COMPUTE_UNITS, "num_exec_units"},
         {CL_DEVICE_NUM_EUS_PER_SUB_SLICE_INTEL, "num_exec_units_per_slice"},
         {CL_DEVICE_NUM_THREADS_PER_EU_INTEL, "num_threads_per_eu"},
-        // Assuming the cache size is equal to the local mem
-        {CL_DEVICE_LOCAL_MEM_SIZE, "L1_cache_size_in_bytes"},
+        {CL_DEVICE_LOCAL_MEM_SIZE, "local_mem_size"},
     };
 
     unsigned i = 0;
diff --git a/lib/gc/Transforms/GPU/GpuTilingAndFusion.cpp b/lib/gc/Transforms/GPU/GpuTilingAndFusion.cpp
@@ -51,20 +51,17 @@ struct GpuTilingAndFusion final
     OpRewriter rw(fn);
     tileAndFuseLinalgOps(rw, fn);
     tileForallOps(rw, fn);
-    if (failed(simplifyRegions(rw, fn->getRegions()))) {
-      // Not simplified
-    }
   }
 
 private:
   void tileAndFuseLinalgOps(OpRewriter &rw, func::FuncOp &fn) {
     auto numEus = getNumEus(rw);
     auto numEusPerSlice = getNumEusPerSlice(rw);
     auto numThreadsPerEu = getNumThreadsPerEu(rw);
-    auto cacheSize = getCacheSize(rw);
+    auto localMemSize = getLocalMemSize(rw);
     auto vectorWidth = getVectorWidth(rw);
     auto cachePerThread =
-        std::max(cacheSize / numEusPerSlice / numThreadsPerEu, vectorWidth);
+        std::max(localMemSize / numEusPerSlice / numThreadsPerEu, vectorWidth);
     SCFTileAndFuseOptions opts;
     opts.tilingOptions.setTileSizeComputationFunction(
         [&rw, cachePerThread, vectorWidth,
diff --git a/lib/gc/Transforms/GPU/GpuUtils.h b/lib/gc/Transforms/GPU/GpuUtils.h
@@ -58,9 +58,9 @@ template <typename DerivedT> struct GpuPass {
                                static_cast<DerivedT *>(this)->numThreadsPerEu);
   }
 
-  int64_t getCacheSize(Builder &builder) {
-    return getGpuPropertyAsInt(builder, "L1_cache_size_in_bytes",
-                               static_cast<DerivedT *>(this)->cacheSize);
+  int64_t getLocalMemSize(Builder &builder) {
+    return getGpuPropertyAsInt(builder, "local_mem_size",
+                               static_cast<DerivedT *>(this)->localMemSize);
   }
 
   int64_t getVectorWidth(Builder &builder) {
@@ -87,22 +87,9 @@ struct OpRewriter final : IRRewriter {
   arith::ConstantIndexOp getConstant(int64_t v) {
     if (func.empty()) {
       func.addEntryBlock();
-    } else {
-      // Try to find ConstantIndexOp with the same value in the function.
-      for (auto &op : func.getBody().getOps()) {
-        if (auto constOp = dyn_cast<arith::ConstantIndexOp>(op)) {
-          if (constOp.value() == v) {
-            return constOp;
-          }
-        }
-      }
+      setInsertionPointToStart(func.addEntryBlock());
     }
-
-    auto ip = saveInsertionPoint();
-    setInsertionPointToStart(&func.getBody().front());
-    auto op = create<arith::ConstantIndexOp>(v);
-    restoreInsertionPoint(ip);
-    return op;
+    return create<arith::ConstantIndexOp>(v);
   }
 
 private:
@@ -152,9 +139,11 @@ static void adjustTiles(T totalSize, T *begin, T *end,
     return;
   }
 
-  --end;
-  T a = *begin;
-  T b = *end;
+  // a and b are the initial tile sizes, x and y are the new sizes.
+  T *aPtr = begin;
+  T *bPtr = end - 1;
+  T a = *aPtr;
+  T b = *bPtr;
   bool swap = a < b;
   if (swap) {
     std::swap(a, b);
@@ -164,8 +153,8 @@ static void adjustTiles(T totalSize, T *begin, T *end,
   b = floorPow2(b);
 
   if (a * b <= total) {
-    *begin = swap ? b : a;
-    *end = swap ? a : b;
+    *aPtr = swap ? b : a;
+    *bPtr = swap ? a : b;
     return;
   }
 
@@ -196,8 +185,8 @@ static void adjustTiles(T totalSize, T *begin, T *end,
     }
   }
 
-  *begin = swap ? y : x;
-  *end = swap ? x : y;
+  *aPtr = swap ? y : x;
+  *bPtr = swap ? x : y;
 }
 
 template <typename T, unsigned N>
diff --git a/test/mlir/unittests/Transforms/GPU/GpuUtilsTest.cpp b/test/mlir/unittests/Transforms/GPU/GpuUtilsTest.cpp
@@ -13,30 +13,38 @@
 #include "gtest/gtest.h"
 
 TEST(testAdjustTiles, GputUtilsTest) {
-  auto testCalc = [](int64_t totalSize, SmallVector<int64_t> &tiles,
-                     const SmallVector<int64_t> &expected) {
-    std::cout << totalSize << ": [";
-    for (unsigned i = 0; i < tiles.size(); i++) {
-      std::cout << tiles[i] << (i + 1 < tiles.size() ? ", " : "");
+  bool print = false;
+  auto testAdjust = [print](int64_t totalSize, SmallVector<int64_t> &tiles,
+                            const SmallVector<int64_t> &expected) {
+    if (print) {
+      std::cout << totalSize << ": [";
+      for (unsigned i = 0; i < tiles.size(); i++) {
+        std::cout << tiles[i] << (i + 1 < tiles.size() ? ", " : "");
+      }
+      std::cout << "] -> [";
     }
-    std::cout << "] -> [";
+
     gc::adjustTiles(totalSize, tiles);
-    for (unsigned i = 0; i < tiles.size(); i++) {
-      std::cout << tiles[i] << (i + 1 < tiles.size() ? ", " : "");
+
+    if (print) {
+      for (unsigned i = 0; i < tiles.size(); i++) {
+        std::cout << tiles[i] << (i + 1 < tiles.size() ? ", " : "");
+      }
+      std::cout << "]" << std::endl;
     }
-    std::cout << "]" << std::endl;
+
     EXPECT_EQ(tiles, expected);
   };
-  auto test = [testCalc](int64_t totalSize, SmallVector<int64_t> tiles,
-                         SmallVector<int64_t> expected) {
+  auto test = [testAdjust](int64_t totalSize, SmallVector<int64_t> tiles,
+                           SmallVector<int64_t> expected) {
     if (tiles.size() != 2 || tiles[0] == tiles[1]) {
-      testCalc(totalSize, tiles, expected);
+      testAdjust(totalSize, tiles, expected);
       return;
     }
     SmallVector<int64_t> reversed(tiles.rbegin(), tiles.rend());
-    testCalc(totalSize, tiles, expected);
+    testAdjust(totalSize, tiles, expected);
     std::reverse(expected.begin(), expected.end());
-    testCalc(totalSize, reversed, expected);
+    testAdjust(totalSize, reversed, expected);
   };
 
   test(8, {1, 1}, {1, 1});