Skip to content

Commit 6e16b89

Browse files
Apply suggestions from code review
1 parent 95224e2 commit 6e16b89

File tree

5 files changed

+42
-49
lines changed

5 files changed

+42
-49
lines changed

include/gc/Transforms/Passes.td

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -124,7 +124,7 @@ def GpuToGpuOcl : Pass<"gpu-to-gpuocl", "ModuleOp"> {
124124
def GpuTilingAndFusion : Pass<"gpu-tiling", "func::FuncOp"> {
125125
let summary = "GPU tiling and fusion path.";
126126
let description = [{
127-
This pass tiles linalg operations and creates two nested csf.forall loops. When converting to gpu.launch,
127+
This pass tiles linalg operations and creates two nested scf.forall loops. When converting to gpu.launch,
128128
the inner loop is mapped to the block sizes and the outer - to grid sizes. The tiles calculation is based
129129
on the GPU device properties, retrieved from the DLTI attributes. If the DLTI attributes are not specified,
130130
defaults to the pass options.
@@ -139,9 +139,9 @@ def GpuTilingAndFusion : Pass<"gpu-tiling", "func::FuncOp"> {
139139
Option<"numThreadsPerEu", "num-threads-per-eu", "size_t",
140140
/*default=*/"8",
141141
"Number of threads per Execution Unit.">,
142-
Option<"cacheSize", "cache-size", "size_t",
142+
Option<"localMemSize", "local-mem-size", "size_t",
143143
/*default=*/"131072",
144-
"Execution Unit cache size.">,
144+
"The size of the local memory, shared across a work-group.">,
145145
Option<"vectorWidth", "vector-width", "size_t",
146146
/*default=*/"512",
147147
"The maximum width of EU's vector registers.">,

lib/gc/ExecutionEngine/GPURuntime/ocl/GpuOclRuntime.cpp

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -876,8 +876,7 @@ OclModuleBuilder::build(const OclRuntime::Ext &ext) {
876876
{CL_DEVICE_MAX_COMPUTE_UNITS, "num_exec_units"},
877877
{CL_DEVICE_NUM_EUS_PER_SUB_SLICE_INTEL, "num_exec_units_per_slice"},
878878
{CL_DEVICE_NUM_THREADS_PER_EU_INTEL, "num_threads_per_eu"},
879-
// Assuming the cache size is equal to the local mem
880-
{CL_DEVICE_LOCAL_MEM_SIZE, "L1_cache_size_in_bytes"},
879+
{CL_DEVICE_LOCAL_MEM_SIZE, "local_mem_size"},
881880
};
882881

883882
unsigned i = 0;

lib/gc/Transforms/GPU/GpuTilingAndFusion.cpp

Lines changed: 2 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -51,20 +51,17 @@ struct GpuTilingAndFusion final
5151
OpRewriter rw(fn);
5252
tileAndFuseLinalgOps(rw, fn);
5353
tileForallOps(rw, fn);
54-
if (failed(simplifyRegions(rw, fn->getRegions()))) {
55-
// Not simplified
56-
}
5754
}
5855

5956
private:
6057
void tileAndFuseLinalgOps(OpRewriter &rw, func::FuncOp &fn) {
6158
auto numEus = getNumEus(rw);
6259
auto numEusPerSlice = getNumEusPerSlice(rw);
6360
auto numThreadsPerEu = getNumThreadsPerEu(rw);
64-
auto cacheSize = getCacheSize(rw);
61+
auto localMemSize = getLocalMemSize(rw);
6562
auto vectorWidth = getVectorWidth(rw);
6663
auto cachePerThread =
67-
std::max(cacheSize / numEusPerSlice / numThreadsPerEu, vectorWidth);
64+
std::max(localMemSize / numEusPerSlice / numThreadsPerEu, vectorWidth);
6865
SCFTileAndFuseOptions opts;
6966
opts.tilingOptions.setTileSizeComputationFunction(
7067
[&rw, cachePerThread, vectorWidth,

lib/gc/Transforms/GPU/GpuUtils.h

Lines changed: 14 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -58,9 +58,9 @@ template <typename DerivedT> struct GpuPass {
5858
static_cast<DerivedT *>(this)->numThreadsPerEu);
5959
}
6060

61-
int64_t getCacheSize(Builder &builder) {
62-
return getGpuPropertyAsInt(builder, "L1_cache_size_in_bytes",
63-
static_cast<DerivedT *>(this)->cacheSize);
61+
int64_t getLocalMemSize(Builder &builder) {
62+
return getGpuPropertyAsInt(builder, "local_mem_size",
63+
static_cast<DerivedT *>(this)->localMemSize);
6464
}
6565

6666
int64_t getVectorWidth(Builder &builder) {
@@ -87,22 +87,9 @@ struct OpRewriter final : IRRewriter {
8787
arith::ConstantIndexOp getConstant(int64_t v) {
8888
if (func.empty()) {
8989
func.addEntryBlock();
90-
} else {
91-
// Try to find ConstantIndexOp with the same value in the function.
92-
for (auto &op : func.getBody().getOps()) {
93-
if (auto constOp = dyn_cast<arith::ConstantIndexOp>(op)) {
94-
if (constOp.value() == v) {
95-
return constOp;
96-
}
97-
}
98-
}
90+
setInsertionPointToStart(func.addEntryBlock());
9991
}
100-
101-
auto ip = saveInsertionPoint();
102-
setInsertionPointToStart(&func.getBody().front());
103-
auto op = create<arith::ConstantIndexOp>(v);
104-
restoreInsertionPoint(ip);
105-
return op;
92+
return create<arith::ConstantIndexOp>(v);
10693
}
10794

10895
private:
@@ -152,9 +139,11 @@ static void adjustTiles(T totalSize, T *begin, T *end,
152139
return;
153140
}
154141

155-
--end;
156-
T a = *begin;
157-
T b = *end;
142+
// a and b are the initial tile sizes, x and y are the new sizes.
143+
T *aPtr = begin;
144+
T *bPtr = end - 1;
145+
T a = *aPtr;
146+
T b = *bPtr;
158147
bool swap = a < b;
159148
if (swap) {
160149
std::swap(a, b);
@@ -164,8 +153,8 @@ static void adjustTiles(T totalSize, T *begin, T *end,
164153
b = floorPow2(b);
165154

166155
if (a * b <= total) {
167-
*begin = swap ? b : a;
168-
*end = swap ? a : b;
156+
*aPtr = swap ? b : a;
157+
*bPtr = swap ? a : b;
169158
return;
170159
}
171160

@@ -196,8 +185,8 @@ static void adjustTiles(T totalSize, T *begin, T *end,
196185
}
197186
}
198187

199-
*begin = swap ? y : x;
200-
*end = swap ? x : y;
188+
*aPtr = swap ? y : x;
189+
*bPtr = swap ? x : y;
201190
}
202191

203192
template <typename T, unsigned N>

test/mlir/unittests/Transforms/GPU/GpuUtilsTest.cpp

Lines changed: 22 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -13,30 +13,38 @@
1313
#include "gtest/gtest.h"
1414

1515
TEST(testAdjustTiles, GputUtilsTest) {
16-
auto testCalc = [](int64_t totalSize, SmallVector<int64_t> &tiles,
17-
const SmallVector<int64_t> &expected) {
18-
std::cout << totalSize << ": [";
19-
for (unsigned i = 0; i < tiles.size(); i++) {
20-
std::cout << tiles[i] << (i + 1 < tiles.size() ? ", " : "");
16+
bool print = false;
17+
auto testAdjust = [print](int64_t totalSize, SmallVector<int64_t> &tiles,
18+
const SmallVector<int64_t> &expected) {
19+
if (print) {
20+
std::cout << totalSize << ": [";
21+
for (unsigned i = 0; i < tiles.size(); i++) {
22+
std::cout << tiles[i] << (i + 1 < tiles.size() ? ", " : "");
23+
}
24+
std::cout << "] -> [";
2125
}
22-
std::cout << "] -> [";
26+
2327
gc::adjustTiles(totalSize, tiles);
24-
for (unsigned i = 0; i < tiles.size(); i++) {
25-
std::cout << tiles[i] << (i + 1 < tiles.size() ? ", " : "");
28+
29+
if (print) {
30+
for (unsigned i = 0; i < tiles.size(); i++) {
31+
std::cout << tiles[i] << (i + 1 < tiles.size() ? ", " : "");
32+
}
33+
std::cout << "]" << std::endl;
2634
}
27-
std::cout << "]" << std::endl;
35+
2836
EXPECT_EQ(tiles, expected);
2937
};
30-
auto test = [testCalc](int64_t totalSize, SmallVector<int64_t> tiles,
31-
SmallVector<int64_t> expected) {
38+
auto test = [testAdjust](int64_t totalSize, SmallVector<int64_t> tiles,
39+
SmallVector<int64_t> expected) {
3240
if (tiles.size() != 2 || tiles[0] == tiles[1]) {
33-
testCalc(totalSize, tiles, expected);
41+
testAdjust(totalSize, tiles, expected);
3442
return;
3543
}
3644
SmallVector<int64_t> reversed(tiles.rbegin(), tiles.rend());
37-
testCalc(totalSize, tiles, expected);
45+
testAdjust(totalSize, tiles, expected);
3846
std::reverse(expected.begin(), expected.end());
39-
testCalc(totalSize, reversed, expected);
47+
testAdjust(totalSize, reversed, expected);
4048
};
4149

4250
test(8, {1, 1}, {1, 1});

0 commit comments

Comments
 (0)