Skip to content

Commit 8d0da5b

Browse files
test
1 parent 7d157e7 commit 8d0da5b

File tree

3 files changed

+84
-56
lines changed

3 files changed

+84
-56
lines changed

lib/gc/Transforms/GPU/GpuLoopTiling.cpp

Lines changed: 15 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -41,34 +41,29 @@ struct GpuLoopTiling final : GpuPass<GpuLoopTiling>,
4141

4242
void runOnOperation() override {
4343
IRRewriter rewriter(&getContext());
44-
auto euThreads = static_cast<double>(getEuThreads(rewriter));
44+
size_t euThreads = getEuThreads(rewriter);
4545
getOperation().walk<WalkOrder::PreOrder>([&](scf::ParallelOp loop) {
4646
if (!loop->getParentOfType<scf::ParallelOp>()) {
47-
tile(loop, euThreads);
47+
SmallVector<int64_t> tiles;
48+
auto steps = loop.getStep();
49+
tiles.reserve(steps.size());
50+
51+
for (auto step : steps) {
52+
if (auto v = getConstIdxValue(step)) {
53+
tiles.push_back(v);
54+
} else {
55+
tiles.push_back(32);
56+
}
57+
}
58+
59+
calcTiles(euThreads, tiles);
60+
tileParallelLoop(loop, tiles, false);
4861
}
4962
return WalkResult::skip();
5063
});
5164
if (failed(simplifyRegions(rewriter, getOperation()->getRegions()))) {
5265
gcLogD("Failed to simplify regions");
5366
}
5467
}
55-
56-
private:
57-
static void tile(scf::ParallelOp loop, double euThreads) {
58-
SmallVector<int64_t> tileSizes;
59-
auto steps = loop.getStep();
60-
tileSizes.reserve(steps.size());
61-
62-
for (auto step : steps) {
63-
if (auto v = getConstIdxValue(step)) {
64-
tileSizes.push_back(static_cast<int64_t>(
65-
std::ceil(static_cast<double>(v) / euThreads)));
66-
} else {
67-
tileSizes.push_back(32);
68-
}
69-
}
70-
71-
tileParallelLoop(loop, tileSizes, false);
72-
}
7368
};
7469
} // namespace

lib/gc/Transforms/GPU/GpuTilingAndFusion.cpp

Lines changed: 39 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -46,26 +46,27 @@ struct GpuTilingAndFusion final
4646
void runOnOperation() override {
4747
IRRewriter rewriter(&getContext());
4848
scf::SCFTileAndFuseOptions opts;
49+
opts.setFusionControlFn(
50+
[&](tensor::ExtractSliceOp candidateSliceOp, OpResult originalProducer,
51+
bool isDestinationOperand)
52+
-> std::optional<scf::SCFTileAndFuseOptions::ControlFnResult> {
53+
Operation *op = originalProducer.getOwner();
54+
if (!op) {
55+
return std::nullopt;
56+
}
57+
if (auto linalgOp = dyn_cast<linalg::LinalgOp>(op)) {
58+
if (!linalgOp.hasOnlyProjectedPermutations()) {
59+
return std::nullopt;
60+
}
61+
}
62+
return scf::SCFTileAndFuseOptions::ControlFnResult{};
63+
});
4964
opts.tilingOptions.setLoopType(scf::SCFTilingOptions::LoopType::ForallOp);
5065
// The outer loop is converted to a GPU kernel and the tile sizes are mapped
5166
// to the grid sizes.
5267
opts.tilingOptions.setTileSizeComputationFunction(
53-
// The tile sizes calculation is based on the following equation:
54-
// n * TS0 * TS1 * ... * TSn = euMem
55-
// where:
56-
// n - an average number of bytes, processed by each iteration
57-
// TS0, TS1, ... TSn - the tile sizes for each loop correspondingly
58-
// euMem - the physical memory (cache) size of the GPU execution unit
59-
//
60-
// To calculate the tile size TS, we need to divide the total loop size
61-
// S by the ratio r:
62-
//
63-
// n * (S0/r0) * (S1/r1) * ... * (Sn/rn) = euMem
64-
// r0 * r1 * ... * rn = (n * S0 * S1 * ... * Sn) / euMem
65-
// If all sizes are equal, then S0 = ... = Sn = S, r0 = ... = rn = r:
66-
// r^n = (n * S^n) / euMem
67-
// r = (n * S^n / euMem)^(1/n)
68-
[euMem = getEuMem(rewriter), euThreads = getEuThreads(rewriter)](
68+
[euMem = getEuMem(rewriter), euThreads = getEuThreads(rewriter),
69+
vectorWidth = getVectorWidth(rewriter)](
6970
OpBuilder &builder, Operation *op) -> SmallVector<OpFoldResult> {
7071
auto ti = dyn_cast<TilingInterface>(op);
7172
if (!ti) {
@@ -76,44 +77,45 @@ struct GpuTilingAndFusion final
7677
auto itDomains = ti.getIterationDomain(builder);
7778
assert(itTypes.size() == itDomains.size());
7879

79-
// TODO: Add a parameter to the options?
80-
size_t totalSize = calcOperandsSize(op) * euThreads;
81-
unsigned loopCount = 0;
82-
80+
SmallVector<int64_t> tiles;
8381
for (auto [t, r] : zip(itTypes, itDomains)) {
8482
if (t == utils::IteratorType::parallel) {
8583
if (auto v = getConstantIntValue(r.size)) {
86-
loopCount++;
87-
totalSize *= *v;
84+
tiles.emplace_back(*v);
8885
} else {
8986
return calcDynamicSizes(builder, ti, euMem, euThreads);
9087
}
9188
}
9289
}
9390

94-
if (loopCount == 0) {
91+
if (tiles.empty()) {
9592
return {};
9693
}
9794

98-
// TODO: In case of different sizes, calculate the ratio for each loop
99-
double ratio = std::pow(static_cast<double>(totalSize) /
100-
static_cast<double>(euMem),
101-
1.0 / loopCount);
102-
ratio = std::max(1.0, ratio);
103-
SmallVector<OpFoldResult> tiles;
104-
tiles.reserve(itDomains.size());
95+
size_t elementSize = 1;
96+
if (auto linalgOp = dyn_cast<linalg::LinalgOp>(op)) {
97+
auto t = linalgOp.getDpsInits()[0].getType();
98+
if (t.isIntOrFloat()) {
99+
elementSize = t.getIntOrFloatBitWidth() / 8;
100+
}
101+
}
102+
calcTiles(
103+
std::max(euThreads, euThreads / 2 * vectorWidth / elementSize),
104+
tiles);
105+
106+
unsigned counter = 0;
107+
SmallVector<OpFoldResult> result;
108+
result.reserve(itDomains.size());
105109

106110
for (auto [t, r] : zip(itTypes, itDomains)) {
107111
if (t != utils::IteratorType::parallel) {
108-
tiles.emplace_back(builder.getIndexAttr(1));
109-
} else if (auto v = getConstantIntValue(r.size)) {
110-
tiles.emplace_back(ceil(builder, *v, ratio));
112+
result.emplace_back(builder.getIndexAttr(1));
111113
} else {
112-
abort(); // Must never get here
114+
result.emplace_back(builder.getIndexAttr(tiles[counter++]));
113115
}
114116
}
115117

116-
return tiles;
118+
return result;
117119
});
118120

119121
auto fn = getOperation();
@@ -174,7 +176,8 @@ struct GpuTilingAndFusion final
174176
static std::optional<TilingInterface> findTi(Operation *op) {
175177
std::optional<TilingInterface> last;
176178
op->walk<WalkOrder::PreOrder>([&](linalg::LinalgOp linalgOp) {
177-
if (!linalgOp->getParentOfType<scf::ForallOp>()) {
179+
if (linalgOp.hasOnlyProjectedPermutations() &&
180+
!linalgOp->getParentOfType<scf::ForallOp>()) {
178181
if (auto ti = dyn_cast<TilingInterface>(linalgOp.getOperation())) {
179182
last = ti;
180183
}

lib/gc/Transforms/GPU/GpuUtils.h

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,8 @@
88
#ifndef GPUUTILS_H
99
#define GPUUTILS_H
1010

11+
#include <numeric>
12+
1113
#include "mlir/IR/Builders.h"
1214
#include "mlir/IR/BuiltinOps.h"
1315
#include "mlir/Interfaces/DataLayoutInterfaces.h"
@@ -42,6 +44,10 @@ template <typename DerivedT> struct GpuPass {
4244
return getDeviceProperty(builder, "threads_per_eu",
4345
static_cast<DerivedT *>(this)->euThreads);
4446
}
47+
48+
size_t getVectorWidth(Builder &builder) {
49+
return getDeviceProperty(builder, "max_vector_width", 512);
50+
}
4551
};
4652

4753
template <typename A, typename B>
@@ -69,4 +75,28 @@ static int64_t getConstIdxValue(Value value) {
6975
}
7076
return 0;
7177
}
78+
79+
// Calculate tiles so that the product of the tiles is as close to totalSize as
80+
// possible and proportional to the initial tiles.
81+
static void calcTiles(size_t totalSize, SmallVector<int64_t> &tiles) {
82+
size_t count = tiles.size();
83+
assert(count > 0);
84+
int64_t max = 1;
85+
std::vector<std::pair<int64_t, size_t>> pairs;
86+
pairs.reserve(count);
87+
for (size_t i = 0; i < count; ++i) {
88+
max = std::max(max, tiles[i]);
89+
pairs.emplace_back(tiles[i], i);
90+
}
91+
std::sort(pairs.begin(), pairs.end());
92+
93+
// Distribute the totalSize among the tiles
94+
for (size_t i = 0; i < count; ++i) {
95+
auto tile = static_cast<int64_t>(
96+
std::pow(totalSize, 1.0 / static_cast<double>(count - i)));
97+
tile = 1 << static_cast<unsigned>(std::log2(tile));
98+
tiles[pairs[i].second] = tile;
99+
totalSize /= tile;
100+
}
101+
}
72102
#endif

0 commit comments

Comments
 (0)