Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 9 additions & 14 deletions lib/gc/Transforms/GPU/GpuTilingAndFusion.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -101,17 +101,16 @@ struct GpuTilingAndFusion final
auto sizePerThread = numIterations / numThreads * elementSize;
auto totalSize = std::max(sizePerThread, cachePerThread);
totalSize = std::max(totalSize / elementSize, 64L);
int64_t minTileSize = 1;
bool xeGpu = canLowerToXeGPU(op);

// If the operation could be lowered to XeGPU, make the tiles
// multiple of the vector width and the minimum tile size 8.
if (canLowerToXeGPU(op)) {
minTileSize = 8;
// multiple of the vector width.
if (xeGpu) {
totalSize = std::max(totalSize / vectorWidth, 1L) * vectorWidth;
}

SmallVector<int64_t> tiles = sizes;
adjustTiles(totalSize, tiles, minTileSize);
adjustTiles(totalSize, tiles, xeGpu);

// If the tiles are equal to the sizes, split the largest tile
// to avoid loops elimination by the canonicalizer pass.
Expand Down Expand Up @@ -356,16 +355,12 @@ struct GpuTilingAndFusion final
return false;
}

auto shape = type.getShape();
if (isOutput) {
if (shape.size() != 2 || shape[0] * shape[1] < 16) {
return false;
}
} else if (shape.size() > 2) {
return false;
if (auto shape = type.getShape(); shape.size() >= 2) {
return !isOutput ||
std::accumulate(shape.begin() + 1, shape.end(), shape[0],
std::multiplies<>()) >= 16;
}

return true;
return false;
};

if (auto inits = op.getDpsInits();
Expand Down
51 changes: 40 additions & 11 deletions lib/gc/Transforms/GPU/GpuUtils.h
Original file line number Diff line number Diff line change
Expand Up @@ -139,9 +139,20 @@ template <typename T> T findFactor(T number, T closeTo) {
return closeTo;
}

namespace impl {
// Controls the adjustment in case of more than 2 tiles.
enum class AdjustTilesMode {
// Sort the input and switch to the First mode.
Sort,
// Adjust the first tile and call adjustTiles() recursively for the rest.
First,
// To allow for squeezing, set 1's for all tiles except the last 2.
XeGpu,
};

template <typename T>
static void adjustTwoTiles(T totalSize, T *aPtr, T *bPtr,
T minSize = static_cast<T>(1)) {
AdjustTilesMode mode) {
T a = *aPtr;
T b = *bPtr;
assert(a >= b);
Expand All @@ -150,6 +161,7 @@ static void adjustTwoTiles(T totalSize, T *aPtr, T *bPtr,
return;
}

T minSize = static_cast<T>(mode == AdjustTilesMode::XeGpu ? 8 : 1);
bool aPow2 = isPow2(a);
bool bPow2 = isPow2(b);
double ratio = static_cast<double>(a) / static_cast<double>(b);
Expand Down Expand Up @@ -208,14 +220,14 @@ static void adjustTwoTiles(T totalSize, T *aPtr, T *bPtr,
// and, if possible, is a power of 2.
template <typename T>
static void adjustTiles(T totalSize, T *begin, T *end,
T minSize = static_cast<T>(1), bool isSorted = false) {
assert((minSize & (minSize - 1)) == 0 && "minSize must be a power of 2");
AdjustTilesMode mode = AdjustTilesMode::Sort) {
auto count = end - begin;
if (count == 0) {
return;
}

if (count == 1) {
T minSize = static_cast<T>(mode == AdjustTilesMode::XeGpu ? 8 : 1);
if (T a = *begin; isPow2(a)) {
*begin = std::min(std::max(ceilPow2(a), minSize), floorPow2(totalSize));
} else {
Expand All @@ -225,15 +237,29 @@ static void adjustTiles(T totalSize, T *begin, T *end,
}

if (count > 2) {
if (mode == AdjustTilesMode::XeGpu) {
for (unsigned i = 0; i < count - 2; ++i) {
*(begin + i) = 1;
}
T *aPtr = end - 2;
T *bPtr = end - 1;
if (*aPtr < *bPtr) {
std::swap(aPtr, bPtr);
}
adjustTwoTiles(totalSize, aPtr, bPtr, mode);
return;
}

SmallVector<T> sorted;
SmallVector<unsigned> indices;
T *head;
T *tail;

if (isSorted) {
if (mode == AdjustTilesMode::First) {
head = begin;
tail = end;
} else {
assert(mode == AdjustTilesMode::Sort);
SmallVector<std::pair<T, unsigned>> pairs;
pairs.reserve(count);
for (unsigned i = 0; i < count; ++i) {
Expand All @@ -254,26 +280,29 @@ static void adjustTiles(T totalSize, T *begin, T *end,
// first one and the product of the rest. The second one is the rest.
T first[] = {*head, std::accumulate(head + 2, tail, *(head + 1),
std::multiplies<>())};
adjustTiles(totalSize, first, first + 2, minSize, true);
adjustTiles(totalSize / *first, head + 1, tail, minSize, true);
adjustTiles(totalSize, first, first + 2, AdjustTilesMode::First);
adjustTiles(totalSize / *first, head + 1, tail, AdjustTilesMode::First);
*head = *first;

if (!isSorted) {
if (mode == AdjustTilesMode::Sort) {
for (unsigned i = 0; i < count; ++i) {
*(begin + indices[i]) = sorted[i];
}
}
} else if (*begin >= *(end - 1)) {
adjustTwoTiles(totalSize, begin, end - 1, minSize);
adjustTwoTiles(totalSize, begin, end - 1, mode);
} else {
adjustTwoTiles(totalSize, end - 1, begin, minSize);
adjustTwoTiles(totalSize, end - 1, begin, mode);
}
}
} // namespace impl

template <typename T, unsigned N>
static void adjustTiles(T totalSize, SmallVector<T, N> &tiles,
T minSize = static_cast<T>(1)) {
adjustTiles(totalSize, tiles.begin(), tiles.end(), minSize);
bool xeGpuMode = false) {
impl::adjustTiles(totalSize, tiles.begin(), tiles.end(),
xeGpuMode ? impl::AdjustTilesMode::XeGpu
: impl::AdjustTilesMode::Sort);
}

// Check recursively if the specified operation has an operand that
Expand Down
17 changes: 11 additions & 6 deletions test/mlir/unittests/Transforms/GPU/GpuUtilsTest.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,8 @@
TEST(testAdjustTiles, GputUtilsTest) {
bool print = false;
auto testAdjust = [print](int64_t totalSize, SmallVector<int64_t> &tiles,
const SmallVector<int64_t> &expected) {
const SmallVector<int64_t> &expected,
bool xeGpu = false) {
if (print) {
std::cout << totalSize << ": [";
for (unsigned i = 0; i < tiles.size(); i++) {
Expand All @@ -24,7 +25,7 @@ TEST(testAdjustTiles, GputUtilsTest) {
std::cout << "] -> [";
}

gc::adjustTiles(totalSize, tiles);
gc::adjustTiles(totalSize, tiles, xeGpu);

if (print) {
for (unsigned i = 0; i < tiles.size(); i++) {
Expand All @@ -36,15 +37,15 @@ TEST(testAdjustTiles, GputUtilsTest) {
EXPECT_EQ(tiles, expected);
};
auto test = [testAdjust](int64_t totalSize, SmallVector<int64_t> tiles,
SmallVector<int64_t> expected) {
SmallVector<int64_t> expected, bool xeGpu = false) {
if (tiles.size() != 2 || tiles[0] == tiles[1]) {
testAdjust(totalSize, tiles, expected);
testAdjust(totalSize, tiles, expected, xeGpu);
return;
}
SmallVector<int64_t> reversed(tiles.rbegin(), tiles.rend());
testAdjust(totalSize, tiles, expected);
testAdjust(totalSize, tiles, expected, xeGpu);
std::reverse(expected.begin(), expected.end());
testAdjust(totalSize, reversed, expected);
testAdjust(totalSize, reversed, expected, xeGpu);
};

test(8, {1, 1}, {1, 1});
Expand Down Expand Up @@ -91,4 +92,8 @@ TEST(testAdjustTiles, GputUtilsTest) {
test(16384, {60, 128, 512}, {4, 32, 128});
test(16384, {119, 256, 512}, {7, 32, 64});
test(16384, {109, 256, 512}, {109, 8, 16});

test(16384, {8, 16, 32, 256, 512}, {1, 1, 1, 128, 128}, true);
test(16384, {8, 16, 32, 1024, 256}, {1, 1, 1, 256, 64}, true);
test(16384, {8, 16, 32, 16, 4096}, {1, 1, 1, 8, 2048}, true);
}