diff --git a/lib/gc/Transforms/GPU/GpuTilingAndFusion.cpp b/lib/gc/Transforms/GPU/GpuTilingAndFusion.cpp index 45299fa4..b41baf59 100644 --- a/lib/gc/Transforms/GPU/GpuTilingAndFusion.cpp +++ b/lib/gc/Transforms/GPU/GpuTilingAndFusion.cpp @@ -101,17 +101,16 @@ struct GpuTilingAndFusion final auto sizePerThread = numIterations / numThreads * elementSize; auto totalSize = std::max(sizePerThread, cachePerThread); totalSize = std::max(totalSize / elementSize, 64L); - int64_t minTileSize = 1; + bool xeGpu = canLowerToXeGPU(op); // If the operation could be lowered to XeGPU, make the tiles - // multiple of the vector width and the minimum tile size 8. - if (canLowerToXeGPU(op)) { - minTileSize = 8; + // multiple of the vector width. + if (xeGpu) { totalSize = std::max(totalSize / vectorWidth, 1L) * vectorWidth; } SmallVector tiles = sizes; - adjustTiles(totalSize, tiles, minTileSize); + adjustTiles(totalSize, tiles, xeGpu); // If the tiles are equal to the sizes, split the largest tile // to avoid loops elimination by the canonicalizer pass. @@ -356,16 +355,12 @@ struct GpuTilingAndFusion final return false; } - auto shape = type.getShape(); - if (isOutput) { - if (shape.size() != 2 || shape[0] * shape[1] < 16) { - return false; - } - } else if (shape.size() > 2) { - return false; + if (auto shape = type.getShape(); shape.size() >= 2) { + return !isOutput || + std::accumulate(shape.begin() + 1, shape.end(), shape[0], + std::multiplies<>()) >= 16; } - - return true; + return false; }; if (auto inits = op.getDpsInits(); diff --git a/lib/gc/Transforms/GPU/GpuUtils.h b/lib/gc/Transforms/GPU/GpuUtils.h index 2bc07997..a9e1aaa4 100644 --- a/lib/gc/Transforms/GPU/GpuUtils.h +++ b/lib/gc/Transforms/GPU/GpuUtils.h @@ -139,9 +139,20 @@ template T findFactor(T number, T closeTo) { return closeTo; } +namespace impl { +// Controls the adjustment in case of more than 2 tiles. +enum class AdjustTilesMode { + // Sort the input and switch to the First mode. + Sort, + // Adjust the first tile and call adjustTiles() recursively for the rest. + First, + // To allow for squeezing, set 1's for all tiles except the last 2. + XeGpu, +}; + template static void adjustTwoTiles(T totalSize, T *aPtr, T *bPtr, - T minSize = static_cast(1)) { + AdjustTilesMode mode) { T a = *aPtr; T b = *bPtr; assert(a >= b); @@ -150,6 +161,7 @@ static void adjustTwoTiles(T totalSize, T *aPtr, T *bPtr, return; } + T minSize = static_cast(mode == AdjustTilesMode::XeGpu ? 8 : 1); bool aPow2 = isPow2(a); bool bPow2 = isPow2(b); double ratio = static_cast(a) / static_cast(b); @@ -208,14 +220,14 @@ static void adjustTwoTiles(T totalSize, T *aPtr, T *bPtr, // and, if possible, is a power of 2. template static void adjustTiles(T totalSize, T *begin, T *end, - T minSize = static_cast(1), bool isSorted = false) { - assert((minSize & (minSize - 1)) == 0 && "minSize must be a power of 2"); + AdjustTilesMode mode = AdjustTilesMode::Sort) { auto count = end - begin; if (count == 0) { return; } if (count == 1) { + T minSize = static_cast(mode == AdjustTilesMode::XeGpu ? 8 : 1); if (T a = *begin; isPow2(a)) { *begin = std::min(std::max(ceilPow2(a), minSize), floorPow2(totalSize)); } else { @@ -225,15 +237,29 @@ static void adjustTiles(T totalSize, T *begin, T *end, } if (count > 2) { + if (mode == AdjustTilesMode::XeGpu) { + for (unsigned i = 0; i < count - 2; ++i) { + *(begin + i) = 1; + } + T *aPtr = end - 2; + T *bPtr = end - 1; + if (*aPtr < *bPtr) { + std::swap(aPtr, bPtr); + } + adjustTwoTiles(totalSize, aPtr, bPtr, mode); + return; + } + SmallVector sorted; SmallVector indices; T *head; T *tail; - if (isSorted) { + if (mode == AdjustTilesMode::First) { head = begin; tail = end; } else { + assert(mode == AdjustTilesMode::Sort); SmallVector> pairs; pairs.reserve(count); for (unsigned i = 0; i < count; ++i) { @@ -254,26 +280,29 @@ static void adjustTiles(T totalSize, T *begin, T *end, // first one and the product of the rest. The second one is the rest. T first[] = {*head, std::accumulate(head + 2, tail, *(head + 1), std::multiplies<>())}; - adjustTiles(totalSize, first, first + 2, minSize, true); - adjustTiles(totalSize / *first, head + 1, tail, minSize, true); + adjustTiles(totalSize, first, first + 2, AdjustTilesMode::First); + adjustTiles(totalSize / *first, head + 1, tail, AdjustTilesMode::First); *head = *first; - if (!isSorted) { + if (mode == AdjustTilesMode::Sort) { for (unsigned i = 0; i < count; ++i) { *(begin + indices[i]) = sorted[i]; } } } else if (*begin >= *(end - 1)) { - adjustTwoTiles(totalSize, begin, end - 1, minSize); + adjustTwoTiles(totalSize, begin, end - 1, mode); } else { - adjustTwoTiles(totalSize, end - 1, begin, minSize); + adjustTwoTiles(totalSize, end - 1, begin, mode); } } +} // namespace impl template static void adjustTiles(T totalSize, SmallVector &tiles, - T minSize = static_cast(1)) { - adjustTiles(totalSize, tiles.begin(), tiles.end(), minSize); + bool xeGpuMode = false) { + impl::adjustTiles(totalSize, tiles.begin(), tiles.end(), + xeGpuMode ? impl::AdjustTilesMode::XeGpu + : impl::AdjustTilesMode::Sort); } // Check recursively if the specified operation has an operand that diff --git a/test/mlir/unittests/Transforms/GPU/GpuUtilsTest.cpp b/test/mlir/unittests/Transforms/GPU/GpuUtilsTest.cpp index 0f886869..09b5e8aa 100644 --- a/test/mlir/unittests/Transforms/GPU/GpuUtilsTest.cpp +++ b/test/mlir/unittests/Transforms/GPU/GpuUtilsTest.cpp @@ -15,7 +15,8 @@ TEST(testAdjustTiles, GputUtilsTest) { bool print = false; auto testAdjust = [print](int64_t totalSize, SmallVector &tiles, - const SmallVector &expected) { + const SmallVector &expected, + bool xeGpu = false) { if (print) { std::cout << totalSize << ": ["; for (unsigned i = 0; i < tiles.size(); i++) { @@ -24,7 +25,7 @@ TEST(testAdjustTiles, GputUtilsTest) { std::cout << "] -> ["; } - gc::adjustTiles(totalSize, tiles); + gc::adjustTiles(totalSize, tiles, xeGpu); if (print) { for (unsigned i = 0; i < tiles.size(); i++) { @@ -36,15 +37,15 @@ TEST(testAdjustTiles, GputUtilsTest) { EXPECT_EQ(tiles, expected); }; auto test = [testAdjust](int64_t totalSize, SmallVector tiles, - SmallVector expected) { + SmallVector expected, bool xeGpu = false) { if (tiles.size() != 2 || tiles[0] == tiles[1]) { - testAdjust(totalSize, tiles, expected); + testAdjust(totalSize, tiles, expected, xeGpu); return; } SmallVector reversed(tiles.rbegin(), tiles.rend()); - testAdjust(totalSize, tiles, expected); + testAdjust(totalSize, tiles, expected, xeGpu); std::reverse(expected.begin(), expected.end()); - testAdjust(totalSize, reversed, expected); + testAdjust(totalSize, reversed, expected, xeGpu); }; test(8, {1, 1}, {1, 1}); @@ -91,4 +92,8 @@ TEST(testAdjustTiles, GputUtilsTest) { test(16384, {60, 128, 512}, {4, 32, 128}); test(16384, {119, 256, 512}, {7, 32, 64}); test(16384, {109, 256, 512}, {109, 8, 16}); + + test(16384, {8, 16, 32, 256, 512}, {1, 1, 1, 128, 128}, true); + test(16384, {8, 16, 32, 1024, 256}, {1, 1, 1, 256, 64}, true); + test(16384, {8, 16, 32, 16, 4096}, {1, 1, 1, 8, 2048}, true); }