From 8f28da901bb95a79f6267cb4c4116b162a759537 Mon Sep 17 00:00:00 2001 From: Hongtao Yu Date: Thu, 31 Oct 2024 08:39:59 -0700 Subject: [PATCH 1/3] [BACKEND] Do not pipeline epilog loops generated by loop unrolling --- lib/Dialect/Triton/Transforms/LoopUnroll.cpp | 63 ++++++++++++++++++-- test/Triton/loop-unroll.mlir | 1 + 2 files changed, 58 insertions(+), 6 deletions(-) diff --git a/lib/Dialect/Triton/Transforms/LoopUnroll.cpp b/lib/Dialect/Triton/Transforms/LoopUnroll.cpp index 257e734b7f88..b145a20307fc 100644 --- a/lib/Dialect/Triton/Transforms/LoopUnroll.cpp +++ b/lib/Dialect/Triton/Transforms/LoopUnroll.cpp @@ -1,6 +1,7 @@ #include #include "mlir/Dialect/SCF/Utils/Utils.h" +#include "mlir/IR/Attributes.h" #include "mlir/IR/BuiltinAttributes.h" #include "mlir/IR/Matchers.h" #include "mlir/IR/PatternMatch.h" @@ -22,8 +23,6 @@ namespace mlir::triton { -static const char *loopUnrollFactorAttrName = "tt.loop_unroll_factor"; - namespace { class LoopUnrollPass : public TritonLoopUnrollBase { @@ -31,15 +30,56 @@ class LoopUnrollPass : public TritonLoopUnrollBase { int getUnrollFactorOrDefault(scf::ForOp forOp) { // Use the attribute attached to the loop if it exists otherwise set the // factor to 1 to suppress the unrolling. - if (auto factor = forOp->getAttrOfType( - mlir::triton::loopUnrollFactorAttrName)) + if (auto factor = + forOp->getAttrOfType(loopUnrollFactorAttrName)) return factor.getInt(); return 1; } + int getUnrollIdOrDefault(scf::ForOp forOp) { + // Use the attribute attached to the loop if it exists otherwise set the + // factor to 1 to suppress the unrolling. + if (auto factor = forOp->getAttrOfType(unrolledLoopIdAttrName)) + return factor.getInt(); + return 0; + } + + const char *loopUnrollFactorAttrName = "tt.loop_unroll_factor"; + const char *unrolledLoopIdAttrName = "tt.unrolled_loop_id"; + const char *pipelineStagesAttrName = "tt.num_stages"; + public: LoopUnrollPass() = default; LoopUnrollPass(const LoopUnrollPass &) {} + + SmallVector getUnrolledLoopsAndClearAttrs(unsigned loopId) { + SmallVector loops; + getOperation()->walk([&](scf::ForOp forOp) { + if (getUnrollIdOrDefault(forOp) == loopId) + loops.push_back(forOp); + }); + + // check which one is the unrolled loop and which one is the prolog/epilog + // loop. A simple heuristic is to check the number of instructions in the + // loop. The unrolled main loop should have the most instructions. + assert(loops.size() == 2 && "only support unrolling one loop at a time"); + SmallVector loopInstructionCounts; + for (auto loop : loops) { + loop->removeAttr(loopUnrollFactorAttrName); + loop->removeAttr(unrolledLoopIdAttrName); + int count = 0; + loop->walk([&](Operation *op) { count++; }); + loopInstructionCounts.push_back(count); + } + + // sort the loops by the number of instructions. The unrolled main loop + // should go first. + if (loopInstructionCounts[0] < loopInstructionCounts[1]) + std::swap(loops[0], loops[1]); + + return loops; + } + void runOnOperation() override { LDBG("Loop unroll pass"); SmallVector loops; @@ -49,11 +89,22 @@ class LoopUnrollPass : public TritonLoopUnrollBase { loops.push_back(forOp); }); - for (auto loop : loops) { + auto ctx = getOperation()->getContext(); + for (unsigned i = 0; i < loops.size(); i++) { + auto loop = loops[i]; auto unrollFactor = getUnrollFactorOrDefault(loop); - loop->removeAttr(mlir::triton::loopUnrollFactorAttrName); + loop->setAttr(unrolledLoopIdAttrName, + mlir::IntegerAttr::get(IntegerType::get(ctx, 32), i + 1)); LDBG("Unrolling loop by " << unrollFactor << " times\n" << loop); (void)loopUnrollByFactor(loop, unrollFactor); + auto unrolledLoops = getUnrolledLoopsAndClearAttrs(i + 1); + // Do not pipeline the prolog/epilog loop. + if (unrolledLoops.size() == 2) { + auto prologEpilogLoop = unrolledLoops[1]; + prologEpilogLoop->setAttr( + pipelineStagesAttrName, + mlir::IntegerAttr::get(IntegerType::get(ctx, 32), 1)); + } } } }; diff --git a/test/Triton/loop-unroll.mlir b/test/Triton/loop-unroll.mlir index 9166630281e6..531a14fffad3 100644 --- a/test/Triton/loop-unroll.mlir +++ b/test/Triton/loop-unroll.mlir @@ -13,6 +13,7 @@ tt.func @add_kernel_unroll(%arg0: tensor<256x!tt.ptr>, %arg1: i32) { // CHECK: scf.for // CHECK: tt.load // CHECK-NOT: tt.load + // CHECK: tt.num_stages = 1 : i32 %2:2 = scf.for %arg3 = %c1_i32 to %arg1 step %c1_i32 iter_args(%arg4 = %1, %arg5 = %arg0) -> (tensor<256xf32>, tensor<256x!tt.ptr>) : i32 { %3 = tt.load %arg5 : tensor<256x!tt.ptr> %4 = arith.addf %arg4, %3 : tensor<256xf32> From 3eb537b6e96a182a2e7d37553c27af902b18305b Mon Sep 17 00:00:00 2001 From: Hongtao Yu Date: Thu, 31 Oct 2024 10:54:21 -0700 Subject: [PATCH 2/3] Fix for constant-bound loop where no epilog will be generated. --- lib/Dialect/Triton/Transforms/LoopUnroll.cpp | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/lib/Dialect/Triton/Transforms/LoopUnroll.cpp b/lib/Dialect/Triton/Transforms/LoopUnroll.cpp index b145a20307fc..3d558568209b 100644 --- a/lib/Dialect/Triton/Transforms/LoopUnroll.cpp +++ b/lib/Dialect/Triton/Transforms/LoopUnroll.cpp @@ -59,10 +59,8 @@ class LoopUnrollPass : public TritonLoopUnrollBase { loops.push_back(forOp); }); - // check which one is the unrolled loop and which one is the prolog/epilog - // loop. A simple heuristic is to check the number of instructions in the - // loop. The unrolled main loop should have the most instructions. - assert(loops.size() == 2 && "only support unrolling one loop at a time"); + assert(loops.size() <= 2 && "Expect at most 2 loops, one for the main loop " + "and one for the prolog/epilog"); SmallVector loopInstructionCounts; for (auto loop : loops) { loop->removeAttr(loopUnrollFactorAttrName); @@ -71,11 +69,15 @@ class LoopUnrollPass : public TritonLoopUnrollBase { loop->walk([&](Operation *op) { count++; }); loopInstructionCounts.push_back(count); } - - // sort the loops by the number of instructions. The unrolled main loop - // should go first. - if (loopInstructionCounts[0] < loopInstructionCounts[1]) - std::swap(loops[0], loops[1]); + if (loops.size() == 2) { + // check which one is the unrolled loop and which one is the prolog/epilog + // loop. A simple heuristic is to check the number of instructions in the + // loop. The unrolled main loop should have the most instructions. + // sort the loops by the number of instructions. The unrolled main loop + // should go first. + if (loopInstructionCounts[0] < loopInstructionCounts[1]) + std::swap(loops[0], loops[1]); + } return loops; } From 6e16e018ca4b261ad54f56678f75a168f34fec5b Mon Sep 17 00:00:00 2001 From: Hongtao Yu Date: Mon, 4 Nov 2024 16:14:01 -0800 Subject: [PATCH 3/3] Simplified based on MLIR-side changes. --- lib/Dialect/Triton/Transforms/LoopUnroll.cpp | 61 +++----------------- 1 file changed, 8 insertions(+), 53 deletions(-) diff --git a/lib/Dialect/Triton/Transforms/LoopUnroll.cpp b/lib/Dialect/Triton/Transforms/LoopUnroll.cpp index 3d558568209b..cb25d41a2548 100644 --- a/lib/Dialect/Triton/Transforms/LoopUnroll.cpp +++ b/lib/Dialect/Triton/Transforms/LoopUnroll.cpp @@ -1,7 +1,6 @@ #include #include "mlir/Dialect/SCF/Utils/Utils.h" -#include "mlir/IR/Attributes.h" #include "mlir/IR/BuiltinAttributes.h" #include "mlir/IR/Matchers.h" #include "mlir/IR/PatternMatch.h" @@ -36,52 +35,12 @@ class LoopUnrollPass : public TritonLoopUnrollBase { return 1; } - int getUnrollIdOrDefault(scf::ForOp forOp) { - // Use the attribute attached to the loop if it exists otherwise set the - // factor to 1 to suppress the unrolling. - if (auto factor = forOp->getAttrOfType(unrolledLoopIdAttrName)) - return factor.getInt(); - return 0; - } - const char *loopUnrollFactorAttrName = "tt.loop_unroll_factor"; - const char *unrolledLoopIdAttrName = "tt.unrolled_loop_id"; const char *pipelineStagesAttrName = "tt.num_stages"; public: LoopUnrollPass() = default; LoopUnrollPass(const LoopUnrollPass &) {} - - SmallVector getUnrolledLoopsAndClearAttrs(unsigned loopId) { - SmallVector loops; - getOperation()->walk([&](scf::ForOp forOp) { - if (getUnrollIdOrDefault(forOp) == loopId) - loops.push_back(forOp); - }); - - assert(loops.size() <= 2 && "Expect at most 2 loops, one for the main loop " - "and one for the prolog/epilog"); - SmallVector loopInstructionCounts; - for (auto loop : loops) { - loop->removeAttr(loopUnrollFactorAttrName); - loop->removeAttr(unrolledLoopIdAttrName); - int count = 0; - loop->walk([&](Operation *op) { count++; }); - loopInstructionCounts.push_back(count); - } - if (loops.size() == 2) { - // check which one is the unrolled loop and which one is the prolog/epilog - // loop. A simple heuristic is to check the number of instructions in the - // loop. The unrolled main loop should have the most instructions. - // sort the loops by the number of instructions. The unrolled main loop - // should go first. - if (loopInstructionCounts[0] < loopInstructionCounts[1]) - std::swap(loops[0], loops[1]); - } - - return loops; - } - void runOnOperation() override { LDBG("Loop unroll pass"); SmallVector loops; @@ -92,20 +51,16 @@ class LoopUnrollPass : public TritonLoopUnrollBase { }); auto ctx = getOperation()->getContext(); - for (unsigned i = 0; i < loops.size(); i++) { - auto loop = loops[i]; + for (auto loop : loops) { auto unrollFactor = getUnrollFactorOrDefault(loop); - loop->setAttr(unrolledLoopIdAttrName, - mlir::IntegerAttr::get(IntegerType::get(ctx, 32), i + 1)); + loop->removeAttr(loopUnrollFactorAttrName); LDBG("Unrolling loop by " << unrollFactor << " times\n" << loop); - (void)loopUnrollByFactor(loop, unrollFactor); - auto unrolledLoops = getUnrolledLoopsAndClearAttrs(i + 1); - // Do not pipeline the prolog/epilog loop. - if (unrolledLoops.size() == 2) { - auto prologEpilogLoop = unrolledLoops[1]; - prologEpilogLoop->setAttr( - pipelineStagesAttrName, - mlir::IntegerAttr::get(IntegerType::get(ctx, 32), 1)); + auto resultLoops = loopUnrollByFactor(loop, unrollFactor); + // Do not pipeline the epilog loop. + if (succeeded(resultLoops) && resultLoops->epilogueLoopOp) { + (*resultLoops->epilogueLoopOp) + ->setAttr(pipelineStagesAttrName, + mlir::IntegerAttr::get(IntegerType::get(ctx, 32), 1)); } } }