Skip to content

Commit 00cc5d0

Browse files
[PIPELINER] Cleanup of LoopScheduling.cpp, introduction of AssignLatencies (#5176)
This change breaks down LoopScheduling into two sub-passes: latency assignment and actual scheduling. Latency assignment is a transformation that analyzes the loop and based on the requested number of stages it assigns "latencies" to the ops that are going to be converted to async ops by the pipeliner. Latencies are expressed in terms of number of iterations of the loop and can be thought as per-operation num_stages. Scheduling transformation takes these latencies and builds a pipeliner schedule based on it. The process of building a schedule was slightly rewritten to simplify the code and cleanup the logic that was no longer needed after recent refactoring. Breaking down the schedule into latency assignment and proper scheduling has number of purposes: 1. Code became more modular, with cleaner interfaces that helps with maintanance 2. Both parts can be tested in separation, I have added lit tests for both pieces. We can finally test our pipeliner infrastructure in manageable chunks 3. It opens up opportunity to expose per-op "latencies" to the frontend, enabling creating user-defined schedules right from the language level Next step in the cleanup process is to clearly separate lowering and pipelining phases.
1 parent 712ac66 commit 00cc5d0

File tree

16 files changed

+1327
-391
lines changed

16 files changed

+1327
-391
lines changed

include/triton/Dialect/TritonGPU/Transforms/Passes.td

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,38 @@ def TritonGPUPipeline : Pass<"tritongpu-pipeline", "mlir::ModuleOp"> {
2323
];
2424
}
2525

26+
def TritonGPUTestPipelineAssignLatencies : Pass<"tritongpu-test-pipeline-assign-latencies", "mlir::ModuleOp"> {
27+
let summary = "test assigning latencies to interesting ops ahead of pipelining";
28+
29+
let description = [{
30+
This is a test pass that tests `assignLatencies` method of `TritonGPULoopScheduling`.
31+
}];
32+
33+
let dependentDialects = ["mlir::triton::gpu::TritonGPUDialect",
34+
"mlir::triton::nvidia_gpu::TritonNvidiaGPUDialect",
35+
"mlir::scf::SCFDialect",
36+
"mlir::arith::ArithDialect"];
37+
38+
let options = [
39+
Option<"numStages", "num-stages",
40+
"int32_t", /*default*/"3",
41+
"number of pipeline stages">
42+
];
43+
}
44+
45+
def TritonGPUTestPipelineScheduleLoop : Pass<"tritongpu-test-pipeline-schedule-loop", "mlir::ModuleOp"> {
46+
let summary = "test scheduling a loop for software pipelining";
47+
48+
let description = [{
49+
This is a test pass that tests `scheduleLoop` method of `TritonGPULoopScheduling`.
50+
}];
51+
52+
let dependentDialects = ["mlir::triton::gpu::TritonGPUDialect",
53+
"mlir::triton::nvidia_gpu::TritonNvidiaGPUDialect",
54+
"mlir::scf::SCFDialect",
55+
"mlir::arith::ArithDialect"];
56+
}
57+
2658
def TritonGPUF32DotTC : Pass<"tritongpu-F32DotTC", "mlir::ModuleOp"> {
2759
let summary = "3xTF32 trick";
2860

include/triton/Dialect/TritonGPU/Transforms/PipeliningUtility.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,9 @@ static const char *kNumStagesAttrName = "tt.num_stages";
1111
static const char *kLoopStageAttrName = "loop.stage";
1212
static const char *kLoopClusterAttrName = "loop.cluster";
1313

14+
bool loopHasDistGreaterThanOne(scf::ForOp forOp);
15+
bool isOuterLoop(scf::ForOp forOp);
16+
1417
/// Function to mask operations during scheduling.
1518
Operation *predicateOp(RewriterBase &rewriter, Operation *op, Value pred);
1619

include/triton/Dialect/TritonGPU/Transforms/Schedule.h

Lines changed: 13 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,18 @@
1111
namespace mlir {
1212
namespace triton {
1313

14+
namespace gpu {
15+
16+
/// Discover operations that should become async and assign latencies to them
17+
/// based on the numStages value provided by the user.
18+
DenseMap<Operation *, int> assignLatencies(ModuleOp forOp, int numStages);
19+
20+
/// Schedule the loop based on the latencies assigned to the operations.
21+
void scheduleLoop(scf::ForOp forOp,
22+
const DenseMap<Operation *, int> &opLatency);
23+
24+
}; // namespace gpu
25+
1426
/// This fill out the pipelining options including schedule and annotations
1527
/// for wait ops. This also does pre-processing by converting some of the
1628
/// loads into async loads so that the IR is ready to be pipelined.
@@ -108,8 +120,7 @@ class CoarseSchedule {
108120

109121
// Add dependencies of anchor ops to the coarse schedule. Schedule them to
110122
// the same stage and ordering cluster as the anchor op.
111-
void scheduleDependencies(scf::ForOp forOp, CoarseSchedule &schedule,
112-
int numStages);
123+
void scheduleDependencies(scf::ForOp forOp, CoarseSchedule &schedule);
113124

114125
} // namespace triton
115126
} // namespace mlir

lib/Dialect/TritonGPU/Transforms/CMakeLists.txt

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,9 +8,12 @@ add_triton_library(TritonGPUTransforms
88
OptimizeAccumulatorInit.cpp
99
OptimizeDotOperands.cpp
1010
OptimizeThreadLocality.cpp
11+
Pipeliner/AssignLatencies.cpp
1112
Pipeliner/MatmulLoopPipeline.cpp
1213
Pipeliner/OuterLoopPipeline.cpp
1314
Pipeliner/PipelineExpander.cpp
15+
Pipeliner/TestPipelineAssignLatencies.cpp
16+
Pipeliner/TestPipelineScheduleLoop.cpp
1417
Pipeliner/SoftwarePipeliner.cpp
1518
Pipeliner/TMAStoresPipeline.cpp
1619
Pipeliner/PipeliningUtility.cpp

0 commit comments

Comments
 (0)