Skip to content

Commit 852c05f

Browse files
[PIPELINER] Refactor pipeliner lowering. (#5989)
This change reworks the pipeliner flow in triton. It systematizes the pipeliner transformations by making all of them part of the same SoftwarePipeliner pass, while making them modular and defining clear IR interfaces between them. It also introduces new LowerLoop transformation that attempts to be more generic async operations lowering, written with minimal amount of assumptions of the IR shape that is coming from the pipeline scheduling sub-pass.
1 parent ef38bec commit 852c05f

38 files changed

+2865
-2347
lines changed

Makefile

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -89,9 +89,9 @@ dev-install: dev-install-requires dev-install-triton
8989

9090
.PHONY: golden-samples
9191
golden-samples: triton-opt
92-
$(TRITON_OPT) test/TritonGPU/samples/simulated-grouped-gemm.mlir.in -tritongpu-loop-scheduling -tritongpu-pipeline -canonicalize | \
92+
$(TRITON_OPT) test/TritonGPU/samples/simulated-grouped-gemm.mlir.in -tritongpu-pipeline -canonicalize | \
9393
$(PYTHON) utils/generate-test-checks.py --source test/TritonGPU/samples/simulated-grouped-gemm.mlir.in --source_delim_regex="\bmodule" \
9494
-o test/TritonGPU/samples/simulated-grouped-gemm.mlir
95-
$(TRITON_OPT) test/TritonGPU/samples/descriptor-matmul-pipeline.mlir.in -tritongpu-loop-scheduling -tritongpu-pipeline -canonicalize | \
95+
$(TRITON_OPT) test/TritonGPU/samples/descriptor-matmul-pipeline.mlir.in -tritongpu-pipeline -canonicalize | \
9696
$(PYTHON) utils/generate-test-checks.py --source test/TritonGPU/samples/descriptor-matmul-pipeline.mlir.in --source_delim_regex="\bmodule" \
9797
-o test/TritonGPU/samples/descriptor-matmul-pipeline.mlir

include/triton/Dialect/TritonGPU/Transforms/Passes.td

Lines changed: 19 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,10 @@ def TritonGPUPipeline : Pass<"tritongpu-pipeline", "mlir::ModuleOp"> {
1919
let options = [
2020
Option<"numStages", "num-stages",
2121
"int32_t", /*default*/"3",
22-
"number of pipeline stages">
22+
"number of pipeline stages">,
23+
Option<"dumpIntermediateSteps", "dump-intermediate-steps",
24+
"bool", /*default*/"false",
25+
"Dump intermediate steps">
2326
];
2427
}
2528

@@ -45,7 +48,7 @@ def TritonGPUTestPipelineAssignLatencies : Pass<"tritongpu-test-pipeline-assign-
4548
let summary = "test assigning latencies to interesting ops ahead of pipelining";
4649

4750
let description = [{
48-
This is a test pass that tests `assignLatencies` method of `TritonGPULoopScheduling`.
51+
This is a test pass that tests `assignLatencies` method of `TritonGPUPipeline`.
4952
}];
5053

5154
let dependentDialects = ["mlir::triton::gpu::TritonGPUDialect",
@@ -64,7 +67,20 @@ def TritonGPUTestPipelineScheduleLoop : Pass<"tritongpu-test-pipeline-schedule-l
6467
let summary = "test scheduling a loop for software pipelining";
6568

6669
let description = [{
67-
This is a test pass that tests `scheduleLoop` method of `TritonGPULoopScheduling`.
70+
This is a test pass that tests `scheduleLoop` method of `TritonGPUPipeline`.
71+
}];
72+
73+
let dependentDialects = ["mlir::triton::gpu::TritonGPUDialect",
74+
"mlir::triton::nvidia_gpu::TritonNvidiaGPUDialect",
75+
"mlir::scf::SCFDialect",
76+
"mlir::arith::ArithDialect"];
77+
}
78+
79+
def TritonGPUTestPipelineLowerLoop : Pass<"tritongpu-test-pipeline-lower-loop", "mlir::ModuleOp"> {
80+
let summary = "test lowering a loop for software pipelining";
81+
82+
let description = [{
83+
This is a test pass that tests `lowerLoop` method of `TritonGPUPipeline`.
6884
}];
6985

7086
let dependentDialects = ["mlir::triton::gpu::TritonGPUDialect",
@@ -254,20 +270,6 @@ def TritonGPUOptimizeAccumulatorInit: Pass<"tritongpu-optimize-accumulator-init"
254270
"mlir::triton::TritonDialect"];
255271
}
256272

257-
def TritonGPULoopScheduling: Pass<"tritongpu-loop-scheduling", "mlir::ModuleOp"> {
258-
let summary = "Generate loop scheduling for SWP";
259-
260-
let description = "This pass sets up stages and clustering for software pipelining.";
261-
262-
let dependentDialects = ["mlir::triton::gpu::TritonGPUDialect",
263-
"mlir::triton::TritonDialect"];
264-
let options = [
265-
Option<"numStages", "num-stages",
266-
"int32_t", /*default*/"3",
267-
"number of pipeline stages">
268-
];
269-
}
270-
271273
def TritonGPUCoalesceAsyncCopy: Pass<"tritongpu-coalesce-async-copy", "mlir::ModuleOp"> {
272274
let summary = "Improve coalescing for async global to local copies";
273275

include/triton/Dialect/TritonGPU/Transforms/PipeliningUtility.h

Lines changed: 22 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
#define TRITON_TRITONGPU_TRANSFORMS_PIPELINER_PIPELINING_UTILITY_H_
33

44
#include "mlir/Dialect/SCF/IR/SCF.h"
5+
#include "triton/Dialect/TritonGPU/IR/Dialect.h"
56
#include <optional>
67
#include <utility>
78
#include <vector>
@@ -14,25 +15,14 @@ static const char *kDisallowAccMultiBufferAttrName =
1415
"tt.disallow_acc_multi_buffer";
1516
static const char *kLoopStageAttrName = "loop.stage";
1617
static const char *kLoopClusterAttrName = "loop.cluster";
18+
static const char *kLatencyAttrName = "tt.latency";
1719

1820
bool loopHasDistGreaterThanOne(scf::ForOp forOp);
1921
bool isOuterLoop(scf::ForOp forOp);
2022

2123
/// Function to mask operations during scheduling.
2224
Operation *predicateOp(RewriterBase &rewriter, Operation *op, Value pred);
2325

24-
/// Collect ssa dependencies of `op` in `deps`. if `includeArg` is true,
25-
/// continue looking through loop block arguments.
26-
void addDep(Operation *op, DenseSet<Operation *> &deps, bool includeArg = true,
27-
DenseSet<Operation *> *filter = nullptr);
28-
29-
/// Add operations from `forOp` into a pipeline schedule with the the given
30-
/// `stage` when filter is true. This will add operation in the original loop
31-
/// order.
32-
void addOps(scf::ForOp forOp, int stage,
33-
std::vector<std::pair<Operation *, unsigned>> &schedule,
34-
std::function<bool(Operation *)> filter);
35-
3626
/// Replace all uses of `oldUse` with `val` and propagate the type if needed.
3727
/// This is useful when we need to change a memory descriptor from immutable to
3828
/// mutable.
@@ -50,11 +40,26 @@ void visitNestedOperands(Operation *op, function_ref<void(Value)> visitor);
5040
/// of `op`.
5141
SetVector<Value> getNestedOperands(Operation *op);
5242

53-
// Return the minClusterId and maxClusterId for the given ForOp.
54-
std::pair<int, int> getMinMaxCluster(scf::ForOp &forOp);
55-
std::pair<int, int> getStageCluster(Operation *op);
56-
std::optional<std::pair<int, int>> maybeGetStageCluster(Operation *op);
57-
void setStageCluster(Operation *op, int stage, int cluster);
43+
// Return maxumum length of the vectorized copy between registers and shared
44+
// memory for the given tensor type and shared encoding.
45+
int getCopyVecBytes(RankedTensorType registerTy,
46+
gpu::SharedEncodingTrait sharedEnc);
47+
48+
// Serialize the latencies of the operations in the loops into the latency
49+
// attribute.
50+
void serializeLatencies(ModuleOp module, DenseMap<Operation *, int> &opLatency);
51+
52+
// Deserialize the latencies of the operations in the loops from the attribute.
53+
DenseMap<Operation *, int> deserializeLatencies(ModuleOp module);
54+
55+
// Given a result of MemDescSubview, or Alloca, create a MemDescSubview with a
56+
// single buffer slice (leading dimension equal to 1), at the given index.
57+
Value createSingleBufferView(OpBuilder &builder, Value alloc, Value idx);
58+
Value createSingleBufferView(OpBuilder &builder, Value alloc, int idx);
59+
60+
// Create an allocation and init the mbarriers.
61+
Value createBarrierAlloc(scf::ForOp forOp, int numBarriers);
62+
5863
} // namespace triton
5964
} // namespace mlir
6065

include/triton/Dialect/TritonGPU/Transforms/Schedule.h

Lines changed: 33 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -15,11 +15,13 @@ namespace gpu {
1515

1616
/// Discover operations that should become async and assign latencies to them
1717
/// based on the numStages value provided by the user.
18-
DenseMap<Operation *, int> assignLatencies(ModuleOp forOp, int numStages);
18+
void assignLatencies(ModuleOp moduleOp, int numStages);
1919

20-
/// Schedule the loop based on the latencies assigned to the operations.
21-
void scheduleLoop(scf::ForOp forOp,
22-
const DenseMap<Operation *, int> &opLatency);
20+
/// Schedule the loops based on the latencies assigned to the operations.
21+
void scheduleLoops(ModuleOp moduleOp);
22+
23+
/// Lower the loops to prepare them for pipeline expansion.
24+
void lowerLoops(ModuleOp moduleOp);
2325

2426
}; // namespace gpu
2527

@@ -34,11 +36,10 @@ bool preProcessLoopAndGetSchedule(scf::ForOp &forOp, int numStages,
3436
bool getOuterLoopSchedule(scf::ForOp &forOp, int numStages,
3537
mlir::triton::PipeliningOption &options);
3638

37-
/// Pipeline the Tensor Core Gen 05 MMA ops in `forOps` with `numStages` stages.
38-
/// This will pre-process the loops, lowering the ops related to TG Gen5 MMA,
39-
/// and then pipeline the loops using expander.
40-
void pipelineTC05MMALoops(ModuleOp module,
41-
const SmallVector<scf::ForOp> &forOps, int numStages,
39+
/// Pipeline the Tensor Core Gen 05 MMA ops in the module with `numStages`
40+
/// stages. This will pre-process the loops, lowering the ops related to TG Gen5
41+
/// MMA, and then pipeline the loops using expander.
42+
void pipelineTC05MMALoops(ModuleOp module, int numStages,
4243
bool disableExpander = false);
4344

4445
/// Pipeline the TMA stores in the loop.
@@ -64,9 +65,12 @@ class CoarseSchedule {
6465

6566
public:
6667
using iterator = decltype(orderClusters)::iterator;
68+
using const_iterator = decltype(orderClusters)::const_iterator;
6769
ClusterList() = default;
6870
iterator begin() { return orderClusters.begin(); }
71+
const_iterator begin() const { return orderClusters.begin(); }
6972
iterator end() { return orderClusters.end(); }
73+
const_iterator end() const { return orderClusters.end(); }
7074
size_t size() { return orderClusters.size(); }
7175
iterator newAtBack() {
7276
orderClusters.push_back(orderClusters.size());
@@ -86,16 +90,31 @@ class CoarseSchedule {
8690
}
8791
return ret;
8892
}
93+
94+
bool isBefore(iterator a, iterator b) const {
95+
for (auto it = begin(); it != end(); ++it) {
96+
if (it == a)
97+
return true;
98+
if (it == b)
99+
return false;
100+
}
101+
llvm::report_fatal_error(
102+
"One or both clusters not found in clusters list!");
103+
}
89104
};
90105

106+
CoarseSchedule() = default;
91107
CoarseSchedule(int numStages) : numStages(numStages) {}
92-
int numStages;
93108
ClusterList clusters;
94109
using Cluster = decltype(clusters)::iterator;
95110

96111
DenseMap<Operation *, std::pair<int, Cluster>> opToStageAndCluster;
97112

113+
void setNumStages(int numStages) { this->numStages = numStages; }
114+
int getNumStages() { return numStages; }
115+
98116
void insert(Operation *op, int stage, Cluster cluster) {
117+
assert(stage < numStages && "Invalid stage");
99118
opToStageAndCluster[op] = {stage, cluster};
100119
}
101120

@@ -133,9 +152,12 @@ class CoarseSchedule {
133152
// Set <stage, cluster> based on CoarseSchedule.
134153
void serialize(scf::ForOp &forOp);
135154
// Create a CoarseSchedule based on forOp's <stage, cluster>.
136-
void deSerialize(scf::ForOp &forOp);
155+
LogicalResult deSerialize(scf::ForOp &forOp);
137156

138157
LLVM_DUMP_METHOD void dump();
158+
159+
private:
160+
int numStages = 0;
139161
};
140162

141163
// Add dependencies of anchor ops to the coarse schedule. Schedule them to

lib/Dialect/TritonGPU/Transforms/CMakeLists.txt

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,16 +5,18 @@ add_triton_library(TritonGPUTransforms
55
FuseNestedLoops.cpp
66
CombineTensorSelectAndIf.cpp
77
DecomposeScaledBlocked.cpp
8-
LoopScheduling.cpp
98
ReduceDataDuplication.cpp
109
OptimizeAccumulatorInit.cpp
1110
OptimizeDotOperands.cpp
1211
OptimizeThreadLocality.cpp
1312
Pipeliner/AssignLatencies.cpp
14-
Pipeliner/MatmulLoopPipeline.cpp
13+
Pipeliner/LowerLoops.cpp
14+
Pipeliner/ScheduleLoops.cpp
15+
Pipeliner/WGMMAPipeline.cpp
1516
Pipeliner/PipelineExpander.cpp
1617
Pipeliner/TestPipelineAssignLatencies.cpp
1718
Pipeliner/TestPipelineScheduleLoop.cpp
19+
Pipeliner/TestPipelineLowerLoop.cpp
1820
Pipeliner/SoftwarePipeliner.cpp
1921
Pipeliner/TC05MMAPipeline.cpp
2022
Pipeliner/TMAStoresPipeline.cpp

lib/Dialect/TritonGPU/Transforms/Pipeliner/AssignLatencies.cpp

Lines changed: 4 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77
#include "triton/Dialect/TritonNvidiaGPU/IR/Dialect.h"
88
#include "llvm/Support/Debug.h"
99

10-
#define DEBUG_TYPE "triton-pipeline-schedule"
10+
#define DEBUG_TYPE "triton-loop-pipeline"
1111
#define DBGS() (llvm::dbgs() << "[" DEBUG_TYPE "]: ")
1212
#define LDBG(X) LLVM_DEBUG(DBGS() << X << "\n")
1313

@@ -65,17 +65,6 @@ bool isSmallLoad(tt::LoadOp loadOp,
6565
return width < 32;
6666
}
6767

68-
int getCopyVecBytes(RankedTensorType registerTy,
69-
ttg::SharedEncodingTrait sharedEnc) {
70-
auto regLayout = triton::gpu::toLinearLayout(registerTy.getShape(),
71-
registerTy.getEncoding());
72-
auto sharedLayout =
73-
triton::gpu::toLinearLayout(registerTy.getShape(), sharedEnc);
74-
auto regToSharedLayout = regLayout.invertAndCompose(sharedLayout);
75-
const int vecElems = regToSharedLayout.getNumConsecutiveInOut();
76-
return vecElems * registerTy.getElementTypeBitWidth() / 8;
77-
}
78-
7968
bool isPipeliningBeneficial(Operation *op, Operation *finalUser,
8069
tt::ModuleAxisInfoAnalysis &axisInfoAnalysis) {
8170
if (auto loadOp = dyn_cast<tt::LoadOp>(op)) {
@@ -233,8 +222,7 @@ void assignUserProvidedLatencies(scf::ForOp forOp,
233222
// on the requested number of stages assign the latencies in a way that
234223
// cover all the stages with the sum of latencies in the chain from the first
235224
// load to the final dot op.
236-
DenseMap<Operation *, int> assignLatencies(ModuleOp moduleOp,
237-
int defaultNumStages) {
225+
void assignLatencies(ModuleOp moduleOp, int defaultNumStages) {
238226
auto getNumStagesOrDefault = [defaultNumStages](scf::ForOp forOp) -> int {
239227
// Use the attribute attached to the loop if it exists otherwise use the
240228
// global control.
@@ -252,7 +240,7 @@ DenseMap<Operation *, int> assignLatencies(ModuleOp moduleOp,
252240
loops.push_back(forOp);
253241
});
254242
if (loops.empty())
255-
return DenseMap<Operation *, int>();
243+
return;
256244

257245
DenseMap<Operation *, int> opLatency;
258246
for (auto forOp : loops) {
@@ -291,9 +279,8 @@ DenseMap<Operation *, int> assignLatencies(ModuleOp moduleOp,
291279
opLatency[loadOp] = loadLatency;
292280
}
293281
}
294-
return opLatency;
282+
serializeLatencies(moduleOp, opLatency);
295283
}
296-
297284
} // namespace gpu
298285
} // namespace triton
299286
} // namespace mlir

0 commit comments

Comments
 (0)