Skip to content

Commit 51021fb

Browse files
authored
[AMD] NFC: Split Loop Expansion from StreamPipeliner (#8307)
This PR refactored StreamPipeliner, with following modifications: - exposed some structures in header - splitted StreamPipeliner into 2 passes - `TritonAMDGPUScheduleLoops`: to generate schedule. It keeps most of the functionality of StreamPipeliner. In the following PR it will be further trimmed to only schedule global load and compute. - `TritonAMDGPUPipeline`: eventually it contains: `lowerLoops` to allocate lds and schedule async loads, and `expandLoops` to mechanically expand loops according to schedule. At the point of this PR, it will only contain `expandLoops`. - splitted `expandLoops` out of `StreamPipeliner`, into a separate Pipeline pass - renamed StreamPipeline.cpp to ScheduleLoops.cpp to better reflect its purpose - changed name of passes in lit tests accordingly It's part of a series of PRs to refactor StreamPipeliner. Previous PRs: - triton-lang/triton#8295
1 parent 0532264 commit 51021fb

File tree

18 files changed

+269
-185
lines changed

18 files changed

+269
-185
lines changed

bin/RegisterTritonDialects.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -108,7 +108,8 @@ inline void registerTritonDialects(mlir::DialectRegistry &registry) {
108108
mlir::registerTritonAMDGPUHoistLayoutConversions();
109109
mlir::registerTritonAMDGPUReorderInstructions();
110110
mlir::registerTritonAMDGPUBlockPingpong();
111-
mlir::registerTritonAMDGPUStreamPipeline();
111+
mlir::registerTritonAMDGPUPipeline();
112+
mlir::registerTritonAMDGPUScheduleLoops();
112113
mlir::registerTritonAMDGPUCanonicalizePointers();
113114
mlir::registerTritonAMDGPUConvertToBufferOps();
114115
mlir::registerTritonAMDGPUInThreadTranspose();

include/triton/Dialect/TritonGPU/Transforms/PipeliningUtility.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -182,6 +182,8 @@ getLastUseOfPipelinedOp(ArrayRef<Operation *> ops, scf::ForOp forOp,
182182
CoarseSchedule &schedule,
183183
std::function<bool(Operation *)> filterUse = nullptr);
184184

185+
// Clean up attributes passing over schedules across stages in pipelining
186+
void removePipeliningAttributes(ModuleOp moduleOp);
185187
} // namespace triton
186188
} // namespace mlir
187189

lib/Dialect/TritonGPU/Transforms/Pipeliner/PipeliningUtility.cpp

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -925,3 +925,11 @@ triton::getLastUseOfPipelinedOp(ArrayRef<Operation *> ops, scf::ForOp forOp,
925925
current->isBeforeInBlock(candidate));
926926
});
927927
}
928+
929+
void triton::removePipeliningAttributes(ModuleOp moduleOp) {
930+
moduleOp->walk([&](Operation *op) {
931+
op->removeAttr(mlir::triton::kLoopStageAttrName);
932+
op->removeAttr(mlir::triton::kLoopClusterAttrName);
933+
op->removeAttr(mlir::triton::kScheduledMaxStageAttrName);
934+
});
935+
}

lib/Dialect/TritonGPU/Transforms/Pipeliner/SoftwarePipeliner.cpp

Lines changed: 1 addition & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -148,14 +148,6 @@ static void expandLoops(ModuleOp moduleOp) {
148148
resolveMaskOp(moduleOp, peeledMaskOps);
149149
}
150150

151-
static void removeAttributes(ModuleOp moduleOp) {
152-
moduleOp->walk([&](Operation *op) {
153-
op->removeAttr(mlir::triton::kLoopStageAttrName);
154-
op->removeAttr(mlir::triton::kLoopClusterAttrName);
155-
op->removeAttr(mlir::triton::kScheduledMaxStageAttrName);
156-
});
157-
}
158-
159151
struct PipelinePass : public impl::TritonGPUPipelineBase<PipelinePass> {
160152

161153
using impl::TritonGPUPipelineBase<PipelinePass>::TritonGPUPipelineBase;
@@ -180,7 +172,7 @@ struct PipelinePass : public impl::TritonGPUPipelineBase<PipelinePass> {
180172
}
181173

182174
// Cleanup the IR from the pipeline attributes.
183-
removeAttributes(moduleOp);
175+
removePipeliningAttributes(moduleOp);
184176

185177
pipelineWgmma(moduleOp, numStages);
186178

test/TritonGPU/amd/amd-pipeline-chained-dots.mlir

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
// RUN: triton-opt %s -split-input-file -tritonamdgpu-stream-pipeline="num_stages=4 use_async_copy=1" -canonicalize | FileCheck %s
1+
// RUN: triton-opt %s -split-input-file -tritonamdgpu-schedule-loops="num_stages=4 use_async_copy=1" -tritonamdgpu-pipeline="use_async_copy=1" -canonicalize | FileCheck %s
22

33
#blocked = #ttg.blocked<{sizePerThread = [8, 1], threadsPerWarp = [8, 8], warpsPerCTA = [1, 4], order = [0, 1]}>
44
#mma = #ttg.amd_mfma<{version = 4, warpsPerCTA = [4, 1], instrShape = [32, 32, 16], isTransposed = true}>

test/TritonGPU/amd/amd-stream-lds-layout-selection.mlir

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
// RUN: triton-opt %s -split-input-file -tritonamdgpu-stream-pipeline="num_stages=2" -canonicalize | FileCheck %s
1+
// RUN: triton-opt %s -split-input-file -tritonamdgpu-schedule-loops="num_stages=2" -tritonamdgpu-pipeline -canonicalize | FileCheck %s
22

33
// Pick a common shared memory layout with vec = max kWidth of all users.
44
// CHECK{LITERAL}: #shared = #ttg.swizzled_shared<{vec = 8, perPhase = 2, maxPhase = 8, order = [0, 1]}>

test/TritonGPU/amd/amd-stream-loop-assume.mlir

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
// RUN: triton-opt %s -split-input-file -tritonamdgpu-stream-pipeline="num_stages=2" -canonicalize | FileCheck %s
1+
// RUN: triton-opt %s -split-input-file -tritonamdgpu-schedule-loops="num_stages=2" -tritonamdgpu-pipeline -canonicalize | FileCheck %s
22

33
// matmul: 128x32 @ 32x128 -> 128x128
44
#AL = #ttg.blocked<{sizePerThread = [1, 4], threadsPerWarp = [8, 8], warpsPerCTA = [4, 1], order = [1, 0]}>

test/TritonGPU/loop-pipeline-combine-waits.mlir

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
// RUN: triton-opt %s -split-input-file -tritonamdgpu-stream-pipeline="num_stages=3 use_async_copy=1 use_pingpong=1" | FileCheck %s
1+
// RUN: triton-opt %s -split-input-file -tritonamdgpu-schedule-loops="num_stages=3 use_async_copy=1 use_pingpong=1" -tritonamdgpu-pipeline="use_async_copy=1" | FileCheck %s
22

33
#blocked = #ttg.blocked<{sizePerThread = [8, 1], threadsPerWarp = [8, 4], warpsPerCTA = [1, 4], order = [0, 1]}>
44
#mma = #ttg.nvidia_mma<{versionMajor = 2, versionMinor = 0, warpsPerCTA = [4, 1], instrShape = [16, 8]}>

test/TritonGPU/loop-pipeline-hip.mlir

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
1-
// RUN: triton-opt %s -split-input-file -tritonamdgpu-stream-pipeline=num_stages=2 -canonicalize | FileCheck %s --check-prefixes=COMMON,SYNC
2-
// RUN: triton-opt %s -split-input-file -tritonamdgpu-stream-pipeline="num_stages=2 use_async_copy=1" -canonicalize | FileCheck %s --check-prefixes=COMMON,ASYNC
1+
// RUN: triton-opt %s -split-input-file -tritonamdgpu-schedule-loops=num_stages=2 -tritonamdgpu-pipeline -canonicalize | FileCheck %s --check-prefixes=COMMON,SYNC
2+
// RUN: triton-opt %s -split-input-file -tritonamdgpu-schedule-loops="num_stages=2 use_async_copy=1" -tritonamdgpu-pipeline="use_async_copy=1" -canonicalize | FileCheck %s --check-prefixes=COMMON,ASYNC
33

44
#blocked = #ttg.blocked<{sizePerThread = [8, 1], threadsPerWarp = [8, 4], warpsPerCTA = [1, 4], order = [0, 1]}>
55
#blocked1 = #ttg.blocked<{sizePerThread = [1, 8], threadsPerWarp = [8, 4], warpsPerCTA = [4, 1], order = [1, 0]}>

test/TritonGPU/loop-pipeline.mlir

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
// RUN: triton-opt %s -split-input-file -tritongpu-assign-latencies -tritongpu-schedule-loops -tritongpu-pipeline=num-stages=3 -canonicalize | FileCheck %s --check-prefixes=COMMON,CHECK
2-
// RUN: triton-opt %s -split-input-file -tritonamdgpu-stream-pipeline=num_stages=2 -canonicalize | FileCheck %s --check-prefixes=COMMON,AMD
3-
// RUN: triton-opt %s -split-input-file -tritonamdgpu-stream-pipeline="num_stages=3" -canonicalize | FileCheck %s --check-prefixes=COMMON,AMD_3_STAGES
2+
// RUN: triton-opt %s -split-input-file -tritonamdgpu-schedule-loops=num_stages=2 -tritonamdgpu-pipeline -canonicalize | FileCheck %s --check-prefixes=COMMON,AMD
3+
// RUN: triton-opt %s -split-input-file -tritonamdgpu-schedule-loops="num_stages=3" -tritonamdgpu-pipeline -canonicalize | FileCheck %s --check-prefixes=COMMON,AMD_3_STAGES
44

55
// 4 warps
66
// matmul: 128x32 @ 32x128 -> 128x128

0 commit comments

Comments
 (0)