Skip to content

Commit 76ed94d

Browse files
sjw36antiagainst
andauthored
[AMD] Remove stream pipeliner v1 (#4845)
We have flipped stream pipeliner v2 on as default for quite sometime. All known issues has been fixed. So now remove old v1 pipeliner. Note that this changes know `num_stages` are handled: previously we used to enable pipelining if `num_stages` is `0`, which really is not a good behavior. Now switched to follow common practice where `0`/`1` won't trigger pipelining anymore; need `2` or more to trigger. Given downstream users might be using `0` in the codebase, right now we `assert` to give developers a clear indication the switch of behavior instead of silently drop the perf. The `assert` is expected to be dropped sometime down the line. --------- Co-authored-by: Lei Zhang <[email protected]>
1 parent d4e5a78 commit 76ed94d

File tree

8 files changed

+8
-982
lines changed

8 files changed

+8
-982
lines changed

bin/RegisterTritonDialects.h

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -60,7 +60,6 @@ inline void registerTritonDialects(mlir::DialectRegistry &registry) {
6060
mlir::registerTritonAMDGPUAccelerateMatmul();
6161
mlir::registerTritonAMDGPUOptimizeEpilogue();
6262
mlir::registerTritonAMDGPUReorderInstructions();
63-
mlir::registerTritonAMDGPUStreamPipeline();
6463
mlir::registerTritonAMDGPUStreamPipelineV2();
6564
mlir::registerTritonAMDGPUCanonicalizePointers();
6665

test/TritonGPU/amd/amd-loop-pipeline-v1.mlir

Lines changed: 0 additions & 31 deletions
This file was deleted.

third_party/amd/backend/compiler.py

Lines changed: 8 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@ def min_dot_size(target: GPUTarget):
2929
class HIPOptions:
3030
num_warps: int = 4
3131
waves_per_eu: int = 1
32-
num_stages: int = 0
32+
num_stages: int = 2
3333
num_ctas: int = 1
3434
extern_libs: dict = None
3535
cluster_dims: tuple = (1, 1, 1)
@@ -215,23 +215,19 @@ def make_ttgir(mod, metadata, options):
215215
passes.ttgpuir.add_remove_layout_conversions(pm)
216216
amd.passes.ttgpuir.add_optimize_epilogue(pm)
217217
passes.ttgpuir.add_optimize_dot_operands(pm, True)
218-
use_new_pipeliner = os.getenv("TRITON_HIP_USE_NEW_STREAM_PIPELINE", "1") == "1"
219218
if amd.has_matrix_core_feature(options.arch):
220-
if use_new_pipeliner:
221-
# In the old pipeliner we only support num_stages = 0/1, which means something
222-
# different than the NVIDIA side. In the new pipeliner we unify the num_stages
223-
# interpretation. Default to use 2 stages if not explicitly set.
224-
num_stages = options.num_stages if options.num_stages != 0 else 2
225-
amd.passes.ttgpuir.add_stream_pipelinev2(pm, num_stages)
226-
else:
227-
if options.num_stages == 0:
228-
amd.passes.ttgpuir.add_stream_pipeline(pm)
219+
assert options.num_stages != 0, ("Triton AMD backend pipeliner has been updated. "
220+
"We used to trigger software pipelining with "
221+
"num_stages == 0. Now it will not happen anymore; "
222+
"please update to use num_stages == 2 for "
223+
"equivalent behavior in the past.")
224+
amd.passes.ttgpuir.add_stream_pipelinev2(pm, options.num_stages)
229225
passes.common.add_canonicalizer(pm)
230226
amd.passes.ttgpuir.insert_instruction_sched_hints(pm)
231227
passes.ttgpuir.add_optimize_dot_operands(pm, True)
232228
passes.ttgpuir.add_remove_layout_conversions(pm)
233229
passes.ttgpuir.add_reduce_data_duplication(pm)
234-
if use_new_pipeliner or options.num_stages != 0:
230+
if amd.has_matrix_core_feature(options.arch):
235231
amd.passes.ttgpuir.add_reorder_instructions(pm)
236232
amd.passes.ttgpuir.add_canonicalize_pointers(pm)
237233
passes.common.add_canonicalizer(pm)

third_party/amd/include/TritonAMDGPUTransforms/Passes.h

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6,8 +6,6 @@
66

77
namespace mlir {
88

9-
std::unique_ptr<Pass> createTritonAMDGPUStreamPipelinePass();
10-
119
std::unique_ptr<Pass> createTritonAMDGPUStreamPipelineV2Pass(int numStages = 2);
1210

1311
std::unique_ptr<Pass>

third_party/amd/include/TritonAMDGPUTransforms/Passes.td

Lines changed: 0 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -3,19 +3,6 @@
33

44
include "mlir/Pass/PassBase.td"
55

6-
def TritonAMDGPUStreamPipeline : Pass<"tritonamdgpu-stream-pipeline", "mlir::ModuleOp"> {
7-
let summary = "pipeline";
8-
9-
let description = [{
10-
Pipeline global loads through registers to shared memory while computing on previous
11-
tile
12-
}];
13-
14-
let constructor = "mlir::createTritonAMDGPUStreamPipelinePass()";
15-
16-
let dependentDialects = [];
17-
}
18-
196
def TritonAMDGPUStreamPipelineV2 : Pass<"tritonamdgpu-stream-pipeline-v2", "mlir::ModuleOp"> {
207
let summary = "pipeline";
218

third_party/amd/lib/TritonAMDGPUTransforms/CMakeLists.txt

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,6 @@ add_triton_library(TritonAMDGPUTransforms
33
CanonicalizePointers.cpp
44
OptimizeEpilogue.cpp
55
ReorderInstructions.cpp
6-
StreamPipeline.cpp
76
StreamPipelineV2.cpp
87
MfmaGroup.cpp
98

0 commit comments

Comments
 (0)