Skip to content

Commit 9a2d7a5

Browse files
Merge commit 'cea35daf3578767f45db9904f8437ed96d2dfaa8'
2 parents d5b7656 + cea35da commit 9a2d7a5

File tree

22 files changed

+2292
-91
lines changed

22 files changed

+2292
-91
lines changed

bin/RegisterTritonDialects.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -85,6 +85,7 @@ inline void registerTritonDialects(mlir::DialectRegistry &registry) {
8585
// TritonAMDGPUTransforms passes
8686
mlir::registerTritonAMDGPUAccelerateMatmul();
8787
mlir::registerTritonAMDGPUOptimizeEpilogue();
88+
mlir::registerTritonAMDGPUBypassLDSForDotOperand();
8889
mlir::registerTritonAMDGPUReorderInstructions();
8990
mlir::registerTritonAMDGPUBlockPingpong();
9091
mlir::registerTritonAMDGPUStreamPipeline();

include/triton/Dialect/TritonGPU/Transforms/Passes.td

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,22 @@ def TritonGPUTestPipelineScheduleLoop : Pass<"tritongpu-test-pipeline-schedule-l
5555
"mlir::arith::ArithDialect"];
5656
}
5757

58+
def TritonGPUFuseNestedLoops : Pass<"tritongpu-fuse-nested-loops", "mlir::ModuleOp"> {
59+
let summary = "fuse nested loops for pipelining";
60+
61+
let description = [{
62+
The `tritongpu-fuse-nested-loops` pass will analyze loop nests in the module
63+
that need to be pipelined and fuse them into a single loop. This composes
64+
with the pipeliner to pipeline loop nests.
65+
}];
66+
67+
let dependentDialects = [
68+
"mlir::triton::gpu::TritonGPUDialect",
69+
"mlir::arith::ArithDialect",
70+
"mlir::ub::UBDialect",
71+
];
72+
}
73+
5874
def TritonGPUF32DotTC : Pass<"tritongpu-F32DotTC", "mlir::ModuleOp"> {
5975
let summary = "3xTF32 trick";
6076

include/triton/Dialect/TritonGPU/Transforms/Utility.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -205,6 +205,8 @@ enum class MMALoadType {
205205
};
206206
MMALoadType getMMALoadType(Operation *loadOp);
207207

208+
// Convert \param op operands and results to layout \param encoding.
209+
void convertOpEncoding(Attribute encoding, Operation *op);
208210
} // namespace mlir
209211

210212
#endif // TRITON_DIALECT_TRITONGPU_TRANSFORMS_UTILITY_H_

include/triton/Tools/Sys/GetEnv.hpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@ inline const std::set<std::string> CACHE_INVALIDATING_ENV_VARS = {
3131
"TRITON_ENABLE_LLVM_DEBUG",
3232
"TRITON_HIP_STREAM_PREFETCH",
3333
"TRITON_HIP_USE_BLOCK_PINGPONG",
34+
"TRITON_HIP_BYPASS_LDS_FOR_DOT",
3435
"TRITON_LLVM_DEBUG_ONLY",
3536
"TRITON_ENABLE_ASAN",
3637
"TRITON_OVERRIDE_ARCH",

lib/Conversion/TritonGPUToLLVM/CMakeLists.txt

Lines changed: 14 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -1,25 +1,25 @@
11
add_triton_library(TritonGPUToLLVM
22
ConvertLayoutOpToLLVM/SharedToDotOperandFMA.cpp
33
DotOpToLLVM/FMA.cpp
4-
GlobalScratchMemoryAllocation.cpp
5-
TypeConverter.cpp
6-
Utility.cpp
7-
ElementwiseOpToLLVM.cpp
8-
MemoryOpToLLVM.cpp
4+
AllocateSharedMemory.cpp
95
AssertOpToLLVM.cpp
10-
ViewOpToLLVM.cpp
11-
MakeRangeOpToLLVM.cpp
6+
ControlFlowOpToLLVM.cpp
7+
ConvertLayoutOpToLLVM.cpp
8+
DecomposeUnsupportedConversions.cpp
9+
ElementwiseOpToLLVM.cpp
10+
FuncOpToLLVM.cpp
11+
GatherOpToLLVM.cpp
12+
GlobalScratchMemoryAllocation.cpp
1213
HistogramOpToLLVM.cpp
13-
AllocateSharedMemory.cpp
14+
MakeRangeOpToLLVM.cpp
15+
MemoryOpToLLVM.cpp
16+
PrintOpToLLVM.cpp
1417
ReduceOpToLLVM.cpp
1518
ScanOpToLLVM.cpp
16-
GatherOpToLLVM.cpp
17-
ConvertLayoutOpToLLVM.cpp
18-
ControlFlowOpToLLVM.cpp
19-
FuncOpToLLVM.cpp
2019
SPMDOpToLLVM.cpp
21-
DecomposeUnsupportedConversions.cpp
22-
PrintOpToLLVM.cpp
20+
TypeConverter.cpp
21+
Utility.cpp
22+
ViewOpToLLVM.cpp
2323

2424
DEPENDS
2525
TritonGPUConversionPassIncGen

lib/Dialect/TritonGPU/Transforms/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@ add_triton_library(TritonGPUTransforms
22
AccelerateMatmul.cpp
33
Coalesce.cpp
44
F32DotTC.cpp
5+
FuseNestedLoops.cpp
56
CombineTensorSelectAndIf.cpp
67
LoopScheduling.cpp
78
ReduceDataDuplication.cpp

lib/Dialect/TritonGPU/Transforms/Coalesce.cpp

Lines changed: 1 addition & 50 deletions
Original file line numberDiff line numberDiff line change
@@ -104,55 +104,6 @@ struct CoalescePass : public impl::TritonGPUCoalesceBase<CoalescePass> {
104104
threadsPerWarp, CTALayout);
105105
}
106106

107-
static Type getNewType(Type type, Attribute encoding) {
108-
RankedTensorType tensorType = cast<RankedTensorType>(type);
109-
return RankedTensorType::get(tensorType.getShape(),
110-
tensorType.getElementType(), encoding);
111-
}
112-
113-
void coalesceOp(Attribute encoding, Operation *op) {
114-
OpBuilder builder(op);
115-
// Convert operands
116-
// For load/store with tensor pointers, we don't have to change the
117-
// operands' type, we do this by changing the outputs' type of
118-
// `make_tensor_ptr`
119-
SmallVector<Value, 4> newArgs;
120-
for (auto operand : op->getOperands()) {
121-
auto tensorType = dyn_cast<RankedTensorType>(operand.getType());
122-
if (tensorType &&
123-
!isa<triton::gpu::SharedEncodingAttr>(tensorType.getEncoding())) {
124-
Type newType = getNewType(tensorType, encoding);
125-
newArgs.push_back(builder.create<triton::gpu::ConvertLayoutOp>(
126-
op->getLoc(), newType, operand));
127-
} else {
128-
newArgs.push_back(operand);
129-
}
130-
}
131-
132-
// Convert output types
133-
SmallVector<Type, 4> newTypes;
134-
for (auto t : op->getResultTypes()) {
135-
bool isAsync = isa<triton::gpu::AsyncCopyGlobalToLocalOp>(op);
136-
newTypes.push_back(isAsync ? t : getNewType(t, encoding));
137-
}
138-
139-
// Construct new op with the new encoding
140-
Operation *newOp =
141-
builder.create(op->getLoc(), op->getName().getIdentifier(), newArgs,
142-
newTypes, op->getAttrs());
143-
144-
// Cast the results back to the original layout
145-
for (size_t i = 0; i < op->getNumResults(); i++) {
146-
Value newResult = newOp->getResult(i);
147-
if (newTypes[i] != op->getResultTypes()[i]) {
148-
newResult = builder.create<triton::gpu::ConvertLayoutOp>(
149-
op->getLoc(), op->getResult(i).getType(), newResult);
150-
}
151-
op->getResult(i).replaceAllUsesWith(newResult);
152-
}
153-
op->erase();
154-
}
155-
156107
void runOnOperation() override {
157108
// Run axis info analysis
158109
ModuleOp moduleOp = getOperation();
@@ -187,7 +138,7 @@ struct CoalescePass : public impl::TritonGPUCoalesceBase<CoalescePass> {
187138
// 4. Convert the output of this new memory op back to L1
188139
// 5. Replace all the uses of the original memory op by the new one
189140
for (auto &kv : layoutMap) {
190-
coalesceOp(kv.second, kv.first);
141+
convertOpEncoding(kv.second, kv.first);
191142
}
192143
}
193144
};

0 commit comments

Comments
 (0)