Skip to content

Commit 9b05fba

Browse files
Merge commit '22e212b5433fb1b5b204b8a62277cbdb5bd343b3'
2 parents d804502 + 22e212b commit 9b05fba

38 files changed

+168
-89
lines changed

CMakeLists.txt

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -65,8 +65,8 @@ if(NOT MSVC)
6565
set(CMAKE_CXX_FLAGS_TRITONBUILDWITHO1 "-O1")
6666
else()
6767
set(CMAKE_CXX_STANDARD 20)
68-
set(CMAKE_C_FLAGS_TRITONRELBUILDWITHASSERTS "/Zi /Ob0 /Od /RTC1 /bigobj /Zc:preprocessor")
69-
set(CMAKE_CXX_FLAGS_TRITONRELBUILDWITHASSERTS "/Zi /Ob0 /Od /RTC1 /bigobj /Zc:preprocessor")
68+
set(CMAKE_C_FLAGS_TRITONRELBUILDWITHASSERTS "/Zi /RTC1 /bigobj /Zc:preprocessor")
69+
set(CMAKE_CXX_FLAGS_TRITONRELBUILDWITHASSERTS "/Zi /RTC1 /bigobj /Zc:preprocessor")
7070
set(CMAKE_EXE_LINKER_FLAGS_TRITONRELBUILDWITHASSERTS "/debug:fastlink /INCREMENTAL")
7171
set(CMAKE_MODULE_LINKER_FLAGS_TRITONRELBUILDWITHASSERTS "/debug:fastlink /INCREMENTAL")
7272
set(CMAKE_SHARED_LINKER_FLAGS_TRITONRELBUILDWITHASSERTS "/debug:fastlink /INCREMENTAL")
@@ -96,7 +96,7 @@ if(NOT MSVC)
9696
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -D__STDC_FORMAT_MACROS -Wno-deprecated")
9797
endif()
9898
else()
99-
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -D__STDC_FORMAT_MACROS /wd4244 /wd4624 /wd4715 /wd4530")
99+
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -D__STDC_FORMAT_MACROS")
100100
endif()
101101

102102

bin/RegisterTritonDialects.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -87,7 +87,7 @@ inline void registerTritonDialects(mlir::DialectRegistry &registry) {
8787
mlir::registerTritonAMDGPUAccelerateMatmul();
8888
mlir::registerTritonAMDGPUOptimizeEpilogue();
8989
mlir::registerTritonAMDGPUReorderInstructions();
90-
mlir::registerTritonAMDGPUStreamPipelineV2();
90+
mlir::registerTritonAMDGPUStreamPipeline();
9191
mlir::registerTritonAMDGPUCanonicalizePointers();
9292
mlir::registerTritonAMDGPUConvertToBufferOps();
9393
mlir::triton::registerTritonAMDGPUInsertInstructionSchedHints();

lib/Conversion/TritonToTritonGPU/TritonToTritonGPUPass.cpp

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2,12 +2,9 @@
22

33
#include "mlir/Dialect/Arith/IR/Arith.h"
44
#include "mlir/Dialect/ControlFlow/IR/ControlFlowOps.h"
5-
#include "mlir/Dialect/GPU/IR/GPUDialect.h"
6-
#include "mlir/Dialect/Index/IR/IndexDialect.h"
75
#include "mlir/IR/BuiltinAttributes.h"
86
#include "mlir/Pass/Pass.h"
97
#include "mlir/Transforms/DialectConversion.h"
10-
#include "triton/Analysis/Utility.h"
118
#include "triton/Dialect/Triton/IR/Dialect.h"
129
#include "triton/Dialect/Triton/IR/Utility.h"
1310
#include "triton/Dialect/TritonGPU/IR/Dialect.h"

lib/Dialect/TritonGPU/Transforms/LoopScheduling.cpp

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -56,6 +56,11 @@ loadOpsToIndirectionLevelAndUse(scf::ForOp forOp) {
5656
distance++;
5757
}
5858
for (Value operand : op->getOperands()) {
59+
if (op->hasTrait<OpTrait::DotLike>()) {
60+
// Heuristic: only pipeline A and B operands of the dot op.
61+
if (operand == op->getOperand(2))
62+
continue;
63+
}
5964
Value v = operand;
6065
Operation *defOp = v.getDefiningOp();
6166
if (defOp && defOp->getBlock() == op->getBlock()) {

lib/Dialect/TritonGPU/Transforms/Utility.cpp

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -563,7 +563,8 @@ bool canFoldIntoConversion(Operation *op, Attribute targetEncoding) {
563563
}
564564
return isa<triton::gpu::ConvertLayoutOp, arith::ConstantOp,
565565
triton::MakeRangeOp, triton::SplatOp, triton::HistogramOp,
566-
triton::gpu::LocalAllocOp, triton::gpu::LocalStoreOp>(op);
566+
triton::gpu::LocalAllocOp, triton::gpu::LocalLoadOp,
567+
triton::gpu::LocalStoreOp>(op);
567568
}
568569

569570
scf::ForOp replaceForOpWithNewSignature(

python/setup.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -299,6 +299,7 @@ def update_symlink(link_path, source_path):
299299
shutil.rmtree(link_path)
300300

301301
print(f"creating symlink: {link_path} -> {source_path}", file=sys.stderr)
302+
link_path.absolute().parent.mkdir(parents=True, exist_ok=True) # Ensure link's parent directory exists
302303
link_path.symlink_to(source_path, target_is_directory=True)
303304

304305

python/src/ir.cc

Lines changed: 3 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,6 @@
66
#include "mlir/Bytecode/BytecodeWriter.h"
77
#include "mlir/Dialect/ControlFlow/IR/ControlFlow.h"
88
#include "mlir/Dialect/ControlFlow/IR/ControlFlowOps.h"
9-
#include "mlir/Dialect/Index/IR/IndexDialect.h"
109
#include "mlir/Dialect/LLVMIR/LLVMDialect.h"
1110
#include "mlir/Dialect/LLVMIR/Transforms/InlinerInterfaceImpl.h"
1211
#include "mlir/Dialect/UB/IR/UBOps.h"
@@ -245,10 +244,9 @@ void init_triton_ir(py::module &&m) {
245244
m.def("load_dialects", [](MLIRContext &context) {
246245
DialectRegistry registry;
247246
registry.insert<TritonDialect, ::mlir::triton::gpu::TritonGPUDialect,
248-
math::MathDialect, arith::ArithDialect, index::IndexDialect,
249-
scf::SCFDialect, ::mlir::gpu::GPUDialect,
250-
cf::ControlFlowDialect, LLVM::LLVMDialect,
251-
mlir::ub::UBDialect>();
247+
math::MathDialect, arith::ArithDialect, scf::SCFDialect,
248+
::mlir::gpu::GPUDialect, cf::ControlFlowDialect,
249+
LLVM::LLVMDialect, mlir::ub::UBDialect>();
252250
mlir::LLVM::registerInlinerInterface(registry);
253251
registerBuiltinDialectTranslation(registry);
254252
registerLLVMDialectTranslation(registry);

test/TritonGPU/amd/amd-instruction-sched.mlir

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,10 @@
11
// RUN: triton-opt %s -split-input-file -triton-amdgpu-insert-instruction-sched-hints -triton-amdgpu-lower-insert-instruction-sched-hints='variant=llvm-iglp-0' -verify-diagnostics | FileCheck %s -check-prefix=INSERT_IGLP0
22
// RUN: triton-opt %s -split-input-file -triton-amdgpu-insert-instruction-sched-hints -triton-amdgpu-lower-insert-instruction-sched-hints='variant=llvm-iglp-1' -verify-diagnostics | FileCheck %s -check-prefix=INSERT_IGLP1
3-
// RUN: triton-opt %s -split-input-file -convert-triton-to-tritongpu='target=hip:gfx942 num-ctas=1 num-warps=4 threads-per-warp=64' -tritongpu-coalesce -tritonamdgpu-accelerate-matmul='arch-generation-name=gfx942 matrix-instruction-size=32 kPack=1' -tritongpu-remove-layout-conversions -tritonamdgpu-stream-pipeline-v2='num_stages=1' -triton-amdgpu-insert-instruction-sched-hints -decompose-unsupported-amd-conversions -optimize-amd-lds-usage='target-arch=gfx942' -convert-scf-to-cf -convert-index-to-llvm -allocate-shared-memory -convert-triton-amdgpu-to-llvm='arch=gfx942' -verify-diagnostics | FileCheck %s -check-prefix=INSTR_COUNT_NS1
4-
// RUN: triton-opt %s -split-input-file -convert-triton-to-tritongpu='target=hip:gfx942 num-ctas=1 num-warps=4 threads-per-warp=64' -tritongpu-coalesce -tritonamdgpu-accelerate-matmul='arch-generation-name=gfx942 matrix-instruction-size=32 kPack=1' -tritongpu-remove-layout-conversions -tritonamdgpu-stream-pipeline-v2='num_stages=2' -triton-amdgpu-insert-instruction-sched-hints -decompose-unsupported-amd-conversions -optimize-amd-lds-usage='target-arch=gfx942' -convert-scf-to-cf -convert-index-to-llvm -allocate-shared-memory -convert-triton-amdgpu-to-llvm='arch=gfx942' -verify-diagnostics | FileCheck %s -check-prefix=INSTR_COUNT_NS2
5-
// RUN: triton-opt %s -split-input-file -convert-triton-to-tritongpu='target=hip:gfx942 num-ctas=1 num-warps=4 threads-per-warp=64' -tritongpu-coalesce -tritonamdgpu-accelerate-matmul='arch-generation-name=gfx942 matrix-instruction-size=32 kPack=1' -tritongpu-remove-layout-conversions -tritonamdgpu-stream-pipeline-v2='num_stages=2' -triton-amdgpu-insert-instruction-sched-hints -decompose-unsupported-amd-conversions -optimize-amd-lds-usage='target-arch=gfx942' -convert-scf-to-cf -convert-index-to-llvm -allocate-shared-memory -convert-triton-amdgpu-to-llvm='arch=gfx942' -triton-amdgpu-lower-insert-instruction-sched-hints='variant=local-prefetch' -debug-only='lower-insert-instruction-sched-hints' -verify-diagnostics 2>&1 | FileCheck %s -check-prefix=USE_LOCAL_PREFETCH_GLOBAL_LOAD
6-
// RUN: triton-opt %s -split-input-file -convert-triton-to-tritongpu='target=hip:gfx942 num-ctas=1 num-warps=4 threads-per-warp=64' -tritongpu-coalesce -tritongpu-remove-layout-conversions -tritonamdgpu-stream-pipeline-v2='num_stages=1' | FileCheck %s -check-prefix=LABELING_PS_1
7-
// RUN: triton-opt %s -split-input-file -convert-triton-to-tritongpu='target=hip:gfx942 num-ctas=1 num-warps=4 threads-per-warp=64' -tritongpu-coalesce -tritongpu-remove-layout-conversions -tritonamdgpu-stream-pipeline-v2='num_stages=2' | FileCheck %s -check-prefix=LABELING_PS_2
3+
// RUN: triton-opt %s -split-input-file -convert-triton-to-tritongpu='target=hip:gfx942 num-ctas=1 num-warps=4 threads-per-warp=64' -tritongpu-coalesce -tritonamdgpu-accelerate-matmul='arch-generation-name=gfx942 matrix-instruction-size=32 kPack=1' -tritongpu-remove-layout-conversions -tritonamdgpu-stream-pipeline='num_stages=1' -triton-amdgpu-insert-instruction-sched-hints -decompose-unsupported-amd-conversions -optimize-amd-lds-usage='target-arch=gfx942' -convert-scf-to-cf -convert-index-to-llvm -allocate-shared-memory -convert-triton-amdgpu-to-llvm='arch=gfx942' -verify-diagnostics | FileCheck %s -check-prefix=INSTR_COUNT_NS1
4+
// RUN: triton-opt %s -split-input-file -convert-triton-to-tritongpu='target=hip:gfx942 num-ctas=1 num-warps=4 threads-per-warp=64' -tritongpu-coalesce -tritonamdgpu-accelerate-matmul='arch-generation-name=gfx942 matrix-instruction-size=32 kPack=1' -tritongpu-remove-layout-conversions -tritonamdgpu-stream-pipeline='num_stages=2' -triton-amdgpu-insert-instruction-sched-hints -decompose-unsupported-amd-conversions -optimize-amd-lds-usage='target-arch=gfx942' -convert-scf-to-cf -convert-index-to-llvm -allocate-shared-memory -convert-triton-amdgpu-to-llvm='arch=gfx942' -verify-diagnostics | FileCheck %s -check-prefix=INSTR_COUNT_NS2
5+
// RUN: triton-opt %s -split-input-file -convert-triton-to-tritongpu='target=hip:gfx942 num-ctas=1 num-warps=4 threads-per-warp=64' -tritongpu-coalesce -tritonamdgpu-accelerate-matmul='arch-generation-name=gfx942 matrix-instruction-size=32 kPack=1' -tritongpu-remove-layout-conversions -tritonamdgpu-stream-pipeline='num_stages=2' -triton-amdgpu-insert-instruction-sched-hints -decompose-unsupported-amd-conversions -optimize-amd-lds-usage='target-arch=gfx942' -convert-scf-to-cf -convert-index-to-llvm -allocate-shared-memory -convert-triton-amdgpu-to-llvm='arch=gfx942' -triton-amdgpu-lower-insert-instruction-sched-hints='variant=local-prefetch' -debug-only='lower-insert-instruction-sched-hints' -verify-diagnostics 2>&1 | FileCheck %s -check-prefix=USE_LOCAL_PREFETCH_GLOBAL_LOAD
6+
// RUN: triton-opt %s -split-input-file -convert-triton-to-tritongpu='target=hip:gfx942 num-ctas=1 num-warps=4 threads-per-warp=64' -tritongpu-coalesce -tritongpu-remove-layout-conversions -tritonamdgpu-stream-pipeline='num_stages=1' | FileCheck %s -check-prefix=LABELING_PS_1
7+
// RUN: triton-opt %s -split-input-file -convert-triton-to-tritongpu='target=hip:gfx942 num-ctas=1 num-warps=4 threads-per-warp=64' -tritongpu-coalesce -tritongpu-remove-layout-conversions -tritonamdgpu-stream-pipeline='num_stages=2' | FileCheck %s -check-prefix=LABELING_PS_2
88

99
module {
1010
// INSERT_IGLP0-LABEL: @test_dot_op

test/TritonGPU/combine.mlir

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2685,3 +2685,21 @@ module attributes {"triton_gpu.num-warps" = 1 : i32, "triton_gpu.threads-per-war
26852685
tt.return
26862686
}
26872687
}
2688+
2689+
// -----
2690+
2691+
#blocked = #triton_gpu.blocked<{sizePerThread = [1, 1, 1, 1, 4], threadsPerWarp = [2, 1, 16, 1, 1], warpsPerCTA = [1, 1, 2, 2, 1], order = [4, 0, 1, 2, 3]}>
2692+
#blocked2 = #triton_gpu.blocked<{sizePerThread = [1, 1, 1, 1, 4], threadsPerWarp = [1, 1, 32, 1, 1], warpsPerCTA = [1, 1, 1, 1, 4], order = [4, 3, 2, 1, 0]}>
2693+
#blocked1 = #triton_gpu.blocked<{sizePerThread = [1, 1, 1, 1, 4], threadsPerWarp = [2, 1, 16, 1, 1], warpsPerCTA = [1, 2, 2, 1, 1], order = [4, 0, 3, 2, 1]}>
2694+
#shared = #triton_gpu.shared<{vec = 1, perPhase = 1, maxPhase = 1, order = [4, 0, 1, 2, 3], hasLeadingOffset = false}>
2695+
module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, triton_gpu.target = "cuda:100", "triton_gpu.threads-per-warp" = 32 : i32} {
2696+
// CHECK-LABEL: lift_convert_to_local_load
2697+
// CHECK-NOT: convert_layout
2698+
// CHECK: tt.return
2699+
tt.func public @lift_convert_to_local_load(%arg0 : !triton_gpu.memdesc<2x1x32x4x4xi8, #shared, #triton_gpu.shared_memory, mutable>) -> tensor<2x4x32x1x4xi8, #blocked2> {
2700+
%1 = triton_gpu.local_load %arg0 : !triton_gpu.memdesc<2x1x32x4x4xi8, #shared, #triton_gpu.shared_memory, mutable> -> tensor<2x1x32x4x4xi8, #blocked>
2701+
%2 = tt.trans %1 {order = array<i32: 0, 3, 2, 1, 4>} : tensor<2x1x32x4x4xi8, #blocked> -> tensor<2x4x32x1x4xi8, #blocked1>
2702+
%3 = triton_gpu.convert_layout %2 : tensor<2x4x32x1x4xi8, #blocked1> -> tensor<2x4x32x1x4xi8, #blocked2>
2703+
tt.return %3 : tensor<2x4x32x1x4xi8, #blocked2>
2704+
}
2705+
}

test/TritonGPU/loop-pipeline-hip.mlir

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
// RUN: triton-opt %s -split-input-file -tritonamdgpu-stream-pipeline-v2=num_stages=2 -canonicalize | FileCheck %s
1+
// RUN: triton-opt %s -split-input-file -tritonamdgpu-stream-pipeline=num_stages=2 -canonicalize | FileCheck %s
22

33
#blocked = #triton_gpu.blocked<{sizePerThread = [8, 1], threadsPerWarp = [8, 8], warpsPerCTA = [1, 4], order = [0, 1]}>
44
#blocked1 = #triton_gpu.blocked<{sizePerThread = [1, 8], threadsPerWarp = [8, 8], warpsPerCTA = [4, 1], order = [1, 0]}>

0 commit comments

Comments
 (0)