Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions bin/RegisterTritonDialects.h
Original file line number Diff line number Diff line change
Expand Up @@ -112,6 +112,7 @@ inline void registerTritonDialects(mlir::DialectRegistry &registry) {
mlir::registerTritonAMDGPUScheduleLoops();
mlir::registerTritonAMDGPUCanonicalizePointers();
mlir::registerTritonAMDGPUConvertToBufferOps();
mlir::registerTritonAMDGPUOptimizeBufferOpPtr();
mlir::registerTritonAMDGPUInThreadTranspose();
mlir::registerTritonAMDGPUCoalesceAsyncCopy();
mlir::registerTritonAMDGPUUpdateAsyncWaitCount();
Expand Down
5 changes: 5 additions & 0 deletions python/src/passes.h
Original file line number Diff line number Diff line change
@@ -1,6 +1,11 @@
#define ADD_PASS_WRAPPER_0(name, builder) \
m.def(name, [](mlir::PassManager &pm) { pm.addPass(builder()); })

#define ADD_FUNC_PASS_WRAPPER_0(name, builder) \
m.def(name, [](mlir::PassManager &pm) { \
pm.addNestedPass<mlir::triton::FuncOp>(builder()); \
});

#define ADD_PASS_WRAPPER_1(name, builder, ty0) \
m.def(name, \
[](mlir::PassManager &pm, ty0 val0) { pm.addPass(builder(val0)); })
Expand Down
266 changes: 266 additions & 0 deletions test/TritonGPU/amd/amd-convert-buffer-ops-base-ptr-increment.mlir
Original file line number Diff line number Diff line change
@@ -0,0 +1,266 @@
// RUN: triton-opt %s -split-input-file --tritonamdgpu-convert-buffer-ops="arch-generation-name=gfx950" --tritonamdgpu-optimize-buffer-op-ptr| FileCheck %s --check-prefixes=COMMON

// COMMON-LABEL: add_after_load
// COMMON-DAG: [[X_OFFSET_CST:%.*]] = arith.constant dense<123>
// COMMON-DAG: [[Y_OFFSET_CST:%.*]] = arith.constant dense<321>
// COMMON: scf.for {{.*}} iter_args({{.*}}, {{.*}}, [[X_BASE:%.*]] = {{.*}}, [[Y_BASE:%.*]] = {{.*}})
// COMMON: amdgpu.buffer_load [[X_BASE]]{{\[}}[[X_OFFSET_CST]]{{\]}}
// COMMON: amdgpu.buffer_load [[Y_BASE]]{{\[}}[[Y_OFFSET_CST]]{{\]}}
// COMMON: [[NEXT_X_BASE:%.*]] = tt.addptr [[X_BASE]], %c64_i32
// COMMON: [[NEXT_Y_BASE:%.*]] = tt.addptr [[Y_BASE]]
// COMMON: scf.yield {{.*}}, [[NEXT_X_BASE]], [[NEXT_Y_BASE]]

#blocked = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [8, 8], warpsPerCTA = [1, 1], order = [1, 0]}>
#shared = #ttg.swizzled_shared<{vec = 1, perPhase = 1, maxPhase = 1, order = [1, 0]}>
#smem = #ttg.shared_memory
module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 1 : i32, "ttg.threads-per-warp" = 64 : i32} {
tt.func public @add_after_load(%X: !tt.ptr<f16> {tt.divisibility = 16 : i32, tt.pointer_range = 32 : i32}, %Y: !tt.ptr<f16> {tt.divisibility = 16 : i32, tt.pointer_range = 32 : i32}, %stride: i32) attributes {noinline = false} {
%cst = arith.constant dense<64> : tensor<16x64xi32, #blocked>
%c0 = arith.constant 0 : index
%c128 = arith.constant 128 : index
%c64_i32 = arith.constant 64 : i32
%c1 = arith.constant 1 : index

%Xoffset_init = arith.constant dense<123> : tensor<16x64xi32, #blocked>
%Yoffset_init = arith.constant dense<321> : tensor<64x32xi32, #blocked>

%x_dummy_buffer = ttg.local_alloc : () -> !ttg.memdesc<16x64xf16, #shared, #smem, mutable, 16x64>
%y_dummy_buffer = ttg.local_alloc : () -> !ttg.memdesc<64x32xf16, #shared, #smem, mutable, 64x32>

%tmp = arith.muli %stride, %c64_i32 : i32
%step = tt.splat %tmp : i32 -> tensor<64x32xi32, #blocked>
%for:2 = scf.for %idx = %c0 to %c128 step %c1 iter_args(%Xoffset = %Xoffset_init, %Yoffset = %Yoffset_init) -> (tensor<16x64xi32, #blocked>, tensor<64x32xi32, #blocked>) {
%x = amdgpu.buffer_load %X[%Xoffset] : tensor<16x64xf16, #blocked>
%y = amdgpu.buffer_load %Y[%Yoffset] cacheModifier = cg : tensor<64x32xf16, #blocked>

ttg.local_store %x, %x_dummy_buffer : tensor<16x64xf16, #blocked> -> !ttg.memdesc<16x64xf16, #shared, #smem, mutable, 16x64>
ttg.local_store %y, %y_dummy_buffer : tensor<64x32xf16, #blocked> -> !ttg.memdesc<64x32xf16, #shared, #smem, mutable, 64x32>

%Xoffset_next = arith.addi %Xoffset, %cst : tensor<16x64xi32, #blocked>
%Yoffset_next = arith.addi %Yoffset, %step : tensor<64x32xi32, #blocked>
scf.yield %Xoffset_next, %Yoffset_next : tensor<16x64xi32, #blocked>, tensor<64x32xi32, #blocked>
}
tt.return
}
}

// -----

// COMMON-LABEL: buffer_load_to_local
// COMMON-DAG: [[X_OFFSET_CST:%.*]] = arith.constant dense<123>
// COMMON: scf.for {{.*}} iter_args({{.*}}, [[X_BASE:%.*]] = {{.*}}
// COMMON: amdgpu.buffer_load_to_local [[X_BASE]]{{\[}}[[X_OFFSET_CST]]{{\]}}
// COMMON: [[NEXT_X_BASE:%.*]] = tt.addptr [[X_BASE]], %c64_i32
// COMMON: scf.yield {{.*}}, [[NEXT_X_BASE]]

#blocked = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [8, 8], warpsPerCTA = [1, 1], order = [1, 0]}>
#shared = #ttg.swizzled_shared<{vec = 1, perPhase = 1, maxPhase = 1, order = [1, 0]}>
#smem = #ttg.shared_memory
module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 1 : i32, "ttg.threads-per-warp" = 64 : i32} {
tt.func public @buffer_load_to_local(%X: !tt.ptr<f16> {tt.divisibility = 16 : i32, tt.pointer_range = 32 : i32}) attributes {noinline = false} {
%cst = arith.constant dense<64> : tensor<16x64xi32, #blocked>
%c0 = arith.constant 0 : index
%c128 = arith.constant 128 : index
%c1 = arith.constant 1 : index

%Xoffset_init = arith.constant dense<123> : tensor<16x64xi32, #blocked>

%x_dummy_buffer = ttg.local_alloc : () -> !ttg.memdesc<16x64xf16, #shared, #smem, mutable, 16x64>

%for = scf.for %idx = %c0 to %c128 step %c1 iter_args(%Xoffset = %Xoffset_init) -> (tensor<16x64xi32, #blocked>) {
%x = amdgpu.buffer_load_to_local %X[%Xoffset] into %x_dummy_buffer : <f16>[tensor<16x64xi32, #blocked>] -> !ttg.memdesc<16x64xf16, #shared, #smem, mutable, 16x64>

%Xoffset_next = arith.addi %Xoffset, %cst : tensor<16x64xi32, #blocked>
scf.yield %Xoffset_next : tensor<16x64xi32, #blocked>
}
tt.return
}
}

// -----

// COMMON-LABEL: add_before_load
// COMMON-DAG: [[X_OFFSET_CST:%.*]] = arith.constant dense<123>
// COMMON: scf.for {{.*}} iter_args({{.*}}, [[X_BASE:%.*]] = {{.*}})
// COMMON: [[NEXT_X_BASE:%.*]] = tt.addptr [[X_BASE]], %c64_i32
// COMMON: amdgpu.buffer_load [[NEXT_X_BASE]]{{\[}}[[X_OFFSET_CST]]{{\]}}
// COMMON: scf.yield {{.*}}, [[NEXT_X_BASE]]

#blocked = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [8, 8], warpsPerCTA = [1, 1], order = [1, 0]}>
#shared = #ttg.swizzled_shared<{vec = 1, perPhase = 1, maxPhase = 1, order = [1, 0]}>
#smem = #ttg.shared_memory
module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 1 : i32, "ttg.threads-per-warp" = 64 : i32} {
tt.func public @add_before_load(%X: !tt.ptr<f16> {tt.divisibility = 16 : i32, tt.pointer_range = 32 : i32}, %Y: !tt.ptr<f16> {tt.divisibility = 16 : i32, tt.pointer_range = 32 : i32}) attributes {noinline = false} {
%cst = arith.constant dense<64> : tensor<16x64xi32, #blocked>
%c0 = arith.constant 0 : index
%c128 = arith.constant 128 : index
%c1 = arith.constant 1 : index
%Xoffset_init = arith.constant dense<123> : tensor<16x64xi32, #blocked>
%x_dummy_buffer = ttg.local_alloc : () -> !ttg.memdesc<16x64xf16, #shared, #smem, mutable, 16x64>
%for = scf.for %idx = %c0 to %c128 step %c1 iter_args(%Xoffset = %Xoffset_init) -> (tensor<16x64xi32, #blocked>) {
%Xoffset_next = arith.addi %Xoffset, %cst : tensor<16x64xi32, #blocked>
%x = amdgpu.buffer_load %X[%Xoffset_next] : tensor<16x64xf16, #blocked>
ttg.local_store %x, %x_dummy_buffer : tensor<16x64xf16, #blocked> -> !ttg.memdesc<16x64xf16, #shared, #smem, mutable, 16x64>
scf.yield %Xoffset_next : tensor<16x64xi32, #blocked>
}
tt.return
}
}

// -----

// COMMON-LABEL: isolated_pattern_nested_loop1
// COMMON: [[X_OFFSET_CST:%.*]] = arith.constant dense<123>
// COMMON: scf.for
// COMMON: scf.for {{.*}} iter_args({{.*}}, [[X_BASE:%.*]] = {{.*}})
// COMMON: amdgpu.buffer_load [[X_BASE]]{{\[}}[[X_OFFSET_CST]]{{\]}}
// COMMON: [[NEXT_X_BASE:%.*]] = tt.addptr [[X_BASE]]
// COMMON: scf.yield {{.*}}, [[NEXT_X_BASE]]

#blocked = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [8, 8], warpsPerCTA = [1, 1], order = [1, 0]}>
#shared = #ttg.swizzled_shared<{vec = 1, perPhase = 1, maxPhase = 1, order = [1, 0]}>
#smem = #ttg.shared_memory
module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 1 : i32, "ttg.threads-per-warp" = 64 : i32} {
tt.func public @isolated_pattern_nested_loop1(%X: !tt.ptr<f16> {tt.divisibility = 16 : i32, tt.pointer_range = 32 : i32}) attributes {noinline = false} {
%cst = arith.constant dense<64> : tensor<16x64xi32, #blocked>
%c0 = arith.constant 0 : index
%c128 = arith.constant 128 : index
%c1 = arith.constant 1 : index

%Xoffset_init = arith.constant dense<123> : tensor<16x64xi32, #blocked>
%x_dummy_buffer = ttg.local_alloc : () -> !ttg.memdesc<16x64xf16, #shared, #smem, mutable, 16x64>
scf.for %idx_outer = %c0 to %c128 step %c1 iter_args() -> () {
%for_inner = scf.for %idx_innter = %c0 to %c128 step %c1 iter_args(%Xoffset = %Xoffset_init) -> (tensor<16x64xi32, #blocked>) {
%x = amdgpu.buffer_load %X[%Xoffset] : tensor<16x64xf16, #blocked>
ttg.local_store %x, %x_dummy_buffer : tensor<16x64xf16, #blocked> -> !ttg.memdesc<16x64xf16, #shared, #smem, mutable, 16x64>
%Xoffset_next = arith.addi %Xoffset, %cst : tensor<16x64xi32, #blocked>
scf.yield %Xoffset_next : tensor<16x64xi32, #blocked>
}
scf.yield
}
tt.return
}
}

// -----

// COMMON-LABEL: isolated_pattern_nested_loop2
// COMMON: [[X_OFFSET_CST:%.*]] = arith.constant dense<123>
// COMMON: scf.for {{.*}} iter_args({{.*}}, [[X_BASE:%.*]] = {{.*}})
// COMMON: scf.for
// COMMON: amdgpu.buffer_load [[X_BASE]]{{\[}}[[X_OFFSET_CST]]{{\]}}
// COMMON: [[NEXT_X_BASE:%.*]] = tt.addptr [[X_BASE]]
// COMMON: scf.yield {{.*}}, [[NEXT_X_BASE]]

#blocked = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [8, 8], warpsPerCTA = [1, 1], order = [1, 0]}>
#shared = #ttg.swizzled_shared<{vec = 1, perPhase = 1, maxPhase = 1, order = [1, 0]}>
#smem = #ttg.shared_memory
module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 1 : i32, "ttg.threads-per-warp" = 64 : i32} {
tt.func public @isolated_pattern_nested_loop2(%X: !tt.ptr<f16> {tt.divisibility = 16 : i32, tt.pointer_range = 32 : i32}) attributes {noinline = false} {
%cst = arith.constant dense<64> : tensor<16x64xi32, #blocked>
%c0 = arith.constant 0 : index
%c128 = arith.constant 128 : index
%c1 = arith.constant 1 : index

%Xoffset_init = arith.constant dense<123> : tensor<16x64xi32, #blocked>
%x_dummy_buffer = ttg.local_alloc : () -> !ttg.memdesc<16x64xf16, #shared, #smem, mutable, 16x64>
%for_outer = scf.for %idx_outer = %c0 to %c128 step %c1 iter_args(%Xoffset = %Xoffset_init) -> (tensor<16x64xi32, #blocked>) {
scf.for %idx_inner = %c0 to %c128 step %c1 iter_args() -> () {
%x = amdgpu.buffer_load %X[%Xoffset] : tensor<16x64xf16, #blocked>
ttg.local_store %x, %x_dummy_buffer : tensor<16x64xf16, #blocked> -> !ttg.memdesc<16x64xf16, #shared, #smem, mutable, 16x64>
scf.yield
}
%Xoffset_next = arith.addi %Xoffset, %cst : tensor<16x64xi32, #blocked>
scf.yield %Xoffset_next : tensor<16x64xi32, #blocked>
}
tt.return
}
}

// -----

// COMMON-LABEL: convert_with_base_ptr_optimization
// COMMON: [[X_OFFSET_CST:%.*]] = arith.constant dense<123>
// COMMON: scf.for {{.*}} iter_args({{.*}}, [[X_BASE:%.*]] = {{.*}})
// COMMON: amdgpu.buffer_load [[X_BASE]]{{\[}}[[X_OFFSET_CST]]{{\]}}
// COMMON: [[NEXT_X_BASE:%.*]] = tt.addptr [[X_BASE]]
// COMMON: scf.yield {{.*}}, [[NEXT_X_BASE]]

#blocked = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [8, 8], warpsPerCTA = [1, 1], order = [1, 0]}>
#shared = #ttg.swizzled_shared<{vec = 1, perPhase = 1, maxPhase = 1, order = [1, 0]}>
#smem = #ttg.shared_memory
module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 1 : i32, "ttg.threads-per-warp" = 64 : i32} {
tt.func public @convert_with_base_ptr_optimization(%X: !tt.ptr<f16> {tt.divisibility = 16 : i32, tt.pointer_range = 32 : i32}) attributes {noinline = false} {
%step = arith.constant dense<64> : tensor<16x64xi32, #blocked>
%c0 = arith.constant 0 : index
%c128 = arith.constant 128 : index
%c1 = arith.constant 1 : index
%x_base = tt.splat %X : !tt.ptr<f16> -> tensor<16x64x!tt.ptr<f16>, #blocked>
%offsets_init = arith.constant dense<123> : tensor<16x64xi32, #blocked>
%x_dummy_buffer = ttg.local_alloc : () -> !ttg.memdesc<16x64xf16, #shared, #smem, mutable, 16x64>
%for = scf.for %idx_outer = %c0 to %c128 step %c1 iter_args(%offsets = %offsets_init) -> (tensor<16x64xi32, #blocked>) {
%X_ptr = tt.addptr %x_base, %offsets : tensor<16x64x!tt.ptr<f16>, #blocked>, tensor<16x64xi32, #blocked>
%x = tt.load %X_ptr : tensor<16x64x!tt.ptr<f16>, #blocked>
ttg.local_store %x, %x_dummy_buffer : tensor<16x64xf16, #blocked> -> !ttg.memdesc<16x64xf16, #shared, #smem, mutable, 16x64>
%offsets_next = arith.addi %offsets, %step : tensor<16x64xi32, #blocked>
scf.yield %offsets_next : tensor<16x64xi32, #blocked>
}
tt.return
}
}

// -----

// COMMON-LABEL: convert_with_dynamic_base_negative
// COMMON: [[X_BASE:%.*]] = tt.addptr
// COMMON: amdgpu.buffer_load [[X_BASE]]
// COMMON-NOT: tt.addptr

#blocked = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [8, 8], warpsPerCTA = [1, 1], order = [1, 0]}>
#shared = #ttg.swizzled_shared<{vec = 1, perPhase = 1, maxPhase = 1, order = [1, 0]}>
#smem = #ttg.shared_memory
module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 1 : i32, "ttg.threads-per-warp" = 64 : i32} {
tt.func public @convert_with_dynamic_base_negative(%X: !tt.ptr<f16> {tt.divisibility = 16 : i32, tt.pointer_range = 32 : i32}) attributes {noinline = false} {
%cst = arith.constant dense<64> : tensor<16x64xi32, #blocked>
%c0 = arith.constant 0 : i32
%c128 = arith.constant 128 : i32
%c1 = arith.constant 1 : i32

%Xoffset_init = arith.constant dense<123> : tensor<16x64xi32, #blocked>
%x_dummy_buffer = ttg.local_alloc : () -> !ttg.memdesc<16x64xf16, #shared, #smem, mutable, 16x64>
%for = scf.for %idx = %c0 to %c128 step %c1 iter_args(%Xoffset = %Xoffset_init) -> (tensor<16x64xi32, #blocked>) : i32 {
%x_base = tt.addptr %X, %idx : !tt.ptr<f16>, i32
%x = amdgpu.buffer_load %x_base[%Xoffset] : tensor<16x64xf16, #blocked>
ttg.local_store %x, %x_dummy_buffer : tensor<16x64xf16, #blocked> -> !ttg.memdesc<16x64xf16, #shared, #smem, mutable, 16x64>
%Xoffset_next = arith.addi %Xoffset, %cst : tensor<16x64xi32, #blocked>
scf.yield %Xoffset_next : tensor<16x64xi32, #blocked>
}
tt.return
}
}

// -----

// COMMON-LABEL: convert_with_non_uniform_step_negative
// COMMON-NOT: tt.addptr

#blocked = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [8, 8], warpsPerCTA = [1, 1], order = [1, 0]}>
#shared = #ttg.swizzled_shared<{vec = 1, perPhase = 1, maxPhase = 1, order = [1, 0]}>
#smem = #ttg.shared_memory
module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 1 : i32, "ttg.threads-per-warp" = 64 : i32} {
tt.func public @convert_with_non_uniform_step_negative(%X: !tt.ptr<f16> {tt.divisibility = 16 : i32, tt.pointer_range = 32 : i32}, %step : tensor<16x64xi32, #blocked>) attributes {noinline = false} {
%c0 = arith.constant 0 : i32
%c128 = arith.constant 128 : i32
%c1 = arith.constant 1 : i32

%Xoffset_init = arith.constant dense<123> : tensor<16x64xi32, #blocked>
%x_dummy_buffer = ttg.local_alloc : () -> !ttg.memdesc<16x64xf16, #shared, #smem, mutable, 16x64>
%for = scf.for %idx = %c0 to %c128 step %c1 iter_args(%Xoffset = %Xoffset_init) -> (tensor<16x64xi32, #blocked>) : i32 {
%x = amdgpu.buffer_load %X[%Xoffset] : tensor<16x64xf16, #blocked>
ttg.local_store %x, %x_dummy_buffer : tensor<16x64xf16, #blocked> -> !ttg.memdesc<16x64xf16, #shared, #smem, mutable, 16x64>
%Xoffset_next = arith.addi %Xoffset, %step : tensor<16x64xi32, #blocked>
scf.yield %Xoffset_next : tensor<16x64xi32, #blocked>
}
tt.return
}
}
1 change: 1 addition & 0 deletions third_party/amd/backend/compiler.py
Original file line number Diff line number Diff line change
Expand Up @@ -251,6 +251,7 @@ def make_ttgir(mod, metadata, options):
knobs.amd.use_buffer_atomics,
knobs.amd.buffer_ops_analyze_small_tensor_range,
)
amd.passes.ttgpuir.add_optimize_buffer_op_ptr(pm)

amd.passes.ttgpuir.add_fold_true_cmpi(pm)
passes.common.add_canonicalizer(pm)
Expand Down
8 changes: 8 additions & 0 deletions third_party/amd/include/TritonAMDGPUTransforms/Passes.td
Original file line number Diff line number Diff line change
Expand Up @@ -160,6 +160,14 @@ def TritonAMDGPUConvertToBufferOps : Pass<"tritonamdgpu-convert-buffer-ops", "ml
];
}

def TritonAMDGPUOptimizeBufferOpPtr : Pass<"tritonamdgpu-optimize-buffer-op-ptr", "mlir::triton::FuncOp"> {
let summary = "Optimize address operands of buffer operations";

let description = "This pass optimizes address computation for buffer operations";

let dependentDialects = ["mlir::triton::amdgpu::TritonAMDGPUDialect"];
}

def TritonAMDGPUBlockPingpong: Pass<"tritonamdgpu-block-pingpong", "mlir::ModuleOp"> {
let summary = "Interleaving instructions from two warps on the same SIMD to better utilize matrix core";

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ add_triton_library(TritonAMDGPUTransforms
CanonicalizePointers.cpp
CoalesceAsyncCopy.cpp
ConvertToBufferOps.cpp
OptimizeBufferOpPtr.cpp
OptimizeEpilogue.cpp
OptimizeDotOperands.cpp
HoistLayoutConversions.cpp
Expand Down
Loading
Loading