Skip to content

Commit 33b2823

Browse files
authored
[AMD][Gluon] support inline when amdgpu ops in the callee (#8041)
Before this change, when there are amdgpu ops in the kernel as callee, inlining will fail. A`gluon-inline` pass runs after ttgir is generated by the frontend, so `run_parser` in test_frontend will not trigger it. Therefore, a runtime testing using `warmup` is added.
1 parent 9801a7a commit 33b2823

File tree

2 files changed

+26
-0
lines changed

2 files changed

+26
-0
lines changed

python/test/gluon/test_core.py

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -811,3 +811,26 @@ def kernel(N, out):
811811
out = torch.empty(1, dtype=torch.int32, device="cuda")
812812
compiled_kernel = kernel.warmup(N=100, out=out, grid=(1, ))
813813
assert compiled_kernel.asm["llir"].count("define") == 1
814+
815+
816+
@pytest.mark.skipif(not is_hip_cdna3() and not is_hip_cdna4(), reason="Requires CDNA3 or CDNA4")
817+
def test_inline_with_amdgpu_dialect():
818+
819+
@gluon.jit
820+
def buffer_load(x, offsets):
821+
return ttgl.amd.cdna3.buffer_load(ptr=x, offsets=offsets)
822+
823+
@gluon.jit
824+
def kernel(x, y):
825+
layout: ttgl.constexpr = ttgl.BlockedLayout(size_per_thread=[1], threads_per_warp=[64], warps_per_cta=[4],
826+
order=[0])
827+
offsets = ttgl.arange(0, 64, layout=layout)
828+
829+
a = buffer_load(x, offsets)
830+
ttgl.amd.cdna3.buffer_store(stored_value=a, ptr=y, offsets=offsets)
831+
832+
input = torch.arange(64, device="cuda").to(torch.int32)
833+
output = torch.empty_like(input)
834+
835+
compiled_kernel = kernel.warmup(input, output, grid=(1, ))
836+
assert compiled_kernel.asm["ttgir"].count("tt.func private") == 0

third_party/amd/lib/Dialect/TritonAMDGPU/IR/Dialect.cpp

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@
2727
#include "mlir/IR/OpImplementation.h"
2828
#include "third_party/amd/include/Utils/Utility.h"
2929
#include "triton/Conversion/TritonGPUToLLVM/Utility.h"
30+
#include "triton/Dialect/Triton/IR/Interfaces.h"
3031
#include "triton/Tools/LayoutUtils.h"
3132
#include "llvm/ADT/TypeSwitch.h"
3233

@@ -50,6 +51,8 @@ void mlir::triton::amdgpu::TritonAMDGPUDialect::initialize() {
5051
#define GET_OP_LIST
5152
#include "Dialect/TritonAMDGPU/IR/Ops.cpp.inc"
5253
>();
54+
55+
addInterfaces<TritonInlinerInterface>();
5356
}
5457

5558
#include "Dialect/TritonAMDGPU/IR/TritonAMDGPUEnums.cpp.inc"

0 commit comments

Comments
 (0)