Skip to content

Commit 084d620

Browse files
authored
[Triton][Gluon] Run the inliner after scf-to-cf (#8017)
The expectation is that functions with tensor-typed arguments will be inlined, and this is enforced in the frontend. However, if the callee has early returns it doesn't get inlined, hitting crashes in interprocedural dataflow analyses later (AxisInfo). @ThomasRaoux's suggestion is just to run the inliner again after scf-to-cf.
1 parent 8a34c21 commit 084d620

File tree

4 files changed

+32
-1
lines changed

4 files changed

+32
-1
lines changed

lib/Dialect/TritonNvidiaGPU/IR/Ops.cpp

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@
3030
#include "triton/Dialect/TritonNvidiaGPU/IR/Dialect.h"
3131
#include "triton/Dialect/TritonNvidiaGPU/IR/TritonNvidiaGPUOpInterfaces.cpp.inc"
3232
#include "triton/Dialect/TritonNvidiaGPU/Transforms/Utility.h"
33+
#include "llvm/Support/ErrorHandling.h"
3334

3435
using namespace mlir::triton::gpu;
3536

@@ -277,7 +278,7 @@ static std::string strMMADTypeKind(MMADTypeKind kind) {
277278
case MMADTypeKind::i8:
278279
return "i8";
279280
}
280-
__builtin_unreachable();
281+
llvm_unreachable("unknown mma dtype kind");
281282
}
282283

283284
static std::optional<std::pair<MMADTypeKind, SmallVector<Type>>>

python/test/gluon/test_core.py

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -783,3 +783,31 @@ def tmem_copy_no_scales(in_ptr, out_ptr, M: ttgl.constexpr, N: ttgl.constexpr, B
783783

784784
tmem_copy_no_scales[(1, )](input, output, M, N, BLOCK_N, swizzle, num_warps=num_warps)
785785
assert (output == input).all()
786+
787+
788+
@gluon.jit
789+
def early_return_kernel(x):
790+
if x.sum(0).sum(0):
791+
return x
792+
x = x + x
793+
return x
794+
795+
796+
def test_2d_tensor_early_return():
797+
warp_size = ttgl.constexpr(THREADS_PER_WARP)
798+
799+
@gluon.jit
800+
def kernel(N, out):
801+
layout: ttgl.constexpr = ttgl.BlockedLayout([1, 1], [1, warp_size], [1, 4], [1, 0])
802+
BLOCK: ttgl.constexpr = 32
803+
804+
x0 = ttgl.arange(0, BLOCK, layout=ttgl.SliceLayout(1, layout))
805+
x1 = ttgl.arange(0, BLOCK, layout=ttgl.SliceLayout(0, layout))
806+
x = x0[:, None] * x1[None, :]
807+
for i in range(N):
808+
x += early_return_kernel(x)
809+
ttgl.store(out, x.sum(0).sum(0))
810+
811+
out = torch.empty(1, dtype=torch.int32, device="cuda")
812+
compiled_kernel = kernel.warmup(N=100, out=out, grid=(1, ))
813+
assert compiled_kernel.asm["llir"].count("define") == 1

third_party/amd/backend/compiler.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -288,6 +288,7 @@ def make_llir(src, metadata, options):
288288
custom_lds_size = 0
289289
amd.passes.ttgpuir.add_optimize_lds_usage(pm, options.arch, custom_lds_size)
290290
passes.convert.add_scf_to_cf(pm)
291+
passes.gluon.add_inliner(pm)
291292
passes.convert.add_index_to_llvmir(pm)
292293

293294
amd.passes.ttgpuir.add_allocate_shared_memory(pm)

third_party/nvidia/backend/compiler.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -349,6 +349,7 @@ def make_llir(self, src, metadata, options, capability):
349349
passes.ttgpuir.add_combine_tensor_select_and_if(pm)
350350
passes.ttgpuir.add_allocate_warp_groups(pm)
351351
passes.convert.add_scf_to_cf(pm)
352+
passes.gluon.add_inliner(pm)
352353
nvidia.passes.ttgpuir.add_allocate_shared_memory_nv(pm, capability, ptx_version)
353354
nvidia.passes.ttnvgpuir.add_allocate_tensor_memory(pm)
354355
if knobs.compilation.enable_experimental_consan:

0 commit comments

Comments
 (0)