-
Notifications
You must be signed in to change notification settings - Fork 76
Closed
Description
#2531 fails to detect layout conversions with sliced dimensions as shuffles. This way, code like:
#blocked = #triton_gpu.blocked<{sizePerThread = [1, 16], threadsPerWarp = [16, 1], warpsPerCTA = [1, 2], order = [0, 1]}>
#blocked1 = #triton_gpu.blocked<{sizePerThread = [16, 1], threadsPerWarp = [1, 16], warpsPerCTA = [1, 2], order = [0, 1]}>
#sliced = #triton_gpu.slice<{dim = 1, parent = #blocked}>
#sliced1 = #triton_gpu.slice<{dim = 1, parent = #blocked1}>
module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 1 : i32, "triton_gpu.threads-per-warp" = 16 : i32} {
tt.func @test_f32(%arg0: tensor<32xf32, #sliced>) -> tensor<32xf32, #sliced1> {
%0 = triton_gpu.convert_layout %arg0 : tensor<32xf32, #sliced> -> tensor<32xf32, #sliced1>
tt.return %0 : tensor<32xf32, #sliced1>
}
}won't be lowered to shuffles, but to SLM memory accesses needing synchronization.