@@ -294,16 +294,13 @@ def TritonIntelGPUOptimizeReductionLocality
294294 `triton_gpu.convert_layout` operations, e.g.:
295295 ```mlir
296296#mma = #triton_intel_gpu.dpas<{repeatCount = 8, systolicDepth = 8, executionSize = 16, opsPerChan = 2, threadsPerWarp = 16, warpsPerCTA = [2, 2], repCluster = [1, 1]}>
297-
298- module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, triton_gpu.target = "xpu", "triton_gpu.threads-per-warp" = 16 : i32, triton_intel_gpu.support_dpas, triton_intel_gpu.support_sg_2d_block} {
299- tt.func @test.work(%arg0: tensor<32x32xf32, #mma>) -> tensor<32xf32, #triton_gpu.slice<{dim = 0, parent = #mma}>> {
300- %0 = "tt.reduce"(%arg0) <{axis = 0 : i32}> ({
301- ^bb0(%arg1: f32, %arg2: f32):
302- %1 = arith.addf %arg1, %arg2 : f32
303- tt.reduce.return %1 : f32
304- }) : (tensor<32x32xf32, #mma>) -> tensor<32xf32, #triton_gpu.slice<{dim = 0, parent = #mma}>>
305- tt.return %0 : tensor<32xf32, #triton_gpu.slice<{dim = 0, parent = #mma}>>
306- }
297+ tt.func @test(%arg0: tensor<32x32xf32, #mma>) -> tensor<32xf32, #triton_gpu.slice<{dim = 0, parent = #mma}>> {
298+ %0 = "tt.reduce"(%arg0) <{axis = 0 : i32}> ({
299+ ^bb0(%arg1: f32, %arg2: f32):
300+ %1 = arith.addf %arg1, %arg2 : f32
301+ tt.reduce.return %1 : f32
302+ }) : (tensor<32x32xf32, #mma>) -> tensor<32xf32, #triton_gpu.slice<{dim = 0, parent = #mma}>>
303+ tt.return %0 : tensor<32xf32, #triton_gpu.slice<{dim = 0, parent = #mma}>>
307304}
308305 ```
309306 Is converted to:
@@ -312,29 +309,27 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 :
312309#blocked1 = #triton_gpu.blocked<{sizePerThread = [1, 16, 1], threadsPerWarp = [16, 1, 1], warpsPerCTA = [2, 1, 2], order = [2, 0, 1]}>
313310#blocked2 = #triton_gpu.blocked<{sizePerThread = [1, 16], threadsPerWarp = [16, 1], warpsPerCTA = [2, 2], order = [1, 0]}>
314311#mma = #triton_intel_gpu.dpas<{repeatCount = 8, systolicDepth = 8, executionSize = 16, opsPerChan = 1, threadsPerWarp = 16, warpsPerCTA = [2, 2], repCluster = [1, 1], A = [8, 8], B = [8, 16], C = [8, 16]}>
315- module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, triton_gpu.target = "xpu", "triton_gpu.threads-per-warp" = 16 : i32, triton_intel_gpu.support_dpas, triton_intel_gpu.support_sg_2d_block} {
316- tt.func @test_two_warps_twice(%arg0: tensor<32x32xf32, #mma>) -> tensor<32xf32, #triton_gpu.slice<{dim = 1, parent = #mma}>> {
317- %0 = tt.reshape %arg0 {allow_reorder = true} : tensor<32x32xf32, #mma> -> tensor<32x16x1x2x1xf32, #blocked>
318- %1 = "tt.reduce"(%0) <{axis = 4 : i32}> ({
319- ^bb0(%arg1: f32, %arg2: f32):
320- %7 = arith.addf %arg1, %arg2 : f32
321- tt.reduce.return %7 : f32
322- }) : (tensor<32x16x1x2x1xf32, #blocked>) -> tensor<32x16x1x2xf32, #triton_gpu.slice<{dim = 4, parent = #blocked}>>
323- %2 = "tt.reduce"(%1) <{axis = 2 : i32}> ({
324- ^bb0(%arg1: f32, %arg2: f32):
325- %7 = arith.addf %arg1, %arg2 : f32
326- tt.reduce.return %7 : f32
327- }) : (tensor<32x16x1x2xf32, #triton_gpu.slice<{dim = 4, parent = #blocked}>>) -> tensor<32x16x2xf32, #triton_gpu.slice<{dim = 2, parent = #triton_gpu.slice<{dim = 4, parent = #blocked}>}>>
328- %3 = triton_gpu.convert_layout %2 : tensor<32x16x2xf32, #triton_gpu.slice<{dim = 2, parent = #triton_gpu.slice<{dim = 4, parent = #blocked}>}>> -> tensor<32x16x2xf32, #blocked1>
329- %4 = tt.reshape %3 {allow_reorder = true} : tensor<32x16x2xf32, #blocked1> -> tensor<32x32xf32, #blocked2>
330- %5 = "tt.reduce"(%4) <{axis = 1 : i32}> ({
331- ^bb0(%arg1: f32, %arg2: f32):
332- %7 = arith.addf %arg1, %arg2 : f32
333- tt.reduce.return %7 : f32
334- }) : (tensor<32x32xf32, #blocked2>) -> tensor<32xf32, #triton_gpu.slice<{dim = 1, parent = #blocked2}>>
335- %6 = triton_gpu.convert_layout %5 : tensor<32xf32, #triton_gpu.slice<{dim = 1, parent = #blocked2}>> -> tensor<32xf32, #triton_gpu.slice<{dim = 1, parent = #mma}>>
336- tt.return %6 : tensor<32xf32, #triton_gpu.slice<{dim = 1, parent = #mma}>>
337- }
312+ tt.func @test(%arg0: tensor<32x32xf32, #mma>) -> tensor<32xf32, #triton_gpu.slice<{dim = 1, parent = #mma}>> {
313+ %0 = tt.reshape %arg0 {allow_reorder = true} : tensor<32x32xf32, #mma> -> tensor<32x16x1x2x1xf32, #blocked>
314+ %1 = "tt.reduce"(%0) <{axis = 4 : i32}> ({
315+ ^bb0(%arg1: f32, %arg2: f32):
316+ %7 = arith.addf %arg1, %arg2 : f32
317+ tt.reduce.return %7 : f32
318+ }) : (tensor<32x16x1x2x1xf32, #blocked>) -> tensor<32x16x1x2xf32, #triton_gpu.slice<{dim = 4, parent = #blocked}>>
319+ %2 = "tt.reduce"(%1) <{axis = 2 : i32}> ({
320+ ^bb0(%arg1: f32, %arg2: f32):
321+ %7 = arith.addf %arg1, %arg2 : f32
322+ tt.reduce.return %7 : f32
323+ }) : (tensor<32x16x1x2xf32, #triton_gpu.slice<{dim = 4, parent = #blocked}>>) -> tensor<32x16x2xf32, #triton_gpu.slice<{dim = 2, parent = #triton_gpu.slice<{dim = 4, parent = #blocked}>}>>
324+ %3 = triton_gpu.convert_layout %2 : tensor<32x16x2xf32, #triton_gpu.slice<{dim = 2, parent = #triton_gpu.slice<{dim = 4, parent = #blocked}>}>> -> tensor<32x16x2xf32, #blocked1>
325+ %4 = tt.reshape %3 {allow_reorder = true} : tensor<32x16x2xf32, #blocked1> -> tensor<32x32xf32, #blocked2>
326+ %5 = "tt.reduce"(%4) <{axis = 1 : i32}> ({
327+ ^bb0(%arg1: f32, %arg2: f32):
328+ %7 = arith.addf %arg1, %arg2 : f32
329+ tt.reduce.return %7 : f32
330+ }) : (tensor<32x32xf32, #blocked2>) -> tensor<32xf32, #triton_gpu.slice<{dim = 1, parent = #blocked2}>>
331+ %6 = triton_gpu.convert_layout %5 : tensor<32xf32, #triton_gpu.slice<{dim = 1, parent = #blocked2}>> -> tensor<32xf32, #triton_gpu.slice<{dim = 1, parent = #mma}>>
332+ tt.return %6 : tensor<32xf32, #triton_gpu.slice<{dim = 1, parent = #mma}>>
338333}
339334 ```
340335 The `tt.reshape` operation is a NOP so that the following `tt.reduce`
0 commit comments