@@ -299,7 +299,7 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 :
299299 tt.func @test.work(%arg0: tensor<32x32xf32, #mma>) -> tensor<32xf32, #triton_gpu.slice<{dim = 0, parent = #mma}>> {
300300 %0 = "tt.reduce"(%arg0) <{axis = 0 : i32}> ({
301301 ^bb0(%arg1: f32, %arg2: f32):
302- %1 = arith.maxnumf %arg1, %arg2 : f32
302+ %1 = arith.addf %arg1, %arg2 : f32
303303 tt.reduce.return %1 : f32
304304 }) : (tensor<32x32xf32, #mma>) -> tensor<32xf32, #triton_gpu.slice<{dim = 0, parent = #mma}>>
305305 tt.return %0 : tensor<32xf32, #triton_gpu.slice<{dim = 0, parent = #mma}>>
@@ -308,26 +308,32 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 :
308308 ```
309309 Is converted to:
310310 ```mlir
311- #mma = #triton_intel_gpu.dpas<{repeatCount = 8, systolicDepth = 8, executionSize = 16, opsPerChan = 2, threadsPerWarp = 16, warpsPerCTA = [2, 2 ], repCluster = [1, 1 ]}>
312- #blocked = #triton_gpu.blocked<{sizePerThread = [1, 16, 1], threadsPerWarp = [16, 1, 1], warpsPerCTA = [2, 2, 1 ], order = [0, 2, 1], CTAsPerCGA = [1, 1, 1], CTASplitNum = [1, 1, 1], CTAOrder = [0, 2 , 1]}>
313- #blocked2 = #triton_gpu.blocked<{sizePerThread = [16, 1 ], threadsPerWarp = [1, 16 ], warpsPerCTA = [2, 2], order = [0, 1], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [0, 1 ]}>
314-
311+ #blocked = #triton_gpu.blocked<{sizePerThread = [ 8, 1, 1, 1, 1], threadsPerWarp = [1, 16, 1, 1, 1], warpsPerCTA = [2, 1, 1, 2, 1 ], order = [4, 0, 1, 2, 3 ]}>
312+ #blocked1 = #triton_gpu.blocked<{sizePerThread = [1, 16, 1], threadsPerWarp = [16, 1, 1], warpsPerCTA = [2, 1, 2 ], order = [2, 0 , 1]}>
313+ #blocked2 = #triton_gpu.blocked<{sizePerThread = [1, 16 ], threadsPerWarp = [16, 1 ], warpsPerCTA = [2, 2], order = [1, 0 ]}>
314+ #mma = #triton_intel_gpu.dpas<{repeatCount = 8, systolicDepth = 8, executionSize = 16, opsPerChan = 1, threadsPerWarp = 16, warpsPerCTA = [2, 2], repCluster = [1, 1], A = [8, 8], B = [8, 16], C = [8, 16]}>
315315module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, triton_gpu.target = "xpu", "triton_gpu.threads-per-warp" = 16 : i32, triton_intel_gpu.support_dpas, triton_intel_gpu.support_sg_2d_block} {
316- tt.func @test.work(%arg0: tensor<32x32xf32, #mma>) -> tensor<32xf32, #triton_gpu.slice<{dim = 0, parent = #mma}>> {
317- %0 = tt.reshape %arg0 {allow_reorder = true} : tensor<32x32xf32, #mma> -> tensor<32x32x1xf32, #blocked>
318- %1 = "tt.reduce"(%0) <{axis = 2 : i32}> ({
316+ tt.func @test_two_warps_twice(%arg0: tensor<32x32xf32, #mma>) -> tensor<32xf32, #triton_gpu.slice<{dim = 1, parent = #mma}>> {
317+ %0 = tt.reshape %arg0 {allow_reorder = true} : tensor<32x32xf32, #mma> -> tensor<32x16x1x2x1xf32, #blocked>
318+ %1 = "tt.reduce"(%0) <{axis = 4 : i32}> ({
319+ ^bb0(%arg1: f32, %arg2: f32):
320+ %7 = arith.addf %arg1, %arg2 : f32
321+ tt.reduce.return %7 : f32
322+ }) : (tensor<32x16x1x2x1xf32, #blocked>) -> tensor<32x16x1x2xf32, #triton_gpu.slice<{dim = 4, parent = #blocked}>>
323+ %2 = "tt.reduce"(%1) <{axis = 2 : i32}> ({
324+ ^bb0(%arg1: f32, %arg2: f32):
325+ %7 = arith.addf %arg1, %arg2 : f32
326+ tt.reduce.return %7 : f32
327+ }) : (tensor<32x16x1x2xf32, #triton_gpu.slice<{dim = 4, parent = #blocked}>>) -> tensor<32x16x2xf32, #triton_gpu.slice<{dim = 2, parent = #triton_gpu.slice<{dim = 4, parent = #blocked}>}>>
328+ %3 = triton_gpu.convert_layout %2 : tensor<32x16x2xf32, #triton_gpu.slice<{dim = 2, parent = #triton_gpu.slice<{dim = 4, parent = #blocked}>}>> -> tensor<32x16x2xf32, #blocked1>
329+ %4 = tt.reshape %3 {allow_reorder = true} : tensor<32x16x2xf32, #blocked1> -> tensor<32x32xf32, #blocked2>
330+ %5 = "tt.reduce"(%4) <{axis = 1 : i32}> ({
319331 ^bb0(%arg1: f32, %arg2: f32):
320- %2 = arith.maxnumf %arg1, %arg2 : f32
321- tt.reduce.return %2 : f32
322- }) : (tensor<32x32x1xf32, #blocked>) -> tensor<32x32xf32, #triton_gpu.slice<{dim = 2, parent = #blocked}>>
323- %3 = triton_gpu.convert_layout %1 : tensor<32x32xf32, #triton_gpu.slice<{dim = 2, parent = #blocked}>> -> tensor<32x32xf32, #blocked2>
324- %4 = "tt.reduce"(%3) <{axis = 0 : i32}> ({
325- ^bb0(%arg3: f32, %arg4: f32):
326- %5 = arith.maxnumf %arg3, %arg4 : f32
327- tt.reduce.return %5 : f32
328- }) : (tensor<32x32xf32, #blocked2>) -> tensor<32xf32, #triton_gpu.slice<{dim = 0, parent = #blocked2}>>
329- %6 = triton_gpu.convert_layout %4 : tensor<32xf32, #triton_gpu.slice<{dim = 0, parent = #blocked2}>> -> tensor<32xf32, #triton_gpu.slice<{dim = 0, parent = #mma}>>
330- tt.return %6 : tensor<32xf32, #triton_gpu.slice<{dim = 0, parent = #mma}>>
332+ %7 = arith.addf %arg1, %arg2 : f32
333+ tt.reduce.return %7 : f32
334+ }) : (tensor<32x32xf32, #blocked2>) -> tensor<32xf32, #triton_gpu.slice<{dim = 1, parent = #blocked2}>>
335+ %6 = triton_gpu.convert_layout %5 : tensor<32xf32, #triton_gpu.slice<{dim = 1, parent = #blocked2}>> -> tensor<32xf32, #triton_gpu.slice<{dim = 1, parent = #mma}>>
336+ tt.return %6 : tensor<32xf32, #triton_gpu.slice<{dim = 1, parent = #mma}>>
331337 }
332338}
333339 ```
0 commit comments