Skip to content

Commit 807676f

Browse files
committed
Update doc
1 parent 87e7af1 commit 807676f

File tree

1 file changed

+25
-19
lines changed
  • third_party/intel/include/Dialect/TritonIntelGPU/Transforms

1 file changed

+25
-19
lines changed

third_party/intel/include/Dialect/TritonIntelGPU/Transforms/Passes.td

Lines changed: 25 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -299,7 +299,7 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 :
299299
tt.func @test.work(%arg0: tensor<32x32xf32, #mma>) -> tensor<32xf32, #triton_gpu.slice<{dim = 0, parent = #mma}>> {
300300
%0 = "tt.reduce"(%arg0) <{axis = 0 : i32}> ({
301301
^bb0(%arg1: f32, %arg2: f32):
302-
%1 = arith.maxnumf %arg1, %arg2 : f32
302+
%1 = arith.addf %arg1, %arg2 : f32
303303
tt.reduce.return %1 : f32
304304
}) : (tensor<32x32xf32, #mma>) -> tensor<32xf32, #triton_gpu.slice<{dim = 0, parent = #mma}>>
305305
tt.return %0 : tensor<32xf32, #triton_gpu.slice<{dim = 0, parent = #mma}>>
@@ -308,26 +308,32 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 :
308308
```
309309
Is converted to:
310310
```mlir
311-
#mma = #triton_intel_gpu.dpas<{repeatCount = 8, systolicDepth = 8, executionSize = 16, opsPerChan = 2, threadsPerWarp = 16, warpsPerCTA = [2, 2], repCluster = [1, 1]}>
312-
#blocked = #triton_gpu.blocked<{sizePerThread = [1, 16, 1], threadsPerWarp = [16, 1, 1], warpsPerCTA = [2, 2, 1], order = [0, 2, 1], CTAsPerCGA = [1, 1, 1], CTASplitNum = [1, 1, 1], CTAOrder = [0, 2, 1]}>
313-
#blocked2 = #triton_gpu.blocked<{sizePerThread = [16, 1], threadsPerWarp = [1, 16], warpsPerCTA = [2, 2], order = [0, 1], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [0, 1]}>
314-
311+
#blocked = #triton_gpu.blocked<{sizePerThread = [8, 1, 1, 1, 1], threadsPerWarp = [1, 16, 1, 1, 1], warpsPerCTA = [2, 1, 1, 2, 1], order = [4, 0, 1, 2, 3]}>
312+
#blocked1 = #triton_gpu.blocked<{sizePerThread = [1, 16, 1], threadsPerWarp = [16, 1, 1], warpsPerCTA = [2, 1, 2], order = [2, 0, 1]}>
313+
#blocked2 = #triton_gpu.blocked<{sizePerThread = [1, 16], threadsPerWarp = [16, 1], warpsPerCTA = [2, 2], order = [1, 0]}>
314+
#mma = #triton_intel_gpu.dpas<{repeatCount = 8, systolicDepth = 8, executionSize = 16, opsPerChan = 1, threadsPerWarp = 16, warpsPerCTA = [2, 2], repCluster = [1, 1], A = [8, 8], B = [8, 16], C = [8, 16]}>
315315
module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, triton_gpu.target = "xpu", "triton_gpu.threads-per-warp" = 16 : i32, triton_intel_gpu.support_dpas, triton_intel_gpu.support_sg_2d_block} {
316-
tt.func @test.work(%arg0: tensor<32x32xf32, #mma>) -> tensor<32xf32, #triton_gpu.slice<{dim = 0, parent = #mma}>> {
317-
%0 = tt.reshape %arg0 {allow_reorder = true} : tensor<32x32xf32, #mma> -> tensor<32x32x1xf32, #blocked>
318-
%1 = "tt.reduce"(%0) <{axis = 2 : i32}> ({
316+
tt.func @test_two_warps_twice(%arg0: tensor<32x32xf32, #mma>) -> tensor<32xf32, #triton_gpu.slice<{dim = 1, parent = #mma}>> {
317+
%0 = tt.reshape %arg0 {allow_reorder = true} : tensor<32x32xf32, #mma> -> tensor<32x16x1x2x1xf32, #blocked>
318+
%1 = "tt.reduce"(%0) <{axis = 4 : i32}> ({
319+
^bb0(%arg1: f32, %arg2: f32):
320+
%7 = arith.addf %arg1, %arg2 : f32
321+
tt.reduce.return %7 : f32
322+
}) : (tensor<32x16x1x2x1xf32, #blocked>) -> tensor<32x16x1x2xf32, #triton_gpu.slice<{dim = 4, parent = #blocked}>>
323+
%2 = "tt.reduce"(%1) <{axis = 2 : i32}> ({
324+
^bb0(%arg1: f32, %arg2: f32):
325+
%7 = arith.addf %arg1, %arg2 : f32
326+
tt.reduce.return %7 : f32
327+
}) : (tensor<32x16x1x2xf32, #triton_gpu.slice<{dim = 4, parent = #blocked}>>) -> tensor<32x16x2xf32, #triton_gpu.slice<{dim = 2, parent = #triton_gpu.slice<{dim = 4, parent = #blocked}>}>>
328+
%3 = triton_gpu.convert_layout %2 : tensor<32x16x2xf32, #triton_gpu.slice<{dim = 2, parent = #triton_gpu.slice<{dim = 4, parent = #blocked}>}>> -> tensor<32x16x2xf32, #blocked1>
329+
%4 = tt.reshape %3 {allow_reorder = true} : tensor<32x16x2xf32, #blocked1> -> tensor<32x32xf32, #blocked2>
330+
%5 = "tt.reduce"(%4) <{axis = 1 : i32}> ({
319331
^bb0(%arg1: f32, %arg2: f32):
320-
%2 = arith.maxnumf %arg1, %arg2 : f32
321-
tt.reduce.return %2 : f32
322-
}) : (tensor<32x32x1xf32, #blocked>) -> tensor<32x32xf32, #triton_gpu.slice<{dim = 2, parent = #blocked}>>
323-
%3 = triton_gpu.convert_layout %1 : tensor<32x32xf32, #triton_gpu.slice<{dim = 2, parent = #blocked}>> -> tensor<32x32xf32, #blocked2>
324-
%4 = "tt.reduce"(%3) <{axis = 0 : i32}> ({
325-
^bb0(%arg3: f32, %arg4: f32):
326-
%5 = arith.maxnumf %arg3, %arg4 : f32
327-
tt.reduce.return %5 : f32
328-
}) : (tensor<32x32xf32, #blocked2>) -> tensor<32xf32, #triton_gpu.slice<{dim = 0, parent = #blocked2}>>
329-
%6 = triton_gpu.convert_layout %4 : tensor<32xf32, #triton_gpu.slice<{dim = 0, parent = #blocked2}>> -> tensor<32xf32, #triton_gpu.slice<{dim = 0, parent = #mma}>>
330-
tt.return %6 : tensor<32xf32, #triton_gpu.slice<{dim = 0, parent = #mma}>>
332+
%7 = arith.addf %arg1, %arg2 : f32
333+
tt.reduce.return %7 : f32
334+
}) : (tensor<32x32xf32, #blocked2>) -> tensor<32xf32, #triton_gpu.slice<{dim = 1, parent = #blocked2}>>
335+
%6 = triton_gpu.convert_layout %5 : tensor<32xf32, #triton_gpu.slice<{dim = 1, parent = #blocked2}>> -> tensor<32xf32, #triton_gpu.slice<{dim = 1, parent = #mma}>>
336+
tt.return %6 : tensor<32xf32, #triton_gpu.slice<{dim = 1, parent = #mma}>>
331337
}
332338
}
333339
```

0 commit comments

Comments
 (0)