Skip to content

Commit 5ed11cb

Browse files
committed
Update doc
1 parent bc5471c commit 5ed11cb

File tree

2 files changed

+7
-7
lines changed

2 files changed

+7
-7
lines changed

third_party/intel/include/Dialect/TritonIntelGPU/Transforms/Passes.td

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -305,8 +305,8 @@ tt.func @test(%arg0: tensor<32x32xf32, #mma>) -> tensor<32xf32, #triton_gpu.slic
305305
```
306306
Is converted to:
307307
```mlir
308-
#blocked = #triton_gpu.blocked<{sizePerThread = [8, 1, 1, 1, 1], threadsPerWarp = [1, 16, 1, 1, 1], warpsPerCTA = [2, 1, 1, 2, 1], order = [4, 0, 1, 2, 3]}>
309-
#blocked1 = #triton_gpu.blocked<{sizePerThread = [1, 16, 1], threadsPerWarp = [16, 1, 1], warpsPerCTA = [2, 1, 2], order = [2, 0, 1]}>
308+
#blocked = #triton_gpu.blocked<{sizePerThread = [8, 1, 1, 1, 1], threadsPerWarp = [1, 16, 1, 1, 1], warpsPerCTA = [2, 1, 1, 2, 1], order = [1, 2, 3, 4, 0]}>
309+
#blocked1 = #triton_gpu.blocked<{sizePerThread = [1, 16, 1], threadsPerWarp = [16, 1, 1], warpsPerCTA = [2, 1, 2], order = [1, 2, 0]}>
310310
#blocked2 = #triton_gpu.blocked<{sizePerThread = [1, 16], threadsPerWarp = [16, 1], warpsPerCTA = [2, 2], order = [1, 0]}>
311311
#mma = #triton_intel_gpu.dpas<{repeatCount = 8, systolicDepth = 8, executionSize = 16, opsPerChan = 1, threadsPerWarp = 16, warpsPerCTA = [2, 2], repCluster = [1, 1], A = [8, 8], B = [8, 16], C = [8, 16]}>
312312
tt.func @test(%arg0: tensor<32x32xf32, #mma>) -> tensor<32xf32, #triton_gpu.slice<{dim = 1, parent = #mma}>> {

third_party/intel/lib/TritonIntelGPUTransforms/OptimizeReductionLocality.cpp

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -69,7 +69,7 @@ static Value createReshapeForReduction(PatternRewriter &rewriter, Location loc,
6969
/// | t0 t1 t2 t3 ... tn t0 t1 t2 t3 ... tn tn1 tn2 tn3 ... tnn tn1 tn2 tn3 tn4 ... tnn |
7070
/// v t0 t1 t2 t3 ... tn t0 t1 t2 t3 ... tn tn1 tn2 tn3 ... tnn tn1 tn2 tn3 tn4 ... tnn |
7171
/// ```
72-
/// Blocked (#triton_gpu.blocked<{sizePerThread = [repCluster[0]*repeatCount, 1, 1, 1, 1], threadsPerWarp = [1, executionSize, 1, 1, 1], warpsPerCTA = [warpsPerCTA[0], 1, 1, warpsPerCTA[1], 1], order = [4, 0, 1, 2, 3]}>):
72+
/// Blocked (#triton_gpu.blocked<{sizePerThread = [repCluster[0]*repeatCount, 1, 1, 1, 1], threadsPerWarp = [1, executionSize, 1, 1, 1], warpsPerCTA = [warpsPerCTA[0], 1, 1, warpsPerCTA[1], 1], order = [1, 2, 3, 4, 0]}>):
7373
/// ```
7474
/// warpsPerCTA[3]
7575
/// <------------------------------------------------------------------------------->
@@ -113,10 +113,10 @@ static Value createReshapeForReduction(PatternRewriter &rewriter, Location loc,
113113
/// <------------------------------------>
114114
/// sizePerThread[1]
115115
/// <------------------>
116-
/// ^ t0 t0 t0 t0 ... t0 tn1 tn1 tn1 ... tn1 ^
117-
/// | t1 t1 t1 t1 ... t1 tn2 tn2 tn2 ... tn2 |
118-
/// sizePerThread[0] | t2 t2 t2 t2 ... t2 tn3 tn3 tn3 ... tn3 | warpsPerCTA[0]
119-
/// | t3 t3 t3 t3 ... t3 tn4 tn4 tn4 ... tn4 |
116+
/// ^ t0 t0 t0 t0 ... t0 tn1 tn1 tn1 ... tn1 ^
117+
/// | t1 t1 t1 t1 ... t1 tn2 tn2 tn2 ... tn2 |
118+
/// threadsPerWarp[0] | t2 t2 t2 t2 ... t2 tn3 tn3 tn3 ... tn3 | warpsPerCTA[0]
119+
/// | t3 t3 t3 t3 ... t3 tn4 tn4 tn4 ... tn4 |
120120
/// ```
121121
/// And reducing on dimension 1 and converting the layout to the original one
122122
/// leads to the same output as the original operation.

0 commit comments

Comments
 (0)