Skip to content

Commit 2c1505e

Browse files
authored
Avoid redundant reduction step. (#4774)
The redundant step of reduction of partial accumulators can be done, causing inefficiency and unguarded shared memory access. Signed-off-by: Ilya Enkovich <[email protected]>
1 parent 83367a9 commit 2c1505e

File tree

2 files changed

+23
-2
lines changed

2 files changed

+23
-2
lines changed

test/TritonIntelGPU/tritongpu_reduce_op_lowering.mlir

Lines changed: 22 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
// RUN: triton-opt %s --intel-allocate-shared-memory --convert-triton-intel-gpu-to-llvm | FileCheck %s --implicit-check-not=llvm.inline_asm
1+
// RUN: triton-opt %s --split-input-file --intel-allocate-shared-memory --convert-triton-intel-gpu-to-llvm | FileCheck %s --implicit-check-not=llvm.inline_asm
22

33
// COM: Tests reduction when threads_per_warp < num_warps.
44

@@ -35,3 +35,24 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 64 : i32, "ttg.th
3535
tt.return
3636
}
3737
}
38+
39+
// -----
40+
41+
#blocked = #ttg.blocked<{sizePerThread = [1, 4], threadsPerWarp = [1, 32], warpsPerCTA = [32, 1], order = [1, 0]}>
42+
module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 32 : i32, ttg.target = "xpu", "ttg.threads-per-warp" = 32 : i32, ttig.min_sg_size = 32 : i32} {
43+
tt.func public @test_reduce(%arg0: tensor<32x128xf32, #blocked>) -> tensor<128xf32, #ttg.slice<{dim = 0, parent = #blocked}>> {
44+
// CHECK: llvm.call spir_funccc @_Z27__spirv_GroupNonUniformFAddiif
45+
// CHECK: llvm.call spir_funccc @_Z27__spirv_GroupNonUniformFAddiif
46+
// CHECK: llvm.call spir_funccc @_Z27__spirv_GroupNonUniformFAddiif
47+
// CHECK: llvm.call spir_funccc @_Z27__spirv_GroupNonUniformFAddiif
48+
// CHECK: llvm.store
49+
// CHECK-NOT: llvm.load
50+
// CHECK: llvm.call spir_funccc @_Z7barrierj
51+
%1 = "tt.reduce"(%arg0) <{axis = 0 : i32}> ({
52+
^bb0(%arg2: f32, %arg3: f32):
53+
%2 = arith.addf %arg2, %arg3 : f32
54+
tt.reduce.return %2 : f32
55+
}) {allocation.offset = 0 : i32} : (tensor<32x128xf32, #blocked>) -> tensor<128xf32, #ttg.slice<{dim = 0, parent = #blocked}>>
56+
tt.return %1 : tensor<128xf32, #ttg.slice<{dim = 0, parent = #blocked}>>
57+
}
58+
}

third_party/intel/lib/TritonIntelGPUToLLVM/ReduceOpToLLVM.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -292,7 +292,7 @@ struct ReduceOpConversion
292292
// size is [elems / sizeInterWarps, N] -> [elems / sizeInterWarps, ceil(N,
293293
// numLanes)] in each reduce iteration.
294294
unsigned problemBatchSize = elems / sizeInterWarps;
295-
for (unsigned problemSize = sizeInterWarps; problemSize > 0;
295+
for (unsigned problemSize = sizeInterWarps; problemSize > 1;
296296
problemSize = problemSize / numLanes) {
297297
unsigned reduceLaneNumber = std::min(problemSize, numLanes);
298298
unsigned totalProblemSizePerIter = problemSize * problemBatchSize;

0 commit comments

Comments
 (0)