Skip to content

Commit 0928f46

Browse files
authored
[MLIR][GPU] Ensure all lanes in cluster have final reduction value (#165764)
This is a fix for a cluster size of 32 when the subgroup size is 64. Previously, only lanes [16, 32) u [48, 64) contained the correct clusterwise reduction value. This PR adds a swizzle instruction to broadcast the correct value down to lanes [0, 16) u [32, 48).
1 parent 7eef868 commit 0928f46

File tree

2 files changed

+109
-91
lines changed

2 files changed

+109
-91
lines changed

mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp

Lines changed: 27 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -416,13 +416,39 @@ createSubgroupDPPReduction(PatternRewriter &rewriter, gpu::SubgroupReduceOp op,
416416
if (ci.clusterSize >= 32) {
417417
if (chipset.majorVersion <= 9) {
418418
// Broadcast last value from each row to next row.
419-
// Use row mask to avoid polluting rows 1 and 3.
419+
// Use row mask to avoid polluting row 0 (and row 2 if wave-64).
420420
dpp = amdgpu::DPPOp::create(rewriter, loc, res.getType(), res, res,
421421
amdgpu::DPPPerm::row_bcast_15,
422422
rewriter.getUnitAttr(), 0xa, allBanks,
423423
/*bound_ctrl*/ false);
424424
res = vector::makeArithReduction(
425425
rewriter, loc, gpu::convertReductionKind(mode), res, dpp);
426+
427+
// For subgroupSize = 64, at this point lanes [16, 32) contain the full
428+
// reduction over lanes [0, 32), but lanes [0, 16) do not. Similarly,
429+
// lanes [48, 64) contain the full reduction over lanes [32, 64), but
430+
// lanes [32, 48) do not.
431+
//
432+
// If subgroup size is 64 and cluster size is 64, we don't need lanes [0,
433+
// 16) and [32, 48) to have the correct cluster-32 reduction values at
434+
// this point, because only lane 63's value will ultimately be read in
435+
// this full-cluster case.
436+
//
437+
// If subgroup size is 64 and cluster size is 32, we need to ensure that
438+
// lanes [0, 16) and [32, 48) have the correct final cluster-32 reduction
439+
// values (subgroup_reduce guarantees that all lanes within each cluster
440+
// contain the final reduction value). We do this by broadcasting lane
441+
// 31's value to lanes [0, 16) and lanes 63's value to lanes [32, 48).
442+
//
443+
// See https://gpuopen.com/learn/amd-gcn-assembly-cross-lane-operations
444+
// for an illustration of how this within-cluster broadcast works with a
445+
// swizzle.
446+
if (ci.subgroupSize == 64 && ci.clusterSize == 32) {
447+
res =
448+
amdgpu::SwizzleBitModeOp::create(rewriter, loc, res, /*and_mask=*/0,
449+
/*or_mask=*/31,
450+
/*xor_mask=*/0);
451+
}
426452
} else if (chipset.majorVersion <= 12) {
427453
// Use a permute lane to cross rows (row 1 <-> row 0, row 3 <-> row 2).
428454
Value uint32Max = arith::ConstantOp::create(

mlir/test/Dialect/GPU/subgroup-reduce-lowering.mlir

Lines changed: 82 additions & 90 deletions
Original file line numberDiff line numberDiff line change
@@ -8,11 +8,11 @@
88

99
// RUN: mlir-opt --allow-unregistered-dialect \
1010
// RUN: --test-gpu-subgroup-reduce-lowering="expand-to-shuffles target=gfx942" %s \
11-
// RUN: | FileCheck %s --check-prefix=CHECK-GFX9
11+
// RUN: | FileCheck %s --check-prefixes=CHECK-GFX,CHECK-GFX9
1212

1313
// RUN: mlir-opt --allow-unregistered-dialect \
1414
// RUN: --test-gpu-subgroup-reduce-lowering="expand-to-shuffles target=gfx1030" %s \
15-
// RUN: | FileCheck %s --check-prefix=CHECK-GFX10
15+
// RUN: | FileCheck %s --check-prefixes=CHECK-GFX,CHECK-GFX10
1616

1717
// CHECK-SUB: gpu.module @kernels {
1818
// CHECK-SHFL: gpu.module @kernels {
@@ -24,8 +24,7 @@ gpu.module @kernels {
2424
// CHECK-SUB-SAME: %[[ARG0:.+]]: vector<5xf16>)
2525
//
2626
// CHECK-SHFL-LABEL: gpu.func @kernel0(
27-
// CHECK-GFX9-LABEL: gpu.func @kernel0(
28-
// CHECK-GFX10-LABEL: gpu.func @kernel0(
27+
// CHECK-GFX-LABEL: gpu.func @kernel0(
2928
gpu.func @kernel0(%arg0: vector<5xf16>) kernel {
3029
// CHECK-SUB: %[[VZ:.+]] = arith.constant dense<0.0{{.*}}> : vector<5xf16>
3130
// CHECK-SUB: %[[E0:.+]] = vector.extract_strided_slice %[[ARG0]] {offsets = [0], sizes = [2], strides = [1]} : vector<5xf16> to vector<2xf16>
@@ -56,8 +55,7 @@ gpu.module @kernels {
5655

5756
// CHECK-SUB-COUNT-3: gpu.subgroup_reduce mul {{.+}} cluster(size = 4)
5857
// CHECK-SUB: "test.consume"
59-
// CHECK-GFX9-COUNT-2: amdgpu.dpp {{.+}}
60-
// CHECK-GFX10-COUNT-2: amdgpu.dpp {{.+}}
58+
// CHECK-GFX-COUNT-2: amdgpu.dpp {{.+}}
6159
%sum2 = gpu.subgroup_reduce mul %arg0 cluster(size = 4) : (vector<5xf16>) -> (vector<5xf16>)
6260
"test.consume"(%sum2) : (vector<5xf16>) -> ()
6361

@@ -74,8 +72,7 @@ gpu.module @kernels {
7472
// CHECK-SUB-SAME: %[[ARG0:.+]]: vector<1xf32>)
7573
//
7674
// CHECK-SHFL-LABEL: gpu.func @kernel1(
77-
// CHECK-GFX9-LABEL: gpu.func @kernel1(
78-
// CHECK-GFX10-LABEL: gpu.func @kernel1(
75+
// CHECK-GFX-LABEL: gpu.func @kernel1(
7976
gpu.func @kernel1(%arg0: vector<1xf32>) kernel {
8077
// CHECK-SUB: %[[E0:.+]] = vector.extract %[[ARG0]][0] : f32 from vector<1xf32>
8178
// CHECK-SUB: %[[R0:.+]] = gpu.subgroup_reduce add %[[E0]] : (f32) -> f32
@@ -100,17 +97,14 @@ gpu.module @kernels {
10097
// Note stride is dropped because it is == 1.
10198
// CHECK-SUB: gpu.subgroup_reduce add {{.+}} cluster(size = 8) : (f32) -> f32
10299
// CHECK-SUB: "test.consume"
103-
// CHECK-GFX9-COUNT-2: amdgpu.dpp {{.+}} quad_perm
104-
// CHECK-GFX9: amdgpu.dpp {{.+}} row_half_mirror
105-
// CHECK-GFX10-COUNT-2: amdgpu.dpp {{.+}} quad_perm
106-
// CHECK-GFX10: amdgpu.dpp {{.+}} row_half_mirror
100+
// CHECK-GFX-COUNT-2: amdgpu.dpp {{.+}} quad_perm
101+
// CHECK-GFX: amdgpu.dpp {{.+}} row_half_mirror
107102
%sum2 = gpu.subgroup_reduce add %arg0 cluster(size = 8, stride = 1) : (vector<1xf32>) -> (vector<1xf32>)
108103
"test.consume"(%sum2) : (vector<1xf32>) -> ()
109104

110105
// CHECK-SUB: gpu.subgroup_reduce add {{.+}} uniform cluster(size = 8, stride = 4) : (f32) -> f32
111106
// CHECK-SUB: "test.consume"
112-
// CHECK-GFX9-NOT: amdgpu.dpp
113-
// CHECK-GFX10-NOT: amdgpu.dpp
107+
// CHECK-GFX-NOT: amdgpu.dpp
114108
// CHECK-GFX10-NOT: rocdl.permlanex16
115109
%sum3 = gpu.subgroup_reduce add %arg0 uniform cluster(size = 8, stride = 4) : (vector<1xf32>) -> (vector<1xf32>)
116110
"test.consume"(%sum3) : (vector<1xf32>) -> ()
@@ -126,11 +120,8 @@ gpu.module @kernels {
126120
//
127121
// CHECK-SHFL-LABEL: gpu.func @kernel2(
128122
//
129-
// CHECK-GFX9-LABEL: gpu.func @kernel2(
130-
// CHECK-GFX9-NOT: amdgpu.dpp
131-
//
132-
// CHECK-GFX10-LABEL: gpu.func @kernel2(
133-
// CHECK-GFX10-NOT: amdgpu.dpp
123+
// CHECK-GFX-LABEL: gpu.func @kernel2(
124+
// CHECK-GFX-NOT: amdgpu.dpp
134125
gpu.func @kernel2(%arg0: vector<3xi8>, %arg1: vector<4xi8>) kernel {
135126
// CHECK-SUB: %[[R0:.+]] = gpu.subgroup_reduce add %[[ARG0]] : (vector<3xi8>) -> vector<3xi8>
136127
// CHECK-SUB: "test.consume"(%[[R0]]) : (vector<3xi8>) -> ()
@@ -148,8 +139,7 @@ gpu.module @kernels {
148139

149140
// CHECK-SHFL-LABEL: gpu.func @kernel3(
150141
// CHECK-SHFL-SAME: %[[ARG0:.+]]: i32)
151-
// CHECK-GFX9-LABEL: gpu.func @kernel3(
152-
// CHECK-GFX10-LABEL: gpu.func @kernel3(
142+
// CHECK-GFX-LABEL: gpu.func @kernel3(
153143
gpu.func @kernel3(%arg0: i32) kernel {
154144
// CHECK-SHFL-DAG: %[[C1:.+]] = arith.constant 1 : i32
155145
// CHECK-SHFL-DAG: %[[C2:.+]] = arith.constant 2 : i32
@@ -169,9 +159,9 @@ gpu.module @kernels {
169159
// CHECK-SHFL: %[[S4:.+]], %{{.+}} = gpu.shuffle xor %[[A3]], %[[C16]], %[[C32]] : i32
170160
// CHECK-SHFL: %[[A4:.+]] = arith.addi %[[A3]], %[[S4]] : i32
171161
// CHECK-SHFL: "test.consume"(%[[A4]]) : (i32) -> ()
172-
162+
173163
// CHECK-GFX9-COUNT-6: amdgpu.dpp
174-
164+
175165
// CHECK-GFX10-COUNT-4: amdgpu.dpp
176166
// CHECK-GFX10: rocdl.permlanex16
177167
// CHECK-GFX10-COUNT-2: rocdl.readlane
@@ -185,11 +175,8 @@ gpu.module @kernels {
185175
// CHECK-SHFL-LABEL: gpu.func @kernel3_clustered(
186176
// CHECK-SHFL-SAME: %[[ARG0:.+]]: i32)
187177
//
188-
// CHECK-GFX9-LABEL: gpu.func @kernel3_clustered(
189-
// CHECK-GFX9-SAME: %[[ARG0:.+]]: i32)
190-
//
191-
// CHECK-GFX10-LABEL: gpu.func @kernel3_clustered(
192-
// CHECK-GFX10-SAME: %[[ARG0:.+]]: i32)
178+
// CHECK-GFX-LABEL: gpu.func @kernel3_clustered(
179+
// CHECK-GFX-SAME: %[[ARG0:.+]]: i32)
193180
gpu.func @kernel3_clustered(%arg0: i32) kernel {
194181
// CHECK-SHFL-DAG: %[[C1:.+]] = arith.constant 1 : i32
195182
// CHECK-SHFL-DAG: %[[C2:.+]] = arith.constant 2 : i32
@@ -204,19 +191,13 @@ gpu.module @kernels {
204191
// CHECK-SHFL: %[[A2:.+]] = arith.addi %[[A1]], %[[S2]] : i32
205192
// CHECK-SHFL: "test.consume"(%[[A2]]) : (i32) -> ()
206193

207-
// CHECK-GFX9: %[[D0:.+]] = amdgpu.dpp %[[ARG0]] %[[ARG0]] quad_perm([1 : i32, 0 : i32, 3 : i32, 2 : i32]) {bound_ctrl = true} : i32
208-
// CHECK-GFX9: %[[A0:.+]] = arith.addi %[[ARG0]], %[[D0]] : i32
209-
// CHECK-GFX9: %[[D1:.+]] = amdgpu.dpp %[[A0]] %[[A0]] quad_perm([2 : i32, 3 : i32, 0 : i32, 1 : i32]) {bound_ctrl = true} : i32
210-
// CHECK-GFX9: %[[A1:.+]] = arith.addi %[[A0]], %[[D1]] : i32
211-
// CHECK-GFX9: %[[D2:.+]] = amdgpu.dpp %[[A1]] %[[A1]] row_half_mirror(unit) {bound_ctrl = true} : i32
212-
// CHECK-GFX9: %[[A2:.+]] = arith.addi %[[A1]], %[[D2]] : i32
213-
214-
// CHECK-GFX10: %[[D0:.+]] = amdgpu.dpp %[[ARG0]] %[[ARG0]] quad_perm([1 : i32, 0 : i32, 3 : i32, 2 : i32]) {bound_ctrl = true} : i32
215-
// CHECK-GFX10: %[[A0:.+]] = arith.addi %[[ARG0]], %[[D0]] : i32
216-
// CHECK-GFX10: %[[D1:.+]] = amdgpu.dpp %[[A0]] %[[A0]] quad_perm([2 : i32, 3 : i32, 0 : i32, 1 : i32]) {bound_ctrl = true} : i32
217-
// CHECK-GFX10: %[[A1:.+]] = arith.addi %[[A0]], %[[D1]] : i32
218-
// CHECK-GFX10: %[[D2:.+]] = amdgpu.dpp %[[A1]] %[[A1]] row_half_mirror(unit) {bound_ctrl = true} : i32
219-
// CHECK-GFX10: %[[A2:.+]] = arith.addi %[[A1]], %[[D2]] : i32
194+
// CHECK-GFX: %[[D0:.+]] = amdgpu.dpp %[[ARG0]] %[[ARG0]] quad_perm([1 : i32, 0 : i32, 3 : i32, 2 : i32]) {bound_ctrl = true} : i32
195+
// CHECK-GFX: %[[A0:.+]] = arith.addi %[[ARG0]], %[[D0]] : i32
196+
// CHECK-GFX: %[[D1:.+]] = amdgpu.dpp %[[A0]] %[[A0]] quad_perm([2 : i32, 3 : i32, 0 : i32, 1 : i32]) {bound_ctrl = true} : i32
197+
// CHECK-GFX: %[[A1:.+]] = arith.addi %[[A0]], %[[D1]] : i32
198+
// CHECK-GFX: %[[D2:.+]] = amdgpu.dpp %[[A1]] %[[A1]] row_half_mirror(unit) {bound_ctrl = true} : i32
199+
// CHECK-GFX: %[[A2:.+]] = arith.addi %[[A1]], %[[D2]] : i32
200+
220201
// CHECK-GFX10: "test.consume"(%[[A2]]) : (i32) -> ()
221202
%sum0 = gpu.subgroup_reduce add %arg0 cluster(size = 8) : (i32) -> i32
222203
"test.consume"(%sum0) : (i32) -> ()
@@ -228,11 +209,8 @@ gpu.module @kernels {
228209
// CHECK-SHFL-LABEL: gpu.func @kernel3_clustered_strided(
229210
// CHECK-SHFL-SAME: %[[ARG0:.+]]: i32)
230211
//
231-
// CHECK-GFX9-LABEL: gpu.func @kernel3_clustered_strided(
232-
// CHECK-GFX9-NOT: amdgpu.dpp
233-
//
234-
// CHECK-GFX10-LABEL: gpu.func @kernel3_clustered_strided(
235-
// CHECK-GFX10-NOT: amdgpu.dpp
212+
// CHECK-GFX-LABEL: gpu.func @kernel3_clustered_strided(
213+
// CHECK-GFX-NOT: amdgpu.dpp
236214
gpu.func @kernel3_clustered_strided(%arg0: i32) kernel {
237215
// CHECK-SHFL-DAG: %[[C1:.+]] = arith.constant 4 : i32
238216
// CHECK-SHFL-DAG: %[[C2:.+]] = arith.constant 8 : i32
@@ -256,11 +234,8 @@ gpu.module @kernels {
256234
// CHECK-SHFL-LABEL: gpu.func @kernel4(
257235
// CHECK-SHFL-SAME: %[[ARG0:.+]]: vector<2xf16>)
258236
//
259-
// CHECK-GFX9-LABEL: gpu.func @kernel4(
260-
// CHECK-GFX9-NOT: amdgpu.dpp
261-
//
262-
// CHECK-GFX10-LABEL: gpu.func @kernel4(
263-
// CHECK-GFX10-NOT: amdgpu.dpp
237+
// CHECK-GFX-LABEL: gpu.func @kernel4(
238+
// CHECK-GFX-NOT: amdgpu.dpp
264239
gpu.func @kernel4(%arg0: vector<2xf16>) kernel {
265240
// CHECK-SHFL-DAG: %[[C1:.+]] = arith.constant 1 : i32
266241
// CHECK-SHFL-DAG: %[[C2:.+]] = arith.constant 2 : i32
@@ -298,11 +273,8 @@ gpu.module @kernels {
298273
// CHECK-SHFL-LABEL: gpu.func @kernel4_clustered(
299274
// CHECK-SHFL-SAME: %[[ARG0:.+]]: vector<2xf16>)
300275
//
301-
// CHECK-GFX9-LABEL: gpu.func @kernel4_clustered(
302-
// CHECK-GFX9-NOT: amdgpu.dpp
303-
//
304-
// CHECK-GFX10-LABEL: gpu.func @kernel4_clustered(
305-
// CHECK-GFX10-NOT: amdgpu.dpp
276+
// CHECK-GFX-LABEL: gpu.func @kernel4_clustered(
277+
// CHECK-GFX-NOT: amdgpu.dpp
306278
gpu.func @kernel4_clustered(%arg0: vector<2xf16>) kernel {
307279
// CHECK-SHFL-DAG: %[[C1:.+]] = arith.constant 1 : i32
308280
// CHECK-SHFL-DAG: %[[C2:.+]] = arith.constant 2 : i32
@@ -319,10 +291,8 @@ gpu.module @kernels {
319291
// CHECK-SHFL-LABEL: gpu.func @kernel5(
320292
// CHECK-SHFL-SAME: %[[ARG0:.+]]: i16)
321293
//
322-
// CHECK-GFX9-LABEL: gpu.func @kernel5(
323-
//
324-
// CHECK-GFX10-LABEL: gpu.func @kernel5(
325-
// CHECK-GFX10-SAME: %[[ARG0:.+]]: i16)
294+
// CHECK-GFX-LABEL: gpu.func @kernel5(
295+
// CHECK-GFX-SAME: %[[ARG0:.+]]: i16)
326296
gpu.func @kernel5(%arg0: i16) kernel {
327297
// CHECK-SHFL: %[[E0:.+]] = arith.extui %[[ARG0]] : i16 to i32
328298
// CHECK-SHFL: %[[S0:.+]], %{{.+}} = gpu.shuffle xor %[[E0]], {{.+}} : i32
@@ -334,7 +304,7 @@ gpu.module @kernels {
334304
// CHECK-SHFL: arith.trunci {{.+}} : i32 to i16
335305
// CHECK-SHFL: %[[AL:.+]] = arith.addi {{.+}} : i16
336306
// CHECK-SHFL: "test.consume"(%[[AL]]) : (i16) -> ()
337-
307+
338308
// CHECK-GFX9-COUNT-6: amdgpu.dpp
339309

340310
// CHECK-GFX10: %[[D0:.+]] = amdgpu.dpp %[[ARG0]] %[[ARG0]] quad_perm([1 : i32, 0 : i32, 3 : i32, 2 : i32]) {bound_ctrl = true} : i16
@@ -361,11 +331,8 @@ gpu.module @kernels {
361331
// CHECK-SHFL-LABEL: gpu.func @kernel5_clustered(
362332
// CHECK-SHFL-SAME: %[[ARG0:.+]]: i16)
363333
//
364-
// CHECK-GFX9-LABEL: gpu.func @kernel5_clustered
365-
// CHECK-GFX9-SAME: %[[ARG0:.+]]: i16)
366-
//
367-
// CHECK-GFX10-LABEL: gpu.func @kernel5_clustered
368-
// CHECK-GFX10-SAME: %[[ARG0:.+]]: i16)
334+
// CHECK-GFX-LABEL: gpu.func @kernel5_clustered
335+
// CHECK-GFX-SAME: %[[ARG0:.+]]: i16)
369336
gpu.func @kernel5_clustered(%arg0: i16) kernel {
370337
// CHECK-SHFL: %[[E0:.+]] = arith.extui %[[ARG0]] : i16 to i32
371338
// CHECK-SHFL: %[[S0:.+]], %{{.+}} = gpu.shuffle xor %[[E0]], {{.+}} : i32
@@ -378,25 +345,15 @@ gpu.module @kernels {
378345
// CHECK-SHFL: %[[AL:.+]] = arith.addi {{.+}} : i16
379346
// CHECK-SHFL: "test.consume"(%[[AL]]) : (i16) -> ()
380347

381-
// CHECK-GFX9: %[[VAR0:.+]] = amdgpu.dpp %[[ARG0]] %[[ARG0]] quad_perm([1 : i32, 0 : i32, 3 : i32, 2 : i32]) {bound_ctrl = true} : i16
382-
// CHECK-GFX9: %[[VAR1:.+]] = arith.addi %[[ARG0]], %[[VAR0]] : i16
383-
// CHECK-GFX9: %[[VAR2:.+]] = amdgpu.dpp %[[VAR1]] %[[VAR1]] quad_perm([2 : i32, 3 : i32, 0 : i32, 1 : i32]) {bound_ctrl = true} : i16
384-
// CHECK-GFX9: %[[VAR3:.+]] = arith.addi %[[VAR1]], %[[VAR2]] : i16
385-
// CHECK-GFX9: %[[VAR4:.+]] = amdgpu.dpp %[[VAR3]] %[[VAR3]] row_half_mirror(unit) {bound_ctrl = true} : i16
386-
// CHECK-GFX9: %[[VAR5:.+]] = arith.addi %[[VAR3]], %[[VAR4]] : i16
387-
// CHECK-GFX9: %[[VAR6:.+]] = amdgpu.dpp %[[VAR5]] %[[VAR5]] row_mirror(unit) {bound_ctrl = true} : i16
388-
// CHECK-GFX9: %[[VAR7:.+]] = arith.addi %[[VAR5]], %[[VAR6]] : i16
389-
// CHECK-GFX9: "test.consume"(%[[VAR7]]) : (i16) -> ()
390-
391-
// CHECK-GFX10: %[[VAR0:.+]] = amdgpu.dpp %[[ARG0]] %[[ARG0]] quad_perm([1 : i32, 0 : i32, 3 : i32, 2 : i32]) {bound_ctrl = true} : i16
392-
// CHECK-GFX10: %[[VAR1:.+]] = arith.addi %[[ARG0]], %[[VAR0]] : i16
393-
// CHECK-GFX10: %[[VAR2:.+]] = amdgpu.dpp %[[VAR1]] %[[VAR1]] quad_perm([2 : i32, 3 : i32, 0 : i32, 1 : i32]) {bound_ctrl = true} : i16
394-
// CHECK-GFX10: %[[VAR3:.+]] = arith.addi %[[VAR1]], %[[VAR2]] : i16
395-
// CHECK-GFX10: %[[VAR4:.+]] = amdgpu.dpp %[[VAR3]] %[[VAR3]] row_half_mirror(unit) {bound_ctrl = true} : i16
396-
// CHECK-GFX10: %[[VAR5:.+]] = arith.addi %[[VAR3]], %[[VAR4]] : i16
397-
// CHECK-GFX10: %[[VAR6:.+]] = amdgpu.dpp %[[VAR5]] %[[VAR5]] row_mirror(unit) {bound_ctrl = true} : i16
398-
// CHECK-GFX10: %[[VAR7:.+]] = arith.addi %[[VAR5]], %[[VAR6]] : i16
399-
// CHECK-GFX10: "test.consume"(%[[VAR7]]) : (i16) -> ()
348+
// CHECK-GFX: %[[VAR0:.+]] = amdgpu.dpp %[[ARG0]] %[[ARG0]] quad_perm([1 : i32, 0 : i32, 3 : i32, 2 : i32]) {bound_ctrl = true} : i16
349+
// CHECK-GFX: %[[VAR1:.+]] = arith.addi %[[ARG0]], %[[VAR0]] : i16
350+
// CHECK-GFX: %[[VAR2:.+]] = amdgpu.dpp %[[VAR1]] %[[VAR1]] quad_perm([2 : i32, 3 : i32, 0 : i32, 1 : i32]) {bound_ctrl = true} : i16
351+
// CHECK-GFX: %[[VAR3:.+]] = arith.addi %[[VAR1]], %[[VAR2]] : i16
352+
// CHECK-GFX: %[[VAR4:.+]] = amdgpu.dpp %[[VAR3]] %[[VAR3]] row_half_mirror(unit) {bound_ctrl = true} : i16
353+
// CHECK-GFX: %[[VAR5:.+]] = arith.addi %[[VAR3]], %[[VAR4]] : i16
354+
// CHECK-GFX: %[[VAR6:.+]] = amdgpu.dpp %[[VAR5]] %[[VAR5]] row_mirror(unit) {bound_ctrl = true} : i16
355+
// CHECK-GFX: %[[VAR7:.+]] = arith.addi %[[VAR5]], %[[VAR6]] : i16
356+
// CHECK-GFX: "test.consume"(%[[VAR7]]) : (i16) -> ()
400357
%sum0 = gpu.subgroup_reduce add %arg0 cluster(size = 16) : (i16) -> i16
401358
"test.consume"(%sum0) : (i16) -> ()
402359

@@ -407,11 +364,8 @@ gpu.module @kernels {
407364
// CHECK-SHFL-LABEL: gpu.func @kernel6(
408365
// CHECK-SHFL-SAME: %[[ARG0:.+]]: vector<3xi8>)
409366
//
410-
// CHECK-GFX9-LABEL: gpu.func @kernel6(
411-
// CHECK-GFX9-NOT: amdgpu.dpp
412-
//
413-
// CHECK-GFX10-LABEL: gpu.func @kernel6(
414-
// CHECK-GFX10-NOT: amdgpu.dpp
367+
// CHECK-GFX-LABEL: gpu.func @kernel6(
368+
// CHECK-GFX-NOT: amdgpu.dpp
415369
gpu.func @kernel6(%arg0: vector<3xi8>) kernel {
416370
// CHECK-SHFL: %[[CZ:.+]] = arith.constant dense<0> : vector<4xi8>
417371
// CHECK-SHFL: %[[V0:.+]] = vector.insert_strided_slice %[[ARG0]], %[[CZ]] {offsets = [0], strides = [1]} : vector<3xi8> into vector<4xi8>
@@ -433,6 +387,44 @@ gpu.module @kernels {
433387
gpu.return
434388
}
435389

390+
// CHECK-GFX-LABEL: gpu.func @kernel7(
391+
// CHECK-GFX-SAME: %[[ARG0:.+]]: f32)
392+
//
393+
// Checks, common to gfx942 and gfx1030, of
394+
// (1) quad_perm, followed by reduction resulting in reduction over 2 consecutive lanes,
395+
// (2) quad_perm, followed by reduction resulting in reduction over 4 consecutive lanes,
396+
// (3) row_half_mirror, followed by reduction resulting in reduction over 8 consecutive lanes, and
397+
// (4) row_mirror, followed by reduction resulting in reduction over 16 consecutive lanes.
398+
// CHECK-GFX: %[[D0:.+]] = amdgpu.dpp %[[ARG0]] %[[ARG0]] quad_perm([1 : i32, 0 : i32, 3 : i32, 2 : i32]) {bound_ctrl = true} : f32
399+
// CHECK-GFX: %[[A0:.+]] = arith.addf %[[ARG0]], %[[D0]] : f32
400+
// CHECK-GFX: %[[D1:.+]] = amdgpu.dpp %[[A0]] %[[A0]] quad_perm([2 : i32, 3 : i32, 0 : i32, 1 : i32]) {bound_ctrl = true} : f32
401+
// CHECK-GFX: %[[A1:.+]] = arith.addf %[[A0]], %[[D1]] : f32
402+
// CHECK-GFX: %[[D2:.+]] = amdgpu.dpp %[[A1]] %[[A1]] row_half_mirror(unit) {bound_ctrl = true} : f32
403+
// CHECK-GFX: %[[A2:.+]] = arith.addf %[[A1]], %[[D2]] : f32
404+
// CHECK-GFX: %[[D3:.+]] = amdgpu.dpp %[[A2]] %[[A2]] row_mirror(unit) {bound_ctrl = true} : f32
405+
// CHECK-GFX: %[[A3:.+]] = arith.addf %[[A2]], %[[D3]] : f32
406+
//
407+
// Now, on gfx942:
408+
// (1) Lane 15 gets broadcast to lanes [16, 32) and lane 31 gets broadcast to lanes [48, 64], after which
409+
// the reduction in lanes [16, 32) is over the full cluster of the first 32 lanes, and the reduction in lanes
410+
// [48, 64) is over the full cluster of the last 32 lanes.
411+
// (2) Update the reduction value in lanes [0, 16) and [32, 48) with the final reduction result from
412+
// lanes [16, 32) and [48, 64), respectively.
413+
// CHECK-GFX9: %[[BCAST15:.+]] = amdgpu.dpp %[[A3]] %[[A3]] row_bcast_15(unit) {row_mask = 10 : i32} : f32
414+
// CHECK-GFX9: %[[SUM:.+]] = arith.addf %[[A3]], %[[BCAST15]] : f32
415+
// CHECK-GFX9: %[[SWIZ:.+]] = amdgpu.swizzle_bitmode %[[SUM]] 0 31 0 : f32
416+
// CHECK-GFX9: "test.consume"(%[[SWIZ]]) : (f32) -> ()
417+
//
418+
// On gfx1030, the final step is to permute the lanes and perform final reduction:
419+
// CHECK-GFX10: rocdl.permlanex16
420+
// CHECK-GFX10: arith.addf
421+
// CHECK-GFX10: "test.consume"
422+
gpu.func @kernel7(%arg0: f32) kernel {
423+
%sum0 = gpu.subgroup_reduce add %arg0 cluster(size = 32) : (f32) -> (f32)
424+
"test.consume"(%sum0) : (f32) -> ()
425+
gpu.return
426+
}
427+
436428
// CHECK-SHFL-LABEL: gpu.func @kernel_cluster_size_is_subgroup_size(
437429
// CHECK-SHFL-SAME: %[[ARG0:.+]]: vector<3xi8>)
438430
//

0 commit comments

Comments
 (0)