88
99// RUN: mlir-opt --allow-unregistered-dialect \
1010// RUN: --test-gpu-subgroup-reduce-lowering="expand-to-shuffles target=gfx942" %s \
11- // RUN: | FileCheck %s --check-prefix= CHECK-GFX9
11+ // RUN: | FileCheck %s --check-prefixes=CHECK-GFX, CHECK-GFX9
1212
1313// RUN: mlir-opt --allow-unregistered-dialect \
1414// RUN: --test-gpu-subgroup-reduce-lowering="expand-to-shuffles target=gfx1030" %s \
15- // RUN: | FileCheck %s --check-prefix= CHECK-GFX10
15+ // RUN: | FileCheck %s --check-prefixes=CHECK-GFX, CHECK-GFX10
1616
1717// CHECK-SUB: gpu.module @kernels {
1818// CHECK-SHFL: gpu.module @kernels {
@@ -24,8 +24,7 @@ gpu.module @kernels {
2424 // CHECK-SUB-SAME: %[[ARG0:.+]]: vector<5xf16>)
2525 //
2626 // CHECK-SHFL-LABEL: gpu.func @kernel0(
27- // CHECK-GFX9-LABEL: gpu.func @kernel0(
28- // CHECK-GFX10-LABEL: gpu.func @kernel0(
27+ // CHECK-GFX-LABEL: gpu.func @kernel0(
2928 gpu.func @kernel0 (%arg0: vector <5 xf16 >) kernel {
3029 // CHECK-SUB: %[[VZ:.+]] = arith.constant dense<0.0{{.*}}> : vector<5xf16>
3130 // CHECK-SUB: %[[E0:.+]] = vector.extract_strided_slice %[[ARG0]] {offsets = [0], sizes = [2], strides = [1]} : vector<5xf16> to vector<2xf16>
@@ -56,8 +55,7 @@ gpu.module @kernels {
5655
5756 // CHECK-SUB-COUNT-3: gpu.subgroup_reduce mul {{.+}} cluster(size = 4)
5857 // CHECK-SUB: "test.consume"
59- // CHECK-GFX9-COUNT-2: amdgpu.dpp {{.+}}
60- // CHECK-GFX10-COUNT-2: amdgpu.dpp {{.+}}
58+ // CHECK-GFX-COUNT-2: amdgpu.dpp {{.+}}
6159 %sum2 = gpu.subgroup_reduce mul %arg0 cluster (size = 4 ) : (vector <5 xf16 >) -> (vector <5 xf16 >)
6260 " test.consume" (%sum2 ) : (vector <5 xf16 >) -> ()
6361
@@ -74,8 +72,7 @@ gpu.module @kernels {
7472 // CHECK-SUB-SAME: %[[ARG0:.+]]: vector<1xf32>)
7573 //
7674 // CHECK-SHFL-LABEL: gpu.func @kernel1(
77- // CHECK-GFX9-LABEL: gpu.func @kernel1(
78- // CHECK-GFX10-LABEL: gpu.func @kernel1(
75+ // CHECK-GFX-LABEL: gpu.func @kernel1(
7976 gpu.func @kernel1 (%arg0: vector <1 xf32 >) kernel {
8077 // CHECK-SUB: %[[E0:.+]] = vector.extract %[[ARG0]][0] : f32 from vector<1xf32>
8178 // CHECK-SUB: %[[R0:.+]] = gpu.subgroup_reduce add %[[E0]] : (f32) -> f32
@@ -100,17 +97,14 @@ gpu.module @kernels {
10097 // Note stride is dropped because it is == 1.
10198 // CHECK-SUB: gpu.subgroup_reduce add {{.+}} cluster(size = 8) : (f32) -> f32
10299 // CHECK-SUB: "test.consume"
103- // CHECK-GFX9-COUNT-2: amdgpu.dpp {{.+}} quad_perm
104- // CHECK-GFX9: amdgpu.dpp {{.+}} row_half_mirror
105- // CHECK-GFX10-COUNT-2: amdgpu.dpp {{.+}} quad_perm
106- // CHECK-GFX10: amdgpu.dpp {{.+}} row_half_mirror
100+ // CHECK-GFX-COUNT-2: amdgpu.dpp {{.+}} quad_perm
101+ // CHECK-GFX: amdgpu.dpp {{.+}} row_half_mirror
107102 %sum2 = gpu.subgroup_reduce add %arg0 cluster (size = 8 , stride = 1 ) : (vector <1 xf32 >) -> (vector <1 xf32 >)
108103 " test.consume" (%sum2 ) : (vector <1 xf32 >) -> ()
109104
110105 // CHECK-SUB: gpu.subgroup_reduce add {{.+}} uniform cluster(size = 8, stride = 4) : (f32) -> f32
111106 // CHECK-SUB: "test.consume"
112- // CHECK-GFX9-NOT: amdgpu.dpp
113- // CHECK-GFX10-NOT: amdgpu.dpp
107+ // CHECK-GFX-NOT: amdgpu.dpp
114108 // CHECK-GFX10-NOT: rocdl.permlanex16
115109 %sum3 = gpu.subgroup_reduce add %arg0 uniform cluster (size = 8 , stride = 4 ) : (vector <1 xf32 >) -> (vector <1 xf32 >)
116110 " test.consume" (%sum3 ) : (vector <1 xf32 >) -> ()
@@ -126,11 +120,8 @@ gpu.module @kernels {
126120 //
127121 // CHECK-SHFL-LABEL: gpu.func @kernel2(
128122 //
129- // CHECK-GFX9-LABEL: gpu.func @kernel2(
130- // CHECK-GFX9-NOT: amdgpu.dpp
131- //
132- // CHECK-GFX10-LABEL: gpu.func @kernel2(
133- // CHECK-GFX10-NOT: amdgpu.dpp
123+ // CHECK-GFX-LABEL: gpu.func @kernel2(
124+ // CHECK-GFX-NOT: amdgpu.dpp
134125 gpu.func @kernel2 (%arg0: vector <3 xi8 >, %arg1: vector <4 xi8 >) kernel {
135126 // CHECK-SUB: %[[R0:.+]] = gpu.subgroup_reduce add %[[ARG0]] : (vector<3xi8>) -> vector<3xi8>
136127 // CHECK-SUB: "test.consume"(%[[R0]]) : (vector<3xi8>) -> ()
@@ -148,8 +139,7 @@ gpu.module @kernels {
148139
149140 // CHECK-SHFL-LABEL: gpu.func @kernel3(
150141 // CHECK-SHFL-SAME: %[[ARG0:.+]]: i32)
151- // CHECK-GFX9-LABEL: gpu.func @kernel3(
152- // CHECK-GFX10-LABEL: gpu.func @kernel3(
142+ // CHECK-GFX-LABEL: gpu.func @kernel3(
153143 gpu.func @kernel3 (%arg0: i32 ) kernel {
154144 // CHECK-SHFL-DAG: %[[C1:.+]] = arith.constant 1 : i32
155145 // CHECK-SHFL-DAG: %[[C2:.+]] = arith.constant 2 : i32
@@ -169,9 +159,9 @@ gpu.module @kernels {
169159 // CHECK-SHFL: %[[S4:.+]], %{{.+}} = gpu.shuffle xor %[[A3]], %[[C16]], %[[C32]] : i32
170160 // CHECK-SHFL: %[[A4:.+]] = arith.addi %[[A3]], %[[S4]] : i32
171161 // CHECK-SHFL: "test.consume"(%[[A4]]) : (i32) -> ()
172-
162+
173163 // CHECK-GFX9-COUNT-6: amdgpu.dpp
174-
164+
175165 // CHECK-GFX10-COUNT-4: amdgpu.dpp
176166 // CHECK-GFX10: rocdl.permlanex16
177167 // CHECK-GFX10-COUNT-2: rocdl.readlane
@@ -185,11 +175,8 @@ gpu.module @kernels {
185175 // CHECK-SHFL-LABEL: gpu.func @kernel3_clustered(
186176 // CHECK-SHFL-SAME: %[[ARG0:.+]]: i32)
187177 //
188- // CHECK-GFX9-LABEL: gpu.func @kernel3_clustered(
189- // CHECK-GFX9-SAME: %[[ARG0:.+]]: i32)
190- //
191- // CHECK-GFX10-LABEL: gpu.func @kernel3_clustered(
192- // CHECK-GFX10-SAME: %[[ARG0:.+]]: i32)
178+ // CHECK-GFX-LABEL: gpu.func @kernel3_clustered(
179+ // CHECK-GFX-SAME: %[[ARG0:.+]]: i32)
193180 gpu.func @kernel3_clustered (%arg0: i32 ) kernel {
194181 // CHECK-SHFL-DAG: %[[C1:.+]] = arith.constant 1 : i32
195182 // CHECK-SHFL-DAG: %[[C2:.+]] = arith.constant 2 : i32
@@ -204,19 +191,13 @@ gpu.module @kernels {
204191 // CHECK-SHFL: %[[A2:.+]] = arith.addi %[[A1]], %[[S2]] : i32
205192 // CHECK-SHFL: "test.consume"(%[[A2]]) : (i32) -> ()
206193
207- // CHECK-GFX9: %[[D0:.+]] = amdgpu.dpp %[[ARG0]] %[[ARG0]] quad_perm([1 : i32, 0 : i32, 3 : i32, 2 : i32]) {bound_ctrl = true} : i32
208- // CHECK-GFX9: %[[A0:.+]] = arith.addi %[[ARG0]], %[[D0]] : i32
209- // CHECK-GFX9: %[[D1:.+]] = amdgpu.dpp %[[A0]] %[[A0]] quad_perm([2 : i32, 3 : i32, 0 : i32, 1 : i32]) {bound_ctrl = true} : i32
210- // CHECK-GFX9: %[[A1:.+]] = arith.addi %[[A0]], %[[D1]] : i32
211- // CHECK-GFX9: %[[D2:.+]] = amdgpu.dpp %[[A1]] %[[A1]] row_half_mirror(unit) {bound_ctrl = true} : i32
212- // CHECK-GFX9: %[[A2:.+]] = arith.addi %[[A1]], %[[D2]] : i32
213-
214- // CHECK-GFX10: %[[D0:.+]] = amdgpu.dpp %[[ARG0]] %[[ARG0]] quad_perm([1 : i32, 0 : i32, 3 : i32, 2 : i32]) {bound_ctrl = true} : i32
215- // CHECK-GFX10: %[[A0:.+]] = arith.addi %[[ARG0]], %[[D0]] : i32
216- // CHECK-GFX10: %[[D1:.+]] = amdgpu.dpp %[[A0]] %[[A0]] quad_perm([2 : i32, 3 : i32, 0 : i32, 1 : i32]) {bound_ctrl = true} : i32
217- // CHECK-GFX10: %[[A1:.+]] = arith.addi %[[A0]], %[[D1]] : i32
218- // CHECK-GFX10: %[[D2:.+]] = amdgpu.dpp %[[A1]] %[[A1]] row_half_mirror(unit) {bound_ctrl = true} : i32
219- // CHECK-GFX10: %[[A2:.+]] = arith.addi %[[A1]], %[[D2]] : i32
194+ // CHECK-GFX: %[[D0:.+]] = amdgpu.dpp %[[ARG0]] %[[ARG0]] quad_perm([1 : i32, 0 : i32, 3 : i32, 2 : i32]) {bound_ctrl = true} : i32
195+ // CHECK-GFX: %[[A0:.+]] = arith.addi %[[ARG0]], %[[D0]] : i32
196+ // CHECK-GFX: %[[D1:.+]] = amdgpu.dpp %[[A0]] %[[A0]] quad_perm([2 : i32, 3 : i32, 0 : i32, 1 : i32]) {bound_ctrl = true} : i32
197+ // CHECK-GFX: %[[A1:.+]] = arith.addi %[[A0]], %[[D1]] : i32
198+ // CHECK-GFX: %[[D2:.+]] = amdgpu.dpp %[[A1]] %[[A1]] row_half_mirror(unit) {bound_ctrl = true} : i32
199+ // CHECK-GFX: %[[A2:.+]] = arith.addi %[[A1]], %[[D2]] : i32
200+
220201 // CHECK-GFX10: "test.consume"(%[[A2]]) : (i32) -> ()
221202 %sum0 = gpu.subgroup_reduce add %arg0 cluster (size = 8 ) : (i32 ) -> i32
222203 " test.consume" (%sum0 ) : (i32 ) -> ()
@@ -228,11 +209,8 @@ gpu.module @kernels {
228209 // CHECK-SHFL-LABEL: gpu.func @kernel3_clustered_strided(
229210 // CHECK-SHFL-SAME: %[[ARG0:.+]]: i32)
230211 //
231- // CHECK-GFX9-LABEL: gpu.func @kernel3_clustered_strided(
232- // CHECK-GFX9-NOT: amdgpu.dpp
233- //
234- // CHECK-GFX10-LABEL: gpu.func @kernel3_clustered_strided(
235- // CHECK-GFX10-NOT: amdgpu.dpp
212+ // CHECK-GFX-LABEL: gpu.func @kernel3_clustered_strided(
213+ // CHECK-GFX-NOT: amdgpu.dpp
236214 gpu.func @kernel3_clustered_strided (%arg0: i32 ) kernel {
237215 // CHECK-SHFL-DAG: %[[C1:.+]] = arith.constant 4 : i32
238216 // CHECK-SHFL-DAG: %[[C2:.+]] = arith.constant 8 : i32
@@ -256,11 +234,8 @@ gpu.module @kernels {
256234 // CHECK-SHFL-LABEL: gpu.func @kernel4(
257235 // CHECK-SHFL-SAME: %[[ARG0:.+]]: vector<2xf16>)
258236 //
259- // CHECK-GFX9-LABEL: gpu.func @kernel4(
260- // CHECK-GFX9-NOT: amdgpu.dpp
261- //
262- // CHECK-GFX10-LABEL: gpu.func @kernel4(
263- // CHECK-GFX10-NOT: amdgpu.dpp
237+ // CHECK-GFX-LABEL: gpu.func @kernel4(
238+ // CHECK-GFX-NOT: amdgpu.dpp
264239 gpu.func @kernel4 (%arg0: vector <2 xf16 >) kernel {
265240 // CHECK-SHFL-DAG: %[[C1:.+]] = arith.constant 1 : i32
266241 // CHECK-SHFL-DAG: %[[C2:.+]] = arith.constant 2 : i32
@@ -298,11 +273,8 @@ gpu.module @kernels {
298273 // CHECK-SHFL-LABEL: gpu.func @kernel4_clustered(
299274 // CHECK-SHFL-SAME: %[[ARG0:.+]]: vector<2xf16>)
300275 //
301- // CHECK-GFX9-LABEL: gpu.func @kernel4_clustered(
302- // CHECK-GFX9-NOT: amdgpu.dpp
303- //
304- // CHECK-GFX10-LABEL: gpu.func @kernel4_clustered(
305- // CHECK-GFX10-NOT: amdgpu.dpp
276+ // CHECK-GFX-LABEL: gpu.func @kernel4_clustered(
277+ // CHECK-GFX-NOT: amdgpu.dpp
306278 gpu.func @kernel4_clustered (%arg0: vector <2 xf16 >) kernel {
307279 // CHECK-SHFL-DAG: %[[C1:.+]] = arith.constant 1 : i32
308280 // CHECK-SHFL-DAG: %[[C2:.+]] = arith.constant 2 : i32
@@ -319,10 +291,8 @@ gpu.module @kernels {
319291 // CHECK-SHFL-LABEL: gpu.func @kernel5(
320292 // CHECK-SHFL-SAME: %[[ARG0:.+]]: i16)
321293 //
322- // CHECK-GFX9-LABEL: gpu.func @kernel5(
323- //
324- // CHECK-GFX10-LABEL: gpu.func @kernel5(
325- // CHECK-GFX10-SAME: %[[ARG0:.+]]: i16)
294+ // CHECK-GFX-LABEL: gpu.func @kernel5(
295+ // CHECK-GFX-SAME: %[[ARG0:.+]]: i16)
326296 gpu.func @kernel5 (%arg0: i16 ) kernel {
327297 // CHECK-SHFL: %[[E0:.+]] = arith.extui %[[ARG0]] : i16 to i32
328298 // CHECK-SHFL: %[[S0:.+]], %{{.+}} = gpu.shuffle xor %[[E0]], {{.+}} : i32
@@ -334,7 +304,7 @@ gpu.module @kernels {
334304 // CHECK-SHFL: arith.trunci {{.+}} : i32 to i16
335305 // CHECK-SHFL: %[[AL:.+]] = arith.addi {{.+}} : i16
336306 // CHECK-SHFL: "test.consume"(%[[AL]]) : (i16) -> ()
337-
307+
338308 // CHECK-GFX9-COUNT-6: amdgpu.dpp
339309
340310 // CHECK-GFX10: %[[D0:.+]] = amdgpu.dpp %[[ARG0]] %[[ARG0]] quad_perm([1 : i32, 0 : i32, 3 : i32, 2 : i32]) {bound_ctrl = true} : i16
@@ -361,11 +331,8 @@ gpu.module @kernels {
361331 // CHECK-SHFL-LABEL: gpu.func @kernel5_clustered(
362332 // CHECK-SHFL-SAME: %[[ARG0:.+]]: i16)
363333 //
364- // CHECK-GFX9-LABEL: gpu.func @kernel5_clustered
365- // CHECK-GFX9-SAME: %[[ARG0:.+]]: i16)
366- //
367- // CHECK-GFX10-LABEL: gpu.func @kernel5_clustered
368- // CHECK-GFX10-SAME: %[[ARG0:.+]]: i16)
334+ // CHECK-GFX-LABEL: gpu.func @kernel5_clustered
335+ // CHECK-GFX-SAME: %[[ARG0:.+]]: i16)
369336 gpu.func @kernel5_clustered (%arg0: i16 ) kernel {
370337 // CHECK-SHFL: %[[E0:.+]] = arith.extui %[[ARG0]] : i16 to i32
371338 // CHECK-SHFL: %[[S0:.+]], %{{.+}} = gpu.shuffle xor %[[E0]], {{.+}} : i32
@@ -378,25 +345,15 @@ gpu.module @kernels {
378345 // CHECK-SHFL: %[[AL:.+]] = arith.addi {{.+}} : i16
379346 // CHECK-SHFL: "test.consume"(%[[AL]]) : (i16) -> ()
380347
381- // CHECK-GFX9: %[[VAR0:.+]] = amdgpu.dpp %[[ARG0]] %[[ARG0]] quad_perm([1 : i32, 0 : i32, 3 : i32, 2 : i32]) {bound_ctrl = true} : i16
382- // CHECK-GFX9: %[[VAR1:.+]] = arith.addi %[[ARG0]], %[[VAR0]] : i16
383- // CHECK-GFX9: %[[VAR2:.+]] = amdgpu.dpp %[[VAR1]] %[[VAR1]] quad_perm([2 : i32, 3 : i32, 0 : i32, 1 : i32]) {bound_ctrl = true} : i16
384- // CHECK-GFX9: %[[VAR3:.+]] = arith.addi %[[VAR1]], %[[VAR2]] : i16
385- // CHECK-GFX9: %[[VAR4:.+]] = amdgpu.dpp %[[VAR3]] %[[VAR3]] row_half_mirror(unit) {bound_ctrl = true} : i16
386- // CHECK-GFX9: %[[VAR5:.+]] = arith.addi %[[VAR3]], %[[VAR4]] : i16
387- // CHECK-GFX9: %[[VAR6:.+]] = amdgpu.dpp %[[VAR5]] %[[VAR5]] row_mirror(unit) {bound_ctrl = true} : i16
388- // CHECK-GFX9: %[[VAR7:.+]] = arith.addi %[[VAR5]], %[[VAR6]] : i16
389- // CHECK-GFX9: "test.consume"(%[[VAR7]]) : (i16) -> ()
390-
391- // CHECK-GFX10: %[[VAR0:.+]] = amdgpu.dpp %[[ARG0]] %[[ARG0]] quad_perm([1 : i32, 0 : i32, 3 : i32, 2 : i32]) {bound_ctrl = true} : i16
392- // CHECK-GFX10: %[[VAR1:.+]] = arith.addi %[[ARG0]], %[[VAR0]] : i16
393- // CHECK-GFX10: %[[VAR2:.+]] = amdgpu.dpp %[[VAR1]] %[[VAR1]] quad_perm([2 : i32, 3 : i32, 0 : i32, 1 : i32]) {bound_ctrl = true} : i16
394- // CHECK-GFX10: %[[VAR3:.+]] = arith.addi %[[VAR1]], %[[VAR2]] : i16
395- // CHECK-GFX10: %[[VAR4:.+]] = amdgpu.dpp %[[VAR3]] %[[VAR3]] row_half_mirror(unit) {bound_ctrl = true} : i16
396- // CHECK-GFX10: %[[VAR5:.+]] = arith.addi %[[VAR3]], %[[VAR4]] : i16
397- // CHECK-GFX10: %[[VAR6:.+]] = amdgpu.dpp %[[VAR5]] %[[VAR5]] row_mirror(unit) {bound_ctrl = true} : i16
398- // CHECK-GFX10: %[[VAR7:.+]] = arith.addi %[[VAR5]], %[[VAR6]] : i16
399- // CHECK-GFX10: "test.consume"(%[[VAR7]]) : (i16) -> ()
348+ // CHECK-GFX: %[[VAR0:.+]] = amdgpu.dpp %[[ARG0]] %[[ARG0]] quad_perm([1 : i32, 0 : i32, 3 : i32, 2 : i32]) {bound_ctrl = true} : i16
349+ // CHECK-GFX: %[[VAR1:.+]] = arith.addi %[[ARG0]], %[[VAR0]] : i16
350+ // CHECK-GFX: %[[VAR2:.+]] = amdgpu.dpp %[[VAR1]] %[[VAR1]] quad_perm([2 : i32, 3 : i32, 0 : i32, 1 : i32]) {bound_ctrl = true} : i16
351+ // CHECK-GFX: %[[VAR3:.+]] = arith.addi %[[VAR1]], %[[VAR2]] : i16
352+ // CHECK-GFX: %[[VAR4:.+]] = amdgpu.dpp %[[VAR3]] %[[VAR3]] row_half_mirror(unit) {bound_ctrl = true} : i16
353+ // CHECK-GFX: %[[VAR5:.+]] = arith.addi %[[VAR3]], %[[VAR4]] : i16
354+ // CHECK-GFX: %[[VAR6:.+]] = amdgpu.dpp %[[VAR5]] %[[VAR5]] row_mirror(unit) {bound_ctrl = true} : i16
355+ // CHECK-GFX: %[[VAR7:.+]] = arith.addi %[[VAR5]], %[[VAR6]] : i16
356+ // CHECK-GFX: "test.consume"(%[[VAR7]]) : (i16) -> ()
400357 %sum0 = gpu.subgroup_reduce add %arg0 cluster (size = 16 ) : (i16 ) -> i16
401358 " test.consume" (%sum0 ) : (i16 ) -> ()
402359
@@ -407,11 +364,8 @@ gpu.module @kernels {
407364 // CHECK-SHFL-LABEL: gpu.func @kernel6(
408365 // CHECK-SHFL-SAME: %[[ARG0:.+]]: vector<3xi8>)
409366 //
410- // CHECK-GFX9-LABEL: gpu.func @kernel6(
411- // CHECK-GFX9-NOT: amdgpu.dpp
412- //
413- // CHECK-GFX10-LABEL: gpu.func @kernel6(
414- // CHECK-GFX10-NOT: amdgpu.dpp
367+ // CHECK-GFX-LABEL: gpu.func @kernel6(
368+ // CHECK-GFX-NOT: amdgpu.dpp
415369 gpu.func @kernel6 (%arg0: vector <3 xi8 >) kernel {
416370 // CHECK-SHFL: %[[CZ:.+]] = arith.constant dense<0> : vector<4xi8>
417371 // CHECK-SHFL: %[[V0:.+]] = vector.insert_strided_slice %[[ARG0]], %[[CZ]] {offsets = [0], strides = [1]} : vector<3xi8> into vector<4xi8>
@@ -433,6 +387,44 @@ gpu.module @kernels {
433387 gpu.return
434388 }
435389
390+ // CHECK-GFX-LABEL: gpu.func @kernel7(
391+ // CHECK-GFX-SAME: %[[ARG0:.+]]: f32)
392+ //
393+ // Checks, common to gfx942 and gfx1030, of
394+ // (1) quad_perm, followed by reduction resulting in reduction over 2 consecutive lanes,
395+ // (2) quad_perm, followed by reduction resulting in reduction over 4 consecutive lanes,
396+ // (3) row_half_mirror, followed by reduction resulting in reduction over 8 consecutive lanes, and
397+ // (4) row_mirror, followed by reduction resulting in reduction over 16 consecutive lanes.
398+ // CHECK-GFX: %[[D0:.+]] = amdgpu.dpp %[[ARG0]] %[[ARG0]] quad_perm([1 : i32, 0 : i32, 3 : i32, 2 : i32]) {bound_ctrl = true} : f32
399+ // CHECK-GFX: %[[A0:.+]] = arith.addf %[[ARG0]], %[[D0]] : f32
400+ // CHECK-GFX: %[[D1:.+]] = amdgpu.dpp %[[A0]] %[[A0]] quad_perm([2 : i32, 3 : i32, 0 : i32, 1 : i32]) {bound_ctrl = true} : f32
401+ // CHECK-GFX: %[[A1:.+]] = arith.addf %[[A0]], %[[D1]] : f32
402+ // CHECK-GFX: %[[D2:.+]] = amdgpu.dpp %[[A1]] %[[A1]] row_half_mirror(unit) {bound_ctrl = true} : f32
403+ // CHECK-GFX: %[[A2:.+]] = arith.addf %[[A1]], %[[D2]] : f32
404+ // CHECK-GFX: %[[D3:.+]] = amdgpu.dpp %[[A2]] %[[A2]] row_mirror(unit) {bound_ctrl = true} : f32
405+ // CHECK-GFX: %[[A3:.+]] = arith.addf %[[A2]], %[[D3]] : f32
406+ //
407+ // Now, on gfx942:
408+ // (1) Lane 15 gets broadcast to lanes [16, 32) and lane 31 gets broadcast to lanes [48, 64], after which
409+ // the reduction in lanes [16, 32) is over the full cluster of the first 32 lanes, and the reduction in lanes
410+ // [48, 64) is over the full cluster of the last 32 lanes.
411+ // (2) Update the reduction value in lanes [0, 16) and [32, 48) with the final reduction result from
412+ // lanes [16, 32) and [48, 64), respectively.
413+ // CHECK-GFX9: %[[BCAST15:.+]] = amdgpu.dpp %[[A3]] %[[A3]] row_bcast_15(unit) {row_mask = 10 : i32} : f32
414+ // CHECK-GFX9: %[[SUM:.+]] = arith.addf %[[A3]], %[[BCAST15]] : f32
415+ // CHECK-GFX9: %[[SWIZ:.+]] = amdgpu.swizzle_bitmode %[[SUM]] 0 31 0 : f32
416+ // CHECK-GFX9: "test.consume"(%[[SWIZ]]) : (f32) -> ()
417+ //
418+ // On gfx1030, the final step is to permute the lanes and perform final reduction:
419+ // CHECK-GFX10: rocdl.permlanex16
420+ // CHECK-GFX10: arith.addf
421+ // CHECK-GFX10: "test.consume"
422+ gpu.func @kernel7 (%arg0: f32 ) kernel {
423+ %sum0 = gpu.subgroup_reduce add %arg0 cluster (size = 32 ) : (f32 ) -> (f32 )
424+ " test.consume" (%sum0 ) : (f32 ) -> ()
425+ gpu.return
426+ }
427+
436428 // CHECK-SHFL-LABEL: gpu.func @kernel_cluster_size_is_subgroup_size(
437429 // CHECK-SHFL-SAME: %[[ARG0:.+]]: vector<3xi8>)
438430 //
0 commit comments