88
99// RUN: mlir-opt --allow-unregistered-dialect \
1010// RUN: --test-gpu-subgroup-reduce-lowering="expand-to-shuffles target=gfx942" %s \
11- // RUN: | FileCheck %s --check-prefix=CHECK-DPP
11+ // RUN: | FileCheck %s --check-prefix=CHECK-GFX9
12+
13+ // RUN: mlir-opt --allow-unregistered-dialect \
14+ // RUN: --test-gpu-subgroup-reduce-lowering="expand-to-shuffles target=gfx1030" %s \
15+ // RUN: | FileCheck %s --check-prefix=CHECK-GFX1030
1216
1317// CHECK-SUB: gpu.module @kernels {
1418// CHECK-SHFL: gpu.module @kernels {
15- // CHECK-DPP: gpu.module @kernels {
19+ // CHECK-GFX9: gpu.module @kernels {
20+ // CHECK-GFX10: gpu.module @kernels {
1621gpu.module @kernels {
1722
1823 // CHECK-SUB-LABEL: gpu.func @kernel0(
1924 // CHECK-SUB-SAME: %[[ARG0:.+]]: vector<5xf16>)
2025 //
2126 // CHECK-SHFL-LABEL: gpu.func @kernel0(
22- // CHECK-DPP-LABEL: gpu.func @kernel0(
27+ // CHECK-GFX9-LABEL: gpu.func @kernel0(
28+ // CHECK-GFX10-LABEL: gpu.func @kernel0(
2329 gpu.func @kernel0 (%arg0: vector <5 xf16 >) kernel {
2430 // CHECK-SUB: %[[VZ:.+]] = arith.constant dense<0.0{{.*}}> : vector<5xf16>
2531 // CHECK-SUB: %[[E0:.+]] = vector.extract_strided_slice %[[ARG0]] {offsets = [0], sizes = [2], strides = [1]} : vector<5xf16> to vector<2xf16>
@@ -32,19 +38,26 @@ gpu.module @kernels {
3238 // CHECK-SUB: %[[R2:.+]] = gpu.subgroup_reduce add %[[E2]] : (f16) -> f16
3339 // CHECK-SUB: %[[V2:.+]] = vector.insert %[[R2]], %[[V1]] [4] : f16 into vector<5xf16>
3440 // CHECK-SUB: "test.consume"(%[[V2]]) : (vector<5xf16>) -> ()
35- // CHECK-DPP-COUNT-6: amdgpu.dpp
41+ // CHECK-GFX9-COUNT-6: amdgpu.dpp
42+ // CHECK-GFX10-COUNT-4: amdgpu.dpp
43+ // CHECK-GFX10: rocdl.permlanex16
44+ // CHECK-GFX10-COUNT-2: rocdl.readlane
3645 %sum0 = gpu.subgroup_reduce add %arg0 : (vector <5 xf16 >) -> (vector <5 xf16 >)
3746 " test.consume" (%sum0 ) : (vector <5 xf16 >) -> ()
3847
3948 // CHECK-SUB-COUNT-3: gpu.subgroup_reduce mul {{.+}} uniform
4049 // CHECK-SUB: "test.consume"
41- // CHECK-DPP-COUNT-6: amdgpu.dpp
50+ // CHECK-GFX9-COUNT-6: amdgpu.dpp
51+ // CHECK-GFX10-COUNT-4: amdgpu.dpp
52+ // CHECK-GFX10: rocdl.permlanex16
53+ // CHECK-GFX10-COUNT-2: rocdl.readlane
4254 %sum1 = gpu.subgroup_reduce mul %arg0 uniform : (vector <5 xf16 >) -> (vector <5 xf16 >)
4355 " test.consume" (%sum1 ) : (vector <5 xf16 >) -> ()
4456
4557 // CHECK-SUB-COUNT-3: gpu.subgroup_reduce mul {{.+}} cluster(size = 4)
4658 // CHECK-SUB: "test.consume"
47- // CHECK-DPP-COUNT-2: amdgpu.dpp {{.+}}
59+ // CHECK-GFX9-COUNT-2: amdgpu.dpp {{.+}}
60+ // CHECK-GFX10-COUNT-2: amdgpu.dpp {{.+}}
4861 %sum2 = gpu.subgroup_reduce mul %arg0 cluster (size = 4 ) : (vector <5 xf16 >) -> (vector <5 xf16 >)
4962 " test.consume" (%sum2 ) : (vector <5 xf16 >) -> ()
5063
@@ -61,34 +74,44 @@ gpu.module @kernels {
6174 // CHECK-SUB-SAME: %[[ARG0:.+]]: vector<1xf32>)
6275 //
6376 // CHECK-SHFL-LABEL: gpu.func @kernel1(
64- //
65- // CHECK-DPP -LABEL: gpu.func @kernel1(
77+ // CHECK-GFX9-LABEL: gpu.func @kernel1(
78+ // CHECK-GFX10 -LABEL: gpu.func @kernel1(
6679 gpu.func @kernel1 (%arg0: vector <1 xf32 >) kernel {
6780 // CHECK-SUB: %[[E0:.+]] = vector.extract %[[ARG0]][0] : f32 from vector<1xf32>
6881 // CHECK-SUB: %[[R0:.+]] = gpu.subgroup_reduce add %[[E0]] : (f32) -> f32
6982 // CHECK-SUB: %[[V0:.+]] = vector.broadcast %[[R0]] : f32 to vector<1xf32>
7083 // CHECK-SUB: "test.consume"(%[[V0]]) : (vector<1xf32>) -> ()
71- // CHECK-DPP-COUNT-6: amdgpu.dpp
84+ // CHECK-GFX9-COUNT-6: amdgpu.dpp
85+ // CHECK-GFX10-COUNT-4: amdgpu.dpp
86+ // CHECK-GFX10: rocdl.permlanex16
87+ // CHECK-GFX10-COUNT-2: rocdl.readlane
7288 %sum0 = gpu.subgroup_reduce add %arg0 : (vector <1 xf32 >) -> (vector <1 xf32 >)
7389 " test.consume" (%sum0 ) : (vector <1 xf32 >) -> ()
7490
7591 // CHECK-SUB: gpu.subgroup_reduce add {{.+}} uniform : (f32) -> f32
7692 // CHECK-SUB: "test.consume"
77- // CHECK-DPP-COUNT-6: amdgpu.dpp
93+ // CHECK-GFX9-COUNT-6: amdgpu.dpp
94+ // CHECK-GFX10-COUNT-4: amdgpu.dpp
95+ // CHECK-GFX10: rocdl.permlanex16
96+ // CHECK-GFX10-COUNT-2: rocdl.readlane
7897 %sum1 = gpu.subgroup_reduce add %arg0 uniform : (vector <1 xf32 >) -> (vector <1 xf32 >)
7998 " test.consume" (%sum1 ) : (vector <1 xf32 >) -> ()
8099
81100 // Note stride is dropped because it is == 1.
82101 // CHECK-SUB: gpu.subgroup_reduce add {{.+}} cluster(size = 8) : (f32) -> f32
83102 // CHECK-SUB: "test.consume"
84- // CHECK-DPP-COUNT-2: amdgpu.dpp {{.+}} quad_perm
85- // CHECK-DPP: amdgpu.dpp {{.+}} row_half_mirror
103+ // CHECK-GFX9-COUNT-2: amdgpu.dpp {{.+}} quad_perm
104+ // CHECK-GFX9: amdgpu.dpp {{.+}} row_half_mirror
105+ // CHECK-GFX10-COUNT-2: amdgpu.dpp {{.+}} quad_perm
106+ // CHECK-GFX10: amdgpu.dpp {{.+}} row_half_mirror
86107 %sum2 = gpu.subgroup_reduce add %arg0 cluster (size = 8 , stride = 1 ) : (vector <1 xf32 >) -> (vector <1 xf32 >)
87108 " test.consume" (%sum2 ) : (vector <1 xf32 >) -> ()
88109
89110 // CHECK-SUB: gpu.subgroup_reduce add {{.+}} uniform cluster(size = 8, stride = 4) : (f32) -> f32
90111 // CHECK-SUB: "test.consume"
91- // CHECK-DPP-NOT: amdgpu.dpp
112+ // CHECK-GFX9-NOT: amdgpu.dpp
113+ // CHECK-GFX10-NOT: amdgpu.dpp
114+ // CHECK-GFX10-NOT: rocdl.permlanex16
92115 %sum3 = gpu.subgroup_reduce add %arg0 uniform cluster (size = 8 , stride = 4 ) : (vector <1 xf32 >) -> (vector <1 xf32 >)
93116 " test.consume" (%sum3 ) : (vector <1 xf32 >) -> ()
94117
@@ -102,8 +125,12 @@ gpu.module @kernels {
102125 // CHECK-SUB-SAME: %[[ARG0:.+]]: vector<3xi8>, %[[ARG1:.+]]: vector<4xi8>)
103126 //
104127 // CHECK-SHFL-LABEL: gpu.func @kernel2(
105- // CHECK-DPP-LABEL: gpu.func @kernel2(
106- // CHECK-DPP-NOT: amdgpu.dpp
128+ //
129+ // CHECK-GFX9-LABEL: gpu.func @kernel2(
130+ // CHECK-GFX9-NOT: amdgpu.dpp
131+ //
132+ // CHECK-GFX10-LABEL: gpu.func @kernel2(
133+ // CHECK-GFX10-NOT: amdgpu.dpp
107134 gpu.func @kernel2 (%arg0: vector <3 xi8 >, %arg1: vector <4 xi8 >) kernel {
108135 // CHECK-SUB: %[[R0:.+]] = gpu.subgroup_reduce add %[[ARG0]] : (vector<3xi8>) -> vector<3xi8>
109136 // CHECK-SUB: "test.consume"(%[[R0]]) : (vector<3xi8>) -> ()
@@ -121,8 +148,8 @@ gpu.module @kernels {
121148
122149 // CHECK-SHFL-LABEL: gpu.func @kernel3(
123150 // CHECK-SHFL-SAME: %[[ARG0:.+]]: i32)
124- // CHECK-DPP -LABEL: gpu.func @kernel3(
125- // CHECK-DPP-NOT: amdgpu.dpp
151+ // CHECK-GFX9 -LABEL: gpu.func @kernel3(
152+ // CHECK-GFX10-LABEL: gpu.func @kernel3(
126153 gpu.func @kernel3 (%arg0: i32 ) kernel {
127154 // CHECK-SHFL-DAG: %[[C1:.+]] = arith.constant 1 : i32
128155 // CHECK-SHFL-DAG: %[[C2:.+]] = arith.constant 2 : i32
@@ -143,7 +170,11 @@ gpu.module @kernels {
143170 // CHECK-SHFL: %[[A4:.+]] = arith.addi %[[A3]], %[[S4]] : i32
144171 // CHECK-SHFL: "test.consume"(%[[A4]]) : (i32) -> ()
145172
146- // CHECK-DPP-COUNT-6: amdgpu.dpp
173+ // CHECK-GFX9-COUNT-6: amdgpu.dpp
174+
175+ // CHECK-GFX10-COUNT-4: amdgpu.dpp
176+ // CHECK-GFX10: rocdl.permlanex16
177+ // CHECK-GFX10-COUNT-2: rocdl.readlane
147178 %sum0 = gpu.subgroup_reduce add %arg0 : (i32 ) -> i32
148179 " test.consume" (%sum0 ) : (i32 ) -> ()
149180
@@ -153,8 +184,12 @@ gpu.module @kernels {
153184
154185 // CHECK-SHFL-LABEL: gpu.func @kernel3_clustered(
155186 // CHECK-SHFL-SAME: %[[ARG0:.+]]: i32)
156- // CHECK-DPP-LABEL: gpu.func @kernel3_clustered(
157- // CHECK-DPP-SAME: %[[ARG0:.+]]: i32)
187+ //
188+ // CHECK-GFX9-LABEL: gpu.func @kernel3_clustered(
189+ // CHECK-GFX9-SAME: %[[ARG0:.+]]: i32)
190+ //
191+ // CHECK-GFX10-LABEL: gpu.func @kernel3_clustered(
192+ // CHECK-GFX10-SAME: %[[ARG0:.+]]: i32)
158193 gpu.func @kernel3_clustered (%arg0: i32 ) kernel {
159194 // CHECK-SHFL-DAG: %[[C1:.+]] = arith.constant 1 : i32
160195 // CHECK-SHFL-DAG: %[[C2:.+]] = arith.constant 2 : i32
@@ -169,13 +204,20 @@ gpu.module @kernels {
169204 // CHECK-SHFL: %[[A2:.+]] = arith.addi %[[A1]], %[[S2]] : i32
170205 // CHECK-SHFL: "test.consume"(%[[A2]]) : (i32) -> ()
171206
172- // CHECK-DPP: %[[D0:.+]] = amdgpu.dpp %[[ARG0]] %[[ARG0]] quad_perm([1 : i32, 0 : i32, 3 : i32, 2 : i32]) {bound_ctrl = true} : i32
173- // CHECK-DPP: %[[A0:.+]] = arith.addi %[[ARG0]], %[[D0]] : i32
174- // CHECK-DPP: %[[D1:.+]] = amdgpu.dpp %[[A0]] %[[A0]] quad_perm([2 : i32, 3 : i32, 0 : i32, 1 : i32]) {bound_ctrl = true} : i32
175- // CHECK-DPP: %[[A1:.+]] = arith.addi %[[A0]], %[[D1]] : i32
176- // CHECK-DPP: %[[D2:.+]] = amdgpu.dpp %[[A1]] %[[A1]] row_half_mirror(unit) {bound_ctrl = true} : i32
177- // CHECK-DPP: %[[A2:.+]] = arith.addi %[[A1]], %[[D2]] : i32
178- // CHECK-DPP: "test.consume"(%[[A2]]) : (i32) -> ()
207+ // CHECK-GFX9: %[[D0:.+]] = amdgpu.dpp %[[ARG0]] %[[ARG0]] quad_perm([1 : i32, 0 : i32, 3 : i32, 2 : i32]) {bound_ctrl = true} : i32
208+ // CHECK-GFX9: %[[A0:.+]] = arith.addi %[[ARG0]], %[[D0]] : i32
209+ // CHECK-GFX9: %[[D1:.+]] = amdgpu.dpp %[[A0]] %[[A0]] quad_perm([2 : i32, 3 : i32, 0 : i32, 1 : i32]) {bound_ctrl = true} : i32
210+ // CHECK-GFX9: %[[A1:.+]] = arith.addi %[[A0]], %[[D1]] : i32
211+ // CHECK-GFX9: %[[D2:.+]] = amdgpu.dpp %[[A1]] %[[A1]] row_half_mirror(unit) {bound_ctrl = true} : i32
212+ // CHECK-GFX9: %[[A2:.+]] = arith.addi %[[A1]], %[[D2]] : i32
213+
214+ // CHECK-GFX10: %[[D0:.+]] = amdgpu.dpp %[[ARG0]] %[[ARG0]] quad_perm([1 : i32, 0 : i32, 3 : i32, 2 : i32]) {bound_ctrl = true} : i32
215+ // CHECK-GFX10: %[[A0:.+]] = arith.addi %[[ARG0]], %[[D0]] : i32
216+ // CHECK-GFX10: %[[D1:.+]] = amdgpu.dpp %[[A0]] %[[A0]] quad_perm([2 : i32, 3 : i32, 0 : i32, 1 : i32]) {bound_ctrl = true} : i32
217+ // CHECK-GFX10: %[[A1:.+]] = arith.addi %[[A0]], %[[D1]] : i32
218+ // CHECK-GFX10: %[[D2:.+]] = amdgpu.dpp %[[A1]] %[[A1]] row_half_mirror(unit) {bound_ctrl = true} : i32
219+ // CHECK-GFX10: %[[A2:.+]] = arith.addi %[[A1]], %[[D2]] : i32
220+ // CHECK-GFX10: "test.consume"(%[[A2]]) : (i32) -> ()
179221 %sum0 = gpu.subgroup_reduce add %arg0 cluster (size = 8 ) : (i32 ) -> i32
180222 " test.consume" (%sum0 ) : (i32 ) -> ()
181223
@@ -185,8 +227,12 @@ gpu.module @kernels {
185227
186228 // CHECK-SHFL-LABEL: gpu.func @kernel3_clustered_strided(
187229 // CHECK-SHFL-SAME: %[[ARG0:.+]]: i32)
188- // CHECK-DPP-LABEL: gpu.func @kernel3_clustered_strided(
189- // CHECK-DPP-NOT: amdgpu.dpp
230+ //
231+ // CHECK-GFX9-LABEL: gpu.func @kernel3_clustered_strided(
232+ // CHECK-GFX9-NOT: amdgpu.dpp
233+ //
234+ // CHECK-GFX10-LABEL: gpu.func @kernel3_clustered_strided(
235+ // CHECK-GFX10-NOT: amdgpu.dpp
190236 gpu.func @kernel3_clustered_strided (%arg0: i32 ) kernel {
191237 // CHECK-SHFL-DAG: %[[C1:.+]] = arith.constant 4 : i32
192238 // CHECK-SHFL-DAG: %[[C2:.+]] = arith.constant 8 : i32
@@ -209,8 +255,12 @@ gpu.module @kernels {
209255
210256 // CHECK-SHFL-LABEL: gpu.func @kernel4(
211257 // CHECK-SHFL-SAME: %[[ARG0:.+]]: vector<2xf16>)
212- // CHECK-DPP-LABEL: gpu.func @kernel4(
213- // CHECK-DPP-NOT: amdgpu.dpp
258+ //
259+ // CHECK-GFX9-LABEL: gpu.func @kernel4(
260+ // CHECK-GFX9-NOT: amdgpu.dpp
261+ //
262+ // CHECK-GFX10-LABEL: gpu.func @kernel4(
263+ // CHECK-GFX10-NOT: amdgpu.dpp
214264 gpu.func @kernel4 (%arg0: vector <2 xf16 >) kernel {
215265 // CHECK-SHFL-DAG: %[[C1:.+]] = arith.constant 1 : i32
216266 // CHECK-SHFL-DAG: %[[C2:.+]] = arith.constant 2 : i32
@@ -247,8 +297,12 @@ gpu.module @kernels {
247297
248298 // CHECK-SHFL-LABEL: gpu.func @kernel4_clustered(
249299 // CHECK-SHFL-SAME: %[[ARG0:.+]]: vector<2xf16>)
250- // CHECK-DPP-LABEL: gpu.func @kernel4_clustered(
251- // CHECK-DPP-NOT: amdgpu.dpp
300+ //
301+ // CHECK-GFX9-LABEL: gpu.func @kernel4_clustered(
302+ // CHECK-GFX9-NOT: amdgpu.dpp
303+ //
304+ // CHECK-GFX10-LABEL: gpu.func @kernel4_clustered(
305+ // CHECK-GFX10-NOT: amdgpu.dpp
252306 gpu.func @kernel4_clustered (%arg0: vector <2 xf16 >) kernel {
253307 // CHECK-SHFL-DAG: %[[C1:.+]] = arith.constant 1 : i32
254308 // CHECK-SHFL-DAG: %[[C2:.+]] = arith.constant 2 : i32
@@ -264,7 +318,11 @@ gpu.module @kernels {
264318
265319 // CHECK-SHFL-LABEL: gpu.func @kernel5(
266320 // CHECK-SHFL-SAME: %[[ARG0:.+]]: i16)
267- // CHECK-DPP-LABEL: gpu.func @kernel5(
321+ //
322+ // CHECK-GFX9-LABEL: gpu.func @kernel5(
323+ //
324+ // CHECK-GFX10-LABEL: gpu.func @kernel5(
325+ // CHECK-GFX10-SAME: %[[ARG0:.+]]: i16)
268326 gpu.func @kernel5 (%arg0: i16 ) kernel {
269327 // CHECK-SHFL: %[[E0:.+]] = arith.extui %[[ARG0]] : i16 to i32
270328 // CHECK-SHFL: %[[S0:.+]], %{{.+}} = gpu.shuffle xor %[[E0]], {{.+}} : i32
@@ -276,7 +334,23 @@ gpu.module @kernels {
276334 // CHECK-SHFL: arith.trunci {{.+}} : i32 to i16
277335 // CHECK-SHFL: %[[AL:.+]] = arith.addi {{.+}} : i16
278336 // CHECK-SHFL: "test.consume"(%[[AL]]) : (i16) -> ()
279- // CHECK-DPP-COUNT-6: amdgpu.dpp
337+
338+ // CHECK-GFX9-COUNT-6: amdgpu.dpp
339+
340+ // CHECK-GFX10: %[[D0:.+]] = amdgpu.dpp %[[ARG0]] %[[ARG0]] quad_perm([1 : i32, 0 : i32, 3 : i32, 2 : i32]) {bound_ctrl = true} : i16
341+ // CHECK-GFX10: %[[A0:.+]] = arith.addi %[[ARG0]], %[[D0]] : i16
342+ // CHECK-GFX10: %[[D1:.+]] = amdgpu.dpp %[[A0]] %[[A0]] quad_perm([2 : i32, 3 : i32, 0 : i32, 1 : i32]) {bound_ctrl = true} : i16
343+ // CHECK-GFX10: %[[A1:.+]] = arith.addi %[[A0]], %[[D1]] : i16
344+ // CHECK-GFX10: %[[D2:.+]] = amdgpu.dpp %[[A1]] %[[A1]] row_half_mirror(unit) {bound_ctrl = true} : i16
345+ // CHECK-GFX10: %[[A2:.+]] = arith.addi %[[A1]], %[[D2]] : i16
346+ // CHECK-GFX10: %[[D3:.+]] = amdgpu.dpp %[[A2]] %[[A2]] row_mirror(unit) {bound_ctrl = true} : i16
347+ // CHECK-GFX10: %[[A3:.+]] = arith.addi %[[A2]], %[[D3]] : i16
348+ // CHECK-GFX10: %[[P0:.+]] = rocdl.permlanex16 %[[A3]], %[[A3]], %c-1_i32, %c-1_i32, true, false : i16, i32
349+ // CHECK-GFX10: %[[A4:.+]] = arith.addi %[[A3]], %[[P0]] : i16
350+ // CHECK-GFX10: %[[R0:.+]] = rocdl.readlane %[[A4]], %{{.+}} : (i16, i32) -> i16
351+ // CHECK-GFX10: %[[R1:.+]] = rocdl.readlane %[[A4]], %{{.+}} : (i16, i32) -> i16
352+ // CHECK-GFX10: %[[A5:.+]] = arith.addi %[[R1]], %[[R0]] : i16
353+ // CHECK-GFX10: "test.consume"(%[[A5]]) : (i16) -> ()
280354 %sum0 = gpu.subgroup_reduce add %arg0 : (i16 ) -> i16
281355 " test.consume" (%sum0 ) : (i16 ) -> ()
282356
@@ -286,8 +360,12 @@ gpu.module @kernels {
286360
287361 // CHECK-SHFL-LABEL: gpu.func @kernel5_clustered(
288362 // CHECK-SHFL-SAME: %[[ARG0:.+]]: i16)
289- // CHECK-DPP-LABEL: gpu.func @kernel5_clustered
290- // CHECK-DPP-SAME: %[[ARG0:.+]]: i16)
363+ //
364+ // CHECK-GFX9-LABEL: gpu.func @kernel5_clustered
365+ // CHECK-GFX9-SAME: %[[ARG0:.+]]: i16)
366+ //
367+ // CHECK-GFX10-LABEL: gpu.func @kernel5_clustered
368+ // CHECK-GFX10-SAME: %[[ARG0:.+]]: i16)
291369 gpu.func @kernel5_clustered (%arg0: i16 ) kernel {
292370 // CHECK-SHFL: %[[E0:.+]] = arith.extui %[[ARG0]] : i16 to i32
293371 // CHECK-SHFL: %[[S0:.+]], %{{.+}} = gpu.shuffle xor %[[E0]], {{.+}} : i32
@@ -300,15 +378,25 @@ gpu.module @kernels {
300378 // CHECK-SHFL: %[[AL:.+]] = arith.addi {{.+}} : i16
301379 // CHECK-SHFL: "test.consume"(%[[AL]]) : (i16) -> ()
302380
303- // CHECK-DPP: %[[VAR0:.+]] = amdgpu.dpp %[[ARG0]] %[[ARG0]] quad_perm([1 : i32, 0 : i32, 3 : i32, 2 : i32]) {bound_ctrl = true} : i16
304- // CHECK-DPP: %[[VAR1:.+]] = arith.addi %[[ARG0]], %[[VAR0]] : i16
305- // CHECK-DPP: %[[VAR2:.+]] = amdgpu.dpp %[[VAR1]] %[[VAR1]] quad_perm([2 : i32, 3 : i32, 0 : i32, 1 : i32]) {bound_ctrl = true} : i16
306- // CHECK-DPP: %[[VAR3:.+]] = arith.addi %[[VAR1]], %[[VAR2]] : i16
307- // CHECK-DPP: %[[VAR4:.+]] = amdgpu.dpp %[[VAR3]] %[[VAR3]] row_half_mirror(unit) {bound_ctrl = true} : i16
308- // CHECK-DPP: %[[VAR5:.+]] = arith.addi %[[VAR3]], %[[VAR4]] : i16
309- // CHECK-DPP: %[[VAR6:.+]] = amdgpu.dpp %[[VAR5]] %[[VAR5]] row_mirror(unit) {bound_ctrl = true} : i16
310- // CHECK-DPP: %[[VAR7:.+]] = arith.addi %[[VAR5]], %[[VAR6]] : i16
311- // CHECK-DPP: "test.consume"(%[[VAR7]]) : (i16) -> ()
381+ // CHECK-GFX9: %[[VAR0:.+]] = amdgpu.dpp %[[ARG0]] %[[ARG0]] quad_perm([1 : i32, 0 : i32, 3 : i32, 2 : i32]) {bound_ctrl = true} : i16
382+ // CHECK-GFX9: %[[VAR1:.+]] = arith.addi %[[ARG0]], %[[VAR0]] : i16
383+ // CHECK-GFX9: %[[VAR2:.+]] = amdgpu.dpp %[[VAR1]] %[[VAR1]] quad_perm([2 : i32, 3 : i32, 0 : i32, 1 : i32]) {bound_ctrl = true} : i16
384+ // CHECK-GFX9: %[[VAR3:.+]] = arith.addi %[[VAR1]], %[[VAR2]] : i16
385+ // CHECK-GFX9: %[[VAR4:.+]] = amdgpu.dpp %[[VAR3]] %[[VAR3]] row_half_mirror(unit) {bound_ctrl = true} : i16
386+ // CHECK-GFX9: %[[VAR5:.+]] = arith.addi %[[VAR3]], %[[VAR4]] : i16
387+ // CHECK-GFX9: %[[VAR6:.+]] = amdgpu.dpp %[[VAR5]] %[[VAR5]] row_mirror(unit) {bound_ctrl = true} : i16
388+ // CHECK-GFX9: %[[VAR7:.+]] = arith.addi %[[VAR5]], %[[VAR6]] : i16
389+ // CHECK-GFX9: "test.consume"(%[[VAR7]]) : (i16) -> ()
390+
391+ // CHECK-GFX10: %[[VAR0:.+]] = amdgpu.dpp %[[ARG0]] %[[ARG0]] quad_perm([1 : i32, 0 : i32, 3 : i32, 2 : i32]) {bound_ctrl = true} : i16
392+ // CHECK-GFX10: %[[VAR1:.+]] = arith.addi %[[ARG0]], %[[VAR0]] : i16
393+ // CHECK-GFX10: %[[VAR2:.+]] = amdgpu.dpp %[[VAR1]] %[[VAR1]] quad_perm([2 : i32, 3 : i32, 0 : i32, 1 : i32]) {bound_ctrl = true} : i16
394+ // CHECK-GFX10: %[[VAR3:.+]] = arith.addi %[[VAR1]], %[[VAR2]] : i16
395+ // CHECK-GFX10: %[[VAR4:.+]] = amdgpu.dpp %[[VAR3]] %[[VAR3]] row_half_mirror(unit) {bound_ctrl = true} : i16
396+ // CHECK-GFX10: %[[VAR5:.+]] = arith.addi %[[VAR3]], %[[VAR4]] : i16
397+ // CHECK-GFX10: %[[VAR6:.+]] = amdgpu.dpp %[[VAR5]] %[[VAR5]] row_mirror(unit) {bound_ctrl = true} : i16
398+ // CHECK-GFX10: %[[VAR7:.+]] = arith.addi %[[VAR5]], %[[VAR6]] : i16
399+ // CHECK-GFX10: "test.consume"(%[[VAR7]]) : (i16) -> ()
312400 %sum0 = gpu.subgroup_reduce add %arg0 cluster (size = 16 ) : (i16 ) -> i16
313401 " test.consume" (%sum0 ) : (i16 ) -> ()
314402
@@ -318,8 +406,12 @@ gpu.module @kernels {
318406
319407 // CHECK-SHFL-LABEL: gpu.func @kernel6(
320408 // CHECK-SHFL-SAME: %[[ARG0:.+]]: vector<3xi8>)
321- // CHECK-DPP-LABEL: gpu.func @kernel6(
322- // CHECK-DPP-NOT: amdgpu.dpp
409+ //
410+ // CHECK-GFX9-LABEL: gpu.func @kernel6(
411+ // CHECK-GFX9-NOT: amdgpu.dpp
412+ //
413+ // CHECK-GFX10-LABEL: gpu.func @kernel6(
414+ // CHECK-GFX10-NOT: amdgpu.dpp
323415 gpu.func @kernel6 (%arg0: vector <3 xi8 >) kernel {
324416 // CHECK-SHFL: %[[CZ:.+]] = arith.constant dense<0> : vector<4xi8>
325417 // CHECK-SHFL: %[[V0:.+]] = vector.insert_strided_slice %[[ARG0]], %[[CZ]] {offsets = [0], strides = [1]} : vector<3xi8> into vector<4xi8>
@@ -343,8 +435,12 @@ gpu.module @kernels {
343435
344436 // CHECK-SHFL-LABEL: gpu.func @kernel_cluster_size_is_subgroup_size(
345437 // CHECK-SHFL-SAME: %[[ARG0:.+]]: vector<3xi8>)
346- // CHECK-DPP-LABEL: gpu.func @kernel_cluster_size_is_subgroup_size(
347- // CHECK-DPP-NOT: amdgpu.dpp
438+ //
439+ // CHECK-GFX9-LABEL: gpu.func @kernel_cluster_size_is_subgroup_size(
440+ // CHECK-GFX9-NOT: amdgpu.dpp
441+ //
442+ // CHECK-GFX10-LABEL: gpu.func @kernel_cluster_size_is_subgroup_size(
443+ // CHECK-GFX10-NOT: amdgpu.dpp
348444 gpu.func @kernel_cluster_size_is_subgroup_size (%arg0: vector <3 xi8 >) kernel {
349445 // CHECK-SHFL-COUNT-5: gpu.shuffle xor
350446 %sum0 = gpu.subgroup_reduce add %arg0 cluster (size = 32 ) : (vector <3 xi8 >) -> (vector <3 xi8 >)
0 commit comments