Skip to content

Commit 6da1653

Browse files
Addressing PR comments
Signed-off-by: Muzammiluddin Syed <[email protected]>
1 parent 848c6ba commit 6da1653

File tree

2 files changed

+149
-53
lines changed

2 files changed

+149
-53
lines changed

mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -367,7 +367,7 @@ struct VectorSubgroupReduceToShuffles final
367367
bool matchClustered = false;
368368
};
369369

370-
FailureOr<Value>
370+
static FailureOr<Value>
371371
createSubgroupDPPReduction(PatternRewriter &rewriter, gpu::SubgroupReduceOp op,
372372
Value input, gpu::AllReduceOperation mode,
373373
const ClusterInfo &ci, amdgpu::Chipset chipset) {
@@ -501,13 +501,13 @@ struct ScalarSubgroupReduceToDPP final
501501

502502
if (ci->clusterStride != 1)
503503
return rewriter.notifyMatchFailure(
504-
op, "Supgroup reductions using DPP are currently only available for "
504+
op, "Subgroup reductions using DPP are currently only available for "
505505
"clusters of contiguous lanes.");
506506

507507
Type valueTy = op.getType();
508508
if (!valueTy.isIntOrFloat())
509509
return rewriter.notifyMatchFailure(
510-
op, "value type is not a compatible scalar");
510+
op, "Value type is not a compatible scalar.");
511511

512512
FailureOr<Value> dpp = createSubgroupDPPReduction(
513513
rewriter, op, op.getValue(), op.getOp(), *ci, chipset);

mlir/test/Dialect/GPU/subgroup-reduce-lowering.mlir

Lines changed: 146 additions & 50 deletions
Original file line numberDiff line numberDiff line change
@@ -8,18 +8,24 @@
88

99
// RUN: mlir-opt --allow-unregistered-dialect \
1010
// RUN: --test-gpu-subgroup-reduce-lowering="expand-to-shuffles target=gfx942" %s \
11-
// RUN: | FileCheck %s --check-prefix=CHECK-DPP
11+
// RUN: | FileCheck %s --check-prefix=CHECK-GFX9
12+
13+
// RUN: mlir-opt --allow-unregistered-dialect \
14+
// RUN: --test-gpu-subgroup-reduce-lowering="expand-to-shuffles target=gfx1030" %s \
15+
// RUN: | FileCheck %s --check-prefix=CHECK-GFX1030
1216

1317
// CHECK-SUB: gpu.module @kernels {
1418
// CHECK-SHFL: gpu.module @kernels {
15-
// CHECK-DPP: gpu.module @kernels {
19+
// CHECK-GFX9: gpu.module @kernels {
20+
// CHECK-GFX10: gpu.module @kernels {
1621
gpu.module @kernels {
1722

1823
// CHECK-SUB-LABEL: gpu.func @kernel0(
1924
// CHECK-SUB-SAME: %[[ARG0:.+]]: vector<5xf16>)
2025
//
2126
// CHECK-SHFL-LABEL: gpu.func @kernel0(
22-
// CHECK-DPP-LABEL: gpu.func @kernel0(
27+
// CHECK-GFX9-LABEL: gpu.func @kernel0(
28+
// CHECK-GFX10-LABEL: gpu.func @kernel0(
2329
gpu.func @kernel0(%arg0: vector<5xf16>) kernel {
2430
// CHECK-SUB: %[[VZ:.+]] = arith.constant dense<0.0{{.*}}> : vector<5xf16>
2531
// CHECK-SUB: %[[E0:.+]] = vector.extract_strided_slice %[[ARG0]] {offsets = [0], sizes = [2], strides = [1]} : vector<5xf16> to vector<2xf16>
@@ -32,19 +38,26 @@ gpu.module @kernels {
3238
// CHECK-SUB: %[[R2:.+]] = gpu.subgroup_reduce add %[[E2]] : (f16) -> f16
3339
// CHECK-SUB: %[[V2:.+]] = vector.insert %[[R2]], %[[V1]] [4] : f16 into vector<5xf16>
3440
// CHECK-SUB: "test.consume"(%[[V2]]) : (vector<5xf16>) -> ()
35-
// CHECK-DPP-COUNT-6: amdgpu.dpp
41+
// CHECK-GFX9-COUNT-6: amdgpu.dpp
42+
// CHECK-GFX10-COUNT-4: amdgpu.dpp
43+
// CHECK-GFX10: rocdl.permlanex16
44+
// CHECK-GFX10-COUNT-2: rocdl.readlane
3645
%sum0 = gpu.subgroup_reduce add %arg0 : (vector<5xf16>) -> (vector<5xf16>)
3746
"test.consume"(%sum0) : (vector<5xf16>) -> ()
3847

3948
// CHECK-SUB-COUNT-3: gpu.subgroup_reduce mul {{.+}} uniform
4049
// CHECK-SUB: "test.consume"
41-
// CHECK-DPP-COUNT-6: amdgpu.dpp
50+
// CHECK-GFX9-COUNT-6: amdgpu.dpp
51+
// CHECK-GFX10-COUNT-4: amdgpu.dpp
52+
// CHECK-GFX10: rocdl.permlanex16
53+
// CHECK-GFX10-COUNT-2: rocdl.readlane
4254
%sum1 = gpu.subgroup_reduce mul %arg0 uniform : (vector<5xf16>) -> (vector<5xf16>)
4355
"test.consume"(%sum1) : (vector<5xf16>) -> ()
4456

4557
// CHECK-SUB-COUNT-3: gpu.subgroup_reduce mul {{.+}} cluster(size = 4)
4658
// CHECK-SUB: "test.consume"
47-
// CHECK-DPP-COUNT-2: amdgpu.dpp {{.+}}
59+
// CHECK-GFX9-COUNT-2: amdgpu.dpp {{.+}}
60+
// CHECK-GFX10-COUNT-2: amdgpu.dpp {{.+}}
4861
%sum2 = gpu.subgroup_reduce mul %arg0 cluster(size = 4) : (vector<5xf16>) -> (vector<5xf16>)
4962
"test.consume"(%sum2) : (vector<5xf16>) -> ()
5063

@@ -61,34 +74,44 @@ gpu.module @kernels {
6174
// CHECK-SUB-SAME: %[[ARG0:.+]]: vector<1xf32>)
6275
//
6376
// CHECK-SHFL-LABEL: gpu.func @kernel1(
64-
//
65-
// CHECK-DPP-LABEL: gpu.func @kernel1(
77+
// CHECK-GFX9-LABEL: gpu.func @kernel1(
78+
// CHECK-GFX10-LABEL: gpu.func @kernel1(
6679
gpu.func @kernel1(%arg0: vector<1xf32>) kernel {
6780
// CHECK-SUB: %[[E0:.+]] = vector.extract %[[ARG0]][0] : f32 from vector<1xf32>
6881
// CHECK-SUB: %[[R0:.+]] = gpu.subgroup_reduce add %[[E0]] : (f32) -> f32
6982
// CHECK-SUB: %[[V0:.+]] = vector.broadcast %[[R0]] : f32 to vector<1xf32>
7083
// CHECK-SUB: "test.consume"(%[[V0]]) : (vector<1xf32>) -> ()
71-
// CHECK-DPP-COUNT-6: amdgpu.dpp
84+
// CHECK-GFX9-COUNT-6: amdgpu.dpp
85+
// CHECK-GFX10-COUNT-4: amdgpu.dpp
86+
// CHECK-GFX10: rocdl.permlanex16
87+
// CHECK-GFX10-COUNT-2: rocdl.readlane
7288
%sum0 = gpu.subgroup_reduce add %arg0 : (vector<1xf32>) -> (vector<1xf32>)
7389
"test.consume"(%sum0) : (vector<1xf32>) -> ()
7490

7591
// CHECK-SUB: gpu.subgroup_reduce add {{.+}} uniform : (f32) -> f32
7692
// CHECK-SUB: "test.consume"
77-
// CHECK-DPP-COUNT-6: amdgpu.dpp
93+
// CHECK-GFX9-COUNT-6: amdgpu.dpp
94+
// CHECK-GFX10-COUNT-4: amdgpu.dpp
95+
// CHECK-GFX10: rocdl.permlanex16
96+
// CHECK-GFX10-COUNT-2: rocdl.readlane
7897
%sum1 = gpu.subgroup_reduce add %arg0 uniform : (vector<1xf32>) -> (vector<1xf32>)
7998
"test.consume"(%sum1) : (vector<1xf32>) -> ()
8099

81100
// Note stride is dropped because it is == 1.
82101
// CHECK-SUB: gpu.subgroup_reduce add {{.+}} cluster(size = 8) : (f32) -> f32
83102
// CHECK-SUB: "test.consume"
84-
// CHECK-DPP-COUNT-2: amdgpu.dpp {{.+}} quad_perm
85-
// CHECK-DPP: amdgpu.dpp {{.+}} row_half_mirror
103+
// CHECK-GFX9-COUNT-2: amdgpu.dpp {{.+}} quad_perm
104+
// CHECK-GFX9: amdgpu.dpp {{.+}} row_half_mirror
105+
// CHECK-GFX10-COUNT-2: amdgpu.dpp {{.+}} quad_perm
106+
// CHECK-GFX10: amdgpu.dpp {{.+}} row_half_mirror
86107
%sum2 = gpu.subgroup_reduce add %arg0 cluster(size = 8, stride = 1) : (vector<1xf32>) -> (vector<1xf32>)
87108
"test.consume"(%sum2) : (vector<1xf32>) -> ()
88109

89110
// CHECK-SUB: gpu.subgroup_reduce add {{.+}} uniform cluster(size = 8, stride = 4) : (f32) -> f32
90111
// CHECK-SUB: "test.consume"
91-
// CHECK-DPP-NOT: amdgpu.dpp
112+
// CHECK-GFX9-NOT: amdgpu.dpp
113+
// CHECK-GFX10-NOT: amdgpu.dpp
114+
// CHECK-GFX10-NOT: rocdl.permlanex16
92115
%sum3 = gpu.subgroup_reduce add %arg0 uniform cluster(size = 8, stride = 4) : (vector<1xf32>) -> (vector<1xf32>)
93116
"test.consume"(%sum3) : (vector<1xf32>) -> ()
94117

@@ -102,8 +125,12 @@ gpu.module @kernels {
102125
// CHECK-SUB-SAME: %[[ARG0:.+]]: vector<3xi8>, %[[ARG1:.+]]: vector<4xi8>)
103126
//
104127
// CHECK-SHFL-LABEL: gpu.func @kernel2(
105-
// CHECK-DPP-LABEL: gpu.func @kernel2(
106-
// CHECK-DPP-NOT: amdgpu.dpp
128+
//
129+
// CHECK-GFX9-LABEL: gpu.func @kernel2(
130+
// CHECK-GFX9-NOT: amdgpu.dpp
131+
//
132+
// CHECK-GFX10-LABEL: gpu.func @kernel2(
133+
// CHECK-GFX10-NOT: amdgpu.dpp
107134
gpu.func @kernel2(%arg0: vector<3xi8>, %arg1: vector<4xi8>) kernel {
108135
// CHECK-SUB: %[[R0:.+]] = gpu.subgroup_reduce add %[[ARG0]] : (vector<3xi8>) -> vector<3xi8>
109136
// CHECK-SUB: "test.consume"(%[[R0]]) : (vector<3xi8>) -> ()
@@ -121,8 +148,8 @@ gpu.module @kernels {
121148

122149
// CHECK-SHFL-LABEL: gpu.func @kernel3(
123150
// CHECK-SHFL-SAME: %[[ARG0:.+]]: i32)
124-
// CHECK-DPP-LABEL: gpu.func @kernel3(
125-
// CHECK-DPP-NOT: amdgpu.dpp
151+
// CHECK-GFX9-LABEL: gpu.func @kernel3(
152+
// CHECK-GFX10-LABEL: gpu.func @kernel3(
126153
gpu.func @kernel3(%arg0: i32) kernel {
127154
// CHECK-SHFL-DAG: %[[C1:.+]] = arith.constant 1 : i32
128155
// CHECK-SHFL-DAG: %[[C2:.+]] = arith.constant 2 : i32
@@ -143,7 +170,11 @@ gpu.module @kernels {
143170
// CHECK-SHFL: %[[A4:.+]] = arith.addi %[[A3]], %[[S4]] : i32
144171
// CHECK-SHFL: "test.consume"(%[[A4]]) : (i32) -> ()
145172

146-
// CHECK-DPP-COUNT-6: amdgpu.dpp
173+
// CHECK-GFX9-COUNT-6: amdgpu.dpp
174+
175+
// CHECK-GFX10-COUNT-4: amdgpu.dpp
176+
// CHECK-GFX10: rocdl.permlanex16
177+
// CHECK-GFX10-COUNT-2: rocdl.readlane
147178
%sum0 = gpu.subgroup_reduce add %arg0 : (i32) -> i32
148179
"test.consume"(%sum0) : (i32) -> ()
149180

@@ -153,8 +184,12 @@ gpu.module @kernels {
153184

154185
// CHECK-SHFL-LABEL: gpu.func @kernel3_clustered(
155186
// CHECK-SHFL-SAME: %[[ARG0:.+]]: i32)
156-
// CHECK-DPP-LABEL: gpu.func @kernel3_clustered(
157-
// CHECK-DPP-SAME: %[[ARG0:.+]]: i32)
187+
//
188+
// CHECK-GFX9-LABEL: gpu.func @kernel3_clustered(
189+
// CHECK-GFX9-SAME: %[[ARG0:.+]]: i32)
190+
//
191+
// CHECK-GFX10-LABEL: gpu.func @kernel3_clustered(
192+
// CHECK-GFX10-SAME: %[[ARG0:.+]]: i32)
158193
gpu.func @kernel3_clustered(%arg0: i32) kernel {
159194
// CHECK-SHFL-DAG: %[[C1:.+]] = arith.constant 1 : i32
160195
// CHECK-SHFL-DAG: %[[C2:.+]] = arith.constant 2 : i32
@@ -169,13 +204,20 @@ gpu.module @kernels {
169204
// CHECK-SHFL: %[[A2:.+]] = arith.addi %[[A1]], %[[S2]] : i32
170205
// CHECK-SHFL: "test.consume"(%[[A2]]) : (i32) -> ()
171206

172-
// CHECK-DPP: %[[D0:.+]] = amdgpu.dpp %[[ARG0]] %[[ARG0]] quad_perm([1 : i32, 0 : i32, 3 : i32, 2 : i32]) {bound_ctrl = true} : i32
173-
// CHECK-DPP: %[[A0:.+]] = arith.addi %[[ARG0]], %[[D0]] : i32
174-
// CHECK-DPP: %[[D1:.+]] = amdgpu.dpp %[[A0]] %[[A0]] quad_perm([2 : i32, 3 : i32, 0 : i32, 1 : i32]) {bound_ctrl = true} : i32
175-
// CHECK-DPP: %[[A1:.+]] = arith.addi %[[A0]], %[[D1]] : i32
176-
// CHECK-DPP: %[[D2:.+]] = amdgpu.dpp %[[A1]] %[[A1]] row_half_mirror(unit) {bound_ctrl = true} : i32
177-
// CHECK-DPP: %[[A2:.+]] = arith.addi %[[A1]], %[[D2]] : i32
178-
// CHECK-DPP: "test.consume"(%[[A2]]) : (i32) -> ()
207+
// CHECK-GFX9: %[[D0:.+]] = amdgpu.dpp %[[ARG0]] %[[ARG0]] quad_perm([1 : i32, 0 : i32, 3 : i32, 2 : i32]) {bound_ctrl = true} : i32
208+
// CHECK-GFX9: %[[A0:.+]] = arith.addi %[[ARG0]], %[[D0]] : i32
209+
// CHECK-GFX9: %[[D1:.+]] = amdgpu.dpp %[[A0]] %[[A0]] quad_perm([2 : i32, 3 : i32, 0 : i32, 1 : i32]) {bound_ctrl = true} : i32
210+
// CHECK-GFX9: %[[A1:.+]] = arith.addi %[[A0]], %[[D1]] : i32
211+
// CHECK-GFX9: %[[D2:.+]] = amdgpu.dpp %[[A1]] %[[A1]] row_half_mirror(unit) {bound_ctrl = true} : i32
212+
// CHECK-GFX9: %[[A2:.+]] = arith.addi %[[A1]], %[[D2]] : i32
213+
214+
// CHECK-GFX10: %[[D0:.+]] = amdgpu.dpp %[[ARG0]] %[[ARG0]] quad_perm([1 : i32, 0 : i32, 3 : i32, 2 : i32]) {bound_ctrl = true} : i32
215+
// CHECK-GFX10: %[[A0:.+]] = arith.addi %[[ARG0]], %[[D0]] : i32
216+
// CHECK-GFX10: %[[D1:.+]] = amdgpu.dpp %[[A0]] %[[A0]] quad_perm([2 : i32, 3 : i32, 0 : i32, 1 : i32]) {bound_ctrl = true} : i32
217+
// CHECK-GFX10: %[[A1:.+]] = arith.addi %[[A0]], %[[D1]] : i32
218+
// CHECK-GFX10: %[[D2:.+]] = amdgpu.dpp %[[A1]] %[[A1]] row_half_mirror(unit) {bound_ctrl = true} : i32
219+
// CHECK-GFX10: %[[A2:.+]] = arith.addi %[[A1]], %[[D2]] : i32
220+
// CHECK-GFX10: "test.consume"(%[[A2]]) : (i32) -> ()
179221
%sum0 = gpu.subgroup_reduce add %arg0 cluster(size = 8) : (i32) -> i32
180222
"test.consume"(%sum0) : (i32) -> ()
181223

@@ -185,8 +227,12 @@ gpu.module @kernels {
185227

186228
// CHECK-SHFL-LABEL: gpu.func @kernel3_clustered_strided(
187229
// CHECK-SHFL-SAME: %[[ARG0:.+]]: i32)
188-
// CHECK-DPP-LABEL: gpu.func @kernel3_clustered_strided(
189-
// CHECK-DPP-NOT: amdgpu.dpp
230+
//
231+
// CHECK-GFX9-LABEL: gpu.func @kernel3_clustered_strided(
232+
// CHECK-GFX9-NOT: amdgpu.dpp
233+
//
234+
// CHECK-GFX10-LABEL: gpu.func @kernel3_clustered_strided(
235+
// CHECK-GFX10-NOT: amdgpu.dpp
190236
gpu.func @kernel3_clustered_strided(%arg0: i32) kernel {
191237
// CHECK-SHFL-DAG: %[[C1:.+]] = arith.constant 4 : i32
192238
// CHECK-SHFL-DAG: %[[C2:.+]] = arith.constant 8 : i32
@@ -209,8 +255,12 @@ gpu.module @kernels {
209255

210256
// CHECK-SHFL-LABEL: gpu.func @kernel4(
211257
// CHECK-SHFL-SAME: %[[ARG0:.+]]: vector<2xf16>)
212-
// CHECK-DPP-LABEL: gpu.func @kernel4(
213-
// CHECK-DPP-NOT: amdgpu.dpp
258+
//
259+
// CHECK-GFX9-LABEL: gpu.func @kernel4(
260+
// CHECK-GFX9-NOT: amdgpu.dpp
261+
//
262+
// CHECK-GFX10-LABEL: gpu.func @kernel4(
263+
// CHECK-GFX10-NOT: amdgpu.dpp
214264
gpu.func @kernel4(%arg0: vector<2xf16>) kernel {
215265
// CHECK-SHFL-DAG: %[[C1:.+]] = arith.constant 1 : i32
216266
// CHECK-SHFL-DAG: %[[C2:.+]] = arith.constant 2 : i32
@@ -247,8 +297,12 @@ gpu.module @kernels {
247297

248298
// CHECK-SHFL-LABEL: gpu.func @kernel4_clustered(
249299
// CHECK-SHFL-SAME: %[[ARG0:.+]]: vector<2xf16>)
250-
// CHECK-DPP-LABEL: gpu.func @kernel4_clustered(
251-
// CHECK-DPP-NOT: amdgpu.dpp
300+
//
301+
// CHECK-GFX9-LABEL: gpu.func @kernel4_clustered(
302+
// CHECK-GFX9-NOT: amdgpu.dpp
303+
//
304+
// CHECK-GFX10-LABEL: gpu.func @kernel4_clustered(
305+
// CHECK-GFX10-NOT: amdgpu.dpp
252306
gpu.func @kernel4_clustered(%arg0: vector<2xf16>) kernel {
253307
// CHECK-SHFL-DAG: %[[C1:.+]] = arith.constant 1 : i32
254308
// CHECK-SHFL-DAG: %[[C2:.+]] = arith.constant 2 : i32
@@ -264,7 +318,11 @@ gpu.module @kernels {
264318

265319
// CHECK-SHFL-LABEL: gpu.func @kernel5(
266320
// CHECK-SHFL-SAME: %[[ARG0:.+]]: i16)
267-
// CHECK-DPP-LABEL: gpu.func @kernel5(
321+
//
322+
// CHECK-GFX9-LABEL: gpu.func @kernel5(
323+
//
324+
// CHECK-GFX10-LABEL: gpu.func @kernel5(
325+
// CHECK-GFX10-SAME: %[[ARG0:.+]]: i16)
268326
gpu.func @kernel5(%arg0: i16) kernel {
269327
// CHECK-SHFL: %[[E0:.+]] = arith.extui %[[ARG0]] : i16 to i32
270328
// CHECK-SHFL: %[[S0:.+]], %{{.+}} = gpu.shuffle xor %[[E0]], {{.+}} : i32
@@ -276,7 +334,23 @@ gpu.module @kernels {
276334
// CHECK-SHFL: arith.trunci {{.+}} : i32 to i16
277335
// CHECK-SHFL: %[[AL:.+]] = arith.addi {{.+}} : i16
278336
// CHECK-SHFL: "test.consume"(%[[AL]]) : (i16) -> ()
279-
// CHECK-DPP-COUNT-6: amdgpu.dpp
337+
338+
// CHECK-GFX9-COUNT-6: amdgpu.dpp
339+
340+
// CHECK-GFX10: %[[D0:.+]] = amdgpu.dpp %[[ARG0]] %[[ARG0]] quad_perm([1 : i32, 0 : i32, 3 : i32, 2 : i32]) {bound_ctrl = true} : i16
341+
// CHECK-GFX10: %[[A0:.+]] = arith.addi %[[ARG0]], %[[D0]] : i16
342+
// CHECK-GFX10: %[[D1:.+]] = amdgpu.dpp %[[A0]] %[[A0]] quad_perm([2 : i32, 3 : i32, 0 : i32, 1 : i32]) {bound_ctrl = true} : i16
343+
// CHECK-GFX10: %[[A1:.+]] = arith.addi %[[A0]], %[[D1]] : i16
344+
// CHECK-GFX10: %[[D2:.+]] = amdgpu.dpp %[[A1]] %[[A1]] row_half_mirror(unit) {bound_ctrl = true} : i16
345+
// CHECK-GFX10: %[[A2:.+]] = arith.addi %[[A1]], %[[D2]] : i16
346+
// CHECK-GFX10: %[[D3:.+]] = amdgpu.dpp %[[A2]] %[[A2]] row_mirror(unit) {bound_ctrl = true} : i16
347+
// CHECK-GFX10: %[[A3:.+]] = arith.addi %[[A2]], %[[D3]] : i16
348+
// CHECK-GFX10: %[[P0:.+]] = rocdl.permlanex16 %[[A3]], %[[A3]], %c-1_i32, %c-1_i32, true, false : i16, i32
349+
// CHECK-GFX10: %[[A4:.+]] = arith.addi %[[A3]], %[[P0]] : i16
350+
// CHECK-GFX10: %[[R0:.+]] = rocdl.readlane %[[A4]], %{{.+}} : (i16, i32) -> i16
351+
// CHECK-GFX10: %[[R1:.+]] = rocdl.readlane %[[A4]], %{{.+}} : (i16, i32) -> i16
352+
// CHECK-GFX10: %[[A5:.+]] = arith.addi %[[R1]], %[[R0]] : i16
353+
// CHECK-GFX10: "test.consume"(%[[A5]]) : (i16) -> ()
280354
%sum0 = gpu.subgroup_reduce add %arg0 : (i16) -> i16
281355
"test.consume"(%sum0) : (i16) -> ()
282356

@@ -286,8 +360,12 @@ gpu.module @kernels {
286360

287361
// CHECK-SHFL-LABEL: gpu.func @kernel5_clustered(
288362
// CHECK-SHFL-SAME: %[[ARG0:.+]]: i16)
289-
// CHECK-DPP-LABEL: gpu.func @kernel5_clustered
290-
// CHECK-DPP-SAME: %[[ARG0:.+]]: i16)
363+
//
364+
// CHECK-GFX9-LABEL: gpu.func @kernel5_clustered
365+
// CHECK-GFX9-SAME: %[[ARG0:.+]]: i16)
366+
//
367+
// CHECK-GFX10-LABEL: gpu.func @kernel5_clustered
368+
// CHECK-GFX10-SAME: %[[ARG0:.+]]: i16)
291369
gpu.func @kernel5_clustered(%arg0: i16) kernel {
292370
// CHECK-SHFL: %[[E0:.+]] = arith.extui %[[ARG0]] : i16 to i32
293371
// CHECK-SHFL: %[[S0:.+]], %{{.+}} = gpu.shuffle xor %[[E0]], {{.+}} : i32
@@ -300,15 +378,25 @@ gpu.module @kernels {
300378
// CHECK-SHFL: %[[AL:.+]] = arith.addi {{.+}} : i16
301379
// CHECK-SHFL: "test.consume"(%[[AL]]) : (i16) -> ()
302380

303-
// CHECK-DPP: %[[VAR0:.+]] = amdgpu.dpp %[[ARG0]] %[[ARG0]] quad_perm([1 : i32, 0 : i32, 3 : i32, 2 : i32]) {bound_ctrl = true} : i16
304-
// CHECK-DPP: %[[VAR1:.+]] = arith.addi %[[ARG0]], %[[VAR0]] : i16
305-
// CHECK-DPP: %[[VAR2:.+]] = amdgpu.dpp %[[VAR1]] %[[VAR1]] quad_perm([2 : i32, 3 : i32, 0 : i32, 1 : i32]) {bound_ctrl = true} : i16
306-
// CHECK-DPP: %[[VAR3:.+]] = arith.addi %[[VAR1]], %[[VAR2]] : i16
307-
// CHECK-DPP: %[[VAR4:.+]] = amdgpu.dpp %[[VAR3]] %[[VAR3]] row_half_mirror(unit) {bound_ctrl = true} : i16
308-
// CHECK-DPP: %[[VAR5:.+]] = arith.addi %[[VAR3]], %[[VAR4]] : i16
309-
// CHECK-DPP: %[[VAR6:.+]] = amdgpu.dpp %[[VAR5]] %[[VAR5]] row_mirror(unit) {bound_ctrl = true} : i16
310-
// CHECK-DPP: %[[VAR7:.+]] = arith.addi %[[VAR5]], %[[VAR6]] : i16
311-
// CHECK-DPP: "test.consume"(%[[VAR7]]) : (i16) -> ()
381+
// CHECK-GFX9: %[[VAR0:.+]] = amdgpu.dpp %[[ARG0]] %[[ARG0]] quad_perm([1 : i32, 0 : i32, 3 : i32, 2 : i32]) {bound_ctrl = true} : i16
382+
// CHECK-GFX9: %[[VAR1:.+]] = arith.addi %[[ARG0]], %[[VAR0]] : i16
383+
// CHECK-GFX9: %[[VAR2:.+]] = amdgpu.dpp %[[VAR1]] %[[VAR1]] quad_perm([2 : i32, 3 : i32, 0 : i32, 1 : i32]) {bound_ctrl = true} : i16
384+
// CHECK-GFX9: %[[VAR3:.+]] = arith.addi %[[VAR1]], %[[VAR2]] : i16
385+
// CHECK-GFX9: %[[VAR4:.+]] = amdgpu.dpp %[[VAR3]] %[[VAR3]] row_half_mirror(unit) {bound_ctrl = true} : i16
386+
// CHECK-GFX9: %[[VAR5:.+]] = arith.addi %[[VAR3]], %[[VAR4]] : i16
387+
// CHECK-GFX9: %[[VAR6:.+]] = amdgpu.dpp %[[VAR5]] %[[VAR5]] row_mirror(unit) {bound_ctrl = true} : i16
388+
// CHECK-GFX9: %[[VAR7:.+]] = arith.addi %[[VAR5]], %[[VAR6]] : i16
389+
// CHECK-GFX9: "test.consume"(%[[VAR7]]) : (i16) -> ()
390+
391+
// CHECK-GFX10: %[[VAR0:.+]] = amdgpu.dpp %[[ARG0]] %[[ARG0]] quad_perm([1 : i32, 0 : i32, 3 : i32, 2 : i32]) {bound_ctrl = true} : i16
392+
// CHECK-GFX10: %[[VAR1:.+]] = arith.addi %[[ARG0]], %[[VAR0]] : i16
393+
// CHECK-GFX10: %[[VAR2:.+]] = amdgpu.dpp %[[VAR1]] %[[VAR1]] quad_perm([2 : i32, 3 : i32, 0 : i32, 1 : i32]) {bound_ctrl = true} : i16
394+
// CHECK-GFX10: %[[VAR3:.+]] = arith.addi %[[VAR1]], %[[VAR2]] : i16
395+
// CHECK-GFX10: %[[VAR4:.+]] = amdgpu.dpp %[[VAR3]] %[[VAR3]] row_half_mirror(unit) {bound_ctrl = true} : i16
396+
// CHECK-GFX10: %[[VAR5:.+]] = arith.addi %[[VAR3]], %[[VAR4]] : i16
397+
// CHECK-GFX10: %[[VAR6:.+]] = amdgpu.dpp %[[VAR5]] %[[VAR5]] row_mirror(unit) {bound_ctrl = true} : i16
398+
// CHECK-GFX10: %[[VAR7:.+]] = arith.addi %[[VAR5]], %[[VAR6]] : i16
399+
// CHECK-GFX10: "test.consume"(%[[VAR7]]) : (i16) -> ()
312400
%sum0 = gpu.subgroup_reduce add %arg0 cluster(size = 16) : (i16) -> i16
313401
"test.consume"(%sum0) : (i16) -> ()
314402

@@ -318,8 +406,12 @@ gpu.module @kernels {
318406

319407
// CHECK-SHFL-LABEL: gpu.func @kernel6(
320408
// CHECK-SHFL-SAME: %[[ARG0:.+]]: vector<3xi8>)
321-
// CHECK-DPP-LABEL: gpu.func @kernel6(
322-
// CHECK-DPP-NOT: amdgpu.dpp
409+
//
410+
// CHECK-GFX9-LABEL: gpu.func @kernel6(
411+
// CHECK-GFX9-NOT: amdgpu.dpp
412+
//
413+
// CHECK-GFX10-LABEL: gpu.func @kernel6(
414+
// CHECK-GFX10-NOT: amdgpu.dpp
323415
gpu.func @kernel6(%arg0: vector<3xi8>) kernel {
324416
// CHECK-SHFL: %[[CZ:.+]] = arith.constant dense<0> : vector<4xi8>
325417
// CHECK-SHFL: %[[V0:.+]] = vector.insert_strided_slice %[[ARG0]], %[[CZ]] {offsets = [0], strides = [1]} : vector<3xi8> into vector<4xi8>
@@ -343,8 +435,12 @@ gpu.module @kernels {
343435

344436
// CHECK-SHFL-LABEL: gpu.func @kernel_cluster_size_is_subgroup_size(
345437
// CHECK-SHFL-SAME: %[[ARG0:.+]]: vector<3xi8>)
346-
// CHECK-DPP-LABEL: gpu.func @kernel_cluster_size_is_subgroup_size(
347-
// CHECK-DPP-NOT: amdgpu.dpp
438+
//
439+
// CHECK-GFX9-LABEL: gpu.func @kernel_cluster_size_is_subgroup_size(
440+
// CHECK-GFX9-NOT: amdgpu.dpp
441+
//
442+
// CHECK-GFX10-LABEL: gpu.func @kernel_cluster_size_is_subgroup_size(
443+
// CHECK-GFX10-NOT: amdgpu.dpp
348444
gpu.func @kernel_cluster_size_is_subgroup_size(%arg0: vector<3xi8>) kernel {
349445
// CHECK-SHFL-COUNT-5: gpu.shuffle xor
350446
%sum0 = gpu.subgroup_reduce add %arg0 cluster(size = 32) : (vector<3xi8>) -> (vector<3xi8>)

0 commit comments

Comments
 (0)