@@ -89,17 +89,15 @@ define amdgpu_cs i32 @branch_divergent_ballot_ne_zero_non_compare(i32 %v) {
89
89
; CHECK-LABEL: branch_divergent_ballot_ne_zero_non_compare:
90
90
; CHECK: ; %bb.0:
91
91
; CHECK-NEXT: v_and_b32_e32 v0, 1, v0
92
- ; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
93
- ; CHECK-NEXT: s_and_b32 s0, vcc_lo, exec_lo
94
- ; CHECK-NEXT: s_cmp_eq_u32 s0, 0
95
- ; CHECK-NEXT: s_cbranch_scc1 .LBB7_2
96
- ; CHECK-NEXT: ; %bb.1: ; %true
97
92
; CHECK-NEXT: s_mov_b32 s0, 42
98
- ; CHECK-NEXT: s_branch .LBB7_3
99
- ; CHECK-NEXT: .LBB7_2: ; %false
93
+ ; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
94
+ ; CHECK-NEXT: s_xor_b32 s2, vcc_lo, -1
95
+ ; CHECK-NEXT: s_and_saveexec_b32 s1, s2
96
+ ; CHECK-NEXT: ; %bb.1: ; %false
100
97
; CHECK-NEXT: s_mov_b32 s0, 33
101
- ; CHECK-NEXT: s_branch .LBB7_3
102
- ; CHECK-NEXT: .LBB7_3:
98
+ ; CHECK-NEXT: ; %bb.2: ; %UnifiedReturnBlock
99
+ ; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s1
100
+ ; CHECK-NEXT: ; return to shader part epilog
103
101
%c = trunc i32 %v to i1
104
102
%ballot = call i32 @llvm.amdgcn.ballot.i32 (i1 %c )
105
103
%ballot_ne_zero = icmp ne i32 %ballot , 0
@@ -113,9 +111,9 @@ false:
113
111
define amdgpu_cs i32 @branch_uniform_ballot_ne_zero_non_compare (i32 inreg %v ) {
114
112
; CHECK-LABEL: branch_uniform_ballot_ne_zero_non_compare:
115
113
; CHECK: ; %bb.0:
116
- ; CHECK-NEXT: s_and_b32 s0, 1, s0
117
- ; CHECK-NEXT: v_cmp_ne_u32_e64 s0, 0, s0
118
- ; CHECK-NEXT: s_cmp_eq_u32 s0, 0
114
+ ; CHECK-NEXT: s_xor_b32 s0, s0, 1
115
+ ; CHECK-NEXT: s_and_b32 s0, s0, 1
116
+ ; CHECK-NEXT: s_cmp_lg_u32 s0, 0
119
117
; CHECK-NEXT: s_cbranch_scc1 .LBB8_2
120
118
; CHECK-NEXT: ; %bb.1: ; %true
121
119
; CHECK-NEXT: s_mov_b32 s0, 42
@@ -135,20 +133,29 @@ false:
135
133
}
136
134
137
135
define amdgpu_cs i32 @branch_divergent_ballot_eq_zero_non_compare (i32 %v ) {
138
- ; CHECK-LABEL: branch_divergent_ballot_eq_zero_non_compare:
139
- ; CHECK: ; %bb.0:
140
- ; CHECK-NEXT: v_and_b32_e32 v0, 1, v0
141
- ; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
142
- ; CHECK-NEXT: s_and_b32 s0, vcc_lo, exec_lo
143
- ; CHECK-NEXT: s_cmp_lg_u32 s0, 0
144
- ; CHECK-NEXT: s_cbranch_scc0 .LBB9_2
145
- ; CHECK-NEXT: ; %bb.1: ; %false
146
- ; CHECK-NEXT: s_mov_b32 s0, 33
147
- ; CHECK-NEXT: s_branch .LBB9_3
148
- ; CHECK-NEXT: .LBB9_2: ; %true
149
- ; CHECK-NEXT: s_mov_b32 s0, 42
150
- ; CHECK-NEXT: s_branch .LBB9_3
151
- ; CHECK-NEXT: .LBB9_3:
136
+ ; GFX10-LABEL: branch_divergent_ballot_eq_zero_non_compare:
137
+ ; GFX10: ; %bb.0:
138
+ ; GFX10-NEXT: v_and_b32_e32 v0, 1, v0
139
+ ; GFX10-NEXT: s_mov_b32 s0, 42
140
+ ; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
141
+ ; GFX10-NEXT: s_and_saveexec_b32 s1, vcc_lo
142
+ ; GFX10-NEXT: ; %bb.1: ; %false
143
+ ; GFX10-NEXT: s_mov_b32 s0, 33
144
+ ; GFX10-NEXT: ; %bb.2: ; %UnifiedReturnBlock
145
+ ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s1
146
+ ; GFX10-NEXT: ; return to shader part epilog
147
+ ;
148
+ ; GFX11-LABEL: branch_divergent_ballot_eq_zero_non_compare:
149
+ ; GFX11: ; %bb.0:
150
+ ; GFX11-NEXT: v_and_b32_e32 v0, 1, v0
151
+ ; GFX11-NEXT: s_mov_b32 s0, 42
152
+ ; GFX11-NEXT: s_mov_b32 s1, exec_lo
153
+ ; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v0
154
+ ; GFX11-NEXT: ; %bb.1: ; %false
155
+ ; GFX11-NEXT: s_mov_b32 s0, 33
156
+ ; GFX11-NEXT: ; %bb.2: ; %UnifiedReturnBlock
157
+ ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1
158
+ ; GFX11-NEXT: ; return to shader part epilog
152
159
%c = trunc i32 %v to i1
153
160
%ballot = call i32 @llvm.amdgcn.ballot.i32 (i1 %c )
154
161
%ballot_eq_zero = icmp eq i32 %ballot , 0
@@ -162,16 +169,17 @@ false:
162
169
define amdgpu_cs i32 @branch_uniform_ballot_eq_zero_non_compare (i32 inreg %v ) {
163
170
; CHECK-LABEL: branch_uniform_ballot_eq_zero_non_compare:
164
171
; CHECK: ; %bb.0:
165
- ; CHECK-NEXT: s_and_b32 s0, 1, s0
166
- ; CHECK-NEXT: v_cmp_ne_u32_e64 s0, 0, s0
172
+ ; CHECK-NEXT: s_xor_b32 s0, s0, 1
173
+ ; CHECK-NEXT: s_xor_b32 s0, s0, 1
174
+ ; CHECK-NEXT: s_and_b32 s0, s0, 1
167
175
; CHECK-NEXT: s_cmp_lg_u32 s0, 0
168
- ; CHECK-NEXT: s_cbranch_scc0 .LBB10_2
169
- ; CHECK-NEXT: ; %bb.1: ; %false
170
- ; CHECK-NEXT: s_mov_b32 s0, 33
171
- ; CHECK-NEXT: s_branch .LBB10_3
172
- ; CHECK-NEXT: .LBB10_2: ; %true
176
+ ; CHECK-NEXT: s_cbranch_scc1 .LBB10_2
177
+ ; CHECK-NEXT: ; %bb.1: ; %true
173
178
; CHECK-NEXT: s_mov_b32 s0, 42
174
179
; CHECK-NEXT: s_branch .LBB10_3
180
+ ; CHECK-NEXT: .LBB10_2: ; %false
181
+ ; CHECK-NEXT: s_mov_b32 s0, 33
182
+ ; CHECK-NEXT: s_branch .LBB10_3
175
183
; CHECK-NEXT: .LBB10_3:
176
184
%c = trunc i32 %v to i1
177
185
%ballot = call i32 @llvm.amdgcn.ballot.i32 (i1 %c )
@@ -184,18 +192,27 @@ false:
184
192
}
185
193
186
194
define amdgpu_cs i32 @branch_divergent_ballot_ne_zero_compare (i32 %v ) {
187
- ; CHECK-LABEL: branch_divergent_ballot_ne_zero_compare:
188
- ; CHECK: ; %bb.0:
189
- ; CHECK-NEXT: v_cmp_gt_u32_e32 vcc_lo, 12, v0
190
- ; CHECK-NEXT: s_cmp_eq_u32 vcc_lo, 0
191
- ; CHECK-NEXT: s_cbranch_scc1 .LBB11_2
192
- ; CHECK-NEXT: ; %bb.1: ; %true
193
- ; CHECK-NEXT: s_mov_b32 s0, 42
194
- ; CHECK-NEXT: s_branch .LBB11_3
195
- ; CHECK-NEXT: .LBB11_2: ; %false
196
- ; CHECK-NEXT: s_mov_b32 s0, 33
197
- ; CHECK-NEXT: s_branch .LBB11_3
198
- ; CHECK-NEXT: .LBB11_3:
195
+ ; GFX10-LABEL: branch_divergent_ballot_ne_zero_compare:
196
+ ; GFX10: ; %bb.0:
197
+ ; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, 12, v0
198
+ ; GFX10-NEXT: s_mov_b32 s0, 42
199
+ ; GFX10-NEXT: s_and_saveexec_b32 s1, vcc_lo
200
+ ; GFX10-NEXT: ; %bb.1: ; %false
201
+ ; GFX10-NEXT: s_mov_b32 s0, 33
202
+ ; GFX10-NEXT: ; %bb.2: ; %UnifiedReturnBlock
203
+ ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s1
204
+ ; GFX10-NEXT: ; return to shader part epilog
205
+ ;
206
+ ; GFX11-LABEL: branch_divergent_ballot_ne_zero_compare:
207
+ ; GFX11: ; %bb.0:
208
+ ; GFX11-NEXT: s_mov_b32 s0, 42
209
+ ; GFX11-NEXT: s_mov_b32 s1, exec_lo
210
+ ; GFX11-NEXT: v_cmpx_le_u32_e32 12, v0
211
+ ; GFX11-NEXT: ; %bb.1: ; %false
212
+ ; GFX11-NEXT: s_mov_b32 s0, 33
213
+ ; GFX11-NEXT: ; %bb.2: ; %UnifiedReturnBlock
214
+ ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1
215
+ ; GFX11-NEXT: ; return to shader part epilog
199
216
%c = icmp ult i32 %v , 12
200
217
%ballot = call i32 @llvm.amdgcn.ballot.i32 (i1 %c )
201
218
%ballot_ne_zero = icmp ne i32 %ballot , 0
@@ -209,11 +226,7 @@ false:
209
226
define amdgpu_cs i32 @branch_uniform_ballot_ne_zero_compare (i32 inreg %v ) {
210
227
; CHECK-LABEL: branch_uniform_ballot_ne_zero_compare:
211
228
; CHECK: ; %bb.0:
212
- ; CHECK-NEXT: s_cmp_lt_u32 s0, 12
213
- ; CHECK-NEXT: s_cselect_b32 s0, 1, 0
214
- ; CHECK-NEXT: s_and_b32 s0, 1, s0
215
- ; CHECK-NEXT: v_cmp_ne_u32_e64 s0, 0, s0
216
- ; CHECK-NEXT: s_cmp_eq_u32 s0, 0
229
+ ; CHECK-NEXT: s_cmp_ge_u32 s0, 12
217
230
; CHECK-NEXT: s_cbranch_scc1 .LBB12_2
218
231
; CHECK-NEXT: ; %bb.1: ; %true
219
232
; CHECK-NEXT: s_mov_b32 s0, 42
@@ -233,18 +246,27 @@ false:
233
246
}
234
247
235
248
define amdgpu_cs i32 @branch_divergent_ballot_eq_zero_compare (i32 %v ) {
236
- ; CHECK-LABEL: branch_divergent_ballot_eq_zero_compare:
237
- ; CHECK: ; %bb.0:
238
- ; CHECK-NEXT: v_cmp_gt_u32_e32 vcc_lo, 12, v0
239
- ; CHECK-NEXT: s_cmp_lg_u32 vcc_lo, 0
240
- ; CHECK-NEXT: s_cbranch_scc0 .LBB13_2
241
- ; CHECK-NEXT: ; %bb.1: ; %false
242
- ; CHECK-NEXT: s_mov_b32 s0, 33
243
- ; CHECK-NEXT: s_branch .LBB13_3
244
- ; CHECK-NEXT: .LBB13_2: ; %true
245
- ; CHECK-NEXT: s_mov_b32 s0, 42
246
- ; CHECK-NEXT: s_branch .LBB13_3
247
- ; CHECK-NEXT: .LBB13_3:
249
+ ; GFX10-LABEL: branch_divergent_ballot_eq_zero_compare:
250
+ ; GFX10: ; %bb.0:
251
+ ; GFX10-NEXT: v_cmp_gt_u32_e32 vcc_lo, 12, v0
252
+ ; GFX10-NEXT: s_mov_b32 s0, 42
253
+ ; GFX10-NEXT: s_and_saveexec_b32 s1, vcc_lo
254
+ ; GFX10-NEXT: ; %bb.1: ; %false
255
+ ; GFX10-NEXT: s_mov_b32 s0, 33
256
+ ; GFX10-NEXT: ; %bb.2: ; %UnifiedReturnBlock
257
+ ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s1
258
+ ; GFX10-NEXT: ; return to shader part epilog
259
+ ;
260
+ ; GFX11-LABEL: branch_divergent_ballot_eq_zero_compare:
261
+ ; GFX11: ; %bb.0:
262
+ ; GFX11-NEXT: s_mov_b32 s0, 42
263
+ ; GFX11-NEXT: s_mov_b32 s1, exec_lo
264
+ ; GFX11-NEXT: v_cmpx_gt_u32_e32 12, v0
265
+ ; GFX11-NEXT: ; %bb.1: ; %false
266
+ ; GFX11-NEXT: s_mov_b32 s0, 33
267
+ ; GFX11-NEXT: ; %bb.2: ; %UnifiedReturnBlock
268
+ ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1
269
+ ; GFX11-NEXT: ; return to shader part epilog
248
270
%c = icmp ult i32 %v , 12
249
271
%ballot = call i32 @llvm.amdgcn.ballot.i32 (i1 %c )
250
272
%ballot_eq_zero = icmp eq i32 %ballot , 0
@@ -259,17 +281,13 @@ define amdgpu_cs i32 @branch_uniform_ballot_eq_zero_compare(i32 inreg %v) {
259
281
; CHECK-LABEL: branch_uniform_ballot_eq_zero_compare:
260
282
; CHECK: ; %bb.0:
261
283
; CHECK-NEXT: s_cmp_lt_u32 s0, 12
262
- ; CHECK-NEXT: s_cselect_b32 s0, 1, 0
263
- ; CHECK-NEXT: s_and_b32 s0, 1, s0
264
- ; CHECK-NEXT: v_cmp_ne_u32_e64 s0, 0, s0
265
- ; CHECK-NEXT: s_cmp_lg_u32 s0, 0
266
- ; CHECK-NEXT: s_cbranch_scc0 .LBB14_2
267
- ; CHECK-NEXT: ; %bb.1: ; %false
268
- ; CHECK-NEXT: s_mov_b32 s0, 33
269
- ; CHECK-NEXT: s_branch .LBB14_3
270
- ; CHECK-NEXT: .LBB14_2: ; %true
284
+ ; CHECK-NEXT: s_cbranch_scc1 .LBB14_2
285
+ ; CHECK-NEXT: ; %bb.1: ; %true
271
286
; CHECK-NEXT: s_mov_b32 s0, 42
272
287
; CHECK-NEXT: s_branch .LBB14_3
288
+ ; CHECK-NEXT: .LBB14_2: ; %false
289
+ ; CHECK-NEXT: s_mov_b32 s0, 33
290
+ ; CHECK-NEXT: s_branch .LBB14_3
273
291
; CHECK-NEXT: .LBB14_3:
274
292
%c = icmp ult i32 %v , 12
275
293
%ballot = call i32 @llvm.amdgcn.ballot.i32 (i1 %c )
@@ -284,18 +302,16 @@ false:
284
302
define amdgpu_cs i32 @branch_divergent_ballot_ne_zero_and (i32 %v1 , i32 %v2 ) {
285
303
; CHECK-LABEL: branch_divergent_ballot_ne_zero_and:
286
304
; CHECK: ; %bb.0:
287
- ; CHECK-NEXT: v_cmp_gt_u32_e32 vcc_lo, 12, v0
288
- ; CHECK-NEXT: v_cmp_lt_u32_e64 s0, 34, v1
289
- ; CHECK-NEXT: s_and_b32 s0, vcc_lo, s0
290
- ; CHECK-NEXT: s_cmp_eq_u32 s0, 0
291
- ; CHECK-NEXT: s_cbranch_scc1 .LBB15_2
292
- ; CHECK-NEXT: ; %bb.1: ; %true
305
+ ; CHECK-NEXT: v_cmp_le_u32_e32 vcc_lo, 12, v0
306
+ ; CHECK-NEXT: v_cmp_ge_u32_e64 s0, 34, v1
307
+ ; CHECK-NEXT: s_or_b32 s2, vcc_lo, s0
293
308
; CHECK-NEXT: s_mov_b32 s0, 42
294
- ; CHECK-NEXT: s_branch .LBB15_3
295
- ; CHECK-NEXT: .LBB15_2 : ; %false
309
+ ; CHECK-NEXT: s_and_saveexec_b32 s1, s2
310
+ ; CHECK-NEXT: ; %bb.1 : ; %false
296
311
; CHECK-NEXT: s_mov_b32 s0, 33
297
- ; CHECK-NEXT: s_branch .LBB15_3
298
- ; CHECK-NEXT: .LBB15_3:
312
+ ; CHECK-NEXT: ; %bb.2: ; %UnifiedReturnBlock
313
+ ; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s1
314
+ ; CHECK-NEXT: ; return to shader part epilog
299
315
%v1c = icmp ult i32 %v1 , 12
300
316
%v2c = icmp ugt i32 %v2 , 34
301
317
%c = and i1 %v1c , %v2c
@@ -311,14 +327,12 @@ false:
311
327
define amdgpu_cs i32 @branch_uniform_ballot_ne_zero_and (i32 inreg %v1 , i32 inreg %v2 ) {
312
328
; CHECK-LABEL: branch_uniform_ballot_ne_zero_and:
313
329
; CHECK: ; %bb.0:
314
- ; CHECK-NEXT: s_cmp_lt_u32 s0, 12
330
+ ; CHECK-NEXT: s_cmp_ge_u32 s0, 12
315
331
; CHECK-NEXT: s_cselect_b32 s0, 1, 0
316
- ; CHECK-NEXT: s_cmp_gt_u32 s1, 34
332
+ ; CHECK-NEXT: s_cmp_le_u32 s1, 34
317
333
; CHECK-NEXT: s_cselect_b32 s1, 1, 0
318
- ; CHECK-NEXT: s_and_b32 s0, s0, s1
319
- ; CHECK-NEXT: s_and_b32 s0, 1, s0
320
- ; CHECK-NEXT: v_cmp_ne_u32_e64 s0, 0, s0
321
- ; CHECK-NEXT: s_cmp_eq_u32 s0, 0
334
+ ; CHECK-NEXT: s_or_b32 s0, s0, s1
335
+ ; CHECK-NEXT: s_cmp_lg_u32 s0, 0
322
336
; CHECK-NEXT: s_cbranch_scc1 .LBB16_2
323
337
; CHECK-NEXT: ; %bb.1: ; %true
324
338
; CHECK-NEXT: s_mov_b32 s0, 42
@@ -344,16 +358,14 @@ define amdgpu_cs i32 @branch_divergent_ballot_eq_zero_and(i32 %v1, i32 %v2) {
344
358
; CHECK: ; %bb.0:
345
359
; CHECK-NEXT: v_cmp_gt_u32_e32 vcc_lo, 12, v0
346
360
; CHECK-NEXT: v_cmp_lt_u32_e64 s0, 34, v1
347
- ; CHECK-NEXT: s_and_b32 s0 , vcc_lo, s0
348
- ; CHECK-NEXT: s_cmp_lg_u32 s0, 0
349
- ; CHECK-NEXT: s_cbranch_scc0 .LBB17_2
361
+ ; CHECK-NEXT: s_and_b32 s2 , vcc_lo, s0
362
+ ; CHECK-NEXT: s_mov_b32 s0, 42
363
+ ; CHECK-NEXT: s_and_saveexec_b32 s1, s2
350
364
; CHECK-NEXT: ; %bb.1: ; %false
351
365
; CHECK-NEXT: s_mov_b32 s0, 33
352
- ; CHECK-NEXT: s_branch .LBB17_3
353
- ; CHECK-NEXT: .LBB17_2: ; %true
354
- ; CHECK-NEXT: s_mov_b32 s0, 42
355
- ; CHECK-NEXT: s_branch .LBB17_3
356
- ; CHECK-NEXT: .LBB17_3:
366
+ ; CHECK-NEXT: ; %bb.2: ; %UnifiedReturnBlock
367
+ ; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s1
368
+ ; CHECK-NEXT: ; return to shader part epilog
357
369
%v1c = icmp ult i32 %v1 , 12
358
370
%v2c = icmp ugt i32 %v2 , 34
359
371
%c = and i1 %v1c , %v2c
@@ -374,16 +386,14 @@ define amdgpu_cs i32 @branch_uniform_ballot_eq_zero_and(i32 inreg %v1, i32 inreg
374
386
; CHECK-NEXT: s_cmp_gt_u32 s1, 34
375
387
; CHECK-NEXT: s_cselect_b32 s1, 1, 0
376
388
; CHECK-NEXT: s_and_b32 s0, s0, s1
377
- ; CHECK-NEXT: s_and_b32 s0, 1, s0
378
- ; CHECK-NEXT: v_cmp_ne_u32_e64 s0, 0, s0
379
389
; CHECK-NEXT: s_cmp_lg_u32 s0, 0
380
- ; CHECK-NEXT: s_cbranch_scc0 .LBB18_2
381
- ; CHECK-NEXT: ; %bb.1: ; %false
382
- ; CHECK-NEXT: s_mov_b32 s0, 33
383
- ; CHECK-NEXT: s_branch .LBB18_3
384
- ; CHECK-NEXT: .LBB18_2: ; %true
390
+ ; CHECK-NEXT: s_cbranch_scc1 .LBB18_2
391
+ ; CHECK-NEXT: ; %bb.1: ; %true
385
392
; CHECK-NEXT: s_mov_b32 s0, 42
386
393
; CHECK-NEXT: s_branch .LBB18_3
394
+ ; CHECK-NEXT: .LBB18_2: ; %false
395
+ ; CHECK-NEXT: s_mov_b32 s0, 33
396
+ ; CHECK-NEXT: s_branch .LBB18_3
387
397
; CHECK-NEXT: .LBB18_3:
388
398
%v1c = icmp ult i32 %v1 , 12
389
399
%v2c = icmp ugt i32 %v2 , 34
0 commit comments