@@ -89,17 +89,15 @@ define amdgpu_cs i32 @branch_divergent_ballot_ne_zero_non_compare(i32 %v) {
8989; CHECK-LABEL: branch_divergent_ballot_ne_zero_non_compare:
9090; CHECK: ; %bb.0:
9191; CHECK-NEXT: v_and_b32_e32 v0, 1, v0
92- ; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
93- ; CHECK-NEXT: s_and_b32 s0, vcc_lo, exec_lo
94- ; CHECK-NEXT: s_cmp_eq_u32 s0, 0
95- ; CHECK-NEXT: s_cbranch_scc1 .LBB7_2
96- ; CHECK-NEXT: ; %bb.1: ; %true
9792; CHECK-NEXT: s_mov_b32 s0, 42
98- ; CHECK-NEXT: s_branch .LBB7_3
99- ; CHECK-NEXT: .LBB7_2: ; %false
93+ ; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
94+ ; CHECK-NEXT: s_xor_b32 s2, vcc_lo, -1
95+ ; CHECK-NEXT: s_and_saveexec_b32 s1, s2
96+ ; CHECK-NEXT: ; %bb.1: ; %false
10097; CHECK-NEXT: s_mov_b32 s0, 33
101- ; CHECK-NEXT: s_branch .LBB7_3
102- ; CHECK-NEXT: .LBB7_3:
98+ ; CHECK-NEXT: ; %bb.2: ; %UnifiedReturnBlock
99+ ; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s1
100+ ; CHECK-NEXT: ; return to shader part epilog
103101 %c = trunc i32 %v to i1
104102 %ballot = call i32 @llvm.amdgcn.ballot.i32 (i1 %c )
105103 %ballot_ne_zero = icmp ne i32 %ballot , 0
@@ -113,9 +111,9 @@ false:
113111define amdgpu_cs i32 @branch_uniform_ballot_ne_zero_non_compare (i32 inreg %v ) {
114112; CHECK-LABEL: branch_uniform_ballot_ne_zero_non_compare:
115113; CHECK: ; %bb.0:
116- ; CHECK-NEXT: s_and_b32 s0, 1, s0
117- ; CHECK-NEXT: v_cmp_ne_u32_e64 s0, 0, s0
118- ; CHECK-NEXT: s_cmp_eq_u32 s0, 0
114+ ; CHECK-NEXT: s_xor_b32 s0, s0, 1
115+ ; CHECK-NEXT: s_and_b32 s0, s0, 1
116+ ; CHECK-NEXT: s_cmp_lg_u32 s0, 0
119117; CHECK-NEXT: s_cbranch_scc1 .LBB8_2
120118; CHECK-NEXT: ; %bb.1: ; %true
121119; CHECK-NEXT: s_mov_b32 s0, 42
@@ -161,16 +159,17 @@ false:
161159define amdgpu_cs i32 @branch_uniform_ballot_eq_zero_non_compare (i32 inreg %v ) {
162160; CHECK-LABEL: branch_uniform_ballot_eq_zero_non_compare:
163161; CHECK: ; %bb.0:
164- ; CHECK-NEXT: s_and_b32 s0, 1, s0
165- ; CHECK-NEXT: v_cmp_ne_u32_e64 s0, 0, s0
162+ ; CHECK-NEXT: s_xor_b32 s0, s0, 1
163+ ; CHECK-NEXT: s_xor_b32 s0, s0, 1
164+ ; CHECK-NEXT: s_and_b32 s0, s0, 1
166165; CHECK-NEXT: s_cmp_lg_u32 s0, 0
167- ; CHECK-NEXT: s_cbranch_scc0 .LBB10_2
168- ; CHECK-NEXT: ; %bb.1: ; %false
169- ; CHECK-NEXT: s_mov_b32 s0, 33
170- ; CHECK-NEXT: s_branch .LBB10_3
171- ; CHECK-NEXT: .LBB10_2: ; %true
166+ ; CHECK-NEXT: s_cbranch_scc1 .LBB10_2
167+ ; CHECK-NEXT: ; %bb.1: ; %true
172168; CHECK-NEXT: s_mov_b32 s0, 42
173169; CHECK-NEXT: s_branch .LBB10_3
170+ ; CHECK-NEXT: .LBB10_2: ; %false
171+ ; CHECK-NEXT: s_mov_b32 s0, 33
172+ ; CHECK-NEXT: s_branch .LBB10_3
174173; CHECK-NEXT: .LBB10_3:
175174 %c = trunc i32 %v to i1
176175 %ballot = call i32 @llvm.amdgcn.ballot.i32 (i1 %c )
@@ -183,18 +182,27 @@ false:
183182}
184183
185184define amdgpu_cs i32 @branch_divergent_ballot_ne_zero_compare (i32 %v ) {
186- ; CHECK-LABEL: branch_divergent_ballot_ne_zero_compare:
187- ; CHECK: ; %bb.0:
188- ; CHECK-NEXT: v_cmp_gt_u32_e32 vcc_lo, 12, v0
189- ; CHECK-NEXT: s_cmp_eq_u32 vcc_lo, 0
190- ; CHECK-NEXT: s_cbranch_scc1 .LBB11_2
191- ; CHECK-NEXT: ; %bb.1: ; %true
192- ; CHECK-NEXT: s_mov_b32 s0, 42
193- ; CHECK-NEXT: s_branch .LBB11_3
194- ; CHECK-NEXT: .LBB11_2: ; %false
195- ; CHECK-NEXT: s_mov_b32 s0, 33
196- ; CHECK-NEXT: s_branch .LBB11_3
197- ; CHECK-NEXT: .LBB11_3:
185+ ; GFX10-LABEL: branch_divergent_ballot_ne_zero_compare:
186+ ; GFX10: ; %bb.0:
187+ ; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, 12, v0
188+ ; GFX10-NEXT: s_mov_b32 s0, 42
189+ ; GFX10-NEXT: s_and_saveexec_b32 s1, vcc_lo
190+ ; GFX10-NEXT: ; %bb.1: ; %false
191+ ; GFX10-NEXT: s_mov_b32 s0, 33
192+ ; GFX10-NEXT: ; %bb.2: ; %UnifiedReturnBlock
193+ ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s1
194+ ; GFX10-NEXT: ; return to shader part epilog
195+ ;
196+ ; GFX11-LABEL: branch_divergent_ballot_ne_zero_compare:
197+ ; GFX11: ; %bb.0:
198+ ; GFX11-NEXT: s_mov_b32 s0, 42
199+ ; GFX11-NEXT: s_mov_b32 s1, exec_lo
200+ ; GFX11-NEXT: v_cmpx_le_u32_e32 12, v0
201+ ; GFX11-NEXT: ; %bb.1: ; %false
202+ ; GFX11-NEXT: s_mov_b32 s0, 33
203+ ; GFX11-NEXT: ; %bb.2: ; %UnifiedReturnBlock
204+ ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1
205+ ; GFX11-NEXT: ; return to shader part epilog
198206 %c = icmp ult i32 %v , 12
199207 %ballot = call i32 @llvm.amdgcn.ballot.i32 (i1 %c )
200208 %ballot_ne_zero = icmp ne i32 %ballot , 0
@@ -208,11 +216,7 @@ false:
208216define amdgpu_cs i32 @branch_uniform_ballot_ne_zero_compare (i32 inreg %v ) {
209217; CHECK-LABEL: branch_uniform_ballot_ne_zero_compare:
210218; CHECK: ; %bb.0:
211- ; CHECK-NEXT: s_cmp_lt_u32 s0, 12
212- ; CHECK-NEXT: s_cselect_b32 s0, 1, 0
213- ; CHECK-NEXT: s_and_b32 s0, 1, s0
214- ; CHECK-NEXT: v_cmp_ne_u32_e64 s0, 0, s0
215- ; CHECK-NEXT: s_cmp_eq_u32 s0, 0
219+ ; CHECK-NEXT: s_cmp_ge_u32 s0, 12
216220; CHECK-NEXT: s_cbranch_scc1 .LBB12_2
217221; CHECK-NEXT: ; %bb.1: ; %true
218222; CHECK-NEXT: s_mov_b32 s0, 42
@@ -232,18 +236,27 @@ false:
232236}
233237
234238define amdgpu_cs i32 @branch_divergent_ballot_eq_zero_compare (i32 %v ) {
235- ; CHECK-LABEL: branch_divergent_ballot_eq_zero_compare:
236- ; CHECK: ; %bb.0:
237- ; CHECK-NEXT: v_cmp_gt_u32_e32 vcc_lo, 12, v0
238- ; CHECK-NEXT: s_cmp_lg_u32 vcc_lo, 0
239- ; CHECK-NEXT: s_cbranch_scc0 .LBB13_2
240- ; CHECK-NEXT: ; %bb.1: ; %false
241- ; CHECK-NEXT: s_mov_b32 s0, 33
242- ; CHECK-NEXT: s_branch .LBB13_3
243- ; CHECK-NEXT: .LBB13_2: ; %true
244- ; CHECK-NEXT: s_mov_b32 s0, 42
245- ; CHECK-NEXT: s_branch .LBB13_3
246- ; CHECK-NEXT: .LBB13_3:
239+ ; GFX10-LABEL: branch_divergent_ballot_eq_zero_compare:
240+ ; GFX10: ; %bb.0:
241+ ; GFX10-NEXT: v_cmp_gt_u32_e32 vcc_lo, 12, v0
242+ ; GFX10-NEXT: s_mov_b32 s0, 42
243+ ; GFX10-NEXT: s_and_saveexec_b32 s1, vcc_lo
244+ ; GFX10-NEXT: ; %bb.1: ; %false
245+ ; GFX10-NEXT: s_mov_b32 s0, 33
246+ ; GFX10-NEXT: ; %bb.2: ; %UnifiedReturnBlock
247+ ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s1
248+ ; GFX10-NEXT: ; return to shader part epilog
249+ ;
250+ ; GFX11-LABEL: branch_divergent_ballot_eq_zero_compare:
251+ ; GFX11: ; %bb.0:
252+ ; GFX11-NEXT: s_mov_b32 s0, 42
253+ ; GFX11-NEXT: s_mov_b32 s1, exec_lo
254+ ; GFX11-NEXT: v_cmpx_gt_u32_e32 12, v0
255+ ; GFX11-NEXT: ; %bb.1: ; %false
256+ ; GFX11-NEXT: s_mov_b32 s0, 33
257+ ; GFX11-NEXT: ; %bb.2: ; %UnifiedReturnBlock
258+ ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1
259+ ; GFX11-NEXT: ; return to shader part epilog
247260 %c = icmp ult i32 %v , 12
248261 %ballot = call i32 @llvm.amdgcn.ballot.i32 (i1 %c )
249262 %ballot_eq_zero = icmp eq i32 %ballot , 0
@@ -258,17 +271,13 @@ define amdgpu_cs i32 @branch_uniform_ballot_eq_zero_compare(i32 inreg %v) {
258271; CHECK-LABEL: branch_uniform_ballot_eq_zero_compare:
259272; CHECK: ; %bb.0:
260273; CHECK-NEXT: s_cmp_lt_u32 s0, 12
261- ; CHECK-NEXT: s_cselect_b32 s0, 1, 0
262- ; CHECK-NEXT: s_and_b32 s0, 1, s0
263- ; CHECK-NEXT: v_cmp_ne_u32_e64 s0, 0, s0
264- ; CHECK-NEXT: s_cmp_lg_u32 s0, 0
265- ; CHECK-NEXT: s_cbranch_scc0 .LBB14_2
266- ; CHECK-NEXT: ; %bb.1: ; %false
267- ; CHECK-NEXT: s_mov_b32 s0, 33
268- ; CHECK-NEXT: s_branch .LBB14_3
269- ; CHECK-NEXT: .LBB14_2: ; %true
274+ ; CHECK-NEXT: s_cbranch_scc1 .LBB14_2
275+ ; CHECK-NEXT: ; %bb.1: ; %true
270276; CHECK-NEXT: s_mov_b32 s0, 42
271277; CHECK-NEXT: s_branch .LBB14_3
278+ ; CHECK-NEXT: .LBB14_2: ; %false
279+ ; CHECK-NEXT: s_mov_b32 s0, 33
280+ ; CHECK-NEXT: s_branch .LBB14_3
272281; CHECK-NEXT: .LBB14_3:
273282 %c = icmp ult i32 %v , 12
274283 %ballot = call i32 @llvm.amdgcn.ballot.i32 (i1 %c )
@@ -283,18 +292,16 @@ false:
283292define amdgpu_cs i32 @branch_divergent_ballot_ne_zero_and (i32 %v1 , i32 %v2 ) {
284293; CHECK-LABEL: branch_divergent_ballot_ne_zero_and:
285294; CHECK: ; %bb.0:
286- ; CHECK-NEXT: v_cmp_gt_u32_e32 vcc_lo, 12, v0
287- ; CHECK-NEXT: v_cmp_lt_u32_e64 s0, 34, v1
288- ; CHECK-NEXT: s_and_b32 s0, vcc_lo, s0
289- ; CHECK-NEXT: s_cmp_eq_u32 s0, 0
290- ; CHECK-NEXT: s_cbranch_scc1 .LBB15_2
291- ; CHECK-NEXT: ; %bb.1: ; %true
295+ ; CHECK-NEXT: v_cmp_le_u32_e32 vcc_lo, 12, v0
296+ ; CHECK-NEXT: v_cmp_ge_u32_e64 s0, 34, v1
297+ ; CHECK-NEXT: s_or_b32 s2, vcc_lo, s0
292298; CHECK-NEXT: s_mov_b32 s0, 42
293- ; CHECK-NEXT: s_branch .LBB15_3
294- ; CHECK-NEXT: .LBB15_2 : ; %false
299+ ; CHECK-NEXT: s_and_saveexec_b32 s1, s2
300+ ; CHECK-NEXT: ; %bb.1 : ; %false
295301; CHECK-NEXT: s_mov_b32 s0, 33
296- ; CHECK-NEXT: s_branch .LBB15_3
297- ; CHECK-NEXT: .LBB15_3:
302+ ; CHECK-NEXT: ; %bb.2: ; %UnifiedReturnBlock
303+ ; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s1
304+ ; CHECK-NEXT: ; return to shader part epilog
298305 %v1c = icmp ult i32 %v1 , 12
299306 %v2c = icmp ugt i32 %v2 , 34
300307 %c = and i1 %v1c , %v2c
@@ -310,14 +317,12 @@ false:
310317define amdgpu_cs i32 @branch_uniform_ballot_ne_zero_and (i32 inreg %v1 , i32 inreg %v2 ) {
311318; CHECK-LABEL: branch_uniform_ballot_ne_zero_and:
312319; CHECK: ; %bb.0:
313- ; CHECK-NEXT: s_cmp_lt_u32 s0, 12
320+ ; CHECK-NEXT: s_cmp_ge_u32 s0, 12
314321; CHECK-NEXT: s_cselect_b32 s0, 1, 0
315- ; CHECK-NEXT: s_cmp_gt_u32 s1, 34
322+ ; CHECK-NEXT: s_cmp_le_u32 s1, 34
316323; CHECK-NEXT: s_cselect_b32 s1, 1, 0
317- ; CHECK-NEXT: s_and_b32 s0, s0, s1
318- ; CHECK-NEXT: s_and_b32 s0, 1, s0
319- ; CHECK-NEXT: v_cmp_ne_u32_e64 s0, 0, s0
320- ; CHECK-NEXT: s_cmp_eq_u32 s0, 0
324+ ; CHECK-NEXT: s_or_b32 s0, s0, s1
325+ ; CHECK-NEXT: s_cmp_lg_u32 s0, 0
321326; CHECK-NEXT: s_cbranch_scc1 .LBB16_2
322327; CHECK-NEXT: ; %bb.1: ; %true
323328; CHECK-NEXT: s_mov_b32 s0, 42
@@ -347,11 +352,9 @@ define amdgpu_cs i32 @branch_divergent_ballot_eq_zero_and(i32 %v1, i32 %v2) {
347352; CHECK-NEXT: s_cbranch_scc0 .LBB17_2
348353; CHECK-NEXT: ; %bb.1: ; %false
349354; CHECK-NEXT: s_mov_b32 s0, 33
350- ; CHECK-NEXT: s_branch .LBB17_3
351- ; CHECK-NEXT: .LBB17_2: ; %true
352- ; CHECK-NEXT: s_mov_b32 s0, 42
353- ; CHECK-NEXT: s_branch .LBB17_3
354- ; CHECK-NEXT: .LBB17_3:
355+ ; CHECK-NEXT: ; %bb.2: ; %UnifiedReturnBlock
356+ ; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s1
357+ ; CHECK-NEXT: ; return to shader part epilog
355358 %v1c = icmp ult i32 %v1 , 12
356359 %v2c = icmp ugt i32 %v2 , 34
357360 %c = and i1 %v1c , %v2c
@@ -365,23 +368,15 @@ false:
365368}
366369
367370define amdgpu_cs i32 @branch_uniform_ballot_eq_zero_and (i32 inreg %v1 , i32 inreg %v2 ) {
368- ; CHECK-LABEL: branch_uniform_ballot_eq_zero_and:
369371; CHECK: ; %bb.0:
370372; CHECK-NEXT: s_cmp_lt_u32 s0, 12
371- ; CHECK-NEXT: s_cselect_b32 s0, 1, 0
372- ; CHECK-NEXT: s_cmp_gt_u32 s1, 34
373- ; CHECK-NEXT: s_cselect_b32 s1, 1, 0
374- ; CHECK-NEXT: s_and_b32 s0, s0, s1
375- ; CHECK-NEXT: s_and_b32 s0, 1, s0
376- ; CHECK-NEXT: v_cmp_ne_u32_e64 s0, 0, s0
377- ; CHECK-NEXT: s_cmp_lg_u32 s0, 0
378- ; CHECK-NEXT: s_cbranch_scc0 .LBB18_2
379- ; CHECK-NEXT: ; %bb.1: ; %false
380- ; CHECK-NEXT: s_mov_b32 s0, 33
381- ; CHECK-NEXT: s_branch .LBB18_3
382- ; CHECK-NEXT: .LBB18_2: ; %true
373+ ; CHECK-NEXT: s_cbranch_scc1 .LBB18_2
374+ ; CHECK-NEXT: ; %bb.1: ; %true
383375; CHECK-NEXT: s_mov_b32 s0, 42
384376; CHECK-NEXT: s_branch .LBB18_3
377+ ; CHECK-NEXT: .LBB18_2: ; %false
378+ ; CHECK-NEXT: s_mov_b32 s0, 33
379+ ; CHECK-NEXT: s_branch .LBB18_3
385380; CHECK-NEXT: .LBB18_3:
386381 %v1c = icmp ult i32 %v1 , 12
387382 %v2c = icmp ugt i32 %v2 , 34
0 commit comments