@@ -89,17 +89,15 @@ define amdgpu_cs i32 @branch_divergent_ballot_ne_zero_non_compare(i32 %v) {
8989; CHECK-LABEL: branch_divergent_ballot_ne_zero_non_compare:
9090; CHECK: ; %bb.0:
9191; CHECK-NEXT: v_and_b32_e32 v0, 1, v0
92- ; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
93- ; CHECK-NEXT: s_and_b32 s0, vcc_lo, exec_lo
94- ; CHECK-NEXT: s_cmp_eq_u32 s0, 0
95- ; CHECK-NEXT: s_cbranch_scc1 .LBB7_2
96- ; CHECK-NEXT: ; %bb.1: ; %true
9792; CHECK-NEXT: s_mov_b32 s0, 42
98- ; CHECK-NEXT: s_branch .LBB7_3
99- ; CHECK-NEXT: .LBB7_2: ; %false
93+ ; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
94+ ; CHECK-NEXT: s_xor_b32 s2, vcc_lo, -1
95+ ; CHECK-NEXT: s_and_saveexec_b32 s1, s2
96+ ; CHECK-NEXT: ; %bb.1: ; %false
10097; CHECK-NEXT: s_mov_b32 s0, 33
101- ; CHECK-NEXT: s_branch .LBB7_3
102- ; CHECK-NEXT: .LBB7_3:
98+ ; CHECK-NEXT: ; %bb.2: ; %UnifiedReturnBlock
99+ ; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s1
100+ ; CHECK-NEXT: ; return to shader part epilog
103101 %c = trunc i32 %v to i1
104102 %ballot = call i32 @llvm.amdgcn.ballot.i32 (i1 %c )
105103 %ballot_ne_zero = icmp ne i32 %ballot , 0
@@ -113,9 +111,9 @@ false:
113111define amdgpu_cs i32 @branch_uniform_ballot_ne_zero_non_compare (i32 inreg %v ) {
114112; CHECK-LABEL: branch_uniform_ballot_ne_zero_non_compare:
115113; CHECK: ; %bb.0:
116- ; CHECK-NEXT: s_and_b32 s0, 1, s0
117- ; CHECK-NEXT: v_cmp_ne_u32_e64 s0, 0, s0
118- ; CHECK-NEXT: s_cmp_eq_u32 s0, 0
114+ ; CHECK-NEXT: s_xor_b32 s0, s0, 1
115+ ; CHECK-NEXT: s_and_b32 s0, s0, 1
116+ ; CHECK-NEXT: s_cmp_lg_u32 s0, 0
119117; CHECK-NEXT: s_cbranch_scc1 .LBB8_2
120118; CHECK-NEXT: ; %bb.1: ; %true
121119; CHECK-NEXT: s_mov_b32 s0, 42
@@ -135,20 +133,29 @@ false:
135133}
136134
137135define amdgpu_cs i32 @branch_divergent_ballot_eq_zero_non_compare (i32 %v ) {
138- ; CHECK-LABEL: branch_divergent_ballot_eq_zero_non_compare:
139- ; CHECK: ; %bb.0:
140- ; CHECK-NEXT: v_and_b32_e32 v0, 1, v0
141- ; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
142- ; CHECK-NEXT: s_and_b32 s0, vcc_lo, exec_lo
143- ; CHECK-NEXT: s_cmp_lg_u32 s0, 0
144- ; CHECK-NEXT: s_cbranch_scc0 .LBB9_2
145- ; CHECK-NEXT: ; %bb.1: ; %false
146- ; CHECK-NEXT: s_mov_b32 s0, 33
147- ; CHECK-NEXT: s_branch .LBB9_3
148- ; CHECK-NEXT: .LBB9_2: ; %true
149- ; CHECK-NEXT: s_mov_b32 s0, 42
150- ; CHECK-NEXT: s_branch .LBB9_3
151- ; CHECK-NEXT: .LBB9_3:
136+ ; GFX10-LABEL: branch_divergent_ballot_eq_zero_non_compare:
137+ ; GFX10: ; %bb.0:
138+ ; GFX10-NEXT: v_and_b32_e32 v0, 1, v0
139+ ; GFX10-NEXT: s_mov_b32 s0, 42
140+ ; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
141+ ; GFX10-NEXT: s_and_saveexec_b32 s1, vcc_lo
142+ ; GFX10-NEXT: ; %bb.1: ; %false
143+ ; GFX10-NEXT: s_mov_b32 s0, 33
144+ ; GFX10-NEXT: ; %bb.2: ; %UnifiedReturnBlock
145+ ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s1
146+ ; GFX10-NEXT: ; return to shader part epilog
147+ ;
148+ ; GFX11-LABEL: branch_divergent_ballot_eq_zero_non_compare:
149+ ; GFX11: ; %bb.0:
150+ ; GFX11-NEXT: v_and_b32_e32 v0, 1, v0
151+ ; GFX11-NEXT: s_mov_b32 s0, 42
152+ ; GFX11-NEXT: s_mov_b32 s1, exec_lo
153+ ; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v0
154+ ; GFX11-NEXT: ; %bb.1: ; %false
155+ ; GFX11-NEXT: s_mov_b32 s0, 33
156+ ; GFX11-NEXT: ; %bb.2: ; %UnifiedReturnBlock
157+ ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1
158+ ; GFX11-NEXT: ; return to shader part epilog
152159 %c = trunc i32 %v to i1
153160 %ballot = call i32 @llvm.amdgcn.ballot.i32 (i1 %c )
154161 %ballot_eq_zero = icmp eq i32 %ballot , 0
@@ -162,16 +169,17 @@ false:
162169define amdgpu_cs i32 @branch_uniform_ballot_eq_zero_non_compare (i32 inreg %v ) {
163170; CHECK-LABEL: branch_uniform_ballot_eq_zero_non_compare:
164171; CHECK: ; %bb.0:
165- ; CHECK-NEXT: s_and_b32 s0, 1, s0
166- ; CHECK-NEXT: v_cmp_ne_u32_e64 s0, 0, s0
172+ ; CHECK-NEXT: s_xor_b32 s0, s0, 1
173+ ; CHECK-NEXT: s_xor_b32 s0, s0, 1
174+ ; CHECK-NEXT: s_and_b32 s0, s0, 1
167175; CHECK-NEXT: s_cmp_lg_u32 s0, 0
168- ; CHECK-NEXT: s_cbranch_scc0 .LBB10_2
169- ; CHECK-NEXT: ; %bb.1: ; %false
170- ; CHECK-NEXT: s_mov_b32 s0, 33
171- ; CHECK-NEXT: s_branch .LBB10_3
172- ; CHECK-NEXT: .LBB10_2: ; %true
176+ ; CHECK-NEXT: s_cbranch_scc1 .LBB10_2
177+ ; CHECK-NEXT: ; %bb.1: ; %true
173178; CHECK-NEXT: s_mov_b32 s0, 42
174179; CHECK-NEXT: s_branch .LBB10_3
180+ ; CHECK-NEXT: .LBB10_2: ; %false
181+ ; CHECK-NEXT: s_mov_b32 s0, 33
182+ ; CHECK-NEXT: s_branch .LBB10_3
175183; CHECK-NEXT: .LBB10_3:
176184 %c = trunc i32 %v to i1
177185 %ballot = call i32 @llvm.amdgcn.ballot.i32 (i1 %c )
@@ -184,18 +192,27 @@ false:
184192}
185193
186194define amdgpu_cs i32 @branch_divergent_ballot_ne_zero_compare (i32 %v ) {
187- ; CHECK-LABEL: branch_divergent_ballot_ne_zero_compare:
188- ; CHECK: ; %bb.0:
189- ; CHECK-NEXT: v_cmp_gt_u32_e32 vcc_lo, 12, v0
190- ; CHECK-NEXT: s_cmp_eq_u32 vcc_lo, 0
191- ; CHECK-NEXT: s_cbranch_scc1 .LBB11_2
192- ; CHECK-NEXT: ; %bb.1: ; %true
193- ; CHECK-NEXT: s_mov_b32 s0, 42
194- ; CHECK-NEXT: s_branch .LBB11_3
195- ; CHECK-NEXT: .LBB11_2: ; %false
196- ; CHECK-NEXT: s_mov_b32 s0, 33
197- ; CHECK-NEXT: s_branch .LBB11_3
198- ; CHECK-NEXT: .LBB11_3:
195+ ; GFX10-LABEL: branch_divergent_ballot_ne_zero_compare:
196+ ; GFX10: ; %bb.0:
197+ ; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, 12, v0
198+ ; GFX10-NEXT: s_mov_b32 s0, 42
199+ ; GFX10-NEXT: s_and_saveexec_b32 s1, vcc_lo
200+ ; GFX10-NEXT: ; %bb.1: ; %false
201+ ; GFX10-NEXT: s_mov_b32 s0, 33
202+ ; GFX10-NEXT: ; %bb.2: ; %UnifiedReturnBlock
203+ ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s1
204+ ; GFX10-NEXT: ; return to shader part epilog
205+ ;
206+ ; GFX11-LABEL: branch_divergent_ballot_ne_zero_compare:
207+ ; GFX11: ; %bb.0:
208+ ; GFX11-NEXT: s_mov_b32 s0, 42
209+ ; GFX11-NEXT: s_mov_b32 s1, exec_lo
210+ ; GFX11-NEXT: v_cmpx_le_u32_e32 12, v0
211+ ; GFX11-NEXT: ; %bb.1: ; %false
212+ ; GFX11-NEXT: s_mov_b32 s0, 33
213+ ; GFX11-NEXT: ; %bb.2: ; %UnifiedReturnBlock
214+ ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1
215+ ; GFX11-NEXT: ; return to shader part epilog
199216 %c = icmp ult i32 %v , 12
200217 %ballot = call i32 @llvm.amdgcn.ballot.i32 (i1 %c )
201218 %ballot_ne_zero = icmp ne i32 %ballot , 0
@@ -209,11 +226,7 @@ false:
209226define amdgpu_cs i32 @branch_uniform_ballot_ne_zero_compare (i32 inreg %v ) {
210227; CHECK-LABEL: branch_uniform_ballot_ne_zero_compare:
211228; CHECK: ; %bb.0:
212- ; CHECK-NEXT: s_cmp_lt_u32 s0, 12
213- ; CHECK-NEXT: s_cselect_b32 s0, 1, 0
214- ; CHECK-NEXT: s_and_b32 s0, 1, s0
215- ; CHECK-NEXT: v_cmp_ne_u32_e64 s0, 0, s0
216- ; CHECK-NEXT: s_cmp_eq_u32 s0, 0
229+ ; CHECK-NEXT: s_cmp_ge_u32 s0, 12
217230; CHECK-NEXT: s_cbranch_scc1 .LBB12_2
218231; CHECK-NEXT: ; %bb.1: ; %true
219232; CHECK-NEXT: s_mov_b32 s0, 42
@@ -233,18 +246,27 @@ false:
233246}
234247
235248define amdgpu_cs i32 @branch_divergent_ballot_eq_zero_compare (i32 %v ) {
236- ; CHECK-LABEL: branch_divergent_ballot_eq_zero_compare:
237- ; CHECK: ; %bb.0:
238- ; CHECK-NEXT: v_cmp_gt_u32_e32 vcc_lo, 12, v0
239- ; CHECK-NEXT: s_cmp_lg_u32 vcc_lo, 0
240- ; CHECK-NEXT: s_cbranch_scc0 .LBB13_2
241- ; CHECK-NEXT: ; %bb.1: ; %false
242- ; CHECK-NEXT: s_mov_b32 s0, 33
243- ; CHECK-NEXT: s_branch .LBB13_3
244- ; CHECK-NEXT: .LBB13_2: ; %true
245- ; CHECK-NEXT: s_mov_b32 s0, 42
246- ; CHECK-NEXT: s_branch .LBB13_3
247- ; CHECK-NEXT: .LBB13_3:
249+ ; GFX10-LABEL: branch_divergent_ballot_eq_zero_compare:
250+ ; GFX10: ; %bb.0:
251+ ; GFX10-NEXT: v_cmp_gt_u32_e32 vcc_lo, 12, v0
252+ ; GFX10-NEXT: s_mov_b32 s0, 42
253+ ; GFX10-NEXT: s_and_saveexec_b32 s1, vcc_lo
254+ ; GFX10-NEXT: ; %bb.1: ; %false
255+ ; GFX10-NEXT: s_mov_b32 s0, 33
256+ ; GFX10-NEXT: ; %bb.2: ; %UnifiedReturnBlock
257+ ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s1
258+ ; GFX10-NEXT: ; return to shader part epilog
259+ ;
260+ ; GFX11-LABEL: branch_divergent_ballot_eq_zero_compare:
261+ ; GFX11: ; %bb.0:
262+ ; GFX11-NEXT: s_mov_b32 s0, 42
263+ ; GFX11-NEXT: s_mov_b32 s1, exec_lo
264+ ; GFX11-NEXT: v_cmpx_gt_u32_e32 12, v0
265+ ; GFX11-NEXT: ; %bb.1: ; %false
266+ ; GFX11-NEXT: s_mov_b32 s0, 33
267+ ; GFX11-NEXT: ; %bb.2: ; %UnifiedReturnBlock
268+ ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1
269+ ; GFX11-NEXT: ; return to shader part epilog
248270 %c = icmp ult i32 %v , 12
249271 %ballot = call i32 @llvm.amdgcn.ballot.i32 (i1 %c )
250272 %ballot_eq_zero = icmp eq i32 %ballot , 0
@@ -259,17 +281,13 @@ define amdgpu_cs i32 @branch_uniform_ballot_eq_zero_compare(i32 inreg %v) {
259281; CHECK-LABEL: branch_uniform_ballot_eq_zero_compare:
260282; CHECK: ; %bb.0:
261283; CHECK-NEXT: s_cmp_lt_u32 s0, 12
262- ; CHECK-NEXT: s_cselect_b32 s0, 1, 0
263- ; CHECK-NEXT: s_and_b32 s0, 1, s0
264- ; CHECK-NEXT: v_cmp_ne_u32_e64 s0, 0, s0
265- ; CHECK-NEXT: s_cmp_lg_u32 s0, 0
266- ; CHECK-NEXT: s_cbranch_scc0 .LBB14_2
267- ; CHECK-NEXT: ; %bb.1: ; %false
268- ; CHECK-NEXT: s_mov_b32 s0, 33
269- ; CHECK-NEXT: s_branch .LBB14_3
270- ; CHECK-NEXT: .LBB14_2: ; %true
284+ ; CHECK-NEXT: s_cbranch_scc1 .LBB14_2
285+ ; CHECK-NEXT: ; %bb.1: ; %true
271286; CHECK-NEXT: s_mov_b32 s0, 42
272287; CHECK-NEXT: s_branch .LBB14_3
288+ ; CHECK-NEXT: .LBB14_2: ; %false
289+ ; CHECK-NEXT: s_mov_b32 s0, 33
290+ ; CHECK-NEXT: s_branch .LBB14_3
273291; CHECK-NEXT: .LBB14_3:
274292 %c = icmp ult i32 %v , 12
275293 %ballot = call i32 @llvm.amdgcn.ballot.i32 (i1 %c )
@@ -284,18 +302,16 @@ false:
284302define amdgpu_cs i32 @branch_divergent_ballot_ne_zero_and (i32 %v1 , i32 %v2 ) {
285303; CHECK-LABEL: branch_divergent_ballot_ne_zero_and:
286304; CHECK: ; %bb.0:
287- ; CHECK-NEXT: v_cmp_gt_u32_e32 vcc_lo, 12, v0
288- ; CHECK-NEXT: v_cmp_lt_u32_e64 s0, 34, v1
289- ; CHECK-NEXT: s_and_b32 s0, vcc_lo, s0
290- ; CHECK-NEXT: s_cmp_eq_u32 s0, 0
291- ; CHECK-NEXT: s_cbranch_scc1 .LBB15_2
292- ; CHECK-NEXT: ; %bb.1: ; %true
305+ ; CHECK-NEXT: v_cmp_le_u32_e32 vcc_lo, 12, v0
306+ ; CHECK-NEXT: v_cmp_ge_u32_e64 s0, 34, v1
307+ ; CHECK-NEXT: s_or_b32 s2, vcc_lo, s0
293308; CHECK-NEXT: s_mov_b32 s0, 42
294- ; CHECK-NEXT: s_branch .LBB15_3
295- ; CHECK-NEXT: .LBB15_2 : ; %false
309+ ; CHECK-NEXT: s_and_saveexec_b32 s1, s2
310+ ; CHECK-NEXT: ; %bb.1 : ; %false
296311; CHECK-NEXT: s_mov_b32 s0, 33
297- ; CHECK-NEXT: s_branch .LBB15_3
298- ; CHECK-NEXT: .LBB15_3:
312+ ; CHECK-NEXT: ; %bb.2: ; %UnifiedReturnBlock
313+ ; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s1
314+ ; CHECK-NEXT: ; return to shader part epilog
299315 %v1c = icmp ult i32 %v1 , 12
300316 %v2c = icmp ugt i32 %v2 , 34
301317 %c = and i1 %v1c , %v2c
@@ -311,14 +327,12 @@ false:
311327define amdgpu_cs i32 @branch_uniform_ballot_ne_zero_and (i32 inreg %v1 , i32 inreg %v2 ) {
312328; CHECK-LABEL: branch_uniform_ballot_ne_zero_and:
313329; CHECK: ; %bb.0:
314- ; CHECK-NEXT: s_cmp_lt_u32 s0, 12
330+ ; CHECK-NEXT: s_cmp_ge_u32 s0, 12
315331; CHECK-NEXT: s_cselect_b32 s0, 1, 0
316- ; CHECK-NEXT: s_cmp_gt_u32 s1, 34
332+ ; CHECK-NEXT: s_cmp_le_u32 s1, 34
317333; CHECK-NEXT: s_cselect_b32 s1, 1, 0
318- ; CHECK-NEXT: s_and_b32 s0, s0, s1
319- ; CHECK-NEXT: s_and_b32 s0, 1, s0
320- ; CHECK-NEXT: v_cmp_ne_u32_e64 s0, 0, s0
321- ; CHECK-NEXT: s_cmp_eq_u32 s0, 0
334+ ; CHECK-NEXT: s_or_b32 s0, s0, s1
335+ ; CHECK-NEXT: s_cmp_lg_u32 s0, 0
322336; CHECK-NEXT: s_cbranch_scc1 .LBB16_2
323337; CHECK-NEXT: ; %bb.1: ; %true
324338; CHECK-NEXT: s_mov_b32 s0, 42
@@ -344,16 +358,14 @@ define amdgpu_cs i32 @branch_divergent_ballot_eq_zero_and(i32 %v1, i32 %v2) {
344358; CHECK: ; %bb.0:
345359; CHECK-NEXT: v_cmp_gt_u32_e32 vcc_lo, 12, v0
346360; CHECK-NEXT: v_cmp_lt_u32_e64 s0, 34, v1
347- ; CHECK-NEXT: s_and_b32 s0 , vcc_lo, s0
348- ; CHECK-NEXT: s_cmp_lg_u32 s0, 0
349- ; CHECK-NEXT: s_cbranch_scc0 .LBB17_2
361+ ; CHECK-NEXT: s_and_b32 s2 , vcc_lo, s0
362+ ; CHECK-NEXT: s_mov_b32 s0, 42
363+ ; CHECK-NEXT: s_and_saveexec_b32 s1, s2
350364; CHECK-NEXT: ; %bb.1: ; %false
351365; CHECK-NEXT: s_mov_b32 s0, 33
352- ; CHECK-NEXT: s_branch .LBB17_3
353- ; CHECK-NEXT: .LBB17_2: ; %true
354- ; CHECK-NEXT: s_mov_b32 s0, 42
355- ; CHECK-NEXT: s_branch .LBB17_3
356- ; CHECK-NEXT: .LBB17_3:
366+ ; CHECK-NEXT: ; %bb.2: ; %UnifiedReturnBlock
367+ ; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s1
368+ ; CHECK-NEXT: ; return to shader part epilog
357369 %v1c = icmp ult i32 %v1 , 12
358370 %v2c = icmp ugt i32 %v2 , 34
359371 %c = and i1 %v1c , %v2c
@@ -374,16 +386,14 @@ define amdgpu_cs i32 @branch_uniform_ballot_eq_zero_and(i32 inreg %v1, i32 inreg
374386; CHECK-NEXT: s_cmp_gt_u32 s1, 34
375387; CHECK-NEXT: s_cselect_b32 s1, 1, 0
376388; CHECK-NEXT: s_and_b32 s0, s0, s1
377- ; CHECK-NEXT: s_and_b32 s0, 1, s0
378- ; CHECK-NEXT: v_cmp_ne_u32_e64 s0, 0, s0
379389; CHECK-NEXT: s_cmp_lg_u32 s0, 0
380- ; CHECK-NEXT: s_cbranch_scc0 .LBB18_2
381- ; CHECK-NEXT: ; %bb.1: ; %false
382- ; CHECK-NEXT: s_mov_b32 s0, 33
383- ; CHECK-NEXT: s_branch .LBB18_3
384- ; CHECK-NEXT: .LBB18_2: ; %true
390+ ; CHECK-NEXT: s_cbranch_scc1 .LBB18_2
391+ ; CHECK-NEXT: ; %bb.1: ; %true
385392; CHECK-NEXT: s_mov_b32 s0, 42
386393; CHECK-NEXT: s_branch .LBB18_3
394+ ; CHECK-NEXT: .LBB18_2: ; %false
395+ ; CHECK-NEXT: s_mov_b32 s0, 33
396+ ; CHECK-NEXT: s_branch .LBB18_3
387397; CHECK-NEXT: .LBB18_3:
388398 %v1c = icmp ult i32 %v1 , 12
389399 %v2c = icmp ugt i32 %v2 , 34
0 commit comments