@@ -178,6 +178,55 @@ exit:
178178 ret void
179179}
180180
181+ define void @divergent_i1_xor_used_outside_loop_twice (float %val , float %pre.cond.val , ptr %addr , ptr %addr2 ) {
182+ ; GFX10-LABEL: divergent_i1_xor_used_outside_loop_twice:
183+ ; GFX10: ; %bb.0: ; %entry
184+ ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
185+ ; GFX10-NEXT: s_mov_b32 s4, 0
186+ ; GFX10-NEXT: v_cmp_lt_f32_e64 s5, 1.0, v1
187+ ; GFX10-NEXT: v_mov_b32_e32 v1, s4
188+ ; GFX10-NEXT: ; implicit-def: $sgpr6
189+ ; GFX10-NEXT: .LBB3_1: ; %loop
190+ ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
191+ ; GFX10-NEXT: v_cvt_f32_u32_e32 v6, v1
192+ ; GFX10-NEXT: s_xor_b32 s5, s5, -1
193+ ; GFX10-NEXT: v_add_nc_u32_e32 v1, 1, v1
194+ ; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, v6, v0
195+ ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
196+ ; GFX10-NEXT: s_andn2_b32 s6, s6, exec_lo
197+ ; GFX10-NEXT: s_and_b32 s7, exec_lo, s5
198+ ; GFX10-NEXT: s_or_b32 s6, s6, s7
199+ ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
200+ ; GFX10-NEXT: s_cbranch_execnz .LBB3_1
201+ ; GFX10-NEXT: ; %bb.2: ; %exit
202+ ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
203+ ; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, s6
204+ ; GFX10-NEXT: v_cndmask_b32_e64 v1, -1.0, 2.0, s6
205+ ; GFX10-NEXT: flat_store_dword v[2:3], v0
206+ ; GFX10-NEXT: flat_store_dword v[4:5], v1
207+ ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
208+ ; GFX10-NEXT: s_setpc_b64 s[30:31]
209+ entry:
210+ %pre.cond = fcmp ogt float %pre.cond.val , 1 .0
211+ br label %loop
212+
213+ loop:
214+ %counter = phi i32 [ 0 , %entry ], [ %counter.plus.1 , %loop ]
215+ %bool.counter = phi i1 [ %pre.cond , %entry ], [ %neg.bool.counter , %loop ]
216+ %neg.bool.counter = xor i1 %bool.counter , true
217+ %f.counter = uitofp i32 %counter to float
218+ %cond = fcmp ogt float %f.counter , %val
219+ %counter.plus.1 = add i32 %counter , 1
220+ br i1 %cond , label %exit , label %loop
221+
222+ exit:
223+ %select = select i1 %neg.bool.counter , float 1 .000000e+00 , float 0 .000000e+00
224+ store float %select , ptr %addr
225+ %select2 = select i1 %neg.bool.counter , float 2 .000000e+00 , float -1 .000000e+00
226+ store float %select2 , ptr %addr2
227+ ret void
228+ }
229+
181230;void xor(int num_elts, int* a, int* addr) {
182231;for(int i=0; i<num_elts; ++i) {
183232; if(a[i]==0)
@@ -195,15 +244,15 @@ define void @divergent_i1_xor_used_outside_loop_larger_loop_body(i32 %num.elts,
195244; GFX10-NEXT: s_mov_b32 s5, 0
196245; GFX10-NEXT: s_mov_b32 s6, -1
197246; GFX10-NEXT: s_and_saveexec_b32 s4, vcc_lo
198- ; GFX10-NEXT: s_cbranch_execz .LBB3_6
247+ ; GFX10-NEXT: s_cbranch_execz .LBB4_6
199248; GFX10-NEXT: ; %bb.1: ; %loop.start.preheader
200249; GFX10-NEXT: v_mov_b32_e32 v5, s5
201250; GFX10-NEXT: ; implicit-def: $sgpr6
202251; GFX10-NEXT: ; implicit-def: $sgpr7
203252; GFX10-NEXT: ; implicit-def: $sgpr8
204- ; GFX10-NEXT: s_branch .LBB3_3
205- ; GFX10-NEXT: .LBB3_2 : ; %Flow
206- ; GFX10-NEXT: ; in Loop: Header=BB3_3 Depth=1
253+ ; GFX10-NEXT: s_branch .LBB4_3
254+ ; GFX10-NEXT: .LBB4_2 : ; %Flow
255+ ; GFX10-NEXT: ; in Loop: Header=BB4_3 Depth=1
207256; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s9
208257; GFX10-NEXT: s_xor_b32 s9, s8, -1
209258; GFX10-NEXT: s_and_b32 s10, exec_lo, s7
@@ -212,8 +261,8 @@ define void @divergent_i1_xor_used_outside_loop_larger_loop_body(i32 %num.elts,
212261; GFX10-NEXT: s_and_b32 s9, exec_lo, s9
213262; GFX10-NEXT: s_or_b32 s6, s6, s9
214263; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5
215- ; GFX10-NEXT: s_cbranch_execz .LBB3_5
216- ; GFX10-NEXT: .LBB3_3 : ; %loop.start
264+ ; GFX10-NEXT: s_cbranch_execz .LBB4_5
265+ ; GFX10-NEXT: .LBB4_3 : ; %loop.start
217266; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
218267; GFX10-NEXT: v_ashrrev_i32_e32 v6, 31, v5
219268; GFX10-NEXT: s_andn2_b32 s8, s8, exec_lo
@@ -228,9 +277,9 @@ define void @divergent_i1_xor_used_outside_loop_larger_loop_body(i32 %num.elts,
228277; GFX10-NEXT: s_waitcnt vmcnt(0)
229278; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v6
230279; GFX10-NEXT: s_and_saveexec_b32 s9, vcc_lo
231- ; GFX10-NEXT: s_cbranch_execz .LBB3_2
280+ ; GFX10-NEXT: s_cbranch_execz .LBB4_2
232281; GFX10-NEXT: ; %bb.4: ; %loop.cond
233- ; GFX10-NEXT: ; in Loop: Header=BB3_3 Depth=1
282+ ; GFX10-NEXT: ; in Loop: Header=BB4_3 Depth=1
234283; GFX10-NEXT: v_add_nc_u32_e32 v6, 1, v5
235284; GFX10-NEXT: v_cmp_lt_i32_e32 vcc_lo, v5, v0
236285; GFX10-NEXT: s_andn2_b32 s8, s8, exec_lo
@@ -240,20 +289,20 @@ define void @divergent_i1_xor_used_outside_loop_larger_loop_body(i32 %num.elts,
240289; GFX10-NEXT: s_and_b32 s11, exec_lo, vcc_lo
241290; GFX10-NEXT: s_or_b32 s8, s8, s10
242291; GFX10-NEXT: s_or_b32 s7, s7, s11
243- ; GFX10-NEXT: s_branch .LBB3_2
244- ; GFX10-NEXT: .LBB3_5 : ; %loop.exit.guard
292+ ; GFX10-NEXT: s_branch .LBB4_2
293+ ; GFX10-NEXT: .LBB4_5 : ; %loop.exit.guard
245294; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5
246295; GFX10-NEXT: s_andn2_b32 s5, -1, exec_lo
247296; GFX10-NEXT: s_and_b32 s6, exec_lo, s6
248297; GFX10-NEXT: s_or_b32 s6, s5, s6
249- ; GFX10-NEXT: .LBB3_6 : ; %Flow1
298+ ; GFX10-NEXT: .LBB4_6 : ; %Flow1
250299; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
251300; GFX10-NEXT: s_and_saveexec_b32 s4, s6
252- ; GFX10-NEXT: s_cbranch_execz .LBB3_8
301+ ; GFX10-NEXT: s_cbranch_execz .LBB4_8
253302; GFX10-NEXT: ; %bb.7: ; %block.after.loop
254303; GFX10-NEXT: v_mov_b32_e32 v0, 5
255304; GFX10-NEXT: flat_store_dword v[3:4], v0
256- ; GFX10-NEXT: .LBB3_8 : ; %exit
305+ ; GFX10-NEXT: .LBB4_8 : ; %exit
257306; GFX10-NEXT: s_waitcnt_depctr 0xffe3
258307; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
259308; GFX10-NEXT: s_waitcnt lgkmcnt(0)
@@ -299,51 +348,51 @@ define void @divergent_i1_icmp_used_outside_loop(i32 %v0, i32 %v1, ptr addrspace
299348; GFX10-NEXT: s_mov_b32 s5, 0
300349; GFX10-NEXT: ; implicit-def: $sgpr6
301350; GFX10-NEXT: v_mov_b32_e32 v4, s5
302- ; GFX10-NEXT: s_branch .LBB4_2
303- ; GFX10-NEXT: .LBB4_1 : ; %Flow
304- ; GFX10-NEXT: ; in Loop: Header=BB4_2 Depth=1
351+ ; GFX10-NEXT: s_branch .LBB5_2
352+ ; GFX10-NEXT: .LBB5_1 : ; %Flow
353+ ; GFX10-NEXT: ; in Loop: Header=BB5_2 Depth=1
305354; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s8
306355; GFX10-NEXT: s_and_b32 s4, exec_lo, s7
307356; GFX10-NEXT: s_or_b32 s5, s4, s5
308357; GFX10-NEXT: s_andn2_b32 s4, s6, exec_lo
309358; GFX10-NEXT: s_and_b32 s6, exec_lo, vcc_lo
310359; GFX10-NEXT: s_or_b32 s6, s4, s6
311360; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5
312- ; GFX10-NEXT: s_cbranch_execz .LBB4_6
313- ; GFX10-NEXT: .LBB4_2 : ; %cond.block.0
361+ ; GFX10-NEXT: s_cbranch_execz .LBB5_6
362+ ; GFX10-NEXT: .LBB5_2 : ; %cond.block.0
314363; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
315364; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v4
316365; GFX10-NEXT: s_and_saveexec_b32 s7, vcc_lo
317- ; GFX10-NEXT: s_cbranch_execz .LBB4_4
366+ ; GFX10-NEXT: s_cbranch_execz .LBB5_4
318367; GFX10-NEXT: ; %bb.3: ; %if.block.0
319- ; GFX10-NEXT: ; in Loop: Header=BB4_2 Depth=1
368+ ; GFX10-NEXT: ; in Loop: Header=BB5_2 Depth=1
320369; GFX10-NEXT: v_ashrrev_i32_e32 v5, 31, v4
321370; GFX10-NEXT: v_lshlrev_b64 v[8:9], 2, v[4:5]
322371; GFX10-NEXT: v_add_co_u32 v8, s4, v2, v8
323372; GFX10-NEXT: v_add_co_ci_u32_e64 v9, s4, v3, v9, s4
324373; GFX10-NEXT: global_store_dword v[8:9], v4, off
325- ; GFX10-NEXT: .LBB4_4 : ; %loop.break.block
326- ; GFX10-NEXT: ; in Loop: Header=BB4_2 Depth=1
374+ ; GFX10-NEXT: .LBB5_4 : ; %loop.break.block
375+ ; GFX10-NEXT: ; in Loop: Header=BB5_2 Depth=1
327376; GFX10-NEXT: s_waitcnt_depctr 0xffe3
328377; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s7
329378; GFX10-NEXT: v_cmp_ne_u32_e64 s4, v1, v4
330379; GFX10-NEXT: s_mov_b32 s7, -1
331380; GFX10-NEXT: s_and_saveexec_b32 s8, s4
332- ; GFX10-NEXT: s_cbranch_execz .LBB4_1
381+ ; GFX10-NEXT: s_cbranch_execz .LBB5_1
333382; GFX10-NEXT: ; %bb.5: ; %loop.cond
334- ; GFX10-NEXT: ; in Loop: Header=BB4_2 Depth=1
383+ ; GFX10-NEXT: ; in Loop: Header=BB5_2 Depth=1
335384; GFX10-NEXT: v_add_nc_u32_e32 v4, 1, v4
336385; GFX10-NEXT: s_andn2_b32 s4, -1, exec_lo
337386; GFX10-NEXT: s_and_b32 s7, exec_lo, 0
338387; GFX10-NEXT: s_or_b32 s7, s4, s7
339- ; GFX10-NEXT: s_branch .LBB4_1
340- ; GFX10-NEXT: .LBB4_6 : ; %cond.block.1
388+ ; GFX10-NEXT: s_branch .LBB5_1
389+ ; GFX10-NEXT: .LBB5_6 : ; %cond.block.1
341390; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5
342391; GFX10-NEXT: s_and_saveexec_b32 s4, s6
343- ; GFX10-NEXT: s_cbranch_execz .LBB4_8
392+ ; GFX10-NEXT: s_cbranch_execz .LBB5_8
344393; GFX10-NEXT: ; %bb.7: ; %if.block.1
345394; GFX10-NEXT: global_store_dword v[6:7], v4, off
346- ; GFX10-NEXT: .LBB4_8 : ; %exit
395+ ; GFX10-NEXT: .LBB5_8 : ; %exit
347396; GFX10-NEXT: s_waitcnt_depctr 0xffe3
348397; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
349398; GFX10-NEXT: s_setpc_b64 s[30:31]
@@ -408,9 +457,9 @@ define amdgpu_ps void @divergent_i1_freeze_used_outside_loop(i32 %n, ptr addrspa
408457; GFX10-NEXT: v_mov_b32_e32 v5, s0
409458; GFX10-NEXT: ; implicit-def: $sgpr1
410459; GFX10-NEXT: ; implicit-def: $sgpr2
411- ; GFX10-NEXT: s_branch .LBB5_2
412- ; GFX10-NEXT: .LBB5_1 : ; %loop.cond
413- ; GFX10-NEXT: ; in Loop: Header=BB5_2 Depth=1
460+ ; GFX10-NEXT: s_branch .LBB6_2
461+ ; GFX10-NEXT: .LBB6_1 : ; %loop.cond
462+ ; GFX10-NEXT: ; in Loop: Header=BB6_2 Depth=1
414463; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
415464; GFX10-NEXT: v_cmp_lt_i32_e32 vcc_lo, v5, v0
416465; GFX10-NEXT: v_add_nc_u32_e32 v5, 1, v5
@@ -421,16 +470,16 @@ define amdgpu_ps void @divergent_i1_freeze_used_outside_loop(i32 %n, ptr addrspa
421470; GFX10-NEXT: s_or_b32 s3, s3, s4
422471; GFX10-NEXT: s_or_b32 s1, s1, s4
423472; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s0
424- ; GFX10-NEXT: s_cbranch_execz .LBB5_4
425- ; GFX10-NEXT: .LBB5_2 : ; %loop.start
473+ ; GFX10-NEXT: s_cbranch_execz .LBB6_4
474+ ; GFX10-NEXT: .LBB6_2 : ; %loop.start
426475; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
427476; GFX10-NEXT: s_andn2_b32 s2, s2, exec_lo
428477; GFX10-NEXT: s_and_b32 s4, exec_lo, s3
429478; GFX10-NEXT: s_or_b32 s2, s2, s4
430479; GFX10-NEXT: s_and_saveexec_b32 s4, s3
431- ; GFX10-NEXT: s_cbranch_execz .LBB5_1
480+ ; GFX10-NEXT: s_cbranch_execz .LBB6_1
432481; GFX10-NEXT: ; %bb.3: ; %is.eq.zero
433- ; GFX10-NEXT: ; in Loop: Header=BB5_2 Depth=1
482+ ; GFX10-NEXT: ; in Loop: Header=BB6_2 Depth=1
434483; GFX10-NEXT: v_ashrrev_i32_e32 v6, 31, v5
435484; GFX10-NEXT: s_andn2_b32 s2, s2, exec_lo
436485; GFX10-NEXT: v_lshlrev_b64 v[6:7], 2, v[5:6]
@@ -442,8 +491,8 @@ define amdgpu_ps void @divergent_i1_freeze_used_outside_loop(i32 %n, ptr addrspa
442491; GFX10-NEXT: s_and_b32 s3, exec_lo, vcc_lo
443492; GFX10-NEXT: s_or_b32 s2, s2, s3
444493; GFX10-NEXT: ; implicit-def: $sgpr3
445- ; GFX10-NEXT: s_branch .LBB5_1
446- ; GFX10-NEXT: .LBB5_4 : ; %exit
494+ ; GFX10-NEXT: s_branch .LBB6_1
495+ ; GFX10-NEXT: .LBB6_4 : ; %exit
447496; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s0
448497; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, s1
449498; GFX10-NEXT: flat_store_dword v[3:4], v0
@@ -484,9 +533,9 @@ define amdgpu_cs void @loop_with_1break(ptr addrspace(1) %x, ptr addrspace(1) %a
484533; GFX10-NEXT: ; implicit-def: $sgpr2
485534; GFX10-NEXT: ; implicit-def: $sgpr3
486535; GFX10-NEXT: v_mov_b32_e32 v6, s0
487- ; GFX10-NEXT: s_branch .LBB6_2
488- ; GFX10-NEXT: .LBB6_1 : ; %Flow
489- ; GFX10-NEXT: ; in Loop: Header=BB6_2 Depth=1
536+ ; GFX10-NEXT: s_branch .LBB7_2
537+ ; GFX10-NEXT: .LBB7_1 : ; %Flow
538+ ; GFX10-NEXT: ; in Loop: Header=BB7_2 Depth=1
490539; GFX10-NEXT: s_waitcnt_depctr 0xffe3
491540; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
492541; GFX10-NEXT: s_and_b32 s4, exec_lo, s2
@@ -495,8 +544,8 @@ define amdgpu_cs void @loop_with_1break(ptr addrspace(1) %x, ptr addrspace(1) %a
495544; GFX10-NEXT: s_and_b32 s4, exec_lo, s3
496545; GFX10-NEXT: s_or_b32 s1, s1, s4
497546; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s0
498- ; GFX10-NEXT: s_cbranch_execz .LBB6_4
499- ; GFX10-NEXT: .LBB6_2 : ; %A
547+ ; GFX10-NEXT: s_cbranch_execz .LBB7_4
548+ ; GFX10-NEXT: .LBB7_2 : ; %A
500549; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
501550; GFX10-NEXT: v_ashrrev_i32_e32 v7, 31, v6
502551; GFX10-NEXT: s_andn2_b32 s3, s3, exec_lo
@@ -511,9 +560,9 @@ define amdgpu_cs void @loop_with_1break(ptr addrspace(1) %x, ptr addrspace(1) %a
511560; GFX10-NEXT: s_waitcnt vmcnt(0)
512561; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v9
513562; GFX10-NEXT: s_and_saveexec_b32 s4, vcc_lo
514- ; GFX10-NEXT: s_cbranch_execz .LBB6_1
563+ ; GFX10-NEXT: s_cbranch_execz .LBB7_1
515564; GFX10-NEXT: ; %bb.3: ; %loop.body
516- ; GFX10-NEXT: ; in Loop: Header=BB6_2 Depth=1
565+ ; GFX10-NEXT: ; in Loop: Header=BB7_2 Depth=1
517566; GFX10-NEXT: v_add_co_u32 v7, vcc_lo, v0, v7
518567; GFX10-NEXT: v_add_co_ci_u32_e32 v8, vcc_lo, v1, v8, vcc_lo
519568; GFX10-NEXT: v_add_nc_u32_e32 v10, 1, v6
@@ -529,16 +578,16 @@ define amdgpu_cs void @loop_with_1break(ptr addrspace(1) %x, ptr addrspace(1) %a
529578; GFX10-NEXT: s_waitcnt vmcnt(0)
530579; GFX10-NEXT: v_add_nc_u32_e32 v9, 1, v9
531580; GFX10-NEXT: global_store_dword v[7:8], v9, off
532- ; GFX10-NEXT: s_branch .LBB6_1
533- ; GFX10-NEXT: .LBB6_4 : ; %loop.exit.guard
581+ ; GFX10-NEXT: s_branch .LBB7_1
582+ ; GFX10-NEXT: .LBB7_4 : ; %loop.exit.guard
534583; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s0
535584; GFX10-NEXT: s_and_saveexec_b32 s0, s1
536585; GFX10-NEXT: s_xor_b32 s0, exec_lo, s0
537- ; GFX10-NEXT: s_cbranch_execz .LBB6_6
586+ ; GFX10-NEXT: s_cbranch_execz .LBB7_6
538587; GFX10-NEXT: ; %bb.5: ; %break.body
539588; GFX10-NEXT: v_mov_b32_e32 v0, 10
540589; GFX10-NEXT: global_store_dword v[4:5], v0, off
541- ; GFX10-NEXT: .LBB6_6 : ; %exit
590+ ; GFX10-NEXT: .LBB7_6 : ; %exit
542591; GFX10-NEXT: s_endpgm
543592entry:
544593 br label %A
0 commit comments