@@ -33,7 +33,8 @@ define amdgpu_cs i32 @non_compare(i32 %x) {
3333; CHECK-LABEL: non_compare:
3434; CHECK: ; %bb.0:
3535; CHECK-NEXT: v_and_b32_e32 v0, 1, v0
36- ; CHECK-NEXT: v_cmp_ne_u32_e64 s0, 0, v0
36+ ; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
37+ ; CHECK-NEXT: s_and_b32 s0, vcc_lo, exec_lo
3738; CHECK-NEXT: ; return to shader part epilog
3839 %trunc = trunc i32 %x to i1
3940 %ballot = call i32 @llvm.amdgcn.ballot.i32 (i1 %trunc )
@@ -45,7 +46,8 @@ define amdgpu_cs i32 @non_compare(i32 %x) {
4546define amdgpu_cs i32 @compare_ints (i32 %x , i32 %y ) {
4647; CHECK-LABEL: compare_ints:
4748; CHECK: ; %bb.0:
48- ; CHECK-NEXT: v_cmp_eq_u32_e64 s0, v0, v1
49+ ; CHECK-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
50+ ; CHECK-NEXT: s_and_b32 s0, vcc_lo, exec_lo
4951; CHECK-NEXT: ; return to shader part epilog
5052 %cmp = icmp eq i32 %x , %y
5153 %ballot = call i32 @llvm.amdgcn.ballot.i32 (i1 %cmp )
@@ -55,7 +57,8 @@ define amdgpu_cs i32 @compare_ints(i32 %x, i32 %y) {
5557define amdgpu_cs i32 @compare_int_with_constant (i32 %x ) {
5658; CHECK-LABEL: compare_int_with_constant:
5759; CHECK: ; %bb.0:
58- ; CHECK-NEXT: v_cmp_le_i32_e64 s0, 0x63, v0
60+ ; CHECK-NEXT: v_cmp_le_i32_e32 vcc_lo, 0x63, v0
61+ ; CHECK-NEXT: s_and_b32 s0, vcc_lo, exec_lo
5962; CHECK-NEXT: ; return to shader part epilog
6063 %cmp = icmp sge i32 %x , 99
6164 %ballot = call i32 @llvm.amdgcn.ballot.i32 (i1 %cmp )
@@ -65,7 +68,8 @@ define amdgpu_cs i32 @compare_int_with_constant(i32 %x) {
6568define amdgpu_cs i32 @compare_floats (float %x , float %y ) {
6669; CHECK-LABEL: compare_floats:
6770; CHECK: ; %bb.0:
68- ; CHECK-NEXT: v_cmp_gt_f32_e64 s0, v0, v1
71+ ; CHECK-NEXT: v_cmp_gt_f32_e32 vcc_lo, v0, v1
72+ ; CHECK-NEXT: s_and_b32 s0, vcc_lo, exec_lo
6973; CHECK-NEXT: ; return to shader part epilog
7074 %cmp = fcmp ogt float %x , %y
7175 %ballot = call i32 @llvm.amdgcn.ballot.i32 (i1 %cmp )
@@ -76,7 +80,8 @@ define amdgpu_cs i32 @ctpop_of_ballot(float %x, float %y) {
7680; CHECK-LABEL: ctpop_of_ballot:
7781; CHECK: ; %bb.0:
7882; CHECK-NEXT: v_cmp_gt_f32_e32 vcc_lo, v0, v1
79- ; CHECK-NEXT: s_bcnt1_i32_b32 s0, vcc_lo
83+ ; CHECK-NEXT: s_and_b32 s0, vcc_lo, exec_lo
84+ ; CHECK-NEXT: s_bcnt1_i32_b32 s0, s0
8085; CHECK-NEXT: ; return to shader part epilog
8186 %cmp = fcmp ogt float %x , %y
8287 %ballot = call i32 @llvm.amdgcn.ballot.i32 (i1 %cmp )
@@ -89,7 +94,8 @@ define amdgpu_cs i32 @branch_divergent_ballot_ne_zero_non_compare(i32 %v) {
8994; CHECK: ; %bb.0:
9095; CHECK-NEXT: v_and_b32_e32 v0, 1, v0
9196; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
92- ; CHECK-NEXT: s_cmp_eq_u32 vcc_lo, 0
97+ ; CHECK-NEXT: s_and_b32 s0, vcc_lo, exec_lo
98+ ; CHECK-NEXT: s_cmp_eq_u32 s0, 0
9399; CHECK-NEXT: s_cbranch_scc1 .LBB7_2
94100; CHECK-NEXT: ; %bb.1: ; %true
95101; CHECK-NEXT: s_mov_b32 s0, 42
@@ -113,6 +119,7 @@ define amdgpu_cs i32 @branch_uniform_ballot_ne_zero_non_compare(i32 inreg %v) {
113119; CHECK: ; %bb.0:
114120; CHECK-NEXT: s_and_b32 s0, 1, s0
115121; CHECK-NEXT: v_cmp_ne_u32_e64 s0, 0, s0
122+ ; CHECK-NEXT: s_and_b32 s0, s0, exec_lo
116123; CHECK-NEXT: s_cmp_eq_u32 s0, 0
117124; CHECK-NEXT: s_cbranch_scc1 .LBB8_2
118125; CHECK-NEXT: ; %bb.1: ; %true
@@ -137,7 +144,8 @@ define amdgpu_cs i32 @branch_divergent_ballot_eq_zero_non_compare(i32 %v) {
137144; CHECK: ; %bb.0:
138145; CHECK-NEXT: v_and_b32_e32 v0, 1, v0
139146; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
140- ; CHECK-NEXT: s_cmp_lg_u32 vcc_lo, 0
147+ ; CHECK-NEXT: s_and_b32 s0, vcc_lo, exec_lo
148+ ; CHECK-NEXT: s_cmp_lg_u32 s0, 0
141149; CHECK-NEXT: s_cbranch_scc0 .LBB9_2
142150; CHECK-NEXT: ; %bb.1: ; %false
143151; CHECK-NEXT: s_mov_b32 s0, 33
@@ -161,6 +169,7 @@ define amdgpu_cs i32 @branch_uniform_ballot_eq_zero_non_compare(i32 inreg %v) {
161169; CHECK: ; %bb.0:
162170; CHECK-NEXT: s_and_b32 s0, 1, s0
163171; CHECK-NEXT: v_cmp_ne_u32_e64 s0, 0, s0
172+ ; CHECK-NEXT: s_and_b32 s0, s0, exec_lo
164173; CHECK-NEXT: s_cmp_lg_u32 s0, 0
165174; CHECK-NEXT: s_cbranch_scc0 .LBB10_2
166175; CHECK-NEXT: ; %bb.1: ; %false
@@ -184,7 +193,8 @@ define amdgpu_cs i32 @branch_divergent_ballot_ne_zero_compare(i32 %v) {
184193; CHECK-LABEL: branch_divergent_ballot_ne_zero_compare:
185194; CHECK: ; %bb.0:
186195; CHECK-NEXT: v_cmp_gt_u32_e32 vcc_lo, 12, v0
187- ; CHECK-NEXT: s_cmp_eq_u32 vcc_lo, 0
196+ ; CHECK-NEXT: s_and_b32 s0, vcc_lo, exec_lo
197+ ; CHECK-NEXT: s_cmp_eq_u32 s0, 0
188198; CHECK-NEXT: s_cbranch_scc1 .LBB11_2
189199; CHECK-NEXT: ; %bb.1: ; %true
190200; CHECK-NEXT: s_mov_b32 s0, 42
@@ -210,6 +220,7 @@ define amdgpu_cs i32 @branch_uniform_ballot_ne_zero_compare(i32 inreg %v) {
210220; CHECK-NEXT: s_cselect_b32 s0, 1, 0
211221; CHECK-NEXT: s_and_b32 s0, 1, s0
212222; CHECK-NEXT: v_cmp_ne_u32_e64 s0, 0, s0
223+ ; CHECK-NEXT: s_and_b32 s0, s0, exec_lo
213224; CHECK-NEXT: s_cmp_eq_u32 s0, 0
214225; CHECK-NEXT: s_cbranch_scc1 .LBB12_2
215226; CHECK-NEXT: ; %bb.1: ; %true
@@ -233,7 +244,8 @@ define amdgpu_cs i32 @branch_divergent_ballot_eq_zero_compare(i32 %v) {
233244; CHECK-LABEL: branch_divergent_ballot_eq_zero_compare:
234245; CHECK: ; %bb.0:
235246; CHECK-NEXT: v_cmp_gt_u32_e32 vcc_lo, 12, v0
236- ; CHECK-NEXT: s_cmp_lg_u32 vcc_lo, 0
247+ ; CHECK-NEXT: s_and_b32 s0, vcc_lo, exec_lo
248+ ; CHECK-NEXT: s_cmp_lg_u32 s0, 0
237249; CHECK-NEXT: s_cbranch_scc0 .LBB13_2
238250; CHECK-NEXT: ; %bb.1: ; %false
239251; CHECK-NEXT: s_mov_b32 s0, 33
@@ -259,6 +271,7 @@ define amdgpu_cs i32 @branch_uniform_ballot_eq_zero_compare(i32 inreg %v) {
259271; CHECK-NEXT: s_cselect_b32 s0, 1, 0
260272; CHECK-NEXT: s_and_b32 s0, 1, s0
261273; CHECK-NEXT: v_cmp_ne_u32_e64 s0, 0, s0
274+ ; CHECK-NEXT: s_and_b32 s0, s0, exec_lo
262275; CHECK-NEXT: s_cmp_lg_u32 s0, 0
263276; CHECK-NEXT: s_cbranch_scc0 .LBB14_2
264277; CHECK-NEXT: ; %bb.1: ; %false
@@ -284,6 +297,7 @@ define amdgpu_cs i32 @branch_divergent_ballot_ne_zero_and(i32 %v1, i32 %v2) {
284297; CHECK-NEXT: v_cmp_gt_u32_e32 vcc_lo, 12, v0
285298; CHECK-NEXT: v_cmp_lt_u32_e64 s0, 34, v1
286299; CHECK-NEXT: s_and_b32 s0, vcc_lo, s0
300+ ; CHECK-NEXT: s_and_b32 s0, s0, exec_lo
287301; CHECK-NEXT: s_cmp_eq_u32 s0, 0
288302; CHECK-NEXT: s_cbranch_scc1 .LBB15_2
289303; CHECK-NEXT: ; %bb.1: ; %true
@@ -315,6 +329,7 @@ define amdgpu_cs i32 @branch_uniform_ballot_ne_zero_and(i32 inreg %v1, i32 inreg
315329; CHECK-NEXT: s_and_b32 s0, s0, s1
316330; CHECK-NEXT: s_and_b32 s0, 1, s0
317331; CHECK-NEXT: v_cmp_ne_u32_e64 s0, 0, s0
332+ ; CHECK-NEXT: s_and_b32 s0, s0, exec_lo
318333; CHECK-NEXT: s_cmp_eq_u32 s0, 0
319334; CHECK-NEXT: s_cbranch_scc1 .LBB16_2
320335; CHECK-NEXT: ; %bb.1: ; %true
@@ -342,6 +357,7 @@ define amdgpu_cs i32 @branch_divergent_ballot_eq_zero_and(i32 %v1, i32 %v2) {
342357; CHECK-NEXT: v_cmp_gt_u32_e32 vcc_lo, 12, v0
343358; CHECK-NEXT: v_cmp_lt_u32_e64 s0, 34, v1
344359; CHECK-NEXT: s_and_b32 s0, vcc_lo, s0
360+ ; CHECK-NEXT: s_and_b32 s0, s0, exec_lo
345361; CHECK-NEXT: s_cmp_lg_u32 s0, 0
346362; CHECK-NEXT: s_cbranch_scc0 .LBB17_2
347363; CHECK-NEXT: ; %bb.1: ; %false
@@ -373,6 +389,7 @@ define amdgpu_cs i32 @branch_uniform_ballot_eq_zero_and(i32 inreg %v1, i32 inreg
373389; CHECK-NEXT: s_and_b32 s0, s0, s1
374390; CHECK-NEXT: s_and_b32 s0, 1, s0
375391; CHECK-NEXT: v_cmp_ne_u32_e64 s0, 0, s0
392+ ; CHECK-NEXT: s_and_b32 s0, s0, exec_lo
376393; CHECK-NEXT: s_cmp_lg_u32 s0, 0
377394; CHECK-NEXT: s_cbranch_scc0 .LBB18_2
378395; CHECK-NEXT: ; %bb.1: ; %false
@@ -401,6 +418,7 @@ define amdgpu_cs i32 @branch_uniform_ballot_sgt_N_compare(i32 inreg %v) {
401418; CHECK-NEXT: s_cselect_b32 s0, 1, 0
402419; CHECK-NEXT: s_and_b32 s0, 1, s0
403420; CHECK-NEXT: v_cmp_ne_u32_e64 s0, 0, s0
421+ ; CHECK-NEXT: s_and_b32 s0, s0, exec_lo
404422; CHECK-NEXT: s_cmp_le_i32 s0, 22
405423; CHECK-NEXT: s_cbranch_scc1 .LBB19_2
406424; CHECK-NEXT: ; %bb.1: ; %true
0 commit comments