From 5fda6ec7064674fea17781e93f9b8c7c52a59e58 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Juan=20Manuel=20Martinez=20Caama=C3=B1o?= Date: Mon, 6 Jan 2025 16:48:55 +0100 Subject: [PATCH] [AMDGPU][SIPreEmitPeephole] Missing condition in mustRetainExeczBranch If the code in the "then" block is modifying the exec mask, we must retain the s_cbranch_execz branch. Consider this example: s_cbranch_execz after s_or_b32 exec_lo, exec_lo, -1 after: ... If the branch is removed, when we reach after exec is never zero, while before it would have been zero. --- llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp | 3 ++ llvm/test/CodeGen/AMDGPU/cse-convergent.ll | 3 +- .../AMDGPU/llvm.amdgcn.init.whole.wave-w32.ll | 35 +++++++++++++------ .../AMDGPU/llvm.amdgcn.init.whole.wave-w64.ll | 7 ++-- .../test/CodeGen/AMDGPU/wwm-reserved-spill.ll | 3 +- llvm/test/CodeGen/AMDGPU/wwm-reserved.ll | 6 ++-- 6 files changed, 41 insertions(+), 16 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp b/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp index 2bb70c138a50c..8c074f72fb02e 100644 --- a/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp +++ b/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp @@ -369,6 +369,9 @@ bool SIPreEmitPeephole::mustRetainExeczBranch( if (MI.isMetaInstruction()) continue; + if (MI.modifiesRegister(AMDGPU::EXEC, nullptr)) + return true; + if (TII->hasUnwantedEffectsWhenEXECEmpty(MI)) return true; diff --git a/llvm/test/CodeGen/AMDGPU/cse-convergent.ll b/llvm/test/CodeGen/AMDGPU/cse-convergent.ll index 7aca63d34f51b..52f1ed7e99116 100644 --- a/llvm/test/CodeGen/AMDGPU/cse-convergent.ll +++ b/llvm/test/CodeGen/AMDGPU/cse-convergent.ll @@ -19,6 +19,7 @@ define i32 @test(i32 %val, i32 %cond) { ; GCN-NEXT: v_mov_b32_e32 v4, v2 ; GCN-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 ; GCN-NEXT: s_and_saveexec_b32 s4, vcc_lo +; GCN-NEXT: s_cbranch_execz .LBB0_2 ; GCN-NEXT: ; %bb.1: ; %if ; GCN-NEXT: s_or_saveexec_b32 s5, -1 ; GCN-NEXT: v_mov_b32_e32 v2, 0 @@ -26,7 +27,7 @@ define i32 @test(i32 %val, i32 %cond) { ; GCN-NEXT: v_mov_b32_dpp v2, v3 row_xmask:1 row_mask:0xf bank_mask:0xf ; GCN-NEXT: s_mov_b32 exec_lo, s5 ; GCN-NEXT: v_mov_b32_e32 v5, v2 -; GCN-NEXT: ; %bb.2: ; %end +; GCN-NEXT: .LBB0_2: ; %end ; GCN-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GCN-NEXT: v_add_nc_u32_e32 v0, v4, v5 ; GCN-NEXT: s_xor_saveexec_b32 s4, -1 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.init.whole.wave-w32.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.init.whole.wave-w32.ll index 353f4d90cad1f..1d1b075bdac60 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.init.whole.wave-w32.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.init.whole.wave-w32.ll @@ -119,6 +119,7 @@ define amdgpu_cs_chain void @wwm_in_shader(<3 x i32> inreg %sgpr, ptr inreg %cal ; GISEL12-NEXT: s_mov_b32 s7, s4 ; GISEL12-NEXT: s_wait_alu 0xfffe ; GISEL12-NEXT: s_and_saveexec_b32 s3, s8 +; GISEL12-NEXT: s_cbranch_execz .LBB1_2 ; GISEL12-NEXT: ; %bb.1: ; %shader ; GISEL12-NEXT: s_or_saveexec_b32 s4, -1 ; GISEL12-NEXT: s_wait_alu 0xfffe @@ -129,7 +130,8 @@ define amdgpu_cs_chain void @wwm_in_shader(<3 x i32> inreg %sgpr, ptr inreg %cal ; GISEL12-NEXT: s_mov_b32 exec_lo, s4 ; GISEL12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL12-NEXT: v_dual_mov_b32 v11, v0 :: v_dual_add_nc_u32 v10, 42, v10 -; GISEL12-NEXT: ; %bb.2: ; %tail +; GISEL12-NEXT: .LBB1_2: ; %tail +; GISEL12-NEXT: s_wait_alu 0xfffe ; GISEL12-NEXT: s_or_b32 exec_lo, exec_lo, s3 ; GISEL12-NEXT: s_mov_b32 exec_lo, s5 ; GISEL12-NEXT: s_wait_alu 0xfffe @@ -148,6 +150,7 @@ define amdgpu_cs_chain void @wwm_in_shader(<3 x i32> inreg %sgpr, ptr inreg %cal ; DAGISEL12-NEXT: s_mov_b32 s6, s3 ; DAGISEL12-NEXT: s_wait_alu 0xfffe ; DAGISEL12-NEXT: s_and_saveexec_b32 s3, s8 +; DAGISEL12-NEXT: s_cbranch_execz .LBB1_2 ; DAGISEL12-NEXT: ; %bb.1: ; %shader ; DAGISEL12-NEXT: s_or_saveexec_b32 s4, -1 ; DAGISEL12-NEXT: s_wait_alu 0xfffe @@ -156,7 +159,8 @@ define amdgpu_cs_chain void @wwm_in_shader(<3 x i32> inreg %sgpr, ptr inreg %cal ; DAGISEL12-NEXT: v_cmp_ne_u32_e64 s8, 0, v0 ; DAGISEL12-NEXT: s_mov_b32 exec_lo, s4 ; DAGISEL12-NEXT: v_dual_mov_b32 v11, s8 :: v_dual_add_nc_u32 v10, 42, v10 -; DAGISEL12-NEXT: ; %bb.2: ; %tail +; DAGISEL12-NEXT: .LBB1_2: ; %tail +; DAGISEL12-NEXT: s_wait_alu 0xfffe ; DAGISEL12-NEXT: s_or_b32 exec_lo, exec_lo, s3 ; DAGISEL12-NEXT: s_mov_b32 exec_lo, s5 ; DAGISEL12-NEXT: s_wait_alu 0xfffe @@ -171,6 +175,7 @@ define amdgpu_cs_chain void @wwm_in_shader(<3 x i32> inreg %sgpr, ptr inreg %cal ; GISEL10-NEXT: s_mov_b32 s6, s3 ; GISEL10-NEXT: s_mov_b32 s7, s4 ; GISEL10-NEXT: s_and_saveexec_b32 s3, s8 +; GISEL10-NEXT: s_cbranch_execz .LBB1_2 ; GISEL10-NEXT: ; %bb.1: ; %shader ; GISEL10-NEXT: s_or_saveexec_b32 s4, -1 ; GISEL10-NEXT: v_cndmask_b32_e64 v0, 0x47, v10, s4 @@ -179,7 +184,7 @@ define amdgpu_cs_chain void @wwm_in_shader(<3 x i32> inreg %sgpr, ptr inreg %cal ; GISEL10-NEXT: s_mov_b32 exec_lo, s4 ; GISEL10-NEXT: v_add_nc_u32_e32 v10, 42, v10 ; GISEL10-NEXT: v_mov_b32_e32 v11, v0 -; GISEL10-NEXT: ; %bb.2: ; %tail +; GISEL10-NEXT: .LBB1_2: ; %tail ; GISEL10-NEXT: s_or_b32 exec_lo, exec_lo, s3 ; GISEL10-NEXT: s_mov_b32 exec_lo, s5 ; GISEL10-NEXT: s_setpc_b64 s[6:7] @@ -193,6 +198,7 @@ define amdgpu_cs_chain void @wwm_in_shader(<3 x i32> inreg %sgpr, ptr inreg %cal ; DAGISEL10-NEXT: s_mov_b32 s7, s4 ; DAGISEL10-NEXT: s_mov_b32 s6, s3 ; DAGISEL10-NEXT: s_and_saveexec_b32 s3, s8 +; DAGISEL10-NEXT: s_cbranch_execz .LBB1_2 ; DAGISEL10-NEXT: ; %bb.1: ; %shader ; DAGISEL10-NEXT: s_or_saveexec_b32 s4, -1 ; DAGISEL10-NEXT: v_cndmask_b32_e64 v0, 0x47, v10, s4 @@ -200,7 +206,7 @@ define amdgpu_cs_chain void @wwm_in_shader(<3 x i32> inreg %sgpr, ptr inreg %cal ; DAGISEL10-NEXT: s_mov_b32 exec_lo, s4 ; DAGISEL10-NEXT: v_add_nc_u32_e32 v10, 42, v10 ; DAGISEL10-NEXT: v_mov_b32_e32 v11, s8 -; DAGISEL10-NEXT: ; %bb.2: ; %tail +; DAGISEL10-NEXT: .LBB1_2: ; %tail ; DAGISEL10-NEXT: s_or_b32 exec_lo, exec_lo, s3 ; DAGISEL10-NEXT: s_mov_b32 exec_lo, s5 ; DAGISEL10-NEXT: s_setpc_b64 s[6:7] @@ -240,6 +246,7 @@ define amdgpu_cs_chain void @phi_whole_struct(<3 x i32> inreg %sgpr, ptr inreg % ; GISEL12-NEXT: s_mov_b32 s7, s4 ; GISEL12-NEXT: s_wait_alu 0xfffe ; GISEL12-NEXT: s_and_saveexec_b32 s3, s8 +; GISEL12-NEXT: s_cbranch_execz .LBB2_2 ; GISEL12-NEXT: ; %bb.1: ; %shader ; GISEL12-NEXT: s_or_saveexec_b32 s4, -1 ; GISEL12-NEXT: s_wait_alu 0xfffe @@ -250,7 +257,8 @@ define amdgpu_cs_chain void @phi_whole_struct(<3 x i32> inreg %sgpr, ptr inreg % ; GISEL12-NEXT: s_mov_b32 exec_lo, s4 ; GISEL12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL12-NEXT: v_dual_mov_b32 v11, v0 :: v_dual_add_nc_u32 v10, 42, v12 -; GISEL12-NEXT: ; %bb.2: ; %tail +; GISEL12-NEXT: .LBB2_2: ; %tail +; GISEL12-NEXT: s_wait_alu 0xfffe ; GISEL12-NEXT: s_or_b32 exec_lo, exec_lo, s3 ; GISEL12-NEXT: s_mov_b32 exec_lo, s5 ; GISEL12-NEXT: s_wait_alu 0xfffe @@ -268,6 +276,7 @@ define amdgpu_cs_chain void @phi_whole_struct(<3 x i32> inreg %sgpr, ptr inreg % ; DAGISEL12-NEXT: s_mov_b32 s6, s3 ; DAGISEL12-NEXT: s_wait_alu 0xfffe ; DAGISEL12-NEXT: s_and_saveexec_b32 s3, s8 +; DAGISEL12-NEXT: s_cbranch_execz .LBB2_2 ; DAGISEL12-NEXT: ; %bb.1: ; %shader ; DAGISEL12-NEXT: s_or_saveexec_b32 s4, -1 ; DAGISEL12-NEXT: s_wait_alu 0xfffe @@ -276,7 +285,8 @@ define amdgpu_cs_chain void @phi_whole_struct(<3 x i32> inreg %sgpr, ptr inreg % ; DAGISEL12-NEXT: v_cmp_ne_u32_e64 s8, 0, v0 ; DAGISEL12-NEXT: s_mov_b32 exec_lo, s4 ; DAGISEL12-NEXT: v_dual_mov_b32 v11, s8 :: v_dual_add_nc_u32 v10, 42, v12 -; DAGISEL12-NEXT: ; %bb.2: ; %tail +; DAGISEL12-NEXT: .LBB2_2: ; %tail +; DAGISEL12-NEXT: s_wait_alu 0xfffe ; DAGISEL12-NEXT: s_or_b32 exec_lo, exec_lo, s3 ; DAGISEL12-NEXT: s_mov_b32 exec_lo, s5 ; DAGISEL12-NEXT: s_wait_alu 0xfffe @@ -289,6 +299,7 @@ define amdgpu_cs_chain void @phi_whole_struct(<3 x i32> inreg %sgpr, ptr inreg % ; GISEL10-NEXT: s_mov_b32 s6, s3 ; GISEL10-NEXT: s_mov_b32 s7, s4 ; GISEL10-NEXT: s_and_saveexec_b32 s3, s8 +; GISEL10-NEXT: s_cbranch_execz .LBB2_2 ; GISEL10-NEXT: ; %bb.1: ; %shader ; GISEL10-NEXT: s_or_saveexec_b32 s4, -1 ; GISEL10-NEXT: v_cndmask_b32_e64 v0, 0x47, v12, s4 @@ -297,7 +308,7 @@ define amdgpu_cs_chain void @phi_whole_struct(<3 x i32> inreg %sgpr, ptr inreg % ; GISEL10-NEXT: s_mov_b32 exec_lo, s4 ; GISEL10-NEXT: v_add_nc_u32_e32 v10, 42, v12 ; GISEL10-NEXT: v_mov_b32_e32 v11, v0 -; GISEL10-NEXT: ; %bb.2: ; %tail +; GISEL10-NEXT: .LBB2_2: ; %tail ; GISEL10-NEXT: s_or_b32 exec_lo, exec_lo, s3 ; GISEL10-NEXT: s_mov_b32 exec_lo, s5 ; GISEL10-NEXT: s_setpc_b64 s[6:7] @@ -309,6 +320,7 @@ define amdgpu_cs_chain void @phi_whole_struct(<3 x i32> inreg %sgpr, ptr inreg % ; DAGISEL10-NEXT: s_mov_b32 s7, s4 ; DAGISEL10-NEXT: s_mov_b32 s6, s3 ; DAGISEL10-NEXT: s_and_saveexec_b32 s3, s8 +; DAGISEL10-NEXT: s_cbranch_execz .LBB2_2 ; DAGISEL10-NEXT: ; %bb.1: ; %shader ; DAGISEL10-NEXT: s_or_saveexec_b32 s4, -1 ; DAGISEL10-NEXT: v_cndmask_b32_e64 v0, 0x47, v12, s4 @@ -316,7 +328,7 @@ define amdgpu_cs_chain void @phi_whole_struct(<3 x i32> inreg %sgpr, ptr inreg % ; DAGISEL10-NEXT: s_mov_b32 exec_lo, s4 ; DAGISEL10-NEXT: v_add_nc_u32_e32 v10, 42, v12 ; DAGISEL10-NEXT: v_mov_b32_e32 v11, s8 -; DAGISEL10-NEXT: ; %bb.2: ; %tail +; DAGISEL10-NEXT: .LBB2_2: ; %tail ; DAGISEL10-NEXT: s_or_b32 exec_lo, exec_lo, s3 ; DAGISEL10-NEXT: s_mov_b32 exec_lo, s5 ; DAGISEL10-NEXT: s_setpc_b64 s[6:7] @@ -390,6 +402,7 @@ define amdgpu_cs_chain void @control_flow(<3 x i32> inreg %sgpr, ptr inreg %call ; GISEL12-NEXT: v_cmpx_lt_i32_e64 v12, v13 ; GISEL12-NEXT: s_wait_alu 0xfffe ; GISEL12-NEXT: s_xor_b32 s3, exec_lo, s3 +; GISEL12-NEXT: s_cbranch_execz .LBB3_6 ; GISEL12-NEXT: ; %bb.5: ; %tail.else ; GISEL12-NEXT: s_or_saveexec_b32 s4, -1 ; GISEL12-NEXT: v_mov_b32_e32 v0, 15 @@ -397,7 +410,8 @@ define amdgpu_cs_chain void @control_flow(<3 x i32> inreg %sgpr, ptr inreg %call ; GISEL12-NEXT: s_mov_b32 exec_lo, s4 ; GISEL12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL12-NEXT: v_mov_b32_e32 v8, v0 -; GISEL12-NEXT: ; %bb.6: ; %Flow +; GISEL12-NEXT: .LBB3_6: ; %Flow +; GISEL12-NEXT: s_wait_alu 0xfffe ; GISEL12-NEXT: s_and_not1_saveexec_b32 s3, s3 ; GISEL12-NEXT: ; %bb.7: ; %tail.then ; GISEL12-NEXT: s_mov_b32 s4, 44 @@ -501,12 +515,13 @@ define amdgpu_cs_chain void @control_flow(<3 x i32> inreg %sgpr, ptr inreg %call ; GISEL10-NEXT: ; implicit-def: $vgpr8 ; GISEL10-NEXT: v_cmpx_lt_i32_e64 v12, v13 ; GISEL10-NEXT: s_xor_b32 s3, exec_lo, s3 +; GISEL10-NEXT: s_cbranch_execz .LBB3_6 ; GISEL10-NEXT: ; %bb.5: ; %tail.else ; GISEL10-NEXT: s_or_saveexec_b32 s4, -1 ; GISEL10-NEXT: v_mov_b32_e32 v0, 15 ; GISEL10-NEXT: s_mov_b32 exec_lo, s4 ; GISEL10-NEXT: v_mov_b32_e32 v8, v0 -; GISEL10-NEXT: ; %bb.6: ; %Flow +; GISEL10-NEXT: .LBB3_6: ; %Flow ; GISEL10-NEXT: s_andn2_saveexec_b32 s3, s3 ; GISEL10-NEXT: ; %bb.7: ; %tail.then ; GISEL10-NEXT: s_mov_b32 s4, 44 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.init.whole.wave-w64.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.init.whole.wave-w64.ll index 1b1c89d9f5ad2..2fd7b07e265cf 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.init.whole.wave-w64.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.init.whole.wave-w64.ll @@ -57,6 +57,7 @@ define amdgpu_cs_chain void @basic(<3 x i32> inreg %sgpr, ptr inreg %callee, i64 ; DAGISEL12-NEXT: s_mov_b32 s4, s3 ; DAGISEL12-NEXT: s_wait_alu 0xfffe ; DAGISEL12-NEXT: s_and_saveexec_b64 s[8:9], s[10:11] +; DAGISEL12-NEXT: s_cbranch_execz .LBB0_2 ; DAGISEL12-NEXT: ; %bb.1: ; %shader ; DAGISEL12-NEXT: s_or_saveexec_b64 s[10:11], -1 ; DAGISEL12-NEXT: s_wait_alu 0xfffe @@ -68,7 +69,8 @@ define amdgpu_cs_chain void @basic(<3 x i32> inreg %sgpr, ptr inreg %callee, i64 ; DAGISEL12-NEXT: v_add_nc_u32_e32 v10, 42, v13 ; DAGISEL12-NEXT: s_delay_alu instid0(VALU_DEP_3) ; DAGISEL12-NEXT: v_mov_b32_e32 v12, s13 -; DAGISEL12-NEXT: ; %bb.2: ; %tail +; DAGISEL12-NEXT: .LBB0_2: ; %tail +; DAGISEL12-NEXT: s_wait_alu 0xfffe ; DAGISEL12-NEXT: s_or_b64 exec, exec, s[8:9] ; DAGISEL12-NEXT: s_mov_b64 exec, s[6:7] ; DAGISEL12-NEXT: s_wait_alu 0xfffe @@ -108,6 +110,7 @@ define amdgpu_cs_chain void @basic(<3 x i32> inreg %sgpr, ptr inreg %callee, i64 ; DAGISEL10-NEXT: s_mov_b32 s5, s4 ; DAGISEL10-NEXT: s_mov_b32 s4, s3 ; DAGISEL10-NEXT: s_and_saveexec_b64 s[8:9], s[10:11] +; DAGISEL10-NEXT: s_cbranch_execz .LBB0_2 ; DAGISEL10-NEXT: ; %bb.1: ; %shader ; DAGISEL10-NEXT: s_or_saveexec_b64 s[10:11], -1 ; DAGISEL10-NEXT: v_cndmask_b32_e64 v0, 0x47, v13, s[10:11] @@ -116,7 +119,7 @@ define amdgpu_cs_chain void @basic(<3 x i32> inreg %sgpr, ptr inreg %callee, i64 ; DAGISEL10-NEXT: v_mov_b32_e32 v11, s12 ; DAGISEL10-NEXT: v_add_nc_u32_e32 v10, 42, v13 ; DAGISEL10-NEXT: v_mov_b32_e32 v12, s13 -; DAGISEL10-NEXT: ; %bb.2: ; %tail +; DAGISEL10-NEXT: .LBB0_2: ; %tail ; DAGISEL10-NEXT: s_or_b64 exec, exec, s[8:9] ; DAGISEL10-NEXT: s_mov_b64 exec, s[6:7] ; DAGISEL10-NEXT: s_setpc_b64 s[4:5] diff --git a/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll b/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll index 1089093ea691c..cc67a5fb2842b 100644 --- a/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll +++ b/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll @@ -264,6 +264,7 @@ define amdgpu_gfx void @strict_wwm_cfg(ptr addrspace(8) inreg %tmp14, i32 %arg) ; GFX9-O3-NEXT: v_mov_b32_e32 v3, v1 ; GFX9-O3-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-O3-NEXT: s_and_saveexec_b64 s[34:35], vcc +; GFX9-O3-NEXT: s_cbranch_execz .LBB1_2 ; GFX9-O3-NEXT: ; %bb.1: ; %if ; GFX9-O3-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-O3-NEXT: v_mov_b32_e32 v1, 0 @@ -273,7 +274,7 @@ define amdgpu_gfx void @strict_wwm_cfg(ptr addrspace(8) inreg %tmp14, i32 %arg) ; GFX9-O3-NEXT: v_add_u32_e32 v1, v2, v1 ; GFX9-O3-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-O3-NEXT: v_mov_b32_e32 v5, v1 -; GFX9-O3-NEXT: ; %bb.2: ; %merge +; GFX9-O3-NEXT: .LBB1_2: ; %merge ; GFX9-O3-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-O3-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX9-O3-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc diff --git a/llvm/test/CodeGen/AMDGPU/wwm-reserved.ll b/llvm/test/CodeGen/AMDGPU/wwm-reserved.ll index 08cc2e4ec7d79..f8005b3f256a7 100644 --- a/llvm/test/CodeGen/AMDGPU/wwm-reserved.ll +++ b/llvm/test/CodeGen/AMDGPU/wwm-reserved.ll @@ -230,6 +230,7 @@ define amdgpu_cs void @cfg(ptr addrspace(8) inreg %tmp14, i32 %arg) { ; GFX9-O3-NEXT: v_mov_b32_e32 v3, v1 ; GFX9-O3-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-O3-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-O3-NEXT: s_cbranch_execz .LBB1_2 ; GFX9-O3-NEXT: ; %bb.1: ; %if ; GFX9-O3-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GFX9-O3-NEXT: v_mov_b32_e32 v1, 0 @@ -239,7 +240,7 @@ define amdgpu_cs void @cfg(ptr addrspace(8) inreg %tmp14, i32 %arg) { ; GFX9-O3-NEXT: v_add_u32_e32 v1, v2, v1 ; GFX9-O3-NEXT: s_mov_b64 exec, s[6:7] ; GFX9-O3-NEXT: v_mov_b32_e32 v5, v1 -; GFX9-O3-NEXT: ; %bb.2: ; %merge +; GFX9-O3-NEXT: .LBB1_2: ; %merge ; GFX9-O3-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-O3-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX9-O3-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc @@ -1082,6 +1083,7 @@ define amdgpu_cs void @strict_wwm_cfg(ptr addrspace(8) inreg %tmp14, i32 %arg) { ; GFX9-O3-NEXT: v_mov_b32_e32 v3, v1 ; GFX9-O3-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-O3-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-O3-NEXT: s_cbranch_execz .LBB8_2 ; GFX9-O3-NEXT: ; %bb.1: ; %if ; GFX9-O3-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GFX9-O3-NEXT: v_mov_b32_e32 v1, 0 @@ -1091,7 +1093,7 @@ define amdgpu_cs void @strict_wwm_cfg(ptr addrspace(8) inreg %tmp14, i32 %arg) { ; GFX9-O3-NEXT: v_add_u32_e32 v1, v2, v1 ; GFX9-O3-NEXT: s_mov_b64 exec, s[6:7] ; GFX9-O3-NEXT: v_mov_b32_e32 v5, v1 -; GFX9-O3-NEXT: ; %bb.2: ; %merge +; GFX9-O3-NEXT: .LBB8_2: ; %merge ; GFX9-O3-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-O3-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX9-O3-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc