diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td index 9051db0c01ed1..9e3011b05de65 100644 --- a/llvm/lib/Target/AMDGPU/SIInstructions.td +++ b/llvm/lib/Target/AMDGPU/SIInstructions.td @@ -3123,6 +3123,17 @@ def IMMBitSelConst : SDNodeXForm> i32:$a)), -1)), + (V_CMP_NE_U32_e64 (V_AND_B32_e64 (i32 1), i32:$a), (i32 1)) +>; + +def : GCNPat < + (i1 (xor (i1 (DivergentUnaryFrag> i64:$a)), -1)), + (V_CMP_NE_U32_e64 (V_AND_B32_e64 (i32 1), + (i32 (EXTRACT_SUBREG $a, sub0))), (i32 1)) +>; + def : GCNPat < (i1 (DivergentUnaryFrag (i32 (srl i32:$a, (i32 imm:$b))))), (V_CMP_NE_U32_e64 (V_AND_B32_e64 (i32 (IMMBitSelConst $b)), $a), diff --git a/llvm/test/CodeGen/AMDGPU/extract-subvector.ll b/llvm/test/CodeGen/AMDGPU/extract-subvector.ll index 61c0b8b861d5b..41082821bafe3 100644 --- a/llvm/test/CodeGen/AMDGPU/extract-subvector.ll +++ b/llvm/test/CodeGen/AMDGPU/extract-subvector.ll @@ -6,12 +6,11 @@ define <2 x i16> @extract_2xi16(ptr addrspace(1) %p0, ptr addrspace(1) %p1, i1 % ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: v_and_b32_e32 v4, 1, v4 -; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4 -; GCN-NEXT: s_xor_b64 s[4:5], vcc, -1 +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 1, v4 ; GCN-NEXT: ; implicit-def: $vgpr5 ; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[6:7] +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GCN-NEXT: s_cbranch_execz .LBB0_2 ; GCN-NEXT: ; %bb.1: ; %F ; GCN-NEXT: s_mov_b32 s10, 0 @@ -101,11 +100,10 @@ define <2 x i64> @extract_2xi64(ptr addrspace(1) %p0, ptr addrspace(1) %p1, i1 % ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: v_and_b32_e32 v4, 1, v4 -; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4 -; GCN-NEXT: s_xor_b64 s[4:5], vcc, -1 +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 1, v4 ; GCN-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19 -; GCN-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[6:7] +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GCN-NEXT: s_cbranch_execz .LBB1_2 ; GCN-NEXT: ; %bb.1: ; %F ; GCN-NEXT: s_mov_b32 s10, 0 @@ -172,11 +170,10 @@ define <4 x i64> @extract_4xi64(ptr addrspace(1) %p0, ptr addrspace(1) %p1, i1 % ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: v_and_b32_e32 v4, 1, v4 -; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4 -; GCN-NEXT: s_xor_b64 s[4:5], vcc, -1 +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 1, v4 ; GCN-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19 -; GCN-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[6:7] +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GCN-NEXT: s_cbranch_execz .LBB2_2 ; GCN-NEXT: ; %bb.1: ; %F ; GCN-NEXT: s_mov_b32 s10, 0 @@ -249,11 +246,10 @@ define <8 x i64> @extract_8xi64(ptr addrspace(1) %p0, ptr addrspace(1) %p1, i1 % ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: v_and_b32_e32 v4, 1, v4 -; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4 -; GCN-NEXT: s_xor_b64 s[4:5], vcc, -1 +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 1, v4 ; GCN-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35 -; GCN-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[6:7] +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GCN-NEXT: s_cbranch_execz .LBB3_2 ; GCN-NEXT: ; %bb.1: ; %F ; GCN-NEXT: s_mov_b32 s10, 0 @@ -353,11 +349,10 @@ define <2 x double> @extract_2xf64(ptr addrspace(1) %p0, ptr addrspace(1) %p1, i ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: v_and_b32_e32 v4, 1, v4 -; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4 -; GCN-NEXT: s_xor_b64 s[4:5], vcc, -1 +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 1, v4 ; GCN-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19 -; GCN-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[6:7] +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GCN-NEXT: s_cbranch_execz .LBB4_2 ; GCN-NEXT: ; %bb.1: ; %F ; GCN-NEXT: s_mov_b32 s10, 0 @@ -424,11 +419,10 @@ define <4 x double> @extract_4xf64(ptr addrspace(1) %p0, ptr addrspace(1) %p1, i ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: v_and_b32_e32 v4, 1, v4 -; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4 -; GCN-NEXT: s_xor_b64 s[4:5], vcc, -1 +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 1, v4 ; GCN-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19 -; GCN-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[6:7] +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GCN-NEXT: s_cbranch_execz .LBB5_2 ; GCN-NEXT: ; %bb.1: ; %F ; GCN-NEXT: s_mov_b32 s10, 0 @@ -501,11 +495,10 @@ define <8 x double> @extract_8xf64(ptr addrspace(1) %p0, ptr addrspace(1) %p1, i ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: v_and_b32_e32 v4, 1, v4 -; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4 -; GCN-NEXT: s_xor_b64 s[4:5], vcc, -1 +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 1, v4 ; GCN-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35 -; GCN-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[6:7] +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GCN-NEXT: s_cbranch_execz .LBB6_2 ; GCN-NEXT: ; %bb.1: ; %F ; GCN-NEXT: s_mov_b32 s10, 0 diff --git a/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-phi-block-end-iterator-debugloc.ll b/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-phi-block-end-iterator-debugloc.ll index 13184cf17a2e5..fd64ea3ae1c4b 100644 --- a/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-phi-block-end-iterator-debugloc.ll +++ b/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-phi-block-end-iterator-debugloc.ll @@ -6,8 +6,7 @@ define i32 @rocrand_regression(ptr addrspace(1) %arg, i32 %arg0, i1 %cmp7) { ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CHECK-NEXT: v_and_b32_e32 v0, 1, v3 -; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; CHECK-NEXT: s_xor_b64 s[4:5], vcc, -1 +; CHECK-NEXT: v_cmp_ne_u32_e64 s[4:5], 1, v0 ; CHECK-NEXT: s_mov_b32 s8, 0 ; CHECK-NEXT: .LBB0_1: ; %do.body ; CHECK-NEXT: ; =>This Loop Header: Depth=1 diff --git a/llvm/test/CodeGen/AMDGPU/function-args.ll b/llvm/test/CodeGen/AMDGPU/function-args.ll index 8ee52a828de65..d0a3811314029 100644 --- a/llvm/test/CodeGen/AMDGPU/function-args.ll +++ b/llvm/test/CodeGen/AMDGPU/function-args.ll @@ -102,9 +102,8 @@ define void @i1_arg_i1_use(i1 %arg) #0 { ; CIGFX89: ; %bb.0: ; %bb ; CIGFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CIGFX89-NEXT: v_and_b32_e32 v0, 1, v0 -; CIGFX89-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; CIGFX89-NEXT: s_xor_b64 s[6:7], vcc, -1 -; CIGFX89-NEXT: s_and_saveexec_b64 s[4:5], s[6:7] +; CIGFX89-NEXT: v_cmp_ne_u32_e32 vcc, 1, v0 +; CIGFX89-NEXT: s_and_saveexec_b64 s[4:5], vcc ; CIGFX89-NEXT: s_cbranch_execz .LBB3_2 ; CIGFX89-NEXT: ; %bb.1: ; %bb1 ; CIGFX89-NEXT: s_mov_b32 s7, 0xf000 @@ -120,15 +119,14 @@ define void @i1_arg_i1_use(i1 %arg) #0 { ; GFX11: ; %bb.0: ; %bb ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX11-NEXT: s_mov_b32 s2, -1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 -; GFX11-NEXT: s_xor_b32 s1, vcc_lo, -1 -; GFX11-NEXT: s_and_saveexec_b32 s0, s1 +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_cmpx_ne_u32_e32 1, v0 ; GFX11-NEXT: s_cbranch_execz .LBB3_2 ; GFX11-NEXT: ; %bb.1: ; %bb1 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: s_mov_b32 s2, -1 ; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0 dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: .LBB3_2: ; %bb2 diff --git a/llvm/test/CodeGen/AMDGPU/identical-subrange-spill-infloop.ll b/llvm/test/CodeGen/AMDGPU/identical-subrange-spill-infloop.ll index 8dbd6c5d133ea..56ceba258f471 100644 --- a/llvm/test/CodeGen/AMDGPU/identical-subrange-spill-infloop.ll +++ b/llvm/test/CodeGen/AMDGPU/identical-subrange-spill-infloop.ll @@ -11,37 +11,47 @@ define void @main(i1 %arg) #0 { ; CHECK-NEXT: s_mov_b64 exec, s[4:5] ; CHECK-NEXT: v_writelane_b32 v5, s30, 0 ; CHECK-NEXT: v_writelane_b32 v5, s31, 1 -; CHECK-NEXT: v_writelane_b32 v5, s34, 2 -; CHECK-NEXT: v_writelane_b32 v5, s35, 3 -; CHECK-NEXT: v_writelane_b32 v5, s36, 4 -; CHECK-NEXT: v_writelane_b32 v5, s37, 5 -; CHECK-NEXT: v_writelane_b32 v5, s38, 6 +; CHECK-NEXT: v_writelane_b32 v5, s36, 2 +; CHECK-NEXT: v_writelane_b32 v5, s37, 3 +; CHECK-NEXT: v_writelane_b32 v5, s38, 4 +; CHECK-NEXT: v_writelane_b32 v5, s39, 5 +; CHECK-NEXT: v_writelane_b32 v5, s48, 6 +; CHECK-NEXT: v_writelane_b32 v5, s49, 7 +; CHECK-NEXT: v_writelane_b32 v5, s50, 8 +; CHECK-NEXT: v_writelane_b32 v5, s51, 9 +; CHECK-NEXT: v_writelane_b32 v5, s52, 10 +; CHECK-NEXT: v_writelane_b32 v5, s53, 11 +; CHECK-NEXT: v_writelane_b32 v5, s54, 12 +; CHECK-NEXT: v_writelane_b32 v5, s55, 13 ; CHECK-NEXT: s_getpc_b64 s[24:25] -; CHECK-NEXT: v_writelane_b32 v5, s39, 7 -; CHECK-NEXT: s_movk_i32 s20, 0xf0 -; CHECK-NEXT: s_mov_b32 s21, s24 -; CHECK-NEXT: v_writelane_b32 v5, s48, 8 -; CHECK-NEXT: s_load_dwordx16 s[4:19], s[20:21], 0x0 -; CHECK-NEXT: s_mov_b64 s[20:21], 0 -; CHECK-NEXT: v_writelane_b32 v5, s49, 9 -; CHECK-NEXT: s_load_dwordx4 s[20:23], s[20:21], 0x0 -; CHECK-NEXT: v_writelane_b32 v5, s50, 10 +; CHECK-NEXT: v_writelane_b32 v5, s64, 14 +; CHECK-NEXT: s_movk_i32 s4, 0xf0 +; CHECK-NEXT: s_mov_b32 s5, s24 +; CHECK-NEXT: v_writelane_b32 v5, s65, 15 +; CHECK-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x0 +; CHECK-NEXT: s_mov_b64 s[4:5], 0 +; CHECK-NEXT: v_writelane_b32 v5, s66, 16 +; CHECK-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 +; CHECK-NEXT: v_writelane_b32 v5, s67, 17 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: s_movk_i32 s22, 0x130 -; CHECK-NEXT: s_mov_b32 s23, s24 -; CHECK-NEXT: v_writelane_b32 v5, s51, 11 -; CHECK-NEXT: s_load_dwordx16 s[36:51], s[22:23], 0x0 -; CHECK-NEXT: s_mov_b32 s28, 0 +; CHECK-NEXT: s_movk_i32 s6, 0x130 +; CHECK-NEXT: s_mov_b32 s7, s24 +; CHECK-NEXT: v_writelane_b32 v5, s68, 18 +; CHECK-NEXT: s_load_dwordx16 s[36:51], s[6:7], 0x0 +; CHECK-NEXT: v_writelane_b32 v5, s69, 19 +; CHECK-NEXT: v_writelane_b32 v5, s70, 20 +; CHECK-NEXT: s_mov_b32 s68, 0 ; CHECK-NEXT: v_mov_b32_e32 v1, 0 -; CHECK-NEXT: v_mov_b32_e32 v2, s20 +; CHECK-NEXT: v_writelane_b32 v5, s71, 21 +; CHECK-NEXT: v_mov_b32_e32 v2, s4 ; CHECK-NEXT: v_mov_b32_e32 v3, v1 -; CHECK-NEXT: s_mov_b32 s29, s28 -; CHECK-NEXT: s_mov_b32 s30, s28 -; CHECK-NEXT: s_mov_b32 s31, s28 -; CHECK-NEXT: image_sample_lz v3, v[2:3], s[12:19], s[28:31] dmask:0x1 +; CHECK-NEXT: s_mov_b32 s69, s68 +; CHECK-NEXT: s_mov_b32 s70, s68 +; CHECK-NEXT: s_mov_b32 s71, s68 +; CHECK-NEXT: image_sample_lz v3, v[2:3], s[16:23], s[68:71] dmask:0x1 ; CHECK-NEXT: v_mov_b32_e32 v2, v1 ; CHECK-NEXT: ; implicit-def: $vgpr6 : SGPR spill to VGPR lane -; CHECK-NEXT: v_writelane_b32 v5, s52, 12 +; CHECK-NEXT: s_mov_b32 s6, 48 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: v_writelane_b32 v6, s36, 0 ; CHECK-NEXT: v_writelane_b32 v6, s37, 1 @@ -49,57 +59,44 @@ define void @main(i1 %arg) #0 { ; CHECK-NEXT: v_writelane_b32 v6, s39, 3 ; CHECK-NEXT: v_writelane_b32 v6, s40, 4 ; CHECK-NEXT: v_writelane_b32 v6, s41, 5 -; CHECK-NEXT: image_sample_lz v4, v[1:2], s[36:43], s[28:31] dmask:0x1 +; CHECK-NEXT: image_sample_lz v4, v[1:2], s[36:43], s[68:71] dmask:0x1 ; CHECK-NEXT: v_writelane_b32 v6, s42, 6 ; CHECK-NEXT: v_writelane_b32 v6, s43, 7 ; CHECK-NEXT: v_writelane_b32 v6, s44, 8 ; CHECK-NEXT: v_writelane_b32 v6, s45, 9 -; CHECK-NEXT: v_writelane_b32 v5, s53, 13 ; CHECK-NEXT: v_writelane_b32 v6, s46, 10 -; CHECK-NEXT: v_writelane_b32 v5, s54, 14 ; CHECK-NEXT: v_writelane_b32 v6, s47, 11 -; CHECK-NEXT: v_writelane_b32 v5, s55, 15 ; CHECK-NEXT: v_writelane_b32 v6, s48, 12 -; CHECK-NEXT: v_writelane_b32 v5, s64, 16 ; CHECK-NEXT: v_writelane_b32 v6, s49, 13 -; CHECK-NEXT: v_writelane_b32 v5, s65, 17 ; CHECK-NEXT: v_writelane_b32 v6, s50, 14 -; CHECK-NEXT: v_writelane_b32 v5, s66, 18 -; CHECK-NEXT: v_writelane_b32 v6, s51, 15 -; CHECK-NEXT: s_mov_b32 s40, 48 ; CHECK-NEXT: s_movk_i32 s56, 0x1f0 -; CHECK-NEXT: s_movk_i32 s34, 0x2f0 -; CHECK-NEXT: s_mov_b32 s41, s24 +; CHECK-NEXT: s_movk_i32 s72, 0x2f0 ; CHECK-NEXT: s_mov_b32 s57, s24 -; CHECK-NEXT: s_mov_b32 s35, s24 -; CHECK-NEXT: v_writelane_b32 v5, s67, 19 -; CHECK-NEXT: s_load_dwordx8 s[20:27], s[40:41], 0x0 -; CHECK-NEXT: s_nop 0 +; CHECK-NEXT: s_mov_b32 s73, s24 +; CHECK-NEXT: v_writelane_b32 v6, s51, 15 +; CHECK-NEXT: s_load_dwordx8 s[24:31], s[6:7], 0x0 ; CHECK-NEXT: s_load_dwordx16 s[36:51], s[56:57], 0x0 ; CHECK-NEXT: v_and_b32_e32 v0, 1, v0 -; CHECK-NEXT: s_load_dwordx16 s[52:67], s[34:35], 0x0 -; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; CHECK-NEXT: v_writelane_b32 v5, s68, 20 -; CHECK-NEXT: s_xor_b64 s[72:73], vcc, -1 -; CHECK-NEXT: v_writelane_b32 v5, s69, 21 +; CHECK-NEXT: s_load_dwordx16 s[52:67], s[72:73], 0x0 +; CHECK-NEXT: v_cmp_ne_u32_e64 s[4:5], 1, v0 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: v_mul_f32_e32 v0, v4, v3 -; CHECK-NEXT: s_and_saveexec_b64 vcc, s[72:73] -; CHECK-NEXT: s_xor_b64 s[34:35], exec, vcc +; CHECK-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] +; CHECK-NEXT: s_xor_b64 s[6:7], exec, s[6:7] ; CHECK-NEXT: s_cbranch_execz .LBB0_3 ; CHECK-NEXT: ; %bb.1: ; %bb48 -; CHECK-NEXT: image_sample_lz v3, v[1:2], s[12:19], s[28:31] dmask:0x1 +; CHECK-NEXT: image_sample_lz v3, v[1:2], s[16:23], s[68:71] dmask:0x1 ; CHECK-NEXT: v_mov_b32_e32 v2, 0 ; CHECK-NEXT: s_and_b64 vcc, exec, -1 ; CHECK-NEXT: .LBB0_2: ; %bb50 ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: s_mov_b32 s29, s28 -; CHECK-NEXT: s_mov_b32 s30, s28 -; CHECK-NEXT: s_mov_b32 s31, s28 +; CHECK-NEXT: s_mov_b32 s69, s68 +; CHECK-NEXT: s_mov_b32 s70, s68 +; CHECK-NEXT: s_mov_b32 s71, s68 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: image_sample_lz v4, v[1:2], s[44:51], s[24:27] dmask:0x1 +; CHECK-NEXT: image_sample_lz v4, v[1:2], s[44:51], s[28:31] dmask:0x1 ; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: image_sample_lz v1, v[1:2], s[60:67], s[28:31] dmask:0x1 +; CHECK-NEXT: image_sample_lz v1, v[1:2], s[60:67], s[68:71] dmask:0x1 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: v_sub_f32_e32 v1, v1, v4 ; CHECK-NEXT: v_mul_f32_e32 v1, v1, v0 @@ -107,11 +104,11 @@ define void @main(i1 %arg) #0 { ; CHECK-NEXT: s_mov_b64 vcc, vcc ; CHECK-NEXT: s_cbranch_vccnz .LBB0_2 ; CHECK-NEXT: .LBB0_3: ; %Flow14 -; CHECK-NEXT: s_andn2_saveexec_b64 s[12:13], s[34:35] +; CHECK-NEXT: s_andn2_saveexec_b64 s[6:7], s[6:7] ; CHECK-NEXT: s_cbranch_execz .LBB0_10 ; CHECK-NEXT: ; %bb.4: ; %bb32 -; CHECK-NEXT: s_and_saveexec_b64 s[14:15], s[72:73] -; CHECK-NEXT: s_xor_b64 s[14:15], exec, s[14:15] +; CHECK-NEXT: s_and_saveexec_b64 s[16:17], s[4:5] +; CHECK-NEXT: s_xor_b64 s[4:5], exec, s[16:17] ; CHECK-NEXT: s_cbranch_execz .LBB0_6 ; CHECK-NEXT: ; %bb.5: ; %bb43 ; CHECK-NEXT: s_mov_b32 s16, 0 @@ -120,12 +117,12 @@ define void @main(i1 %arg) #0 { ; CHECK-NEXT: v_mov_b32_e32 v3, s17 ; CHECK-NEXT: s_mov_b32 s18, s16 ; CHECK-NEXT: s_mov_b32 s19, s16 -; CHECK-NEXT: image_sample_lz v1, v[2:3], s[4:11], s[16:19] dmask:0x1 +; CHECK-NEXT: image_sample_lz v1, v[2:3], s[8:15], s[16:19] dmask:0x1 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: s_mov_b64 s[4:5], s[36:37] -; CHECK-NEXT: s_mov_b64 s[6:7], s[38:39] -; CHECK-NEXT: s_mov_b64 s[8:9], s[40:41] -; CHECK-NEXT: s_mov_b64 s[10:11], s[42:43] +; CHECK-NEXT: s_mov_b64 s[8:9], s[36:37] +; CHECK-NEXT: s_mov_b64 s[10:11], s[38:39] +; CHECK-NEXT: s_mov_b64 s[12:13], s[40:41] +; CHECK-NEXT: s_mov_b64 s[14:15], s[42:43] ; CHECK-NEXT: v_readlane_b32 s36, v6, 0 ; CHECK-NEXT: v_readlane_b32 s44, v6, 8 ; CHECK-NEXT: v_readlane_b32 s45, v6, 9 @@ -140,32 +137,32 @@ define void @main(i1 %arg) #0 { ; CHECK-NEXT: v_readlane_b32 s39, v6, 3 ; CHECK-NEXT: v_readlane_b32 s40, v6, 4 ; CHECK-NEXT: v_readlane_b32 s41, v6, 5 -; CHECK-NEXT: image_sample_lz v0, v[2:3], s[44:51], s[20:23] dmask:0x1 +; CHECK-NEXT: image_sample_lz v0, v[2:3], s[44:51], s[24:27] dmask:0x1 ; CHECK-NEXT: v_readlane_b32 s42, v6, 6 ; CHECK-NEXT: v_readlane_b32 s43, v6, 7 ; CHECK-NEXT: v_mov_b32_e32 v2, 0 -; CHECK-NEXT: s_mov_b64 s[42:43], s[10:11] +; CHECK-NEXT: s_mov_b64 s[42:43], s[14:15] ; CHECK-NEXT: v_mov_b32_e32 v3, v2 -; CHECK-NEXT: s_mov_b64 s[40:41], s[8:9] -; CHECK-NEXT: s_mov_b64 s[38:39], s[6:7] -; CHECK-NEXT: s_mov_b64 s[36:37], s[4:5] +; CHECK-NEXT: s_mov_b64 s[40:41], s[12:13] +; CHECK-NEXT: s_mov_b64 s[38:39], s[10:11] +; CHECK-NEXT: s_mov_b64 s[36:37], s[8:9] ; CHECK-NEXT: s_waitcnt vmcnt(1) ; CHECK-NEXT: buffer_store_dwordx3 v[1:3], off, s[16:19], 0 ; CHECK-NEXT: s_waitcnt vmcnt(1) ; CHECK-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 ; CHECK-NEXT: ; implicit-def: $vgpr0 ; CHECK-NEXT: .LBB0_6: ; %Flow12 -; CHECK-NEXT: s_andn2_saveexec_b64 s[4:5], s[14:15] +; CHECK-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; CHECK-NEXT: s_cbranch_execz .LBB0_9 ; CHECK-NEXT: ; %bb.7: ; %bb33.preheader ; CHECK-NEXT: s_mov_b32 s8, 0 -; CHECK-NEXT: s_mov_b32 s6, s8 -; CHECK-NEXT: s_mov_b32 s7, s8 -; CHECK-NEXT: v_mov_b32_e32 v1, s6 +; CHECK-NEXT: s_mov_b32 s12, s8 +; CHECK-NEXT: s_mov_b32 s13, s8 +; CHECK-NEXT: v_mov_b32_e32 v1, s12 ; CHECK-NEXT: s_mov_b32 s9, s8 ; CHECK-NEXT: s_mov_b32 s10, s8 ; CHECK-NEXT: s_mov_b32 s11, s8 -; CHECK-NEXT: v_mov_b32_e32 v2, s7 +; CHECK-NEXT: v_mov_b32_e32 v2, s13 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: image_sample_lz v3, v[1:2], s[36:43], s[8:11] dmask:0x1 ; CHECK-NEXT: image_sample_lz v4, v[1:2], s[52:59], s[8:11] dmask:0x1 @@ -183,28 +180,28 @@ define void @main(i1 %arg) #0 { ; CHECK-NEXT: .LBB0_9: ; %Flow13 ; CHECK-NEXT: s_or_b64 exec, exec, s[4:5] ; CHECK-NEXT: .LBB0_10: ; %UnifiedReturnBlock -; CHECK-NEXT: s_or_b64 exec, exec, s[12:13] -; CHECK-NEXT: v_readlane_b32 s69, v5, 21 -; CHECK-NEXT: v_readlane_b32 s68, v5, 20 +; CHECK-NEXT: s_or_b64 exec, exec, s[6:7] +; CHECK-NEXT: v_readlane_b32 s71, v5, 21 +; CHECK-NEXT: v_readlane_b32 s70, v5, 20 +; CHECK-NEXT: v_readlane_b32 s69, v5, 19 +; CHECK-NEXT: v_readlane_b32 s68, v5, 18 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: v_readlane_b32 s67, v5, 19 -; CHECK-NEXT: v_readlane_b32 s66, v5, 18 -; CHECK-NEXT: v_readlane_b32 s65, v5, 17 -; CHECK-NEXT: v_readlane_b32 s64, v5, 16 -; CHECK-NEXT: v_readlane_b32 s55, v5, 15 -; CHECK-NEXT: v_readlane_b32 s54, v5, 14 -; CHECK-NEXT: v_readlane_b32 s53, v5, 13 -; CHECK-NEXT: v_readlane_b32 s52, v5, 12 -; CHECK-NEXT: v_readlane_b32 s51, v5, 11 -; CHECK-NEXT: v_readlane_b32 s50, v5, 10 -; CHECK-NEXT: v_readlane_b32 s49, v5, 9 -; CHECK-NEXT: v_readlane_b32 s48, v5, 8 -; CHECK-NEXT: v_readlane_b32 s39, v5, 7 -; CHECK-NEXT: v_readlane_b32 s38, v5, 6 -; CHECK-NEXT: v_readlane_b32 s37, v5, 5 -; CHECK-NEXT: v_readlane_b32 s36, v5, 4 -; CHECK-NEXT: v_readlane_b32 s35, v5, 3 -; CHECK-NEXT: v_readlane_b32 s34, v5, 2 +; CHECK-NEXT: v_readlane_b32 s67, v5, 17 +; CHECK-NEXT: v_readlane_b32 s66, v5, 16 +; CHECK-NEXT: v_readlane_b32 s65, v5, 15 +; CHECK-NEXT: v_readlane_b32 s64, v5, 14 +; CHECK-NEXT: v_readlane_b32 s55, v5, 13 +; CHECK-NEXT: v_readlane_b32 s54, v5, 12 +; CHECK-NEXT: v_readlane_b32 s53, v5, 11 +; CHECK-NEXT: v_readlane_b32 s52, v5, 10 +; CHECK-NEXT: v_readlane_b32 s51, v5, 9 +; CHECK-NEXT: v_readlane_b32 s50, v5, 8 +; CHECK-NEXT: v_readlane_b32 s49, v5, 7 +; CHECK-NEXT: v_readlane_b32 s48, v5, 6 +; CHECK-NEXT: v_readlane_b32 s39, v5, 5 +; CHECK-NEXT: v_readlane_b32 s38, v5, 4 +; CHECK-NEXT: v_readlane_b32 s37, v5, 3 +; CHECK-NEXT: v_readlane_b32 s36, v5, 2 ; CHECK-NEXT: v_readlane_b32 s31, v5, 1 ; CHECK-NEXT: v_readlane_b32 s30, v5, 0 ; CHECK-NEXT: s_xor_saveexec_b64 s[4:5], -1 diff --git a/llvm/test/CodeGen/AMDGPU/kill-true-in-return-block.ll b/llvm/test/CodeGen/AMDGPU/kill-true-in-return-block.ll index 021c845d5ea6b..d75e9932bcd82 100644 --- a/llvm/test/CodeGen/AMDGPU/kill-true-in-return-block.ll +++ b/llvm/test/CodeGen/AMDGPU/kill-true-in-return-block.ll @@ -7,9 +7,8 @@ define amdgpu_ps float @kill_true(i1 %.not) { ; CHECK-NEXT: s_mov_b64 s[0:1], exec ; CHECK-NEXT: s_wqm_b64 exec, exec ; CHECK-NEXT: v_and_b32_e32 v0, 1, v0 -; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; CHECK-NEXT: s_xor_b64 s[4:5], vcc, -1 -; CHECK-NEXT: s_and_saveexec_b64 s[2:3], s[4:5] +; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 1, v0 +; CHECK-NEXT: s_and_saveexec_b64 s[2:3], vcc ; CHECK-NEXT: s_cbranch_execz .LBB0_2 ; CHECK-NEXT: ; %bb.1: ; %if1 ; CHECK-NEXT: s_mov_b32 s4, 0 diff --git a/llvm/test/CodeGen/AMDGPU/lds-global-non-entry-func.ll b/llvm/test/CodeGen/AMDGPU/lds-global-non-entry-func.ll index e70dc8f7a6576..e64ec9956860d 100644 --- a/llvm/test/CodeGen/AMDGPU/lds-global-non-entry-func.ll +++ b/llvm/test/CodeGen/AMDGPU/lds-global-non-entry-func.ll @@ -133,11 +133,10 @@ define void @func_uses_lds_multi(i1 %cond) { ; GFX8-SDAG: ; %bb.0: ; %entry ; GFX8-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-SDAG-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX8-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX8-SDAG-NEXT: s_xor_b64 s[4:5], vcc, -1 +; GFX8-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, 1, v0 ; GFX8-SDAG-NEXT: s_mov_b32 m0, -1 -; GFX8-SDAG-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] -; GFX8-SDAG-NEXT: s_xor_b64 s[4:5], exec, s[6:7] +; GFX8-SDAG-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX8-SDAG-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX8-SDAG-NEXT: s_cbranch_execz .LBB2_2 ; GFX8-SDAG-NEXT: ; %bb.1: ; %bb1 ; GFX8-SDAG-NEXT: v_mov_b32_e32 v0, 1 @@ -210,10 +209,9 @@ define void @func_uses_lds_multi(i1 %cond) { ; GFX9-SDAG: ; %bb.0: ; %entry ; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-SDAG-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX9-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX9-SDAG-NEXT: s_xor_b64 s[4:5], vcc, -1 -; GFX9-SDAG-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] -; GFX9-SDAG-NEXT: s_xor_b64 s[4:5], exec, s[6:7] +; GFX9-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, 1, v0 +; GFX9-SDAG-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-SDAG-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-SDAG-NEXT: s_cbranch_execz .LBB2_2 ; GFX9-SDAG-NEXT: ; %bb.1: ; %bb1 ; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, 1 @@ -266,10 +264,9 @@ define void @func_uses_lds_multi(i1 %cond) { ; SDAG: ; %bb.0: ; %entry ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SDAG-NEXT: v_and_b32_e32 v0, 1, v0 -; SDAG-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; SDAG-NEXT: s_xor_b64 s[4:5], vcc, -1 -; SDAG-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] -; SDAG-NEXT: s_xor_b64 s[4:5], exec, s[6:7] +; SDAG-NEXT: v_cmp_ne_u32_e32 vcc, 1, v0 +; SDAG-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SDAG-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SDAG-NEXT: s_cbranch_execz .LBB2_2 ; SDAG-NEXT: ; %bb.1: ; %bb1 ; SDAG-NEXT: v_mov_b32_e32 v0, 1 diff --git a/llvm/test/CodeGen/AMDGPU/loop_exit_with_xor.ll b/llvm/test/CodeGen/AMDGPU/loop_exit_with_xor.ll index 308ca34058f59..e37dcf60506be 100644 --- a/llvm/test/CodeGen/AMDGPU/loop_exit_with_xor.ll +++ b/llvm/test/CodeGen/AMDGPU/loop_exit_with_xor.ll @@ -100,26 +100,26 @@ define void @break_cond_is_arg(i32 %arg, i1 %breakcond) { ; GCN: ; %bb.0: ; %entry ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: v_and_b32_e32 v1, 1, v1 -; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1 -; GCN-NEXT: s_xor_b64 s[4:5], vcc, -1 +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 1, v1 ; GCN-NEXT: s_mov_b32 s10, 1 ; GCN-NEXT: s_mov_b64 s[6:7], 0 ; GCN-NEXT: s_branch .LBB2_2 ; GCN-NEXT: .LBB2_1: ; %endif ; GCN-NEXT: ; in Loop: Header=BB2_2 Depth=1 ; GCN-NEXT: s_or_b64 exec, exec, s[8:9] -; GCN-NEXT: s_and_b64 s[8:9], exec, s[4:5] -; GCN-NEXT: s_or_b64 s[6:7], s[8:9], s[6:7] +; GCN-NEXT: s_and_b64 s[4:5], exec, vcc +; GCN-NEXT: s_or_b64 s[6:7], s[4:5], s[6:7] ; GCN-NEXT: s_add_i32 s10, s10, 1 ; GCN-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GCN-NEXT: s_cbranch_execz .LBB2_4 ; GCN-NEXT: .LBB2_2: ; %loop ; GCN-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN-NEXT: v_cmp_gt_u32_e32 vcc, s10, v0 -; GCN-NEXT: s_and_saveexec_b64 s[8:9], vcc +; GCN-NEXT: v_cmp_gt_u32_e64 s[4:5], s10, v0 +; GCN-NEXT: s_and_saveexec_b64 s[8:9], s[4:5] ; GCN-NEXT: s_cbranch_execz .LBB2_1 ; GCN-NEXT: ; %bb.3: ; %then ; GCN-NEXT: ; in Loop: Header=BB2_2 Depth=1 +; GCN-NEXT: s_nop 2 ; GCN-NEXT: buffer_store_dword v0, off, s[4:7], s4 ; GCN-NEXT: s_branch .LBB2_1 ; GCN-NEXT: .LBB2_4: ; %loopexit diff --git a/llvm/test/CodeGen/AMDGPU/machine-sink-loop-var-out-of-divergent-loop-swdev407790.ll b/llvm/test/CodeGen/AMDGPU/machine-sink-loop-var-out-of-divergent-loop-swdev407790.ll index ba0f5cbf0a5f6..34a9624cb19eb 100644 --- a/llvm/test/CodeGen/AMDGPU/machine-sink-loop-var-out-of-divergent-loop-swdev407790.ll +++ b/llvm/test/CodeGen/AMDGPU/machine-sink-loop-var-out-of-divergent-loop-swdev407790.ll @@ -10,10 +10,9 @@ define void @machinesink_loop_variable_out_of_divergent_loop(i32 %arg, i1 %cmp49 ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CHECK-NEXT: v_and_b32_e32 v1, 1, v1 ; CHECK-NEXT: v_and_b32_e32 v3, 1, v3 -; CHECK-NEXT: s_mov_b32 s5, 0 -; CHECK-NEXT: v_cmp_eq_u32_e64 s4, 1, v1 +; CHECK-NEXT: s_mov_b32 s6, 0 +; CHECK-NEXT: v_cmp_ne_u32_e64 s4, 1, v1 ; CHECK-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v3 -; CHECK-NEXT: s_xor_b32 s6, s4, -1 ; CHECK-NEXT: s_inst_prefetch 0x1 ; CHECK-NEXT: s_branch .LBB0_3 ; CHECK-NEXT: .p2align 6 @@ -24,19 +23,19 @@ define void @machinesink_loop_variable_out_of_divergent_loop(i32 %arg, i1 %cmp49 ; CHECK-NEXT: .LBB0_2: ; %Flow1 ; CHECK-NEXT: ; in Loop: Header=BB0_3 Depth=1 ; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s7 -; CHECK-NEXT: v_cmp_ne_u32_e64 s4, 0, v1 +; CHECK-NEXT: v_cmp_ne_u32_e64 s5, 0, v1 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; j lastloop entry ; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: s_or_b32 s5, s4, s5 -; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 +; CHECK-NEXT: s_or_b32 s6, s5, s6 +; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s6 ; CHECK-NEXT: s_cbranch_execz .LBB0_8 ; CHECK-NEXT: .LBB0_3: ; %for.body33 ; CHECK-NEXT: ; =>This Loop Header: Depth=1 ; CHECK-NEXT: ; Child Loop BB0_6 Depth 2 ; CHECK-NEXT: v_mov_b32_e32 v3, 0 ; CHECK-NEXT: v_mov_b32_e32 v1, 0 -; CHECK-NEXT: s_and_saveexec_b32 s7, s6 +; CHECK-NEXT: s_and_saveexec_b32 s7, s4 ; CHECK-NEXT: s_cbranch_execz .LBB0_2 ; CHECK-NEXT: ; %bb.4: ; %for.body51.preheader ; CHECK-NEXT: ; in Loop: Header=BB0_3 Depth=1 @@ -46,21 +45,21 @@ define void @machinesink_loop_variable_out_of_divergent_loop(i32 %arg, i1 %cmp49 ; CHECK-NEXT: .p2align 6 ; CHECK-NEXT: .LBB0_5: ; %if.end118 ; CHECK-NEXT: ; in Loop: Header=BB0_6 Depth=2 -; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; CHECK-NEXT: s_add_i32 s9, s9, 4 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; backedge ; CHECK-NEXT: ;;#ASMEND ; CHECK-NEXT: v_add_nc_u32_e32 v3, s9, v2 -; CHECK-NEXT: v_cmp_ge_u32_e64 s4, v3, v0 -; CHECK-NEXT: s_or_b32 s8, s4, s8 +; CHECK-NEXT: v_cmp_ge_u32_e64 s5, v3, v0 +; CHECK-NEXT: s_or_b32 s8, s5, s8 ; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s8 ; CHECK-NEXT: s_cbranch_execz .LBB0_1 ; CHECK-NEXT: .LBB0_6: ; %for.body51 ; CHECK-NEXT: ; Parent Loop BB0_3 Depth=1 ; CHECK-NEXT: ; => This Inner Loop Header: Depth=2 ; CHECK-NEXT: v_mov_b32_e32 v1, 1 -; CHECK-NEXT: s_and_saveexec_b32 s4, vcc_lo +; CHECK-NEXT: s_and_saveexec_b32 s5, vcc_lo ; CHECK-NEXT: s_cbranch_execz .LBB0_5 ; CHECK-NEXT: ; %bb.7: ; %if.then112 ; CHECK-NEXT: ; in Loop: Header=BB0_6 Depth=2 @@ -71,7 +70,7 @@ define void @machinesink_loop_variable_out_of_divergent_loop(i32 %arg, i1 %cmp49 ; CHECK-NEXT: s_branch .LBB0_5 ; CHECK-NEXT: .LBB0_8: ; %for.body159.preheader ; CHECK-NEXT: s_inst_prefetch 0x2 -; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s5 +; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s6 ; CHECK-NEXT: s_mov_b32 vcc_lo, exec_lo ; CHECK-NEXT: .LBB0_9: ; %for.body159 ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 diff --git a/llvm/test/CodeGen/AMDGPU/si-annotate-nested-control-flows.ll b/llvm/test/CodeGen/AMDGPU/si-annotate-nested-control-flows.ll index 13f8eff94f86b..34de1e48bfb59 100644 --- a/llvm/test/CodeGen/AMDGPU/si-annotate-nested-control-flows.ll +++ b/llvm/test/CodeGen/AMDGPU/si-annotate-nested-control-flows.ll @@ -1,4 +1,4 @@ -; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: opt -mtriple=amdgcn-amd-amdhsa -p simplifycfg,amdgpu-unify-divergent-exit-nodes %s -S -o - | FileCheck %s --check-prefix=OPT ; RUN: llc -mtriple=amdgcn-amd-amdhsa %s -o - | FileCheck %s --check-prefix=ISA @@ -15,34 +15,39 @@ define void @nested_inf_loop(i1 %0, i1 %1) { ; OPT-NEXT: ret void ; ; ISA-LABEL: nested_inf_loop: -; ISA-NEXT: %bb.0: ; %BB -; ISA-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; ISA-NEXT: v_and_b32_e32 v1, 1, v1 -; ISA-NEXT: v_and_b32_e32 v0, 1, v0 -; ISA-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v1 -; ISA-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; ISA-NEXT: s_xor_b64 s[6:7], vcc, -1 -; ISA-NEXT: s_mov_b64 s[8:9], 0 -; ISA-NEXT: .LBB0_1: ; %BB1 -; ISA: s_and_b64 s[10:11], exec, s[6:7] -; ISA-NEXT: s_or_b64 s[8:9], s[10:11], s[8:9] -; ISA-NEXT: s_andn2_b64 exec, exec, s[8:9] -; ISA-NEXT: s_cbranch_execnz .LBB0_1 -; ISA-NEXT: %bb.2: ; %BB2 -; ISA: s_or_b64 exec, exec, s[8:9] -; ISA-NEXT: s_mov_b64 s[8:9], 0 -; ISA-NEXT: .LBB0_3: ; %BB4 -; ISA: s_and_b64 s[10:11], exec, s[4:5] -; ISA-NEXT: s_or_b64 s[8:9], s[10:11], s[8:9] -; ISA-NEXT: s_andn2_b64 exec, exec, s[8:9] -; ISA-NEXT: s_cbranch_execnz .LBB0_3 -; ISA-NEXT: %bb.4: ; %loop.exit.guard -; ISA: s_or_b64 exec, exec, s[8:9] -; ISA-NEXT: s_mov_b64 vcc, 0 -; ISA-NEXT: s_mov_b64 s[8:9], 0 -; ISA-NEXT: s_branch .LBB0_1 -; ISA-NEXT: %bb.5: ; %DummyReturnBlock -; ISA-NEXT: s_setpc_b64 s[30:31] +; ISA: ; %bb.0: ; %BB +; ISA-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; ISA-NEXT: v_and_b32_e32 v1, 1, v1 +; ISA-NEXT: v_and_b32_e32 v0, 1, v0 +; ISA-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v1 +; ISA-NEXT: v_cmp_ne_u32_e64 s[6:7], 1, v0 +; ISA-NEXT: s_mov_b64 s[8:9], 0 +; ISA-NEXT: .LBB0_1: ; %BB1 +; ISA-NEXT: ; =>This Loop Header: Depth=1 +; ISA-NEXT: ; Child Loop BB0_3 Depth 2 +; ISA-NEXT: s_and_b64 s[10:11], exec, s[6:7] +; ISA-NEXT: s_or_b64 s[8:9], s[10:11], s[8:9] +; ISA-NEXT: s_andn2_b64 exec, exec, s[8:9] +; ISA-NEXT: s_cbranch_execnz .LBB0_1 +; ISA-NEXT: ; %bb.2: ; %BB2 +; ISA-NEXT: ; in Loop: Header=BB0_1 Depth=1 +; ISA-NEXT: s_or_b64 exec, exec, s[8:9] +; ISA-NEXT: s_mov_b64 s[8:9], 0 +; ISA-NEXT: .LBB0_3: ; %BB4 +; ISA-NEXT: ; Parent Loop BB0_1 Depth=1 +; ISA-NEXT: ; => This Inner Loop Header: Depth=2 +; ISA-NEXT: s_and_b64 s[10:11], exec, s[4:5] +; ISA-NEXT: s_or_b64 s[8:9], s[10:11], s[8:9] +; ISA-NEXT: s_andn2_b64 exec, exec, s[8:9] +; ISA-NEXT: s_cbranch_execnz .LBB0_3 +; ISA-NEXT: ; %bb.4: ; %loop.exit.guard +; ISA-NEXT: ; in Loop: Header=BB0_1 Depth=1 +; ISA-NEXT: s_or_b64 exec, exec, s[8:9] +; ISA-NEXT: s_mov_b64 vcc, 0 +; ISA-NEXT: s_mov_b64 s[8:9], 0 +; ISA-NEXT: s_branch .LBB0_1 +; ISA-NEXT: ; %bb.5: ; %DummyReturnBlock +; ISA-NEXT: s_setpc_b64 s[30:31] BB: br label %BB1 diff --git a/llvm/test/CodeGen/AMDGPU/si-optimize-vgpr-live-range-dbg-instr.ll b/llvm/test/CodeGen/AMDGPU/si-optimize-vgpr-live-range-dbg-instr.ll index d34769ad0fcf0..761ff7786b98e 100644 --- a/llvm/test/CodeGen/AMDGPU/si-optimize-vgpr-live-range-dbg-instr.ll +++ b/llvm/test/CodeGen/AMDGPU/si-optimize-vgpr-live-range-dbg-instr.ll @@ -14,10 +14,9 @@ define void @__omp_offloading_35_36570d3__ZN6openmc31process_advance_particle_ev ; GCN-NEXT: v_mov_b32_e32 v2, 0 ; GCN-NEXT: global_load_dwordx2 v[1:2], v[1:2], off ; GCN-NEXT: v_and_b32_e32 v0, 1, v0 -; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GCN-NEXT: s_xor_b64 s[4:5], vcc, -1 -; GCN-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[6:7] +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 1, v0 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GCN-NEXT: s_cbranch_execnz .LBB0_3 ; GCN-NEXT: ; %bb.1: ; %Flow ; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]