diff --git a/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll b/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll index a4eab62f501ce..3160e38df5e3f 100644 --- a/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll +++ b/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll @@ -513,115 +513,117 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg ; GFX908-LABEL: introduced_copy_to_sgpr: ; GFX908: ; %bb.0: ; %bb ; GFX908-NEXT: global_load_ushort v16, v[0:1], off glc -; GFX908-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 -; GFX908-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10 -; GFX908-NEXT: s_load_dword s7, s[8:9], 0x18 -; GFX908-NEXT: s_mov_b32 s6, 0 -; GFX908-NEXT: s_mov_b32 s9, s6 +; GFX908-NEXT: s_load_dwordx4 s[4:7], s[8:9], 0x0 +; GFX908-NEXT: s_load_dwordx2 s[10:11], s[8:9], 0x10 +; GFX908-NEXT: s_load_dword s0, s[8:9], 0x18 +; GFX908-NEXT: s_mov_b32 s12, 0 +; GFX908-NEXT: s_mov_b32 s9, s12 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_cvt_f32_u32_e32 v0, s3 -; GFX908-NEXT: s_sub_i32 s8, 0, s3 -; GFX908-NEXT: v_cvt_f32_f16_e32 v17, s7 +; GFX908-NEXT: v_cvt_f32_u32_e32 v0, s7 +; GFX908-NEXT: s_sub_i32 s1, 0, s7 +; GFX908-NEXT: v_cvt_f32_f16_e32 v17, s0 ; GFX908-NEXT: v_mov_b32_e32 v19, 0 ; GFX908-NEXT: v_rcp_iflag_f32_e32 v2, v0 ; GFX908-NEXT: v_mov_b32_e32 v0, 0 ; GFX908-NEXT: v_mov_b32_e32 v1, 0 ; GFX908-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2 ; GFX908-NEXT: v_cvt_u32_f32_e32 v2, v2 -; GFX908-NEXT: v_readfirstlane_b32 s10, v2 -; GFX908-NEXT: s_mul_i32 s8, s8, s10 -; GFX908-NEXT: s_mul_hi_u32 s8, s10, s8 -; GFX908-NEXT: s_add_i32 s10, s10, s8 -; GFX908-NEXT: s_mul_hi_u32 s8, s2, s10 -; GFX908-NEXT: s_mul_i32 s10, s8, s3 -; GFX908-NEXT: s_sub_i32 s2, s2, s10 -; GFX908-NEXT: s_add_i32 s11, s8, 1 -; GFX908-NEXT: s_sub_i32 s10, s2, s3 -; GFX908-NEXT: s_cmp_ge_u32 s2, s3 -; GFX908-NEXT: s_cselect_b32 s8, s11, s8 -; GFX908-NEXT: s_cselect_b32 s2, s10, s2 -; GFX908-NEXT: s_add_i32 s10, s8, 1 -; GFX908-NEXT: s_cmp_ge_u32 s2, s3 -; GFX908-NEXT: s_cselect_b32 s8, s10, s8 -; GFX908-NEXT: s_lshr_b32 s7, s7, 16 -; GFX908-NEXT: v_cvt_f32_f16_e32 v18, s7 -; GFX908-NEXT: s_lshl_b64 s[2:3], s[0:1], 5 -; GFX908-NEXT: s_lshl_b64 s[12:13], s[8:9], 5 -; GFX908-NEXT: s_lshl_b64 s[10:11], s[4:5], 5 -; GFX908-NEXT: s_or_b32 s10, s10, 28 +; GFX908-NEXT: v_readfirstlane_b32 s2, v2 +; GFX908-NEXT: s_mul_i32 s1, s1, s2 +; GFX908-NEXT: s_mul_hi_u32 s1, s2, s1 +; GFX908-NEXT: s_add_i32 s2, s2, s1 +; GFX908-NEXT: s_mul_hi_u32 s1, s6, s2 +; GFX908-NEXT: s_mul_i32 s2, s1, s7 +; GFX908-NEXT: s_sub_i32 s2, s6, s2 +; GFX908-NEXT: s_add_i32 s3, s1, 1 +; GFX908-NEXT: s_sub_i32 s6, s2, s7 +; GFX908-NEXT: s_cmp_ge_u32 s2, s7 +; GFX908-NEXT: s_cselect_b32 s1, s3, s1 +; GFX908-NEXT: s_cselect_b32 s2, s6, s2 +; GFX908-NEXT: s_add_i32 s3, s1, 1 +; GFX908-NEXT: s_cmp_ge_u32 s2, s7 +; GFX908-NEXT: s_cselect_b32 s8, s3, s1 +; GFX908-NEXT: s_lshr_b32 s2, s0, 16 +; GFX908-NEXT: v_cvt_f32_f16_e32 v18, s2 +; GFX908-NEXT: s_lshl_b64 s[6:7], s[4:5], 5 +; GFX908-NEXT: s_lshl_b64 s[14:15], s[10:11], 5 +; GFX908-NEXT: s_and_b64 s[0:1], exec, s[0:1] +; GFX908-NEXT: s_or_b32 s14, s14, 28 +; GFX908-NEXT: s_lshl_b64 s[16:17], s[8:9], 5 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_readfirstlane_b32 s7, v16 -; GFX908-NEXT: s_and_b32 s7, 0xffff, s7 -; GFX908-NEXT: s_mul_i32 s1, s1, s7 -; GFX908-NEXT: s_mul_hi_u32 s9, s0, s7 -; GFX908-NEXT: s_mul_i32 s0, s0, s7 -; GFX908-NEXT: s_add_i32 s1, s9, s1 -; GFX908-NEXT: s_lshl_b64 s[14:15], s[0:1], 5 +; GFX908-NEXT: v_readfirstlane_b32 s2, v16 +; GFX908-NEXT: s_and_b32 s2, 0xffff, s2 +; GFX908-NEXT: s_mul_i32 s3, s5, s2 +; GFX908-NEXT: s_mul_hi_u32 s5, s4, s2 +; GFX908-NEXT: s_mul_i32 s2, s4, s2 +; GFX908-NEXT: s_add_i32 s3, s5, s3 +; GFX908-NEXT: s_lshl_b64 s[4:5], s[2:3], 5 ; GFX908-NEXT: s_branch .LBB3_2 ; GFX908-NEXT: .LBB3_1: ; %Flow20 ; GFX908-NEXT: ; in Loop: Header=BB3_2 Depth=1 -; GFX908-NEXT: s_andn2_b64 vcc, exec, s[0:1] +; GFX908-NEXT: s_andn2_b64 vcc, exec, s[2:3] ; GFX908-NEXT: s_cbranch_vccz .LBB3_12 ; GFX908-NEXT: .LBB3_2: ; %bb9 ; GFX908-NEXT: ; =>This Loop Header: Depth=1 ; GFX908-NEXT: ; Child Loop BB3_5 Depth 2 -; GFX908-NEXT: s_mov_b64 s[16:17], -1 -; GFX908-NEXT: s_cbranch_scc0 .LBB3_10 +; GFX908-NEXT: s_mov_b64 s[18:19], -1 +; GFX908-NEXT: s_mov_b64 vcc, s[0:1] +; GFX908-NEXT: s_cbranch_vccz .LBB3_10 ; GFX908-NEXT: ; %bb.3: ; %bb14 ; GFX908-NEXT: ; in Loop: Header=BB3_2 Depth=1 ; GFX908-NEXT: global_load_dwordx2 v[2:3], v[0:1], off -; GFX908-NEXT: v_cmp_gt_i64_e64 s[0:1], s[4:5], -1 -; GFX908-NEXT: s_mov_b32 s7, s6 -; GFX908-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[0:1] -; GFX908-NEXT: v_mov_b32_e32 v4, s6 -; GFX908-NEXT: v_cmp_ne_u32_e64 s[0:1], 1, v6 -; GFX908-NEXT: v_mov_b32_e32 v6, s6 -; GFX908-NEXT: v_mov_b32_e32 v9, s7 -; GFX908-NEXT: v_mov_b32_e32 v5, s7 -; GFX908-NEXT: v_mov_b32_e32 v7, s7 -; GFX908-NEXT: v_mov_b32_e32 v8, s6 -; GFX908-NEXT: v_cmp_lt_i64_e64 s[16:17], s[4:5], 0 +; GFX908-NEXT: v_cmp_gt_i64_e64 s[2:3], s[10:11], -1 +; GFX908-NEXT: s_mov_b32 s13, s12 +; GFX908-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[2:3] +; GFX908-NEXT: v_mov_b32_e32 v4, s12 +; GFX908-NEXT: v_cmp_ne_u32_e64 s[2:3], 1, v6 +; GFX908-NEXT: v_mov_b32_e32 v6, s12 +; GFX908-NEXT: v_mov_b32_e32 v8, s12 +; GFX908-NEXT: v_mov_b32_e32 v5, s13 +; GFX908-NEXT: v_mov_b32_e32 v7, s13 +; GFX908-NEXT: v_mov_b32_e32 v9, s13 +; GFX908-NEXT: v_cmp_lt_i64_e64 s[18:19], s[10:11], 0 ; GFX908-NEXT: v_mov_b32_e32 v11, v5 -; GFX908-NEXT: s_mov_b64 s[18:19], s[10:11] +; GFX908-NEXT: s_mov_b64 s[20:21], s[14:15] ; GFX908-NEXT: v_mov_b32_e32 v10, v4 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_readfirstlane_b32 s7, v2 -; GFX908-NEXT: v_readfirstlane_b32 s9, v3 -; GFX908-NEXT: s_add_u32 s7, s7, 1 -; GFX908-NEXT: s_addc_u32 s9, s9, 0 -; GFX908-NEXT: s_mul_hi_u32 s20, s2, s7 -; GFX908-NEXT: s_mul_i32 s9, s2, s9 -; GFX908-NEXT: s_mul_i32 s21, s3, s7 -; GFX908-NEXT: s_add_i32 s9, s20, s9 -; GFX908-NEXT: s_mul_i32 s7, s2, s7 -; GFX908-NEXT: s_add_i32 s9, s9, s21 +; GFX908-NEXT: v_readfirstlane_b32 s9, v2 +; GFX908-NEXT: v_readfirstlane_b32 s13, v3 +; GFX908-NEXT: s_add_u32 s9, s9, 1 +; GFX908-NEXT: s_addc_u32 s13, s13, 0 +; GFX908-NEXT: s_mul_hi_u32 s22, s6, s9 +; GFX908-NEXT: s_mul_i32 s13, s6, s13 +; GFX908-NEXT: s_mul_i32 s23, s7, s9 +; GFX908-NEXT: s_add_i32 s13, s22, s13 +; GFX908-NEXT: s_mul_i32 s9, s6, s9 +; GFX908-NEXT: s_add_i32 s13, s13, s23 ; GFX908-NEXT: s_branch .LBB3_5 ; GFX908-NEXT: .LBB3_4: ; %bb58 ; GFX908-NEXT: ; in Loop: Header=BB3_5 Depth=2 ; GFX908-NEXT: v_add_co_u32_sdwa v2, vcc, v2, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; GFX908-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc -; GFX908-NEXT: s_add_u32 s18, s18, s14 -; GFX908-NEXT: v_cmp_lt_i64_e64 s[22:23], -1, v[2:3] -; GFX908-NEXT: s_addc_u32 s19, s19, s15 -; GFX908-NEXT: s_mov_b64 s[20:21], 0 -; GFX908-NEXT: s_andn2_b64 vcc, exec, s[22:23] +; GFX908-NEXT: s_add_u32 s20, s20, s4 +; GFX908-NEXT: v_cmp_lt_i64_e64 s[24:25], -1, v[2:3] +; GFX908-NEXT: s_addc_u32 s21, s21, s5 +; GFX908-NEXT: s_mov_b64 s[22:23], 0 +; GFX908-NEXT: s_andn2_b64 vcc, exec, s[24:25] ; GFX908-NEXT: s_cbranch_vccz .LBB3_9 ; GFX908-NEXT: .LBB3_5: ; %bb16 ; GFX908-NEXT: ; Parent Loop BB3_2 Depth=1 ; GFX908-NEXT: ; => This Inner Loop Header: Depth=2 -; GFX908-NEXT: s_add_u32 s20, s18, s7 -; GFX908-NEXT: s_addc_u32 s21, s19, s9 -; GFX908-NEXT: global_load_dword v21, v19, s[20:21] offset:-12 glc +; GFX908-NEXT: s_add_u32 s22, s20, s9 +; GFX908-NEXT: s_addc_u32 s23, s21, s13 +; GFX908-NEXT: global_load_dword v21, v19, s[22:23] offset:-12 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: global_load_dword v20, v19, s[20:21] offset:-8 glc +; GFX908-NEXT: global_load_dword v20, v19, s[22:23] offset:-8 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: global_load_dword v12, v19, s[20:21] offset:-4 glc +; GFX908-NEXT: global_load_dword v12, v19, s[22:23] offset:-4 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: global_load_dword v12, v19, s[20:21] glc +; GFX908-NEXT: global_load_dword v12, v19, s[22:23] glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: ds_read_b64 v[12:13], v19 ; GFX908-NEXT: ds_read_b64 v[14:15], v0 -; GFX908-NEXT: s_and_b64 vcc, exec, s[0:1] +; GFX908-NEXT: s_and_b64 vcc, exec, s[2:3] ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: s_cbranch_vccnz .LBB3_7 ; GFX908-NEXT: ; %bb.6: ; %bb51 @@ -648,28 +650,28 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg ; GFX908-NEXT: v_add_f32_e32 v11, v11, v13 ; GFX908-NEXT: s_branch .LBB3_4 ; GFX908-NEXT: .LBB3_7: ; in Loop: Header=BB3_5 Depth=2 -; GFX908-NEXT: s_mov_b64 s[20:21], s[16:17] -; GFX908-NEXT: s_andn2_b64 vcc, exec, s[20:21] +; GFX908-NEXT: s_mov_b64 s[22:23], s[18:19] +; GFX908-NEXT: s_andn2_b64 vcc, exec, s[22:23] ; GFX908-NEXT: s_cbranch_vccz .LBB3_4 ; GFX908-NEXT: ; %bb.8: ; in Loop: Header=BB3_2 Depth=1 -; GFX908-NEXT: s_mov_b64 s[20:21], -1 +; GFX908-NEXT: s_mov_b64 s[22:23], -1 ; GFX908-NEXT: ; implicit-def: $vgpr2_vgpr3 -; GFX908-NEXT: ; implicit-def: $sgpr18_sgpr19 +; GFX908-NEXT: ; implicit-def: $sgpr20_sgpr21 ; GFX908-NEXT: .LBB3_9: ; %loop.exit.guard ; GFX908-NEXT: ; in Loop: Header=BB3_2 Depth=1 -; GFX908-NEXT: s_xor_b64 s[16:17], s[20:21], -1 +; GFX908-NEXT: s_xor_b64 s[18:19], s[22:23], -1 ; GFX908-NEXT: .LBB3_10: ; %Flow19 ; GFX908-NEXT: ; in Loop: Header=BB3_2 Depth=1 -; GFX908-NEXT: s_mov_b64 s[0:1], -1 -; GFX908-NEXT: s_and_b64 vcc, exec, s[16:17] +; GFX908-NEXT: s_mov_b64 s[2:3], -1 +; GFX908-NEXT: s_and_b64 vcc, exec, s[18:19] ; GFX908-NEXT: s_cbranch_vccz .LBB3_1 ; GFX908-NEXT: ; %bb.11: ; %bb12 ; GFX908-NEXT: ; in Loop: Header=BB3_2 Depth=1 -; GFX908-NEXT: s_add_u32 s4, s4, s8 -; GFX908-NEXT: s_addc_u32 s5, s5, 0 -; GFX908-NEXT: s_add_u32 s10, s10, s12 -; GFX908-NEXT: s_addc_u32 s11, s11, s13 -; GFX908-NEXT: s_mov_b64 s[0:1], 0 +; GFX908-NEXT: s_add_u32 s10, s10, s8 +; GFX908-NEXT: s_addc_u32 s11, s11, 0 +; GFX908-NEXT: s_add_u32 s14, s14, s16 +; GFX908-NEXT: s_addc_u32 s15, s15, s17 +; GFX908-NEXT: s_mov_b64 s[2:3], 0 ; GFX908-NEXT: s_branch .LBB3_1 ; GFX908-NEXT: .LBB3_12: ; %DummyReturnBlock ; GFX908-NEXT: s_endpgm @@ -677,111 +679,113 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg ; GFX90A-LABEL: introduced_copy_to_sgpr: ; GFX90A: ; %bb.0: ; %bb ; GFX90A-NEXT: global_load_ushort v18, v[0:1], off glc -; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 -; GFX90A-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10 -; GFX90A-NEXT: s_load_dword s7, s[8:9], 0x18 -; GFX90A-NEXT: s_mov_b32 s6, 0 -; GFX90A-NEXT: s_mov_b32 s9, s6 +; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[8:9], 0x0 +; GFX90A-NEXT: s_load_dwordx2 s[10:11], s[8:9], 0x10 +; GFX90A-NEXT: s_load_dword s0, s[8:9], 0x18 +; GFX90A-NEXT: s_mov_b32 s12, 0 +; GFX90A-NEXT: s_mov_b32 s9, s12 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_cvt_f32_u32_e32 v0, s3 -; GFX90A-NEXT: s_sub_i32 s8, 0, s3 +; GFX90A-NEXT: v_cvt_f32_u32_e32 v0, s7 +; GFX90A-NEXT: s_sub_i32 s1, 0, s7 ; GFX90A-NEXT: v_mov_b32_e32 v19, 0 ; GFX90A-NEXT: v_rcp_iflag_f32_e32 v2, v0 ; GFX90A-NEXT: v_pk_mov_b32 v[0:1], 0, 0 ; GFX90A-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2 ; GFX90A-NEXT: v_cvt_u32_f32_e32 v3, v2 -; GFX90A-NEXT: v_cvt_f32_f16_e32 v2, s7 -; GFX90A-NEXT: v_readfirstlane_b32 s10, v3 -; GFX90A-NEXT: s_mul_i32 s8, s8, s10 -; GFX90A-NEXT: s_mul_hi_u32 s8, s10, s8 -; GFX90A-NEXT: s_add_i32 s10, s10, s8 -; GFX90A-NEXT: s_mul_hi_u32 s8, s2, s10 -; GFX90A-NEXT: s_mul_i32 s10, s8, s3 -; GFX90A-NEXT: s_sub_i32 s2, s2, s10 -; GFX90A-NEXT: s_add_i32 s11, s8, 1 -; GFX90A-NEXT: s_sub_i32 s10, s2, s3 -; GFX90A-NEXT: s_cmp_ge_u32 s2, s3 -; GFX90A-NEXT: s_cselect_b32 s8, s11, s8 -; GFX90A-NEXT: s_cselect_b32 s2, s10, s2 -; GFX90A-NEXT: s_add_i32 s10, s8, 1 -; GFX90A-NEXT: s_cmp_ge_u32 s2, s3 -; GFX90A-NEXT: s_cselect_b32 s8, s10, s8 -; GFX90A-NEXT: s_lshr_b32 s7, s7, 16 -; GFX90A-NEXT: v_cvt_f32_f16_e32 v3, s7 -; GFX90A-NEXT: s_lshl_b64 s[2:3], s[0:1], 5 -; GFX90A-NEXT: s_lshl_b64 s[12:13], s[8:9], 5 -; GFX90A-NEXT: s_lshl_b64 s[10:11], s[4:5], 5 -; GFX90A-NEXT: s_or_b32 s10, s10, 28 +; GFX90A-NEXT: v_cvt_f32_f16_e32 v2, s0 +; GFX90A-NEXT: v_readfirstlane_b32 s2, v3 +; GFX90A-NEXT: s_mul_i32 s1, s1, s2 +; GFX90A-NEXT: s_mul_hi_u32 s1, s2, s1 +; GFX90A-NEXT: s_add_i32 s2, s2, s1 +; GFX90A-NEXT: s_mul_hi_u32 s1, s6, s2 +; GFX90A-NEXT: s_mul_i32 s2, s1, s7 +; GFX90A-NEXT: s_sub_i32 s2, s6, s2 +; GFX90A-NEXT: s_add_i32 s3, s1, 1 +; GFX90A-NEXT: s_sub_i32 s6, s2, s7 +; GFX90A-NEXT: s_cmp_ge_u32 s2, s7 +; GFX90A-NEXT: s_cselect_b32 s1, s3, s1 +; GFX90A-NEXT: s_cselect_b32 s2, s6, s2 +; GFX90A-NEXT: s_add_i32 s3, s1, 1 +; GFX90A-NEXT: s_cmp_ge_u32 s2, s7 +; GFX90A-NEXT: s_cselect_b32 s8, s3, s1 +; GFX90A-NEXT: s_lshr_b32 s2, s0, 16 +; GFX90A-NEXT: v_cvt_f32_f16_e32 v3, s2 +; GFX90A-NEXT: s_lshl_b64 s[6:7], s[4:5], 5 +; GFX90A-NEXT: s_lshl_b64 s[14:15], s[10:11], 5 +; GFX90A-NEXT: s_and_b64 s[0:1], exec, s[0:1] +; GFX90A-NEXT: s_or_b32 s14, s14, 28 +; GFX90A-NEXT: s_lshl_b64 s[16:17], s[8:9], 5 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_readfirstlane_b32 s7, v18 -; GFX90A-NEXT: s_and_b32 s7, 0xffff, s7 -; GFX90A-NEXT: s_mul_i32 s1, s1, s7 -; GFX90A-NEXT: s_mul_hi_u32 s9, s0, s7 -; GFX90A-NEXT: s_mul_i32 s0, s0, s7 -; GFX90A-NEXT: s_add_i32 s1, s9, s1 -; GFX90A-NEXT: s_lshl_b64 s[14:15], s[0:1], 5 +; GFX90A-NEXT: v_readfirstlane_b32 s2, v18 +; GFX90A-NEXT: s_and_b32 s2, 0xffff, s2 +; GFX90A-NEXT: s_mul_i32 s3, s5, s2 +; GFX90A-NEXT: s_mul_hi_u32 s5, s4, s2 +; GFX90A-NEXT: s_mul_i32 s2, s4, s2 +; GFX90A-NEXT: s_add_i32 s3, s5, s3 +; GFX90A-NEXT: s_lshl_b64 s[4:5], s[2:3], 5 ; GFX90A-NEXT: s_branch .LBB3_2 ; GFX90A-NEXT: .LBB3_1: ; %Flow20 ; GFX90A-NEXT: ; in Loop: Header=BB3_2 Depth=1 -; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[0:1] +; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[2:3] ; GFX90A-NEXT: s_cbranch_vccz .LBB3_12 ; GFX90A-NEXT: .LBB3_2: ; %bb9 ; GFX90A-NEXT: ; =>This Loop Header: Depth=1 ; GFX90A-NEXT: ; Child Loop BB3_5 Depth 2 -; GFX90A-NEXT: s_mov_b64 s[16:17], -1 -; GFX90A-NEXT: s_cbranch_scc0 .LBB3_10 +; GFX90A-NEXT: s_mov_b64 s[18:19], -1 +; GFX90A-NEXT: s_mov_b64 vcc, s[0:1] +; GFX90A-NEXT: s_cbranch_vccz .LBB3_10 ; GFX90A-NEXT: ; %bb.3: ; %bb14 ; GFX90A-NEXT: ; in Loop: Header=BB3_2 Depth=1 ; GFX90A-NEXT: global_load_dwordx2 v[4:5], v[0:1], off -; GFX90A-NEXT: v_cmp_gt_i64_e64 s[0:1], s[4:5], -1 -; GFX90A-NEXT: s_mov_b32 s7, s6 -; GFX90A-NEXT: v_cndmask_b32_e64 v8, 0, 1, s[0:1] -; GFX90A-NEXT: v_pk_mov_b32 v[6:7], s[6:7], s[6:7] op_sel:[0,1] -; GFX90A-NEXT: v_cmp_ne_u32_e64 s[0:1], 1, v8 -; GFX90A-NEXT: v_pk_mov_b32 v[8:9], s[6:7], s[6:7] op_sel:[0,1] -; GFX90A-NEXT: v_pk_mov_b32 v[10:11], s[6:7], s[6:7] op_sel:[0,1] -; GFX90A-NEXT: v_cmp_lt_i64_e64 s[16:17], s[4:5], 0 -; GFX90A-NEXT: s_mov_b64 s[18:19], s[10:11] +; GFX90A-NEXT: v_cmp_gt_i64_e64 s[2:3], s[10:11], -1 +; GFX90A-NEXT: s_mov_b32 s13, s12 +; GFX90A-NEXT: v_cndmask_b32_e64 v8, 0, 1, s[2:3] +; GFX90A-NEXT: v_pk_mov_b32 v[6:7], s[12:13], s[12:13] op_sel:[0,1] +; GFX90A-NEXT: v_cmp_ne_u32_e64 s[2:3], 1, v8 +; GFX90A-NEXT: v_pk_mov_b32 v[8:9], s[12:13], s[12:13] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[10:11], s[12:13], s[12:13] op_sel:[0,1] +; GFX90A-NEXT: v_cmp_lt_i64_e64 s[18:19], s[10:11], 0 +; GFX90A-NEXT: s_mov_b64 s[20:21], s[14:15] ; GFX90A-NEXT: v_pk_mov_b32 v[12:13], v[6:7], v[6:7] op_sel:[0,1] ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_readfirstlane_b32 s7, v4 -; GFX90A-NEXT: v_readfirstlane_b32 s9, v5 -; GFX90A-NEXT: s_add_u32 s7, s7, 1 -; GFX90A-NEXT: s_addc_u32 s9, s9, 0 -; GFX90A-NEXT: s_mul_hi_u32 s20, s2, s7 -; GFX90A-NEXT: s_mul_i32 s9, s2, s9 -; GFX90A-NEXT: s_mul_i32 s21, s3, s7 -; GFX90A-NEXT: s_add_i32 s9, s20, s9 -; GFX90A-NEXT: s_mul_i32 s7, s2, s7 -; GFX90A-NEXT: s_add_i32 s9, s9, s21 +; GFX90A-NEXT: v_readfirstlane_b32 s9, v4 +; GFX90A-NEXT: v_readfirstlane_b32 s13, v5 +; GFX90A-NEXT: s_add_u32 s9, s9, 1 +; GFX90A-NEXT: s_addc_u32 s13, s13, 0 +; GFX90A-NEXT: s_mul_hi_u32 s22, s6, s9 +; GFX90A-NEXT: s_mul_i32 s13, s6, s13 +; GFX90A-NEXT: s_mul_i32 s23, s7, s9 +; GFX90A-NEXT: s_add_i32 s13, s22, s13 +; GFX90A-NEXT: s_mul_i32 s9, s6, s9 +; GFX90A-NEXT: s_add_i32 s13, s13, s23 ; GFX90A-NEXT: s_branch .LBB3_5 ; GFX90A-NEXT: .LBB3_4: ; %bb58 ; GFX90A-NEXT: ; in Loop: Header=BB3_5 Depth=2 ; GFX90A-NEXT: v_add_co_u32_sdwa v4, vcc, v4, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc -; GFX90A-NEXT: s_add_u32 s18, s18, s14 -; GFX90A-NEXT: s_addc_u32 s19, s19, s15 -; GFX90A-NEXT: v_cmp_lt_i64_e64 s[22:23], -1, v[4:5] -; GFX90A-NEXT: s_mov_b64 s[20:21], 0 -; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[22:23] +; GFX90A-NEXT: s_add_u32 s20, s20, s4 +; GFX90A-NEXT: s_addc_u32 s21, s21, s5 +; GFX90A-NEXT: v_cmp_lt_i64_e64 s[24:25], -1, v[4:5] +; GFX90A-NEXT: s_mov_b64 s[22:23], 0 +; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[24:25] ; GFX90A-NEXT: s_cbranch_vccz .LBB3_9 ; GFX90A-NEXT: .LBB3_5: ; %bb16 ; GFX90A-NEXT: ; Parent Loop BB3_2 Depth=1 ; GFX90A-NEXT: ; => This Inner Loop Header: Depth=2 -; GFX90A-NEXT: s_add_u32 s20, s18, s7 -; GFX90A-NEXT: s_addc_u32 s21, s19, s9 -; GFX90A-NEXT: global_load_dword v21, v19, s[20:21] offset:-12 glc +; GFX90A-NEXT: s_add_u32 s22, s20, s9 +; GFX90A-NEXT: s_addc_u32 s23, s21, s13 +; GFX90A-NEXT: global_load_dword v21, v19, s[22:23] offset:-12 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: global_load_dword v20, v19, s[20:21] offset:-8 glc +; GFX90A-NEXT: global_load_dword v20, v19, s[22:23] offset:-8 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: global_load_dword v14, v19, s[20:21] offset:-4 glc +; GFX90A-NEXT: global_load_dword v14, v19, s[22:23] offset:-4 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: global_load_dword v14, v19, s[20:21] glc +; GFX90A-NEXT: global_load_dword v14, v19, s[22:23] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: ds_read_b64 v[14:15], v19 ; GFX90A-NEXT: ds_read_b64 v[16:17], v0 -; GFX90A-NEXT: s_and_b64 vcc, exec, s[0:1] -; GFX90A-NEXT: ; kill: killed $sgpr20 killed $sgpr21 +; GFX90A-NEXT: s_and_b64 vcc, exec, s[2:3] +; GFX90A-NEXT: ; kill: killed $sgpr22 killed $sgpr23 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: s_cbranch_vccnz .LBB3_7 ; GFX90A-NEXT: ; %bb.6: ; %bb51 @@ -800,28 +804,28 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg ; GFX90A-NEXT: v_pk_add_f32 v[12:13], v[12:13], v[14:15] ; GFX90A-NEXT: s_branch .LBB3_4 ; GFX90A-NEXT: .LBB3_7: ; in Loop: Header=BB3_5 Depth=2 -; GFX90A-NEXT: s_mov_b64 s[20:21], s[16:17] -; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[20:21] +; GFX90A-NEXT: s_mov_b64 s[22:23], s[18:19] +; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[22:23] ; GFX90A-NEXT: s_cbranch_vccz .LBB3_4 ; GFX90A-NEXT: ; %bb.8: ; in Loop: Header=BB3_2 Depth=1 -; GFX90A-NEXT: s_mov_b64 s[20:21], -1 +; GFX90A-NEXT: s_mov_b64 s[22:23], -1 ; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5 -; GFX90A-NEXT: ; implicit-def: $sgpr18_sgpr19 +; GFX90A-NEXT: ; implicit-def: $sgpr20_sgpr21 ; GFX90A-NEXT: .LBB3_9: ; %loop.exit.guard ; GFX90A-NEXT: ; in Loop: Header=BB3_2 Depth=1 -; GFX90A-NEXT: s_xor_b64 s[16:17], s[20:21], -1 +; GFX90A-NEXT: s_xor_b64 s[18:19], s[22:23], -1 ; GFX90A-NEXT: .LBB3_10: ; %Flow19 ; GFX90A-NEXT: ; in Loop: Header=BB3_2 Depth=1 -; GFX90A-NEXT: s_mov_b64 s[0:1], -1 -; GFX90A-NEXT: s_and_b64 vcc, exec, s[16:17] +; GFX90A-NEXT: s_mov_b64 s[2:3], -1 +; GFX90A-NEXT: s_and_b64 vcc, exec, s[18:19] ; GFX90A-NEXT: s_cbranch_vccz .LBB3_1 ; GFX90A-NEXT: ; %bb.11: ; %bb12 ; GFX90A-NEXT: ; in Loop: Header=BB3_2 Depth=1 -; GFX90A-NEXT: s_add_u32 s4, s4, s8 -; GFX90A-NEXT: s_addc_u32 s5, s5, 0 -; GFX90A-NEXT: s_add_u32 s10, s10, s12 -; GFX90A-NEXT: s_addc_u32 s11, s11, s13 -; GFX90A-NEXT: s_mov_b64 s[0:1], 0 +; GFX90A-NEXT: s_add_u32 s10, s10, s8 +; GFX90A-NEXT: s_addc_u32 s11, s11, 0 +; GFX90A-NEXT: s_add_u32 s14, s14, s16 +; GFX90A-NEXT: s_addc_u32 s15, s15, s17 +; GFX90A-NEXT: s_mov_b64 s[2:3], 0 ; GFX90A-NEXT: s_branch .LBB3_1 ; GFX90A-NEXT: .LBB3_12: ; %DummyReturnBlock ; GFX90A-NEXT: s_endpgm @@ -834,7 +838,8 @@ bb: bb9: ; preds = %bb12, %bb %i10 = phi i64 [ %arg3, %bb ], [ %i13, %bb12 ] - br i1 undef, label %bb14, label %bb12 + %undef = freeze i1 poison + br i1 %undef, label %bb14, label %bb12 bb12: ; preds = %bb58, %bb9 %i13 = add nuw nsw i64 %i10, %i8 diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fold-binop-select.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fold-binop-select.ll index d506c8c4b8779..7fdc012d4f1b5 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fold-binop-select.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fold-binop-select.ll @@ -55,7 +55,7 @@ define <2 x i32> @select_sdiv_lhs_const_v2i32(i1 %cond) { ; GCN-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc ; GCN-NEXT: v_mov_b32_e32 v1, 0x594 ; GCN-NEXT: s_setpc_b64 s[30:31] - %select = select i1 %cond, <2 x i32> , <2 x i32> + %select = select i1 %cond, <2 x i32> , <2 x i32> %op = sdiv <2 x i32> , %select ret <2 x i32> %op } diff --git a/llvm/test/CodeGen/AMDGPU/andorbitset.ll b/llvm/test/CodeGen/AMDGPU/andorbitset.ll index 13daedf987229..a60d14cd46573 100644 --- a/llvm/test/CodeGen/AMDGPU/andorbitset.ll +++ b/llvm/test/CodeGen/AMDGPU/andorbitset.ll @@ -106,8 +106,8 @@ define amdgpu_kernel void @s_set_midbit(ptr addrspace(1) %out, i32 %in) { @gv = external addrspace(1) global i32 ; Make sure there's no verifier error with an undef source. -define void @bitset_verifier_error() local_unnamed_addr #0 { -; SI-LABEL: bitset_verifier_error: +define void @bitset_verifier_error_freeze_poison() local_unnamed_addr #0 { +; SI-LABEL: bitset_verifier_error_freeze_poison: ; SI: ; %bb.0: ; %bb ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_getpc_b64 s[4:5] @@ -128,13 +128,40 @@ define void @bitset_verifier_error() local_unnamed_addr #0 { ; SI-NEXT: ; %bb.1: ; %bb5 ; SI-NEXT: .LBB6_2: ; %bb6 bb: - %i = call float @llvm.fabs.f32(float undef) #0 + %undef0 = freeze float poison + %i = call float @llvm.fabs.f32(float %undef0) #0 %i1 = bitcast float %i to i32 store i32 %i1, ptr addrspace(1) @gv br label %bb2 bb2: - %i3 = call float @llvm.fabs.f32(float undef) #0 + %undef1 = freeze float poison + %i3 = call float @llvm.fabs.f32(float %undef1) #0 + %i4 = fcmp fast ult float %i3, 0x3FEFF7CEE0000000 + br i1 %i4, label %bb5, label %bb6 + +bb5: + unreachable + +bb6: + unreachable +} + +define void @bitset_verifier_error_poison() local_unnamed_addr #0 { +; SI-LABEL: bitset_verifier_error_poison: +; SI: ; %bb.0: ; %bb +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cbranch_scc1 .LBB7_2 +; SI-NEXT: ; %bb.1: ; %bb5 +; SI-NEXT: .LBB7_2: ; %bb6 +bb: + %i = call float @llvm.fabs.f32(float poison) #0 + %i1 = bitcast float %i to i32 + store i32 %i1, ptr addrspace(1) @gv + br label %bb2 + +bb2: + %i3 = call float @llvm.fabs.f32(float poison) #0 %i4 = fcmp fast ult float %i3, 0x3FEFF7CEE0000000 br i1 %i4, label %bb5, label %bb6 diff --git a/llvm/test/CodeGen/AMDGPU/cndmask-no-def-vcc.ll b/llvm/test/CodeGen/AMDGPU/cndmask-no-def-vcc.ll index f8a1604351d9e..29d929995bf88 100644 --- a/llvm/test/CodeGen/AMDGPU/cndmask-no-def-vcc.ll +++ b/llvm/test/CodeGen/AMDGPU/cndmask-no-def-vcc.ll @@ -37,11 +37,12 @@ bb2: define amdgpu_kernel void @preserve_condition_undef_flag(float %arg, i32 %arg1, float %arg2) { bb0: %tmp = icmp sgt i32 %arg1, 4 - %undef = call i1 @llvm.amdgcn.class.f32(float poison, i32 undef) - %tmp4 = select i1 %undef, float %arg, float 1.000000e+00 + %mask = freeze i32 poison + %undef0 = call i1 @llvm.amdgcn.class.f32(float poison, i32 %mask) + %tmp4 = select i1 %undef0, float %arg, float 1.000000e+00 %tmp5 = fcmp ogt float %arg2, 0.000000e+00 %tmp6 = fcmp olt float %arg2, 1.000000e+00 - %tmp7 = fcmp olt float %arg, undef + %tmp7 = fcmp olt float %arg, poison %tmp8 = and i1 %tmp5, %tmp6 %tmp9 = and i1 %tmp8, %tmp7 br i1 %tmp9, label %bb1, label %bb2 diff --git a/llvm/test/CodeGen/AMDGPU/combine-add-zext-xor.ll b/llvm/test/CodeGen/AMDGPU/combine-add-zext-xor.ll index 434fc764e1fa6..f0e7cba6924d8 100644 --- a/llvm/test/CodeGen/AMDGPU/combine-add-zext-xor.ll +++ b/llvm/test/CodeGen/AMDGPU/combine-add-zext-xor.ll @@ -4,29 +4,34 @@ ; Test that unused lanes in the s_xor result are masked out with v_cndmask. -define i32 @combine_add_zext_xor() { +define i32 @combine_add_zext_xor(i32 inreg %cond) { ; GFX1010-LABEL: combine_add_zext_xor: ; GFX1010: ; %bb.0: ; %.entry ; GFX1010-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1010-NEXT: s_cmp_lg_u32 s16, 0 ; GFX1010-NEXT: v_mov_b32_e32 v1, 0 +; GFX1010-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1010-NEXT: v_cndmask_b32_e64 v0, 0, 1, s4 +; GFX1010-NEXT: v_cmp_ne_u32_e64 s4, 1, v0 ; GFX1010-NEXT: s_branch .LBB0_2 ; GFX1010-NEXT: .LBB0_1: ; %bb9 ; GFX1010-NEXT: ; in Loop: Header=BB0_2 Depth=1 -; GFX1010-NEXT: s_xor_b32 s4, s4, -1 +; GFX1010-NEXT: s_xor_b32 s5, s5, -1 ; GFX1010-NEXT: v_cmp_lt_i32_e32 vcc_lo, 0xfffffbe6, v1 -; GFX1010-NEXT: v_cndmask_b32_e64 v0, 0, 1, s4 +; GFX1010-NEXT: v_cndmask_b32_e64 v0, 0, 1, s5 ; GFX1010-NEXT: v_add_nc_u32_e32 v2, v1, v0 ; GFX1010-NEXT: v_mov_b32_e32 v1, v2 ; GFX1010-NEXT: s_cbranch_vccz .LBB0_4 ; GFX1010-NEXT: .LBB0_2: ; %.a ; GFX1010-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1010-NEXT: ; implicit-def: $sgpr4 -; GFX1010-NEXT: s_cbranch_scc1 .LBB0_1 +; GFX1010-NEXT: s_and_b32 vcc_lo, exec_lo, s4 +; GFX1010-NEXT: ; implicit-def: $sgpr5 +; GFX1010-NEXT: s_cbranch_vccnz .LBB0_1 ; GFX1010-NEXT: ; %bb.3: ; %bb ; GFX1010-NEXT: ; in Loop: Header=BB0_2 Depth=1 ; GFX1010-NEXT: buffer_load_dword v0, v1, s[4:7], 64 offen glc ; GFX1010-NEXT: s_waitcnt vmcnt(0) -; GFX1010-NEXT: v_cmp_eq_u32_e64 s4, 0, v0 +; GFX1010-NEXT: v_cmp_eq_u32_e64 s5, 0, v0 ; GFX1010-NEXT: s_branch .LBB0_1 ; GFX1010-NEXT: .LBB0_4: ; %.exit ; GFX1010-NEXT: s_setpc_b64 s[30:31] @@ -34,27 +39,32 @@ define i32 @combine_add_zext_xor() { ; GFX1100-LABEL: combine_add_zext_xor: ; GFX1100: ; %bb.0: ; %.entry ; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1100-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1100-NEXT: v_mov_b32_e32 v1, 0 +; GFX1100-NEXT: s_cselect_b32 s0, -1, 0 +; GFX1100-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1100-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX1100-NEXT: v_cmp_ne_u32_e64 s0, 1, v0 ; GFX1100-NEXT: s_branch .LBB0_2 ; GFX1100-NEXT: .LBB0_1: ; %bb9 ; GFX1100-NEXT: ; in Loop: Header=BB0_2 Depth=1 -; GFX1100-NEXT: s_xor_b32 s0, s0, -1 -; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1100-NEXT: s_xor_b32 s1, s1, -1 ; GFX1100-NEXT: v_cmp_lt_i32_e32 vcc_lo, 0xfffffbe6, v1 -; GFX1100-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX1100-NEXT: v_cndmask_b32_e64 v0, 0, 1, s1 +; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1100-NEXT: v_add_nc_u32_e32 v2, v1, v0 -; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1100-NEXT: v_mov_b32_e32 v1, v2 ; GFX1100-NEXT: s_cbranch_vccz .LBB0_4 ; GFX1100-NEXT: .LBB0_2: ; %.a ; GFX1100-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1100-NEXT: ; implicit-def: $sgpr0 -; GFX1100-NEXT: s_cbranch_scc1 .LBB0_1 +; GFX1100-NEXT: s_and_b32 vcc_lo, exec_lo, s0 +; GFX1100-NEXT: ; implicit-def: $sgpr1 +; GFX1100-NEXT: s_cbranch_vccnz .LBB0_1 ; GFX1100-NEXT: ; %bb.3: ; %bb ; GFX1100-NEXT: ; in Loop: Header=BB0_2 Depth=1 ; GFX1100-NEXT: buffer_load_b32 v0, v1, s[0:3], 64 offen glc ; GFX1100-NEXT: s_waitcnt vmcnt(0) -; GFX1100-NEXT: v_cmp_eq_u32_e64 s0, 0, v0 +; GFX1100-NEXT: v_cmp_eq_u32_e64 s1, 0, v0 ; GFX1100-NEXT: s_branch .LBB0_1 ; GFX1100-NEXT: .LBB0_4: ; %.exit ; GFX1100-NEXT: s_setpc_b64 s[30:31] @@ -63,7 +73,8 @@ define i32 @combine_add_zext_xor() { .a: ; preds = %bb9, %.entry %.2 = phi i32 [ 0, %.entry ], [ %i11, %bb9 ] - br i1 poison, label %bb9, label %bb + %cmp = icmp eq i32 %cond, 0 + br i1 %cmp, label %bb9, label %bb bb: ; preds = %.a %.i3 = call i32 @llvm.amdgcn.raw.ptr.buffer.load.i32(ptr addrspace(8) poison, i32 %.2, i32 64, i32 1) @@ -84,29 +95,34 @@ bb9: ; preds = %bb, %.a ; Test that unused lanes in the s_xor result are masked out with v_cndmask. -define i32 @combine_sub_zext_xor() { +define i32 @combine_sub_zext_xor(i32 inreg %cond) { ; GFX1010-LABEL: combine_sub_zext_xor: ; GFX1010: ; %bb.0: ; %.entry ; GFX1010-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1010-NEXT: s_cmp_lg_u32 s16, 0 ; GFX1010-NEXT: v_mov_b32_e32 v1, 0 +; GFX1010-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1010-NEXT: v_cndmask_b32_e64 v0, 0, 1, s4 +; GFX1010-NEXT: v_cmp_ne_u32_e64 s4, 1, v0 ; GFX1010-NEXT: s_branch .LBB1_2 ; GFX1010-NEXT: .LBB1_1: ; %bb9 ; GFX1010-NEXT: ; in Loop: Header=BB1_2 Depth=1 -; GFX1010-NEXT: s_xor_b32 s4, s4, -1 +; GFX1010-NEXT: s_xor_b32 s5, s5, -1 ; GFX1010-NEXT: v_cmp_lt_i32_e32 vcc_lo, 0xfffffbe6, v1 -; GFX1010-NEXT: v_cndmask_b32_e64 v0, 0, 1, s4 +; GFX1010-NEXT: v_cndmask_b32_e64 v0, 0, 1, s5 ; GFX1010-NEXT: v_sub_nc_u32_e32 v2, v1, v0 ; GFX1010-NEXT: v_mov_b32_e32 v1, v2 ; GFX1010-NEXT: s_cbranch_vccz .LBB1_4 ; GFX1010-NEXT: .LBB1_2: ; %.a ; GFX1010-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1010-NEXT: ; implicit-def: $sgpr4 -; GFX1010-NEXT: s_cbranch_scc1 .LBB1_1 +; GFX1010-NEXT: s_and_b32 vcc_lo, exec_lo, s4 +; GFX1010-NEXT: ; implicit-def: $sgpr5 +; GFX1010-NEXT: s_cbranch_vccnz .LBB1_1 ; GFX1010-NEXT: ; %bb.3: ; %bb ; GFX1010-NEXT: ; in Loop: Header=BB1_2 Depth=1 ; GFX1010-NEXT: buffer_load_dword v0, v1, s[4:7], 64 offen glc ; GFX1010-NEXT: s_waitcnt vmcnt(0) -; GFX1010-NEXT: v_cmp_eq_u32_e64 s4, 0, v0 +; GFX1010-NEXT: v_cmp_eq_u32_e64 s5, 0, v0 ; GFX1010-NEXT: s_branch .LBB1_1 ; GFX1010-NEXT: .LBB1_4: ; %.exit ; GFX1010-NEXT: s_setpc_b64 s[30:31] @@ -114,27 +130,32 @@ define i32 @combine_sub_zext_xor() { ; GFX1100-LABEL: combine_sub_zext_xor: ; GFX1100: ; %bb.0: ; %.entry ; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1100-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1100-NEXT: v_mov_b32_e32 v1, 0 +; GFX1100-NEXT: s_cselect_b32 s0, -1, 0 +; GFX1100-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1100-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX1100-NEXT: v_cmp_ne_u32_e64 s0, 1, v0 ; GFX1100-NEXT: s_branch .LBB1_2 ; GFX1100-NEXT: .LBB1_1: ; %bb9 ; GFX1100-NEXT: ; in Loop: Header=BB1_2 Depth=1 -; GFX1100-NEXT: s_xor_b32 s0, s0, -1 -; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1100-NEXT: s_xor_b32 s1, s1, -1 ; GFX1100-NEXT: v_cmp_lt_i32_e32 vcc_lo, 0xfffffbe6, v1 -; GFX1100-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX1100-NEXT: v_cndmask_b32_e64 v0, 0, 1, s1 +; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1100-NEXT: v_sub_nc_u32_e32 v2, v1, v0 -; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1100-NEXT: v_mov_b32_e32 v1, v2 ; GFX1100-NEXT: s_cbranch_vccz .LBB1_4 ; GFX1100-NEXT: .LBB1_2: ; %.a ; GFX1100-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1100-NEXT: ; implicit-def: $sgpr0 -; GFX1100-NEXT: s_cbranch_scc1 .LBB1_1 +; GFX1100-NEXT: s_and_b32 vcc_lo, exec_lo, s0 +; GFX1100-NEXT: ; implicit-def: $sgpr1 +; GFX1100-NEXT: s_cbranch_vccnz .LBB1_1 ; GFX1100-NEXT: ; %bb.3: ; %bb ; GFX1100-NEXT: ; in Loop: Header=BB1_2 Depth=1 ; GFX1100-NEXT: buffer_load_b32 v0, v1, s[0:3], 64 offen glc ; GFX1100-NEXT: s_waitcnt vmcnt(0) -; GFX1100-NEXT: v_cmp_eq_u32_e64 s0, 0, v0 +; GFX1100-NEXT: v_cmp_eq_u32_e64 s1, 0, v0 ; GFX1100-NEXT: s_branch .LBB1_1 ; GFX1100-NEXT: .LBB1_4: ; %.exit ; GFX1100-NEXT: s_setpc_b64 s[30:31] @@ -143,7 +164,8 @@ define i32 @combine_sub_zext_xor() { .a: ; preds = %bb9, %.entry %.2 = phi i32 [ 0, %.entry ], [ %i11, %bb9 ] - br i1 undef, label %bb9, label %bb + %cmp = icmp eq i32 %cond, 0 + br i1 %cmp, label %bb9, label %bb bb: ; preds = %.a %.i3 = call i32 @llvm.amdgcn.raw.ptr.buffer.load.i32(ptr addrspace(8) poison, i32 %.2, i32 64, i32 1) @@ -164,60 +186,71 @@ bb9: ; preds = %bb, %.a ; Test that unused lanes in the s_or result are masked out with v_cndmask. -define i32 @combine_add_zext_or() { +define i32 @combine_add_zext_or(i32 inreg %cond) { ; GFX1010-LABEL: combine_add_zext_or: ; GFX1010: ; %bb.0: ; %.entry ; GFX1010-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX1010-NEXT: s_mov_b32 s4, 0 +; GFX1010-NEXT: s_cmp_lg_u32 s16, 0 +; GFX1010-NEXT: s_mov_b32 s5, 0 +; GFX1010-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1010-NEXT: v_cndmask_b32_e64 v0, 0, 1, s4 +; GFX1010-NEXT: v_cmp_ne_u32_e64 s4, 1, v0 ; GFX1010-NEXT: s_branch .LBB2_2 ; GFX1010-NEXT: .LBB2_1: ; %bb9 ; GFX1010-NEXT: ; in Loop: Header=BB2_2 Depth=1 -; GFX1010-NEXT: s_cmpk_gt_i32 s4, 0xfbe6 -; GFX1010-NEXT: s_cselect_b32 s6, -1, 0 -; GFX1010-NEXT: s_add_i32 s4, s4, 1 -; GFX1010-NEXT: s_and_b32 vcc_lo, exec_lo, s6 +; GFX1010-NEXT: s_cmpk_gt_i32 s5, 0xfbe6 +; GFX1010-NEXT: s_cselect_b32 s7, -1, 0 +; GFX1010-NEXT: s_add_i32 s5, s5, 1 +; GFX1010-NEXT: s_and_b32 vcc_lo, exec_lo, s7 ; GFX1010-NEXT: s_cbranch_vccz .LBB2_4 ; GFX1010-NEXT: .LBB2_2: ; %.a ; GFX1010-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1010-NEXT: ; implicit-def: $sgpr5 -; GFX1010-NEXT: s_cbranch_scc1 .LBB2_1 +; GFX1010-NEXT: s_and_b32 vcc_lo, exec_lo, s4 +; GFX1010-NEXT: ; implicit-def: $sgpr6 +; GFX1010-NEXT: s_cbranch_vccnz .LBB2_1 ; GFX1010-NEXT: ; %bb.3: ; %bb ; GFX1010-NEXT: ; in Loop: Header=BB2_2 Depth=1 -; GFX1010-NEXT: v_mov_b32_e32 v0, s4 +; GFX1010-NEXT: v_mov_b32_e32 v0, s5 ; GFX1010-NEXT: buffer_load_dword v0, v0, s[4:7], 64 offen glc ; GFX1010-NEXT: s_waitcnt vmcnt(0) -; GFX1010-NEXT: v_cmp_eq_u32_e64 s5, 0, v0 +; GFX1010-NEXT: v_cmp_eq_u32_e64 s6, 0, v0 ; GFX1010-NEXT: s_branch .LBB2_1 ; GFX1010-NEXT: .LBB2_4: ; %.exit -; GFX1010-NEXT: s_or_b32 s4, s5, s6 +; GFX1010-NEXT: s_or_b32 s4, s6, s7 ; GFX1010-NEXT: v_cndmask_b32_e64 v0, 0, 1, s4 ; GFX1010-NEXT: s_setpc_b64 s[30:31] ; ; GFX1100-LABEL: combine_add_zext_or: ; GFX1100: ; %bb.0: ; %.entry ; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX1100-NEXT: s_mov_b32 s0, 0 +; GFX1100-NEXT: s_cmp_lg_u32 s0, 0 +; GFX1100-NEXT: s_mov_b32 s1, 0 +; GFX1100-NEXT: s_cselect_b32 s0, -1, 0 +; GFX1100-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1100-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX1100-NEXT: v_cmp_ne_u32_e64 s0, 1, v0 ; GFX1100-NEXT: s_branch .LBB2_2 ; GFX1100-NEXT: .LBB2_1: ; %bb9 ; GFX1100-NEXT: ; in Loop: Header=BB2_2 Depth=1 -; GFX1100-NEXT: s_cmpk_gt_i32 s0, 0xfbe6 -; GFX1100-NEXT: s_cselect_b32 s2, -1, 0 -; GFX1100-NEXT: s_add_i32 s0, s0, 1 -; GFX1100-NEXT: s_and_b32 vcc_lo, exec_lo, s2 +; GFX1100-NEXT: s_cmpk_gt_i32 s1, 0xfbe6 +; GFX1100-NEXT: s_cselect_b32 s3, -1, 0 +; GFX1100-NEXT: s_add_i32 s1, s1, 1 +; GFX1100-NEXT: s_and_b32 vcc_lo, exec_lo, s3 ; GFX1100-NEXT: s_cbranch_vccz .LBB2_4 ; GFX1100-NEXT: .LBB2_2: ; %.a ; GFX1100-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1100-NEXT: ; implicit-def: $sgpr1 -; GFX1100-NEXT: s_cbranch_scc1 .LBB2_1 +; GFX1100-NEXT: s_and_b32 vcc_lo, exec_lo, s0 +; GFX1100-NEXT: ; implicit-def: $sgpr2 +; GFX1100-NEXT: s_cbranch_vccnz .LBB2_1 ; GFX1100-NEXT: ; %bb.3: ; %bb ; GFX1100-NEXT: ; in Loop: Header=BB2_2 Depth=1 -; GFX1100-NEXT: v_mov_b32_e32 v0, s0 +; GFX1100-NEXT: v_mov_b32_e32 v0, s1 ; GFX1100-NEXT: buffer_load_b32 v0, v0, s[0:3], 64 offen glc ; GFX1100-NEXT: s_waitcnt vmcnt(0) -; GFX1100-NEXT: v_cmp_eq_u32_e64 s1, 0, v0 +; GFX1100-NEXT: v_cmp_eq_u32_e64 s2, 0, v0 ; GFX1100-NEXT: s_branch .LBB2_1 ; GFX1100-NEXT: .LBB2_4: ; %.exit -; GFX1100-NEXT: s_or_b32 s0, s1, s2 +; GFX1100-NEXT: s_or_b32 s0, s2, s3 ; GFX1100-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1100-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 ; GFX1100-NEXT: s_setpc_b64 s[30:31] @@ -226,7 +259,8 @@ define i32 @combine_add_zext_or() { .a: ; preds = %bb9, %.entry %.2 = phi i32 [ 0, %.entry ], [ %i11, %bb9 ] - br i1 undef, label %bb9, label %bb + %cmp = icmp eq i32 %cond, 0 + br i1 %cmp, label %bb9, label %bb bb: ; preds = %.a %.i3 = call i32 @llvm.amdgcn.raw.ptr.buffer.load.i32(ptr addrspace(8) poison, i32 %.2, i32 64, i32 1) @@ -248,60 +282,71 @@ bb9: ; preds = %bb, %.a ; Test that unused lanes in the s_or result are masked out with v_cndmask. -define i32 @combine_sub_zext_or() { +define i32 @combine_sub_zext_or(i32 inreg %cond) { ; GFX1010-LABEL: combine_sub_zext_or: ; GFX1010: ; %bb.0: ; %.entry ; GFX1010-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX1010-NEXT: s_mov_b32 s4, 0 +; GFX1010-NEXT: s_cmp_lg_u32 s16, 0 +; GFX1010-NEXT: s_mov_b32 s5, 0 +; GFX1010-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1010-NEXT: v_cndmask_b32_e64 v0, 0, 1, s4 +; GFX1010-NEXT: v_cmp_ne_u32_e64 s4, 1, v0 ; GFX1010-NEXT: s_branch .LBB3_2 ; GFX1010-NEXT: .LBB3_1: ; %bb9 ; GFX1010-NEXT: ; in Loop: Header=BB3_2 Depth=1 -; GFX1010-NEXT: s_cmpk_gt_i32 s4, 0xfbe6 -; GFX1010-NEXT: s_cselect_b32 s6, -1, 0 -; GFX1010-NEXT: s_add_i32 s4, s4, -1 -; GFX1010-NEXT: s_and_b32 vcc_lo, exec_lo, s6 +; GFX1010-NEXT: s_cmpk_gt_i32 s5, 0xfbe6 +; GFX1010-NEXT: s_cselect_b32 s7, -1, 0 +; GFX1010-NEXT: s_add_i32 s5, s5, -1 +; GFX1010-NEXT: s_and_b32 vcc_lo, exec_lo, s7 ; GFX1010-NEXT: s_cbranch_vccz .LBB3_4 ; GFX1010-NEXT: .LBB3_2: ; %.a ; GFX1010-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1010-NEXT: ; implicit-def: $sgpr5 -; GFX1010-NEXT: s_cbranch_scc1 .LBB3_1 +; GFX1010-NEXT: s_and_b32 vcc_lo, exec_lo, s4 +; GFX1010-NEXT: ; implicit-def: $sgpr6 +; GFX1010-NEXT: s_cbranch_vccnz .LBB3_1 ; GFX1010-NEXT: ; %bb.3: ; %bb ; GFX1010-NEXT: ; in Loop: Header=BB3_2 Depth=1 -; GFX1010-NEXT: v_mov_b32_e32 v0, s4 +; GFX1010-NEXT: v_mov_b32_e32 v0, s5 ; GFX1010-NEXT: buffer_load_dword v0, v0, s[4:7], 64 offen glc ; GFX1010-NEXT: s_waitcnt vmcnt(0) -; GFX1010-NEXT: v_cmp_eq_u32_e64 s5, 0, v0 +; GFX1010-NEXT: v_cmp_eq_u32_e64 s6, 0, v0 ; GFX1010-NEXT: s_branch .LBB3_1 ; GFX1010-NEXT: .LBB3_4: ; %.exit -; GFX1010-NEXT: s_or_b32 s4, s5, s6 +; GFX1010-NEXT: s_or_b32 s4, s6, s7 ; GFX1010-NEXT: v_cndmask_b32_e64 v0, 0, 1, s4 ; GFX1010-NEXT: s_setpc_b64 s[30:31] ; ; GFX1100-LABEL: combine_sub_zext_or: ; GFX1100: ; %bb.0: ; %.entry ; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX1100-NEXT: s_mov_b32 s0, 0 +; GFX1100-NEXT: s_cmp_lg_u32 s0, 0 +; GFX1100-NEXT: s_mov_b32 s1, 0 +; GFX1100-NEXT: s_cselect_b32 s0, -1, 0 +; GFX1100-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1100-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX1100-NEXT: v_cmp_ne_u32_e64 s0, 1, v0 ; GFX1100-NEXT: s_branch .LBB3_2 ; GFX1100-NEXT: .LBB3_1: ; %bb9 ; GFX1100-NEXT: ; in Loop: Header=BB3_2 Depth=1 -; GFX1100-NEXT: s_cmpk_gt_i32 s0, 0xfbe6 -; GFX1100-NEXT: s_cselect_b32 s2, -1, 0 -; GFX1100-NEXT: s_add_i32 s0, s0, -1 -; GFX1100-NEXT: s_and_b32 vcc_lo, exec_lo, s2 +; GFX1100-NEXT: s_cmpk_gt_i32 s1, 0xfbe6 +; GFX1100-NEXT: s_cselect_b32 s3, -1, 0 +; GFX1100-NEXT: s_add_i32 s1, s1, -1 +; GFX1100-NEXT: s_and_b32 vcc_lo, exec_lo, s3 ; GFX1100-NEXT: s_cbranch_vccz .LBB3_4 ; GFX1100-NEXT: .LBB3_2: ; %.a ; GFX1100-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1100-NEXT: ; implicit-def: $sgpr1 -; GFX1100-NEXT: s_cbranch_scc1 .LBB3_1 +; GFX1100-NEXT: s_and_b32 vcc_lo, exec_lo, s0 +; GFX1100-NEXT: ; implicit-def: $sgpr2 +; GFX1100-NEXT: s_cbranch_vccnz .LBB3_1 ; GFX1100-NEXT: ; %bb.3: ; %bb ; GFX1100-NEXT: ; in Loop: Header=BB3_2 Depth=1 -; GFX1100-NEXT: v_mov_b32_e32 v0, s0 +; GFX1100-NEXT: v_mov_b32_e32 v0, s1 ; GFX1100-NEXT: buffer_load_b32 v0, v0, s[0:3], 64 offen glc ; GFX1100-NEXT: s_waitcnt vmcnt(0) -; GFX1100-NEXT: v_cmp_eq_u32_e64 s1, 0, v0 +; GFX1100-NEXT: v_cmp_eq_u32_e64 s2, 0, v0 ; GFX1100-NEXT: s_branch .LBB3_1 ; GFX1100-NEXT: .LBB3_4: ; %.exit -; GFX1100-NEXT: s_or_b32 s0, s1, s2 +; GFX1100-NEXT: s_or_b32 s0, s2, s3 ; GFX1100-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1100-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 ; GFX1100-NEXT: s_setpc_b64 s[30:31] @@ -310,7 +355,8 @@ define i32 @combine_sub_zext_or() { .a: ; preds = %bb9, %.entry %.2 = phi i32 [ 0, %.entry ], [ %i11, %bb9 ] - br i1 undef, label %bb9, label %bb + %cmp = icmp eq i32 %cond, 0 + br i1 %cmp, label %bb9, label %bb bb: ; preds = %.a %.i3 = call i32 @llvm.amdgcn.raw.ptr.buffer.load.i32(ptr addrspace(8) poison, i32 %.2, i32 64, i32 1) @@ -332,28 +378,33 @@ bb9: ; preds = %bb, %.a ; Test that unused lanes in the s_and result are masked out with v_cndmask. -define i32 @combine_add_zext_and() { +define i32 @combine_add_zext_and(i32 inreg %cond) { ; GFX1010-LABEL: combine_add_zext_and: ; GFX1010: ; %bb.0: ; %.entry ; GFX1010-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1010-NEXT: s_cmp_lg_u32 s16, 0 ; GFX1010-NEXT: v_mov_b32_e32 v1, 0 +; GFX1010-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1010-NEXT: v_cndmask_b32_e64 v0, 0, 1, s4 +; GFX1010-NEXT: v_cmp_ne_u32_e64 s4, 1, v0 ; GFX1010-NEXT: s_branch .LBB4_2 ; GFX1010-NEXT: .LBB4_1: ; %bb9 ; GFX1010-NEXT: ; in Loop: Header=BB4_2 Depth=1 ; GFX1010-NEXT: v_cmp_lt_i32_e32 vcc_lo, 0xfffffbe6, v1 -; GFX1010-NEXT: s_and_b32 s4, s4, vcc_lo -; GFX1010-NEXT: v_cndmask_b32_e64 v0, 0, 1, s4 +; GFX1010-NEXT: s_and_b32 s5, s5, vcc_lo +; GFX1010-NEXT: v_cndmask_b32_e64 v0, 0, 1, s5 ; GFX1010-NEXT: v_add_nc_u32_e32 v1, v1, v0 ; GFX1010-NEXT: s_cbranch_vccz .LBB4_4 ; GFX1010-NEXT: .LBB4_2: ; %.a ; GFX1010-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1010-NEXT: ; implicit-def: $sgpr4 -; GFX1010-NEXT: s_cbranch_scc1 .LBB4_1 +; GFX1010-NEXT: s_and_b32 vcc_lo, exec_lo, s4 +; GFX1010-NEXT: ; implicit-def: $sgpr5 +; GFX1010-NEXT: s_cbranch_vccnz .LBB4_1 ; GFX1010-NEXT: ; %bb.3: ; %bb ; GFX1010-NEXT: ; in Loop: Header=BB4_2 Depth=1 ; GFX1010-NEXT: buffer_load_dword v0, v1, s[4:7], 64 offen glc ; GFX1010-NEXT: s_waitcnt vmcnt(0) -; GFX1010-NEXT: v_cmp_eq_u32_e64 s4, 0, v0 +; GFX1010-NEXT: v_cmp_eq_u32_e64 s5, 0, v0 ; GFX1010-NEXT: s_branch .LBB4_1 ; GFX1010-NEXT: .LBB4_4: ; %.exit ; GFX1010-NEXT: s_setpc_b64 s[30:31] @@ -361,26 +412,32 @@ define i32 @combine_add_zext_and() { ; GFX1100-LABEL: combine_add_zext_and: ; GFX1100: ; %bb.0: ; %.entry ; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1100-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1100-NEXT: v_mov_b32_e32 v1, 0 +; GFX1100-NEXT: s_cselect_b32 s0, -1, 0 +; GFX1100-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1100-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX1100-NEXT: v_cmp_ne_u32_e64 s0, 1, v0 ; GFX1100-NEXT: s_branch .LBB4_2 ; GFX1100-NEXT: .LBB4_1: ; %bb9 ; GFX1100-NEXT: ; in Loop: Header=BB4_2 Depth=1 ; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1100-NEXT: v_cmp_lt_i32_e32 vcc_lo, 0xfffffbe6, v1 -; GFX1100-NEXT: s_and_b32 s0, s0, vcc_lo -; GFX1100-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX1100-NEXT: s_and_b32 s1, s1, vcc_lo +; GFX1100-NEXT: v_cndmask_b32_e64 v0, 0, 1, s1 ; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1100-NEXT: v_add_nc_u32_e32 v1, v1, v0 ; GFX1100-NEXT: s_cbranch_vccz .LBB4_4 ; GFX1100-NEXT: .LBB4_2: ; %.a ; GFX1100-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1100-NEXT: ; implicit-def: $sgpr0 -; GFX1100-NEXT: s_cbranch_scc1 .LBB4_1 +; GFX1100-NEXT: s_and_b32 vcc_lo, exec_lo, s0 +; GFX1100-NEXT: ; implicit-def: $sgpr1 +; GFX1100-NEXT: s_cbranch_vccnz .LBB4_1 ; GFX1100-NEXT: ; %bb.3: ; %bb ; GFX1100-NEXT: ; in Loop: Header=BB4_2 Depth=1 ; GFX1100-NEXT: buffer_load_b32 v0, v1, s[0:3], 64 offen glc ; GFX1100-NEXT: s_waitcnt vmcnt(0) -; GFX1100-NEXT: v_cmp_eq_u32_e64 s0, 0, v0 +; GFX1100-NEXT: v_cmp_eq_u32_e64 s1, 0, v0 ; GFX1100-NEXT: s_branch .LBB4_1 ; GFX1100-NEXT: .LBB4_4: ; %.exit ; GFX1100-NEXT: s_setpc_b64 s[30:31] @@ -389,7 +446,8 @@ define i32 @combine_add_zext_and() { .a: ; preds = %bb9, %.entry %.2 = phi i32 [ 0, %.entry ], [ %i11, %bb9 ] - br i1 undef, label %bb9, label %bb + %cmp = icmp eq i32 %cond, 0 + br i1 %cmp, label %bb9, label %bb bb: ; preds = %.a %.i3 = call i32 @llvm.amdgcn.raw.ptr.buffer.load.i32(ptr addrspace(8) poison, i32 %.2, i32 64, i32 1) diff --git a/llvm/test/CodeGen/AMDGPU/fold-fabs.ll b/llvm/test/CodeGen/AMDGPU/fold-fabs.ll index 6a8594a168f03..1c6ab3c14da57 100644 --- a/llvm/test/CodeGen/AMDGPU/fold-fabs.ll +++ b/llvm/test/CodeGen/AMDGPU/fold-fabs.ll @@ -81,7 +81,8 @@ define float @fold_abs_in_branch_undef(float %arg1, float %arg2) { entry: %0 = fadd reassoc nnan nsz arcp contract afn float %arg1, %arg2 %1 = fadd reassoc nnan nsz arcp contract afn float %0, %arg2 - %2 = call reassoc nnan nsz arcp contract afn float @llvm.fabs.f32(float undef) + %undef = freeze float poison + %2 = call reassoc nnan nsz arcp contract afn float @llvm.fabs.f32(float %undef) %3 = fmul reassoc nnan nsz arcp contract afn float %2, 2.000000e+00 %4 = fcmp ule float %3, 1.000000e+00 br i1 %4, label %if, label %exit diff --git a/llvm/test/CodeGen/AMDGPU/i1-copy-implicit-def.ll b/llvm/test/CodeGen/AMDGPU/i1-copy-implicit-def.ll index 338bea9d4f73f..cd0a15e4d7e2e 100644 --- a/llvm/test/CodeGen/AMDGPU/i1-copy-implicit-def.ll +++ b/llvm/test/CodeGen/AMDGPU/i1-copy-implicit-def.ll @@ -2,12 +2,28 @@ ; RUN: llc -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s ; SILowerI1Copies was not handling IMPLICIT_DEF -; SI-LABEL: {{^}}br_implicit_def: +; SI-LABEL: {{^}}br_poison: ; SI: %bb.0: ; SI-NEXT: s_cbranch_scc1 -define amdgpu_kernel void @br_implicit_def(ptr addrspace(1) %out, i32 %arg) #0 { +define amdgpu_kernel void @br_poison(ptr addrspace(1) %out, i32 %arg) #0 { bb: - br i1 undef, label %bb1, label %bb2 + br i1 poison, label %bb1, label %bb2 + +bb1: + store volatile i32 123, ptr addrspace(1) %out + ret void + +bb2: + ret void +} + +; SI-LABEL: {{^}}br_freeze_poison: +; SI: %bb.0: +; SI-NEXT: s_cbranch_scc1 +define amdgpu_kernel void @br_freeze_poison(ptr addrspace(1) %out, i32 %arg) #0 { +bb: + %undef = freeze i1 poison + br i1 %undef, label %bb1, label %bb2 bb1: store volatile i32 123, ptr addrspace(1) %out diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-invalid-addrspace.mir b/llvm/test/CodeGen/AMDGPU/memory-legalizer-invalid-addrspace.mir index 7a914c2322229..0251284696591 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-invalid-addrspace.mir +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-invalid-addrspace.mir @@ -11,7 +11,7 @@ body: | $vgpr0 = V_MOV_B32_e32 $sgpr2, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr2_sgpr3 $vgpr1 = V_MOV_B32_e32 killed $sgpr3, implicit $exec, implicit $sgpr2_sgpr3, implicit $exec - renamable $vgpr2 = FLAT_LOAD_DWORD killed renamable $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr :: (volatile load syncscope("one-as") seq_cst (s32) from `ptr addrspace(42) undef`) + renamable $vgpr2 = FLAT_LOAD_DWORD killed renamable $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr :: (volatile load syncscope("one-as") seq_cst (s32) from `ptr addrspace(42) poison`) $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into `ptr poison`) @@ -30,7 +30,7 @@ body: | $vgpr2 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec - FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, implicit $exec, implicit $flat_scr :: (volatile store syncscope("agent-one-as") seq_cst (s32) into `ptr addrspace(42) undef`) + FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, implicit $exec, implicit $flat_scr :: (volatile store syncscope("agent-one-as") seq_cst (s32) into `ptr addrspace(42) poison`) S_ENDPGM 0 ... @@ -47,7 +47,7 @@ body: | $vgpr0 = V_MOV_B32_e32 killed $sgpr4, implicit $exec, implicit $exec $vgpr1 = V_MOV_B32_e32 killed $sgpr5, implicit $exec, implicit $exec $vgpr2 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec - FLAT_ATOMIC_CMPSWAP killed renamable $vgpr2_vgpr3, killed renamable $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr :: (volatile load store syncscope("workgroup-one-as") seq_cst seq_cst (s32) on `ptr addrspace(42) undef`) + FLAT_ATOMIC_CMPSWAP killed renamable $vgpr2_vgpr3, killed renamable $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr :: (volatile load store syncscope("workgroup-one-as") seq_cst seq_cst (s32) on `ptr addrspace(42) poison`) S_ENDPGM 0 ... @@ -63,7 +63,7 @@ body: | $vgpr0 = V_MOV_B32_e32 $sgpr2, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr2_sgpr3 $vgpr1 = V_MOV_B32_e32 killed $sgpr3, implicit $exec, implicit $sgpr2_sgpr3, implicit $exec $vgpr2 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec - FLAT_ATOMIC_SWAP killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, implicit $exec, implicit $flat_scr :: (volatile load store syncscope("wavefront-one-as") seq_cst (s32) on `ptr addrspace(42) undef`) + FLAT_ATOMIC_SWAP killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, implicit $exec, implicit $flat_scr :: (volatile load store syncscope("wavefront-one-as") seq_cst (s32) on `ptr addrspace(42) poison`) S_ENDPGM 0 ... diff --git a/llvm/test/CodeGen/AMDGPU/merge-load-store-vreg.mir b/llvm/test/CodeGen/AMDGPU/merge-load-store-vreg.mir index 2b3851c348d55..40c47f0e979fb 100644 --- a/llvm/test/CodeGen/AMDGPU/merge-load-store-vreg.mir +++ b/llvm/test/CodeGen/AMDGPU/merge-load-store-vreg.mir @@ -19,7 +19,7 @@ # GFX9-NEXT: DS_READ2_B32_gfx9 killed %10, 16, 24, --- | - @0 = internal unnamed_addr addrspace(3) global [256 x float] undef, align 4 + @0 = internal unnamed_addr addrspace(3) global [256 x float] poison, align 4 define amdgpu_kernel void @ds_combine_base_offset() { bb.0: diff --git a/llvm/test/CodeGen/AMDGPU/multi-divergent-exit-region.ll b/llvm/test/CodeGen/AMDGPU/multi-divergent-exit-region.ll index 9fc6af6f0dd6a..42c6589f417ba 100644 --- a/llvm/test/CodeGen/AMDGPU/multi-divergent-exit-region.ll +++ b/llvm/test/CodeGen/AMDGPU/multi-divergent-exit-region.ll @@ -689,7 +689,7 @@ divergent.ret: ; IR: UnifiedReturnBlock: ; IR-NEXT: call void @llvm.amdgcn.end.cf.i64(i64 ; IR-NEXT: ret void -define amdgpu_kernel void @multi_divergent_unreachable_exit() #0 { +define amdgpu_kernel void @multi_divergent_unreachable_exit(i32 %switch) #0 { bb: %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() switch i32 %tmp, label %bb3 [ @@ -704,7 +704,7 @@ bb2: ; preds = %bb unreachable bb3: ; preds = %bb - switch i32 undef, label %bb5 [ + switch i32 %switch, label %bb5 [ i32 2, label %bb4 ] diff --git a/llvm/test/CodeGen/AMDGPU/nested-loop-conditions.ll b/llvm/test/CodeGen/AMDGPU/nested-loop-conditions.ll index e1c2bde99eed2..9a2d969f94e3e 100644 --- a/llvm/test/CodeGen/AMDGPU/nested-loop-conditions.ll +++ b/llvm/test/CodeGen/AMDGPU/nested-loop-conditions.ll @@ -252,10 +252,10 @@ define amdgpu_kernel void @nested_loop_conditions(ptr addrspace(1) captures(none ; IR: [[BB21]]: ; IR-NEXT: [[MY_TMP22:%.*]] = extractelement <2 x i32> [[MY_TMP17]], i64 1 ; IR-NEXT: [[MY_TMP23:%.*]] = lshr i32 [[MY_TMP22]], 16 -; IR-NEXT: [[MY_TMP24:%.*]] = select i1 undef, i32 undef, i32 [[MY_TMP23]] +; IR-NEXT: [[MY_TMP24:%.*]] = select i1 false, i32 0, i32 [[MY_TMP23]] ; IR-NEXT: [[MY_TMP25:%.*]] = uitofp i32 [[MY_TMP24]] to float ; IR-NEXT: [[MY_TMP26:%.*]] = fmul float [[MY_TMP25]], 0x3EF0001000000000 -; IR-NEXT: [[MY_TMP27:%.*]] = fsub float [[MY_TMP26]], undef +; IR-NEXT: [[MY_TMP27:%.*]] = fsub float [[MY_TMP26]], 0x7FF8000000000000 ; IR-NEXT: [[MY_TMP28:%.*]] = fcmp olt float [[MY_TMP27]], 5.000000e-01 ; IR-NEXT: [[MY_TMP29:%.*]] = select i1 [[MY_TMP28]], i64 1, i64 2 ; IR-NEXT: [[MY_TMP30:%.*]] = extractelement <4 x i32> [[MY_TMP936]], i64 [[MY_TMP29]] @@ -317,10 +317,10 @@ bb18: ; preds = %bb18, %bb16 bb21: ; preds = %bb18 %my.tmp22 = extractelement <2 x i32> %my.tmp17, i64 1 %my.tmp23 = lshr i32 %my.tmp22, 16 - %my.tmp24 = select i1 undef, i32 undef, i32 %my.tmp23 + %my.tmp24 = select i1 false, i32 0, i32 %my.tmp23 %my.tmp25 = uitofp i32 %my.tmp24 to float %my.tmp26 = fmul float %my.tmp25, 0x3EF0001000000000 - %my.tmp27 = fsub float %my.tmp26, undef + %my.tmp27 = fsub float %my.tmp26, 0x7FF8000000000000 %my.tmp28 = fcmp olt float %my.tmp27, 5.000000e-01 %my.tmp29 = select i1 %my.tmp28, i64 1, i64 2 %my.tmp30 = extractelement <4 x i32> %my.tmp936, i64 %my.tmp29 diff --git a/llvm/test/CodeGen/AMDGPU/schedule-amdgpu-trackers.ll b/llvm/test/CodeGen/AMDGPU/schedule-amdgpu-trackers.ll index 64a8f5484673f..c5732531f5423 100644 --- a/llvm/test/CodeGen/AMDGPU/schedule-amdgpu-trackers.ll +++ b/llvm/test/CodeGen/AMDGPU/schedule-amdgpu-trackers.ll @@ -93,7 +93,7 @@ entry: %conv = add i32 %i6, %i7 %conv.frozen = freeze i32 %conv %div = udiv i32 %conv.frozen, 49 - %add.ptr22 = getelementptr inbounds float, ptr addrspace(4) %wei_ptr, i64 undef + %add.ptr22 = getelementptr inbounds float, ptr addrspace(4) %wei_ptr, i64 0 %in.ptr1 = getelementptr inbounds float, ptr addrspace(1) %in, i32 %i5 br label %for.cond28.preheader @@ -530,11 +530,11 @@ for.cond28.preheader: ; preds = %for.cond28.preheade br i1 %exitcond.not, label %for.cond.cleanup26, label %for.cond28.preheader for.cond.cleanup26: ; preds = %for.cond28.preheader - %mul119 = shl nuw nsw i32 undef, 1 + %mul119 = shl nuw nsw i32 0, 1 %mul120 = mul i32 %div, 200704 - %mul121 = mul i32 undef, 6272 + %mul121 = mul i32 0, 6272 %add122 = add i32 %mul120, %mul121 - %mul123 = mul nuw nsw i32 undef, 28 + %mul123 = mul nuw nsw i32 0, 28 %add124 = add i32 %add122, %mul123 %add126 = add i32 %add124, %mul119 %idx.ext127 = zext i32 %add126 to i64 diff --git a/llvm/test/CodeGen/AMDGPU/si-annotate-cf-noloop.ll b/llvm/test/CodeGen/AMDGPU/si-annotate-cf-noloop.ll index 1620e2778223c..522b46526f0b9 100644 --- a/llvm/test/CodeGen/AMDGPU/si-annotate-cf-noloop.ll +++ b/llvm/test/CodeGen/AMDGPU/si-annotate-cf-noloop.ll @@ -2,7 +2,6 @@ ; RUN: opt -mtriple=amdgcn-- -S -passes=structurizecfg,si-annotate-control-flow -simplifycfg-require-and-preserve-domtree=1 %s | FileCheck -check-prefix=OPT %s ; RUN: llc -mtriple=amdgcn -verify-machineinstrs -simplifycfg-require-and-preserve-domtree=1 < %s | FileCheck -check-prefix=GCN %s - ; OPT-LABEL: @annotate_unreachable_noloop( ; OPT-NOT: call i1 @llvm.amdgcn.loop @@ -19,7 +18,7 @@ bb1: ; preds = %bb %tmp2 = sext i32 %tmp to i64 %tmp3 = getelementptr inbounds <4 x float>, ptr addrspace(1) %arg, i64 %tmp2 %tmp4 = load <4 x float>, ptr addrspace(1) %tmp3, align 16 - br i1 undef, label %bb5, label %bb3 + br i1 poison, label %bb5, label %bb3 bb3: ; preds = %bb1 %tmp6 = extractelement <4 x float> %tmp4, i32 2 @@ -84,7 +83,8 @@ bb1: ; preds = %bb %tmp2 = sext i32 %tmp to i64 %tmp3 = getelementptr inbounds <4 x float>, ptr addrspace(1) %arg, i64 %tmp2 %tmp4 = load <4 x float>, ptr addrspace(1) %tmp3, align 16 - br i1 undef, label %bb5, label %bb3 + %undef = freeze i1 poison + br i1 %undef, label %bb5, label %bb3 bb3: ; preds = %bb1 %tmp6 = extractelement <4 x float> %tmp4, i32 2 diff --git a/llvm/test/CodeGen/AMDGPU/si-spill-cf.ll b/llvm/test/CodeGen/AMDGPU/si-spill-cf.ll index 25592c8ac8072..88daad2bf6949 100644 --- a/llvm/test/CodeGen/AMDGPU/si-spill-cf.ll +++ b/llvm/test/CodeGen/AMDGPU/si-spill-cf.ll @@ -7,75 +7,75 @@ ; SI: s_or_b64 exec, exec, [[SAVED:s\[[0-9]+:[0-9]+\]|[a-z]+]] ; SI-NOT: v_readlane_b32 [[SAVED]] -define amdgpu_ps void @main() #0 { +define amdgpu_ps void @main(<4 x i32> inreg %rsrc) #0 { main_body: - %tmp = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> poison, i32 16, i32 0) - %tmp1 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> poison, i32 32, i32 0) - %tmp2 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> poison, i32 80, i32 0) - %tmp3 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> poison, i32 84, i32 0) - %tmp4 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> poison, i32 88, i32 0) - %tmp5 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> poison, i32 96, i32 0) - %tmp6 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> poison, i32 100, i32 0) - %tmp7 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> poison, i32 104, i32 0) - %tmp8 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> poison, i32 112, i32 0) - %tmp9 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> poison, i32 116, i32 0) - %tmp10 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> poison, i32 120, i32 0) - %tmp11 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> poison, i32 128, i32 0) - %tmp12 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> poison, i32 132, i32 0) - %tmp13 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> poison, i32 136, i32 0) - %tmp14 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> poison, i32 144, i32 0) - %tmp15 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> poison, i32 148, i32 0) - %tmp16 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> poison, i32 152, i32 0) - %tmp17 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> poison, i32 160, i32 0) - %tmp18 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> poison, i32 164, i32 0) - %tmp19 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> poison, i32 168, i32 0) - %tmp20 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> poison, i32 176, i32 0) - %tmp21 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> poison, i32 180, i32 0) - %tmp22 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> poison, i32 184, i32 0) - %tmp23 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> poison, i32 192, i32 0) - %tmp24 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> poison, i32 196, i32 0) - %tmp25 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> poison, i32 200, i32 0) - %tmp26 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> poison, i32 208, i32 0) - %tmp27 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> poison, i32 212, i32 0) - %tmp28 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> poison, i32 216, i32 0) - %tmp29 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> poison, i32 224, i32 0) - %tmp30 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> poison, i32 228, i32 0) - %tmp31 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> poison, i32 232, i32 0) - %tmp32 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> poison, i32 240, i32 0) - %tmp33 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> poison, i32 244, i32 0) - %tmp34 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> poison, i32 248, i32 0) - %tmp35 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> poison, i32 256, i32 0) - %tmp36 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> poison, i32 260, i32 0) - %tmp37 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> poison, i32 264, i32 0) - %tmp38 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> poison, i32 272, i32 0) - %tmp39 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> poison, i32 276, i32 0) - %tmp40 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> poison, i32 280, i32 0) - %tmp41 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> poison, i32 288, i32 0) - %tmp42 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> poison, i32 292, i32 0) - %tmp43 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> poison, i32 296, i32 0) - %tmp44 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> poison, i32 304, i32 0) - %tmp45 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> poison, i32 308, i32 0) - %tmp46 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> poison, i32 312, i32 0) - %tmp47 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> poison, i32 320, i32 0) - %tmp48 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> poison, i32 324, i32 0) - %tmp49 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> poison, i32 328, i32 0) - %tmp50 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> poison, i32 336, i32 0) - %tmp51 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> poison, i32 340, i32 0) - %tmp52 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> poison, i32 344, i32 0) - %tmp53 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> poison, i32 352, i32 0) - %tmp54 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> poison, i32 356, i32 0) - %tmp55 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> poison, i32 360, i32 0) - %tmp56 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> poison, i32 368, i32 0) - %tmp57 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> poison, i32 372, i32 0) - %tmp58 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> poison, i32 376, i32 0) - %tmp59 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> poison, i32 384, i32 0) - %tmp60 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> poison, i32 388, i32 0) - %tmp61 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> poison, i32 392, i32 0) - %tmp62 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> poison, i32 400, i32 0) - %tmp63 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> poison, i32 404, i32 0) - %tmp64 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> poison, i32 408, i32 0) - %tmp65 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> poison, i32 416, i32 0) - %tmp66 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> poison, i32 420, i32 0) + %tmp = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %rsrc, i32 16, i32 0) + %tmp1 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %rsrc, i32 32, i32 0) + %tmp2 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %rsrc, i32 80, i32 0) + %tmp3 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %rsrc, i32 84, i32 0) + %tmp4 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %rsrc, i32 88, i32 0) + %tmp5 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %rsrc, i32 96, i32 0) + %tmp6 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %rsrc, i32 100, i32 0) + %tmp7 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %rsrc, i32 104, i32 0) + %tmp8 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %rsrc, i32 112, i32 0) + %tmp9 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %rsrc, i32 116, i32 0) + %tmp10 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %rsrc, i32 120, i32 0) + %tmp11 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %rsrc, i32 128, i32 0) + %tmp12 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %rsrc, i32 132, i32 0) + %tmp13 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %rsrc, i32 136, i32 0) + %tmp14 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %rsrc, i32 144, i32 0) + %tmp15 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %rsrc, i32 148, i32 0) + %tmp16 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %rsrc, i32 152, i32 0) + %tmp17 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %rsrc, i32 160, i32 0) + %tmp18 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %rsrc, i32 164, i32 0) + %tmp19 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %rsrc, i32 168, i32 0) + %tmp20 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %rsrc, i32 176, i32 0) + %tmp21 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %rsrc, i32 180, i32 0) + %tmp22 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %rsrc, i32 184, i32 0) + %tmp23 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %rsrc, i32 192, i32 0) + %tmp24 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %rsrc, i32 196, i32 0) + %tmp25 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %rsrc, i32 200, i32 0) + %tmp26 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %rsrc, i32 208, i32 0) + %tmp27 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %rsrc, i32 212, i32 0) + %tmp28 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %rsrc, i32 216, i32 0) + %tmp29 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %rsrc, i32 224, i32 0) + %tmp30 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %rsrc, i32 228, i32 0) + %tmp31 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %rsrc, i32 232, i32 0) + %tmp32 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %rsrc, i32 240, i32 0) + %tmp33 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %rsrc, i32 244, i32 0) + %tmp34 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %rsrc, i32 248, i32 0) + %tmp35 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %rsrc, i32 256, i32 0) + %tmp36 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %rsrc, i32 260, i32 0) + %tmp37 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %rsrc, i32 264, i32 0) + %tmp38 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %rsrc, i32 272, i32 0) + %tmp39 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %rsrc, i32 276, i32 0) + %tmp40 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %rsrc, i32 280, i32 0) + %tmp41 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %rsrc, i32 288, i32 0) + %tmp42 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %rsrc, i32 292, i32 0) + %tmp43 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %rsrc, i32 296, i32 0) + %tmp44 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %rsrc, i32 304, i32 0) + %tmp45 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %rsrc, i32 308, i32 0) + %tmp46 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %rsrc, i32 312, i32 0) + %tmp47 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %rsrc, i32 320, i32 0) + %tmp48 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %rsrc, i32 324, i32 0) + %tmp49 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %rsrc, i32 328, i32 0) + %tmp50 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %rsrc, i32 336, i32 0) + %tmp51 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %rsrc, i32 340, i32 0) + %tmp52 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %rsrc, i32 344, i32 0) + %tmp53 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %rsrc, i32 352, i32 0) + %tmp54 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %rsrc, i32 356, i32 0) + %tmp55 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %rsrc, i32 360, i32 0) + %tmp56 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %rsrc, i32 368, i32 0) + %tmp57 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %rsrc, i32 372, i32 0) + %tmp58 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %rsrc, i32 376, i32 0) + %tmp59 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %rsrc, i32 384, i32 0) + %tmp60 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %rsrc, i32 388, i32 0) + %tmp61 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %rsrc, i32 392, i32 0) + %tmp62 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %rsrc, i32 400, i32 0) + %tmp63 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %rsrc, i32 404, i32 0) + %tmp64 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %rsrc, i32 408, i32 0) + %tmp65 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %rsrc, i32 416, i32 0) + %tmp66 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %rsrc, i32 420, i32 0) br label %LOOP LOOP: ; preds = %ENDIF2795, %main_body @@ -90,7 +90,7 @@ ENDLOOP: ; preds = %ELSE2566, %LOOP %one.sub.ac.i = fmul float %one.sub.a.i, 0x7FF8000000000000 %fmul = fmul float 0x7FF8000000000000, 0x7FF8000000000000 %result.i = fadd float %fmul, %one.sub.ac.i - call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float poison, float %result.i, float 0x7FF8000000000000, float 1.000000e+00, i1 true, i1 true) #0 + call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float poison, float %result.i, float poison, float 1.000000e+00, i1 true, i1 true) #0 ret void ENDIF: ; preds = %LOOP @@ -107,9 +107,9 @@ ENDIF: ; preds = %LOOP %tmp78 = call float @llvm.minnum.f32(float %tmp73, float %tmp77) %tmp79 = call float @llvm.maxnum.f32(float %tmp71, float 0.000000e+00) %tmp80 = call float @llvm.maxnum.f32(float %tmp72, float %tmp76) - %tmp81 = call float @llvm.maxnum.f32(float poison, float %tmp78) + %tmp81 = call float @llvm.maxnum.f32(float 0x7FF8000000000000, float %tmp78) %tmp82 = call float @llvm.minnum.f32(float %tmp79, float %tmp80) - %tmp83 = call float @llvm.minnum.f32(float %tmp82, float poison) + %tmp83 = call float @llvm.minnum.f32(float %tmp82, float 0x7FF8000000000000) %tmp84 = fsub float %tmp14, 0x7FF8000000000000 %tmp85 = fsub float %tmp15, 0x7FF8000000000000 %tmp86 = fsub float %tmp16, 0x7FF8000000000000 @@ -125,19 +125,19 @@ ENDIF: ; preds = %LOOP %tmp96 = call float @llvm.minnum.f32(float %tmp88, float %tmp94) %tmp97 = call float @llvm.maxnum.f32(float %tmp87, float %tmp93) %tmp98 = call float @llvm.maxnum.f32(float %tmp89, float %tmp95) - %tmp99 = call float @llvm.maxnum.f32(float poison, float %tmp96) - %tmp100 = call float @llvm.maxnum.f32(float %tmp99, float poison) - %tmp101 = call float @llvm.minnum.f32(float %tmp97, float poison) + %tmp99 = call float @llvm.maxnum.f32(float 0x7FF8000000000000, float %tmp96) + %tmp100 = call float @llvm.maxnum.f32(float %tmp99, float 0x7FF8000000000000) + %tmp101 = call float @llvm.minnum.f32(float %tmp97, float 0x7FF8000000000000) %tmp102 = call float @llvm.minnum.f32(float %tmp101, float %tmp98) %tmp103 = fsub float %tmp30, 0x7FF8000000000000 %tmp104 = fsub float %tmp31, 0x7FF8000000000000 %tmp105 = fmul float %tmp103, 0.000000e+00 %tmp106 = fmul float %tmp104, 0.000000e+00 - %tmp107 = call float @llvm.minnum.f32(float poison, float %tmp105) - %tmp108 = call float @llvm.maxnum.f32(float poison, float %tmp106) - %tmp109 = call float @llvm.maxnum.f32(float poison, float %tmp107) - %tmp110 = call float @llvm.maxnum.f32(float %tmp109, float poison) - %tmp111 = call float @llvm.minnum.f32(float poison, float %tmp108) + %tmp107 = call float @llvm.minnum.f32(float 0x7FF8000000000000, float %tmp105) + %tmp108 = call float @llvm.maxnum.f32(float 0x7FF8000000000000, float %tmp106) + %tmp109 = call float @llvm.maxnum.f32(float 0x7FF8000000000000, float %tmp107) + %tmp110 = call float @llvm.maxnum.f32(float %tmp109, float 0x7FF8000000000000) + %tmp111 = call float @llvm.minnum.f32(float 0x7FF8000000000000, float %tmp108) %tmp112 = fsub float %tmp32, 0x7FF8000000000000 %tmp113 = fsub float %tmp33, 0x7FF8000000000000 %tmp114 = fsub float %tmp34, 0x7FF8000000000000 @@ -219,18 +219,20 @@ ENDIF: ; preds = %LOOP %tmp190 = fmul float %tmp188, 0x7FF8000000000000 %tmp191 = call float @llvm.maxnum.f32(float %tmp184, float %tmp189) %tmp192 = call float @llvm.maxnum.f32(float %tmp185, float %tmp190) - %tmp193 = call float @llvm.maxnum.f32(float %tmp186, float poison) + %tmp193 = call float @llvm.maxnum.f32(float %tmp186, float 0x7FF8000000000000) %tmp194 = call float @llvm.minnum.f32(float %tmp191, float %tmp192) %tmp195 = call float @llvm.minnum.f32(float %tmp194, float %tmp193) - %.temp292.7 = select i1 undef, float %tmp162, float poison + %undef0 = freeze i1 poison + %.temp292.7 = select i1 %undef0, float %tmp162, float 0x7FF8000000000000 %temp292.9 = select i1 false, float %tmp180, float %.temp292.7 - %.temp292.9 = select i1 undef, float poison, float %temp292.9 + %undef1 = freeze i1 poison + %.temp292.9 = select i1 %undef1, float 0x7FF8000000000000, float %temp292.9 %tmp196 = fcmp ogt float 0x7FF8000000000000, 0.000000e+00 %tmp197 = fcmp olt float 0x7FF8000000000000, %tmp195 %tmp198 = and i1 %tmp196, %tmp197 %tmp199 = fcmp olt float 0x7FF8000000000000, %.temp292.9 %tmp200 = and i1 %tmp198, %tmp199 - %temp292.11 = select i1 %tmp200, float poison, float %.temp292.9 + %temp292.11 = select i1 %tmp200, float 0x7FF8000000000000, float %.temp292.9 %tid0 = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) %cmp0 = icmp eq i32 %tid0, 0 br i1 %cmp0, label %IF2565, label %ELSE2566 @@ -238,7 +240,17 @@ ENDIF: ; preds = %LOOP IF2565: ; preds = %ENDIF %tid1 = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) %cmp1 = icmp eq i32 %tid1, 0 - br i1 %cmp1, label %ENDIF2582, label %ELSE2584 + %tmp212 = fadd float %tmp1, 0x7FF8000000000000 + %tmp213 = fadd float 0.000000e+00, %tmp212 + %floor = call float @llvm.floor.f32(float %tmp213) + %tmp214 = fsub float %tmp213, %floor + %tid4 = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) + %cmp4 = icmp eq i32 %tid4, 0 + %tmp215 = fsub float 1.000000e+00, %tmp214 + %tmp216 = call float @llvm.sqrt.f32(float %tmp215) + %tmp217 = fmul float %tmp216, 0x7FF8000000000000 + %tmp218 = fadd float %tmp217, 0x7FF8000000000000 + br label %ENDIF2564 ELSE2566: ; preds = %ENDIF %tid2 = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) @@ -246,14 +258,14 @@ ELSE2566: ; preds = %ENDIF %tmp201 = fcmp oeq float %temp292.11, %tidf br i1 %tmp201, label %ENDLOOP, label %ELSE2593 -ENDIF2564: ; preds = %ENDIF2594, %ENDIF2588 - %temp894.1 = phi float [ poison, %ENDIF2588 ], [ %temp894.2, %ENDIF2594 ] - %temp18.1 = phi float [ %tmp218, %ENDIF2588 ], [ poison, %ENDIF2594 ] +ENDIF2564: ; preds = %ENDIF2594, %IF2565 + %temp894.1 = phi float [ poison, %IF2565 ], [ %temp894.2, %ENDIF2594 ] + %temp18.1 = phi float [ %tmp218, %IF2565 ], [ poison, %ENDIF2594 ] %tmp202 = fsub float %tmp5, 0x7FF8000000000000 %tmp203 = fmul float %tmp202, 0x7FF8000000000000 - %tmp204 = call float @llvm.maxnum.f32(float poison, float %tmp203) - %tmp205 = call float @llvm.minnum.f32(float %tmp204, float poison) - %tmp206 = call float @llvm.minnum.f32(float %tmp205, float poison) + %tmp204 = call float @llvm.maxnum.f32(float 0x7FF8000000000000, float %tmp203) + %tmp205 = call float @llvm.minnum.f32(float %tmp204, float 0x7FF8000000000000) + %tmp206 = call float @llvm.minnum.f32(float %tmp205, float 0x7FF8000000000000) %tmp207 = fcmp ogt float 0x7FF8000000000000, 0.000000e+00 %tmp208 = fcmp olt float 0x7FF8000000000000, 1.000000e+00 %tmp209 = and i1 %tmp207, %tmp208 @@ -263,31 +275,6 @@ ENDIF2564: ; preds = %ENDIF2594, %ENDIF25 %tmp211 = and i1 %tmp209, %tmp210 br i1 %tmp211, label %ENDIF2795, label %ELSE2797 -ELSE2584: ; preds = %IF2565 - br label %ENDIF2582 - -ENDIF2582: ; preds = %ELSE2584, %IF2565 - %tmp212 = fadd float %tmp1, 0x7FF8000000000000 - %tmp213 = fadd float 0.000000e+00, %tmp212 - %floor = call float @llvm.floor.f32(float %tmp213) - %tmp214 = fsub float %tmp213, %floor - %tid4 = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) - %cmp4 = icmp eq i32 %tid4, 0 - br i1 %cmp4, label %IF2589, label %ELSE2590 - -IF2589: ; preds = %ENDIF2582 - br label %ENDIF2588 - -ELSE2590: ; preds = %ENDIF2582 - br label %ENDIF2588 - -ENDIF2588: ; preds = %ELSE2590, %IF2589 - %tmp215 = fsub float 1.000000e+00, %tmp214 - %tmp216 = call float @llvm.sqrt.f32(float %tmp215) - %tmp217 = fmul float %tmp216, 0x7FF8000000000000 - %tmp218 = fadd float %tmp217, 0x7FF8000000000000 - br label %ENDIF2564 - ELSE2593: ; preds = %ELSE2566 %tmp219 = fcmp oeq float %temp292.11, %tmp81 %tmp220 = fcmp olt float %tmp81, %tmp83 @@ -298,24 +285,20 @@ ELSE2596: ; preds = %ELSE2593 %tmp222 = fcmp oeq float %temp292.11, %tmp100 %tmp223 = fcmp olt float %tmp100, %tmp102 %tmp224 = and i1 %tmp222, %tmp223 - br i1 %tmp224, label %ENDIF2594, label %ELSE2632 + %undef_ELSE2596 = freeze i1 poison + %brmerge = or i1 %tmp224, %undef_ELSE2596 + br i1 %brmerge, label %ENDIF2594, label %ELSE2650 -ENDIF2594: ; preds = %ELSE2788, %ELSE2785, %ELSE2782, %ELSE2779, %IF2775, %ELSE2761, %ELSE2758, %IF2757, %ELSE2704, %ELSE2686, %ELSE2671, %ELSE2668, %IF2667, %ELSE2632, %ELSE2596, %ELSE2593 - %temp894.2 = phi float [ 0.000000e+00, %IF2667 ], [ 0.000000e+00, %ELSE2671 ], [ 0.000000e+00, %IF2757 ], [ 0.000000e+00, %ELSE2761 ], [ %temp894.0, %ELSE2758 ], [ 0.000000e+00, %IF2775 ], [ 0.000000e+00, %ELSE2779 ], [ 0.000000e+00, %ELSE2782 ], [ %.2848, %ELSE2788 ], [ 0.000000e+00, %ELSE2785 ], [ 0.000000e+00, %ELSE2593 ], [ 0.000000e+00, %ELSE2632 ], [ 0.000000e+00, %ELSE2704 ], [ 0.000000e+00, %ELSE2686 ], [ 0.000000e+00, %ELSE2668 ], [ 0.000000e+00, %ELSE2596 ] +ENDIF2594: ; preds = %ELSE2704, %ELSE2650, %ELSE2596, %ELSE2686, %ELSE2668, %ELSE2593 + %temp894.2 = phi float [ 0.000000e+00, %ELSE2593 ], [ 0.000000e+00, %ELSE2686 ], [ 0.000000e+00, %ELSE2668 ], [ 0.000000e+00, %ELSE2596 ], [ 0.000000e+00, %ELSE2650 ], [ %spec.select6, %ELSE2704 ] %tmp225 = fmul float %temp894.2, 0x7FF8000000000000 br label %ENDIF2564 -ELSE2632: ; preds = %ELSE2596 - br i1 undef, label %ENDIF2594, label %ELSE2650 - -ELSE2650: ; preds = %ELSE2632 +ELSE2650: ; preds = %ELSE2596 %tmp226 = fcmp oeq float %temp292.11, %tmp110 %tmp227 = fcmp olt float %tmp110, %tmp111 %tmp228 = and i1 %tmp226, %tmp227 - br i1 %tmp228, label %IF2667, label %ELSE2668 - -IF2667: ; preds = %ELSE2650 - br i1 undef, label %ENDIF2594, label %ELSE2671 + br i1 %tmp228, label %ENDIF2594, label %ELSE2668 ELSE2668: ; preds = %ELSE2650 %tmp229 = fcmp oeq float %temp292.11, %tmp128 @@ -323,9 +306,6 @@ ELSE2668: ; preds = %ELSE2650 %tmp231 = and i1 %tmp229, %tmp230 br i1 %tmp231, label %ENDIF2594, label %ELSE2686 -ELSE2671: ; preds = %IF2667 - br label %ENDIF2594 - ELSE2686: ; preds = %ELSE2668 %tmp232 = fcmp oeq float %temp292.11, %tmp145 %tmp233 = fcmp olt float %tmp145, 0x7FF8000000000000 @@ -336,37 +316,9 @@ ELSE2704: ; preds = %ELSE2686 %tmp235 = fcmp oeq float %temp292.11, %tmp180 %tmp236 = fcmp olt float %tmp180, 0x7FF8000000000000 %tmp237 = and i1 %tmp235, %tmp236 - br i1 %tmp237, label %ENDIF2594, label %ELSE2740 - -ELSE2740: ; preds = %ELSE2704 - br i1 undef, label %IF2757, label %ELSE2758 - -IF2757: ; preds = %ELSE2740 - br i1 undef, label %ENDIF2594, label %ELSE2761 - -ELSE2758: ; preds = %ELSE2740 - br i1 undef, label %IF2775, label %ENDIF2594 - -ELSE2761: ; preds = %IF2757 - br label %ENDIF2594 - -IF2775: ; preds = %ELSE2758 - %tmp238 = fcmp olt float 0x7FF8000000000000, 0x7FF8000000000000 - br i1 %tmp238, label %ENDIF2594, label %ELSE2779 - -ELSE2779: ; preds = %IF2775 - br i1 undef, label %ENDIF2594, label %ELSE2782 - -ELSE2782: ; preds = %ELSE2779 - br i1 undef, label %ENDIF2594, label %ELSE2785 - -ELSE2785: ; preds = %ELSE2782 - %tmp239 = fcmp olt float 0x7FF8000000000000, 0.000000e+00 - br i1 %tmp239, label %ENDIF2594, label %ELSE2788 - -ELSE2788: ; preds = %ELSE2785 - %tmp240 = fcmp olt float 0.000000e+00, 0x7FF8000000000000 - %.2848 = select i1 %tmp240, float -1.000000e+00, float 1.000000e+00 + %undef.ELSE2704 = freeze i1 poison + %spec.select = select i1 %undef.ELSE2704, float 0.000000e+00, float %temp894.0 + %spec.select6 = select i1 %tmp237, float 0.000000e+00, float %spec.select br label %ENDIF2594 ELSE2797: ; preds = %ENDIF2564 @@ -386,22 +338,19 @@ ELSE2797: ; preds = %ENDIF2564 %tmp254 = call float @llvm.minnum.f32(float %tmp245, float %tmp251) %tmp255 = call float @llvm.maxnum.f32(float %tmp246, float %tmp252) %tmp256 = call float @llvm.maxnum.f32(float %tmp253, float %tmp254) - %tmp257 = call float @llvm.maxnum.f32(float %tmp256, float poison) - %tmp258 = call float @llvm.minnum.f32(float poison, float %tmp255) + %tmp257 = call float @llvm.maxnum.f32(float %tmp256, float 0x7FF8000000000000) + %tmp258 = call float @llvm.minnum.f32(float 0x7FF8000000000000, float %tmp255) %tmp259 = fcmp ogt float %tmp257, 0.000000e+00 %tmp260 = fcmp olt float %tmp257, 1.000000e+00 %tmp261 = and i1 %tmp259, %tmp260 %tmp262 = fcmp olt float %tmp257, %tmp258 %tmp263 = and i1 %tmp261, %tmp262 - br i1 %tmp263, label %ENDIF2795, label %ELSE2800 + br i1 %tmp263, label %ENDIF2795, label %ELSE2803 -ENDIF2795: ; preds = %ELSE2824, %ELSE2821, %ELSE2818, %ELSE2815, %ELSE2812, %ELSE2809, %ELSE2806, %ELSE2803, %ELSE2800, %ELSE2797, %ENDIF2564 +ENDIF2795: ; preds = %ELSE2806, %ELSE2797, %ELSE2824, %ELSE2821, %ELSE2803, %ENDIF2564 br label %LOOP -ELSE2800: ; preds = %ELSE2797 - br i1 undef, label %ENDIF2795, label %ELSE2803 - -ELSE2803: ; preds = %ELSE2800 +ELSE2803: ; preds = %ELSE2797 %tmp264 = fsub float %tmp20, 0x7FF8000000000000 %tmp265 = fsub float %tmp21, 0x7FF8000000000000 %tmp266 = fsub float %tmp22, 0x7FF8000000000000 @@ -417,9 +366,9 @@ ELSE2803: ; preds = %ELSE2800 %tmp276 = call float @llvm.minnum.f32(float %tmp267, float %tmp273) %tmp277 = call float @llvm.maxnum.f32(float %tmp268, float %tmp274) %tmp278 = call float @llvm.maxnum.f32(float %tmp269, float %tmp275) - %tmp279 = call float @llvm.maxnum.f32(float %tmp276, float poison) - %tmp280 = call float @llvm.maxnum.f32(float %tmp279, float poison) - %tmp281 = call float @llvm.minnum.f32(float poison, float %tmp277) + %tmp279 = call float @llvm.maxnum.f32(float %tmp276, float 0x7FF8000000000000) + %tmp280 = call float @llvm.maxnum.f32(float %tmp279, float 0x7FF8000000000000) + %tmp281 = call float @llvm.minnum.f32(float 0x7FF8000000000000, float %tmp277) %tmp282 = call float @llvm.minnum.f32(float %tmp281, float %tmp278) %tmp283 = fcmp ogt float %tmp280, 0.000000e+00 %tmp284 = fcmp olt float %tmp280, 1.000000e+00 @@ -438,31 +387,19 @@ ELSE2806: ; preds = %ELSE2803 %tmp294 = fsub float %tmp29, 0x7FF8000000000000 %tmp295 = fmul float %tmp294, 0x7FF8000000000000 %tmp296 = call float @llvm.minnum.f32(float %tmp291, float %tmp295) - %tmp297 = call float @llvm.minnum.f32(float %tmp292, float poison) - %tmp298 = call float @llvm.maxnum.f32(float %tmp293, float poison) + %tmp297 = call float @llvm.minnum.f32(float %tmp292, float 0x7FF8000000000000) + %tmp298 = call float @llvm.maxnum.f32(float %tmp293, float 0x7FF8000000000000) %tmp299 = call float @llvm.maxnum.f32(float %tmp296, float %tmp297) - %tmp300 = call float @llvm.maxnum.f32(float %tmp299, float poison) - %tmp301 = call float @llvm.minnum.f32(float poison, float %tmp298) + %tmp300 = call float @llvm.maxnum.f32(float %tmp299, float 0x7FF8000000000000) + %tmp301 = call float @llvm.minnum.f32(float 0x7FF8000000000000, float %tmp298) %tmp302 = fcmp ogt float %tmp300, 0.000000e+00 %tmp303 = fcmp olt float %tmp300, 1.000000e+00 %tmp304 = and i1 %tmp302, %tmp303 %tmp305 = fcmp olt float %tmp300, %tmp301 %tmp306 = and i1 %tmp304, %tmp305 - br i1 %tmp306, label %ENDIF2795, label %ELSE2809 - -ELSE2809: ; preds = %ELSE2806 - br i1 undef, label %ENDIF2795, label %ELSE2812 - -ELSE2812: ; preds = %ELSE2809 - br i1 undef, label %ENDIF2795, label %ELSE2815 - -ELSE2815: ; preds = %ELSE2812 - br i1 undef, label %ENDIF2795, label %ELSE2818 - -ELSE2818: ; preds = %ELSE2815 - br i1 undef, label %ENDIF2795, label %ELSE2821 + br i1 %tmp306, label %ENDIF2795, label %ELSE2821 -ELSE2821: ; preds = %ELSE2818 +ELSE2821: ; preds = %ELSE2806 %tmp307 = fsub float %tmp56, 0x7FF8000000000000 %tmp308 = fsub float %tmp57, 0x7FF8000000000000 %tmp309 = fsub float %tmp58, 0x7FF8000000000000 @@ -488,7 +425,8 @@ ELSE2821: ; preds = %ELSE2818 br i1 %tmp328, label %ENDIF2795, label %ELSE2824 ELSE2824: ; preds = %ELSE2821 - %.2849 = select i1 undef, float 0.000000e+00, float 1.000000e+00 + %undef = freeze i1 poison + %.2849 = select i1 %undef, float 0.000000e+00, float 1.000000e+00 br label %ENDIF2795 } diff --git a/llvm/test/CodeGen/AMDGPU/skip-if-dead.ll b/llvm/test/CodeGen/AMDGPU/skip-if-dead.ll index 244a90fa0c4c4..7e0341efad6f8 100644 --- a/llvm/test/CodeGen/AMDGPU/skip-if-dead.ll +++ b/llvm/test/CodeGen/AMDGPU/skip-if-dead.ll @@ -1134,17 +1134,19 @@ exit: } ; bug 28550 -define amdgpu_ps void @phi_use_def_before_kill(float inreg %x) #0 { +define amdgpu_ps void @phi_use_def_before_kill(float inreg %x, i32 inreg %y) #0 { ; SI-LABEL: phi_use_def_before_kill: ; SI: ; %bb.0: ; %bb ; SI-NEXT: v_add_f32_e64 v1, s0, 1.0 ; SI-NEXT: v_cmp_lt_f32_e32 vcc, 0, v1 +; SI-NEXT: s_mov_b64 s[2:3], exec ; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1.0, vcc ; SI-NEXT: v_cmp_nlt_f32_e32 vcc, 0, v1 -; SI-NEXT: s_andn2_b64 exec, exec, vcc +; SI-NEXT: s_andn2_b64 s[2:3], s[2:3], vcc ; SI-NEXT: s_cbranch_scc0 .LBB11_6 ; SI-NEXT: ; %bb.1: ; %bb ; SI-NEXT: s_andn2_b64 exec, exec, vcc +; SI-NEXT: s_cmp_lg_u32 s1, 0 ; SI-NEXT: s_cbranch_scc0 .LBB11_3 ; SI-NEXT: ; %bb.2: ; %bb8 ; SI-NEXT: s_mov_b32 s3, 0xf000 @@ -1172,13 +1174,15 @@ define amdgpu_ps void @phi_use_def_before_kill(float inreg %x) #0 { ; GFX10-WAVE64-LABEL: phi_use_def_before_kill: ; GFX10-WAVE64: ; %bb.0: ; %bb ; GFX10-WAVE64-NEXT: v_add_f32_e64 v1, s0, 1.0 +; GFX10-WAVE64-NEXT: s_mov_b64 s[2:3], exec ; GFX10-WAVE64-NEXT: v_cmp_lt_f32_e32 vcc, 0, v1 ; GFX10-WAVE64-NEXT: v_cndmask_b32_e64 v0, 0, -1.0, vcc ; GFX10-WAVE64-NEXT: v_cmp_nlt_f32_e32 vcc, 0, v1 -; GFX10-WAVE64-NEXT: s_andn2_b64 exec, exec, vcc +; GFX10-WAVE64-NEXT: s_andn2_b64 s[2:3], s[2:3], vcc ; GFX10-WAVE64-NEXT: s_cbranch_scc0 .LBB11_6 ; GFX10-WAVE64-NEXT: ; %bb.1: ; %bb ; GFX10-WAVE64-NEXT: s_andn2_b64 exec, exec, vcc +; GFX10-WAVE64-NEXT: s_cmp_lg_u32 s1, 0 ; GFX10-WAVE64-NEXT: s_cbranch_scc0 .LBB11_3 ; GFX10-WAVE64-NEXT: ; %bb.2: ; %bb8 ; GFX10-WAVE64-NEXT: v_mov_b32_e32 v1, 8 @@ -1202,13 +1206,15 @@ define amdgpu_ps void @phi_use_def_before_kill(float inreg %x) #0 { ; GFX10-WAVE32-LABEL: phi_use_def_before_kill: ; GFX10-WAVE32: ; %bb.0: ; %bb ; GFX10-WAVE32-NEXT: v_add_f32_e64 v1, s0, 1.0 +; GFX10-WAVE32-NEXT: s_mov_b32 s2, exec_lo ; GFX10-WAVE32-NEXT: v_cmp_lt_f32_e32 vcc_lo, 0, v1 ; GFX10-WAVE32-NEXT: v_cndmask_b32_e64 v0, 0, -1.0, vcc_lo ; GFX10-WAVE32-NEXT: v_cmp_nlt_f32_e32 vcc_lo, 0, v1 -; GFX10-WAVE32-NEXT: s_andn2_b32 exec_lo, exec_lo, vcc_lo +; GFX10-WAVE32-NEXT: s_andn2_b32 s2, s2, vcc_lo ; GFX10-WAVE32-NEXT: s_cbranch_scc0 .LBB11_6 ; GFX10-WAVE32-NEXT: ; %bb.1: ; %bb ; GFX10-WAVE32-NEXT: s_andn2_b32 exec_lo, exec_lo, vcc_lo +; GFX10-WAVE32-NEXT: s_cmp_lg_u32 s1, 0 ; GFX10-WAVE32-NEXT: s_cbranch_scc0 .LBB11_3 ; GFX10-WAVE32-NEXT: ; %bb.2: ; %bb8 ; GFX10-WAVE32-NEXT: v_mov_b32_e32 v1, 8 @@ -1232,14 +1238,16 @@ define amdgpu_ps void @phi_use_def_before_kill(float inreg %x) #0 { ; GFX11-LABEL: phi_use_def_before_kill: ; GFX11: ; %bb.0: ; %bb ; GFX11-NEXT: v_add_f32_e64 v1, s0, 1.0 +; GFX11-NEXT: s_mov_b64 s[2:3], exec ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_cmp_lt_f32_e32 vcc, 0, v1 ; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, -1.0, vcc ; GFX11-NEXT: v_cmp_nlt_f32_e32 vcc, 0, v1 -; GFX11-NEXT: s_and_not1_b64 exec, exec, vcc +; GFX11-NEXT: s_and_not1_b64 s[2:3], s[2:3], vcc ; GFX11-NEXT: s_cbranch_scc0 .LBB11_6 ; GFX11-NEXT: ; %bb.1: ; %bb ; GFX11-NEXT: s_and_not1_b64 exec, exec, vcc +; GFX11-NEXT: s_cmp_lg_u32 s1, 0 ; GFX11-NEXT: s_cbranch_scc0 .LBB11_3 ; GFX11-NEXT: ; %bb.2: ; %bb8 ; GFX11-NEXT: v_mov_b32_e32 v1, 8 @@ -1265,7 +1273,8 @@ bb: %tmp2 = select i1 %tmp1, float -1.000000e+00, float 0.000000e+00 %cmp.tmp2 = fcmp olt float %tmp2, 0.0 call void @llvm.amdgcn.kill(i1 %cmp.tmp2) - br i1 undef, label %phibb, label %bb8 + %uniform.cond = icmp eq i32 %y, 0 + br i1 %uniform.cond, label %phibb, label %bb8 phibb: %tmp5 = phi float [ %tmp2, %bb ], [ 4.0, %bb8 ] diff --git a/llvm/test/CodeGen/AMDGPU/soft-clause-exceeds-register-budget.ll b/llvm/test/CodeGen/AMDGPU/soft-clause-exceeds-register-budget.ll index 3176257920a7a..71e4755b58bf2 100644 --- a/llvm/test/CodeGen/AMDGPU/soft-clause-exceeds-register-budget.ll +++ b/llvm/test/CodeGen/AMDGPU/soft-clause-exceeds-register-budget.ll @@ -34,7 +34,7 @@ entry: %conv = add i32 %i6, %i7 %conv.frozen = freeze i32 %conv %div = udiv i32 %conv.frozen, 49 - %add.ptr22 = getelementptr inbounds float, ptr addrspace(4) %wei_ptr, i64 undef + %add.ptr22 = getelementptr inbounds float, ptr addrspace(4) %wei_ptr, i64 0 %in.ptr1 = getelementptr inbounds float, ptr addrspace(1) %in, i32 %i5 br label %for.cond28.preheader @@ -471,11 +471,11 @@ for.cond28.preheader: ; preds = %for.cond28.preheade br i1 %exitcond.not, label %for.cond.cleanup26, label %for.cond28.preheader for.cond.cleanup26: ; preds = %for.cond28.preheader - %mul119 = shl nuw nsw i32 undef, 1 + %mul119 = shl nuw nsw i32 0, 1 %mul120 = mul i32 %div, 200704 - %mul121 = mul i32 undef, 6272 + %mul121 = mul i32 0, 6272 %add122 = add i32 %mul120, %mul121 - %mul123 = mul nuw nsw i32 undef, 28 + %mul123 = mul nuw nsw i32 0, 28 %add124 = add i32 %add122, %mul123 %add126 = add i32 %add124, %mul119 %idx.ext127 = zext i32 %add126 to i64 diff --git a/llvm/test/CodeGen/AMDGPU/splitkit-getsubrangeformask.ll b/llvm/test/CodeGen/AMDGPU/splitkit-getsubrangeformask.ll index fc5f6d9dab796..b7e6ebaa655b9 100644 --- a/llvm/test/CodeGen/AMDGPU/splitkit-getsubrangeformask.ll +++ b/llvm/test/CodeGen/AMDGPU/splitkit-getsubrangeformask.ll @@ -87,18 +87,18 @@ define amdgpu_gs void @_amdgpu_gs_main(i32 inreg %primShaderTableAddrLow, <31 x ; CHECK-NEXT: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_LSHL_B32_]], 16, implicit-def dead $scc ; CHECK-NEXT: [[S_ADD_I32_1:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_LSHL_B32_2]], 16, implicit-def dead $scc ; CHECK-NEXT: [[S_BUFFER_LOAD_DWORD_SGPR_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR_IMM [[S_MOV_B32_]], [[S_ADD_I32_]], 0, 0 :: (dereferenceable invariant load (s32)) - ; CHECK-NEXT: [[S_BUFFER_LOAD_DWORD_SGPR_IMM1:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR_IMM [[S_MOV_B32_]], undef %301:sreg_32, 0, 0 :: (dereferenceable invariant load (s32)) + ; CHECK-NEXT: [[S_BUFFER_LOAD_DWORD_SGPR_IMM1:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR_IMM [[S_MOV_B32_]], undef %302:sreg_32, 0, 0 :: (dereferenceable invariant load (s32)) ; CHECK-NEXT: [[S_BUFFER_LOAD_DWORD_SGPR_IMM2:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR_IMM [[S_MOV_B32_]], [[S_ADD_I32_1]], 0, 0 :: (dereferenceable invariant load (s32)) ; CHECK-NEXT: [[S_BUFFER_LOAD_DWORD_IMM2:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM [[S_MOV_B32_]], 16, 0 :: (dereferenceable invariant load (s32)) - ; CHECK-NEXT: [[S_BUFFER_LOAD_DWORD_SGPR_IMM3:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR_IMM undef %356:sgpr_128, undef %357:sreg_32, 0, 0 :: (dereferenceable invariant load (s32)) - ; CHECK-NEXT: [[S_BUFFER_LOAD_DWORD_IMM3:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM undef %367:sgpr_128, 16, 0 :: (dereferenceable invariant load (s32)) + ; CHECK-NEXT: [[S_BUFFER_LOAD_DWORD_SGPR_IMM3:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR_IMM undef %357:sgpr_128, undef %358:sreg_32, 0, 0 :: (dereferenceable invariant load (s32)) + ; CHECK-NEXT: [[S_BUFFER_LOAD_DWORD_IMM3:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM undef %368:sgpr_128, 16, 0 :: (dereferenceable invariant load (s32)) ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM4:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_3]], 64, 0 :: (invariant load (s128) from %ir.99, addrspace 4) ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM5:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_4]], 64, 0 :: (invariant load (s128) from %ir.107, addrspace 4) ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM6:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_6]], 0, 0 :: (invariant load (s128) from %ir.117, addrspace 4) ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM7:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_7]], 0, 0 :: (invariant load (s128) from %ir.124, addrspace 4) ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN2:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM2]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) - ; CHECK-NEXT: [[S_BUFFER_LOAD_DWORD_SGPR_IMM4:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR_IMM undef %351:sgpr_128, [[S_ADD_I32_]], 0, 0 :: (dereferenceable invariant load (s32)) - ; CHECK-NEXT: [[S_BUFFER_LOAD_DWORD_SGPR_IMM5:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR_IMM undef %362:sgpr_128, [[S_ADD_I32_1]], 0, 0 :: (dereferenceable invariant load (s32)) + ; CHECK-NEXT: [[S_BUFFER_LOAD_DWORD_SGPR_IMM4:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR_IMM undef %352:sgpr_128, [[S_ADD_I32_]], 0, 0 :: (dereferenceable invariant load (s32)) + ; CHECK-NEXT: [[S_BUFFER_LOAD_DWORD_SGPR_IMM5:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR_IMM undef %363:sgpr_128, [[S_ADD_I32_1]], 0, 0 :: (dereferenceable invariant load (s32)) ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN3:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM3]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) ; CHECK-NEXT: [[S_ADD_I32_2:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR_IMM]], -98, implicit-def dead $scc ; CHECK-NEXT: [[S_ADD_I32_3:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR_IMM1]], -114, implicit-def dead $scc @@ -116,7 +116,7 @@ define amdgpu_gs void @_amdgpu_gs_main(i32 inreg %primShaderTableAddrLow, <31 x ; CHECK-NEXT: [[S_LSHL_B32_3:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY12]], 4, implicit-def dead $scc ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN4:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM4]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) ; CHECK-NEXT: [[S_ADD_I32_6:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_LSHL_B32_3]], 16, implicit-def dead $scc - ; CHECK-NEXT: [[S_BUFFER_LOAD_DWORD_SGPR_IMM6:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR_IMM undef %383:sgpr_128, [[S_ADD_I32_6]], 0, 0 :: (dereferenceable invariant load (s32)) + ; CHECK-NEXT: [[S_BUFFER_LOAD_DWORD_SGPR_IMM6:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR_IMM undef %384:sgpr_128, [[S_ADD_I32_6]], 0, 0 :: (dereferenceable invariant load (s32)) ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN5:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM5]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM9:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_5]], 224, 0 :: (invariant load (s128) from %ir.129, addrspace 4) ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM10:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[COPY7]], 224, 0 :: (invariant load (s128) from %ir.145, addrspace 4) @@ -198,9 +198,9 @@ define amdgpu_gs void @_amdgpu_gs_main(i32 inreg %primShaderTableAddrLow, <31 x ; CHECK-NEXT: [[COPY17:%[0-9]+]].sub1:sgpr_128 = COPY [[S_AND_B32_1]] ; CHECK-NEXT: [[S_BUFFER_LOAD_DWORD_IMM6:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM [[COPY17]], 0, 0 :: (dereferenceable invariant load (s32)) ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM23:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_16]], 160, 0 :: (invariant load (s128) from %ir.256, addrspace 4) - ; CHECK-NEXT: [[S_LOAD_DWORD_IMM1:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM undef %469:sreg_64, 0, 0 :: (invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) + ; CHECK-NEXT: [[S_LOAD_DWORD_IMM1:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM undef %470:sreg_64, 0, 0 :: (invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) ; CHECK-NEXT: KILL [[S_ADD_U32_16]].sub0, [[S_ADD_U32_16]].sub1 - ; CHECK-NEXT: KILL undef %469:sreg_64 + ; CHECK-NEXT: KILL undef %470:sreg_64 ; CHECK-NEXT: KILL [[COPY17]].sub0_sub1_sub2, [[COPY17]].sub3 ; CHECK-NEXT: [[S_LSHL_B32_8:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY14]], 3, implicit-def dead $scc ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM24:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_17]], 160, 0 :: (invariant load (s128) from %ir.265, addrspace 4) @@ -211,8 +211,8 @@ define amdgpu_gs void @_amdgpu_gs_main(i32 inreg %primShaderTableAddrLow, <31 x ; CHECK-NEXT: [[S_LOAD_DWORD_IMM2:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[S_ADD_U32_21]], 168, 0 :: (invariant load (s32) from %ir.305, align 8, addrspace 4) ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN21:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM23]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN22:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM24]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) - ; CHECK-NEXT: KILL [[S_LOAD_DWORDX4_IMM23]] ; CHECK-NEXT: KILL [[S_LOAD_DWORDX4_IMM24]] + ; CHECK-NEXT: KILL [[S_LOAD_DWORDX4_IMM23]] ; CHECK-NEXT: [[S_AND_B32_2:%[0-9]+]]:sreg_32 = S_AND_B32 [[S_LOAD_DWORD_IMM1]], 65535, implicit-def dead $scc ; CHECK-NEXT: [[COPY18:%[0-9]+]]:sgpr_128 = COPY [[S_LOAD_DWORDX2_IMM]] ; CHECK-NEXT: [[COPY18:%[0-9]+]].sub1:sgpr_128 = COPY [[S_AND_B32_2]] @@ -236,10 +236,10 @@ define amdgpu_gs void @_amdgpu_gs_main(i32 inreg %primShaderTableAddrLow, <31 x ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN23:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM25]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN24:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM26]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN25:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM27]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) - ; CHECK-NEXT: KILL [[S_LOAD_DWORDX4_IMM26]] - ; CHECK-NEXT: KILL [[V_MOV_B32_e32_]] ; CHECK-NEXT: KILL [[S_LOAD_DWORDX4_IMM27]] ; CHECK-NEXT: KILL [[S_LOAD_DWORDX4_IMM25]] + ; CHECK-NEXT: KILL [[V_MOV_B32_e32_]] + ; CHECK-NEXT: KILL [[S_LOAD_DWORDX4_IMM26]] ; CHECK-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 -2, [[BUFFER_LOAD_FORMAT_X_IDXEN]], 0, implicit $exec ; CHECK-NEXT: [[V_ADD_U32_e64_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 -1, [[BUFFER_LOAD_FORMAT_X_IDXEN1]], 0, implicit $exec ; CHECK-NEXT: [[V_ADD_U32_e64_2:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 -3, [[BUFFER_LOAD_FORMAT_X_IDXEN]], 0, implicit $exec @@ -351,13 +351,13 @@ define amdgpu_gs void @_amdgpu_gs_main(i32 inreg %primShaderTableAddrLow, <31 x ; CHECK-NEXT: [[V_OR_B32_e64_64:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_OR_B32_e64_63]], [[V_ADD_U32_e64_28]], implicit $exec ; CHECK-NEXT: [[V_ADD_U32_e64_30:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 -593, [[BUFFER_LOAD_FORMAT_X_IDXEN]], 0, implicit $exec ; CHECK-NEXT: [[V_OR_B32_e64_65:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_OR_B32_e64_64]], [[V_ADD_U32_e64_29]], implicit $exec - ; CHECK-NEXT: [[S_LOAD_DWORDX8_IMM:%[0-9]+]]:sgpr_256 = S_LOAD_DWORDX8_IMM undef %542:sreg_64, 0, 0 :: (invariant load (s256) from `ptr addrspace(4) poison`, addrspace 4) + ; CHECK-NEXT: [[S_LOAD_DWORDX8_IMM:%[0-9]+]]:sgpr_256 = S_LOAD_DWORDX8_IMM undef %543:sreg_64, 0, 0 :: (invariant load (s256) from `ptr addrspace(4) poison`, addrspace 4) ; CHECK-NEXT: [[V_OR_B32_e64_66:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_OR_B32_e64_65]], [[V_ADD_U32_e64_30]], implicit $exec ; CHECK-NEXT: [[S_ADD_I32_24:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_IMM8]], -594, implicit-def dead $scc ; CHECK-NEXT: [[V_OR_B32_e64_67:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[S_ADD_I32_24]], [[V_OR_B32_e64_66]], implicit $exec ; CHECK-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U32_e64 0, [[V_OR_B32_e64_67]], implicit $exec ; CHECK-NEXT: undef [[V_CNDMASK_B32_e64_:%[0-9]+]].sub3:vreg_128 = V_CNDMASK_B32_e64 0, 0, 0, 1, [[V_CMP_EQ_U32_e64_]], implicit $exec - ; CHECK-NEXT: IMAGE_STORE_V4_V2_nsa_gfx10 [[V_CNDMASK_B32_e64_]], undef %556:vgpr_32, undef %558:vgpr_32, [[S_LOAD_DWORDX8_IMM]], 15, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store (s128), addrspace 8) + ; CHECK-NEXT: IMAGE_STORE_V4_V2_nsa_gfx10 [[V_CNDMASK_B32_e64_]], undef %557:vgpr_32, undef %559:vgpr_32, [[S_LOAD_DWORDX8_IMM]], 15, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store (s128), addrspace 8) ; CHECK-NEXT: S_ENDPGM 0 .expVert: %0 = extractelement <31 x i32> %userData, i64 2 @@ -406,7 +406,7 @@ define amdgpu_gs void @_amdgpu_gs_main(i32 inreg %primShaderTableAddrLow, <31 x %40 = and i32 %rootDesc58.ii1.i, 65535 %41 = insertelement <4 x i32> , i32 %rootDesc58.ii0.i, i32 0 %42 = insertelement <4 x i32> %41, i32 %40, i32 1 - %43 = and i32 undef, 65535 + %43 = and i32 0, 65535 %44 = insertelement <4 x i32> poison, i32 %43, i32 1 %45 = load <4 x i32>, ptr addrspace(4) poison, align 16 %46 = call i32 @llvm.amdgcn.struct.buffer.load.format.i32(<4 x i32> %45, i32 0, i32 0, i32 0, i32 0) @@ -470,7 +470,8 @@ define amdgpu_gs void @_amdgpu_gs_main(i32 inreg %primShaderTableAddrLow, <31 x %104 = call i32 @llvm.amdgcn.struct.buffer.load.format.i32(<4 x i32> %103, i32 0, i32 0, i32 0, i32 0) %105 = add i32 %104, -34 %106 = or i32 %101, %105 - %107 = call i32 @llvm.amdgcn.readfirstlane(i32 undef) + %undef = freeze i32 poison + %107 = call i32 @llvm.amdgcn.readfirstlane(i32 %undef) %108 = sext i32 %107 to i64 %109 = getelementptr i8, ptr addrspace(4) %91, i64 %108 %110 = load <4 x i32>, ptr addrspace(4) %109, align 16 @@ -490,7 +491,8 @@ define amdgpu_gs void @_amdgpu_gs_main(i32 inreg %primShaderTableAddrLow, <31 x %124 = call i32 @llvm.amdgcn.struct.buffer.load.format.i32(<4 x i32> poison, i32 0, i32 0, i32 0, i32 0) %125 = add i32 %124, -39 %126 = or i32 %123, %125 - %127 = call i32 @llvm.amdgcn.readfirstlane(i32 undef) + %undef1 = freeze i32 poison + %127 = call i32 @llvm.amdgcn.readfirstlane(i32 %undef1) %128 = sext i32 %127 to i64 %129 = getelementptr i8, ptr addrspace(4) %32, i64 %128 %130 = load <4 x i32>, ptr addrspace(4) %129, align 16 @@ -513,7 +515,7 @@ define amdgpu_gs void @_amdgpu_gs_main(i32 inreg %primShaderTableAddrLow, <31 x %147 = call i32 @llvm.amdgcn.struct.buffer.load.format.i32(<4 x i32> %146, i32 0, i32 0, i32 0, i32 0) %148 = add i32 %147, -53 %149 = or i32 %144, %148 - %150 = sext i32 undef to i64 + %150 = sext i32 0 to i64 %151 = getelementptr i8, ptr addrspace(4) %134, i64 %150 %152 = load <4 x i32>, ptr addrspace(4) %151, align 16 %153 = call i32 @llvm.amdgcn.struct.buffer.load.format.i32(<4 x i32> %152, i32 0, i32 0, i32 0, i32 0) @@ -574,7 +576,7 @@ define amdgpu_gs void @_amdgpu_gs_main(i32 inreg %primShaderTableAddrLow, <31 x %208 = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %44, i32 %207, i32 0) %209 = add i32 %208, -130 %210 = or i32 %205, %209 - %211 = getelementptr <{ [4 x i32], [6 x %llpc.array.element] }>, ptr addrspace(6) null, i32 0, i32 1, i32 undef, i32 0 + %211 = getelementptr <{ [4 x i32], [6 x %llpc.array.element] }>, ptr addrspace(6) null, i32 0, i32 1, i32 0, i32 0 %212 = ptrtoint ptr addrspace(6) %211 to i32 %213 = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %44, i32 %212, i32 0) %214 = add i32 %213, -178 @@ -617,7 +619,7 @@ define amdgpu_gs void @_amdgpu_gs_main(i32 inreg %primShaderTableAddrLow, <31 x %251 = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> poison, i32 %250, i32 0) %252 = add i32 %251, -249 %253 = or i32 %248, %252 - %254 = getelementptr <{ [4 x i32], [6 x %llpc.array.element.2] }>, ptr addrspace(6) null, i32 0, i32 1, i32 undef, i32 0 + %254 = getelementptr <{ [4 x i32], [6 x %llpc.array.element.2] }>, ptr addrspace(6) null, i32 0, i32 1, i32 0, i32 0 %255 = ptrtoint ptr addrspace(6) %254 to i32 %256 = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> poison, i32 %255, i32 0) %257 = add i32 %256, -297 @@ -661,7 +663,7 @@ define amdgpu_gs void @_amdgpu_gs_main(i32 inreg %primShaderTableAddrLow, <31 x %295 = sext i32 %294 to i64 %296 = getelementptr i8, ptr addrspace(4) %293, i64 %295 %.ii0.i = load i32, ptr addrspace(4) %296, align 8 - %297 = and i32 undef, 65535 + %297 = and i32 0, 65535 %298 = insertelement <4 x i32> , i32 %.ii0.i, i32 0 %299 = insertelement <4 x i32> %298, i32 %297, i32 1 %300 = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %299, i32 0, i32 0) diff --git a/llvm/test/CodeGen/AMDGPU/undefined-subreg-liverange.ll b/llvm/test/CodeGen/AMDGPU/undefined-subreg-liverange.ll index 4f551d4c9de1a..78103d5e40425 100644 --- a/llvm/test/CodeGen/AMDGPU/undefined-subreg-liverange.ll +++ b/llvm/test/CodeGen/AMDGPU/undefined-subreg-liverange.ll @@ -8,6 +8,7 @@ define amdgpu_kernel void @func() #0 { ; CHECK-LABEL: func: ; CHECK: ; %bb.0: ; %B0 +; CHECK-NEXT: s_cmp_lg_u32 s8, 0 ; CHECK-NEXT: s_mov_b32 s0, 0 ; CHECK-NEXT: s_cbranch_scc1 .LBB0_2 ; CHECK-NEXT: ; %bb.1: ; %B30.1 @@ -18,17 +19,19 @@ define amdgpu_kernel void @func() #0 { ; CHECK-NEXT: ds_write_b32 v0, v0 ; CHECK-NEXT: s_endpgm B0: - br i1 undef, label %B1, label %B2 + %id = call i32 @llvm.amdgcn.workgroup.id.x() + %cmp = icmp eq i32 %id, 0 + br i1 %cmp, label %B1, label %B2 B1: br label %B2 B2: %v0 = phi <4 x float> [ zeroinitializer, %B1 ], [ , %B0 ] - br i1 undef, label %B30.1, label %B30.2 + br i1 %cmp, label %B30.1, label %B30.2 B30.1: - %sub = fsub <4 x float> %v0, undef + %sub = fsub <4 x float> %v0, splat (float 0x7FF8000000000000) br label %B30.2 B30.2: @@ -73,7 +76,7 @@ bb: %tmp3 = bitcast i32 %tmp1 to float %tmp4 = call <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 15, float %tmp3, float %tmp3, <8 x i32> poison, <4 x i32> poison, i1 0, i32 0, i32 0) %tmp5 = extractelement <4 x float> %tmp4, i32 0 - %tmp6 = fmul float %tmp5, undef + %tmp6 = fmul float %tmp5, 0x7FF8000000000000 %tmp7 = fadd float %tmp6, %tmp6 %tmp8 = insertelement <4 x i32> %tmp2, i32 %tmp, i32 1 store <4 x i32> %tmp8, ptr addrspace(1) poison, align 16 diff --git a/llvm/test/CodeGen/AMDGPU/uniform-cfg.ll b/llvm/test/CodeGen/AMDGPU/uniform-cfg.ll index 9a330a2683097..374c6701f1ba6 100644 --- a/llvm/test/CodeGen/AMDGPU/uniform-cfg.ll +++ b/llvm/test/CodeGen/AMDGPU/uniform-cfg.ll @@ -1150,6 +1150,10 @@ define void @move_to_valu_vgpr_operand_phi(ptr addrspace(3) %out) { ; SI-NEXT: v_add_i32_e64 v0, s[4:5], 8, v0 ; SI-NEXT: .LBB20_2: ; %bb1 ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: ;;#ASMSTART +; SI-NEXT: ; def s4 +; SI-NEXT: ;;#ASMEND +; SI-NEXT: s_cmp_lg_u32 s4, 0 ; SI-NEXT: s_cbranch_scc1 .LBB20_1 ; SI-NEXT: ; %bb.3: ; %bb2 ; SI-NEXT: ; in Loop: Header=BB20_2 Depth=1 @@ -1173,6 +1177,10 @@ define void @move_to_valu_vgpr_operand_phi(ptr addrspace(3) %out) { ; VI-NEXT: v_add_u32_e64 v0, s[4:5], 8, v0 ; VI-NEXT: .LBB20_2: ; %bb1 ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: ;;#ASMSTART +; VI-NEXT: ; def s4 +; VI-NEXT: ;;#ASMEND +; VI-NEXT: s_cmp_lg_u32 s4, 0 ; VI-NEXT: s_cbranch_scc1 .LBB20_1 ; VI-NEXT: ; %bb.3: ; %bb2 ; VI-NEXT: ; in Loop: Header=BB20_2 Depth=1 @@ -1189,7 +1197,9 @@ bb1: ; preds = %bb3, %bb0 %tmp0 = phi i32 [ 8, %bb0 ], [ %tmp4, %bb3 ] %tmp1 = add nsw i32 %tmp0, -1 %tmp2 = getelementptr inbounds i32, ptr addrspace(3) %out, i32 %tmp1 - br i1 undef, label %bb2, label %bb3 + %cond = call i32 asm "; def $0","=s"() + %cmp = icmp eq i32 %cond, 0 + br i1 %cmp, label %bb2, label %bb3 bb2: ; preds = %bb1 store volatile i32 1, ptr addrspace(3) %tmp2, align 4 diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-liverange-ir.ll b/llvm/test/CodeGen/AMDGPU/vgpr-liverange-ir.ll index 3e5b8b1b13db6..aea25b37e8f4e 100644 --- a/llvm/test/CodeGen/AMDGPU/vgpr-liverange-ir.ll +++ b/llvm/test/CodeGen/AMDGPU/vgpr-liverange-ir.ll @@ -538,7 +538,8 @@ if.then: ; preds = %entry ret void if.then9: ; preds = %entry - br i1 undef, label %sw.bb18, label %sw.bb + %undef = freeze i1 poison + br i1 %undef, label %sw.bb18, label %sw.bb sw.bb: ; preds = %if.then9 %i17 = load i8, ptr addrspace(1) null, align 1 diff --git a/llvm/test/CodeGen/AMDGPU/wave32.ll b/llvm/test/CodeGen/AMDGPU/wave32.ll index baf9e9df91689..4212fd3b35cd8 100644 --- a/llvm/test/CodeGen/AMDGPU/wave32.ll +++ b/llvm/test/CodeGen/AMDGPU/wave32.ll @@ -1511,7 +1511,7 @@ define amdgpu_kernel void @test_preserve_condition_undef_flag(float %arg, i32 %a ; GFX1064-NEXT: s_endpgm bb0: %tmp = icmp sgt i32 %arg1, 4 - %undef = call i1 @llvm.amdgcn.class.f32(float poison, i32 undef) + %undef = call i1 @llvm.amdgcn.class.f32(float poison, i32 0) %tmp4 = select i1 %undef, float %arg, float 1.000000e+00 %tmp5 = fcmp ogt float %arg2, 0.000000e+00 %tmp6 = fcmp olt float %arg2, 1.000000e+00 @@ -2329,7 +2329,7 @@ for.body.lr.ph: ; preds = %entry br label %for.body for.body: ; preds = %for.body, %for.body.lr.ph - br i1 undef, label %for.end, label %for.body + br i1 poison, label %for.end, label %for.body for.end: ; preds = %for.body, %entry ret void diff --git a/llvm/test/CodeGen/MIR/AMDGPU/custom-pseudo-source-values.ll b/llvm/test/CodeGen/MIR/AMDGPU/custom-pseudo-source-values.ll index 519cc1478a434..ed57628fa721c 100644 --- a/llvm/test/CodeGen/MIR/AMDGPU/custom-pseudo-source-values.ll +++ b/llvm/test/CodeGen/MIR/AMDGPU/custom-pseudo-source-values.ll @@ -16,7 +16,7 @@ define amdgpu_cs void @shader(i32 %arg0, i32 %arg1, <8 x i32> inreg %arg2, ptr a %bload1.f = bitcast i32 %bload1 to float %bload2.f = bitcast i32 %bload2 to float %bload3.f = bitcast i32 %bload3 to float - %istore0 = insertelement <4 x float> undef, float %bload0.f, i32 0 + %istore0 = insertelement <4 x float> poison, float %bload0.f, i32 0 %istore1 = insertelement <4 x float> %istore0, float %bload0.f, i32 1 %istore2 = insertelement <4 x float> %istore1, float %bload0.f, i32 2 %istore3 = insertelement <4 x float> %istore2, float %bload0.f, i32 3 diff --git a/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-long-branch-reg-debug.ll b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-long-branch-reg-debug.ll index 883657547519b..b2f299d531f5c 100644 --- a/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-long-branch-reg-debug.ll +++ b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-long-branch-reg-debug.ll @@ -59,7 +59,7 @@ br i1 %0, label %bb2, label %bb4, !dbg !12, !amdgpu.uniform !7 bb2: ; preds = %Flow - store volatile i32 17, ptr addrspace(1) undef, align 4, !dbg !13 + store volatile i32 17, ptr addrspace(1) poison, align 4, !dbg !13 br label %bb4, !dbg !14, !amdgpu.uniform !7 bb3: ; preds = %bb0 diff --git a/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-long-branch-reg.ll b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-long-branch-reg.ll index 278bf086d6088..93f2c343cd051 100644 --- a/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-long-branch-reg.ll +++ b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-long-branch-reg.ll @@ -51,7 +51,7 @@ bb0: br i1 %tmp, label %bb2, label %bb3 bb2: - store volatile i32 17, ptr addrspace(1) undef + store volatile i32 17, ptr addrspace(1) poison br label %bb4 bb3: diff --git a/llvm/test/CodeGen/MIR/AMDGPU/mircanon-memoperands.mir b/llvm/test/CodeGen/MIR/AMDGPU/mircanon-memoperands.mir index af0f28f6b5d74..aa30e20dd7f06 100644 --- a/llvm/test/CodeGen/MIR/AMDGPU/mircanon-memoperands.mir +++ b/llvm/test/CodeGen/MIR/AMDGPU/mircanon-memoperands.mir @@ -33,11 +33,11 @@ body: | ; CHECK-NEXT: %bb0_{{[0-9a-f]+}}__1:sreg_64_xexec = S_LOAD_DWORDX2_IMM %0 = COPY $sgpr4_sgpr5 - %1 = S_LOAD_DWORDX2_IMM %0, 0, 0 :: (non-temporal dereferenceable invariant load (s64) from `ptr addrspace(4) undef`) - %2 = S_LOAD_DWORDX2_IMM %0, 0, 0 :: ( dereferenceable invariant load (s64) from `ptr addrspace(4) undef`) - %3 = S_LOAD_DWORDX2_IMM %0, 0, 0 :: ( invariant load (s64) from `ptr addrspace(4) undef`) - %4 = S_LOAD_DWORDX2_IMM %0, 0, 0 :: ( load (s64) from `ptr addrspace(4) undef`) - %5 = S_LOAD_DWORDX2_IMM %0, 0, 0 :: ( load (s64) from `ptr addrspace(2) undef`) - %6 = S_LOAD_DWORDX2_IMM %0, 0, 0 :: ( load (s64) from `ptr addrspace(1) undef`) + %1 = S_LOAD_DWORDX2_IMM %0, 0, 0 :: (non-temporal dereferenceable invariant load (s64) from `ptr addrspace(4) poison`) + %2 = S_LOAD_DWORDX2_IMM %0, 0, 0 :: ( dereferenceable invariant load (s64) from `ptr addrspace(4) poison`) + %3 = S_LOAD_DWORDX2_IMM %0, 0, 0 :: ( invariant load (s64) from `ptr addrspace(4) poison`) + %4 = S_LOAD_DWORDX2_IMM %0, 0, 0 :: ( load (s64) from `ptr addrspace(4) poison`) + %5 = S_LOAD_DWORDX2_IMM %0, 0, 0 :: ( load (s64) from `ptr addrspace(2) poison`) + %6 = S_LOAD_DWORDX2_IMM %0, 0, 0 :: ( load (s64) from `ptr addrspace(1) poison`) ... diff --git a/llvm/test/CodeGen/MIR/AMDGPU/syncscopes.mir b/llvm/test/CodeGen/MIR/AMDGPU/syncscopes.mir index c28a4405d488c..db18d5433da3b 100644 --- a/llvm/test/CodeGen/MIR/AMDGPU/syncscopes.mir +++ b/llvm/test/CodeGen/MIR/AMDGPU/syncscopes.mir @@ -74,14 +74,14 @@ body: | liveins: $sgpr4_sgpr5 S_WAITCNT 0 - $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM $sgpr4_sgpr5, 8, 0 :: (non-temporal dereferenceable invariant load (s64) from `ptr addrspace(4) undef`) - $sgpr6 = S_LOAD_DWORD_IMM $sgpr4_sgpr5, 0, 0 :: (non-temporal dereferenceable invariant load (s32) from `ptr addrspace(4) undef`) - $sgpr2_sgpr3 = S_LOAD_DWORDX2_IMM $sgpr4_sgpr5, 24, 0 :: (non-temporal dereferenceable invariant load (s64) from `ptr addrspace(4) undef`) - $sgpr7 = S_LOAD_DWORD_IMM $sgpr4_sgpr5, 16, 0 :: (non-temporal dereferenceable invariant load (s32) from `ptr addrspace(4) undef`) - $sgpr8 = S_LOAD_DWORD_IMM $sgpr4_sgpr5, 32, 0 :: (non-temporal dereferenceable invariant load (s32) from `ptr addrspace(4) undef`) + $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM $sgpr4_sgpr5, 8, 0 :: (non-temporal dereferenceable invariant load (s64) from `ptr addrspace(4) poison`) + $sgpr6 = S_LOAD_DWORD_IMM $sgpr4_sgpr5, 0, 0 :: (non-temporal dereferenceable invariant load (s32) from `ptr addrspace(4) poison`) + $sgpr2_sgpr3 = S_LOAD_DWORDX2_IMM $sgpr4_sgpr5, 24, 0 :: (non-temporal dereferenceable invariant load (s64) from `ptr addrspace(4) poison`) + $sgpr7 = S_LOAD_DWORD_IMM $sgpr4_sgpr5, 16, 0 :: (non-temporal dereferenceable invariant load (s32) from `ptr addrspace(4) poison`) + $sgpr8 = S_LOAD_DWORD_IMM $sgpr4_sgpr5, 32, 0 :: (non-temporal dereferenceable invariant load (s32) from `ptr addrspace(4) poison`) S_WAITCNT 127 $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 - $sgpr4_sgpr5 = S_LOAD_DWORDX2_IMM killed $sgpr4_sgpr5, 40, 0 :: (non-temporal dereferenceable invariant load (s64) from `ptr addrspace(4) undef`) + $sgpr4_sgpr5 = S_LOAD_DWORDX2_IMM killed $sgpr4_sgpr5, 40, 0 :: (non-temporal dereferenceable invariant load (s64) from `ptr addrspace(4) poison`) $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit killed $sgpr0_sgpr1, implicit $sgpr0_sgpr1, implicit $exec $vgpr2 = V_MOV_B32_e32 killed $sgpr6, implicit $exec, implicit $exec FLAT_STORE_DWORD killed $vgpr0_vgpr1, killed $vgpr2, 0, 19, implicit $exec, implicit $flat_scr :: (volatile non-temporal store syncscope("agent") seq_cst (s32) into %ir.agent_out)