diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll index a02e0b37479a0..ebb345f561687 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll @@ -769,7 +769,7 @@ define amdgpu_kernel void @store_load_sindex_small_offset_kernel(i32 %idx) { bb: %padding = alloca [64 x i32], align 4, addrspace(5) %i = alloca [32 x float], align 4, addrspace(5) - %pad_gep = getelementptr inbounds [64 x i32], ptr addrspace(5) %padding, i32 0, i32 undef + %pad_gep = getelementptr inbounds [64 x i32], ptr addrspace(5) %padding, i32 0, i32 0 %pad_load = load volatile i32, ptr addrspace(5) %pad_gep, align 4 %i7 = getelementptr inbounds [32 x float], ptr addrspace(5) %i, i32 0, i32 %idx store volatile i32 15, ptr addrspace(5) %i7, align 4 @@ -956,7 +956,7 @@ define amdgpu_kernel void @store_load_vindex_small_offset_kernel() { bb: %padding = alloca [64 x i32], align 4, addrspace(5) %i = alloca [32 x float], align 4, addrspace(5) - %pad_gep = getelementptr inbounds [64 x i32], ptr addrspace(5) %padding, i32 0, i32 undef + %pad_gep = getelementptr inbounds [64 x i32], ptr addrspace(5) %padding, i32 0, i32 0 %pad_load = load volatile i32, ptr addrspace(5) %pad_gep, align 4 %i2 = tail call i32 @llvm.amdgcn.workitem.id.x() %i3 = zext i32 %i2 to i64 @@ -1153,7 +1153,7 @@ define void @store_load_vindex_small_offset_foo(i32 %idx) { bb: %padding = alloca [64 x i32], align 4, addrspace(5) %i = alloca [32 x float], align 4, addrspace(5) - %pad_gep = getelementptr inbounds [64 x i32], ptr addrspace(5) %padding, i32 0, i32 undef + %pad_gep = getelementptr inbounds [64 x i32], ptr addrspace(5) %padding, i32 0, i32 0 %pad_load = load volatile i32, ptr addrspace(5) %pad_gep, align 4 %i7 = getelementptr inbounds [32 x float], ptr addrspace(5) %i, i32 0, i32 %idx store volatile i32 15, ptr addrspace(5) %i7, align 4 @@ -1358,7 +1358,7 @@ define amdgpu_kernel void @store_load_sindex_large_offset_kernel(i32 %idx) { bb: %padding = alloca [4096 x i32], align 4, addrspace(5) %i = alloca [32 x float], align 4, addrspace(5) - %pad_gep = getelementptr inbounds [4096 x i32], ptr addrspace(5) %padding, i32 0, i32 undef + %pad_gep = getelementptr inbounds [4096 x i32], ptr addrspace(5) %padding, i32 0, i32 0 %pad_load = load volatile i32, ptr addrspace(5) %pad_gep, align 4 %i7 = getelementptr inbounds [32 x float], ptr addrspace(5) %i, i32 0, i32 %idx store volatile i32 15, ptr addrspace(5) %i7, align 4 @@ -1549,7 +1549,7 @@ define amdgpu_kernel void @store_load_vindex_large_offset_kernel() { bb: %padding = alloca [4096 x i32], align 4, addrspace(5) %i = alloca [32 x float], align 4, addrspace(5) - %pad_gep = getelementptr inbounds [4096 x i32], ptr addrspace(5) %padding, i32 0, i32 undef + %pad_gep = getelementptr inbounds [4096 x i32], ptr addrspace(5) %padding, i32 0, i32 0 %pad_load = load volatile i32, ptr addrspace(5) %pad_gep, align 4 %i2 = tail call i32 @llvm.amdgcn.workitem.id.x() %i3 = zext i32 %i2 to i64 @@ -1750,7 +1750,7 @@ define void @store_load_vindex_large_offset_foo(i32 %idx) { bb: %padding = alloca [4096 x i32], align 4, addrspace(5) %i = alloca [32 x float], align 4, addrspace(5) - %pad_gep = getelementptr inbounds [4096 x i32], ptr addrspace(5) %padding, i32 0, i32 undef + %pad_gep = getelementptr inbounds [4096 x i32], ptr addrspace(5) %padding, i32 0, i32 0 %pad_load = load volatile i32, ptr addrspace(5) %pad_gep, align 4 %i7 = getelementptr inbounds [32 x float], ptr addrspace(5) %i, i32 0, i32 %idx store volatile i32 15, ptr addrspace(5) %i7, align 4 @@ -1900,7 +1900,7 @@ define amdgpu_kernel void @store_load_large_imm_offset_kernel() { ; UNALIGNED_GFX12-NEXT: s_endpgm bb: %i = alloca [4096 x i32], align 4, addrspace(5) - %i1 = getelementptr inbounds [4096 x i32], ptr addrspace(5) %i, i32 0, i32 undef + %i1 = getelementptr inbounds [4096 x i32], ptr addrspace(5) %i, i32 0, i32 0 store volatile i32 13, ptr addrspace(5) %i1, align 4 %i7 = getelementptr inbounds [4096 x i32], ptr addrspace(5) %i, i32 0, i32 4000 store volatile i32 15, ptr addrspace(5) %i7, align 4 @@ -2055,7 +2055,7 @@ define void @store_load_large_imm_offset_foo() { ; UNALIGNED_GFX12-NEXT: s_setpc_b64 s[30:31] bb: %i = alloca [4096 x i32], align 4, addrspace(5) - %i1 = getelementptr inbounds [4096 x i32], ptr addrspace(5) %i, i32 0, i32 undef + %i1 = getelementptr inbounds [4096 x i32], ptr addrspace(5) %i, i32 0, i32 0 store volatile i32 13, ptr addrspace(5) %i1, align 4 %i7 = getelementptr inbounds [4096 x i32], ptr addrspace(5) %i, i32 0, i32 4000 store volatile i32 15, ptr addrspace(5) %i7, align 4 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.scale.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.scale.ll index ec893feb8d9cb..ce195593627db 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.scale.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.scale.ll @@ -1681,7 +1681,7 @@ define amdgpu_kernel void @test_div_scale_f64_val_undef_val(ptr addrspace(1) %ou ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-NEXT: s_endpgm - %result = call { double, i1 } @llvm.amdgcn.div.scale.f64(double 8.0, double undef, i1 false) + %result = call { double, i1 } @llvm.amdgcn.div.scale.f64(double 8.0, double poison, i1 false) %result0 = extractvalue { double, i1 } %result, 0 store double %result0, ptr addrspace(1) %out, align 8 ret void diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.rsq.clamp.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.rsq.clamp.ll index c70a2e6ee6758..24fe2d1c41ffa 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.rsq.clamp.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.rsq.clamp.ll @@ -215,7 +215,7 @@ define double @v_rsq_clamp_undef_f64() #0 { ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[2:3] ; GFX12-NEXT: s_setpc_b64 s[30:31] - %rsq_clamp = call double @llvm.amdgcn.rsq.clamp.f64(double undef) + %rsq_clamp = call double @llvm.amdgcn.rsq.clamp.f64(double poison) ret double %rsq_clamp } diff --git a/llvm/test/CodeGen/AMDGPU/amdpal_scratch_mergedshader.ll b/llvm/test/CodeGen/AMDGPU/amdpal_scratch_mergedshader.ll index 2ddb2fea5ddc6..67382d9cb47f5 100644 --- a/llvm/test/CodeGen/AMDGPU/amdpal_scratch_mergedshader.ll +++ b/llvm/test/CodeGen/AMDGPU/amdpal_scratch_mergedshader.ll @@ -12,7 +12,7 @@ define amdgpu_hs void @_amdgpu_hs_main(i32 inreg %arg, i32 inreg %arg1, i32 inreg %arg2, i32 inreg %arg3, i32 inreg %arg4, i32 inreg %arg5, i32 inreg %arg6, i32 inreg %arg7, <6 x i32> inreg %arg8) { .entry: %__llpc_global_proxy_7.i = alloca [3 x <4 x float>], align 16, addrspace(5) - %tmp = icmp ult i32 undef, undef + %tmp = icmp ult i32 %arg, 0 br i1 %tmp, label %.beginls, label %.endls .beginls: ; preds = %.entry diff --git a/llvm/test/CodeGen/AMDGPU/bug-v4f64-subvector.ll b/llvm/test/CodeGen/AMDGPU/bug-v4f64-subvector.ll index d3808abc9432f..162b88d573624 100644 --- a/llvm/test/CodeGen/AMDGPU/bug-v4f64-subvector.ll +++ b/llvm/test/CodeGen/AMDGPU/bug-v4f64-subvector.ll @@ -7,12 +7,12 @@ ; CHECK: GLOBAL_LOAD_DWORDX4 ; CHECK: GLOBAL_LOAD_DWORDX4 ; CHECK: GLOBAL_STORE_DWORDX4 -define protected amdgpu_kernel void @test1() local_unnamed_addr !kernel_arg_addr_space !0 !kernel_arg_access_qual !1 !kernel_arg_type !2 !kernel_arg_base_type !2 !kernel_arg_type_qual !3 !kernel_arg_name !4 { +define protected amdgpu_kernel void @test1(ptr addrspace(4) %ptr) local_unnamed_addr !kernel_arg_addr_space !0 !kernel_arg_access_qual !1 !kernel_arg_type !2 !kernel_arg_base_type !2 !kernel_arg_type_qual !3 !kernel_arg_name !4 { entry: - %tmp = load <3 x i64>, ptr addrspace(4) poison, align 16, !invariant.load !5 + %tmp = load <3 x i64>, ptr addrspace(4) %ptr, align 16, !invariant.load !5 %srcA.load2 = extractelement <3 x i64> %tmp, i32 0 %tmp1 = inttoptr i64 %srcA.load2 to ptr addrspace(1) - %tmp2 = getelementptr inbounds double, ptr addrspace(1) %tmp1, i64 undef + %tmp2 = getelementptr inbounds double, ptr addrspace(1) %tmp1, i64 0 %tmp4 = load <3 x double>, ptr addrspace(1) %tmp2, align 8, !tbaa !6 %tmp5 = extractelement <3 x double> %tmp4, i32 1 %tmp6 = insertelement <3 x double> poison, double %tmp5, i32 1 @@ -34,12 +34,12 @@ entry: ; CHECK: GLOBAL_LOAD_DWORDX2 ; CHECK: GLOBAL_LOAD_DWORDX2 ; CHECK: GLOBAL_STORE_DWORDX2 -define protected amdgpu_kernel void @test2() local_unnamed_addr !kernel_arg_addr_space !0 !kernel_arg_access_qual !1 !kernel_arg_type !2 !kernel_arg_base_type !2 !kernel_arg_type_qual !3 !kernel_arg_name !4 { +define protected amdgpu_kernel void @test2(ptr addrspace(4) %ptr) local_unnamed_addr !kernel_arg_addr_space !0 !kernel_arg_access_qual !1 !kernel_arg_type !2 !kernel_arg_base_type !2 !kernel_arg_type_qual !3 !kernel_arg_name !4 { entry: - %tmp = load <3 x i64>, ptr addrspace(4) poison, align 16, !invariant.load !5 + %tmp = load <3 x i64>, ptr addrspace(4) %ptr, align 16, !invariant.load !5 %srcA.load2 = extractelement <3 x i64> %tmp, i32 0 %tmp1 = inttoptr i64 %srcA.load2 to ptr addrspace(1) - %tmp2 = getelementptr inbounds double, ptr addrspace(1) %tmp1, i64 undef + %tmp2 = getelementptr inbounds double, ptr addrspace(1) %tmp1, i64 0 %tmp4 = load <3 x double>, ptr addrspace(1) %tmp2, align 8, !tbaa !6 %tmp5 = extractelement <3 x double> %tmp4, i32 1 %tmp6 = insertelement <3 x double> poison, double %tmp5, i32 1 diff --git a/llvm/test/CodeGen/AMDGPU/cf-loop-on-constant.ll b/llvm/test/CodeGen/AMDGPU/cf-loop-on-constant.ll index fa4e82da1d18e..7ce69fe2f4989 100644 --- a/llvm/test/CodeGen/AMDGPU/cf-loop-on-constant.ll +++ b/llvm/test/CodeGen/AMDGPU/cf-loop-on-constant.ll @@ -359,7 +359,7 @@ for.body: %add = fadd float %vecload, 1.0 store float %add, ptr addrspace(3) %arrayidx, align 8 %inc = add i32 %indvar, 1 - br i1 undef, label %for.body, label %for.exit + br i1 poison, label %for.body, label %for.exit } define amdgpu_kernel void @loop_arg_0(ptr addrspace(3) %ptr, i32 %n) nounwind { diff --git a/llvm/test/CodeGen/AMDGPU/cgp-bitfield-extract.ll b/llvm/test/CodeGen/AMDGPU/cgp-bitfield-extract.ll index 587f172c84edf..f712421083e6b 100644 --- a/llvm/test/CodeGen/AMDGPU/cgp-bitfield-extract.ll +++ b/llvm/test/CodeGen/AMDGPU/cgp-bitfield-extract.ll @@ -99,16 +99,17 @@ ret: ; OPT-LABEL: @sink_ubfe_i16( ; OPT: entry: +; OPT-NEXT: icmp ; OPT-NEXT: br i1 ; OPT: bb0: -; OPT: %0 = lshr i16 %arg1, 4 -; OPT-NEXT: %val0 = and i16 %0, 255 +; OPT: [[LSHR0:%[0-9]+]] = lshr i16 %arg1, 4 +; OPT-NEXT: %val0 = and i16 [[LSHR0]], 255 ; OPT: br label ; OPT: bb1: -; OPT: %1 = lshr i16 %arg1, 4 -; OPT-NEXT: %val1 = and i16 %1, 127 +; OPT: [[LSHR1:%[0-9]+]] = lshr i16 %arg1, 4 +; OPT-NEXT: %val1 = and i16 [[LSHR1]], 127 ; OPT: br label ; OPT: ret: @@ -123,19 +124,21 @@ ret: ; VI: s_bfe_u32 [[BFE:s[0-9]+]], [[ARG]], 0xc0004 ; GCN: s_cbranch_scc{{[0-1]}} +; GCN: ; %bb.1: ; SI: s_bfe_u32 s{{[0-9]+}}, s{{[0-9]+}}, 0x70004 ; VI: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x7f -; GCN: .LBB2_3: +; GCN: .LBB2_2: ; SI: s_bfe_u32 s{{[0-9]+}}, s{{[0-9]+}}, 0x80004 ; VI: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0xff ; GCN: buffer_store_short ; GCN: s_endpgm -define amdgpu_kernel void @sink_ubfe_i16(ptr addrspace(1) %out, i16 %arg1) #0 { +define amdgpu_kernel void @sink_ubfe_i16(ptr addrspace(1) %out, i16 %arg1, [8 x i32], i32 %arg2) #0 { entry: %shr = lshr i16 %arg1, 4 - br i1 undef, label %bb0, label %bb1 + %cond = icmp eq i32 %arg2, 0 + br i1 %cond, label %bb0, label %bb1 bb0: %val0 = and i16 %shr, 255 diff --git a/llvm/test/CodeGen/AMDGPU/combine-add-zext-xor.ll b/llvm/test/CodeGen/AMDGPU/combine-add-zext-xor.ll index 77dfc859cd1b1..434fc764e1fa6 100644 --- a/llvm/test/CodeGen/AMDGPU/combine-add-zext-xor.ll +++ b/llvm/test/CodeGen/AMDGPU/combine-add-zext-xor.ll @@ -63,7 +63,7 @@ define i32 @combine_add_zext_xor() { .a: ; preds = %bb9, %.entry %.2 = phi i32 [ 0, %.entry ], [ %i11, %bb9 ] - br i1 undef, label %bb9, label %bb + br i1 poison, label %bb9, label %bb bb: ; preds = %.a %.i3 = call i32 @llvm.amdgcn.raw.ptr.buffer.load.i32(ptr addrspace(8) poison, i32 %.2, i32 64, i32 1) @@ -411,28 +411,33 @@ bb9: ; preds = %bb, %.a ; Test that unused lanes in the s_and result are masked out with v_cndmask. -define i32 @combine_sub_zext_and() { +define i32 @combine_sub_zext_and(i32 inreg %cond) { ; GFX1010-LABEL: combine_sub_zext_and: ; GFX1010: ; %bb.0: ; %.entry ; GFX1010-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1010-NEXT: s_cmp_lg_u32 s16, 0 ; GFX1010-NEXT: v_mov_b32_e32 v1, 0 +; GFX1010-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1010-NEXT: v_cndmask_b32_e64 v0, 0, 1, s4 +; GFX1010-NEXT: v_cmp_ne_u32_e64 s4, 1, v0 ; GFX1010-NEXT: s_branch .LBB5_2 ; GFX1010-NEXT: .LBB5_1: ; %bb9 ; GFX1010-NEXT: ; in Loop: Header=BB5_2 Depth=1 ; GFX1010-NEXT: v_cmp_lt_i32_e32 vcc_lo, 0xfffffbe6, v1 -; GFX1010-NEXT: s_and_b32 s4, s4, vcc_lo -; GFX1010-NEXT: v_cndmask_b32_e64 v0, 0, 1, s4 +; GFX1010-NEXT: s_and_b32 s5, s5, vcc_lo +; GFX1010-NEXT: v_cndmask_b32_e64 v0, 0, 1, s5 ; GFX1010-NEXT: v_sub_nc_u32_e32 v1, v1, v0 ; GFX1010-NEXT: s_cbranch_vccz .LBB5_4 ; GFX1010-NEXT: .LBB5_2: ; %.a ; GFX1010-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1010-NEXT: ; implicit-def: $sgpr4 -; GFX1010-NEXT: s_cbranch_scc1 .LBB5_1 +; GFX1010-NEXT: s_and_b32 vcc_lo, exec_lo, s4 +; GFX1010-NEXT: ; implicit-def: $sgpr5 +; GFX1010-NEXT: s_cbranch_vccnz .LBB5_1 ; GFX1010-NEXT: ; %bb.3: ; %bb ; GFX1010-NEXT: ; in Loop: Header=BB5_2 Depth=1 ; GFX1010-NEXT: buffer_load_dword v0, v1, s[4:7], 64 offen glc ; GFX1010-NEXT: s_waitcnt vmcnt(0) -; GFX1010-NEXT: v_cmp_eq_u32_e64 s4, 0, v0 +; GFX1010-NEXT: v_cmp_eq_u32_e64 s5, 0, v0 ; GFX1010-NEXT: s_branch .LBB5_1 ; GFX1010-NEXT: .LBB5_4: ; %.exit ; GFX1010-NEXT: s_setpc_b64 s[30:31] @@ -440,26 +445,32 @@ define i32 @combine_sub_zext_and() { ; GFX1100-LABEL: combine_sub_zext_and: ; GFX1100: ; %bb.0: ; %.entry ; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1100-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1100-NEXT: v_mov_b32_e32 v1, 0 +; GFX1100-NEXT: s_cselect_b32 s0, -1, 0 +; GFX1100-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1100-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX1100-NEXT: v_cmp_ne_u32_e64 s0, 1, v0 ; GFX1100-NEXT: s_branch .LBB5_2 ; GFX1100-NEXT: .LBB5_1: ; %bb9 ; GFX1100-NEXT: ; in Loop: Header=BB5_2 Depth=1 ; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1100-NEXT: v_cmp_lt_i32_e32 vcc_lo, 0xfffffbe6, v1 -; GFX1100-NEXT: s_and_b32 s0, s0, vcc_lo -; GFX1100-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX1100-NEXT: s_and_b32 s1, s1, vcc_lo +; GFX1100-NEXT: v_cndmask_b32_e64 v0, 0, 1, s1 ; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1100-NEXT: v_sub_nc_u32_e32 v1, v1, v0 ; GFX1100-NEXT: s_cbranch_vccz .LBB5_4 ; GFX1100-NEXT: .LBB5_2: ; %.a ; GFX1100-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1100-NEXT: ; implicit-def: $sgpr0 -; GFX1100-NEXT: s_cbranch_scc1 .LBB5_1 +; GFX1100-NEXT: s_and_b32 vcc_lo, exec_lo, s0 +; GFX1100-NEXT: ; implicit-def: $sgpr1 +; GFX1100-NEXT: s_cbranch_vccnz .LBB5_1 ; GFX1100-NEXT: ; %bb.3: ; %bb ; GFX1100-NEXT: ; in Loop: Header=BB5_2 Depth=1 ; GFX1100-NEXT: buffer_load_b32 v0, v1, s[0:3], 64 offen glc ; GFX1100-NEXT: s_waitcnt vmcnt(0) -; GFX1100-NEXT: v_cmp_eq_u32_e64 s0, 0, v0 +; GFX1100-NEXT: v_cmp_eq_u32_e64 s1, 0, v0 ; GFX1100-NEXT: s_branch .LBB5_1 ; GFX1100-NEXT: .LBB5_4: ; %.exit ; GFX1100-NEXT: s_setpc_b64 s[30:31] @@ -468,7 +479,8 @@ define i32 @combine_sub_zext_and() { .a: ; preds = %bb9, %.entry %.2 = phi i32 [ 0, %.entry ], [ %i11, %bb9 ] - br i1 undef, label %bb9, label %bb + %cmp = icmp eq i32 %cond, 0 + br i1 %cmp, label %bb9, label %bb bb: ; preds = %.a %.i3 = call i32 @llvm.amdgcn.raw.ptr.buffer.load.i32(ptr addrspace(8) poison, i32 %.2, i32 64, i32 1) diff --git a/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll b/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll index 9a5dcfc0e39b3..2ec6f7ab7602b 100644 --- a/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll +++ b/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll @@ -2993,7 +2993,7 @@ entry: for.body.i: ; preds = %for.body.i, %entry %retval.sroa.0.0.copyload = load ptr, ptr addrspace(1) poison, align 8 - %add.ptr = getelementptr inbounds %Vec, ptr %retval.sroa.0.0.copyload, i64 undef + %add.ptr = getelementptr inbounds %Vec, ptr %retval.sroa.0.0.copyload, i64 0 %retval.sroa.0.0..sroa_cast_adr = addrspacecast ptr %add.ptr to ptr addrspace(1) %retval.sroa.0.0.copyload.i = load i32, ptr addrspace(1) %retval.sroa.0.0..sroa_cast_adr, align 1 %p1.sroa.6.0.extract.shift = lshr i32 %retval.sroa.0.0.copyload.i, 24 diff --git a/llvm/test/CodeGen/AMDGPU/debug-value.ll b/llvm/test/CodeGen/AMDGPU/debug-value.ll index f13bd665cc7f0..60ffc28cef577 100644 --- a/llvm/test/CodeGen/AMDGPU/debug-value.ll +++ b/llvm/test/CodeGen/AMDGPU/debug-value.ll @@ -13,8 +13,8 @@ bb: %tmp10 = load i32, ptr addrspace(1) %tmp9, align 4 %tmp11 = sext i32 %tmp10 to i64 %tmp12 = getelementptr inbounds <2 x float>, ptr addrspace(1) %arg, i64 %tmp11 - %tmp14 = getelementptr inbounds i8, ptr addrspace(1) %arg, i64 undef - %tmp16 = getelementptr inbounds <4 x float>, ptr addrspace(1) %tmp14, i64 undef + %tmp14 = getelementptr inbounds i8, ptr addrspace(1) %arg, i64 0 + %tmp16 = getelementptr inbounds <4 x float>, ptr addrspace(1) %tmp14, i64 0 %tmp17 = load <4 x float>, ptr addrspace(1) %tmp16, align 16 %tmp18 = fsub <4 x float> %tmp17, %tmp17 %ext = extractelement <4 x float> %tmp18, i32 1 @@ -35,7 +35,7 @@ bb25: ; preds = %bb bb28: ; preds = %bb25, %bb21 %tmp29 = phi <4 x float> [ %tmp27, %bb25 ], [ %tmp24, %bb21 ] - store <4 x float> %tmp29, ptr addrspace(5) poison, align 16 + store <4 x float> %tmp29, ptr addrspace(5) null, align 16 %tmp30 = getelementptr inbounds %struct.wombat, ptr addrspace(1) %arg, i64 %tmp2, i32 2, i64 2 %tmp31 = load i32, ptr addrspace(1) %tmp30, align 4 %tmp32 = sext i32 %tmp31 to i64 @@ -49,16 +49,16 @@ bb28: ; preds = %bb25, %bb21 %tmp41 = fsub <4 x float> zeroinitializer, %tmp40 %tmp42 = fsub <4 x float> %tmp39, %tmp40 %tmp43 = extractelement <4 x float> %tmp40, i32 1 - %tmp44 = fsub float %tmp43, undef - %tmp45 = fadd float undef, undef + %tmp44 = fsub float %tmp43, 0.0 + %tmp45 = fadd float 0.0, 0.0 %tmp46 = fdiv float %tmp44, %tmp45 %tmp47 = insertelement <4 x float> poison, float %tmp46, i32 0 %tmp48 = shufflevector <4 x float> %tmp47, <4 x float> poison, <4 x i32> zeroinitializer %tmp49 = fsub <4 x float> %tmp48, %tmp40 %tmp50 = extractelement <4 x float> %tmp41, i32 1 %tmp51 = extractelement <4 x float> %tmp42, i32 2 - %tmp52 = fmul float undef, undef - %tmp53 = fadd float %tmp52, undef + %tmp52 = fmul float 0.0, 0.0 + %tmp53 = fadd float %tmp52, 0.0 %tmp54 = fadd float %tmp51, %tmp53 %tmp55 = extractelement <4 x float> %tmp49, i32 1 %tmp56 = fmul float %tmp55, %tmp50 @@ -72,7 +72,7 @@ bb28: ; preds = %bb25, %bb21 %tmp59 = bitcast i64 %tmp35 to <2 x float> %tmp60 = insertelement <2 x float> poison, float %tmp58, i32 0 %tmp61 = shufflevector <2 x float> %tmp60, <2 x float> poison, <2 x i32> zeroinitializer - %tmp62 = fmul <2 x float> %tmp61, undef + %tmp62 = fmul <2 x float> %tmp61, zeroinitializer %tmp63 = fsub <2 x float> %tmp62, %tmp59 %tmp64 = extractelement <2 x float> %tmp63, i64 0 call void @eggs(float %tmp64) #2 diff --git a/llvm/test/CodeGen/AMDGPU/diverge-interp-mov-lower.ll b/llvm/test/CodeGen/AMDGPU/diverge-interp-mov-lower.ll index e03be90a22d3c..ed92bf3df91f9 100644 --- a/llvm/test/CodeGen/AMDGPU/diverge-interp-mov-lower.ll +++ b/llvm/test/CodeGen/AMDGPU/diverge-interp-mov-lower.ll @@ -9,13 +9,13 @@ ; GCN-LABEL: {{^}}_amdgpu_ps_main: ; GCN-NOT: v_readfirstlane ; PRE-GFX9: flat_load_dword -; GFX9: global_load +; GFX9: global_load define dllexport amdgpu_ps void @_amdgpu_ps_main(i32 inreg %arg) local_unnamed_addr #0 { .entry: %tmp = call float @llvm.amdgcn.interp.mov(i32 2, i32 0, i32 0, i32 %arg) #1 %tmp1 = bitcast float %tmp to i32 %tmp2 = srem i32 %tmp1, 4 - %tmp3 = select i1 false, i32 undef, i32 %tmp2 + %tmp3 = select i1 false, i32 poison, i32 %tmp2 %tmp4 = sext i32 %tmp3 to i64 %tmp5 = getelementptr [4 x <4 x float>], ptr addrspace(4) @0, i64 0, i64 %tmp4 %tmp6 = load <4 x float>, ptr addrspace(4) %tmp5, align 16 diff --git a/llvm/test/CodeGen/AMDGPU/early-if-convert.ll b/llvm/test/CodeGen/AMDGPU/early-if-convert.ll index 8be5d1a3fde7c..cc7460eebd9e1 100644 --- a/llvm/test/CodeGen/AMDGPU/early-if-convert.ll +++ b/llvm/test/CodeGen/AMDGPU/early-if-convert.ll @@ -386,7 +386,7 @@ done: ; GCN-NEXT: s_cselect_b32 s{{[0-9]+}}, 0, 1{{$}} define amdgpu_kernel void @ifcvt_undef_scc(i32 %cond, ptr addrspace(1) %out) { entry: - br i1 undef, label %else, label %if + br i1 poison, label %else, label %if if: br label %done diff --git a/llvm/test/CodeGen/AMDGPU/extract-subvector-16bit.ll b/llvm/test/CodeGen/AMDGPU/extract-subvector-16bit.ll index 24dc5b5bb3150..af7028173f6c7 100644 --- a/llvm/test/CodeGen/AMDGPU/extract-subvector-16bit.ll +++ b/llvm/test/CodeGen/AMDGPU/extract-subvector-16bit.ll @@ -3,13 +3,14 @@ ; RUN: llc -mtriple=amdgcn-- -mcpu=gfx900 -verify-machineinstrs -o - %s | FileCheck -check-prefix=GFX9 %s ; RUN: llc -mtriple=amdgcn-- -mcpu=gfx1100 -verify-machineinstrs -o - %s | FileCheck -check-prefix=GFX11 %s -define <4 x i16> @vec_8xi16_extract_4xi16(ptr addrspace(1) %p0, ptr addrspace(1) %p1) { +define <4 x i16> @vec_8xi16_extract_4xi16(ptr addrspace(1) %p0, ptr addrspace(1) %p1, i32 inreg %cond.arg) { ; SI-LABEL: vec_8xi16_extract_4xi16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: s_cbranch_scc0 .LBB0_2 -; SI-NEXT: ; %bb.1: ; %F +; SI-NEXT: s_cmp_lg_u32 s16, 0 ; SI-NEXT: s_mov_b32 s6, 0 +; SI-NEXT: s_cbranch_scc0 .LBB0_4 +; SI-NEXT: ; %bb.1: ; %F ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s4, s6 ; SI-NEXT: s_mov_b32 s5, s6 @@ -34,15 +35,8 @@ define <4 x i16> @vec_8xi16_extract_4xi16(ptr addrspace(1) %p0, ptr addrspace(1) ; SI-NEXT: v_or_b32_e32 v2, v6, v2 ; SI-NEXT: v_or_b32_e32 v3, v5, v3 ; SI-NEXT: s_mov_b64 vcc, exec -; SI-NEXT: s_cbranch_execz .LBB0_3 -; SI-NEXT: s_branch .LBB0_4 -; SI-NEXT: .LBB0_2: -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: s_mov_b64 vcc, 0 -; SI-NEXT: .LBB0_3: ; %T -; SI-NEXT: s_mov_b32 s6, 0 +; SI-NEXT: s_cbranch_execnz .LBB0_3 +; SI-NEXT: .LBB0_2: ; %T ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s4, s6 ; SI-NEXT: s_mov_b32 s5, s6 @@ -66,7 +60,7 @@ define <4 x i16> @vec_8xi16_extract_4xi16(ptr addrspace(1) %p0, ptr addrspace(1) ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v4 ; SI-NEXT: v_or_b32_e32 v2, v2, v0 ; SI-NEXT: v_or_b32_e32 v3, v3, v1 -; SI-NEXT: .LBB0_4: ; %exit +; SI-NEXT: .LBB0_3: ; %exit ; SI-NEXT: v_bfe_i32 v0, v3, 0, 16 ; SI-NEXT: v_bfe_i32 v1, v4, 0, 16 ; SI-NEXT: v_bfe_i32 v2, v2, 0, 16 @@ -87,22 +81,26 @@ define <4 x i16> @vec_8xi16_extract_4xi16(ptr addrspace(1) %p0, ptr addrspace(1) ; SI-NEXT: v_or_b32_e32 v2, v3, v4 ; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 ; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB0_4: +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: s_mov_b64 vcc, 0 +; SI-NEXT: s_branch .LBB0_2 ; ; GFX9-LABEL: vec_8xi16_extract_4xi16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_cbranch_scc0 .LBB0_2 +; GFX9-NEXT: s_cmp_lg_u32 s16, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB0_4 ; GFX9-NEXT: ; %bb.1: ; %F ; GFX9-NEXT: global_load_dwordx4 v[2:5], v[2:3], off glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_cbranch_execz .LBB0_3 -; GFX9-NEXT: s_branch .LBB0_4 -; GFX9-NEXT: .LBB0_2: -; GFX9-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5 -; GFX9-NEXT: .LBB0_3: ; %T +; GFX9-NEXT: s_cbranch_execnz .LBB0_3 +; GFX9-NEXT: .LBB0_2: ; %T ; GFX9-NEXT: global_load_dwordx4 v[2:5], v[0:1], off glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: .LBB0_4: ; %exit +; GFX9-NEXT: .LBB0_3: ; %exit ; GFX9-NEXT: v_pk_ashrrev_i16 v0, 15, v3 op_sel_hi:[0,0] ; GFX9-NEXT: s_movk_i32 s4, 0x8000 ; GFX9-NEXT: v_or_b32_e32 v1, 0xffff8000, v0 @@ -114,22 +112,25 @@ define <4 x i16> @vec_8xi16_extract_4xi16(ptr addrspace(1) %p0, ptr addrspace(1) ; GFX9-NEXT: v_perm_b32 v0, v0, v2, s4 ; GFX9-NEXT: v_perm_b32 v1, v3, v1, s4 ; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB0_4: +; GFX9-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5 +; GFX9-NEXT: s_branch .LBB0_2 ; ; GFX11-LABEL: vec_8xi16_extract_4xi16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: s_cbranch_scc0 .LBB0_2 +; GFX11-NEXT: s_cmp_lg_u32 s0, 0 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB0_4 ; GFX11-NEXT: ; %bb.1: ; %F ; GFX11-NEXT: global_load_b128 v[2:5], v[2:3], off glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: s_cbranch_execz .LBB0_3 -; GFX11-NEXT: s_branch .LBB0_4 -; GFX11-NEXT: .LBB0_2: -; GFX11-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5 -; GFX11-NEXT: .LBB0_3: ; %T +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccnz .LBB0_3 +; GFX11-NEXT: .LBB0_2: ; %T ; GFX11-NEXT: global_load_b128 v[2:5], v[0:1], off glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: .LBB0_4: ; %exit +; GFX11-NEXT: .LBB0_3: ; %exit ; GFX11-NEXT: v_pk_ashrrev_i16 v0, 15, v2 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_ashrrev_i16 v1, 15, v3 op_sel_hi:[0,0] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) @@ -144,7 +145,11 @@ define <4 x i16> @vec_8xi16_extract_4xi16(ptr addrspace(1) %p0, ptr addrspace(1) ; GFX11-NEXT: v_perm_b32 v0, v2, v0, 0x5040100 ; GFX11-NEXT: v_perm_b32 v1, v3, v1, 0x5040100 ; GFX11-NEXT: s_setpc_b64 s[30:31] - br i1 undef, label %T, label %F +; GFX11-NEXT: .LBB0_4: +; GFX11-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5 +; GFX11-NEXT: s_branch .LBB0_2 + %cond = icmp eq i32 %cond.arg, 0 + br i1 %cond, label %T, label %F T: %t = load volatile <8 x i16>, ptr addrspace(1) %p0 @@ -162,13 +167,14 @@ exit: ret <4 x i16> %r2 } -define <4 x i16> @vec_8xi16_extract_4xi16_2(ptr addrspace(1) %p0, ptr addrspace(1) %p1) { +define <4 x i16> @vec_8xi16_extract_4xi16_2(ptr addrspace(1) %p0, ptr addrspace(1) %p1, i32 inreg %cond.arg) { ; SI-LABEL: vec_8xi16_extract_4xi16_2: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: s_cbranch_scc0 .LBB1_2 -; SI-NEXT: ; %bb.1: ; %F +; SI-NEXT: s_cmp_lg_u32 s16, 0 ; SI-NEXT: s_mov_b32 s6, 0 +; SI-NEXT: s_cbranch_scc0 .LBB1_4 +; SI-NEXT: ; %bb.1: ; %F ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s4, s6 ; SI-NEXT: s_mov_b32 s5, s6 @@ -193,16 +199,8 @@ define <4 x i16> @vec_8xi16_extract_4xi16_2(ptr addrspace(1) %p0, ptr addrspace( ; SI-NEXT: v_or_b32_e32 v3, v6, v3 ; SI-NEXT: v_or_b32_e32 v5, v5, v7 ; SI-NEXT: s_mov_b64 vcc, exec -; SI-NEXT: s_cbranch_execz .LBB1_3 -; SI-NEXT: s_branch .LBB1_4 -; SI-NEXT: .LBB1_2: -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: s_mov_b64 vcc, 0 -; SI-NEXT: .LBB1_3: ; %T -; SI-NEXT: s_mov_b32 s6, 0 +; SI-NEXT: s_cbranch_execnz .LBB1_3 +; SI-NEXT: .LBB1_2: ; %T ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s4, s6 ; SI-NEXT: s_mov_b32 s5, s6 @@ -226,7 +224,7 @@ define <4 x i16> @vec_8xi16_extract_4xi16_2(ptr addrspace(1) %p0, ptr addrspace( ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v4 ; SI-NEXT: v_or_b32_e32 v3, v3, v0 ; SI-NEXT: v_or_b32_e32 v5, v5, v1 -; SI-NEXT: .LBB1_4: ; %exit +; SI-NEXT: .LBB1_3: ; %exit ; SI-NEXT: v_bfe_i32 v0, v5, 0, 16 ; SI-NEXT: v_bfe_i32 v1, v4, 0, 16 ; SI-NEXT: v_bfe_i32 v3, v3, 0, 16 @@ -248,22 +246,27 @@ define <4 x i16> @vec_8xi16_extract_4xi16_2(ptr addrspace(1) %p0, ptr addrspace( ; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 ; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v4 ; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB1_4: +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: s_mov_b64 vcc, 0 +; SI-NEXT: s_branch .LBB1_2 ; ; GFX9-LABEL: vec_8xi16_extract_4xi16_2: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_cbranch_scc0 .LBB1_2 +; GFX9-NEXT: s_cmp_lg_u32 s16, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB1_4 ; GFX9-NEXT: ; %bb.1: ; %F ; GFX9-NEXT: global_load_dwordx4 v[2:5], v[2:3], off glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_cbranch_execz .LBB1_3 -; GFX9-NEXT: s_branch .LBB1_4 -; GFX9-NEXT: .LBB1_2: -; GFX9-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5 -; GFX9-NEXT: .LBB1_3: ; %T +; GFX9-NEXT: s_cbranch_execnz .LBB1_3 +; GFX9-NEXT: .LBB1_2: ; %T ; GFX9-NEXT: global_load_dwordx4 v[2:5], v[0:1], off glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: .LBB1_4: ; %exit +; GFX9-NEXT: .LBB1_3: ; %exit ; GFX9-NEXT: v_pk_ashrrev_i16 v0, 15, v5 op_sel_hi:[0,1] ; GFX9-NEXT: s_movk_i32 s4, 0x8000 ; GFX9-NEXT: v_or_b32_e32 v1, 0xffff8000, v0 @@ -275,22 +278,25 @@ define <4 x i16> @vec_8xi16_extract_4xi16_2(ptr addrspace(1) %p0, ptr addrspace( ; GFX9-NEXT: v_perm_b32 v0, v0, v3, s4 ; GFX9-NEXT: v_perm_b32 v1, v2, v1, s4 ; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB1_4: +; GFX9-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5 +; GFX9-NEXT: s_branch .LBB1_2 ; ; GFX11-LABEL: vec_8xi16_extract_4xi16_2: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: s_cbranch_scc0 .LBB1_2 +; GFX11-NEXT: s_cmp_lg_u32 s0, 0 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB1_4 ; GFX11-NEXT: ; %bb.1: ; %F ; GFX11-NEXT: global_load_b128 v[2:5], v[2:3], off glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: s_cbranch_execz .LBB1_3 -; GFX11-NEXT: s_branch .LBB1_4 -; GFX11-NEXT: .LBB1_2: -; GFX11-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5 -; GFX11-NEXT: .LBB1_3: ; %T +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccnz .LBB1_3 +; GFX11-NEXT: .LBB1_2: ; %T ; GFX11-NEXT: global_load_b128 v[2:5], v[0:1], off glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: .LBB1_4: ; %exit +; GFX11-NEXT: .LBB1_3: ; %exit ; GFX11-NEXT: v_pk_ashrrev_i16 v0, 15, v4 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_ashrrev_i16 v1, 15, v5 op_sel_hi:[0,1] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) @@ -305,7 +311,11 @@ define <4 x i16> @vec_8xi16_extract_4xi16_2(ptr addrspace(1) %p0, ptr addrspace( ; GFX11-NEXT: v_perm_b32 v0, v2, v0, 0x5040100 ; GFX11-NEXT: v_perm_b32 v1, v3, v1, 0x5040100 ; GFX11-NEXT: s_setpc_b64 s[30:31] - br i1 undef, label %T, label %F +; GFX11-NEXT: .LBB1_4: +; GFX11-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5 +; GFX11-NEXT: s_branch .LBB1_2 + %cond = icmp eq i32 %cond.arg, 0 + br i1 %cond, label %T, label %F T: %t = load volatile <8 x i16>, ptr addrspace(1) %p0 @@ -323,13 +333,14 @@ exit: ret <4 x i16> %r2 } -define <4 x half> @vec_8xf16_extract_4xf16(ptr addrspace(1) %p0, ptr addrspace(1) %p1) { +define <4 x half> @vec_8xf16_extract_4xf16(ptr addrspace(1) %p0, ptr addrspace(1) %p1, i32 inreg %cond.arg) { ; SI-LABEL: vec_8xf16_extract_4xf16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: s_cbranch_scc0 .LBB2_2 -; SI-NEXT: ; %bb.1: ; %F +; SI-NEXT: s_cmp_lg_u32 s16, 0 ; SI-NEXT: s_mov_b32 s6, 0 +; SI-NEXT: s_cbranch_scc0 .LBB2_4 +; SI-NEXT: ; %bb.1: ; %F ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s4, s6 ; SI-NEXT: s_mov_b32 s5, s6 @@ -357,15 +368,8 @@ define <4 x half> @vec_8xf16_extract_4xf16(ptr addrspace(1) %p0, ptr addrspace(1 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 ; SI-NEXT: s_mov_b64 vcc, exec -; SI-NEXT: s_cbranch_execz .LBB2_3 -; SI-NEXT: s_branch .LBB2_4 -; SI-NEXT: .LBB2_2: -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: s_mov_b64 vcc, 0 -; SI-NEXT: .LBB2_3: ; %T -; SI-NEXT: s_mov_b32 s6, 0 +; SI-NEXT: s_cbranch_execnz .LBB2_3 +; SI-NEXT: .LBB2_2: ; %T ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s4, s6 ; SI-NEXT: s_mov_b32 s5, s6 @@ -392,7 +396,7 @@ define <4 x half> @vec_8xf16_extract_4xf16(ptr addrspace(1) %p0, ptr addrspace(1 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v0 ; SI-NEXT: v_cvt_f32_f16_e32 v4, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: .LBB2_4: ; %exit +; SI-NEXT: .LBB2_3: ; %exit ; SI-NEXT: v_cvt_f16_f32_e32 v0, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 @@ -409,22 +413,26 @@ define <4 x half> @vec_8xf16_extract_4xf16(ptr addrspace(1) %p0, ptr addrspace(1 ; SI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc ; SI-NEXT: v_mov_b32_e32 v3, v2 ; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB2_4: +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: s_mov_b64 vcc, 0 +; SI-NEXT: s_branch .LBB2_2 ; ; GFX9-LABEL: vec_8xf16_extract_4xf16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_cbranch_scc0 .LBB2_2 +; GFX9-NEXT: s_cmp_lg_u32 s16, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB2_4 ; GFX9-NEXT: ; %bb.1: ; %F ; GFX9-NEXT: global_load_dwordx4 v[2:5], v[2:3], off glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_cbranch_execz .LBB2_3 -; GFX9-NEXT: s_branch .LBB2_4 -; GFX9-NEXT: .LBB2_2: -; GFX9-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5 -; GFX9-NEXT: .LBB2_3: ; %T +; GFX9-NEXT: s_cbranch_execnz .LBB2_3 +; GFX9-NEXT: .LBB2_2: ; %T ; GFX9-NEXT: global_load_dwordx4 v[2:5], v[0:1], off glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: .LBB2_4: ; %exit +; GFX9-NEXT: .LBB2_3: ; %exit ; GFX9-NEXT: v_mov_b32_e32 v0, 0x3900 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x3d00 ; GFX9-NEXT: v_cmp_ge_f16_e32 vcc, 0.5, v2 @@ -439,22 +447,25 @@ define <4 x half> @vec_8xf16_extract_4xf16(ptr addrspace(1) %p0, ptr addrspace(1 ; GFX9-NEXT: v_pack_b32_f16 v1, v0, v5 ; GFX9-NEXT: v_pack_b32_f16 v0, v4, v2 ; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB2_4: +; GFX9-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5 +; GFX9-NEXT: s_branch .LBB2_2 ; ; GFX11-LABEL: vec_8xf16_extract_4xf16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: s_cbranch_scc0 .LBB2_2 +; GFX11-NEXT: s_cmp_lg_u32 s0, 0 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB2_4 ; GFX11-NEXT: ; %bb.1: ; %F ; GFX11-NEXT: global_load_b128 v[2:5], v[2:3], off glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: s_cbranch_execz .LBB2_3 -; GFX11-NEXT: s_branch .LBB2_4 -; GFX11-NEXT: .LBB2_2: -; GFX11-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5 -; GFX11-NEXT: .LBB2_3: ; %T +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccnz .LBB2_3 +; GFX11-NEXT: .LBB2_2: ; %T ; GFX11-NEXT: global_load_b128 v[2:5], v[0:1], off glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: .LBB2_4: ; %exit +; GFX11-NEXT: .LBB2_3: ; %exit ; GFX11-NEXT: v_mov_b32_e32 v0, 0x3d00 ; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v2 ; GFX11-NEXT: v_cmp_ge_f16_e32 vcc_lo, 0.5, v2 @@ -470,7 +481,11 @@ define <4 x half> @vec_8xf16_extract_4xf16(ptr addrspace(1) %p0, ptr addrspace(1 ; GFX11-NEXT: v_pack_b32_f16 v0, v2, v1 ; GFX11-NEXT: v_pack_b32_f16 v1, v3, v4 ; GFX11-NEXT: s_setpc_b64 s[30:31] - br i1 undef, label %T, label %F +; GFX11-NEXT: .LBB2_4: +; GFX11-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5 +; GFX11-NEXT: s_branch .LBB2_2 + %cond = icmp eq i32 %cond.arg, 0 + br i1 %cond, label %T, label %F T: %t = load volatile <8 x half>, ptr addrspace(1) %p0 @@ -488,14 +503,15 @@ exit: ret <4 x half> %r2 } -define <4 x i16> @vec_16xi16_extract_4xi16(ptr addrspace(1) %p0, ptr addrspace(1) %p1) { +define <4 x i16> @vec_16xi16_extract_4xi16(ptr addrspace(1) %p0, ptr addrspace(1) %p1, i32 inreg %cond.arg) { ; ; SI-LABEL: vec_16xi16_extract_4xi16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: s_cbranch_scc0 .LBB3_2 -; SI-NEXT: ; %bb.1: ; %F +; SI-NEXT: s_cmp_lg_u32 s16, 0 ; SI-NEXT: s_mov_b32 s6, 0 +; SI-NEXT: s_cbranch_scc0 .LBB3_4 +; SI-NEXT: ; %bb.1: ; %F ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s4, s6 ; SI-NEXT: s_mov_b32 s5, s6 @@ -536,15 +552,8 @@ define <4 x i16> @vec_16xi16_extract_4xi16(ptr addrspace(1) %p0, ptr addrspace(1 ; SI-NEXT: v_or_b32_e32 v2, v6, v2 ; SI-NEXT: v_or_b32_e32 v3, v5, v3 ; SI-NEXT: s_mov_b64 vcc, exec -; SI-NEXT: s_cbranch_execz .LBB3_3 -; SI-NEXT: s_branch .LBB3_4 -; SI-NEXT: .LBB3_2: -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: s_mov_b64 vcc, 0 -; SI-NEXT: .LBB3_3: ; %T -; SI-NEXT: s_mov_b32 s6, 0 +; SI-NEXT: s_cbranch_execnz .LBB3_3 +; SI-NEXT: .LBB3_2: ; %T ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s4, s6 ; SI-NEXT: s_mov_b32 s5, s6 @@ -584,7 +593,7 @@ define <4 x i16> @vec_16xi16_extract_4xi16(ptr addrspace(1) %p0, ptr addrspace(1 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v4 ; SI-NEXT: v_or_b32_e32 v2, v2, v0 ; SI-NEXT: v_or_b32_e32 v3, v3, v1 -; SI-NEXT: .LBB3_4: ; %exit +; SI-NEXT: .LBB3_3: ; %exit ; SI-NEXT: v_bfe_i32 v0, v3, 0, 16 ; SI-NEXT: v_bfe_i32 v1, v4, 0, 16 ; SI-NEXT: v_bfe_i32 v2, v2, 0, 16 @@ -605,28 +614,32 @@ define <4 x i16> @vec_16xi16_extract_4xi16(ptr addrspace(1) %p0, ptr addrspace(1 ; SI-NEXT: v_or_b32_e32 v2, v3, v4 ; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 ; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB3_4: +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: s_mov_b64 vcc, 0 +; SI-NEXT: s_branch .LBB3_2 ; ; GFX9-LABEL: vec_16xi16_extract_4xi16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_cbranch_scc0 .LBB3_2 +; GFX9-NEXT: s_cmp_lg_u32 s16, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB3_4 ; GFX9-NEXT: ; %bb.1: ; %F ; GFX9-NEXT: global_load_dwordx4 v[4:7], v[2:3], off offset:16 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: global_load_dwordx4 v[4:7], v[2:3], off glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; kill: killed $vgpr2 killed $vgpr3 -; GFX9-NEXT: s_cbranch_execz .LBB3_3 -; GFX9-NEXT: s_branch .LBB3_4 -; GFX9-NEXT: .LBB3_2: -; GFX9-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11 -; GFX9-NEXT: .LBB3_3: ; %T +; GFX9-NEXT: s_cbranch_execnz .LBB3_3 +; GFX9-NEXT: .LBB3_2: ; %T ; GFX9-NEXT: global_load_dwordx4 v[2:5], v[0:1], off offset:16 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: global_load_dwordx4 v[4:7], v[0:1], off glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; kill: killed $vgpr0 killed $vgpr1 -; GFX9-NEXT: .LBB3_4: ; %exit +; GFX9-NEXT: .LBB3_3: ; %exit ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_pk_ashrrev_i16 v0, 15, v5 op_sel_hi:[0,0] ; GFX9-NEXT: s_movk_i32 s4, 0x8000 @@ -639,26 +652,29 @@ define <4 x i16> @vec_16xi16_extract_4xi16(ptr addrspace(1) %p0, ptr addrspace(1 ; GFX9-NEXT: v_perm_b32 v0, v0, v3, s4 ; GFX9-NEXT: v_perm_b32 v1, v2, v1, s4 ; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB3_4: +; GFX9-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11 +; GFX9-NEXT: s_branch .LBB3_2 ; ; GFX11-LABEL: vec_16xi16_extract_4xi16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: s_cbranch_scc0 .LBB3_2 +; GFX11-NEXT: s_cmp_lg_u32 s0, 0 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB3_4 ; GFX11-NEXT: ; %bb.1: ; %F ; GFX11-NEXT: global_load_b128 v[4:7], v[2:3], off offset:16 glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: global_load_b128 v[2:5], v[2:3], off glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: s_cbranch_execz .LBB3_3 -; GFX11-NEXT: s_branch .LBB3_4 -; GFX11-NEXT: .LBB3_2: -; GFX11-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9 -; GFX11-NEXT: .LBB3_3: ; %T +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccnz .LBB3_3 +; GFX11-NEXT: .LBB3_2: ; %T ; GFX11-NEXT: global_load_b128 v[2:5], v[0:1], off offset:16 glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: global_load_b128 v[2:5], v[0:1], off glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: .LBB3_4: ; %exit +; GFX11-NEXT: .LBB3_3: ; %exit ; GFX11-NEXT: v_pk_ashrrev_i16 v0, 15, v2 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_ashrrev_i16 v1, 15, v3 op_sel_hi:[0,0] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) @@ -673,7 +689,11 @@ define <4 x i16> @vec_16xi16_extract_4xi16(ptr addrspace(1) %p0, ptr addrspace(1 ; GFX11-NEXT: v_perm_b32 v0, v2, v0, 0x5040100 ; GFX11-NEXT: v_perm_b32 v1, v3, v1, 0x5040100 ; GFX11-NEXT: s_setpc_b64 s[30:31] - br i1 undef, label %T, label %F +; GFX11-NEXT: .LBB3_4: +; GFX11-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9 +; GFX11-NEXT: s_branch .LBB3_2 + %cond = icmp eq i32 %cond.arg, 0 + br i1 %cond, label %T, label %F T: %t = load volatile <16 x i16>, ptr addrspace(1) %p0 @@ -691,14 +711,15 @@ exit: ret <4 x i16> %r2 } -define <4 x i16> @vec_16xi16_extract_4xi16_2(ptr addrspace(1) %p0, ptr addrspace(1) %p1) { +define <4 x i16> @vec_16xi16_extract_4xi16_2(ptr addrspace(1) %p0, ptr addrspace(1) %p1, i32 inreg %cond.arg) { ; ; SI-LABEL: vec_16xi16_extract_4xi16_2: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: s_cbranch_scc0 .LBB4_2 -; SI-NEXT: ; %bb.1: ; %F +; SI-NEXT: s_cmp_lg_u32 s16, 0 ; SI-NEXT: s_mov_b32 s6, 0 +; SI-NEXT: s_cbranch_scc0 .LBB4_4 +; SI-NEXT: ; %bb.1: ; %F ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s4, s6 ; SI-NEXT: s_mov_b32 s5, s6 @@ -739,16 +760,8 @@ define <4 x i16> @vec_16xi16_extract_4xi16_2(ptr addrspace(1) %p0, ptr addrspace ; SI-NEXT: v_or_b32_e32 v2, v7, v2 ; SI-NEXT: v_or_b32_e32 v3, v6, v3 ; SI-NEXT: s_mov_b64 vcc, exec -; SI-NEXT: s_cbranch_execz .LBB4_3 -; SI-NEXT: s_branch .LBB4_4 -; SI-NEXT: .LBB4_2: -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: s_mov_b64 vcc, 0 -; SI-NEXT: .LBB4_3: ; %T -; SI-NEXT: s_mov_b32 s6, 0 +; SI-NEXT: s_cbranch_execnz .LBB4_3 +; SI-NEXT: .LBB4_2: ; %T ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s4, s6 ; SI-NEXT: s_mov_b32 s5, s6 @@ -788,7 +801,7 @@ define <4 x i16> @vec_16xi16_extract_4xi16_2(ptr addrspace(1) %p0, ptr addrspace ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v4 ; SI-NEXT: v_or_b32_e32 v2, v2, v0 ; SI-NEXT: v_or_b32_e32 v3, v3, v1 -; SI-NEXT: .LBB4_4: ; %exit +; SI-NEXT: .LBB4_3: ; %exit ; SI-NEXT: v_bfe_i32 v0, v3, 0, 16 ; SI-NEXT: v_bfe_i32 v1, v4, 0, 16 ; SI-NEXT: v_bfe_i32 v2, v2, 0, 16 @@ -810,28 +823,33 @@ define <4 x i16> @vec_16xi16_extract_4xi16_2(ptr addrspace(1) %p0, ptr addrspace ; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 ; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB4_4: +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: s_mov_b64 vcc, 0 +; SI-NEXT: s_branch .LBB4_2 ; ; GFX9-LABEL: vec_16xi16_extract_4xi16_2: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_cbranch_scc0 .LBB4_2 +; GFX9-NEXT: s_cmp_lg_u32 s16, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB4_4 ; GFX9-NEXT: ; %bb.1: ; %F ; GFX9-NEXT: global_load_dwordx4 v[4:7], v[2:3], off offset:16 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: global_load_dwordx4 v[4:7], v[2:3], off glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; kill: killed $vgpr2 killed $vgpr3 -; GFX9-NEXT: s_cbranch_execz .LBB4_3 -; GFX9-NEXT: s_branch .LBB4_4 -; GFX9-NEXT: .LBB4_2: -; GFX9-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11 -; GFX9-NEXT: .LBB4_3: ; %T +; GFX9-NEXT: s_cbranch_execnz .LBB4_3 +; GFX9-NEXT: .LBB4_2: ; %T ; GFX9-NEXT: global_load_dwordx4 v[2:5], v[0:1], off offset:16 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: global_load_dwordx4 v[4:7], v[0:1], off glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; kill: killed $vgpr0 killed $vgpr1 -; GFX9-NEXT: .LBB4_4: ; %exit +; GFX9-NEXT: .LBB4_3: ; %exit ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_pk_ashrrev_i16 v0, 15, v7 op_sel_hi:[0,1] ; GFX9-NEXT: s_movk_i32 s4, 0x8000 @@ -844,26 +862,29 @@ define <4 x i16> @vec_16xi16_extract_4xi16_2(ptr addrspace(1) %p0, ptr addrspace ; GFX9-NEXT: v_perm_b32 v0, v0, v3, s4 ; GFX9-NEXT: v_perm_b32 v1, v2, v1, s4 ; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB4_4: +; GFX9-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11 +; GFX9-NEXT: s_branch .LBB4_2 ; ; GFX11-LABEL: vec_16xi16_extract_4xi16_2: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: s_cbranch_scc0 .LBB4_2 +; GFX11-NEXT: s_cmp_lg_u32 s0, 0 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB4_4 ; GFX11-NEXT: ; %bb.1: ; %F ; GFX11-NEXT: global_load_b128 v[4:7], v[2:3], off offset:16 glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: global_load_b128 v[2:5], v[2:3], off glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: s_cbranch_execz .LBB4_3 -; GFX11-NEXT: s_branch .LBB4_4 -; GFX11-NEXT: .LBB4_2: -; GFX11-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9 -; GFX11-NEXT: .LBB4_3: ; %T +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccnz .LBB4_3 +; GFX11-NEXT: .LBB4_2: ; %T ; GFX11-NEXT: global_load_b128 v[2:5], v[0:1], off offset:16 glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: global_load_b128 v[2:5], v[0:1], off glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: .LBB4_4: ; %exit +; GFX11-NEXT: .LBB4_3: ; %exit ; GFX11-NEXT: v_pk_ashrrev_i16 v0, 15, v4 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_ashrrev_i16 v1, 15, v5 op_sel_hi:[0,1] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) @@ -878,7 +899,11 @@ define <4 x i16> @vec_16xi16_extract_4xi16_2(ptr addrspace(1) %p0, ptr addrspace ; GFX11-NEXT: v_perm_b32 v0, v2, v0, 0x5040100 ; GFX11-NEXT: v_perm_b32 v1, v3, v1, 0x5040100 ; GFX11-NEXT: s_setpc_b64 s[30:31] - br i1 undef, label %T, label %F +; GFX11-NEXT: .LBB4_4: +; GFX11-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9 +; GFX11-NEXT: s_branch .LBB4_2 + %cond = icmp eq i32 %cond.arg, 0 + br i1 %cond, label %T, label %F T: %t = load volatile <16 x i16>, ptr addrspace(1) %p0 @@ -896,14 +921,15 @@ exit: ret <4 x i16> %r2 } -define <4 x half> @vec_16xf16_extract_4xf16(ptr addrspace(1) %p0, ptr addrspace(1) %p1) { +define <4 x half> @vec_16xf16_extract_4xf16(ptr addrspace(1) %p0, ptr addrspace(1) %p1, i32 inreg %cond.arg) { ; ; SI-LABEL: vec_16xf16_extract_4xf16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: s_cbranch_scc0 .LBB5_2 -; SI-NEXT: ; %bb.1: ; %F +; SI-NEXT: s_cmp_lg_u32 s16, 0 ; SI-NEXT: s_mov_b32 s6, 0 +; SI-NEXT: s_cbranch_scc0 .LBB5_4 +; SI-NEXT: ; %bb.1: ; %F ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s4, s6 ; SI-NEXT: s_mov_b32 s5, s6 @@ -947,15 +973,8 @@ define <4 x half> @vec_16xf16_extract_4xf16(ptr addrspace(1) %p0, ptr addrspace( ; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 ; SI-NEXT: s_mov_b64 vcc, exec -; SI-NEXT: s_cbranch_execz .LBB5_3 -; SI-NEXT: s_branch .LBB5_4 -; SI-NEXT: .LBB5_2: -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: s_mov_b64 vcc, 0 -; SI-NEXT: .LBB5_3: ; %T -; SI-NEXT: s_mov_b32 s6, 0 +; SI-NEXT: s_cbranch_execnz .LBB5_3 +; SI-NEXT: .LBB5_2: ; %T ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s4, s6 ; SI-NEXT: s_mov_b32 s5, s6 @@ -998,7 +1017,7 @@ define <4 x half> @vec_16xf16_extract_4xf16(ptr addrspace(1) %p0, ptr addrspace( ; SI-NEXT: v_cvt_f32_f16_e32 v2, v0 ; SI-NEXT: v_cvt_f32_f16_e32 v4, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: .LBB5_4: ; %exit +; SI-NEXT: .LBB5_3: ; %exit ; SI-NEXT: v_cvt_f16_f32_e32 v0, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 @@ -1015,28 +1034,32 @@ define <4 x half> @vec_16xf16_extract_4xf16(ptr addrspace(1) %p0, ptr addrspace( ; SI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc ; SI-NEXT: v_mov_b32_e32 v3, v2 ; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB5_4: +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: s_mov_b64 vcc, 0 +; SI-NEXT: s_branch .LBB5_2 ; ; GFX9-LABEL: vec_16xf16_extract_4xf16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_cbranch_scc0 .LBB5_2 +; GFX9-NEXT: s_cmp_lg_u32 s16, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB5_4 ; GFX9-NEXT: ; %bb.1: ; %F ; GFX9-NEXT: global_load_dwordx4 v[4:7], v[2:3], off offset:16 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: global_load_dwordx4 v[4:7], v[2:3], off glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; kill: killed $vgpr2 killed $vgpr3 -; GFX9-NEXT: s_cbranch_execz .LBB5_3 -; GFX9-NEXT: s_branch .LBB5_4 -; GFX9-NEXT: .LBB5_2: -; GFX9-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11 -; GFX9-NEXT: .LBB5_3: ; %T +; GFX9-NEXT: s_cbranch_execnz .LBB5_3 +; GFX9-NEXT: .LBB5_2: ; %T ; GFX9-NEXT: global_load_dwordx4 v[2:5], v[0:1], off offset:16 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: global_load_dwordx4 v[4:7], v[0:1], off glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; kill: killed $vgpr0 killed $vgpr1 -; GFX9-NEXT: .LBB5_4: ; %exit +; GFX9-NEXT: .LBB5_3: ; %exit ; GFX9-NEXT: v_mov_b32_e32 v0, 0x3900 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x3d00 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -1052,26 +1075,29 @@ define <4 x half> @vec_16xf16_extract_4xf16(ptr addrspace(1) %p0, ptr addrspace( ; GFX9-NEXT: v_pack_b32_f16 v1, v0, v4 ; GFX9-NEXT: v_pack_b32_f16 v0, v2, v3 ; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB5_4: +; GFX9-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11 +; GFX9-NEXT: s_branch .LBB5_2 ; ; GFX11-LABEL: vec_16xf16_extract_4xf16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: s_cbranch_scc0 .LBB5_2 +; GFX11-NEXT: s_cmp_lg_u32 s0, 0 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB5_4 ; GFX11-NEXT: ; %bb.1: ; %F ; GFX11-NEXT: global_load_b128 v[4:7], v[2:3], off offset:16 glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: global_load_b128 v[2:5], v[2:3], off glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: s_cbranch_execz .LBB5_3 -; GFX11-NEXT: s_branch .LBB5_4 -; GFX11-NEXT: .LBB5_2: -; GFX11-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9 -; GFX11-NEXT: .LBB5_3: ; %T +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccnz .LBB5_3 +; GFX11-NEXT: .LBB5_2: ; %T ; GFX11-NEXT: global_load_b128 v[2:5], v[0:1], off offset:16 glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: global_load_b128 v[2:5], v[0:1], off glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: .LBB5_4: ; %exit +; GFX11-NEXT: .LBB5_3: ; %exit ; GFX11-NEXT: v_mov_b32_e32 v0, 0x3d00 ; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v2 ; GFX11-NEXT: v_cmp_ge_f16_e32 vcc_lo, 0.5, v2 @@ -1087,7 +1113,11 @@ define <4 x half> @vec_16xf16_extract_4xf16(ptr addrspace(1) %p0, ptr addrspace( ; GFX11-NEXT: v_pack_b32_f16 v0, v2, v1 ; GFX11-NEXT: v_pack_b32_f16 v1, v3, v4 ; GFX11-NEXT: s_setpc_b64 s[30:31] - br i1 undef, label %T, label %F +; GFX11-NEXT: .LBB5_4: +; GFX11-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9 +; GFX11-NEXT: s_branch .LBB5_2 + %cond = icmp eq i32 %cond.arg, 0 + br i1 %cond, label %T, label %F T: %t = load volatile <16 x half>, ptr addrspace(1) %p0 diff --git a/llvm/test/CodeGen/AMDGPU/flat-scratch.ll b/llvm/test/CodeGen/AMDGPU/flat-scratch.ll index e7c8604776ce0..dd423b5ce5a79 100644 --- a/llvm/test/CodeGen/AMDGPU/flat-scratch.ll +++ b/llvm/test/CodeGen/AMDGPU/flat-scratch.ll @@ -1246,7 +1246,7 @@ define amdgpu_kernel void @zero_init_small_offset_kernel() { ; GFX12-PAL-NEXT: s_endpgm %padding = alloca [64 x i32], align 4, addrspace(5) %alloca = alloca [32 x i16], align 2, addrspace(5) - %pad_gep = getelementptr inbounds [64 x i32], ptr addrspace(5) %padding, i32 0, i32 undef + %pad_gep = getelementptr inbounds [64 x i32], ptr addrspace(5) %padding, i32 0, i32 0 %pad_load = load volatile i32, ptr addrspace(5) %pad_gep, align 4 call void @llvm.memset.p5.i64(ptr addrspace(5) align 2 dereferenceable(64) %alloca, i8 0, i64 64, i1 false) ret void @@ -1436,7 +1436,7 @@ define void @zero_init_small_offset_foo() { ; GFX12-PAL-NEXT: s_setpc_b64 s[30:31] %padding = alloca [64 x i32], align 4, addrspace(5) %alloca = alloca [32 x i16], align 2, addrspace(5) - %pad_gep = getelementptr inbounds [64 x i32], ptr addrspace(5) %padding, i32 0, i32 undef + %pad_gep = getelementptr inbounds [64 x i32], ptr addrspace(5) %padding, i32 0, i32 0 %pad_load = load volatile i32, ptr addrspace(5) %pad_gep, align 4 call void @llvm.memset.p5.i64(ptr addrspace(5) align 2 dereferenceable(64) %alloca, i8 0, i64 64, i1 false) ret void @@ -1657,7 +1657,7 @@ define amdgpu_kernel void @store_load_sindex_small_offset_kernel(i32 %idx) { bb: %padding = alloca [64 x i32], align 4, addrspace(5) %i = alloca [32 x float], align 4, addrspace(5) - %pad_gep = getelementptr inbounds [64 x i32], ptr addrspace(5) %padding, i32 0, i32 undef + %pad_gep = getelementptr inbounds [64 x i32], ptr addrspace(5) %padding, i32 0, i32 0 %pad_load = load volatile i32, ptr addrspace(5) %pad_gep, align 4 %i7 = getelementptr inbounds [32 x float], ptr addrspace(5) %i, i32 0, i32 %idx store volatile i32 15, ptr addrspace(5) %i7, align 4 @@ -1864,7 +1864,7 @@ define amdgpu_ps void @store_load_sindex_small_offset_foo(i32 inreg %idx) { bb: %padding = alloca [64 x i32], align 4, addrspace(5) %i = alloca [32 x float], align 4, addrspace(5) - %pad_gep = getelementptr inbounds [64 x i32], ptr addrspace(5) %padding, i32 0, i32 undef + %pad_gep = getelementptr inbounds [64 x i32], ptr addrspace(5) %padding, i32 0, i32 0 %pad_load = load volatile i32, ptr addrspace(5) %pad_gep, align 4 %i7 = getelementptr inbounds [32 x float], ptr addrspace(5) %i, i32 0, i32 %idx store volatile i32 15, ptr addrspace(5) %i7, align 4 @@ -2051,7 +2051,7 @@ define amdgpu_kernel void @store_load_vindex_small_offset_kernel() { bb: %padding = alloca [64 x i32], align 4, addrspace(5) %i = alloca [32 x float], align 4, addrspace(5) - %pad_gep = getelementptr inbounds [64 x i32], ptr addrspace(5) %padding, i32 0, i32 undef + %pad_gep = getelementptr inbounds [64 x i32], ptr addrspace(5) %padding, i32 0, i32 0 %pad_load = load volatile i32, ptr addrspace(5) %pad_gep, align 4 %i2 = tail call i32 @llvm.amdgcn.workitem.id.x() %i3 = zext i32 %i2 to i64 @@ -2219,7 +2219,7 @@ define void @store_load_vindex_small_offset_foo(i32 %idx) { bb: %padding = alloca [64 x i32], align 4, addrspace(5) %i = alloca [32 x float], align 4, addrspace(5) - %pad_gep = getelementptr inbounds [64 x i32], ptr addrspace(5) %padding, i32 0, i32 undef + %pad_gep = getelementptr inbounds [64 x i32], ptr addrspace(5) %padding, i32 0, i32 0 %pad_load = load volatile i32, ptr addrspace(5) %pad_gep, align 4 %i7 = getelementptr inbounds [32 x float], ptr addrspace(5) %i, i32 0, i32 %idx store volatile i32 15, ptr addrspace(5) %i7, align 4 @@ -2448,7 +2448,7 @@ define amdgpu_kernel void @zero_init_large_offset_kernel() { ; GFX12-PAL-NEXT: s_endpgm %padding = alloca [4096 x i32], align 4, addrspace(5) %alloca = alloca [32 x i16], align 2, addrspace(5) - %pad_gep = getelementptr inbounds [4096 x i32], ptr addrspace(5) %padding, i32 0, i32 undef + %pad_gep = getelementptr inbounds [4096 x i32], ptr addrspace(5) %padding, i32 0, i32 0 %pad_load = load volatile i32, ptr addrspace(5) %pad_gep, align 4 call void @llvm.memset.p5.i64(ptr addrspace(5) align 2 dereferenceable(64) %alloca, i8 0, i64 64, i1 false) ret void @@ -2690,7 +2690,7 @@ define void @zero_init_large_offset_foo() { ; GFX12-PAL-NEXT: s_setpc_b64 s[30:31] %padding = alloca [4096 x i32], align 4, addrspace(5) %alloca = alloca [32 x i16], align 2, addrspace(5) - %pad_gep = getelementptr inbounds [4096 x i32], ptr addrspace(5) %padding, i32 0, i32 undef + %pad_gep = getelementptr inbounds [4096 x i32], ptr addrspace(5) %padding, i32 0, i32 0 %pad_load = load volatile i32, ptr addrspace(5) %pad_gep, align 4 call void @llvm.memset.p5.i64(ptr addrspace(5) align 2 dereferenceable(64) %alloca, i8 0, i64 64, i1 false) ret void @@ -2911,7 +2911,7 @@ define amdgpu_kernel void @store_load_sindex_large_offset_kernel(i32 %idx) { bb: %padding = alloca [4096 x i32], align 4, addrspace(5) %i = alloca [32 x float], align 4, addrspace(5) - %pad_gep = getelementptr inbounds [4096 x i32], ptr addrspace(5) %padding, i32 0, i32 undef + %pad_gep = getelementptr inbounds [4096 x i32], ptr addrspace(5) %padding, i32 0, i32 0 %pad_load = load volatile i32, ptr addrspace(5) %pad_gep, align 4 %i7 = getelementptr inbounds [32 x float], ptr addrspace(5) %i, i32 0, i32 %idx store volatile i32 15, ptr addrspace(5) %i7, align 4 @@ -3118,7 +3118,7 @@ define amdgpu_ps void @store_load_sindex_large_offset_foo(i32 inreg %idx) { bb: %padding = alloca [4096 x i32], align 4, addrspace(5) %i = alloca [32 x float], align 4, addrspace(5) - %pad_gep = getelementptr inbounds [4096 x i32], ptr addrspace(5) %padding, i32 0, i32 undef + %pad_gep = getelementptr inbounds [4096 x i32], ptr addrspace(5) %padding, i32 0, i32 0 %pad_load = load volatile i32, ptr addrspace(5) %pad_gep, align 4 %i7 = getelementptr inbounds [32 x float], ptr addrspace(5) %i, i32 0, i32 %idx store volatile i32 15, ptr addrspace(5) %i7, align 4 @@ -3308,7 +3308,7 @@ define amdgpu_kernel void @store_load_vindex_large_offset_kernel() { bb: %padding = alloca [4096 x i32], align 4, addrspace(5) %i = alloca [32 x float], align 4, addrspace(5) - %pad_gep = getelementptr inbounds [4096 x i32], ptr addrspace(5) %padding, i32 0, i32 undef + %pad_gep = getelementptr inbounds [4096 x i32], ptr addrspace(5) %padding, i32 0, i32 0 %pad_load = load volatile i32, ptr addrspace(5) %pad_gep, align 4 %i2 = tail call i32 @llvm.amdgcn.workitem.id.x() %i3 = zext i32 %i2 to i64 @@ -3479,7 +3479,7 @@ define void @store_load_vindex_large_offset_foo(i32 %idx) { bb: %padding = alloca [4096 x i32], align 4, addrspace(5) %i = alloca [32 x float], align 4, addrspace(5) - %pad_gep = getelementptr inbounds [4096 x i32], ptr addrspace(5) %padding, i32 0, i32 undef + %pad_gep = getelementptr inbounds [4096 x i32], ptr addrspace(5) %padding, i32 0, i32 0 %pad_load = load volatile i32, ptr addrspace(5) %pad_gep, align 4 %i7 = getelementptr inbounds [32 x float], ptr addrspace(5) %i, i32 0, i32 %idx store volatile i32 15, ptr addrspace(5) %i7, align 4 @@ -3649,7 +3649,7 @@ define amdgpu_kernel void @store_load_large_imm_offset_kernel() { ; GFX12-PAL-NEXT: s_endpgm bb: %i = alloca [4096 x i32], align 4, addrspace(5) - %i1 = getelementptr inbounds [4096 x i32], ptr addrspace(5) %i, i32 0, i32 undef + %i1 = getelementptr inbounds [4096 x i32], ptr addrspace(5) %i, i32 0, i32 0 store volatile i32 13, ptr addrspace(5) %i1, align 4 %i7 = getelementptr inbounds [4096 x i32], ptr addrspace(5) %i, i32 0, i32 4000 store volatile i32 15, ptr addrspace(5) %i7, align 4 @@ -3790,7 +3790,7 @@ define void @store_load_large_imm_offset_foo() { ; GFX12-PAL-NEXT: s_setpc_b64 s[30:31] bb: %i = alloca [4096 x i32], align 4, addrspace(5) - %i1 = getelementptr inbounds [4096 x i32], ptr addrspace(5) %i, i32 0, i32 undef + %i1 = getelementptr inbounds [4096 x i32], ptr addrspace(5) %i, i32 0, i32 0 store volatile i32 13, ptr addrspace(5) %i1, align 4 %i7 = getelementptr inbounds [4096 x i32], ptr addrspace(5) %i, i32 0, i32 4000 store volatile i32 15, ptr addrspace(5) %i7, align 4 diff --git a/llvm/test/CodeGen/AMDGPU/indirect-addressing-si.ll b/llvm/test/CodeGen/AMDGPU/indirect-addressing-si.ll index cb99ceba2fca5..1f21a1a91fc89 100644 --- a/llvm/test/CodeGen/AMDGPU/indirect-addressing-si.ll +++ b/llvm/test/CodeGen/AMDGPU/indirect-addressing-si.ll @@ -1852,7 +1852,7 @@ define amdgpu_kernel void @extract_undef_offset_sgpr(ptr addrspace(1) %out, ptr ; GFX9-IDXMODE-NEXT: s_endpgm entry: %ld = load volatile <4 x i32>, ptr addrspace(1) %in - %value = extractelement <4 x i32> %ld, i32 undef + %value = extractelement <4 x i32> %ld, i32 poison store i32 %value, ptr addrspace(1) %out ret void } @@ -7451,13 +7451,13 @@ bb: bb1: %tmp2 = load volatile <4 x float>, ptr addrspace(1) poison - %tmp3 = extractelement <4 x float> %tmp2, i32 undef + %tmp3 = extractelement <4 x float> %tmp2, i32 poison call void asm sideeffect "; reg use $0", "v"(<4 x float> %tmp2) ; Prevent block optimize out br label %bb7 bb4: %tmp5 = load volatile <4 x float>, ptr addrspace(1) poison - %tmp6 = extractelement <4 x float> %tmp5, i32 undef + %tmp6 = extractelement <4 x float> %tmp5, i32 poison call void asm sideeffect "; reg use $0", "v"(<4 x float> %tmp5) ; Prevent block optimize out br label %bb7 diff --git a/llvm/test/CodeGen/AMDGPU/infinite-loop.ll b/llvm/test/CodeGen/AMDGPU/infinite-loop.ll index b2fd4015d920a..bea532bd52955 100644 --- a/llvm/test/CodeGen/AMDGPU/infinite-loop.ll +++ b/llvm/test/CodeGen/AMDGPU/infinite-loop.ll @@ -122,7 +122,7 @@ define amdgpu_kernel void @infinite_loops(ptr addrspace(1) %out) { ; SI-NEXT: s_endpgm ; IR-LABEL: @infinite_loops( ; IR-NEXT: entry: -; IR-NEXT: br i1 undef, label [[LOOP1:%.*]], label [[LOOP2:%.*]] +; IR-NEXT: br i1 poison, label [[LOOP1:%.*]], label [[LOOP2:%.*]] ; IR: loop1: ; IR-NEXT: store volatile i32 999, ptr addrspace(1) [[OUT:%.*]], align 4 ; IR-NEXT: br i1 true, label [[LOOP1]], label [[DUMMYRETURNBLOCK:%.*]] @@ -133,7 +133,7 @@ define amdgpu_kernel void @infinite_loops(ptr addrspace(1) %out) { ; IR-NEXT: ret void ; entry: - br i1 undef, label %loop1, label %loop2 + br i1 poison, label %loop1, label %loop2 loop1: store volatile i32 999, ptr addrspace(1) %out, align 4 diff --git a/llvm/test/CodeGen/AMDGPU/ipra-return-address-save-restore.ll b/llvm/test/CodeGen/AMDGPU/ipra-return-address-save-restore.ll index 4dced1c15d1e2..ebd1540eb997d 100644 --- a/llvm/test/CodeGen/AMDGPU/ipra-return-address-save-restore.ll +++ b/llvm/test/CodeGen/AMDGPU/ipra-return-address-save-restore.ll @@ -36,8 +36,9 @@ define internal fastcc void @svm_node_closure_bsdf(ptr addrspace(1) %sd, ptr %st ; GCN: s_setpc_b64 s[30:31] entry: %8 = extractelement <4 x i32> %node, i64 0 - %cmp.i.not = icmp eq i32 undef, 0 - br i1 undef, label %common.ret.critedge, label %cond.true + %undef.i32 = freeze i32 poison + %cmp.i.not = icmp eq i32 %undef.i32, 0 + br i1 poison, label %common.ret.critedge, label %cond.true cond.true: ; preds = %entry %9 = load float, ptr null, align 4 @@ -120,13 +121,15 @@ if.then534: ; preds = %bsdf_alloc.exit2188 br label %if.end627.sink.split if.else568: ; preds = %if.then413 - br i1 undef, label %bsdf_alloc.exit2214, label %if.then.i2198 + %undef.0 = freeze i1 poison + br i1 %undef.0, label %bsdf_alloc.exit2214, label %if.then.i2198 if.then.i2198: ; preds = %if.else568 - br i1 undef, label %closure_alloc.exit.i2210, label %if.end.i.i2207 + %undef.1 = freeze i1 poison + br i1 %undef.1, label %closure_alloc.exit.i2210, label %if.end.i.i2207 if.end.i.i2207: ; preds = %if.then.i2198 - %arrayidx.i.i22028 = getelementptr inbounds %struct.ShaderData, ptr addrspace(1) %sd, i64 0, i32 30, i64 undef + %arrayidx.i.i22028 = getelementptr inbounds %struct.ShaderData, ptr addrspace(1) %sd, i64 0, i32 30, i64 0 br label %closure_alloc.exit.i2210 closure_alloc.exit.i2210: ; preds = %if.end.i.i2207, %if.then.i2198 @@ -185,7 +188,7 @@ sw.bb10: ; GCN-DAG: v_readlane_b32 s30, [[CSR_VGPR]], ; GCN: s_waitcnt vmcnt(0) ; GCN: s_setpc_b64 s[30:31] - call fastcc void @svm_node_closure_bsdf(ptr addrspace(1) null, ptr null, <4 x i32> zeroinitializer, ptr null, i32 poison, i8 undef, float poison, float poison, float poison, i1 undef, <4 x i32> poison, float poison, i32 poison, i1 undef, i1 undef, i1 undef, float poison, ptr addrspace(1) poison, ptr addrspace(1) poison, ptr addrspace(1) poison, i1 undef, ptr addrspace(1) poison, i32 poison, i1 undef, i32 poison, i64 undef, i32 poison) + call fastcc void @svm_node_closure_bsdf(ptr addrspace(1) null, ptr null, <4 x i32> zeroinitializer, ptr null, i32 poison, i8 poison, float poison, float poison, float poison, i1 poison, <4 x i32> poison, float poison, i32 poison, i1 poison, i1 poison, i1 poison, float poison, ptr addrspace(1) poison, ptr addrspace(1) poison, ptr addrspace(1) poison, i1 poison, ptr addrspace(1) poison, i32 poison, i1 poison, i32 poison, i64 poison, i32 poison) ret void } diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.div.scale.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.div.scale.ll index 3db7b996f0240..9b9d864689537 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.div.scale.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.div.scale.ll @@ -444,7 +444,7 @@ define amdgpu_kernel void @test_div_scale_f32_undef_undef_val(ptr addrspace(1) % ; SI-DAG: s_mov_b32 s[[K_HI:[0-9]+]], 0x40200000 ; SI: v_div_scale_f64 v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s[[[K_LO]]:[[K_HI]]], v[0:1], s[[[K_LO]]:[[K_HI]]] define amdgpu_kernel void @test_div_scale_f64_val_undef_val(ptr addrspace(1) %out) #0 { - %result = call { double, i1 } @llvm.amdgcn.div.scale.f64(double 8.0, double undef, i1 false) + %result = call { double, i1 } @llvm.amdgcn.div.scale.f64(double 8.0, double poison, i1 false) %result0 = extractvalue { double, i1 } %result, 0 store double %result0, ptr addrspace(1) %out, align 8 ret void diff --git a/llvm/test/CodeGen/AMDGPU/mdt-preserving-crash.ll b/llvm/test/CodeGen/AMDGPU/mdt-preserving-crash.ll index 50cc8065718a2..3bb840eb51690 100644 --- a/llvm/test/CodeGen/AMDGPU/mdt-preserving-crash.ll +++ b/llvm/test/CodeGen/AMDGPU/mdt-preserving-crash.ll @@ -26,6 +26,7 @@ define protected amdgpu_kernel void @_RSENC_PRInit______________________________ ; CHECK-NEXT: s_cbranch_vccnz .LBB0_12 ; CHECK-NEXT: .LBB0_2: ; %while.cond.i ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: s_cmp_eq_u32 s4, 0 ; CHECK-NEXT: s_cbranch_scc1 .LBB0_2 ; CHECK-NEXT: ; %bb.3: ; %if.end60 ; CHECK-NEXT: s_mov_b64 vcc, exec @@ -86,7 +87,8 @@ lor.lhs.false17: ; preds = %if.end15 br label %while.cond.i while.cond.i: ; preds = %while.cond.i, %lor.lhs.false17 - switch i32 undef, label %if.end60 [ + %undef0 = freeze i32 poison + switch i32 %undef0, label %if.end60 [ i32 0, label %while.cond.i i32 3, label %if.end60.loopexit857 ] @@ -115,7 +117,7 @@ if.end5.i: ; preds = %if.then3.i, %if.end %conv612.i = sext i8 %2 to i32 %sub13.i = add nsw i32 %conv612.i, -48 %cmp714.i = icmp ugt i32 %sub13.i, 9 - switch i8 undef, label %if.end5.i314 [ + switch i8 poison, label %if.end5.i314 [ i8 45, label %if.then.i306 i8 43, label %if.then3.i308 ] @@ -132,7 +134,7 @@ if.end5.i314: ; preds = %if.then3.i308, %if. %conv612.i311 = sext i8 %3 to i32 %sub13.i312 = add nsw i32 %conv612.i311, -48 %cmp714.i313 = icmp ugt i32 %sub13.i312, 9 - switch i8 undef, label %if.end5.i338 [ + switch i8 poison, label %if.end5.i338 [ i8 45, label %if.then.i330 i8 43, label %if.then3.i332 ] @@ -149,7 +151,7 @@ if.end5.i338: ; preds = %if.then3.i332, %if. %conv612.i335 = sext i8 %4 to i32 %sub13.i336 = add nsw i32 %conv612.i335, -48 %cmp714.i337 = icmp ugt i32 %sub13.i336, 9 - switch i8 undef, label %if.end5.i362 [ + switch i8 poison, label %if.end5.i362 [ i8 45, label %if.then.i354 i8 43, label %if.then3.i356 ] @@ -170,7 +172,7 @@ if.end5.i362: ; preds = %if.then3.i356, %if. %6 = load i8, ptr addrspace(1) getelementptr inbounds ([4096 x i8], ptr addrspace(1) @_RSENC_gDcd_______________________________, i64 0, i64 1153), align 1 %arrayidx232250.1 = getelementptr inbounds [128 x i8], ptr addrspace(5) %pD10, i32 0, i32 1 store i8 %6, ptr addrspace(5) %arrayidx232250.1, align 1 - switch i8 undef, label %if.end5.i400 [ + switch i8 poison, label %if.end5.i400 [ i8 45, label %if.then.i392 i8 43, label %if.then3.i394 ] diff --git a/llvm/test/CodeGen/AMDGPU/noclobber-barrier.ll b/llvm/test/CodeGen/AMDGPU/noclobber-barrier.ll index f731ed1e01ae3..a8fab161e1ffb 100644 --- a/llvm/test/CodeGen/AMDGPU/noclobber-barrier.ll +++ b/llvm/test/CodeGen/AMDGPU/noclobber-barrier.ll @@ -55,11 +55,11 @@ bb: ; GCN: s_load_dword s ; GCN-NOT: global_load_dword ; GCN: global_store_dword -define amdgpu_kernel void @memory_phi_no_clobber(ptr addrspace(1) %arg) { +define amdgpu_kernel void @memory_phi_no_clobber(ptr addrspace(1) %arg, i1 %cond) { ; CHECK-LABEL: @memory_phi_no_clobber( ; CHECK-NEXT: bb: ; CHECK-NEXT: [[I:%.*]] = load i32, ptr addrspace(1) [[ARG:%.*]], align 4, !amdgpu.noclobber !0 -; CHECK-NEXT: br i1 undef, label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]], !amdgpu.uniform !0 +; CHECK-NEXT: br i1 %cond, label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]], !amdgpu.uniform !0 ; CHECK: if.then: ; CHECK-NEXT: tail call void @llvm.amdgcn.s.barrier() ; CHECK-NEXT: br label [[IF_END:%.*]], !amdgpu.uniform !0 @@ -76,7 +76,7 @@ define amdgpu_kernel void @memory_phi_no_clobber(ptr addrspace(1) %arg) { ; bb: %i = load i32, ptr addrspace(1) %arg, align 4 - br i1 undef, label %if.then, label %if.else + br i1 %cond, label %if.then, label %if.else if.then: tail call void @llvm.amdgcn.s.barrier() @@ -101,11 +101,11 @@ if.end: ; GCN: global_store_dword ; GCN: global_load_dword ; GCN: global_store_dword -define amdgpu_kernel void @memory_phi_clobber1(ptr addrspace(1) %arg) { +define amdgpu_kernel void @memory_phi_clobber1(ptr addrspace(1) %arg, i1 %cond) { ; CHECK-LABEL: @memory_phi_clobber1( ; CHECK-NEXT: bb: ; CHECK-NEXT: [[I:%.*]] = load i32, ptr addrspace(1) [[ARG:%.*]], align 4, !amdgpu.noclobber !0 -; CHECK-NEXT: br i1 undef, label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]], !amdgpu.uniform !0 +; CHECK-NEXT: br i1 %cond, label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]], !amdgpu.uniform !0 ; CHECK: if.then: ; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[ARG]], i64 3 ; CHECK-NEXT: store i32 1, ptr addrspace(1) [[GEP]], align 4 @@ -123,7 +123,7 @@ define amdgpu_kernel void @memory_phi_clobber1(ptr addrspace(1) %arg) { ; bb: %i = load i32, ptr addrspace(1) %arg, align 4 - br i1 undef, label %if.then, label %if.else + br i1 %cond, label %if.then, label %if.else if.then: %gep = getelementptr inbounds i32, ptr addrspace(1) %arg, i64 3 @@ -149,11 +149,11 @@ if.end: ; GCN: s_barrier ; GCN: global_load_dword ; GCN: global_store_dword -define amdgpu_kernel void @memory_phi_clobber2(ptr addrspace(1) %arg) { +define amdgpu_kernel void @memory_phi_clobber2(ptr addrspace(1) %arg, i1 %cond) { ; CHECK-LABEL: @memory_phi_clobber2( ; CHECK-NEXT: bb: ; CHECK-NEXT: [[I:%.*]] = load i32, ptr addrspace(1) [[ARG:%.*]], align 4, !amdgpu.noclobber !0 -; CHECK-NEXT: br i1 undef, label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]], !amdgpu.uniform !0 +; CHECK-NEXT: br i1 %cond, label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]], !amdgpu.uniform !0 ; CHECK: if.then: ; CHECK-NEXT: tail call void @llvm.amdgcn.s.barrier() ; CHECK-NEXT: br label [[IF_END:%.*]], !amdgpu.uniform !0 @@ -171,7 +171,7 @@ define amdgpu_kernel void @memory_phi_clobber2(ptr addrspace(1) %arg) { ; bb: %i = load i32, ptr addrspace(1) %arg, align 4 - br i1 undef, label %if.then, label %if.else + br i1 %cond, label %if.then, label %if.else if.then: tail call void @llvm.amdgcn.s.barrier() diff --git a/llvm/test/CodeGen/AMDGPU/sgpr-spills-split-regalloc.ll b/llvm/test/CodeGen/AMDGPU/sgpr-spills-split-regalloc.ll index 4a01962aa4084..5824c7b4a9490 100644 --- a/llvm/test/CodeGen/AMDGPU/sgpr-spills-split-regalloc.ll +++ b/llvm/test/CodeGen/AMDGPU/sgpr-spills-split-regalloc.ll @@ -723,11 +723,14 @@ define void @spill_sgpr_with_sgpr_uses() #0 { ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; def s4 ; GCN-NEXT: ;;#ASMEND +; GCN-NEXT: s_mov_b32 s5, s4 ; GCN-NEXT: ; implicit-def: $vgpr254 : SGPR spill to VGPR lane -; GCN-NEXT: v_writelane_b32 v254, s4, 0 +; GCN-NEXT: v_writelane_b32 v254, s5, 0 ; GCN-NEXT: s_or_saveexec_b64 s[8:9], -1 ; GCN-NEXT: buffer_store_dword v254, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 exec, s[8:9] +; GCN-NEXT: s_mov_b32 s5, 0 +; GCN-NEXT: s_cmp_lg_u32 s4, s5 ; GCN-NEXT: s_cbranch_scc1 .LBB3_2 ; GCN-NEXT: ; %bb.1: ; %bb0 ; GCN-NEXT: s_or_saveexec_b64 s[8:9], -1 @@ -886,7 +889,7 @@ define void @spill_sgpr_with_sgpr_uses() #0 { ,~{v250},~{v251},~{v252},~{v253}" () #0 %sgpr = call i32 asm sideeffect "; def $0", "=s" () #0 - %cmp = icmp eq i32 undef, 0 + %cmp = icmp eq i32 %sgpr, 0 br i1 %cmp, label %bb0, label %ret bb0: diff --git a/llvm/test/CodeGen/AMDGPU/si-spill-cf.ll b/llvm/test/CodeGen/AMDGPU/si-spill-cf.ll index 1640cdac8836b..25592c8ac8072 100644 --- a/llvm/test/CodeGen/AMDGPU/si-spill-cf.ll +++ b/llvm/test/CodeGen/AMDGPU/si-spill-cf.ll @@ -87,22 +87,22 @@ LOOP: ; preds = %ENDIF2795, %main_bo ENDLOOP: ; preds = %ELSE2566, %LOOP %one.sub.a.i = fsub float 1.000000e+00, %tmp - %one.sub.ac.i = fmul float %one.sub.a.i, undef - %fmul = fmul float undef, undef + %one.sub.ac.i = fmul float %one.sub.a.i, 0x7FF8000000000000 + %fmul = fmul float 0x7FF8000000000000, 0x7FF8000000000000 %result.i = fadd float %fmul, %one.sub.ac.i - call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float poison, float %result.i, float undef, float 1.000000e+00, i1 true, i1 true) #0 + call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float poison, float %result.i, float 0x7FF8000000000000, float 1.000000e+00, i1 true, i1 true) #0 ret void ENDIF: ; preds = %LOOP - %tmp68 = fsub float %tmp2, undef - %tmp69 = fsub float %tmp3, undef - %tmp70 = fsub float %tmp4, undef + %tmp68 = fsub float %tmp2, 0x7FF8000000000000 + %tmp69 = fsub float %tmp3, 0x7FF8000000000000 + %tmp70 = fsub float %tmp4, 0x7FF8000000000000 %tmp71 = fmul float %tmp68, 0.000000e+00 - %tmp72 = fmul float %tmp69, undef - %tmp73 = fmul float %tmp70, undef - %tmp74 = fsub float %tmp6, undef - %tmp75 = fsub float %tmp7, undef - %tmp76 = fmul float %tmp74, undef + %tmp72 = fmul float %tmp69, 0x7FF8000000000000 + %tmp73 = fmul float %tmp70, 0x7FF8000000000000 + %tmp74 = fsub float %tmp6, 0x7FF8000000000000 + %tmp75 = fsub float %tmp7, 0x7FF8000000000000 + %tmp76 = fmul float %tmp74, 0x7FF8000000000000 %tmp77 = fmul float %tmp75, 0.000000e+00 %tmp78 = call float @llvm.minnum.f32(float %tmp73, float %tmp77) %tmp79 = call float @llvm.maxnum.f32(float %tmp71, float 0.000000e+00) @@ -110,18 +110,18 @@ ENDIF: ; preds = %LOOP %tmp81 = call float @llvm.maxnum.f32(float poison, float %tmp78) %tmp82 = call float @llvm.minnum.f32(float %tmp79, float %tmp80) %tmp83 = call float @llvm.minnum.f32(float %tmp82, float poison) - %tmp84 = fsub float %tmp14, undef - %tmp85 = fsub float %tmp15, undef - %tmp86 = fsub float %tmp16, undef - %tmp87 = fmul float %tmp84, undef - %tmp88 = fmul float %tmp85, undef - %tmp89 = fmul float %tmp86, undef - %tmp90 = fsub float %tmp17, undef - %tmp91 = fsub float %tmp18, undef - %tmp92 = fsub float %tmp19, undef + %tmp84 = fsub float %tmp14, 0x7FF8000000000000 + %tmp85 = fsub float %tmp15, 0x7FF8000000000000 + %tmp86 = fsub float %tmp16, 0x7FF8000000000000 + %tmp87 = fmul float %tmp84, 0x7FF8000000000000 + %tmp88 = fmul float %tmp85, 0x7FF8000000000000 + %tmp89 = fmul float %tmp86, 0x7FF8000000000000 + %tmp90 = fsub float %tmp17, 0x7FF8000000000000 + %tmp91 = fsub float %tmp18, 0x7FF8000000000000 + %tmp92 = fsub float %tmp19, 0x7FF8000000000000 %tmp93 = fmul float %tmp90, 0.000000e+00 - %tmp94 = fmul float %tmp91, undef - %tmp95 = fmul float %tmp92, undef + %tmp94 = fmul float %tmp91, 0x7FF8000000000000 + %tmp95 = fmul float %tmp92, 0x7FF8000000000000 %tmp96 = call float @llvm.minnum.f32(float %tmp88, float %tmp94) %tmp97 = call float @llvm.maxnum.f32(float %tmp87, float %tmp93) %tmp98 = call float @llvm.maxnum.f32(float %tmp89, float %tmp95) @@ -129,8 +129,8 @@ ENDIF: ; preds = %LOOP %tmp100 = call float @llvm.maxnum.f32(float %tmp99, float poison) %tmp101 = call float @llvm.minnum.f32(float %tmp97, float poison) %tmp102 = call float @llvm.minnum.f32(float %tmp101, float %tmp98) - %tmp103 = fsub float %tmp30, undef - %tmp104 = fsub float %tmp31, undef + %tmp103 = fsub float %tmp30, 0x7FF8000000000000 + %tmp104 = fsub float %tmp31, 0x7FF8000000000000 %tmp105 = fmul float %tmp103, 0.000000e+00 %tmp106 = fmul float %tmp104, 0.000000e+00 %tmp107 = call float @llvm.minnum.f32(float poison, float %tmp105) @@ -138,85 +138,85 @@ ENDIF: ; preds = %LOOP %tmp109 = call float @llvm.maxnum.f32(float poison, float %tmp107) %tmp110 = call float @llvm.maxnum.f32(float %tmp109, float poison) %tmp111 = call float @llvm.minnum.f32(float poison, float %tmp108) - %tmp112 = fsub float %tmp32, undef - %tmp113 = fsub float %tmp33, undef - %tmp114 = fsub float %tmp34, undef + %tmp112 = fsub float %tmp32, 0x7FF8000000000000 + %tmp113 = fsub float %tmp33, 0x7FF8000000000000 + %tmp114 = fsub float %tmp34, 0x7FF8000000000000 %tmp115 = fmul float %tmp112, 0.000000e+00 - %tmp116 = fmul float %tmp113, undef - %tmp117 = fmul float %tmp114, undef - %tmp118 = fsub float %tmp35, undef - %tmp119 = fsub float %tmp36, undef - %tmp120 = fsub float %tmp37, undef - %tmp121 = fmul float %tmp118, undef - %tmp122 = fmul float %tmp119, undef - %tmp123 = fmul float %tmp120, undef + %tmp116 = fmul float %tmp113, 0x7FF8000000000000 + %tmp117 = fmul float %tmp114, 0x7FF8000000000000 + %tmp118 = fsub float %tmp35, 0x7FF8000000000000 + %tmp119 = fsub float %tmp36, 0x7FF8000000000000 + %tmp120 = fsub float %tmp37, 0x7FF8000000000000 + %tmp121 = fmul float %tmp118, 0x7FF8000000000000 + %tmp122 = fmul float %tmp119, 0x7FF8000000000000 + %tmp123 = fmul float %tmp120, 0x7FF8000000000000 %tmp124 = call float @llvm.minnum.f32(float %tmp115, float %tmp121) %tmp125 = call float @llvm.minnum.f32(float %tmp116, float %tmp122) %tmp126 = call float @llvm.minnum.f32(float %tmp117, float %tmp123) %tmp127 = call float @llvm.maxnum.f32(float %tmp124, float %tmp125) %tmp128 = call float @llvm.maxnum.f32(float %tmp127, float %tmp126) - %tmp129 = fsub float %tmp38, undef - %tmp130 = fsub float %tmp39, undef - %tmp131 = fsub float %tmp40, undef + %tmp129 = fsub float %tmp38, 0x7FF8000000000000 + %tmp130 = fsub float %tmp39, 0x7FF8000000000000 + %tmp131 = fsub float %tmp40, 0x7FF8000000000000 %tmp132 = fmul float %tmp129, 0.000000e+00 - %tmp133 = fmul float %tmp130, undef - %tmp134 = fmul float %tmp131, undef - %tmp135 = fsub float %tmp41, undef - %tmp136 = fsub float %tmp42, undef - %tmp137 = fsub float %tmp43, undef - %tmp138 = fmul float %tmp135, undef - %tmp139 = fmul float %tmp136, undef - %tmp140 = fmul float %tmp137, undef + %tmp133 = fmul float %tmp130, 0x7FF8000000000000 + %tmp134 = fmul float %tmp131, 0x7FF8000000000000 + %tmp135 = fsub float %tmp41, 0x7FF8000000000000 + %tmp136 = fsub float %tmp42, 0x7FF8000000000000 + %tmp137 = fsub float %tmp43, 0x7FF8000000000000 + %tmp138 = fmul float %tmp135, 0x7FF8000000000000 + %tmp139 = fmul float %tmp136, 0x7FF8000000000000 + %tmp140 = fmul float %tmp137, 0x7FF8000000000000 %tmp141 = call float @llvm.minnum.f32(float %tmp132, float %tmp138) %tmp142 = call float @llvm.minnum.f32(float %tmp133, float %tmp139) %tmp143 = call float @llvm.minnum.f32(float %tmp134, float %tmp140) %tmp144 = call float @llvm.maxnum.f32(float %tmp141, float %tmp142) %tmp145 = call float @llvm.maxnum.f32(float %tmp144, float %tmp143) - %tmp146 = fsub float %tmp44, undef - %tmp147 = fsub float %tmp45, undef - %tmp148 = fsub float %tmp46, undef + %tmp146 = fsub float %tmp44, 0x7FF8000000000000 + %tmp147 = fsub float %tmp45, 0x7FF8000000000000 + %tmp148 = fsub float %tmp46, 0x7FF8000000000000 %tmp149 = fmul float %tmp146, 0.000000e+00 %tmp150 = fmul float %tmp147, 0.000000e+00 - %tmp151 = fmul float %tmp148, undef - %tmp152 = fsub float %tmp47, undef - %tmp153 = fsub float %tmp48, undef - %tmp154 = fsub float %tmp49, undef - %tmp155 = fmul float %tmp152, undef + %tmp151 = fmul float %tmp148, 0x7FF8000000000000 + %tmp152 = fsub float %tmp47, 0x7FF8000000000000 + %tmp153 = fsub float %tmp48, 0x7FF8000000000000 + %tmp154 = fsub float %tmp49, 0x7FF8000000000000 + %tmp155 = fmul float %tmp152, 0x7FF8000000000000 %tmp156 = fmul float %tmp153, 0.000000e+00 - %tmp157 = fmul float %tmp154, undef + %tmp157 = fmul float %tmp154, 0x7FF8000000000000 %tmp158 = call float @llvm.minnum.f32(float %tmp149, float %tmp155) %tmp159 = call float @llvm.minnum.f32(float %tmp150, float %tmp156) %tmp160 = call float @llvm.minnum.f32(float %tmp151, float %tmp157) %tmp161 = call float @llvm.maxnum.f32(float %tmp158, float %tmp159) %tmp162 = call float @llvm.maxnum.f32(float %tmp161, float %tmp160) - %tmp163 = fsub float %tmp50, undef - %tmp164 = fsub float %tmp51, undef - %tmp165 = fsub float %tmp52, undef - %tmp166 = fmul float %tmp163, undef + %tmp163 = fsub float %tmp50, 0x7FF8000000000000 + %tmp164 = fsub float %tmp51, 0x7FF8000000000000 + %tmp165 = fsub float %tmp52, 0x7FF8000000000000 + %tmp166 = fmul float %tmp163, 0x7FF8000000000000 %tmp167 = fmul float %tmp164, 0.000000e+00 %tmp168 = fmul float %tmp165, 0.000000e+00 - %tmp169 = fsub float %tmp53, undef - %tmp170 = fsub float %tmp54, undef - %tmp171 = fsub float %tmp55, undef + %tmp169 = fsub float %tmp53, 0x7FF8000000000000 + %tmp170 = fsub float %tmp54, 0x7FF8000000000000 + %tmp171 = fsub float %tmp55, 0x7FF8000000000000 %tmp172 = fdiv float 1.000000e+00, %temp18.0 - %tmp173 = fmul float %tmp169, undef - %tmp174 = fmul float %tmp170, undef + %tmp173 = fmul float %tmp169, 0x7FF8000000000000 + %tmp174 = fmul float %tmp170, 0x7FF8000000000000 %tmp175 = fmul float %tmp171, %tmp172 %tmp176 = call float @llvm.minnum.f32(float %tmp166, float %tmp173) %tmp177 = call float @llvm.minnum.f32(float %tmp167, float %tmp174) %tmp178 = call float @llvm.minnum.f32(float %tmp168, float %tmp175) %tmp179 = call float @llvm.maxnum.f32(float %tmp176, float %tmp177) %tmp180 = call float @llvm.maxnum.f32(float %tmp179, float %tmp178) - %tmp181 = fsub float %tmp62, undef - %tmp182 = fsub float %tmp63, undef - %tmp183 = fsub float %tmp64, undef + %tmp181 = fsub float %tmp62, 0x7FF8000000000000 + %tmp182 = fsub float %tmp63, 0x7FF8000000000000 + %tmp183 = fsub float %tmp64, 0x7FF8000000000000 %tmp184 = fmul float %tmp181, 0.000000e+00 - %tmp185 = fmul float %tmp182, undef - %tmp186 = fmul float %tmp183, undef - %tmp187 = fsub float %tmp65, undef - %tmp188 = fsub float %tmp66, undef - %tmp189 = fmul float %tmp187, undef - %tmp190 = fmul float %tmp188, undef + %tmp185 = fmul float %tmp182, 0x7FF8000000000000 + %tmp186 = fmul float %tmp183, 0x7FF8000000000000 + %tmp187 = fsub float %tmp65, 0x7FF8000000000000 + %tmp188 = fsub float %tmp66, 0x7FF8000000000000 + %tmp189 = fmul float %tmp187, 0x7FF8000000000000 + %tmp190 = fmul float %tmp188, 0x7FF8000000000000 %tmp191 = call float @llvm.maxnum.f32(float %tmp184, float %tmp189) %tmp192 = call float @llvm.maxnum.f32(float %tmp185, float %tmp190) %tmp193 = call float @llvm.maxnum.f32(float %tmp186, float poison) @@ -225,10 +225,10 @@ ENDIF: ; preds = %LOOP %.temp292.7 = select i1 undef, float %tmp162, float poison %temp292.9 = select i1 false, float %tmp180, float %.temp292.7 %.temp292.9 = select i1 undef, float poison, float %temp292.9 - %tmp196 = fcmp ogt float undef, 0.000000e+00 - %tmp197 = fcmp olt float undef, %tmp195 + %tmp196 = fcmp ogt float 0x7FF8000000000000, 0.000000e+00 + %tmp197 = fcmp olt float 0x7FF8000000000000, %tmp195 %tmp198 = and i1 %tmp196, %tmp197 - %tmp199 = fcmp olt float undef, %.temp292.9 + %tmp199 = fcmp olt float 0x7FF8000000000000, %.temp292.9 %tmp200 = and i1 %tmp198, %tmp199 %temp292.11 = select i1 %tmp200, float poison, float %.temp292.9 %tid0 = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) @@ -249,13 +249,13 @@ ELSE2566: ; preds = %ENDIF ENDIF2564: ; preds = %ENDIF2594, %ENDIF2588 %temp894.1 = phi float [ poison, %ENDIF2588 ], [ %temp894.2, %ENDIF2594 ] %temp18.1 = phi float [ %tmp218, %ENDIF2588 ], [ poison, %ENDIF2594 ] - %tmp202 = fsub float %tmp5, undef - %tmp203 = fmul float %tmp202, undef + %tmp202 = fsub float %tmp5, 0x7FF8000000000000 + %tmp203 = fmul float %tmp202, 0x7FF8000000000000 %tmp204 = call float @llvm.maxnum.f32(float poison, float %tmp203) %tmp205 = call float @llvm.minnum.f32(float %tmp204, float poison) %tmp206 = call float @llvm.minnum.f32(float %tmp205, float poison) - %tmp207 = fcmp ogt float undef, 0.000000e+00 - %tmp208 = fcmp olt float undef, 1.000000e+00 + %tmp207 = fcmp ogt float 0x7FF8000000000000, 0.000000e+00 + %tmp208 = fcmp olt float 0x7FF8000000000000, 1.000000e+00 %tmp209 = and i1 %tmp207, %tmp208 %tid3 = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) %tidf3 = bitcast i32 %tid3 to float @@ -267,7 +267,7 @@ ELSE2584: ; preds = %IF2565 br label %ENDIF2582 ENDIF2582: ; preds = %ELSE2584, %IF2565 - %tmp212 = fadd float %tmp1, undef + %tmp212 = fadd float %tmp1, 0x7FF8000000000000 %tmp213 = fadd float 0.000000e+00, %tmp212 %floor = call float @llvm.floor.f32(float %tmp213) %tmp214 = fsub float %tmp213, %floor @@ -284,8 +284,8 @@ ELSE2590: ; preds = %ENDIF2582 ENDIF2588: ; preds = %ELSE2590, %IF2589 %tmp215 = fsub float 1.000000e+00, %tmp214 %tmp216 = call float @llvm.sqrt.f32(float %tmp215) - %tmp217 = fmul float %tmp216, undef - %tmp218 = fadd float %tmp217, undef + %tmp217 = fmul float %tmp216, 0x7FF8000000000000 + %tmp218 = fadd float %tmp217, 0x7FF8000000000000 br label %ENDIF2564 ELSE2593: ; preds = %ELSE2566 @@ -302,7 +302,7 @@ ELSE2596: ; preds = %ELSE2593 ENDIF2594: ; preds = %ELSE2788, %ELSE2785, %ELSE2782, %ELSE2779, %IF2775, %ELSE2761, %ELSE2758, %IF2757, %ELSE2704, %ELSE2686, %ELSE2671, %ELSE2668, %IF2667, %ELSE2632, %ELSE2596, %ELSE2593 %temp894.2 = phi float [ 0.000000e+00, %IF2667 ], [ 0.000000e+00, %ELSE2671 ], [ 0.000000e+00, %IF2757 ], [ 0.000000e+00, %ELSE2761 ], [ %temp894.0, %ELSE2758 ], [ 0.000000e+00, %IF2775 ], [ 0.000000e+00, %ELSE2779 ], [ 0.000000e+00, %ELSE2782 ], [ %.2848, %ELSE2788 ], [ 0.000000e+00, %ELSE2785 ], [ 0.000000e+00, %ELSE2593 ], [ 0.000000e+00, %ELSE2632 ], [ 0.000000e+00, %ELSE2704 ], [ 0.000000e+00, %ELSE2686 ], [ 0.000000e+00, %ELSE2668 ], [ 0.000000e+00, %ELSE2596 ] - %tmp225 = fmul float %temp894.2, undef + %tmp225 = fmul float %temp894.2, 0x7FF8000000000000 br label %ENDIF2564 ELSE2632: ; preds = %ELSE2596 @@ -319,7 +319,7 @@ IF2667: ; preds = %ELSE2650 ELSE2668: ; preds = %ELSE2650 %tmp229 = fcmp oeq float %temp292.11, %tmp128 - %tmp230 = fcmp olt float %tmp128, undef + %tmp230 = fcmp olt float %tmp128, 0x7FF8000000000000 %tmp231 = and i1 %tmp229, %tmp230 br i1 %tmp231, label %ENDIF2594, label %ELSE2686 @@ -328,13 +328,13 @@ ELSE2671: ; preds = %IF2667 ELSE2686: ; preds = %ELSE2668 %tmp232 = fcmp oeq float %temp292.11, %tmp145 - %tmp233 = fcmp olt float %tmp145, undef + %tmp233 = fcmp olt float %tmp145, 0x7FF8000000000000 %tmp234 = and i1 %tmp232, %tmp233 br i1 %tmp234, label %ENDIF2594, label %ELSE2704 ELSE2704: ; preds = %ELSE2686 %tmp235 = fcmp oeq float %temp292.11, %tmp180 - %tmp236 = fcmp olt float %tmp180, undef + %tmp236 = fcmp olt float %tmp180, 0x7FF8000000000000 %tmp237 = and i1 %tmp235, %tmp236 br i1 %tmp237, label %ENDIF2594, label %ELSE2740 @@ -351,7 +351,7 @@ ELSE2761: ; preds = %IF2757 br label %ENDIF2594 IF2775: ; preds = %ELSE2758 - %tmp238 = fcmp olt float undef, undef + %tmp238 = fcmp olt float 0x7FF8000000000000, 0x7FF8000000000000 br i1 %tmp238, label %ENDIF2594, label %ELSE2779 ELSE2779: ; preds = %IF2775 @@ -361,27 +361,27 @@ ELSE2782: ; preds = %ELSE2779 br i1 undef, label %ENDIF2594, label %ELSE2785 ELSE2785: ; preds = %ELSE2782 - %tmp239 = fcmp olt float undef, 0.000000e+00 + %tmp239 = fcmp olt float 0x7FF8000000000000, 0.000000e+00 br i1 %tmp239, label %ENDIF2594, label %ELSE2788 ELSE2788: ; preds = %ELSE2785 - %tmp240 = fcmp olt float 0.000000e+00, undef + %tmp240 = fcmp olt float 0.000000e+00, 0x7FF8000000000000 %.2848 = select i1 %tmp240, float -1.000000e+00, float 1.000000e+00 br label %ENDIF2594 ELSE2797: ; preds = %ENDIF2564 - %tmp241 = fsub float %tmp8, undef - %tmp242 = fsub float %tmp9, undef - %tmp243 = fsub float %tmp10, undef - %tmp244 = fmul float %tmp241, undef - %tmp245 = fmul float %tmp242, undef - %tmp246 = fmul float %tmp243, undef - %tmp247 = fsub float %tmp11, undef - %tmp248 = fsub float %tmp12, undef - %tmp249 = fsub float %tmp13, undef - %tmp250 = fmul float %tmp247, undef - %tmp251 = fmul float %tmp248, undef - %tmp252 = fmul float %tmp249, undef + %tmp241 = fsub float %tmp8, 0x7FF8000000000000 + %tmp242 = fsub float %tmp9, 0x7FF8000000000000 + %tmp243 = fsub float %tmp10, 0x7FF8000000000000 + %tmp244 = fmul float %tmp241, 0x7FF8000000000000 + %tmp245 = fmul float %tmp242, 0x7FF8000000000000 + %tmp246 = fmul float %tmp243, 0x7FF8000000000000 + %tmp247 = fsub float %tmp11, 0x7FF8000000000000 + %tmp248 = fsub float %tmp12, 0x7FF8000000000000 + %tmp249 = fsub float %tmp13, 0x7FF8000000000000 + %tmp250 = fmul float %tmp247, 0x7FF8000000000000 + %tmp251 = fmul float %tmp248, 0x7FF8000000000000 + %tmp252 = fmul float %tmp249, 0x7FF8000000000000 %tmp253 = call float @llvm.minnum.f32(float %tmp244, float %tmp250) %tmp254 = call float @llvm.minnum.f32(float %tmp245, float %tmp251) %tmp255 = call float @llvm.maxnum.f32(float %tmp246, float %tmp252) @@ -402,18 +402,18 @@ ELSE2800: ; preds = %ELSE2797 br i1 undef, label %ENDIF2795, label %ELSE2803 ELSE2803: ; preds = %ELSE2800 - %tmp264 = fsub float %tmp20, undef - %tmp265 = fsub float %tmp21, undef - %tmp266 = fsub float %tmp22, undef - %tmp267 = fmul float %tmp264, undef - %tmp268 = fmul float %tmp265, undef + %tmp264 = fsub float %tmp20, 0x7FF8000000000000 + %tmp265 = fsub float %tmp21, 0x7FF8000000000000 + %tmp266 = fsub float %tmp22, 0x7FF8000000000000 + %tmp267 = fmul float %tmp264, 0x7FF8000000000000 + %tmp268 = fmul float %tmp265, 0x7FF8000000000000 %tmp269 = fmul float %tmp266, 0.000000e+00 - %tmp270 = fsub float %tmp23, undef - %tmp271 = fsub float %tmp24, undef - %tmp272 = fsub float %tmp25, undef - %tmp273 = fmul float %tmp270, undef - %tmp274 = fmul float %tmp271, undef - %tmp275 = fmul float %tmp272, undef + %tmp270 = fsub float %tmp23, 0x7FF8000000000000 + %tmp271 = fsub float %tmp24, 0x7FF8000000000000 + %tmp272 = fsub float %tmp25, 0x7FF8000000000000 + %tmp273 = fmul float %tmp270, 0x7FF8000000000000 + %tmp274 = fmul float %tmp271, 0x7FF8000000000000 + %tmp275 = fmul float %tmp272, 0x7FF8000000000000 %tmp276 = call float @llvm.minnum.f32(float %tmp267, float %tmp273) %tmp277 = call float @llvm.maxnum.f32(float %tmp268, float %tmp274) %tmp278 = call float @llvm.maxnum.f32(float %tmp269, float %tmp275) @@ -429,14 +429,14 @@ ELSE2803: ; preds = %ELSE2800 br i1 %tmp287, label %ENDIF2795, label %ELSE2806 ELSE2806: ; preds = %ELSE2803 - %tmp288 = fsub float %tmp26, undef - %tmp289 = fsub float %tmp27, undef - %tmp290 = fsub float %tmp28, undef - %tmp291 = fmul float %tmp288, undef + %tmp288 = fsub float %tmp26, 0x7FF8000000000000 + %tmp289 = fsub float %tmp27, 0x7FF8000000000000 + %tmp290 = fsub float %tmp28, 0x7FF8000000000000 + %tmp291 = fmul float %tmp288, 0x7FF8000000000000 %tmp292 = fmul float %tmp289, 0.000000e+00 - %tmp293 = fmul float %tmp290, undef - %tmp294 = fsub float %tmp29, undef - %tmp295 = fmul float %tmp294, undef + %tmp293 = fmul float %tmp290, 0x7FF8000000000000 + %tmp294 = fsub float %tmp29, 0x7FF8000000000000 + %tmp295 = fmul float %tmp294, 0x7FF8000000000000 %tmp296 = call float @llvm.minnum.f32(float %tmp291, float %tmp295) %tmp297 = call float @llvm.minnum.f32(float %tmp292, float poison) %tmp298 = call float @llvm.maxnum.f32(float %tmp293, float poison) @@ -463,27 +463,27 @@ ELSE2818: ; preds = %ELSE2815 br i1 undef, label %ENDIF2795, label %ELSE2821 ELSE2821: ; preds = %ELSE2818 - %tmp307 = fsub float %tmp56, undef - %tmp308 = fsub float %tmp57, undef - %tmp309 = fsub float %tmp58, undef - %tmp310 = fmul float %tmp307, undef + %tmp307 = fsub float %tmp56, 0x7FF8000000000000 + %tmp308 = fsub float %tmp57, 0x7FF8000000000000 + %tmp309 = fsub float %tmp58, 0x7FF8000000000000 + %tmp310 = fmul float %tmp307, 0x7FF8000000000000 %tmp311 = fmul float %tmp308, 0.000000e+00 - %tmp312 = fmul float %tmp309, undef - %tmp313 = fsub float %tmp59, undef - %tmp314 = fsub float %tmp60, undef - %tmp315 = fsub float %tmp61, undef - %tmp316 = fmul float %tmp313, undef - %tmp317 = fmul float %tmp314, undef - %tmp318 = fmul float %tmp315, undef + %tmp312 = fmul float %tmp309, 0x7FF8000000000000 + %tmp313 = fsub float %tmp59, 0x7FF8000000000000 + %tmp314 = fsub float %tmp60, 0x7FF8000000000000 + %tmp315 = fsub float %tmp61, 0x7FF8000000000000 + %tmp316 = fmul float %tmp313, 0x7FF8000000000000 + %tmp317 = fmul float %tmp314, 0x7FF8000000000000 + %tmp318 = fmul float %tmp315, 0x7FF8000000000000 %tmp319 = call float @llvm.maxnum.f32(float %tmp310, float %tmp316) %tmp320 = call float @llvm.maxnum.f32(float %tmp311, float %tmp317) %tmp321 = call float @llvm.maxnum.f32(float %tmp312, float %tmp318) %tmp322 = call float @llvm.minnum.f32(float %tmp319, float %tmp320) %tmp323 = call float @llvm.minnum.f32(float %tmp322, float %tmp321) - %tmp324 = fcmp ogt float undef, 0.000000e+00 - %tmp325 = fcmp olt float undef, 1.000000e+00 + %tmp324 = fcmp ogt float 0x7FF8000000000000, 0.000000e+00 + %tmp325 = fcmp olt float 0x7FF8000000000000, 1.000000e+00 %tmp326 = and i1 %tmp324, %tmp325 - %tmp327 = fcmp olt float undef, %tmp323 + %tmp327 = fcmp olt float 0x7FF8000000000000, %tmp323 %tmp328 = and i1 %tmp326, %tmp327 br i1 %tmp328, label %ENDIF2795, label %ELSE2824 diff --git a/llvm/test/CodeGen/AMDGPU/smrd.ll b/llvm/test/CodeGen/AMDGPU/smrd.ll index 31c982cdbaafe..ceb1ce4440de5 100644 --- a/llvm/test/CodeGen/AMDGPU/smrd.ll +++ b/llvm/test/CodeGen/AMDGPU/smrd.ll @@ -687,9 +687,10 @@ exit: ; GCN: buffer_load_dword v0, v0, ; GCN-NEXT: s_waitcnt ; GCN-NEXT: ; return to shader part epilog -define amdgpu_cs float @arg_divergence(i32 inreg %unused, <3 x i32> %arg4) #0 { +define amdgpu_cs float @arg_divergence(i32 inreg %cmp, <3 x i32> %arg4) #0 { main_body: - br i1 undef, label %if1, label %endif1 + %uniform.cond = icmp eq i32 %cmp, 0 + br i1 %uniform.cond, label %endif1, label %if1 if1: ; preds = %main_body store i32 0, ptr addrspace(3) poison, align 4 diff --git a/llvm/test/CodeGen/AMDGPU/split-smrd.ll b/llvm/test/CodeGen/AMDGPU/split-smrd.ll index a39d50815cec8..dbb621d7d61e4 100644 --- a/llvm/test/CodeGen/AMDGPU/split-smrd.ll +++ b/llvm/test/CodeGen/AMDGPU/split-smrd.ll @@ -6,11 +6,12 @@ ; GCN-LABEL: {{^}}split_smrd_add_worklist: ; GCN: image_sample v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}] dmask:0x1 -define amdgpu_ps void @split_smrd_add_worklist(ptr addrspace(4) inreg %arg) #0 { +define amdgpu_ps void @split_smrd_add_worklist(ptr addrspace(4) inreg %arg, i32 inreg %cond.arg) #0 { bb: %tmp = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> poison, i32 96, i32 0) %tmp1 = bitcast float %tmp to i32 - br i1 undef, label %bb2, label %bb3 + %scc = icmp eq i32 %cond.arg, 0 + br i1 %scc, label %bb2, label %bb3 bb2: ; preds = %bb unreachable diff --git a/llvm/test/CodeGen/AMDGPU/swdev373493.ll b/llvm/test/CodeGen/AMDGPU/swdev373493.ll index caf58823aa6d5..d2d6fdc14bb5f 100644 --- a/llvm/test/CodeGen/AMDGPU/swdev373493.ll +++ b/llvm/test/CodeGen/AMDGPU/swdev373493.ll @@ -54,7 +54,7 @@ define hidden fastcc void @bar(i32 %arg, ptr %arg1, ptr %arg2, ptr %arg3, ptr %a ; CHECK-NEXT: .LBB0_5: ; %bb9 ; CHECK-NEXT: s_setpc_b64 s[30:31] bb: - switch i32 undef, label %bb9 [ + switch i32 poison, label %bb9 [ i32 3, label %bb8 i32 1, label %bb7 ] diff --git a/llvm/test/CodeGen/AMDGPU/trunc-combine.ll b/llvm/test/CodeGen/AMDGPU/trunc-combine.ll index ace7ff726e1d2..dd3499ed4dd68 100644 --- a/llvm/test/CodeGen/AMDGPU/trunc-combine.ll +++ b/llvm/test/CodeGen/AMDGPU/trunc-combine.ll @@ -119,9 +119,9 @@ define amdgpu_kernel void @truncate_high_elt_extract_vector(ptr addrspace(1) noc ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm bb: - %tmp = getelementptr inbounds <2 x i16>, ptr addrspace(1) %arg, i64 undef + %tmp = getelementptr inbounds <2 x i16>, ptr addrspace(1) %arg, i64 0 %tmp3 = load <2 x i16>, ptr addrspace(1) %tmp, align 4 - %tmp4 = getelementptr inbounds <2 x i16>, ptr addrspace(1) %arg1, i64 undef + %tmp4 = getelementptr inbounds <2 x i16>, ptr addrspace(1) %arg1, i64 0 %tmp5 = load <2 x i16>, ptr addrspace(1) %tmp4, align 4 %tmp6 = sext <2 x i16> %tmp3 to <2 x i32> %tmp7 = sext <2 x i16> %tmp5 to <2 x i32> @@ -132,7 +132,7 @@ bb: %tmp12 = insertelement <2 x i32> %tmp11, i32 poison, i32 1 %tmp13 = lshr <2 x i32> %tmp12, %tmp14 = trunc <2 x i32> %tmp13 to <2 x i16> - %tmp15 = getelementptr inbounds <2 x i16>, ptr addrspace(1) %arg2, i64 undef + %tmp15 = getelementptr inbounds <2 x i16>, ptr addrspace(1) %arg2, i64 0 store <2 x i16> %tmp14, ptr addrspace(1) %tmp15, align 4 ret void } diff --git a/llvm/test/CodeGen/AMDGPU/udiv.ll b/llvm/test/CodeGen/AMDGPU/udiv.ll index 24c312e701e03..6045d423c6bad 100644 --- a/llvm/test/CodeGen/AMDGPU/udiv.ll +++ b/llvm/test/CodeGen/AMDGPU/udiv.ll @@ -2319,18 +2319,22 @@ define amdgpu_kernel void @test_udiv_3_mulhu(i32 %p) { define amdgpu_kernel void @fdiv_test_denormals(ptr addrspace(1) nocapture readonly %arg) { ; SI-LABEL: fdiv_test_denormals: ; SI: ; %bb.0: ; %bb -; SI-NEXT: s_mov_b64 s[0:1], 0 +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_mov_b64 s[4:5], 0 +; SI-NEXT: s_mov_b32 s6, s2 +; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: buffer_load_sbyte v0, off, s[0:3], 0 -; SI-NEXT: buffer_load_sbyte v1, off, s[0:3], 0 +; SI-NEXT: s_mov_b32 s7, s3 +; SI-NEXT: buffer_load_sbyte v1, off, s[4:7], 0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_i32_e32 v2, v0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_i32_e32 v3, v1 ; SI-NEXT: v_xor_b32_e32 v0, v1, v0 -; SI-NEXT: v_ashrrev_i32_e32 v0, 30, v0 ; SI-NEXT: v_rcp_iflag_f32_e32 v4, v2 +; SI-NEXT: v_ashrrev_i32_e32 v0, 30, v0 ; SI-NEXT: v_or_b32_e32 v0, 1, v0 ; SI-NEXT: v_mul_f32_e32 v1, v3, v4 ; SI-NEXT: v_trunc_f32_e32 v1, v1 @@ -2339,23 +2343,27 @@ define amdgpu_kernel void @fdiv_test_denormals(ptr addrspace(1) nocapture readon ; SI-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, |v2| ; SI-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc ; SI-NEXT: v_add_i32_e32 v0, vcc, v0, v1 -; SI-NEXT: buffer_store_byte v0, off, s[0:3], 0 +; SI-NEXT: buffer_store_byte v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: fdiv_test_denormals: ; VI: ; %bb.0: ; %bb -; VI-NEXT: s_mov_b64 s[0:1], 0 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: s_mov_b32 s6, s2 +; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: buffer_load_sbyte v0, off, s[0:3], 0 -; VI-NEXT: buffer_load_sbyte v1, off, s[0:3], 0 +; VI-NEXT: s_mov_b32 s7, s3 +; VI-NEXT: buffer_load_sbyte v1, off, s[4:7], 0 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_cvt_f32_i32_e32 v2, v0 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_cvt_f32_i32_e32 v3, v1 ; VI-NEXT: v_xor_b32_e32 v0, v1, v0 -; VI-NEXT: v_ashrrev_i32_e32 v0, 30, v0 ; VI-NEXT: v_rcp_iflag_f32_e32 v4, v2 +; VI-NEXT: v_ashrrev_i32_e32 v0, 30, v0 ; VI-NEXT: v_or_b32_e32 v0, 1, v0 ; VI-NEXT: v_mul_f32_e32 v1, v3, v4 ; VI-NEXT: v_trunc_f32_e32 v1, v1 @@ -2364,11 +2372,15 @@ define amdgpu_kernel void @fdiv_test_denormals(ptr addrspace(1) nocapture readon ; VI-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, |v2| ; VI-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc ; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v1 -; VI-NEXT: buffer_store_byte v0, off, s[0:3], 0 +; VI-NEXT: buffer_store_byte v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm ; ; GCN-LABEL: fdiv_test_denormals: ; GCN: ; %bb.0: ; %bb +; GCN-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v0, s0 +; GCN-NEXT: v_mov_b32_e32 v1, s1 ; GCN-NEXT: flat_load_sbyte v2, v[0:1] ; GCN-NEXT: v_mov_b32_e32 v0, 0 ; GCN-NEXT: v_mov_b32_e32 v1, 0 @@ -2393,7 +2405,10 @@ define amdgpu_kernel void @fdiv_test_denormals(ptr addrspace(1) nocapture readon ; ; GFX1030-LABEL: fdiv_test_denormals: ; GFX1030: ; %bb.0: ; %bb -; GFX1030-NEXT: global_load_sbyte v2, v[0:1], off +; GFX1030-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; GFX1030-NEXT: v_mov_b32_e32 v0, 0 +; GFX1030-NEXT: s_waitcnt lgkmcnt(0) +; GFX1030-NEXT: global_load_sbyte v2, v0, s[0:1] ; GFX1030-NEXT: v_mov_b32_e32 v0, 0 ; GFX1030-NEXT: v_mov_b32_e32 v1, 0 ; GFX1030-NEXT: global_load_sbyte v3, v[0:1], off @@ -2417,19 +2432,23 @@ define amdgpu_kernel void @fdiv_test_denormals(ptr addrspace(1) nocapture readon ; ; EG-LABEL: fdiv_test_denormals: ; EG: ; %bb.0: ; %bb -; EG-NEXT: TEX 0 @6 -; EG-NEXT: ALU 0, @10, KC0[], KC1[] +; EG-NEXT: ALU 0, @12, KC0[CB0:0-32], KC1[] ; EG-NEXT: TEX 0 @8 -; EG-NEXT: ALU 25, @11, KC0[], KC1[] +; EG-NEXT: ALU 0, @13, KC0[], KC1[] +; EG-NEXT: TEX 0 @10 +; EG-NEXT: ALU 25, @14, KC0[], KC1[] ; EG-NEXT: MEM_RAT MSKOR T0.XW, T1.X ; EG-NEXT: CF_END -; EG-NEXT: Fetch clause starting at 6: -; EG-NEXT: VTX_READ_8 T0.X, T0.X, 0, #1 +; EG-NEXT: PAD ; EG-NEXT: Fetch clause starting at 8: +; EG-NEXT: VTX_READ_8 T0.X, T0.X, 0, #1 +; EG-NEXT: Fetch clause starting at 10: ; EG-NEXT: VTX_READ_8 T1.X, T1.X, 0, #1 -; EG-NEXT: ALU clause starting at 10: +; EG-NEXT: ALU clause starting at 12: +; EG-NEXT: MOV * T0.X, KC0[2].Y, +; EG-NEXT: ALU clause starting at 13: ; EG-NEXT: MOV * T1.X, 0.0, -; EG-NEXT: ALU clause starting at 11: +; EG-NEXT: ALU clause starting at 14: ; EG-NEXT: BFE_INT * T0.W, T0.X, 0.0, literal.x, ; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) ; EG-NEXT: INT_TO_FLT * T0.X, PV.W, @@ -2459,7 +2478,7 @@ define amdgpu_kernel void @fdiv_test_denormals(ptr addrspace(1) nocapture readon bb: %tmp = load i8, ptr addrspace(1) null, align 1 %tmp1 = sext i8 %tmp to i32 - %tmp2 = getelementptr inbounds i8, ptr addrspace(1) %arg, i64 undef + %tmp2 = getelementptr inbounds i8, ptr addrspace(1) %arg, i64 0 %tmp3 = load i8, ptr addrspace(1) %tmp2, align 1 %tmp4 = sext i8 %tmp3 to i32 %tmp5 = sdiv i32 %tmp1, %tmp4 diff --git a/llvm/test/CodeGen/AMDGPU/unhandled-loop-condition-assertion.ll b/llvm/test/CodeGen/AMDGPU/unhandled-loop-condition-assertion.ll index 8a85e1e78ce7d..c88499d271814 100644 --- a/llvm/test/CodeGen/AMDGPU/unhandled-loop-condition-assertion.ll +++ b/llvm/test/CodeGen/AMDGPU/unhandled-loop-condition-assertion.ll @@ -26,8 +26,8 @@ for.body: ; preds = %for.body, %for.body %3 = load i32, ptr addrspace(1) %add.ptr2, align 4 %add.ptr3 = getelementptr inbounds i8, ptr addrspace(1) %main.addr.011, i32 %add.ptr4.sum %4 = load i32, ptr addrspace(1) %add.ptr3, align 4 - %add.ptr6 = getelementptr inbounds i8, ptr addrspace(1) %main.addr.011, i32 undef - br i1 undef, label %for.end, label %for.body + %add.ptr6 = getelementptr inbounds i8, ptr addrspace(1) %main.addr.011, i32 0 + br i1 poison, label %for.end, label %for.body for.end: ; preds = %for.body, %entry ret void @@ -57,20 +57,20 @@ for.body: ; preds = %for.body, %for.body %3 = load i32, ptr addrspace(1) %add.ptr2, align 4 %add.ptr3 = getelementptr inbounds i8, ptr addrspace(1) %main.addr.011, i32 %add.ptr4.sum %4 = load i32, ptr addrspace(1) %add.ptr3, align 4 - %add.ptr6 = getelementptr inbounds i8, ptr addrspace(1) %main.addr.011, i32 undef - br i1 undef, label %for.end, label %for.body + %add.ptr6 = getelementptr inbounds i8, ptr addrspace(1) %main.addr.011, i32 0 + br i1 poison, label %for.end, label %for.body for.end: ; preds = %for.body, %entry ret void } -; COMMON-LABEL: {{^}}branch_undef: +; COMMON-LABEL: {{^}}branch_poison: ; SI: s_cbranch_scc1 ; SI: s_cbranch_scc1 ; SI: s_endpgm -define amdgpu_kernel void @branch_undef(ptr addrspace(1) nocapture %main, i32 %main_stride) #0 { +define amdgpu_kernel void @branch_poison(ptr addrspace(1) nocapture %main, i32 %main_stride) #0 { entry: - br i1 undef, label %for.end, label %for.body.lr.ph + br i1 poison, label %for.end, label %for.body.lr.ph for.body.lr.ph: ; preds = %entry %add.ptr.sum = shl i32 %main_stride, 1 @@ -89,8 +89,8 @@ for.body: ; preds = %for.body, %for.body %3 = load i32, ptr addrspace(1) %add.ptr2, align 4 %add.ptr3 = getelementptr inbounds i8, ptr addrspace(1) %main.addr.011, i32 %add.ptr4.sum %4 = load i32, ptr addrspace(1) %add.ptr3, align 4 - %add.ptr6 = getelementptr inbounds i8, ptr addrspace(1) %main.addr.011, i32 undef - br i1 undef, label %for.end, label %for.body + %add.ptr6 = getelementptr inbounds i8, ptr addrspace(1) %main.addr.011, i32 0 + br i1 poison, label %for.end, label %for.body for.end: ; preds = %for.body, %entry ret void diff --git a/llvm/test/CodeGen/AMDGPU/wqm.ll b/llvm/test/CodeGen/AMDGPU/wqm.ll index 43fea1d5a2ba3..45973840309ce 100644 --- a/llvm/test/CodeGen/AMDGPU/wqm.ll +++ b/llvm/test/CodeGen/AMDGPU/wqm.ll @@ -1338,7 +1338,7 @@ define amdgpu_ps void @test_set_inactive2(i32 inreg %idx0, i32 inreg %idx1) { main_body: %src1 = call float @llvm.amdgcn.struct.ptr.buffer.load.f32(ptr addrspace(8) poison, i32 %idx1, i32 0, i32 0, i32 0) %src1.0 = bitcast float %src1 to i32 - %src1.1 = call i32 @llvm.amdgcn.set.inactive.i32(i32 %src1.0, i32 undef) + %src1.1 = call i32 @llvm.amdgcn.set.inactive.i32(i32 %src1.0, i32 poison) %src0 = call float @llvm.amdgcn.struct.ptr.buffer.load.f32(ptr addrspace(8) poison, i32 %idx0, i32 0, i32 0, i32 0) %src0.0 = bitcast float %src0 to i32 %src0.1 = call i32 @llvm.amdgcn.wqm.i32(i32 %src0.0)