diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index bd95bcd89e183..e22498589d8c6 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -3225,6 +3225,7 @@ SITargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv, const SDLoc &DL, SelectionDAG &DAG) const { MachineFunction &MF = DAG.getMachineFunction(); SIMachineFunctionInfo *Info = MF.getInfo(); + const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo(); if (AMDGPU::isKernel(CallConv)) { return AMDGPUTargetLowering::LowerReturn(Chain, CallConv, isVarArg, Outs, @@ -3251,6 +3252,8 @@ SITargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv, SmallVector RetOps; RetOps.push_back(Chain); // Operand #0 = Chain (updated below) + SDValue ReadFirstLane = + DAG.getTargetConstant(Intrinsic::amdgcn_readfirstlane, DL, MVT::i32); // Copy the result values into the output registers. for (unsigned I = 0, RealRVLocIdx = 0, E = RVLocs.size(); I != E; ++I, ++RealRVLocIdx) { @@ -3278,7 +3281,9 @@ SITargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv, default: llvm_unreachable("Unknown loc info!"); } - + if (TRI->isSGPRPhysReg(VA.getLocReg())) + Arg = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, Arg.getValueType(), + ReadFirstLane, Arg); Chain = DAG.getCopyToReg(Chain, DL, VA.getLocReg(), Arg, Glue); Glue = Chain.getValue(1); RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT())); diff --git a/llvm/test/CodeGen/AMDGPU/add64-low-32-bits-known-zero.ll b/llvm/test/CodeGen/AMDGPU/add64-low-32-bits-known-zero.ll index 52259c4c2e6e1..1d51b8a077566 100644 --- a/llvm/test/CodeGen/AMDGPU/add64-low-32-bits-known-zero.ll +++ b/llvm/test/CodeGen/AMDGPU/add64-low-32-bits-known-zero.ll @@ -148,8 +148,8 @@ define <2 x i64> @v_add_v2i64_nonsplat_const_low_bits_known0_0(<2 x i64> %reg) { define amdgpu_ps <2 x i64> @s_add_v2i64_splat_const_low_bits_known0_0(<2 x i64> inreg %reg) { ; GFX9-LABEL: s_add_v2i64_splat_const_low_bits_known0_0: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_add_i32 s1, s1, 1 ; GFX9-NEXT: s_add_i32 s3, s3, 1 +; GFX9-NEXT: s_add_i32 s1, s1, 1 ; GFX9-NEXT: ; return to shader part epilog %add = add <2 x i64> %reg, ; (1 << 32) ret <2 x i64> %add @@ -158,8 +158,8 @@ define amdgpu_ps <2 x i64> @s_add_v2i64_splat_const_low_bits_known0_0(<2 x i64> define amdgpu_ps <2 x i64> @s_add_v2i64_nonsplat_const_low_bits_known0_0(<2 x i64> inreg %reg) { ; GFX9-LABEL: s_add_v2i64_nonsplat_const_low_bits_known0_0: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_add_i32 s1, s1, 1 ; GFX9-NEXT: s_add_i32 s3, s3, 2 +; GFX9-NEXT: s_add_i32 s1, s1, 1 ; GFX9-NEXT: ; return to shader part epilog %add = add <2 x i64> %reg, ; (1 << 32), (1 << 33) ret <2 x i64> %add diff --git a/llvm/test/CodeGen/AMDGPU/buffer-atomic-fadd.f64.ll b/llvm/test/CodeGen/AMDGPU/buffer-atomic-fadd.f64.ll index 6885657bbfa36..37928a78622a6 100644 --- a/llvm/test/CodeGen/AMDGPU/buffer-atomic-fadd.f64.ll +++ b/llvm/test/CodeGen/AMDGPU/buffer-atomic-fadd.f64.ll @@ -110,9 +110,11 @@ define amdgpu_ps double @buffer_atomic_fadd_f64_offset_rtn(double %val, <4 x i32 ; GFX90A_GFX942-NEXT: [[COPY7:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE1]] ; GFX90A_GFX942-NEXT: [[BUFFER_ATOMIC_ADD_F64_OFFSET_RTN:%[0-9]+]]:vreg_64_align2 = BUFFER_ATOMIC_ADD_F64_OFFSET_RTN [[COPY7]], killed [[REG_SEQUENCE]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64), align 1, addrspace 8) ; GFX90A_GFX942-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_ADD_F64_OFFSET_RTN]].sub0 + ; GFX90A_GFX942-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 killed [[COPY8]], implicit $exec ; GFX90A_GFX942-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_ADD_F64_OFFSET_RTN]].sub1 - ; GFX90A_GFX942-NEXT: $sgpr0 = COPY [[COPY8]] - ; GFX90A_GFX942-NEXT: $sgpr1 = COPY [[COPY9]] + ; GFX90A_GFX942-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 killed [[COPY9]], implicit $exec + ; GFX90A_GFX942-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] + ; GFX90A_GFX942-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]] ; GFX90A_GFX942-NEXT: SI_RETURN_TO_EPILOG $sgpr0, $sgpr1 %ret = call double @llvm.amdgcn.raw.buffer.atomic.fadd.f64(double %val, <4 x i32> %rsrc, i32 0, i32 %soffset, i32 0) ret double %ret @@ -136,9 +138,11 @@ define amdgpu_ps double @buffer_atomic_fadd_f64_offen_rtn(double %val, <4 x i32> ; GFX90A_GFX942-NEXT: [[COPY8:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE1]] ; GFX90A_GFX942-NEXT: [[BUFFER_ATOMIC_ADD_F64_OFFEN_RTN:%[0-9]+]]:vreg_64_align2 = BUFFER_ATOMIC_ADD_F64_OFFEN_RTN [[COPY8]], [[COPY1]], killed [[REG_SEQUENCE]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64), align 1, addrspace 8) ; GFX90A_GFX942-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_ADD_F64_OFFEN_RTN]].sub0 + ; GFX90A_GFX942-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 killed [[COPY9]], implicit $exec ; GFX90A_GFX942-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_ADD_F64_OFFEN_RTN]].sub1 - ; GFX90A_GFX942-NEXT: $sgpr0 = COPY [[COPY9]] - ; GFX90A_GFX942-NEXT: $sgpr1 = COPY [[COPY10]] + ; GFX90A_GFX942-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 killed [[COPY10]], implicit $exec + ; GFX90A_GFX942-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] + ; GFX90A_GFX942-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]] ; GFX90A_GFX942-NEXT: SI_RETURN_TO_EPILOG $sgpr0, $sgpr1 %ret = call double @llvm.amdgcn.raw.buffer.atomic.fadd.f64(double %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) ret double %ret @@ -162,9 +166,11 @@ define amdgpu_ps double @buffer_atomic_fadd_f64_idxen_rtn(double %val, <4 x i32> ; GFX90A_GFX942-NEXT: [[COPY8:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE1]] ; GFX90A_GFX942-NEXT: [[BUFFER_ATOMIC_ADD_F64_IDXEN_RTN:%[0-9]+]]:vreg_64_align2 = BUFFER_ATOMIC_ADD_F64_IDXEN_RTN [[COPY8]], [[COPY1]], killed [[REG_SEQUENCE]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64), align 1, addrspace 8) ; GFX90A_GFX942-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_ADD_F64_IDXEN_RTN]].sub0 + ; GFX90A_GFX942-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 killed [[COPY9]], implicit $exec ; GFX90A_GFX942-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_ADD_F64_IDXEN_RTN]].sub1 - ; GFX90A_GFX942-NEXT: $sgpr0 = COPY [[COPY9]] - ; GFX90A_GFX942-NEXT: $sgpr1 = COPY [[COPY10]] + ; GFX90A_GFX942-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 killed [[COPY10]], implicit $exec + ; GFX90A_GFX942-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] + ; GFX90A_GFX942-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]] ; GFX90A_GFX942-NEXT: SI_RETURN_TO_EPILOG $sgpr0, $sgpr1 %ret = call double @llvm.amdgcn.struct.buffer.atomic.fadd.f64(double %val, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 %soffset, i32 0) ret double %ret @@ -190,9 +196,11 @@ define amdgpu_ps double @buffer_atomic_fadd_f64_bothen_rtn(double %val, <4 x i32 ; GFX90A_GFX942-NEXT: [[COPY9:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE1]] ; GFX90A_GFX942-NEXT: [[BUFFER_ATOMIC_ADD_F64_BOTHEN_RTN:%[0-9]+]]:vreg_64_align2 = BUFFER_ATOMIC_ADD_F64_BOTHEN_RTN [[COPY9]], killed [[REG_SEQUENCE2]], killed [[REG_SEQUENCE]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64), align 1, addrspace 8) ; GFX90A_GFX942-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_ADD_F64_BOTHEN_RTN]].sub0 + ; GFX90A_GFX942-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 killed [[COPY10]], implicit $exec ; GFX90A_GFX942-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_ADD_F64_BOTHEN_RTN]].sub1 - ; GFX90A_GFX942-NEXT: $sgpr0 = COPY [[COPY10]] - ; GFX90A_GFX942-NEXT: $sgpr1 = COPY [[COPY11]] + ; GFX90A_GFX942-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 killed [[COPY11]], implicit $exec + ; GFX90A_GFX942-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] + ; GFX90A_GFX942-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]] ; GFX90A_GFX942-NEXT: SI_RETURN_TO_EPILOG $sgpr0, $sgpr1 %ret = call double @llvm.amdgcn.struct.buffer.atomic.fadd.f64(double %val, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) ret double %ret @@ -334,9 +342,11 @@ define amdgpu_ps double @buffer_ptr_atomic_fadd_f64_offset_rtn(double %val, ptr ; GFX90A_GFX942-NEXT: [[COPY11:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE3]] ; GFX90A_GFX942-NEXT: [[BUFFER_ATOMIC_ADD_F64_OFFSET_RTN:%[0-9]+]]:vreg_64_align2 = BUFFER_ATOMIC_ADD_F64_OFFSET_RTN [[COPY11]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.rsrc, align 1, addrspace 8) ; GFX90A_GFX942-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_ADD_F64_OFFSET_RTN]].sub0 + ; GFX90A_GFX942-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 killed [[COPY12]], implicit $exec ; GFX90A_GFX942-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_ADD_F64_OFFSET_RTN]].sub1 - ; GFX90A_GFX942-NEXT: $sgpr0 = COPY [[COPY12]] - ; GFX90A_GFX942-NEXT: $sgpr1 = COPY [[COPY13]] + ; GFX90A_GFX942-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 killed [[COPY13]], implicit $exec + ; GFX90A_GFX942-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] + ; GFX90A_GFX942-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]] ; GFX90A_GFX942-NEXT: SI_RETURN_TO_EPILOG $sgpr0, $sgpr1 %ret = call double @llvm.amdgcn.raw.ptr.buffer.atomic.fadd.f64(double %val, ptr addrspace(8) %rsrc, i32 0, i32 %soffset, i32 0) ret double %ret @@ -366,9 +376,11 @@ define amdgpu_ps double @buffer_ptr_atomic_fadd_f64_offen_rtn(double %val, ptr a ; GFX90A_GFX942-NEXT: [[COPY12:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE3]] ; GFX90A_GFX942-NEXT: [[BUFFER_ATOMIC_ADD_F64_OFFEN_RTN:%[0-9]+]]:vreg_64_align2 = BUFFER_ATOMIC_ADD_F64_OFFEN_RTN [[COPY12]], [[COPY1]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.rsrc, align 1, addrspace 8) ; GFX90A_GFX942-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_ADD_F64_OFFEN_RTN]].sub0 + ; GFX90A_GFX942-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 killed [[COPY13]], implicit $exec ; GFX90A_GFX942-NEXT: [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_ADD_F64_OFFEN_RTN]].sub1 - ; GFX90A_GFX942-NEXT: $sgpr0 = COPY [[COPY13]] - ; GFX90A_GFX942-NEXT: $sgpr1 = COPY [[COPY14]] + ; GFX90A_GFX942-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 killed [[COPY14]], implicit $exec + ; GFX90A_GFX942-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] + ; GFX90A_GFX942-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]] ; GFX90A_GFX942-NEXT: SI_RETURN_TO_EPILOG $sgpr0, $sgpr1 %ret = call double @llvm.amdgcn.raw.ptr.buffer.atomic.fadd.f64(double %val, ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 0) ret double %ret @@ -398,9 +410,11 @@ define amdgpu_ps double @buffer_ptr_atomic_fadd_f64_idxen_rtn(double %val, ptr a ; GFX90A_GFX942-NEXT: [[COPY12:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE3]] ; GFX90A_GFX942-NEXT: [[BUFFER_ATOMIC_ADD_F64_IDXEN_RTN:%[0-9]+]]:vreg_64_align2 = BUFFER_ATOMIC_ADD_F64_IDXEN_RTN [[COPY12]], [[COPY1]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.rsrc, align 1, addrspace 8) ; GFX90A_GFX942-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_ADD_F64_IDXEN_RTN]].sub0 + ; GFX90A_GFX942-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 killed [[COPY13]], implicit $exec ; GFX90A_GFX942-NEXT: [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_ADD_F64_IDXEN_RTN]].sub1 - ; GFX90A_GFX942-NEXT: $sgpr0 = COPY [[COPY13]] - ; GFX90A_GFX942-NEXT: $sgpr1 = COPY [[COPY14]] + ; GFX90A_GFX942-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 killed [[COPY14]], implicit $exec + ; GFX90A_GFX942-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] + ; GFX90A_GFX942-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]] ; GFX90A_GFX942-NEXT: SI_RETURN_TO_EPILOG $sgpr0, $sgpr1 %ret = call double @llvm.amdgcn.struct.ptr.buffer.atomic.fadd.f64(double %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 %soffset, i32 0) ret double %ret @@ -432,9 +446,11 @@ define amdgpu_ps double @buffer_ptr_atomic_fadd_f64_bothen_rtn(double %val, ptr ; GFX90A_GFX942-NEXT: [[COPY13:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE3]] ; GFX90A_GFX942-NEXT: [[BUFFER_ATOMIC_ADD_F64_BOTHEN_RTN:%[0-9]+]]:vreg_64_align2 = BUFFER_ATOMIC_ADD_F64_BOTHEN_RTN [[COPY13]], killed [[REG_SEQUENCE4]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.rsrc, align 1, addrspace 8) ; GFX90A_GFX942-NEXT: [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_ADD_F64_BOTHEN_RTN]].sub0 + ; GFX90A_GFX942-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 killed [[COPY14]], implicit $exec ; GFX90A_GFX942-NEXT: [[COPY15:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_ADD_F64_BOTHEN_RTN]].sub1 - ; GFX90A_GFX942-NEXT: $sgpr0 = COPY [[COPY14]] - ; GFX90A_GFX942-NEXT: $sgpr1 = COPY [[COPY15]] + ; GFX90A_GFX942-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 killed [[COPY15]], implicit $exec + ; GFX90A_GFX942-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] + ; GFX90A_GFX942-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]] ; GFX90A_GFX942-NEXT: SI_RETURN_TO_EPILOG $sgpr0, $sgpr1 %ret = call double @llvm.amdgcn.struct.ptr.buffer.atomic.fadd.f64(double %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) ret double %ret diff --git a/llvm/test/CodeGen/AMDGPU/constrained-shift.ll b/llvm/test/CodeGen/AMDGPU/constrained-shift.ll index 661af021e8a84..af4ca2ad7120a 100644 --- a/llvm/test/CodeGen/AMDGPU/constrained-shift.ll +++ b/llvm/test/CodeGen/AMDGPU/constrained-shift.ll @@ -168,26 +168,26 @@ define <4 x i32> @csh_v4i32(<4 x i32> %a, <4 x i32> %b) { define amdgpu_ps <4 x i32> @s_csh_v4i32(<4 x i32> inreg %a, <4 x i32> inreg %b) { ; CHECK-LABEL: s_csh_v4i32: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_lshl_b32 s8, s0, s4 -; CHECK-NEXT: s_lshl_b32 s9, s1, s5 -; CHECK-NEXT: s_lshl_b32 s10, s2, s6 -; CHECK-NEXT: s_lshl_b32 s11, s3, s7 -; CHECK-NEXT: s_lshr_b32 s12, s0, s4 -; CHECK-NEXT: s_lshr_b32 s13, s1, s5 -; CHECK-NEXT: s_lshr_b32 s14, s2, s6 -; CHECK-NEXT: s_lshr_b32 s15, s3, s7 -; CHECK-NEXT: s_ashr_i32 s3, s3, s7 -; CHECK-NEXT: s_ashr_i32 s2, s2, s6 -; CHECK-NEXT: s_ashr_i32 s1, s1, s5 +; CHECK-NEXT: s_lshl_b32 s8, s3, s7 +; CHECK-NEXT: s_lshl_b32 s9, s2, s6 +; CHECK-NEXT: s_lshl_b32 s10, s1, s5 +; CHECK-NEXT: s_lshl_b32 s11, s0, s4 +; CHECK-NEXT: s_lshr_b32 s12, s3, s7 +; CHECK-NEXT: s_lshr_b32 s13, s2, s6 +; CHECK-NEXT: s_lshr_b32 s14, s1, s5 +; CHECK-NEXT: s_lshr_b32 s15, s0, s4 ; CHECK-NEXT: s_ashr_i32 s0, s0, s4 +; CHECK-NEXT: s_ashr_i32 s1, s1, s5 +; CHECK-NEXT: s_ashr_i32 s2, s2, s6 +; CHECK-NEXT: s_ashr_i32 s3, s3, s7 ; CHECK-NEXT: s_add_i32 s4, s11, s15 ; CHECK-NEXT: s_add_i32 s5, s10, s14 ; CHECK-NEXT: s_add_i32 s6, s9, s13 ; CHECK-NEXT: s_add_i32 s7, s8, s12 -; CHECK-NEXT: s_add_i32 s0, s7, s0 -; CHECK-NEXT: s_add_i32 s1, s6, s1 -; CHECK-NEXT: s_add_i32 s2, s5, s2 -; CHECK-NEXT: s_add_i32 s3, s4, s3 +; CHECK-NEXT: s_add_i32 s3, s7, s3 +; CHECK-NEXT: s_add_i32 s2, s6, s2 +; CHECK-NEXT: s_add_i32 s1, s5, s1 +; CHECK-NEXT: s_add_i32 s0, s4, s0 ; CHECK-NEXT: ; return to shader part epilog ; ; GISEL-LABEL: s_csh_v4i32: diff --git a/llvm/test/CodeGen/AMDGPU/dag-preserve-disjoint-flag.ll b/llvm/test/CodeGen/AMDGPU/dag-preserve-disjoint-flag.ll index 4b4718a2acb80..d63a36c4b2958 100644 --- a/llvm/test/CodeGen/AMDGPU/dag-preserve-disjoint-flag.ll +++ b/llvm/test/CodeGen/AMDGPU/dag-preserve-disjoint-flag.ll @@ -10,8 +10,10 @@ define amdgpu_ps i32 @s_or_i32_disjoint(i32 inreg %a, i32 inreg %b) { ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr1 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr0 - ; CHECK-NEXT: %3:sreg_32 = disjoint S_OR_B32 [[COPY1]], [[COPY]], implicit-def dead $scc - ; CHECK-NEXT: $sgpr0 = COPY %3 + ; CHECK-NEXT: [[S_OR_B32_:%[0-9]+]]:sreg_32 = disjoint S_OR_B32 [[COPY1]], [[COPY]], implicit-def dead $scc + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[S_OR_B32_]] + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 killed [[COPY2]], implicit $exec + ; CHECK-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] ; CHECK-NEXT: SI_RETURN_TO_EPILOG $sgpr0 %result = or disjoint i32 %a, %b ret i32 %result @@ -26,10 +28,14 @@ define amdgpu_ps <2 x i32> @s_or_v2i32_disjoint(<2 x i32> inreg %a, <2 x i32> in ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr2 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr1 ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr0 - ; CHECK-NEXT: %5:sreg_32 = disjoint S_OR_B32 [[COPY3]], [[COPY1]], implicit-def dead $scc - ; CHECK-NEXT: %6:sreg_32 = disjoint S_OR_B32 [[COPY2]], [[COPY]], implicit-def dead $scc - ; CHECK-NEXT: $sgpr0 = COPY %5 - ; CHECK-NEXT: $sgpr1 = COPY %6 + ; CHECK-NEXT: [[S_OR_B32_:%[0-9]+]]:sreg_32 = disjoint S_OR_B32 [[COPY2]], [[COPY]], implicit-def dead $scc + ; CHECK-NEXT: [[S_OR_B32_1:%[0-9]+]]:sreg_32 = disjoint S_OR_B32 [[COPY3]], [[COPY1]], implicit-def dead $scc + ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[S_OR_B32_1]] + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 killed [[COPY4]], implicit $exec + ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[S_OR_B32_]] + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 killed [[COPY5]], implicit $exec + ; CHECK-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] + ; CHECK-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]] ; CHECK-NEXT: SI_RETURN_TO_EPILOG $sgpr0, $sgpr1 %result = or disjoint <2 x i32> %a, %b ret <2 x i32> %result @@ -42,8 +48,8 @@ define i32 @v_or_i32_disjoint(i32 %a, i32 %b) { ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; CHECK-NEXT: %10:vgpr_32 = disjoint V_OR_B32_e64 [[COPY1]], [[COPY]], implicit $exec - ; CHECK-NEXT: $vgpr0 = COPY %10 + ; CHECK-NEXT: [[V_OR_B32_e64_:%[0-9]+]]:vgpr_32 = disjoint V_OR_B32_e64 [[COPY1]], [[COPY]], implicit $exec + ; CHECK-NEXT: $vgpr0 = COPY [[V_OR_B32_e64_]] ; CHECK-NEXT: SI_RETURN implicit $vgpr0 %result = or disjoint i32 %a, %b ret i32 %result @@ -58,10 +64,10 @@ define <2 x i32> @v_or_v2i32_disjoint(<2 x i32> %a, <2 x i32> %b) { ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; CHECK-NEXT: %12:vgpr_32 = disjoint V_OR_B32_e64 [[COPY3]], [[COPY1]], implicit $exec - ; CHECK-NEXT: %13:vgpr_32 = disjoint V_OR_B32_e64 [[COPY2]], [[COPY]], implicit $exec - ; CHECK-NEXT: $vgpr0 = COPY %12 - ; CHECK-NEXT: $vgpr1 = COPY %13 + ; CHECK-NEXT: [[V_OR_B32_e64_:%[0-9]+]]:vgpr_32 = disjoint V_OR_B32_e64 [[COPY3]], [[COPY1]], implicit $exec + ; CHECK-NEXT: [[V_OR_B32_e64_1:%[0-9]+]]:vgpr_32 = disjoint V_OR_B32_e64 [[COPY2]], [[COPY]], implicit $exec + ; CHECK-NEXT: $vgpr0 = COPY [[V_OR_B32_e64_]] + ; CHECK-NEXT: $vgpr1 = COPY [[V_OR_B32_e64_1]] ; CHECK-NEXT: SI_RETURN implicit $vgpr0, implicit $vgpr1 %result = or disjoint <2 x i32> %a, %b ret <2 x i32> %result @@ -78,11 +84,15 @@ define amdgpu_ps i64 @s_or_i64_disjoint(i64 inreg %a, i64 inreg %b) { ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr0 ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1 ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1 - ; CHECK-NEXT: %7:sreg_64 = disjoint S_OR_B64 killed [[REG_SEQUENCE1]], killed [[REG_SEQUENCE]], implicit-def dead $scc - ; CHECK-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY %7.sub1 - ; CHECK-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY %7.sub0 - ; CHECK-NEXT: $sgpr0 = COPY [[COPY5]] - ; CHECK-NEXT: $sgpr1 = COPY [[COPY4]] + ; CHECK-NEXT: [[S_OR_B64_:%[0-9]+]]:sreg_64 = disjoint S_OR_B64 killed [[REG_SEQUENCE1]], killed [[REG_SEQUENCE]], implicit-def dead $scc + ; CHECK-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY [[S_OR_B64_]].sub1 + ; CHECK-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY [[S_OR_B64_]].sub0 + ; CHECK-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[COPY5]] + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 killed [[COPY6]], implicit $exec + ; CHECK-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY4]] + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 killed [[COPY7]], implicit $exec + ; CHECK-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] + ; CHECK-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]] ; CHECK-NEXT: SI_RETURN_TO_EPILOG $sgpr0, $sgpr1 %result = or disjoint i64 %a, %b ret i64 %result diff --git a/llvm/test/CodeGen/AMDGPU/flat-atomic-fadd.f64.ll b/llvm/test/CodeGen/AMDGPU/flat-atomic-fadd.f64.ll index 36714b386e7e5..f2f8c0a5cfa8f 100644 --- a/llvm/test/CodeGen/AMDGPU/flat-atomic-fadd.f64.ll +++ b/llvm/test/CodeGen/AMDGPU/flat-atomic-fadd.f64.ll @@ -44,9 +44,11 @@ define amdgpu_ps double @flat_atomic_fadd_f64_rtn_intrinsic(ptr %ptr, double %da ; GFX90A_GFX942-NEXT: [[COPY5:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]] ; GFX90A_GFX942-NEXT: [[FLAT_ATOMIC_ADD_F64_RTN:%[0-9]+]]:vreg_64_align2 = FLAT_ATOMIC_ADD_F64_RTN killed [[COPY4]], killed [[COPY5]], 0, 1, implicit $exec, implicit $flat_scr :: (load store syncscope("agent") seq_cst (s64) on %ir.ptr) ; GFX90A_GFX942-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[FLAT_ATOMIC_ADD_F64_RTN]].sub0 + ; GFX90A_GFX942-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 killed [[COPY6]], implicit $exec ; GFX90A_GFX942-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[FLAT_ATOMIC_ADD_F64_RTN]].sub1 - ; GFX90A_GFX942-NEXT: $sgpr0 = COPY [[COPY6]] - ; GFX90A_GFX942-NEXT: $sgpr1 = COPY [[COPY7]] + ; GFX90A_GFX942-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 killed [[COPY7]], implicit $exec + ; GFX90A_GFX942-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] + ; GFX90A_GFX942-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]] ; GFX90A_GFX942-NEXT: SI_RETURN_TO_EPILOG $sgpr0, $sgpr1 %ret = call double @llvm.amdgcn.flat.atomic.fadd.f64.p1.f64(ptr %ptr, double %data) ret double %ret @@ -117,9 +119,11 @@ define amdgpu_ps double @flat_atomic_fadd_f64_rtn_atomicrmw(ptr %ptr, double %da ; GFX90A_GFX942-NEXT: [[COPY5:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]] ; GFX90A_GFX942-NEXT: [[FLAT_ATOMIC_ADD_F64_RTN:%[0-9]+]]:vreg_64_align2 = FLAT_ATOMIC_ADD_F64_RTN killed [[COPY4]], killed [[COPY5]], 0, 1, implicit $exec, implicit $flat_scr :: (load store syncscope("wavefront") monotonic (s64) on %ir.ptr) ; GFX90A_GFX942-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[FLAT_ATOMIC_ADD_F64_RTN]].sub0 + ; GFX90A_GFX942-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 killed [[COPY6]], implicit $exec ; GFX90A_GFX942-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[FLAT_ATOMIC_ADD_F64_RTN]].sub1 - ; GFX90A_GFX942-NEXT: $sgpr0 = COPY [[COPY6]] - ; GFX90A_GFX942-NEXT: $sgpr1 = COPY [[COPY7]] + ; GFX90A_GFX942-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 killed [[COPY7]], implicit $exec + ; GFX90A_GFX942-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] + ; GFX90A_GFX942-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]] ; GFX90A_GFX942-NEXT: SI_RETURN_TO_EPILOG $sgpr0, $sgpr1 %ret = atomicrmw fadd ptr %ptr, double %data syncscope("wavefront") monotonic, !noalias.addrspace !1, !amdgpu.no.fine.grained.memory !0 ret double %ret @@ -144,9 +148,11 @@ define amdgpu_ps double @flat_atomic_fadd_f64_rtn_atomicrmw__noprivate(ptr %ptr, ; GFX90A_GFX942-NEXT: [[COPY5:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]] ; GFX90A_GFX942-NEXT: [[FLAT_ATOMIC_ADD_F64_RTN:%[0-9]+]]:vreg_64_align2 = FLAT_ATOMIC_ADD_F64_RTN killed [[COPY4]], killed [[COPY5]], 0, 1, implicit $exec, implicit $flat_scr :: (load store syncscope("wavefront") monotonic (s64) on %ir.ptr) ; GFX90A_GFX942-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[FLAT_ATOMIC_ADD_F64_RTN]].sub0 + ; GFX90A_GFX942-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 killed [[COPY6]], implicit $exec ; GFX90A_GFX942-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[FLAT_ATOMIC_ADD_F64_RTN]].sub1 - ; GFX90A_GFX942-NEXT: $sgpr0 = COPY [[COPY6]] - ; GFX90A_GFX942-NEXT: $sgpr1 = COPY [[COPY7]] + ; GFX90A_GFX942-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 killed [[COPY7]], implicit $exec + ; GFX90A_GFX942-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] + ; GFX90A_GFX942-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]] ; GFX90A_GFX942-NEXT: SI_RETURN_TO_EPILOG $sgpr0, $sgpr1 %ret = atomicrmw fadd ptr %ptr, double %data syncscope("wavefront") monotonic, !noalias.addrspace !1, !amdgpu.no.fine.grained.memory !0 ret double %ret diff --git a/llvm/test/CodeGen/AMDGPU/global-atomic-fadd.f64.ll b/llvm/test/CodeGen/AMDGPU/global-atomic-fadd.f64.ll index fa4e7f87853dd..682c1cd8060aa 100644 --- a/llvm/test/CodeGen/AMDGPU/global-atomic-fadd.f64.ll +++ b/llvm/test/CodeGen/AMDGPU/global-atomic-fadd.f64.ll @@ -62,9 +62,11 @@ define amdgpu_ps double @global_atomic_fadd_f64_rtn_atomicrmw(ptr addrspace(1) % ; GFX90A-NEXT: [[PHI3:%[0-9]+]]:sreg_64 = PHI [[SI_IF_BREAK]], %bb.1 ; GFX90A-NEXT: SI_END_CF [[PHI3]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec ; GFX90A-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[PHI2]].sub0 + ; GFX90A-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 killed [[COPY12]], implicit $exec ; GFX90A-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[PHI2]].sub1 - ; GFX90A-NEXT: $sgpr0 = COPY [[COPY12]] - ; GFX90A-NEXT: $sgpr1 = COPY [[COPY13]] + ; GFX90A-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 killed [[COPY13]], implicit $exec + ; GFX90A-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] + ; GFX90A-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]] ; GFX90A-NEXT: SI_RETURN_TO_EPILOG $sgpr0, $sgpr1 ; ; GFX942-LABEL: name: global_atomic_fadd_f64_rtn_atomicrmw @@ -81,9 +83,11 @@ define amdgpu_ps double @global_atomic_fadd_f64_rtn_atomicrmw(ptr addrspace(1) % ; GFX942-NEXT: [[COPY5:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]] ; GFX942-NEXT: [[GLOBAL_ATOMIC_ADD_F64_RTN:%[0-9]+]]:vreg_64_align2 = GLOBAL_ATOMIC_ADD_F64_RTN killed [[COPY4]], killed [[COPY5]], 0, 1, implicit $exec :: (load store syncscope("wavefront") monotonic (s64) on %ir.ptr, addrspace 1) ; GFX942-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_RTN]].sub0 + ; GFX942-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 killed [[COPY6]], implicit $exec ; GFX942-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_RTN]].sub1 - ; GFX942-NEXT: $sgpr0 = COPY [[COPY6]] - ; GFX942-NEXT: $sgpr1 = COPY [[COPY7]] + ; GFX942-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 killed [[COPY7]], implicit $exec + ; GFX942-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] + ; GFX942-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]] ; GFX942-NEXT: SI_RETURN_TO_EPILOG $sgpr0, $sgpr1 %ret = atomicrmw fadd ptr addrspace(1) %ptr, double %data syncscope("wavefront") monotonic ret double %ret @@ -123,9 +127,11 @@ define amdgpu_ps double @global_atomic_fadd_f64_saddr_rtn_atomicrmw(ptr addrspac ; GFX90A_GFX942-NEXT: [[COPY4:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE1]] ; GFX90A_GFX942-NEXT: [[GLOBAL_ATOMIC_ADD_F64_SADDR_RTN:%[0-9]+]]:vreg_64_align2 = GLOBAL_ATOMIC_ADD_F64_SADDR_RTN killed [[V_MOV_B32_e32_]], killed [[COPY4]], killed [[REG_SEQUENCE]], 0, 1, implicit $exec :: (load store syncscope("wavefront") monotonic (s64) on %ir.ptr, addrspace 1) ; GFX90A_GFX942-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_SADDR_RTN]].sub0 + ; GFX90A_GFX942-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 killed [[COPY5]], implicit $exec ; GFX90A_GFX942-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_SADDR_RTN]].sub1 - ; GFX90A_GFX942-NEXT: $sgpr0 = COPY [[COPY5]] - ; GFX90A_GFX942-NEXT: $sgpr1 = COPY [[COPY6]] + ; GFX90A_GFX942-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 killed [[COPY6]], implicit $exec + ; GFX90A_GFX942-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] + ; GFX90A_GFX942-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]] ; GFX90A_GFX942-NEXT: SI_RETURN_TO_EPILOG $sgpr0, $sgpr1 %ret = atomicrmw fadd ptr addrspace(1) %ptr, double %data syncscope("wavefront") monotonic, !amdgpu.no.fine.grained.memory !0 ret double %ret diff --git a/llvm/test/CodeGen/AMDGPU/i1-to-bf16.ll b/llvm/test/CodeGen/AMDGPU/i1-to-bf16.ll index c88113d62a887..e82801eadc936 100644 --- a/llvm/test/CodeGen/AMDGPU/i1-to-bf16.ll +++ b/llvm/test/CodeGen/AMDGPU/i1-to-bf16.ll @@ -358,8 +358,8 @@ define <2 x bfloat> @v_uitofp_v2i1_to_v2bf16(<2 x i1> %num) { define amdgpu_ps <2 x i32> @s_uitofp_v2i1_to_v2bf16(<2 x i1> inreg %num) { ; GFX7-LABEL: s_uitofp_v2i1_to_v2bf16: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_and_b32 s2, 1, s0 -; GFX7-NEXT: s_bitcmp1_b32 s1, 0 +; GFX7-NEXT: s_and_b32 s2, 1, s1 +; GFX7-NEXT: s_bitcmp1_b32 s0, 0 ; GFX7-NEXT: s_cselect_b64 s[0:1], -1, 0 ; GFX7-NEXT: s_cmp_eq_u32 s2, 1 ; GFX7-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, s[0:1] @@ -367,14 +367,14 @@ define amdgpu_ps <2 x i32> @s_uitofp_v2i1_to_v2bf16(<2 x i1> inreg %num) { ; GFX7-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s[0:1] ; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX7-NEXT: v_readfirstlane_b32 s0, v1 -; GFX7-NEXT: v_readfirstlane_b32 s1, v0 +; GFX7-NEXT: v_readfirstlane_b32 s0, v0 +; GFX7-NEXT: v_readfirstlane_b32 s1, v1 ; GFX7-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: s_uitofp_v2i1_to_v2bf16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_and_b32 s2, 1, s0 -; GFX9-NEXT: s_bitcmp1_b32 s1, 0 +; GFX9-NEXT: s_and_b32 s2, 1, s1 +; GFX9-NEXT: s_bitcmp1_b32 s0, 0 ; GFX9-NEXT: s_cselect_b64 s[0:1], -1, 0 ; GFX9-NEXT: s_cmp_eq_u32 s2, 1 ; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, s[0:1] @@ -396,77 +396,75 @@ define amdgpu_ps <2 x i32> @s_uitofp_v2i1_to_v2bf16(<2 x i1> inreg %num) { ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX9-NEXT: v_readfirstlane_b32 s0, v1 -; GFX9-NEXT: v_readfirstlane_b32 s1, v0 +; GFX9-NEXT: v_readfirstlane_b32 s1, v1 +; GFX9-NEXT: v_readfirstlane_b32 s0, v0 ; GFX9-NEXT: ; return to shader part epilog ; ; GFX11-LABEL: s_uitofp_v2i1_to_v2bf16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_and_b32 s0, 1, s0 -; GFX11-NEXT: s_bitcmp1_b32 s1, 0 -; GFX11-NEXT: s_cselect_b32 s1, -1, 0 -; GFX11-NEXT: s_cmp_eq_u32 s0, 1 -; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, s1 +; GFX11-NEXT: s_and_b32 s1, 1, s1 +; GFX11-NEXT: s_bitcmp1_b32 s0, 0 +; GFX11-NEXT: s_cselect_b32 s0, -1, 0 +; GFX11-NEXT: s_cmp_eq_u32 s1, 1 +; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, s0 ; GFX11-NEXT: s_cselect_b32 s0, -1, 0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s0 -; GFX11-NEXT: v_bfe_u32 v3, v0, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v5, 0x400000, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v1 -; GFX11-NEXT: v_add_nc_u32_e32 v3, v3, v0 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_add_nc_u32_e32 v2, v2, v1 +; GFX11-NEXT: v_bfe_u32 v2, v0, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v0 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_bfe_u32 v3, v1, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; GFX11-NEXT: v_add_nc_u32_e32 v3, v3, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_nc_u32_e32 v2, v2, v0 ; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2 -; GFX11-NEXT: v_cndmask_b32_e32 v1, v2, v4, vcc_lo -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_cndmask_b32_e32 v0, v3, v5, vcc_lo -; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 ; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX11-NEXT: v_readfirstlane_b32 s0, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-NEXT: v_readfirstlane_b32 s1, v0 +; GFX11-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_readfirstlane_b32 s0, v0 +; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_readfirstlane_b32 s1, v1 ; GFX11-NEXT: ; return to shader part epilog ; ; GFX12-LABEL: s_uitofp_v2i1_to_v2bf16: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_and_b32 s0, 1, s0 -; GFX12-NEXT: s_bitcmp1_b32 s1, 0 -; GFX12-NEXT: s_cselect_b32 s1, -1, 0 -; GFX12-NEXT: s_cmp_eq_u32 s0, 1 -; GFX12-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, s1 +; GFX12-NEXT: s_and_b32 s1, 1, s1 +; GFX12-NEXT: s_bitcmp1_b32 s0, 0 +; GFX12-NEXT: s_cselect_b32 s0, -1, 0 +; GFX12-NEXT: s_cmp_eq_u32 s1, 1 +; GFX12-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, s0 ; GFX12-NEXT: s_cselect_b32 s0, -1, 0 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX12-NEXT: v_bfe_u32 v3, v0, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v5, 0x400000, v0 -; GFX12-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v4, 0x400000, v1 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX12-NEXT: v_add_nc_u32_e32 v3, v3, v0 -; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX12-NEXT: v_add_nc_u32_e32 v2, v2, v1 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX12-NEXT: v_bfe_u32 v2, v0, 16, 1 +; GFX12-NEXT: v_or_b32_e32 v4, 0x400000, v0 +; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX12-NEXT: v_bfe_u32 v3, v1, 16, 1 +; GFX12-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_add_nc_u32_e32 v3, v3, v1 ; GFX12-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3 +; GFX12-NEXT: v_add_nc_u32_e32 v2, v2, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_3) -; GFX12-NEXT: v_cndmask_b32_e32 v1, v2, v4, vcc_lo -; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX12-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc_lo +; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: v_cndmask_b32_e32 v0, v3, v5, vcc_lo +; GFX12-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc_lo +; GFX12-NEXT: v_readfirstlane_b32 s0, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX12-NEXT: v_readfirstlane_b32 s0, v1 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-NEXT: v_readfirstlane_b32 s1, v0 +; GFX12-NEXT: v_readfirstlane_b32 s1, v1 ; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: ; return to shader part epilog %op = uitofp <2 x i1> %num to <2 x bfloat> @@ -706,11 +704,11 @@ define <3 x bfloat> @v_uitofp_v3i1_to_v3bf16(<3 x i1> %num) { define amdgpu_ps <3 x i32> @s_uitofp_v3i1_to_v3bf16(<3 x i1> inreg %num) { ; GFX7-LABEL: s_uitofp_v3i1_to_v3bf16: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_and_b32 s4, 1, s0 -; GFX7-NEXT: s_and_b32 s3, 1, s1 -; GFX7-NEXT: s_bitcmp1_b32 s2, 0 +; GFX7-NEXT: s_and_b32 s4, 1, s2 +; GFX7-NEXT: s_and_b32 s2, 1, s1 +; GFX7-NEXT: s_bitcmp1_b32 s0, 0 ; GFX7-NEXT: s_cselect_b64 s[0:1], -1, 0 -; GFX7-NEXT: s_cmp_eq_u32 s3, 1 +; GFX7-NEXT: s_cmp_eq_u32 s2, 1 ; GFX7-NEXT: s_cselect_b64 s[2:3], -1, 0 ; GFX7-NEXT: s_cmp_eq_u32 s4, 1 ; GFX7-NEXT: s_cselect_b64 s[4:5], -1, 0 @@ -720,18 +718,18 @@ define amdgpu_ps <3 x i32> @s_uitofp_v3i1_to_v3bf16(<3 x i1> inreg %num) { ; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX7-NEXT: v_readfirstlane_b32 s0, v0 +; GFX7-NEXT: v_readfirstlane_b32 s0, v2 ; GFX7-NEXT: v_readfirstlane_b32 s1, v1 -; GFX7-NEXT: v_readfirstlane_b32 s2, v2 +; GFX7-NEXT: v_readfirstlane_b32 s2, v0 ; GFX7-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: s_uitofp_v3i1_to_v3bf16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_and_b32 s4, 1, s0 -; GFX9-NEXT: s_and_b32 s3, 1, s1 -; GFX9-NEXT: s_bitcmp1_b32 s2, 0 +; GFX9-NEXT: s_and_b32 s4, 1, s2 +; GFX9-NEXT: s_and_b32 s2, 1, s1 +; GFX9-NEXT: s_bitcmp1_b32 s0, 0 ; GFX9-NEXT: s_cselect_b64 s[0:1], -1, 0 -; GFX9-NEXT: s_cmp_eq_u32 s3, 1 +; GFX9-NEXT: s_cmp_eq_u32 s2, 1 ; GFX9-NEXT: s_cselect_b64 s[2:3], -1, 0 ; GFX9-NEXT: s_cmp_eq_u32 s4, 1 ; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 @@ -759,99 +757,96 @@ define amdgpu_ps <3 x i32> @s_uitofp_v3i1_to_v3bf16(<3 x i1> inreg %num) { ; GFX9-NEXT: v_add_u32_e32 v4, 0x7fff, v4 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX9-NEXT: v_readfirstlane_b32 s0, v0 +; GFX9-NEXT: v_readfirstlane_b32 s2, v0 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v4, v3, vcc ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX9-NEXT: v_readfirstlane_b32 s1, v1 -; GFX9-NEXT: v_readfirstlane_b32 s2, v2 +; GFX9-NEXT: v_readfirstlane_b32 s0, v2 ; GFX9-NEXT: ; return to shader part epilog ; ; GFX11-LABEL: s_uitofp_v3i1_to_v3bf16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_and_b32 s0, 1, s0 +; GFX11-NEXT: s_and_b32 s2, 1, s2 ; GFX11-NEXT: s_and_b32 s1, 1, s1 -; GFX11-NEXT: s_bitcmp1_b32 s2, 0 -; GFX11-NEXT: s_cselect_b32 s2, -1, 0 +; GFX11-NEXT: s_bitcmp1_b32 s0, 0 +; GFX11-NEXT: s_cselect_b32 s0, -1, 0 ; GFX11-NEXT: s_cmp_eq_u32 s1, 1 -; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1.0, s2 +; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s0 ; GFX11-NEXT: s_cselect_b32 s1, -1, 0 -; GFX11-NEXT: s_cmp_eq_u32 s0, 1 +; GFX11-NEXT: s_cmp_eq_u32 s2, 1 ; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, s1 ; GFX11-NEXT: s_cselect_b32 s0, -1, 0 -; GFX11-NEXT: v_bfe_u32 v5, v2, 16, 1 -; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s0 -; GFX11-NEXT: v_or_b32_e32 v6, 0x400000, v2 +; GFX11-NEXT: v_bfe_u32 v4, v1, 16, 1 +; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1.0, s0 +; GFX11-NEXT: v_or_b32_e32 v6, 0x400000, v1 ; GFX11-NEXT: v_bfe_u32 v3, v0, 16, 1 ; GFX11-NEXT: v_or_b32_e32 v7, 0x400000, v0 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX11-NEXT: v_add_nc_u32_e32 v5, v5, v2 -; GFX11-NEXT: v_bfe_u32 v4, v1, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_add_nc_u32_e32 v5, 0x7fff, v5 -; GFX11-NEXT: v_add_nc_u32_e32 v3, v3, v0 ; GFX11-NEXT: v_add_nc_u32_e32 v4, v4, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3 +; GFX11-NEXT: v_bfe_u32 v5, v2, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX11-NEXT: v_add_nc_u32_e32 v3, v3, v0 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 ; GFX11-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e32 v0, v3, v7, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3 +; GFX11-NEXT: v_dual_cndmask_b32 v0, v3, v7 :: v_dual_add_nc_u32 v5, v5, v2 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_add_nc_u32_e32 v5, 0x7fff, v5 ; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_cndmask_b32_e32 v1, v4, v8, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e32 v1, v4, v6, vcc_lo ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX11-NEXT: v_readfirstlane_b32 s1, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX11-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e32 v2, v5, v8, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_readfirstlane_b32 s0, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readfirstlane_b32 s2, v2 ; GFX11-NEXT: ; return to shader part epilog ; ; GFX12-LABEL: s_uitofp_v3i1_to_v3bf16: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_and_b32 s0, 1, s0 +; GFX12-NEXT: s_and_b32 s2, 1, s2 ; GFX12-NEXT: s_and_b32 s1, 1, s1 -; GFX12-NEXT: s_bitcmp1_b32 s2, 0 -; GFX12-NEXT: s_cselect_b32 s2, -1, 0 +; GFX12-NEXT: s_bitcmp1_b32 s0, 0 +; GFX12-NEXT: s_cselect_b32 s0, -1, 0 ; GFX12-NEXT: s_cmp_eq_u32 s1, 1 -; GFX12-NEXT: v_cndmask_b32_e64 v2, 0, 1.0, s2 +; GFX12-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s0 ; GFX12-NEXT: s_cselect_b32 s1, -1, 0 -; GFX12-NEXT: s_cmp_eq_u32 s0, 1 +; GFX12-NEXT: s_cmp_eq_u32 s2, 1 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, s1 ; GFX12-NEXT: s_cselect_b32 s0, -1, 0 -; GFX12-NEXT: v_bfe_u32 v5, v2, 16, 1 +; GFX12-NEXT: v_bfe_u32 v4, v1, 16, 1 ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s0 -; GFX12-NEXT: v_or_b32_e32 v6, 0x400000, v2 +; GFX12-NEXT: v_cndmask_b32_e64 v2, 0, 1.0, s0 +; GFX12-NEXT: v_or_b32_e32 v6, 0x400000, v1 ; GFX12-NEXT: v_bfe_u32 v3, v0, 16, 1 ; GFX12-NEXT: v_or_b32_e32 v7, 0x400000, v0 -; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX12-NEXT: v_add_nc_u32_e32 v5, v5, v2 -; GFX12-NEXT: v_bfe_u32 v4, v1, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v1 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX12-NEXT: v_add_nc_u32_e32 v5, 0x7fff, v5 -; GFX12-NEXT: v_add_nc_u32_e32 v3, v3, v0 ; GFX12-NEXT: v_add_nc_u32_e32 v4, v4, v1 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3 +; GFX12-NEXT: v_bfe_u32 v5, v2, 16, 1 +; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX12-NEXT: v_add_nc_u32_e32 v3, v3, v0 +; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 ; GFX12-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_cndmask_b32_e32 v0, v3, v7, vcc_lo +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3 +; GFX12-NEXT: v_dual_cndmask_b32 v0, v3, v7 :: v_dual_add_nc_u32 v5, v5, v2 ; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX12-NEXT: v_add_nc_u32_e32 v5, 0x7fff, v5 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: v_cndmask_b32_e32 v1, v4, v8, vcc_lo +; GFX12-NEXT: v_cndmask_b32_e32 v1, v4, v6, vcc_lo ; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX12-NEXT: v_readfirstlane_b32 s1, v0 ; GFX12-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc_lo +; GFX12-NEXT: v_cndmask_b32_e32 v2, v5, v8, vcc_lo ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_readfirstlane_b32 s0, v1 ; GFX12-NEXT: v_lshrrev_b32_e32 v2, 16, v2 @@ -1154,10 +1149,10 @@ define <4 x bfloat> @v_uitofp_v4i1_to_v4bf16(<4 x i1> %num) { define amdgpu_ps <4 x i32> @s_uitofp_v4i1_to_v4bf16(<4 x i1> inreg %num) { ; GFX7-LABEL: s_uitofp_v4i1_to_v4bf16: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_and_b32 s6, 1, s0 -; GFX7-NEXT: s_and_b32 s4, 1, s1 -; GFX7-NEXT: s_and_b32 s2, 1, s2 -; GFX7-NEXT: s_bitcmp1_b32 s3, 0 +; GFX7-NEXT: s_and_b32 s6, 1, s3 +; GFX7-NEXT: s_and_b32 s4, 1, s2 +; GFX7-NEXT: s_and_b32 s2, 1, s1 +; GFX7-NEXT: s_bitcmp1_b32 s0, 0 ; GFX7-NEXT: s_cselect_b64 s[0:1], -1, 0 ; GFX7-NEXT: s_cmp_eq_u32 s2, 1 ; GFX7-NEXT: s_cselect_b64 s[2:3], -1, 0 @@ -1173,18 +1168,18 @@ define amdgpu_ps <4 x i32> @s_uitofp_v4i1_to_v4bf16(<4 x i1> inreg %num) { ; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: v_readfirstlane_b32 s0, v0 -; GFX7-NEXT: v_readfirstlane_b32 s1, v1 -; GFX7-NEXT: v_readfirstlane_b32 s2, v2 -; GFX7-NEXT: v_readfirstlane_b32 s3, v3 +; GFX7-NEXT: v_readfirstlane_b32 s0, v3 +; GFX7-NEXT: v_readfirstlane_b32 s1, v2 +; GFX7-NEXT: v_readfirstlane_b32 s2, v1 +; GFX7-NEXT: v_readfirstlane_b32 s3, v0 ; GFX7-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: s_uitofp_v4i1_to_v4bf16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_and_b32 s6, 1, s0 -; GFX9-NEXT: s_and_b32 s4, 1, s1 -; GFX9-NEXT: s_and_b32 s2, 1, s2 -; GFX9-NEXT: s_bitcmp1_b32 s3, 0 +; GFX9-NEXT: s_and_b32 s6, 1, s3 +; GFX9-NEXT: s_and_b32 s4, 1, s2 +; GFX9-NEXT: s_and_b32 s2, 1, s1 +; GFX9-NEXT: s_bitcmp1_b32 s0, 0 ; GFX9-NEXT: s_cselect_b64 s[0:1], -1, 0 ; GFX9-NEXT: s_cmp_eq_u32 s2, 1 ; GFX9-NEXT: s_cselect_b64 s[2:3], -1, 0 @@ -1216,7 +1211,7 @@ define amdgpu_ps <4 x i32> @s_uitofp_v4i1_to_v4bf16(<4 x i1> inreg %num) { ; GFX9-NEXT: v_add_u32_e32 v4, 0x7fff, v4 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: v_readfirstlane_b32 s3, v0 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v4, v3, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1.0, s[0:1] ; GFX9-NEXT: v_bfe_u32 v5, v3, 16, 1 @@ -1225,133 +1220,126 @@ define amdgpu_ps <4 x i32> @s_uitofp_v4i1_to_v4bf16(<4 x i1> inreg %num) { ; GFX9-NEXT: v_add_u32_e32 v5, 0x7fff, v5 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX9-NEXT: v_readfirstlane_b32 s0, v0 +; GFX9-NEXT: v_readfirstlane_b32 s2, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v5, v4, vcc ; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX9-NEXT: v_readfirstlane_b32 s1, v1 -; GFX9-NEXT: v_readfirstlane_b32 s2, v2 -; GFX9-NEXT: v_readfirstlane_b32 s3, v3 +; GFX9-NEXT: v_readfirstlane_b32 s1, v2 +; GFX9-NEXT: v_readfirstlane_b32 s0, v3 ; GFX9-NEXT: ; return to shader part epilog ; ; GFX11-LABEL: s_uitofp_v4i1_to_v4bf16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_and_b32 s0, 1, s0 -; GFX11-NEXT: s_and_b32 s1, 1, s1 +; GFX11-NEXT: s_and_b32 s3, 1, s3 ; GFX11-NEXT: s_and_b32 s2, 1, s2 -; GFX11-NEXT: s_bitcmp1_b32 s3, 0 -; GFX11-NEXT: s_cselect_b32 s3, -1, 0 -; GFX11-NEXT: s_cmp_eq_u32 s2, 1 -; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s3 -; GFX11-NEXT: s_cselect_b32 s2, -1, 0 +; GFX11-NEXT: s_and_b32 s1, 1, s1 +; GFX11-NEXT: s_bitcmp1_b32 s0, 0 +; GFX11-NEXT: s_cselect_b32 s0, -1, 0 ; GFX11-NEXT: s_cmp_eq_u32 s1, 1 -; GFX11-NEXT: v_cndmask_b32_e64 v4, 0, 1.0, s2 +; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, 1.0, s0 ; GFX11-NEXT: s_cselect_b32 s1, -1, 0 -; GFX11-NEXT: s_cmp_eq_u32 s0, 1 -; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, 1.0, s1 -; GFX11-NEXT: s_cselect_b32 s0, -1, 0 +; GFX11-NEXT: s_cmp_eq_u32 s2, 1 +; GFX11-NEXT: v_cndmask_b32_e64 v4, 0, 1.0, s1 +; GFX11-NEXT: s_cselect_b32 s2, -1, 0 +; GFX11-NEXT: s_cmp_eq_u32 s3, 1 +; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s2 +; GFX11-NEXT: s_cselect_b32 s3, -1, 0 ; GFX11-NEXT: v_bfe_u32 v7, v4, 16, 1 -; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, s0 -; GFX11-NEXT: v_bfe_u32 v5, v1, 16, 1 +; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, s3 ; GFX11-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-NEXT: v_bfe_u32 v2, v0, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v0 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX11-NEXT: v_bfe_u32 v5, v1, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v4 ; GFX11-NEXT: v_add_nc_u32_e32 v7, v7, v4 -; GFX11-NEXT: v_add_nc_u32_e32 v5, v5, v1 +; GFX11-NEXT: v_bfe_u32 v2, v0, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v0 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v1 ; GFX11-NEXT: v_add_nc_u32_e32 v6, v6, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) ; GFX11-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v7 -; GFX11-NEXT: v_add_nc_u32_e32 v2, v2, v0 +; GFX11-NEXT: v_add_nc_u32_e32 v5, v5, v1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-NEXT: v_or_b32_e32 v11, 0x400000, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v6 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_add_nc_u32_e32 v5, 0x7fff, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2 -; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v9, vcc_lo -; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v6 +; GFX11-NEXT: v_dual_cndmask_b32 v1, v5, v9 :: v_dual_add_nc_u32 v2, v2, v0 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 -; GFX11-NEXT: v_or_b32_e32 v6, 0x400000, v4 -; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v1 -; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v8, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2 +; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX11-NEXT: v_cndmask_b32_e32 v3, v6, v11, vcc_lo ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_readfirstlane_b32 s2, v1 +; GFX11-NEXT: v_cndmask_b32_e32 v4, v7, v10, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v8, vcc_lo +; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v3 +; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v4 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_readfirstlane_b32 s0, v0 -; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX11-NEXT: v_cndmask_b32_e32 v3, v7, v6, vcc_lo -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX11-NEXT: v_readfirstlane_b32 s0, v2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_readfirstlane_b32 s1, v2 -; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX11-NEXT: v_cndmask_b32_e32 v1, v5, v9, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_readfirstlane_b32 s2, v3 -; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_readfirstlane_b32 s3, v1 +; GFX11-NEXT: v_readfirstlane_b32 s1, v3 +; GFX11-NEXT: v_readfirstlane_b32 s3, v0 ; GFX11-NEXT: ; return to shader part epilog ; ; GFX12-LABEL: s_uitofp_v4i1_to_v4bf16: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_and_b32 s0, 1, s0 -; GFX12-NEXT: s_and_b32 s1, 1, s1 +; GFX12-NEXT: s_and_b32 s3, 1, s3 ; GFX12-NEXT: s_and_b32 s2, 1, s2 -; GFX12-NEXT: s_bitcmp1_b32 s3, 0 -; GFX12-NEXT: s_cselect_b32 s3, -1, 0 -; GFX12-NEXT: s_cmp_eq_u32 s2, 1 -; GFX12-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s3 -; GFX12-NEXT: s_cselect_b32 s2, -1, 0 +; GFX12-NEXT: s_and_b32 s1, 1, s1 +; GFX12-NEXT: s_bitcmp1_b32 s0, 0 +; GFX12-NEXT: s_cselect_b32 s0, -1, 0 ; GFX12-NEXT: s_cmp_eq_u32 s1, 1 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: v_cndmask_b32_e64 v4, 0, 1.0, s2 +; GFX12-NEXT: v_cndmask_b32_e64 v3, 0, 1.0, s0 ; GFX12-NEXT: s_cselect_b32 s1, -1, 0 -; GFX12-NEXT: s_cmp_eq_u32 s0, 1 -; GFX12-NEXT: v_cndmask_b32_e64 v3, 0, 1.0, s1 -; GFX12-NEXT: s_cselect_b32 s0, -1, 0 +; GFX12-NEXT: s_cmp_eq_u32 s2, 1 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: v_cndmask_b32_e64 v4, 0, 1.0, s1 +; GFX12-NEXT: s_cselect_b32 s2, -1, 0 +; GFX12-NEXT: s_cmp_eq_u32 s3, 1 +; GFX12-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s2 +; GFX12-NEXT: s_cselect_b32 s3, -1, 0 ; GFX12-NEXT: v_bfe_u32 v7, v4, 16, 1 ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, s0 -; GFX12-NEXT: v_bfe_u32 v5, v1, 16, 1 +; GFX12-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, s3 ; GFX12-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX12-NEXT: v_bfe_u32 v2, v0, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v0 -; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX12-NEXT: v_bfe_u32 v5, v1, 16, 1 +; GFX12-NEXT: v_or_b32_e32 v10, 0x400000, v4 ; GFX12-NEXT: v_add_nc_u32_e32 v7, v7, v4 -; GFX12-NEXT: v_add_nc_u32_e32 v5, v5, v1 +; GFX12-NEXT: v_bfe_u32 v2, v0, 16, 1 +; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v0 +; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v1 ; GFX12-NEXT: v_add_nc_u32_e32 v6, v6, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) ; GFX12-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v7 -; GFX12-NEXT: v_add_nc_u32_e32 v2, v2, v0 +; GFX12-NEXT: v_add_nc_u32_e32 v5, v5, v1 +; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX12-NEXT: v_or_b32_e32 v11, 0x400000, v3 +; GFX12-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v6 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_add_nc_u32_e32 v5, 0x7fff, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2 -; GFX12-NEXT: v_cndmask_b32_e32 v0, v2, v9, vcc_lo -; GFX12-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v6 +; GFX12-NEXT: v_dual_cndmask_b32 v1, v5, v9 :: v_dual_add_nc_u32 v2, v2, v0 ; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 -; GFX12-NEXT: v_or_b32_e32 v6, 0x400000, v4 -; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v1 -; GFX12-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX12-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2 +; GFX12-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: v_cndmask_b32_e32 v2, v2, v8, vcc_lo +; GFX12-NEXT: v_cndmask_b32_e32 v3, v6, v11, vcc_lo ; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX12-NEXT: v_readfirstlane_b32 s0, v0 -; GFX12-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX12-NEXT: v_readfirstlane_b32 s2, v1 ; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: v_cndmask_b32_e32 v3, v7, v6, vcc_lo -; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX12-NEXT: v_readfirstlane_b32 s1, v2 -; GFX12-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX12-NEXT: v_cndmask_b32_e32 v4, v7, v10, vcc_lo +; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 ; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: v_cndmask_b32_e32 v1, v5, v9, vcc_lo -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_readfirstlane_b32 s2, v3 -; GFX12-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_readfirstlane_b32 s3, v1 +; GFX12-NEXT: v_cndmask_b32_e32 v0, v2, v8, vcc_lo +; GFX12-NEXT: v_lshrrev_b32_e32 v2, 16, v3 +; GFX12-NEXT: v_lshrrev_b32_e32 v3, 16, v4 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX12-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX12-NEXT: v_readfirstlane_b32 s0, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX12-NEXT: v_readfirstlane_b32 s1, v3 +; GFX12-NEXT: v_readfirstlane_b32 s3, v0 ; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: ; return to shader part epilog %op = uitofp <4 x i1> %num to <4 x bfloat> @@ -1712,8 +1700,8 @@ define <2 x bfloat> @v_sitofp_v2i1_to_v2bf16(<2 x i1> %num) { define amdgpu_ps <2 x i32> @s_sitofp_v2i1_to_v2bf16(<2 x i1> inreg %num) { ; GFX7-LABEL: s_sitofp_v2i1_to_v2bf16: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_and_b32 s2, 1, s1 -; GFX7-NEXT: s_bitcmp1_b32 s0, 0 +; GFX7-NEXT: s_and_b32 s2, 1, s0 +; GFX7-NEXT: s_bitcmp1_b32 s1, 0 ; GFX7-NEXT: s_cselect_b64 s[0:1], -1, 0 ; GFX7-NEXT: s_cmp_eq_u32 s2, 1 ; GFX7-NEXT: v_cndmask_b32_e64 v0, 0, -1.0, s[0:1] @@ -1721,14 +1709,14 @@ define amdgpu_ps <2 x i32> @s_sitofp_v2i1_to_v2bf16(<2 x i1> inreg %num) { ; GFX7-NEXT: v_cndmask_b32_e64 v1, 0, -1.0, s[0:1] ; GFX7-NEXT: v_ashrrev_i32_e32 v0, 16, v0 ; GFX7-NEXT: v_ashrrev_i32_e32 v1, 16, v1 -; GFX7-NEXT: v_readfirstlane_b32 s0, v0 -; GFX7-NEXT: v_readfirstlane_b32 s1, v1 +; GFX7-NEXT: v_readfirstlane_b32 s0, v1 +; GFX7-NEXT: v_readfirstlane_b32 s1, v0 ; GFX7-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: s_sitofp_v2i1_to_v2bf16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_and_b32 s2, 1, s1 -; GFX9-NEXT: s_bitcmp1_b32 s0, 0 +; GFX9-NEXT: s_and_b32 s2, 1, s0 +; GFX9-NEXT: s_bitcmp1_b32 s1, 0 ; GFX9-NEXT: s_cselect_b64 s[0:1], -1, 0 ; GFX9-NEXT: s_cmp_eq_u32 s2, 1 ; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, -1.0, s[0:1] @@ -1750,75 +1738,77 @@ define amdgpu_ps <2 x i32> @s_sitofp_v2i1_to_v2bf16(<2 x i1> inreg %num) { ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc ; GFX9-NEXT: v_ashrrev_i32_e32 v0, 16, v0 -; GFX9-NEXT: v_readfirstlane_b32 s1, v1 -; GFX9-NEXT: v_readfirstlane_b32 s0, v0 +; GFX9-NEXT: v_readfirstlane_b32 s0, v1 +; GFX9-NEXT: v_readfirstlane_b32 s1, v0 ; GFX9-NEXT: ; return to shader part epilog ; ; GFX11-LABEL: s_sitofp_v2i1_to_v2bf16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_and_b32 s1, 1, s1 -; GFX11-NEXT: s_bitcmp1_b32 s0, 0 -; GFX11-NEXT: s_cselect_b32 s0, -1, 0 -; GFX11-NEXT: s_cmp_eq_u32 s1, 1 -; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, -1.0, s0 +; GFX11-NEXT: s_and_b32 s0, 1, s0 +; GFX11-NEXT: s_bitcmp1_b32 s1, 0 +; GFX11-NEXT: s_cselect_b32 s1, -1, 0 +; GFX11-NEXT: s_cmp_eq_u32 s0, 1 +; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, -1.0, s1 ; GFX11-NEXT: s_cselect_b32 s0, -1, 0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, -1.0, s0 -; GFX11-NEXT: v_bfe_u32 v2, v0, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v0 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_bfe_u32 v3, v1, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v5, 0x400000, v1 -; GFX11-NEXT: v_add_nc_u32_e32 v3, v3, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_bfe_u32 v3, v0, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v5, 0x400000, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v1 +; GFX11-NEXT: v_add_nc_u32_e32 v3, v3, v0 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_add_nc_u32_e32 v2, v2, v1 ; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3 -; GFX11-NEXT: v_add_nc_u32_e32 v2, v2, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc_lo -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX11-NEXT: v_ashrrev_i32_e32 v0, 16, v0 -; GFX11-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_readfirstlane_b32 s0, v0 +; GFX11-NEXT: v_cndmask_b32_e32 v1, v2, v4, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_cndmask_b32_e32 v0, v3, v5, vcc_lo ; GFX11-NEXT: v_ashrrev_i32_e32 v1, 16, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_readfirstlane_b32 s1, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_ashrrev_i32_e32 v0, 16, v0 +; GFX11-NEXT: v_readfirstlane_b32 s0, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_readfirstlane_b32 s1, v0 ; GFX11-NEXT: ; return to shader part epilog ; ; GFX12-LABEL: s_sitofp_v2i1_to_v2bf16: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_and_b32 s1, 1, s1 -; GFX12-NEXT: s_bitcmp1_b32 s0, 0 -; GFX12-NEXT: s_cselect_b32 s0, -1, 0 -; GFX12-NEXT: s_cmp_eq_u32 s1, 1 -; GFX12-NEXT: v_cndmask_b32_e64 v0, 0, -1.0, s0 +; GFX12-NEXT: s_and_b32 s0, 1, s0 +; GFX12-NEXT: s_bitcmp1_b32 s1, 0 +; GFX12-NEXT: s_cselect_b32 s1, -1, 0 +; GFX12-NEXT: s_cmp_eq_u32 s0, 1 +; GFX12-NEXT: v_cndmask_b32_e64 v0, 0, -1.0, s1 ; GFX12-NEXT: s_cselect_b32 s0, -1, 0 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_cndmask_b32_e64 v1, 0, -1.0, s0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4) -; GFX12-NEXT: v_bfe_u32 v2, v0, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v4, 0x400000, v0 -; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX12-NEXT: v_bfe_u32 v3, v1, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v5, 0x400000, v1 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_add_nc_u32_e32 v3, v3, v1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX12-NEXT: v_bfe_u32 v3, v0, 16, 1 +; GFX12-NEXT: v_or_b32_e32 v5, 0x400000, v0 +; GFX12-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX12-NEXT: v_or_b32_e32 v4, 0x400000, v1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX12-NEXT: v_add_nc_u32_e32 v3, v3, v0 +; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX12-NEXT: v_add_nc_u32_e32 v2, v2, v1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3 -; GFX12-NEXT: v_add_nc_u32_e32 v2, v2, v0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2 -; GFX12-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc_lo -; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_ashrrev_i32_e32 v0, 16, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_3) +; GFX12-NEXT: v_cndmask_b32_e32 v1, v2, v4, vcc_lo +; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 ; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc_lo -; GFX12-NEXT: v_readfirstlane_b32 s0, v0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_cndmask_b32_e32 v0, v3, v5, vcc_lo ; GFX12-NEXT: v_ashrrev_i32_e32 v1, 16, v1 -; GFX12-NEXT: v_readfirstlane_b32 s1, v1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_ashrrev_i32_e32 v0, 16, v0 +; GFX12-NEXT: v_readfirstlane_b32 s0, v1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-NEXT: v_readfirstlane_b32 s1, v0 ; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: ; return to shader part epilog %op = sitofp <2 x i1> %num to <2 x bfloat> @@ -2058,11 +2048,11 @@ define <3 x bfloat> @v_sitofp_v3i1_to_v3bf16(<3 x i1> %num) { define amdgpu_ps <3 x i32> @s_sitofp_v3i1_to_v3bf16(<3 x i1> inreg %num) { ; GFX7-LABEL: s_sitofp_v3i1_to_v3bf16: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_and_b32 s4, 1, s2 -; GFX7-NEXT: s_and_b32 s2, 1, s1 -; GFX7-NEXT: s_bitcmp1_b32 s0, 0 +; GFX7-NEXT: s_and_b32 s4, 1, s0 +; GFX7-NEXT: s_and_b32 s3, 1, s1 +; GFX7-NEXT: s_bitcmp1_b32 s2, 0 ; GFX7-NEXT: s_cselect_b64 s[0:1], -1, 0 -; GFX7-NEXT: s_cmp_eq_u32 s2, 1 +; GFX7-NEXT: s_cmp_eq_u32 s3, 1 ; GFX7-NEXT: s_cselect_b64 s[2:3], -1, 0 ; GFX7-NEXT: s_cmp_eq_u32 s4, 1 ; GFX7-NEXT: s_cselect_b64 s[4:5], -1, 0 @@ -2072,18 +2062,18 @@ define amdgpu_ps <3 x i32> @s_sitofp_v3i1_to_v3bf16(<3 x i1> inreg %num) { ; GFX7-NEXT: v_ashrrev_i32_e32 v2, 16, v2 ; GFX7-NEXT: v_ashrrev_i32_e32 v1, 16, v1 ; GFX7-NEXT: v_ashrrev_i32_e32 v0, 16, v0 -; GFX7-NEXT: v_readfirstlane_b32 s0, v2 +; GFX7-NEXT: v_readfirstlane_b32 s0, v0 ; GFX7-NEXT: v_readfirstlane_b32 s1, v1 -; GFX7-NEXT: v_readfirstlane_b32 s2, v0 +; GFX7-NEXT: v_readfirstlane_b32 s2, v2 ; GFX7-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: s_sitofp_v3i1_to_v3bf16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_and_b32 s4, 1, s2 -; GFX9-NEXT: s_and_b32 s2, 1, s1 -; GFX9-NEXT: s_bitcmp1_b32 s0, 0 +; GFX9-NEXT: s_and_b32 s4, 1, s0 +; GFX9-NEXT: s_and_b32 s3, 1, s1 +; GFX9-NEXT: s_bitcmp1_b32 s2, 0 ; GFX9-NEXT: s_cselect_b64 s[0:1], -1, 0 -; GFX9-NEXT: s_cmp_eq_u32 s2, 1 +; GFX9-NEXT: s_cmp_eq_u32 s3, 1 ; GFX9-NEXT: s_cselect_b64 s[2:3], -1, 0 ; GFX9-NEXT: s_cmp_eq_u32 s4, 1 ; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 @@ -2111,96 +2101,99 @@ define amdgpu_ps <3 x i32> @s_sitofp_v3i1_to_v3bf16(<3 x i1> inreg %num) { ; GFX9-NEXT: v_add_u32_e32 v4, 0x7fff, v4 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 ; GFX9-NEXT: v_ashrrev_i32_e32 v1, 16, v1 -; GFX9-NEXT: v_readfirstlane_b32 s2, v0 +; GFX9-NEXT: v_readfirstlane_b32 s0, v0 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v4, v3, vcc ; GFX9-NEXT: v_ashrrev_i32_e32 v2, 16, v2 ; GFX9-NEXT: v_readfirstlane_b32 s1, v1 -; GFX9-NEXT: v_readfirstlane_b32 s0, v2 +; GFX9-NEXT: v_readfirstlane_b32 s2, v2 ; GFX9-NEXT: ; return to shader part epilog ; ; GFX11-LABEL: s_sitofp_v3i1_to_v3bf16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_and_b32 s2, 1, s2 +; GFX11-NEXT: s_and_b32 s0, 1, s0 ; GFX11-NEXT: s_and_b32 s1, 1, s1 -; GFX11-NEXT: s_bitcmp1_b32 s0, 0 -; GFX11-NEXT: s_cselect_b32 s0, -1, 0 +; GFX11-NEXT: s_bitcmp1_b32 s2, 0 +; GFX11-NEXT: s_cselect_b32 s2, -1, 0 ; GFX11-NEXT: s_cmp_eq_u32 s1, 1 -; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, -1.0, s0 +; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, -1.0, s2 ; GFX11-NEXT: s_cselect_b32 s1, -1, 0 -; GFX11-NEXT: s_cmp_eq_u32 s2, 1 +; GFX11-NEXT: s_cmp_eq_u32 s0, 1 ; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, -1.0, s1 ; GFX11-NEXT: s_cselect_b32 s0, -1, 0 -; GFX11-NEXT: v_bfe_u32 v4, v1, 16, 1 -; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, -1.0, s0 -; GFX11-NEXT: v_or_b32_e32 v6, 0x400000, v1 +; GFX11-NEXT: v_bfe_u32 v5, v2, 16, 1 +; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, -1.0, s0 +; GFX11-NEXT: v_or_b32_e32 v6, 0x400000, v2 ; GFX11-NEXT: v_bfe_u32 v3, v0, 16, 1 ; GFX11-NEXT: v_or_b32_e32 v7, 0x400000, v0 -; GFX11-NEXT: v_add_nc_u32_e32 v4, v4, v1 -; GFX11-NEXT: v_bfe_u32 v5, v2, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX11-NEXT: v_add_nc_u32_e32 v3, v3, v0 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX11-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_nc_u32_e32 v5, v5, v2 +; GFX11-NEXT: v_bfe_u32 v4, v1, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_add_nc_u32_e32 v5, 0x7fff, v5 +; GFX11-NEXT: v_add_nc_u32_e32 v3, v3, v0 +; GFX11-NEXT: v_add_nc_u32_e32 v4, v4, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3 -; GFX11-NEXT: v_dual_cndmask_b32 v0, v3, v7 :: v_dual_add_nc_u32 v5, v5, v2 +; GFX11-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b32_e32 v0, v3, v7, vcc_lo ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_add_nc_u32_e32 v5, 0x7fff, v5 ; GFX11-NEXT: v_ashrrev_i32_e32 v0, 16, v0 -; GFX11-NEXT: v_cndmask_b32_e32 v1, v4, v6, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_cndmask_b32_e32 v1, v4, v8, vcc_lo ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX11-NEXT: v_readfirstlane_b32 s1, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_ashrrev_i32_e32 v1, 16, v1 -; GFX11-NEXT: v_cndmask_b32_e32 v2, v5, v8, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc_lo ; GFX11-NEXT: v_readfirstlane_b32 s0, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_ashrrev_i32_e32 v2, 16, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readfirstlane_b32 s2, v2 ; GFX11-NEXT: ; return to shader part epilog ; ; GFX12-LABEL: s_sitofp_v3i1_to_v3bf16: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_and_b32 s2, 1, s2 +; GFX12-NEXT: s_and_b32 s0, 1, s0 ; GFX12-NEXT: s_and_b32 s1, 1, s1 -; GFX12-NEXT: s_bitcmp1_b32 s0, 0 -; GFX12-NEXT: s_cselect_b32 s0, -1, 0 +; GFX12-NEXT: s_bitcmp1_b32 s2, 0 +; GFX12-NEXT: s_cselect_b32 s2, -1, 0 ; GFX12-NEXT: s_cmp_eq_u32 s1, 1 -; GFX12-NEXT: v_cndmask_b32_e64 v1, 0, -1.0, s0 +; GFX12-NEXT: v_cndmask_b32_e64 v2, 0, -1.0, s2 ; GFX12-NEXT: s_cselect_b32 s1, -1, 0 -; GFX12-NEXT: s_cmp_eq_u32 s2, 1 -; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_cmp_eq_u32 s0, 1 ; GFX12-NEXT: v_cndmask_b32_e64 v0, 0, -1.0, s1 ; GFX12-NEXT: s_cselect_b32 s0, -1, 0 -; GFX12-NEXT: v_bfe_u32 v4, v1, 16, 1 +; GFX12-NEXT: v_bfe_u32 v5, v2, 16, 1 ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: v_cndmask_b32_e64 v2, 0, -1.0, s0 -; GFX12-NEXT: v_or_b32_e32 v6, 0x400000, v1 +; GFX12-NEXT: v_cndmask_b32_e64 v1, 0, -1.0, s0 +; GFX12-NEXT: v_or_b32_e32 v6, 0x400000, v2 ; GFX12-NEXT: v_bfe_u32 v3, v0, 16, 1 ; GFX12-NEXT: v_or_b32_e32 v7, 0x400000, v0 -; GFX12-NEXT: v_add_nc_u32_e32 v4, v4, v1 -; GFX12-NEXT: v_bfe_u32 v5, v2, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX12-NEXT: v_add_nc_u32_e32 v3, v3, v0 ; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX12-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_add_nc_u32_e32 v5, v5, v2 +; GFX12-NEXT: v_bfe_u32 v4, v1, 16, 1 +; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX12-NEXT: v_add_nc_u32_e32 v5, 0x7fff, v5 +; GFX12-NEXT: v_add_nc_u32_e32 v3, v3, v0 +; GFX12-NEXT: v_add_nc_u32_e32 v4, v4, v1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3 -; GFX12-NEXT: v_dual_cndmask_b32 v0, v3, v7 :: v_dual_add_nc_u32 v5, v5, v2 +; GFX12-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_cndmask_b32_e32 v0, v3, v7, vcc_lo ; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX12-NEXT: v_add_nc_u32_e32 v5, 0x7fff, v5 ; GFX12-NEXT: v_ashrrev_i32_e32 v0, 16, v0 ; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: v_cndmask_b32_e32 v1, v4, v6, vcc_lo +; GFX12-NEXT: v_cndmask_b32_e32 v1, v4, v8, vcc_lo ; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX12-NEXT: v_readfirstlane_b32 s1, v0 ; GFX12-NEXT: v_ashrrev_i32_e32 v1, 16, v1 ; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: v_cndmask_b32_e32 v2, v5, v8, vcc_lo +; GFX12-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc_lo ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_readfirstlane_b32 s0, v1 ; GFX12-NEXT: v_ashrrev_i32_e32 v2, 16, v2 @@ -2503,10 +2496,10 @@ define <4 x bfloat> @v_sitofp_v4i1_to_v4bf16(<4 x i1> %num) { define amdgpu_ps <4 x i32> @s_sitofp_v4i1_to_v4bf16(<4 x i1> inreg %num) { ; GFX7-LABEL: s_sitofp_v4i1_to_v4bf16: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_and_b32 s6, 1, s3 -; GFX7-NEXT: s_and_b32 s4, 1, s2 -; GFX7-NEXT: s_and_b32 s2, 1, s1 -; GFX7-NEXT: s_bitcmp1_b32 s0, 0 +; GFX7-NEXT: s_and_b32 s6, 1, s0 +; GFX7-NEXT: s_and_b32 s4, 1, s1 +; GFX7-NEXT: s_and_b32 s2, 1, s2 +; GFX7-NEXT: s_bitcmp1_b32 s3, 0 ; GFX7-NEXT: s_cselect_b64 s[0:1], -1, 0 ; GFX7-NEXT: s_cmp_eq_u32 s2, 1 ; GFX7-NEXT: s_cselect_b64 s[2:3], -1, 0 @@ -2522,18 +2515,18 @@ define amdgpu_ps <4 x i32> @s_sitofp_v4i1_to_v4bf16(<4 x i1> inreg %num) { ; GFX7-NEXT: v_ashrrev_i32_e32 v2, 16, v2 ; GFX7-NEXT: v_ashrrev_i32_e32 v1, 16, v1 ; GFX7-NEXT: v_ashrrev_i32_e32 v0, 16, v0 -; GFX7-NEXT: v_readfirstlane_b32 s0, v3 -; GFX7-NEXT: v_readfirstlane_b32 s1, v2 -; GFX7-NEXT: v_readfirstlane_b32 s2, v1 -; GFX7-NEXT: v_readfirstlane_b32 s3, v0 +; GFX7-NEXT: v_readfirstlane_b32 s0, v0 +; GFX7-NEXT: v_readfirstlane_b32 s1, v1 +; GFX7-NEXT: v_readfirstlane_b32 s2, v2 +; GFX7-NEXT: v_readfirstlane_b32 s3, v3 ; GFX7-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: s_sitofp_v4i1_to_v4bf16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_and_b32 s6, 1, s3 -; GFX9-NEXT: s_and_b32 s4, 1, s2 -; GFX9-NEXT: s_and_b32 s2, 1, s1 -; GFX9-NEXT: s_bitcmp1_b32 s0, 0 +; GFX9-NEXT: s_and_b32 s6, 1, s0 +; GFX9-NEXT: s_and_b32 s4, 1, s1 +; GFX9-NEXT: s_and_b32 s2, 1, s2 +; GFX9-NEXT: s_bitcmp1_b32 s3, 0 ; GFX9-NEXT: s_cselect_b64 s[0:1], -1, 0 ; GFX9-NEXT: s_cmp_eq_u32 s2, 1 ; GFX9-NEXT: s_cselect_b64 s[2:3], -1, 0 @@ -2565,7 +2558,7 @@ define amdgpu_ps <4 x i32> @s_sitofp_v4i1_to_v4bf16(<4 x i1> inreg %num) { ; GFX9-NEXT: v_add_u32_e32 v4, 0x7fff, v4 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 ; GFX9-NEXT: v_ashrrev_i32_e32 v1, 16, v1 -; GFX9-NEXT: v_readfirstlane_b32 s3, v0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v4, v3, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, -1.0, s[0:1] ; GFX9-NEXT: v_bfe_u32 v5, v3, 16, 1 @@ -2574,126 +2567,133 @@ define amdgpu_ps <4 x i32> @s_sitofp_v4i1_to_v4bf16(<4 x i1> inreg %num) { ; GFX9-NEXT: v_add_u32_e32 v5, 0x7fff, v5 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 ; GFX9-NEXT: v_ashrrev_i32_e32 v2, 16, v2 -; GFX9-NEXT: v_readfirstlane_b32 s2, v1 +; GFX9-NEXT: v_readfirstlane_b32 s0, v0 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v5, v4, vcc ; GFX9-NEXT: v_ashrrev_i32_e32 v3, 16, v3 -; GFX9-NEXT: v_readfirstlane_b32 s1, v2 -; GFX9-NEXT: v_readfirstlane_b32 s0, v3 +; GFX9-NEXT: v_readfirstlane_b32 s1, v1 +; GFX9-NEXT: v_readfirstlane_b32 s2, v2 +; GFX9-NEXT: v_readfirstlane_b32 s3, v3 ; GFX9-NEXT: ; return to shader part epilog ; ; GFX11-LABEL: s_sitofp_v4i1_to_v4bf16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_and_b32 s3, 1, s3 -; GFX11-NEXT: s_and_b32 s2, 1, s2 +; GFX11-NEXT: s_and_b32 s0, 1, s0 ; GFX11-NEXT: s_and_b32 s1, 1, s1 -; GFX11-NEXT: s_bitcmp1_b32 s0, 0 -; GFX11-NEXT: s_cselect_b32 s0, -1, 0 -; GFX11-NEXT: s_cmp_eq_u32 s1, 1 -; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, -1.0, s0 -; GFX11-NEXT: s_cselect_b32 s1, -1, 0 +; GFX11-NEXT: s_and_b32 s2, 1, s2 +; GFX11-NEXT: s_bitcmp1_b32 s3, 0 +; GFX11-NEXT: s_cselect_b32 s3, -1, 0 ; GFX11-NEXT: s_cmp_eq_u32 s2, 1 -; GFX11-NEXT: v_cndmask_b32_e64 v4, 0, -1.0, s1 +; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, -1.0, s3 ; GFX11-NEXT: s_cselect_b32 s2, -1, 0 -; GFX11-NEXT: s_cmp_eq_u32 s3, 1 -; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, -1.0, s2 -; GFX11-NEXT: s_cselect_b32 s3, -1, 0 +; GFX11-NEXT: s_cmp_eq_u32 s1, 1 +; GFX11-NEXT: v_cndmask_b32_e64 v4, 0, -1.0, s2 +; GFX11-NEXT: s_cselect_b32 s1, -1, 0 +; GFX11-NEXT: s_cmp_eq_u32 s0, 1 +; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, -1.0, s1 +; GFX11-NEXT: s_cselect_b32 s0, -1, 0 ; GFX11-NEXT: v_bfe_u32 v7, v4, 16, 1 -; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, -1.0, s3 -; GFX11-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, -1.0, s0 ; GFX11-NEXT: v_bfe_u32 v5, v1, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v4 -; GFX11-NEXT: v_add_nc_u32_e32 v7, v7, v4 +; GFX11-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) ; GFX11-NEXT: v_bfe_u32 v2, v0, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v0 -; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v1 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v0 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX11-NEXT: v_add_nc_u32_e32 v7, v7, v4 +; GFX11-NEXT: v_add_nc_u32_e32 v5, v5, v1 ; GFX11-NEXT: v_add_nc_u32_e32 v6, v6, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) ; GFX11-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v7 -; GFX11-NEXT: v_add_nc_u32_e32 v5, v5, v1 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX11-NEXT: v_or_b32_e32 v11, 0x400000, v3 -; GFX11-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v6 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_nc_u32_e32 v2, v2, v0 ; GFX11-NEXT: v_add_nc_u32_e32 v5, 0x7fff, v5 -; GFX11-NEXT: v_dual_cndmask_b32 v1, v5, v9 :: v_dual_add_nc_u32 v2, v2, v0 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2 -; GFX11-NEXT: v_ashrrev_i32_e32 v1, 16, v1 -; GFX11-NEXT: v_cndmask_b32_e32 v3, v6, v11, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v9, vcc_lo +; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v6 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-NEXT: v_or_b32_e32 v6, 0x400000, v4 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v1 +; GFX11-NEXT: v_ashrrev_i32_e32 v0, 16, v0 +; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v8, vcc_lo ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_readfirstlane_b32 s2, v1 -; GFX11-NEXT: v_cndmask_b32_e32 v4, v7, v10, vcc_lo -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v8, vcc_lo -; GFX11-NEXT: v_ashrrev_i32_e32 v2, 16, v3 -; GFX11-NEXT: v_ashrrev_i32_e32 v3, 16, v4 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_ashrrev_i32_e32 v0, 16, v0 -; GFX11-NEXT: v_readfirstlane_b32 s0, v2 +; GFX11-NEXT: v_readfirstlane_b32 s0, v0 +; GFX11-NEXT: v_ashrrev_i32_e32 v2, 16, v2 +; GFX11-NEXT: v_cndmask_b32_e32 v3, v7, v6, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_readfirstlane_b32 s1, v3 -; GFX11-NEXT: v_readfirstlane_b32 s3, v0 +; GFX11-NEXT: v_readfirstlane_b32 s1, v2 +; GFX11-NEXT: v_ashrrev_i32_e32 v3, 16, v3 +; GFX11-NEXT: v_cndmask_b32_e32 v1, v5, v9, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_readfirstlane_b32 s2, v3 +; GFX11-NEXT: v_ashrrev_i32_e32 v1, 16, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_readfirstlane_b32 s3, v1 ; GFX11-NEXT: ; return to shader part epilog ; ; GFX12-LABEL: s_sitofp_v4i1_to_v4bf16: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_and_b32 s3, 1, s3 -; GFX12-NEXT: s_and_b32 s2, 1, s2 +; GFX12-NEXT: s_and_b32 s0, 1, s0 ; GFX12-NEXT: s_and_b32 s1, 1, s1 -; GFX12-NEXT: s_bitcmp1_b32 s0, 0 -; GFX12-NEXT: s_cselect_b32 s0, -1, 0 -; GFX12-NEXT: s_cmp_eq_u32 s1, 1 -; GFX12-NEXT: v_cndmask_b32_e64 v3, 0, -1.0, s0 -; GFX12-NEXT: s_cselect_b32 s1, -1, 0 +; GFX12-NEXT: s_and_b32 s2, 1, s2 +; GFX12-NEXT: s_bitcmp1_b32 s3, 0 +; GFX12-NEXT: s_cselect_b32 s3, -1, 0 ; GFX12-NEXT: s_cmp_eq_u32 s2, 1 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: v_cndmask_b32_e64 v4, 0, -1.0, s1 +; GFX12-NEXT: v_cndmask_b32_e64 v1, 0, -1.0, s3 ; GFX12-NEXT: s_cselect_b32 s2, -1, 0 -; GFX12-NEXT: s_cmp_eq_u32 s3, 1 -; GFX12-NEXT: v_cndmask_b32_e64 v1, 0, -1.0, s2 -; GFX12-NEXT: s_cselect_b32 s3, -1, 0 +; GFX12-NEXT: s_cmp_eq_u32 s1, 1 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: v_cndmask_b32_e64 v4, 0, -1.0, s2 +; GFX12-NEXT: s_cselect_b32 s1, -1, 0 +; GFX12-NEXT: s_cmp_eq_u32 s0, 1 +; GFX12-NEXT: v_cndmask_b32_e64 v3, 0, -1.0, s1 +; GFX12-NEXT: s_cselect_b32 s0, -1, 0 ; GFX12-NEXT: v_bfe_u32 v7, v4, 16, 1 ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: v_cndmask_b32_e64 v0, 0, -1.0, s3 -; GFX12-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX12-NEXT: v_cndmask_b32_e64 v0, 0, -1.0, s0 ; GFX12-NEXT: v_bfe_u32 v5, v1, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v10, 0x400000, v4 -; GFX12-NEXT: v_add_nc_u32_e32 v7, v7, v4 +; GFX12-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) ; GFX12-NEXT: v_bfe_u32 v2, v0, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v0 -; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v1 +; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v0 +; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX12-NEXT: v_add_nc_u32_e32 v7, v7, v4 +; GFX12-NEXT: v_add_nc_u32_e32 v5, v5, v1 ; GFX12-NEXT: v_add_nc_u32_e32 v6, v6, v3 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) ; GFX12-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v7 -; GFX12-NEXT: v_add_nc_u32_e32 v5, v5, v1 -; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX12-NEXT: v_or_b32_e32 v11, 0x400000, v3 -; GFX12-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v6 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_add_nc_u32_e32 v2, v2, v0 ; GFX12-NEXT: v_add_nc_u32_e32 v5, 0x7fff, v5 -; GFX12-NEXT: v_dual_cndmask_b32 v1, v5, v9 :: v_dual_add_nc_u32 v2, v2, v0 -; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2 -; GFX12-NEXT: v_ashrrev_i32_e32 v1, 16, v1 +; GFX12-NEXT: v_cndmask_b32_e32 v0, v2, v9, vcc_lo +; GFX12-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v6 +; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX12-NEXT: v_or_b32_e32 v6, 0x400000, v4 +; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v1 +; GFX12-NEXT: v_ashrrev_i32_e32 v0, 16, v0 ; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: v_cndmask_b32_e32 v3, v6, v11, vcc_lo +; GFX12-NEXT: v_cndmask_b32_e32 v2, v2, v8, vcc_lo ; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX12-NEXT: v_readfirstlane_b32 s2, v1 -; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: v_cndmask_b32_e32 v4, v7, v10, vcc_lo -; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: v_cndmask_b32_e32 v0, v2, v8, vcc_lo -; GFX12-NEXT: v_ashrrev_i32_e32 v2, 16, v3 -; GFX12-NEXT: v_ashrrev_i32_e32 v3, 16, v4 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX12-NEXT: v_ashrrev_i32_e32 v0, 16, v0 -; GFX12-NEXT: v_readfirstlane_b32 s0, v2 +; GFX12-NEXT: v_readfirstlane_b32 s0, v0 +; GFX12-NEXT: v_ashrrev_i32_e32 v2, 16, v2 +; GFX12-NEXT: s_wait_alu 0xfffd +; GFX12-NEXT: v_cndmask_b32_e32 v3, v7, v6, vcc_lo +; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX12-NEXT: v_readfirstlane_b32 s1, v3 -; GFX12-NEXT: v_readfirstlane_b32 s3, v0 +; GFX12-NEXT: v_readfirstlane_b32 s1, v2 +; GFX12-NEXT: v_ashrrev_i32_e32 v3, 16, v3 +; GFX12-NEXT: s_wait_alu 0xfffd +; GFX12-NEXT: v_cndmask_b32_e32 v1, v5, v9, vcc_lo +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_readfirstlane_b32 s2, v3 +; GFX12-NEXT: v_ashrrev_i32_e32 v1, 16, v1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_readfirstlane_b32 s3, v1 ; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: ; return to shader part epilog %op = sitofp <4 x i1> %num to <4 x bfloat> diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.make.buffer.rsrc.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.make.buffer.rsrc.ll index 3aa5ea995559f..dfde10329fe80 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.make.buffer.rsrc.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.make.buffer.rsrc.ll @@ -11,12 +11,18 @@ define amdgpu_ps ptr addrspace(8) @basic_raw_buffer(ptr inreg %p) { ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr0 ; CHECK-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 65535 ; CHECK-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 [[COPY]], killed [[S_MOV_B32_]], implicit-def dead $scc + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[S_AND_B32_]] + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 killed [[COPY2]], implicit $exec + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY1]] + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec ; CHECK-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 1234 - ; CHECK-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 5678 - ; CHECK-NEXT: $sgpr0 = COPY [[COPY1]] - ; CHECK-NEXT: $sgpr1 = COPY [[S_AND_B32_]] - ; CHECK-NEXT: $sgpr2 = COPY [[S_MOV_B32_1]] - ; CHECK-NEXT: $sgpr3 = COPY [[S_MOV_B32_2]] + ; CHECK-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 killed [[S_MOV_B32_1]] + ; CHECK-NEXT: [[S_MOV_B32_3:%[0-9]+]]:sreg_32 = S_MOV_B32 5678 + ; CHECK-NEXT: [[S_MOV_B32_4:%[0-9]+]]:sreg_32 = S_MOV_B32 killed [[S_MOV_B32_3]] + ; CHECK-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_1]] + ; CHECK-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_]] + ; CHECK-NEXT: $sgpr2 = COPY [[S_MOV_B32_2]] + ; CHECK-NEXT: $sgpr3 = COPY [[S_MOV_B32_4]] ; CHECK-NEXT: SI_RETURN_TO_EPILOG $sgpr0, $sgpr1, $sgpr2, $sgpr3 %rsrc = call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p0(ptr %p, i16 0, i32 1234, i32 5678) ret ptr addrspace(8) %rsrc @@ -52,12 +58,18 @@ define amdgpu_ps ptr addrspace(8) @basic_struct_buffer(ptr inreg %p) { ; CHECK-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 [[COPY]], killed [[S_MOV_B32_]], implicit-def dead $scc ; CHECK-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 262144 ; CHECK-NEXT: [[S_OR_B32_:%[0-9]+]]:sreg_32 = S_OR_B32 killed [[S_AND_B32_]], killed [[S_MOV_B32_1]], implicit-def dead $scc + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[S_OR_B32_]] + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 killed [[COPY2]], implicit $exec + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY1]] + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec ; CHECK-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 1234 - ; CHECK-NEXT: [[S_MOV_B32_3:%[0-9]+]]:sreg_32 = S_MOV_B32 5678 - ; CHECK-NEXT: $sgpr0 = COPY [[COPY1]] - ; CHECK-NEXT: $sgpr1 = COPY [[S_OR_B32_]] - ; CHECK-NEXT: $sgpr2 = COPY [[S_MOV_B32_2]] - ; CHECK-NEXT: $sgpr3 = COPY [[S_MOV_B32_3]] + ; CHECK-NEXT: [[S_MOV_B32_3:%[0-9]+]]:sreg_32 = S_MOV_B32 killed [[S_MOV_B32_2]] + ; CHECK-NEXT: [[S_MOV_B32_4:%[0-9]+]]:sreg_32 = S_MOV_B32 5678 + ; CHECK-NEXT: [[S_MOV_B32_5:%[0-9]+]]:sreg_32 = S_MOV_B32 killed [[S_MOV_B32_4]] + ; CHECK-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_1]] + ; CHECK-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_]] + ; CHECK-NEXT: $sgpr2 = COPY [[S_MOV_B32_3]] + ; CHECK-NEXT: $sgpr3 = COPY [[S_MOV_B32_5]] ; CHECK-NEXT: SI_RETURN_TO_EPILOG $sgpr0, $sgpr1, $sgpr2, $sgpr3 %rsrc = call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p0(ptr %p, i16 4, i32 1234, i32 5678) ret ptr addrspace(8) %rsrc @@ -76,10 +88,18 @@ define amdgpu_ps ptr addrspace(8) @variable_top_half(ptr inreg %p, i32 inreg %nu ; CHECK-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 [[COPY2]], killed [[S_MOV_B32_]], implicit-def dead $scc ; CHECK-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 262144 ; CHECK-NEXT: [[S_OR_B32_:%[0-9]+]]:sreg_32 = S_OR_B32 killed [[S_AND_B32_]], killed [[S_MOV_B32_1]], implicit-def dead $scc - ; CHECK-NEXT: $sgpr0 = COPY [[COPY3]] - ; CHECK-NEXT: $sgpr1 = COPY [[S_OR_B32_]] - ; CHECK-NEXT: $sgpr2 = COPY [[COPY1]] - ; CHECK-NEXT: $sgpr3 = COPY [[COPY]] + ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[S_OR_B32_]] + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 killed [[COPY4]], implicit $exec + ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[COPY3]] + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec + ; CHECK-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[COPY1]] + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec + ; CHECK-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY]] + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec + ; CHECK-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_1]] + ; CHECK-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_]] + ; CHECK-NEXT: $sgpr2 = COPY [[V_READFIRSTLANE_B32_2]] + ; CHECK-NEXT: $sgpr3 = COPY [[V_READFIRSTLANE_B32_3]] ; CHECK-NEXT: SI_RETURN_TO_EPILOG $sgpr0, $sgpr1, $sgpr2, $sgpr3 %rsrc = call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p0(ptr %p, i16 4, i32 %numVals, i32 %flags) ret ptr addrspace(8) %rsrc @@ -99,10 +119,18 @@ define amdgpu_ps ptr addrspace(8) @general_case(ptr inreg %p, i16 inreg %stride, ; CHECK-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 [[COPY3]], killed [[S_MOV_B32_]], implicit-def dead $scc ; CHECK-NEXT: [[S_LSHL_B32_:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY2]], 16, implicit-def dead $scc ; CHECK-NEXT: [[S_OR_B32_:%[0-9]+]]:sreg_32 = S_OR_B32 killed [[S_AND_B32_]], killed [[S_LSHL_B32_]], implicit-def dead $scc - ; CHECK-NEXT: $sgpr0 = COPY [[COPY4]] - ; CHECK-NEXT: $sgpr1 = COPY [[S_OR_B32_]] - ; CHECK-NEXT: $sgpr2 = COPY [[COPY1]] - ; CHECK-NEXT: $sgpr3 = COPY [[COPY]] + ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[S_OR_B32_]] + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 killed [[COPY5]], implicit $exec + ; CHECK-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[COPY4]] + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec + ; CHECK-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY1]] + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec + ; CHECK-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY]] + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY8]], implicit $exec + ; CHECK-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_1]] + ; CHECK-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_]] + ; CHECK-NEXT: $sgpr2 = COPY [[V_READFIRSTLANE_B32_2]] + ; CHECK-NEXT: $sgpr3 = COPY [[V_READFIRSTLANE_B32_3]] ; CHECK-NEXT: SI_RETURN_TO_EPILOG $sgpr0, $sgpr1, $sgpr2, $sgpr3 %rsrc = call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p0(ptr %p, i16 %stride, i32 %numVals, i32 %flags) ret ptr addrspace(8) %rsrc diff --git a/llvm/test/CodeGen/AMDGPU/ptrmask.ll b/llvm/test/CodeGen/AMDGPU/ptrmask.ll index 8594549318dda..9ad9c80d82ff3 100644 --- a/llvm/test/CodeGen/AMDGPU/ptrmask.ll +++ b/llvm/test/CodeGen/AMDGPU/ptrmask.ll @@ -186,8 +186,10 @@ define ptr addrspace(8) @v_ptrmask_buffer_resource_variable_i128_neg8(ptr addrsp define amdgpu_ps ptr addrspace(8) @s_ptrmask_buffer_resource_variable_i128(ptr addrspace(8) inreg %ptr, i128 inreg %mask) { ; GCN-LABEL: s_ptrmask_buffer_resource_variable_i128: ; GCN: ; %bb.0: +; GCN-NEXT: s_and_b64 s[4:5], s[4:5], s[8:9] ; GCN-NEXT: s_and_b64 s[0:1], s[2:3], s[6:7] -; GCN-NEXT: s_and_b64 s[2:3], s[4:5], s[8:9] +; GCN-NEXT: s_mov_b32 s2, s4 +; GCN-NEXT: s_mov_b32 s3, s5 ; GCN-NEXT: ; return to shader part epilog ; ; GFX10PLUS-LABEL: s_ptrmask_buffer_resource_variable_i128: diff --git a/llvm/test/CodeGen/AMDGPU/sdag-print-divergence.ll b/llvm/test/CodeGen/AMDGPU/sdag-print-divergence.ll index 695d5225421de..482f78889ff4e 100644 --- a/llvm/test/CodeGen/AMDGPU/sdag-print-divergence.ll +++ b/llvm/test/CodeGen/AMDGPU/sdag-print-divergence.ll @@ -1,3 +1,4 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -O0 -verify-machineinstrs < %s -debug-only=isel 2>&1 | FileCheck --check-prefixes=GCN,GCN-DEFAULT %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -O0 -verify-machineinstrs < %s -debug-only=isel -dag-dump-verbose 2>&1 | FileCheck --check-prefixes=GCN,GCN-VERBOSE %s @@ -5,23 +6,25 @@ ; GCN-LABEL: === test_sdag_dump ; GCN: Initial selection DAG: %bb.0 'test_sdag_dump:entry' -; GCN: SelectionDAG has 10 nodes: +; GCN: SelectionDAG has 11 nodes: ; GCN-DEFAULT: t0: ch,glue = EntryToken ; GCN-DEFAULT: t2: f32,ch = CopyFromReg t0, Register:f32 %0 +; GCN-DEFAULT: t7: i32 = TargetConstant<3222> ; GCN-DEFAULT: t5: f32 = fadd t2, t2 ; GCN-DEFAULT: t4: f32,ch = CopyFromReg # D:1 t0, Register:f32 %1 -; GCN-DEFAULT: t6: f32 = fadd # D:1 t5, t4 -; GCN-DEFAULT: t8: ch,glue = CopyToReg # D:1 t0, Register:f32 $vgpr0, t6 -; GCN-DEFAULT: t9: ch = RETURN_TO_EPILOG t8, Register:f32 $vgpr0, t8:1 +; GCN-DEFAULT: t6: f32 = fadd # D:1 t5, t4 +; GCN-DEFAULT: t9: ch,glue = CopyToReg # D:1 t0, Register:f32 $vgpr0, t6 +; GCN-DEFAULT: t10: ch = RETURN_TO_EPILOG t9, Register:f32 $vgpr0, t9:1 ; GCN-VERBOSE: t0: ch,glue = EntryToken # D:0 ; GCN-VERBOSE: t2: f32,ch = CopyFromReg [ORD=1] # D:0 t0, Register:f32 %0 # D:0 +; GCN-VERBOSE: t7: i32 = TargetConstant<3222> ; GCN-VERBOSE: t5: f32 = fadd [ORD=2] # D:0 t2, t2 ; GCN-VERBOSE: t4: f32,ch = CopyFromReg [ORD=1] # D:1 t0, Register:f32 %1 # D:0 ; GCN-VERBOSE: t6: f32 = fadd [ORD=3] # D:1 t5, t4 -; GCN-VERBOSE: t8: ch,glue = CopyToReg [ORD=4] # D:1 t0, Register:f32 $vgpr0 # D:0, t6 -; GCN-VERBOSE: t9: ch = RETURN_TO_EPILOG [ORD=4] # D:0 t8, Register:f32 $vgpr0 # D:0, t8:1 +; GCN-VERBOSE: t9: ch,glue = CopyToReg [ORD=4] # D:1 t0, Register:f32 $vgpr0 # D:0, t6 +; GCN-VERBOSE: t10: ch = RETURN_TO_EPILOG [ORD=4] # D:0 t9, Register:f32 $vgpr0 # D:0, t9:1 define amdgpu_ps float @test_sdag_dump(float inreg %scalar, float %vector) { entry: diff --git a/llvm/test/CodeGen/AMDGPU/sub64-low-32-bits-known-zero.ll b/llvm/test/CodeGen/AMDGPU/sub64-low-32-bits-known-zero.ll index f52f1164f2ba2..6e4391a5ecaab 100644 --- a/llvm/test/CodeGen/AMDGPU/sub64-low-32-bits-known-zero.ll +++ b/llvm/test/CodeGen/AMDGPU/sub64-low-32-bits-known-zero.ll @@ -148,8 +148,8 @@ define <2 x i64> @v_sub_v2i64_nonsplat_const_low_bits_known0_0(<2 x i64> %reg) { define amdgpu_ps <2 x i64> @s_sub_v2i64_splat_const_low_bits_known0_0(<2 x i64> inreg %reg) { ; GFX9-LABEL: s_sub_v2i64_splat_const_low_bits_known0_0: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_add_i32 s1, s1, -1 ; GFX9-NEXT: s_add_i32 s3, s3, -1 +; GFX9-NEXT: s_add_i32 s1, s1, -1 ; GFX9-NEXT: ; return to shader part epilog %sub = sub <2 x i64> %reg, ; (1 << 32) ret <2 x i64> %sub @@ -158,8 +158,8 @@ define amdgpu_ps <2 x i64> @s_sub_v2i64_splat_const_low_bits_known0_0(<2 x i64> define amdgpu_ps <2 x i64> @s_sub_v2i64_nonsplat_const_low_bits_known0_0(<2 x i64> inreg %reg) { ; GFX9-LABEL: s_sub_v2i64_nonsplat_const_low_bits_known0_0: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_add_i32 s1, s1, -1 ; GFX9-NEXT: s_add_i32 s3, s3, -2 +; GFX9-NEXT: s_add_i32 s1, s1, -1 ; GFX9-NEXT: ; return to shader part epilog %sub = sub <2 x i64> %reg, ; (1 << 32), (1 << 33) ret <2 x i64> %sub diff --git a/llvm/test/CodeGen/AMDGPU/uniform-vgpr-to-sgpr-return.ll b/llvm/test/CodeGen/AMDGPU/uniform-vgpr-to-sgpr-return.ll new file mode 100644 index 0000000000000..5476c26e39ba9 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/uniform-vgpr-to-sgpr-return.ll @@ -0,0 +1,119 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1100 | FileCheck %s -check-prefixes=GFX11 + +define amdgpu_ps i32 @uniform_v_to_s_i32(float inreg %a, float inreg %b) { +; GFX11-LABEL: uniform_v_to_s_i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_max_f32_e64 v0, s0, s1 +; GFX11-NEXT: v_cmp_o_f32_e64 vcc_lo, s0, s1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x7fc00000, v0, vcc_lo +; GFX11-NEXT: v_readfirstlane_b32 s0, v0 +; GFX11-NEXT: ; return to shader part epilog + %max0 = call float @llvm.maximum.f32(float %a, float %b) + %cast = bitcast float %max0 to i32 + ret i32 %cast +} + +define amdgpu_ps i64 @uniform_v_to_s_i64(double inreg %a, double inreg %b) { +; GFX11-LABEL: uniform_v_to_s_i64: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_max_f64 v[0:1], s[0:1], s[2:3] +; GFX11-NEXT: v_cmp_u_f64_e64 s0, s[0:1], s[2:3] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, 0x7ff80000, s0 +; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, 0, s0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_readfirstlane_b32 s1, v1 +; GFX11-NEXT: v_readfirstlane_b32 s0, v0 +; GFX11-NEXT: ; return to shader part epilog + %max0 = call double @llvm.maximum.f64(double %a, double %b) + %cast = bitcast double %max0 to i64 + ret i64 %cast +} + +define amdgpu_ps <2 x i32> @uniform_v_to_s_2_i32(<2 x float> inreg %a, <2 x float> inreg %b) { +; GFX11-LABEL: uniform_v_to_s_2_i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_max_f32_e64 v0, s0, s2 +; GFX11-NEXT: v_cmp_o_f32_e64 vcc_lo, s0, s2 +; GFX11-NEXT: v_max_f32_e64 v1, s1, s3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x7fc00000, v0, vcc_lo +; GFX11-NEXT: v_cmp_o_f32_e64 vcc_lo, s1, s3 +; GFX11-NEXT: v_readfirstlane_b32 s0, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b32_e32 v1, 0x7fc00000, v1, vcc_lo +; GFX11-NEXT: v_readfirstlane_b32 s1, v1 +; GFX11-NEXT: ; return to shader part epilog + %max0 = call <2 x float> @llvm.maximum.f32(<2 x float> %a, <2 x float> %b) + %cast = bitcast <2 x float> %max0 to <2 x i32> + ret <2 x i32> %cast +} + +define amdgpu_ps ptr @uniform_v_to_s_ptr(ptr inreg %x) { +; GFX11-LABEL: uniform_v_to_s_ptr: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: s_mov_b32 s1, 0 +; GFX11-NEXT: flat_load_b32 v0, v[0:1] +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_max_f32_e32 v1, 1.0, v0 +; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v0, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x7fc00000, v1, vcc_lo +; GFX11-NEXT: v_cvt_u32_f32_e32 v0, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_readfirstlane_b32 s0, v0 +; GFX11-NEXT: ; return to shader part epilog + %val = load float, ptr %x, align 4 + %max = call float @llvm.maximum.f32(float %val, float 1.0) + %int = fptoui float %max to i32 + %ptr = inttoptr i32 %int to ptr + ret ptr %ptr +} + +define amdgpu_ps double @uniform_v_to_s_double(double inreg %a, double inreg %b) { +; GFX11-LABEL: uniform_v_to_s_double: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_max_f64 v[0:1], s[0:1], s[2:3] +; GFX11-NEXT: v_cmp_u_f64_e64 s0, s[0:1], s[2:3] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, 0x7ff80000, s0 +; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, 0, s0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_readfirstlane_b32 s1, v1 +; GFX11-NEXT: v_readfirstlane_b32 s0, v0 +; GFX11-NEXT: ; return to shader part epilog + %max0 = call double @llvm.maximum.f64(double %a, double %b) + ret double %max0 +} + +define amdgpu_ps <2 x i16> @uniform_v_to_s_2_i16(float inreg %a, float inreg %b) { +; GFX11-LABEL: uniform_v_to_s_2_i16: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_max_f32_e64 v0, s0, s1 +; GFX11-NEXT: v_cmp_o_f32_e64 vcc_lo, s0, s1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x7fc00000, v0, vcc_lo +; GFX11-NEXT: v_readfirstlane_b32 s0, v0 +; GFX11-NEXT: ; return to shader part epilog + %max0 = call float @llvm.maximum.f32(float %a, float %b) + %cast = bitcast float %max0 to <2 x i16> + ret <2 x i16> %cast +} + +define amdgpu_ps i16 @uniform_v_to_s_i16(half inreg %a, half inreg %b) { +; GFX11-LABEL: uniform_v_to_s_i16: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_max_f16_e64 v0, s0, s1 +; GFX11-NEXT: v_cmp_o_f16_e64 vcc_lo, s0, s1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v0, vcc_lo +; GFX11-NEXT: v_readfirstlane_b32 s0, v0 +; GFX11-NEXT: ; return to shader part epilog + %max = call half @llvm.maximum.f16(half %a, half %b) + %cast = bitcast half %max to i16 + ret i16 %cast +} + diff --git a/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/amdgpu_isel.ll.expected b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/amdgpu_isel.ll.expected index 2202b6446fd15..f590324f1120d 100644 --- a/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/amdgpu_isel.ll.expected +++ b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/amdgpu_isel.ll.expected @@ -7,16 +7,16 @@ define i64 @i64_test(i64 %i) nounwind readnone { ; CHECK-NEXT: t0: ch,glue = EntryToken ; CHECK-NEXT: t2: i32,ch = CopyFromReg # D:1 t0, Register:i32 %8 ; CHECK-NEXT: t4: i32,ch = CopyFromReg # D:1 t0, Register:i32 %9 -; CHECK-NEXT: t49: i64 = REG_SEQUENCE # D:1 TargetConstant:i32<60>, t2, TargetConstant:i32<3>, t4, TargetConstant:i32<11> -; CHECK-NEXT: t26: i32,ch = BUFFER_LOAD_DWORD_OFFEN TargetFrameIndex:i32<0>, Register:v4i32 $sgpr0_sgpr1_sgpr2_sgpr3, TargetConstant:i32<0>, TargetConstant:i32<0>, TargetConstant:i32<0>, TargetConstant:i1<0>, t0 -; CHECK-NEXT: t29: i32,ch = BUFFER_LOAD_DWORD_OFFEN TargetFrameIndex:i32<0>, Register:v4i32 $sgpr0_sgpr1_sgpr2_sgpr3, TargetConstant:i32<0>, TargetConstant:i32<4>, TargetConstant:i32<0>, TargetConstant:i1<0>, t0 -; CHECK-NEXT: t32: v2i32 = REG_SEQUENCE # D:1 TargetConstant:i32<60>, t26, TargetConstant:i32<3>, t29, TargetConstant:i32<11> -; CHECK-NEXT: t10: i64 = V_ADD_U64_PSEUDO # D:1 t49, t32 -; CHECK-NEXT: t23: i32 = EXTRACT_SUBREG # D:1 t10, TargetConstant:i32<3> -; CHECK-NEXT: t16: ch,glue = CopyToReg # D:1 t0, Register:i32 $vgpr0, t23 -; CHECK-NEXT: t38: i32 = EXTRACT_SUBREG # D:1 t10, TargetConstant:i32<11> -; CHECK-NEXT: t18: ch,glue = CopyToReg # D:1 t16, Register:i32 $vgpr1, t38, t16:1 -; CHECK-NEXT: t19: ch = SI_RETURN Register:i32 $vgpr0, Register:i32 $vgpr1, t18, t18:1 +; CHECK-NEXT: t50: i64 = REG_SEQUENCE # D:1 TargetConstant:i32<60>, t2, TargetConstant:i32<3>, t4, TargetConstant:i32<11> +; CHECK-NEXT: t27: i32,ch = BUFFER_LOAD_DWORD_OFFEN TargetFrameIndex:i32<0>, Register:v4i32 $sgpr0_sgpr1_sgpr2_sgpr3, TargetConstant:i32<0>, TargetConstant:i32<0>, TargetConstant:i32<0>, TargetConstant:i1<0>, t0 +; CHECK-NEXT: t30: i32,ch = BUFFER_LOAD_DWORD_OFFEN TargetFrameIndex:i32<0>, Register:v4i32 $sgpr0_sgpr1_sgpr2_sgpr3, TargetConstant:i32<0>, TargetConstant:i32<4>, TargetConstant:i32<0>, TargetConstant:i1<0>, t0 +; CHECK-NEXT: t33: v2i32 = REG_SEQUENCE # D:1 TargetConstant:i32<60>, t27, TargetConstant:i32<3>, t30, TargetConstant:i32<11> +; CHECK-NEXT: t10: i64 = V_ADD_U64_PSEUDO # D:1 t50, t33 +; CHECK-NEXT: t24: i32 = EXTRACT_SUBREG # D:1 t10, TargetConstant:i32<3> +; CHECK-NEXT: t17: ch,glue = CopyToReg # D:1 t0, Register:i32 $vgpr0, t24 +; CHECK-NEXT: t39: i32 = EXTRACT_SUBREG # D:1 t10, TargetConstant:i32<11> +; CHECK-NEXT: t19: ch,glue = CopyToReg # D:1 t17, Register:i32 $vgpr1, t39, t17:1 +; CHECK-NEXT: t20: ch = SI_RETURN Register:i32 $vgpr0, Register:i32 $vgpr1, t19, t19:1 ; CHECK-EMPTY: %loc = alloca i64, addrspace(5) %j = load i64, ptr addrspace(5) %loc @@ -31,10 +31,10 @@ define i64 @i32_test(i32 %i) nounwind readnone { ; CHECK-NEXT: t2: i32,ch = CopyFromReg # D:1 t0, Register:i32 %8 ; CHECK-NEXT: t6: i32,ch = BUFFER_LOAD_DWORD_OFFEN TargetFrameIndex:i32<0>, Register:v4i32 $sgpr0_sgpr1_sgpr2_sgpr3, TargetConstant:i32<0>, TargetConstant:i32<0>, TargetConstant:i32<0>, TargetConstant:i1<0>, t0 ; CHECK-NEXT: t7: i32,i1 = V_ADD_CO_U32_e64 # D:1 t2, t6, TargetConstant:i1<0> -; CHECK-NEXT: t14: ch,glue = CopyToReg # D:1 t0, Register:i32 $vgpr0, t7 -; CHECK-NEXT: t22: i32 = V_MOV_B32_e32 TargetConstant:i32<0> -; CHECK-NEXT: t16: ch,glue = CopyToReg t14, Register:i32 $vgpr1, t22, t14:1 -; CHECK-NEXT: t17: ch = SI_RETURN Register:i32 $vgpr0, Register:i32 $vgpr1, t16, t16:1 +; CHECK-NEXT: t15: ch,glue = CopyToReg # D:1 t0, Register:i32 $vgpr0, t7 +; CHECK-NEXT: t23: i32 = V_MOV_B32_e32 TargetConstant:i32<0> +; CHECK-NEXT: t17: ch,glue = CopyToReg t15, Register:i32 $vgpr1, t23, t15:1 +; CHECK-NEXT: t18: ch = SI_RETURN Register:i32 $vgpr0, Register:i32 $vgpr1, t17, t17:1 ; CHECK-EMPTY: %loc = alloca i32, addrspace(5) %j = load i32, ptr addrspace(5) %loc @@ -48,14 +48,14 @@ define i64 @i16_test(i16 %i) nounwind readnone { ; CHECK: SelectionDAG has 18 nodes: ; CHECK-NEXT: t0: ch,glue = EntryToken ; CHECK-NEXT: t2: i32,ch = CopyFromReg # D:1 t0, Register:i32 %8 -; CHECK-NEXT: t19: i32,ch = BUFFER_LOAD_USHORT_OFFEN TargetFrameIndex:i32<0>, Register:v4i32 $sgpr0_sgpr1_sgpr2_sgpr3, TargetConstant:i32<0>, TargetConstant:i32<0>, TargetConstant:i32<0>, TargetConstant:i1<0>, t0 -; CHECK-NEXT: t20: i32,i1 = V_ADD_CO_U32_e64 # D:1 t2, t19, TargetConstant:i1<0> -; CHECK-NEXT: t24: i32 = S_MOV_B32 TargetConstant:i32<65535> -; CHECK-NEXT: t25: i32 = V_AND_B32_e64 # D:1 t20, t24 -; CHECK-NEXT: t15: ch,glue = CopyToReg # D:1 t0, Register:i32 $vgpr0, t25 -; CHECK-NEXT: t31: i32 = V_MOV_B32_e32 TargetConstant:i32<0> -; CHECK-NEXT: t17: ch,glue = CopyToReg t15, Register:i32 $vgpr1, t31, t15:1 -; CHECK-NEXT: t18: ch = SI_RETURN Register:i32 $vgpr0, Register:i32 $vgpr1, t17, t17:1 +; CHECK-NEXT: t20: i32,ch = BUFFER_LOAD_USHORT_OFFEN TargetFrameIndex:i32<0>, Register:v4i32 $sgpr0_sgpr1_sgpr2_sgpr3, TargetConstant:i32<0>, TargetConstant:i32<0>, TargetConstant:i32<0>, TargetConstant:i1<0>, t0 +; CHECK-NEXT: t21: i32,i1 = V_ADD_CO_U32_e64 # D:1 t2, t20, TargetConstant:i1<0> +; CHECK-NEXT: t25: i32 = S_MOV_B32 TargetConstant:i32<65535> +; CHECK-NEXT: t26: i32 = V_AND_B32_e64 # D:1 t21, t25 +; CHECK-NEXT: t16: ch,glue = CopyToReg # D:1 t0, Register:i32 $vgpr0, t26 +; CHECK-NEXT: t32: i32 = V_MOV_B32_e32 TargetConstant:i32<0> +; CHECK-NEXT: t18: ch,glue = CopyToReg t16, Register:i32 $vgpr1, t32, t16:1 +; CHECK-NEXT: t19: ch = SI_RETURN Register:i32 $vgpr0, Register:i32 $vgpr1, t18, t18:1 ; CHECK-EMPTY: %loc = alloca i16, addrspace(5) %j = load i16, ptr addrspace(5) %loc @@ -69,14 +69,14 @@ define i64 @i8_test(i8 %i) nounwind readnone { ; CHECK: SelectionDAG has 18 nodes: ; CHECK-NEXT: t0: ch,glue = EntryToken ; CHECK-NEXT: t2: i32,ch = CopyFromReg # D:1 t0, Register:i32 %8 -; CHECK-NEXT: t19: i32,ch = BUFFER_LOAD_UBYTE_OFFEN TargetFrameIndex:i32<0>, Register:v4i32 $sgpr0_sgpr1_sgpr2_sgpr3, TargetConstant:i32<0>, TargetConstant:i32<0>, TargetConstant:i32<0>, TargetConstant:i1<0>, t0 -; CHECK-NEXT: t20: i32,i1 = V_ADD_CO_U32_e64 # D:1 t2, t19, TargetConstant:i1<0> -; CHECK-NEXT: t24: i32 = S_MOV_B32 TargetConstant:i32<255> -; CHECK-NEXT: t25: i32 = V_AND_B32_e64 # D:1 t20, t24 -; CHECK-NEXT: t15: ch,glue = CopyToReg # D:1 t0, Register:i32 $vgpr0, t25 -; CHECK-NEXT: t31: i32 = V_MOV_B32_e32 TargetConstant:i32<0> -; CHECK-NEXT: t17: ch,glue = CopyToReg t15, Register:i32 $vgpr1, t31, t15:1 -; CHECK-NEXT: t18: ch = SI_RETURN Register:i32 $vgpr0, Register:i32 $vgpr1, t17, t17:1 +; CHECK-NEXT: t20: i32,ch = BUFFER_LOAD_UBYTE_OFFEN TargetFrameIndex:i32<0>, Register:v4i32 $sgpr0_sgpr1_sgpr2_sgpr3, TargetConstant:i32<0>, TargetConstant:i32<0>, TargetConstant:i32<0>, TargetConstant:i1<0>, t0 +; CHECK-NEXT: t21: i32,i1 = V_ADD_CO_U32_e64 # D:1 t2, t20, TargetConstant:i1<0> +; CHECK-NEXT: t25: i32 = S_MOV_B32 TargetConstant:i32<255> +; CHECK-NEXT: t26: i32 = V_AND_B32_e64 # D:1 t21, t25 +; CHECK-NEXT: t16: ch,glue = CopyToReg # D:1 t0, Register:i32 $vgpr0, t26 +; CHECK-NEXT: t32: i32 = V_MOV_B32_e32 TargetConstant:i32<0> +; CHECK-NEXT: t18: ch,glue = CopyToReg t16, Register:i32 $vgpr1, t32, t16:1 +; CHECK-NEXT: t19: ch = SI_RETURN Register:i32 $vgpr0, Register:i32 $vgpr1, t18, t18:1 ; CHECK-EMPTY: %loc = alloca i8, addrspace(5) %j = load i8, ptr addrspace(5) %loc