diff --git a/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp b/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp index bb8e9a092e07c..6f89a3a207f93 100644 --- a/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp +++ b/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp @@ -1086,10 +1086,25 @@ void SIFixSGPRCopies::lowerVGPR2SGPRCopies(MachineFunction &MF) { TRI->getRegClassForOperandReg(*MRI, MI->getOperand(1)); size_t SrcSize = TRI->getRegSizeInBits(*SrcRC); if (SrcSize == 16) { - // HACK to handle possible 16bit VGPR source - auto MIB = BuildMI(*MBB, MI, MI->getDebugLoc(), - TII->get(AMDGPU::V_READFIRSTLANE_B32), DstReg); - MIB.addReg(SrcReg, 0, AMDGPU::NoSubRegister); + assert(MF.getSubtarget().useRealTrue16Insts() && + "We do not expect to see 16-bit copies from VGPR to SGPR unless " + "we have 16-bit VGPRs"); + assert(MRI->getRegClass(DstReg) == &AMDGPU::SGPR_LO16RegClass || + MRI->getRegClass(DstReg) == &AMDGPU::SReg_32RegClass || + MRI->getRegClass(DstReg) == &AMDGPU::SReg_32_XM0RegClass); + // There is no V_READFIRSTLANE_B16, so legalize the dst/src reg to 32 bits + MRI->setRegClass(DstReg, &AMDGPU::SReg_32_XM0RegClass); + Register VReg32 = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); + const DebugLoc &DL = MI->getDebugLoc(); + Register Undef = MRI->createVirtualRegister(&AMDGPU::VGPR_16RegClass); + BuildMI(*MBB, MI, DL, TII->get(AMDGPU::IMPLICIT_DEF), Undef); + BuildMI(*MBB, MI, DL, TII->get(AMDGPU::REG_SEQUENCE), VReg32) + .addReg(SrcReg, 0, SubReg) + .addImm(AMDGPU::lo16) + .addReg(Undef) + .addImm(AMDGPU::hi16); + BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), DstReg) + .addReg(VReg32); } else if (SrcSize == 32) { auto MIB = BuildMI(*MBB, MI, MI->getDebugLoc(), TII->get(AMDGPU::V_READFIRSTLANE_B32), DstReg); diff --git a/llvm/lib/Target/AMDGPU/VOP1Instructions.td b/llvm/lib/Target/AMDGPU/VOP1Instructions.td index 071f55ce16403..352a3f9c2d27f 100644 --- a/llvm/lib/Target/AMDGPU/VOP1Instructions.td +++ b/llvm/lib/Target/AMDGPU/VOP1Instructions.td @@ -1472,16 +1472,9 @@ def : GCNPat < } // End OtherPredicates = [isGFX8Plus, p] -let True16Predicate = UseFakeTrue16Insts in { -def : GCNPat< - (i32 (DivergentUnaryFrag i16:$src)), - (COPY $src) ->; -} // End True16Predicate = UseFakeTrue16Insts - let True16Predicate = UseRealTrue16Insts in { def : GCNPat< - (i32 (UniformUnaryFrag (i16 SReg_32:$src))), + (i32 (UniformUnaryFrag i16:$src)), (COPY $src) >; diff --git a/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-f16-true16.mir b/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-f16-true16.mir index 137a9aaea6a77..6e24d9afa2bbc 100644 --- a/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-f16-true16.mir +++ b/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-f16-true16.mir @@ -53,3 +53,67 @@ body: | %3:sreg_32 = S_OR_B32 %2:sreg_32, %2:sreg_32, implicit-def $scc %4:vgpr_16 = V_CVT_F16_U16_t16_e64 0, %3:sreg_32, 0, 0, 0, implicit $mode, implicit $exec ... + +--- +name: vgpr16_to_spgr32 +body: | + ; GCN-LABEL: name: vgpr16_to_spgr32 + ; GCN: bb.0.entry: + ; GCN-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000) + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: [[DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF + ; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY [[DEF]] + ; GCN-NEXT: [[DS_READ2_B32_gfx9_:%[0-9]+]]:vreg_64 = DS_READ2_B32_gfx9 killed [[COPY]], 0, 1, 0, implicit $exec :: (load (s64) from `ptr addrspace(3) poison` + 8, align 4, addrspace 3) + ; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[DS_READ2_B32_gfx9_]].sub0 + ; GCN-NEXT: [[V_CVT_F16_F32_t16_e64_:%[0-9]+]]:vgpr_16 = nofpexcept V_CVT_F16_F32_t16_e64 0, killed [[COPY1]], 0, 0, 0, implicit $mode, implicit $exec + ; GCN-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 65535 + ; GCN-NEXT: [[DEF1:%[0-9]+]]:vgpr_16 = IMPLICIT_DEF + ; GCN-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vgpr_32 = REG_SEQUENCE [[V_CVT_F16_F32_t16_e64_]], %subreg.lo16, [[DEF1]], %subreg.hi16 + ; GCN-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]], implicit $exec + ; GCN-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 killed [[S_MOV_B32_]], killed [[V_READFIRSTLANE_B32_]], implicit-def dead $scc + ; GCN-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 5 + ; GCN-NEXT: [[S_MUL_I32_:%[0-9]+]]:sreg_32 = S_MUL_I32 killed [[S_AND_B32_]], killed [[S_MOV_B32_1]] + ; GCN-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 2 + ; GCN-NEXT: S_CMP_LG_U32 killed [[S_MUL_I32_]], killed [[S_MOV_B32_2]], implicit-def $scc + ; GCN-NEXT: S_CBRANCH_SCC1 %bb.2, implicit $scc + ; GCN-NEXT: S_BRANCH %bb.1 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: bb.1: + ; GCN-NEXT: [[S_MOV_B32_3:%[0-9]+]]:sreg_32 = S_MOV_B32 1 + ; GCN-NEXT: [[S_MOV_B32_4:%[0-9]+]]:sreg_32 = S_MOV_B32 killed [[S_MOV_B32_3]] + ; GCN-NEXT: $sgpr0 = COPY [[S_MOV_B32_4]] + ; GCN-NEXT: SI_RETURN_TO_EPILOG $sgpr0 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: bb.2: + ; GCN-NEXT: [[S_MOV_B32_5:%[0-9]+]]:sreg_32 = S_MOV_B32 2 + ; GCN-NEXT: [[S_MOV_B32_6:%[0-9]+]]:sreg_32 = S_MOV_B32 killed [[S_MOV_B32_5]] + ; GCN-NEXT: $sgpr0 = COPY [[S_MOV_B32_6]] + ; GCN-NEXT: SI_RETURN_TO_EPILOG $sgpr0 + bb.0.entry: + successors: %bb.1(0x40000000), %bb.2(0x40000000); %bb.1(50.00%), %bb.2(50.00%) + + %5:sreg_32 = IMPLICIT_DEF + %6:vgpr_32 = COPY %5:sreg_32 + %4:vreg_64 = DS_READ2_B32_gfx9 killed %6:vgpr_32, 0, 1, 0, implicit $exec :: (load (s64) from `ptr addrspace(3) poison` + 8, align 4, addrspace 3) + %7:sgpr_32 = COPY %4.sub0:vreg_64 + %8:vgpr_16 = nofpexcept V_CVT_F16_F32_t16_e64 0, killed %7:sgpr_32, 0, 0, 0, implicit $mode, implicit $exec + %9:sreg_32 = S_MOV_B32 65535 + %11:sreg_32 = COPY %8:vgpr_16 + %10:sreg_32 = S_AND_B32 killed %9:sreg_32, killed %11:sreg_32, implicit-def dead $scc + %12:sreg_32 = S_MOV_B32 5 + %13:sreg_32 = S_MUL_I32 killed %10:sreg_32, killed %12:sreg_32 + %14:sreg_32 = S_MOV_B32 2 + S_CMP_LG_U32 killed %13:sreg_32, killed %14:sreg_32, implicit-def $scc + S_CBRANCH_SCC1 %bb.2, implicit $scc + S_BRANCH %bb.1 + bb.1: + %17:sreg_32 = S_MOV_B32 1 + %18:sreg_32 = S_MOV_B32 killed %17:sreg_32 + $sgpr0 = COPY %18:sreg_32 + SI_RETURN_TO_EPILOG $sgpr0 + bb.2: + %15:sreg_32 = S_MOV_B32 2 + %16:sreg_32 = S_MOV_B32 killed %15:sreg_32 + $sgpr0 = COPY %16:sreg_32 + SI_RETURN_TO_EPILOG $sgpr0 +... diff --git a/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-vgpr16-to-spgr32.ll b/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-vgpr16-to-spgr32.ll new file mode 100644 index 0000000000000..5df61f19033a3 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-vgpr16-to-spgr32.ll @@ -0,0 +1,44 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3 +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck %s + +@lds = external local_unnamed_addr addrspace(3) global [4 x float], align 4 + +; expect readfirstlane to pick the 32bit register +define amdgpu_gs i32 @vgpr16_copyto_sgpr() { +; CHECK-LABEL: vgpr16_copyto_sgpr: +; CHECK: ; %bb.0: ; %entry +; CHECK-NEXT: v_mov_b32_e32 v0, lds@abs32@lo +; CHECK-NEXT: ds_load_2addr_b32 v[0:1], v0 offset1:1 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: v_cvt_f16_f32_e32 v0.l, v0 +; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; CHECK-NEXT: v_readfirstlane_b32 s0, v0 +; CHECK-NEXT: s_and_b32 s0, 0xffff, s0 +; CHECK-NEXT: s_mul_i32 s0, s0, 5 +; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; CHECK-NEXT: s_cmp_lg_u32 s0, 2 +; CHECK-NEXT: s_cbranch_scc1 .LBB0_2 +; CHECK-NEXT: ; %bb.1: ; %a1 +; CHECK-NEXT: s_mov_b32 s0, 1 +; CHECK-NEXT: s_branch .LBB0_3 +; CHECK-NEXT: .LBB0_2: ; %a2 +; CHECK-NEXT: s_mov_b32 s0, 2 +; CHECK-NEXT: s_branch .LBB0_3 +; CHECK-NEXT: .LBB0_3: +entry: + %ptr = load <4 x float>, ptr addrspace(3) @lds, align 4 + %f = extractelement <4 x float> %ptr, i32 0 + %half = fptrunc float %f to half + %i16 = bitcast half %half to i16 + %i32 = zext i16 %i16 to i32 + %add = add i32 %i32, 1 + %mul = mul i32 %add, 5 + %icmp = icmp eq i32 %mul, 7 + br i1 %icmp, label %a1, label %a2 + +a1: + ret i32 1 + +a2: + ret i32 2 +}