From 2973ed7061c41a33ba75b68a047e3b51a7b953c9 Mon Sep 17 00:00:00 2001 From: guochen2 Date: Tue, 29 Apr 2025 13:28:11 -0400 Subject: [PATCH 1/3] fix vgpr16 copy to sgpr32 --- llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp | 20 ++++-- llvm/lib/Target/AMDGPU/VOP1Instructions.td | 9 +-- .../AMDGPU/fix-sgpr-copies-f16-true16.mir | 63 +++++++++++++++++++ .../fix-sgpr-copies-vgpr16-to-spgr32.ll | 41 ++++++++++++ 4 files changed, 121 insertions(+), 12 deletions(-) create mode 100644 llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-vgpr16-to-spgr32.ll diff --git a/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp b/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp index bb8e9a092e07c..caa5c7a599b8b 100644 --- a/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp +++ b/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp @@ -1086,10 +1086,22 @@ void SIFixSGPRCopies::lowerVGPR2SGPRCopies(MachineFunction &MF) { TRI->getRegClassForOperandReg(*MRI, MI->getOperand(1)); size_t SrcSize = TRI->getRegSizeInBits(*SrcRC); if (SrcSize == 16) { - // HACK to handle possible 16bit VGPR source - auto MIB = BuildMI(*MBB, MI, MI->getDebugLoc(), - TII->get(AMDGPU::V_READFIRSTLANE_B32), DstReg); - MIB.addReg(SrcReg, 0, AMDGPU::NoSubRegister); + assert(MF.getSubtarget().useRealTrue16Insts() && + "We do not expect to see 16-bit copies from VGPR to SGPR unless " + "we have 16-bit VGPRs"); + assert(MRI->getRegClass(DstReg) == &AMDGPU::SGPR_LO16RegClass || + MRI->getRegClass(DstReg) == &AMDGPU::SReg_32RegClass || + MRI->getRegClass(DstReg) == &AMDGPU::SReg_32_XM0RegClass); + // There is no V_READFIRSTLANE_B16, so legalize the dst/src reg to 32 bits + MRI->setRegClass(DstReg, &AMDGPU::SReg_32_XM0RegClass); + Register VReg32 = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); + const DebugLoc &DL = MI->getDebugLoc(); + BuildMI(*MBB, MI, DL, TII->get(TargetOpcode::SUBREG_TO_REG), VReg32) + .addImm(0) + .addReg(SrcReg, 0) + .addImm(AMDGPU::lo16); + BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), DstReg) + .addReg(VReg32); } else if (SrcSize == 32) { auto MIB = BuildMI(*MBB, MI, MI->getDebugLoc(), TII->get(AMDGPU::V_READFIRSTLANE_B32), DstReg); diff --git a/llvm/lib/Target/AMDGPU/VOP1Instructions.td b/llvm/lib/Target/AMDGPU/VOP1Instructions.td index 071f55ce16403..352a3f9c2d27f 100644 --- a/llvm/lib/Target/AMDGPU/VOP1Instructions.td +++ b/llvm/lib/Target/AMDGPU/VOP1Instructions.td @@ -1472,16 +1472,9 @@ def : GCNPat < } // End OtherPredicates = [isGFX8Plus, p] -let True16Predicate = UseFakeTrue16Insts in { -def : GCNPat< - (i32 (DivergentUnaryFrag i16:$src)), - (COPY $src) ->; -} // End True16Predicate = UseFakeTrue16Insts - let True16Predicate = UseRealTrue16Insts in { def : GCNPat< - (i32 (UniformUnaryFrag (i16 SReg_32:$src))), + (i32 (UniformUnaryFrag i16:$src)), (COPY $src) >; diff --git a/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-f16-true16.mir b/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-f16-true16.mir index 137a9aaea6a77..28f498e1e0518 100644 --- a/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-f16-true16.mir +++ b/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-f16-true16.mir @@ -53,3 +53,66 @@ body: | %3:sreg_32 = S_OR_B32 %2:sreg_32, %2:sreg_32, implicit-def $scc %4:vgpr_16 = V_CVT_F16_U16_t16_e64 0, %3:sreg_32, 0, 0, 0, implicit $mode, implicit $exec ... + +--- +name: vgpr16_to_spgr32 +body: | + ; GCN-LABEL: name: vgpr16_to_spgr32 + ; GCN: bb.0.entry: + ; GCN-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000) + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: [[DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF + ; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY [[DEF]] + ; GCN-NEXT: [[DS_READ2_B32_gfx9_:%[0-9]+]]:vreg_64 = DS_READ2_B32_gfx9 killed [[COPY]], 0, 1, 0, implicit $exec :: (load (s64) from `ptr addrspace(3) poison` + 8, align 4, addrspace 3) + ; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[DS_READ2_B32_gfx9_]].sub0 + ; GCN-NEXT: [[V_CVT_F16_F32_t16_e64_:%[0-9]+]]:vgpr_16 = nofpexcept V_CVT_F16_F32_t16_e64 0, killed [[COPY1]], 0, 0, 0, implicit $mode, implicit $exec + ; GCN-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 65535 + ; GCN-NEXT: [[SUBREG_TO_REG:%[0-9]+]]:vgpr_32 = SUBREG_TO_REG 0, [[V_CVT_F16_F32_t16_e64_]], %subreg.lo16 + ; GCN-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[SUBREG_TO_REG]], implicit $exec + ; GCN-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 killed [[S_MOV_B32_]], killed [[V_READFIRSTLANE_B32_]], implicit-def dead $scc + ; GCN-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 5 + ; GCN-NEXT: [[S_MUL_I32_:%[0-9]+]]:sreg_32 = S_MUL_I32 killed [[S_AND_B32_]], killed [[S_MOV_B32_1]] + ; GCN-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 2 + ; GCN-NEXT: S_CMP_LG_U32 killed [[S_MUL_I32_]], killed [[S_MOV_B32_2]], implicit-def $scc + ; GCN-NEXT: S_CBRANCH_SCC1 %bb.2, implicit $scc + ; GCN-NEXT: S_BRANCH %bb.1 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: bb.1: + ; GCN-NEXT: [[S_MOV_B32_3:%[0-9]+]]:sreg_32 = S_MOV_B32 1 + ; GCN-NEXT: [[S_MOV_B32_4:%[0-9]+]]:sreg_32 = S_MOV_B32 killed [[S_MOV_B32_3]] + ; GCN-NEXT: $sgpr0 = COPY [[S_MOV_B32_4]] + ; GCN-NEXT: SI_RETURN_TO_EPILOG $sgpr0 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: bb.2: + ; GCN-NEXT: [[S_MOV_B32_5:%[0-9]+]]:sreg_32 = S_MOV_B32 2 + ; GCN-NEXT: [[S_MOV_B32_6:%[0-9]+]]:sreg_32 = S_MOV_B32 killed [[S_MOV_B32_5]] + ; GCN-NEXT: $sgpr0 = COPY [[S_MOV_B32_6]] + ; GCN-NEXT: SI_RETURN_TO_EPILOG $sgpr0 + bb.0.entry: + successors: %bb.1(0x40000000), %bb.2(0x40000000); %bb.1(50.00%), %bb.2(50.00%) + + %5:sreg_32 = IMPLICIT_DEF + %6:vgpr_32 = COPY %5:sreg_32 + %4:vreg_64 = DS_READ2_B32_gfx9 killed %6:vgpr_32, 0, 1, 0, implicit $exec :: (load (s64) from `ptr addrspace(3) poison` + 8, align 4, addrspace 3) + %7:sgpr_32 = COPY %4.sub0:vreg_64 + %8:vgpr_16 = nofpexcept V_CVT_F16_F32_t16_e64 0, killed %7:sgpr_32, 0, 0, 0, implicit $mode, implicit $exec + %9:sreg_32 = S_MOV_B32 65535 + %11:sreg_32 = COPY %8:vgpr_16 + %10:sreg_32 = S_AND_B32 killed %9:sreg_32, killed %11:sreg_32, implicit-def dead $scc + %12:sreg_32 = S_MOV_B32 5 + %13:sreg_32 = S_MUL_I32 killed %10:sreg_32, killed %12:sreg_32 + %14:sreg_32 = S_MOV_B32 2 + S_CMP_LG_U32 killed %13:sreg_32, killed %14:sreg_32, implicit-def $scc + S_CBRANCH_SCC1 %bb.2, implicit $scc + S_BRANCH %bb.1 + bb.1: + %17:sreg_32 = S_MOV_B32 1 + %18:sreg_32 = S_MOV_B32 killed %17:sreg_32 + $sgpr0 = COPY %18:sreg_32 + SI_RETURN_TO_EPILOG $sgpr0 + bb.2: + %15:sreg_32 = S_MOV_B32 2 + %16:sreg_32 = S_MOV_B32 killed %15:sreg_32 + $sgpr0 = COPY %16:sreg_32 + SI_RETURN_TO_EPILOG $sgpr0 +... diff --git a/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-vgpr16-to-spgr32.ll b/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-vgpr16-to-spgr32.ll new file mode 100644 index 0000000000000..0b42274f9553d --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-vgpr16-to-spgr32.ll @@ -0,0 +1,41 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3 +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck %s + +; expect readfirstlane to pick the 32bit register +define amdgpu_gs i32 @vgpr16_copyto_sgpr(ptr addrspace(3) %a, i32 %b, ptr addrspace(1) %out) { +; CHECK-LABEL: vgpr16_copyto_sgpr: +; CHECK: ; %bb.0: ; %entry +; CHECK-NEXT: ds_load_2addr_b32 v[0:1], v0 offset1:1 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: v_cvt_f16_f32_e32 v0.l, v0 +; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; CHECK-NEXT: v_readfirstlane_b32 s0, v0 +; CHECK-NEXT: s_and_b32 s0, 0xffff, s0 +; CHECK-NEXT: s_mul_i32 s0, s0, 5 +; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; CHECK-NEXT: s_cmp_lg_u32 s0, 2 +; CHECK-NEXT: s_cbranch_scc1 .LBB0_2 +; CHECK-NEXT: ; %bb.1: ; %a1 +; CHECK-NEXT: s_mov_b32 s0, 1 +; CHECK-NEXT: s_branch .LBB0_3 +; CHECK-NEXT: .LBB0_2: ; %a2 +; CHECK-NEXT: s_mov_b32 s0, 2 +; CHECK-NEXT: s_branch .LBB0_3 +; CHECK-NEXT: .LBB0_3: +entry: + %1 = load <4 x float>, ptr addrspace(3) poison, align 4 + %2 = extractelement <4 x float> %1, i32 0 + %3 = fptrunc float %2 to half + %4 = bitcast half %3 to i16 + %5 = zext i16 %4 to i32 + %6 = add i32 %5, 1 + %7 = mul i32 %6, 5 + %8 = icmp eq i32 %7, 7 + br i1 %8, label %a1, label %a2 + +a1: + ret i32 1 + +a2: + ret i32 2 +} From 43a7ff511207f344dae10ab72d41e804949ef619 Mon Sep 17 00:00:00 2001 From: guochen2 Date: Mon, 5 May 2025 13:00:49 -0400 Subject: [PATCH 2/3] replace sbureg_to_reg to reg_sequence --- llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp | 11 +++++++---- .../CodeGen/AMDGPU/fix-sgpr-copies-f16-true16.mir | 5 +++-- 2 files changed, 10 insertions(+), 6 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp b/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp index caa5c7a599b8b..6f89a3a207f93 100644 --- a/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp +++ b/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp @@ -1096,10 +1096,13 @@ void SIFixSGPRCopies::lowerVGPR2SGPRCopies(MachineFunction &MF) { MRI->setRegClass(DstReg, &AMDGPU::SReg_32_XM0RegClass); Register VReg32 = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); const DebugLoc &DL = MI->getDebugLoc(); - BuildMI(*MBB, MI, DL, TII->get(TargetOpcode::SUBREG_TO_REG), VReg32) - .addImm(0) - .addReg(SrcReg, 0) - .addImm(AMDGPU::lo16); + Register Undef = MRI->createVirtualRegister(&AMDGPU::VGPR_16RegClass); + BuildMI(*MBB, MI, DL, TII->get(AMDGPU::IMPLICIT_DEF), Undef); + BuildMI(*MBB, MI, DL, TII->get(AMDGPU::REG_SEQUENCE), VReg32) + .addReg(SrcReg, 0, SubReg) + .addImm(AMDGPU::lo16) + .addReg(Undef) + .addImm(AMDGPU::hi16); BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), DstReg) .addReg(VReg32); } else if (SrcSize == 32) { diff --git a/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-f16-true16.mir b/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-f16-true16.mir index 28f498e1e0518..6e24d9afa2bbc 100644 --- a/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-f16-true16.mir +++ b/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-f16-true16.mir @@ -67,8 +67,9 @@ body: | ; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[DS_READ2_B32_gfx9_]].sub0 ; GCN-NEXT: [[V_CVT_F16_F32_t16_e64_:%[0-9]+]]:vgpr_16 = nofpexcept V_CVT_F16_F32_t16_e64 0, killed [[COPY1]], 0, 0, 0, implicit $mode, implicit $exec ; GCN-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 65535 - ; GCN-NEXT: [[SUBREG_TO_REG:%[0-9]+]]:vgpr_32 = SUBREG_TO_REG 0, [[V_CVT_F16_F32_t16_e64_]], %subreg.lo16 - ; GCN-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[SUBREG_TO_REG]], implicit $exec + ; GCN-NEXT: [[DEF1:%[0-9]+]]:vgpr_16 = IMPLICIT_DEF + ; GCN-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vgpr_32 = REG_SEQUENCE [[V_CVT_F16_F32_t16_e64_]], %subreg.lo16, [[DEF1]], %subreg.hi16 + ; GCN-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]], implicit $exec ; GCN-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 killed [[S_MOV_B32_]], killed [[V_READFIRSTLANE_B32_]], implicit-def dead $scc ; GCN-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 5 ; GCN-NEXT: [[S_MUL_I32_:%[0-9]+]]:sreg_32 = S_MUL_I32 killed [[S_AND_B32_]], killed [[S_MOV_B32_1]] From 7706bea702081bb5fc83aef32765490a80990620 Mon Sep 17 00:00:00 2001 From: guochen2 Date: Mon, 5 May 2025 14:49:08 -0400 Subject: [PATCH 3/3] update test --- .../fix-sgpr-copies-vgpr16-to-spgr32.ll | 23 +++++++++++-------- 1 file changed, 13 insertions(+), 10 deletions(-) diff --git a/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-vgpr16-to-spgr32.ll b/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-vgpr16-to-spgr32.ll index 0b42274f9553d..5df61f19033a3 100644 --- a/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-vgpr16-to-spgr32.ll +++ b/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-vgpr16-to-spgr32.ll @@ -1,10 +1,13 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck %s +@lds = external local_unnamed_addr addrspace(3) global [4 x float], align 4 + ; expect readfirstlane to pick the 32bit register -define amdgpu_gs i32 @vgpr16_copyto_sgpr(ptr addrspace(3) %a, i32 %b, ptr addrspace(1) %out) { +define amdgpu_gs i32 @vgpr16_copyto_sgpr() { ; CHECK-LABEL: vgpr16_copyto_sgpr: ; CHECK: ; %bb.0: ; %entry +; CHECK-NEXT: v_mov_b32_e32 v0, lds@abs32@lo ; CHECK-NEXT: ds_load_2addr_b32 v[0:1], v0 offset1:1 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: v_cvt_f16_f32_e32 v0.l, v0 @@ -23,15 +26,15 @@ define amdgpu_gs i32 @vgpr16_copyto_sgpr(ptr addrspace(3) %a, i32 %b, ptr addrsp ; CHECK-NEXT: s_branch .LBB0_3 ; CHECK-NEXT: .LBB0_3: entry: - %1 = load <4 x float>, ptr addrspace(3) poison, align 4 - %2 = extractelement <4 x float> %1, i32 0 - %3 = fptrunc float %2 to half - %4 = bitcast half %3 to i16 - %5 = zext i16 %4 to i32 - %6 = add i32 %5, 1 - %7 = mul i32 %6, 5 - %8 = icmp eq i32 %7, 7 - br i1 %8, label %a1, label %a2 + %ptr = load <4 x float>, ptr addrspace(3) @lds, align 4 + %f = extractelement <4 x float> %ptr, i32 0 + %half = fptrunc float %f to half + %i16 = bitcast half %half to i16 + %i32 = zext i16 %i16 to i32 + %add = add i32 %i32, 1 + %mul = mul i32 %add, 5 + %icmp = icmp eq i32 %mul, 7 + br i1 %icmp, label %a1, label %a2 a1: ret i32 1