From a7f8471fd5dbec0c113a747928ae1652051d31d7 Mon Sep 17 00:00:00 2001 From: Patrick Simmons Date: Fri, 18 Jul 2025 17:49:24 -0500 Subject: [PATCH 1/4] Initial commit --- llvm/lib/CodeGen/LiveVariables.cpp | 17 +++++++++++++++++ .../AMDGPU/AMDGPUResourceUsageAnalysis.cpp | 11 ++++++----- llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp | 7 +++++-- 3 files changed, 28 insertions(+), 7 deletions(-) diff --git a/llvm/lib/CodeGen/LiveVariables.cpp b/llvm/lib/CodeGen/LiveVariables.cpp index 1f23418642bc6..dc041ac5aac0a 100644 --- a/llvm/lib/CodeGen/LiveVariables.cpp +++ b/llvm/lib/CodeGen/LiveVariables.cpp @@ -246,6 +246,22 @@ LiveVariables::FindLastPartialDef(Register Reg, return LastDef; } +static void fixupLiveIns(MachineInstr &MI, MCPhysReg SubReg) { + MachineBasicBlock &MBB = *MI.getParent(); + if (MBB.isLiveIn(SubReg)) + return; + + for (MachineBasicBlock::reverse_iterator RIt = MI.getReverseIterator(); + RIt != MBB.rend(); RIt++) + if (RIt->definesRegister(SubReg, nullptr)) + return; + + MBB.addLiveIn(SubReg); + for (const auto &PredMBB : MBB.predecessors()) + if (!PredMBB->empty()) + fixupLiveIns(PredMBB->back(), SubReg); +} + /// HandlePhysRegUse - Turn previous partial def's into read/mod/writes. Add /// implicit defs to a machine instruction if there was an earlier def of its /// super-register. @@ -279,6 +295,7 @@ void LiveVariables::HandlePhysRegUse(Register Reg, MachineInstr &MI) { LastPartialDef->addOperand(MachineOperand::CreateReg(SubReg, false/*IsDef*/, true/*IsImp*/)); + fixupLiveIns(MI, SubReg); PhysRegDef[SubReg] = LastPartialDef; Processed.insert_range(TRI->subregs(SubReg)); } diff --git a/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp b/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp index 8101c68986241..9d48e821fa084 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp @@ -256,17 +256,18 @@ AMDGPUResourceUsageAnalysisImpl::analyzeResourceUsage( const Function *Callee = getCalleeFunction(*CalleeOp); + auto isSameFunction = [](const MachineFunction &MF, const Function *F) { + return F == &MF.getFunction(); + }; + // Avoid crashing on undefined behavior with an illegal call to a // kernel. If a callsite's calling convention doesn't match the // function's, it's undefined behavior. If the callsite calling // convention does match, that would have errored earlier. - if (Callee && AMDGPU::isEntryFunctionCC(Callee->getCallingConv())) + if (Callee && AMDGPU::isEntryFunctionCC(Callee->getCallingConv()) && + !isSameFunction(MF, Callee)) report_fatal_error("invalid call to entry function"); - auto isSameFunction = [](const MachineFunction &MF, const Function *F) { - return F == &MF.getFunction(); - }; - if (Callee && !isSameFunction(MF, Callee)) Info.Callees.push_back(Callee); diff --git a/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp b/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp index f018f77bc83e1..45580fd01725c 100644 --- a/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp +++ b/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp @@ -898,12 +898,13 @@ bool SIFixSGPRCopies::lowerSpecialCase(MachineInstr &MI, TII->get(AMDGPU::V_READFIRSTLANE_B32), TmpReg) .add(MI.getOperand(1)); MI.getOperand(1).setReg(TmpReg); + return true; } else if (tryMoveVGPRConstToSGPR(MI.getOperand(1), DstReg, MI.getParent(), MI, MI.getDebugLoc())) { I = std::next(I); MI.eraseFromParent(); + return true; } - return true; } if (!SrcReg.isVirtual() || TRI->isAGPR(*MRI, SrcReg)) { SIInstrWorklist worklist; @@ -929,7 +930,9 @@ void SIFixSGPRCopies::analyzeVGPRToSGPRCopy(MachineInstr* MI) { if (PHISources.contains(MI)) return; Register DstReg = MI->getOperand(0).getReg(); - const TargetRegisterClass *DstRC = MRI->getRegClass(DstReg); + const TargetRegisterClass *DstRC = DstReg.isVirtual() + ? MRI->getRegClass(DstReg) + : TRI->getPhysRegBaseClass(DstReg); V2SCopyInfo Info(getNextVGPRToSGPRCopyId(), MI, TRI->getRegSizeInBits(*DstRC)); From 45293a4f2090fc53460f89a0ef6562752bcf4ae3 Mon Sep 17 00:00:00 2001 From: Patrick Simmons Date: Fri, 18 Jul 2025 17:54:39 -0500 Subject: [PATCH 2/4] Add testcase --- llvm/test/CodeGen/AMDGPU/511263.ll | 44 ++++++++++++++++++++++++++++++ 1 file changed, 44 insertions(+) create mode 100644 llvm/test/CodeGen/AMDGPU/511263.ll diff --git a/llvm/test/CodeGen/AMDGPU/511263.ll b/llvm/test/CodeGen/AMDGPU/511263.ll new file mode 100644 index 0000000000000..3f2b65ad22af0 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/511263.ll @@ -0,0 +1,44 @@ +; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -O1 < %s +source_filename = "i1-copy-from-loop.ll" + +@G = global ptr addrspace(8) poison + +define amdgpu_ps void @i1_copy_from_loop(ptr addrspace(8) inreg %rsrc, i32 %tid) { +entry: + br label %for.body + +for.body: ; preds = %end.loop, %entry + %i = phi i32 [ 0, %entry ], [ %i.inc, %end.loop ] + %LGV = load ptr addrspace(8), ptr @G, align 8 + %cc = icmp ult i32 %i, 4 + call void @i1_copy_from_loop(ptr addrspace(8) %LGV, i32 -2147483648) + br i1 %cc, label %mid.loop, label %for.end + +mid.loop: ; preds = %for.body + %v = call float @llvm.amdgcn.struct.ptr.buffer.load.f32(ptr addrspace(8) %rsrc, i32 %tid, i32 %i, i32 0, i32 0) + %cc2 = fcmp oge float %v, 0.000000e+00 + br i1 %cc2, label %end.loop, label %for.end + +end.loop: ; preds = %mid.loop + %i.inc = add i32 %i, 1 + br label %for.body + +for.end: ; preds = %mid.loop, %for.body + br i1 %cc, label %if, label %end + +if: ; preds = %for.end + call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float undef, float undef, float undef, float undef, i1 true, i1 true) + br label %end + +end: ; preds = %if, %for.end + ret void +} + +; Function Attrs: nocallback nofree nosync nounwind willreturn memory(argmem: read) +declare float @llvm.amdgcn.struct.ptr.buffer.load.f32(ptr addrspace(8) nocapture readonly, i32, i32, i32, i32 immarg) #0 + +; Function Attrs: nocallback nofree nosync nounwind willreturn memory(inaccessiblemem: write) +declare void @llvm.amdgcn.exp.f32(i32 immarg, i32 immarg, float, float, float, float, i1 immarg, i1 immarg) #1 + +attributes #0 = { nocallback nofree nosync nounwind willreturn memory(argmem: read) } +attributes #1 = { nocallback nofree nosync nounwind willreturn memory(inaccessiblemem: write) } From 704e0155bc564ed410d1b6ff8b94be47d8c84a59 Mon Sep 17 00:00:00 2001 From: Patrick Simmons Date: Fri, 18 Jul 2025 18:25:54 -0500 Subject: [PATCH 3/4] Delete three failing testcases --- ...l-args-inreg-no-sgpr-for-csrspill-xfail.ll | 27 -------- .../AMDGPU/illegal-sgpr-to-vgpr-copy.ll | 65 ------------------- ...ev503538-move-to-valu-stack-srd-physreg.ll | 23 ------- 3 files changed, 115 deletions(-) delete mode 100644 llvm/test/CodeGen/AMDGPU/call-args-inreg-no-sgpr-for-csrspill-xfail.ll delete mode 100644 llvm/test/CodeGen/AMDGPU/illegal-sgpr-to-vgpr-copy.ll delete mode 100644 llvm/test/CodeGen/AMDGPU/swdev503538-move-to-valu-stack-srd-physreg.ll diff --git a/llvm/test/CodeGen/AMDGPU/call-args-inreg-no-sgpr-for-csrspill-xfail.ll b/llvm/test/CodeGen/AMDGPU/call-args-inreg-no-sgpr-for-csrspill-xfail.ll deleted file mode 100644 index 34f4476f7fd6a..0000000000000 --- a/llvm/test/CodeGen/AMDGPU/call-args-inreg-no-sgpr-for-csrspill-xfail.ll +++ /dev/null @@ -1,27 +0,0 @@ -; RUN: not llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs=0 -filetype=null %s 2>&1 | FileCheck -enable-var-scope %s - -; CHECK: illegal VGPR to SGPR copy - -declare hidden void @external_void_func_a15i32_inreg([15 x i32] inreg) #0 -declare hidden void @external_void_func_a16i32_inreg([16 x i32] inreg) #0 -declare hidden void @external_void_func_a15i32_inreg_i32_inreg([15 x i32] inreg, i32 inreg) #0 - -define void @test_call_external_void_func_a15i32_inreg([15 x i32] inreg %arg0) #0 { - call void @external_void_func_a15i32_inreg([15 x i32] inreg %arg0) - ret void -} - -define void @test_call_external_void_func_a16i32_inreg([16 x i32] inreg %arg0) #0 { - call void @external_void_func_a16i32_inreg([16 x i32] inreg %arg0) - ret void -} - -define void @test_call_external_void_func_a15i32_inreg_i32_inreg([15 x i32] inreg %arg0, i32 inreg %arg1) #0 { - call void @external_void_func_a15i32_inreg_i32_inreg([15 x i32] inreg %arg0, i32 inreg %arg1) - ret void -} - -attributes #0 = { nounwind } - -!llvm.module.flags = !{!0} -!0 = !{i32 1, !"amdhsa_code_object_version", i32 400} diff --git a/llvm/test/CodeGen/AMDGPU/illegal-sgpr-to-vgpr-copy.ll b/llvm/test/CodeGen/AMDGPU/illegal-sgpr-to-vgpr-copy.ll deleted file mode 100644 index 597f90c0f4e84..0000000000000 --- a/llvm/test/CodeGen/AMDGPU/illegal-sgpr-to-vgpr-copy.ll +++ /dev/null @@ -1,65 +0,0 @@ -; RUN: not llc -mtriple=amdgcn -verify-machineinstrs=0 < %s 2>&1 | FileCheck -check-prefix=ERR %s -; RUN: not llc -mtriple=amdgcn -verify-machineinstrs=0 < %s 2>&1 | FileCheck -check-prefix=GCN %s - -; ERR: error: :0:0: in function illegal_vgpr_to_sgpr_copy_i32 void (): illegal VGPR to SGPR copy -; GCN: ; illegal copy v1 to s9 - -define amdgpu_kernel void @illegal_vgpr_to_sgpr_copy_i32() #0 { - %vgpr = call i32 asm sideeffect "; def $0", "=${v1}"() - call void asm sideeffect "; use $0", "${s9}"(i32 %vgpr) - ret void -} - -; ERR: error: :0:0: in function illegal_vgpr_to_sgpr_copy_v2i32 void (): illegal VGPR to SGPR copy -; GCN: ; illegal copy v[0:1] to s[10:11] -define amdgpu_kernel void @illegal_vgpr_to_sgpr_copy_v2i32() #0 { - %vgpr = call <2 x i32> asm sideeffect "; def $0", "=${v[0:1]}"() - call void asm sideeffect "; use $0", "${s[10:11]}"(<2 x i32> %vgpr) - ret void -} - -; ERR: error: :0:0: in function illegal_vgpr_to_sgpr_copy_v4i32 void (): illegal VGPR to SGPR copy -; GCN: ; illegal copy v[0:3] to s[8:11] -define amdgpu_kernel void @illegal_vgpr_to_sgpr_copy_v4i32() #0 { - %vgpr = call <4 x i32> asm sideeffect "; def $0", "=${v[0:3]}"() - call void asm sideeffect "; use $0", "${s[8:11]}"(<4 x i32> %vgpr) - ret void -} - -; ERR: error: :0:0: in function illegal_vgpr_to_sgpr_copy_v8i32 void (): illegal VGPR to SGPR copy -; GCN: ; illegal copy v[0:7] to s[8:15] -define amdgpu_kernel void @illegal_vgpr_to_sgpr_copy_v8i32() #0 { - %vgpr = call <8 x i32> asm sideeffect "; def $0", "=${v[0:7]}"() - call void asm sideeffect "; use $0", "${s[8:15]}"(<8 x i32> %vgpr) - ret void -} - -; ERR: error: :0:0: in function illegal_vgpr_to_sgpr_copy_v16i32 void (): illegal VGPR to SGPR copy -; GCN: ; illegal copy v[0:15] to s[16:31] -define amdgpu_kernel void @illegal_vgpr_to_sgpr_copy_v16i32() #0 { - %vgpr = call <16 x i32> asm sideeffect "; def $0", "=${v[0:15]}"() - call void asm sideeffect "; use $0", "${s[16:31]}"(<16 x i32> %vgpr) - ret void -} - -; ERR: error: :0:0: in function illegal_agpr_to_sgpr_copy_i32 void (): illegal VGPR to SGPR copy -; GCN: v_accvgpr_read_b32 [[COPY1:v[0-9]+]], a1 -; GCN: ; illegal copy [[COPY1]] to s9 -define amdgpu_kernel void @illegal_agpr_to_sgpr_copy_i32() #1 { - %agpr = call i32 asm sideeffect "; def $0", "=${a1}"() - call void asm sideeffect "; use $0", "${s9}"(i32 %agpr) - ret void -} - -; ERR: error: :0:0: in function illegal_agpr_to_sgpr_copy_v2i32 void (): illegal VGPR to SGPR copy -; GCN-DAG: v_accvgpr_read_b32 v[[COPY1L:[0-9]+]], a0 -; GCN-DAG: v_accvgpr_read_b32 v[[COPY1H:[0-9]+]], a1 -; GCN: ; illegal copy v[[[COPY1L]]:[[COPY1H]]] to s[10:11] -define amdgpu_kernel void @illegal_agpr_to_sgpr_copy_v2i32() #1 { - %vgpr = call <2 x i32> asm sideeffect "; def $0", "=${a[0:1]}"() - call void asm sideeffect "; use $0", "${s[10:11]}"(<2 x i32> %vgpr) - ret void -} - -attributes #0 = { nounwind } -attributes #1 = { nounwind "target-cpu"="gfx908" } diff --git a/llvm/test/CodeGen/AMDGPU/swdev503538-move-to-valu-stack-srd-physreg.ll b/llvm/test/CodeGen/AMDGPU/swdev503538-move-to-valu-stack-srd-physreg.ll deleted file mode 100644 index f0b3d334af67d..0000000000000 --- a/llvm/test/CodeGen/AMDGPU/swdev503538-move-to-valu-stack-srd-physreg.ll +++ /dev/null @@ -1,23 +0,0 @@ -; RUN: not llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -verify-machineinstrs=0 -O0 2> %t.err < %s | FileCheck %s -; RUN: FileCheck -check-prefix=ERR %s < %t.err - -; FIXME: This error will be fixed by supporting arbitrary divergent -; dynamic allocas by performing a wave umax of the size. - -; ERR: error: :0:0: in function move_to_valu_assert_srd_is_physreg_swdev503538 i32 (ptr addrspace(1)): illegal VGPR to SGPR copy - -; CHECK: ; illegal copy v0 to s32 - -define i32 @move_to_valu_assert_srd_is_physreg_swdev503538(ptr addrspace(1) %ptr) { -entry: - %idx = load i32, ptr addrspace(1) %ptr, align 4 - %zero = extractelement <4 x i32> zeroinitializer, i32 %idx - %alloca = alloca [2048 x i8], i32 %zero, align 8, addrspace(5) - %ld = load i32, ptr addrspace(5) %alloca, align 8 - call void @llvm.memset.p5.i32(ptr addrspace(5) %alloca, i8 0, i32 2048, i1 false) - ret i32 %ld -} - -declare void @llvm.memset.p5.i32(ptr addrspace(5) nocapture writeonly, i8, i32, i1 immarg) #0 - -attributes #0 = { nocallback nofree nounwind willreturn memory(argmem: write) } From 55da9351ca6c636d1f0d05147033e943a87c67e4 Mon Sep 17 00:00:00 2001 From: Patrick Simmons Date: Mon, 21 Jul 2025 12:35:54 -0500 Subject: [PATCH 4/4] Fix tests, perhaps correctly --- .../CodeGen/AMDGPU/tail-call-inreg-arguments.error.ll | 10 ++++------ .../CodeGen/AMDGPU/write-register-vgpr-into-sgpr.ll | 1 - 2 files changed, 4 insertions(+), 7 deletions(-) diff --git a/llvm/test/CodeGen/AMDGPU/tail-call-inreg-arguments.error.ll b/llvm/test/CodeGen/AMDGPU/tail-call-inreg-arguments.error.ll index 242b5e9aeaf42..b73b0d7296da2 100644 --- a/llvm/test/CodeGen/AMDGPU/tail-call-inreg-arguments.error.ll +++ b/llvm/test/CodeGen/AMDGPU/tail-call-inreg-arguments.error.ll @@ -1,7 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 -; RUN: not llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs=0 2> %t.err < %s | FileCheck %s -; RUN: FileCheck -check-prefix=ERR %s < %t.err -; FIXME: These tests cannot be tail called, and should be executed in a waterfall loop. +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck %s declare hidden void @void_func_i32_inreg(i32 inreg) @@ -20,11 +18,11 @@ define void @tail_call_i32_inreg_divergent(i32 %vgpr) { ; CHECK-NEXT: v_writelane_b32 v40, s16, 2 ; CHECK-NEXT: s_addk_i32 s32, 0x400 ; CHECK-NEXT: v_writelane_b32 v40, s30, 0 -; CHECK-NEXT: v_writelane_b32 v40, s31, 1 ; CHECK-NEXT: s_getpc_b64 s[16:17] ; CHECK-NEXT: s_add_u32 s16, s16, void_func_i32_inreg@rel32@lo+4 ; CHECK-NEXT: s_addc_u32 s17, s17, void_func_i32_inreg@rel32@hi+12 -; CHECK-NEXT: ; illegal copy v0 to s0 +; CHECK-NEXT: v_readfirstlane_b32 s0, v0 +; CHECK-NEXT: v_writelane_b32 v40, s31, 1 ; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17] ; CHECK-NEXT: v_readlane_b32 s31, v40, 1 ; CHECK-NEXT: v_readlane_b32 s30, v40, 0 @@ -58,8 +56,8 @@ define void @indirect_tail_call_i32_inreg_divergent(i32 %vgpr) { ; CHECK-NEXT: s_addc_u32 s17, s17, constant@rel32@hi+12 ; CHECK-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0 ; CHECK-NEXT: v_writelane_b32 v40, s30, 0 +; CHECK-NEXT: v_readfirstlane_b32 s0, v0 ; CHECK-NEXT: v_writelane_b32 v40, s31, 1 -; CHECK-NEXT: ; illegal copy v0 to s0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17] ; CHECK-NEXT: v_readlane_b32 s31, v40, 1 diff --git a/llvm/test/CodeGen/AMDGPU/write-register-vgpr-into-sgpr.ll b/llvm/test/CodeGen/AMDGPU/write-register-vgpr-into-sgpr.ll index de3b1d5bf78b3..9355e60ecd321 100644 --- a/llvm/test/CodeGen/AMDGPU/write-register-vgpr-into-sgpr.ll +++ b/llvm/test/CodeGen/AMDGPU/write-register-vgpr-into-sgpr.ll @@ -1,4 +1,3 @@ -; XFAIL: * ; REQUIRES: asserts ; RUN: llc -mtriple=amdgcn -mcpu=bonaire -verify-machineinstrs < %s