diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp index b5e9d3759d608..31e0bd8d652bc 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp @@ -1230,18 +1230,18 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo, ProgInfo.LdsSize = STM.isAmdHsaOS() ? 0 : ProgInfo.LDSBlocks; ProgInfo.EXCPEnable = 0; - if (STM.hasGFX90AInsts()) { - // return ((Dst & ~Mask) | (Value << Shift)) - auto SetBits = [&Ctx](const MCExpr *Dst, const MCExpr *Value, uint32_t Mask, - uint32_t Shift) { - const auto *Shft = MCConstantExpr::create(Shift, Ctx); - const auto *Msk = MCConstantExpr::create(Mask, Ctx); - Dst = MCBinaryExpr::createAnd(Dst, MCUnaryExpr::createNot(Msk, Ctx), Ctx); - Dst = MCBinaryExpr::createOr( - Dst, MCBinaryExpr::createShl(Value, Shft, Ctx), Ctx); - return Dst; - }; + // return ((Dst & ~Mask) | (Value << Shift)) + auto SetBits = [&Ctx](const MCExpr *Dst, const MCExpr *Value, uint32_t Mask, + uint32_t Shift) { + const auto *Shft = MCConstantExpr::create(Shift, Ctx); + const auto *Msk = MCConstantExpr::create(Mask, Ctx); + Dst = MCBinaryExpr::createAnd(Dst, MCUnaryExpr::createNot(Msk, Ctx), Ctx); + Dst = MCBinaryExpr::createOr(Dst, MCBinaryExpr::createShl(Value, Shft, Ctx), + Ctx); + return Dst; + }; + if (STM.hasGFX90AInsts()) { ProgInfo.ComputePGMRSrc3 = SetBits(ProgInfo.ComputePGMRSrc3, ProgInfo.AccumOffset, amdhsa::COMPUTE_PGM_RSRC3_GFX90A_ACCUM_OFFSET, @@ -1268,6 +1268,26 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo, ", final occupancy is " + Twine(Occupancy)); F.getContext().diagnose(Diag); } + + if (isGFX11Plus(STM)) { + uint32_t CodeSizeInBytes = (uint32_t)std::min( + ProgInfo.getFunctionCodeSize(MF, true /* IsLowerBound */), + (uint64_t)std::numeric_limits::max()); + uint32_t CodeSizeInLines = divideCeil(CodeSizeInBytes, 128); + uint32_t Field, Shift, Width; + if (isGFX11(STM)) { + Field = amdhsa::COMPUTE_PGM_RSRC3_GFX11_INST_PREF_SIZE; + Shift = amdhsa::COMPUTE_PGM_RSRC3_GFX11_INST_PREF_SIZE_SHIFT; + Width = amdhsa::COMPUTE_PGM_RSRC3_GFX11_INST_PREF_SIZE_WIDTH; + } else { + Field = amdhsa::COMPUTE_PGM_RSRC3_GFX12_PLUS_INST_PREF_SIZE; + Shift = amdhsa::COMPUTE_PGM_RSRC3_GFX12_PLUS_INST_PREF_SIZE_SHIFT; + Width = amdhsa::COMPUTE_PGM_RSRC3_GFX12_PLUS_INST_PREF_SIZE_WIDTH; + } + uint64_t InstPrefSize = std::min(CodeSizeInLines, (1u << Width) - 1); + ProgInfo.ComputePGMRSrc3 = SetBits(ProgInfo.ComputePGMRSrc3, + CreateExpr(InstPrefSize), Field, Shift); + } } static unsigned getRsrcReg(CallingConv::ID CallConv) { diff --git a/llvm/lib/Target/AMDGPU/SIProgramInfo.cpp b/llvm/lib/Target/AMDGPU/SIProgramInfo.cpp index 82e84b5fc1640..ef72690b91662 100644 --- a/llvm/lib/Target/AMDGPU/SIProgramInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIProgramInfo.cpp @@ -202,8 +202,9 @@ const MCExpr *SIProgramInfo::getPGMRSrc2(CallingConv::ID CC, return MCConstantExpr::create(0, Ctx); } -uint64_t SIProgramInfo::getFunctionCodeSize(const MachineFunction &MF) { - if (CodeSizeInBytes.has_value()) +uint64_t SIProgramInfo::getFunctionCodeSize(const MachineFunction &MF, + bool IsLowerBound) { + if (!IsLowerBound && CodeSizeInBytes.has_value()) return *CodeSizeInBytes; const GCNSubtarget &STM = MF.getSubtarget(); @@ -216,7 +217,8 @@ uint64_t SIProgramInfo::getFunctionCodeSize(const MachineFunction &MF) { // overestimated. In case of inline asm used getInstSizeInBytes() will // return a maximum size of a single instruction, where the real size may // differ. At this point CodeSize may be already off. - CodeSize = alignTo(CodeSize, MBB.getAlignment()); + if (!IsLowerBound) + CodeSize = alignTo(CodeSize, MBB.getAlignment()); for (const MachineInstr &MI : MBB) { // TODO: CodeSize should account for multiple functions. @@ -224,6 +226,11 @@ uint64_t SIProgramInfo::getFunctionCodeSize(const MachineFunction &MF) { if (MI.isMetaInstruction()) continue; + // We cannot properly estimate inline asm size. It can be as small as zero + // if that is just a comment. + if (IsLowerBound && MI.isInlineAsm()) + continue; + CodeSize += TII->getInstSizeInBytes(MI); } } diff --git a/llvm/lib/Target/AMDGPU/SIProgramInfo.h b/llvm/lib/Target/AMDGPU/SIProgramInfo.h index 3f68b0255a375..35c8d58f3c476 100644 --- a/llvm/lib/Target/AMDGPU/SIProgramInfo.h +++ b/llvm/lib/Target/AMDGPU/SIProgramInfo.h @@ -101,7 +101,10 @@ struct LLVM_EXTERNAL_VISIBILITY SIProgramInfo { void reset(const MachineFunction &MF); // Get function code size and cache the value. - uint64_t getFunctionCodeSize(const MachineFunction &MF); + // If \p IsLowerBound is set it returns a minimal code size which is safe + // to address. + uint64_t getFunctionCodeSize(const MachineFunction &MF, + bool IsLowerBound = false); /// Compute the value of the ComputePGMRsrc1 register. const MCExpr *getComputePGMRSrc1(const GCNSubtarget &ST, diff --git a/llvm/test/CodeGen/AMDGPU/inst-prefetch-hint.ll b/llvm/test/CodeGen/AMDGPU/inst-prefetch-hint.ll new file mode 100644 index 0000000000000..580167076e1f0 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/inst-prefetch-hint.ll @@ -0,0 +1,32 @@ +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 --amdgpu-memcpy-loop-unroll=100000 < %s | FileCheck --check-prefixes=GCN,GFX11 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 --amdgpu-memcpy-loop-unroll=100000 < %s | FileCheck --check-prefixes=GCN,GFX12 %s + +; GCN-LABEL: .amdhsa_kernel large +; GFX11: .amdhsa_inst_pref_size 3 +; GFX11: codeLenInByte = 3{{[0-9][0-9]$}} +; GFX12: .amdhsa_inst_pref_size 4 +; GFX12: codeLenInByte = 4{{[0-9][0-9]$}} +define amdgpu_kernel void @large(ptr addrspace(1) %out, ptr addrspace(1) %in) { +bb: + call void @llvm.memcpy.p1.p3.i32(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 256, i1 false) + ret void +} + +; GCN-LABEL: .amdhsa_kernel small +; GCN: .amdhsa_inst_pref_size 1 +; GCN: codeLenInByte = {{[0-9]$}} +define amdgpu_kernel void @small() { +bb: + ret void +} + +; Ignore inline asm in size calculation + +; GCN-LABEL: .amdhsa_kernel inline_asm +; GCN: .amdhsa_inst_pref_size 1 +; GCN: codeLenInByte = {{[0-9]$}} +define amdgpu_kernel void @inline_asm() { +bb: + call void asm sideeffect ".fill 256, 4, 0", ""() + ret void +}