Skip to content

Commit 0455596

Browse files
committed
[AMDGPU] Add DAG ISel support for preloaded kernel arguments
This patch adds the DAG isel changes for kernel argument preloading. These changes are not usable with older firmware but subsequent patches in the series will make the codegen backwards compatible. This patch should only be submitted alongside that subsequent patch. Preloading here begins from the start of the kernel arguments until the amount of arguments indicated by the CL flag amdgpu-kernarg-preload-count. Aggregates and arguments passed by-ref are not supported. Special care for the alignment of the kernarg segment is needed as well as consideration of the alignment of addressable SGPR tuples when we cannot directly use misaligned large tuples that the arguments are loaded to. Reviewed By: bcahoon Differential Revision: https://reviews.llvm.org/D158579
1 parent c77da6f commit 0455596

16 files changed

+5807
-103
lines changed

llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.cpp

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -60,6 +60,7 @@ bool AMDGPUArgumentUsageInfo::doFinalization(Module &M) {
6060
return false;
6161
}
6262

63+
// TODO: Print preload kernargs?
6364
void AMDGPUArgumentUsageInfo::print(raw_ostream &OS, const Module *M) const {
6465
for (const auto &FI : ArgInfoMap) {
6566
OS << "Arguments for " << FI.first->getName() << '\n'
@@ -148,7 +149,7 @@ AMDGPUFunctionArgInfo::getPreloadedValue(
148149
llvm_unreachable("unexpected preloaded value type");
149150
}
150151

151-
constexpr AMDGPUFunctionArgInfo AMDGPUFunctionArgInfo::fixedABILayout() {
152+
AMDGPUFunctionArgInfo AMDGPUFunctionArgInfo::fixedABILayout() {
152153
AMDGPUFunctionArgInfo AI;
153154
AI.PrivateSegmentBuffer
154155
= ArgDescriptor::createRegister(AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3);

llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h

Lines changed: 16 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUARGUMENTUSAGEINFO_H
1010
#define LLVM_LIB_TARGET_AMDGPU_AMDGPUARGUMENTUSAGEINFO_H
1111

12+
#include "llvm/ADT/DenseMap.h"
1213
#include "llvm/CodeGen/Register.h"
1314
#include "llvm/Pass.h"
1415

@@ -37,22 +38,19 @@ struct ArgDescriptor {
3738
bool IsSet : 1;
3839

3940
public:
40-
constexpr ArgDescriptor(unsigned Val = 0, unsigned Mask = ~0u,
41-
bool IsStack = false, bool IsSet = false)
42-
: Reg(Val), Mask(Mask), IsStack(IsStack), IsSet(IsSet) {}
41+
ArgDescriptor(unsigned Val = 0, unsigned Mask = ~0u, bool IsStack = false,
42+
bool IsSet = false)
43+
: Reg(Val), Mask(Mask), IsStack(IsStack), IsSet(IsSet) {}
4344

44-
static constexpr ArgDescriptor createRegister(Register Reg,
45-
unsigned Mask = ~0u) {
45+
static ArgDescriptor createRegister(Register Reg, unsigned Mask = ~0u) {
4646
return ArgDescriptor(Reg, Mask, false, true);
4747
}
4848

49-
static constexpr ArgDescriptor createStack(unsigned Offset,
50-
unsigned Mask = ~0u) {
49+
static ArgDescriptor createStack(unsigned Offset, unsigned Mask = ~0u) {
5150
return ArgDescriptor(Offset, Mask, true, true);
5251
}
5352

54-
static constexpr ArgDescriptor createArg(const ArgDescriptor &Arg,
55-
unsigned Mask) {
53+
static ArgDescriptor createArg(const ArgDescriptor &Arg, unsigned Mask) {
5654
return ArgDescriptor(Arg.Reg, Mask, Arg.IsStack, Arg.IsSet);
5755
}
5856

@@ -94,6 +92,11 @@ inline raw_ostream &operator<<(raw_ostream &OS, const ArgDescriptor &Arg) {
9492
return OS;
9593
}
9694

95+
struct KernArgPreloadDescriptor : public ArgDescriptor {
96+
KernArgPreloadDescriptor() {}
97+
SmallVector<MCRegister> Regs;
98+
};
99+
97100
struct AMDGPUFunctionArgInfo {
98101
enum PreloadedValue {
99102
// SGPRS:
@@ -151,10 +154,13 @@ struct AMDGPUFunctionArgInfo {
151154
ArgDescriptor WorkItemIDY;
152155
ArgDescriptor WorkItemIDZ;
153156

157+
// Map the index of preloaded kernel arguments to its descriptor.
158+
SmallDenseMap<int, KernArgPreloadDescriptor> PreloadKernArgs{};
159+
154160
std::tuple<const ArgDescriptor *, const TargetRegisterClass *, LLT>
155161
getPreloadedValue(PreloadedValue Value) const;
156162

157-
static constexpr AMDGPUFunctionArgInfo fixedABILayout();
163+
static AMDGPUFunctionArgInfo fixedABILayout();
158164
};
159165

160166
class AMDGPUArgumentUsageInfo : public ImmutablePass {

llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -205,6 +205,11 @@ void AMDGPUAsmPrinter::emitFunctionBodyStart() {
205205

206206
if (STM.isAmdHsaOS())
207207
HSAMetadataStream->emitKernel(*MF, CurrentProgramInfo);
208+
209+
if (MFI.getNumKernargPreloadedSGPRs() > 0) {
210+
assert(AMDGPU::hasKernargPreload(STM));
211+
getTargetStreamer()->EmitKernargPreloadHeader(*getGlobalSTI());
212+
}
208213
}
209214

210215
void AMDGPUAsmPrinter::emitFunctionBodyEnd() {
@@ -417,6 +422,7 @@ amdhsa::kernel_descriptor_t AMDGPUAsmPrinter::getAmdhsaKernelDescriptor(
417422
const SIProgramInfo &PI) const {
418423
const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>();
419424
const Function &F = MF.getFunction();
425+
const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
420426

421427
amdhsa::kernel_descriptor_t KernelDescriptor;
422428
memset(&KernelDescriptor, 0x0, sizeof(KernelDescriptor));
@@ -440,6 +446,10 @@ amdhsa::kernel_descriptor_t AMDGPUAsmPrinter::getAmdhsaKernelDescriptor(
440446
KernelDescriptor.compute_pgm_rsrc3 =
441447
CurrentProgramInfo.ComputePGMRSrc3GFX90A;
442448

449+
if (AMDGPU::hasKernargPreload(STM))
450+
KernelDescriptor.kernarg_preload =
451+
static_cast<uint16_t>(Info->getNumKernargPreloadedSGPRs());
452+
443453
return KernelDescriptor;
444454
}
445455

llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp

Lines changed: 18 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1027,7 +1027,8 @@ const AMDGPUSubtarget &AMDGPUSubtarget::get(const TargetMachine &TM, const Funct
10271027
}
10281028

10291029
GCNUserSGPRUsageInfo::GCNUserSGPRUsageInfo(const Function &F,
1030-
const GCNSubtarget &ST) {
1030+
const GCNSubtarget &ST)
1031+
: ST(ST) {
10311032
const CallingConv::ID CC = F.getCallingConv();
10321033
const bool IsKernel =
10331034
CC == CallingConv::AMDGPU_KERNEL || CC == CallingConv::SPIR_KERNEL;
@@ -1068,30 +1069,35 @@ GCNUserSGPRUsageInfo::GCNUserSGPRUsageInfo(const Function &F,
10681069
!ST.flatScratchIsArchitected()) {
10691070
FlatScratchInit = true;
10701071
}
1071-
}
10721072

1073-
unsigned GCNUserSGPRUsageInfo::getNumUsedUserSGPRs() const {
1074-
unsigned NumUserSGPRs = 0;
10751073
if (hasImplicitBufferPtr())
1076-
NumUserSGPRs += getNumUserSGPRForField(ImplicitBufferPtrID);
1074+
NumUsedUserSGPRs += getNumUserSGPRForField(ImplicitBufferPtrID);
10771075

10781076
if (hasPrivateSegmentBuffer())
1079-
NumUserSGPRs += getNumUserSGPRForField(PrivateSegmentBufferID);
1077+
NumUsedUserSGPRs += getNumUserSGPRForField(PrivateSegmentBufferID);
10801078

10811079
if (hasDispatchPtr())
1082-
NumUserSGPRs += getNumUserSGPRForField(DispatchPtrID);
1080+
NumUsedUserSGPRs += getNumUserSGPRForField(DispatchPtrID);
10831081

10841082
if (hasQueuePtr())
1085-
NumUserSGPRs += getNumUserSGPRForField(QueuePtrID);
1083+
NumUsedUserSGPRs += getNumUserSGPRForField(QueuePtrID);
10861084

10871085
if (hasKernargSegmentPtr())
1088-
NumUserSGPRs += getNumUserSGPRForField(KernargSegmentPtrID);
1086+
NumUsedUserSGPRs += getNumUserSGPRForField(KernargSegmentPtrID);
10891087

10901088
if (hasDispatchID())
1091-
NumUserSGPRs += getNumUserSGPRForField(DispatchIdID);
1089+
NumUsedUserSGPRs += getNumUserSGPRForField(DispatchIdID);
10921090

10931091
if (hasFlatScratchInit())
1094-
NumUserSGPRs += getNumUserSGPRForField(FlatScratchInitID);
1092+
NumUsedUserSGPRs += getNumUserSGPRForField(FlatScratchInitID);
1093+
}
1094+
1095+
void GCNUserSGPRUsageInfo::allocKernargPreloadSGPRs(unsigned NumSGPRs) {
1096+
assert(NumKernargPreloadSGPRs + NumSGPRs <= AMDGPU::getMaxNumUserSGPRs(ST));
1097+
NumKernargPreloadSGPRs += NumSGPRs;
1098+
NumUsedUserSGPRs += NumSGPRs;
1099+
}
10951100

1096-
return NumUserSGPRs;
1101+
unsigned GCNUserSGPRUsageInfo::getNumFreeUserSGPRs() {
1102+
return AMDGPU::getMaxNumUserSGPRs(ST) - NumUsedUserSGPRs;
10971103
}

llvm/lib/Target/AMDGPU/GCNSubtarget.h

Lines changed: 14 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1394,8 +1394,6 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
13941394

13951395
class GCNUserSGPRUsageInfo {
13961396
public:
1397-
unsigned getNumUsedUserSGPRs() const;
1398-
13991397
bool hasImplicitBufferPtr() const { return ImplicitBufferPtr; }
14001398

14011399
bool hasPrivateSegmentBuffer() const { return PrivateSegmentBuffer; }
@@ -1410,6 +1408,14 @@ class GCNUserSGPRUsageInfo {
14101408

14111409
bool hasFlatScratchInit() const { return FlatScratchInit; }
14121410

1411+
unsigned getNumKernargPreloadSGPRs() const { return NumKernargPreloadSGPRs; }
1412+
1413+
unsigned getNumUsedUserSGPRs() const { return NumUsedUserSGPRs; }
1414+
1415+
unsigned getNumFreeUserSGPRs();
1416+
1417+
void allocKernargPreloadSGPRs(unsigned NumSGPRs);
1418+
14131419
enum UserSGPRID : unsigned {
14141420
ImplicitBufferPtrID = 0,
14151421
PrivateSegmentBufferID = 1,
@@ -1447,6 +1453,8 @@ class GCNUserSGPRUsageInfo {
14471453
GCNUserSGPRUsageInfo(const Function &F, const GCNSubtarget &ST);
14481454

14491455
private:
1456+
const GCNSubtarget &ST;
1457+
14501458
// Private memory buffer
14511459
// Compute directly in sgpr[0:1]
14521460
// Other shaders indirect 64-bits at sgpr[0:1]
@@ -1463,6 +1471,10 @@ class GCNUserSGPRUsageInfo {
14631471
bool DispatchID = false;
14641472

14651473
bool FlatScratchInit = false;
1474+
1475+
unsigned NumKernargPreloadSGPRs = 0;
1476+
1477+
unsigned NumUsedUserSGPRs = 0;
14661478
};
14671479

14681480
} // end namespace llvm

llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -830,6 +830,24 @@ bool AMDGPUTargetELFStreamer::EmitHSAMetadata(
830830
return true;
831831
}
832832

833+
bool AMDGPUTargetAsmStreamer::EmitKernargPreloadHeader(
834+
const MCSubtargetInfo &STI) {
835+
for (int i = 0; i < 64; ++i) {
836+
OS << "\ts_nop 0\n";
837+
}
838+
return true;
839+
}
840+
841+
bool AMDGPUTargetELFStreamer::EmitKernargPreloadHeader(
842+
const MCSubtargetInfo &STI) {
843+
const uint32_t Encoded_s_nop = 0xbf800000;
844+
MCStreamer &OS = getStreamer();
845+
for (int i = 0; i < 64; ++i) {
846+
OS.emitInt32(Encoded_s_nop);
847+
}
848+
return true;
849+
}
850+
833851
bool AMDGPUTargetELFStreamer::EmitCodeEnd(const MCSubtargetInfo &STI) {
834852
const uint32_t Encoded_s_code_end = 0xbf9f0000;
835853
const uint32_t Encoded_s_nop = 0xbf800000;

llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -90,6 +90,11 @@ class AMDGPUTargetStreamer : public MCTargetStreamer {
9090
/// \returns True on success, false on failure.
9191
virtual bool EmitCodeEnd(const MCSubtargetInfo &STI) { return true; }
9292

93+
/// \returns True on success, false on failure.
94+
virtual bool EmitKernargPreloadHeader(const MCSubtargetInfo &STI) {
95+
return true;
96+
}
97+
9398
virtual void EmitAmdhsaKernelDescriptor(
9499
const MCSubtargetInfo &STI, StringRef KernelName,
95100
const amdhsa::kernel_descriptor_t &KernelDescriptor, uint64_t NextVGPR,
@@ -154,6 +159,9 @@ class AMDGPUTargetAsmStreamer final : public AMDGPUTargetStreamer {
154159
/// \returns True on success, false on failure.
155160
bool EmitCodeEnd(const MCSubtargetInfo &STI) override;
156161

162+
/// \returns True on success, false on failure.
163+
bool EmitKernargPreloadHeader(const MCSubtargetInfo &STI) override;
164+
157165
void EmitAmdhsaKernelDescriptor(
158166
const MCSubtargetInfo &STI, StringRef KernelName,
159167
const amdhsa::kernel_descriptor_t &KernelDescriptor, uint64_t NextVGPR,
@@ -215,6 +223,9 @@ class AMDGPUTargetELFStreamer final : public AMDGPUTargetStreamer {
215223
/// \returns True on success, false on failure.
216224
bool EmitCodeEnd(const MCSubtargetInfo &STI) override;
217225

226+
/// \returns True on success, false on failure.
227+
bool EmitKernargPreloadHeader(const MCSubtargetInfo &STI) override;
228+
218229
void EmitAmdhsaKernelDescriptor(
219230
const MCSubtargetInfo &STI, StringRef KernelName,
220231
const amdhsa::kernel_descriptor_t &KernelDescriptor, uint64_t NextVGPR,

0 commit comments

Comments
 (0)