Skip to content

Commit cb4da45

Browse files
committed
[AMDGPU] Split struct kernel arguments
AMDGPU backend has a pass which does transformations to allow firmware to preload kernel arguments into sgpr's to avoid loading them from kernel arg segment. This pass can improve kernel latency but it cannot preload struct-type kernel arguments. This patch adds a pass to AMDGPU backend to split and flatten struct-type kernel arguments so that later passes can preload them into sgpr's. Basically, the pass collects load or GEP/load instructions with struct-type kenel args as operands and makes them new arguments as the kernel. If all uses of a struct-type kernel arg can be replaced, it will do the replacements and create a new kernel with the new signature, and translate all instructions of the old kernel to use the new arguments in the new kernel. It adds a function attribute to encode the mapping from the new kernel argument index to the old kernel argument index and offset. The streamer will generate kernel argument metadata based on that and runtime will process the kernel arguments based on the metadata. The pass is disabled by default and can be enabled by LLVM option `-amdgpu-enable-split-kernel-args`.
1 parent 091051f commit cb4da45

File tree

9 files changed

+562
-5
lines changed

9 files changed

+562
-5
lines changed

llvm/lib/Target/AMDGPU/AMDGPU.h

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -125,6 +125,15 @@ struct AMDGPUPromoteKernelArgumentsPass
125125
PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
126126
};
127127

128+
ModulePass *createAMDGPUSplitKernelArgumentsPass();
129+
void initializeAMDGPUSplitKernelArgumentsPass(PassRegistry &);
130+
extern char &AMDGPUSplitKernelArgumentsID;
131+
132+
struct AMDGPUSplitKernelArgumentsPass
133+
: PassInfoMixin<AMDGPUSplitKernelArgumentsPass> {
134+
PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM);
135+
};
136+
128137
ModulePass *createAMDGPULowerKernelAttributesPass();
129138
void initializeAMDGPULowerKernelAttributesPass(PassRegistry &);
130139
extern char &AMDGPULowerKernelAttributesID;

llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp

Lines changed: 43 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -357,17 +357,51 @@ void MetadataStreamerMsgPackV4::emitKernelArg(const Argument &Arg,
357357
Align ArgAlign;
358358
std::tie(ArgTy, ArgAlign) = getArgumentTypeAlign(Arg, DL);
359359

360+
// Assuming the argument is not split from struct-type argument by default,
361+
// unless we find it in function attribute amdgpu-argument-mapping.
362+
unsigned OriginalArgIndex = ~0U;
363+
uint64_t OriginalArgOffset = 0;
364+
if (Func->hasFnAttribute("amdgpu-argument-mapping")) {
365+
StringRef MappingStr =
366+
Func->getFnAttribute("amdgpu-argument-mapping").getValueAsString();
367+
SmallVector<StringRef, 8> Mappings;
368+
MappingStr.split(Mappings, ',');
369+
for (const StringRef &Mapping : Mappings) {
370+
SmallVector<StringRef, 3> Elements;
371+
Mapping.split(Elements, ':');
372+
if (Elements.size() != 3)
373+
continue;
374+
375+
unsigned NewArgIndex = 0;
376+
unsigned OrigArgIndex = 0;
377+
uint64_t OffsetValue = 0;
378+
if (Elements[0].getAsInteger(10, NewArgIndex))
379+
continue;
380+
if (Elements[1].getAsInteger(10, OrigArgIndex))
381+
continue;
382+
if (Elements[2].getAsInteger(10, OffsetValue))
383+
continue;
384+
385+
if (NewArgIndex == ArgNo) {
386+
OriginalArgIndex = OrigArgIndex;
387+
OriginalArgOffset = OffsetValue;
388+
break;
389+
}
390+
}
391+
}
392+
360393
emitKernelArg(DL, ArgTy, ArgAlign,
361394
getValueKind(ArgTy, TypeQual, BaseTypeName), Offset, Args,
362-
PointeeAlign, Name, TypeName, BaseTypeName, ActAccQual,
363-
AccQual, TypeQual);
395+
PointeeAlign, Name, TypeName, BaseTypeName, ActAccQual, AccQual,
396+
TypeQual, OriginalArgIndex, OriginalArgOffset);
364397
}
365398

366399
void MetadataStreamerMsgPackV4::emitKernelArg(
367400
const DataLayout &DL, Type *Ty, Align Alignment, StringRef ValueKind,
368401
unsigned &Offset, msgpack::ArrayDocNode Args, MaybeAlign PointeeAlign,
369402
StringRef Name, StringRef TypeName, StringRef BaseTypeName,
370-
StringRef ActAccQual, StringRef AccQual, StringRef TypeQual) {
403+
StringRef ActAccQual, StringRef AccQual, StringRef TypeQual,
404+
unsigned OriginalArgIndex, uint64_t OriginalArgOffset) {
371405
auto Arg = Args.getDocument()->getMapNode();
372406

373407
if (!Name.empty())
@@ -409,6 +443,12 @@ void MetadataStreamerMsgPackV4::emitKernelArg(
409443
Arg[".is_pipe"] = Arg.getDocument()->getNode(true);
410444
}
411445

446+
// Add original argument index and offset to the metadata
447+
if (OriginalArgIndex != ~0U) {
448+
Arg[".original_arg_index"] = Arg.getDocument()->getNode(OriginalArgIndex);
449+
Arg[".original_arg_offset"] = Arg.getDocument()->getNode(OriginalArgOffset);
450+
}
451+
412452
Args.push_back(Arg);
413453
}
414454

llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.h

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -116,7 +116,9 @@ class LLVM_EXTERNAL_VISIBILITY MetadataStreamerMsgPackV4
116116
MaybeAlign PointeeAlign = std::nullopt,
117117
StringRef Name = "", StringRef TypeName = "",
118118
StringRef BaseTypeName = "", StringRef ActAccQual = "",
119-
StringRef AccQual = "", StringRef TypeQual = "");
119+
StringRef AccQual = "", StringRef TypeQual = "",
120+
unsigned OriginalArgIndex = ~0U,
121+
uint64_t OriginalArgOffset = 0);
120122

121123
void emitHiddenKernelArgs(const MachineFunction &MF, unsigned &Offset,
122124
msgpack::ArrayDocNode Args) override;

llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@ MODULE_PASS("amdgpu-printf-runtime-binding", AMDGPUPrintfRuntimeBindingPass())
2929
MODULE_PASS("amdgpu-remove-incompatible-functions", AMDGPURemoveIncompatibleFunctionsPass(*this))
3030
MODULE_PASS("amdgpu-sw-lower-lds", AMDGPUSwLowerLDSPass(*this))
3131
MODULE_PASS("amdgpu-unify-metadata", AMDGPUUnifyMetadataPass())
32+
MODULE_PASS("amdgpu-split-kernel-arguments", AMDGPUSplitKernelArgumentsPass())
3233
#undef MODULE_PASS
3334

3435
#ifndef MODULE_PASS_WITH_PARAMS

0 commit comments

Comments
 (0)