Skip to content

Commit 4bcbeae

Browse files
authored
[AMDGPU] Enable kernel arg preloading with gfx90a (llvm#81180)
Add a trap instruction to the beginning of the kernel prologue to handle cases where preloading is attempted on HW loaded with incompatible firmware.
1 parent bb77047 commit 4bcbeae

File tree

9 files changed

+10685
-5345
lines changed

9 files changed

+10685
-5345
lines changed

llvm/docs/AMDGPUUsage.rst

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5515,7 +5515,10 @@ additional 256 bytes to the kernel_code_entry_byte_offset. This addition
55155515
facilitates the incorporation of a prologue to the kernel entry to handle cases
55165516
where code designed for kernarg preloading is executed on hardware equipped with
55175517
incompatible firmware. If hardware has compatible firmware the 256 bytes at the
5518-
start of the kernel entry will be skipped.
5518+
start of the kernel entry will be skipped. Additionally, the compiler backend
5519+
may insert a trap instruction at the start of the kernel prologue to manage
5520+
situations where kernarg preloading is attempted on hardware with incompatible
5521+
firmware.
55195522

55205523
.. _amdgpu-amdhsa-kernel-prolog:
55215524

llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -204,7 +204,8 @@ void AMDGPUAsmPrinter::emitFunctionBodyStart() {
204204

205205
if (MFI.getNumKernargPreloadedSGPRs() > 0) {
206206
assert(AMDGPU::hasKernargPreload(STM));
207-
getTargetStreamer()->EmitKernargPreloadHeader(*getGlobalSTI());
207+
getTargetStreamer()->EmitKernargPreloadHeader(*getGlobalSTI(),
208+
STM.isAmdHsaOS());
208209
}
209210
}
210211

llvm/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -145,7 +145,6 @@ static bool lowerKernelArguments(Function &F, const TargetMachine &TM) {
145145

146146
// Try to preload this argument into user SGPRs.
147147
if (Arg.hasInRegAttr() && InPreloadSequence && ST.hasKernargPreload() &&
148-
!ST.needsKernargPreloadBackwardsCompatibility() &&
149148
!Arg.getType()->isAggregateType())
150149
if (PreloadInfo.tryAllocPreloadSGPRs(AllocSize, EltOffset,
151150
LastExplicitArgOffset))

llvm/lib/Target/AMDGPU/GCNSubtarget.h

Lines changed: 0 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1258,12 +1258,6 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
12581258
// \returns true if preloading kernel arguments is supported.
12591259
bool hasKernargPreload() const { return KernargPreload; }
12601260

1261-
// \returns true if we need to generate backwards compatible code when
1262-
// preloading kernel arguments.
1263-
bool needsKernargPreloadBackwardsCompatibility() const {
1264-
return hasKernargPreload() && !hasGFX940Insts();
1265-
}
1266-
12671261
// \returns true if the target has split barriers feature
12681262
bool hasSplitBarriers() const { return getGeneration() >= GFX12; }
12691263

llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp

Lines changed: 12 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -782,18 +782,26 @@ bool AMDGPUTargetELFStreamer::EmitHSAMetadata(msgpack::Document &HSAMetadataDoc,
782782
}
783783

784784
bool AMDGPUTargetAsmStreamer::EmitKernargPreloadHeader(
785-
const MCSubtargetInfo &STI) {
786-
for (int i = 0; i < 64; ++i) {
785+
const MCSubtargetInfo &STI, bool TrapEnabled) {
786+
const char *TrapInstr = TrapEnabled ? "\ts_trap 2" : "\ts_endpgm";
787+
OS << TrapInstr
788+
<< " ; Trap with incompatible firmware that doesn't "
789+
"support preloading kernel arguments.\n";
790+
for (int i = 0; i < 63; ++i) {
787791
OS << "\ts_nop 0\n";
788792
}
789793
return true;
790794
}
791795

792796
bool AMDGPUTargetELFStreamer::EmitKernargPreloadHeader(
793-
const MCSubtargetInfo &STI) {
797+
const MCSubtargetInfo &STI, bool TrapEnabled) {
794798
const uint32_t Encoded_s_nop = 0xbf800000;
799+
const uint32_t Encoded_s_trap = 0xbf920002;
800+
const uint32_t Encoded_s_endpgm = 0xbf810000;
801+
const uint32_t TrapInstr = TrapEnabled ? Encoded_s_trap : Encoded_s_endpgm;
795802
MCStreamer &OS = getStreamer();
796-
for (int i = 0; i < 64; ++i) {
803+
OS.emitInt32(TrapInstr);
804+
for (int i = 0; i < 63; ++i) {
797805
OS.emitInt32(Encoded_s_nop);
798806
}
799807
return true;

llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -89,7 +89,8 @@ class AMDGPUTargetStreamer : public MCTargetStreamer {
8989
virtual bool EmitCodeEnd(const MCSubtargetInfo &STI) { return true; }
9090

9191
/// \returns True on success, false on failure.
92-
virtual bool EmitKernargPreloadHeader(const MCSubtargetInfo &STI) {
92+
virtual bool EmitKernargPreloadHeader(const MCSubtargetInfo &STI,
93+
bool TrapEnabled) {
9394
return true;
9495
}
9596

@@ -146,7 +147,8 @@ class AMDGPUTargetAsmStreamer final : public AMDGPUTargetStreamer {
146147
bool EmitCodeEnd(const MCSubtargetInfo &STI) override;
147148

148149
/// \returns True on success, false on failure.
149-
bool EmitKernargPreloadHeader(const MCSubtargetInfo &STI) override;
150+
bool EmitKernargPreloadHeader(const MCSubtargetInfo &STI,
151+
bool TrapEnabled) override;
150152

151153
void EmitAmdhsaKernelDescriptor(
152154
const MCSubtargetInfo &STI, StringRef KernelName,
@@ -200,7 +202,8 @@ class AMDGPUTargetELFStreamer final : public AMDGPUTargetStreamer {
200202
bool EmitCodeEnd(const MCSubtargetInfo &STI) override;
201203

202204
/// \returns True on success, false on failure.
203-
bool EmitKernargPreloadHeader(const MCSubtargetInfo &STI) override;
205+
bool EmitKernargPreloadHeader(const MCSubtargetInfo &STI,
206+
bool TrapEnabled) override;
204207

205208
void EmitAmdhsaKernelDescriptor(
206209
const MCSubtargetInfo &STI, StringRef KernelName,

llvm/lib/Target/AMDGPU/SIISelLowering.cpp

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2826,8 +2826,7 @@ SDValue SITargetLowering::LowerFormalArguments(
28262826
if (IsEntryFunc) {
28272827
allocateSpecialEntryInputVGPRs(CCInfo, MF, *TRI, *Info);
28282828
allocateHSAUserSGPRs(CCInfo, MF, *TRI, *Info);
2829-
if (IsKernel && Subtarget->hasKernargPreload() &&
2830-
!Subtarget->needsKernargPreloadBackwardsCompatibility())
2829+
if (IsKernel && Subtarget->hasKernargPreload())
28312830
allocatePreloadKernArgSGPRs(CCInfo, ArgLocs, Ins, MF, *TRI, *Info);
28322831

28332832
allocateLDSKernelId(CCInfo, MF, *TRI, *Info);

llvm/test/CodeGen/AMDGPU/preload-kernarg-header.ll

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,11 @@
1-
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -amdgpu-kernarg-preload-count=1 -asm-verbose=0 < %s | FileCheck -check-prefixes=GCN %s
2-
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -amdgpu-kernarg-preload-count=1 -filetype=obj < %s | llvm-objdump --arch=amdgcn --mcpu=gfx940 --disassemble - | FileCheck -check-prefixes=GCN %s
1+
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -amdgpu-kernarg-preload-count=1 -asm-verbose=0 < %s | FileCheck -check-prefixes=GCN,HSA %s
2+
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -amdgpu-kernarg-preload-count=1 -filetype=obj < %s | llvm-objdump --arch=amdgcn --mcpu=gfx940 --disassemble - | FileCheck -check-prefixes=GCN,HSA %s
3+
; RUN: llc -mtriple=amdgcn -mcpu=gfx940 -amdgpu-kernarg-preload-count=1 -filetype=obj < %s | llvm-objdump --arch=amdgcn --mcpu=gfx940 --disassemble - | FileCheck -check-prefixes=GCN,NON-HSA %s
34

45
; GCN: preload_kernarg_header
5-
; GCN-COUNT-64: s_nop 0
6+
; HSA: s_trap 2
7+
; NON-HSA: s_endpgm
8+
; GCN-COUNT-63: s_nop 0
69
define amdgpu_kernel void @preload_kernarg_header(ptr %arg) {
710
store ptr %arg, ptr %arg
811
ret void

0 commit comments

Comments
 (0)