Skip to content

Commit 014ca4e

Browse files
committed
[AMDGPU] Enable serializing of allocated preload kernarg SGPRs info
- Support serialization of the number of allocated preload kernarg SGPRs - Support serialization of the first preload kernarg SGPR allocated Together they enable reconstructing correctly MIR with preload kernarg SGPRs.
1 parent e70e9ec commit 014ca4e

10 files changed

+173
-2
lines changed

llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2027,6 +2027,35 @@ bool GCNTargetMachine::parseMachineFunctionInfo(
20272027
MFI->ArgInfo.WorkItemIDZ, 0, 0)))
20282028
return true;
20292029

2030+
// Parse FirstKernArgPreloadReg separately, since it's a Register,
2031+
// not ArgDescriptor.
2032+
if (YamlMFI.ArgInfo && YamlMFI.ArgInfo->FirstKernArgPreloadReg) {
2033+
const auto &A = *YamlMFI.ArgInfo->FirstKernArgPreloadReg;
2034+
2035+
if (!A.IsRegister) {
2036+
const MemoryBuffer &Buffer =
2037+
*PFS.SM->getMemoryBuffer(PFS.SM->getMainFileID());
2038+
Error = SMDiagnostic(*PFS.SM, SMLoc(), Buffer.getBufferIdentifier(), 1, 0,
2039+
SourceMgr::DK_Error,
2040+
"firstKernArgPreloadReg must be a register", "", {},
2041+
{});
2042+
return true;
2043+
}
2044+
2045+
Register Reg;
2046+
if (parseNamedRegisterReference(PFS, Reg, A.RegisterName.Value, Error)) {
2047+
SourceRange = A.RegisterName.SourceRange;
2048+
return true;
2049+
}
2050+
2051+
if (!AMDGPU::SGPR_32RegClass.contains(Reg))
2052+
return diagnoseRegisterClass(A.RegisterName);
2053+
2054+
MFI->ArgInfo.FirstKernArgPreloadReg = Reg;
2055+
2056+
MFI->NumUserSGPRs += YamlMFI.NumKernargPreloadSGPRs;
2057+
}
2058+
20302059
if (ST.hasIEEEMode())
20312060
MFI->Mode.IEEE = YamlMFI.Mode.IEEE;
20322061
if (ST.hasDX10ClampMode())

llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp

Lines changed: 18 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -696,7 +696,6 @@ convertArgumentInfo(const AMDGPUFunctionArgInfo &ArgInfo,
696696
return true;
697697
};
698698

699-
// TODO: Need to serialize kernarg preloads.
700699
bool Any = false;
701700
Any |= convertArg(AI.PrivateSegmentBuffer, ArgInfo.PrivateSegmentBuffer);
702701
Any |= convertArg(AI.DispatchPtr, ArgInfo.DispatchPtr);
@@ -718,6 +717,20 @@ convertArgumentInfo(const AMDGPUFunctionArgInfo &ArgInfo,
718717
Any |= convertArg(AI.WorkItemIDY, ArgInfo.WorkItemIDY);
719718
Any |= convertArg(AI.WorkItemIDZ, ArgInfo.WorkItemIDZ);
720719

720+
// Write FirstKernArgPreloadReg separately, since it's a Register,
721+
// not ArgDescriptor.
722+
if (ArgInfo.FirstKernArgPreloadReg) {
723+
Register Reg = ArgInfo.FirstKernArgPreloadReg;
724+
if (Reg.isPhysical()) {
725+
yaml::SIArgument SA = yaml::SIArgument::createArgument(true);
726+
raw_string_ostream OS(SA.RegisterName.Value);
727+
OS << printReg(Reg, &TRI);
728+
729+
AI.FirstKernArgPreloadReg = SA;
730+
Any = true;
731+
}
732+
}
733+
721734
if (Any)
722735
return AI;
723736

@@ -750,7 +763,8 @@ yaml::SIMachineFunctionInfo::SIMachineFunctionInfo(
750763
Mode(MFI.getMode()), HasInitWholeWave(MFI.hasInitWholeWave()),
751764
IsWholeWaveFunction(MFI.isWholeWaveFunction()),
752765
DynamicVGPRBlockSize(MFI.getDynamicVGPRBlockSize()),
753-
ScratchReservedForDynamicVGPRs(MFI.getScratchReservedForDynamicVGPRs()) {
766+
ScratchReservedForDynamicVGPRs(MFI.getScratchReservedForDynamicVGPRs()),
767+
NumKernargPreloadSGPRs(MFI.getNumKernargPreloadedSGPRs()) {
754768
for (Register Reg : MFI.getSGPRSpillPhysVGPRs())
755769
SpillPhysVGPRS.push_back(regToString(Reg, TRI));
756770

@@ -799,6 +813,8 @@ bool SIMachineFunctionInfo::initializeBaseYamlFields(
799813
ReturnsVoid = YamlMFI.ReturnsVoid;
800814
IsWholeWaveFunction = YamlMFI.IsWholeWaveFunction;
801815

816+
UserSGPRInfo.allocKernargPreloadSGPRs(YamlMFI.NumKernargPreloadSGPRs);
817+
802818
if (YamlMFI.ScavengeFI) {
803819
auto FIOrErr = YamlMFI.ScavengeFI->getFI(MF.getFrameInfo());
804820
if (!FIOrErr) {

llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -170,6 +170,7 @@ struct SIArgumentInfo {
170170
std::optional<SIArgument> DispatchID;
171171
std::optional<SIArgument> FlatScratchInit;
172172
std::optional<SIArgument> PrivateSegmentSize;
173+
std::optional<SIArgument> FirstKernArgPreloadReg;
173174

174175
std::optional<SIArgument> WorkGroupIDX;
175176
std::optional<SIArgument> WorkGroupIDY;
@@ -195,6 +196,7 @@ template <> struct MappingTraits<SIArgumentInfo> {
195196
YamlIO.mapOptional("dispatchID", AI.DispatchID);
196197
YamlIO.mapOptional("flatScratchInit", AI.FlatScratchInit);
197198
YamlIO.mapOptional("privateSegmentSize", AI.PrivateSegmentSize);
199+
YamlIO.mapOptional("firstKernArgPreloadReg", AI.FirstKernArgPreloadReg);
198200

199201
YamlIO.mapOptional("workGroupIDX", AI.WorkGroupIDX);
200202
YamlIO.mapOptional("workGroupIDY", AI.WorkGroupIDY);
@@ -305,6 +307,8 @@ struct SIMachineFunctionInfo final : public yaml::MachineFunctionInfo {
305307
unsigned DynamicVGPRBlockSize = 0;
306308
unsigned ScratchReservedForDynamicVGPRs = 0;
307309

310+
unsigned NumKernargPreloadSGPRs = 0;
311+
308312
SIMachineFunctionInfo() = default;
309313
SIMachineFunctionInfo(const llvm::SIMachineFunctionInfo &,
310314
const TargetRegisterInfo &TRI,
@@ -361,6 +365,7 @@ template <> struct MappingTraits<SIMachineFunctionInfo> {
361365
YamlIO.mapOptional("dynamicVGPRBlockSize", MFI.DynamicVGPRBlockSize, false);
362366
YamlIO.mapOptional("scratchReservedForDynamicVGPRs",
363367
MFI.ScratchReservedForDynamicVGPRs, 0);
368+
YamlIO.mapOptional("numKernargPreloadSGPRs", MFI.NumKernargPreloadSGPRs, 0);
364369
YamlIO.mapOptional("isWholeWaveFunction", MFI.IsWholeWaveFunction, false);
365370
}
366371
};

llvm/test/CodeGen/MIR/AMDGPU/long-branch-reg-all-sgpr-used.ll

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,7 @@
4848
; CHECK-NEXT: hasInitWholeWave: false
4949
; CHECK-NEXT: dynamicVGPRBlockSize: 0
5050
; CHECK-NEXT: scratchReservedForDynamicVGPRs: 0
51+
; CHECK-NEXT: numKernargPreloadSGPRs: 0
5152
; CHECK-NEXT: isWholeWaveFunction: false
5253
; CHECK-NEXT: body:
5354
define amdgpu_kernel void @long_branch_used_all_sgprs(ptr addrspace(1) %arg, i32 %cnd) #0 {
@@ -320,6 +321,7 @@
320321
; CHECK-NEXT: hasInitWholeWave: false
321322
; CHECK-NEXT: dynamicVGPRBlockSize: 0
322323
; CHECK-NEXT: scratchReservedForDynamicVGPRs: 0
324+
; CHECK-NEXT: numKernargPreloadSGPRs: 0
323325
; CHECK-NEXT: isWholeWaveFunction: false
324326
; CHECK-NEXT: body:
325327
define amdgpu_kernel void @long_branch_high_num_sgprs_used(ptr addrspace(1) %arg, i32 %cnd) #0 {

llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-after-pei.ll

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,7 @@
4848
; AFTER-PEI-NEXT: hasInitWholeWave: false
4949
; AFTER-PEI-NEXT: dynamicVGPRBlockSize: 0
5050
; AFTER-PEI-NEXT: scratchReservedForDynamicVGPRs: 0
51+
; AFTER-PEI-NEXT: numKernargPreloadSGPRs: 0
5152
; AFTER-PEI-NEXT: isWholeWaveFunction: false
5253
; AFTER-PEI-NEXT: body:
5354
define amdgpu_kernel void @scavenge_fi(ptr addrspace(1) %out, i32 %in) #0 {

llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-long-branch-reg-debug.ll

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,7 @@
4848
; CHECK-NEXT: hasInitWholeWave: false
4949
; CHECK-NEXT: dynamicVGPRBlockSize: 0
5050
; CHECK-NEXT: scratchReservedForDynamicVGPRs: 0
51+
; CHECK-NEXT: numKernargPreloadSGPRs: 0
5152
; CHECK-NEXT: isWholeWaveFunction: false
5253
; CHECK-NEXT: body:
5354
define amdgpu_kernel void @uniform_long_forward_branch_debug(ptr addrspace(1) %arg, i32 %arg1) #0 !dbg !5 {

llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-long-branch-reg.ll

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,7 @@
4848
; CHECK-NEXT: hasInitWholeWave: false
4949
; CHECK-NEXT: dynamicVGPRBlockSize: 0
5050
; CHECK-NEXT: scratchReservedForDynamicVGPRs: 0
51+
; CHECK-NEXT: numKernargPreloadSGPRs: 0
5152
; CHECK-NEXT: isWholeWaveFunction: false
5253
; CHECK-NEXT: body:
5354
define amdgpu_kernel void @uniform_long_forward_branch(ptr addrspace(1) %arg, i32 %arg1) #0 {

llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-no-ir.mir

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,7 @@
5757
# FULL-NEXT: hasInitWholeWave: false
5858
# FULL-NEXT: dynamicVGPRBlockSize: 0
5959
# FULL-NEXT: scratchReservedForDynamicVGPRs: 0
60+
# FULL-NEXT: numKernargPreloadSGPRs: 0
6061
# FULL-NEXT: isWholeWaveFunction: false
6162
# FULL-NEXT: body:
6263

@@ -167,6 +168,7 @@ body: |
167168
# FULL-NEXT: hasInitWholeWave: false
168169
# FULL-NEXT: dynamicVGPRBlockSize: 0
169170
# FULL-NEXT: scratchReservedForDynamicVGPRs: 0
171+
# FULL-NEXT: numKernargPreloadSGPRs: 0
170172
# FULL-NEXT: isWholeWaveFunction: false
171173
# FULL-NEXT: body:
172174

@@ -248,6 +250,7 @@ body: |
248250
# FULL-NEXT: hasInitWholeWave: false
249251
# FULL-NEXT: dynamicVGPRBlockSize: 0
250252
# FULL-NEXT: scratchReservedForDynamicVGPRs: 0
253+
# FULL-NEXT: numKernargPreloadSGPRs: 0
251254
# FULL-NEXT: isWholeWaveFunction: false
252255
# FULL-NEXT: body:
253256

@@ -330,6 +333,7 @@ body: |
330333
# FULL-NEXT: hasInitWholeWave: false
331334
# FULL-NEXT: dynamicVGPRBlockSize: 0
332335
# FULL-NEXT: scratchReservedForDynamicVGPRs: 0
336+
# FULL-NEXT: numKernargPreloadSGPRs: 0
333337
# FULL-NEXT: isWholeWaveFunction: false
334338
# FULL-NEXT: body:
335339

llvm/test/CodeGen/MIR/AMDGPU/machine-function-info.ll

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -58,6 +58,7 @@
5858
; CHECK-NEXT: hasInitWholeWave: false
5959
; CHECK-NEXT: dynamicVGPRBlockSize: 0
6060
; CHECK-NEXT: scratchReservedForDynamicVGPRs: 0
61+
; CHECK-NEXT: numKernargPreloadSGPRs: 0
6162
; CHECK-NEXT: isWholeWaveFunction: false
6263
; CHECK-NEXT: body:
6364
define amdgpu_kernel void @kernel(i32 %arg0, i64 %arg1, <16 x i32> %arg2) {
@@ -110,6 +111,7 @@ define amdgpu_kernel void @kernel(i32 %arg0, i64 %arg1, <16 x i32> %arg2) {
110111
; CHECK-NEXT: hasInitWholeWave: false
111112
; CHECK-NEXT: dynamicVGPRBlockSize: 0
112113
; CHECK-NEXT: scratchReservedForDynamicVGPRs: 0
114+
; CHECK-NEXT: numKernargPreloadSGPRs: 0
113115
; CHECK-NEXT: isWholeWaveFunction: false
114116
; CHECK-NEXT: body:
115117
define amdgpu_ps void @ps_shader(i32 %arg0, i32 inreg %arg1) {
@@ -186,6 +188,7 @@ define amdgpu_ps void @gds_size_shader(i32 %arg0, i32 inreg %arg1) #5 {
186188
; CHECK-NEXT: hasInitWholeWave: false
187189
; CHECK-NEXT: dynamicVGPRBlockSize: 0
188190
; CHECK-NEXT: scratchReservedForDynamicVGPRs: 0
191+
; CHECK-NEXT: numKernargPreloadSGPRs: 0
189192
; CHECK-NEXT: isWholeWaveFunction: false
190193
; CHECK-NEXT: body:
191194
define void @function() {
@@ -244,6 +247,7 @@ define void @function() {
244247
; CHECK-NEXT: hasInitWholeWave: false
245248
; CHECK-NEXT: dynamicVGPRBlockSize: 0
246249
; CHECK-NEXT: scratchReservedForDynamicVGPRs: 0
250+
; CHECK-NEXT: numKernargPreloadSGPRs: 0
247251
; CHECK-NEXT: isWholeWaveFunction: false
248252
; CHECK-NEXT: body:
249253
define void @function_nsz() #0 {
Lines changed: 108 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,108 @@
1+
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx950 -stop-after=amdgpu-isel %s -o - | FileCheck --check-prefix=MIR %s
2+
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx950 -stop-after=amdgpu-isel -o %t.mir %s
3+
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx950 -start-after=amdgpu-isel -verify-machineinstrs %t.mir -o - | FileCheck --check-prefix=ASM %s
4+
5+
; Test that kernarg preloading information is correctly serialized to MIR and
6+
; can be round-tripped through MIR serialization/deserialization.
7+
8+
; MIR-LABEL: name: kernarg_preload_single_arg
9+
; MIR: machineFunctionInfo:
10+
; MIR: argumentInfo:
11+
; MIR: kernargSegmentPtr: { reg: '$sgpr4_sgpr5' }
12+
; MIR: firstKernArgPreloadReg: { reg: '$sgpr8' }
13+
; MIR: numKernargPreloadSGPRs: 1
14+
15+
; ASM-LABEL: kernarg_preload_single_arg:
16+
; ASM: .amdhsa_user_sgpr_kernarg_preload_length 1
17+
; ASM: .amdhsa_user_sgpr_kernarg_preload_offset 0
18+
define amdgpu_kernel void @kernarg_preload_single_arg(i32 inreg %arg0) {
19+
entry:
20+
%val = add i32 %arg0, 1
21+
store i32 %val, ptr addrspace(1) null
22+
ret void
23+
}
24+
25+
; MIR-LABEL: name: kernarg_preload_multiple_args_unaligned
26+
; MIR: machineFunctionInfo:
27+
; MIR: argumentInfo:
28+
; MIR: kernargSegmentPtr: { reg: '$sgpr4_sgpr5' }
29+
; MIR: firstKernArgPreloadReg: { reg: '$sgpr8' }
30+
; MIR: numKernargPreloadSGPRs: 5
31+
32+
; ASM-LABEL: kernarg_preload_multiple_args_unaligned:
33+
; ASM: .amdhsa_user_sgpr_kernarg_preload_length 5
34+
; ASM: .amdhsa_user_sgpr_kernarg_preload_offset 0
35+
define amdgpu_kernel void @kernarg_preload_multiple_args_unaligned(i32 inreg %arg0, i64 inreg %arg1, i32 inreg %arg2) {
36+
entry:
37+
%val = add i32 %arg0, %arg2
38+
store i32 %val, ptr addrspace(1) null
39+
ret void
40+
}
41+
42+
; MIR-LABEL: name: kernarg_preload_multiple_args_aligned
43+
; MIR: machineFunctionInfo:
44+
; MIR: argumentInfo:
45+
; MIR: kernargSegmentPtr: { reg: '$sgpr4_sgpr5' }
46+
; MIR: firstKernArgPreloadReg: { reg: '$sgpr8' }
47+
; MIR: numKernargPreloadSGPRs: 4
48+
49+
; ASM-LABEL: kernarg_preload_multiple_args_aligned:
50+
; ASM: .amdhsa_user_sgpr_kernarg_preload_length 4
51+
; ASM: .amdhsa_user_sgpr_kernarg_preload_offset 0
52+
define amdgpu_kernel void @kernarg_preload_multiple_args_aligned(i64 inreg %arg0, i32 inreg %arg1, i32 inreg %arg2) {
53+
entry:
54+
%val = add i32 %arg1, %arg2
55+
store i32 %val, ptr addrspace(1) null
56+
ret void
57+
}
58+
59+
; MIR-LABEL: name: kernarg_preload_with_ptr
60+
; MIR: machineFunctionInfo:
61+
; MIR: argumentInfo:
62+
; MIR: kernargSegmentPtr: { reg: '$sgpr4_sgpr5' }
63+
; MIR: firstKernArgPreloadReg: { reg: '$sgpr8' }
64+
; MIR: numKernargPreloadSGPRs: 2
65+
66+
; ASM-LABEL: kernarg_preload_with_ptr:
67+
; ASM: .amdhsa_user_sgpr_kernarg_preload_length 2
68+
; ASM: .amdhsa_user_sgpr_kernarg_preload_offset 0
69+
define amdgpu_kernel void @kernarg_preload_with_ptr(ptr inreg %ptr) {
70+
entry:
71+
%val = load i32, ptr %ptr
72+
%add = add i32 %val, 1
73+
store i32 %add, ptr %ptr
74+
ret void
75+
}
76+
77+
; MIR-LABEL: name: kernarg_no_preload
78+
; MIR: machineFunctionInfo:
79+
; MIR: argumentInfo:
80+
; MIR: kernargSegmentPtr: { reg: '$sgpr4_sgpr5' }
81+
; MIR-NOT: firstKernArgPreloadReg
82+
; MIR: numKernargPreloadSGPRs: 0
83+
84+
; ASM-LABEL: kernarg_no_preload:
85+
; ASM: .amdhsa_user_sgpr_kernarg_preload_length 0
86+
define amdgpu_kernel void @kernarg_no_preload(i32 %arg0) {
87+
entry:
88+
%val = add i32 %arg0, 1
89+
store i32 %val, ptr addrspace(1) null
90+
ret void
91+
}
92+
93+
; MIR-LABEL: name: kernarg_preload_mixed
94+
; MIR: machineFunctionInfo:
95+
; MIR: argumentInfo:
96+
; MIR: kernargSegmentPtr: { reg: '$sgpr4_sgpr5' }
97+
; MIR: firstKernArgPreloadReg: { reg: '$sgpr8' }
98+
; MIR: numKernargPreloadSGPRs: 2
99+
100+
; ASM-LABEL: kernarg_preload_mixed:
101+
; ASM: .amdhsa_user_sgpr_kernarg_preload_length 2
102+
define amdgpu_kernel void @kernarg_preload_mixed(i32 inreg %arg0, i32 inreg %arg1, i32 %arg2) {
103+
entry:
104+
%val = add i32 %arg0, %arg1
105+
%val2 = add i32 %val, %arg2
106+
store i32 %val2, ptr addrspace(1) null
107+
ret void
108+
}

0 commit comments

Comments
 (0)