Skip to content

Commit 59b8dd0

Browse files
kerbowayxsamliu
authored andcommitted
[AMDGPU] Fix hidden kernarg preload count inconsistency (llvm#116759)
It is possible that the number of hidden arguments that are selected to be preloaded in AMDGPULowerKernel arguments and isel can differ. This isn't an issue with explicit arguments since isel can lower the argument correctly either way, but with hidden arguments we may have alignment issues if we try to load these hidden arguments that were added to the kernel signature. The reason for the mismatch is that isel reserves an extra synthetic user SGPR for module LDS. Instead of teaching lowerFormalArguments how to handle these properly it makes more sense and is less expensive to fix the mismatch and assert if we ever run into this issue again. We should never be trying to lower these in the normal way. In a future change we probably want to revise how we track "synthetic" user SGPRs and unify the handling in GCNUserSGPRUsageInfo. Sometimes synthetic SGPRSs are considered user SGPRs and sometimes they are not. Until then this patch resolves the inconsistency, fixes the bug, and is otherwise a NFC. Change-Id: Ib474daf86ee84913ed1f864f52c399f0a1480710
1 parent 30a7332 commit 59b8dd0

File tree

5 files changed

+121
-12
lines changed

5 files changed

+121
-12
lines changed

llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -530,6 +530,12 @@ bool AMDGPUCallLowering::lowerFormalArgumentsKernel(
530530

531531
// TODO: Align down to dword alignment and extract bits for extending loads.
532532
for (auto &Arg : F.args()) {
533+
// TODO: Add support for kernarg preload.
534+
if (Arg.hasAttribute("amdgpu-hidden-argument")) {
535+
LLVM_DEBUG(dbgs() << "Preloading hidden arguments is not supported\n");
536+
return false;
537+
}
538+
533539
const bool IsByRef = Arg.hasByRefAttr();
534540
Type *ArgTy = IsByRef ? Arg.getParamByRefType() : Arg.getType();
535541
unsigned AllocSize = DL.getTypeAllocSize(ArgTy);

llvm/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp

Lines changed: 8 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -144,10 +144,8 @@ class PreloadKernelArgInfo {
144144
// Returns the maximum number of user SGPRs that we have available to preload
145145
// arguments.
146146
void setInitialFreeUserSGPRsCount() {
147-
const unsigned MaxUserSGPRs = ST.getMaxNumUserSGPRs();
148147
GCNUserSGPRUsageInfo UserSGPRInfo(F, ST);
149-
150-
NumFreeUserSGPRs = MaxUserSGPRs - UserSGPRInfo.getNumUsedUserSGPRs();
148+
NumFreeUserSGPRs = UserSGPRInfo.getNumFreeUserSGPRs();
151149
}
152150

153151
bool tryAllocPreloadSGPRs(unsigned AllocSize, uint64_t ArgOffset,
@@ -162,13 +160,15 @@ class PreloadKernelArgInfo {
162160
}
163161
// Check if this argument may be loaded into the same register as the
164162
// previous argument.
165-
if (!isAligned(Align(4), ArgOffset) && AllocSize < 4) {
163+
if (ArgOffset - LastExplicitArgOffset < 4 &&
164+
!isAligned(Align(4), ArgOffset)) {
166165
if (DBG)
167166
llvm::errs() << " pre-loaded into the same register as the previous argument\n";
168167
return true;
169168
}
170169

171170
// Pad SGPRs for kernarg alignment.
171+
ArgOffset = alignDown(ArgOffset, 4);
172172
unsigned Padding = ArgOffset - LastExplicitArgOffset;
173173
unsigned PaddingSGPRs = alignTo(Padding, 4) / 4;
174174
unsigned NumPreloadSGPRs = alignTo(AllocSize, 4) / 4;
@@ -192,6 +192,7 @@ class PreloadKernelArgInfo {
192192

193193
// Try to allocate SGPRs to preload implicit kernel arguments.
194194
void tryAllocImplicitArgPreloadSGPRs(uint64_t ImplicitArgsBaseOffset,
195+
uint64_t LastExplicitArgOffset,
195196
IRBuilder<> &Builder) {
196197
StringRef Name = Intrinsic::getName(Intrinsic::amdgcn_implicitarg_ptr);
197198
Function *ImplicitArgPtr = F.getParent()->getFunction(Name);
@@ -237,7 +238,6 @@ class PreloadKernelArgInfo {
237238
// argument can actually be preloaded.
238239
std::sort(ImplicitArgLoads.begin(), ImplicitArgLoads.end(), less_second());
239240

240-
uint64_t LastExplicitArgOffset = ImplicitArgsBaseOffset;
241241
// If we fail to preload any implicit argument we know we don't have SGPRs
242242
// to preload any subsequent ones with larger offsets. Find the first
243243
// argument that we cannot preload.
@@ -251,7 +251,8 @@ class PreloadKernelArgInfo {
251251
LastExplicitArgOffset))
252252
return true;
253253

254-
LastExplicitArgOffset = LoadOffset + LoadSize;
254+
LastExplicitArgOffset =
255+
ImplicitArgsBaseOffset + LoadOffset + LoadSize;
255256
return false;
256257
});
257258

@@ -510,7 +511,7 @@ static bool lowerKernelArguments(Function &F, const TargetMachine &TM) {
510511
alignTo(ExplicitArgOffset, ST.getAlignmentForImplicitArgPtr()) +
511512
BaseOffset;
512513
PreloadInfo.tryAllocImplicitArgPreloadSGPRs(ImplicitArgsBaseOffset,
513-
Builder);
514+
ExplicitArgOffset, Builder);
514515
}
515516

516517
return true;

llvm/lib/Target/AMDGPU/SIISelLowering.cpp

Lines changed: 15 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2477,8 +2477,7 @@ void SITargetLowering::allocatePreloadKernArgSGPRs(
24772477
unsigned Padding = ArgOffset - LastExplicitArgOffset;
24782478
unsigned PaddingSGPRs = alignTo(Padding, 4) / 4;
24792479
// Check for free user SGPRs for preloading.
2480-
if (PaddingSGPRs + NumAllocSGPRs + 1 /*Synthetic SGPRs*/ >
2481-
SGPRInfo.getNumFreeUserSGPRs()) {
2480+
if (PaddingSGPRs + NumAllocSGPRs > SGPRInfo.getNumFreeUserSGPRs()) {
24822481
InPreloadSequence = false;
24832482
break;
24842483
}
@@ -2958,6 +2957,20 @@ SDValue SITargetLowering::LowerFormalArguments(
29582957
NewArg = DAG.getMergeValues({NewArg, Chain}, DL);
29592958
}
29602959
} else {
2960+
// Hidden arguments that are in the kernel signature must be preloaded
2961+
// to user SGPRs. Print a diagnostic error if a hidden argument is in
2962+
// the argument list and is not preloaded.
2963+
if (Arg.isOrigArg()) {
2964+
Argument *OrigArg = Fn.getArg(Arg.getOrigArgIndex());
2965+
if (OrigArg->hasAttribute("amdgpu-hidden-argument")) {
2966+
DiagnosticInfoUnsupported NonPreloadHiddenArg(
2967+
*OrigArg->getParent(),
2968+
"hidden argument in kernel signature was not preloaded",
2969+
DL.getDebugLoc());
2970+
DAG.getContext()->diagnose(NonPreloadHiddenArg);
2971+
}
2972+
}
2973+
29612974
NewArg =
29622975
lowerKernargMemParameter(DAG, VT, MemVT, DL, Chain, Offset,
29632976
Alignment, Ins[i].Flags.isSExt(), &Ins[i]);
Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
; RUN: not llc -global-isel=1 -global-isel-abort=2 -mtriple=amdgcn--amdhsa -mcpu=gfx942 < %s 2>&1 | FileCheck -check-prefixes=ERROR,GISEL %s
2+
; RUN: not llc -global-isel=0 -mtriple=amdgcn--amdhsa -mcpu=gfx942 < %s 2>&1 | FileCheck -check-prefix=ERROR %s
3+
; RUN: not llc -global-isel=1 -global-isel-abort=2 -amdgpu-ir-lower-kernel-arguments=0 -mtriple=amdgcn--amdhsa -mcpu=gfx942 < %s 2>&1 | FileCheck -check-prefixes=ERROR,GISEL %s
4+
; RUN: not llc -global-isel=0 -amdgpu-ir-lower-kernel-arguments=0 -mtriple=amdgcn--amdhsa -mcpu=gfx942 < %s 2>&1 | FileCheck -check-prefix=ERROR %s
5+
6+
define amdgpu_kernel void @no_free_sgprs_block_count_x_no_preload_diag(ptr addrspace(1) inreg %out, i512 inreg, i32 inreg "amdgpu-hidden-argument" %_hidden_block_count_x) #0 {
7+
; GISEL: warning: Instruction selection used fallback path for no_free_sgprs_block_count_x_no_preload_diag
8+
; ERROR: error: <unknown>:0:0: in function no_free_sgprs_block_count_x_no_preload_diag void (ptr addrspace(1), i512, i32): hidden argument in kernel signature was not preloaded
9+
store i32 %_hidden_block_count_x, ptr addrspace(1) %out
10+
ret void
11+
}
12+
13+
define amdgpu_kernel void @preloadremainder_z_no_preload_diag(ptr addrspace(1) inreg %out, i256 inreg, i32 inreg "amdgpu-hidden-argument" %_hidden_block_count_x, i32 inreg "amdgpu-hidden-argument" %_hidden_block_count_y, i32 inreg "amdgpu-hidden-argument" %_hidden_block_count_z, i16 inreg "amdgpu-hidden-argument" %_hidden_group_size_x, i16 inreg "amdgpu-hidden-argument" %_hidden_group_size_y, i16 inreg "amdgpu-hidden-argument" %_hidden_group_size_z, i16 inreg "amdgpu-hidden-argument" %_hidden_remainder_x, i16 inreg "amdgpu-hidden-argument" %_hidden_remainder_y, i16 inreg "amdgpu-hidden-argument" %_hidden_remainder_z) #0 {
14+
; GISEL: warning: Instruction selection used fallback path for preloadremainder_z_no_preload_diag
15+
; ERROR: error: <unknown>:0:0: in function preloadremainder_z_no_preload_diag void (ptr addrspace(1), i256, i32, i32, i32, i16, i16, i16, i16, i16, i16): hidden argument in kernel signature was not preloaded
16+
%conv = zext i16 %_hidden_remainder_z to i32
17+
store i32 %conv, ptr addrspace(1) %out
18+
ret void
19+
}
20+
21+
attributes #0 = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }

llvm/test/CodeGen/AMDGPU/preload-implicit-kernargs.ll

Lines changed: 71 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -599,10 +599,8 @@ define amdgpu_kernel void @no_free_sgprs_preloadremainder_z(ptr addrspace(1) inr
599599
; GFX940: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
600600
; GFX940-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
601601
; GFX940-NEXT: ; %bb.0:
602-
; GFX940-NEXT: s_load_dword s0, s[2:3], 0x1c
602+
; GFX940-NEXT: s_lshr_b32 s0, s15, 16
603603
; GFX940-NEXT: v_mov_b32_e32 v0, 0
604-
; GFX940-NEXT: s_waitcnt lgkmcnt(0)
605-
; GFX940-NEXT: s_lshr_b32 s0, s0, 16
606604
; GFX940-NEXT: v_mov_b32_e32 v1, s0
607605
; GFX940-NEXT: global_store_dword v0, v1, s[6:7] sc0 sc1
608606
; GFX940-NEXT: s_endpgm
@@ -626,4 +624,74 @@ define amdgpu_kernel void @no_free_sgprs_preloadremainder_z(ptr addrspace(1) inr
626624
ret void
627625
}
628626

627+
; Check for consistency between isel and earlier passes preload SGPR accounting with max preload SGPRs.
628+
629+
define amdgpu_kernel void @preload_block_max_user_sgprs(ptr addrspace(1) inreg %out, i192 inreg %t0, i32 inreg %t1) #0 {
630+
; GFX940-LABEL: preload_block_max_user_sgprs:
631+
; GFX940: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
632+
; GFX940-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
633+
; GFX940-NEXT: ; %bb.0:
634+
; GFX940-NEXT: v_mov_b32_e32 v0, 0
635+
; GFX940-NEXT: v_mov_b32_e32 v1, s12
636+
; GFX940-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1
637+
; GFX940-NEXT: s_endpgm
638+
;
639+
; GFX90a-LABEL: preload_block_max_user_sgprs:
640+
; GFX90a: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
641+
; GFX90a-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
642+
; GFX90a-NEXT: ; %bb.0:
643+
; GFX90a-NEXT: s_load_dword s0, s[4:5], 0x28
644+
; GFX90a-NEXT: v_mov_b32_e32 v0, 0
645+
; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
646+
; GFX90a-NEXT: v_mov_b32_e32 v1, s0
647+
; GFX90a-NEXT: global_store_dword v0, v1, s[6:7]
648+
; GFX90a-NEXT: s_endpgm
649+
%imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
650+
%load = load i32, ptr addrspace(4) %imp_arg_ptr
651+
store i32 %load, ptr addrspace(1) %out
652+
ret void
653+
}
654+
655+
define amdgpu_kernel void @preload_block_count_z_workgroup_size_z_remainder_z(ptr addrspace(1) inreg %out) #0 {
656+
; GFX940-LABEL: preload_block_count_z_workgroup_size_z_remainder_z:
657+
; GFX940: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
658+
; GFX940-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
659+
; GFX940-NEXT: ; %bb.0:
660+
; GFX940-NEXT: s_lshr_b32 s0, s9, 16
661+
; GFX940-NEXT: s_and_b32 s1, s8, 0xffff
662+
; GFX940-NEXT: v_mov_b32_e32 v3, 0
663+
; GFX940-NEXT: v_mov_b32_e32 v0, s6
664+
; GFX940-NEXT: v_mov_b32_e32 v1, s1
665+
; GFX940-NEXT: v_mov_b32_e32 v2, s0
666+
; GFX940-NEXT: global_store_dwordx3 v3, v[0:2], s[2:3] sc0 sc1
667+
; GFX940-NEXT: s_endpgm
668+
;
669+
; GFX90a-LABEL: preload_block_count_z_workgroup_size_z_remainder_z:
670+
; GFX90a: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
671+
; GFX90a-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
672+
; GFX90a-NEXT: ; %bb.0:
673+
; GFX90a-NEXT: s_lshr_b32 s0, s13, 16
674+
; GFX90a-NEXT: s_and_b32 s1, s12, 0xffff
675+
; GFX90a-NEXT: v_mov_b32_e32 v3, 0
676+
; GFX90a-NEXT: v_mov_b32_e32 v0, s10
677+
; GFX90a-NEXT: v_mov_b32_e32 v1, s1
678+
; GFX90a-NEXT: v_mov_b32_e32 v2, s0
679+
; GFX90a-NEXT: global_store_dwordx3 v3, v[0:2], s[6:7]
680+
; GFX90a-NEXT: s_endpgm
681+
%imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
682+
%gep0 = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 8
683+
%gep1 = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 16
684+
%gep2 = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 22
685+
%load0 = load i32, ptr addrspace(4) %gep0
686+
%load1 = load i16, ptr addrspace(4) %gep1
687+
%load2 = load i16, ptr addrspace(4) %gep2
688+
%conv1 = zext i16 %load1 to i32
689+
%conv2 = zext i16 %load2 to i32
690+
%ins.0 = insertelement <3 x i32> poison, i32 %load0, i32 0
691+
%ins.1 = insertelement <3 x i32> %ins.0, i32 %conv1, i32 1
692+
%ins.2 = insertelement <3 x i32> %ins.1, i32 %conv2, i32 2
693+
store <3 x i32> %ins.2, ptr addrspace(1) %out
694+
ret void
695+
}
696+
629697
attributes #0 = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }

0 commit comments

Comments
 (0)