Skip to content

Commit 1a49025

Browse files
authored
Merge branch 'main' into refactorlen
2 parents b0e77d2 + a216358 commit 1a49025

File tree

45 files changed

+775
-168
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

45 files changed

+775
-168
lines changed

llvm/docs/AMDGPUUsage.rst

Lines changed: 18 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1698,15 +1698,28 @@ The AMDGPU backend supports the following LLVM IR attributes.
16981698
``amdgpu_max_num_work_groups`` CLANG attribute [CLANG-ATTR]_. Clang only
16991699
emits this attribute when all the three numbers are >= 1.
17001700

1701-
"amdgpu-no-agpr" Indicates the function will not require allocating AGPRs. This is only
1702-
relevant on subtargets with AGPRs. The behavior is undefined if a
1703-
function which requires AGPRs is reached through any function marked
1704-
with this attribute.
1705-
17061701
"amdgpu-hidden-argument" This attribute is used internally by the backend to mark function arguments
17071702
as hidden. Hidden arguments are managed by the compiler and are not part of
17081703
the explicit arguments supplied by the user.
17091704

1705+
"amdgpu-agpr-alloc"="min(,max)" Indicates a minimum and maximum range for the number of AGPRs to make
1706+
available to allocate. The values will be rounded up to the next multiple
1707+
of the allocation granularity (4). The minimum value is interpreted as the
1708+
minimum required number of AGPRs for the function to allocate (that is, the
1709+
function requires no more than min registers). If only one value is specified,
1710+
it is interpreted as the minimum register budget. The maximum will restrict
1711+
allocation to use no more than max AGPRs.
1712+
1713+
The values may be ignored if satisfying it would violate other allocation
1714+
constraints.
1715+
1716+
The behavior is undefined if a function which requires more AGPRs than the
1717+
lower bound is reached through any function marked with a higher value of this
1718+
attribute. A minimum value of 0 indicates the function does not require
1719+
any AGPRs.
1720+
1721+
This is only relevant on targets with AGPRs which support accum_offset (gfx90a+).
1722+
17101723
"amdgpu-sgpr-hazard-wait" Disabled SGPR hazard wait insertion if set to 0.
17111724
Exists for testing performance impact of SGPR hazard waits only.
17121725

llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1235,6 +1235,8 @@ static bool inlineAsmUsesAGPRs(const InlineAsm *IA) {
12351235
return false;
12361236
}
12371237

1238+
// TODO: Migrate to range merge of amdgpu-agpr-alloc.
1239+
// FIXME: Why is this using Attribute::NoUnwind?
12381240
struct AAAMDGPUNoAGPR
12391241
: public IRAttribute<Attribute::NoUnwind,
12401242
StateWrapper<BooleanState, AbstractAttribute>,
@@ -1250,7 +1252,10 @@ struct AAAMDGPUNoAGPR
12501252

12511253
void initialize(Attributor &A) override {
12521254
Function *F = getAssociatedFunction();
1253-
if (F->hasFnAttribute("amdgpu-no-agpr"))
1255+
auto [MinNumAGPR, MaxNumAGPR] =
1256+
AMDGPU::getIntegerPairAttribute(*F, "amdgpu-agpr-alloc", {~0u, ~0u},
1257+
/*OnlyFirstRequired=*/true);
1258+
if (MinNumAGPR == 0)
12541259
indicateOptimisticFixpoint();
12551260
}
12561261

@@ -1297,7 +1302,7 @@ struct AAAMDGPUNoAGPR
12971302
return ChangeStatus::UNCHANGED;
12981303
LLVMContext &Ctx = getAssociatedFunction()->getContext();
12991304
return A.manifestAttrs(getIRPosition(),
1300-
{Attribute::get(Ctx, "amdgpu-no-agpr")});
1305+
{Attribute::get(Ctx, "amdgpu-agpr-alloc", "0")});
13011306
}
13021307

13031308
const std::string getName() const override { return "AAAMDGPUNoAGPR"; }

llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -780,5 +780,8 @@ bool SIMachineFunctionInfo::initializeBaseYamlFields(
780780
}
781781

782782
bool SIMachineFunctionInfo::mayUseAGPRs(const Function &F) const {
783-
return !F.hasFnAttribute("amdgpu-no-agpr");
783+
auto [MinNumAGPR, MaxNumAGPR] =
784+
AMDGPU::getIntegerPairAttribute(F, "amdgpu-agpr-alloc", {~0u, ~0u},
785+
/*OnlyFirstRequired=*/true);
786+
return MinNumAGPR != 0u;
784787
}

llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp

Lines changed: 40 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -571,10 +571,10 @@ MCRegister SIRegisterInfo::reservedPrivateSegmentBufferReg(
571571

572572
std::pair<unsigned, unsigned>
573573
SIRegisterInfo::getMaxNumVectorRegs(const MachineFunction &MF) const {
574-
const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
575-
unsigned MaxNumVGPRs = ST.getMaxNumVGPRs(MF);
576-
unsigned MaxNumAGPRs = MaxNumVGPRs;
577-
unsigned TotalNumVGPRs = AMDGPU::VGPR_32RegClass.getNumRegs();
574+
const unsigned MaxVectorRegs = ST.getMaxNumVGPRs(MF);
575+
576+
unsigned MaxNumVGPRs = MaxVectorRegs;
577+
unsigned MaxNumAGPRs = 0;
578578

579579
// On GFX90A, the number of VGPRs and AGPRs need not be equal. Theoretically,
580580
// a wave may have up to 512 total vector registers combining together both
@@ -585,16 +585,44 @@ SIRegisterInfo::getMaxNumVectorRegs(const MachineFunction &MF) const {
585585
// TODO: it shall be possible to estimate maximum AGPR/VGPR pressure and split
586586
// register file accordingly.
587587
if (ST.hasGFX90AInsts()) {
588-
if (MFI->mayNeedAGPRs()) {
589-
MaxNumVGPRs /= 2;
590-
MaxNumAGPRs = MaxNumVGPRs;
588+
unsigned MinNumAGPRs = 0;
589+
const unsigned TotalNumAGPRs = AMDGPU::AGPR_32RegClass.getNumRegs();
590+
const unsigned TotalNumVGPRs = AMDGPU::VGPR_32RegClass.getNumRegs();
591+
592+
const std::pair<unsigned, unsigned> DefaultNumAGPR = {~0u, ~0u};
593+
594+
// TODO: Move this logic into subtarget on IR function
595+
//
596+
// TODO: The lower bound should probably force the number of required
597+
// registers up, overriding amdgpu-waves-per-eu.
598+
std::tie(MinNumAGPRs, MaxNumAGPRs) = AMDGPU::getIntegerPairAttribute(
599+
MF.getFunction(), "amdgpu-agpr-alloc", DefaultNumAGPR,
600+
/*OnlyFirstRequired=*/true);
601+
602+
if (MinNumAGPRs == DefaultNumAGPR.first) {
603+
// Default to splitting half the registers if AGPRs are required.
604+
MinNumAGPRs = MaxNumAGPRs = MaxVectorRegs / 2;
591605
} else {
592-
if (MaxNumVGPRs > TotalNumVGPRs) {
593-
MaxNumAGPRs = MaxNumVGPRs - TotalNumVGPRs;
594-
MaxNumVGPRs = TotalNumVGPRs;
595-
} else
596-
MaxNumAGPRs = 0;
606+
// Align to accum_offset's allocation granularity.
607+
MinNumAGPRs = alignTo(MinNumAGPRs, 4);
608+
609+
MinNumAGPRs = std::min(MinNumAGPRs, TotalNumAGPRs);
597610
}
611+
612+
// Clamp values to be inbounds of our limits, and ensure min <= max.
613+
614+
MaxNumAGPRs = std::min(std::max(MinNumAGPRs, MaxNumAGPRs), MaxVectorRegs);
615+
MinNumAGPRs = std::min(std::min(MinNumAGPRs, TotalNumAGPRs), MaxNumAGPRs);
616+
617+
MaxNumVGPRs = std::min(MaxVectorRegs - MinNumAGPRs, TotalNumVGPRs);
618+
MaxNumAGPRs = std::min(MaxVectorRegs - MaxNumVGPRs, MaxNumAGPRs);
619+
620+
assert(MaxNumVGPRs + MaxNumAGPRs <= MaxVectorRegs &&
621+
MaxNumAGPRs <= TotalNumAGPRs && MaxNumVGPRs <= TotalNumVGPRs &&
622+
"invalid register counts");
623+
} else if (ST.hasMAIInsts()) {
624+
// On gfx908 the number of AGPRs always equals the number of VGPRs.
625+
MaxNumAGPRs = MaxNumVGPRs = MaxVectorRegs;
598626
}
599627

600628
return std::pair(MaxNumVGPRs, MaxNumAGPRs);

llvm/test/CodeGen/AMDGPU/addrspacecast-constantexpr.ll

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -233,8 +233,8 @@ attributes #1 = { nounwind }
233233
; AKF_HSA: attributes #[[ATTR1]] = { nounwind }
234234
;.
235235
; ATTRIBUTOR_HSA: attributes #[[ATTR0:[0-9]+]] = { nocallback nofree nounwind willreturn memory(argmem: readwrite) }
236-
; ATTRIBUTOR_HSA: attributes #[[ATTR1]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
237-
; ATTRIBUTOR_HSA: attributes #[[ATTR2]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
236+
; ATTRIBUTOR_HSA: attributes #[[ATTR1]] = { nounwind "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
237+
; ATTRIBUTOR_HSA: attributes #[[ATTR2]] = { nounwind "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
238238
;.
239239
; AKF_HSA: [[META0:![0-9]+]] = !{i32 1, !"amdhsa_code_object_version", i32 500}
240240
;.

llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers-assertion-after-ra-failure.ll

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,6 @@ define void @no_free_vgprs_at_agpr_to_agpr_copy(float %v0, float %v1) #0 {
1717
declare <16 x float> @llvm.amdgcn.mfma.f32.16x16x1f32(float, float, <16 x float>, i32 immarg, i32 immarg, i32 immarg) #1
1818
declare noundef i32 @llvm.amdgcn.workitem.id.x() #2
1919

20-
attributes #0 = { "amdgpu-no-agpr" "amdgpu-waves-per-eu"="6,6" }
20+
attributes #0 = { "amdgpu-agpr-alloc"="0" "amdgpu-waves-per-eu"="6,6" }
2121
attributes #1 = { convergent nocallback nofree nosync nounwind willreturn memory(none) }
2222
attributes #2 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }

llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1144,6 +1144,6 @@ declare i32 @llvm.amdgcn.workitem.id.x() #2
11441144
attributes #0 = { "amdgpu-waves-per-eu"="6,6" }
11451145
attributes #1 = { convergent nounwind readnone willreturn }
11461146
attributes #2 = { nounwind readnone willreturn }
1147-
attributes #3 = { "amdgpu-waves-per-eu"="7,7" "amdgpu-no-agpr" }
1147+
attributes #3 = { "amdgpu-waves-per-eu"="7,7" "amdgpu-agpr-alloc"="0" }
11481148
attributes #4 = { "amdgpu-waves-per-eu"="6,6" "amdgpu-flat-work-group-size"="1024,1024" }
1149-
attributes #5 = { "amdgpu-waves-per-eu"="6,6" "amdgpu-no-agpr" }
1149+
attributes #5 = { "amdgpu-waves-per-eu"="6,6" "amdgpu-agpr-alloc"="0" }

llvm/test/CodeGen/AMDGPU/amdgpu-attributor-no-agpr.ll

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -252,13 +252,13 @@ define amdgpu_kernel void @indirect_calls_none_agpr(i1 %cond) {
252252
}
253253

254254

255-
attributes #0 = { "amdgpu-no-agpr" }
255+
attributes #0 = { "amdgpu-agpr-alloc"="0" }
256256
;.
257257
; CHECK: attributes #[[ATTR0]] = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx90a" "uniform-work-group-size"="false" }
258-
; CHECK: attributes #[[ATTR1]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx90a" "uniform-work-group-size"="false" }
258+
; CHECK: attributes #[[ATTR1]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx90a" "uniform-work-group-size"="false" }
259259
; CHECK: attributes #[[ATTR2]] = { "target-cpu"="gfx90a" "uniform-work-group-size"="false" }
260260
; CHECK: attributes #[[ATTR3:[0-9]+]] = { convergent nocallback nofree nosync nounwind willreturn memory(none) "target-cpu"="gfx90a" }
261261
; CHECK: attributes #[[ATTR4:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) "target-cpu"="gfx90a" }
262262
; CHECK: attributes #[[ATTR5:[0-9]+]] = { nocallback nofree nounwind willreturn memory(argmem: readwrite) "target-cpu"="gfx90a" }
263-
; CHECK: attributes #[[ATTR6]] = { "amdgpu-no-agpr" }
263+
; CHECK: attributes #[[ATTR6]] = { "amdgpu-agpr-alloc"="0" }
264264
;.
Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,50 @@
1+
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 < %s | FileCheck -check-prefixes=CHECK,GFX908 %s
2+
; RUN: not llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a < %s 2> %t.err | FileCheck -check-prefixes=CHECK,GFX90A %s
3+
; RUN: FileCheck --implicit-check-not=error -check-prefix=ERR < %t.err %s
4+
5+
; Test undefined behavior where a function ends up needing AGPRs that
6+
; was marked with "amdgpu-agpr-alloc="="0". There should be no asserts.
7+
8+
; TODO: Should this be an error, or let UB happen?
9+
10+
; ERR: error: <unknown>:0:0: no registers from class available to allocate in function 'kernel_illegal_agpr_use_asm'
11+
; ERR: error: <unknown>:0:0: no registers from class available to allocate in function 'func_illegal_agpr_use_asm'
12+
13+
; CHECK: {{^}}kernel_illegal_agpr_use_asm:
14+
; CHECK: ; use a0
15+
16+
; CHECK: NumVgprs: 0
17+
; CHECK: NumAgprs: 1
18+
define amdgpu_kernel void @kernel_illegal_agpr_use_asm() #0 {
19+
call void asm sideeffect "; use $0", "a"(i32 poison)
20+
ret void
21+
}
22+
23+
; CHECK: {{^}}func_illegal_agpr_use_asm:
24+
; CHECK: ; use a0
25+
26+
; CHECK: NumVgprs: 0
27+
; CHECK: NumAgprs: 1
28+
define void @func_illegal_agpr_use_asm() #0 {
29+
call void asm sideeffect "; use $0", "a"(i32 poison)
30+
ret void
31+
}
32+
33+
; CHECK-LABEL: {{^}}kernel_calls_mfma.f32.32x32x1f32:
34+
; GFX908: v_accvgpr_write_b32
35+
; GFX90A-NOT: v_accvgpr_write_b32
36+
37+
; GFX908: NumVgprs: 5
38+
; GFX908: NumAgprs: 32
39+
; GFX90A: NumVgprs: 35
40+
; GFX90A: NumAgprs: 0
41+
42+
; GFX908: TotalNumVgprs: 32
43+
; GFX90A: TotalNumVgprs: 35
44+
define amdgpu_kernel void @kernel_calls_mfma.f32.32x32x1f32(ptr addrspace(1) %out, float %a, float %b, <32 x float> %c) #0 {
45+
%result = call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float %a, float %b, <32 x float> %c, i32 0, i32 0, i32 0)
46+
store <32 x float> %result, ptr addrspace(1) %out
47+
ret void
48+
}
49+
50+
attributes #0 = { "amdgpu-agpr-alloc"="0" }

0 commit comments

Comments
 (0)