Skip to content

Commit 27b242f

Browse files
authored
[AMDGPU][Attributor] Add AAAMDGPUClusterDims (#158076)
1 parent a75f428 commit 27b242f

29 files changed

+306
-132
lines changed

llvm/docs/AMDGPUUsage.rst

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1672,6 +1672,15 @@ The AMDGPU backend supports the following LLVM IR attributes.
16721672
"amdgpu-no-workgroup-id-z" The same as amdgpu-no-workitem-id-x, except for the
16731673
llvm.amdgcn.workgroup.id.z intrinsic.
16741674

1675+
"amdgpu-no-cluster-id-x" The same as amdgpu-no-workitem-id-x, except for the
1676+
llvm.amdgcn.cluster.id.x intrinsic.
1677+
1678+
"amdgpu-no-cluster-id-y" The same as amdgpu-no-workitem-id-x, except for the
1679+
llvm.amdgcn.cluster.id.y intrinsic.
1680+
1681+
"amdgpu-no-cluster-id-z" The same as amdgpu-no-workitem-id-x, except for the
1682+
llvm.amdgcn.cluster.id.z intrinsic.
1683+
16751684
"amdgpu-no-dispatch-ptr" The same as amdgpu-no-workitem-id-x, except for the
16761685
llvm.amdgcn.dispatch.ptr intrinsic.
16771686

llvm/lib/Target/AMDGPU/AMDGPUAttributes.def

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,5 +31,8 @@ AMDGPU_ATTRIBUTE(LDS_KERNEL_ID, "amdgpu-no-lds-kernel-id")
3131
AMDGPU_ATTRIBUTE(DEFAULT_QUEUE, "amdgpu-no-default-queue")
3232
AMDGPU_ATTRIBUTE(COMPLETION_ACTION, "amdgpu-no-completion-action")
3333
AMDGPU_ATTRIBUTE(FLAT_SCRATCH_INIT, "amdgpu-no-flat-scratch-init")
34+
AMDGPU_ATTRIBUTE(CLUSTER_ID_X, "amdgpu-no-cluster-id-x")
35+
AMDGPU_ATTRIBUTE(CLUSTER_ID_Y, "amdgpu-no-cluster-id-y")
36+
AMDGPU_ATTRIBUTE(CLUSTER_ID_Z, "amdgpu-no-cluster-id-z")
3437

3538
#undef AMDGPU_ATTRIBUTE

llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp

Lines changed: 163 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -77,6 +77,13 @@ intrinsicToAttrMask(Intrinsic::ID ID, bool &NonKernelOnly, bool &NeedsImplicit,
7777
case Intrinsic::amdgcn_workgroup_id_z:
7878
case Intrinsic::r600_read_tgid_z:
7979
return WORKGROUP_ID_Z;
80+
case Intrinsic::amdgcn_cluster_id_x:
81+
NonKernelOnly = true;
82+
return CLUSTER_ID_X;
83+
case Intrinsic::amdgcn_cluster_id_y:
84+
return CLUSTER_ID_Y;
85+
case Intrinsic::amdgcn_cluster_id_z:
86+
return CLUSTER_ID_Z;
8087
case Intrinsic::amdgcn_lds_kernel_id:
8188
return LDS_KERNEL_ID;
8289
case Intrinsic::amdgcn_dispatch_ptr:
@@ -1296,6 +1303,157 @@ struct AAAMDGPUNoAGPR
12961303

12971304
const char AAAMDGPUNoAGPR::ID = 0;
12981305

1306+
/// An abstract attribute to propagate the function attribute
1307+
/// "amdgpu-cluster-dims" from kernel entry functions to device functions.
1308+
struct AAAMDGPUClusterDims
1309+
: public StateWrapper<BooleanState, AbstractAttribute> {
1310+
using Base = StateWrapper<BooleanState, AbstractAttribute>;
1311+
AAAMDGPUClusterDims(const IRPosition &IRP, Attributor &A) : Base(IRP) {}
1312+
1313+
/// Create an abstract attribute view for the position \p IRP.
1314+
static AAAMDGPUClusterDims &createForPosition(const IRPosition &IRP,
1315+
Attributor &A);
1316+
1317+
/// See AbstractAttribute::getName().
1318+
StringRef getName() const override { return "AAAMDGPUClusterDims"; }
1319+
1320+
/// See AbstractAttribute::getIdAddr().
1321+
const char *getIdAddr() const override { return &ID; }
1322+
1323+
/// This function should return true if the type of the \p AA is
1324+
/// AAAMDGPUClusterDims.
1325+
static bool classof(const AbstractAttribute *AA) {
1326+
return AA->getIdAddr() == &ID;
1327+
}
1328+
1329+
virtual const AMDGPU::ClusterDimsAttr &getClusterDims() const = 0;
1330+
1331+
/// Unique ID (due to the unique address)
1332+
static const char ID;
1333+
};
1334+
1335+
const char AAAMDGPUClusterDims::ID = 0;
1336+
1337+
struct AAAMDGPUClusterDimsFunction : public AAAMDGPUClusterDims {
1338+
AAAMDGPUClusterDimsFunction(const IRPosition &IRP, Attributor &A)
1339+
: AAAMDGPUClusterDims(IRP, A) {}
1340+
1341+
void initialize(Attributor &A) override {
1342+
Function *F = getAssociatedFunction();
1343+
assert(F && "empty associated function");
1344+
1345+
Attr = AMDGPU::ClusterDimsAttr::get(*F);
1346+
1347+
// No matter what a kernel function has, it is final.
1348+
if (AMDGPU::isEntryFunctionCC(F->getCallingConv())) {
1349+
if (Attr.isUnknown())
1350+
indicatePessimisticFixpoint();
1351+
else
1352+
indicateOptimisticFixpoint();
1353+
}
1354+
}
1355+
1356+
const std::string getAsStr(Attributor *A) const override {
1357+
if (!getAssumed() || Attr.isUnknown())
1358+
return "unknown";
1359+
if (Attr.isNoCluster())
1360+
return "no";
1361+
if (Attr.isVariableDims())
1362+
return "variable";
1363+
return Attr.to_string();
1364+
}
1365+
1366+
void trackStatistics() const override {}
1367+
1368+
ChangeStatus updateImpl(Attributor &A) override {
1369+
auto OldState = Attr;
1370+
1371+
auto CheckCallSite = [&](AbstractCallSite CS) {
1372+
const auto *CallerAA = A.getAAFor<AAAMDGPUClusterDims>(
1373+
*this, IRPosition::function(*CS.getInstruction()->getFunction()),
1374+
DepClassTy::REQUIRED);
1375+
if (!CallerAA || !CallerAA->isValidState())
1376+
return false;
1377+
1378+
return merge(CallerAA->getClusterDims());
1379+
};
1380+
1381+
bool UsedAssumedInformation = false;
1382+
if (!A.checkForAllCallSites(CheckCallSite, *this,
1383+
/*RequireAllCallSites=*/true,
1384+
UsedAssumedInformation))
1385+
return indicatePessimisticFixpoint();
1386+
1387+
return OldState == Attr ? ChangeStatus::UNCHANGED : ChangeStatus::CHANGED;
1388+
}
1389+
1390+
ChangeStatus manifest(Attributor &A) override {
1391+
if (Attr.isUnknown())
1392+
return ChangeStatus::UNCHANGED;
1393+
return A.manifestAttrs(
1394+
getIRPosition(),
1395+
{Attribute::get(getAssociatedFunction()->getContext(), AttrName,
1396+
Attr.to_string())},
1397+
/*ForceReplace=*/true);
1398+
}
1399+
1400+
const AMDGPU::ClusterDimsAttr &getClusterDims() const override {
1401+
return Attr;
1402+
}
1403+
1404+
private:
1405+
bool merge(const AMDGPU::ClusterDimsAttr &Other) {
1406+
// Case 1: Both of them are unknown yet, we do nothing and continue wait for
1407+
// propagation.
1408+
if (Attr.isUnknown() && Other.isUnknown())
1409+
return true;
1410+
1411+
// Case 2: The other is determined, but we are unknown yet, we simply take
1412+
// the other's value.
1413+
if (Attr.isUnknown()) {
1414+
Attr = Other;
1415+
return true;
1416+
}
1417+
1418+
// Case 3: We are determined but the other is unknown yet, we simply keep
1419+
// everything unchanged.
1420+
if (Other.isUnknown())
1421+
return true;
1422+
1423+
// After this point, both are determined.
1424+
1425+
// Case 4: If they are same, we do nothing.
1426+
if (Attr == Other)
1427+
return true;
1428+
1429+
// Now they are not same.
1430+
1431+
// Case 5: If either of us uses cluster (but not both; otherwise case 4
1432+
// would hold), then it is unknown whether cluster will be used, and the
1433+
// state is final, unlike case 1.
1434+
if (Attr.isNoCluster() || Other.isNoCluster()) {
1435+
Attr.setUnknown();
1436+
return false;
1437+
}
1438+
1439+
// Case 6: Both of us use cluster, but the dims are different, so the result
1440+
// is, cluster is used, but we just don't have a fixed dims.
1441+
Attr.setVariableDims();
1442+
return true;
1443+
}
1444+
1445+
AMDGPU::ClusterDimsAttr Attr;
1446+
1447+
static constexpr const char AttrName[] = "amdgpu-cluster-dims";
1448+
};
1449+
1450+
AAAMDGPUClusterDims &
1451+
AAAMDGPUClusterDims::createForPosition(const IRPosition &IRP, Attributor &A) {
1452+
if (IRP.getPositionKind() == IRPosition::IRP_FUNCTION)
1453+
return *new (A.Allocator) AAAMDGPUClusterDimsFunction(IRP, A);
1454+
llvm_unreachable("AAAMDGPUClusterDims is only valid for function position");
1455+
}
1456+
12991457
static bool runImpl(Module &M, AnalysisGetter &AG, TargetMachine &TM,
13001458
AMDGPUAttributorOptions Options,
13011459
ThinOrFullLTOPhase LTOPhase) {
@@ -1314,7 +1472,7 @@ static bool runImpl(Module &M, AnalysisGetter &AG, TargetMachine &TM,
13141472
&AAAMDMaxNumWorkgroups::ID, &AAAMDWavesPerEU::ID, &AAAMDGPUNoAGPR::ID,
13151473
&AACallEdges::ID, &AAPointerInfo::ID, &AAPotentialConstantValues::ID,
13161474
&AAUnderlyingObjects::ID, &AANoAliasAddrSpace::ID, &AAAddressSpace::ID,
1317-
&AAIndirectCallInfo::ID});
1475+
&AAIndirectCallInfo::ID, &AAAMDGPUClusterDims::ID});
13181476

13191477
AttributorConfig AC(CGUpdater);
13201478
AC.IsClosedWorldModule = Options.IsClosedWorld;
@@ -1352,6 +1510,10 @@ static bool runImpl(Module &M, AnalysisGetter &AG, TargetMachine &TM,
13521510
A.getOrCreateAAFor<AAAMDWavesPerEU>(IRPosition::function(*F));
13531511
}
13541512

1513+
const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(*F);
1514+
if (!F->isDeclaration() && ST.hasClusters())
1515+
A.getOrCreateAAFor<AAAMDGPUClusterDims>(IRPosition::function(*F));
1516+
13551517
for (auto &I : instructions(F)) {
13561518
Value *Ptr = nullptr;
13571519
if (auto *LI = dyn_cast<LoadInst>(&I))

llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1827,7 +1827,7 @@ class ClusterDimsAttr {
18271827

18281828
bool isFixedDims() const { return getKind() == Kind::FixedDims; }
18291829

1830-
bool isVariableedDims() const { return getKind() == Kind::VariableDims; }
1830+
bool isVariableDims() const { return getKind() == Kind::VariableDims; }
18311831

18321832
void setUnknown() { *this = ClusterDimsAttr(Kind::Unknown); }
18331833

llvm/test/CodeGen/AMDGPU/addrspacecast-constantexpr.ll

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -169,6 +169,6 @@ attributes #1 = { nounwind }
169169

170170
;.
171171
; HSA: attributes #[[ATTR0:[0-9]+]] = { nocallback nofree nounwind willreturn memory(argmem: readwrite) }
172-
; HSA: attributes #[[ATTR1]] = { nounwind "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
173-
; HSA: attributes #[[ATTR2]] = { nounwind "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
172+
; HSA: attributes #[[ATTR1]] = { nounwind "amdgpu-agpr-alloc"="0" "amdgpu-no-cluster-id-x" "amdgpu-no-cluster-id-y" "amdgpu-no-cluster-id-z" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
173+
; HSA: attributes #[[ATTR2]] = { nounwind "amdgpu-agpr-alloc"="0" "amdgpu-no-cluster-id-x" "amdgpu-no-cluster-id-y" "amdgpu-no-cluster-id-z" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
174174
;.

llvm/test/CodeGen/AMDGPU/amdgpu-attributor-no-agpr.ll

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -254,9 +254,9 @@ define amdgpu_kernel void @indirect_calls_none_agpr(i1 %cond) {
254254

255255
attributes #0 = { "amdgpu-agpr-alloc"="0" }
256256
;.
257-
; CHECK: attributes #[[ATTR0]] = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx90a" "uniform-work-group-size"="false" }
258-
; CHECK: attributes #[[ATTR1]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx90a" "uniform-work-group-size"="false" }
259-
; CHECK: attributes #[[ATTR2]] = { "target-cpu"="gfx90a" "uniform-work-group-size"="false" }
257+
; CHECK: attributes #[[ATTR0]] = { "amdgpu-no-cluster-id-x" "amdgpu-no-cluster-id-y" "amdgpu-no-cluster-id-z" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx90a" "uniform-work-group-size"="false" }
258+
; CHECK: attributes #[[ATTR1]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-cluster-id-x" "amdgpu-no-cluster-id-y" "amdgpu-no-cluster-id-z" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx90a" "uniform-work-group-size"="false" }
259+
; CHECK: attributes #[[ATTR2:[0-9]+]] = { "target-cpu"="gfx90a" "uniform-work-group-size"="false" }
260260
; CHECK: attributes #[[ATTR3:[0-9]+]] = { convergent nocallback nofree nosync nounwind willreturn memory(none) "target-cpu"="gfx90a" }
261261
; CHECK: attributes #[[ATTR4:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) "target-cpu"="gfx90a" }
262262
; CHECK: attributes #[[ATTR5:[0-9]+]] = { nocallback nofree nounwind willreturn memory(argmem: readwrite) "target-cpu"="gfx90a" }

0 commit comments

Comments
 (0)