Skip to content

Commit bde0a8c

Browse files
shiltianyxsamliujayfoad
authored
[Clang][HIP][CUDA] Add __cluster_dims__ and __no_cluster__ attribute (#156686)
This PR adds basic frontend support for `__cluster_dims__` and `__no_cluster__` attribute. In CUDA/HIP programming, the ``__cluster_dims__`` attribute can be applied to a kernel function to set the dimensions of a thread block cluster. The ``__no_cluster__`` attribute can be applied to a kernel function to indicate that the thread block cluster feature will not be enabled at both compile time and kernel launch time. Note that `__no_cluster__` is a LLVM/Clang only attribute. Co-authored-by: Yaxun (Sam) Liu <[email protected]> Co-authored-by: Jay Foad <[email protected]>
1 parent 9975600 commit bde0a8c

File tree

13 files changed

+338
-0
lines changed

13 files changed

+338
-0
lines changed

clang/include/clang/Basic/Attr.td

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1572,6 +1572,23 @@ def HIPManaged : InheritableAttr {
15721572
let Documentation = [HIPManagedAttrDocs];
15731573
}
15741574

1575+
def CUDAClusterDims : InheritableAttr {
1576+
let Spellings = [GNU<"cluster_dims">];
1577+
let Args = [ExprArgument<"X">, ExprArgument<"Y", /*opt=*/1>, ExprArgument<"Z", /*opt=*/1>];
1578+
let Subjects = SubjectList<[ObjCMethod, FunctionLike]>;
1579+
let LangOpts = [CUDA];
1580+
let Documentation = [CUDAClusterDimsAttrDoc];
1581+
}
1582+
1583+
def CUDANoCluster : InheritableAttr {
1584+
let Spellings = [GNU<"no_cluster">];
1585+
let Subjects = SubjectList<[ObjCMethod, FunctionLike]>;
1586+
let LangOpts = [CUDA];
1587+
let Documentation = [CUDANoClusterAttrDoc];
1588+
}
1589+
1590+
def : MutualExclusions<[CUDAClusterDims, CUDANoCluster]>;
1591+
15751592
def CUDAInvalidTarget : InheritableAttr {
15761593
let Spellings = [];
15771594
let Subjects = SubjectList<[Function]>;

clang/include/clang/Basic/AttrDocs.td

Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7545,6 +7545,45 @@ A managed variable can be accessed in both device and host code.
75457545
}];
75467546
}
75477547

7548+
def CUDAClusterDimsAttrDoc : Documentation {
7549+
let Category = DocCatDecl;
7550+
let Content = [{
7551+
In CUDA/HIP programming, the ``cluster_dims`` attribute, conventionally exposed as the
7552+
``__cluster_dims__`` macro, can be applied to a kernel function to set the dimensions of a
7553+
thread block cluster, which is an optional level of hierarchy and made up of thread blocks.
7554+
``__cluster_dims__`` defines the cluster size as ``(X, Y, Z)``, where each value is the number
7555+
of thread blocks in that dimension. The ``cluster_dims`` and `no_cluster`` attributes are
7556+
mutually exclusive.
7557+
7558+
.. code::
7559+
7560+
__global__ __cluster_dims__(2, 1, 1) void kernel(...) {
7561+
...
7562+
}
7563+
7564+
}];
7565+
}
7566+
7567+
def CUDANoClusterAttrDoc : Documentation {
7568+
let Category = DocCatDecl;
7569+
let Content = [{
7570+
In CUDA/HIP programming, a kernel function can still be launched with the cluster feature enabled
7571+
at runtime, even without being annotated with ``__cluster_dims__``. The LLVM/Clang-exclusive
7572+
``no_cluster`` attribute, conventionally exposed as the ``__no_cluster__`` macro, can be applied to
7573+
a kernel function to explicitly indicate that the cluster feature will not be enabled either at
7574+
compile time or at kernel launch time. This allows the compiler to apply certain optimizations
7575+
without assuming that clustering could be enabled at runtime. It is undefined behavior to launch a
7576+
kernel annotated with ``__no_cluster__`` if the cluster feature is enabled at runtime.
7577+
The ``cluster_dims`` and ``no_cluster`` attributes are mutually exclusive.
7578+
7579+
.. code::
7580+
7581+
__global__ __no_cluster__ void kernel(...) {
7582+
...
7583+
}
7584+
}];
7585+
}
7586+
75487587
def LifetimeOwnerDocs : Documentation {
75497588
let Category = DocCatDecl;
75507589
let Content = [{

clang/include/clang/Basic/DiagnosticSemaKinds.td

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13070,6 +13070,12 @@ def warn_cuda_maxclusterrank_sm_90 : Warning<
1307013070
"maxclusterrank requires sm_90 or higher, CUDA arch provided: %0, ignoring "
1307113071
"%1 attribute">, InGroup<IgnoredAttributes>;
1307213072

13073+
def err_cluster_attr_not_supported : Error<
13074+
"%0 is not supported for this GPU architecture">;
13075+
13076+
def err_cluster_dims_too_large : Error<
13077+
"cluster does not support more than %0 thread blocks; %1 provided">;
13078+
1307313079
// VTable pointer authentication errors
1307413080
def err_non_polymorphic_vtable_pointer_auth : Error<
1307513081
"cannot set vtable pointer authentication on monomorphic type %0">;

clang/include/clang/Sema/Sema.h

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5010,6 +5010,14 @@ class Sema final : public SemaBase {
50105010
void AddLaunchBoundsAttr(Decl *D, const AttributeCommonInfo &CI,
50115011
Expr *MaxThreads, Expr *MinBlocks, Expr *MaxBlocks);
50125012

5013+
/// Add a cluster_dims attribute to a particular declaration.
5014+
CUDAClusterDimsAttr *createClusterDimsAttr(const AttributeCommonInfo &CI,
5015+
Expr *X, Expr *Y, Expr *Z);
5016+
void addClusterDimsAttr(Decl *D, const AttributeCommonInfo &CI, Expr *X,
5017+
Expr *Y, Expr *Z);
5018+
/// Add a no_cluster attribute to a particular declaration.
5019+
void addNoClusterAttr(Decl *D, const AttributeCommonInfo &CI);
5020+
50135021
enum class RetainOwnershipKind { NS, CF, OS };
50145022

50155023
UuidAttr *mergeUuidAttr(Decl *D, const AttributeCommonInfo &CI,

clang/lib/CodeGen/Targets/AMDGPU.cpp

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -402,6 +402,26 @@ void AMDGPUTargetCodeGenInfo::setFunctionDeclAttributes(
402402

403403
F->addFnAttr("amdgpu-max-num-workgroups", AttrVal.str());
404404
}
405+
406+
if (auto *Attr = FD->getAttr<CUDAClusterDimsAttr>()) {
407+
auto GetExprVal = [&](const auto &E) {
408+
return E ? E->EvaluateKnownConstInt(M.getContext()).getExtValue() : 1;
409+
};
410+
unsigned X = GetExprVal(Attr->getX());
411+
unsigned Y = GetExprVal(Attr->getY());
412+
unsigned Z = GetExprVal(Attr->getZ());
413+
llvm::SmallString<32> AttrVal;
414+
llvm::raw_svector_ostream OS(AttrVal);
415+
OS << X << ',' << Y << ',' << Z;
416+
F->addFnAttr("amdgpu-cluster-dims", AttrVal.str());
417+
}
418+
419+
// OpenCL doesn't support cluster feature.
420+
const TargetInfo &TTI = M.getContext().getTargetInfo();
421+
if ((IsOpenCLKernel &&
422+
TTI.hasFeatureEnabled(TTI.getTargetOpts().FeatureMap, "clusters")) ||
423+
FD->hasAttr<CUDANoClusterAttr>())
424+
F->addFnAttr("amdgpu-cluster-dims", "0,0,0");
405425
}
406426

407427
void AMDGPUTargetCodeGenInfo::setTargetAttributes(

clang/lib/Headers/__clang_hip_runtime_wrapper.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,8 @@
2525
#define __constant__ __attribute__((constant))
2626
#define __managed__ __attribute__((managed))
2727

28+
#define __cluster_dims__(...) __attribute__((cluster_dims(__VA_ARGS__)))
29+
2830
#if !defined(__cplusplus) || __cplusplus < 201103L
2931
#define nullptr NULL;
3032
#endif

clang/lib/Sema/SemaDeclAttr.cpp

Lines changed: 114 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5676,6 +5676,114 @@ static void handleLaunchBoundsAttr(Sema &S, Decl *D, const ParsedAttr &AL) {
56765676
AL.getNumArgs() > 2 ? AL.getArgAsExpr(2) : nullptr);
56775677
}
56785678

5679+
static std::pair<Expr *, int>
5680+
makeClusterDimsArgExpr(Sema &S, Expr *E, const CUDAClusterDimsAttr &AL,
5681+
const unsigned Idx) {
5682+
if (!E || S.DiagnoseUnexpandedParameterPack(E))
5683+
return {};
5684+
5685+
// Accept template arguments for now as they depend on something else.
5686+
// We'll get to check them when they eventually get instantiated.
5687+
if (E->isInstantiationDependent())
5688+
return {E, 1};
5689+
5690+
std::optional<llvm::APSInt> I = E->getIntegerConstantExpr(S.Context);
5691+
if (!I) {
5692+
S.Diag(E->getExprLoc(), diag::err_attribute_argument_n_type)
5693+
<< &AL << Idx << AANT_ArgumentIntegerConstant << E->getSourceRange();
5694+
return {};
5695+
}
5696+
// Make sure we can fit it in 4 bits.
5697+
if (!I->isIntN(4)) {
5698+
S.Diag(E->getExprLoc(), diag::err_ice_too_large)
5699+
<< toString(*I, 10, false) << 4 << /*Unsigned=*/1;
5700+
return {};
5701+
}
5702+
if (*I < 0) {
5703+
S.Diag(E->getExprLoc(), diag::warn_attribute_argument_n_negative)
5704+
<< &AL << Idx << E->getSourceRange();
5705+
}
5706+
5707+
return {ConstantExpr::Create(S.getASTContext(), E, APValue(*I)),
5708+
I->getZExtValue()};
5709+
}
5710+
5711+
CUDAClusterDimsAttr *Sema::createClusterDimsAttr(const AttributeCommonInfo &CI,
5712+
Expr *X, Expr *Y, Expr *Z) {
5713+
CUDAClusterDimsAttr TmpAttr(Context, CI, X, Y, Z);
5714+
5715+
auto [NewX, ValX] = makeClusterDimsArgExpr(*this, X, TmpAttr, /*Idx=*/0);
5716+
auto [NewY, ValY] = makeClusterDimsArgExpr(*this, Y, TmpAttr, /*Idx=*/1);
5717+
auto [NewZ, ValZ] = makeClusterDimsArgExpr(*this, Z, TmpAttr, /*Idx=*/2);
5718+
5719+
if (!NewX || (Y && !NewY) || (Z && !NewZ))
5720+
return nullptr;
5721+
5722+
int FlatDim = ValX * ValY * ValZ;
5723+
const llvm::Triple TT =
5724+
(!Context.getLangOpts().CUDAIsDevice && Context.getAuxTargetInfo())
5725+
? Context.getAuxTargetInfo()->getTriple()
5726+
: Context.getTargetInfo().getTriple();
5727+
int MaxDim = 1;
5728+
if (TT.isNVPTX())
5729+
MaxDim = 8;
5730+
else if (TT.isAMDGPU())
5731+
MaxDim = 16;
5732+
else
5733+
return nullptr;
5734+
5735+
// A maximum of 8 thread blocks in a cluster is supported as a portable
5736+
// cluster size in CUDA. The number is 16 for AMDGPU.
5737+
if (FlatDim > MaxDim) {
5738+
Diag(CI.getLoc(), diag::err_cluster_dims_too_large) << MaxDim << FlatDim;
5739+
return nullptr;
5740+
}
5741+
5742+
return CUDAClusterDimsAttr::Create(Context, NewX, NewY, NewZ, CI);
5743+
}
5744+
5745+
void Sema::addClusterDimsAttr(Decl *D, const AttributeCommonInfo &CI, Expr *X,
5746+
Expr *Y, Expr *Z) {
5747+
if (auto *Attr = createClusterDimsAttr(CI, X, Y, Z))
5748+
D->addAttr(Attr);
5749+
}
5750+
5751+
void Sema::addNoClusterAttr(Decl *D, const AttributeCommonInfo &CI) {
5752+
D->addAttr(CUDANoClusterAttr::Create(Context, CI));
5753+
}
5754+
5755+
static void handleClusterDimsAttr(Sema &S, Decl *D, const ParsedAttr &AL) {
5756+
const TargetInfo &TTI = S.Context.getTargetInfo();
5757+
OffloadArch Arch = StringToOffloadArch(TTI.getTargetOpts().CPU);
5758+
if ((TTI.getTriple().isNVPTX() && Arch < clang::OffloadArch::SM_90) ||
5759+
(TTI.getTriple().isAMDGPU() &&
5760+
!TTI.hasFeatureEnabled(TTI.getTargetOpts().FeatureMap, "clusters"))) {
5761+
S.Diag(AL.getLoc(), diag::err_cluster_attr_not_supported) << AL;
5762+
return;
5763+
}
5764+
5765+
if (!AL.checkAtLeastNumArgs(S, /*Num=*/1) ||
5766+
!AL.checkAtMostNumArgs(S, /*Num=*/3))
5767+
return;
5768+
5769+
S.addClusterDimsAttr(D, AL, AL.getArgAsExpr(0),
5770+
AL.getNumArgs() > 1 ? AL.getArgAsExpr(1) : nullptr,
5771+
AL.getNumArgs() > 2 ? AL.getArgAsExpr(2) : nullptr);
5772+
}
5773+
5774+
static void handleNoClusterAttr(Sema &S, Decl *D, const ParsedAttr &AL) {
5775+
const TargetInfo &TTI = S.Context.getTargetInfo();
5776+
OffloadArch Arch = StringToOffloadArch(TTI.getTargetOpts().CPU);
5777+
if ((TTI.getTriple().isNVPTX() && Arch < clang::OffloadArch::SM_90) ||
5778+
(TTI.getTriple().isAMDGPU() &&
5779+
!TTI.hasFeatureEnabled(TTI.getTargetOpts().FeatureMap, "clusters"))) {
5780+
S.Diag(AL.getLoc(), diag::err_cluster_attr_not_supported) << AL;
5781+
return;
5782+
}
5783+
5784+
S.addNoClusterAttr(D, AL);
5785+
}
5786+
56795787
static void handleArgumentWithTypeTagAttr(Sema &S, Decl *D,
56805788
const ParsedAttr &AL) {
56815789
if (!AL.isArgIdent(0)) {
@@ -7141,6 +7249,12 @@ ProcessDeclAttribute(Sema &S, Scope *scope, Decl *D, const ParsedAttr &AL,
71417249
case ParsedAttr::AT_CUDALaunchBounds:
71427250
handleLaunchBoundsAttr(S, D, AL);
71437251
break;
7252+
case ParsedAttr::AT_CUDAClusterDims:
7253+
handleClusterDimsAttr(S, D, AL);
7254+
break;
7255+
case ParsedAttr::AT_CUDANoCluster:
7256+
handleNoClusterAttr(S, D, AL);
7257+
break;
71447258
case ParsedAttr::AT_Restrict:
71457259
handleRestrictAttr(S, D, AL);
71467260
break;

clang/lib/Sema/SemaTemplateInstantiateDecl.cpp

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -707,6 +707,23 @@ static void instantiateDependentAMDGPUMaxNumWorkGroupsAttr(
707707
S.AMDGPU().addAMDGPUMaxNumWorkGroupsAttr(New, Attr, XExpr, YExpr, ZExpr);
708708
}
709709

710+
static void instantiateDependentCUDAClusterDimsAttr(
711+
Sema &S, const MultiLevelTemplateArgumentList &TemplateArgs,
712+
const CUDAClusterDimsAttr &Attr, Decl *New) {
713+
EnterExpressionEvaluationContext Unevaluated(
714+
S, Sema::ExpressionEvaluationContext::ConstantEvaluated);
715+
716+
auto SubstElt = [&S, &TemplateArgs](Expr *E) {
717+
return E ? S.SubstExpr(E, TemplateArgs).get() : nullptr;
718+
};
719+
720+
Expr *XExpr = SubstElt(Attr.getX());
721+
Expr *YExpr = SubstElt(Attr.getY());
722+
Expr *ZExpr = SubstElt(Attr.getZ());
723+
724+
S.addClusterDimsAttr(New, Attr, XExpr, YExpr, ZExpr);
725+
}
726+
710727
// This doesn't take any template parameters, but we have a custom action that
711728
// needs to happen when the kernel itself is instantiated. We need to run the
712729
// ItaniumMangler to mark the names required to name this kernel.
@@ -929,6 +946,11 @@ void Sema::InstantiateAttrs(const MultiLevelTemplateArgumentList &TemplateArgs,
929946
*this, TemplateArgs, *AMDGPUMaxNumWorkGroups, New);
930947
}
931948

949+
if (const auto *CUDAClusterDims = dyn_cast<CUDAClusterDimsAttr>(TmplAttr)) {
950+
instantiateDependentCUDAClusterDimsAttr(*this, TemplateArgs,
951+
*CUDAClusterDims, New);
952+
}
953+
932954
if (const auto *ParamAttr = dyn_cast<HLSLParamModifierAttr>(TmplAttr)) {
933955
instantiateDependentHLSLParamModifierAttr(*this, TemplateArgs, ParamAttr,
934956
Tmpl, New);

clang/test/CodeGenCUDA/Inputs/cuda.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,8 @@
1313
#endif
1414
#define __launch_bounds__(...) __attribute__((launch_bounds(__VA_ARGS__)))
1515
#define __grid_constant__ __attribute__((grid_constant))
16+
#define __cluster_dims__(...) __attribute__((cluster_dims(__VA_ARGS__)))
17+
#define __no_cluster__ __attribute__((no_cluster))
1618
#else
1719
#define __constant__
1820
#define __device__
@@ -22,6 +24,8 @@
2224
#define __managed__
2325
#define __launch_bounds__(...)
2426
#define __grid_constant__
27+
#define __cluster_dims__(...)
28+
#define __no_cluster__
2529
#endif
2630

2731
struct dim3 {
Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,38 @@
1+
// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -target-cpu gfx1250 -fcuda-is-device -emit-llvm -x hip -o - %s | FileCheck %s
2+
// RUN: %clang_cc1 -triple x86_64-pc-linux-gnu -aux-triple amdgcn-amd-amdhsa -emit-llvm -x hip -o - %s | FileCheck --check-prefix=HOST %s
3+
4+
#include "Inputs/cuda.h"
5+
6+
const int constint = 4;
7+
8+
// HOST-NOT: "amdgpu-cluster-dims"
9+
10+
// CHECK: "amdgpu-cluster-dims"="2,2,2"
11+
__global__ void __cluster_dims__(2, 2, 2) test_literal_3d() {}
12+
13+
// CHECK: "amdgpu-cluster-dims"="2,2,1"
14+
__global__ void __cluster_dims__(2, 2) test_literal_2d() {}
15+
16+
// CHECK: "amdgpu-cluster-dims"="4,1,1"
17+
__global__ void __cluster_dims__(4) test_literal_1d() {}
18+
19+
// CHECK: "amdgpu-cluster-dims"="4,2,1"
20+
__global__ void __cluster_dims__(constint, constint / 2, 1) test_constant() {}
21+
22+
// CHECK: "amdgpu-cluster-dims"="0,0,0"
23+
__global__ void __no_cluster__ test_no_cluster() {}
24+
25+
// CHECK: "amdgpu-cluster-dims"="7,1,1"
26+
template<unsigned a>
27+
__global__ void __cluster_dims__(a) test_template_1d() {}
28+
template __global__ void test_template_1d<7>();
29+
30+
// CHECK: "amdgpu-cluster-dims"="2,6,1"
31+
template<unsigned a, unsigned b>
32+
__global__ void __cluster_dims__(a, b) test_template_2d() {}
33+
template __global__ void test_template_2d<2, 6>();
34+
35+
// CHECK: "amdgpu-cluster-dims"="1,2,3"
36+
template<unsigned a, unsigned b, unsigned c>
37+
__global__ void __cluster_dims__(a, b, c) test_template_3d() {}
38+
template __global__ void test_template_3d<1, 2, 3>();

0 commit comments

Comments
 (0)