Skip to content

Commit 714faa2

Browse files
shiltianyxsamliujayfoad
committed
[Clang][HIP][CUDA] Add __cluster_dims__ and __no_cluster__ attribute
This PR adds basic frontend support for `__cluster_dims__` and `__no_cluster__` attribute. Co-authored-by: Yaxun (Sam) Liu <[email protected]> Co-authored-by: Jay Foad <[email protected]>
1 parent f756224 commit 714faa2

File tree

12 files changed

+338
-0
lines changed

12 files changed

+338
-0
lines changed

clang/include/clang/Basic/Attr.td

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1562,6 +1562,23 @@ def HIPManaged : InheritableAttr {
15621562
let Documentation = [HIPManagedAttrDocs];
15631563
}
15641564

1565+
def CUDAClusterDims : InheritableAttr {
1566+
let Spellings = [GNU<"cluster_dims">, Declspec<"__cluster_dims__">];
1567+
let Args = [ExprArgument<"X">, ExprArgument<"Y", 1>, ExprArgument<"Z", 1>];
1568+
let Subjects = SubjectList<[Function], ErrorDiag, "kernel functions">;
1569+
let LangOpts = [CUDA];
1570+
let Documentation = [Undocumented];
1571+
}
1572+
1573+
def CUDANoCluster : InheritableAttr {
1574+
let Spellings = [GNU<"no_cluster">, Declspec<"__no_cluster__">];
1575+
let Subjects = SubjectList<[Function], ErrorDiag, "kernel functions">;
1576+
let LangOpts = [CUDA];
1577+
let Documentation = [Undocumented];
1578+
}
1579+
1580+
def : MutualExclusions<[CUDAClusterDims, CUDANoCluster]>;
1581+
15651582
def CUDAInvalidTarget : InheritableAttr {
15661583
let Spellings = [];
15671584
let Subjects = SubjectList<[Function]>;

clang/include/clang/Basic/DiagnosticSemaKinds.td

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13027,6 +13027,14 @@ def warn_cuda_maxclusterrank_sm_90 : Warning<
1302713027
"maxclusterrank requires sm_90 or higher, CUDA arch provided: %0, ignoring "
1302813028
"%1 attribute">, InGroup<IgnoredAttributes>;
1302913029

13030+
def err_cuda_cluster_attr_not_supported : Error<
13031+
"%select{__cluster_dims__|__no_cluster__}0 is not supported for this GPU architecture"
13032+
>;
13033+
13034+
def err_cuda_cluster_dims_too_large : Error<
13035+
"only a maximum of %0 thread blocks in a cluster is supported"
13036+
>;
13037+
1303013038
// VTable pointer authentication errors
1303113039
def err_non_polymorphic_vtable_pointer_auth : Error<
1303213040
"cannot set vtable pointer authentication on monomorphic type %0">;

clang/include/clang/Sema/Sema.h

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5002,6 +5002,14 @@ class Sema final : public SemaBase {
50025002
void AddLaunchBoundsAttr(Decl *D, const AttributeCommonInfo &CI,
50035003
Expr *MaxThreads, Expr *MinBlocks, Expr *MaxBlocks);
50045004

5005+
/// Add a cluster_dims attribute to a particular declaration.
5006+
CUDAClusterDimsAttr *createClusterDimsAttr(const AttributeCommonInfo &CI,
5007+
Expr *X, Expr *Y, Expr *Z);
5008+
void addClusterDimsAttr(Decl *D, const AttributeCommonInfo &CI, Expr *X,
5009+
Expr *Y, Expr *Z);
5010+
/// Add a no_cluster attribute to a particular declaration.
5011+
void addNoClusterAttr(Decl *D, const AttributeCommonInfo &CI);
5012+
50055013
enum class RetainOwnershipKind { NS, CF, OS };
50065014

50075015
UuidAttr *mergeUuidAttr(Decl *D, const AttributeCommonInfo &CI,

clang/lib/CodeGen/Targets/AMDGPU.cpp

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -342,6 +342,9 @@ static bool requiresAMDGPUProtectedVisibility(const Decl *D,
342342

343343
void AMDGPUTargetCodeGenInfo::setFunctionDeclAttributes(
344344
const FunctionDecl *FD, llvm::Function *F, CodeGenModule &M) const {
345+
llvm::StringMap<bool> TargetFetureMap;
346+
M.getContext().getFunctionFeatureMap(TargetFetureMap, FD);
347+
345348
const auto *ReqdWGS =
346349
M.getLangOpts().OpenCL ? FD->getAttr<ReqdWorkGroupSizeAttr>() : nullptr;
347350
const bool IsOpenCLKernel =
@@ -402,6 +405,29 @@ void AMDGPUTargetCodeGenInfo::setFunctionDeclAttributes(
402405

403406
F->addFnAttr("amdgpu-max-num-workgroups", AttrVal.str());
404407
}
408+
409+
if (auto *Attr = FD->getAttr<CUDAClusterDimsAttr>()) {
410+
uint32_t X =
411+
Attr->getX()->EvaluateKnownConstInt(M.getContext()).getExtValue();
412+
uint32_t Y =
413+
Attr->getY()
414+
? Attr->getY()->EvaluateKnownConstInt(M.getContext()).getExtValue()
415+
: 1;
416+
uint32_t Z =
417+
Attr->getZ()
418+
? Attr->getZ()->EvaluateKnownConstInt(M.getContext()).getExtValue()
419+
: 1;
420+
421+
llvm::SmallString<32> AttrVal;
422+
llvm::raw_svector_ostream OS(AttrVal);
423+
OS << X << ',' << Y << ',' << Z;
424+
F->addFnAttr("amdgpu-cluster-dims", AttrVal.str());
425+
}
426+
427+
// OpenCL doesn't support cluster feature.
428+
if ((IsOpenCLKernel && TargetFetureMap.lookup("gfx1250-insts")) ||
429+
FD->getAttr<CUDANoClusterAttr>())
430+
F->addFnAttr("amdgpu-cluster-dims", "0,0,0");
405431
}
406432

407433
void AMDGPUTargetCodeGenInfo::setTargetAttributes(

clang/lib/Headers/__clang_hip_runtime_wrapper.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,8 @@
2525
#define __constant__ __attribute__((constant))
2626
#define __managed__ __attribute__((managed))
2727

28+
#define __cluster_dims__(...) __attribute__((cluster_dims(__VA_ARGS__)))
29+
2830
#if !defined(__cplusplus) || __cplusplus < 201103L
2931
#define nullptr NULL;
3032
#endif

clang/lib/Sema/SemaDeclAttr.cpp

Lines changed: 130 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5678,6 +5678,130 @@ static void handleLaunchBoundsAttr(Sema &S, Decl *D, const ParsedAttr &AL) {
56785678
AL.getNumArgs() > 2 ? AL.getArgAsExpr(2) : nullptr);
56795679
}
56805680

5681+
static std::pair<Expr *, int>
5682+
makeClusterDimsArgExpr(Sema &S, Expr *E, const CUDAClusterDimsAttr &AL,
5683+
const unsigned Idx) {
5684+
if (S.DiagnoseUnexpandedParameterPack(E))
5685+
return {nullptr, 0};
5686+
5687+
// Accept template arguments for now as they depend on something else.
5688+
// We'll get to check them when they eventually get instantiated.
5689+
if (E->isValueDependent())
5690+
return {E, 1};
5691+
5692+
std::optional<llvm::APSInt> I = llvm::APSInt(64);
5693+
if (!(I = E->getIntegerConstantExpr(S.Context))) {
5694+
S.Diag(E->getExprLoc(), diag::err_attribute_argument_n_type)
5695+
<< &AL << Idx << AANT_ArgumentIntegerConstant << E->getSourceRange();
5696+
return {nullptr, 0};
5697+
}
5698+
// Make sure we can fit it in 4 bits.
5699+
if (!I->isIntN(4)) {
5700+
S.Diag(E->getExprLoc(), diag::err_ice_too_large)
5701+
<< toString(*I, 10, false) << 4 << /* Unsigned */ 1;
5702+
return {nullptr, 0};
5703+
}
5704+
if (*I < 0)
5705+
S.Diag(E->getExprLoc(), diag::warn_attribute_argument_n_negative)
5706+
<< &AL << Idx << E->getSourceRange();
5707+
5708+
// We may need to perform implicit conversion of the argument.
5709+
InitializedEntity Entity = InitializedEntity::InitializeParameter(
5710+
S.Context, S.Context.getConstType(S.Context.IntTy), /*consume*/ false);
5711+
ExprResult ValArg = S.PerformCopyInitialization(Entity, SourceLocation(), E);
5712+
assert(!ValArg.isInvalid() &&
5713+
"Unexpected PerformCopyInitialization() failure.");
5714+
5715+
return {ValArg.getAs<Expr>(), I->getZExtValue()};
5716+
}
5717+
5718+
CUDAClusterDimsAttr *Sema::createClusterDimsAttr(const AttributeCommonInfo &CI,
5719+
Expr *X, Expr *Y, Expr *Z) {
5720+
CUDAClusterDimsAttr TmpAttr(Context, CI, X, Y, Z);
5721+
5722+
int ValX = 1;
5723+
int ValY = 1;
5724+
int ValZ = 1;
5725+
5726+
std::tie(X, ValX) = makeClusterDimsArgExpr(*this, X, TmpAttr, /*Idx=*/0);
5727+
if (!X)
5728+
return nullptr;
5729+
5730+
if (Y) {
5731+
std::tie(Y, ValY) = makeClusterDimsArgExpr(*this, Y, TmpAttr, /*Idx=*/1);
5732+
if (!Y)
5733+
return nullptr;
5734+
}
5735+
5736+
if (Z) {
5737+
std::tie(Z, ValZ) = makeClusterDimsArgExpr(*this, Z, TmpAttr, /*Idx=*/2);
5738+
if (!Z)
5739+
return nullptr;
5740+
}
5741+
5742+
int FlatDim = ValX * ValY * ValZ;
5743+
auto TT = (!Context.getLangOpts().CUDAIsDevice && Context.getAuxTargetInfo())
5744+
? Context.getAuxTargetInfo()->getTriple()
5745+
: Context.getTargetInfo().getTriple();
5746+
int MaxDim = 1;
5747+
if (TT.isNVPTX())
5748+
MaxDim = 8;
5749+
else if (TT.isAMDGPU())
5750+
MaxDim = 16;
5751+
else
5752+
return nullptr;
5753+
5754+
// A maximum of 8 thread blocks in a cluster is supported as a portable
5755+
// cluster size in CUDA. The number is 16 for AMDGPU.
5756+
if (FlatDim > MaxDim) {
5757+
Diag(CI.getLoc(), diag::err_cuda_cluster_dims_too_large) << MaxDim;
5758+
return nullptr;
5759+
}
5760+
5761+
return ::new (Context) CUDAClusterDimsAttr(Context, CI, X, Y, Z);
5762+
}
5763+
5764+
void Sema::addClusterDimsAttr(Decl *D, const AttributeCommonInfo &CI, Expr *X,
5765+
Expr *Y, Expr *Z) {
5766+
if (auto *Attr = createClusterDimsAttr(CI, X, Y, Z))
5767+
D->addAttr(Attr);
5768+
}
5769+
5770+
void Sema::addNoClusterAttr(Decl *D, const AttributeCommonInfo &CI) {
5771+
if (CUDANoClusterAttr *Attr = ::new (Context) CUDANoClusterAttr(Context, CI))
5772+
D->addAttr(Attr);
5773+
}
5774+
5775+
static void handleClusterDimsAttr(Sema &S, Decl *D, const ParsedAttr &AL) {
5776+
auto &TTI = S.Context.getTargetInfo();
5777+
auto Arch = StringToOffloadArch(TTI.getTargetOpts().CPU);
5778+
if ((TTI.getTriple().isNVPTX() && Arch < clang::OffloadArch::SM_90) ||
5779+
(TTI.getTriple().isAMDGPU() && Arch < clang::OffloadArch::GFX1250)) {
5780+
S.Diag(AL.getLoc(), diag::err_cuda_cluster_attr_not_supported) << 0;
5781+
return;
5782+
}
5783+
5784+
if (!AL.checkAtLeastNumArgs(S, /*Num=*/1) ||
5785+
!AL.checkAtMostNumArgs(S, /*Num=*/3))
5786+
return;
5787+
5788+
S.addClusterDimsAttr(D, AL, AL.getArgAsExpr(0),
5789+
AL.getNumArgs() > 1 ? AL.getArgAsExpr(1) : nullptr,
5790+
AL.getNumArgs() > 2 ? AL.getArgAsExpr(2) : nullptr);
5791+
}
5792+
5793+
static void handleNoClusterAttr(Sema &S, Decl *D, const ParsedAttr &AL) {
5794+
auto &TTI = S.Context.getTargetInfo();
5795+
auto Arch = StringToOffloadArch(TTI.getTargetOpts().CPU);
5796+
if ((TTI.getTriple().isNVPTX() && Arch < clang::OffloadArch::SM_90) ||
5797+
(TTI.getTriple().isAMDGPU() && Arch < clang::OffloadArch::GFX1250)) {
5798+
S.Diag(AL.getLoc(), diag::err_cuda_cluster_attr_not_supported) << 1;
5799+
return;
5800+
}
5801+
5802+
S.addNoClusterAttr(D, AL);
5803+
}
5804+
56815805
static void handleArgumentWithTypeTagAttr(Sema &S, Decl *D,
56825806
const ParsedAttr &AL) {
56835807
if (!AL.isArgIdent(0)) {
@@ -7129,6 +7253,12 @@ ProcessDeclAttribute(Sema &S, Scope *scope, Decl *D, const ParsedAttr &AL,
71297253
case ParsedAttr::AT_CUDALaunchBounds:
71307254
handleLaunchBoundsAttr(S, D, AL);
71317255
break;
7256+
case ParsedAttr::AT_CUDAClusterDims:
7257+
handleClusterDimsAttr(S, D, AL);
7258+
break;
7259+
case ParsedAttr::AT_CUDANoCluster:
7260+
handleNoClusterAttr(S, D, AL);
7261+
break;
71327262
case ParsedAttr::AT_Restrict:
71337263
handleRestrictAttr(S, D, AL);
71347264
break;

clang/lib/Sema/SemaTemplateInstantiateDecl.cpp

Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -707,6 +707,38 @@ static void instantiateDependentAMDGPUMaxNumWorkGroupsAttr(
707707
S.AMDGPU().addAMDGPUMaxNumWorkGroupsAttr(New, Attr, XExpr, YExpr, ZExpr);
708708
}
709709

710+
static void instantiateDependentCUDAClusterDimsAttr(
711+
Sema &S, const MultiLevelTemplateArgumentList &TemplateArgs,
712+
const CUDAClusterDimsAttr &Attr, Decl *New) {
713+
EnterExpressionEvaluationContext Unevaluated(
714+
S, Sema::ExpressionEvaluationContext::ConstantEvaluated);
715+
716+
Expr *XExpr = nullptr;
717+
Expr *YExpr = nullptr;
718+
Expr *ZExpr = nullptr;
719+
720+
if (Attr.getX()) {
721+
ExprResult ResultX = S.SubstExpr(Attr.getX(), TemplateArgs);
722+
if (ResultX.isUsable())
723+
XExpr = ResultX.getAs<Expr>();
724+
}
725+
726+
if (Attr.getY()) {
727+
ExprResult ResultY = S.SubstExpr(Attr.getY(), TemplateArgs);
728+
if (ResultY.isUsable())
729+
YExpr = ResultY.getAs<Expr>();
730+
}
731+
732+
if (Attr.getZ()) {
733+
ExprResult ResultZ = S.SubstExpr(Attr.getZ(), TemplateArgs);
734+
if (ResultZ.isUsable())
735+
ZExpr = ResultZ.getAs<Expr>();
736+
}
737+
738+
if (XExpr)
739+
S.addClusterDimsAttr(New, Attr, XExpr, YExpr, ZExpr);
740+
}
741+
710742
// This doesn't take any template parameters, but we have a custom action that
711743
// needs to happen when the kernel itself is instantiated. We need to run the
712744
// ItaniumMangler to mark the names required to name this kernel.
@@ -921,6 +953,11 @@ void Sema::InstantiateAttrs(const MultiLevelTemplateArgumentList &TemplateArgs,
921953
*this, TemplateArgs, *AMDGPUMaxNumWorkGroups, New);
922954
}
923955

956+
if (const auto *CUDAClusterDims = dyn_cast<CUDAClusterDimsAttr>(TmplAttr)) {
957+
instantiateDependentCUDAClusterDimsAttr(*this, TemplateArgs,
958+
*CUDAClusterDims, New);
959+
}
960+
924961
if (const auto *ParamAttr = dyn_cast<HLSLParamModifierAttr>(TmplAttr)) {
925962
instantiateDependentHLSLParamModifierAttr(*this, TemplateArgs, ParamAttr,
926963
New);

clang/test/CodeGenCUDA/Inputs/cuda.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,8 @@
1313
#endif
1414
#define __launch_bounds__(...) __attribute__((launch_bounds(__VA_ARGS__)))
1515
#define __grid_constant__ __attribute__((grid_constant))
16+
#define __cluster_dims__(...) __attribute__((cluster_dims(__VA_ARGS__)))
17+
#define __no_cluster__ __attribute__((no_cluster))
1618
#else
1719
#define __constant__
1820
#define __device__
@@ -22,6 +24,8 @@
2224
#define __managed__
2325
#define __launch_bounds__(...)
2426
#define __grid_constant__
27+
#define __cluster_dims__(...)
28+
#define __no_cluster__
2529
#endif
2630

2731
struct dim3 {
Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,38 @@
1+
// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -target-cpu gfx1250 -fcuda-is-device -emit-llvm -x hip -o - %s | FileCheck %s
2+
// RUN: %clang_cc1 -triple x86_64-pc-linux-gnu -aux-triple amdgcn-amd-amdhsa -emit-llvm -x hip -o - %s | FileCheck --check-prefix=HOST %s
3+
4+
#include "Inputs/cuda.h"
5+
6+
const int constint = 4;
7+
8+
// HOST-NOT: "amdgpu-cluster-dims"
9+
10+
// CHECK: "amdgpu-cluster-dims"="2,2,2"
11+
__global__ void __cluster_dims__(2, 2, 2) test_literal_3d() {}
12+
13+
// CHECK: "amdgpu-cluster-dims"="2,2,1"
14+
__global__ void __cluster_dims__(2, 2) test_literal_2d() {}
15+
16+
// CHECK: "amdgpu-cluster-dims"="4,1,1"
17+
__global__ void __cluster_dims__(4) test_literal_1d() {}
18+
19+
// CHECK: "amdgpu-cluster-dims"="4,2,1"
20+
__global__ void __cluster_dims__(constint, constint / 2, 1) test_constant() {}
21+
22+
// CHECK: "amdgpu-cluster-dims"="0,0,0"
23+
__global__ void __no_cluster__ test_no_cluster() {}
24+
25+
// CHECK: "amdgpu-cluster-dims"="7,1,1"
26+
template<unsigned a>
27+
__global__ void __cluster_dims__(a) test_template_1d() {}
28+
template __global__ void test_template_1d<7>();
29+
30+
// CHECK: "amdgpu-cluster-dims"="2,6,1"
31+
template<unsigned a, unsigned b>
32+
__global__ void __cluster_dims__(a, b) test_template_2d() {}
33+
template __global__ void test_template_2d<2, 6>();
34+
35+
// CHECK: "amdgpu-cluster-dims"="1,2,3"
36+
template<unsigned a, unsigned b, unsigned c>
37+
__global__ void __cluster_dims__(a, b, c) test_template_3d() {}
38+
template __global__ void test_template_3d<1, 2, 3>();

clang/test/Misc/pragma-attribute-supported-attributes-list.test

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,7 @@
3535
// CHECK-NEXT: CFUnknownTransfer (SubjectMatchRule_function)
3636
// CHECK-NEXT: CPUDispatch (SubjectMatchRule_function)
3737
// CHECK-NEXT: CPUSpecific (SubjectMatchRule_function)
38+
// CHECK-NEXT: CUDAClusterDims (SubjectMatchRule_function)
3839
// CHECK-NEXT: CUDAConstant (SubjectMatchRule_variable)
3940
// CHECK-NEXT: CUDADevice (SubjectMatchRule_function, SubjectMatchRule_variable)
4041
// CHECK-NEXT: CUDADeviceBuiltinSurfaceType (SubjectMatchRule_record)
@@ -43,6 +44,7 @@
4344
// CHECK-NEXT: CUDAGridConstant (SubjectMatchRule_variable_is_parameter)
4445
// CHECK-NEXT: CUDAHost (SubjectMatchRule_function)
4546
// CHECK-NEXT: CUDALaunchBounds (SubjectMatchRule_objc_method, SubjectMatchRule_hasType_functionType)
47+
// CHECK-NEXT: CUDANoCluster (SubjectMatchRule_function)
4648
// CHECK-NEXT: CUDAShared (SubjectMatchRule_variable)
4749
// CHECK-NEXT: CXX11NoReturn (SubjectMatchRule_function)
4850
// CHECK-NEXT: CallableWhen (SubjectMatchRule_function_is_member)

0 commit comments

Comments
 (0)