Skip to content

Commit 190dd7a

Browse files
shiltianyxsamliujayfoad
committed
[Clang][HIP][CUDA] Add __cluster_dims__ and __no_cluster__ attribute
This PR adds basic frontend support for `__cluster_dims__` and `__no_cluster__` attribute. Co-authored-by: Yaxun (Sam) Liu <[email protected]> Co-authored-by: Jay Foad <[email protected]>
1 parent fd6a2b8 commit 190dd7a

File tree

12 files changed

+338
-0
lines changed

12 files changed

+338
-0
lines changed

clang/include/clang/Basic/Attr.td

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1557,6 +1557,23 @@ def HIPManaged : InheritableAttr {
15571557
let Documentation = [HIPManagedAttrDocs];
15581558
}
15591559

1560+
def CUDAClusterDims : InheritableAttr {
1561+
let Spellings = [GNU<"cluster_dims">, Declspec<"__cluster_dims__">];
1562+
let Args = [ExprArgument<"X">, ExprArgument<"Y", 1>, ExprArgument<"Z", 1>];
1563+
let Subjects = SubjectList<[Function], ErrorDiag, "kernel functions">;
1564+
let LangOpts = [CUDA];
1565+
let Documentation = [Undocumented];
1566+
}
1567+
1568+
def CUDANoCluster : InheritableAttr {
1569+
let Spellings = [GNU<"no_cluster">, Declspec<"__no_cluster__">];
1570+
let Subjects = SubjectList<[Function], ErrorDiag, "kernel functions">;
1571+
let LangOpts = [CUDA];
1572+
let Documentation = [Undocumented];
1573+
}
1574+
1575+
def : MutualExclusions<[CUDAClusterDims, CUDANoCluster]>;
1576+
15601577
def CUDAInvalidTarget : InheritableAttr {
15611578
let Spellings = [];
15621579
let Subjects = SubjectList<[Function]>;

clang/include/clang/Basic/DiagnosticSemaKinds.td

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13027,6 +13027,14 @@ def warn_cuda_maxclusterrank_sm_90 : Warning<
1302713027
"maxclusterrank requires sm_90 or higher, CUDA arch provided: %0, ignoring "
1302813028
"%1 attribute">, InGroup<IgnoredAttributes>;
1302913029

13030+
def err_cuda_cluster_attr_not_supported : Error<
13031+
"%select{__cluster_dims__|__no_cluster__}0 is not supported for this GPU architecture"
13032+
>;
13033+
13034+
def err_cuda_cluster_dims_too_large : Error<
13035+
"only a maximum of %0 thread blocks in a cluster is supported"
13036+
>;
13037+
1303013038
// VTable pointer authentication errors
1303113039
def err_non_polymorphic_vtable_pointer_auth : Error<
1303213040
"cannot set vtable pointer authentication on monomorphic type %0">;

clang/include/clang/Sema/Sema.h

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5002,6 +5002,14 @@ class Sema final : public SemaBase {
50025002
void AddLaunchBoundsAttr(Decl *D, const AttributeCommonInfo &CI,
50035003
Expr *MaxThreads, Expr *MinBlocks, Expr *MaxBlocks);
50045004

5005+
/// Add a cluster_dims attribute to a particular declaration.
5006+
CUDAClusterDimsAttr *createClusterDimsAttr(const AttributeCommonInfo &CI,
5007+
Expr *X, Expr *Y, Expr *Z);
5008+
void addClusterDimsAttr(Decl *D, const AttributeCommonInfo &CI, Expr *X,
5009+
Expr *Y, Expr *Z);
5010+
/// Add a no_cluster attribute to a particular declaration.
5011+
void addNoClusterAttr(Decl *D, const AttributeCommonInfo &CI);
5012+
50055013
enum class RetainOwnershipKind { NS, CF, OS };
50065014

50075015
UuidAttr *mergeUuidAttr(Decl *D, const AttributeCommonInfo &CI,

clang/lib/CodeGen/Targets/AMDGPU.cpp

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -342,6 +342,9 @@ static bool requiresAMDGPUProtectedVisibility(const Decl *D,
342342

343343
void AMDGPUTargetCodeGenInfo::setFunctionDeclAttributes(
344344
const FunctionDecl *FD, llvm::Function *F, CodeGenModule &M) const {
345+
llvm::StringMap<bool> TargetFetureMap;
346+
M.getContext().getFunctionFeatureMap(TargetFetureMap, FD);
347+
345348
const auto *ReqdWGS =
346349
M.getLangOpts().OpenCL ? FD->getAttr<ReqdWorkGroupSizeAttr>() : nullptr;
347350
const bool IsOpenCLKernel =
@@ -402,6 +405,29 @@ void AMDGPUTargetCodeGenInfo::setFunctionDeclAttributes(
402405

403406
F->addFnAttr("amdgpu-max-num-workgroups", AttrVal.str());
404407
}
408+
409+
if (auto *Attr = FD->getAttr<CUDAClusterDimsAttr>()) {
410+
uint32_t X =
411+
Attr->getX()->EvaluateKnownConstInt(M.getContext()).getExtValue();
412+
uint32_t Y =
413+
Attr->getY()
414+
? Attr->getY()->EvaluateKnownConstInt(M.getContext()).getExtValue()
415+
: 1;
416+
uint32_t Z =
417+
Attr->getZ()
418+
? Attr->getZ()->EvaluateKnownConstInt(M.getContext()).getExtValue()
419+
: 1;
420+
421+
llvm::SmallString<32> AttrVal;
422+
llvm::raw_svector_ostream OS(AttrVal);
423+
OS << X << ',' << Y << ',' << Z;
424+
F->addFnAttr("amdgpu-cluster-dims", AttrVal.str());
425+
}
426+
427+
// OpenCL doesn't support cluster feature.
428+
if ((IsOpenCLKernel && TargetFetureMap.lookup("gfx1250-insts")) ||
429+
FD->getAttr<CUDANoClusterAttr>())
430+
F->addFnAttr("amdgpu-cluster-dims", "0,0,0");
405431
}
406432

407433
void AMDGPUTargetCodeGenInfo::setTargetAttributes(

clang/lib/Headers/__clang_hip_runtime_wrapper.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,8 @@
2525
#define __constant__ __attribute__((constant))
2626
#define __managed__ __attribute__((managed))
2727

28+
#define __cluster_dims__(...) __attribute__((cluster_dims(__VA_ARGS__)))
29+
2830
#if !defined(__cplusplus) || __cplusplus < 201103L
2931
#define nullptr NULL;
3032
#endif

clang/lib/Sema/SemaDeclAttr.cpp

Lines changed: 130 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5654,6 +5654,130 @@ static void handleLaunchBoundsAttr(Sema &S, Decl *D, const ParsedAttr &AL) {
56545654
AL.getNumArgs() > 2 ? AL.getArgAsExpr(2) : nullptr);
56555655
}
56565656

5657+
static std::pair<Expr *, int>
5658+
makeClusterDimsArgExpr(Sema &S, Expr *E, const CUDAClusterDimsAttr &AL,
5659+
const unsigned Idx) {
5660+
if (S.DiagnoseUnexpandedParameterPack(E))
5661+
return {nullptr, 0};
5662+
5663+
// Accept template arguments for now as they depend on something else.
5664+
// We'll get to check them when they eventually get instantiated.
5665+
if (E->isValueDependent())
5666+
return {E, 1};
5667+
5668+
std::optional<llvm::APSInt> I = llvm::APSInt(64);
5669+
if (!(I = E->getIntegerConstantExpr(S.Context))) {
5670+
S.Diag(E->getExprLoc(), diag::err_attribute_argument_n_type)
5671+
<< &AL << Idx << AANT_ArgumentIntegerConstant << E->getSourceRange();
5672+
return {nullptr, 0};
5673+
}
5674+
// Make sure we can fit it in 4 bits.
5675+
if (!I->isIntN(4)) {
5676+
S.Diag(E->getExprLoc(), diag::err_ice_too_large)
5677+
<< toString(*I, 10, false) << 4 << /* Unsigned */ 1;
5678+
return {nullptr, 0};
5679+
}
5680+
if (*I < 0)
5681+
S.Diag(E->getExprLoc(), diag::warn_attribute_argument_n_negative)
5682+
<< &AL << Idx << E->getSourceRange();
5683+
5684+
// We may need to perform implicit conversion of the argument.
5685+
InitializedEntity Entity = InitializedEntity::InitializeParameter(
5686+
S.Context, S.Context.getConstType(S.Context.IntTy), /*consume*/ false);
5687+
ExprResult ValArg = S.PerformCopyInitialization(Entity, SourceLocation(), E);
5688+
assert(!ValArg.isInvalid() &&
5689+
"Unexpected PerformCopyInitialization() failure.");
5690+
5691+
return {ValArg.getAs<Expr>(), I->getZExtValue()};
5692+
}
5693+
5694+
CUDAClusterDimsAttr *Sema::createClusterDimsAttr(const AttributeCommonInfo &CI,
5695+
Expr *X, Expr *Y, Expr *Z) {
5696+
CUDAClusterDimsAttr TmpAttr(Context, CI, X, Y, Z);
5697+
5698+
int ValX = 1;
5699+
int ValY = 1;
5700+
int ValZ = 1;
5701+
5702+
std::tie(X, ValX) = makeClusterDimsArgExpr(*this, X, TmpAttr, /*Idx=*/0);
5703+
if (!X)
5704+
return nullptr;
5705+
5706+
if (Y) {
5707+
std::tie(Y, ValY) = makeClusterDimsArgExpr(*this, Y, TmpAttr, /*Idx=*/1);
5708+
if (!Y)
5709+
return nullptr;
5710+
}
5711+
5712+
if (Z) {
5713+
std::tie(Z, ValZ) = makeClusterDimsArgExpr(*this, Z, TmpAttr, /*Idx=*/2);
5714+
if (!Z)
5715+
return nullptr;
5716+
}
5717+
5718+
int FlatDim = ValX * ValY * ValZ;
5719+
auto TT = (!Context.getLangOpts().CUDAIsDevice && Context.getAuxTargetInfo())
5720+
? Context.getAuxTargetInfo()->getTriple()
5721+
: Context.getTargetInfo().getTriple();
5722+
int MaxDim = 1;
5723+
if (TT.isNVPTX())
5724+
MaxDim = 8;
5725+
else if (TT.isAMDGPU())
5726+
MaxDim = 16;
5727+
else
5728+
return nullptr;
5729+
5730+
// A maximum of 8 thread blocks in a cluster is supported as a portable
5731+
// cluster size in CUDA. The number is 16 for AMDGPU.
5732+
if (FlatDim > MaxDim) {
5733+
Diag(CI.getLoc(), diag::err_cuda_cluster_dims_too_large) << MaxDim;
5734+
return nullptr;
5735+
}
5736+
5737+
return ::new (Context) CUDAClusterDimsAttr(Context, CI, X, Y, Z);
5738+
}
5739+
5740+
void Sema::addClusterDimsAttr(Decl *D, const AttributeCommonInfo &CI, Expr *X,
5741+
Expr *Y, Expr *Z) {
5742+
if (auto *Attr = createClusterDimsAttr(CI, X, Y, Z))
5743+
D->addAttr(Attr);
5744+
}
5745+
5746+
void Sema::addNoClusterAttr(Decl *D, const AttributeCommonInfo &CI) {
5747+
if (CUDANoClusterAttr *Attr = ::new (Context) CUDANoClusterAttr(Context, CI))
5748+
D->addAttr(Attr);
5749+
}
5750+
5751+
static void handleClusterDimsAttr(Sema &S, Decl *D, const ParsedAttr &AL) {
5752+
auto &TTI = S.Context.getTargetInfo();
5753+
auto Arch = StringToOffloadArch(TTI.getTargetOpts().CPU);
5754+
if ((TTI.getTriple().isNVPTX() && Arch < clang::OffloadArch::SM_90) ||
5755+
(TTI.getTriple().isAMDGPU() && Arch < clang::OffloadArch::GFX1250)) {
5756+
S.Diag(AL.getLoc(), diag::err_cuda_cluster_attr_not_supported) << 0;
5757+
return;
5758+
}
5759+
5760+
if (!AL.checkAtLeastNumArgs(S, /*Num=*/1) ||
5761+
!AL.checkAtMostNumArgs(S, /*Num=*/3))
5762+
return;
5763+
5764+
S.addClusterDimsAttr(D, AL, AL.getArgAsExpr(0),
5765+
AL.getNumArgs() > 1 ? AL.getArgAsExpr(1) : nullptr,
5766+
AL.getNumArgs() > 2 ? AL.getArgAsExpr(2) : nullptr);
5767+
}
5768+
5769+
static void handleNoClusterAttr(Sema &S, Decl *D, const ParsedAttr &AL) {
5770+
auto &TTI = S.Context.getTargetInfo();
5771+
auto Arch = StringToOffloadArch(TTI.getTargetOpts().CPU);
5772+
if ((TTI.getTriple().isNVPTX() && Arch < clang::OffloadArch::SM_90) ||
5773+
(TTI.getTriple().isAMDGPU() && Arch < clang::OffloadArch::GFX1250)) {
5774+
S.Diag(AL.getLoc(), diag::err_cuda_cluster_attr_not_supported) << 1;
5775+
return;
5776+
}
5777+
5778+
S.addNoClusterAttr(D, AL);
5779+
}
5780+
56575781
static void handleArgumentWithTypeTagAttr(Sema &S, Decl *D,
56585782
const ParsedAttr &AL) {
56595783
if (!AL.isArgIdent(0)) {
@@ -7105,6 +7229,12 @@ ProcessDeclAttribute(Sema &S, Scope *scope, Decl *D, const ParsedAttr &AL,
71057229
case ParsedAttr::AT_CUDALaunchBounds:
71067230
handleLaunchBoundsAttr(S, D, AL);
71077231
break;
7232+
case ParsedAttr::AT_CUDAClusterDims:
7233+
handleClusterDimsAttr(S, D, AL);
7234+
break;
7235+
case ParsedAttr::AT_CUDANoCluster:
7236+
handleNoClusterAttr(S, D, AL);
7237+
break;
71087238
case ParsedAttr::AT_Restrict:
71097239
handleRestrictAttr(S, D, AL);
71107240
break;

clang/lib/Sema/SemaTemplateInstantiateDecl.cpp

Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -681,6 +681,38 @@ static void instantiateDependentAMDGPUMaxNumWorkGroupsAttr(
681681
S.AMDGPU().addAMDGPUMaxNumWorkGroupsAttr(New, Attr, XExpr, YExpr, ZExpr);
682682
}
683683

684+
static void instantiateDependentCUDAClusterDimsAttr(
685+
Sema &S, const MultiLevelTemplateArgumentList &TemplateArgs,
686+
const CUDAClusterDimsAttr &Attr, Decl *New) {
687+
EnterExpressionEvaluationContext Unevaluated(
688+
S, Sema::ExpressionEvaluationContext::ConstantEvaluated);
689+
690+
Expr *XExpr = nullptr;
691+
Expr *YExpr = nullptr;
692+
Expr *ZExpr = nullptr;
693+
694+
if (Attr.getX()) {
695+
ExprResult ResultX = S.SubstExpr(Attr.getX(), TemplateArgs);
696+
if (ResultX.isUsable())
697+
XExpr = ResultX.getAs<Expr>();
698+
}
699+
700+
if (Attr.getY()) {
701+
ExprResult ResultY = S.SubstExpr(Attr.getY(), TemplateArgs);
702+
if (ResultY.isUsable())
703+
YExpr = ResultY.getAs<Expr>();
704+
}
705+
706+
if (Attr.getZ()) {
707+
ExprResult ResultZ = S.SubstExpr(Attr.getZ(), TemplateArgs);
708+
if (ResultZ.isUsable())
709+
ZExpr = ResultZ.getAs<Expr>();
710+
}
711+
712+
if (XExpr)
713+
S.addClusterDimsAttr(New, Attr, XExpr, YExpr, ZExpr);
714+
}
715+
684716
// This doesn't take any template parameters, but we have a custom action that
685717
// needs to happen when the kernel itself is instantiated. We need to run the
686718
// ItaniumMangler to mark the names required to name this kernel.
@@ -883,6 +915,11 @@ void Sema::InstantiateAttrs(const MultiLevelTemplateArgumentList &TemplateArgs,
883915
*this, TemplateArgs, *AMDGPUMaxNumWorkGroups, New);
884916
}
885917

918+
if (const auto *CUDAClusterDims = dyn_cast<CUDAClusterDimsAttr>(TmplAttr)) {
919+
instantiateDependentCUDAClusterDimsAttr(*this, TemplateArgs,
920+
*CUDAClusterDims, New);
921+
}
922+
886923
if (const auto *ParamAttr = dyn_cast<HLSLParamModifierAttr>(TmplAttr)) {
887924
instantiateDependentHLSLParamModifierAttr(*this, TemplateArgs, ParamAttr,
888925
New);

clang/test/CodeGenCUDA/Inputs/cuda.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,8 @@
1313
#endif
1414
#define __launch_bounds__(...) __attribute__((launch_bounds(__VA_ARGS__)))
1515
#define __grid_constant__ __attribute__((grid_constant))
16+
#define __cluster_dims__(...) __attribute__((cluster_dims(__VA_ARGS__)))
17+
#define __no_cluster__ __attribute__((no_cluster))
1618
#else
1719
#define __constant__
1820
#define __device__
@@ -22,6 +24,8 @@
2224
#define __managed__
2325
#define __launch_bounds__(...)
2426
#define __grid_constant__
27+
#define __cluster_dims__(...)
28+
#define __no_cluster__
2529
#endif
2630

2731
struct dim3 {
Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,38 @@
1+
// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -target-cpu gfx1250 -fcuda-is-device -emit-llvm -x hip -o - %s | FileCheck %s
2+
// RUN: %clang_cc1 -triple x86_64-pc-linux-gnu -aux-triple amdgcn-amd-amdhsa -emit-llvm -x hip -o - %s | FileCheck --check-prefix=HOST %s
3+
4+
#include "Inputs/cuda.h"
5+
6+
const int constint = 4;
7+
8+
// HOST-NOT: "amdgpu-cluster-dims"
9+
10+
// CHECK: "amdgpu-cluster-dims"="2,2,2"
11+
__global__ void __cluster_dims__(2, 2, 2) test_literal_3d() {}
12+
13+
// CHECK: "amdgpu-cluster-dims"="2,2,1"
14+
__global__ void __cluster_dims__(2, 2) test_literal_2d() {}
15+
16+
// CHECK: "amdgpu-cluster-dims"="4,1,1"
17+
__global__ void __cluster_dims__(4) test_literal_1d() {}
18+
19+
// CHECK: "amdgpu-cluster-dims"="4,2,1"
20+
__global__ void __cluster_dims__(constint, constint / 2, 1) test_constant() {}
21+
22+
// CHECK: "amdgpu-cluster-dims"="0,0,0"
23+
__global__ void __no_cluster__ test_no_cluster() {}
24+
25+
// CHECK: "amdgpu-cluster-dims"="7,1,1"
26+
template<unsigned a>
27+
__global__ void __cluster_dims__(a) test_template_1d() {}
28+
template __global__ void test_template_1d<7>();
29+
30+
// CHECK: "amdgpu-cluster-dims"="2,6,1"
31+
template<unsigned a, unsigned b>
32+
__global__ void __cluster_dims__(a, b) test_template_2d() {}
33+
template __global__ void test_template_2d<2, 6>();
34+
35+
// CHECK: "amdgpu-cluster-dims"="1,2,3"
36+
template<unsigned a, unsigned b, unsigned c>
37+
__global__ void __cluster_dims__(a, b, c) test_template_3d() {}
38+
template __global__ void test_template_3d<1, 2, 3>();

clang/test/Misc/pragma-attribute-supported-attributes-list.test

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,7 @@
3535
// CHECK-NEXT: CFUnknownTransfer (SubjectMatchRule_function)
3636
// CHECK-NEXT: CPUDispatch (SubjectMatchRule_function)
3737
// CHECK-NEXT: CPUSpecific (SubjectMatchRule_function)
38+
// CHECK-NEXT: CUDAClusterDims (SubjectMatchRule_function)
3839
// CHECK-NEXT: CUDAConstant (SubjectMatchRule_variable)
3940
// CHECK-NEXT: CUDADevice (SubjectMatchRule_function, SubjectMatchRule_variable)
4041
// CHECK-NEXT: CUDADeviceBuiltinSurfaceType (SubjectMatchRule_record)
@@ -43,6 +44,7 @@
4344
// CHECK-NEXT: CUDAGridConstant (SubjectMatchRule_variable_is_parameter)
4445
// CHECK-NEXT: CUDAHost (SubjectMatchRule_function)
4546
// CHECK-NEXT: CUDALaunchBounds (SubjectMatchRule_objc_method, SubjectMatchRule_hasType_functionType)
47+
// CHECK-NEXT: CUDANoCluster (SubjectMatchRule_function)
4648
// CHECK-NEXT: CUDAShared (SubjectMatchRule_variable)
4749
// CHECK-NEXT: CXX11NoReturn (SubjectMatchRule_function)
4850
// CHECK-NEXT: CallableWhen (SubjectMatchRule_function_is_member)

0 commit comments

Comments
 (0)