Skip to content

Commit 0a6d767

Browse files
committed
refine target feature lookup; fix comments
1 parent 120ca45 commit 0a6d767

File tree

7 files changed

+31
-25
lines changed

7 files changed

+31
-25
lines changed

clang/include/clang/Basic/Attr.td

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1573,15 +1573,15 @@ def HIPManaged : InheritableAttr {
15731573
}
15741574

15751575
def CUDAClusterDims : InheritableAttr {
1576-
let Spellings = [GNU<"cluster_dims">, Declspec<"cluster_dims">];
1576+
let Spellings = [GNU<"cluster_dims">];
15771577
let Args = [ExprArgument<"X">, ExprArgument<"Y", /*opt=*/1>, ExprArgument<"Z", /*opt=*/1>];
15781578
let Subjects = SubjectList<[ObjCMethod, FunctionLike]>;
15791579
let LangOpts = [CUDA];
15801580
let Documentation = [CUDAClusterDimsAttrDoc];
15811581
}
15821582

15831583
def CUDANoCluster : InheritableAttr {
1584-
let Spellings = [GNU<"no_cluster">, Declspec<"no_cluster">];
1584+
let Spellings = [GNU<"no_cluster">];
15851585
let Subjects = SubjectList<[ObjCMethod, FunctionLike]>;
15861586
let LangOpts = [CUDA];
15871587
let Documentation = [CUDANoClusterAttrDoc];

clang/include/clang/Basic/AttrDocs.td

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -7549,9 +7549,10 @@ def CUDAClusterDimsAttrDoc : Documentation {
75497549
let Category = DocCatDecl;
75507550
let Content = [{
75517551
In CUDA/HIP programming, the ``__cluster_dims__`` attribute can be applied to a kernel function
7552-
to set the dimensions of a thread block cluster. ``__cluster_dims__`` defines the cluster size
7553-
as ``(X, Y, Z)``, where each value is the number of thread blocks in that dimension.
7554-
The ``__cluster_dims__`` and `__no_cluster__`` attributes are mutually exclusive.
7552+
to set the dimensions of a thread block cluster, which is an optional level of hierarchy and made
7553+
up of thread blocks. ``__cluster_dims__`` defines the cluster size as ``(X, Y, Z)``, where each value
7554+
is the number of thread blocks in that dimension. The ``__cluster_dims__`` and `__no_cluster__``
7555+
attributes are mutually exclusive.
75557556

75567557
.. code::
75577558

clang/include/clang/Basic/DiagnosticSemaKinds.td

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13075,7 +13075,7 @@ def err_cuda_cluster_attr_not_supported : Error<
1307513075
>;
1307613076

1307713077
def err_cuda_cluster_dims_too_large : Error<
13078-
"only a maximum of %0 thread blocks in a cluster is supported"
13078+
"cluster does not support more than %0 thread blocks; %1 provided"
1307913079
>;
1308013080

1308113081
// VTable pointer authentication errors

clang/lib/CodeGen/Targets/AMDGPU.cpp

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -415,13 +415,15 @@ void AMDGPUTargetCodeGenInfo::setFunctionDeclAttributes(
415415
unsigned Z = GetExprVal(Attr->getZ());
416416
llvm::SmallString<32> AttrVal;
417417
llvm::raw_svector_ostream OS(AttrVal);
418-
OS << X << ", " << Y << ", " << Z;
418+
OS << X << ',' << Y << ',' << Z;
419419
F->addFnAttr("amdgpu-cluster-dims", AttrVal.str());
420420
}
421421

422422
// OpenCL doesn't support cluster feature.
423-
if ((IsOpenCLKernel && TargetFetureMap.lookup("gfx1250-insts")) ||
424-
FD->getAttr<CUDANoClusterAttr>())
423+
const TargetInfo &TTI = M.getContext().getTargetInfo();
424+
if ((IsOpenCLKernel &&
425+
TTI.hasFeatureEnabled(TTI.getTargetOpts().FeatureMap, "clusters")) ||
426+
FD->hasAttr<CUDANoClusterAttr>())
425427
F->addFnAttr("amdgpu-cluster-dims", "0,0,0");
426428
}
427429

clang/lib/Sema/SemaDeclAttr.cpp

Lines changed: 14 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -5733,9 +5733,10 @@ CUDAClusterDimsAttr *Sema::createClusterDimsAttr(const AttributeCommonInfo &CI,
57335733
}
57345734

57355735
int FlatDim = ValX * ValY * ValZ;
5736-
auto TT = (!Context.getLangOpts().CUDAIsDevice && Context.getAuxTargetInfo())
5737-
? Context.getAuxTargetInfo()->getTriple()
5738-
: Context.getTargetInfo().getTriple();
5736+
const llvm::Triple TT =
5737+
(!Context.getLangOpts().CUDAIsDevice && Context.getAuxTargetInfo())
5738+
? Context.getAuxTargetInfo()->getTriple()
5739+
: Context.getTargetInfo().getTriple();
57395740
int MaxDim = 1;
57405741
if (TT.isNVPTX())
57415742
MaxDim = 8;
@@ -5747,7 +5748,8 @@ CUDAClusterDimsAttr *Sema::createClusterDimsAttr(const AttributeCommonInfo &CI,
57475748
// A maximum of 8 thread blocks in a cluster is supported as a portable
57485749
// cluster size in CUDA. The number is 16 for AMDGPU.
57495750
if (FlatDim > MaxDim) {
5750-
Diag(CI.getLoc(), diag::err_cuda_cluster_dims_too_large) << MaxDim;
5751+
Diag(CI.getLoc(), diag::err_cuda_cluster_dims_too_large)
5752+
<< MaxDim << FlatDim;
57515753
return nullptr;
57525754
}
57535755

@@ -5765,10 +5767,11 @@ void Sema::addNoClusterAttr(Decl *D, const AttributeCommonInfo &CI) {
57655767
}
57665768

57675769
static void handleClusterDimsAttr(Sema &S, Decl *D, const ParsedAttr &AL) {
5768-
auto &TTI = S.Context.getTargetInfo();
5769-
auto Arch = StringToOffloadArch(TTI.getTargetOpts().CPU);
5770+
const TargetInfo &TTI = S.Context.getTargetInfo();
5771+
OffloadArch Arch = StringToOffloadArch(TTI.getTargetOpts().CPU);
57705772
if ((TTI.getTriple().isNVPTX() && Arch < clang::OffloadArch::SM_90) ||
5771-
(TTI.getTriple().isAMDGPU() && Arch < clang::OffloadArch::GFX1250)) {
5773+
(TTI.getTriple().isAMDGPU() &&
5774+
!TTI.hasFeatureEnabled(TTI.getTargetOpts().FeatureMap, "clusters"))) {
57725775
S.Diag(AL.getLoc(), diag::err_cuda_cluster_attr_not_supported)
57735776
<< "__cluster_dims__";
57745777
return;
@@ -5784,10 +5787,11 @@ static void handleClusterDimsAttr(Sema &S, Decl *D, const ParsedAttr &AL) {
57845787
}
57855788

57865789
static void handleNoClusterAttr(Sema &S, Decl *D, const ParsedAttr &AL) {
5787-
auto &TTI = S.Context.getTargetInfo();
5788-
auto Arch = StringToOffloadArch(TTI.getTargetOpts().CPU);
5790+
const TargetInfo &TTI = S.Context.getTargetInfo();
5791+
OffloadArch Arch = StringToOffloadArch(TTI.getTargetOpts().CPU);
57895792
if ((TTI.getTriple().isNVPTX() && Arch < clang::OffloadArch::SM_90) ||
5790-
(TTI.getTriple().isAMDGPU() && Arch < clang::OffloadArch::GFX1250)) {
5793+
(TTI.getTriple().isAMDGPU() &&
5794+
!TTI.hasFeatureEnabled(TTI.getTargetOpts().FeatureMap, "clusters"))) {
57915795
S.Diag(AL.getLoc(), diag::err_cuda_cluster_attr_not_supported)
57925796
<< "__no_cluster__";
57935797
return;

clang/lib/Sema/SemaTemplateInstantiateDecl.cpp

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -735,8 +735,7 @@ static void instantiateDependentCUDAClusterDimsAttr(
735735
ZExpr = ResultZ.get();
736736
}
737737

738-
if (XExpr)
739-
S.addClusterDimsAttr(New, Attr, XExpr, YExpr, ZExpr);
738+
S.addClusterDimsAttr(New, Attr, XExpr, YExpr, ZExpr);
740739
}
741740

742741
// This doesn't take any template parameters, but we have a custom action that

clang/test/SemaCUDA/cluster_dims.cu

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -29,12 +29,12 @@ template <int x, int y, int z> void test_template_expr(void) __cluster_dims__(x
2929
//NS-error@+1 {{__cluster_dims__ is not supported for this GPU architecture}}
3030
__global__ void __cluster_dims__(32, 2, 4) test_too_large_dim_0() {} // common-error {{integer constant expression evaluates to value 32 that cannot be represented in a 4-bit unsigned integer type}}
3131

32-
// cuda-error@+2 {{only a maximum of 8 thread blocks in a cluster is supported}}
33-
// amd-error@+1 {{only a maximum of 16 thread blocks in a cluster is supported}}
32+
// cuda-error@+2 {{cluster does not support more than 8 thread blocks; 64 provided}}
33+
// amd-error@+1 {{cluster does not support more than 16 thread blocks; 64 provided}}
3434
__global__ void __cluster_dims__(4, 4, 4) test_too_large_dim_1() {} // NS-error {{__cluster_dims__ is not supported for this GPU architecture}}
3535

36-
// cuda-error@+3 {{only a maximum of 8 thread blocks in a cluster is supported}}
37-
// amd-error@+2 {{only a maximum of 16 thread blocks in a cluster is supported}}
36+
// cuda-error@+3 {{cluster does not support more than 8 thread blocks; 64 provided}}
37+
// amd-error@+2 {{cluster does not support more than 16 thread blocks; 64 provided}}
3838
template<unsigned a, unsigned b, unsigned c>
3939
__global__ void __cluster_dims__(a, b, c) test_too_large_dim_template() {} // NS-error {{__cluster_dims__ is not supported for this GPU architecture}}
4040
template __global__ void test_too_large_dim_template<4, 4, 4>(); // common-note {{in instantiation of function template specialization 'test_too_large_dim_template<4U, 4U, 4U>' requested here}}

0 commit comments

Comments
 (0)