refine target feature lookup; fix comments

shiltian · shiltian · commit 0a6d7675dae3 · 2025-10-16T14:40:01.000-04:00
diff --git a/clang/include/clang/Basic/Attr.td b/clang/include/clang/Basic/Attr.td
@@ -1573,15 +1573,15 @@ def HIPManaged : InheritableAttr {
 }
 
 def CUDAClusterDims : InheritableAttr {
-  let Spellings = [GNU<"cluster_dims">, Declspec<"cluster_dims">];
+  let Spellings = [GNU<"cluster_dims">];
   let Args = [ExprArgument<"X">, ExprArgument<"Y", /*opt=*/1>, ExprArgument<"Z", /*opt=*/1>];
   let Subjects = SubjectList<[ObjCMethod, FunctionLike]>;
   let LangOpts = [CUDA];
   let Documentation = [CUDAClusterDimsAttrDoc];
 }
 
 def CUDANoCluster : InheritableAttr {
-  let Spellings = [GNU<"no_cluster">, Declspec<"no_cluster">];
+  let Spellings = [GNU<"no_cluster">];
   let Subjects = SubjectList<[ObjCMethod, FunctionLike]>;
   let LangOpts = [CUDA];
   let Documentation = [CUDANoClusterAttrDoc];
diff --git a/clang/include/clang/Basic/AttrDocs.td b/clang/include/clang/Basic/AttrDocs.td
@@ -7549,9 +7549,10 @@ def CUDAClusterDimsAttrDoc : Documentation {
   let Category = DocCatDecl;
   let Content = [{
 In CUDA/HIP programming, the ``__cluster_dims__`` attribute can be applied to a kernel function
-to set the dimensions of a thread block cluster. ``__cluster_dims__`` defines the cluster size
-as ``(X, Y, Z)``, where each value is the number of thread blocks in that dimension.
-The ``__cluster_dims__`` and `__no_cluster__`` attributes are mutually exclusive.
+to set the dimensions of a thread block cluster, which is an optional level of hierarchy and made
+up of thread blocks. ``__cluster_dims__`` defines the cluster size as ``(X, Y, Z)``, where each value
+is the number of thread blocks in that dimension. The ``__cluster_dims__`` and `__no_cluster__``
+attributes are mutually exclusive.
 
 .. code::
 
diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td
@@ -13075,7 +13075,7 @@ def err_cuda_cluster_attr_not_supported : Error<
 >;
 
 def err_cuda_cluster_dims_too_large : Error<
-  "only a maximum of %0 thread blocks in a cluster is supported"
+  "cluster does not support more than %0 thread blocks; %1 provided"
 >;
 
 // VTable pointer authentication errors
diff --git a/clang/lib/CodeGen/Targets/AMDGPU.cpp b/clang/lib/CodeGen/Targets/AMDGPU.cpp
@@ -415,13 +415,15 @@ void AMDGPUTargetCodeGenInfo::setFunctionDeclAttributes(
     unsigned Z = GetExprVal(Attr->getZ());
     llvm::SmallString<32> AttrVal;
     llvm::raw_svector_ostream OS(AttrVal);
-    OS << X << ", " << Y << ", " << Z;
+    OS << X << ',' << Y << ',' << Z;
     F->addFnAttr("amdgpu-cluster-dims", AttrVal.str());
   }
 
   // OpenCL doesn't support cluster feature.
-  if ((IsOpenCLKernel && TargetFetureMap.lookup("gfx1250-insts")) ||
-      FD->getAttr<CUDANoClusterAttr>())
+  const TargetInfo &TTI = M.getContext().getTargetInfo();
+  if ((IsOpenCLKernel &&
+       TTI.hasFeatureEnabled(TTI.getTargetOpts().FeatureMap, "clusters")) ||
+      FD->hasAttr<CUDANoClusterAttr>())
     F->addFnAttr("amdgpu-cluster-dims", "0,0,0");
 }
 
diff --git a/clang/lib/Sema/SemaDeclAttr.cpp b/clang/lib/Sema/SemaDeclAttr.cpp
@@ -5733,9 +5733,10 @@ CUDAClusterDimsAttr *Sema::createClusterDimsAttr(const AttributeCommonInfo &CI,
   }
 
   int FlatDim = ValX * ValY * ValZ;
-  auto TT = (!Context.getLangOpts().CUDAIsDevice && Context.getAuxTargetInfo())
-                ? Context.getAuxTargetInfo()->getTriple()
-                : Context.getTargetInfo().getTriple();
+  const llvm::Triple TT =
+      (!Context.getLangOpts().CUDAIsDevice && Context.getAuxTargetInfo())
+          ? Context.getAuxTargetInfo()->getTriple()
+          : Context.getTargetInfo().getTriple();
   int MaxDim = 1;
   if (TT.isNVPTX())
     MaxDim = 8;
@@ -5747,7 +5748,8 @@ CUDAClusterDimsAttr *Sema::createClusterDimsAttr(const AttributeCommonInfo &CI,
   // A maximum of 8 thread blocks in a cluster is supported as a portable
   // cluster size in CUDA. The number is 16 for AMDGPU.
   if (FlatDim > MaxDim) {
-    Diag(CI.getLoc(), diag::err_cuda_cluster_dims_too_large) << MaxDim;
+    Diag(CI.getLoc(), diag::err_cuda_cluster_dims_too_large)
+        << MaxDim << FlatDim;
     return nullptr;
   }
 
@@ -5765,10 +5767,11 @@ void Sema::addNoClusterAttr(Decl *D, const AttributeCommonInfo &CI) {
 }
 
 static void handleClusterDimsAttr(Sema &S, Decl *D, const ParsedAttr &AL) {
-  auto &TTI = S.Context.getTargetInfo();
-  auto Arch = StringToOffloadArch(TTI.getTargetOpts().CPU);
+  const TargetInfo &TTI = S.Context.getTargetInfo();
+  OffloadArch Arch = StringToOffloadArch(TTI.getTargetOpts().CPU);
   if ((TTI.getTriple().isNVPTX() && Arch < clang::OffloadArch::SM_90) ||
-      (TTI.getTriple().isAMDGPU() && Arch < clang::OffloadArch::GFX1250)) {
+      (TTI.getTriple().isAMDGPU() &&
+       !TTI.hasFeatureEnabled(TTI.getTargetOpts().FeatureMap, "clusters"))) {
     S.Diag(AL.getLoc(), diag::err_cuda_cluster_attr_not_supported)
         << "__cluster_dims__";
     return;
@@ -5784,10 +5787,11 @@ static void handleClusterDimsAttr(Sema &S, Decl *D, const ParsedAttr &AL) {
 }
 
 static void handleNoClusterAttr(Sema &S, Decl *D, const ParsedAttr &AL) {
-  auto &TTI = S.Context.getTargetInfo();
-  auto Arch = StringToOffloadArch(TTI.getTargetOpts().CPU);
+  const TargetInfo &TTI = S.Context.getTargetInfo();
+  OffloadArch Arch = StringToOffloadArch(TTI.getTargetOpts().CPU);
   if ((TTI.getTriple().isNVPTX() && Arch < clang::OffloadArch::SM_90) ||
-      (TTI.getTriple().isAMDGPU() && Arch < clang::OffloadArch::GFX1250)) {
+      (TTI.getTriple().isAMDGPU() &&
+       !TTI.hasFeatureEnabled(TTI.getTargetOpts().FeatureMap, "clusters"))) {
     S.Diag(AL.getLoc(), diag::err_cuda_cluster_attr_not_supported)
         << "__no_cluster__";
     return;
diff --git a/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp b/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp
@@ -735,8 +735,7 @@ static void instantiateDependentCUDAClusterDimsAttr(
       ZExpr = ResultZ.get();
   }
 
-  if (XExpr)
-    S.addClusterDimsAttr(New, Attr, XExpr, YExpr, ZExpr);
+  S.addClusterDimsAttr(New, Attr, XExpr, YExpr, ZExpr);
 }
 
 // This doesn't take any template parameters, but we have a custom action that
diff --git a/clang/test/SemaCUDA/cluster_dims.cu b/clang/test/SemaCUDA/cluster_dims.cu
@@ -29,12 +29,12 @@ template <int x, int y, int z> void test_template_expr(void) __cluster_dims__(x
 //NS-error@+1 {{__cluster_dims__ is not supported for this GPU architecture}}
 __global__ void __cluster_dims__(32, 2, 4) test_too_large_dim_0() {} // common-error {{integer constant expression evaluates to value 32 that cannot be represented in a 4-bit unsigned integer type}}
 
-// cuda-error@+2 {{only a maximum of 8 thread blocks in a cluster is supported}}
-// amd-error@+1 {{only a maximum of 16 thread blocks in a cluster is supported}}
+// cuda-error@+2 {{cluster does not support more than 8 thread blocks; 64 provided}}
+// amd-error@+1 {{cluster does not support more than 16 thread blocks; 64 provided}}
 __global__ void __cluster_dims__(4, 4, 4) test_too_large_dim_1() {} // NS-error {{__cluster_dims__ is not supported for this GPU architecture}}
 
-// cuda-error@+3 {{only a maximum of 8 thread blocks in a cluster is supported}}
-// amd-error@+2 {{only a maximum of 16 thread blocks in a cluster is supported}}
+// cuda-error@+3 {{cluster does not support more than 8 thread blocks; 64 provided}}
+// amd-error@+2 {{cluster does not support more than 16 thread blocks; 64 provided}}
 template<unsigned a, unsigned b, unsigned c>
 __global__ void __cluster_dims__(a, b, c) test_too_large_dim_template() {} // NS-error {{__cluster_dims__ is not supported for this GPU architecture}}
 template __global__ void test_too_large_dim_template<4, 4, 4>(); // common-note {{in instantiation of function template specialization 'test_too_large_dim_template<4U, 4U, 4U>' requested here}}

Original file line number	Diff line number	Diff line change
`@@ -735,8 +735,7 @@ static void instantiateDependentCUDAClusterDimsAttr(`
`735`	`735`	`ZExpr = ResultZ.get();`
`736`	`736`	`}`
`737`	`737`
`738`		`- if (XExpr)`
`739`		`- S.addClusterDimsAttr(New, Attr, XExpr, YExpr, ZExpr);`
	`738`	`+ S.addClusterDimsAttr(New, Attr, XExpr, YExpr, ZExpr);`
`740`	`739`	`}`
`741`	`740`
`742`	`741`	`// This doesn't take any template parameters, but we have a custom action that`