-
Notifications
You must be signed in to change notification settings - Fork 15.1k
[Clang][HIP][CUDA] Add __cluster_dims__ and __no_cluster__ attribute
#156686
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
This stack of pull requests is managed by Graphite. Learn more about stacking. |
|
@llvm/pr-subscribers-clang-codegen @llvm/pr-subscribers-backend-amdgpu Author: Shilei Tian (shiltian) ChangesThis PR adds basic frontend support for Co-authored-by: Yaxun (Sam) Liu <[email protected]> Patch is 22.58 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/156686.diff 12 Files Affected:
diff --git a/clang/include/clang/Basic/Attr.td b/clang/include/clang/Basic/Attr.td
index 29364c5903d31..efb019d43cbe4 100644
--- a/clang/include/clang/Basic/Attr.td
+++ b/clang/include/clang/Basic/Attr.td
@@ -979,7 +979,7 @@ def AnalyzerNoReturn : InheritableAttr {
}
def InferredNoReturn : InheritableAttr {
- let Spellings = [];
+ let Spellings = [];
let SemaHandler = 0;
let Subjects = SubjectList<[Function], ErrorDiag>;
let Documentation = [InternalOnly];
@@ -1557,6 +1557,23 @@ def HIPManaged : InheritableAttr {
let Documentation = [HIPManagedAttrDocs];
}
+def CUDAClusterDims : InheritableAttr {
+ let Spellings = [GNU<"cluster_dims">, Declspec<"__cluster_dims__">];
+ let Args = [ExprArgument<"X">, ExprArgument<"Y", 1>, ExprArgument<"Z", 1>];
+ let Subjects = SubjectList<[Function], ErrorDiag, "kernel functions">;
+ let LangOpts = [CUDA];
+ let Documentation = [Undocumented];
+}
+
+def CUDANoCluster : InheritableAttr {
+ let Spellings = [GNU<"no_cluster">, Declspec<"__no_cluster__">];
+ let Subjects = SubjectList<[Function], ErrorDiag, "kernel functions">;
+ let LangOpts = [CUDA];
+ let Documentation = [Undocumented];
+}
+
+def : MutualExclusions<[CUDAClusterDims, CUDANoCluster]>;
+
def CUDAInvalidTarget : InheritableAttr {
let Spellings = [];
let Subjects = SubjectList<[Function]>;
diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td
index 3146f20da1424..32b6944f2e038 100644
--- a/clang/include/clang/Basic/DiagnosticSemaKinds.td
+++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td
@@ -10698,7 +10698,7 @@ def warn_dangling_reference_captured_by_unknown : Warning<
// Diagnostics based on the Lifetime safety analysis.
def warn_lifetime_safety_loan_expires_permissive : Warning<
- "object whose reference is captured does not live long enough">,
+ "object whose reference is captured does not live long enough">,
InGroup<LifetimeSafetyPermissive>, DefaultIgnore;
def warn_lifetime_safety_loan_expires_strict : Warning<
"object whose reference is captured may not live long enough">,
@@ -13027,6 +13027,14 @@ def warn_cuda_maxclusterrank_sm_90 : Warning<
"maxclusterrank requires sm_90 or higher, CUDA arch provided: %0, ignoring "
"%1 attribute">, InGroup<IgnoredAttributes>;
+def err_cuda_cluster_attr_not_supported : Error<
+ "%select{__cluster_dims__|__no_cluster__}0 is not supported for this GPU architecture"
+>;
+
+def err_cuda_cluster_dims_too_large : Error<
+ "only a maximum of %0 thread blocks in a cluster is supported"
+>;
+
// VTable pointer authentication errors
def err_non_polymorphic_vtable_pointer_auth : Error<
"cannot set vtable pointer authentication on monomorphic type %0">;
diff --git a/clang/include/clang/Sema/Sema.h b/clang/include/clang/Sema/Sema.h
index aa035a1555950..c9c77bd565260 100644
--- a/clang/include/clang/Sema/Sema.h
+++ b/clang/include/clang/Sema/Sema.h
@@ -5002,6 +5002,14 @@ class Sema final : public SemaBase {
void AddLaunchBoundsAttr(Decl *D, const AttributeCommonInfo &CI,
Expr *MaxThreads, Expr *MinBlocks, Expr *MaxBlocks);
+ /// Add a cluster_dims attribute to a particular declaration.
+ CUDAClusterDimsAttr *createClusterDimsAttr(const AttributeCommonInfo &CI,
+ Expr *X, Expr *Y, Expr *Z);
+ void addClusterDimsAttr(Decl *D, const AttributeCommonInfo &CI, Expr *X,
+ Expr *Y, Expr *Z);
+ /// Add a no_cluster attribute to a particular declaration.
+ void addNoClusterAttr(Decl *D, const AttributeCommonInfo &CI);
+
enum class RetainOwnershipKind { NS, CF, OS };
UuidAttr *mergeUuidAttr(Decl *D, const AttributeCommonInfo &CI,
diff --git a/clang/lib/CodeGen/Targets/AMDGPU.cpp b/clang/lib/CodeGen/Targets/AMDGPU.cpp
index 0fcbf7e458a34..48855ce485f91 100644
--- a/clang/lib/CodeGen/Targets/AMDGPU.cpp
+++ b/clang/lib/CodeGen/Targets/AMDGPU.cpp
@@ -342,6 +342,9 @@ static bool requiresAMDGPUProtectedVisibility(const Decl *D,
void AMDGPUTargetCodeGenInfo::setFunctionDeclAttributes(
const FunctionDecl *FD, llvm::Function *F, CodeGenModule &M) const {
+ llvm::StringMap<bool> TargetFetureMap;
+ M.getContext().getFunctionFeatureMap(TargetFetureMap, FD);
+
const auto *ReqdWGS =
M.getLangOpts().OpenCL ? FD->getAttr<ReqdWorkGroupSizeAttr>() : nullptr;
const bool IsOpenCLKernel =
@@ -402,6 +405,29 @@ void AMDGPUTargetCodeGenInfo::setFunctionDeclAttributes(
F->addFnAttr("amdgpu-max-num-workgroups", AttrVal.str());
}
+
+ if (auto *Attr = FD->getAttr<CUDAClusterDimsAttr>()) {
+ uint32_t X =
+ Attr->getX()->EvaluateKnownConstInt(M.getContext()).getExtValue();
+ uint32_t Y =
+ Attr->getY()
+ ? Attr->getY()->EvaluateKnownConstInt(M.getContext()).getExtValue()
+ : 1;
+ uint32_t Z =
+ Attr->getZ()
+ ? Attr->getZ()->EvaluateKnownConstInt(M.getContext()).getExtValue()
+ : 1;
+
+ llvm::SmallString<32> AttrVal;
+ llvm::raw_svector_ostream OS(AttrVal);
+ OS << X << ',' << Y << ',' << Z;
+ F->addFnAttr("amdgpu-cluster-dims", AttrVal.str());
+ }
+
+ // OpenCL doesn't support cluster feature.
+ if ((IsOpenCLKernel && TargetFetureMap.lookup("gfx1250-insts")) ||
+ FD->getAttr<CUDANoClusterAttr>())
+ F->addFnAttr("amdgpu-cluster-dims", "0,0,0");
}
void AMDGPUTargetCodeGenInfo::setTargetAttributes(
diff --git a/clang/lib/Headers/__clang_hip_runtime_wrapper.h b/clang/lib/Headers/__clang_hip_runtime_wrapper.h
index da1e39ac7270e..fb0ece96e1418 100644
--- a/clang/lib/Headers/__clang_hip_runtime_wrapper.h
+++ b/clang/lib/Headers/__clang_hip_runtime_wrapper.h
@@ -25,6 +25,8 @@
#define __constant__ __attribute__((constant))
#define __managed__ __attribute__((managed))
+#define __cluster_dims__(...) __attribute__((cluster_dims(__VA_ARGS__)))
+
#if !defined(__cplusplus) || __cplusplus < 201103L
#define nullptr NULL;
#endif
diff --git a/clang/lib/Sema/SemaDeclAttr.cpp b/clang/lib/Sema/SemaDeclAttr.cpp
index 3ded60cd8b073..ad2c28843f970 100644
--- a/clang/lib/Sema/SemaDeclAttr.cpp
+++ b/clang/lib/Sema/SemaDeclAttr.cpp
@@ -5654,6 +5654,130 @@ static void handleLaunchBoundsAttr(Sema &S, Decl *D, const ParsedAttr &AL) {
AL.getNumArgs() > 2 ? AL.getArgAsExpr(2) : nullptr);
}
+static std::pair<Expr *, int>
+makeClusterDimsArgExpr(Sema &S, Expr *E, const CUDAClusterDimsAttr &AL,
+ const unsigned Idx) {
+ if (S.DiagnoseUnexpandedParameterPack(E))
+ return {nullptr, 0};
+
+ // Accept template arguments for now as they depend on something else.
+ // We'll get to check them when they eventually get instantiated.
+ if (E->isValueDependent())
+ return {E, 1};
+
+ std::optional<llvm::APSInt> I = llvm::APSInt(64);
+ if (!(I = E->getIntegerConstantExpr(S.Context))) {
+ S.Diag(E->getExprLoc(), diag::err_attribute_argument_n_type)
+ << &AL << Idx << AANT_ArgumentIntegerConstant << E->getSourceRange();
+ return {nullptr, 0};
+ }
+ // Make sure we can fit it in 4 bits.
+ if (!I->isIntN(4)) {
+ S.Diag(E->getExprLoc(), diag::err_ice_too_large)
+ << toString(*I, 10, false) << 4 << /* Unsigned */ 1;
+ return {nullptr, 0};
+ }
+ if (*I < 0)
+ S.Diag(E->getExprLoc(), diag::warn_attribute_argument_n_negative)
+ << &AL << Idx << E->getSourceRange();
+
+ // We may need to perform implicit conversion of the argument.
+ InitializedEntity Entity = InitializedEntity::InitializeParameter(
+ S.Context, S.Context.getConstType(S.Context.IntTy), /*consume*/ false);
+ ExprResult ValArg = S.PerformCopyInitialization(Entity, SourceLocation(), E);
+ assert(!ValArg.isInvalid() &&
+ "Unexpected PerformCopyInitialization() failure.");
+
+ return {ValArg.getAs<Expr>(), I->getZExtValue()};
+}
+
+CUDAClusterDimsAttr *Sema::createClusterDimsAttr(const AttributeCommonInfo &CI,
+ Expr *X, Expr *Y, Expr *Z) {
+ CUDAClusterDimsAttr TmpAttr(Context, CI, X, Y, Z);
+
+ int ValX = 1;
+ int ValY = 1;
+ int ValZ = 1;
+
+ std::tie(X, ValX) = makeClusterDimsArgExpr(*this, X, TmpAttr, /*Idx=*/0);
+ if (!X)
+ return nullptr;
+
+ if (Y) {
+ std::tie(Y, ValY) = makeClusterDimsArgExpr(*this, Y, TmpAttr, /*Idx=*/1);
+ if (!Y)
+ return nullptr;
+ }
+
+ if (Z) {
+ std::tie(Z, ValZ) = makeClusterDimsArgExpr(*this, Z, TmpAttr, /*Idx=*/2);
+ if (!Z)
+ return nullptr;
+ }
+
+ int FlatDim = ValX * ValY * ValZ;
+ auto TT = (!Context.getLangOpts().CUDAIsDevice && Context.getAuxTargetInfo())
+ ? Context.getAuxTargetInfo()->getTriple()
+ : Context.getTargetInfo().getTriple();
+ int MaxDim = 1;
+ if (TT.isNVPTX())
+ MaxDim = 8;
+ else if (TT.isAMDGPU())
+ MaxDim = 16;
+ else
+ return nullptr;
+
+ // A maximum of 8 thread blocks in a cluster is supported as a portable
+ // cluster size in CUDA. The number is 16 for AMDGPU.
+ if (FlatDim > MaxDim) {
+ Diag(CI.getLoc(), diag::err_cuda_cluster_dims_too_large) << MaxDim;
+ return nullptr;
+ }
+
+ return ::new (Context) CUDAClusterDimsAttr(Context, CI, X, Y, Z);
+}
+
+void Sema::addClusterDimsAttr(Decl *D, const AttributeCommonInfo &CI, Expr *X,
+ Expr *Y, Expr *Z) {
+ if (auto *Attr = createClusterDimsAttr(CI, X, Y, Z))
+ D->addAttr(Attr);
+}
+
+void Sema::addNoClusterAttr(Decl *D, const AttributeCommonInfo &CI) {
+ if (CUDANoClusterAttr *Attr = ::new (Context) CUDANoClusterAttr(Context, CI))
+ D->addAttr(Attr);
+}
+
+static void handleClusterDimsAttr(Sema &S, Decl *D, const ParsedAttr &AL) {
+ auto &TTI = S.Context.getTargetInfo();
+ auto Arch = StringToOffloadArch(TTI.getTargetOpts().CPU);
+ if ((TTI.getTriple().isNVPTX() && Arch < clang::OffloadArch::SM_90) ||
+ (TTI.getTriple().isAMDGPU() && Arch < clang::OffloadArch::GFX1250)) {
+ S.Diag(AL.getLoc(), diag::err_cuda_cluster_attr_not_supported) << 0;
+ return;
+ }
+
+ if (!AL.checkAtLeastNumArgs(S, /*Num=*/1) ||
+ !AL.checkAtMostNumArgs(S, /*Num=*/3))
+ return;
+
+ S.addClusterDimsAttr(D, AL, AL.getArgAsExpr(0),
+ AL.getNumArgs() > 1 ? AL.getArgAsExpr(1) : nullptr,
+ AL.getNumArgs() > 2 ? AL.getArgAsExpr(2) : nullptr);
+}
+
+static void handleNoClusterAttr(Sema &S, Decl *D, const ParsedAttr &AL) {
+ auto &TTI = S.Context.getTargetInfo();
+ auto Arch = StringToOffloadArch(TTI.getTargetOpts().CPU);
+ if ((TTI.getTriple().isNVPTX() && Arch < clang::OffloadArch::SM_90) ||
+ (TTI.getTriple().isAMDGPU() && Arch < clang::OffloadArch::GFX1250)) {
+ S.Diag(AL.getLoc(), diag::err_cuda_cluster_attr_not_supported) << 1;
+ return;
+ }
+
+ S.addNoClusterAttr(D, AL);
+}
+
static void handleArgumentWithTypeTagAttr(Sema &S, Decl *D,
const ParsedAttr &AL) {
if (!AL.isArgIdent(0)) {
@@ -7105,6 +7229,12 @@ ProcessDeclAttribute(Sema &S, Scope *scope, Decl *D, const ParsedAttr &AL,
case ParsedAttr::AT_CUDALaunchBounds:
handleLaunchBoundsAttr(S, D, AL);
break;
+ case ParsedAttr::AT_CUDAClusterDims:
+ handleClusterDimsAttr(S, D, AL);
+ break;
+ case ParsedAttr::AT_CUDANoCluster:
+ handleNoClusterAttr(S, D, AL);
+ break;
case ParsedAttr::AT_Restrict:
handleRestrictAttr(S, D, AL);
break;
diff --git a/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp b/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp
index ee1b520fa46e9..aab93a93ba95b 100644
--- a/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp
+++ b/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp
@@ -681,6 +681,38 @@ static void instantiateDependentAMDGPUMaxNumWorkGroupsAttr(
S.AMDGPU().addAMDGPUMaxNumWorkGroupsAttr(New, Attr, XExpr, YExpr, ZExpr);
}
+static void instantiateDependentCUDAClusterDimsAttr(
+ Sema &S, const MultiLevelTemplateArgumentList &TemplateArgs,
+ const CUDAClusterDimsAttr &Attr, Decl *New) {
+ EnterExpressionEvaluationContext Unevaluated(
+ S, Sema::ExpressionEvaluationContext::ConstantEvaluated);
+
+ Expr *XExpr = nullptr;
+ Expr *YExpr = nullptr;
+ Expr *ZExpr = nullptr;
+
+ if (Attr.getX()) {
+ ExprResult ResultX = S.SubstExpr(Attr.getX(), TemplateArgs);
+ if (ResultX.isUsable())
+ XExpr = ResultX.getAs<Expr>();
+ }
+
+ if (Attr.getY()) {
+ ExprResult ResultY = S.SubstExpr(Attr.getY(), TemplateArgs);
+ if (ResultY.isUsable())
+ YExpr = ResultY.getAs<Expr>();
+ }
+
+ if (Attr.getZ()) {
+ ExprResult ResultZ = S.SubstExpr(Attr.getZ(), TemplateArgs);
+ if (ResultZ.isUsable())
+ ZExpr = ResultZ.getAs<Expr>();
+ }
+
+ if (XExpr)
+ S.addClusterDimsAttr(New, Attr, XExpr, YExpr, ZExpr);
+}
+
// This doesn't take any template parameters, but we have a custom action that
// needs to happen when the kernel itself is instantiated. We need to run the
// ItaniumMangler to mark the names required to name this kernel.
@@ -883,6 +915,11 @@ void Sema::InstantiateAttrs(const MultiLevelTemplateArgumentList &TemplateArgs,
*this, TemplateArgs, *AMDGPUMaxNumWorkGroups, New);
}
+ if (const auto *CUDAClusterDims = dyn_cast<CUDAClusterDimsAttr>(TmplAttr)) {
+ instantiateDependentCUDAClusterDimsAttr(*this, TemplateArgs,
+ *CUDAClusterDims, New);
+ }
+
if (const auto *ParamAttr = dyn_cast<HLSLParamModifierAttr>(TmplAttr)) {
instantiateDependentHLSLParamModifierAttr(*this, TemplateArgs, ParamAttr,
New);
diff --git a/clang/test/CodeGenCUDA/Inputs/cuda.h b/clang/test/CodeGenCUDA/Inputs/cuda.h
index dc85eae0c5178..e7ad784335027 100644
--- a/clang/test/CodeGenCUDA/Inputs/cuda.h
+++ b/clang/test/CodeGenCUDA/Inputs/cuda.h
@@ -13,6 +13,8 @@
#endif
#define __launch_bounds__(...) __attribute__((launch_bounds(__VA_ARGS__)))
#define __grid_constant__ __attribute__((grid_constant))
+#define __cluster_dims__(...) __attribute__((cluster_dims(__VA_ARGS__)))
+#define __no_cluster__ __attribute__((no_cluster))
#else
#define __constant__
#define __device__
@@ -22,6 +24,8 @@
#define __managed__
#define __launch_bounds__(...)
#define __grid_constant__
+#define __cluster_dims__(...)
+#define __no_cluster__
#endif
struct dim3 {
diff --git a/clang/test/CodeGenCUDA/cluster_dims.cu b/clang/test/CodeGenCUDA/cluster_dims.cu
new file mode 100644
index 0000000000000..00635e3572a7f
--- /dev/null
+++ b/clang/test/CodeGenCUDA/cluster_dims.cu
@@ -0,0 +1,38 @@
+// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -target-cpu gfx1250 -fcuda-is-device -emit-llvm -x hip -o - %s | FileCheck %s
+// RUN: %clang_cc1 -triple x86_64-pc-linux-gnu -aux-triple amdgcn-amd-amdhsa -emit-llvm -x hip -o - %s | FileCheck --check-prefix=HOST %s
+
+#include "Inputs/cuda.h"
+
+const int constint = 4;
+
+// HOST-NOT: "amdgpu-cluster-dims"
+
+// CHECK: "amdgpu-cluster-dims"="2,2,2"
+__global__ void __cluster_dims__(2, 2, 2) test_literal_3d() {}
+
+// CHECK: "amdgpu-cluster-dims"="2,2,1"
+__global__ void __cluster_dims__(2, 2) test_literal_2d() {}
+
+// CHECK: "amdgpu-cluster-dims"="4,1,1"
+__global__ void __cluster_dims__(4) test_literal_1d() {}
+
+// CHECK: "amdgpu-cluster-dims"="4,2,1"
+__global__ void __cluster_dims__(constint, constint / 2, 1) test_constant() {}
+
+// CHECK: "amdgpu-cluster-dims"="0,0,0"
+__global__ void __no_cluster__ test_no_cluster() {}
+
+// CHECK: "amdgpu-cluster-dims"="7,1,1"
+template<unsigned a>
+__global__ void __cluster_dims__(a) test_template_1d() {}
+template __global__ void test_template_1d<7>();
+
+// CHECK: "amdgpu-cluster-dims"="2,6,1"
+template<unsigned a, unsigned b>
+__global__ void __cluster_dims__(a, b) test_template_2d() {}
+template __global__ void test_template_2d<2, 6>();
+
+// CHECK: "amdgpu-cluster-dims"="1,2,3"
+template<unsigned a, unsigned b, unsigned c>
+__global__ void __cluster_dims__(a, b, c) test_template_3d() {}
+template __global__ void test_template_3d<1, 2, 3>();
diff --git a/clang/test/Misc/pragma-attribute-supported-attributes-list.test b/clang/test/Misc/pragma-attribute-supported-attributes-list.test
index 37ff33e5a1523..c8c913448d968 100644
--- a/clang/test/Misc/pragma-attribute-supported-attributes-list.test
+++ b/clang/test/Misc/pragma-attribute-supported-attributes-list.test
@@ -35,6 +35,7 @@
// CHECK-NEXT: CFUnknownTransfer (SubjectMatchRule_function)
// CHECK-NEXT: CPUDispatch (SubjectMatchRule_function)
// CHECK-NEXT: CPUSpecific (SubjectMatchRule_function)
+// CHECK-NEXT: CUDAClusterDims (SubjectMatchRule_function)
// CHECK-NEXT: CUDAConstant (SubjectMatchRule_variable)
// CHECK-NEXT: CUDADevice (SubjectMatchRule_function, SubjectMatchRule_variable)
// CHECK-NEXT: CUDADeviceBuiltinSurfaceType (SubjectMatchRule_record)
@@ -43,6 +44,7 @@
// CHECK-NEXT: CUDAGridConstant (SubjectMatchRule_variable_is_parameter)
// CHECK-NEXT: CUDAHost (SubjectMatchRule_function)
// CHECK-NEXT: CUDALaunchBounds (SubjectMatchRule_objc_method, SubjectMatchRule_hasType_functionType)
+// CHECK-NEXT: CUDANoCluster (SubjectMatchRule_function)
// CHECK-NEXT: CUDAShared (SubjectMatchRule_variable)
// CHECK-NEXT: CXX11NoReturn (SubjectMatchRule_function)
// CHECK-NEXT: CallableWhen (SubjectMatchRule_function_is_member)
diff --git a/clang/test/SemaCUDA/Inputs/cuda.h b/clang/test/SemaCUDA/Inputs/cuda.h
index 10db947d8246c..2bf45e03d91c7 100644
--- a/clang/test/SemaCUDA/Inputs/cuda.h
+++ b/clang/test/SemaCUDA/Inputs/cuda.h
@@ -13,6 +13,8 @@
#define __managed__ __attribute__((managed))
#define __grid_constant__ __attribute__((grid_constant))
#define __launch_bounds__(...) __attribute__((launch_bounds(__VA_ARGS__)))
+#define __cluster_dims__(...) __attribute__((cluster_dims(__VA_ARGS__)))
+#define __no_cluster__ __attribute__((no_cluster))
struct dim3 {
unsigned x, y, z;
diff --git a/clang/test/SemaCUDA/cluster_dims.cu b/clang/test/SemaCUDA/cluster_dims.cu
new file mode 100644
index 0000000000000..3cd0e0197c29b
--- /dev/null
+++ b/clang/test/SemaCUDA/cluster_dims.cu
@@ -0,0 +1,64 @@
+// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -fcuda-is-device -ast-print -x hip -verify=NS,all %s
+// RUN: %clang_cc1 -triple nvptx-nvidia-cuda -fcuda-is-device -ast-print -x hip -verify=NS,all %s
+// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -target-cpu gfx1250 -fcuda-is-device -ast-print -x hip -verify=amd,common,all %s | FileCheck -check-prefixes=CHECK %s
+// RUN: %clang_cc1 -triple nvptx-nvidia-cuda -target-cpu sm_90 -fcuda-is-device -ast-print -x hip -verify=cuda,common,all %s | FileCheck -check-prefixes=CHECK %s
+// RUN: %clang_cc1 -triple x86_64-pc-linux-gnu -aux-triple amdgcn-amd-amdhsa -ast-print -x hip -verify=amd,common,all %s | FileCheck -check-prefixes=CHECK %s
+
+#include "Inputs/cuda.h"
+
+const int constint = 4;
+
+// CHECK: __attribute__((global)) __attribute__((cluster_dims(2, 2, 2))) void test_literal_3d()
+__global__ void __cluster_dims__(2, 2, 2) test_literal_3d() {} //NS-error {{__cluster_dims__ is not supported for this GPU architecture}}
+
+// CHECK: __attribute__((global)) __attribute__((cluster_dims(2, 2))) void test_literal_2d()
+__global__ void __cluster_dims__(2, 2) test_literal_2d() {} //NS-error {{__cluster_dims__ is not supported for this GPU architecture}}
+
+// CHECK: __attribute__((global)) __attribute__((cluster_dims(4))) void test_literal_1d()
+__global__ void __cluster_dims__(4) test_literal_1d() {} //NS-error {{__cluster_dims__ is not supported for this GPU architecture}}
+
+// CHECK: __attribute__((global)) __attribute__((cluster_dims(constint, constint / 4, 1))) void test_constant()
+__global__ void __cluster_dims__(constint, constint / 4, 1) test_constant() {} //NS-error {{__cluster_dims__ is not supported for this GPU architecture}}
+
+// CHECK: template <int x, int y, int z> void test_template() __attribute__((cluster_dims(x, y, z)))
+template <int x, int y, int z> void test_template(void) __cluster_dims__(x, y, z){} //NS-error {{__cluster_dims__ is not supported for this GPU architecture}}
+
+// CHECK: template <int x, int y, int z> void test_template_expr() __attribute__((cluster_dims(x + constint, y, z)))
+template <int x, int y, int z> void test_template_expr(void) __cluster_dims__(x + constint, y, z) {} //NS-error {{__cluster_dims__ is not supported for this GPU architecture}}
+
+//NS-error@+1 {{__cluster_dims__ is not supported for this GPU architecture}}
+__global__ void __cluster_dims__(32, 2, 4) test...
[truncated]
|
|
@llvm/pr-subscribers-clang Author: Shilei Tian (shiltian) ChangesThis PR adds basic frontend support for Co-authored-by: Yaxun (Sam) Liu <[email protected]> Patch is 22.58 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/156686.diff 12 Files Affected:
diff --git a/clang/include/clang/Basic/Attr.td b/clang/include/clang/Basic/Attr.td
index 29364c5903d31..efb019d43cbe4 100644
--- a/clang/include/clang/Basic/Attr.td
+++ b/clang/include/clang/Basic/Attr.td
@@ -979,7 +979,7 @@ def AnalyzerNoReturn : InheritableAttr {
}
def InferredNoReturn : InheritableAttr {
- let Spellings = [];
+ let Spellings = [];
let SemaHandler = 0;
let Subjects = SubjectList<[Function], ErrorDiag>;
let Documentation = [InternalOnly];
@@ -1557,6 +1557,23 @@ def HIPManaged : InheritableAttr {
let Documentation = [HIPManagedAttrDocs];
}
+def CUDAClusterDims : InheritableAttr {
+ let Spellings = [GNU<"cluster_dims">, Declspec<"__cluster_dims__">];
+ let Args = [ExprArgument<"X">, ExprArgument<"Y", 1>, ExprArgument<"Z", 1>];
+ let Subjects = SubjectList<[Function], ErrorDiag, "kernel functions">;
+ let LangOpts = [CUDA];
+ let Documentation = [Undocumented];
+}
+
+def CUDANoCluster : InheritableAttr {
+ let Spellings = [GNU<"no_cluster">, Declspec<"__no_cluster__">];
+ let Subjects = SubjectList<[Function], ErrorDiag, "kernel functions">;
+ let LangOpts = [CUDA];
+ let Documentation = [Undocumented];
+}
+
+def : MutualExclusions<[CUDAClusterDims, CUDANoCluster]>;
+
def CUDAInvalidTarget : InheritableAttr {
let Spellings = [];
let Subjects = SubjectList<[Function]>;
diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td
index 3146f20da1424..32b6944f2e038 100644
--- a/clang/include/clang/Basic/DiagnosticSemaKinds.td
+++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td
@@ -10698,7 +10698,7 @@ def warn_dangling_reference_captured_by_unknown : Warning<
// Diagnostics based on the Lifetime safety analysis.
def warn_lifetime_safety_loan_expires_permissive : Warning<
- "object whose reference is captured does not live long enough">,
+ "object whose reference is captured does not live long enough">,
InGroup<LifetimeSafetyPermissive>, DefaultIgnore;
def warn_lifetime_safety_loan_expires_strict : Warning<
"object whose reference is captured may not live long enough">,
@@ -13027,6 +13027,14 @@ def warn_cuda_maxclusterrank_sm_90 : Warning<
"maxclusterrank requires sm_90 or higher, CUDA arch provided: %0, ignoring "
"%1 attribute">, InGroup<IgnoredAttributes>;
+def err_cuda_cluster_attr_not_supported : Error<
+ "%select{__cluster_dims__|__no_cluster__}0 is not supported for this GPU architecture"
+>;
+
+def err_cuda_cluster_dims_too_large : Error<
+ "only a maximum of %0 thread blocks in a cluster is supported"
+>;
+
// VTable pointer authentication errors
def err_non_polymorphic_vtable_pointer_auth : Error<
"cannot set vtable pointer authentication on monomorphic type %0">;
diff --git a/clang/include/clang/Sema/Sema.h b/clang/include/clang/Sema/Sema.h
index aa035a1555950..c9c77bd565260 100644
--- a/clang/include/clang/Sema/Sema.h
+++ b/clang/include/clang/Sema/Sema.h
@@ -5002,6 +5002,14 @@ class Sema final : public SemaBase {
void AddLaunchBoundsAttr(Decl *D, const AttributeCommonInfo &CI,
Expr *MaxThreads, Expr *MinBlocks, Expr *MaxBlocks);
+ /// Add a cluster_dims attribute to a particular declaration.
+ CUDAClusterDimsAttr *createClusterDimsAttr(const AttributeCommonInfo &CI,
+ Expr *X, Expr *Y, Expr *Z);
+ void addClusterDimsAttr(Decl *D, const AttributeCommonInfo &CI, Expr *X,
+ Expr *Y, Expr *Z);
+ /// Add a no_cluster attribute to a particular declaration.
+ void addNoClusterAttr(Decl *D, const AttributeCommonInfo &CI);
+
enum class RetainOwnershipKind { NS, CF, OS };
UuidAttr *mergeUuidAttr(Decl *D, const AttributeCommonInfo &CI,
diff --git a/clang/lib/CodeGen/Targets/AMDGPU.cpp b/clang/lib/CodeGen/Targets/AMDGPU.cpp
index 0fcbf7e458a34..48855ce485f91 100644
--- a/clang/lib/CodeGen/Targets/AMDGPU.cpp
+++ b/clang/lib/CodeGen/Targets/AMDGPU.cpp
@@ -342,6 +342,9 @@ static bool requiresAMDGPUProtectedVisibility(const Decl *D,
void AMDGPUTargetCodeGenInfo::setFunctionDeclAttributes(
const FunctionDecl *FD, llvm::Function *F, CodeGenModule &M) const {
+ llvm::StringMap<bool> TargetFetureMap;
+ M.getContext().getFunctionFeatureMap(TargetFetureMap, FD);
+
const auto *ReqdWGS =
M.getLangOpts().OpenCL ? FD->getAttr<ReqdWorkGroupSizeAttr>() : nullptr;
const bool IsOpenCLKernel =
@@ -402,6 +405,29 @@ void AMDGPUTargetCodeGenInfo::setFunctionDeclAttributes(
F->addFnAttr("amdgpu-max-num-workgroups", AttrVal.str());
}
+
+ if (auto *Attr = FD->getAttr<CUDAClusterDimsAttr>()) {
+ uint32_t X =
+ Attr->getX()->EvaluateKnownConstInt(M.getContext()).getExtValue();
+ uint32_t Y =
+ Attr->getY()
+ ? Attr->getY()->EvaluateKnownConstInt(M.getContext()).getExtValue()
+ : 1;
+ uint32_t Z =
+ Attr->getZ()
+ ? Attr->getZ()->EvaluateKnownConstInt(M.getContext()).getExtValue()
+ : 1;
+
+ llvm::SmallString<32> AttrVal;
+ llvm::raw_svector_ostream OS(AttrVal);
+ OS << X << ',' << Y << ',' << Z;
+ F->addFnAttr("amdgpu-cluster-dims", AttrVal.str());
+ }
+
+ // OpenCL doesn't support cluster feature.
+ if ((IsOpenCLKernel && TargetFetureMap.lookup("gfx1250-insts")) ||
+ FD->getAttr<CUDANoClusterAttr>())
+ F->addFnAttr("amdgpu-cluster-dims", "0,0,0");
}
void AMDGPUTargetCodeGenInfo::setTargetAttributes(
diff --git a/clang/lib/Headers/__clang_hip_runtime_wrapper.h b/clang/lib/Headers/__clang_hip_runtime_wrapper.h
index da1e39ac7270e..fb0ece96e1418 100644
--- a/clang/lib/Headers/__clang_hip_runtime_wrapper.h
+++ b/clang/lib/Headers/__clang_hip_runtime_wrapper.h
@@ -25,6 +25,8 @@
#define __constant__ __attribute__((constant))
#define __managed__ __attribute__((managed))
+#define __cluster_dims__(...) __attribute__((cluster_dims(__VA_ARGS__)))
+
#if !defined(__cplusplus) || __cplusplus < 201103L
#define nullptr NULL;
#endif
diff --git a/clang/lib/Sema/SemaDeclAttr.cpp b/clang/lib/Sema/SemaDeclAttr.cpp
index 3ded60cd8b073..ad2c28843f970 100644
--- a/clang/lib/Sema/SemaDeclAttr.cpp
+++ b/clang/lib/Sema/SemaDeclAttr.cpp
@@ -5654,6 +5654,130 @@ static void handleLaunchBoundsAttr(Sema &S, Decl *D, const ParsedAttr &AL) {
AL.getNumArgs() > 2 ? AL.getArgAsExpr(2) : nullptr);
}
+static std::pair<Expr *, int>
+makeClusterDimsArgExpr(Sema &S, Expr *E, const CUDAClusterDimsAttr &AL,
+ const unsigned Idx) {
+ if (S.DiagnoseUnexpandedParameterPack(E))
+ return {nullptr, 0};
+
+ // Accept template arguments for now as they depend on something else.
+ // We'll get to check them when they eventually get instantiated.
+ if (E->isValueDependent())
+ return {E, 1};
+
+ std::optional<llvm::APSInt> I = llvm::APSInt(64);
+ if (!(I = E->getIntegerConstantExpr(S.Context))) {
+ S.Diag(E->getExprLoc(), diag::err_attribute_argument_n_type)
+ << &AL << Idx << AANT_ArgumentIntegerConstant << E->getSourceRange();
+ return {nullptr, 0};
+ }
+ // Make sure we can fit it in 4 bits.
+ if (!I->isIntN(4)) {
+ S.Diag(E->getExprLoc(), diag::err_ice_too_large)
+ << toString(*I, 10, false) << 4 << /* Unsigned */ 1;
+ return {nullptr, 0};
+ }
+ if (*I < 0)
+ S.Diag(E->getExprLoc(), diag::warn_attribute_argument_n_negative)
+ << &AL << Idx << E->getSourceRange();
+
+ // We may need to perform implicit conversion of the argument.
+ InitializedEntity Entity = InitializedEntity::InitializeParameter(
+ S.Context, S.Context.getConstType(S.Context.IntTy), /*consume*/ false);
+ ExprResult ValArg = S.PerformCopyInitialization(Entity, SourceLocation(), E);
+ assert(!ValArg.isInvalid() &&
+ "Unexpected PerformCopyInitialization() failure.");
+
+ return {ValArg.getAs<Expr>(), I->getZExtValue()};
+}
+
+CUDAClusterDimsAttr *Sema::createClusterDimsAttr(const AttributeCommonInfo &CI,
+ Expr *X, Expr *Y, Expr *Z) {
+ CUDAClusterDimsAttr TmpAttr(Context, CI, X, Y, Z);
+
+ int ValX = 1;
+ int ValY = 1;
+ int ValZ = 1;
+
+ std::tie(X, ValX) = makeClusterDimsArgExpr(*this, X, TmpAttr, /*Idx=*/0);
+ if (!X)
+ return nullptr;
+
+ if (Y) {
+ std::tie(Y, ValY) = makeClusterDimsArgExpr(*this, Y, TmpAttr, /*Idx=*/1);
+ if (!Y)
+ return nullptr;
+ }
+
+ if (Z) {
+ std::tie(Z, ValZ) = makeClusterDimsArgExpr(*this, Z, TmpAttr, /*Idx=*/2);
+ if (!Z)
+ return nullptr;
+ }
+
+ int FlatDim = ValX * ValY * ValZ;
+ auto TT = (!Context.getLangOpts().CUDAIsDevice && Context.getAuxTargetInfo())
+ ? Context.getAuxTargetInfo()->getTriple()
+ : Context.getTargetInfo().getTriple();
+ int MaxDim = 1;
+ if (TT.isNVPTX())
+ MaxDim = 8;
+ else if (TT.isAMDGPU())
+ MaxDim = 16;
+ else
+ return nullptr;
+
+ // A maximum of 8 thread blocks in a cluster is supported as a portable
+ // cluster size in CUDA. The number is 16 for AMDGPU.
+ if (FlatDim > MaxDim) {
+ Diag(CI.getLoc(), diag::err_cuda_cluster_dims_too_large) << MaxDim;
+ return nullptr;
+ }
+
+ return ::new (Context) CUDAClusterDimsAttr(Context, CI, X, Y, Z);
+}
+
+void Sema::addClusterDimsAttr(Decl *D, const AttributeCommonInfo &CI, Expr *X,
+ Expr *Y, Expr *Z) {
+ if (auto *Attr = createClusterDimsAttr(CI, X, Y, Z))
+ D->addAttr(Attr);
+}
+
+void Sema::addNoClusterAttr(Decl *D, const AttributeCommonInfo &CI) {
+ if (CUDANoClusterAttr *Attr = ::new (Context) CUDANoClusterAttr(Context, CI))
+ D->addAttr(Attr);
+}
+
+static void handleClusterDimsAttr(Sema &S, Decl *D, const ParsedAttr &AL) {
+ auto &TTI = S.Context.getTargetInfo();
+ auto Arch = StringToOffloadArch(TTI.getTargetOpts().CPU);
+ if ((TTI.getTriple().isNVPTX() && Arch < clang::OffloadArch::SM_90) ||
+ (TTI.getTriple().isAMDGPU() && Arch < clang::OffloadArch::GFX1250)) {
+ S.Diag(AL.getLoc(), diag::err_cuda_cluster_attr_not_supported) << 0;
+ return;
+ }
+
+ if (!AL.checkAtLeastNumArgs(S, /*Num=*/1) ||
+ !AL.checkAtMostNumArgs(S, /*Num=*/3))
+ return;
+
+ S.addClusterDimsAttr(D, AL, AL.getArgAsExpr(0),
+ AL.getNumArgs() > 1 ? AL.getArgAsExpr(1) : nullptr,
+ AL.getNumArgs() > 2 ? AL.getArgAsExpr(2) : nullptr);
+}
+
+static void handleNoClusterAttr(Sema &S, Decl *D, const ParsedAttr &AL) {
+ auto &TTI = S.Context.getTargetInfo();
+ auto Arch = StringToOffloadArch(TTI.getTargetOpts().CPU);
+ if ((TTI.getTriple().isNVPTX() && Arch < clang::OffloadArch::SM_90) ||
+ (TTI.getTriple().isAMDGPU() && Arch < clang::OffloadArch::GFX1250)) {
+ S.Diag(AL.getLoc(), diag::err_cuda_cluster_attr_not_supported) << 1;
+ return;
+ }
+
+ S.addNoClusterAttr(D, AL);
+}
+
static void handleArgumentWithTypeTagAttr(Sema &S, Decl *D,
const ParsedAttr &AL) {
if (!AL.isArgIdent(0)) {
@@ -7105,6 +7229,12 @@ ProcessDeclAttribute(Sema &S, Scope *scope, Decl *D, const ParsedAttr &AL,
case ParsedAttr::AT_CUDALaunchBounds:
handleLaunchBoundsAttr(S, D, AL);
break;
+ case ParsedAttr::AT_CUDAClusterDims:
+ handleClusterDimsAttr(S, D, AL);
+ break;
+ case ParsedAttr::AT_CUDANoCluster:
+ handleNoClusterAttr(S, D, AL);
+ break;
case ParsedAttr::AT_Restrict:
handleRestrictAttr(S, D, AL);
break;
diff --git a/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp b/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp
index ee1b520fa46e9..aab93a93ba95b 100644
--- a/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp
+++ b/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp
@@ -681,6 +681,38 @@ static void instantiateDependentAMDGPUMaxNumWorkGroupsAttr(
S.AMDGPU().addAMDGPUMaxNumWorkGroupsAttr(New, Attr, XExpr, YExpr, ZExpr);
}
+static void instantiateDependentCUDAClusterDimsAttr(
+ Sema &S, const MultiLevelTemplateArgumentList &TemplateArgs,
+ const CUDAClusterDimsAttr &Attr, Decl *New) {
+ EnterExpressionEvaluationContext Unevaluated(
+ S, Sema::ExpressionEvaluationContext::ConstantEvaluated);
+
+ Expr *XExpr = nullptr;
+ Expr *YExpr = nullptr;
+ Expr *ZExpr = nullptr;
+
+ if (Attr.getX()) {
+ ExprResult ResultX = S.SubstExpr(Attr.getX(), TemplateArgs);
+ if (ResultX.isUsable())
+ XExpr = ResultX.getAs<Expr>();
+ }
+
+ if (Attr.getY()) {
+ ExprResult ResultY = S.SubstExpr(Attr.getY(), TemplateArgs);
+ if (ResultY.isUsable())
+ YExpr = ResultY.getAs<Expr>();
+ }
+
+ if (Attr.getZ()) {
+ ExprResult ResultZ = S.SubstExpr(Attr.getZ(), TemplateArgs);
+ if (ResultZ.isUsable())
+ ZExpr = ResultZ.getAs<Expr>();
+ }
+
+ if (XExpr)
+ S.addClusterDimsAttr(New, Attr, XExpr, YExpr, ZExpr);
+}
+
// This doesn't take any template parameters, but we have a custom action that
// needs to happen when the kernel itself is instantiated. We need to run the
// ItaniumMangler to mark the names required to name this kernel.
@@ -883,6 +915,11 @@ void Sema::InstantiateAttrs(const MultiLevelTemplateArgumentList &TemplateArgs,
*this, TemplateArgs, *AMDGPUMaxNumWorkGroups, New);
}
+ if (const auto *CUDAClusterDims = dyn_cast<CUDAClusterDimsAttr>(TmplAttr)) {
+ instantiateDependentCUDAClusterDimsAttr(*this, TemplateArgs,
+ *CUDAClusterDims, New);
+ }
+
if (const auto *ParamAttr = dyn_cast<HLSLParamModifierAttr>(TmplAttr)) {
instantiateDependentHLSLParamModifierAttr(*this, TemplateArgs, ParamAttr,
New);
diff --git a/clang/test/CodeGenCUDA/Inputs/cuda.h b/clang/test/CodeGenCUDA/Inputs/cuda.h
index dc85eae0c5178..e7ad784335027 100644
--- a/clang/test/CodeGenCUDA/Inputs/cuda.h
+++ b/clang/test/CodeGenCUDA/Inputs/cuda.h
@@ -13,6 +13,8 @@
#endif
#define __launch_bounds__(...) __attribute__((launch_bounds(__VA_ARGS__)))
#define __grid_constant__ __attribute__((grid_constant))
+#define __cluster_dims__(...) __attribute__((cluster_dims(__VA_ARGS__)))
+#define __no_cluster__ __attribute__((no_cluster))
#else
#define __constant__
#define __device__
@@ -22,6 +24,8 @@
#define __managed__
#define __launch_bounds__(...)
#define __grid_constant__
+#define __cluster_dims__(...)
+#define __no_cluster__
#endif
struct dim3 {
diff --git a/clang/test/CodeGenCUDA/cluster_dims.cu b/clang/test/CodeGenCUDA/cluster_dims.cu
new file mode 100644
index 0000000000000..00635e3572a7f
--- /dev/null
+++ b/clang/test/CodeGenCUDA/cluster_dims.cu
@@ -0,0 +1,38 @@
+// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -target-cpu gfx1250 -fcuda-is-device -emit-llvm -x hip -o - %s | FileCheck %s
+// RUN: %clang_cc1 -triple x86_64-pc-linux-gnu -aux-triple amdgcn-amd-amdhsa -emit-llvm -x hip -o - %s | FileCheck --check-prefix=HOST %s
+
+#include "Inputs/cuda.h"
+
+const int constint = 4;
+
+// HOST-NOT: "amdgpu-cluster-dims"
+
+// CHECK: "amdgpu-cluster-dims"="2,2,2"
+__global__ void __cluster_dims__(2, 2, 2) test_literal_3d() {}
+
+// CHECK: "amdgpu-cluster-dims"="2,2,1"
+__global__ void __cluster_dims__(2, 2) test_literal_2d() {}
+
+// CHECK: "amdgpu-cluster-dims"="4,1,1"
+__global__ void __cluster_dims__(4) test_literal_1d() {}
+
+// CHECK: "amdgpu-cluster-dims"="4,2,1"
+__global__ void __cluster_dims__(constint, constint / 2, 1) test_constant() {}
+
+// CHECK: "amdgpu-cluster-dims"="0,0,0"
+__global__ void __no_cluster__ test_no_cluster() {}
+
+// CHECK: "amdgpu-cluster-dims"="7,1,1"
+template<unsigned a>
+__global__ void __cluster_dims__(a) test_template_1d() {}
+template __global__ void test_template_1d<7>();
+
+// CHECK: "amdgpu-cluster-dims"="2,6,1"
+template<unsigned a, unsigned b>
+__global__ void __cluster_dims__(a, b) test_template_2d() {}
+template __global__ void test_template_2d<2, 6>();
+
+// CHECK: "amdgpu-cluster-dims"="1,2,3"
+template<unsigned a, unsigned b, unsigned c>
+__global__ void __cluster_dims__(a, b, c) test_template_3d() {}
+template __global__ void test_template_3d<1, 2, 3>();
diff --git a/clang/test/Misc/pragma-attribute-supported-attributes-list.test b/clang/test/Misc/pragma-attribute-supported-attributes-list.test
index 37ff33e5a1523..c8c913448d968 100644
--- a/clang/test/Misc/pragma-attribute-supported-attributes-list.test
+++ b/clang/test/Misc/pragma-attribute-supported-attributes-list.test
@@ -35,6 +35,7 @@
// CHECK-NEXT: CFUnknownTransfer (SubjectMatchRule_function)
// CHECK-NEXT: CPUDispatch (SubjectMatchRule_function)
// CHECK-NEXT: CPUSpecific (SubjectMatchRule_function)
+// CHECK-NEXT: CUDAClusterDims (SubjectMatchRule_function)
// CHECK-NEXT: CUDAConstant (SubjectMatchRule_variable)
// CHECK-NEXT: CUDADevice (SubjectMatchRule_function, SubjectMatchRule_variable)
// CHECK-NEXT: CUDADeviceBuiltinSurfaceType (SubjectMatchRule_record)
@@ -43,6 +44,7 @@
// CHECK-NEXT: CUDAGridConstant (SubjectMatchRule_variable_is_parameter)
// CHECK-NEXT: CUDAHost (SubjectMatchRule_function)
// CHECK-NEXT: CUDALaunchBounds (SubjectMatchRule_objc_method, SubjectMatchRule_hasType_functionType)
+// CHECK-NEXT: CUDANoCluster (SubjectMatchRule_function)
// CHECK-NEXT: CUDAShared (SubjectMatchRule_variable)
// CHECK-NEXT: CXX11NoReturn (SubjectMatchRule_function)
// CHECK-NEXT: CallableWhen (SubjectMatchRule_function_is_member)
diff --git a/clang/test/SemaCUDA/Inputs/cuda.h b/clang/test/SemaCUDA/Inputs/cuda.h
index 10db947d8246c..2bf45e03d91c7 100644
--- a/clang/test/SemaCUDA/Inputs/cuda.h
+++ b/clang/test/SemaCUDA/Inputs/cuda.h
@@ -13,6 +13,8 @@
#define __managed__ __attribute__((managed))
#define __grid_constant__ __attribute__((grid_constant))
#define __launch_bounds__(...) __attribute__((launch_bounds(__VA_ARGS__)))
+#define __cluster_dims__(...) __attribute__((cluster_dims(__VA_ARGS__)))
+#define __no_cluster__ __attribute__((no_cluster))
struct dim3 {
unsigned x, y, z;
diff --git a/clang/test/SemaCUDA/cluster_dims.cu b/clang/test/SemaCUDA/cluster_dims.cu
new file mode 100644
index 0000000000000..3cd0e0197c29b
--- /dev/null
+++ b/clang/test/SemaCUDA/cluster_dims.cu
@@ -0,0 +1,64 @@
+// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -fcuda-is-device -ast-print -x hip -verify=NS,all %s
+// RUN: %clang_cc1 -triple nvptx-nvidia-cuda -fcuda-is-device -ast-print -x hip -verify=NS,all %s
+// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -target-cpu gfx1250 -fcuda-is-device -ast-print -x hip -verify=amd,common,all %s | FileCheck -check-prefixes=CHECK %s
+// RUN: %clang_cc1 -triple nvptx-nvidia-cuda -target-cpu sm_90 -fcuda-is-device -ast-print -x hip -verify=cuda,common,all %s | FileCheck -check-prefixes=CHECK %s
+// RUN: %clang_cc1 -triple x86_64-pc-linux-gnu -aux-triple amdgcn-amd-amdhsa -ast-print -x hip -verify=amd,common,all %s | FileCheck -check-prefixes=CHECK %s
+
+#include "Inputs/cuda.h"
+
+const int constint = 4;
+
+// CHECK: __attribute__((global)) __attribute__((cluster_dims(2, 2, 2))) void test_literal_3d()
+__global__ void __cluster_dims__(2, 2, 2) test_literal_3d() {} //NS-error {{__cluster_dims__ is not supported for this GPU architecture}}
+
+// CHECK: __attribute__((global)) __attribute__((cluster_dims(2, 2))) void test_literal_2d()
+__global__ void __cluster_dims__(2, 2) test_literal_2d() {} //NS-error {{__cluster_dims__ is not supported for this GPU architecture}}
+
+// CHECK: __attribute__((global)) __attribute__((cluster_dims(4))) void test_literal_1d()
+__global__ void __cluster_dims__(4) test_literal_1d() {} //NS-error {{__cluster_dims__ is not supported for this GPU architecture}}
+
+// CHECK: __attribute__((global)) __attribute__((cluster_dims(constint, constint / 4, 1))) void test_constant()
+__global__ void __cluster_dims__(constint, constint / 4, 1) test_constant() {} //NS-error {{__cluster_dims__ is not supported for this GPU architecture}}
+
+// CHECK: template <int x, int y, int z> void test_template() __attribute__((cluster_dims(x, y, z)))
+template <int x, int y, int z> void test_template(void) __cluster_dims__(x, y, z){} //NS-error {{__cluster_dims__ is not supported for this GPU architecture}}
+
+// CHECK: template <int x, int y, int z> void test_template_expr() __attribute__((cluster_dims(x + constint, y, z)))
+template <int x, int y, int z> void test_template_expr(void) __cluster_dims__(x + constint, y, z) {} //NS-error {{__cluster_dims__ is not supported for this GPU architecture}}
+
+//NS-error@+1 {{__cluster_dims__ is not supported for this GPU architecture}}
+__global__ void __cluster_dims__(32, 2, 4) test...
[truncated]
|
|
@llvm/pr-subscribers-backend-x86 Author: Shilei Tian (shiltian) ChangesThis PR adds basic frontend support for Co-authored-by: Yaxun (Sam) Liu <[email protected]> Patch is 22.58 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/156686.diff 12 Files Affected:
diff --git a/clang/include/clang/Basic/Attr.td b/clang/include/clang/Basic/Attr.td
index 29364c5903d31..efb019d43cbe4 100644
--- a/clang/include/clang/Basic/Attr.td
+++ b/clang/include/clang/Basic/Attr.td
@@ -979,7 +979,7 @@ def AnalyzerNoReturn : InheritableAttr {
}
def InferredNoReturn : InheritableAttr {
- let Spellings = [];
+ let Spellings = [];
let SemaHandler = 0;
let Subjects = SubjectList<[Function], ErrorDiag>;
let Documentation = [InternalOnly];
@@ -1557,6 +1557,23 @@ def HIPManaged : InheritableAttr {
let Documentation = [HIPManagedAttrDocs];
}
+def CUDAClusterDims : InheritableAttr {
+ let Spellings = [GNU<"cluster_dims">, Declspec<"__cluster_dims__">];
+ let Args = [ExprArgument<"X">, ExprArgument<"Y", 1>, ExprArgument<"Z", 1>];
+ let Subjects = SubjectList<[Function], ErrorDiag, "kernel functions">;
+ let LangOpts = [CUDA];
+ let Documentation = [Undocumented];
+}
+
+def CUDANoCluster : InheritableAttr {
+ let Spellings = [GNU<"no_cluster">, Declspec<"__no_cluster__">];
+ let Subjects = SubjectList<[Function], ErrorDiag, "kernel functions">;
+ let LangOpts = [CUDA];
+ let Documentation = [Undocumented];
+}
+
+def : MutualExclusions<[CUDAClusterDims, CUDANoCluster]>;
+
def CUDAInvalidTarget : InheritableAttr {
let Spellings = [];
let Subjects = SubjectList<[Function]>;
diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td
index 3146f20da1424..32b6944f2e038 100644
--- a/clang/include/clang/Basic/DiagnosticSemaKinds.td
+++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td
@@ -10698,7 +10698,7 @@ def warn_dangling_reference_captured_by_unknown : Warning<
// Diagnostics based on the Lifetime safety analysis.
def warn_lifetime_safety_loan_expires_permissive : Warning<
- "object whose reference is captured does not live long enough">,
+ "object whose reference is captured does not live long enough">,
InGroup<LifetimeSafetyPermissive>, DefaultIgnore;
def warn_lifetime_safety_loan_expires_strict : Warning<
"object whose reference is captured may not live long enough">,
@@ -13027,6 +13027,14 @@ def warn_cuda_maxclusterrank_sm_90 : Warning<
"maxclusterrank requires sm_90 or higher, CUDA arch provided: %0, ignoring "
"%1 attribute">, InGroup<IgnoredAttributes>;
+def err_cuda_cluster_attr_not_supported : Error<
+ "%select{__cluster_dims__|__no_cluster__}0 is not supported for this GPU architecture"
+>;
+
+def err_cuda_cluster_dims_too_large : Error<
+ "only a maximum of %0 thread blocks in a cluster is supported"
+>;
+
// VTable pointer authentication errors
def err_non_polymorphic_vtable_pointer_auth : Error<
"cannot set vtable pointer authentication on monomorphic type %0">;
diff --git a/clang/include/clang/Sema/Sema.h b/clang/include/clang/Sema/Sema.h
index aa035a1555950..c9c77bd565260 100644
--- a/clang/include/clang/Sema/Sema.h
+++ b/clang/include/clang/Sema/Sema.h
@@ -5002,6 +5002,14 @@ class Sema final : public SemaBase {
void AddLaunchBoundsAttr(Decl *D, const AttributeCommonInfo &CI,
Expr *MaxThreads, Expr *MinBlocks, Expr *MaxBlocks);
+ /// Add a cluster_dims attribute to a particular declaration.
+ CUDAClusterDimsAttr *createClusterDimsAttr(const AttributeCommonInfo &CI,
+ Expr *X, Expr *Y, Expr *Z);
+ void addClusterDimsAttr(Decl *D, const AttributeCommonInfo &CI, Expr *X,
+ Expr *Y, Expr *Z);
+ /// Add a no_cluster attribute to a particular declaration.
+ void addNoClusterAttr(Decl *D, const AttributeCommonInfo &CI);
+
enum class RetainOwnershipKind { NS, CF, OS };
UuidAttr *mergeUuidAttr(Decl *D, const AttributeCommonInfo &CI,
diff --git a/clang/lib/CodeGen/Targets/AMDGPU.cpp b/clang/lib/CodeGen/Targets/AMDGPU.cpp
index 0fcbf7e458a34..48855ce485f91 100644
--- a/clang/lib/CodeGen/Targets/AMDGPU.cpp
+++ b/clang/lib/CodeGen/Targets/AMDGPU.cpp
@@ -342,6 +342,9 @@ static bool requiresAMDGPUProtectedVisibility(const Decl *D,
void AMDGPUTargetCodeGenInfo::setFunctionDeclAttributes(
const FunctionDecl *FD, llvm::Function *F, CodeGenModule &M) const {
+ llvm::StringMap<bool> TargetFetureMap;
+ M.getContext().getFunctionFeatureMap(TargetFetureMap, FD);
+
const auto *ReqdWGS =
M.getLangOpts().OpenCL ? FD->getAttr<ReqdWorkGroupSizeAttr>() : nullptr;
const bool IsOpenCLKernel =
@@ -402,6 +405,29 @@ void AMDGPUTargetCodeGenInfo::setFunctionDeclAttributes(
F->addFnAttr("amdgpu-max-num-workgroups", AttrVal.str());
}
+
+ if (auto *Attr = FD->getAttr<CUDAClusterDimsAttr>()) {
+ uint32_t X =
+ Attr->getX()->EvaluateKnownConstInt(M.getContext()).getExtValue();
+ uint32_t Y =
+ Attr->getY()
+ ? Attr->getY()->EvaluateKnownConstInt(M.getContext()).getExtValue()
+ : 1;
+ uint32_t Z =
+ Attr->getZ()
+ ? Attr->getZ()->EvaluateKnownConstInt(M.getContext()).getExtValue()
+ : 1;
+
+ llvm::SmallString<32> AttrVal;
+ llvm::raw_svector_ostream OS(AttrVal);
+ OS << X << ',' << Y << ',' << Z;
+ F->addFnAttr("amdgpu-cluster-dims", AttrVal.str());
+ }
+
+ // OpenCL doesn't support cluster feature.
+ if ((IsOpenCLKernel && TargetFetureMap.lookup("gfx1250-insts")) ||
+ FD->getAttr<CUDANoClusterAttr>())
+ F->addFnAttr("amdgpu-cluster-dims", "0,0,0");
}
void AMDGPUTargetCodeGenInfo::setTargetAttributes(
diff --git a/clang/lib/Headers/__clang_hip_runtime_wrapper.h b/clang/lib/Headers/__clang_hip_runtime_wrapper.h
index da1e39ac7270e..fb0ece96e1418 100644
--- a/clang/lib/Headers/__clang_hip_runtime_wrapper.h
+++ b/clang/lib/Headers/__clang_hip_runtime_wrapper.h
@@ -25,6 +25,8 @@
#define __constant__ __attribute__((constant))
#define __managed__ __attribute__((managed))
+#define __cluster_dims__(...) __attribute__((cluster_dims(__VA_ARGS__)))
+
#if !defined(__cplusplus) || __cplusplus < 201103L
#define nullptr NULL;
#endif
diff --git a/clang/lib/Sema/SemaDeclAttr.cpp b/clang/lib/Sema/SemaDeclAttr.cpp
index 3ded60cd8b073..ad2c28843f970 100644
--- a/clang/lib/Sema/SemaDeclAttr.cpp
+++ b/clang/lib/Sema/SemaDeclAttr.cpp
@@ -5654,6 +5654,130 @@ static void handleLaunchBoundsAttr(Sema &S, Decl *D, const ParsedAttr &AL) {
AL.getNumArgs() > 2 ? AL.getArgAsExpr(2) : nullptr);
}
+static std::pair<Expr *, int>
+makeClusterDimsArgExpr(Sema &S, Expr *E, const CUDAClusterDimsAttr &AL,
+ const unsigned Idx) {
+ if (S.DiagnoseUnexpandedParameterPack(E))
+ return {nullptr, 0};
+
+ // Accept template arguments for now as they depend on something else.
+ // We'll get to check them when they eventually get instantiated.
+ if (E->isValueDependent())
+ return {E, 1};
+
+ std::optional<llvm::APSInt> I = llvm::APSInt(64);
+ if (!(I = E->getIntegerConstantExpr(S.Context))) {
+ S.Diag(E->getExprLoc(), diag::err_attribute_argument_n_type)
+ << &AL << Idx << AANT_ArgumentIntegerConstant << E->getSourceRange();
+ return {nullptr, 0};
+ }
+ // Make sure we can fit it in 4 bits.
+ if (!I->isIntN(4)) {
+ S.Diag(E->getExprLoc(), diag::err_ice_too_large)
+ << toString(*I, 10, false) << 4 << /* Unsigned */ 1;
+ return {nullptr, 0};
+ }
+ if (*I < 0)
+ S.Diag(E->getExprLoc(), diag::warn_attribute_argument_n_negative)
+ << &AL << Idx << E->getSourceRange();
+
+ // We may need to perform implicit conversion of the argument.
+ InitializedEntity Entity = InitializedEntity::InitializeParameter(
+ S.Context, S.Context.getConstType(S.Context.IntTy), /*consume*/ false);
+ ExprResult ValArg = S.PerformCopyInitialization(Entity, SourceLocation(), E);
+ assert(!ValArg.isInvalid() &&
+ "Unexpected PerformCopyInitialization() failure.");
+
+ return {ValArg.getAs<Expr>(), I->getZExtValue()};
+}
+
+CUDAClusterDimsAttr *Sema::createClusterDimsAttr(const AttributeCommonInfo &CI,
+ Expr *X, Expr *Y, Expr *Z) {
+ CUDAClusterDimsAttr TmpAttr(Context, CI, X, Y, Z);
+
+ int ValX = 1;
+ int ValY = 1;
+ int ValZ = 1;
+
+ std::tie(X, ValX) = makeClusterDimsArgExpr(*this, X, TmpAttr, /*Idx=*/0);
+ if (!X)
+ return nullptr;
+
+ if (Y) {
+ std::tie(Y, ValY) = makeClusterDimsArgExpr(*this, Y, TmpAttr, /*Idx=*/1);
+ if (!Y)
+ return nullptr;
+ }
+
+ if (Z) {
+ std::tie(Z, ValZ) = makeClusterDimsArgExpr(*this, Z, TmpAttr, /*Idx=*/2);
+ if (!Z)
+ return nullptr;
+ }
+
+ int FlatDim = ValX * ValY * ValZ;
+ auto TT = (!Context.getLangOpts().CUDAIsDevice && Context.getAuxTargetInfo())
+ ? Context.getAuxTargetInfo()->getTriple()
+ : Context.getTargetInfo().getTriple();
+ int MaxDim = 1;
+ if (TT.isNVPTX())
+ MaxDim = 8;
+ else if (TT.isAMDGPU())
+ MaxDim = 16;
+ else
+ return nullptr;
+
+ // A maximum of 8 thread blocks in a cluster is supported as a portable
+ // cluster size in CUDA. The number is 16 for AMDGPU.
+ if (FlatDim > MaxDim) {
+ Diag(CI.getLoc(), diag::err_cuda_cluster_dims_too_large) << MaxDim;
+ return nullptr;
+ }
+
+ return ::new (Context) CUDAClusterDimsAttr(Context, CI, X, Y, Z);
+}
+
+void Sema::addClusterDimsAttr(Decl *D, const AttributeCommonInfo &CI, Expr *X,
+ Expr *Y, Expr *Z) {
+ if (auto *Attr = createClusterDimsAttr(CI, X, Y, Z))
+ D->addAttr(Attr);
+}
+
+void Sema::addNoClusterAttr(Decl *D, const AttributeCommonInfo &CI) {
+ if (CUDANoClusterAttr *Attr = ::new (Context) CUDANoClusterAttr(Context, CI))
+ D->addAttr(Attr);
+}
+
+static void handleClusterDimsAttr(Sema &S, Decl *D, const ParsedAttr &AL) {
+ auto &TTI = S.Context.getTargetInfo();
+ auto Arch = StringToOffloadArch(TTI.getTargetOpts().CPU);
+ if ((TTI.getTriple().isNVPTX() && Arch < clang::OffloadArch::SM_90) ||
+ (TTI.getTriple().isAMDGPU() && Arch < clang::OffloadArch::GFX1250)) {
+ S.Diag(AL.getLoc(), diag::err_cuda_cluster_attr_not_supported) << 0;
+ return;
+ }
+
+ if (!AL.checkAtLeastNumArgs(S, /*Num=*/1) ||
+ !AL.checkAtMostNumArgs(S, /*Num=*/3))
+ return;
+
+ S.addClusterDimsAttr(D, AL, AL.getArgAsExpr(0),
+ AL.getNumArgs() > 1 ? AL.getArgAsExpr(1) : nullptr,
+ AL.getNumArgs() > 2 ? AL.getArgAsExpr(2) : nullptr);
+}
+
+static void handleNoClusterAttr(Sema &S, Decl *D, const ParsedAttr &AL) {
+ auto &TTI = S.Context.getTargetInfo();
+ auto Arch = StringToOffloadArch(TTI.getTargetOpts().CPU);
+ if ((TTI.getTriple().isNVPTX() && Arch < clang::OffloadArch::SM_90) ||
+ (TTI.getTriple().isAMDGPU() && Arch < clang::OffloadArch::GFX1250)) {
+ S.Diag(AL.getLoc(), diag::err_cuda_cluster_attr_not_supported) << 1;
+ return;
+ }
+
+ S.addNoClusterAttr(D, AL);
+}
+
static void handleArgumentWithTypeTagAttr(Sema &S, Decl *D,
const ParsedAttr &AL) {
if (!AL.isArgIdent(0)) {
@@ -7105,6 +7229,12 @@ ProcessDeclAttribute(Sema &S, Scope *scope, Decl *D, const ParsedAttr &AL,
case ParsedAttr::AT_CUDALaunchBounds:
handleLaunchBoundsAttr(S, D, AL);
break;
+ case ParsedAttr::AT_CUDAClusterDims:
+ handleClusterDimsAttr(S, D, AL);
+ break;
+ case ParsedAttr::AT_CUDANoCluster:
+ handleNoClusterAttr(S, D, AL);
+ break;
case ParsedAttr::AT_Restrict:
handleRestrictAttr(S, D, AL);
break;
diff --git a/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp b/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp
index ee1b520fa46e9..aab93a93ba95b 100644
--- a/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp
+++ b/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp
@@ -681,6 +681,38 @@ static void instantiateDependentAMDGPUMaxNumWorkGroupsAttr(
S.AMDGPU().addAMDGPUMaxNumWorkGroupsAttr(New, Attr, XExpr, YExpr, ZExpr);
}
+static void instantiateDependentCUDAClusterDimsAttr(
+ Sema &S, const MultiLevelTemplateArgumentList &TemplateArgs,
+ const CUDAClusterDimsAttr &Attr, Decl *New) {
+ EnterExpressionEvaluationContext Unevaluated(
+ S, Sema::ExpressionEvaluationContext::ConstantEvaluated);
+
+ Expr *XExpr = nullptr;
+ Expr *YExpr = nullptr;
+ Expr *ZExpr = nullptr;
+
+ if (Attr.getX()) {
+ ExprResult ResultX = S.SubstExpr(Attr.getX(), TemplateArgs);
+ if (ResultX.isUsable())
+ XExpr = ResultX.getAs<Expr>();
+ }
+
+ if (Attr.getY()) {
+ ExprResult ResultY = S.SubstExpr(Attr.getY(), TemplateArgs);
+ if (ResultY.isUsable())
+ YExpr = ResultY.getAs<Expr>();
+ }
+
+ if (Attr.getZ()) {
+ ExprResult ResultZ = S.SubstExpr(Attr.getZ(), TemplateArgs);
+ if (ResultZ.isUsable())
+ ZExpr = ResultZ.getAs<Expr>();
+ }
+
+ if (XExpr)
+ S.addClusterDimsAttr(New, Attr, XExpr, YExpr, ZExpr);
+}
+
// This doesn't take any template parameters, but we have a custom action that
// needs to happen when the kernel itself is instantiated. We need to run the
// ItaniumMangler to mark the names required to name this kernel.
@@ -883,6 +915,11 @@ void Sema::InstantiateAttrs(const MultiLevelTemplateArgumentList &TemplateArgs,
*this, TemplateArgs, *AMDGPUMaxNumWorkGroups, New);
}
+ if (const auto *CUDAClusterDims = dyn_cast<CUDAClusterDimsAttr>(TmplAttr)) {
+ instantiateDependentCUDAClusterDimsAttr(*this, TemplateArgs,
+ *CUDAClusterDims, New);
+ }
+
if (const auto *ParamAttr = dyn_cast<HLSLParamModifierAttr>(TmplAttr)) {
instantiateDependentHLSLParamModifierAttr(*this, TemplateArgs, ParamAttr,
New);
diff --git a/clang/test/CodeGenCUDA/Inputs/cuda.h b/clang/test/CodeGenCUDA/Inputs/cuda.h
index dc85eae0c5178..e7ad784335027 100644
--- a/clang/test/CodeGenCUDA/Inputs/cuda.h
+++ b/clang/test/CodeGenCUDA/Inputs/cuda.h
@@ -13,6 +13,8 @@
#endif
#define __launch_bounds__(...) __attribute__((launch_bounds(__VA_ARGS__)))
#define __grid_constant__ __attribute__((grid_constant))
+#define __cluster_dims__(...) __attribute__((cluster_dims(__VA_ARGS__)))
+#define __no_cluster__ __attribute__((no_cluster))
#else
#define __constant__
#define __device__
@@ -22,6 +24,8 @@
#define __managed__
#define __launch_bounds__(...)
#define __grid_constant__
+#define __cluster_dims__(...)
+#define __no_cluster__
#endif
struct dim3 {
diff --git a/clang/test/CodeGenCUDA/cluster_dims.cu b/clang/test/CodeGenCUDA/cluster_dims.cu
new file mode 100644
index 0000000000000..00635e3572a7f
--- /dev/null
+++ b/clang/test/CodeGenCUDA/cluster_dims.cu
@@ -0,0 +1,38 @@
+// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -target-cpu gfx1250 -fcuda-is-device -emit-llvm -x hip -o - %s | FileCheck %s
+// RUN: %clang_cc1 -triple x86_64-pc-linux-gnu -aux-triple amdgcn-amd-amdhsa -emit-llvm -x hip -o - %s | FileCheck --check-prefix=HOST %s
+
+#include "Inputs/cuda.h"
+
+const int constint = 4;
+
+// HOST-NOT: "amdgpu-cluster-dims"
+
+// CHECK: "amdgpu-cluster-dims"="2,2,2"
+__global__ void __cluster_dims__(2, 2, 2) test_literal_3d() {}
+
+// CHECK: "amdgpu-cluster-dims"="2,2,1"
+__global__ void __cluster_dims__(2, 2) test_literal_2d() {}
+
+// CHECK: "amdgpu-cluster-dims"="4,1,1"
+__global__ void __cluster_dims__(4) test_literal_1d() {}
+
+// CHECK: "amdgpu-cluster-dims"="4,2,1"
+__global__ void __cluster_dims__(constint, constint / 2, 1) test_constant() {}
+
+// CHECK: "amdgpu-cluster-dims"="0,0,0"
+__global__ void __no_cluster__ test_no_cluster() {}
+
+// CHECK: "amdgpu-cluster-dims"="7,1,1"
+template<unsigned a>
+__global__ void __cluster_dims__(a) test_template_1d() {}
+template __global__ void test_template_1d<7>();
+
+// CHECK: "amdgpu-cluster-dims"="2,6,1"
+template<unsigned a, unsigned b>
+__global__ void __cluster_dims__(a, b) test_template_2d() {}
+template __global__ void test_template_2d<2, 6>();
+
+// CHECK: "amdgpu-cluster-dims"="1,2,3"
+template<unsigned a, unsigned b, unsigned c>
+__global__ void __cluster_dims__(a, b, c) test_template_3d() {}
+template __global__ void test_template_3d<1, 2, 3>();
diff --git a/clang/test/Misc/pragma-attribute-supported-attributes-list.test b/clang/test/Misc/pragma-attribute-supported-attributes-list.test
index 37ff33e5a1523..c8c913448d968 100644
--- a/clang/test/Misc/pragma-attribute-supported-attributes-list.test
+++ b/clang/test/Misc/pragma-attribute-supported-attributes-list.test
@@ -35,6 +35,7 @@
// CHECK-NEXT: CFUnknownTransfer (SubjectMatchRule_function)
// CHECK-NEXT: CPUDispatch (SubjectMatchRule_function)
// CHECK-NEXT: CPUSpecific (SubjectMatchRule_function)
+// CHECK-NEXT: CUDAClusterDims (SubjectMatchRule_function)
// CHECK-NEXT: CUDAConstant (SubjectMatchRule_variable)
// CHECK-NEXT: CUDADevice (SubjectMatchRule_function, SubjectMatchRule_variable)
// CHECK-NEXT: CUDADeviceBuiltinSurfaceType (SubjectMatchRule_record)
@@ -43,6 +44,7 @@
// CHECK-NEXT: CUDAGridConstant (SubjectMatchRule_variable_is_parameter)
// CHECK-NEXT: CUDAHost (SubjectMatchRule_function)
// CHECK-NEXT: CUDALaunchBounds (SubjectMatchRule_objc_method, SubjectMatchRule_hasType_functionType)
+// CHECK-NEXT: CUDANoCluster (SubjectMatchRule_function)
// CHECK-NEXT: CUDAShared (SubjectMatchRule_variable)
// CHECK-NEXT: CXX11NoReturn (SubjectMatchRule_function)
// CHECK-NEXT: CallableWhen (SubjectMatchRule_function_is_member)
diff --git a/clang/test/SemaCUDA/Inputs/cuda.h b/clang/test/SemaCUDA/Inputs/cuda.h
index 10db947d8246c..2bf45e03d91c7 100644
--- a/clang/test/SemaCUDA/Inputs/cuda.h
+++ b/clang/test/SemaCUDA/Inputs/cuda.h
@@ -13,6 +13,8 @@
#define __managed__ __attribute__((managed))
#define __grid_constant__ __attribute__((grid_constant))
#define __launch_bounds__(...) __attribute__((launch_bounds(__VA_ARGS__)))
+#define __cluster_dims__(...) __attribute__((cluster_dims(__VA_ARGS__)))
+#define __no_cluster__ __attribute__((no_cluster))
struct dim3 {
unsigned x, y, z;
diff --git a/clang/test/SemaCUDA/cluster_dims.cu b/clang/test/SemaCUDA/cluster_dims.cu
new file mode 100644
index 0000000000000..3cd0e0197c29b
--- /dev/null
+++ b/clang/test/SemaCUDA/cluster_dims.cu
@@ -0,0 +1,64 @@
+// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -fcuda-is-device -ast-print -x hip -verify=NS,all %s
+// RUN: %clang_cc1 -triple nvptx-nvidia-cuda -fcuda-is-device -ast-print -x hip -verify=NS,all %s
+// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -target-cpu gfx1250 -fcuda-is-device -ast-print -x hip -verify=amd,common,all %s | FileCheck -check-prefixes=CHECK %s
+// RUN: %clang_cc1 -triple nvptx-nvidia-cuda -target-cpu sm_90 -fcuda-is-device -ast-print -x hip -verify=cuda,common,all %s | FileCheck -check-prefixes=CHECK %s
+// RUN: %clang_cc1 -triple x86_64-pc-linux-gnu -aux-triple amdgcn-amd-amdhsa -ast-print -x hip -verify=amd,common,all %s | FileCheck -check-prefixes=CHECK %s
+
+#include "Inputs/cuda.h"
+
+const int constint = 4;
+
+// CHECK: __attribute__((global)) __attribute__((cluster_dims(2, 2, 2))) void test_literal_3d()
+__global__ void __cluster_dims__(2, 2, 2) test_literal_3d() {} //NS-error {{__cluster_dims__ is not supported for this GPU architecture}}
+
+// CHECK: __attribute__((global)) __attribute__((cluster_dims(2, 2))) void test_literal_2d()
+__global__ void __cluster_dims__(2, 2) test_literal_2d() {} //NS-error {{__cluster_dims__ is not supported for this GPU architecture}}
+
+// CHECK: __attribute__((global)) __attribute__((cluster_dims(4))) void test_literal_1d()
+__global__ void __cluster_dims__(4) test_literal_1d() {} //NS-error {{__cluster_dims__ is not supported for this GPU architecture}}
+
+// CHECK: __attribute__((global)) __attribute__((cluster_dims(constint, constint / 4, 1))) void test_constant()
+__global__ void __cluster_dims__(constint, constint / 4, 1) test_constant() {} //NS-error {{__cluster_dims__ is not supported for this GPU architecture}}
+
+// CHECK: template <int x, int y, int z> void test_template() __attribute__((cluster_dims(x, y, z)))
+template <int x, int y, int z> void test_template(void) __cluster_dims__(x, y, z){} //NS-error {{__cluster_dims__ is not supported for this GPU architecture}}
+
+// CHECK: template <int x, int y, int z> void test_template_expr() __attribute__((cluster_dims(x + constint, y, z)))
+template <int x, int y, int z> void test_template_expr(void) __cluster_dims__(x + constint, y, z) {} //NS-error {{__cluster_dims__ is not supported for this GPU architecture}}
+
+//NS-error@+1 {{__cluster_dims__ is not supported for this GPU architecture}}
+__global__ void __cluster_dims__(32, 2, 4) test...
[truncated]
|
4d6e309 to
190dd7a
Compare
|
It isn't clear what these are, why/whether we want them, what their signature is, etc. Additionally, we do not allow adding undocumented attributes anymore, so please make sure these get documented. And why can't these just use the other dimensions-attributes that we have (of which we have plenty). |
It is the support for
Noted. Will add some documents.
I don't follow this one. Can you explain more? These are for different purposes. |
ALL of this (including a summary that doesn't require a link) should be in the commit message and documentation.
are they? There is nothing on this patch that indicates that (being a bit coy, the documentation should probably make it clear what the differences are/how it is used differently than all the other 'dim' based attributes). |
7d41777 to
e57c037
Compare
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM, but I will defer to people more familiar with clang.
| let Category = DocCatDecl; | ||
| let Content = [{ | ||
| In CUDA/HIP programming, the ``__no_cluster__`` attribute can be applied to a kernel function to | ||
| indicate that the thread block cluster feature will not be enabled at both compile time and kernel |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
what is a thread block cluster feature? Can that be explained in these docs?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I don't think that needs to be explained in LLVM. It's like we don't explain what a grid or a thread block is in LLVM either. LLVM docs are not a CUDA/HIP programming guide.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Perhaps we should :) The point of the docs here is to help newbies have SOME level of idea what the thing does so they can research it more. Right now it is an expert-only documentation.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
That is beyond the scope of this PR. We can always come back later if we decide to make this document easier for people outside of compiler world.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I'm not expecting to get ALL of the documentation right (though, patch would be welcome). I'm asking you to do THIS one right since it is new.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Again, like I said, I don’t think we should do this. This isn't a CUDA/HIP programming guide, and LLVM docs are definitely not the right place for it. Turning LLVM docs into a CUDA/HIP programming guide isn't something we can just decide here. I'd expect it to go through an RFC first, and if the community accepts it, then we can put together a proper document.
As for "some ideas of what the thing does so they can research it more," I think the current wording already works well. We mention the "thread block cluster feature" for CUDA and HIP, and that's basically all the keywords someone needs to search and find the CUDA docs for details.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The point of the docs here is to help newbies have SOME level of idea what the thing does so they can research it more.
@erichkeane Would a short description + a link to the appropriate section of the CUDA programming manual do? Brief description using established nomenclature is probably as far as we should go on the LLVM side, IMO, and we should leave the details to the CUDA programming guide, and PTX spec.
Including more (but not enough) details will not be of much help for the newbies.
The attribute name itself is distinct enough to make it easily googleable, or directly searchable for in the CUDA docs.
e57c037 to
dae2768
Compare
clang/include/clang/Basic/Attr.td
Outdated
| } | ||
|
|
||
| def CUDANoCluster : InheritableAttr { | ||
| let Spellings = [GNU<"no_cluster">, Declspec<"__no_cluster__">]; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
How is __no_cluster__ different from not specifying __cluster_dims__ ?
What exactly does it mean to indicate that the thread block cluster feature will not be enabled at both compile time and kernel launch time. What exactly is it supposed to do to the kernel, and how can we tall that a given kernel is not supposed to be launched on clusters at runtime -- we have no way to enforce what the user may end up passing to the CUDA driver or runtime APIs.
NVIDIA docs are fascinatingly confusing. We apparently also have __block_size__ now which may optionally specify the cluster size. To make things even more confusing, if the __block_size__ specifies cluster size, that leads to "...the compiler would recognize the first argument inside <<<>>> as the number of clusters instead of thread blocks."
https://docs.nvidia.com/cuda/cuda-c-programming-guide/#blocks-as-clusters
If we're adding __cluster_dims__, we should probably add support for __block_size__ now, too, so we deal with that mess once.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
If a kernel doesn't have __cluster_dims__, user can still enable the cluster feature at runtime during kernel launch. That means the compiler has to be conservative about cluster-related handling in the backend and assume the feature could be used.
On the other hand, __no_cluster__ tells the compiler the cluster feature will not be enabled at runtime. This lets the backend optimize if needed. For AMDGPU, it helps the compiler avoid querying certain registers when lowering some workgroup-related intrinsics.
__no_cluster__ is not a CUDA concept. I can make it a HIP only one if CUDA's lowering can not benefit from it at all.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
no_cluster is not a CUDA concept. I can make it a HIP only one if CUDA's lowering can not benefit from it at all.
At the moment I do not see any practical use of the attribute for CUDA and would prefer to keep it HIP-only.
| let Category = DocCatDecl; | ||
| let Content = [{ | ||
| In CUDA/HIP programming, the ``__no_cluster__`` attribute can be applied to a kernel function to | ||
| indicate that the thread block cluster feature will not be enabled at both compile time and kernel |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The point of the docs here is to help newbies have SOME level of idea what the thing does so they can research it more.
@erichkeane Would a short description + a link to the appropriate section of the CUDA programming manual do? Brief description using established nomenclature is probably as far as we should go on the LLVM side, IMO, and we should leave the details to the CUDA programming guide, and PTX spec.
Including more (but not enough) details will not be of much help for the newbies.
The attribute name itself is distinct enough to make it easily googleable, or directly searchable for in the CUDA docs.
|
ping+1 |
b009efb to
2ff7b16
Compare
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I'd like to close out on a unified way of talking about this in the docs, but I'm ok with the rest of it, so approve pending a touch of work on docs.
2ff7b16 to
fb807b7
Compare
This PR adds basic frontend support for `__cluster_dims__` and `__no_cluster__` attribute. Co-authored-by: Yaxun (Sam) Liu <[email protected]> Co-authored-by: Jay Foad <[email protected]>
fb807b7 to
98d7e29
Compare
|
Thank all for the detailed review! |
|
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/27/builds/17661 Here is the relevant piece of the build log for the reference |
|
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/88/builds/17238 Here is the relevant piece of the build log for the reference |

This PR adds basic frontend support for
__cluster_dims__and__no_cluster__attribute.In CUDA/HIP programming, the
__cluster_dims__attribute can be applied to a kernel function to set the dimensions of a thread block cluster. The__no_cluster__attribute can be applied to a kernel function to indicate that the thread block cluster feature will not be enabled at both compile time and kernel launch time. Note that__no_cluster__is a LLVM/Clang only attribute.Co-authored-by: Yaxun (Sam) Liu [email protected]
Co-authored-by: Jay Foad [email protected]