Skip to content
Merged
17 changes: 17 additions & 0 deletions clang/include/clang/Basic/Attr.td
Original file line number Diff line number Diff line change
Expand Up @@ -1572,6 +1572,23 @@ def HIPManaged : InheritableAttr {
let Documentation = [HIPManagedAttrDocs];
}

def CUDAClusterDims : InheritableAttr {
let Spellings = [GNU<"cluster_dims">];
let Args = [ExprArgument<"X">, ExprArgument<"Y", /*opt=*/1>, ExprArgument<"Z", /*opt=*/1>];
let Subjects = SubjectList<[ObjCMethod, FunctionLike]>;
let LangOpts = [CUDA];
let Documentation = [CUDAClusterDimsAttrDoc];
}

def CUDANoCluster : InheritableAttr {
let Spellings = [GNU<"no_cluster">];
let Subjects = SubjectList<[ObjCMethod, FunctionLike]>;
let LangOpts = [CUDA];
let Documentation = [CUDANoClusterAttrDoc];
}

def : MutualExclusions<[CUDAClusterDims, CUDANoCluster]>;

def CUDAInvalidTarget : InheritableAttr {
let Spellings = [];
let Subjects = SubjectList<[Function]>;
Expand Down
39 changes: 39 additions & 0 deletions clang/include/clang/Basic/AttrDocs.td
Original file line number Diff line number Diff line change
Expand Up @@ -7545,6 +7545,45 @@ A managed variable can be accessed in both device and host code.
}];
}

def CUDAClusterDimsAttrDoc : Documentation {
let Category = DocCatDecl;
let Content = [{
In CUDA/HIP programming, the ``cluster_dims`` attribute, conventionally exposed as the
``__cluster_dims__`` macro, can be applied to a kernel function to set the dimensions of a
thread block cluster, which is an optional level of hierarchy and made up of thread blocks.
``__cluster_dims__`` defines the cluster size as ``(X, Y, Z)``, where each value is the number
of thread blocks in that dimension. The ``cluster_dims`` and `no_cluster`` attributes are
mutually exclusive.

.. code::

__global__ __cluster_dims__(2, 1, 1) void kernel(...) {
...
}

}];
}

def CUDANoClusterAttrDoc : Documentation {
let Category = DocCatDecl;
let Content = [{
In CUDA/HIP programming, a kernel function can still be launched with the cluster feature enabled
at runtime, even without being annotated with ``__cluster_dims__``. The LLVM/Clang-exclusive
``no_cluster`` attribute, conventionally exposed as the ``__no_cluster__`` macro, can be applied to
a kernel function to explicitly indicate that the cluster feature will not be enabled either at
compile time or at kernel launch time. This allows the compiler to apply certain optimizations
without assuming that clustering could be enabled at runtime. It is undefined behavior to launch a
kernel annotated with ``__no_cluster__`` if the cluster feature is enabled at runtime.
The ``cluster_dims`` and ``no_cluster`` attributes are mutually exclusive.

.. code::

__global__ __no_cluster__ void kernel(...) {
...
}
}];
}

def LifetimeOwnerDocs : Documentation {
let Category = DocCatDecl;
let Content = [{
Expand Down
6 changes: 6 additions & 0 deletions clang/include/clang/Basic/DiagnosticSemaKinds.td
Original file line number Diff line number Diff line change
Expand Up @@ -13070,6 +13070,12 @@ def warn_cuda_maxclusterrank_sm_90 : Warning<
"maxclusterrank requires sm_90 or higher, CUDA arch provided: %0, ignoring "
"%1 attribute">, InGroup<IgnoredAttributes>;

def err_cluster_attr_not_supported : Error<
"%0 is not supported for this GPU architecture">;

def err_cluster_dims_too_large : Error<
"cluster does not support more than %0 thread blocks; %1 provided">;

// VTable pointer authentication errors
def err_non_polymorphic_vtable_pointer_auth : Error<
"cannot set vtable pointer authentication on monomorphic type %0">;
Expand Down
8 changes: 8 additions & 0 deletions clang/include/clang/Sema/Sema.h
Original file line number Diff line number Diff line change
Expand Up @@ -5010,6 +5010,14 @@ class Sema final : public SemaBase {
void AddLaunchBoundsAttr(Decl *D, const AttributeCommonInfo &CI,
Expr *MaxThreads, Expr *MinBlocks, Expr *MaxBlocks);

/// Add a cluster_dims attribute to a particular declaration.
CUDAClusterDimsAttr *createClusterDimsAttr(const AttributeCommonInfo &CI,
Expr *X, Expr *Y, Expr *Z);
void addClusterDimsAttr(Decl *D, const AttributeCommonInfo &CI, Expr *X,
Expr *Y, Expr *Z);
/// Add a no_cluster attribute to a particular declaration.
void addNoClusterAttr(Decl *D, const AttributeCommonInfo &CI);

enum class RetainOwnershipKind { NS, CF, OS };

UuidAttr *mergeUuidAttr(Decl *D, const AttributeCommonInfo &CI,
Expand Down
20 changes: 20 additions & 0 deletions clang/lib/CodeGen/Targets/AMDGPU.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -402,6 +402,26 @@ void AMDGPUTargetCodeGenInfo::setFunctionDeclAttributes(

F->addFnAttr("amdgpu-max-num-workgroups", AttrVal.str());
}

if (auto *Attr = FD->getAttr<CUDAClusterDimsAttr>()) {
auto GetExprVal = [&](const auto &E) {
return E ? E->EvaluateKnownConstInt(M.getContext()).getExtValue() : 1;
};
unsigned X = GetExprVal(Attr->getX());
unsigned Y = GetExprVal(Attr->getY());
unsigned Z = GetExprVal(Attr->getZ());
llvm::SmallString<32> AttrVal;
llvm::raw_svector_ostream OS(AttrVal);
OS << X << ',' << Y << ',' << Z;
F->addFnAttr("amdgpu-cluster-dims", AttrVal.str());
}

// OpenCL doesn't support cluster feature.
const TargetInfo &TTI = M.getContext().getTargetInfo();
if ((IsOpenCLKernel &&
TTI.hasFeatureEnabled(TTI.getTargetOpts().FeatureMap, "clusters")) ||
FD->hasAttr<CUDANoClusterAttr>())
F->addFnAttr("amdgpu-cluster-dims", "0,0,0");
}

void AMDGPUTargetCodeGenInfo::setTargetAttributes(
Expand Down
2 changes: 2 additions & 0 deletions clang/lib/Headers/__clang_hip_runtime_wrapper.h
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,8 @@
#define __constant__ __attribute__((constant))
#define __managed__ __attribute__((managed))

#define __cluster_dims__(...) __attribute__((cluster_dims(__VA_ARGS__)))

#if !defined(__cplusplus) || __cplusplus < 201103L
#define nullptr NULL;
#endif
Expand Down
114 changes: 114 additions & 0 deletions clang/lib/Sema/SemaDeclAttr.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5676,6 +5676,114 @@ static void handleLaunchBoundsAttr(Sema &S, Decl *D, const ParsedAttr &AL) {
AL.getNumArgs() > 2 ? AL.getArgAsExpr(2) : nullptr);
}

static std::pair<Expr *, int>
makeClusterDimsArgExpr(Sema &S, Expr *E, const CUDAClusterDimsAttr &AL,
const unsigned Idx) {
if (!E || S.DiagnoseUnexpandedParameterPack(E))
return {};

// Accept template arguments for now as they depend on something else.
// We'll get to check them when they eventually get instantiated.
if (E->isInstantiationDependent())
return {E, 1};

std::optional<llvm::APSInt> I = E->getIntegerConstantExpr(S.Context);
if (!I) {
S.Diag(E->getExprLoc(), diag::err_attribute_argument_n_type)
<< &AL << Idx << AANT_ArgumentIntegerConstant << E->getSourceRange();
return {};
}
// Make sure we can fit it in 4 bits.
if (!I->isIntN(4)) {
S.Diag(E->getExprLoc(), diag::err_ice_too_large)
<< toString(*I, 10, false) << 4 << /*Unsigned=*/1;
return {};
}
if (*I < 0) {
S.Diag(E->getExprLoc(), diag::warn_attribute_argument_n_negative)
<< &AL << Idx << E->getSourceRange();
}

return {ConstantExpr::Create(S.getASTContext(), E, APValue(*I)),
I->getZExtValue()};
}

CUDAClusterDimsAttr *Sema::createClusterDimsAttr(const AttributeCommonInfo &CI,
Expr *X, Expr *Y, Expr *Z) {
CUDAClusterDimsAttr TmpAttr(Context, CI, X, Y, Z);

auto [NewX, ValX] = makeClusterDimsArgExpr(*this, X, TmpAttr, /*Idx=*/0);
auto [NewY, ValY] = makeClusterDimsArgExpr(*this, Y, TmpAttr, /*Idx=*/1);
auto [NewZ, ValZ] = makeClusterDimsArgExpr(*this, Z, TmpAttr, /*Idx=*/2);

if (!NewX || (Y && !NewY) || (Z && !NewZ))
return nullptr;

int FlatDim = ValX * ValY * ValZ;
const llvm::Triple TT =
(!Context.getLangOpts().CUDAIsDevice && Context.getAuxTargetInfo())
? Context.getAuxTargetInfo()->getTriple()
: Context.getTargetInfo().getTriple();
int MaxDim = 1;
if (TT.isNVPTX())
MaxDim = 8;
else if (TT.isAMDGPU())
MaxDim = 16;
else
return nullptr;

// A maximum of 8 thread blocks in a cluster is supported as a portable
// cluster size in CUDA. The number is 16 for AMDGPU.
if (FlatDim > MaxDim) {
Diag(CI.getLoc(), diag::err_cluster_dims_too_large) << MaxDim << FlatDim;
return nullptr;
}

return CUDAClusterDimsAttr::Create(Context, NewX, NewY, NewZ, CI);
}

void Sema::addClusterDimsAttr(Decl *D, const AttributeCommonInfo &CI, Expr *X,
Expr *Y, Expr *Z) {
if (auto *Attr = createClusterDimsAttr(CI, X, Y, Z))
D->addAttr(Attr);
}

void Sema::addNoClusterAttr(Decl *D, const AttributeCommonInfo &CI) {
D->addAttr(CUDANoClusterAttr::Create(Context, CI));
}

static void handleClusterDimsAttr(Sema &S, Decl *D, const ParsedAttr &AL) {
const TargetInfo &TTI = S.Context.getTargetInfo();
OffloadArch Arch = StringToOffloadArch(TTI.getTargetOpts().CPU);
if ((TTI.getTriple().isNVPTX() && Arch < clang::OffloadArch::SM_90) ||
(TTI.getTriple().isAMDGPU() &&
!TTI.hasFeatureEnabled(TTI.getTargetOpts().FeatureMap, "clusters"))) {
S.Diag(AL.getLoc(), diag::err_cluster_attr_not_supported) << AL;
return;
}

if (!AL.checkAtLeastNumArgs(S, /*Num=*/1) ||
!AL.checkAtMostNumArgs(S, /*Num=*/3))
return;

S.addClusterDimsAttr(D, AL, AL.getArgAsExpr(0),
AL.getNumArgs() > 1 ? AL.getArgAsExpr(1) : nullptr,
AL.getNumArgs() > 2 ? AL.getArgAsExpr(2) : nullptr);
}

static void handleNoClusterAttr(Sema &S, Decl *D, const ParsedAttr &AL) {
const TargetInfo &TTI = S.Context.getTargetInfo();
OffloadArch Arch = StringToOffloadArch(TTI.getTargetOpts().CPU);
if ((TTI.getTriple().isNVPTX() && Arch < clang::OffloadArch::SM_90) ||
(TTI.getTriple().isAMDGPU() &&
!TTI.hasFeatureEnabled(TTI.getTargetOpts().FeatureMap, "clusters"))) {
S.Diag(AL.getLoc(), diag::err_cluster_attr_not_supported) << AL;
return;
}

S.addNoClusterAttr(D, AL);
}

static void handleArgumentWithTypeTagAttr(Sema &S, Decl *D,
const ParsedAttr &AL) {
if (!AL.isArgIdent(0)) {
Expand Down Expand Up @@ -7141,6 +7249,12 @@ ProcessDeclAttribute(Sema &S, Scope *scope, Decl *D, const ParsedAttr &AL,
case ParsedAttr::AT_CUDALaunchBounds:
handleLaunchBoundsAttr(S, D, AL);
break;
case ParsedAttr::AT_CUDAClusterDims:
handleClusterDimsAttr(S, D, AL);
break;
case ParsedAttr::AT_CUDANoCluster:
handleNoClusterAttr(S, D, AL);
break;
case ParsedAttr::AT_Restrict:
handleRestrictAttr(S, D, AL);
break;
Expand Down
22 changes: 22 additions & 0 deletions clang/lib/Sema/SemaTemplateInstantiateDecl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -707,6 +707,23 @@ static void instantiateDependentAMDGPUMaxNumWorkGroupsAttr(
S.AMDGPU().addAMDGPUMaxNumWorkGroupsAttr(New, Attr, XExpr, YExpr, ZExpr);
}

static void instantiateDependentCUDAClusterDimsAttr(
Sema &S, const MultiLevelTemplateArgumentList &TemplateArgs,
const CUDAClusterDimsAttr &Attr, Decl *New) {
EnterExpressionEvaluationContext Unevaluated(
S, Sema::ExpressionEvaluationContext::ConstantEvaluated);

auto SubstElt = [&S, &TemplateArgs](Expr *E) {
return E ? S.SubstExpr(E, TemplateArgs).get() : nullptr;
};

Expr *XExpr = SubstElt(Attr.getX());
Expr *YExpr = SubstElt(Attr.getY());
Expr *ZExpr = SubstElt(Attr.getZ());

S.addClusterDimsAttr(New, Attr, XExpr, YExpr, ZExpr);
}

// This doesn't take any template parameters, but we have a custom action that
// needs to happen when the kernel itself is instantiated. We need to run the
// ItaniumMangler to mark the names required to name this kernel.
Expand Down Expand Up @@ -921,6 +938,11 @@ void Sema::InstantiateAttrs(const MultiLevelTemplateArgumentList &TemplateArgs,
*this, TemplateArgs, *AMDGPUMaxNumWorkGroups, New);
}

if (const auto *CUDAClusterDims = dyn_cast<CUDAClusterDimsAttr>(TmplAttr)) {
instantiateDependentCUDAClusterDimsAttr(*this, TemplateArgs,
*CUDAClusterDims, New);
}

if (const auto *ParamAttr = dyn_cast<HLSLParamModifierAttr>(TmplAttr)) {
instantiateDependentHLSLParamModifierAttr(*this, TemplateArgs, ParamAttr,
New);
Expand Down
4 changes: 4 additions & 0 deletions clang/test/CodeGenCUDA/Inputs/cuda.h
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,8 @@
#endif
#define __launch_bounds__(...) __attribute__((launch_bounds(__VA_ARGS__)))
#define __grid_constant__ __attribute__((grid_constant))
#define __cluster_dims__(...) __attribute__((cluster_dims(__VA_ARGS__)))
#define __no_cluster__ __attribute__((no_cluster))
#else
#define __constant__
#define __device__
Expand All @@ -22,6 +24,8 @@
#define __managed__
#define __launch_bounds__(...)
#define __grid_constant__
#define __cluster_dims__(...)
#define __no_cluster__
#endif

struct dim3 {
Expand Down
38 changes: 38 additions & 0 deletions clang/test/CodeGenCUDA/cluster_dims.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -target-cpu gfx1250 -fcuda-is-device -emit-llvm -x hip -o - %s | FileCheck %s
// RUN: %clang_cc1 -triple x86_64-pc-linux-gnu -aux-triple amdgcn-amd-amdhsa -emit-llvm -x hip -o - %s | FileCheck --check-prefix=HOST %s

#include "Inputs/cuda.h"

const int constint = 4;

// HOST-NOT: "amdgpu-cluster-dims"

// CHECK: "amdgpu-cluster-dims"="2,2,2"
__global__ void __cluster_dims__(2, 2, 2) test_literal_3d() {}

// CHECK: "amdgpu-cluster-dims"="2,2,1"
__global__ void __cluster_dims__(2, 2) test_literal_2d() {}

// CHECK: "amdgpu-cluster-dims"="4,1,1"
__global__ void __cluster_dims__(4) test_literal_1d() {}

// CHECK: "amdgpu-cluster-dims"="4,2,1"
__global__ void __cluster_dims__(constint, constint / 2, 1) test_constant() {}

// CHECK: "amdgpu-cluster-dims"="0,0,0"
__global__ void __no_cluster__ test_no_cluster() {}

// CHECK: "amdgpu-cluster-dims"="7,1,1"
template<unsigned a>
__global__ void __cluster_dims__(a) test_template_1d() {}
template __global__ void test_template_1d<7>();

// CHECK: "amdgpu-cluster-dims"="2,6,1"
template<unsigned a, unsigned b>
__global__ void __cluster_dims__(a, b) test_template_2d() {}
template __global__ void test_template_2d<2, 6>();

// CHECK: "amdgpu-cluster-dims"="1,2,3"
template<unsigned a, unsigned b, unsigned c>
__global__ void __cluster_dims__(a, b, c) test_template_3d() {}
template __global__ void test_template_3d<1, 2, 3>();
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@
// CHECK-NEXT: CFUnknownTransfer (SubjectMatchRule_function)
// CHECK-NEXT: CPUDispatch (SubjectMatchRule_function)
// CHECK-NEXT: CPUSpecific (SubjectMatchRule_function)
// CHECK-NEXT: CUDAClusterDims (SubjectMatchRule_objc_method, SubjectMatchRule_hasType_functionType)
// CHECK-NEXT: CUDAConstant (SubjectMatchRule_variable)
// CHECK-NEXT: CUDADevice (SubjectMatchRule_function, SubjectMatchRule_variable)
// CHECK-NEXT: CUDADeviceBuiltinSurfaceType (SubjectMatchRule_record)
Expand All @@ -43,6 +44,7 @@
// CHECK-NEXT: CUDAGridConstant (SubjectMatchRule_variable_is_parameter)
// CHECK-NEXT: CUDAHost (SubjectMatchRule_function)
// CHECK-NEXT: CUDALaunchBounds (SubjectMatchRule_objc_method, SubjectMatchRule_hasType_functionType)
// CHECK-NEXT: CUDANoCluster (SubjectMatchRule_objc_method, SubjectMatchRule_hasType_functionType)
// CHECK-NEXT: CUDAShared (SubjectMatchRule_variable)
// CHECK-NEXT: CXX11NoReturn (SubjectMatchRule_function)
// CHECK-NEXT: CallableWhen (SubjectMatchRule_function_is_member)
Expand Down
2 changes: 2 additions & 0 deletions clang/test/SemaCUDA/Inputs/cuda.h
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,8 @@
#define __managed__ __attribute__((managed))
#define __grid_constant__ __attribute__((grid_constant))
#define __launch_bounds__(...) __attribute__((launch_bounds(__VA_ARGS__)))
#define __cluster_dims__(...) __attribute__((cluster_dims(__VA_ARGS__)))
#define __no_cluster__ __attribute__((no_cluster))

struct dim3 {
unsigned x, y, z;
Expand Down
Loading