Skip to content

Commit 1f3fa59

Browse files
committed
[CUDA] Add device-side kernel launch support
- CUDA's dynamic parallelism extension allows device-side kernel launches, which share the identical syntax to host-side launches, e.g., kernel<<<Dg, Db, Ns, S>>>(arguments); but differ from the code generation. That device-side kernel launches is eventually translated into the following sequence config = cudaGetParameterBuffer(alignment, size); // setup arguments by copying them into `config`. cudaLaunchDevice(func, config, Dg, Db, Ns, S); - To support the device-side kernel launch, 'CUDAKernelCallExpr' is reused but its config expr is set to a call to 'cudaLaunchDevice'. During the code generation, 'CUDAKernelCallExpr' is expanded into the sequence aforementioned. - As the device-side kernel launch requires the source to be compiled as relocatable device code and linked with '-lcudadevrt'. Linkers are changed to pass relevant link options to 'nvlink'.
1 parent 47d71b6 commit 1f3fa59

File tree

21 files changed

+385
-56
lines changed

21 files changed

+385
-56
lines changed

clang/include/clang/AST/ASTContext.h

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -488,6 +488,10 @@ class ASTContext : public RefCountedBase<ASTContext> {
488488

489489
/// Declaration for the CUDA cudaConfigureCall function.
490490
FunctionDecl *cudaConfigureCallDecl = nullptr;
491+
/// Declaration for the CUDA cudaGetParameterBuffer function.
492+
FunctionDecl *cudaGetParameterBufferDecl = nullptr;
493+
/// Declaration for the CUDA cudaLaunchDevice function.
494+
FunctionDecl *cudaLaunchDeviceDecl = nullptr;
491495

492496
/// Keeps track of all declaration attributes.
493497
///
@@ -1641,6 +1645,18 @@ class ASTContext : public RefCountedBase<ASTContext> {
16411645
return cudaConfigureCallDecl;
16421646
}
16431647

1648+
void setcudaGetParameterBufferDecl(FunctionDecl *FD) {
1649+
cudaGetParameterBufferDecl = FD;
1650+
}
1651+
1652+
FunctionDecl *getcudaGetParameterBufferDecl() {
1653+
return cudaGetParameterBufferDecl;
1654+
}
1655+
1656+
void setcudaLaunchDeviceDecl(FunctionDecl *FD) { cudaLaunchDeviceDecl = FD; }
1657+
1658+
FunctionDecl *getcudaLaunchDeviceDecl() { return cudaLaunchDeviceDecl; }
1659+
16441660
/// Returns true iff we need copy/dispose helpers for the given type.
16451661
bool BlockRequiresCopying(QualType Ty, const VarDecl *D);
16461662

clang/include/clang/Basic/DiagnosticSemaKinds.td

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9501,6 +9501,8 @@ def err_kern_is_nonstatic_method : Error<
95019501
"kernel function %0 must be a free function or static member function">;
95029502
def err_config_scalar_return : Error<
95039503
"CUDA special function '%0' must have scalar return type">;
9504+
def err_config_pointer_return
9505+
: Error<"CUDA special function '%0' must have pointer return type">;
95049506
def err_kern_call_not_global_function : Error<
95059507
"kernel call to non-global function %0">;
95069508
def err_global_call_not_config : Error<
@@ -13692,4 +13694,10 @@ def err_amdgcn_load_lds_size_invalid_value : Error<"invalid size value">;
1369213694
def note_amdgcn_load_lds_size_valid_value : Note<"size must be %select{1, 2, or 4|1, 2, 4, 12 or 16}0">;
1369313695

1369413696
def err_amdgcn_coop_atomic_invalid_as : Error<"cooperative atomic requires a global or generic pointer">;
13697+
13698+
def err_cuda_device_kernel_launch_not_supported
13699+
: Error<"device-side kernel call/launch is not supported">;
13700+
def err_cuda_device_kernel_launch_require_rdc
13701+
: Error<"kernel launch from __device__ or __global__ function requires "
13702+
"relocatable device code, also known as separate compilation mode">;
1369513703
} // end of sema component.

clang/include/clang/Sema/SemaCUDA.h

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -273,6 +273,11 @@ class SemaCUDA : public SemaBase {
273273
/// of the function that will be called to configure kernel call, with the
274274
/// parameters specified via <<<>>>.
275275
std::string getConfigureFuncName() const;
276+
/// Return the name of the parameter buffer allocation function for the
277+
/// device kernel launch.
278+
std::string getGetParameterBufferFuncName() const;
279+
/// Return the name of the device kernel launch function.
280+
std::string getLaunchDeviceFuncName() const;
276281

277282
/// Record variables that are potentially ODR-used in CUDA/HIP.
278283
void recordPotentialODRUsedVariable(MultiExprArg Args,

clang/include/clang/Serialization/ASTReader.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1013,7 +1013,7 @@ class ASTReader
10131013
///
10141014
/// The AST context tracks a few important decls, currently cudaConfigureCall,
10151015
/// directly.
1016-
SmallVector<GlobalDeclID, 2> CUDASpecialDeclRefs;
1016+
SmallVector<GlobalDeclID, 4> CUDASpecialDeclRefs;
10171017

10181018
/// The floating point pragma option settings.
10191019
SmallVector<uint64_t, 1> FPPragmaOptions;

clang/lib/CodeGen/CGCUDARuntime.cpp

Lines changed: 110 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,116 @@ using namespace CodeGen;
2222

2323
CGCUDARuntime::~CGCUDARuntime() {}
2424

25+
static llvm::Value *emitGetParamBuf(CodeGenFunction &CGF,
26+
const CUDAKernelCallExpr *E) {
27+
auto *GetParamBuf = CGF.getContext().getcudaGetParameterBufferDecl();
28+
const FunctionProtoType *GetParamBufProto =
29+
GetParamBuf->getType()->getAs<FunctionProtoType>();
30+
31+
DeclRefExpr *DRE = DeclRefExpr::Create(
32+
CGF.getContext(), {}, {}, GetParamBuf,
33+
/*RefersToEnclosingVariableOrCapture=*/false, GetParamBuf->getNameInfo(),
34+
GetParamBuf->getType(), VK_PRValue);
35+
auto *ImpCast = ImplicitCastExpr::Create(
36+
CGF.getContext(), CGF.getContext().getPointerType(GetParamBuf->getType()),
37+
CK_FunctionToPointerDecay, DRE, nullptr, VK_PRValue, FPOptionsOverride());
38+
39+
CGCallee Callee = CGF.EmitCallee(ImpCast);
40+
CallArgList Args;
41+
// Use 64B alignment.
42+
Args.add(RValue::get(CGF.CGM.getSize(CharUnits::fromQuantity(64))),
43+
CGF.getContext().getSizeType());
44+
// Calculate parameter sizes.
45+
const PointerType *PT = E->getCallee()->getType()->getAs<PointerType>();
46+
const FunctionProtoType *FTP =
47+
PT->getPointeeType()->getAs<FunctionProtoType>();
48+
CharUnits Offset = CharUnits::Zero();
49+
for (auto ArgTy : FTP->getParamTypes()) {
50+
auto TInfo = CGF.CGM.getContext().getTypeInfoInChars(ArgTy);
51+
Offset = Offset.alignTo(TInfo.Align);
52+
Offset += TInfo.Width;
53+
}
54+
Args.add(RValue::get(CGF.CGM.getSize(Offset)),
55+
CGF.getContext().getSizeType());
56+
const CGFunctionInfo &CallInfo = CGF.CGM.getTypes().arrangeFreeFunctionCall(
57+
Args, GetParamBufProto, /*ChainCall=*/false);
58+
auto Ret = CGF.EmitCall(CallInfo, Callee, /*ReturnValue=*/{}, Args);
59+
60+
return Ret.getScalarVal();
61+
}
62+
63+
RValue CGCUDARuntime::EmitCUDADeviceKernelCallExpr(
64+
CodeGenFunction &CGF, const CUDAKernelCallExpr *E,
65+
ReturnValueSlot ReturnValue, llvm::CallBase **CallOrInvoke) {
66+
ASTContext &Ctx = CGM.getContext();
67+
assert(Ctx.getcudaLaunchDeviceDecl() == E->getConfig()->getDirectCallee());
68+
69+
llvm::BasicBlock *ConfigOKBlock = CGF.createBasicBlock("dkcall.configok");
70+
llvm::BasicBlock *ContBlock = CGF.createBasicBlock("dkcall.end");
71+
72+
llvm::Value *Config = emitGetParamBuf(CGF, E);
73+
CGF.Builder.CreateCondBr(
74+
CGF.Builder.CreateICmpNE(Config,
75+
llvm::Constant::getNullValue(Config->getType())),
76+
ConfigOKBlock, ContBlock);
77+
78+
CodeGenFunction::ConditionalEvaluation eval(CGF);
79+
80+
eval.begin(CGF);
81+
CGF.EmitBlock(ConfigOKBlock);
82+
83+
QualType KernelCalleeFuncTy =
84+
E->getCallee()->getType()->getAs<PointerType>()->getPointeeType();
85+
CGCallee KernelCallee = CGF.EmitCallee(E->getCallee());
86+
// Emit kernel arguments.
87+
CallArgList KernelCallArgs;
88+
CGF.EmitCallArgs(
89+
KernelCallArgs,
90+
dyn_cast<FunctionProtoType>(KernelCalleeFuncTy->castAs<FunctionType>()),
91+
E->arguments(), E->getDirectCallee());
92+
// Copy emitted kernel arguments into that parameter buffer.
93+
RawAddress CfgBase(Config, CGM.Int8Ty,
94+
/*Alignment=*/CharUnits::fromQuantity(64));
95+
CharUnits Offset = CharUnits::Zero();
96+
for (auto &Arg : KernelCallArgs) {
97+
auto TInfo = CGM.getContext().getTypeInfoInChars(Arg.getType());
98+
Offset = Offset.alignTo(TInfo.Align);
99+
Address Addr =
100+
CGF.Builder.CreateConstInBoundsGEP(CfgBase, Offset.getQuantity());
101+
Arg.copyInto(CGF, Addr);
102+
Offset += TInfo.Width;
103+
}
104+
// Make `cudaLaunchDevice` call, i.e. E->getConfig().
105+
const CallExpr *LaunchCall = E->getConfig();
106+
QualType LaunchCalleeFuncTy = LaunchCall->getCallee()
107+
->getType()
108+
->getAs<PointerType>()
109+
->getPointeeType();
110+
CGCallee LaunchCallee = CGF.EmitCallee(LaunchCall->getCallee());
111+
CallArgList LaunchCallArgs;
112+
CGF.EmitCallArgs(
113+
LaunchCallArgs,
114+
dyn_cast<FunctionProtoType>(LaunchCalleeFuncTy->castAs<FunctionType>()),
115+
LaunchCall->arguments(), LaunchCall->getDirectCallee());
116+
// Replace func and paramterbuffer arguments.
117+
LaunchCallArgs[0] = CallArg(RValue::get(KernelCallee.getFunctionPointer()),
118+
CGM.getContext().VoidPtrTy);
119+
LaunchCallArgs[1] = CallArg(RValue::get(Config), CGM.getContext().VoidPtrTy);
120+
const CGFunctionInfo &LaunchCallInfo = CGM.getTypes().arrangeFreeFunctionCall(
121+
LaunchCallArgs,
122+
dyn_cast<FunctionProtoType>(LaunchCalleeFuncTy->castAs<FunctionType>()),
123+
/*ChainCall=*/false);
124+
CGF.EmitCall(LaunchCallInfo, LaunchCallee, ReturnValue, LaunchCallArgs,
125+
CallOrInvoke,
126+
/*IsMustTail=*/false, E->getExprLoc());
127+
CGF.EmitBranch(ContBlock);
128+
129+
CGF.EmitBlock(ContBlock);
130+
eval.end(CGF);
131+
132+
return RValue::get(nullptr);
133+
}
134+
25135
RValue CGCUDARuntime::EmitCUDAKernelCallExpr(CodeGenFunction &CGF,
26136
const CUDAKernelCallExpr *E,
27137
ReturnValueSlot ReturnValue,

clang/lib/CodeGen/CGCUDARuntime.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -88,6 +88,10 @@ class CGCUDARuntime {
8888
ReturnValueSlot ReturnValue,
8989
llvm::CallBase **CallOrInvoke = nullptr);
9090

91+
virtual RValue EmitCUDADeviceKernelCallExpr(
92+
CodeGenFunction &CGF, const CUDAKernelCallExpr *E,
93+
ReturnValueSlot ReturnValue, llvm::CallBase **CallOrInvoke = nullptr);
94+
9195
/// Emits a kernel launch stub.
9296
virtual void emitDeviceStub(CodeGenFunction &CGF, FunctionArgList &Args) = 0;
9397

clang/lib/CodeGen/CGExprCXX.cpp

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -503,6 +503,12 @@ RValue CodeGenFunction::EmitCXXOperatorMemberCallExpr(
503503
RValue CodeGenFunction::EmitCUDAKernelCallExpr(const CUDAKernelCallExpr *E,
504504
ReturnValueSlot ReturnValue,
505505
llvm::CallBase **CallOrInvoke) {
506+
auto *FD = E->getConfig()->getDirectCallee();
507+
// Emit as a device kernel call if the config is prepared using
508+
// 'cudaGetParameterBuffer'.
509+
if (FD && CGM.getContext().getcudaLaunchDeviceDecl() == FD)
510+
return CGM.getCUDARuntime().EmitCUDADeviceKernelCallExpr(
511+
*this, E, ReturnValue, CallOrInvoke);
506512
return CGM.getCUDARuntime().EmitCUDAKernelCallExpr(*this, E, ReturnValue,
507513
CallOrInvoke);
508514
}

clang/lib/Sema/SemaCUDA.cpp

Lines changed: 97 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -52,16 +52,94 @@ bool SemaCUDA::PopForceHostDevice() {
5252
ExprResult SemaCUDA::ActOnExecConfigExpr(Scope *S, SourceLocation LLLLoc,
5353
MultiExprArg ExecConfig,
5454
SourceLocation GGGLoc) {
55-
FunctionDecl *ConfigDecl = getASTContext().getcudaConfigureCallDecl();
55+
bool IsDeviceKernelCall = false;
56+
switch (CurrentTarget()) {
57+
case CUDAFunctionTarget::Global:
58+
case CUDAFunctionTarget::Device:
59+
IsDeviceKernelCall = true;
60+
break;
61+
case CUDAFunctionTarget::HostDevice:
62+
if (getLangOpts().CUDAIsDevice) {
63+
IsDeviceKernelCall = true;
64+
if (FunctionDecl *Caller =
65+
SemaRef.getCurFunctionDecl(/*AllowLambda=*/true);
66+
Caller && isImplicitHostDeviceFunction(Caller)) {
67+
// Under the device compilation, config call under an HD function should
68+
// be treated as a device kernel call. But, for implicit HD ones (such
69+
// as lambdas), need to check whether RDC is enabled or not.
70+
if (!getLangOpts().GPURelocatableDeviceCode)
71+
IsDeviceKernelCall = false;
72+
// HIP doesn't support device-side kernel call yet. Still treat it as
73+
// the host-side kernel call.
74+
if (getLangOpts().HIP)
75+
IsDeviceKernelCall = false;
76+
}
77+
}
78+
break;
79+
default:
80+
break;
81+
}
82+
83+
if (IsDeviceKernelCall && getLangOpts().HIP)
84+
return ExprError(
85+
Diag(LLLLoc, diag::err_cuda_device_kernel_launch_not_supported));
86+
87+
if (IsDeviceKernelCall && !getLangOpts().GPURelocatableDeviceCode)
88+
return ExprError(
89+
Diag(LLLLoc, diag::err_cuda_device_kernel_launch_require_rdc));
90+
91+
FunctionDecl *ConfigDecl = IsDeviceKernelCall
92+
? getASTContext().getcudaLaunchDeviceDecl()
93+
: getASTContext().getcudaConfigureCallDecl();
5694
if (!ConfigDecl)
5795
return ExprError(Diag(LLLLoc, diag::err_undeclared_var_use)
58-
<< getConfigureFuncName());
96+
<< (IsDeviceKernelCall ? getLaunchDeviceFuncName()
97+
: getConfigureFuncName()));
98+
// Additional check on the launch function if it's a device kernel call.
99+
if (IsDeviceKernelCall) {
100+
auto *GetParamBuf = getASTContext().getcudaGetParameterBufferDecl();
101+
if (!GetParamBuf)
102+
return ExprError(Diag(LLLLoc, diag::err_undeclared_var_use)
103+
<< getGetParameterBufferFuncName());
104+
}
105+
59106
QualType ConfigQTy = ConfigDecl->getType();
60107

61108
DeclRefExpr *ConfigDR = new (getASTContext()) DeclRefExpr(
62109
getASTContext(), ConfigDecl, false, ConfigQTy, VK_LValue, LLLLoc);
63110
SemaRef.MarkFunctionReferenced(LLLLoc, ConfigDecl);
64111

112+
if (IsDeviceKernelCall) {
113+
SmallVector<Expr *> Args;
114+
// Use a null pointer as the kernel function, which may not be resolvable
115+
// here. For example, resolving that kernel function may need additional
116+
// kernel arguments.
117+
llvm::APInt Zero(SemaRef.Context.getTypeSize(SemaRef.Context.IntTy), 0);
118+
Args.push_back(IntegerLiteral::Create(SemaRef.Context, Zero,
119+
SemaRef.Context.IntTy, LLLLoc));
120+
// Use a null pointer as the parameter buffer, which should be allocated in
121+
// the codegen.
122+
Args.push_back(IntegerLiteral::Create(SemaRef.Context, Zero,
123+
SemaRef.Context.IntTy, LLLLoc));
124+
// Add the original config arguments.
125+
llvm::append_range(Args, ExecConfig);
126+
// Add the default blockDim if it's missing.
127+
if (Args.size() < 4) {
128+
llvm::APInt One(SemaRef.Context.getTypeSize(SemaRef.Context.IntTy), 1);
129+
Args.push_back(IntegerLiteral::Create(SemaRef.Context, One,
130+
SemaRef.Context.IntTy, LLLLoc));
131+
}
132+
// Add the default sharedMemSize if it's missing.
133+
if (Args.size() < 5)
134+
Args.push_back(IntegerLiteral::Create(SemaRef.Context, Zero,
135+
SemaRef.Context.IntTy, LLLLoc));
136+
// Add the default stream if it's missing.
137+
if (Args.size() < 6)
138+
Args.push_back(IntegerLiteral::Create(SemaRef.Context, Zero,
139+
SemaRef.Context.IntTy, LLLLoc));
140+
return SemaRef.BuildCallExpr(S, ConfigDR, LLLLoc, Args, GGGLoc, nullptr,
141+
/*IsExecConfig=*/true);
142+
}
65143
return SemaRef.BuildCallExpr(S, ConfigDR, LLLLoc, ExecConfig, GGGLoc, nullptr,
66144
/*IsExecConfig=*/true);
67145
}
@@ -246,12 +324,12 @@ SemaCUDA::IdentifyPreference(const FunctionDecl *Caller,
246324
CalleeTarget == CUDAFunctionTarget::InvalidTarget)
247325
return CFP_Never;
248326

249-
// (a) Can't call global from some contexts until we support CUDA's
250-
// dynamic parallelism.
327+
// (a) Call global from either global or device contexts is allowed as part
328+
// of CUDA's dynamic parallelism support.
251329
if (CalleeTarget == CUDAFunctionTarget::Global &&
252330
(CallerTarget == CUDAFunctionTarget::Global ||
253331
CallerTarget == CUDAFunctionTarget::Device))
254-
return CFP_Never;
332+
return CFP_Native;
255333

256334
// (b) Calling HostDevice is OK for everyone.
257335
if (CalleeTarget == CUDAFunctionTarget::HostDevice)
@@ -279,7 +357,8 @@ SemaCUDA::IdentifyPreference(const FunctionDecl *Caller,
279357
if (CallerTarget == CUDAFunctionTarget::HostDevice) {
280358
// It's OK to call a compilation-mode matching function from an HD one.
281359
if ((getLangOpts().CUDAIsDevice &&
282-
CalleeTarget == CUDAFunctionTarget::Device) ||
360+
(CalleeTarget == CUDAFunctionTarget::Device ||
361+
CalleeTarget == CUDAFunctionTarget::Global)) ||
283362
(!getLangOpts().CUDAIsDevice &&
284363
(CalleeTarget == CUDAFunctionTarget::Host ||
285364
CalleeTarget == CUDAFunctionTarget::Global)))
@@ -1103,6 +1182,18 @@ std::string SemaCUDA::getConfigureFuncName() const {
11031182
return "cudaConfigureCall";
11041183
}
11051184

1185+
std::string SemaCUDA::getGetParameterBufferFuncName() const {
1186+
// FIXME: Use the API from CUDA programming guide. Add V2 support when
1187+
// necessary.
1188+
return "cudaGetParameterBuffer";
1189+
}
1190+
1191+
std::string SemaCUDA::getLaunchDeviceFuncName() const {
1192+
// FIXME: Use the API from CUDA programming guide. Add V2 support when
1193+
// necessary.
1194+
return "cudaLaunchDevice";
1195+
}
1196+
11061197
// Record any local constexpr variables that are passed one way on the host
11071198
// and another on the device.
11081199
void SemaCUDA::recordPotentialODRUsedVariable(

clang/lib/Sema/SemaDecl.cpp

Lines changed: 24 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -11050,14 +11050,30 @@ Sema::ActOnFunctionDeclarator(Scope *S, Declarator &D, DeclContext *DC,
1105011050
}
1105111051

1105211052
if (getLangOpts().CUDA) {
11053-
IdentifierInfo *II = NewFD->getIdentifier();
11054-
if (II && II->isStr(CUDA().getConfigureFuncName()) &&
11055-
!NewFD->isInvalidDecl() &&
11056-
NewFD->getDeclContext()->getRedeclContext()->isTranslationUnit()) {
11057-
if (!R->castAs<FunctionType>()->getReturnType()->isScalarType())
11058-
Diag(NewFD->getLocation(), diag::err_config_scalar_return)
11059-
<< CUDA().getConfigureFuncName();
11060-
Context.setcudaConfigureCallDecl(NewFD);
11053+
if (IdentifierInfo *II = NewFD->getIdentifier()) {
11054+
if (II->isStr(CUDA().getConfigureFuncName()) && !NewFD->isInvalidDecl() &&
11055+
NewFD->getDeclContext()->getRedeclContext()->isTranslationUnit()) {
11056+
if (!R->castAs<FunctionType>()->getReturnType()->isScalarType())
11057+
Diag(NewFD->getLocation(), diag::err_config_scalar_return)
11058+
<< CUDA().getConfigureFuncName();
11059+
Context.setcudaConfigureCallDecl(NewFD);
11060+
}
11061+
if (II->isStr(CUDA().getGetParameterBufferFuncName()) &&
11062+
!NewFD->isInvalidDecl() &&
11063+
NewFD->getDeclContext()->getRedeclContext()->isTranslationUnit()) {
11064+
if (!R->castAs<FunctionType>()->getReturnType()->isPointerType())
11065+
Diag(NewFD->getLocation(), diag::err_config_pointer_return)
11066+
<< CUDA().getConfigureFuncName();
11067+
Context.setcudaGetParameterBufferDecl(NewFD);
11068+
}
11069+
if (II->isStr(CUDA().getLaunchDeviceFuncName()) &&
11070+
!NewFD->isInvalidDecl() &&
11071+
NewFD->getDeclContext()->getRedeclContext()->isTranslationUnit()) {
11072+
if (!R->castAs<FunctionType>()->getReturnType()->isScalarType())
11073+
Diag(NewFD->getLocation(), diag::err_config_scalar_return)
11074+
<< CUDA().getConfigureFuncName();
11075+
Context.setcudaLaunchDeviceDecl(NewFD);
11076+
}
1106111077
}
1106211078
}
1106311079

0 commit comments

Comments
 (0)