@@ -52,16 +52,94 @@ bool SemaCUDA::PopForceHostDevice() {
5252ExprResult SemaCUDA::ActOnExecConfigExpr (Scope *S, SourceLocation LLLLoc,
5353 MultiExprArg ExecConfig,
5454 SourceLocation GGGLoc) {
55- FunctionDecl *ConfigDecl = getASTContext ().getcudaConfigureCallDecl ();
55+ bool IsDeviceKernelCall = false ;
56+ switch (CurrentTarget ()) {
57+ case CUDAFunctionTarget::Global:
58+ case CUDAFunctionTarget::Device:
59+ IsDeviceKernelCall = true ;
60+ break ;
61+ case CUDAFunctionTarget::HostDevice:
62+ if (getLangOpts ().CUDAIsDevice ) {
63+ IsDeviceKernelCall = true ;
64+ if (FunctionDecl *Caller =
65+ SemaRef.getCurFunctionDecl (/* AllowLambda=*/ true );
66+ Caller && isImplicitHostDeviceFunction (Caller)) {
67+ // Under the device compilation, config call under an HD function should
68+ // be treated as a device kernel call. But, for implicit HD ones (such
69+ // as lambdas), need to check whether RDC is enabled or not.
70+ if (!getLangOpts ().GPURelocatableDeviceCode )
71+ IsDeviceKernelCall = false ;
72+ // HIP doesn't support device-side kernel call yet. Still treat it as
73+ // the host-side kernel call.
74+ if (getLangOpts ().HIP )
75+ IsDeviceKernelCall = false ;
76+ }
77+ }
78+ break ;
79+ default :
80+ break ;
81+ }
82+
83+ if (IsDeviceKernelCall && getLangOpts ().HIP )
84+ return ExprError (
85+ Diag (LLLLoc, diag::err_cuda_device_kernel_launch_not_supported));
86+
87+ if (IsDeviceKernelCall && !getLangOpts ().GPURelocatableDeviceCode )
88+ return ExprError (
89+ Diag (LLLLoc, diag::err_cuda_device_kernel_launch_require_rdc));
90+
91+ FunctionDecl *ConfigDecl = IsDeviceKernelCall
92+ ? getASTContext ().getcudaLaunchDeviceDecl ()
93+ : getASTContext ().getcudaConfigureCallDecl ();
5694 if (!ConfigDecl)
5795 return ExprError (Diag (LLLLoc, diag::err_undeclared_var_use)
58- << getConfigureFuncName ());
96+ << (IsDeviceKernelCall ? getLaunchDeviceFuncName ()
97+ : getConfigureFuncName ()));
98+ // Additional check on the launch function if it's a device kernel call.
99+ if (IsDeviceKernelCall) {
100+ auto *GetParamBuf = getASTContext ().getcudaGetParameterBufferDecl ();
101+ if (!GetParamBuf)
102+ return ExprError (Diag (LLLLoc, diag::err_undeclared_var_use)
103+ << getGetParameterBufferFuncName ());
104+ }
105+
59106 QualType ConfigQTy = ConfigDecl->getType ();
60107
61108 DeclRefExpr *ConfigDR = new (getASTContext ()) DeclRefExpr (
62109 getASTContext (), ConfigDecl, false , ConfigQTy, VK_LValue, LLLLoc);
63110 SemaRef.MarkFunctionReferenced (LLLLoc, ConfigDecl);
64111
112+ if (IsDeviceKernelCall) {
113+ SmallVector<Expr *> Args;
114+ // Use a null pointer as the kernel function, which may not be resolvable
115+ // here. For example, resolving that kernel function may need additional
116+ // kernel arguments.
117+ llvm::APInt Zero (SemaRef.Context .getTypeSize (SemaRef.Context .IntTy ), 0 );
118+ Args.push_back (IntegerLiteral::Create (SemaRef.Context , Zero,
119+ SemaRef.Context .IntTy , LLLLoc));
120+ // Use a null pointer as the parameter buffer, which should be allocated in
121+ // the codegen.
122+ Args.push_back (IntegerLiteral::Create (SemaRef.Context , Zero,
123+ SemaRef.Context .IntTy , LLLLoc));
124+ // Add the original config arguments.
125+ llvm::append_range (Args, ExecConfig);
126+ // Add the default blockDim if it's missing.
127+ if (Args.size () < 4 ) {
128+ llvm::APInt One (SemaRef.Context .getTypeSize (SemaRef.Context .IntTy ), 1 );
129+ Args.push_back (IntegerLiteral::Create (SemaRef.Context , One,
130+ SemaRef.Context .IntTy , LLLLoc));
131+ }
132+ // Add the default sharedMemSize if it's missing.
133+ if (Args.size () < 5 )
134+ Args.push_back (IntegerLiteral::Create (SemaRef.Context , Zero,
135+ SemaRef.Context .IntTy , LLLLoc));
136+ // Add the default stream if it's missing.
137+ if (Args.size () < 6 )
138+ Args.push_back (IntegerLiteral::Create (SemaRef.Context , Zero,
139+ SemaRef.Context .IntTy , LLLLoc));
140+ return SemaRef.BuildCallExpr (S, ConfigDR, LLLLoc, Args, GGGLoc, nullptr ,
141+ /* IsExecConfig=*/ true );
142+ }
65143 return SemaRef.BuildCallExpr (S, ConfigDR, LLLLoc, ExecConfig, GGGLoc, nullptr ,
66144 /* IsExecConfig=*/ true );
67145}
@@ -246,12 +324,12 @@ SemaCUDA::IdentifyPreference(const FunctionDecl *Caller,
246324 CalleeTarget == CUDAFunctionTarget::InvalidTarget)
247325 return CFP_Never;
248326
249- // (a) Can't call global from some contexts until we support CUDA's
250- // dynamic parallelism.
327+ // (a) Call global from either global or device contexts is allowed as part
328+ // of CUDA's dynamic parallelism support .
251329 if (CalleeTarget == CUDAFunctionTarget::Global &&
252330 (CallerTarget == CUDAFunctionTarget::Global ||
253331 CallerTarget == CUDAFunctionTarget::Device))
254- return CFP_Never ;
332+ return CFP_Native ;
255333
256334 // (b) Calling HostDevice is OK for everyone.
257335 if (CalleeTarget == CUDAFunctionTarget::HostDevice)
@@ -279,7 +357,8 @@ SemaCUDA::IdentifyPreference(const FunctionDecl *Caller,
279357 if (CallerTarget == CUDAFunctionTarget::HostDevice) {
280358 // It's OK to call a compilation-mode matching function from an HD one.
281359 if ((getLangOpts ().CUDAIsDevice &&
282- CalleeTarget == CUDAFunctionTarget::Device) ||
360+ (CalleeTarget == CUDAFunctionTarget::Device ||
361+ CalleeTarget == CUDAFunctionTarget::Global)) ||
283362 (!getLangOpts ().CUDAIsDevice &&
284363 (CalleeTarget == CUDAFunctionTarget::Host ||
285364 CalleeTarget == CUDAFunctionTarget::Global)))
@@ -1103,6 +1182,18 @@ std::string SemaCUDA::getConfigureFuncName() const {
11031182 return " cudaConfigureCall" ;
11041183}
11051184
1185+ std::string SemaCUDA::getGetParameterBufferFuncName () const {
1186+ // FIXME: Use the API from CUDA programming guide. Add V2 support when
1187+ // necessary.
1188+ return " cudaGetParameterBuffer" ;
1189+ }
1190+
1191+ std::string SemaCUDA::getLaunchDeviceFuncName () const {
1192+ // FIXME: Use the API from CUDA programming guide. Add V2 support when
1193+ // necessary.
1194+ return " cudaLaunchDevice" ;
1195+ }
1196+
11061197// Record any local constexpr variables that are passed one way on the host
11071198// and another on the device.
11081199void SemaCUDA::recordPotentialODRUsedVariable (
0 commit comments