Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 16 additions & 0 deletions clang/include/clang/AST/ASTContext.h
Original file line number Diff line number Diff line change
Expand Up @@ -500,6 +500,10 @@ class ASTContext : public RefCountedBase<ASTContext> {

/// Declaration for the CUDA cudaConfigureCall function.
FunctionDecl *cudaConfigureCallDecl = nullptr;
/// Declaration for the CUDA cudaGetParameterBuffer function.
FunctionDecl *cudaGetParameterBufferDecl = nullptr;
/// Declaration for the CUDA cudaLaunchDevice function.
FunctionDecl *cudaLaunchDeviceDecl = nullptr;

/// Keeps track of all declaration attributes.
///
Expand Down Expand Up @@ -1653,6 +1657,18 @@ class ASTContext : public RefCountedBase<ASTContext> {
return cudaConfigureCallDecl;
}

void setcudaGetParameterBufferDecl(FunctionDecl *FD) {
cudaGetParameterBufferDecl = FD;
}

FunctionDecl *getcudaGetParameterBufferDecl() {
return cudaGetParameterBufferDecl;
}

void setcudaLaunchDeviceDecl(FunctionDecl *FD) { cudaLaunchDeviceDecl = FD; }

FunctionDecl *getcudaLaunchDeviceDecl() { return cudaLaunchDeviceDecl; }

/// Returns true iff we need copy/dispose helpers for the given type.
bool BlockRequiresCopying(QualType Ty, const VarDecl *D);

Expand Down
8 changes: 8 additions & 0 deletions clang/include/clang/Basic/DiagnosticSemaKinds.td
Original file line number Diff line number Diff line change
Expand Up @@ -9512,6 +9512,8 @@ def err_kern_is_nonstatic_method : Error<
"kernel function %0 must be a free function or static member function">;
def err_config_scalar_return : Error<
"CUDA special function '%0' must have scalar return type">;
def err_config_pointer_return
: Error<"CUDA special function '%0' must have pointer return type">;
def err_kern_call_not_global_function : Error<
"kernel call to non-global function %0">;
def err_global_call_not_config : Error<
Expand Down Expand Up @@ -13707,4 +13709,10 @@ def err_amdgcn_load_lds_size_invalid_value : Error<"invalid size value">;
def note_amdgcn_load_lds_size_valid_value : Note<"size must be %select{1, 2, or 4|1, 2, 4, 12 or 16}0">;

def err_amdgcn_coop_atomic_invalid_as : Error<"cooperative atomic requires a global or generic pointer">;

def err_cuda_device_kernel_launch_not_supported
: Error<"device-side kernel call/launch is not supported">;
def err_cuda_device_kernel_launch_require_rdc
: Error<"kernel launch from __device__ or __global__ function requires "
"relocatable device code (i.e. requires -fgpu-rdc)">;
} // end of sema component.
5 changes: 5 additions & 0 deletions clang/include/clang/Sema/SemaCUDA.h
Original file line number Diff line number Diff line change
Expand Up @@ -273,6 +273,11 @@ class SemaCUDA : public SemaBase {
/// of the function that will be called to configure kernel call, with the
/// parameters specified via <<<>>>.
std::string getConfigureFuncName() const;
/// Return the name of the parameter buffer allocation function for the
/// device kernel launch.
std::string getGetParameterBufferFuncName() const;
/// Return the name of the device kernel launch function.
std::string getLaunchDeviceFuncName() const;

/// Record variables that are potentially ODR-used in CUDA/HIP.
void recordPotentialODRUsedVariable(MultiExprArg Args,
Expand Down
2 changes: 1 addition & 1 deletion clang/include/clang/Serialization/ASTReader.h
Original file line number Diff line number Diff line change
Expand Up @@ -1005,7 +1005,7 @@ class ASTReader
///
/// The AST context tracks a few important decls, currently cudaConfigureCall,
/// directly.
SmallVector<GlobalDeclID, 2> CUDASpecialDeclRefs;
SmallVector<GlobalDeclID, 4> CUDASpecialDeclRefs;

/// The floating point pragma option settings.
SmallVector<uint64_t, 1> FPPragmaOptions;
Expand Down
106 changes: 106 additions & 0 deletions clang/lib/CodeGen/CGCUDARuntime.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,112 @@ using namespace CodeGen;

CGCUDARuntime::~CGCUDARuntime() {}

static llvm::Value *emitGetParamBuf(CodeGenFunction &CGF,
const CUDAKernelCallExpr *E) {
auto *GetParamBuf = CGF.getContext().getcudaGetParameterBufferDecl();
const FunctionProtoType *GetParamBufProto =
GetParamBuf->getType()->getAs<FunctionProtoType>();

DeclRefExpr *DRE = DeclRefExpr::Create(
CGF.getContext(), {}, {}, GetParamBuf,
/*RefersToEnclosingVariableOrCapture=*/false, GetParamBuf->getNameInfo(),
GetParamBuf->getType(), VK_PRValue);
auto *ImpCast = ImplicitCastExpr::Create(
CGF.getContext(), CGF.getContext().getPointerType(GetParamBuf->getType()),
CK_FunctionToPointerDecay, DRE, nullptr, VK_PRValue, FPOptionsOverride());

CGCallee Callee = CGF.EmitCallee(ImpCast);
CallArgList Args;
// Use 64B alignment.
Args.add(RValue::get(CGF.CGM.getSize(CharUnits::fromQuantity(64))),
CGF.getContext().getSizeType());
// Calculate parameter sizes.
const PointerType *PT = E->getCallee()->getType()->getAs<PointerType>();
const FunctionProtoType *FTP =
PT->getPointeeType()->getAs<FunctionProtoType>();
CharUnits Offset = CharUnits::Zero();
for (auto ArgTy : FTP->getParamTypes()) {
auto TInfo = CGF.CGM.getContext().getTypeInfoInChars(ArgTy);
Offset = Offset.alignTo(TInfo.Align) + TInfo.Width;
}
Args.add(RValue::get(CGF.CGM.getSize(Offset)),
CGF.getContext().getSizeType());
const CGFunctionInfo &CallInfo = CGF.CGM.getTypes().arrangeFreeFunctionCall(
Args, GetParamBufProto, /*ChainCall=*/false);
auto Ret = CGF.EmitCall(CallInfo, Callee, /*ReturnValue=*/{}, Args);

return Ret.getScalarVal();
}

RValue CGCUDARuntime::EmitCUDADeviceKernelCallExpr(
CodeGenFunction &CGF, const CUDAKernelCallExpr *E,
ReturnValueSlot ReturnValue, llvm::CallBase **CallOrInvoke) {
ASTContext &Ctx = CGM.getContext();
assert(Ctx.getcudaLaunchDeviceDecl() == E->getConfig()->getDirectCallee());

llvm::BasicBlock *ConfigOKBlock = CGF.createBasicBlock("dkcall.configok");
llvm::BasicBlock *ContBlock = CGF.createBasicBlock("dkcall.end");

llvm::Value *Config = emitGetParamBuf(CGF, E);
CGF.Builder.CreateCondBr(
CGF.Builder.CreateICmpNE(Config,
llvm::Constant::getNullValue(Config->getType())),
ConfigOKBlock, ContBlock);

CodeGenFunction::ConditionalEvaluation eval(CGF);

eval.begin(CGF);
CGF.EmitBlock(ConfigOKBlock);

QualType KernelCalleeFuncTy =
E->getCallee()->getType()->getAs<PointerType>()->getPointeeType();
CGCallee KernelCallee = CGF.EmitCallee(E->getCallee());
// Emit kernel arguments.
CallArgList KernelCallArgs;
CGF.EmitCallArgs(KernelCallArgs,
KernelCalleeFuncTy->getAs<FunctionProtoType>(),
E->arguments(), E->getDirectCallee());
// Copy emitted kernel arguments into that parameter buffer.
RawAddress CfgBase(Config, CGM.Int8Ty,
/*Alignment=*/CharUnits::fromQuantity(64));
CharUnits Offset = CharUnits::Zero();
for (auto &Arg : KernelCallArgs) {
auto TInfo = CGM.getContext().getTypeInfoInChars(Arg.getType());
Offset = Offset.alignTo(TInfo.Align);
Address Addr =
CGF.Builder.CreateConstInBoundsGEP(CfgBase, Offset.getQuantity());
Arg.copyInto(CGF, Addr);
Offset += TInfo.Width;
}
// Make `cudaLaunchDevice` call, i.e. E->getConfig().
const CallExpr *LaunchCall = E->getConfig();
QualType LaunchCalleeFuncTy = LaunchCall->getCallee()
->getType()
->getAs<PointerType>()
->getPointeeType();
CGCallee LaunchCallee = CGF.EmitCallee(LaunchCall->getCallee());
CallArgList LaunchCallArgs;
CGF.EmitCallArgs(LaunchCallArgs,
LaunchCalleeFuncTy->getAs<FunctionProtoType>(),
LaunchCall->arguments(), LaunchCall->getDirectCallee());
// Replace func and paramterbuffer arguments.
LaunchCallArgs[0] = CallArg(RValue::get(KernelCallee.getFunctionPointer()),
CGM.getContext().VoidPtrTy);
LaunchCallArgs[1] = CallArg(RValue::get(Config), CGM.getContext().VoidPtrTy);
const CGFunctionInfo &LaunchCallInfo = CGM.getTypes().arrangeFreeFunctionCall(
LaunchCallArgs, LaunchCalleeFuncTy->getAs<FunctionProtoType>(),
/*ChainCall=*/false);
CGF.EmitCall(LaunchCallInfo, LaunchCallee, ReturnValue, LaunchCallArgs,
CallOrInvoke,
/*IsMustTail=*/false, E->getExprLoc());
CGF.EmitBranch(ContBlock);

CGF.EmitBlock(ContBlock);
eval.end(CGF);

return RValue::get(nullptr);
}

RValue CGCUDARuntime::EmitCUDAKernelCallExpr(CodeGenFunction &CGF,
const CUDAKernelCallExpr *E,
ReturnValueSlot ReturnValue,
Expand Down
4 changes: 4 additions & 0 deletions clang/lib/CodeGen/CGCUDARuntime.h
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,10 @@ class CGCUDARuntime {
ReturnValueSlot ReturnValue,
llvm::CallBase **CallOrInvoke = nullptr);

virtual RValue EmitCUDADeviceKernelCallExpr(
CodeGenFunction &CGF, const CUDAKernelCallExpr *E,
ReturnValueSlot ReturnValue, llvm::CallBase **CallOrInvoke = nullptr);

/// Emits a kernel launch stub.
virtual void emitDeviceStub(CodeGenFunction &CGF, FunctionArgList &Args) = 0;

Expand Down
6 changes: 6 additions & 0 deletions clang/lib/CodeGen/CGExprCXX.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -503,6 +503,12 @@ RValue CodeGenFunction::EmitCXXOperatorMemberCallExpr(
RValue CodeGenFunction::EmitCUDAKernelCallExpr(const CUDAKernelCallExpr *E,
ReturnValueSlot ReturnValue,
llvm::CallBase **CallOrInvoke) {
auto *FD = E->getConfig()->getDirectCallee();
// Emit as a device kernel call if the config is prepared using
// 'cudaGetParameterBuffer'.
if (FD && CGM.getContext().getcudaLaunchDeviceDecl() == FD)
Comment on lines +507 to +509
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It's not clear how the comment's "prepared using 'cudaGetParameterBuffer'" maps to the new code. We appear to be relying on the second-order implementation details. We should be able to check if we're generating kernel launch from a GPU-side function directly.

return CGM.getCUDARuntime().EmitCUDADeviceKernelCallExpr(
*this, E, ReturnValue, CallOrInvoke);
return CGM.getCUDARuntime().EmitCUDAKernelCallExpr(*this, E, ReturnValue,
CallOrInvoke);
}
Expand Down
99 changes: 93 additions & 6 deletions clang/lib/Sema/SemaCUDA.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -52,16 +52,94 @@ bool SemaCUDA::PopForceHostDevice() {
ExprResult SemaCUDA::ActOnExecConfigExpr(Scope *S, SourceLocation LLLLoc,
MultiExprArg ExecConfig,
SourceLocation GGGLoc) {
FunctionDecl *ConfigDecl = getASTContext().getcudaConfigureCallDecl();
bool IsDeviceKernelCall = false;
switch (CurrentTarget()) {
case CUDAFunctionTarget::Global:
case CUDAFunctionTarget::Device:
IsDeviceKernelCall = true;
break;
case CUDAFunctionTarget::HostDevice:
if (getLangOpts().CUDAIsDevice) {
IsDeviceKernelCall = true;
if (FunctionDecl *Caller =
SemaRef.getCurFunctionDecl(/*AllowLambda=*/true);
Caller && isImplicitHostDeviceFunction(Caller)) {
// Under the device compilation, config call under an HD function should
// be treated as a device kernel call. But, for implicit HD ones (such
// as lambdas), need to check whether RDC is enabled or not.
if (!getLangOpts().GPURelocatableDeviceCode)
IsDeviceKernelCall = false;
// HIP doesn't support device-side kernel call yet. Still treat it as
// the host-side kernel call.
if (getLangOpts().HIP)
IsDeviceKernelCall = false;
}
}
break;
default:
break;
}

if (IsDeviceKernelCall && getLangOpts().HIP)
return ExprError(
Diag(LLLLoc, diag::err_cuda_device_kernel_launch_not_supported));

if (IsDeviceKernelCall && !getLangOpts().GPURelocatableDeviceCode)
return ExprError(
Diag(LLLLoc, diag::err_cuda_device_kernel_launch_require_rdc));

FunctionDecl *ConfigDecl = IsDeviceKernelCall
? getASTContext().getcudaLaunchDeviceDecl()
: getASTContext().getcudaConfigureCallDecl();
if (!ConfigDecl)
return ExprError(Diag(LLLLoc, diag::err_undeclared_var_use)
<< getConfigureFuncName());
<< (IsDeviceKernelCall ? getLaunchDeviceFuncName()
: getConfigureFuncName()));
// Additional check on the launch function if it's a device kernel call.
if (IsDeviceKernelCall) {
auto *GetParamBuf = getASTContext().getcudaGetParameterBufferDecl();
if (!GetParamBuf)
return ExprError(Diag(LLLLoc, diag::err_undeclared_var_use)
<< getGetParameterBufferFuncName());
}

QualType ConfigQTy = ConfigDecl->getType();

DeclRefExpr *ConfigDR = new (getASTContext()) DeclRefExpr(
getASTContext(), ConfigDecl, false, ConfigQTy, VK_LValue, LLLLoc);
SemaRef.MarkFunctionReferenced(LLLLoc, ConfigDecl);

if (IsDeviceKernelCall) {
SmallVector<Expr *> Args;
// Use a null pointer as the kernel function, which may not be resolvable
// here. For example, resolving that kernel function may need additional
// kernel arguments.
llvm::APInt Zero(SemaRef.Context.getTypeSize(SemaRef.Context.IntTy), 0);
Args.push_back(IntegerLiteral::Create(SemaRef.Context, Zero,
SemaRef.Context.IntTy, LLLLoc));
// Use a null pointer as the placeholder of the parameter buffer, which
// should be replaced with the actual allocation later, in the codegen.
Args.push_back(IntegerLiteral::Create(SemaRef.Context, Zero,
SemaRef.Context.IntTy, LLLLoc));
// Add the original config arguments.
llvm::append_range(Args, ExecConfig);
// Add the default blockDim if it's missing.
if (Args.size() < 4) {
llvm::APInt One(SemaRef.Context.getTypeSize(SemaRef.Context.IntTy), 1);
Args.push_back(IntegerLiteral::Create(SemaRef.Context, One,
SemaRef.Context.IntTy, LLLLoc));
}
// Add the default sharedMemSize if it's missing.
if (Args.size() < 5)
Args.push_back(IntegerLiteral::Create(SemaRef.Context, Zero,
SemaRef.Context.IntTy, LLLLoc));
// Add the default stream if it's missing.
if (Args.size() < 6)
Args.push_back(new (SemaRef.Context) CXXNullPtrLiteralExpr(
SemaRef.Context.NullPtrTy, LLLLoc));
return SemaRef.BuildCallExpr(S, ConfigDR, LLLLoc, Args, GGGLoc, nullptr,
/*IsExecConfig=*/true);
}
return SemaRef.BuildCallExpr(S, ConfigDR, LLLLoc, ExecConfig, GGGLoc, nullptr,
/*IsExecConfig=*/true);
}
Expand Down Expand Up @@ -246,12 +324,12 @@ SemaCUDA::IdentifyPreference(const FunctionDecl *Caller,
CalleeTarget == CUDAFunctionTarget::InvalidTarget)
return CFP_Never;

// (a) Can't call global from some contexts until we support CUDA's
// dynamic parallelism.
// (a) Call global from either global or device contexts is allowed as part
// of CUDA's dynamic parallelism support.
if (CalleeTarget == CUDAFunctionTarget::Global &&
(CallerTarget == CUDAFunctionTarget::Global ||
CallerTarget == CUDAFunctionTarget::Device))
return CFP_Never;
return CFP_Native;

// (b) Calling HostDevice is OK for everyone.
if (CalleeTarget == CUDAFunctionTarget::HostDevice)
Expand Down Expand Up @@ -279,7 +357,8 @@ SemaCUDA::IdentifyPreference(const FunctionDecl *Caller,
if (CallerTarget == CUDAFunctionTarget::HostDevice) {
// It's OK to call a compilation-mode matching function from an HD one.
if ((getLangOpts().CUDAIsDevice &&
CalleeTarget == CUDAFunctionTarget::Device) ||
(CalleeTarget == CUDAFunctionTarget::Device ||
CalleeTarget == CUDAFunctionTarget::Global)) ||
(!getLangOpts().CUDAIsDevice &&
(CalleeTarget == CUDAFunctionTarget::Host ||
CalleeTarget == CUDAFunctionTarget::Global)))
Expand Down Expand Up @@ -1103,6 +1182,14 @@ std::string SemaCUDA::getConfigureFuncName() const {
return "cudaConfigureCall";
}

std::string SemaCUDA::getGetParameterBufferFuncName() const {
return "cudaGetParameterBuffer";
}

std::string SemaCUDA::getLaunchDeviceFuncName() const {
return "cudaLaunchDevice";
}

// Record any local constexpr variables that are passed one way on the host
// and another on the device.
void SemaCUDA::recordPotentialODRUsedVariable(
Expand Down
32 changes: 24 additions & 8 deletions clang/lib/Sema/SemaDecl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -11050,14 +11050,30 @@ Sema::ActOnFunctionDeclarator(Scope *S, Declarator &D, DeclContext *DC,
}

if (getLangOpts().CUDA) {
IdentifierInfo *II = NewFD->getIdentifier();
if (II && II->isStr(CUDA().getConfigureFuncName()) &&
!NewFD->isInvalidDecl() &&
NewFD->getDeclContext()->getRedeclContext()->isTranslationUnit()) {
if (!R->castAs<FunctionType>()->getReturnType()->isScalarType())
Diag(NewFD->getLocation(), diag::err_config_scalar_return)
<< CUDA().getConfigureFuncName();
Context.setcudaConfigureCallDecl(NewFD);
if (IdentifierInfo *II = NewFD->getIdentifier()) {
if (II->isStr(CUDA().getConfigureFuncName()) && !NewFD->isInvalidDecl() &&
NewFD->getDeclContext()->getRedeclContext()->isTranslationUnit()) {
if (!R->castAs<FunctionType>()->getReturnType()->isScalarType())
Diag(NewFD->getLocation(), diag::err_config_scalar_return)
<< CUDA().getConfigureFuncName();
Context.setcudaConfigureCallDecl(NewFD);
}
if (II->isStr(CUDA().getGetParameterBufferFuncName()) &&
!NewFD->isInvalidDecl() &&
NewFD->getDeclContext()->getRedeclContext()->isTranslationUnit()) {
if (!R->castAs<FunctionType>()->getReturnType()->isPointerType())
Diag(NewFD->getLocation(), diag::err_config_pointer_return)
<< CUDA().getConfigureFuncName();
Context.setcudaGetParameterBufferDecl(NewFD);
}
if (II->isStr(CUDA().getLaunchDeviceFuncName()) &&
!NewFD->isInvalidDecl() &&
NewFD->getDeclContext()->getRedeclContext()->isTranslationUnit()) {
if (!R->castAs<FunctionType>()->getReturnType()->isScalarType())
Diag(NewFD->getLocation(), diag::err_config_scalar_return)
<< CUDA().getConfigureFuncName();
Context.setcudaLaunchDeviceDecl(NewFD);
}
}
}

Expand Down
Loading
Loading