-
Notifications
You must be signed in to change notification settings - Fork 15.2k
[CUDA] Add device-side kernel launch support #165519
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Conversation
|
@llvm/pr-subscribers-clang-driver @llvm/pr-subscribers-clang Author: None (darkbuck) Changes
Patch is 33.18 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/165519.diff 20 Files Affected:
diff --git a/clang/include/clang/AST/ASTContext.h b/clang/include/clang/AST/ASTContext.h
index 33aa2d343aa7a..f64e29be3205f 100644
--- a/clang/include/clang/AST/ASTContext.h
+++ b/clang/include/clang/AST/ASTContext.h
@@ -488,6 +488,10 @@ class ASTContext : public RefCountedBase<ASTContext> {
/// Declaration for the CUDA cudaConfigureCall function.
FunctionDecl *cudaConfigureCallDecl = nullptr;
+ /// Declaration for the CUDA cudaGetParameterBuffer function.
+ FunctionDecl *cudaGetParameterBufferDecl = nullptr;
+ /// Declaration for the CUDA cudaLaunchDevice function.
+ FunctionDecl *cudaLaunchDeviceDecl = nullptr;
/// Keeps track of all declaration attributes.
///
@@ -1641,6 +1645,18 @@ class ASTContext : public RefCountedBase<ASTContext> {
return cudaConfigureCallDecl;
}
+ void setcudaGetParameterBufferDecl(FunctionDecl *FD) {
+ cudaGetParameterBufferDecl = FD;
+ }
+
+ FunctionDecl *getcudaGetParameterBufferDecl() {
+ return cudaGetParameterBufferDecl;
+ }
+
+ void setcudaLaunchDeviceDecl(FunctionDecl *FD) { cudaLaunchDeviceDecl = FD; }
+
+ FunctionDecl *getcudaLaunchDeviceDecl() { return cudaLaunchDeviceDecl; }
+
/// Returns true iff we need copy/dispose helpers for the given type.
bool BlockRequiresCopying(QualType Ty, const VarDecl *D);
diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td
index 4e369be0bbb92..5e010cb52954d 100644
--- a/clang/include/clang/Basic/DiagnosticSemaKinds.td
+++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td
@@ -9499,6 +9499,8 @@ def err_kern_is_nonstatic_method : Error<
"kernel function %0 must be a free function or static member function">;
def err_config_scalar_return : Error<
"CUDA special function '%0' must have scalar return type">;
+def err_config_pointer_return
+ : Error<"CUDA special function '%0' must have pointer return type">;
def err_kern_call_not_global_function : Error<
"kernel call to non-global function %0">;
def err_global_call_not_config : Error<
@@ -13690,4 +13692,8 @@ def err_amdgcn_load_lds_size_invalid_value : Error<"invalid size value">;
def note_amdgcn_load_lds_size_valid_value : Note<"size must be %select{1, 2, or 4|1, 2, 4, 12 or 16}0">;
def err_amdgcn_coop_atomic_invalid_as : Error<"cooperative atomic requires a global or generic pointer">;
+
+def err_cuda_device_kernel_launch_require_rdc
+ : Error<"kernel launch from __device__ or __global__ function requires "
+ "relocatable device code, also known as separate compilation mode">;
} // end of sema component.
diff --git a/clang/include/clang/Sema/SemaCUDA.h b/clang/include/clang/Sema/SemaCUDA.h
index dbc1432860d89..dbb4290f5d149 100644
--- a/clang/include/clang/Sema/SemaCUDA.h
+++ b/clang/include/clang/Sema/SemaCUDA.h
@@ -273,6 +273,11 @@ class SemaCUDA : public SemaBase {
/// of the function that will be called to configure kernel call, with the
/// parameters specified via <<<>>>.
std::string getConfigureFuncName() const;
+ /// Return the name of the parameter buffer allocation function for the
+ /// device kernel launch.
+ std::string getGetParameterBufferFuncName() const;
+ /// Return the name of the device kernel launch function.
+ std::string getLaunchDeviceFuncName() const;
/// Record variables that are potentially ODR-used in CUDA/HIP.
void recordPotentialODRUsedVariable(MultiExprArg Args,
diff --git a/clang/include/clang/Serialization/ASTReader.h b/clang/include/clang/Serialization/ASTReader.h
index af856a8097ab1..a65f7fd2d1d43 100644
--- a/clang/include/clang/Serialization/ASTReader.h
+++ b/clang/include/clang/Serialization/ASTReader.h
@@ -1013,7 +1013,7 @@ class ASTReader
///
/// The AST context tracks a few important decls, currently cudaConfigureCall,
/// directly.
- SmallVector<GlobalDeclID, 2> CUDASpecialDeclRefs;
+ SmallVector<GlobalDeclID, 4> CUDASpecialDeclRefs;
/// The floating point pragma option settings.
SmallVector<uint64_t, 1> FPPragmaOptions;
diff --git a/clang/lib/CodeGen/CGCUDARuntime.cpp b/clang/lib/CodeGen/CGCUDARuntime.cpp
index 121a481213396..cd1476ebd6754 100644
--- a/clang/lib/CodeGen/CGCUDARuntime.cpp
+++ b/clang/lib/CodeGen/CGCUDARuntime.cpp
@@ -22,6 +22,116 @@ using namespace CodeGen;
CGCUDARuntime::~CGCUDARuntime() {}
+static llvm::Value *emitGetParamBuf(CodeGenFunction &CGF,
+ const CUDAKernelCallExpr *E) {
+ auto *GetParamBuf = CGF.getContext().getcudaGetParameterBufferDecl();
+ const FunctionProtoType *GetParamBufProto =
+ GetParamBuf->getType()->getAs<FunctionProtoType>();
+
+ DeclRefExpr *DRE = DeclRefExpr::Create(
+ CGF.getContext(), {}, {}, GetParamBuf,
+ /*RefersToEnclosingVariableOrCapture=*/false, GetParamBuf->getNameInfo(),
+ GetParamBuf->getType(), VK_PRValue);
+ auto *ImpCast = ImplicitCastExpr::Create(
+ CGF.getContext(), CGF.getContext().getPointerType(GetParamBuf->getType()),
+ CK_FunctionToPointerDecay, DRE, nullptr, VK_PRValue, FPOptionsOverride());
+
+ CGCallee Callee = CGF.EmitCallee(ImpCast);
+ CallArgList Args;
+ // Use 64B alignment.
+ Args.add(RValue::get(CGF.CGM.getSize(CharUnits::fromQuantity(64))),
+ CGF.getContext().getSizeType());
+ // Calculate parameter sizes.
+ const PointerType *PT = E->getCallee()->getType()->getAs<PointerType>();
+ const FunctionProtoType *FTP =
+ PT->getPointeeType()->getAs<FunctionProtoType>();
+ CharUnits Offset = CharUnits::Zero();
+ for (auto ArgTy : FTP->getParamTypes()) {
+ auto TInfo = CGF.CGM.getContext().getTypeInfoInChars(ArgTy);
+ Offset = Offset.alignTo(TInfo.Align);
+ Offset += TInfo.Width;
+ }
+ Args.add(RValue::get(CGF.CGM.getSize(Offset)),
+ CGF.getContext().getSizeType());
+ const CGFunctionInfo &CallInfo = CGF.CGM.getTypes().arrangeFreeFunctionCall(
+ Args, GetParamBufProto, /*ChainCall=*/false);
+ auto Ret = CGF.EmitCall(CallInfo, Callee, /*ReturnValue=*/{}, Args);
+
+ return Ret.getScalarVal();
+}
+
+RValue CGCUDARuntime::EmitCUDADeviceKernelCallExpr(
+ CodeGenFunction &CGF, const CUDAKernelCallExpr *E,
+ ReturnValueSlot ReturnValue, llvm::CallBase **CallOrInvoke) {
+ ASTContext &Ctx = CGM.getContext();
+ assert(Ctx.getcudaLaunchDeviceDecl() == E->getConfig()->getDirectCallee());
+
+ llvm::BasicBlock *ConfigOKBlock = CGF.createBasicBlock("dkcall.configok");
+ llvm::BasicBlock *ContBlock = CGF.createBasicBlock("dkcall.end");
+
+ llvm::Value *Config = emitGetParamBuf(CGF, E);
+ CGF.Builder.CreateCondBr(
+ CGF.Builder.CreateICmpNE(Config,
+ llvm::Constant::getNullValue(Config->getType())),
+ ConfigOKBlock, ContBlock);
+
+ CodeGenFunction::ConditionalEvaluation eval(CGF);
+
+ eval.begin(CGF);
+ CGF.EmitBlock(ConfigOKBlock);
+
+ QualType KernelCalleeFuncTy =
+ E->getCallee()->getType()->getAs<PointerType>()->getPointeeType();
+ CGCallee KernelCallee = CGF.EmitCallee(E->getCallee());
+ // Emit kernel arguments.
+ CallArgList KernelCallArgs;
+ CGF.EmitCallArgs(
+ KernelCallArgs,
+ dyn_cast<FunctionProtoType>(KernelCalleeFuncTy->castAs<FunctionType>()),
+ E->arguments(), E->getDirectCallee());
+ // Copy emitted kernel arguments into that parameter buffer.
+ RawAddress CfgBase(Config, CGM.Int8Ty,
+ /*Alignment=*/CharUnits::fromQuantity(64));
+ CharUnits Offset = CharUnits::Zero();
+ for (auto &Arg : KernelCallArgs) {
+ auto TInfo = CGM.getContext().getTypeInfoInChars(Arg.getType());
+ Offset = Offset.alignTo(TInfo.Align);
+ Address Addr =
+ CGF.Builder.CreateConstInBoundsGEP(CfgBase, Offset.getQuantity());
+ Arg.copyInto(CGF, Addr);
+ Offset += TInfo.Width;
+ }
+ // Make `cudaLaunchDevice` call, i.e. E->getConfig().
+ const CallExpr *LaunchCall = E->getConfig();
+ QualType LaunchCalleeFuncTy = LaunchCall->getCallee()
+ ->getType()
+ ->getAs<PointerType>()
+ ->getPointeeType();
+ CGCallee LaunchCallee = CGF.EmitCallee(LaunchCall->getCallee());
+ CallArgList LaunchCallArgs;
+ CGF.EmitCallArgs(
+ LaunchCallArgs,
+ dyn_cast<FunctionProtoType>(LaunchCalleeFuncTy->castAs<FunctionType>()),
+ LaunchCall->arguments(), LaunchCall->getDirectCallee());
+ // Replace func and paramterbuffer arguments.
+ LaunchCallArgs[0] = CallArg(RValue::get(KernelCallee.getFunctionPointer()),
+ CGM.getContext().VoidPtrTy);
+ LaunchCallArgs[1] = CallArg(RValue::get(Config), CGM.getContext().VoidPtrTy);
+ const CGFunctionInfo &LaunchCallInfo = CGM.getTypes().arrangeFreeFunctionCall(
+ LaunchCallArgs,
+ dyn_cast<FunctionProtoType>(LaunchCalleeFuncTy->castAs<FunctionType>()),
+ /*ChainCall=*/false);
+ CGF.EmitCall(LaunchCallInfo, LaunchCallee, ReturnValue, LaunchCallArgs,
+ CallOrInvoke,
+ /*IsMustTail=*/false, E->getExprLoc());
+ CGF.EmitBranch(ContBlock);
+
+ CGF.EmitBlock(ContBlock);
+ eval.end(CGF);
+
+ return RValue::get(nullptr);
+}
+
RValue CGCUDARuntime::EmitCUDAKernelCallExpr(CodeGenFunction &CGF,
const CUDAKernelCallExpr *E,
ReturnValueSlot ReturnValue,
diff --git a/clang/lib/CodeGen/CGCUDARuntime.h b/clang/lib/CodeGen/CGCUDARuntime.h
index 86f776004ee7c..64fb9a31422e0 100644
--- a/clang/lib/CodeGen/CGCUDARuntime.h
+++ b/clang/lib/CodeGen/CGCUDARuntime.h
@@ -88,6 +88,10 @@ class CGCUDARuntime {
ReturnValueSlot ReturnValue,
llvm::CallBase **CallOrInvoke = nullptr);
+ virtual RValue EmitCUDADeviceKernelCallExpr(
+ CodeGenFunction &CGF, const CUDAKernelCallExpr *E,
+ ReturnValueSlot ReturnValue, llvm::CallBase **CallOrInvoke = nullptr);
+
/// Emits a kernel launch stub.
virtual void emitDeviceStub(CodeGenFunction &CGF, FunctionArgList &Args) = 0;
diff --git a/clang/lib/CodeGen/CGExprCXX.cpp b/clang/lib/CodeGen/CGExprCXX.cpp
index 14d8db32bafc6..0c01933790100 100644
--- a/clang/lib/CodeGen/CGExprCXX.cpp
+++ b/clang/lib/CodeGen/CGExprCXX.cpp
@@ -503,6 +503,12 @@ RValue CodeGenFunction::EmitCXXOperatorMemberCallExpr(
RValue CodeGenFunction::EmitCUDAKernelCallExpr(const CUDAKernelCallExpr *E,
ReturnValueSlot ReturnValue,
llvm::CallBase **CallOrInvoke) {
+ auto *FD = E->getConfig()->getDirectCallee();
+ // Emit as a device kernel call if the config is prepared using
+ // 'cudaGetParameterBuffer'.
+ if (FD && CGM.getContext().getcudaLaunchDeviceDecl() == FD)
+ return CGM.getCUDARuntime().EmitCUDADeviceKernelCallExpr(
+ *this, E, ReturnValue, CallOrInvoke);
return CGM.getCUDARuntime().EmitCUDAKernelCallExpr(*this, E, ReturnValue,
CallOrInvoke);
}
diff --git a/clang/lib/Sema/SemaCUDA.cpp b/clang/lib/Sema/SemaCUDA.cpp
index 31735a0f5feb3..a60a32dcb9e4c 100644
--- a/clang/lib/Sema/SemaCUDA.cpp
+++ b/clang/lib/Sema/SemaCUDA.cpp
@@ -52,16 +52,85 @@ bool SemaCUDA::PopForceHostDevice() {
ExprResult SemaCUDA::ActOnExecConfigExpr(Scope *S, SourceLocation LLLLoc,
MultiExprArg ExecConfig,
SourceLocation GGGLoc) {
- FunctionDecl *ConfigDecl = getASTContext().getcudaConfigureCallDecl();
+ bool IsDeviceKernelCall = false;
+ switch (CurrentTarget()) {
+ case CUDAFunctionTarget::Global:
+ case CUDAFunctionTarget::Device:
+ IsDeviceKernelCall = true;
+ break;
+ case CUDAFunctionTarget::HostDevice:
+ if (getLangOpts().CUDAIsDevice) {
+ // Under the device compilation, config call under an HD function should
+ // be treated as a device kernel call. But, for implicit HD ones (such as
+ // lambdas), need to check whether RDC is enabled or not.
+ IsDeviceKernelCall = true;
+ if (!getLangOpts().GPURelocatableDeviceCode) {
+ FunctionDecl *Caller = SemaRef.getCurFunctionDecl(/*AllowLambda=*/true);
+ if (Caller && isImplicitHostDeviceFunction(Caller))
+ IsDeviceKernelCall = false;
+ }
+ }
+ break;
+ default:
+ break;
+ }
+
+ if (IsDeviceKernelCall && !getLangOpts().GPURelocatableDeviceCode)
+ return ExprError(
+ Diag(LLLLoc, diag::err_cuda_device_kernel_launch_require_rdc));
+
+ FunctionDecl *ConfigDecl = IsDeviceKernelCall
+ ? getASTContext().getcudaLaunchDeviceDecl()
+ : getASTContext().getcudaConfigureCallDecl();
if (!ConfigDecl)
return ExprError(Diag(LLLLoc, diag::err_undeclared_var_use)
- << getConfigureFuncName());
+ << (IsDeviceKernelCall ? getLaunchDeviceFuncName()
+ : getConfigureFuncName()));
+ // Additional check on the launch function if it's a device kernel call.
+ if (IsDeviceKernelCall) {
+ auto *GetParamBuf = getASTContext().getcudaGetParameterBufferDecl();
+ if (!GetParamBuf)
+ return ExprError(Diag(LLLLoc, diag::err_undeclared_var_use)
+ << getGetParameterBufferFuncName());
+ }
+
QualType ConfigQTy = ConfigDecl->getType();
DeclRefExpr *ConfigDR = new (getASTContext()) DeclRefExpr(
getASTContext(), ConfigDecl, false, ConfigQTy, VK_LValue, LLLLoc);
SemaRef.MarkFunctionReferenced(LLLLoc, ConfigDecl);
+ if (IsDeviceKernelCall) {
+ SmallVector<Expr *> Args;
+ // Use a null pointer as the kernel function, which may not be resolvable
+ // here. For example, resolving that kernel function may need additional
+ // kernel arguments.
+ llvm::APInt Zero(SemaRef.Context.getTypeSize(SemaRef.Context.IntTy), 0);
+ Args.push_back(IntegerLiteral::Create(SemaRef.Context, Zero,
+ SemaRef.Context.IntTy, LLLLoc));
+ // Use a null pointer as the parameter buffer, which should be allocated in
+ // the codegen.
+ Args.push_back(IntegerLiteral::Create(SemaRef.Context, Zero,
+ SemaRef.Context.IntTy, LLLLoc));
+ // Add the original config arguments.
+ llvm::append_range(Args, ExecConfig);
+ // Add the default blockDim if it's missing.
+ if (Args.size() < 4) {
+ llvm::APInt One(SemaRef.Context.getTypeSize(SemaRef.Context.IntTy), 1);
+ Args.push_back(IntegerLiteral::Create(SemaRef.Context, One,
+ SemaRef.Context.IntTy, LLLLoc));
+ }
+ // Add the default sharedMemSize if it's missing.
+ if (Args.size() < 5)
+ Args.push_back(IntegerLiteral::Create(SemaRef.Context, Zero,
+ SemaRef.Context.IntTy, LLLLoc));
+ // Add the default stream if it's missing.
+ if (Args.size() < 6)
+ Args.push_back(IntegerLiteral::Create(SemaRef.Context, Zero,
+ SemaRef.Context.IntTy, LLLLoc));
+ return SemaRef.BuildCallExpr(S, ConfigDR, LLLLoc, Args, GGGLoc, nullptr,
+ /*IsExecConfig=*/true);
+ }
return SemaRef.BuildCallExpr(S, ConfigDR, LLLLoc, ExecConfig, GGGLoc, nullptr,
/*IsExecConfig=*/true);
}
@@ -251,7 +320,7 @@ SemaCUDA::IdentifyPreference(const FunctionDecl *Caller,
if (CalleeTarget == CUDAFunctionTarget::Global &&
(CallerTarget == CUDAFunctionTarget::Global ||
CallerTarget == CUDAFunctionTarget::Device))
- return CFP_Never;
+ return CFP_Native;
// (b) Calling HostDevice is OK for everyone.
if (CalleeTarget == CUDAFunctionTarget::HostDevice)
@@ -279,7 +348,8 @@ SemaCUDA::IdentifyPreference(const FunctionDecl *Caller,
if (CallerTarget == CUDAFunctionTarget::HostDevice) {
// It's OK to call a compilation-mode matching function from an HD one.
if ((getLangOpts().CUDAIsDevice &&
- CalleeTarget == CUDAFunctionTarget::Device) ||
+ (CalleeTarget == CUDAFunctionTarget::Device ||
+ CalleeTarget == CUDAFunctionTarget::Global)) ||
(!getLangOpts().CUDAIsDevice &&
(CalleeTarget == CUDAFunctionTarget::Host ||
CalleeTarget == CUDAFunctionTarget::Global)))
@@ -1103,6 +1173,18 @@ std::string SemaCUDA::getConfigureFuncName() const {
return "cudaConfigureCall";
}
+std::string SemaCUDA::getGetParameterBufferFuncName() const {
+ // FIXME: Use the API from CUDA programming guide. Add V2 support when
+ // necessary.
+ return "cudaGetParameterBuffer";
+}
+
+std::string SemaCUDA::getLaunchDeviceFuncName() const {
+ // FIXME: Use the API from CUDA programming guide. Add V2 support when
+ // necessary.
+ return "cudaLaunchDevice";
+}
+
// Record any local constexpr variables that are passed one way on the host
// and another on the device.
void SemaCUDA::recordPotentialODRUsedVariable(
diff --git a/clang/lib/Sema/SemaDecl.cpp b/clang/lib/Sema/SemaDecl.cpp
index fc3aabf5741ca..1e39bfb5e42cd 100644
--- a/clang/lib/Sema/SemaDecl.cpp
+++ b/clang/lib/Sema/SemaDecl.cpp
@@ -11050,14 +11050,30 @@ Sema::ActOnFunctionDeclarator(Scope *S, Declarator &D, DeclContext *DC,
}
if (getLangOpts().CUDA) {
- IdentifierInfo *II = NewFD->getIdentifier();
- if (II && II->isStr(CUDA().getConfigureFuncName()) &&
- !NewFD->isInvalidDecl() &&
- NewFD->getDeclContext()->getRedeclContext()->isTranslationUnit()) {
- if (!R->castAs<FunctionType>()->getReturnType()->isScalarType())
- Diag(NewFD->getLocation(), diag::err_config_scalar_return)
- << CUDA().getConfigureFuncName();
- Context.setcudaConfigureCallDecl(NewFD);
+ if (IdentifierInfo *II = NewFD->getIdentifier()) {
+ if (II->isStr(CUDA().getConfigureFuncName()) && !NewFD->isInvalidDecl() &&
+ NewFD->getDeclContext()->getRedeclContext()->isTranslationUnit()) {
+ if (!R->castAs<FunctionType>()->getReturnType()->isScalarType())
+ Diag(NewFD->getLocation(), diag::err_config_scalar_return)
+ << CUDA().getConfigureFuncName();
+ Context.setcudaConfigureCallDecl(NewFD);
+ }
+ if (II->isStr(CUDA().getGetParameterBufferFuncName()) &&
+ !NewFD->isInvalidDecl() &&
+ NewFD->getDeclContext()->getRedeclContext()->isTranslationUnit()) {
+ if (!R->castAs<FunctionType>()->getReturnType()->isPointerType())
+ Diag(NewFD->getLocation(), diag::err_config_pointer_return)
+ << CUDA().getConfigureFuncName();
+ Context.setcudaGetParameterBufferDecl(NewFD);
+ }
+ if (II->isStr(CUDA().getLaunchDeviceFuncName()) &&
+ !NewFD->isInvalidDecl() &&
+ NewFD->getDeclContext()->getRedeclContext()->isTranslationUnit()) {
+ if (!R->castAs<FunctionType>()->getReturnType()->isScalarType())
+ Diag(NewFD->getLocation(), diag::err_config_scalar_return)
+ << CUDA().getConfigureFuncName();
+ Context.setcudaLaunchDeviceDecl(NewFD);
+ }
}
}
diff --git a/clang/lib/Serialization/ASTReader.cpp b/clang/lib/Serialization/ASTReader.cpp
index c1b5cb730e4a4..e415d5816ab01 100644
--- a/clang/lib/Serialization/ASTReader.cpp
+++ b/clang/lib/Serialization/ASTReader.cpp
@@ -5588,9 +5588,13 @@ void ASTReader::InitializeContext() {
// If there were any CUDA special declarations, deserialize them.
if (!CUDASpecialDeclRefs.empty()) {
- assert(CUDASpecialDeclRefs.size() == 1 && "More decl refs than expected!");
+ assert(CUDASpecialDeclRefs.size() == 3 && "More decl refs than expected!");
Context.setcudaConfigureCallDecl(
- cast<FunctionDecl>(GetDecl(CUDASpecialDeclRefs[0])));
+ cast_or_null<FunctionDecl>(GetDecl(CUDASpecialDeclRefs[0])));
+ Context.setcudaGetParameterBufferDecl(
+ cast_or_null<FunctionDecl>(GetDecl(CUDASpecialDeclRefs[1])));
+ Context.setcudaLaunchDeviceDecl(
+ cast_or_null<FunctionDecl>(GetDecl(CUDASpecialDeclRefs[2])));
}
// Re-export any modules that were imported by a non-module AST file.
diff --git a/clang/lib/Serialization/ASTWriter.cpp b/clang/lib/Serialization/ASTWriter.cpp
index 377e3966874f3..8e527db972fb0 100644
--- a/clang/lib/Serialization/ASTWriter.cpp
+++ b/clang/lib/Serialization/ASTWriter.cpp
@@ -5714,8 +5714,13 @@ void ASTWriter::PrepareWritingSpecialDecls(Sema &Sem...
[truncated]
|
|
@llvm/pr-subscribers-clang-modules Author: None (darkbuck) Changes
Patch is 33.18 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/165519.diff 20 Files Affected:
diff --git a/clang/include/clang/AST/ASTContext.h b/clang/include/clang/AST/ASTContext.h
index 33aa2d343aa7a..f64e29be3205f 100644
--- a/clang/include/clang/AST/ASTContext.h
+++ b/clang/include/clang/AST/ASTContext.h
@@ -488,6 +488,10 @@ class ASTContext : public RefCountedBase<ASTContext> {
/// Declaration for the CUDA cudaConfigureCall function.
FunctionDecl *cudaConfigureCallDecl = nullptr;
+ /// Declaration for the CUDA cudaGetParameterBuffer function.
+ FunctionDecl *cudaGetParameterBufferDecl = nullptr;
+ /// Declaration for the CUDA cudaLaunchDevice function.
+ FunctionDecl *cudaLaunchDeviceDecl = nullptr;
/// Keeps track of all declaration attributes.
///
@@ -1641,6 +1645,18 @@ class ASTContext : public RefCountedBase<ASTContext> {
return cudaConfigureCallDecl;
}
+ void setcudaGetParameterBufferDecl(FunctionDecl *FD) {
+ cudaGetParameterBufferDecl = FD;
+ }
+
+ FunctionDecl *getcudaGetParameterBufferDecl() {
+ return cudaGetParameterBufferDecl;
+ }
+
+ void setcudaLaunchDeviceDecl(FunctionDecl *FD) { cudaLaunchDeviceDecl = FD; }
+
+ FunctionDecl *getcudaLaunchDeviceDecl() { return cudaLaunchDeviceDecl; }
+
/// Returns true iff we need copy/dispose helpers for the given type.
bool BlockRequiresCopying(QualType Ty, const VarDecl *D);
diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td
index 4e369be0bbb92..5e010cb52954d 100644
--- a/clang/include/clang/Basic/DiagnosticSemaKinds.td
+++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td
@@ -9499,6 +9499,8 @@ def err_kern_is_nonstatic_method : Error<
"kernel function %0 must be a free function or static member function">;
def err_config_scalar_return : Error<
"CUDA special function '%0' must have scalar return type">;
+def err_config_pointer_return
+ : Error<"CUDA special function '%0' must have pointer return type">;
def err_kern_call_not_global_function : Error<
"kernel call to non-global function %0">;
def err_global_call_not_config : Error<
@@ -13690,4 +13692,8 @@ def err_amdgcn_load_lds_size_invalid_value : Error<"invalid size value">;
def note_amdgcn_load_lds_size_valid_value : Note<"size must be %select{1, 2, or 4|1, 2, 4, 12 or 16}0">;
def err_amdgcn_coop_atomic_invalid_as : Error<"cooperative atomic requires a global or generic pointer">;
+
+def err_cuda_device_kernel_launch_require_rdc
+ : Error<"kernel launch from __device__ or __global__ function requires "
+ "relocatable device code, also known as separate compilation mode">;
} // end of sema component.
diff --git a/clang/include/clang/Sema/SemaCUDA.h b/clang/include/clang/Sema/SemaCUDA.h
index dbc1432860d89..dbb4290f5d149 100644
--- a/clang/include/clang/Sema/SemaCUDA.h
+++ b/clang/include/clang/Sema/SemaCUDA.h
@@ -273,6 +273,11 @@ class SemaCUDA : public SemaBase {
/// of the function that will be called to configure kernel call, with the
/// parameters specified via <<<>>>.
std::string getConfigureFuncName() const;
+ /// Return the name of the parameter buffer allocation function for the
+ /// device kernel launch.
+ std::string getGetParameterBufferFuncName() const;
+ /// Return the name of the device kernel launch function.
+ std::string getLaunchDeviceFuncName() const;
/// Record variables that are potentially ODR-used in CUDA/HIP.
void recordPotentialODRUsedVariable(MultiExprArg Args,
diff --git a/clang/include/clang/Serialization/ASTReader.h b/clang/include/clang/Serialization/ASTReader.h
index af856a8097ab1..a65f7fd2d1d43 100644
--- a/clang/include/clang/Serialization/ASTReader.h
+++ b/clang/include/clang/Serialization/ASTReader.h
@@ -1013,7 +1013,7 @@ class ASTReader
///
/// The AST context tracks a few important decls, currently cudaConfigureCall,
/// directly.
- SmallVector<GlobalDeclID, 2> CUDASpecialDeclRefs;
+ SmallVector<GlobalDeclID, 4> CUDASpecialDeclRefs;
/// The floating point pragma option settings.
SmallVector<uint64_t, 1> FPPragmaOptions;
diff --git a/clang/lib/CodeGen/CGCUDARuntime.cpp b/clang/lib/CodeGen/CGCUDARuntime.cpp
index 121a481213396..cd1476ebd6754 100644
--- a/clang/lib/CodeGen/CGCUDARuntime.cpp
+++ b/clang/lib/CodeGen/CGCUDARuntime.cpp
@@ -22,6 +22,116 @@ using namespace CodeGen;
CGCUDARuntime::~CGCUDARuntime() {}
+static llvm::Value *emitGetParamBuf(CodeGenFunction &CGF,
+ const CUDAKernelCallExpr *E) {
+ auto *GetParamBuf = CGF.getContext().getcudaGetParameterBufferDecl();
+ const FunctionProtoType *GetParamBufProto =
+ GetParamBuf->getType()->getAs<FunctionProtoType>();
+
+ DeclRefExpr *DRE = DeclRefExpr::Create(
+ CGF.getContext(), {}, {}, GetParamBuf,
+ /*RefersToEnclosingVariableOrCapture=*/false, GetParamBuf->getNameInfo(),
+ GetParamBuf->getType(), VK_PRValue);
+ auto *ImpCast = ImplicitCastExpr::Create(
+ CGF.getContext(), CGF.getContext().getPointerType(GetParamBuf->getType()),
+ CK_FunctionToPointerDecay, DRE, nullptr, VK_PRValue, FPOptionsOverride());
+
+ CGCallee Callee = CGF.EmitCallee(ImpCast);
+ CallArgList Args;
+ // Use 64B alignment.
+ Args.add(RValue::get(CGF.CGM.getSize(CharUnits::fromQuantity(64))),
+ CGF.getContext().getSizeType());
+ // Calculate parameter sizes.
+ const PointerType *PT = E->getCallee()->getType()->getAs<PointerType>();
+ const FunctionProtoType *FTP =
+ PT->getPointeeType()->getAs<FunctionProtoType>();
+ CharUnits Offset = CharUnits::Zero();
+ for (auto ArgTy : FTP->getParamTypes()) {
+ auto TInfo = CGF.CGM.getContext().getTypeInfoInChars(ArgTy);
+ Offset = Offset.alignTo(TInfo.Align);
+ Offset += TInfo.Width;
+ }
+ Args.add(RValue::get(CGF.CGM.getSize(Offset)),
+ CGF.getContext().getSizeType());
+ const CGFunctionInfo &CallInfo = CGF.CGM.getTypes().arrangeFreeFunctionCall(
+ Args, GetParamBufProto, /*ChainCall=*/false);
+ auto Ret = CGF.EmitCall(CallInfo, Callee, /*ReturnValue=*/{}, Args);
+
+ return Ret.getScalarVal();
+}
+
+RValue CGCUDARuntime::EmitCUDADeviceKernelCallExpr(
+ CodeGenFunction &CGF, const CUDAKernelCallExpr *E,
+ ReturnValueSlot ReturnValue, llvm::CallBase **CallOrInvoke) {
+ ASTContext &Ctx = CGM.getContext();
+ assert(Ctx.getcudaLaunchDeviceDecl() == E->getConfig()->getDirectCallee());
+
+ llvm::BasicBlock *ConfigOKBlock = CGF.createBasicBlock("dkcall.configok");
+ llvm::BasicBlock *ContBlock = CGF.createBasicBlock("dkcall.end");
+
+ llvm::Value *Config = emitGetParamBuf(CGF, E);
+ CGF.Builder.CreateCondBr(
+ CGF.Builder.CreateICmpNE(Config,
+ llvm::Constant::getNullValue(Config->getType())),
+ ConfigOKBlock, ContBlock);
+
+ CodeGenFunction::ConditionalEvaluation eval(CGF);
+
+ eval.begin(CGF);
+ CGF.EmitBlock(ConfigOKBlock);
+
+ QualType KernelCalleeFuncTy =
+ E->getCallee()->getType()->getAs<PointerType>()->getPointeeType();
+ CGCallee KernelCallee = CGF.EmitCallee(E->getCallee());
+ // Emit kernel arguments.
+ CallArgList KernelCallArgs;
+ CGF.EmitCallArgs(
+ KernelCallArgs,
+ dyn_cast<FunctionProtoType>(KernelCalleeFuncTy->castAs<FunctionType>()),
+ E->arguments(), E->getDirectCallee());
+ // Copy emitted kernel arguments into that parameter buffer.
+ RawAddress CfgBase(Config, CGM.Int8Ty,
+ /*Alignment=*/CharUnits::fromQuantity(64));
+ CharUnits Offset = CharUnits::Zero();
+ for (auto &Arg : KernelCallArgs) {
+ auto TInfo = CGM.getContext().getTypeInfoInChars(Arg.getType());
+ Offset = Offset.alignTo(TInfo.Align);
+ Address Addr =
+ CGF.Builder.CreateConstInBoundsGEP(CfgBase, Offset.getQuantity());
+ Arg.copyInto(CGF, Addr);
+ Offset += TInfo.Width;
+ }
+ // Make `cudaLaunchDevice` call, i.e. E->getConfig().
+ const CallExpr *LaunchCall = E->getConfig();
+ QualType LaunchCalleeFuncTy = LaunchCall->getCallee()
+ ->getType()
+ ->getAs<PointerType>()
+ ->getPointeeType();
+ CGCallee LaunchCallee = CGF.EmitCallee(LaunchCall->getCallee());
+ CallArgList LaunchCallArgs;
+ CGF.EmitCallArgs(
+ LaunchCallArgs,
+ dyn_cast<FunctionProtoType>(LaunchCalleeFuncTy->castAs<FunctionType>()),
+ LaunchCall->arguments(), LaunchCall->getDirectCallee());
+ // Replace func and paramterbuffer arguments.
+ LaunchCallArgs[0] = CallArg(RValue::get(KernelCallee.getFunctionPointer()),
+ CGM.getContext().VoidPtrTy);
+ LaunchCallArgs[1] = CallArg(RValue::get(Config), CGM.getContext().VoidPtrTy);
+ const CGFunctionInfo &LaunchCallInfo = CGM.getTypes().arrangeFreeFunctionCall(
+ LaunchCallArgs,
+ dyn_cast<FunctionProtoType>(LaunchCalleeFuncTy->castAs<FunctionType>()),
+ /*ChainCall=*/false);
+ CGF.EmitCall(LaunchCallInfo, LaunchCallee, ReturnValue, LaunchCallArgs,
+ CallOrInvoke,
+ /*IsMustTail=*/false, E->getExprLoc());
+ CGF.EmitBranch(ContBlock);
+
+ CGF.EmitBlock(ContBlock);
+ eval.end(CGF);
+
+ return RValue::get(nullptr);
+}
+
RValue CGCUDARuntime::EmitCUDAKernelCallExpr(CodeGenFunction &CGF,
const CUDAKernelCallExpr *E,
ReturnValueSlot ReturnValue,
diff --git a/clang/lib/CodeGen/CGCUDARuntime.h b/clang/lib/CodeGen/CGCUDARuntime.h
index 86f776004ee7c..64fb9a31422e0 100644
--- a/clang/lib/CodeGen/CGCUDARuntime.h
+++ b/clang/lib/CodeGen/CGCUDARuntime.h
@@ -88,6 +88,10 @@ class CGCUDARuntime {
ReturnValueSlot ReturnValue,
llvm::CallBase **CallOrInvoke = nullptr);
+ virtual RValue EmitCUDADeviceKernelCallExpr(
+ CodeGenFunction &CGF, const CUDAKernelCallExpr *E,
+ ReturnValueSlot ReturnValue, llvm::CallBase **CallOrInvoke = nullptr);
+
/// Emits a kernel launch stub.
virtual void emitDeviceStub(CodeGenFunction &CGF, FunctionArgList &Args) = 0;
diff --git a/clang/lib/CodeGen/CGExprCXX.cpp b/clang/lib/CodeGen/CGExprCXX.cpp
index 14d8db32bafc6..0c01933790100 100644
--- a/clang/lib/CodeGen/CGExprCXX.cpp
+++ b/clang/lib/CodeGen/CGExprCXX.cpp
@@ -503,6 +503,12 @@ RValue CodeGenFunction::EmitCXXOperatorMemberCallExpr(
RValue CodeGenFunction::EmitCUDAKernelCallExpr(const CUDAKernelCallExpr *E,
ReturnValueSlot ReturnValue,
llvm::CallBase **CallOrInvoke) {
+ auto *FD = E->getConfig()->getDirectCallee();
+ // Emit as a device kernel call if the config is prepared using
+ // 'cudaGetParameterBuffer'.
+ if (FD && CGM.getContext().getcudaLaunchDeviceDecl() == FD)
+ return CGM.getCUDARuntime().EmitCUDADeviceKernelCallExpr(
+ *this, E, ReturnValue, CallOrInvoke);
return CGM.getCUDARuntime().EmitCUDAKernelCallExpr(*this, E, ReturnValue,
CallOrInvoke);
}
diff --git a/clang/lib/Sema/SemaCUDA.cpp b/clang/lib/Sema/SemaCUDA.cpp
index 31735a0f5feb3..a60a32dcb9e4c 100644
--- a/clang/lib/Sema/SemaCUDA.cpp
+++ b/clang/lib/Sema/SemaCUDA.cpp
@@ -52,16 +52,85 @@ bool SemaCUDA::PopForceHostDevice() {
ExprResult SemaCUDA::ActOnExecConfigExpr(Scope *S, SourceLocation LLLLoc,
MultiExprArg ExecConfig,
SourceLocation GGGLoc) {
- FunctionDecl *ConfigDecl = getASTContext().getcudaConfigureCallDecl();
+ bool IsDeviceKernelCall = false;
+ switch (CurrentTarget()) {
+ case CUDAFunctionTarget::Global:
+ case CUDAFunctionTarget::Device:
+ IsDeviceKernelCall = true;
+ break;
+ case CUDAFunctionTarget::HostDevice:
+ if (getLangOpts().CUDAIsDevice) {
+ // Under the device compilation, config call under an HD function should
+ // be treated as a device kernel call. But, for implicit HD ones (such as
+ // lambdas), need to check whether RDC is enabled or not.
+ IsDeviceKernelCall = true;
+ if (!getLangOpts().GPURelocatableDeviceCode) {
+ FunctionDecl *Caller = SemaRef.getCurFunctionDecl(/*AllowLambda=*/true);
+ if (Caller && isImplicitHostDeviceFunction(Caller))
+ IsDeviceKernelCall = false;
+ }
+ }
+ break;
+ default:
+ break;
+ }
+
+ if (IsDeviceKernelCall && !getLangOpts().GPURelocatableDeviceCode)
+ return ExprError(
+ Diag(LLLLoc, diag::err_cuda_device_kernel_launch_require_rdc));
+
+ FunctionDecl *ConfigDecl = IsDeviceKernelCall
+ ? getASTContext().getcudaLaunchDeviceDecl()
+ : getASTContext().getcudaConfigureCallDecl();
if (!ConfigDecl)
return ExprError(Diag(LLLLoc, diag::err_undeclared_var_use)
- << getConfigureFuncName());
+ << (IsDeviceKernelCall ? getLaunchDeviceFuncName()
+ : getConfigureFuncName()));
+ // Additional check on the launch function if it's a device kernel call.
+ if (IsDeviceKernelCall) {
+ auto *GetParamBuf = getASTContext().getcudaGetParameterBufferDecl();
+ if (!GetParamBuf)
+ return ExprError(Diag(LLLLoc, diag::err_undeclared_var_use)
+ << getGetParameterBufferFuncName());
+ }
+
QualType ConfigQTy = ConfigDecl->getType();
DeclRefExpr *ConfigDR = new (getASTContext()) DeclRefExpr(
getASTContext(), ConfigDecl, false, ConfigQTy, VK_LValue, LLLLoc);
SemaRef.MarkFunctionReferenced(LLLLoc, ConfigDecl);
+ if (IsDeviceKernelCall) {
+ SmallVector<Expr *> Args;
+ // Use a null pointer as the kernel function, which may not be resolvable
+ // here. For example, resolving that kernel function may need additional
+ // kernel arguments.
+ llvm::APInt Zero(SemaRef.Context.getTypeSize(SemaRef.Context.IntTy), 0);
+ Args.push_back(IntegerLiteral::Create(SemaRef.Context, Zero,
+ SemaRef.Context.IntTy, LLLLoc));
+ // Use a null pointer as the parameter buffer, which should be allocated in
+ // the codegen.
+ Args.push_back(IntegerLiteral::Create(SemaRef.Context, Zero,
+ SemaRef.Context.IntTy, LLLLoc));
+ // Add the original config arguments.
+ llvm::append_range(Args, ExecConfig);
+ // Add the default blockDim if it's missing.
+ if (Args.size() < 4) {
+ llvm::APInt One(SemaRef.Context.getTypeSize(SemaRef.Context.IntTy), 1);
+ Args.push_back(IntegerLiteral::Create(SemaRef.Context, One,
+ SemaRef.Context.IntTy, LLLLoc));
+ }
+ // Add the default sharedMemSize if it's missing.
+ if (Args.size() < 5)
+ Args.push_back(IntegerLiteral::Create(SemaRef.Context, Zero,
+ SemaRef.Context.IntTy, LLLLoc));
+ // Add the default stream if it's missing.
+ if (Args.size() < 6)
+ Args.push_back(IntegerLiteral::Create(SemaRef.Context, Zero,
+ SemaRef.Context.IntTy, LLLLoc));
+ return SemaRef.BuildCallExpr(S, ConfigDR, LLLLoc, Args, GGGLoc, nullptr,
+ /*IsExecConfig=*/true);
+ }
return SemaRef.BuildCallExpr(S, ConfigDR, LLLLoc, ExecConfig, GGGLoc, nullptr,
/*IsExecConfig=*/true);
}
@@ -251,7 +320,7 @@ SemaCUDA::IdentifyPreference(const FunctionDecl *Caller,
if (CalleeTarget == CUDAFunctionTarget::Global &&
(CallerTarget == CUDAFunctionTarget::Global ||
CallerTarget == CUDAFunctionTarget::Device))
- return CFP_Never;
+ return CFP_Native;
// (b) Calling HostDevice is OK for everyone.
if (CalleeTarget == CUDAFunctionTarget::HostDevice)
@@ -279,7 +348,8 @@ SemaCUDA::IdentifyPreference(const FunctionDecl *Caller,
if (CallerTarget == CUDAFunctionTarget::HostDevice) {
// It's OK to call a compilation-mode matching function from an HD one.
if ((getLangOpts().CUDAIsDevice &&
- CalleeTarget == CUDAFunctionTarget::Device) ||
+ (CalleeTarget == CUDAFunctionTarget::Device ||
+ CalleeTarget == CUDAFunctionTarget::Global)) ||
(!getLangOpts().CUDAIsDevice &&
(CalleeTarget == CUDAFunctionTarget::Host ||
CalleeTarget == CUDAFunctionTarget::Global)))
@@ -1103,6 +1173,18 @@ std::string SemaCUDA::getConfigureFuncName() const {
return "cudaConfigureCall";
}
+std::string SemaCUDA::getGetParameterBufferFuncName() const {
+ // FIXME: Use the API from CUDA programming guide. Add V2 support when
+ // necessary.
+ return "cudaGetParameterBuffer";
+}
+
+std::string SemaCUDA::getLaunchDeviceFuncName() const {
+ // FIXME: Use the API from CUDA programming guide. Add V2 support when
+ // necessary.
+ return "cudaLaunchDevice";
+}
+
// Record any local constexpr variables that are passed one way on the host
// and another on the device.
void SemaCUDA::recordPotentialODRUsedVariable(
diff --git a/clang/lib/Sema/SemaDecl.cpp b/clang/lib/Sema/SemaDecl.cpp
index fc3aabf5741ca..1e39bfb5e42cd 100644
--- a/clang/lib/Sema/SemaDecl.cpp
+++ b/clang/lib/Sema/SemaDecl.cpp
@@ -11050,14 +11050,30 @@ Sema::ActOnFunctionDeclarator(Scope *S, Declarator &D, DeclContext *DC,
}
if (getLangOpts().CUDA) {
- IdentifierInfo *II = NewFD->getIdentifier();
- if (II && II->isStr(CUDA().getConfigureFuncName()) &&
- !NewFD->isInvalidDecl() &&
- NewFD->getDeclContext()->getRedeclContext()->isTranslationUnit()) {
- if (!R->castAs<FunctionType>()->getReturnType()->isScalarType())
- Diag(NewFD->getLocation(), diag::err_config_scalar_return)
- << CUDA().getConfigureFuncName();
- Context.setcudaConfigureCallDecl(NewFD);
+ if (IdentifierInfo *II = NewFD->getIdentifier()) {
+ if (II->isStr(CUDA().getConfigureFuncName()) && !NewFD->isInvalidDecl() &&
+ NewFD->getDeclContext()->getRedeclContext()->isTranslationUnit()) {
+ if (!R->castAs<FunctionType>()->getReturnType()->isScalarType())
+ Diag(NewFD->getLocation(), diag::err_config_scalar_return)
+ << CUDA().getConfigureFuncName();
+ Context.setcudaConfigureCallDecl(NewFD);
+ }
+ if (II->isStr(CUDA().getGetParameterBufferFuncName()) &&
+ !NewFD->isInvalidDecl() &&
+ NewFD->getDeclContext()->getRedeclContext()->isTranslationUnit()) {
+ if (!R->castAs<FunctionType>()->getReturnType()->isPointerType())
+ Diag(NewFD->getLocation(), diag::err_config_pointer_return)
+ << CUDA().getConfigureFuncName();
+ Context.setcudaGetParameterBufferDecl(NewFD);
+ }
+ if (II->isStr(CUDA().getLaunchDeviceFuncName()) &&
+ !NewFD->isInvalidDecl() &&
+ NewFD->getDeclContext()->getRedeclContext()->isTranslationUnit()) {
+ if (!R->castAs<FunctionType>()->getReturnType()->isScalarType())
+ Diag(NewFD->getLocation(), diag::err_config_scalar_return)
+ << CUDA().getConfigureFuncName();
+ Context.setcudaLaunchDeviceDecl(NewFD);
+ }
}
}
diff --git a/clang/lib/Serialization/ASTReader.cpp b/clang/lib/Serialization/ASTReader.cpp
index c1b5cb730e4a4..e415d5816ab01 100644
--- a/clang/lib/Serialization/ASTReader.cpp
+++ b/clang/lib/Serialization/ASTReader.cpp
@@ -5588,9 +5588,13 @@ void ASTReader::InitializeContext() {
// If there were any CUDA special declarations, deserialize them.
if (!CUDASpecialDeclRefs.empty()) {
- assert(CUDASpecialDeclRefs.size() == 1 && "More decl refs than expected!");
+ assert(CUDASpecialDeclRefs.size() == 3 && "More decl refs than expected!");
Context.setcudaConfigureCallDecl(
- cast<FunctionDecl>(GetDecl(CUDASpecialDeclRefs[0])));
+ cast_or_null<FunctionDecl>(GetDecl(CUDASpecialDeclRefs[0])));
+ Context.setcudaGetParameterBufferDecl(
+ cast_or_null<FunctionDecl>(GetDecl(CUDASpecialDeclRefs[1])));
+ Context.setcudaLaunchDeviceDecl(
+ cast_or_null<FunctionDecl>(GetDecl(CUDASpecialDeclRefs[2])));
}
// Re-export any modules that were imported by a non-module AST file.
diff --git a/clang/lib/Serialization/ASTWriter.cpp b/clang/lib/Serialization/ASTWriter.cpp
index 377e3966874f3..8e527db972fb0 100644
--- a/clang/lib/Serialization/ASTWriter.cpp
+++ b/clang/lib/Serialization/ASTWriter.cpp
@@ -5714,8 +5714,13 @@ void ASTWriter::PrepareWritingSpecialDecls(Sema &Sem...
[truncated]
|
|
I should mention that the generated code sequence differs from the CUDA 12.9. The latter uses |
bb68180 to
a5320cb
Compare
b0cdc5e to
7f4de97
Compare
|
rebase |
7f4de97 to
dadc214
Compare
|
ping for review |
|
LGTM on HIP side. |
1f3fa59 to
347802f
Compare
|
@Artem-B ping for review |
|
|
||
| // Skip '-lcudadevrt'. | ||
| if (Arg->getOption().matches(OPT_library) && | ||
| StringRef(Arg->getValue()) == "cudadevrt") |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Can you explain this in more detail? Normally the way I handle device libraries is to just pass them to the linker via -Xoffload-linker -lfoo or similar. Why are we breaking the convention here while also not doing -mlink-builtin-bitcode like we do with the libdevice?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
cudadevrt is different from libdevice. The latter is a NVVM/LLVM bitcode library, but the former is a CUDA fatbin (/usr/local/cuda/targets/x86_64-linux/lib/libcudadevrt.a for the default installation), which has only stubs of those 2 device functions (in PTX and SASS) for device-side kernel launch. We need to pass -lcudadevrt to nvlink. AFAIK, there's no option to forward that optino as well as -L <path> from clang-nvlink-wrapper to nvlink. Do we need additional option to enable that forwarding from clang-nvlink-wrapper to nvlink.
I agreed that -Xoffload-linker makes the change to clang-linker-wrapper unnecessary.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I don't know how CUDA handles these fat binaries exactly, I'm assuming nvlink does some magic there. Any option passed to the linker will be forwarded to nvlink. I suppose one problem might be the fact that we do LTO and stuff so it might get confused if binaries for another target are in there? Is it really not working if you just pass -lcudadevrt to the wrapper?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
yes, you may [email protected]:darkbuck/mandelbrot-dyn.git, where I modified the makefile to compile that CDP example using clang-cuda. It works and performs the same as the one compiled from nvcc. (That repo is just updated against the latest change in this PR).
For clang-nvlink-wrapper, we skipped all inputs and libraries when passing options to nvcc. See
| if (Arg->getOption().matches(OPT_INPUT) || |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Right, that's because we had to modify those to do the .cubin renaming and LTO handling. I think the proper solution is to add a fork for assuming non-NVPTX objects are fat binaries and to just forward them without the extra handling.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Oh right I think I also pull apart static libraries because nvlink does this fatbinary handling for those. You'll probably just need to detect this with some binary magic and add it to the input list we build.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Just add the fatbin check in clang-nvlink-wrapper to directly forward archives with fatbin.
347802f to
a652ec9
Compare
a652ec9 to
f377525
Compare
| if (hasFatBinary(**LibFile)) { | ||
| ForwardArgs.push_back(Arg); | ||
| break; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The return value from this function is a list of files that will be passed to nvlink. Why do we need an extra container?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The return value is a list of files extracted. I don't want to mix that '-lcudadevrt' with them to confuse 'nvlink'. In addition, I also want to pass the same form '-lcudadevrt' as 'nvcc' does to avoid any potential issue with 'nvlink'.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I'd be very concerned if nvlink had different behavior there. We should do the sensible thing first and then fix it if there's actually an issue, or if you notice that nvlink can't handle .a files passed directly. The handling here is supposed to use the normal linker search behavior for identifying libraries with -l.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
yes, we locate that archive file (from the specified -lxxx option) using the normal linker search beh and find fat binaries. After, we pass the same -lxxx option as well as other -L to ensure the same linker search beh from 'nvlink'. Assuming that, we could prepare that archive in the same way as nvcc to minimize the potential risk.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
verified that passing *.a files directly works as well. The latest revision simplifies the code and passes that archive file in the return file list.
| // For host object files, search for the fat binary section. | ||
| bool FoundFatBinSection = false; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Honestly not sure we need to bother here, just assume it's a fat-binary if it's not strictly NVPTX. If there's an issue nvlink will either ignore or error.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Sure would be nice if I didn't need to do my own static library resolution and could instead just pass all of these to nvlink.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Mixing objects from nvcc and clang is another headache, even though it is very, very rare.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
What situation would lead that to happen here? The only possible way for this to make it to the nvlink-wrapper stage is if someone passed it via -Xoffload-linker or invoked it directly
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
If we remove the fat binary check and instead check whether it's a device object, at least one regression test (test/Driver/nvlink-wrapper.c) failed.
a97c30b to
613b656
Compare
| return LibFile.takeError(); | ||
| // Skip extracting archives with fat binaries. Forward them to nvlink. | ||
| if (hasFatBinary(**LibFile)) { | ||
| ForwardArchives.emplace_back(Args.MakeArgString(*Filename)); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Can't we just emplace into the Files list? That'd be better because order actually does matter with archives so it'd be best to preserve it.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
In the latest revision, the input order is preserved. However, as Files may be populated from object files extracted from archives later, emplace them into Files cannot preserve the input order.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Hm, it's probably not worth mixing handling for these CUDA fat binary archives. The tough part is that we kind of just want to forward these without modification while the rest of the handling is supposed to do the normal linker stuff that nvlink doesn't do. I'd assume all we'd need is something like this. Do we need additional handling?
// Just let nvlink handle these directly.
if (hasFatBinary(Archive))
Files.emplace_back(Archive);There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
That will put those archives before all other inputs. I noticed one behavior of nvlink is that -lcudadevrt <input> works, but <path>/libcudadevrt.a <input> fails.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
If we don't want the complex processing to preserve archive orders. It's more natural to put all the forward archives at the end. What are your thoughts?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
That's GNU bfd handling, static archives only exact undefined symbols and there aren't any undefined symbols since the files are processed in order. I'm surprised that this doesn't preserve the order since i thought that I wrote the handling here to intentionally expand out the static archives in the proper order.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
That's GNU bfd handling, static archives only exact undefined symbols and there aren't any undefined symbols since the files are processed in order. I'm surprised that this doesn't preserve the order since i thought that I wrote the handling here to intentionally expand out the static archives in the proper order.
To preserve the order of the forwarded archives relative to other inputs, we need to place the forward placeholders and then assemble them.
| if (Obj.getArch() == Triple::nvptx || Obj.getArch() == Triple::nvptx64) | ||
| return false; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Still think it might just be easier to return true in this case, nvlink will likely error if anything is really wrong so we can just defer.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
if the code is changed to
if (Obj.getArch() != Triple::nvptx && Obj.getArch() != Triple::nvptx64)
return true;
and remove the following code check fatbin sections. One regression test (test/Driver/nvlink-wrapper.c) failed.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
any idea what's wrong on that failure?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Oh right, that's listed in the comment.
Creating a CUDA binary requires access to the ptxas executable, so we just use x64.. So I was just spoofing the input there since it doesn't actually get run. You can probably work around it. Worst case you just ignore this in the dry run mode.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
add a test-only to assume device archives for that test, which is also enhanced to test changes in this PR.
- CUDA's dynamic parallelism extension allows device-side kernel
launches, which share the identical syntax to host-side launches,
e.g.,
kernel<<<Dg, Db, Ns, S>>>(arguments);
but differ from the code generation. That device-side kernel launches
is eventually translated into the following sequence
config = cudaGetParameterBuffer(alignment, size);
// setup arguments by copying them into `config`.
cudaLaunchDevice(func, config, Dg, Db, Ns, S);
- To support the device-side kernel launch, 'CUDAKernelCallExpr' is
reused but its config expr is set to a call to 'cudaLaunchDevice'.
During the code generation, 'CUDAKernelCallExpr' is expanded into the
sequence aforementioned.
- As the device-side kernel launch requires the code to be compiled as
relocatable device code and linked with '-lcudadevrt'.
'clang-nvlink-wrapper' is modified to forward archives with fat
binaries directly.
613b656 to
fbf77ef
Compare
CUDA's dynamic parallelism extension allows device-side kernel launches, which share the identical syntax to host-side launches, e.g.,
kernel<<<Dg, Db, Ns, S>>>(arguments);
but differ from the code generation. That device-side kernel launches is eventually translated into the following sequence
config = cudaGetParameterBuffer(alignment, size);
// setup arguments by copying them into
config.cudaLaunchDevice(func, config, Dg, Db, Ns, S);
To support the device-side kernel launch, 'CUDAKernelCallExpr' is reused but its config expr is set to a call to 'cudaLaunchDevice'. During the code generation, 'CUDAKernelCallExpr' is expanded into the sequence aforementioned.
As the device-side kernel launch requires the source to be compiled as relocatable device code and linked with '-lcudadevrt'. Linkers are changed to pass relevant link options to 'nvlink'.