Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
46 commits
Select commit Hold shift + click to select a range
d065850
[Offload][CUDA] Allow CUDA kernels to use LLVM/Offload
jdoerfert Jun 4, 2024
7104455
[Offload][CUDA] Add initial cuda_runtime.h overlay
jdoerfert Jun 8, 2024
4340516
[Offload] Introduce the concept of "default streams"
jdoerfert Jun 12, 2024
930970a
[WIP] Playing with ideas
jdoerfert Jun 15, 2024
628bb35
Trying
jdoerfert Jun 18, 2024
3a147be
Rewrite
jdoerfert Jun 19, 2024
a006510
Restructuring
jdoerfert Jun 22, 2024
192d35d
Add liftetime handling
jdoerfert Jun 26, 2024
6af5502
Trying to add offload sanitizer to clang
jdoerfert Jun 27, 2024
e4563fa
Driver
jdoerfert Jun 27, 2024
0e806bc
AI
jdoerfert Jun 28, 2024
6ac086a
WIP
jdoerfert Jun 28, 2024
514bbb9
Fix cuda and fake ptr handling
jdoerfert Jun 28, 2024
6c053f3
Fixes and more tests
jdoerfert Jun 29, 2024
5c49fb6
Fixes
jdoerfert Jun 29, 2024
0ce2440
Initial support for source locations
jdoerfert Jul 2, 2024
d003f7e
Fixes
jdoerfert Jul 2, 2024
cbdd509
Backtrace support
jdoerfert Jul 2, 2024
612afcc
More tests
jdoerfert Jul 2, 2024
4bf46ce
Ambiguous call trace support
jdoerfert Jul 3, 2024
f216b05
Multi call
jdoerfert Jul 3, 2024
b55babe
Improve test
jdoerfert Jul 3, 2024
6f4233b
Inline calls
jdoerfert Jul 9, 2024
6a82721
Fix build, add shuffle
jdoerfert Jul 13, 2024
2088c55
Add support for globals
EthanLuisMcDonough Jul 19, 2024
06b4d0c
Tune inlining, improve perf for -g
jdoerfert Jul 17, 2024
c004067
[OpenMP][NFC] Precommit test auto-update
jdoerfert Jul 19, 2024
7609f76
[OpenMP] Ensure the actual kernel is annotated with launch bounds
jdoerfert Jul 19, 2024
c8286fb
Fix ompx_new_local
jdoerfert Jul 22, 2024
77ec431
Merge branch 'gpu_san' into gpusan_globals
EthanLuisMcDonough Jul 22, 2024
30154c2
Add basic support for host stack traces
jdoerfert Jul 23, 2024
9ea1f74
Add PC again
jdoerfert Jul 23, 2024
1d36a54
Merge branch 'gpu_san' into gpusan_globals
EthanLuisMcDonough Jul 24, 2024
8bd2223
Fix global tests and -O3 calls
EthanLuisMcDonough Jul 25, 2024
5e5e799
Refactor GEP operand change
EthanLuisMcDonough Jul 25, 2024
81af937
saving work
Jul 26, 2024
86abd53
fix if you can't hoist address computation
Jul 28, 2024
2c3efed
save work
Jul 31, 2024
15e6567
edits to hoisting checks
Jul 31, 2024
9b7ad4b
edits
Aug 6, 2024
6a4da77
more tests
Aug 7, 2024
ef5d63e
fix runtime error; more test
Aug 8, 2024
e44487e
fix PO handling
Aug 9, 2024
5c5bf4a
hoist checks out of loops
Aug 12, 2024
b57e244
edits
Aug 13, 2024
8fd6772
edit
Aug 15, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions clang/include/clang/Basic/LangOptions.def
Original file line number Diff line number Diff line change
Expand Up @@ -288,6 +288,7 @@ LANGOPT(GPUMaxThreadsPerBlock, 32, 1024, "default max threads per block for kern
LANGOPT(GPUDeferDiag, 1, 0, "defer host/device related diagnostic messages for CUDA/HIP")
LANGOPT(GPUExcludeWrongSideOverloads, 1, 0, "always exclude wrong side overloads in overloading resolution for CUDA/HIP")
LANGOPT(OffloadingNewDriver, 1, 0, "use the new driver for generating offloading code.")
LANGOPT(OffloadViaLLVM, 1, 0, "target LLVM/Offload as portable offloading runtime.")

LANGOPT(SYCLIsDevice , 1, 0, "Generate code for SYCL device")
LANGOPT(SYCLIsHost , 1, 0, "SYCL host compilation")
Expand Down
3 changes: 3 additions & 0 deletions clang/include/clang/Basic/Sanitizers.def
Original file line number Diff line number Diff line change
Expand Up @@ -190,6 +190,9 @@ SANITIZER_GROUP("bounds", Bounds, ArrayBounds | LocalBounds)
// Scudo hardened allocator
SANITIZER("scudo", Scudo)

// LLVM/Offload sanitizer
SANITIZER("offload", Offload)

// Magic group, containing all sanitizers. For example, "-fno-sanitize=all"
// can be used to disable all the sanitizers.
SANITIZER_GROUP("all", All, ~SanitizerMask())
Expand Down
6 changes: 6 additions & 0 deletions clang/include/clang/Driver/Options.td
Original file line number Diff line number Diff line change
Expand Up @@ -1293,6 +1293,12 @@ def no_offload_compress : Flag<["--"], "no-offload-compress">;
def offload_compression_level_EQ : Joined<["--"], "offload-compression-level=">,
Flags<[HelpHidden]>,
HelpText<"Compression level for offload device binaries (HIP only)">;

defm offload_via_llvm : BoolFOption<"offload-via-llvm",
LangOpts<"OffloadViaLLVM">, DefaultFalse,
PosFlag<SetTrue, [], [ClangOption, CC1Option], "Use">,
NegFlag<SetFalse, [], [ClangOption], "Don't use">,
BothFlags<[], [ClangOption], " LLVM/Offload as portable offloading runtime.">>;
}

// CUDA options
Expand Down
3 changes: 3 additions & 0 deletions clang/include/clang/Driver/SanitizerArgs.h
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,9 @@ class SanitizerArgs {

bool needsMemProfRt() const { return NeedsMemProfRt; }
bool needsAsanRt() const { return Sanitizers.has(SanitizerKind::Address); }
bool needsOffloadKernels() const {
return Sanitizers.has(SanitizerKind::Offload);
}
bool needsHwasanRt() const {
return Sanitizers.has(SanitizerKind::HWAddress);
}
Expand Down
1 change: 1 addition & 0 deletions clang/lib/CodeGen/BackendUtil.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,7 @@
#include "llvm/Transforms/Instrumentation/BoundsChecking.h"
#include "llvm/Transforms/Instrumentation/DataFlowSanitizer.h"
#include "llvm/Transforms/Instrumentation/GCOVProfiler.h"
#include "llvm/Transforms/Instrumentation/GPUSan.h"
#include "llvm/Transforms/Instrumentation/HWAddressSanitizer.h"
#include "llvm/Transforms/Instrumentation/InstrProfiling.h"
#include "llvm/Transforms/Instrumentation/KCFI.h"
Expand Down
97 changes: 82 additions & 15 deletions clang/lib/CodeGen/CGCUDANV.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -15,10 +15,12 @@
#include "CGCXXABI.h"
#include "CodeGenFunction.h"
#include "CodeGenModule.h"
#include "clang/AST/CharUnits.h"
#include "clang/AST/Decl.h"
#include "clang/Basic/Cuda.h"
#include "clang/CodeGen/CodeGenABITypes.h"
#include "clang/CodeGen/ConstantInitBuilder.h"
#include "llvm/ADT/StringRef.h"
#include "llvm/Frontend/Offloading/Utility.h"
#include "llvm/IR/BasicBlock.h"
#include "llvm/IR/Constants.h"
Expand All @@ -36,6 +38,11 @@ constexpr unsigned HIPFatMagic = 0x48495046; // "HIPF"

class CGNVCUDARuntime : public CGCUDARuntime {

/// The prefix used for function calls and section names (CUDA, HIP, LLVM)
StringRef Prefix;
/// TODO: We should transition the OpenMP section to LLVM/Offload
StringRef SectionPrefix;

private:
llvm::IntegerType *IntTy, *SizeTy;
llvm::Type *VoidTy;
Expand Down Expand Up @@ -132,6 +139,9 @@ class CGNVCUDARuntime : public CGCUDARuntime {
return DummyFunc;
}

Address prepareKernelArgs(CodeGenFunction &CGF, FunctionArgList &Args);
Address prepareKernelArgsLLVMOffload(CodeGenFunction &CGF,
FunctionArgList &Args);
void emitDeviceStubBodyLegacy(CodeGenFunction &CGF, FunctionArgList &Args);
void emitDeviceStubBodyNew(CodeGenFunction &CGF, FunctionArgList &Args);
std::string getDeviceSideName(const NamedDecl *ND) override;
Expand Down Expand Up @@ -191,15 +201,11 @@ class CGNVCUDARuntime : public CGCUDARuntime {
} // end anonymous namespace

std::string CGNVCUDARuntime::addPrefixToName(StringRef FuncName) const {
if (CGM.getLangOpts().HIP)
return ((Twine("hip") + Twine(FuncName)).str());
return ((Twine("cuda") + Twine(FuncName)).str());
return (Prefix + FuncName).str();
}
std::string
CGNVCUDARuntime::addUnderscoredPrefixToName(StringRef FuncName) const {
if (CGM.getLangOpts().HIP)
return ((Twine("__hip") + Twine(FuncName)).str());
return ((Twine("__cuda") + Twine(FuncName)).str());
return ("__" + Prefix + FuncName).str();
}

static std::unique_ptr<MangleContext> InitDeviceMC(CodeGenModule &CGM) {
Expand Down Expand Up @@ -227,6 +233,14 @@ CGNVCUDARuntime::CGNVCUDARuntime(CodeGenModule &CGM)
SizeTy = CGM.SizeTy;
VoidTy = CGM.VoidTy;
PtrTy = CGM.UnqualPtrTy;

if (CGM.getLangOpts().OffloadViaLLVM) {
Prefix = "llvm";
SectionPrefix = "omp";
} else if (CGM.getLangOpts().HIP)
SectionPrefix = Prefix = "hip";
else
SectionPrefix = Prefix = "cuda";
}

llvm::FunctionCallee CGNVCUDARuntime::getSetupArgumentFn() const {
Expand Down Expand Up @@ -305,18 +319,58 @@ void CGNVCUDARuntime::emitDeviceStub(CodeGenFunction &CGF,
}
if (CudaFeatureEnabled(CGM.getTarget().getSDKVersion(),
CudaFeature::CUDA_USES_NEW_LAUNCH) ||
(CGF.getLangOpts().HIP && CGF.getLangOpts().HIPUseNewLaunchAPI))
(CGF.getLangOpts().HIP && CGF.getLangOpts().HIPUseNewLaunchAPI) ||
(CGF.getLangOpts().OffloadViaLLVM))
emitDeviceStubBodyNew(CGF, Args);
else
emitDeviceStubBodyLegacy(CGF, Args);
}

// CUDA 9.0+ uses new way to launch kernels. Parameters are packed in a local
// array and kernels are launched using cudaLaunchKernel().
void CGNVCUDARuntime::emitDeviceStubBodyNew(CodeGenFunction &CGF,
FunctionArgList &Args) {
// Build the shadow stack entry at the very start of the function.
/// CUDA passes the arguments with a level of indirection. For example, a
/// (void*, short, void*) is passed as {void **, short *, void **} to the launch
/// function. For the LLVM/offload launch we flatten the arguments into the
/// struct directly. In addition, we include the size of the arguments, thus
/// pass {sizeof({void *, short, void *}), ptr to {void *, short, void *},
/// nullptr}. The last nullptr needs to be initialized to an array of pointers
/// pointing to the arguments if we want to offload to the host.
Address CGNVCUDARuntime::prepareKernelArgsLLVMOffload(CodeGenFunction &CGF,
FunctionArgList &Args) {
SmallVector<llvm::Type *> ArgTypes, KernelLaunchParamsTypes;
for (auto &Arg : Args)
ArgTypes.push_back(CGF.ConvertTypeForMem(Arg->getType()));
llvm::StructType *KernelArgsTy = llvm::StructType::create(ArgTypes);

auto *Int64Ty = CGF.Builder.getInt64Ty();
KernelLaunchParamsTypes.push_back(Int64Ty);
KernelLaunchParamsTypes.push_back(PtrTy);
KernelLaunchParamsTypes.push_back(PtrTy);

llvm::StructType *KernelLaunchParamsTy =
llvm::StructType::create(KernelLaunchParamsTypes);
Address KernelArgs = CGF.CreateTempAllocaWithoutCast(
KernelArgsTy, CharUnits::fromQuantity(16), "kernel_args");
Address KernelLaunchParams = CGF.CreateTempAllocaWithoutCast(
KernelLaunchParamsTy, CharUnits::fromQuantity(16),
"kernel_launch_params");

auto KernelArgsSize = CGM.getDataLayout().getTypeAllocSize(KernelArgsTy);
CGF.Builder.CreateStore(llvm::ConstantInt::get(Int64Ty, KernelArgsSize),
CGF.Builder.CreateStructGEP(KernelLaunchParams, 0));
CGF.Builder.CreateStore(KernelArgs.emitRawPointer(CGF),
CGF.Builder.CreateStructGEP(KernelLaunchParams, 1));
CGF.Builder.CreateStore(llvm::Constant::getNullValue(PtrTy),
CGF.Builder.CreateStructGEP(KernelLaunchParams, 2));

for (unsigned i = 0; i < Args.size(); ++i) {
auto *ArgVal = CGF.Builder.CreateLoad(CGF.GetAddrOfLocalVar(Args[i]));
CGF.Builder.CreateStore(ArgVal, CGF.Builder.CreateStructGEP(KernelArgs, i));
}

return KernelLaunchParams;
}

Address CGNVCUDARuntime::prepareKernelArgs(CodeGenFunction &CGF,
FunctionArgList &Args) {
// Calculate amount of space we will need for all arguments. If we have no
// args, allocate a single pointer so we still have a valid pointer to the
// argument array that we can pass to runtime, even if it will be unused.
Expand All @@ -331,6 +385,17 @@ void CGNVCUDARuntime::emitDeviceStubBodyNew(CodeGenFunction &CGF,
VoidVarPtr, CGF.Builder.CreateConstGEP1_32(
PtrTy, KernelArgs.emitRawPointer(CGF), i));
}
return KernelArgs;
}

// CUDA 9.0+ uses new way to launch kernels. Parameters are packed in a local
// array and kernels are launched using cudaLaunchKernel().
void CGNVCUDARuntime::emitDeviceStubBodyNew(CodeGenFunction &CGF,
FunctionArgList &Args) {
// Build the shadow stack entry at the very start of the function.
Address KernelArgs = CGF.getLangOpts().OffloadViaLLVM
? prepareKernelArgsLLVMOffload(CGF, Args)
: prepareKernelArgs(CGF, Args);

llvm::BasicBlock *EndBlock = CGF.createBasicBlock("setup.end");

Expand Down Expand Up @@ -1129,8 +1194,9 @@ void CGNVCUDARuntime::transformManagedVars() {
// registered. The linker will provide a pointer to this section so we can
// register the symbols with the linked device image.
void CGNVCUDARuntime::createOffloadingEntries() {
StringRef Section = CGM.getLangOpts().HIP ? "hip_offloading_entries"
: "cuda_offloading_entries";
SmallVector<char, 32> Out;
StringRef Section = (SectionPrefix + "_offloading_entries").toStringRef(Out);

llvm::Module &M = CGM.getModule();
for (KernelInfo &I : EmittedKernels)
llvm::offloading::emitOffloadingEntry(
Expand Down Expand Up @@ -1199,7 +1265,8 @@ llvm::Function *CGNVCUDARuntime::finalizeModule() {
}
return nullptr;
}
if (CGM.getLangOpts().OffloadingNewDriver && RelocatableDeviceCode)
if (CGM.getLangOpts().OffloadViaLLVM ||
(CGM.getLangOpts().OffloadingNewDriver && RelocatableDeviceCode))
createOffloadingEntries();
else
return makeModuleCtorFunction();
Expand Down
4 changes: 4 additions & 0 deletions clang/lib/CodeGen/CGDeclCXX.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -456,6 +456,10 @@ llvm::Function *CodeGenModule::CreateGlobalInitOrCleanUpFunction(
!isInNoSanitizeList(SanitizerKind::Address, Fn, Loc))
Fn->addFnAttr(llvm::Attribute::SanitizeAddress);

if (getLangOpts().Sanitize.has(SanitizerKind::Offload) &&
!isInNoSanitizeList(SanitizerKind::Offload, Fn, Loc))
Fn->addFnAttr(llvm::Attribute::SanitizeAddress);

if (getLangOpts().Sanitize.has(SanitizerKind::KernelAddress) &&
!isInNoSanitizeList(SanitizerKind::KernelAddress, Fn, Loc))
Fn->addFnAttr(llvm::Attribute::SanitizeAddress);
Expand Down
46 changes: 27 additions & 19 deletions clang/lib/CodeGen/CGStmtOpenMP.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -639,27 +639,42 @@ CodeGenFunction::GenerateOpenMPCapturedStmtFunction(const CapturedStmt &S,
// Build the argument list.
bool NeedWrapperFunction =
getDebugInfo() && CGM.getCodeGenOpts().hasReducedDebugInfo();
FunctionArgList Args;
llvm::MapVector<const Decl *, std::pair<const VarDecl *, Address>> LocalAddrs;
llvm::DenseMap<const Decl *, std::pair<const Expr *, llvm::Value *>> VLASizes;
FunctionArgList Args, WrapperArgs;
llvm::MapVector<const Decl *, std::pair<const VarDecl *, Address>> LocalAddrs,
WrapperLocalAddrs;
llvm::DenseMap<const Decl *, std::pair<const Expr *, llvm::Value *>> VLASizes,
WrapperVLASizes;
SmallString<256> Buffer;
llvm::raw_svector_ostream Out(Buffer);
Out << CapturedStmtInfo->getHelperName();
if (NeedWrapperFunction)

CodeGenFunction WrapperCGF(CGM, /*suppressNewContext=*/true);
llvm::Function *WrapperF = nullptr;
if (NeedWrapperFunction) {
// Emit the final kernel early to allow attributes to be added by the
// OpenMPI-IR-Builder.
FunctionOptions WrapperFO(&S, /*UIntPtrCastRequired=*/true,
/*RegisterCastedArgsOnly=*/true,
CapturedStmtInfo->getHelperName(), Loc);
WrapperCGF.CapturedStmtInfo = CapturedStmtInfo;
WrapperF =
emitOutlinedFunctionPrologue(WrapperCGF, Args, LocalAddrs, VLASizes,
WrapperCGF.CXXThisValue, WrapperFO);
Out << "_debug__";
}
FunctionOptions FO(&S, !NeedWrapperFunction, /*RegisterCastedArgsOnly=*/false,
Out.str(), Loc);
llvm::Function *F = emitOutlinedFunctionPrologue(*this, Args, LocalAddrs,
VLASizes, CXXThisValue, FO);
llvm::Function *F = emitOutlinedFunctionPrologue(
*this, WrapperArgs, WrapperLocalAddrs, WrapperVLASizes, CXXThisValue, FO);
CodeGenFunction::OMPPrivateScope LocalScope(*this);
for (const auto &LocalAddrPair : LocalAddrs) {
for (const auto &LocalAddrPair : WrapperLocalAddrs) {
if (LocalAddrPair.second.first) {
LocalScope.addPrivate(LocalAddrPair.second.first,
LocalAddrPair.second.second);
}
}
(void)LocalScope.Privatize();
for (const auto &VLASizePair : VLASizes)
for (const auto &VLASizePair : WrapperVLASizes)
VLASizeMap[VLASizePair.second.first] = VLASizePair.second.second;
PGO.assignRegionCounters(GlobalDecl(CD), F);
CapturedStmtInfo->EmitBody(*this, CD->getBody());
Expand All @@ -668,17 +683,10 @@ CodeGenFunction::GenerateOpenMPCapturedStmtFunction(const CapturedStmt &S,
if (!NeedWrapperFunction)
return F;

FunctionOptions WrapperFO(&S, /*UIntPtrCastRequired=*/true,
/*RegisterCastedArgsOnly=*/true,
CapturedStmtInfo->getHelperName(), Loc);
CodeGenFunction WrapperCGF(CGM, /*suppressNewContext=*/true);
WrapperCGF.CapturedStmtInfo = CapturedStmtInfo;
Args.clear();
LocalAddrs.clear();
VLASizes.clear();
llvm::Function *WrapperF =
emitOutlinedFunctionPrologue(WrapperCGF, Args, LocalAddrs, VLASizes,
WrapperCGF.CXXThisValue, WrapperFO);
// Reverse the order.
WrapperF->removeFromParent();
F->getParent()->getFunctionList().insertAfter(F->getIterator(), WrapperF);

llvm::SmallVector<llvm::Value *, 4> CallArgs;
auto *PI = F->arg_begin();
for (const auto *Arg : Args) {
Expand Down
9 changes: 7 additions & 2 deletions clang/lib/CodeGen/CodeGenFunction.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@
#include "clang/AST/StmtObjC.h"
#include "clang/Basic/Builtins.h"
#include "clang/Basic/CodeGenOptions.h"
#include "clang/Basic/Sanitizers.h"
#include "clang/Basic/TargetBuiltins.h"
#include "clang/Basic/TargetInfo.h"
#include "clang/CodeGen/CGFunctionInfo.h"
Expand Down Expand Up @@ -67,7 +68,8 @@ static bool shouldEmitLifetimeMarkers(const CodeGenOptions &CGOpts,
// Sanitizers may use markers.
if (CGOpts.SanitizeAddressUseAfterScope ||
LangOpts.Sanitize.has(SanitizerKind::HWAddress) ||
LangOpts.Sanitize.has(SanitizerKind::Memory))
LangOpts.Sanitize.has(SanitizerKind::Memory) ||
LangOpts.Sanitize.has(SanitizerKind::Offload))
return true;

// For now, only in optimized builds.
Expand Down Expand Up @@ -791,6 +793,8 @@ void CodeGenFunction::StartFunction(GlobalDecl GD, QualType RetTy,
SanOpts.set(SanitizerKind::KernelHWAddress, false);
if (no_sanitize_mask & SanitizerKind::KernelHWAddress)
SanOpts.set(SanitizerKind::HWAddress, false);
if (no_sanitize_mask & SanitizerKind::Offload)
SanOpts.set(SanitizerKind::Offload, false);

if (SanitizeBounds && !SanOpts.hasOneOf(SanitizerKind::Bounds))
Fn->addFnAttr(llvm::Attribute::NoSanitizeBounds);
Expand All @@ -809,7 +813,8 @@ void CodeGenFunction::StartFunction(GlobalDecl GD, QualType RetTy,
CurFn->addFnAttr(llvm::Attribute::DisableSanitizerInstrumentation);
} else {
// Apply sanitizer attributes to the function.
if (SanOpts.hasOneOf(SanitizerKind::Address | SanitizerKind::KernelAddress))
if (SanOpts.hasOneOf(SanitizerKind::Address | SanitizerKind::KernelAddress |
SanitizerKind::Offload))
Fn->addFnAttr(llvm::Attribute::SanitizeAddress);
if (SanOpts.hasOneOf(SanitizerKind::HWAddress |
SanitizerKind::KernelHWAddress))
Expand Down
Loading