diff --git a/clang/include/clang/Basic/LangOptions.def b/clang/include/clang/Basic/LangOptions.def index 2dea3cd4d795b..e8d3be7e89dbb 100644 --- a/clang/include/clang/Basic/LangOptions.def +++ b/clang/include/clang/Basic/LangOptions.def @@ -288,6 +288,7 @@ LANGOPT(GPUMaxThreadsPerBlock, 32, 1024, "default max threads per block for kern LANGOPT(GPUDeferDiag, 1, 0, "defer host/device related diagnostic messages for CUDA/HIP") LANGOPT(GPUExcludeWrongSideOverloads, 1, 0, "always exclude wrong side overloads in overloading resolution for CUDA/HIP") LANGOPT(OffloadingNewDriver, 1, 0, "use the new driver for generating offloading code.") +LANGOPT(OffloadViaLLVM, 1, 0, "target LLVM/Offload as portable offloading runtime.") LANGOPT(SYCLIsDevice , 1, 0, "Generate code for SYCL device") LANGOPT(SYCLIsHost , 1, 0, "SYCL host compilation") diff --git a/clang/include/clang/Basic/Sanitizers.def b/clang/include/clang/Basic/Sanitizers.def index bee35e9dca7c3..0b41187f6db52 100644 --- a/clang/include/clang/Basic/Sanitizers.def +++ b/clang/include/clang/Basic/Sanitizers.def @@ -190,6 +190,9 @@ SANITIZER_GROUP("bounds", Bounds, ArrayBounds | LocalBounds) // Scudo hardened allocator SANITIZER("scudo", Scudo) +// LLVM/Offload sanitizer +SANITIZER("offload", Offload) + // Magic group, containing all sanitizers. For example, "-fno-sanitize=all" // can be used to disable all the sanitizers. SANITIZER_GROUP("all", All, ~SanitizerMask()) diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td index d44faa55c456f..b77fd063f5519 100644 --- a/clang/include/clang/Driver/Options.td +++ b/clang/include/clang/Driver/Options.td @@ -1293,6 +1293,12 @@ def no_offload_compress : Flag<["--"], "no-offload-compress">; def offload_compression_level_EQ : Joined<["--"], "offload-compression-level=">, Flags<[HelpHidden]>, HelpText<"Compression level for offload device binaries (HIP only)">; + +defm offload_via_llvm : BoolFOption<"offload-via-llvm", + LangOpts<"OffloadViaLLVM">, DefaultFalse, + PosFlag, + NegFlag, + BothFlags<[], [ClangOption], " LLVM/Offload as portable offloading runtime.">>; } // CUDA options diff --git a/clang/include/clang/Driver/SanitizerArgs.h b/clang/include/clang/Driver/SanitizerArgs.h index 47ef175302679..004d5fbf4af73 100644 --- a/clang/include/clang/Driver/SanitizerArgs.h +++ b/clang/include/clang/Driver/SanitizerArgs.h @@ -80,6 +80,9 @@ class SanitizerArgs { bool needsMemProfRt() const { return NeedsMemProfRt; } bool needsAsanRt() const { return Sanitizers.has(SanitizerKind::Address); } + bool needsOffloadKernels() const { + return Sanitizers.has(SanitizerKind::Offload); + } bool needsHwasanRt() const { return Sanitizers.has(SanitizerKind::HWAddress); } diff --git a/clang/lib/CodeGen/BackendUtil.cpp b/clang/lib/CodeGen/BackendUtil.cpp index b09680086248d..b1d11b4a6497a 100644 --- a/clang/lib/CodeGen/BackendUtil.cpp +++ b/clang/lib/CodeGen/BackendUtil.cpp @@ -70,6 +70,7 @@ #include "llvm/Transforms/Instrumentation/BoundsChecking.h" #include "llvm/Transforms/Instrumentation/DataFlowSanitizer.h" #include "llvm/Transforms/Instrumentation/GCOVProfiler.h" +#include "llvm/Transforms/Instrumentation/GPUSan.h" #include "llvm/Transforms/Instrumentation/HWAddressSanitizer.h" #include "llvm/Transforms/Instrumentation/InstrProfiling.h" #include "llvm/Transforms/Instrumentation/KCFI.h" diff --git a/clang/lib/CodeGen/CGCUDANV.cpp b/clang/lib/CodeGen/CGCUDANV.cpp index 43dfbbb90dd52..2ebe0bf802dfa 100644 --- a/clang/lib/CodeGen/CGCUDANV.cpp +++ b/clang/lib/CodeGen/CGCUDANV.cpp @@ -15,10 +15,12 @@ #include "CGCXXABI.h" #include "CodeGenFunction.h" #include "CodeGenModule.h" +#include "clang/AST/CharUnits.h" #include "clang/AST/Decl.h" #include "clang/Basic/Cuda.h" #include "clang/CodeGen/CodeGenABITypes.h" #include "clang/CodeGen/ConstantInitBuilder.h" +#include "llvm/ADT/StringRef.h" #include "llvm/Frontend/Offloading/Utility.h" #include "llvm/IR/BasicBlock.h" #include "llvm/IR/Constants.h" @@ -36,6 +38,11 @@ constexpr unsigned HIPFatMagic = 0x48495046; // "HIPF" class CGNVCUDARuntime : public CGCUDARuntime { + /// The prefix used for function calls and section names (CUDA, HIP, LLVM) + StringRef Prefix; + /// TODO: We should transition the OpenMP section to LLVM/Offload + StringRef SectionPrefix; + private: llvm::IntegerType *IntTy, *SizeTy; llvm::Type *VoidTy; @@ -132,6 +139,9 @@ class CGNVCUDARuntime : public CGCUDARuntime { return DummyFunc; } + Address prepareKernelArgs(CodeGenFunction &CGF, FunctionArgList &Args); + Address prepareKernelArgsLLVMOffload(CodeGenFunction &CGF, + FunctionArgList &Args); void emitDeviceStubBodyLegacy(CodeGenFunction &CGF, FunctionArgList &Args); void emitDeviceStubBodyNew(CodeGenFunction &CGF, FunctionArgList &Args); std::string getDeviceSideName(const NamedDecl *ND) override; @@ -191,15 +201,11 @@ class CGNVCUDARuntime : public CGCUDARuntime { } // end anonymous namespace std::string CGNVCUDARuntime::addPrefixToName(StringRef FuncName) const { - if (CGM.getLangOpts().HIP) - return ((Twine("hip") + Twine(FuncName)).str()); - return ((Twine("cuda") + Twine(FuncName)).str()); + return (Prefix + FuncName).str(); } std::string CGNVCUDARuntime::addUnderscoredPrefixToName(StringRef FuncName) const { - if (CGM.getLangOpts().HIP) - return ((Twine("__hip") + Twine(FuncName)).str()); - return ((Twine("__cuda") + Twine(FuncName)).str()); + return ("__" + Prefix + FuncName).str(); } static std::unique_ptr InitDeviceMC(CodeGenModule &CGM) { @@ -227,6 +233,14 @@ CGNVCUDARuntime::CGNVCUDARuntime(CodeGenModule &CGM) SizeTy = CGM.SizeTy; VoidTy = CGM.VoidTy; PtrTy = CGM.UnqualPtrTy; + + if (CGM.getLangOpts().OffloadViaLLVM) { + Prefix = "llvm"; + SectionPrefix = "omp"; + } else if (CGM.getLangOpts().HIP) + SectionPrefix = Prefix = "hip"; + else + SectionPrefix = Prefix = "cuda"; } llvm::FunctionCallee CGNVCUDARuntime::getSetupArgumentFn() const { @@ -305,18 +319,58 @@ void CGNVCUDARuntime::emitDeviceStub(CodeGenFunction &CGF, } if (CudaFeatureEnabled(CGM.getTarget().getSDKVersion(), CudaFeature::CUDA_USES_NEW_LAUNCH) || - (CGF.getLangOpts().HIP && CGF.getLangOpts().HIPUseNewLaunchAPI)) + (CGF.getLangOpts().HIP && CGF.getLangOpts().HIPUseNewLaunchAPI) || + (CGF.getLangOpts().OffloadViaLLVM)) emitDeviceStubBodyNew(CGF, Args); else emitDeviceStubBodyLegacy(CGF, Args); } -// CUDA 9.0+ uses new way to launch kernels. Parameters are packed in a local -// array and kernels are launched using cudaLaunchKernel(). -void CGNVCUDARuntime::emitDeviceStubBodyNew(CodeGenFunction &CGF, - FunctionArgList &Args) { - // Build the shadow stack entry at the very start of the function. +/// CUDA passes the arguments with a level of indirection. For example, a +/// (void*, short, void*) is passed as {void **, short *, void **} to the launch +/// function. For the LLVM/offload launch we flatten the arguments into the +/// struct directly. In addition, we include the size of the arguments, thus +/// pass {sizeof({void *, short, void *}), ptr to {void *, short, void *}, +/// nullptr}. The last nullptr needs to be initialized to an array of pointers +/// pointing to the arguments if we want to offload to the host. +Address CGNVCUDARuntime::prepareKernelArgsLLVMOffload(CodeGenFunction &CGF, + FunctionArgList &Args) { + SmallVector ArgTypes, KernelLaunchParamsTypes; + for (auto &Arg : Args) + ArgTypes.push_back(CGF.ConvertTypeForMem(Arg->getType())); + llvm::StructType *KernelArgsTy = llvm::StructType::create(ArgTypes); + + auto *Int64Ty = CGF.Builder.getInt64Ty(); + KernelLaunchParamsTypes.push_back(Int64Ty); + KernelLaunchParamsTypes.push_back(PtrTy); + KernelLaunchParamsTypes.push_back(PtrTy); + + llvm::StructType *KernelLaunchParamsTy = + llvm::StructType::create(KernelLaunchParamsTypes); + Address KernelArgs = CGF.CreateTempAllocaWithoutCast( + KernelArgsTy, CharUnits::fromQuantity(16), "kernel_args"); + Address KernelLaunchParams = CGF.CreateTempAllocaWithoutCast( + KernelLaunchParamsTy, CharUnits::fromQuantity(16), + "kernel_launch_params"); + + auto KernelArgsSize = CGM.getDataLayout().getTypeAllocSize(KernelArgsTy); + CGF.Builder.CreateStore(llvm::ConstantInt::get(Int64Ty, KernelArgsSize), + CGF.Builder.CreateStructGEP(KernelLaunchParams, 0)); + CGF.Builder.CreateStore(KernelArgs.emitRawPointer(CGF), + CGF.Builder.CreateStructGEP(KernelLaunchParams, 1)); + CGF.Builder.CreateStore(llvm::Constant::getNullValue(PtrTy), + CGF.Builder.CreateStructGEP(KernelLaunchParams, 2)); + + for (unsigned i = 0; i < Args.size(); ++i) { + auto *ArgVal = CGF.Builder.CreateLoad(CGF.GetAddrOfLocalVar(Args[i])); + CGF.Builder.CreateStore(ArgVal, CGF.Builder.CreateStructGEP(KernelArgs, i)); + } + return KernelLaunchParams; +} + +Address CGNVCUDARuntime::prepareKernelArgs(CodeGenFunction &CGF, + FunctionArgList &Args) { // Calculate amount of space we will need for all arguments. If we have no // args, allocate a single pointer so we still have a valid pointer to the // argument array that we can pass to runtime, even if it will be unused. @@ -331,6 +385,17 @@ void CGNVCUDARuntime::emitDeviceStubBodyNew(CodeGenFunction &CGF, VoidVarPtr, CGF.Builder.CreateConstGEP1_32( PtrTy, KernelArgs.emitRawPointer(CGF), i)); } + return KernelArgs; +} + +// CUDA 9.0+ uses new way to launch kernels. Parameters are packed in a local +// array and kernels are launched using cudaLaunchKernel(). +void CGNVCUDARuntime::emitDeviceStubBodyNew(CodeGenFunction &CGF, + FunctionArgList &Args) { + // Build the shadow stack entry at the very start of the function. + Address KernelArgs = CGF.getLangOpts().OffloadViaLLVM + ? prepareKernelArgsLLVMOffload(CGF, Args) + : prepareKernelArgs(CGF, Args); llvm::BasicBlock *EndBlock = CGF.createBasicBlock("setup.end"); @@ -1129,8 +1194,9 @@ void CGNVCUDARuntime::transformManagedVars() { // registered. The linker will provide a pointer to this section so we can // register the symbols with the linked device image. void CGNVCUDARuntime::createOffloadingEntries() { - StringRef Section = CGM.getLangOpts().HIP ? "hip_offloading_entries" - : "cuda_offloading_entries"; + SmallVector Out; + StringRef Section = (SectionPrefix + "_offloading_entries").toStringRef(Out); + llvm::Module &M = CGM.getModule(); for (KernelInfo &I : EmittedKernels) llvm::offloading::emitOffloadingEntry( @@ -1199,7 +1265,8 @@ llvm::Function *CGNVCUDARuntime::finalizeModule() { } return nullptr; } - if (CGM.getLangOpts().OffloadingNewDriver && RelocatableDeviceCode) + if (CGM.getLangOpts().OffloadViaLLVM || + (CGM.getLangOpts().OffloadingNewDriver && RelocatableDeviceCode)) createOffloadingEntries(); else return makeModuleCtorFunction(); diff --git a/clang/lib/CodeGen/CGDeclCXX.cpp b/clang/lib/CodeGen/CGDeclCXX.cpp index a88bb2af59fee..90ebc638bd45e 100644 --- a/clang/lib/CodeGen/CGDeclCXX.cpp +++ b/clang/lib/CodeGen/CGDeclCXX.cpp @@ -456,6 +456,10 @@ llvm::Function *CodeGenModule::CreateGlobalInitOrCleanUpFunction( !isInNoSanitizeList(SanitizerKind::Address, Fn, Loc)) Fn->addFnAttr(llvm::Attribute::SanitizeAddress); + if (getLangOpts().Sanitize.has(SanitizerKind::Offload) && + !isInNoSanitizeList(SanitizerKind::Offload, Fn, Loc)) + Fn->addFnAttr(llvm::Attribute::SanitizeAddress); + if (getLangOpts().Sanitize.has(SanitizerKind::KernelAddress) && !isInNoSanitizeList(SanitizerKind::KernelAddress, Fn, Loc)) Fn->addFnAttr(llvm::Attribute::SanitizeAddress); diff --git a/clang/lib/CodeGen/CodeGenFunction.cpp b/clang/lib/CodeGen/CodeGenFunction.cpp index cea0d84c64bc4..d11c4ed65a597 100644 --- a/clang/lib/CodeGen/CodeGenFunction.cpp +++ b/clang/lib/CodeGen/CodeGenFunction.cpp @@ -31,6 +31,7 @@ #include "clang/AST/StmtObjC.h" #include "clang/Basic/Builtins.h" #include "clang/Basic/CodeGenOptions.h" +#include "clang/Basic/Sanitizers.h" #include "clang/Basic/TargetBuiltins.h" #include "clang/Basic/TargetInfo.h" #include "clang/CodeGen/CGFunctionInfo.h" @@ -67,7 +68,8 @@ static bool shouldEmitLifetimeMarkers(const CodeGenOptions &CGOpts, // Sanitizers may use markers. if (CGOpts.SanitizeAddressUseAfterScope || LangOpts.Sanitize.has(SanitizerKind::HWAddress) || - LangOpts.Sanitize.has(SanitizerKind::Memory)) + LangOpts.Sanitize.has(SanitizerKind::Memory) || + LangOpts.Sanitize.has(SanitizerKind::Offload)) return true; // For now, only in optimized builds. @@ -791,6 +793,8 @@ void CodeGenFunction::StartFunction(GlobalDecl GD, QualType RetTy, SanOpts.set(SanitizerKind::KernelHWAddress, false); if (no_sanitize_mask & SanitizerKind::KernelHWAddress) SanOpts.set(SanitizerKind::HWAddress, false); + if (no_sanitize_mask & SanitizerKind::Offload) + SanOpts.set(SanitizerKind::Offload, false); if (SanitizeBounds && !SanOpts.hasOneOf(SanitizerKind::Bounds)) Fn->addFnAttr(llvm::Attribute::NoSanitizeBounds); @@ -809,7 +813,8 @@ void CodeGenFunction::StartFunction(GlobalDecl GD, QualType RetTy, CurFn->addFnAttr(llvm::Attribute::DisableSanitizerInstrumentation); } else { // Apply sanitizer attributes to the function. - if (SanOpts.hasOneOf(SanitizerKind::Address | SanitizerKind::KernelAddress)) + if (SanOpts.hasOneOf(SanitizerKind::Address | SanitizerKind::KernelAddress | + SanitizerKind::Offload)) Fn->addFnAttr(llvm::Attribute::SanitizeAddress); if (SanOpts.hasOneOf(SanitizerKind::HWAddress | SanitizerKind::KernelHWAddress)) diff --git a/clang/lib/Driver/Driver.cpp b/clang/lib/Driver/Driver.cpp index 67bf0604acd6e..0f55b429512c2 100644 --- a/clang/lib/Driver/Driver.cpp +++ b/clang/lib/Driver/Driver.cpp @@ -792,11 +792,13 @@ void Driver::CreateOffloadingDeviceToolChains(Compilation &C, }) || C.getInputArgs().hasArg(options::OPT_hip_link) || C.getInputArgs().hasArg(options::OPT_hipstdpar); + bool UseLLVMOffload = C.getInputArgs().hasArg( + options::OPT_foffload_via_llvm, options::OPT_fno_offload_via_llvm, false); if (IsCuda && IsHIP) { Diag(clang::diag::err_drv_mix_cuda_hip); return; } - if (IsCuda) { + if (IsCuda && !UseLLVMOffload) { const ToolChain *HostTC = C.getSingleOffloadToolChain(); const llvm::Triple &HostTriple = HostTC->getTriple(); auto OFK = Action::OFK_Cuda; @@ -818,7 +820,7 @@ void Driver::CreateOffloadingDeviceToolChains(Compilation &C, CudaInstallation.WarnIfUnsupportedVersion(); } C.addOffloadDeviceToolChain(CudaTC.get(), OFK); - } else if (IsHIP) { + } else if (IsHIP && !UseLLVMOffload) { if (auto *OMPTargetArg = C.getInputArgs().getLastArg(options::OPT_fopenmp_targets_EQ)) { Diag(clang::diag::err_drv_unsupported_opt_for_language_mode) @@ -842,10 +844,11 @@ void Driver::CreateOffloadingDeviceToolChains(Compilation &C, // We need to generate an OpenMP toolchain if the user specified targets with // the -fopenmp-targets option or used --offload-arch with OpenMP enabled. bool IsOpenMPOffloading = - C.getInputArgs().hasFlag(options::OPT_fopenmp, options::OPT_fopenmp_EQ, - options::OPT_fno_openmp, false) && - (C.getInputArgs().hasArg(options::OPT_fopenmp_targets_EQ) || - C.getInputArgs().hasArg(options::OPT_offload_arch_EQ)); + ((IsCuda || IsHIP) && UseLLVMOffload) || + (C.getInputArgs().hasFlag(options::OPT_fopenmp, options::OPT_fopenmp_EQ, + options::OPT_fno_openmp, false) && + (C.getInputArgs().hasArg(options::OPT_fopenmp_targets_EQ) || + C.getInputArgs().hasArg(options::OPT_offload_arch_EQ))); if (IsOpenMPOffloading) { // We expect that -fopenmp-targets is always used in conjunction with the // option -fopenmp specifying a valid runtime with offloading support, i.e. @@ -873,7 +876,7 @@ void Driver::CreateOffloadingDeviceToolChains(Compilation &C, for (StringRef T : OpenMPTargets->getValues()) OpenMPTriples.insert(T); } else if (C.getInputArgs().hasArg(options::OPT_offload_arch_EQ) && - !IsHIP && !IsCuda) { + ((!IsHIP && !IsCuda) || UseLLVMOffload)) { const ToolChain *HostTC = C.getSingleOffloadToolChain(); auto AMDTriple = getHIPOffloadTargetTriple(*this, C.getInputArgs()); auto NVPTXTriple = getNVIDIAOffloadTargetTriple(*this, C.getInputArgs(), @@ -4146,6 +4149,8 @@ void Driver::BuildActions(Compilation &C, DerivedArgList &Args, bool UseNewOffloadingDriver = C.isOffloadingHostKind(Action::OFK_OpenMP) || + Args.hasFlag(options::OPT_foffload_via_llvm, + options::OPT_fno_offload_via_llvm, false) || Args.hasFlag(options::OPT_offload_new_driver, options::OPT_no_offload_new_driver, false); diff --git a/clang/lib/Driver/ToolChain.cpp b/clang/lib/Driver/ToolChain.cpp index 40ab2e91125d1..05b53c5573a1c 100644 --- a/clang/lib/Driver/ToolChain.cpp +++ b/clang/lib/Driver/ToolChain.cpp @@ -1367,7 +1367,8 @@ SanitizerMask ToolChain::getSupportedSanitizers() const { SanitizerKind::CFICastStrict | SanitizerKind::FloatDivideByZero | SanitizerKind::KCFI | SanitizerKind::UnsignedIntegerOverflow | SanitizerKind::UnsignedShiftBase | SanitizerKind::ImplicitConversion | - SanitizerKind::Nullability | SanitizerKind::LocalBounds; + SanitizerKind::Nullability | SanitizerKind::LocalBounds | + SanitizerKind::Offload; if (getTriple().getArch() == llvm::Triple::x86 || getTriple().getArch() == llvm::Triple::x86_64 || getTriple().getArch() == llvm::Triple::arm || getTriple().isWasm() || diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp index b8d8ff3db5d1f..77d365bad229f 100644 --- a/clang/lib/Driver/ToolChains/Clang.cpp +++ b/clang/lib/Driver/ToolChains/Clang.cpp @@ -1125,6 +1125,18 @@ void Clang::AddPreprocessingOptions(Compilation &C, const JobAction &JA, CmdArgs.push_back("__clang_openmp_device_functions.h"); } + if (Args.hasArg(options::OPT_foffload_via_llvm)) { + // Add llvm_wrappers/* to our system include path. This lets us wrap + // standard library headers and other headers. + SmallString<128> P(D.ResourceDir); + llvm::sys::path::append(P, "include", "llvm_offload_wrappers"); + CmdArgs.append({"-internal-isystem", Args.MakeArgString(P), "-include"}); + if (JA.isDeviceOffloading(Action::OFK_OpenMP)) + CmdArgs.push_back("__llvm_offload_device.h"); + else + CmdArgs.push_back("__llvm_offload_host.h"); + } + // Add -i* options, and automatically translate to // -include-pch/-include-pth for transparent PCH support. It's // wonky, but we include looking for .gch so we can support seamless @@ -6598,6 +6610,8 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA, // device offloading action other than OpenMP. if (Args.hasFlag(options::OPT_fopenmp, options::OPT_fopenmp_EQ, options::OPT_fno_openmp, false) && + !Args.hasFlag(options::OPT_foffload_via_llvm, + options::OPT_fno_offload_via_llvm, false) && (JA.isDeviceOffloading(Action::OFK_None) || JA.isDeviceOffloading(Action::OFK_OpenMP))) { switch (D.getOpenMPRuntime(Args)) { @@ -6675,11 +6689,16 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA, Args.addOptOutFlag(CmdArgs, options::OPT_fopenmp_extensions, options::OPT_fno_openmp_extensions); } - - // Forward the new driver to change offloading code generation. - if (Args.hasFlag(options::OPT_offload_new_driver, - options::OPT_no_offload_new_driver, false)) + // Forward the offload runtime change to code generation, liboffload implies + // new driver. Otherwise, check if we should forward the new driver to change + // offloading code generation. + if (Args.hasFlag(options::OPT_foffload_via_llvm, + options::OPT_fno_offload_via_llvm, false)) { + CmdArgs.append({"--offload-new-driver", "-foffload-via-llvm"}); + } else if (Args.hasFlag(options::OPT_offload_new_driver, + options::OPT_no_offload_new_driver, false)) { CmdArgs.push_back("--offload-new-driver"); + } SanitizeArgs.addArgs(TC, Args, CmdArgs, InputType); diff --git a/clang/lib/Driver/ToolChains/CommonArgs.cpp b/clang/lib/Driver/ToolChains/CommonArgs.cpp index 2a4c1369f5a73..ecbee87ed6486 100644 --- a/clang/lib/Driver/ToolChains/CommonArgs.cpp +++ b/clang/lib/Driver/ToolChains/CommonArgs.cpp @@ -1204,9 +1204,20 @@ bool tools::addOpenMPRuntime(const Compilation &C, ArgStringList &CmdArgs, const ToolChain &TC, const ArgList &Args, bool ForceStaticHostRuntime, bool IsOffloadingHost, bool GompNeedsRT) { + const SanitizerArgs &SanArgs = TC.getSanitizerArgs(Args); + if (SanArgs.needsOffloadKernels()) { + CmdArgs.push_back("-loffload.kernels"); + CmdArgs.append({"-mllvm", "-enable-offload-sanitizer"}); + } + if (!Args.hasFlag(options::OPT_fopenmp, options::OPT_fopenmp_EQ, - options::OPT_fno_openmp, false)) + options::OPT_fno_openmp, false)) { + // We need libomptarget (liboffload) if it's the choosen offloading runtime. + if (Args.hasFlag(options::OPT_foffload_via_llvm, + options::OPT_fno_offload_via_llvm, false)) + CmdArgs.push_back("-lomptarget"); return false; + } Driver::OpenMPRuntimeKind RTKind = TC.getDriver().getOpenMPRuntime(Args); diff --git a/clang/lib/Driver/ToolChains/Cuda.cpp b/clang/lib/Driver/ToolChains/Cuda.cpp index 2dfc7457b0ac7..7d154865ce3f2 100644 --- a/clang/lib/Driver/ToolChains/Cuda.cpp +++ b/clang/lib/Driver/ToolChains/Cuda.cpp @@ -861,17 +861,15 @@ void CudaToolChain::addClangTargetOptions( DeviceOffloadingKind == Action::OFK_Cuda) && "Only OpenMP or CUDA offloading kinds are supported for NVIDIA GPUs."); - if (DeviceOffloadingKind == Action::OFK_Cuda) { - CC1Args.append( - {"-fcuda-is-device", "-mllvm", "-enable-memcpyopt-without-libcalls"}); - - // Unsized function arguments used for variadics were introduced in CUDA-9.0 - // We still do not support generating code that actually uses variadic - // arguments yet, but we do need to allow parsing them as recent CUDA - // headers rely on that. https://github.com/llvm/llvm-project/issues/58410 - if (CudaInstallation.version() >= CudaVersion::CUDA_90) - CC1Args.push_back("-fcuda-allow-variadic-functions"); - } + CC1Args.append( + {"-fcuda-is-device", "-mllvm", "-enable-memcpyopt-without-libcalls"}); + + // Unsized function arguments used for variadics were introduced in CUDA-9.0 + // We still do not support generating code that actually uses variadic + // arguments yet, but we do need to allow parsing them as recent CUDA + // headers rely on that. https://github.com/llvm/llvm-project/issues/58410 + if (CudaInstallation.version() >= CudaVersion::CUDA_90) + CC1Args.push_back("-fcuda-allow-variadic-functions"); if (DriverArgs.hasArg(options::OPT_nogpulib)) return; @@ -889,6 +887,13 @@ void CudaToolChain::addClangTargetOptions( CC1Args.push_back("-mlink-builtin-bitcode"); CC1Args.push_back(DriverArgs.MakeArgString(LibDeviceFile)); + // For now, we don't use any Offload/OpenMP device runtime when we offload + // CUDA via LLVM/Offload. We should split the Offload/OpenMP device runtime + // and include the "generic" (or CUDA-specific) parts. + if (DriverArgs.hasFlag(options::OPT_foffload_via_llvm, + options::OPT_fno_offload_via_llvm, false)) + return; + clang::CudaVersion CudaInstallationVersion = CudaInstallation.version(); if (DriverArgs.hasFlag(options::OPT_fcuda_short_ptr, diff --git a/clang/lib/Headers/CMakeLists.txt b/clang/lib/Headers/CMakeLists.txt index d3090e488306f..251e5b0ba2381 100644 --- a/clang/lib/Headers/CMakeLists.txt +++ b/clang/lib/Headers/CMakeLists.txt @@ -325,6 +325,13 @@ set(openmp_wrapper_files openmp_wrappers/new ) +set(llvm_offload_wrapper_files + llvm_offload_wrappers/__llvm_offload.h + llvm_offload_wrappers/__llvm_offload_host.h + llvm_offload_wrappers/__llvm_offload_device.h + llvm_offload_wrappers/cuda_runtime.h +) + set(llvm_libc_wrapper_files llvm_libc_wrappers/assert.h llvm_libc_wrappers/stdio.h @@ -375,7 +382,7 @@ endfunction(clang_generate_header) # Copy header files from the source directory to the build directory foreach( f ${files} ${cuda_wrapper_files} ${cuda_wrapper_bits_files} ${ppc_wrapper_files} ${openmp_wrapper_files} ${zos_wrapper_files} ${hlsl_files} - ${llvm_libc_wrapper_files}) + ${llvm_libc_wrapper_files} ${llvm_offload_wrapper_files}) copy_header_to_output_dir(${CMAKE_CURRENT_SOURCE_DIR} ${f}) endforeach( f ) @@ -501,6 +508,7 @@ add_header_target("hlsl-resource-headers" ${hlsl_files}) add_header_target("opencl-resource-headers" ${opencl_files}) add_header_target("llvm-libc-resource-headers" ${llvm_libc_wrapper_files}) add_header_target("openmp-resource-headers" ${openmp_wrapper_files}) +add_header_target("llvm-offload-resource-headers" ${llvm_libc_wrapper_files}) add_header_target("windows-resource-headers" ${windows_only_files}) add_header_target("utility-resource-headers" ${utility_files}) @@ -542,6 +550,11 @@ install( DESTINATION ${header_install_dir}/openmp_wrappers COMPONENT clang-resource-headers) +install( + FILES ${llvm_offload_wrapper_files} + DESTINATION ${header_install_dir}/llvm_offload_wrappers + COMPONENT clang-resource-headers) + install( FILES ${zos_wrapper_files} DESTINATION ${header_install_dir}/zos_wrappers @@ -704,8 +717,8 @@ install( COMPONENT openmp-resource-headers) install( - FILES ${openmp_wrapper_files} - DESTINATION ${header_install_dir}/openmp_wrappers + FILES ${llvm_offload_wrapper_files} + DESTINATION ${header_install_dir}/llvm_offload_wrappers EXCLUDE_FROM_ALL COMPONENT openmp-resource-headers) diff --git a/clang/lib/Headers/llvm_offload_wrappers/__llvm_offload.h b/clang/lib/Headers/llvm_offload_wrappers/__llvm_offload.h new file mode 100644 index 0000000000000..2898898904e29 --- /dev/null +++ b/clang/lib/Headers/llvm_offload_wrappers/__llvm_offload.h @@ -0,0 +1,31 @@ +/*===------ LLVM/Offload helpers for kernel languages (CUDA/HIP) -*- c++ -*-=== + * + * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. + * See https://llvm.org/LICENSE.txt for license information. + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + * + *===-----------------------------------------------------------------------=== + */ + +#include + +#define __host__ __attribute__((host)) +#define __device__ __attribute__((device)) +#define __global__ __attribute__((global)) +#define __shared__ __attribute__((shared)) +#define __constant__ __attribute__((constant)) +#define __managed__ __attribute__((managed)) + +extern "C" { + +typedef struct dim3 { + dim3() {} + dim3(unsigned x) : x(x) {} + unsigned x = 0, y = 0, z = 0; +} dim3; + +// TODO: For some reason the CUDA device compilation requires this declaration +// to be present on the device while it is only used on the host. +unsigned __llvmPushCallConfiguration(dim3 gridDim, dim3 blockDim, + size_t sharedMem = 0, void *stream = 0); +} diff --git a/clang/lib/Headers/llvm_offload_wrappers/__llvm_offload_device.h b/clang/lib/Headers/llvm_offload_wrappers/__llvm_offload_device.h new file mode 100644 index 0000000000000..1a813b331515b --- /dev/null +++ b/clang/lib/Headers/llvm_offload_wrappers/__llvm_offload_device.h @@ -0,0 +1,10 @@ +/*===------ LLVM/Offload helpers for kernel languages (CUDA/HIP) -*- c++ -*-=== + * + * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. + * See https://llvm.org/LICENSE.txt for license information. + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + * + *===-----------------------------------------------------------------------=== + */ + +#include "__llvm_offload.h" diff --git a/clang/lib/Headers/llvm_offload_wrappers/__llvm_offload_host.h b/clang/lib/Headers/llvm_offload_wrappers/__llvm_offload_host.h new file mode 100644 index 0000000000000..160289d169b55 --- /dev/null +++ b/clang/lib/Headers/llvm_offload_wrappers/__llvm_offload_host.h @@ -0,0 +1,15 @@ +/*===------ LLVM/Offload helpers for kernel languages (CUDA/HIP) -*- c++ -*-=== + * + * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. + * See https://llvm.org/LICENSE.txt for license information. + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + * + *===-----------------------------------------------------------------------=== + */ + +#include "__llvm_offload.h" + +extern "C" { +unsigned llvmLaunchKernel(const void *func, dim3 gridDim, dim3 blockDim, + void **args, size_t sharedMem = 0, void *stream = 0); +} diff --git a/clang/lib/Headers/llvm_offload_wrappers/cuda_runtime.h b/clang/lib/Headers/llvm_offload_wrappers/cuda_runtime.h new file mode 100644 index 0000000000000..2d698e1c14e49 --- /dev/null +++ b/clang/lib/Headers/llvm_offload_wrappers/cuda_runtime.h @@ -0,0 +1,137 @@ +/*===- __cuda_runtime.h - LLVM/Offload wrappers for CUDA runtime API -------=== + * + * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. + * See https://llvm.org/LICENSE.txt for license information. + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + * + *===-----------------------------------------------------------------------=== + */ + +#ifndef __CUDA_RUNTIME_API__ +#define __CUDA_RUNTIME_API__ + +#include +#include +#include + +extern "C" { +int omp_get_initial_device(void); +void omp_target_free(void *Ptr, int Device); +void *omp_target_alloc(size_t Size, int Device); +int omp_target_memcpy(void *Dst, const void *Src, size_t Length, + size_t DstOffset, size_t SrcOffset, int DstDevice, + int SrcDevice); +void *omp_target_memset(void *Ptr, int C, size_t N, int DeviceNum); +int __tgt_target_synchronize_async_info_queue(void *Loc, int64_t DeviceNum, + void *AsyncInfoQueue); +} + +// TODO: There are many fields missing in this enumeration. +typedef enum cudaError { + cudaSuccess = 0, + cudaErrorInvalidValue = 1, + cudaErrorMemoryAllocation = 2, + cudaErrorNoDevice = 100, + cudaErrorInvalidDevice = 101, + cudaErrorOTHER = -1, +} cudaError_t; + +enum cudaMemcpyKind { + cudaMemcpyHostToHost = 0, + cudaMemcpyHostToDevice = 1, + cudaMemcpyDeviceToHost = 2, + cudaMemcpyDeviceToDevice = 3, + cudaMemcpyDefault = 4 +}; + +typedef void *cudaStream_t; + +static thread_local cudaError_t __cudaomp_last_error = cudaSuccess; + +// Returns the last error that has been produced and resets it to cudaSuccess. +inline cudaError_t cudaGetLastError() { + cudaError_t TempError = __cudaomp_last_error; + __cudaomp_last_error = cudaSuccess; + return TempError; +} + +// Returns the last error that has been produced without reseting it. +inline cudaError_t cudaPeekAtLastError() { return __cudaomp_last_error; } + +inline cudaError_t cudaDeviceSynchronize() { + int DeviceNum = 0; + return __cudaomp_last_error = + (cudaError_t)__tgt_target_synchronize_async_info_queue( + /*Loc=*/nullptr, DeviceNum, /*AsyncInfoQueue=*/nullptr); +} + +inline cudaError_t __cudaMalloc(void **devPtr, size_t size) { + int DeviceNum = 0; + *devPtr = omp_target_alloc(size, DeviceNum); + if (*devPtr == NULL) + return __cudaomp_last_error = cudaErrorMemoryAllocation; + + return __cudaomp_last_error = cudaSuccess; +} + +template cudaError_t cudaMalloc(T **devPtr, size_t size) { + return __cudaMalloc((void **)devPtr, size); +} + +inline cudaError_t __cudaFree(void *devPtr) { + int DeviceNum = 0; + omp_target_free(devPtr, DeviceNum); + return __cudaomp_last_error = cudaSuccess; +} + +template inline cudaError_t cudaFree(T *ptr) { + return __cudaFree((void *)ptr); +} + +inline cudaError_t __cudaMemcpy(void *dst, const void *src, size_t count, + cudaMemcpyKind kind) { + // get the host device number (which is the inital device) + int HostDeviceNum = omp_get_initial_device(); + + // use the default device for gpu + int GPUDeviceNum = 0; + + // default to copy from host to device + int DstDeviceNum = GPUDeviceNum; + int SrcDeviceNum = HostDeviceNum; + + if (kind == cudaMemcpyDeviceToHost) + std::swap(DstDeviceNum, SrcDeviceNum); + + // omp_target_memcpy returns 0 on success and non-zero on failure + if (omp_target_memcpy(dst, src, count, 0, 0, DstDeviceNum, SrcDeviceNum)) + return __cudaomp_last_error = cudaErrorInvalidValue; + return __cudaomp_last_error = cudaSuccess; +} + +template +inline cudaError_t cudaMemcpy(T *dst, const T *src, size_t count, + cudaMemcpyKind kind) { + return __cudaMemcpy((void *)dst, (const void *)src, count, kind); +} + +inline cudaError_t __cudaMemset(void *devPtr, int value, size_t count, + cudaStream_t stream = 0) { + int DeviceNum = 0; + if (!omp_target_memset(devPtr, value, count, DeviceNum)) + return __cudaomp_last_error = cudaErrorInvalidValue; + return __cudaomp_last_error = cudaSuccess; +} + +template +inline cudaError_t cudaMemset(T *devPtr, int value, size_t count) { + return __cudaMemset((void *)devPtr, value, count); +} + +inline cudaError_t cudaDeviceReset(void) { + cudaDeviceSynchronize(); + // TODO: not implemented. + return __cudaomp_last_error = cudaSuccess; +} + +#endif diff --git a/clang/lib/Headers/openmp_wrappers/__clang_openmp_device_functions.h b/clang/lib/Headers/openmp_wrappers/__clang_openmp_device_functions.h index d5b6846b03488..3e354c63efc66 100644 --- a/clang/lib/Headers/openmp_wrappers/__clang_openmp_device_functions.h +++ b/clang/lib/Headers/openmp_wrappers/__clang_openmp_device_functions.h @@ -10,17 +10,15 @@ #ifndef __CLANG_OPENMP_DEVICE_FUNCTIONS_H__ #define __CLANG_OPENMP_DEVICE_FUNCTIONS_H__ -#ifndef _OPENMP -#error "This file is for OpenMP compilation only." -#endif - #ifdef __cplusplus extern "C" { #endif +#ifdef __NVPTX__ #pragma omp begin declare variant match( \ device = {arch(nvptx, nvptx64)}, implementation = {extension(match_any)}) +#pragma push_macro("__CUDA__") #define __CUDA__ #define __OPENMP_NVPTX__ @@ -31,9 +29,10 @@ extern "C" { #include <__clang_cuda_device_functions.h> #undef __OPENMP_NVPTX__ -#undef __CUDA__ +#pragma pop_macro("__CUDA__") #pragma omp end declare variant +#endif #ifdef __AMDGCN__ #pragma omp begin declare variant match(device = {arch(amdgcn)}) diff --git a/clang/lib/Sema/SemaCUDA.cpp b/clang/lib/Sema/SemaCUDA.cpp index 580b9872c6a1d..ec37c0df56c67 100644 --- a/clang/lib/Sema/SemaCUDA.cpp +++ b/clang/lib/Sema/SemaCUDA.cpp @@ -1068,6 +1068,9 @@ void SemaCUDA::inheritTargetAttrs(FunctionDecl *FD, } std::string SemaCUDA::getConfigureFuncName() const { + if (getLangOpts().OffloadViaLLVM) + return "__llvmPushCallConfiguration"; + if (getLangOpts().HIP) return getLangOpts().HIPUseNewLaunchAPI ? "__hipPushCallConfiguration" : "hipConfigureCall"; diff --git a/clang/test/CodeGenCUDA/offload_via_llvm.cu b/clang/test/CodeGenCUDA/offload_via_llvm.cu new file mode 100644 index 0000000000000..3eb580850fc48 --- /dev/null +++ b/clang/test/CodeGenCUDA/offload_via_llvm.cu @@ -0,0 +1,97 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5 +// RUN: %clang -Xclang -triple -Xclang "x86_64-unknown-linux-gnu" -S -c -foffload-via-llvm -emit-llvm -o - %s | FileCheck %s + +// Check that we generate LLVM/Offload calls, including the KERNEL_LAUNCH_PARAMS argument. + +// CHECK-LABEL: define dso_local void @_Z18__device_stub__fooisPvS_( +// CHECK-SAME: i32 noundef [[TMP0:%.*]], i16 noundef signext [[TMP1:%.*]], ptr noundef [[TMP2:%.*]], ptr noundef [[TMP3:%.*]]) #[[ATTR0:[0-9]+]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[DOTADDR:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[DOTADDR1:%.*]] = alloca i16, align 2 +// CHECK-NEXT: [[DOTADDR2:%.*]] = alloca ptr, align 8 +// CHECK-NEXT: [[DOTADDR3:%.*]] = alloca ptr, align 8 +// CHECK-NEXT: [[KERNEL_ARGS:%.*]] = alloca [[TMP0]], align 16 +// CHECK-NEXT: [[KERNEL_LAUNCH_PARAMS:%.*]] = alloca [[TMP1]], align 16 +// CHECK-NEXT: [[GRID_DIM:%.*]] = alloca [[STRUCT_DIM3:%.*]], align 8 +// CHECK-NEXT: [[BLOCK_DIM:%.*]] = alloca [[STRUCT_DIM3]], align 8 +// CHECK-NEXT: [[SHMEM_SIZE:%.*]] = alloca i64, align 8 +// CHECK-NEXT: [[STREAM:%.*]] = alloca ptr, align 8 +// CHECK-NEXT: [[GRID_DIM_COERCE:%.*]] = alloca { i64, i32 }, align 8 +// CHECK-NEXT: [[BLOCK_DIM_COERCE:%.*]] = alloca { i64, i32 }, align 8 +// CHECK-NEXT: store i32 [[TMP0]], ptr [[DOTADDR]], align 4 +// CHECK-NEXT: store i16 [[TMP1]], ptr [[DOTADDR1]], align 2 +// CHECK-NEXT: store ptr [[TMP2]], ptr [[DOTADDR2]], align 8 +// CHECK-NEXT: store ptr [[TMP3]], ptr [[DOTADDR3]], align 8 +// CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds [[TMP1]], ptr [[KERNEL_LAUNCH_PARAMS]], i32 0, i32 0 +// CHECK-NEXT: store i64 24, ptr [[TMP4]], align 16 +// CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds [[TMP1]], ptr [[KERNEL_LAUNCH_PARAMS]], i32 0, i32 1 +// CHECK-NEXT: store ptr [[KERNEL_ARGS]], ptr [[TMP5]], align 8 +// CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds [[TMP1]], ptr [[KERNEL_LAUNCH_PARAMS]], i32 0, i32 2 +// CHECK-NEXT: store ptr null, ptr [[TMP6]], align 16 +// CHECK-NEXT: [[TMP7:%.*]] = load i32, ptr [[DOTADDR]], align 4 +// CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds [[TMP0]], ptr [[KERNEL_ARGS]], i32 0, i32 0 +// CHECK-NEXT: store i32 [[TMP7]], ptr [[TMP8]], align 16 +// CHECK-NEXT: [[TMP9:%.*]] = load i16, ptr [[DOTADDR1]], align 2 +// CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds [[TMP0]], ptr [[KERNEL_ARGS]], i32 0, i32 1 +// CHECK-NEXT: store i16 [[TMP9]], ptr [[TMP10]], align 4 +// CHECK-NEXT: [[TMP11:%.*]] = load ptr, ptr [[DOTADDR2]], align 8 +// CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds [[TMP0]], ptr [[KERNEL_ARGS]], i32 0, i32 2 +// CHECK-NEXT: store ptr [[TMP11]], ptr [[TMP12]], align 8 +// CHECK-NEXT: [[TMP13:%.*]] = load ptr, ptr [[DOTADDR3]], align 8 +// CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds [[TMP0]], ptr [[KERNEL_ARGS]], i32 0, i32 3 +// CHECK-NEXT: store ptr [[TMP13]], ptr [[TMP14]], align 16 +// CHECK-NEXT: [[TMP15:%.*]] = call i32 @__llvmPopCallConfiguration(ptr [[GRID_DIM]], ptr [[BLOCK_DIM]], ptr [[SHMEM_SIZE]], ptr [[STREAM]]) +// CHECK-NEXT: [[TMP16:%.*]] = load i64, ptr [[SHMEM_SIZE]], align 8 +// CHECK-NEXT: [[TMP17:%.*]] = load ptr, ptr [[STREAM]], align 8 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[GRID_DIM_COERCE]], ptr align 8 [[GRID_DIM]], i64 12, i1 false) +// CHECK-NEXT: [[TMP18:%.*]] = getelementptr inbounds { i64, i32 }, ptr [[GRID_DIM_COERCE]], i32 0, i32 0 +// CHECK-NEXT: [[TMP19:%.*]] = load i64, ptr [[TMP18]], align 8 +// CHECK-NEXT: [[TMP20:%.*]] = getelementptr inbounds { i64, i32 }, ptr [[GRID_DIM_COERCE]], i32 0, i32 1 +// CHECK-NEXT: [[TMP21:%.*]] = load i32, ptr [[TMP20]], align 8 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[BLOCK_DIM_COERCE]], ptr align 8 [[BLOCK_DIM]], i64 12, i1 false) +// CHECK-NEXT: [[TMP22:%.*]] = getelementptr inbounds { i64, i32 }, ptr [[BLOCK_DIM_COERCE]], i32 0, i32 0 +// CHECK-NEXT: [[TMP23:%.*]] = load i64, ptr [[TMP22]], align 8 +// CHECK-NEXT: [[TMP24:%.*]] = getelementptr inbounds { i64, i32 }, ptr [[BLOCK_DIM_COERCE]], i32 0, i32 1 +// CHECK-NEXT: [[TMP25:%.*]] = load i32, ptr [[TMP24]], align 8 +// CHECK-NEXT: [[CALL:%.*]] = call noundef i32 @llvmLaunchKernel(ptr noundef @_Z18__device_stub__fooisPvS_, i64 [[TMP19]], i32 [[TMP21]], i64 [[TMP23]], i32 [[TMP25]], ptr noundef [[KERNEL_LAUNCH_PARAMS]], i64 noundef [[TMP16]], ptr noundef [[TMP17]]) +// CHECK-NEXT: br label %[[SETUP_END:.*]] +// CHECK: [[SETUP_END]]: +// CHECK-NEXT: ret void +// +__global__ void foo(int, short, void *, void *) {} + +// CHECK-LABEL: define dso_local void @_Z5test1Pv( +// CHECK-SAME: ptr noundef [[PTR:%.*]]) #[[ATTR2:[0-9]+]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[PTR_ADDR:%.*]] = alloca ptr, align 8 +// CHECK-NEXT: [[AGG_TMP:%.*]] = alloca [[STRUCT_DIM3:%.*]], align 4 +// CHECK-NEXT: [[AGG_TMP1:%.*]] = alloca [[STRUCT_DIM3]], align 4 +// CHECK-NEXT: [[AGG_TMP_COERCE:%.*]] = alloca { i64, i32 }, align 4 +// CHECK-NEXT: [[AGG_TMP1_COERCE:%.*]] = alloca { i64, i32 }, align 4 +// CHECK-NEXT: store ptr [[PTR]], ptr [[PTR_ADDR]], align 8 +// CHECK-NEXT: call void @_ZN4dim3C2Ej(ptr noundef nonnull align 4 dereferenceable(12) [[AGG_TMP]], i32 noundef 3) +// CHECK-NEXT: call void @_ZN4dim3C2Ej(ptr noundef nonnull align 4 dereferenceable(12) [[AGG_TMP1]], i32 noundef 7) +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[AGG_TMP_COERCE]], ptr align 4 [[AGG_TMP]], i64 12, i1 false) +// CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds { i64, i32 }, ptr [[AGG_TMP_COERCE]], i32 0, i32 0 +// CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr [[TMP0]], align 4 +// CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds { i64, i32 }, ptr [[AGG_TMP_COERCE]], i32 0, i32 1 +// CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr [[TMP2]], align 4 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[AGG_TMP1_COERCE]], ptr align 4 [[AGG_TMP1]], i64 12, i1 false) +// CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds { i64, i32 }, ptr [[AGG_TMP1_COERCE]], i32 0, i32 0 +// CHECK-NEXT: [[TMP5:%.*]] = load i64, ptr [[TMP4]], align 4 +// CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds { i64, i32 }, ptr [[AGG_TMP1_COERCE]], i32 0, i32 1 +// CHECK-NEXT: [[TMP7:%.*]] = load i32, ptr [[TMP6]], align 4 +// CHECK-NEXT: [[CALL:%.*]] = call i32 @__llvmPushCallConfiguration(i64 [[TMP1]], i32 [[TMP3]], i64 [[TMP5]], i32 [[TMP7]], i64 noundef 0, ptr noundef null) +// CHECK-NEXT: [[TOBOOL:%.*]] = icmp ne i32 [[CALL]], 0 +// CHECK-NEXT: br i1 [[TOBOOL]], label %[[KCALL_END:.*]], label %[[KCALL_CONFIGOK:.*]] +// CHECK: [[KCALL_CONFIGOK]]: +// CHECK-NEXT: [[TMP8:%.*]] = load ptr, ptr [[PTR_ADDR]], align 8 +// CHECK-NEXT: [[TMP9:%.*]] = load ptr, ptr [[PTR_ADDR]], align 8 +// CHECK-NEXT: call void @_Z18__device_stub__fooisPvS_(i32 noundef 13, i16 noundef signext 1, ptr noundef [[TMP8]], ptr noundef [[TMP9]]) #[[ATTR5:[0-9]+]] +// CHECK-NEXT: br label %[[KCALL_END]] +// CHECK: [[KCALL_END]]: +// CHECK-NEXT: ret void +// +void test1(void *Ptr) { + foo<<<3, 7>>>(13, 1, Ptr, Ptr); +} diff --git a/clang/test/Driver/cuda-via-liboffload.cu b/clang/test/Driver/cuda-via-liboffload.cu new file mode 100644 index 0000000000000..68dc963e906b2 --- /dev/null +++ b/clang/test/Driver/cuda-via-liboffload.cu @@ -0,0 +1,23 @@ +// RUN: %clang -### -target x86_64-linux-gnu -foffload-via-llvm -ccc-print-bindings \ +// RUN: --offload-arch=sm_35 --offload-arch=sm_70 %s 2>&1 \ +// RUN: | FileCheck -check-prefix BINDINGS %s + +// BINDINGS: "x86_64-unknown-linux-gnu" - "clang", inputs: ["[[INPUT:.+]]"], output: "[[HOST_BC:.+]]" +// BINDINGS-NEXT: "nvptx64-nvidia-cuda" - "clang", inputs: ["[[INPUT]]", "[[HOST_BC]]"], output: "[[PTX_SM_35:.+]]" +// BINDINGS-NEXT: "nvptx64-nvidia-cuda" - "NVPTX::Assembler", inputs: ["[[PTX_SM_35]]"], output: "[[CUBIN_SM_35:.+]]" +// BINDINGS-NEXT: "nvptx64-nvidia-cuda" - "clang", inputs: ["[[INPUT]]", "[[HOST_BC]]"], output: "[[PTX_SM_70:.+]]" +// BINDINGS-NEXT: "nvptx64-nvidia-cuda" - "NVPTX::Assembler", inputs: ["[[PTX_SM_70:.+]]"], output: "[[CUBIN_SM_70:.+]]" +// BINDINGS-NEXT: "x86_64-unknown-linux-gnu" - "Offload::Packager", inputs: ["[[CUBIN_SM_35]]", "[[CUBIN_SM_70]]"], output: "[[BINARY:.+]]" +// BINDINGS-NEXT: "x86_64-unknown-linux-gnu" - "clang", inputs: ["[[HOST_BC]]", "[[BINARY]]"], output: "[[HOST_OBJ:.+]]" +// BINDINGS-NEXT: "x86_64-unknown-linux-gnu" - "Offload::Linker", inputs: ["[[HOST_OBJ]]"], output: "a.out" + +// RUN: %clang -### -target x86_64-linux-gnu -foffload-via-llvm -ccc-print-bindings \ +// RUN: --offload-arch=sm_35 --offload-arch=sm_70 %s 2>&1 \ +// RUN: | FileCheck -check-prefix BINDINGS-DEVICE %s + +// BINDINGS-DEVICE: # "nvptx64-nvidia-cuda" - "clang", inputs: ["[[INPUT:.+]]"], output: "[[PTX:.+]]" +// BINDINGS-DEVICE: # "nvptx64-nvidia-cuda" - "NVPTX::Assembler", inputs: ["[[PTX]]"], output: "[[CUBIN:.+]]" + +// RUN: %clang -### -target x86_64-linux-gnu -ccc-print-bindings --offload-link -foffload-via-llvm %s 2>&1 | FileCheck -check-prefix DEVICE-LINK %s + +// DEVICE-LINK: "x86_64-unknown-linux-gnu" - "Offload::Linker", inputs: ["[[INPUT:.+]]"], output: "a.out" diff --git a/llvm/include/llvm/Frontend/OpenMP/OMPConstants.h b/llvm/include/llvm/Frontend/OpenMP/OMPConstants.h index 338b56226f204..a7be3f51fac7d 100644 --- a/llvm/include/llvm/Frontend/OpenMP/OMPConstants.h +++ b/llvm/include/llvm/Frontend/OpenMP/OMPConstants.h @@ -72,7 +72,7 @@ enum class IdentFlag { #include "llvm/Frontend/OpenMP/OMPKinds.def" // Version of the kernel argument format used by the omp runtime. -#define OMP_KERNEL_ARG_VERSION 3 +#define OMP_KERNEL_ARG_VERSION 4 // Minimum version of the compiler that generates a kernel dynamic pointer. #define OMP_KERNEL_ARG_MIN_VERSION_WITH_DYN_PTR 3 diff --git a/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def b/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def index fe09bb8177c28..0be3827185e2e 100644 --- a/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def +++ b/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def @@ -90,7 +90,7 @@ __OMP_ARRAY_TYPE(Int32Arr3, Int32, 3) __OMP_STRUCT_TYPE(Ident, ident_t, false, Int32, Int32, Int32, Int32, Int8Ptr) __OMP_STRUCT_TYPE(KernelArgs, __tgt_kernel_arguments, false, Int32, Int32, VoidPtrPtr, VoidPtrPtr, Int64Ptr, Int64Ptr, VoidPtrPtr, VoidPtrPtr, - Int64, Int64, Int32Arr3Ty, Int32Arr3Ty, Int32) + Int64, Int64, Int32Arr3Ty, Int32Arr3Ty, Int32, VoidPtr) __OMP_STRUCT_TYPE(AsyncInfo, __tgt_async_info, false, Int8Ptr) __OMP_STRUCT_TYPE(DependInfo, kmp_dep_info, false, SizeTy, SizeTy, Int8) __OMP_STRUCT_TYPE(Task, kmp_task_ompbuilder_t, false, VoidPtr, VoidPtr, Int32, VoidPtr, VoidPtr) diff --git a/llvm/include/llvm/Transforms/Instrumentation/GPUSan.h b/llvm/include/llvm/Transforms/Instrumentation/GPUSan.h new file mode 100644 index 0000000000000..1201b3c47ac06 --- /dev/null +++ b/llvm/include/llvm/Transforms/Instrumentation/GPUSan.h @@ -0,0 +1,26 @@ +//===- Transforms/Instrumentation/GPUSan.h ----------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +/// \file +/// This file provides the interface for LLVM's PGO Instrumentation lowering +/// pass. +//===----------------------------------------------------------------------===// + +#ifndef LLVM_TRANSFORMS_INSTRUMENTATION_GPUSAN_H +#define LLVM_TRANSFORMS_INSTRUMENTATION_GPUSAN_H + +#include "llvm/IR/PassManager.h" + +namespace llvm { + +class GPUSanPass : public PassInfoMixin { +public: + PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM); +}; +} // end namespace llvm + +#endif // LLVM_TRANSFORMS_INSTRUMENTATION_GPUSAN_H diff --git a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp index 92213e19c9d9d..ba73d48bc9a8b 100644 --- a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp +++ b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp @@ -496,6 +496,7 @@ void OpenMPIRBuilder::getKernelArgsVector(TargetKernelArgs &KernelArgs, auto Int32Ty = Type::getInt32Ty(Builder.getContext()); Value *ZeroArray = Constant::getNullValue(ArrayType::get(Int32Ty, 3)); Value *Flags = Builder.getInt64(KernelArgs.HasNoWait); + Value *AsyncInfoQueue = Constant::getNullValue(Builder.getPtrTy()); Value *NumTeams3D = Builder.CreateInsertValue(ZeroArray, KernelArgs.NumTeams, {0}); @@ -514,7 +515,8 @@ void OpenMPIRBuilder::getKernelArgsVector(TargetKernelArgs &KernelArgs, Flags, NumTeams3D, NumThreads3D, - KernelArgs.DynCGGroupMem}; + KernelArgs.DynCGGroupMem, + AsyncInfoQueue}; } void OpenMPIRBuilder::addAttributes(omp::RuntimeFunction FnID, Function &Fn) { diff --git a/llvm/lib/Passes/PassBuilder.cpp b/llvm/lib/Passes/PassBuilder.cpp index 19e8a8ab68a73..42fdb2415c2fe 100644 --- a/llvm/lib/Passes/PassBuilder.cpp +++ b/llvm/lib/Passes/PassBuilder.cpp @@ -174,6 +174,7 @@ #include "llvm/Transforms/Instrumentation/ControlHeightReduction.h" #include "llvm/Transforms/Instrumentation/DataFlowSanitizer.h" #include "llvm/Transforms/Instrumentation/GCOVProfiler.h" +#include "llvm/Transforms/Instrumentation/GPUSan.h" #include "llvm/Transforms/Instrumentation/HWAddressSanitizer.h" #include "llvm/Transforms/Instrumentation/InstrOrderFile.h" #include "llvm/Transforms/Instrumentation/InstrProfiling.h" diff --git a/llvm/lib/Passes/PassBuilderPipelines.cpp b/llvm/lib/Passes/PassBuilderPipelines.cpp index 926515c9508a9..410aedd57dc9a 100644 --- a/llvm/lib/Passes/PassBuilderPipelines.cpp +++ b/llvm/lib/Passes/PassBuilderPipelines.cpp @@ -71,6 +71,7 @@ #include "llvm/Transforms/InstCombine/InstCombine.h" #include "llvm/Transforms/Instrumentation/CGProfile.h" #include "llvm/Transforms/Instrumentation/ControlHeightReduction.h" +#include "llvm/Transforms/Instrumentation/GPUSan.h" #include "llvm/Transforms/Instrumentation/InstrOrderFile.h" #include "llvm/Transforms/Instrumentation/InstrProfiling.h" #include "llvm/Transforms/Instrumentation/MemProfiler.h" @@ -163,6 +164,10 @@ static cl::opt cl::Hidden, cl::desc("Enable inline deferral during PGO")); +static cl::opt + EnableOffloadSanitizer("enable-offload-sanitizer", cl::init(false), + cl::Hidden, cl::desc("Enable offload sanitizer")); + static cl::opt EnableModuleInliner("enable-module-inliner", cl::init(false), cl::Hidden, cl::desc("Enable module inliner")); @@ -1750,6 +1755,9 @@ PassBuilder::buildLTODefaultPipeline(OptimizationLevel Level, // in ICP. MPM.addPass(LowerTypeTestsPass(nullptr, nullptr, true)); + if (EnableOffloadSanitizer) + MPM.addPass(GPUSanPass()); + invokeFullLinkTimeOptimizationLastEPCallbacks(MPM, Level); // Emit annotation remarks. @@ -1828,6 +1836,9 @@ PassBuilder::buildLTODefaultPipeline(OptimizationLevel Level, // pipeline). MPM.addPass(LowerTypeTestsPass(nullptr, nullptr, true)); + if (EnableOffloadSanitizer) + MPM.addPass(GPUSanPass()); + invokeFullLinkTimeOptimizationLastEPCallbacks(MPM, Level); // Emit annotation remarks. @@ -2040,6 +2051,9 @@ PassBuilder::buildLTODefaultPipeline(OptimizationLevel Level, if (PTO.CallGraphProfile) MPM.addPass(CGProfilePass(/*InLTOPostLink=*/true)); + if (EnableOffloadSanitizer) + MPM.addPass(GPUSanPass()); + invokeFullLinkTimeOptimizationLastEPCallbacks(MPM, Level); // Emit annotation remarks. diff --git a/llvm/lib/Passes/PassRegistry.def b/llvm/lib/Passes/PassRegistry.def index 60c517790bcab..741c1d3e1842a 100644 --- a/llvm/lib/Passes/PassRegistry.def +++ b/llvm/lib/Passes/PassRegistry.def @@ -78,6 +78,7 @@ MODULE_PASS("inliner-wrapper-no-mandatory-first", MODULE_PASS("insert-gcov-profiling", GCOVProfilerPass()) MODULE_PASS("instrorderfile", InstrOrderFilePass()) MODULE_PASS("instrprof", InstrProfilingLoweringPass()) +MODULE_PASS("gpusan", GPUSanPass()) MODULE_PASS("ctx-instr-lower", PGOCtxProfLoweringPass()) MODULE_PASS("invalidate", InvalidateAllAnalysesPass()) MODULE_PASS("iroutliner", IROutlinerPass()) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp index 6e7d34f5adaa3..a310038543532 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp @@ -2043,7 +2043,9 @@ static bool isPtrKnownNeverNull(const Value *V, const DataLayout &DL, // // TODO: Use ValueTracking's isKnownNeverNull if it becomes aware that some // address spaces have non-zero null values. - auto SrcPtrKB = computeKnownBits(V, DL).trunc(DL.getPointerSizeInBits(AS)); + auto SrcPtrKB = computeKnownBits(V, DL); + if (SrcPtrKB.getBitWidth() > DL.getPointerSizeInBits(AS)) + SrcPtrKB = SrcPtrKB.trunc(DL.getPointerSizeInBits(AS)); const auto NullVal = TM.getNullPointerValue(AS); assert((NullVal == 0 || NullVal == -1) && "don't know how to check for this null value!"); diff --git a/llvm/lib/Transforms/IPO/AttributorAttributes.cpp b/llvm/lib/Transforms/IPO/AttributorAttributes.cpp index 05a38cdd7d7b6..ebdc2c3682aab 100644 --- a/llvm/lib/Transforms/IPO/AttributorAttributes.cpp +++ b/llvm/lib/Transforms/IPO/AttributorAttributes.cpp @@ -7536,7 +7536,7 @@ struct AAPrivatizablePtrArgument final : public AAPrivatizablePtrImpl { // Collect all tail calls in the function as we cannot allow new allocas to // escape into tail recursion. // TODO: Be smarter about new allocas escaping into tail calls. - SmallVector TailCalls; + SmallVector TailCalls; bool UsedAssumedInformation = false; if (!A.checkForAllInstructions( [&](Instruction &I) { @@ -7574,8 +7574,9 @@ struct AAPrivatizablePtrArgument final : public AAPrivatizablePtrImpl { AI, Arg->getType(), "", IP); Arg->replaceAllUsesWith(AI); - for (CallInst *CI : TailCalls) - CI->setTailCall(false); + for (auto &CI : TailCalls) + if (CI) + cast(CI)->setTailCall(false); }; // Callback to repair a call site of the associated function. The elements diff --git a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp index e3a4821b8226b..89ce01907deb7 100644 --- a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp +++ b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp @@ -620,11 +620,13 @@ struct OMPInformationCache : public InformationCache { // functions, except if `optnone` is present. if (isOpenMPDevice(M)) { for (Function &F : M) { - for (StringRef Prefix : {"__kmpc", "_ZN4ompx", "omp_"}) - if (F.hasFnAttribute(Attribute::NoInline) && - F.getName().starts_with(Prefix) && - !F.hasFnAttribute(Attribute::OptimizeNone)) - F.removeFnAttr(Attribute::NoInline); + for (StringRef Prefix : {"__kmpc", "_ZN4ompx", "omp_"}) { + if (!F.getName().starts_with(Prefix) || + F.hasFnAttribute(Attribute::OptimizeNone)) + continue; + F.removeFnAttr(Attribute::NoInline); + F.addFnAttr(Attribute::AlwaysInline); + } } } diff --git a/llvm/lib/Transforms/Instrumentation/CMakeLists.txt b/llvm/lib/Transforms/Instrumentation/CMakeLists.txt index 8d345d394b51a..9aa530229ab7e 100644 --- a/llvm/lib/Transforms/Instrumentation/CMakeLists.txt +++ b/llvm/lib/Transforms/Instrumentation/CMakeLists.txt @@ -6,6 +6,7 @@ add_llvm_component_library(LLVMInstrumentation DataFlowSanitizer.cpp GCOVProfiling.cpp BlockCoverageInference.cpp + GPUSan.cpp MemProfiler.cpp MemorySanitizer.cpp IndirectCallPromotion.cpp diff --git a/llvm/lib/Transforms/Instrumentation/GPUSan.cpp b/llvm/lib/Transforms/Instrumentation/GPUSan.cpp new file mode 100644 index 0000000000000..611ec9c888782 --- /dev/null +++ b/llvm/lib/Transforms/Instrumentation/GPUSan.cpp @@ -0,0 +1,904 @@ +//===-- GPUSan.cpp - GPU sanitizer ----------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +//===----------------------------------------------------------------------===// + +#include "llvm/Transforms/Instrumentation/GPUSan.h" + +#include "llvm/ADT/ArrayRef.h" +#include "llvm/ADT/DenseMapInfo.h" +#include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/SetVector.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/StringMap.h" +#include "llvm/ADT/StringRef.h" +#include "llvm/Analysis/LoopInfo.h" +#include "llvm/Analysis/ValueTracking.h" +#include "llvm/IR/BasicBlock.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/DataLayout.h" +#include "llvm/IR/DerivedTypes.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/GlobalObject.h" +#include "llvm/IR/GlobalValue.h" +#include "llvm/IR/GlobalVariable.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/InstIterator.h" +#include "llvm/IR/Instruction.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/IntrinsicsAMDGPU.h" +#include "llvm/IR/LLVMContext.h" +#include "llvm/IR/Metadata.h" +#include "llvm/IR/Module.h" +#include "llvm/Support/Allocator.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/MathExtras.h" +#include "llvm/Support/StringSaver.h" +#include "llvm/Transforms/Utils/ModuleUtils.h" +#include + +using namespace llvm; + +#define DEBUG_TYPE "gpusan" + +cl::opt UseTags( + "gpusan-use-tags", + cl::desc( + "Use tags to detect use after if the number of allocations is large"), + cl::init(false)); + +namespace llvm { + +struct LocationInfoTy { + uint64_t LineNo = 0; + uint64_t ColumnNo = 0; + uint64_t ParentIdx = -1; + StringRef FileName; + StringRef FunctionName; + bool operator==(const LocationInfoTy &RHS) const { + return LineNo == RHS.LineNo && ColumnNo == RHS.ColumnNo && + FileName == RHS.FileName && FunctionName == RHS.FunctionName; + } +}; +template <> struct DenseMapInfo { + static LocationInfoTy EmptyKey; + static LocationInfoTy TombstoneKey; + static inline LocationInfoTy *getEmptyKey() { return &EmptyKey; } + + static inline LocationInfoTy *getTombstoneKey() { return &TombstoneKey; } + + static unsigned getHashValue(const LocationInfoTy *LI) { + unsigned Hash = DenseMapInfo::getHashValue(LI->LineNo); + Hash = detail::combineHashValue( + Hash, DenseMapInfo::getHashValue(LI->ColumnNo)); + Hash = detail::combineHashValue( + Hash, DenseMapInfo::getHashValue(LI->FileName)); + Hash = detail::combineHashValue( + Hash, DenseMapInfo::getHashValue(LI->FunctionName)); + return Hash; + } + + static bool isEqual(const LocationInfoTy *LHS, const LocationInfoTy *RHS) { + return *LHS == *RHS; + } +}; +LocationInfoTy DenseMapInfo::EmptyKey = + LocationInfoTy{(uint64_t)-1}; +LocationInfoTy DenseMapInfo::TombstoneKey = + LocationInfoTy{(uint64_t)-2}; +} // namespace llvm + +namespace { + +enum PtrOrigin { + UNKNOWN, + LOCAL, + GLOBAL, + SYSTEM, + NONE, +}; + +static std::string getSuffix(PtrOrigin PO) { + switch (PO) { + case UNKNOWN: + return ""; + case LOCAL: + return "_local"; + case GLOBAL: + return "_global"; + default: + break; + } + llvm_unreachable("Bad pointer origin!"); +} + +static StringRef prettifyFunctionName(StringSaver &SS, StringRef Name) { + if (Name.ends_with(".internalized")) + return SS.save(Name.drop_back(sizeof("internalized")) + " (internalized)"); + if (!Name.starts_with("__omp_offloading_")) + return Name; + Name = Name.drop_front(sizeof("__omp_offloading_")); + auto It = Name.find_first_of("_"); + if (It != StringRef::npos && It + 1 < Name.size()) + Name = Name.drop_front(It + 1); + It = Name.find_first_of("_"); + if (It != StringRef::npos && It + 1 < Name.size()) + Name = Name.drop_front(It + 1); + if (Name.ends_with("_debug__")) + Name = Name.drop_back(sizeof("debug__")); + if (Name.ends_with("_debug___omp_outlined_debug__")) + Name = Name.drop_back(sizeof("debug___omp_outlined_debug__")); + It = Name.find_last_of("_"); + if (It == StringRef::npos || It + 1 >= Name.size()) + return Name; + if (Name[It + 1] != 'l') + return Name; + int64_t KernelLineNo = 0; + Name.take_back(Name.size() - It - + /* '_' and 'l' */ 2) + .getAsInteger(10, KernelLineNo); + if (KernelLineNo) + Name = SS.save("omp target (" + Name.take_front(It).str() + ":" + + std::to_string(KernelLineNo) + ")"); + return Name; +} + +class GPUSanImpl final { +public: + GPUSanImpl(Module &M, FunctionAnalysisManager &FAM) + : M(M), FAM(FAM), Ctx(M.getContext()) {} + + bool instrument(); + +private: + bool instrumentGlobals(); + bool instrumentFunction(Function &Fn); + Value *instrumentAllocation(Instruction &I, Value &Size, FunctionCallee Fn, + PtrOrigin PO); + Value *instrumentAllocaInst(LoopInfo &LI, AllocaInst &AI); + void instrumentAccess(LoopInfo &LI, Instruction &I, int PtrIdx, + Type &AccessTy, bool IsRead); + void instrumentLoadInst(LoopInfo &LI, LoadInst &LoadI); + void instrumentStoreInst(LoopInfo &LI, StoreInst &StoreI); + void instrumentGEPInst(LoopInfo &LI, GetElementPtrInst &GEP); + bool instrumentCallInst(LoopInfo &LI, CallInst &CI); + void + instrumentReturns(SmallVectorImpl> &Allocas, + SmallVectorImpl &Returns); + + Value *getPC(IRBuilder<> &IRB); + Value *getFunctionName(IRBuilder<> &IRB); + Value *getFileName(IRBuilder<> &IRB); + Value *getLineNo(IRBuilder<> &IRB); + + void getAllocationInfo(Function &Fn, PtrOrigin PO, Value &Object, + Value *&Start, Value *&Length, Value *&Tag); + PtrOrigin getPtrOrigin(LoopInfo &LI, Value *Ptr, + const Value **Object = nullptr); + + FunctionCallee getOrCreateFn(FunctionCallee &FC, StringRef Name, Type *RetTy, + ArrayRef ArgTys) { + if (!FC) { + auto *NewAllocationFnTy = FunctionType::get(RetTy, ArgTys, false); + FC = M.getOrInsertFunction(Name, NewAllocationFnTy); + } + return FC; + } + + PointerType *getPtrTy(PtrOrigin PO) { + if (PO == PtrOrigin::LOCAL) + return PointerType::get(Ctx, 5); + return PtrTy; + } + + FunctionCallee getNewFn(PtrOrigin PO) { + assert(PO <= GLOBAL && "Origin does not need handling."); + return getOrCreateFn(NewFn[PO], "ompx_new" + getSuffix(PO), getPtrTy(PO), + {getPtrTy(PO), Int64Ty, Int64Ty, Int64Ty}); + } + FunctionCallee getFreeFn(PtrOrigin PO) { + assert(PO <= GLOBAL && "Origin does not need handling."); + return getOrCreateFn(FreeFn[PO], "ompx_free" + getSuffix(PO), VoidTy, + {getPtrTy(PO), Int64Ty}); + } + FunctionCallee getFreeNLocalFn() { + return getOrCreateFn(FreeNLocalFn, "ompx_free_local_n", VoidTy, {Int32Ty}); + } + FunctionCallee getCheckFn(PtrOrigin PO) { + assert(PO <= GLOBAL && "Origin does not need handling."); + return getOrCreateFn(CheckFn[PO], "ompx_check" + getSuffix(PO), + getPtrTy(PO), + {getPtrTy(PO), Int64Ty, Int64Ty, Int64Ty}); + } + FunctionCallee getCheckWithBaseFn(PtrOrigin PO) { + assert(PO >= LOCAL && PO <= GLOBAL && "Origin does not need handling."); + return getOrCreateFn(CheckWithBaseFn[PO], + "ompx_check_with_base" + getSuffix(PO), getPtrTy(PO), + {getPtrTy(PO), getPtrTy(PO), Int64Ty, Int32Ty, Int64Ty, + Int64Ty, Int64Ty}); + } + FunctionCallee getAllocationInfoFn(PtrOrigin PO) { + assert(PO >= LOCAL && PO <= GLOBAL && "Origin does not need handling."); + return getOrCreateFn( + AllocationInfoFn[PO], "ompx_get_allocation_info" + getSuffix(PO), + StructType::create({getPtrTy(PO), Int64Ty, Int32Ty}), {getPtrTy(PO)}); + } + FunctionCallee getGEPFn(PtrOrigin PO) { + assert(PO <= GLOBAL && "Origin does not need handling."); + return getOrCreateFn(GEPFn[PO], "ompx_gep" + getSuffix(PO), getPtrTy(PO), + {getPtrTy(PO), Int64Ty, Int64Ty}); + } + FunctionCallee getUnpackFn(PtrOrigin PO) { + assert(PO <= GLOBAL && "Origin does not need handling."); + return getOrCreateFn(UnpackFn[PO], "ompx_unpack" + getSuffix(PO), + getPtrTy(PO), {getPtrTy(PO), Int64Ty}); + } + FunctionCallee getLifetimeStart() { + return getOrCreateFn(LifetimeStartFn, "ompx_lifetime_start", VoidTy, + {getPtrTy(LOCAL), Int64Ty}); + } + FunctionCallee getLifetimeEnd() { + return getOrCreateFn(LifetimeEndFn, "ompx_lifetime_end", VoidTy, + {getPtrTy(LOCAL), Int64Ty}); + } + FunctionCallee getLeakCheckFn() { + FunctionCallee LeakCheckFn; + return getOrCreateFn(LeakCheckFn, "ompx_leak_check", VoidTy, {}); + } + FunctionCallee getThreadIdFn() { + return getOrCreateFn(ThreadIDFn, "ompx_global_thread_id", Int32Ty, {}); + } + + Module &M; + FunctionAnalysisManager &FAM; + LLVMContext &Ctx; + bool HasAllocas; + GlobalVariable *LocationsArray; + SmallSetVector AmbiguousCalls; + + Type *VoidTy = Type::getVoidTy(Ctx); + Type *IntptrTy = M.getDataLayout().getIntPtrType(Ctx); + PointerType *PtrTy = PointerType::getUnqual(Ctx); + IntegerType *Int8Ty = Type::getInt8Ty(Ctx); + IntegerType *Int32Ty = Type::getInt32Ty(Ctx); + IntegerType *Int64Ty = Type::getInt64Ty(Ctx); + + const DataLayout &DL = M.getDataLayout(); + + FunctionCallee NewFn[3]; + FunctionCallee GEPFn[3]; + FunctionCallee FreeFn[3]; + FunctionCallee CheckFn[3]; + FunctionCallee CheckWithBaseFn[3]; + FunctionCallee AllocationInfoFn[3]; + FunctionCallee UnpackFn[3]; + FunctionCallee LifetimeEndFn; + FunctionCallee LifetimeStartFn; + FunctionCallee FreeNLocalFn; + FunctionCallee ThreadIDFn; + + StringMap GlobalStringMap; + struct AllocationInfoTy { + Value *Start; + Value *Length; + Value *Tag; + }; + DenseMap, AllocationInfoTy> AllocationInfoMap; + + DenseMap> + LocationMap; + + const std::pair + addLocationInfo(LocationInfoTy *LI, bool &IsNew) { + auto It = LocationMap.insert({LI, LocationMap.size()}); + IsNew = It.second; + if (!IsNew) + delete LI; + return {It.first->first, It.first->second}; + } + + void buildCallTreeInfo(Function &Fn, LocationInfoTy &LI); + ConstantInt *getSourceIndex(Instruction &I, LocationInfoTy *LastLI = nullptr); + + uint64_t addString(StringRef S) { + const auto &It = UniqueStrings.insert({S, ConcatenatedString.size()}); + if (It.second) { + ConcatenatedString += S; + ConcatenatedString.push_back('\0'); + } + return It.first->second; + }; + + void encodeLocationInfo(LocationInfoTy &LI, uint64_t Idx) { + StringRef FunctionName = LI.FunctionName; + if (LI.ParentIdx == (decltype(LI.ParentIdx))-1) + FunctionName = prettifyFunctionName(SS, FunctionName); + + auto FuncIdx = addString(FunctionName); + auto FileIdx = addString(LI.FileName); + if (LocationEncoding.size() < (Idx + 1) * 5) + LocationEncoding.resize((Idx + 1) * 5); + LocationEncoding[Idx * 5 + 0] = ConstantInt::get(Int64Ty, FuncIdx); + LocationEncoding[Idx * 5 + 1] = ConstantInt::get(Int64Ty, FileIdx); + LocationEncoding[Idx * 5 + 2] = ConstantInt::get(Int64Ty, LI.LineNo); + LocationEncoding[Idx * 5 + 3] = ConstantInt::get(Int64Ty, LI.ColumnNo); + LocationEncoding[Idx * 5 + 4] = ConstantInt::get(Int64Ty, LI.ParentIdx); + } + + SmallVector LocationEncoding; + std::string ConcatenatedString; + DenseMap StringIndexMap; + DenseMap UniqueStrings; + + BumpPtrAllocator BPA; + StringSaver SS = StringSaver(BPA); +}; + +} // end anonymous namespace + +ConstantInt *GPUSanImpl::getSourceIndex(Instruction &I, + LocationInfoTy *LastLI) { + LocationInfoTy *LI = new LocationInfoTy(); + auto *DILoc = I.getDebugLoc().get(); + + auto FillLI = [&](LocationInfoTy &LI, DILocation &DIL) { + LI.FileName = DIL.getFilename(); + if (LI.FileName.empty()) + LI.FileName = I.getFunction()->getSubprogram()->getFilename(); + LI.FunctionName = DIL.getSubprogramLinkageName(); + if (LI.FunctionName.empty()) + LI.FunctionName = I.getFunction()->getName(); + LI.LineNo = DIL.getLine(); + LI.ColumnNo = DIL.getColumn(); + }; + + DILocation *ParentDILoc = nullptr; + if (DILoc) { + FillLI(*LI, *DILoc); + ParentDILoc = DILoc->getInlinedAt(); + } else { + LI->FunctionName = I.getFunction()->getName(); + } + + bool IsNew; + uint64_t Idx; + std::tie(LI, Idx) = addLocationInfo(LI, IsNew); + if (LastLI) + LastLI->ParentIdx = Idx; + if (!IsNew) + return ConstantInt::get(Int64Ty, Idx); + + uint64_t CurIdx = Idx; + LocationInfoTy *CurLI = LI; + while (ParentDILoc) { + auto *ParentLI = new LocationInfoTy(); + FillLI(*ParentLI, *ParentDILoc); + uint64_t ParentIdx; + std::tie(ParentLI, ParentIdx) = addLocationInfo(ParentLI, IsNew); + CurLI->ParentIdx = ParentIdx; + if (!IsNew) + break; + encodeLocationInfo(*CurLI, CurIdx); + CurLI = ParentLI; + CurIdx = ParentIdx; + ParentDILoc = ParentDILoc->getInlinedAt(); + } + + Function &Fn = *I.getFunction(); + buildCallTreeInfo(Fn, *CurLI); + + encodeLocationInfo(*CurLI, CurIdx); + + return ConstantInt::get(Int64Ty, Idx); +} + +void GPUSanImpl::buildCallTreeInfo(Function &Fn, LocationInfoTy &LI) { + if (Fn.hasFnAttribute("kernel")) + return; + SmallVector Calls; + for (auto &U : Fn.uses()) { + auto *CB = dyn_cast(U.getUser()); + if (!CB) + continue; + if (!CB->isCallee(&U)) + continue; + Calls.push_back(CB); + } + if (Calls.size() == 1) { + getSourceIndex(*Calls.back(), &LI); + return; + } + LI.ParentIdx = -2; + AmbiguousCalls.insert(Calls.begin(), Calls.end()); +} + +Value *GPUSanImpl::getPC(IRBuilder<> &IRB) { + return IRB.CreateIntrinsic(Int64Ty, Intrinsic::amdgcn_s_getpc, {}, nullptr, + "PC"); +} +Value *GPUSanImpl::getFunctionName(IRBuilder<> &IRB) { + const auto &DLoc = IRB.getCurrentDebugLocation(); + StringRef FnName = IRB.GetInsertPoint()->getFunction()->getName(); + if (DLoc && DLoc.get()) { + StringRef SubprogramName = DLoc.get()->getSubprogramLinkageName(); + if (!SubprogramName.empty()) + FnName = SubprogramName; + } + StringRef Name = FnName.take_back(255); + Value *&NameVal = GlobalStringMap[Name]; + if (!NameVal) + NameVal = IRB.CreateAddrSpaceCast( + IRB.CreateGlobalStringPtr(Name, "", DL.getDefaultGlobalsAddressSpace(), + &M), + PtrTy); + return NameVal; +} +Value *GPUSanImpl::getFileName(IRBuilder<> &IRB) { + const auto &DLoc = IRB.getCurrentDebugLocation(); + if (!DLoc || DLoc->getFilename().empty()) + return ConstantPointerNull::get(PtrTy); + StringRef Name = DLoc->getFilename().take_back(255); + Value *&NameVal = GlobalStringMap[Name]; + if (!NameVal) + NameVal = IRB.CreateAddrSpaceCast( + IRB.CreateGlobalStringPtr(Name, "", DL.getDefaultGlobalsAddressSpace(), + &M), + PtrTy); + return NameVal; +} +Value *GPUSanImpl::getLineNo(IRBuilder<> &IRB) { + const auto &DLoc = IRB.getCurrentDebugLocation(); + if (!DLoc) + return Constant::getNullValue(Int64Ty); + return ConstantInt::get(Int64Ty, DLoc.getLine()); +} + +void GPUSanImpl::getAllocationInfo(Function &Fn, PtrOrigin PO, Value &Object, + Value *&Start, Value *&Length, Value *&Tag) { + auto &It = AllocationInfoMap[{&Fn, &Object}]; + if (!It.Start) { + auto *IP = dyn_cast(&Object); + if (IP) + IP = IP->getNextNode(); + else + IP = &*Fn.getEntryBlock().getFirstNonPHIOrDbgOrAlloca(); + IRBuilder<> IRB(IP); + auto *CB = IRB.CreateCall(getAllocationInfoFn(PO), + {IRB.CreateAddrSpaceCast(&Object, getPtrTy(PO))}); + It.Start = IRB.CreateExtractValue(CB, {0}); + It.Length = IRB.CreateExtractValue(CB, {1}); + It.Tag = IRB.CreateExtractValue(CB, {2}); + } + Start = It.Start; + Length = It.Length; + Tag = It.Tag; +} + +PtrOrigin GPUSanImpl::getPtrOrigin(LoopInfo &LI, Value *Ptr, + const Value **Object) { + SmallVector Objects; + getUnderlyingObjects(Ptr, Objects, &LI); + if (Object && Objects.size() == 1) + *Object = Objects.front(); + PtrOrigin PO = NONE; + for (auto *Obj : Objects) { + PtrOrigin ObjPO = HasAllocas ? UNKNOWN : GLOBAL; + if (isa(Obj)) { + ObjPO = LOCAL; + } else if (isa(Obj)) { + ObjPO = GLOBAL; + } else if (auto *II = dyn_cast(Obj)) { + if (II->getIntrinsicID() == Intrinsic::amdgcn_implicitarg_ptr || + II->getIntrinsicID() == Intrinsic::amdgcn_dispatch_ptr) + return SYSTEM; + } else if (auto *CI = dyn_cast(Obj)) { + if (auto *Callee = CI->getCalledFunction()) + if (Callee->getName().starts_with("ompx_")) { + if (Callee->getName().ends_with("_global")) + ObjPO = GLOBAL; + else if (Callee->getName().ends_with("_local")) + ObjPO = LOCAL; + } + } else if (auto *Arg = dyn_cast(Obj)) { + if (Arg->getParent()->hasFnAttribute("kernel")) + ObjPO = GLOBAL; + } + if (PO == NONE || PO == ObjPO) { + PO = ObjPO; + } else { + return UNKNOWN; + } + } + return PO; +} + +bool GPUSanImpl::instrumentGlobals() { + Function *DtorFn = + Function::Create(FunctionType::get(VoidTy, false), + GlobalValue::PrivateLinkage, "__san.dtor", &M); + BasicBlock *Entry = BasicBlock::Create(Ctx, "entry", DtorFn); + IRBuilder<> IRB(Entry); + IRB.CreateCall(getLeakCheckFn()); + IRB.CreateRetVoid(); + appendToGlobalDtors(M, DtorFn, 0, nullptr); + + return true; + + Function *DTorFn; + std::tie(DTorFn, std::ignore) = getOrCreateSanitizerCtorAndInitFunctions( + M, "ompx.ctor", "ompx.init", + /*InitArgTypes=*/{}, + /*InitArgs=*/{}, + // This callback is invoked when the functions are created the first + // time. Hook them into the global ctors list in that case: + [&](Function *Ctor, FunctionCallee) { + appendToGlobalCtors(M, Ctor, 0, Ctor); + }); + return true; +} + +Value *GPUSanImpl::instrumentAllocation(Instruction &I, Value &Size, + FunctionCallee Fn, PtrOrigin PO) { + IRBuilder<> IRB(I.getNextNode()); + Value *PlainI = IRB.CreatePointerBitCastOrAddrSpaceCast(&I, getPtrTy(PO)); + static int AllocationId = 1; + auto *CB = + IRB.CreateCall(Fn, + {PlainI, &Size, ConstantInt::get(Int64Ty, AllocationId++), + getSourceIndex(I)}, + I.getName() + ".san"); + SmallVector Lifetimes; + I.replaceUsesWithIf( + IRB.CreatePointerBitCastOrAddrSpaceCast(CB, I.getType()), [&](Use &U) { + if (auto *LT = dyn_cast(U.getUser())) { + Lifetimes.push_back(LT); + return false; + } + return U.getUser() != PlainI && U.getUser() != CB; + }); + if (Lifetimes.empty()) + return CB; + + CB->setArgOperand(1, ConstantInt::get(Int64Ty, 0)); + for (auto *LT : Lifetimes) { + if (LT->getIntrinsicID() == Intrinsic::lifetime_start) { + IRB.SetInsertPoint(LT); + IRB.CreateCall(getLifetimeStart(), {CB, LT->getArgOperand(0)}); + } else { + IRB.SetInsertPoint(LT); + IRB.CreateCall(getLifetimeEnd(), {CB, LT->getArgOperand(0)}); + } + } + return CB; +} + +Value *GPUSanImpl::instrumentAllocaInst(LoopInfo &LI, AllocaInst &AI) { + auto SizeOrNone = AI.getAllocationSize(DL); + if (!SizeOrNone) + llvm_unreachable("TODO"); + Value *Size = ConstantInt::get(Int64Ty, *SizeOrNone); + return instrumentAllocation(AI, *Size, getNewFn(LOCAL), LOCAL); +} + +void GPUSanImpl::instrumentAccess(LoopInfo &LI, Instruction &I, int PtrIdx, + Type &AccessTy, bool IsRead) { + Value *PtrOp = I.getOperand(PtrIdx); + const Value *Object = nullptr; + PtrOrigin PO = getPtrOrigin(LI, PtrOp, &Object); + if (PO > GLOBAL) + return; + + Value *Start = nullptr; + Value *Length = nullptr; + Value *Tag = nullptr; + if (PO != UNKNOWN && Object) + getAllocationInfo(*I.getFunction(), PO, *const_cast(Object), Start, + Length, Tag); + + static int32_t ReadAccessId = -1; + static int32_t WriteAccessId = 1; + const int32_t &AccessId = IsRead ? ReadAccessId-- : WriteAccessId++; + + auto TySize = DL.getTypeStoreSize(&AccessTy); + assert(!TySize.isScalable()); + Value *Size = ConstantInt::get(Int64Ty, TySize.getFixedValue()); + IRBuilder<> IRB(&I); + Value *PlainPtrOp = + IRB.CreatePointerBitCastOrAddrSpaceCast(PtrOp, getPtrTy(PO)); + CallInst *CB; + if (Start) { + CB = + IRB.CreateCall(getCheckWithBaseFn(PO), + {PlainPtrOp, Start, Length, Tag, Size, + ConstantInt::get(Int64Ty, AccessId), getSourceIndex(I)}, + I.getName() + ".san"); + } else { + CB = IRB.CreateCall(getCheckFn(PO), + {PlainPtrOp, Size, ConstantInt::get(Int64Ty, AccessId), + getSourceIndex(I)}, + I.getName() + ".san"); + } + I.setOperand(PtrIdx, + IRB.CreatePointerBitCastOrAddrSpaceCast(CB, PtrOp->getType())); +} + +void GPUSanImpl::instrumentLoadInst(LoopInfo &LI, LoadInst &LoadI) { + instrumentAccess(LI, LoadI, LoadInst::getPointerOperandIndex(), + *LoadI.getType(), + /*IsRead=*/true); +} + +void GPUSanImpl::instrumentStoreInst(LoopInfo &LI, StoreInst &StoreI) { + instrumentAccess(LI, StoreI, StoreInst::getPointerOperandIndex(), + *StoreI.getValueOperand()->getType(), /*IsRead=*/false); +} + +void GPUSanImpl::instrumentGEPInst(LoopInfo &LI, GetElementPtrInst &GEP) { + Value *PtrOp = GEP.getPointerOperand(); + PtrOrigin PO = getPtrOrigin(LI, PtrOp); + if (PO > GLOBAL) + return; + + GEP.setOperand(GetElementPtrInst::getPointerOperandIndex(), + Constant::getNullValue(PtrOp->getType())); + IRBuilder<> IRB(GEP.getNextNode()); + Value *PlainPtrOp = + IRB.CreatePointerBitCastOrAddrSpaceCast(PtrOp, getPtrTy(PO)); + auto *CB = IRB.CreateCall(getGEPFn(PO), + {PlainPtrOp, UndefValue::get(Int64Ty), getPC(IRB)}, + GEP.getName() + ".san"); + GEP.replaceAllUsesWith( + IRB.CreatePointerBitCastOrAddrSpaceCast(CB, GEP.getType())); + Value *Offset = + new PtrToIntInst(&GEP, Int64Ty, GEP.getName() + ".san.offset", CB); + CB->setArgOperand(1, Offset); +} + +bool GPUSanImpl::instrumentCallInst(LoopInfo &LI, CallInst &CI) { + bool Changed = false; + if (isa(CI)) + return Changed; + if (auto *Fn = CI.getCalledFunction()) { + if (Fn->getName().starts_with("__kmpc_target_init")) + return Changed; + if ((Fn->isDeclaration() || Fn->getName().starts_with("__kmpc") || + Fn->getName().starts_with("rpc_")) && + !Fn->getName().starts_with("ompx")) { + IRBuilder<> IRB(&CI); + for (int I = 0, E = CI.arg_size(); I != E; ++I) { + Value *Op = CI.getArgOperand(I); + if (!Op->getType()->isPointerTy()) + continue; + PtrOrigin PO = getPtrOrigin(LI, Op); + if (PO > GLOBAL) + continue; + Value *PlainOp = + IRB.CreatePointerBitCastOrAddrSpaceCast(Op, getPtrTy(PO)); + auto *CB = IRB.CreateCall(getUnpackFn(PO), {PlainOp, getPC(IRB)}, + Op->getName() + ".unpack"); + CI.setArgOperand( + I, IRB.CreatePointerBitCastOrAddrSpaceCast(CB, Op->getType())); + Changed = true; + } + } + } + return Changed; +} + +bool GPUSanImpl::instrumentFunction(Function &Fn) { + if (Fn.isDeclaration()) + return false; + + bool Changed = false; + LoopInfo &LI = FAM.getResult(Fn); + SmallVector> Allocas; + SmallVector Returns; + SmallVector Loads; + SmallVector Stores; + SmallVector Calls; + SmallVector GEPs; + + for (auto &I : instructions(Fn)) { + switch (I.getOpcode()) { + case Instruction::Alloca: { + AllocaInst &AI = cast(I); + Allocas.push_back({&AI, nullptr}); + Changed = true; + break; + } + case Instruction::Load: + Loads.push_back(&cast(I)); + Changed = true; + break; + case Instruction::Store: + Stores.push_back(&cast(I)); + Changed = true; + break; + case Instruction::GetElementPtr: + GEPs.push_back(&cast(I)); + Changed = true; + break; + case Instruction::Call: { + auto &CI = cast(I); + Calls.push_back(&CI); + if (CI.isIndirectCall()) + AmbiguousCalls.insert(&CI); + break; + } + case Instruction::Ret: + Returns.push_back(&cast(I)); + break; + default: + break; + } + } + + for (auto *Load : Loads) + instrumentLoadInst(LI, *Load); + for (auto *Store : Stores) + instrumentStoreInst(LI, *Store); + for (auto *GEP : GEPs) + instrumentGEPInst(LI, *GEP); + for (auto *Call : Calls) + Changed |= instrumentCallInst(LI, *Call); + for (auto &It : Allocas) + It.second = instrumentAllocaInst(LI, *It.first); + + instrumentReturns(Allocas, Returns); + + return Changed; +} + +void GPUSanImpl::instrumentReturns( + SmallVectorImpl> &Allocas, + SmallVectorImpl &Returns) { + if (Allocas.empty()) + return; + for (auto *RI : Returns) { + IRBuilder<> IRB(RI); + IRB.CreateCall(getFreeNLocalFn(), + {ConstantInt::get(Int32Ty, Allocas.size())}); + } +} + +bool GPUSanImpl::instrument() { + bool Changed = instrumentGlobals(); + HasAllocas = [&]() { + for (Function &Fn : M) + for (auto &I : instructions(Fn)) + if (isa(I)) + return true; + return false; + }(); + + SmallVector Kernels; + for (Function &Fn : M) { + if (Fn.hasFnAttribute("kernel")) + Kernels.push_back(&Fn); + if (!Fn.getName().contains("ompx") && !Fn.getName().contains("__kmpc") && + !Fn.getName().starts_with("rpc_")) + if (!Fn.hasFnAttribute(Attribute::DisableSanitizerInstrumentation)) + Changed |= instrumentFunction(Fn); + } + + SmallVector AmbiguousCallsOrdered; + SmallVector AmbiguousCallsMapping; + if (LocationMap.empty()) + AmbiguousCalls.clear(); + for (size_t I = 0; I < AmbiguousCalls.size(); ++I) { + CallBase &CB = *AmbiguousCalls[I]; + AmbiguousCallsOrdered.push_back(&CB); + AmbiguousCallsMapping.push_back(getSourceIndex(CB)); + } + + uint64_t AmbiguousCallsBitWidth = + llvm::Log2_64_Ceil(AmbiguousCalls.size() + 1); + + new GlobalVariable(M, Int64Ty, /*isConstant=*/true, + GlobalValue::ExternalLinkage, + ConstantInt::get(Int64Ty, AmbiguousCallsBitWidth), + "__san.num_ambiguous_calls", nullptr, + GlobalValue::ThreadLocalMode::NotThreadLocal, 1); + + if (size_t NumAmbiguousCalls = AmbiguousCalls.size()) { + { + auto *ArrayTy = ArrayType::get(Int64Ty, NumAmbiguousCalls); + auto *GV = new GlobalVariable( + M, ArrayTy, /*isConstant=*/true, GlobalValue::ExternalLinkage, + ConstantArray::get(ArrayTy, AmbiguousCallsMapping), + "__san.ambiguous_calls_mapping", nullptr, + GlobalValue::ThreadLocalMode::NotThreadLocal, 4); + GV->setVisibility(GlobalValue::ProtectedVisibility); + } + + auto *ArrayTy = ArrayType::get(Int64Ty, 1024); + LocationsArray = new GlobalVariable( + M, ArrayTy, /*isConstant=*/false, GlobalValue::PrivateLinkage, + UndefValue::get(ArrayTy), "__san.calls", nullptr, + GlobalValue::ThreadLocalMode::NotThreadLocal, 3); + + auto *OldFn = M.getFunction("__san_get_location_value"); + if (OldFn) + OldFn->setName(""); + Function *LocationGetter = Function::Create( + FunctionType::get(Int64Ty, false), GlobalValue::ExternalLinkage, + "__san_get_location_value", M); + if (OldFn) { + OldFn->replaceAllUsesWith(LocationGetter); + OldFn->eraseFromParent(); + } + auto *EntryBB = BasicBlock::Create(Ctx, "entry", LocationGetter); + IRBuilder<> IRB(EntryBB); + Value *Idx = IRB.CreateCall(getThreadIdFn(), {}, "san.gtid"); + Value *Ptr = IRB.CreateGEP(Int64Ty, LocationsArray, {Idx}); + auto *LocationValue = IRB.CreateLoad(Int64Ty, Ptr); + IRB.CreateRet(LocationValue); + } + + Function *InitSharedFn = + Function::Create(FunctionType::get(VoidTy, false), + GlobalValue::PrivateLinkage, "__san.init_shared", &M); + auto *EntryBB = BasicBlock::Create(Ctx, "entry", InitSharedFn); + IRBuilder<> IRB(EntryBB); + if (!AmbiguousCalls.empty()) { + Value *Idx = IRB.CreateCall(getThreadIdFn(), {}, "san.gtid"); + Value *Ptr = IRB.CreateGEP(Int64Ty, LocationsArray, {Idx}); + IRB.CreateStore(ConstantInt::get(Int64Ty, 0), Ptr); + } + IRB.CreateRetVoid(); + + for (auto *KernelFn : Kernels) { + IRBuilder<> IRB(&*KernelFn->getEntryBlock().getFirstNonPHIOrDbgOrAlloca()); + IRB.CreateCall(InitSharedFn, {}); + } + + for (const auto &It : llvm::enumerate(AmbiguousCallsOrdered)) { + IRBuilder<> IRB(It.value()); + Value *Idx = IRB.CreateCall(getThreadIdFn(), {}, "san.gtid"); + Value *Ptr = IRB.CreateGEP(Int64Ty, LocationsArray, {Idx}); + Value *OldVal = IRB.CreateLoad(Int64Ty, Ptr); + Value *OldValShifted = IRB.CreateShl( + OldVal, ConstantInt::get(Int64Ty, AmbiguousCallsBitWidth)); + Value *NewVal = IRB.CreateBinOp(Instruction::Or, OldValShifted, + ConstantInt::get(Int64Ty, It.index() + 1)); + IRB.CreateStore(NewVal, Ptr); + IRB.SetInsertPoint(It.value()->getNextNode()); + IRB.CreateStore(OldVal, Ptr); + } + + auto *NamesTy = ArrayType::get(Int8Ty, ConcatenatedString.size() + 1); + auto *Names = new GlobalVariable( + M, NamesTy, /*isConstant=*/true, GlobalValue::ExternalLinkage, + ConstantDataArray::getString(Ctx, ConcatenatedString), + "__san.location_names", nullptr, + GlobalValue::ThreadLocalMode::NotThreadLocal, 4); + Names->setVisibility(GlobalValue::ProtectedVisibility); + + auto *ArrayTy = ArrayType::get(Int64Ty, LocationEncoding.size()); + auto *GV = new GlobalVariable( + M, ArrayTy, /*isConstant=*/true, GlobalValue::ExternalLinkage, + ConstantArray::get(ArrayTy, LocationEncoding), "__san.locations", nullptr, + GlobalValue::ThreadLocalMode::NotThreadLocal, 4); + GV->setVisibility(GlobalValue::ProtectedVisibility); + + M.dump(); + return Changed; +} + +PreservedAnalyses GPUSanPass::run(Module &M, ModuleAnalysisManager &AM) { + FunctionAnalysisManager &FAM = + AM.getResult(M).getManager(); + GPUSanImpl Lowerer(M, FAM); + if (!Lowerer.instrument()) + return PreservedAnalyses::all(); + LLVM_DEBUG(M.dump()); + + return PreservedAnalyses::none(); +} diff --git a/offload/DeviceRTL/CMakeLists.txt b/offload/DeviceRTL/CMakeLists.txt index d88430a52b8b7..644befc532ab2 100644 --- a/offload/DeviceRTL/CMakeLists.txt +++ b/offload/DeviceRTL/CMakeLists.txt @@ -69,6 +69,7 @@ elseif(LIBOMPTARGET_DEVICE_ARCHITECTURES STREQUAL "auto" OR "${LIBOMPTARGET_NVPTX_DETECTED_ARCH_LIST};${LIBOMPTARGET_AMDGPU_DETECTED_ARCH_LIST}") endif() list(REMOVE_DUPLICATES LIBOMPTARGET_DEVICE_ARCHITECTURES) +set(LIBOMPTARGET_DEVICE_ARCHITECTURES ${LIBOMPTARGET_DEVICE_ARCHITECTURES} PARENT_SCOPE) set(include_files ${include_directory}/Allocator.h @@ -85,6 +86,7 @@ set(include_files set(src_files ${source_directory}/Allocator.cpp + ${source_directory}/Sanitizer.cpp ${source_directory}/Configuration.cpp ${source_directory}/Debug.cpp ${source_directory}/Kernel.cpp diff --git a/offload/DeviceRTL/include/Allocator.h b/offload/DeviceRTL/include/Allocator.h index a28eb0fb2977e..6bb1cafac720f 100644 --- a/offload/DeviceRTL/include/Allocator.h +++ b/offload/DeviceRTL/include/Allocator.h @@ -12,7 +12,7 @@ #ifndef OMPTARGET_ALLOCATOR_H #define OMPTARGET_ALLOCATOR_H -#include "Types.h" +#include "DeviceTypes.h" // Forward declaration. struct KernelEnvironmentTy; diff --git a/offload/DeviceRTL/include/Configuration.h b/offload/DeviceRTL/include/Configuration.h index 8e6f5c89cbf24..f8b7a6c3c6c9d 100644 --- a/offload/DeviceRTL/include/Configuration.h +++ b/offload/DeviceRTL/include/Configuration.h @@ -15,7 +15,7 @@ #include "Shared/Environment.h" -#include "Types.h" +#include "DeviceTypes.h" namespace ompx { namespace config { diff --git a/offload/DeviceRTL/include/Types.h b/offload/DeviceRTL/include/DeviceTypes.h similarity index 99% rename from offload/DeviceRTL/include/Types.h rename to offload/DeviceRTL/include/DeviceTypes.h index 2e12d9da0353b..bf30ba31260f5 100644 --- a/offload/DeviceRTL/include/Types.h +++ b/offload/DeviceRTL/include/DeviceTypes.h @@ -1,4 +1,4 @@ -//===---------- Types.h - OpenMP types ---------------------------- C++ -*-===// +//===---- DeviceTypes.h - OpenMP types ---------------------------- C++ -*-===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. diff --git a/offload/DeviceRTL/include/DeviceUtils.h b/offload/DeviceRTL/include/DeviceUtils.h new file mode 100644 index 0000000000000..7b8871a766161 --- /dev/null +++ b/offload/DeviceRTL/include/DeviceUtils.h @@ -0,0 +1,55 @@ +//===--------- DeviceUtils.h - OpenMP device runtime utility functions -- C++ +//-*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// +//===----------------------------------------------------------------------===// + +#ifndef OMPTARGET_DEVICERTL_DEVICE_UTILS_H +#define OMPTARGET_DEVICERTL_DEVICE_UTILS_H + +#include "Shared/Utils.h" +#include "DeviceTypes.h" + +#pragma omp begin declare target device_type(nohost) + +namespace utils { + +/// Return the value \p Var from thread Id \p SrcLane in the warp if the thread +/// is identified by \p Mask. +int32_t shuffle(uint64_t Mask, int32_t Var, int32_t SrcLane); + +int32_t shuffleDown(uint64_t Mask, int32_t Var, uint32_t Delta, int32_t Width); + +int64_t shuffleDown(uint64_t Mask, int64_t Var, uint32_t Delta, int32_t Width); + +uint64_t ballotSync(uint64_t Mask, int32_t Pred); + +/// Return \p LowBits and \p HighBits packed into a single 64 bit value. +uint64_t pack(uint32_t LowBits, uint32_t HighBits); + +/// Unpack \p Val into \p LowBits and \p HighBits. +void unpack(uint64_t Val, uint32_t &LowBits, uint32_t &HighBits); + +/// Return true iff \p Ptr is pointing into shared (local) memory (AS(3)). +bool isSharedMemPtr(void *Ptr); + +/// Return true iff \p Ptr is pointing into (thread) local memory (AS(5)). +bool isThreadLocalMemPtr(void *Ptr); + +/// A pointer variable that has by design an `undef` value. Use with care. +[[clang::loader_uninitialized]] static void *const UndefPtr; + +#define OMP_LIKELY(EXPR) __builtin_expect((bool)(EXPR), true) +#define OMP_UNLIKELY(EXPR) __builtin_expect((bool)(EXPR), false) + +} // namespace utils + +#pragma omp end declare target + +#endif diff --git a/offload/DeviceRTL/include/Interface.h b/offload/DeviceRTL/include/Interface.h index f4854ed3d1678..12244a0971f49 100644 --- a/offload/DeviceRTL/include/Interface.h +++ b/offload/DeviceRTL/include/Interface.h @@ -14,7 +14,7 @@ #include "Shared/Environment.h" -#include "Types.h" +#include "DeviceTypes.h" /// External API /// diff --git a/offload/DeviceRTL/include/LibC.h b/offload/DeviceRTL/include/LibC.h index dde86af783af9..6e02b4aca462a 100644 --- a/offload/DeviceRTL/include/LibC.h +++ b/offload/DeviceRTL/include/LibC.h @@ -12,7 +12,7 @@ #ifndef OMPTARGET_LIBC_H #define OMPTARGET_LIBC_H -#include "Types.h" +#include "DeviceTypes.h" extern "C" { diff --git a/offload/DeviceRTL/include/Mapping.h b/offload/DeviceRTL/include/Mapping.h index 165904644dbb9..2fb87abe5418c 100644 --- a/offload/DeviceRTL/include/Mapping.h +++ b/offload/DeviceRTL/include/Mapping.h @@ -12,7 +12,7 @@ #ifndef OMPTARGET_MAPPING_H #define OMPTARGET_MAPPING_H -#include "Types.h" +#include "DeviceTypes.h" namespace ompx { diff --git a/offload/DeviceRTL/include/State.h b/offload/DeviceRTL/include/State.h index 1a3490394458f..37699529e726f 100644 --- a/offload/DeviceRTL/include/State.h +++ b/offload/DeviceRTL/include/State.h @@ -16,8 +16,8 @@ #include "Debug.h" #include "Mapping.h" -#include "Types.h" -#include "Utils.h" +#include "DeviceTypes.h" +#include "DeviceUtils.h" // Forward declaration. struct KernelEnvironmentTy; diff --git a/offload/DeviceRTL/include/Synchronization.h b/offload/DeviceRTL/include/Synchronization.h index af9e1a673e6a2..874974cc861df 100644 --- a/offload/DeviceRTL/include/Synchronization.h +++ b/offload/DeviceRTL/include/Synchronization.h @@ -12,7 +12,7 @@ #ifndef OMPTARGET_DEVICERTL_SYNCHRONIZATION_H #define OMPTARGET_DEVICERTL_SYNCHRONIZATION_H -#include "Types.h" +#include "DeviceTypes.h" namespace ompx { diff --git a/offload/DeviceRTL/include/Utils.h b/offload/DeviceRTL/include/Utils.h deleted file mode 100644 index 82e2397b5958b..0000000000000 --- a/offload/DeviceRTL/include/Utils.h +++ /dev/null @@ -1,100 +0,0 @@ -//===--------- Utils.h - OpenMP device runtime utility functions -- C++ -*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -// -//===----------------------------------------------------------------------===// - -#ifndef OMPTARGET_DEVICERTL_UTILS_H -#define OMPTARGET_DEVICERTL_UTILS_H - -#include "Types.h" - -#pragma omp begin declare target device_type(nohost) - -namespace ompx { -namespace utils { - -/// Return the value \p Var from thread Id \p SrcLane in the warp if the thread -/// is identified by \p Mask. -int32_t shuffle(uint64_t Mask, int32_t Var, int32_t SrcLane); - -int32_t shuffleDown(uint64_t Mask, int32_t Var, uint32_t Delta, int32_t Width); - -int64_t shuffleDown(uint64_t Mask, int64_t Var, uint32_t Delta, int32_t Width); - -uint64_t ballotSync(uint64_t Mask, int32_t Pred); - -/// Return \p LowBits and \p HighBits packed into a single 64 bit value. -uint64_t pack(uint32_t LowBits, uint32_t HighBits); - -/// Unpack \p Val into \p LowBits and \p HighBits. -void unpack(uint64_t Val, uint32_t &LowBits, uint32_t &HighBits); - -/// Round up \p V to a \p Boundary. -template inline Ty roundUp(Ty V, Ty Boundary) { - return (V + Boundary - 1) / Boundary * Boundary; -} - -/// Advance \p Ptr by \p Bytes bytes. -template inline Ty1 *advance(Ty1 Ptr, Ty2 Bytes) { - return reinterpret_cast(reinterpret_cast(Ptr) + Bytes); -} - -/// Return the first bit set in \p V. -inline uint32_t ffs(uint32_t V) { - static_assert(sizeof(int) == sizeof(uint32_t), "type size mismatch"); - return __builtin_ffs(V); -} - -/// Return the first bit set in \p V. -inline uint32_t ffs(uint64_t V) { - static_assert(sizeof(long) == sizeof(uint64_t), "type size mismatch"); - return __builtin_ffsl(V); -} - -/// Return the number of bits set in \p V. -inline uint32_t popc(uint32_t V) { - static_assert(sizeof(int) == sizeof(uint32_t), "type size mismatch"); - return __builtin_popcount(V); -} - -/// Return the number of bits set in \p V. -inline uint32_t popc(uint64_t V) { - static_assert(sizeof(long) == sizeof(uint64_t), "type size mismatch"); - return __builtin_popcountl(V); -} - -/// Return \p V aligned "upwards" according to \p Align. -template inline Ty1 align_up(Ty1 V, Ty2 Align) { - return ((V + Ty1(Align) - 1) / Ty1(Align)) * Ty1(Align); -} -/// Return \p V aligned "downwards" according to \p Align. -template inline Ty1 align_down(Ty1 V, Ty2 Align) { - return V - V % Align; -} - -/// Return true iff \p Ptr is pointing into shared (local) memory (AS(3)). -bool isSharedMemPtr(void *Ptr); - -/// Return \p V typed punned as \p DstTy. -template inline DstTy convertViaPun(SrcTy V) { - return *((DstTy *)(&V)); -} - -/// A pointer variable that has by design an `undef` value. Use with care. -[[clang::loader_uninitialized]] static void *const UndefPtr; - -#define OMP_LIKELY(EXPR) __builtin_expect((bool)(EXPR), true) -#define OMP_UNLIKELY(EXPR) __builtin_expect((bool)(EXPR), false) - -} // namespace utils -} // namespace ompx - -#pragma omp end declare target - -#endif diff --git a/offload/DeviceRTL/src/Allocator.cpp b/offload/DeviceRTL/src/Allocator.cpp index c9c940de62c1a..2a85a34d32f6e 100644 --- a/offload/DeviceRTL/src/Allocator.cpp +++ b/offload/DeviceRTL/src/Allocator.cpp @@ -14,8 +14,8 @@ #include "Configuration.h" #include "Mapping.h" #include "Synchronization.h" -#include "Types.h" -#include "Utils.h" +#include "DeviceTypes.h" +#include "DeviceUtils.h" using namespace ompx; diff --git a/offload/DeviceRTL/src/Configuration.cpp b/offload/DeviceRTL/src/Configuration.cpp index ef0c3663536f5..4d97ad67313aa 100644 --- a/offload/DeviceRTL/src/Configuration.cpp +++ b/offload/DeviceRTL/src/Configuration.cpp @@ -13,7 +13,7 @@ #include "Configuration.h" #include "State.h" -#include "Types.h" +#include "DeviceTypes.h" using namespace ompx; diff --git a/offload/DeviceRTL/src/Debug.cpp b/offload/DeviceRTL/src/Debug.cpp index 31cd54e3de35c..bf228ec1b32cc 100644 --- a/offload/DeviceRTL/src/Debug.cpp +++ b/offload/DeviceRTL/src/Debug.cpp @@ -17,7 +17,7 @@ #include "Interface.h" #include "Mapping.h" #include "State.h" -#include "Types.h" +#include "DeviceTypes.h" using namespace ompx; diff --git a/offload/DeviceRTL/src/Kernel.cpp b/offload/DeviceRTL/src/Kernel.cpp index 95d4c728016d2..afee129c2589c 100644 --- a/offload/DeviceRTL/src/Kernel.cpp +++ b/offload/DeviceRTL/src/Kernel.cpp @@ -18,7 +18,7 @@ #include "Mapping.h" #include "State.h" #include "Synchronization.h" -#include "Types.h" +#include "DeviceTypes.h" #include "llvm/Frontend/OpenMP/OMPDeviceConstants.h" diff --git a/offload/DeviceRTL/src/Mapping.cpp b/offload/DeviceRTL/src/Mapping.cpp index c1ce878746a69..4d89c11ee7e0f 100644 --- a/offload/DeviceRTL/src/Mapping.cpp +++ b/offload/DeviceRTL/src/Mapping.cpp @@ -12,8 +12,8 @@ #include "Mapping.h" #include "Interface.h" #include "State.h" -#include "Types.h" -#include "Utils.h" +#include "DeviceTypes.h" +#include "DeviceUtils.h" #pragma omp begin declare target device_type(nohost) @@ -364,6 +364,13 @@ _TGT_KERNEL_LANGUAGE(block_id, getBlockIdInKernel) _TGT_KERNEL_LANGUAGE(block_dim, getNumberOfThreadsInBlock) _TGT_KERNEL_LANGUAGE(grid_dim, getNumberOfBlocksInKernel) +extern "C" [[clang::disable_sanitizer_instrumentation, gnu::flatten, + gnu::always_inline, gnu::used, gnu::retain]] int +ompx_global_thread_id() { + return ompx_thread_id(0) + ompx_thread_id(1) * ompx_block_dim(0) + + ompx_thread_id(2) * ompx_block_dim(0) * ompx_block_dim(1); +} + extern "C" { uint64_t ompx_ballot_sync(uint64_t mask, int pred) { return utils::ballotSync(mask, pred); diff --git a/offload/DeviceRTL/src/Misc.cpp b/offload/DeviceRTL/src/Misc.cpp index c24af9442d16e..ca8b549b28dbf 100644 --- a/offload/DeviceRTL/src/Misc.cpp +++ b/offload/DeviceRTL/src/Misc.cpp @@ -10,7 +10,7 @@ //===----------------------------------------------------------------------===// #include "Configuration.h" -#include "Types.h" +#include "DeviceTypes.h" #include "Debug.h" diff --git a/offload/DeviceRTL/src/Parallelism.cpp b/offload/DeviceRTL/src/Parallelism.cpp index 031a5ced25518..2a510e9531657 100644 --- a/offload/DeviceRTL/src/Parallelism.cpp +++ b/offload/DeviceRTL/src/Parallelism.cpp @@ -37,8 +37,8 @@ #include "Mapping.h" #include "State.h" #include "Synchronization.h" -#include "Types.h" -#include "Utils.h" +#include "DeviceTypes.h" +#include "DeviceUtils.h" using namespace ompx; diff --git a/offload/DeviceRTL/src/Reduction.cpp b/offload/DeviceRTL/src/Reduction.cpp index 744d1a3a231c8..f4e2e0d25bde9 100644 --- a/offload/DeviceRTL/src/Reduction.cpp +++ b/offload/DeviceRTL/src/Reduction.cpp @@ -15,8 +15,8 @@ #include "Mapping.h" #include "State.h" #include "Synchronization.h" -#include "Types.h" -#include "Utils.h" +#include "DeviceTypes.h" +#include "DeviceUtils.h" using namespace ompx; diff --git a/offload/DeviceRTL/src/Sanitizer.cpp b/offload/DeviceRTL/src/Sanitizer.cpp new file mode 100644 index 0000000000000..2acf15c22d992 --- /dev/null +++ b/offload/DeviceRTL/src/Sanitizer.cpp @@ -0,0 +1,426 @@ +//===------ Sanitizer.cpp - Track allocation for sanitizer checks ---------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +//===----------------------------------------------------------------------===// + +#include "DeviceTypes.h" +#include "DeviceUtils.h" +#include "Interface.h" +#include "LibC.h" +#include "Shared/Environment.h" +#include "Synchronization.h" + +using namespace ompx; +using namespace utils; + +#pragma omp begin declare target device_type(nohost) + +#include "Shared/Sanitizer.h" + +struct AllocationInfoLocalTy { + _AS_PTR(void, AllocationKind::LOCAL) Start; + uint64_t Length; + uint32_t Tag; +}; +struct AllocationInfoGlobalTy { + _AS_PTR(void, AllocationKind::GLOBAL) Start; + uint64_t Length; + uint32_t Tag; +}; + +template struct AllocationInfoTy {}; +template <> struct AllocationInfoTy { + using ASVoidPtrTy = AllocationInfoGlobalTy; +}; +template <> struct AllocationInfoTy { + using ASVoidPtrTy = AllocationInfoLocalTy; +}; + +template <> +AllocationPtrTy +AllocationPtrTy::get(_AS_PTR(void, AllocationKind::LOCAL) + P) { + TypePunUnion TPU; + TPU.P = (void *)P; + return TPU.AP; +} + +template <> +AllocationPtrTy::operator _AS_PTR( + void, AllocationKind::LOCAL)() const { + TypePunUnion TPU; + TPU.AP = *this; + return TPU.AddrP; +} + +template struct AllocationTracker { + static_assert(sizeof(AllocationTy) == sizeof(_AS_PTR(void, AK)) * 2, + "AllocationTy should not exceed two pointers"); + // static_assert(sizeof(AllocationPtrTy) * 8 == + // SanitizerConfig::ADDR_SPACE_PTR_SIZE, + // "AllocationTy pointers should be pointer sized"); + + [[clang::disable_sanitizer_instrumentation]] static + typename AllocationInfoTy::ASVoidPtrTy + getAllocationInfo(_AS_PTR(void, AK) P) { + AllocationPtrTy AP = AllocationPtrTy::get(P); + uint32_t AllocationId = AP.AllocationId; + if (OMP_UNLIKELY(AllocationId >= SanitizerConfig::SLOTS)) + return {P, 0, (uint32_t)-1}; + auto &A = getAllocation(AP); + return {A.Start, A.Length, (uint32_t)A.Tag}; + } + + [[clang::disable_sanitizer_instrumentation]] static _AS_PTR(void, AK) + create(_AS_PTR(void, AK) Start, uint64_t Length, int64_t AllocationId, + uint64_t Slot, int64_t SourceId) { + if constexpr (SanitizerConfig::OFFSET_BITS < 64) + if (OMP_UNLIKELY(Length >= (1UL << (SanitizerConfig::OFFSET_BITS)))) + __sanitizer_trap_info_ptr->exceedsAllocationLength( + Start, Length, AllocationId, Slot, SourceId); + + // Reserve the 0 element for the null pointer in global space. + auto &AllocArr = getAllocationArray(); + auto &Cnt = AllocArr.Cnt; + if constexpr (AK == AllocationKind::LOCAL) + Slot = ++Cnt; + + uint64_t NumSlots = SanitizerConfig::SLOTS; + if (OMP_UNLIKELY(Slot >= NumSlots)) + __sanitizer_trap_info_ptr->exceedsAllocationSlots( + Start, Length, AllocationId, Slot, SourceId); + + auto &A = AllocArr.Arr[Slot]; + + A.Start = Start; + A.Length = Length; + A.Id = AllocationId; + + AllocationPtrTy AP; + AP.Offset = 0; + if constexpr (SanitizerConfig::useTags()) { + AP.AllocationTag = ++A.Tag; + } + AP.AllocationId = Slot; + AP.Magic = SanitizerConfig::MAGIC; + AP.Kind = (uint64_t)AK; + return AP; + } + + [[clang::disable_sanitizer_instrumentation]] static void + remove(_AS_PTR(void, AK) P, int64_t SourceId) { + AllocationPtrTy AP = AllocationPtrTy::get(P); + uint64_t AllocationId = AP.AllocationId; + auto &AllocArr = getAllocationArray(); + auto &A = AllocArr.Arr[AllocationId]; + A.Length = 0; + + auto &Cnt = AllocArr.Cnt; + if constexpr (AK == AllocationKind::LOCAL) { + if (Cnt == AllocationId) + --Cnt; + } + } + + [[clang::disable_sanitizer_instrumentation]] static void remove_n(int32_t N) { + static_assert(AK == AllocationKind::LOCAL, ""); + auto &AllocArr = getAllocationArray(); + auto &Cnt = AllocArr.Cnt; + for (int32_t I = 0; I < N; ++I) { + auto &A = AllocArr.Arr[Cnt--]; + A.Length = 0; + } + } + + [[clang::disable_sanitizer_instrumentation]] static _AS_PTR(void, AK) + advance(_AS_PTR(void, AK) P, uint64_t Offset, int64_t SourceId) { + AllocationPtrTy AP = AllocationPtrTy::get(P); + AP.Offset += Offset; + return AP; + } + + [[clang::disable_sanitizer_instrumentation]] static _AS_PTR(void, AK) + checkWithBase(_AS_PTR(void, AK) P, _AS_PTR(void, AK) Start, + int64_t Length, uint32_t Tag, int64_t Size, + int64_t AccessId, int64_t SourceId) { + AllocationPtrTy AP = AllocationPtrTy::get(P); + if constexpr (AK == AllocationKind::LOCAL) + if (Length == 0) + Length = getAllocation(AP, AccessId).Length; + if constexpr (AK == AllocationKind::GLOBAL) + if (AP.Magic != SanitizerConfig::MAGIC) + __sanitizer_trap_info_ptr->garbagePointer(AP, (void *)P, SourceId); + int64_t Offset = AP.Offset; + if (OMP_UNLIKELY( + Offset > Length - Size || + (SanitizerConfig::useTags() && Tag != AP.AllocationTag))) { + __sanitizer_trap_info_ptr->accessError(AP, Size, AccessId, SourceId); + } + return utils::advancePtr(Start, Offset); + } + + [[clang::disable_sanitizer_instrumentation]] static _AS_PTR(void, AK) + check(_AS_PTR(void, AK) P, int64_t Size, int64_t AccessId, + int64_t SourceId) { + AllocationPtrTy AP = AllocationPtrTy::get(P); + auto &Alloc = getAllocation(AP, AccessId); + return checkWithBase(P, Alloc.Start, Alloc.Length, Alloc.Tag, Size, + AccessId, SourceId); + } + + [[clang::disable_sanitizer_instrumentation]] static _AS_PTR(void, AK) + unpack(_AS_PTR(void, AK) P, int64_t SourceId = 0) { + AllocationPtrTy AP = AllocationPtrTy::get(P); + auto &A = getAllocation(AP); + uint64_t Offset = AP.Offset; + _AS_PTR(void, AK) Ptr = utils::advancePtr(A.Start, Offset); + return Ptr; + } + + [[clang::disable_sanitizer_instrumentation]] static void + lifetimeStart(_AS_PTR(void, AK) P, uint64_t Length) { + AllocationPtrTy AP = AllocationPtrTy::get(P); + auto &A = getAllocation(AP); + // TODO: Check length + A.Length = Length; + } + + [[clang::disable_sanitizer_instrumentation]] static void + lifetimeEnd(_AS_PTR(void, AK) P, uint64_t Length) { + AllocationPtrTy AP = AllocationPtrTy::get(P); + auto &A = getAllocation(AP); + // TODO: Check length + A.Length = 0; + } + + [[clang::disable_sanitizer_instrumentation]] static void leakCheck() { + static_assert(AK == AllocationKind::GLOBAL, ""); + auto &AllocArr = getAllocationArray(); + for (uint64_t Slot = 0; Slot < SanitizerConfig::SLOTS; ++Slot) { + auto &A = AllocArr.Arr[Slot]; + if (OMP_UNLIKELY(A.Length)) + __sanitizer_trap_info_ptr->memoryLeak(A, Slot); + } + } +}; + +template +AllocationArrayTy + Allocations::Arr[SanitizerConfig::NUM_ALLOCATION_ARRAYS]; + +static void checkForMagic(bool IsGlobal, void *P, int64_t SourceId) { + if (IsGlobal) { + auto AP = AllocationPtrTy::get(P); + if (AP.Magic != SanitizerConfig::MAGIC) + __sanitizer_trap_info_ptr->garbagePointer( + AP, P, SourceId); + } else { + auto AP = AllocationPtrTy::get(P); + if (AP.Magic != SanitizerConfig::MAGIC) + __sanitizer_trap_info_ptr->garbagePointer( + AP, P, SourceId); + } +} + +extern "C" { + +#define REAL_PTR_IS_LOCAL(PTR) (isThreadLocalMemPtr(PTR)) +#define IS_GLOBAL(PTR) ((uintptr_t)PTR & (1UL << 63)) + +[[clang::disable_sanitizer_instrumentation, gnu::flatten, gnu::always_inline, + gnu::used, gnu::retain]] _AS_PTR(void, AllocationKind::LOCAL) + ompx_new_local(_AS_PTR(void, AllocationKind::LOCAL) Start, uint64_t Length, + int64_t AllocationId, uint32_t Slot, int64_t SourceId) { + return AllocationTracker::create( + Start, Length, AllocationId, Slot, SourceId); +} +[[clang::disable_sanitizer_instrumentation, gnu::flatten, gnu::always_inline, + gnu::used, gnu::retain]] _AS_PTR(void, AllocationKind::GLOBAL) + ompx_new_global(_AS_PTR(void, AllocationKind::GLOBAL) Start, + uint64_t Length, int64_t AllocationId, uint32_t Slot, + int64_t SourceId) { + return AllocationTracker::create( + Start, Length, AllocationId, Slot, SourceId); +} +[[clang::disable_sanitizer_instrumentation, gnu::flatten, gnu::always_inline, + gnu::used, gnu::retain]] void +__sanitizer_register_host(_AS_PTR(void, AllocationKind::GLOBAL) Start, + uint64_t Length, uint64_t Slot, int64_t SourceId) { + AllocationTracker::create(Start, Length, Slot, Slot, + SourceId); +} +[[clang::disable_sanitizer_instrumentation, gnu::flatten, gnu::always_inline, + gnu::used, gnu::retain]] void * +ompx_new(void *Start, uint64_t Length, int64_t AllocationId, uint32_t Slot, + int64_t SourceId) { + if (REAL_PTR_IS_LOCAL(Start)) + return (void *)ompx_new_local((_AS_PTR(void, AllocationKind::LOCAL))Start, + Length, AllocationId, Slot, SourceId); + return (void *)ompx_new_global((_AS_PTR(void, AllocationKind::GLOBAL))Start, + Length, AllocationId, Slot, SourceId); +} +[[clang::disable_sanitizer_instrumentation, gnu::flatten, gnu::always_inline, + gnu::used, gnu::retain]] void +ompx_free_local_n(int32_t N) { + return AllocationTracker::remove_n(N); +} +[[clang::disable_sanitizer_instrumentation, gnu::flatten, gnu::always_inline, + gnu::used, gnu::retain]] void +__sanitizer_unregister_host(_AS_PTR(void, AllocationKind::GLOBAL) P) { + AllocationTracker::remove(P, /*SourceId=*/0); +} +[[clang::disable_sanitizer_instrumentation, gnu::flatten, gnu::always_inline, + gnu::used, gnu::retain]] void +ompx_free_local(_AS_PTR(void, AllocationKind::LOCAL) P, int64_t SourceId) { + return AllocationTracker::remove(P, SourceId); +} +[[clang::disable_sanitizer_instrumentation, gnu::flatten, gnu::always_inline, + gnu::used, gnu::retain]] void +ompx_free_global(_AS_PTR(void, AllocationKind::GLOBAL) P, int64_t SourceId) { + return AllocationTracker::remove(P, SourceId); +} +[[clang::disable_sanitizer_instrumentation, gnu::flatten, gnu::always_inline, + gnu::used, gnu::retain]] void +ompx_free(void *P, int64_t SourceId) { + bool IsGlobal = IS_GLOBAL(P); + checkForMagic(IsGlobal, P, SourceId); + if (IsGlobal) + return ompx_free_global((_AS_PTR(void, AllocationKind::GLOBAL))P, SourceId); + return ompx_free_local((_AS_PTR(void, AllocationKind::LOCAL))P, SourceId); +} + +[[clang::disable_sanitizer_instrumentation, gnu::flatten, gnu::always_inline, + gnu::used, gnu::retain]] _AS_PTR(void, AllocationKind::LOCAL) + ompx_gep_local(_AS_PTR(void, AllocationKind::LOCAL) P, uint64_t Offset, + int64_t SourceId) { + return AllocationTracker::advance(P, Offset, SourceId); +} +[[clang::disable_sanitizer_instrumentation, gnu::flatten, gnu::always_inline, + gnu::used, gnu::retain]] _AS_PTR(void, AllocationKind::GLOBAL) + ompx_gep_global(_AS_PTR(void, AllocationKind::GLOBAL) P, uint64_t Offset, + int64_t SourceId) { + return AllocationTracker::advance(P, Offset, + SourceId); +} +[[clang::disable_sanitizer_instrumentation, gnu::flatten, gnu::always_inline, + gnu::used, gnu::retain]] void * +ompx_gep(void *P, uint64_t Offset, int64_t SourceId) { + bool IsGlobal = IS_GLOBAL(P); + checkForMagic(IsGlobal, P, SourceId); + if (IsGlobal) + return (void *)ompx_gep_global((_AS_PTR(void, AllocationKind::GLOBAL))P, + Offset, SourceId); + return (void *)ompx_gep_local((_AS_PTR(void, AllocationKind::LOCAL))P, Offset, + SourceId); +} + +[[clang::disable_sanitizer_instrumentation, gnu::flatten, gnu::always_inline, + gnu::used, gnu::retain]] _AS_PTR(void, AllocationKind::LOCAL) + ompx_check_local(_AS_PTR(void, AllocationKind::LOCAL) P, uint64_t Size, + uint64_t AccessId, int64_t SourceId) { + return AllocationTracker::check(P, Size, AccessId, + SourceId); +} +[[clang::disable_sanitizer_instrumentation, gnu::flatten, gnu::always_inline, + gnu::used, gnu::retain]] _AS_PTR(void, AllocationKind::GLOBAL) + ompx_check_global(_AS_PTR(void, AllocationKind::GLOBAL) P, uint64_t Size, + uint64_t AccessId, int64_t SourceId) { + return AllocationTracker::check(P, Size, AccessId, + SourceId); +} +[[clang::disable_sanitizer_instrumentation, gnu::flatten, gnu::always_inline, + gnu::used, gnu::retain]] void * +ompx_check(void *P, uint64_t Size, uint64_t AccessId, int64_t SourceId) { + bool IsGlobal = IS_GLOBAL(P); + checkForMagic(IsGlobal, P, SourceId); + if (IsGlobal) + return (void *)ompx_check_global((_AS_PTR(void, AllocationKind::GLOBAL))P, + Size, AccessId, SourceId); + return (void *)ompx_check_local((_AS_PTR(void, AllocationKind::LOCAL))P, Size, + AccessId, SourceId); +} + +[[clang::disable_sanitizer_instrumentation, gnu::flatten, gnu::always_inline, + gnu::used, gnu::retain]] _AS_PTR(void, AllocationKind::LOCAL) + ompx_check_with_base_local(_AS_PTR(void, AllocationKind::LOCAL) P, + _AS_PTR(void, AllocationKind::LOCAL) Start, + uint64_t Length, uint32_t Tag, uint64_t Size, + uint64_t AccessId, int64_t SourceId) { + return AllocationTracker::checkWithBase( + P, Start, Length, Tag, Size, AccessId, SourceId); +} + +[[clang::disable_sanitizer_instrumentation, gnu::flatten, gnu::always_inline, + gnu::used, gnu::retain]] _AS_PTR(void, AllocationKind::GLOBAL) + ompx_check_with_base_global(_AS_PTR(void, AllocationKind::GLOBAL) P, + _AS_PTR(void, AllocationKind::GLOBAL) Start, + uint64_t Length, uint32_t Tag, uint64_t Size, + uint64_t AccessId, int64_t SourceId) { + return AllocationTracker::checkWithBase( + P, Start, Length, Tag, Size, AccessId, SourceId); +} + +[[clang::disable_sanitizer_instrumentation, gnu::flatten, gnu::always_inline, + gnu::used, gnu::retain]] _AS_PTR(void, AllocationKind::LOCAL) + ompx_unpack_local(_AS_PTR(void, AllocationKind::LOCAL) P, + int64_t SourceId) { + return AllocationTracker::unpack(P, SourceId); +} +[[clang::disable_sanitizer_instrumentation, gnu::flatten, gnu::always_inline, + gnu::used, gnu::retain]] _AS_PTR(void, AllocationKind::GLOBAL) + ompx_unpack_global(_AS_PTR(void, AllocationKind::GLOBAL) P, + int64_t SourceId) { + return AllocationTracker::unpack(P, SourceId); +} +[[clang::disable_sanitizer_instrumentation, gnu::flatten, gnu::always_inline, + gnu::used, gnu::retain]] void * +ompx_unpack(void *P, int64_t SourceId) { + bool IsGlobal = IS_GLOBAL(P); + checkForMagic(IsGlobal, P, SourceId); + if (IsGlobal) + return (void *)ompx_unpack_global((_AS_PTR(void, AllocationKind::GLOBAL))P, + SourceId); + return (void *)ompx_unpack_local((_AS_PTR(void, AllocationKind::LOCAL))P, + SourceId); +} + +[[clang::disable_sanitizer_instrumentation, gnu::flatten, gnu::always_inline, + gnu::used, gnu::retain]] void +ompx_lifetime_start(_AS_PTR(void, AllocationKind::LOCAL) P, uint64_t Length) { + AllocationTracker::lifetimeStart(P, Length); +} +[[clang::disable_sanitizer_instrumentation, gnu::flatten, gnu::always_inline, + gnu::used, gnu::retain]] void +ompx_lifetime_end(_AS_PTR(void, AllocationKind::LOCAL) P, uint64_t Length) { + AllocationTracker::lifetimeEnd(P, Length); +} + +[[clang::disable_sanitizer_instrumentation, gnu::flatten, gnu::always_inline, + gnu::used, gnu::retain]] struct AllocationInfoLocalTy +ompx_get_allocation_info_local(_AS_PTR(void, AllocationKind::LOCAL) P) { + return AllocationTracker::getAllocationInfo(P); +} +[[clang::disable_sanitizer_instrumentation, gnu::flatten, gnu::always_inline, + gnu::used, gnu::retain]] struct AllocationInfoGlobalTy +ompx_get_allocation_info_global(_AS_PTR(void, AllocationKind::GLOBAL) P) { + return AllocationTracker::getAllocationInfo(P); +} + +[[clang::disable_sanitizer_instrumentation, gnu::flatten, gnu::always_inline, + gnu::used, gnu::retain]] void +ompx_leak_check() { + AllocationTracker::leakCheck(); +} + +[[gnu::weak, gnu::noinline, gnu::used, gnu::retain]] int64_t +__san_get_location_value() { + return -1; +} +} + +#pragma omp end declare target diff --git a/offload/DeviceRTL/src/State.cpp b/offload/DeviceRTL/src/State.cpp index a1e4fa2449d9a..cb83b7839b5b2 100644 --- a/offload/DeviceRTL/src/State.cpp +++ b/offload/DeviceRTL/src/State.cpp @@ -13,13 +13,13 @@ #include "Allocator.h" #include "Configuration.h" #include "Debug.h" +#include "DeviceUtils.h" #include "Interface.h" #include "LibC.h" #include "Mapping.h" #include "State.h" #include "Synchronization.h" -#include "Types.h" -#include "Utils.h" +#include "DeviceTypes.h" using namespace ompx; @@ -84,14 +84,14 @@ struct SharedMemorySmartStackTy { /// Deallocate the last allocation made by the encountering thread and pointed /// to by \p Ptr from the stack. Each thread can call this function. - void pop(void *Ptr, uint32_t Bytes); + void pop(void *Ptr, uint64_t Bytes); private: /// Compute the size of the storage space reserved for a thread. uint32_t computeThreadStorageTotal() { uint32_t NumLanesInBlock = mapping::getNumberOfThreadsInBlock(); - return utils::align_down((state::SharedScratchpadSize / NumLanesInBlock), - allocator::ALIGNMENT); + return utils::alignDown((state::SharedScratchpadSize / NumLanesInBlock), + allocator::ALIGNMENT); } /// Return the top address of the warp data stack, that is the first address @@ -121,7 +121,7 @@ void *SharedMemorySmartStackTy::push(uint64_t Bytes) { // First align the number of requested bytes. /// FIXME: The stack shouldn't require worst-case padding. Alignment needs to /// be passed in as an argument and the stack rewritten to support it. - uint64_t AlignedBytes = utils::align_up(Bytes, allocator::ALIGNMENT); + uint64_t AlignedBytes = utils::alignPtr(Bytes, allocator::ALIGNMENT); uint32_t StorageTotal = computeThreadStorageTotal(); @@ -148,8 +148,8 @@ void *SharedMemorySmartStackTy::push(uint64_t Bytes) { return GlobalMemory; } -void SharedMemorySmartStackTy::pop(void *Ptr, uint32_t Bytes) { - uint64_t AlignedBytes = utils::align_up(Bytes, allocator::ALIGNMENT); +void SharedMemorySmartStackTy::pop(void *Ptr, uint64_t Bytes) { + uint64_t AlignedBytes = utils::alignPtr(Bytes, allocator::ALIGNMENT); if (utils::isSharedMemPtr(Ptr)) { int TId = mapping::getThreadIdInBlock(); Usage[TId] -= AlignedBytes; diff --git a/offload/DeviceRTL/src/Synchronization.cpp b/offload/DeviceRTL/src/Synchronization.cpp index 80ba87b300bcd..97a6b080169ad 100644 --- a/offload/DeviceRTL/src/Synchronization.cpp +++ b/offload/DeviceRTL/src/Synchronization.cpp @@ -16,8 +16,8 @@ #include "Interface.h" #include "Mapping.h" #include "State.h" -#include "Types.h" -#include "Utils.h" +#include "DeviceTypes.h" +#include "DeviceUtils.h" #pragma omp begin declare target device_type(nohost) diff --git a/offload/DeviceRTL/src/Tasking.cpp b/offload/DeviceRTL/src/Tasking.cpp index 2dc33562e6d79..34cb67cb1a351 100644 --- a/offload/DeviceRTL/src/Tasking.cpp +++ b/offload/DeviceRTL/src/Tasking.cpp @@ -13,10 +13,10 @@ // //===----------------------------------------------------------------------===// +#include "DeviceUtils.h" #include "Interface.h" #include "State.h" -#include "Types.h" -#include "Utils.h" +#include "DeviceTypes.h" using namespace ompx; @@ -34,7 +34,7 @@ TaskDescriptorTy *__kmpc_omp_task_alloc(IdentTy *, int32_t, int32_t, TaskDescriptorTy *TaskDescriptor = (TaskDescriptorTy *)memory::allocGlobal( TaskSizeTotal, "explicit task descriptor"); TaskDescriptor->Payload = - utils::advance(TaskDescriptor, TaskSizeInclPrivateValuesPadded); + utils::advancePtr(TaskDescriptor, TaskSizeInclPrivateValuesPadded); TaskDescriptor->TaskFn = TaskFn; return TaskDescriptor; diff --git a/offload/DeviceRTL/src/Utils.cpp b/offload/DeviceRTL/src/Utils.cpp index 53cc803234867..956e6200ffd5c 100644 --- a/offload/DeviceRTL/src/Utils.cpp +++ b/offload/DeviceRTL/src/Utils.cpp @@ -9,7 +9,7 @@ // //===----------------------------------------------------------------------===// -#include "Utils.h" +#include "DeviceUtils.h" #include "Debug.h" #include "Interface.h" @@ -22,6 +22,7 @@ using namespace ompx; namespace impl { bool isSharedMemPtr(const void *Ptr) { return false; } +bool isThreadLocalMemPtr(const void *Ptr) { return false; } void Unpack(uint64_t Val, uint32_t *LowBits, uint32_t *HighBits) { static_assert(sizeof(unsigned long) == 8, ""); @@ -67,6 +68,10 @@ bool isSharedMemPtr(const void *Ptr) { return __builtin_amdgcn_is_shared( (const __attribute__((address_space(0))) void *)Ptr); } +bool isThreadLocalMemPtr(const void *Ptr) { + return __builtin_amdgcn_is_private( + (const __attribute__((address_space(0))) void *)Ptr); +} #pragma omp end declare variant ///} @@ -92,6 +97,8 @@ uint64_t ballotSync(uint64_t Mask, int32_t Pred) { bool isSharedMemPtr(const void *Ptr) { return __nvvm_isspacep_shared(Ptr); } +bool isThreadLocalMemPtr(const void *Ptr) { return __nvvm_isspacep_local(Ptr); } + #pragma omp end declare variant ///} } // namespace impl @@ -127,6 +134,9 @@ uint64_t utils::ballotSync(uint64_t Mask, int32_t Pred) { } bool utils::isSharedMemPtr(void *Ptr) { return impl::isSharedMemPtr(Ptr); } +bool utils::isThreadLocalMemPtr(void *Ptr) { + return impl::isThreadLocalMemPtr(Ptr); +} extern "C" { int32_t __kmpc_shuffle_int32(int32_t Val, int16_t Delta, int16_t SrcLane) { diff --git a/offload/DeviceRTL/src/Workshare.cpp b/offload/DeviceRTL/src/Workshare.cpp index bcb7c5ad50a18..fd835f2af677e 100644 --- a/offload/DeviceRTL/src/Workshare.cpp +++ b/offload/DeviceRTL/src/Workshare.cpp @@ -17,8 +17,8 @@ #include "Mapping.h" #include "State.h" #include "Synchronization.h" -#include "Types.h" -#include "Utils.h" +#include "DeviceTypes.h" +#include "DeviceUtils.h" using namespace ompx; diff --git a/offload/DeviceRTL/src/exports b/offload/DeviceRTL/src/exports index 288ddf90b4a9f..b725cca7e1c1d 100644 --- a/offload/DeviceRTL/src/exports +++ b/offload/DeviceRTL/src/exports @@ -16,3 +16,5 @@ free memcmp printf __assert_fail +LocalAllocs +LocalCnt diff --git a/offload/include/OpenMP/Mapping.h b/offload/include/OpenMP/Mapping.h index b9f5c16582931..d310c6c707f8f 100644 --- a/offload/include/OpenMP/Mapping.h +++ b/offload/include/OpenMP/Mapping.h @@ -71,6 +71,7 @@ struct HostDataToTargetTy { const uintptr_t TgtAllocBegin; // allocated target memory const uintptr_t TgtPtrBegin; // mapped target memory = TgtAllocBegin + padding + void *FakeTgtPtrBegin = 0; // mapped target memory = TgtAllocBegin + padding private: static const uint64_t INFRefCount = ~(uint64_t)0; @@ -125,9 +126,10 @@ struct HostDataToTargetTy { HostDataToTargetTy(uintptr_t BP, uintptr_t B, uintptr_t E, uintptr_t TgtAllocBegin, uintptr_t TgtPtrBegin, bool UseHoldRefCount, map_var_info_t Name = nullptr, - bool IsINF = false) + bool IsINF = false, void *FakeTgtPtrBegin = nullptr) : HstPtrBase(BP), HstPtrBegin(B), HstPtrEnd(E), HstPtrName(Name), TgtAllocBegin(TgtAllocBegin), TgtPtrBegin(TgtPtrBegin), + FakeTgtPtrBegin(FakeTgtPtrBegin), States(std::make_unique(UseHoldRefCount ? 0 : IsINF ? INFRefCount : 1, diff --git a/offload/include/Shared/APITypes.h b/offload/include/Shared/APITypes.h index 5b22bbaac144f..f96b2f9ca259d 100644 --- a/offload/include/Shared/APITypes.h +++ b/offload/include/Shared/APITypes.h @@ -85,6 +85,9 @@ struct __tgt_async_info { /// ensure it is a valid location while the transfer to the device is /// happening. KernelLaunchEnvironmentTy KernelLaunchEnvironment; + + /// Flag to indicate the Queue should be persistent. + bool PersistentQueue = false; }; /// This struct contains all of the arguments to a target kernel region launch. @@ -102,19 +105,24 @@ struct KernelArgsTy { 0; // Tripcount for the teams / distribute loop, 0 otherwise. struct { uint64_t NoWait : 1; // Was this kernel spawned with a `nowait` clause. - uint64_t Unused : 63; - } Flags = {0, 0}; + uint64_t IsCUDA : 1; // Was this kernel spawned via CUDA. + uint64_t Unused : 62; + } Flags = {0, 0, 0}; // The number of teams (for x,y,z dimension). uint32_t NumTeams[3] = {0, 0, 0}; // The number of threads (for x,y,z dimension). uint32_t ThreadLimit[3] = {0, 0, 0}; uint32_t DynCGroupMem = 0; // Amount of dynamic cgroup memory requested. + // A __tgt_async_info queue pointer to be used for the kernel and all + // associated device interactions. The operations are implicitly made + // non-blocking. + void *AsyncInfoQueue = nullptr; }; static_assert(sizeof(KernelArgsTy().Flags) == sizeof(uint64_t), "Invalid struct size"); static_assert(sizeof(KernelArgsTy) == (8 * sizeof(int32_t) + 3 * sizeof(int64_t) + - 4 * sizeof(void **) + 2 * sizeof(int64_t *)), + 5 * sizeof(void **) + 2 * sizeof(int64_t *)), "Invalid struct size"); /// Flat array of kernel launch parameters and their total size. diff --git a/offload/include/Shared/RefCnt.h b/offload/include/Shared/RefCnt.h new file mode 100644 index 0000000000000..7c615ba167a3d --- /dev/null +++ b/offload/include/Shared/RefCnt.h @@ -0,0 +1,56 @@ +//===-- Shared/RefCnt.h - Helper to keep track of references --- C++ ------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +//===----------------------------------------------------------------------===// + +#ifndef OMPTARGET_SHARED_REF_CNT_H +#define OMPTARGET_SHARED_REF_CNT_H + +#include +#include +#include +#include + +namespace llvm { +namespace omp { +namespace target { + +/// Utility class for thread-safe reference counting. Any class that needs +/// objects' reference counting can inherit from this entity or have it as a +/// class data member. +template +struct RefCountTy { + /// Create a refcount object initialized to zero. + RefCountTy() : Refs(0) {} + + ~RefCountTy() { assert(Refs == 0 && "Destroying with non-zero refcount"); } + + /// Increase the reference count atomically. + void increase() { Refs.fetch_add(1, MemoryOrder); } + + /// Decrease the reference count and return whether it became zero. Decreasing + /// the counter in more units than it was previously increased results in + /// undefined behavior. + bool decrease() { + Ty Prev = Refs.fetch_sub(1, MemoryOrder); + assert(Prev > 0 && "Invalid refcount"); + return (Prev == 1); + } + + Ty get() const { return Refs.load(MemoryOrder); } + +private: + /// The atomic reference counter. + std::atomic Refs; +}; +} // namespace target +} // namespace omp +} // namespace llvm + +#endif diff --git a/offload/include/Shared/Sanitizer.h b/offload/include/Shared/Sanitizer.h new file mode 100644 index 0000000000000..79e3af2d33bbd --- /dev/null +++ b/offload/include/Shared/Sanitizer.h @@ -0,0 +1,358 @@ +//===-- Shared/SanitizerHost.h - OFfload sanitizer host logic ----- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +//===----------------------------------------------------------------------===// + +#ifndef OMPTARGET_SHARED_SANITIZER_HOST_H +#define OMPTARGET_SHARED_SANITIZER_HOST_H + +#include "Types.h" +#include "Utils.h" + +extern "C" { +int ompx_block_id(int Dim); +int ompx_block_dim(int Dim); +int ompx_thread_id(int Dim); +int64_t __san_get_location_value(); +} + +enum class AllocationKind { LOCAL, GLOBAL, LAST = GLOBAL }; + +template struct ASTypes { + using INT_TY = uint64_t; +}; +#pragma omp begin declare variant match(device = {arch(amdgcn)}) +template <> struct ASTypes { + using INT_TY = uint32_t; +}; +#pragma omp end declare variant + +template struct SanitizerConfig { + static constexpr uint32_t ADDR_SPACE = AK == AllocationKind::GLOBAL ? 0 : 5; + static constexpr uint32_t ADDR_SPACE_PTR_SIZE = + sizeof(typename ASTypes::INT_TY) * 8; + + static constexpr uint32_t NUM_ALLOCATION_ARRAYS = + AK == AllocationKind::GLOBAL ? 1 : (1024 * 1024 * 2); + static constexpr uint32_t TAG_BITS = AK == AllocationKind::GLOBAL ? 1 : 8; + static constexpr uint32_t MAGIC_BITS = 3; + static constexpr uint32_t MAGIC = 0b101; + + static constexpr uint32_t OBJECT_BITS = AK == AllocationKind::GLOBAL ? 10 : 7; + static constexpr uint32_t SLOTS = (1 << (OBJECT_BITS)); + static constexpr uint32_t KIND_BITS = 1; + static constexpr uint32_t Id_BITS = 9 - KIND_BITS; + + static constexpr uint32_t LENGTH_BITS = + ADDR_SPACE_PTR_SIZE - TAG_BITS - Id_BITS; + static constexpr uint32_t OFFSET_BITS = + ADDR_SPACE_PTR_SIZE - TAG_BITS - OBJECT_BITS - KIND_BITS - MAGIC_BITS; + + static constexpr bool useTags() { return TAG_BITS > 1; } + + static_assert(LENGTH_BITS + TAG_BITS + Id_BITS == ADDR_SPACE_PTR_SIZE, + "Length, tag, and Id bits should cover one pointer"); + static_assert(OFFSET_BITS + TAG_BITS + OBJECT_BITS + MAGIC_BITS + KIND_BITS == + ADDR_SPACE_PTR_SIZE, + "Offset, tag, object, and kind bits should cover one pointer"); + static_assert((1 << KIND_BITS) >= ((uint64_t)AllocationKind::LAST + 1), + "Kind bits should match allocation kinds"); +}; + +#define _AS_PTR(TY, AK) \ + TY [[clang::address_space(SanitizerConfig::ADDR_SPACE)]] * + +template struct AllocationTy { + _AS_PTR(void, AK) Start; + typename ASTypes::INT_TY Length : SanitizerConfig::LENGTH_BITS; + typename ASTypes::INT_TY Tag : SanitizerConfig::TAG_BITS; + typename ASTypes::INT_TY Id : SanitizerConfig::Id_BITS; +}; + +template struct AllocationArrayTy { + AllocationTy Arr[SanitizerConfig::SLOTS]; + uint64_t Cnt; +}; + +template struct AllocationPtrTy { + static AllocationPtrTy get(_AS_PTR(void, AK) P) { + return utils::convertViaPun>(P); + } + static AllocationPtrTy get(void *P) { + return get((_AS_PTR(void, AK))(P)); + } + operator _AS_PTR(void, AK)() const { + return utils::convertViaPun<_AS_PTR(void, AK)>(*this); + } + operator typename ASTypes::INT_TY() const { + return utils::convertViaPun::INT_TY>(*this); + } + typename ASTypes::INT_TY Offset : SanitizerConfig::OFFSET_BITS; + typename ASTypes::INT_TY AllocationTag : SanitizerConfig::TAG_BITS; + typename ASTypes::INT_TY AllocationId : SanitizerConfig::OBJECT_BITS; + typename ASTypes::INT_TY Magic : SanitizerConfig::MAGIC_BITS; + // Must be last, TODO: merge into TAG + typename ASTypes::INT_TY Kind : SanitizerConfig::KIND_BITS; +}; +#pragma omp begin declare variant match(device = {arch(amdgcn)}) +static_assert(sizeof(AllocationPtrTy) * 8 == 32); +#pragma omp end declare variant + +union TypePunUnion { + uint64_t I; + void *P; + _AS_PTR(void, AllocationKind::LOCAL) AddrP; + struct { + AllocationPtrTy AP; + uint32_t U; + }; +}; +#pragma omp begin declare variant match(device = {arch(amdgcn)}) +static_assert(sizeof(TypePunUnion) * 8 == 64); +#pragma omp end declare variant + +static inline void *__offload_get_new_sanitizer_ptr(int32_t Slot) { + AllocationPtrTy AP; + AP.Offset = 0; + AP.AllocationId = Slot; + AP.Magic = SanitizerConfig::MAGIC; + AP.Kind = (uint32_t)AllocationKind::GLOBAL; + return (void *)(_AS_PTR(void, AllocationKind::GLOBAL))AP; +} + +template struct Allocations { + static AllocationArrayTy Arr[SanitizerConfig::NUM_ALLOCATION_ARRAYS]; +}; + +struct LocationEncodingTy { + uint64_t FunctionNameIdx; + uint64_t FileNameIdx; + uint64_t LineNo; + uint64_t ColumnNo; + uint64_t ParentIdx; +}; + +struct SanitizerTrapInfoTy { + /// AllocationTy + /// { + void *AllocationStart; + uint64_t AllocationLength; + int32_t AllocationId; + uint32_t AllocationTag; + uint8_t AllocationKind; + ///} + + enum ErrorCodeTy : uint8_t { + None = 0, + ExceedsLength, + ExceedsSlots, + PointerOutsideAllocation, + OutOfBounds, + UseAfterScope, + UseAfterFree, + MemoryLeak, + GarbagePointer, + } ErrorCode; + + /// AllocationTy + /// { + uint64_t PtrOffset; + uint64_t PtrSlot; + uint16_t PtrTag; + uint16_t PtrKind; + ///} + + /// Access + /// { + uint32_t AccessSize; + int64_t AccessId; + /// } + + /// Thread + /// { + uint64_t BlockId[3]; + uint32_t ThreadId[3]; + uint64_t PC; + uint64_t LocationId; + int64_t CallId; + /// } + + [[clang::disable_sanitizer_instrumentation]] void + setCoordinates(int64_t SourceId) { + for (int32_t Dim = 0; Dim < 3; ++Dim) { + BlockId[Dim] = ompx_block_id(Dim); + ThreadId[Dim] = ompx_thread_id(Dim); + } + LocationId = SourceId; + CallId = __san_get_location_value(); + } + + template + [[clang::disable_sanitizer_instrumentation, gnu::always_inline]] void + allocationError(ErrorCodeTy EC, _AS_PTR(void, AK) Start, uint64_t Length, + int64_t Id, int64_t Tag, uint64_t Slot, int64_t SourceId) { + AllocationStart = (void *)Start; + AllocationLength = Length; + AllocationId = Id; + AllocationTag = Tag; + AllocationKind = (decltype(AllocationKind))AK; + PtrSlot = Slot; + + ErrorCode = EC; + setCoordinates(SourceId); + } + + template + [[clang::disable_sanitizer_instrumentation, gnu::always_inline]] void + propagateAccessError(ErrorCodeTy EC, const AllocationTy &A, + const AllocationPtrTy &AP, uint64_t Size, int64_t Id, + int64_t SourceId) { + AllocationStart = (void *)A.Start; + AllocationLength = A.Length; + AllocationId = A.Id; + AllocationTag = A.Tag; + AllocationKind = (decltype(AllocationKind))AK; + + ErrorCode = EC; + + PtrOffset = AP.Offset; + PtrSlot = AP.AllocationId; + PtrTag = AP.AllocationTag; + PtrKind = AP.Kind; + + AccessSize = Size; + AccessId = Id; + + setCoordinates(SourceId); + } + + template + [[clang::disable_sanitizer_instrumentation, noreturn, gnu::noinline]] void + exceedsAllocationLength(_AS_PTR(void, AK) Start, uint64_t Length, + int64_t AllocationId, uint64_t Slot, + int64_t SourceId) { + allocationError(ExceedsLength, Start, Length, AllocationId, /*Tag=*/0, + Slot, SourceId); + __builtin_trap(); + } + + template + [[clang::disable_sanitizer_instrumentation, noreturn, gnu::noinline]] void + exceedsAllocationSlots(_AS_PTR(void, AK) Start, uint64_t Length, + int64_t AllocationId, uint64_t Slot, + int64_t SourceId) { + allocationError(ExceedsSlots, Start, Length, AllocationId, /*Tag=*/0, + Slot, SourceId); + __builtin_trap(); + } + + template + [[clang::disable_sanitizer_instrumentation, noreturn, gnu::noinline]] void + pointerOutsideAllocation(_AS_PTR(void, AK) Start, uint64_t Length, + int64_t AllocationId, uint64_t Slot, uint64_t PC) { + allocationError(PointerOutsideAllocation, Start, Length, AllocationId, + /*Tag=*/0, Slot, PC); + __builtin_trap(); + } + + template + [[clang::disable_sanitizer_instrumentation, noreturn, gnu::noinline]] void + outOfBoundAccess(const AllocationTy A, const AllocationPtrTy AP, + uint64_t Size, int64_t AccessId, int64_t SourceId) { + propagateAccessError(OutOfBounds, A, AP, Size, AccessId, SourceId); + __builtin_trap(); + } + + template + [[clang::disable_sanitizer_instrumentation, noreturn, gnu::noinline]] void + useAfterScope(const AllocationTy A, const AllocationPtrTy AP, + uint64_t Size, int64_t AccessId, int64_t SourceId) { + propagateAccessError(UseAfterScope, A, AP, Size, AccessId, SourceId); + __builtin_trap(); + } + + template + [[clang::disable_sanitizer_instrumentation, noreturn, gnu::noinline]] void + useAfterFree(const AllocationTy A, const AllocationPtrTy AP, + uint64_t Size, int64_t AccessId, int64_t SourceId) { + propagateAccessError(UseAfterFree, A, AP, Size, AccessId, SourceId); + __builtin_trap(); + } + + template + [[clang::disable_sanitizer_instrumentation, noreturn, gnu::noinline]] void + accessError(const AllocationPtrTy AP, int64_t Size, int64_t AccessId, + int64_t SourceId); + + template + [[clang::disable_sanitizer_instrumentation, noreturn, gnu::noinline]] void + garbagePointer(const AllocationPtrTy AP, void *P, int64_t SourceId) { + ErrorCode = GarbagePointer; + AllocationStart = P; + AllocationKind = (decltype(AllocationKind))AK; + PtrOffset = AP.Offset; + PtrSlot = AP.AllocationId; + PtrTag = AP.AllocationTag; + PtrKind = AP.Kind; + setCoordinates(SourceId); + __builtin_trap(); + } + + template + [[clang::disable_sanitizer_instrumentation, noreturn, gnu::noinline]] void + memoryLeak(const AllocationTy A, uint64_t Slot) { + allocationError(MemoryLeak, A.Start, A.Length, A.Id, A.Tag, Slot, + /*SourceId=*/-1); + __builtin_trap(); + } +}; + +[[gnu::used, gnu::retain, gnu::weak, + gnu::visibility("protected")]] SanitizerTrapInfoTy *__sanitizer_trap_info_ptr; + +template +[[clang::disable_sanitizer_instrumentation, + gnu::always_inline]] AllocationArrayTy & +getAllocationArray() { + uint32_t ThreadId = 0, BlockId = 0; + if constexpr (AK == AllocationKind::LOCAL) { + ThreadId = ompx_thread_id(0); + BlockId = ompx_block_id(0); + } + return Allocations::Arr[ThreadId + BlockId * ompx_block_dim(0)]; +} + +template +[[clang::disable_sanitizer_instrumentation, + gnu::always_inline]] AllocationTy & +getAllocation(const AllocationPtrTy AP, int64_t AccessId = 0) { + auto &AllocArr = getAllocationArray(); + uint64_t NumSlots = SanitizerConfig::SLOTS; + uint64_t Slot = AP.AllocationId; + if (Slot >= NumSlots) + __sanitizer_trap_info_ptr->pointerOutsideAllocation(AP, AP.Offset, + AccessId, Slot, 0); + return AllocArr.Arr[Slot]; +} + +template +[[clang::disable_sanitizer_instrumentation, noreturn, gnu::noinline]] void +SanitizerTrapInfoTy::accessError(const AllocationPtrTy AP, int64_t Size, + int64_t AccessId, int64_t SourceId) { + auto &A = getAllocationArray().Arr[AP.AllocationId]; + int64_t Offset = AP.Offset; + int64_t Length = A.Length; + if (AK == AllocationKind::LOCAL && Length == 0) + useAfterScope(A, AP, Size, AccessId, SourceId); + else if (Offset > Length - Size) + outOfBoundAccess(A, AP, Size, AccessId, SourceId); + else + useAfterFree(A, AP, Size, AccessId, SourceId); +} + +#endif diff --git a/offload/include/Shared/Types.h b/offload/include/Shared/Types.h new file mode 100644 index 0000000000000..1503a4b2a1437 --- /dev/null +++ b/offload/include/Shared/Types.h @@ -0,0 +1,22 @@ +//===-- Shared/Types.h - Type defs shared between host and device - C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// Environments shared between host and device. +// +//===----------------------------------------------------------------------===// + +#ifndef OMPTARGET_SHARED_TYPES_H +#define OMPTARGET_SHARED_TYPES_H + +#ifdef OMPTARGET_DEVICE_RUNTIME +#include "DeviceTypes.h" +#else +#include +#endif + +#endif // OMPTARGET_SHARED_TYPES_H diff --git a/offload/include/Shared/Utils.h b/offload/include/Shared/Utils.h index fce14b54edb98..22ea2ba262d2a 100644 --- a/offload/include/Shared/Utils.h +++ b/offload/include/Shared/Utils.h @@ -14,75 +14,65 @@ #ifndef OMPTARGET_SHARED_UTILS_H #define OMPTARGET_SHARED_UTILS_H -#include "llvm/ADT/StringRef.h" +#include "Types.h" -#include "Debug.h" - -#include -#include -#include -#include - -namespace llvm { -namespace omp { -namespace target { - -/// Utility class for thread-safe reference counting. Any class that needs -/// objects' reference counting can inherit from this entity or have it as a -/// class data member. -template -struct RefCountTy { - /// Create a refcount object initialized to zero. - RefCountTy() : Refs(0) {} - - ~RefCountTy() { assert(Refs == 0 && "Destroying with non-zero refcount"); } - - /// Increase the reference count atomically. - void increase() { Refs.fetch_add(1, MemoryOrder); } - - /// Decrease the reference count and return whether it became zero. Decreasing - /// the counter in more units than it was previously increased results in - /// undefined behavior. - bool decrease() { - Ty Prev = Refs.fetch_sub(1, MemoryOrder); - assert(Prev > 0 && "Invalid refcount"); - return (Prev == 1); - } - - Ty get() const { return Refs.load(MemoryOrder); } - -private: - /// The atomic reference counter. - std::atomic Refs; -}; +namespace utils { /// Return the difference (in bytes) between \p Begin and \p End. template -ptrdiff_t getPtrDiff(const void *End, const void *Begin) { +auto getPtrDiff(const void *End, const void *Begin) { return reinterpret_cast(End) - reinterpret_cast(Begin); } /// Return \p Ptr advanced by \p Offset bytes. -template Ty *advanceVoidPtr(Ty *Ptr, int64_t Offset) { - static_assert(std::is_void::value); - return const_cast(reinterpret_cast(Ptr) + Offset); +template Ty1 *advancePtr(Ty1 *Ptr, Ty2 Offset) { + return (Ty1 *)(const_cast((const char *)(Ptr)) + Offset); } -/// Return \p Ptr aligned to \p Alignment bytes. -template Ty *alignPtr(Ty *Ptr, int64_t Alignment) { - size_t Space = std::numeric_limits::max(); - return std::align(Alignment, sizeof(char), Ptr, Space); +/// Return \p V aligned "upwards" according to \p Align. +template inline Ty1 alignPtr(Ty1 V, Ty2 Align) { + return reinterpret_cast(((uintptr_t(V) + Align - 1) / Align) * Align); +} +/// Return \p V aligned "downwards" according to \p Align. +template inline Ty1 alignDown(Ty1 V, Ty2 Align) { + return V - V % Align; } /// Round up \p V to a \p Boundary. template inline Ty roundUp(Ty V, Ty Boundary) { - return (V + Boundary - 1) / Boundary * Boundary; + return alignPtr(V, Boundary); +} + +/// Return the first bit set in \p V. +inline uint32_t ffs(uint32_t V) { + static_assert(sizeof(int) == sizeof(uint32_t), "type size mismatch"); + return __builtin_ffs(V); +} + +/// Return the first bit set in \p V. +inline uint32_t ffs(uint64_t V) { + static_assert(sizeof(long) == sizeof(uint64_t), "type size mismatch"); + return __builtin_ffsl(V); +} + +/// Return the number of bits set in \p V. +inline uint32_t popc(uint32_t V) { + static_assert(sizeof(int) == sizeof(uint32_t), "type size mismatch"); + return __builtin_popcount(V); +} + +/// Return the number of bits set in \p V. +inline uint32_t popc(uint64_t V) { + static_assert(sizeof(long) == sizeof(uint64_t), "type size mismatch"); + return __builtin_popcountl(V); +} + +template inline DstTy convertViaPun(SrcTy V) { + static_assert(sizeof(DstTy) == sizeof(SrcTy), "Bad conversion"); + return *((DstTy *)(&V)); } -} // namespace target -} // namespace omp -} // namespace llvm +} // namespace utils #endif // OMPTARGET_SHARED_UTILS_H diff --git a/offload/include/device.h b/offload/include/device.h index fd6e5fba5fc53..69954254666b8 100644 --- a/offload/include/device.h +++ b/offload/include/device.h @@ -100,11 +100,15 @@ struct DeviceTy { /// Notify the plugin about a new mapping starting at the host address /// \p HstPtr and \p Size bytes. - int32_t notifyDataMapped(void *HstPtr, int64_t Size); + /// If GPUSan is enabled, \p DevicePtr is registered in each image and + /// \p FakeHstPtr is updated. + int32_t notifyDataMapped(void *HstPtr, void *DevicePtr, int64_t Size, + void *&FakeHstPtr); /// Notify the plugin about an existing mapping being unmapped starting at /// the host address \p HstPtr. - int32_t notifyDataUnmapped(void *HstPtr); + /// If GPUSan is enabled, \p FakeHstPtr is unregistered. + int32_t notifyDataUnmapped(void *HstPtr, void *FakeHstPtr); // Launch the kernel identified by \p TgtEntryPtr with the given arguments. int32_t launchKernel(void *TgtEntryPtr, void **TgtVarsPtr, diff --git a/offload/include/omptarget.h b/offload/include/omptarget.h index 323dee41630f2..8730879905984 100644 --- a/offload/include/omptarget.h +++ b/offload/include/omptarget.h @@ -107,7 +107,7 @@ enum TargetAllocTy : int32_t { inline KernelArgsTy CTorDTorKernelArgs = {1, 0, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, - 0, {0,0}, {1, 0, 0}, {1, 0, 0}, 0}; + 0, {0,0,0}, {1, 0, 0}, {1, 0, 0}, 0}; struct DeviceTy; @@ -136,8 +136,19 @@ class AsyncInfoTy { /// Synchronization method to be used. SyncTy SyncType; - AsyncInfoTy(DeviceTy &Device, SyncTy SyncType = SyncTy::BLOCKING) + AsyncInfoTy(DeviceTy &Device, SyncTy SyncType = SyncTy::BLOCKING) : Device(Device), SyncType(SyncType) {} + AsyncInfoTy(DeviceTy &Device, void *AsyncInfoQueue) + : Device(Device), SyncType(AsyncInfoQueue ? SyncTy::NON_BLOCKING : SyncTy::BLOCKING) { + AsyncInfo.Queue = AsyncInfoQueue; + AsyncInfo.PersistentQueue = !!AsyncInfoQueue; + } + AsyncInfoTy(DeviceTy &Device, void *AsyncInfoQueue, SyncTy SyncType) + : Device(Device), SyncType(SyncType) { + AsyncInfo.Queue = AsyncInfoQueue; + AsyncInfo.PersistentQueue = !!AsyncInfoQueue; + } + ~AsyncInfoTy() { synchronize(); } /// Implicit conversion to the __tgt_async_info which is used in the @@ -207,8 +218,9 @@ class TaskAsyncInfoWrapperTy { void **TaskAsyncInfoPtr = nullptr; public: - TaskAsyncInfoWrapperTy(DeviceTy &Device) + TaskAsyncInfoWrapperTy(DeviceTy &Device, void *AsyncInfoQueue= nullptr) : ExecThreadID(__kmpc_global_thread_num(NULL)), LocalAsyncInfo(Device) { + assert(!AsyncInfoQueue && "Async tasks do not support predefined async queue pointers!"); // If we failed to acquired the current global thread id, we cannot // re-enqueue the current task. Thus we should use the local blocking async // info. @@ -425,6 +437,8 @@ int __tgt_activate_record_replay(int64_t DeviceId, uint64_t MemorySize, void *VAddr, bool IsRecord, bool SaveOutput, uint64_t &ReqPtrArgOffset); +void *__tgt_target_get_default_queue(void *Loc, int64_t DeviceId); + #ifdef __cplusplus } #endif diff --git a/offload/plugins-nextgen/amdgpu/src/rtl.cpp b/offload/plugins-nextgen/amdgpu/src/rtl.cpp index e678213df18ce..d9edfb0e8eab6 100644 --- a/offload/plugins-nextgen/amdgpu/src/rtl.cpp +++ b/offload/plugins-nextgen/amdgpu/src/rtl.cpp @@ -13,6 +13,8 @@ #include #include #include +#include +#include #include #include #include @@ -23,6 +25,7 @@ #include "Shared/APITypes.h" #include "Shared/Debug.h" #include "Shared/Environment.h" +#include "Shared/RefCnt.h" #include "Shared/Utils.h" #include "Utils/ELF.h" @@ -87,7 +90,7 @@ struct AMDGPUDeviceImageTy; struct AMDGPUMemoryManagerTy; struct AMDGPUMemoryPoolTy; -namespace utils { +namespace hsa_utils { /// Iterate elements using an HSA iterate function. Do not use this function /// directly but the specialized ones below instead. @@ -187,7 +190,7 @@ Error asyncMemCopy(bool UseMultipleSdmaEngines, void *Dst, hsa_agent_t DstAgent, Expected getTargetTripleAndFeatures(hsa_agent_t Agent) { std::string Target; - auto Err = utils::iterateAgentISAs(Agent, [&](hsa_isa_t ISA) { + auto Err = hsa_utils::iterateAgentISAs(Agent, [&](hsa_isa_t ISA) { uint32_t Length; hsa_status_t Status; Status = hsa_isa_get_info_alt(ISA, HSA_ISA_INFO_NAME_LENGTH, &Length); @@ -208,7 +211,7 @@ Expected getTargetTripleAndFeatures(hsa_agent_t Agent) { return Err; return Target; } -} // namespace utils +} // namespace hsa_utils /// Utility class representing generic resource references to AMDGPU resources. template @@ -481,7 +484,7 @@ struct AMDGPUDeviceImageTy : public DeviceImageTy { findDeviceSymbol(GenericDeviceTy &Device, StringRef SymbolName) const; /// Get additional info for kernel, e.g., register spill counts - std::optional + std::optional getKernelInfo(StringRef Identifier) const { auto It = KernelInfoMap.find(Identifier); @@ -495,7 +498,7 @@ struct AMDGPUDeviceImageTy : public DeviceImageTy { /// The exectuable loaded on the agent. hsa_executable_t Executable; hsa_code_object_t CodeObject; - StringMap KernelInfoMap; + StringMap KernelInfoMap; uint16_t ELFABIVersion; }; @@ -545,7 +548,8 @@ struct AMDGPUKernelTy : public GenericKernelTy { // TODO: Read the kernel descriptor for the max threads per block. May be // read from the image. - ImplicitArgsSize = utils::getImplicitArgsSize(AMDImage.getELFABIVersion()); + ImplicitArgsSize = + hsa_utils::getImplicitArgsSize(AMDImage.getELFABIVersion()); DP("ELFABIVersion: %d\n", AMDImage.getELFABIVersion()); // Get additional kernel info read from image @@ -596,7 +600,7 @@ struct AMDGPUKernelTy : public GenericKernelTy { uint32_t ImplicitArgsSize; /// Additional Info for the AMD GPU Kernel - std::optional KernelInfo; + std::optional KernelInfo; }; /// Class representing an HSA signal. Signals are used to define dependencies @@ -685,12 +689,12 @@ struct AMDGPUQueueTy { AMDGPUQueueTy() : Queue(nullptr), Mutex(), NumUsers(0) {} /// Lazily initialize a new queue belonging to a specific agent. - Error init(hsa_agent_t Agent, int32_t QueueSize) { + Error init(GenericDeviceTy &Device, hsa_agent_t Agent, int32_t QueueSize) { if (Queue) return Plugin::success(); hsa_status_t Status = hsa_queue_create(Agent, QueueSize, HSA_QUEUE_TYPE_MULTI, callbackError, - nullptr, UINT32_MAX, UINT32_MAX, &Queue); + &Device, UINT32_MAX, UINT32_MAX, &Queue); return Plugin::check(Status, "Error in hsa_queue_create: %s"); } @@ -875,10 +879,8 @@ struct AMDGPUQueueTy { } /// Callack that will be called when an error is detected on the HSA queue. - static void callbackError(hsa_status_t Status, hsa_queue_t *Source, void *) { - auto Err = Plugin::check(Status, "Received error in queue %p: %s", Source); - FATAL_MESSAGE(1, "%s", toString(std::move(Err)).data()); - } + static void callbackError(hsa_status_t Status, hsa_queue_t *Source, + void *Data); /// The HSA queue. hsa_queue_t *Queue; @@ -1268,13 +1270,14 @@ struct AMDGPUStreamTy { // Issue the async memory copy. if (InputSignal && InputSignal->load()) { hsa_signal_t InputSignalRaw = InputSignal->get(); - return utils::asyncMemCopy(UseMultipleSdmaEngines, Dst, Agent, Src, Agent, - CopySize, 1, &InputSignalRaw, - OutputSignal->get()); + return hsa_utils::asyncMemCopy(UseMultipleSdmaEngines, Dst, Agent, Src, + Agent, CopySize, 1, &InputSignalRaw, + OutputSignal->get()); } - return utils::asyncMemCopy(UseMultipleSdmaEngines, Dst, Agent, Src, Agent, - CopySize, 0, nullptr, OutputSignal->get()); + return hsa_utils::asyncMemCopy(UseMultipleSdmaEngines, Dst, Agent, Src, + Agent, CopySize, 0, nullptr, + OutputSignal->get()); } /// Push an asynchronous memory copy device-to-host involving an unpinned @@ -1308,14 +1311,14 @@ struct AMDGPUStreamTy { // dependency if already satisfied. if (InputSignal && InputSignal->load()) { hsa_signal_t InputSignalRaw = InputSignal->get(); - if (auto Err = utils::asyncMemCopy( + if (auto Err = hsa_utils::asyncMemCopy( UseMultipleSdmaEngines, Inter, Agent, Src, Agent, CopySize, 1, &InputSignalRaw, OutputSignals[0]->get())) return Err; } else { - if (auto Err = utils::asyncMemCopy(UseMultipleSdmaEngines, Inter, Agent, - Src, Agent, CopySize, 0, nullptr, - OutputSignals[0]->get())) + if (auto Err = hsa_utils::asyncMemCopy(UseMultipleSdmaEngines, Inter, + Agent, Src, Agent, CopySize, 0, + nullptr, OutputSignals[0]->get())) return Err; } @@ -1406,12 +1409,13 @@ struct AMDGPUStreamTy { // dependency if already satisfied. if (InputSignal && InputSignal->load()) { hsa_signal_t InputSignalRaw = InputSignal->get(); - return utils::asyncMemCopy(UseMultipleSdmaEngines, Dst, Agent, Inter, - Agent, CopySize, 1, &InputSignalRaw, - OutputSignal->get()); + return hsa_utils::asyncMemCopy(UseMultipleSdmaEngines, Dst, Agent, Inter, + Agent, CopySize, 1, &InputSignalRaw, + OutputSignal->get()); } - return utils::asyncMemCopy(UseMultipleSdmaEngines, Dst, Agent, Inter, Agent, - CopySize, 0, nullptr, OutputSignal->get()); + return hsa_utils::asyncMemCopy(UseMultipleSdmaEngines, Dst, Agent, Inter, + Agent, CopySize, 0, nullptr, + OutputSignal->get()); } // AMDGPUDeviceTy is incomplete here, passing the underlying agent instead @@ -1435,13 +1439,13 @@ struct AMDGPUStreamTy { if (InputSignal && InputSignal->load()) { hsa_signal_t InputSignalRaw = InputSignal->get(); - return utils::asyncMemCopy(UseMultipleSdmaEngines, Dst, DstAgent, Src, - SrcAgent, CopySize, 1, &InputSignalRaw, - OutputSignal->get()); + return hsa_utils::asyncMemCopy(UseMultipleSdmaEngines, Dst, DstAgent, Src, + SrcAgent, CopySize, 1, &InputSignalRaw, + OutputSignal->get()); } - return utils::asyncMemCopy(UseMultipleSdmaEngines, Dst, DstAgent, Src, - SrcAgent, CopySize, 0, nullptr, - OutputSignal->get()); + return hsa_utils::asyncMemCopy(UseMultipleSdmaEngines, Dst, DstAgent, Src, + SrcAgent, CopySize, 0, nullptr, + OutputSignal->get()); } /// Synchronize with the stream. The current thread waits until all operations @@ -1593,8 +1597,9 @@ struct AMDGPUStreamManagerTy final using ResourceRef = AMDGPUResourceRef; using ResourcePoolTy = GenericDeviceResourceManagerTy; + GenericDeviceTy &Device; AMDGPUStreamManagerTy(GenericDeviceTy &Device, hsa_agent_t HSAAgent) - : GenericDeviceResourceManagerTy(Device), + : GenericDeviceResourceManagerTy(Device), Device(Device), OMPX_QueueTracking("LIBOMPTARGET_AMDGPU_HSA_QUEUE_BUSY_TRACKING", true), NextQueue(0), Agent(HSAAgent) {} @@ -1603,7 +1608,7 @@ struct AMDGPUStreamManagerTy final QueueSize = HSAQueueSize; MaxNumQueues = NumHSAQueues; // Initialize one queue eagerly - if (auto Err = Queues.front().init(Agent, QueueSize)) + if (auto Err = Queues.front().init(Device, Agent, QueueSize)) return Err; return GenericDeviceResourceManagerTy::init(InitialSize); @@ -1660,7 +1665,7 @@ struct AMDGPUStreamManagerTy final } // Make sure the queue is initialized, then add user & assign. - if (auto Err = Queues[Index].init(Agent, QueueSize)) + if (auto Err = Queues[Index].init(Device, Agent, QueueSize)) return Err; Queues[Index].addUser(); Stream->Queue = &Queues[Index]; @@ -1799,7 +1804,7 @@ struct AMDHostDeviceTy : public AMDGenericDeviceTy { Error retrieveAllMemoryPools() override { // Iterate through the available pools across the host agents. for (hsa_agent_t Agent : Agents) { - Error Err = utils::iterateAgentMemoryPools( + Error Err = hsa_utils::iterateAgentMemoryPools( Agent, [&](hsa_amd_memory_pool_t HSAMemoryPool) { AMDGPUMemoryPoolTy *MemoryPool = new AMDGPUMemoryPoolTy(HSAMemoryPool); @@ -1964,7 +1969,7 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy { // Detect if XNACK is enabled auto TargeTripleAndFeaturesOrError = - utils::getTargetTripleAndFeatures(Agent); + hsa_utils::getTargetTripleAndFeatures(Agent); if (!TargeTripleAndFeaturesOrError) return TargeTripleAndFeaturesOrError.takeError(); if (static_cast(*TargeTripleAndFeaturesOrError) @@ -2208,8 +2213,11 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy { return Err; // Once the stream is synchronized, return it to stream pool and reset - // AsyncInfo. This is to make sure the synchronization only works for its - // own tasks. + // AsyncInfo if the queue is not persistent. This is to make sure the + // synchronization only works for its own tasks. + if (AsyncInfo.PersistentQueue) + return Plugin::success(); + AsyncInfo.Queue = nullptr; return AMDGPUStreamManager.returnResource(Stream); } @@ -2228,9 +2236,12 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy { if (!(*CompletedOrErr)) return Plugin::success(); - // Once the stream is completed, return it to stream pool and reset - // AsyncInfo. This is to make sure the synchronization only works for its - // own tasks. + // Once the stream is synchronized, return it to stream pool and reset + // AsyncInfo if the queue is not persistent. This is to make sure the + // synchronization only works for its own tasks. + if (AsyncInfo.PersistentQueue) + return Plugin::success(); + AsyncInfo.Queue = nullptr; return AMDGPUStreamManager.returnResource(Stream); } @@ -2316,9 +2327,9 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy { if (auto Err = Signal.init()) return Err; - if (auto Err = utils::asyncMemCopy(useMultipleSdmaEngines(), TgtPtr, - Agent, PinnedPtr, Agent, Size, 0, - nullptr, Signal.get())) + if (auto Err = hsa_utils::asyncMemCopy(useMultipleSdmaEngines(), TgtPtr, + Agent, PinnedPtr, Agent, Size, 0, + nullptr, Signal.get())) return Err; if (auto Err = Signal.wait(getStreamBusyWaitMicroseconds())) @@ -2376,9 +2387,9 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy { if (auto Err = Signal.init()) return Err; - if (auto Err = utils::asyncMemCopy(useMultipleSdmaEngines(), PinnedPtr, - Agent, TgtPtr, Agent, Size, 0, nullptr, - Signal.get())) + if (auto Err = hsa_utils::asyncMemCopy(useMultipleSdmaEngines(), + PinnedPtr, Agent, TgtPtr, Agent, + Size, 0, nullptr, Signal.get())) return Err; if (auto Err = Signal.wait(getStreamBusyWaitMicroseconds())) @@ -2420,7 +2431,7 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy { if (auto Err = Signal.init()) return Err; - if (auto Err = utils::asyncMemCopy( + if (auto Err = hsa_utils::asyncMemCopy( useMultipleSdmaEngines(), DstPtr, DstDevice.getAgent(), SrcPtr, getAgent(), (uint64_t)Size, 0, nullptr, Signal.get())) return Err; @@ -2443,7 +2454,10 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy { /// Initialize the async info for interoperability purposes. Error initAsyncInfoImpl(AsyncInfoWrapperTy &AsyncInfoWrapper) override { - // TODO: Implement this function. + AMDGPUStreamTy *Stream; + if (auto Err = getStream(AsyncInfoWrapper, Stream)) + return Err; + return Plugin::success(); } @@ -2686,7 +2700,7 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy { } Info.add("ISAs"); - auto Err = utils::iterateAgentISAs(getAgent(), [&](hsa_isa_t ISA) { + auto Err = hsa_utils::iterateAgentISAs(getAgent(), [&](hsa_isa_t ISA) { Status = hsa_isa_get_info_alt(ISA, HSA_ISA_INFO_NAME, TmpChar); if (Status == HSA_STATUS_SUCCESS) Info.add("Name", TmpChar); @@ -2768,7 +2782,7 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy { /// Retrieve and construct all memory pools of the device agent. Error retrieveAllMemoryPools() override { // Iterate through the available pools of the device agent. - return utils::iterateAgentMemoryPools( + return hsa_utils::iterateAgentMemoryPools( Agent, [&](hsa_amd_memory_pool_t HSAMemoryPool) { AMDGPUMemoryPoolTy *MemoryPool = Plugin.allocate(); @@ -2954,7 +2968,7 @@ Error AMDGPUDeviceImageTy::loadExecutable(const AMDGPUDeviceTy &Device) { if (Result) return Plugin::error("Loaded HSA executable does not validate"); - if (auto Err = utils::readAMDGPUMetaDataFromImage( + if (auto Err = hsa_utils::readAMDGPUMetaDataFromImage( getMemoryBuffer(), KernelInfoMap, ELFABIVersion)) return Err; @@ -3083,7 +3097,7 @@ struct AMDGPUPluginTy final : public GenericPluginTy { llvm::SmallVector HostAgents; // Count the number of available agents. - auto Err = utils::iterateAgents([&](hsa_agent_t Agent) { + auto Err = hsa_utils::iterateAgents([&](hsa_agent_t Agent) { // Get the device type of the agent. hsa_device_type_t DeviceType; hsa_status_t Status = @@ -3178,12 +3192,12 @@ struct AMDGPUPluginTy final : public GenericPluginTy { return false; auto TargeTripleAndFeaturesOrError = - utils::getTargetTripleAndFeatures(getKernelAgent(DeviceId)); + hsa_utils::getTargetTripleAndFeatures(getKernelAgent(DeviceId)); if (!TargeTripleAndFeaturesOrError) return TargeTripleAndFeaturesOrError.takeError(); - return utils::isImageCompatibleWithEnv(Processor ? *Processor : "", - ElfOrErr->getPlatformFlags(), - *TargeTripleAndFeaturesOrError); + return hsa_utils::isImageCompatibleWithEnv(Processor ? *Processor : "", + ElfOrErr->getPlatformFlags(), + *TargeTripleAndFeaturesOrError); } bool isDataExchangable(int32_t SrcDeviceId, int32_t DstDeviceId) override { @@ -3295,11 +3309,11 @@ Error AMDGPUKernelTy::launchImpl(GenericDeviceTy &GenericDevice, if (auto Err = GenericDevice.getDeviceStackSize(StackSize)) return Err; - utils::AMDGPUImplicitArgsTy *ImplArgs = nullptr; + hsa_utils::AMDGPUImplicitArgsTy *ImplArgs = nullptr; if (ArgsSize == LaunchParams.Size + getImplicitArgsSize()) { // Initialize implicit arguments. - ImplArgs = reinterpret_cast( - advanceVoidPtr(AllArgs, LaunchParams.Size)); + ImplArgs = reinterpret_cast( + utils::advancePtr(AllArgs, LaunchParams.Size)); // Initialize the implicit arguments to zero. std::memset(ImplArgs, 0, getImplicitArgsSize()); @@ -3323,7 +3337,7 @@ Error AMDGPUKernelTy::launchImpl(GenericDeviceTy &GenericDevice, // Only COV5 implicitargs needs to be set. COV4 implicitargs are not used. if (ImplArgs && - getImplicitArgsSize() == sizeof(utils::AMDGPUImplicitArgsTy)) { + getImplicitArgsSize() == sizeof(hsa_utils::AMDGPUImplicitArgsTy)) { ImplArgs->BlockCountX = NumBlocks; ImplArgs->BlockCountY = 1; ImplArgs->BlockCountZ = 1; @@ -3480,6 +3494,16 @@ void *AMDGPUDeviceTy::allocate(size_t Size, void *, TargetAllocTy Kind) { return Alloc; } +void AMDGPUQueueTy::callbackError(hsa_status_t Status, hsa_queue_t *Source, + void *Data) { + + auto *Device = reinterpret_cast(Data); + Device->reportSanitizerError(); + + auto Err = Plugin::check(Status, "Received error in queue %p: %s", Source); + FATAL_MESSAGE(1, "%s", toString(std::move(Err)).data()); +} + } // namespace plugin } // namespace target } // namespace omp diff --git a/offload/plugins-nextgen/amdgpu/utils/UtilitiesRTL.h b/offload/plugins-nextgen/amdgpu/utils/UtilitiesRTL.h index 58a3b5df00fac..1e99d0a30bdf2 100644 --- a/offload/plugins-nextgen/amdgpu/utils/UtilitiesRTL.h +++ b/offload/plugins-nextgen/amdgpu/utils/UtilitiesRTL.h @@ -33,7 +33,7 @@ namespace llvm { namespace omp { namespace target { namespace plugin { -namespace utils { +namespace hsa_utils { // The implicit arguments of COV5 AMDGPU kernels. struct AMDGPUImplicitArgsTy { @@ -310,7 +310,7 @@ readAMDGPUMetaDataFromImage(MemoryBufferRef MemBuffer, return Error::success(); } -} // namespace utils +} // namespace hsa_utils } // namespace plugin } // namespace target } // namespace omp diff --git a/offload/plugins-nextgen/common/include/GlobalHandler.h b/offload/plugins-nextgen/common/include/GlobalHandler.h index 829b4b7291193..b88d3e52bbf43 100644 --- a/offload/plugins-nextgen/common/include/GlobalHandler.h +++ b/offload/plugins-nextgen/common/include/GlobalHandler.h @@ -50,6 +50,9 @@ class GlobalTy { const std::string &getName() const { return Name; } uint32_t getSize() const { return Size; } void *getPtr() const { return Ptr; } + template T *getPtrAs() const { + return reinterpret_cast(Ptr); + } void setSize(int32_t S) { Size = S; } void setPtr(void *P) { Ptr = P; } diff --git a/offload/plugins-nextgen/common/include/PluginInterface.h b/offload/plugins-nextgen/common/include/PluginInterface.h index 0d2a36a42d5fa..e1e5b5e05acdc 100644 --- a/offload/plugins-nextgen/common/include/PluginInterface.h +++ b/offload/plugins-nextgen/common/include/PluginInterface.h @@ -24,6 +24,7 @@ #include "Shared/Environment.h" #include "Shared/EnvironmentVar.h" #include "Shared/Requirements.h" +#include "Shared/Sanitizer.h" #include "Shared/Utils.h" #include "GlobalHandler.h" @@ -231,7 +232,7 @@ class DeviceImageTy { /// Get the image size. size_t getSize() const { - return getPtrDiff(TgtImage->ImageEnd, TgtImage->ImageStart); + return utils::getPtrDiff(TgtImage->ImageEnd, TgtImage->ImageStart); } /// Get a memory buffer reference to the whole image. @@ -471,7 +472,7 @@ class PinnedAllocationMapTy { --It; // The buffer is not contained in the pinned allocation. - if (advanceVoidPtr(It->HstPtr, It->Size) > HstPtr) + if (utils::advancePtr(It->HstPtr, It->Size) > HstPtr) return &(*It); // None found. @@ -498,15 +499,15 @@ class PinnedAllocationMapTy { /// Indicate whether the first range A fully contains the second range B. static bool contains(void *PtrA, size_t SizeA, void *PtrB, size_t SizeB) { - void *EndA = advanceVoidPtr(PtrA, SizeA); - void *EndB = advanceVoidPtr(PtrB, SizeB); + void *EndA = utils::advancePtr(PtrA, SizeA); + void *EndB = utils::advancePtr(PtrB, SizeB); return (PtrB >= PtrA && EndB <= EndA); } /// Indicate whether the first range A intersects with the second range B. static bool intersects(void *PtrA, size_t SizeA, void *PtrB, size_t SizeB) { - void *EndA = advanceVoidPtr(PtrA, SizeA); - void *EndB = advanceVoidPtr(PtrB, SizeB); + void *EndA = utils::advancePtr(PtrA, SizeA); + void *EndB = utils::advancePtr(PtrB, SizeB); return (PtrA < EndB && PtrB < EndA); } @@ -588,8 +589,8 @@ class PinnedAllocationMapTy { if (!Entry) return nullptr; - return advanceVoidPtr(Entry->DevAccessiblePtr, - getPtrDiff(HstPtr, Entry->HstPtr)); + return utils::advancePtr(Entry->DevAccessiblePtr, + utils::getPtrDiff(HstPtr, Entry->HstPtr)); } /// Check whether a buffer belongs to a registered host pinned allocation. @@ -601,6 +602,22 @@ class PinnedAllocationMapTy { } }; +struct GPUSanTy { + GPUSanTy(GenericDeviceTy &Device) : Device(Device) {} + Error notifyDataMapped(void *DevicePtr, uint64_t Size, void *&FakeHstPtr); + Error notifyDataUnmapped(void *FakeHstPtr); + + void addGPUSanNewFn(GenericKernelTy &GK) { NewFns.push_back(&GK); } + void addGPUSanFreeFn(GenericKernelTy &GK) { FreeFns.push_back(&GK); } + void checkAndReportError(); + +private: + uint32_t SlotCnt = SanitizerConfig::SLOTS - 1; + GenericDeviceTy &Device; + SmallVector NewFns; + SmallVector FreeFns; +}; + /// Class implementing common functionalities of offload devices. Each plugin /// should define the specific device class, derive from this generic one, and /// implement the necessary virtual function members. @@ -718,14 +735,23 @@ struct GenericDeviceTy : public DeviceAllocatorTy { /// buffer (e.g., because a user OpenMP target map) and the buffer may be used /// as source/destination of memory transfers. We can use this information to /// lock the host buffer and optimize its memory transfers. - Error notifyDataMapped(void *HstPtr, int64_t Size) { + Error notifyDataMapped(void *HstPtr, void *DevicePtr, int64_t Size, + void *&FakeHstPtr) { + if (auto Err = GPUSan.notifyDataMapped(DevicePtr, Size, FakeHstPtr)) + return Err; + if (!HstPtr) + return Error::success(); return PinnedAllocs.lockMappedHostBuffer(HstPtr, Size); } /// Mark the host buffer with address \p HstPtr as unmapped. This means that /// libomptarget removed an existing mapping. If the plugin locked the buffer /// in notifyDataMapped, this function should unlock it. - Error notifyDataUnmapped(void *HstPtr) { + Error notifyDataUnmapped(void *HstPtr, void *FakeHstPtr) { + if (auto Err = GPUSan.notifyDataUnmapped(FakeHstPtr)) + return Err; + if (!HstPtr) + return Error::success(); return PinnedAllocs.unlockUnmappedHostBuffer(HstPtr); } @@ -736,6 +762,10 @@ struct GenericDeviceTy : public DeviceAllocatorTy { void *&BaseDevAccessiblePtr, size_t &BaseSize) const = 0; + void addGPUSanNewFn(GenericKernelTy &GK) { GPUSan.addGPUSanNewFn(GK); } + void addGPUSanFreeFn(GenericKernelTy &GK) { GPUSan.addGPUSanFreeFn(GK); } + void reportSanitizerError() { GPUSan.checkAndReportError(); } + /// Submit data to the device (host to device transfer). Error dataSubmit(void *TgtPtr, const void *HstPtr, int64_t Size, __tgt_async_info *AsyncInfo); @@ -857,6 +887,8 @@ struct GenericDeviceTy : public DeviceAllocatorTy { /// Allocate and construct a kernel object. virtual Expected constructKernel(const char *Name) = 0; + DenseMap SanitizerTrapInfos; + /// Reference to the underlying plugin that created this device. GenericPluginTy &Plugin; @@ -950,6 +982,8 @@ struct GenericDeviceTy : public DeviceAllocatorTy { #endif private: + GPUSanTy GPUSan; + DeviceMemoryPoolTy DeviceMemoryPool = {nullptr, 0}; DeviceMemoryPoolTrackingTy DeviceMemoryPoolTracking = {0, 0, ~0U, 0}; }; @@ -1118,10 +1152,12 @@ struct GenericPluginTy { int32_t data_unlock(int32_t DeviceId, void *Ptr); /// Notify the runtime about a new mapping that has been created outside. - int32_t data_notify_mapped(int32_t DeviceId, void *HstPtr, int64_t Size); + int32_t data_notify_mapped(int32_t DeviceId, void *HstPtr, void *DevicePtr, + int64_t Size, void *&FakeHstPtr); /// Notify t he runtime about a mapping that has been deleted. - int32_t data_notify_unmapped(int32_t DeviceId, void *HstPtr); + int32_t data_notify_unmapped(int32_t DeviceId, void *HstPtr, + void *FakeHstPtr); /// Copy data to the given device. int32_t data_submit(int32_t DeviceId, void *TgtPtr, void *HstPtr, diff --git a/offload/plugins-nextgen/common/src/GlobalHandler.cpp b/offload/plugins-nextgen/common/src/GlobalHandler.cpp index ba0aa47f8e51c..e18f358af116c 100644 --- a/offload/plugins-nextgen/common/src/GlobalHandler.cpp +++ b/offload/plugins-nextgen/common/src/GlobalHandler.cpp @@ -152,8 +152,8 @@ Error GenericGlobalHandlerTy::readGlobalFromImage(GenericDeviceTy &Device, HostGlobal.getPtr()); assert(Image.getStart() <= ImageGlobal.getPtr() && - advanceVoidPtr(ImageGlobal.getPtr(), ImageGlobal.getSize()) < - advanceVoidPtr(Image.getStart(), Image.getSize()) && + utils::advancePtr(ImageGlobal.getPtr(), ImageGlobal.getSize()) < + utils::advancePtr(Image.getStart(), Image.getSize()) && "Attempting to read outside the image!"); // Perform the copy from the image to the host memory. diff --git a/offload/plugins-nextgen/common/src/JIT.cpp b/offload/plugins-nextgen/common/src/JIT.cpp index 9dbba1459839d..9adb62b677b92 100644 --- a/offload/plugins-nextgen/common/src/JIT.cpp +++ b/offload/plugins-nextgen/common/src/JIT.cpp @@ -51,7 +51,7 @@ namespace { bool isImageBitcode(const __tgt_device_image &Image) { StringRef Binary(reinterpret_cast(Image.ImageStart), - target::getPtrDiff(Image.ImageEnd, Image.ImageStart)); + utils::getPtrDiff(Image.ImageEnd, Image.ImageStart)); return identify_magic(Binary) == file_magic::bitcode; } @@ -69,7 +69,7 @@ createModuleFromMemoryBuffer(std::unique_ptr &MB, Expected> createModuleFromImage(const __tgt_device_image &Image, LLVMContext &Context) { StringRef Data((const char *)Image.ImageStart, - target::getPtrDiff(Image.ImageEnd, Image.ImageStart)); + utils::getPtrDiff(Image.ImageEnd, Image.ImageStart)); std::unique_ptr MB = MemoryBuffer::getMemBuffer( Data, /*BufferName=*/"", /*RequiresNullTerminator=*/false); return createModuleFromMemoryBuffer(MB, Context); diff --git a/offload/plugins-nextgen/common/src/PluginInterface.cpp b/offload/plugins-nextgen/common/src/PluginInterface.cpp index 94f9d4670b672..fe194a42acaab 100644 --- a/offload/plugins-nextgen/common/src/PluginInterface.cpp +++ b/offload/plugins-nextgen/common/src/PluginInterface.cpp @@ -16,14 +16,20 @@ #include "GlobalHandler.h" #include "JIT.h" +#include "Shared/Sanitizer.h" +#include "Shared/Utils.h" #include "Utils/ELF.h" #include "omptarget.h" +#include "llvm/Support/ErrorHandling.h" +#include +#include #ifdef OMPT_SUPPORT #include "OpenMP/OMPT/Callback.h" #include "omp-tools.h" #endif +#include "llvm/ADT/StringRef.h" #include "llvm/Bitcode/BitcodeReader.h" #include "llvm/Frontend/OpenMP/OMPConstants.h" #include "llvm/Support/Error.h" @@ -74,7 +80,7 @@ struct RecordReplayTy { Device->allocate(1024, /*HstPtr=*/nullptr, TARGET_ALLOC_DEFAULT); Device->free(Addr); // Align Address to MaxMemoryAllocation - Addr = (void *)alignPtr((Addr), MaxMemoryAllocation); + Addr = (void *)utils::alignPtr((Addr), MaxMemoryAllocation); return Addr; } @@ -207,8 +213,8 @@ struct RecordReplayTy { if (EC) report_fatal_error("Error saving image : " + StringRef(EC.message())); if (const auto *TgtImageBitcode = Image.getTgtImageBitcode()) { - size_t Size = - getPtrDiff(TgtImageBitcode->ImageEnd, TgtImageBitcode->ImageStart); + size_t Size = utils::getPtrDiff(TgtImageBitcode->ImageEnd, + TgtImageBitcode->ImageStart); MemoryBufferRef MBR = MemoryBufferRef( StringRef((const char *)TgtImageBitcode->ImageStart, Size), ""); OS << MBR.getBuffer(); @@ -241,10 +247,10 @@ struct RecordReplayTy { int32_t NameLength = std::strlen(OffloadEntry.Name) + 1; memcpy(BufferPtr, OffloadEntry.Name, NameLength); - BufferPtr = advanceVoidPtr(BufferPtr, NameLength); + BufferPtr = utils::advancePtr(BufferPtr, NameLength); *((uint32_t *)(BufferPtr)) = OffloadEntry.Size; - BufferPtr = advanceVoidPtr(BufferPtr, sizeof(uint32_t)); + BufferPtr = utils::advancePtr(BufferPtr, sizeof(uint32_t)); auto Err = Plugin::success(); { @@ -254,11 +260,12 @@ struct RecordReplayTy { } if (Err) report_fatal_error("Error retrieving data for global"); - BufferPtr = advanceVoidPtr(BufferPtr, OffloadEntry.Size); + BufferPtr = utils::advancePtr(BufferPtr, OffloadEntry.Size); } assert(BufferPtr == GlobalsMB->get()->getBufferEnd() && "Buffer over/under-filled."); - assert(Size == getPtrDiff(BufferPtr, GlobalsMB->get()->getBufferStart()) && + assert(Size == utils::getPtrDiff(BufferPtr, + GlobalsMB->get()->getBufferStart()) && "Buffer size mismatch"); StringRef GlobalsMemory(GlobalsMB.get()->getBufferStart(), Size); @@ -549,9 +556,16 @@ Error GenericKernelTy::launch(GenericDeviceTy &GenericDevice, void **ArgPtrs, if (!KernelLaunchEnvOrErr) return KernelLaunchEnvOrErr.takeError(); - KernelLaunchParamsTy LaunchParams = - prepareArgs(GenericDevice, ArgPtrs, ArgOffsets, KernelArgs.NumArgs, Args, - Ptrs, *KernelLaunchEnvOrErr); + KernelLaunchParamsTy LaunchParams; + + // Kernel languages don't use indirection. + if (KernelArgs.Flags.IsCUDA) { + LaunchParams = *reinterpret_cast(KernelArgs.ArgPtrs); + } else { + LaunchParams = + prepareArgs(GenericDevice, ArgPtrs, ArgOffsets, KernelArgs.NumArgs, + Args, Ptrs, *KernelLaunchEnvOrErr); + } uint32_t NumThreads = getNumThreads(GenericDevice, KernelArgs.ThreadLimit); uint64_t NumBlocks = @@ -723,7 +737,7 @@ GenericDeviceTy::GenericDeviceTy(GenericPluginTy &Plugin, int32_t DeviceId, OMPX_InitialNumEvents("LIBOMPTARGET_NUM_INITIAL_EVENTS", 1), DeviceId(DeviceId), GridValues(OMPGridValues), PeerAccesses(NumDevices, PeerAccessState::PENDING), PeerAccessesLock(), - PinnedAllocs(*this), RPCServer(nullptr) { + PinnedAllocs(*this), RPCServer(nullptr), GPUSan(*this) { #ifdef OMPT_SUPPORT OmptInitialized.store(false); // Bind the callbacks to this device's member functions @@ -903,7 +917,7 @@ GenericDeviceTy::loadBinary(GenericPluginTy &Plugin, #ifdef OMPT_SUPPORT if (ompt::Initialized) { size_t Bytes = - getPtrDiff(InputTgtImage->ImageEnd, InputTgtImage->ImageStart); + utils::getPtrDiff(InputTgtImage->ImageEnd, InputTgtImage->ImageStart); performOmptCallback( device_load, Plugin.getUserId(DeviceId), /*FileName=*/nullptr, /*FileOffset=*/0, /*VmaInFile=*/nullptr, @@ -916,6 +930,25 @@ GenericDeviceTy::loadBinary(GenericPluginTy &Plugin, if (auto Err = callGlobalConstructors(Plugin, *Image)) return std::move(Err); + auto GetKernel = [&](StringRef Name) -> GenericKernelTy * { + auto KernelOrErr = constructKernel(Name.data()); + if (Error Err = KernelOrErr.takeError()) { + REPORT("Failure to look up kernel: %s\n", + toString(std::move(Err)).data()); + return nullptr; + } + GenericKernelTy &Kernel = *KernelOrErr; + if (auto Err = Kernel.init(*this, *Image)) { + REPORT("Failure to init kernel: %s\n", toString(std::move(Err)).data()); + return nullptr; + } + return &Kernel; + }; + if (GenericKernelTy *Kernel = GetKernel("__sanitizer_register")) + addGPUSanNewFn(*Kernel); + if (GenericKernelTy *Kernel = GetKernel("__sanitizer_unregister")) + addGPUSanFreeFn(*Kernel); + // Return the pointer to the table of entries. return Image; } @@ -994,6 +1027,16 @@ Error GenericDeviceTy::setupDeviceMemoryPool(GenericPluginTy &Plugin, if (auto Err = GHandler.writeGlobalToDevice(*this, Image, TrackerGlobal)) return Err; + auto *&SanitizerTrapInfo = SanitizerTrapInfos[&Image]; + SanitizerTrapInfo = reinterpret_cast(allocate( + sizeof(*SanitizerTrapInfo), &SanitizerTrapInfo, TARGET_ALLOC_HOST)); + memset(SanitizerTrapInfo, '\0', sizeof(SanitizerTrapInfoTy)); + + GlobalTy TrapId("__sanitizer_trap_info_ptr", sizeof(SanitizerTrapInfo), + &SanitizerTrapInfo); + if (auto Err = GHandler.writeGlobalToDevice(*this, Image, TrapId)) + return Err; + // Create the metainfo of the device environment global. GlobalTy DevEnvGlobal("__omp_rtl_device_memory_pool", sizeof(DeviceMemoryPoolTy), &DeviceMemoryPool); @@ -1131,8 +1174,8 @@ Expected PinnedAllocationMapTy::lockHostBuffer(void *HstPtr, return std::move(Err); // Return the device accessible pointer with the correct offset. - return advanceVoidPtr(Entry->DevAccessiblePtr, - getPtrDiff(HstPtr, Entry->HstPtr)); + return utils::advancePtr(Entry->DevAccessiblePtr, + utils::getPtrDiff(HstPtr, Entry->HstPtr)); } // No intersecting registered allocation found in the map. First, lock the @@ -1428,8 +1471,10 @@ Error GenericDeviceTy::launchKernel(void *EntryPtr, void **ArgPtrs, Error GenericDeviceTy::initAsyncInfo(__tgt_async_info **AsyncInfoPtr) { assert(AsyncInfoPtr && "Invalid async info"); + assert(!(*AsyncInfoPtr) && "Already initialized async info"); *AsyncInfoPtr = new __tgt_async_info(); + (*AsyncInfoPtr)->PersistentQueue = true; AsyncInfoWrapperTy AsyncInfoWrapper(*this, *AsyncInfoPtr); @@ -1607,7 +1652,7 @@ int32_t GenericPluginTy::is_initialized() const { return Initialized; } int32_t GenericPluginTy::is_plugin_compatible(__tgt_device_image *Image) { StringRef Buffer(reinterpret_cast(Image->ImageStart), - target::getPtrDiff(Image->ImageEnd, Image->ImageStart)); + utils::getPtrDiff(Image->ImageEnd, Image->ImageStart)); auto HandleError = [&](Error Err) -> bool { [[maybe_unused]] std::string ErrStr = toString(std::move(Err)); @@ -1639,7 +1684,7 @@ int32_t GenericPluginTy::is_plugin_compatible(__tgt_device_image *Image) { int32_t GenericPluginTy::is_device_compatible(int32_t DeviceId, __tgt_device_image *Image) { StringRef Buffer(reinterpret_cast(Image->ImageStart), - target::getPtrDiff(Image->ImageEnd, Image->ImageStart)); + utils::getPtrDiff(Image->ImageEnd, Image->ImageStart)); auto HandleError = [&](Error Err) -> bool { [[maybe_unused]] std::string ErrStr = toString(std::move(Err)); @@ -1801,8 +1846,10 @@ int32_t GenericPluginTy::data_unlock(int32_t DeviceId, void *Ptr) { } int32_t GenericPluginTy::data_notify_mapped(int32_t DeviceId, void *HstPtr, - int64_t Size) { - auto Err = getDevice(DeviceId).notifyDataMapped(HstPtr, Size); + void *DevicePtr, int64_t Size, + void *&FakeHstPtr) { + auto Err = + getDevice(DeviceId).notifyDataMapped(HstPtr, DevicePtr, Size, FakeHstPtr); if (Err) { REPORT("Failure to notify data mapped %p: %s\n", HstPtr, toString(std::move(Err)).data()); @@ -1812,8 +1859,9 @@ int32_t GenericPluginTy::data_notify_mapped(int32_t DeviceId, void *HstPtr, return OFFLOAD_SUCCESS; } -int32_t GenericPluginTy::data_notify_unmapped(int32_t DeviceId, void *HstPtr) { - auto Err = getDevice(DeviceId).notifyDataUnmapped(HstPtr); +int32_t GenericPluginTy::data_notify_unmapped(int32_t DeviceId, void *HstPtr, + void *FakeHstPtr) { + auto Err = getDevice(DeviceId).notifyDataUnmapped(HstPtr, FakeHstPtr); if (Err) { REPORT("Failure to notify data unmapped %p: %s\n", HstPtr, toString(std::move(Err)).data()); @@ -2088,6 +2136,201 @@ int32_t GenericPluginTy::get_function(__tgt_device_binary Binary, return OFFLOAD_SUCCESS; } +Error GPUSanTy::notifyDataMapped(void *DevicePtr, uint64_t Size, + void *&FakeHstPtr) { + FakeHstPtr = nullptr; + if (NewFns.empty()) + return Plugin::success(); + uint64_t Slot = SlotCnt--; + FakeHstPtr = __offload_get_new_sanitizer_ptr(Slot); + KernelArgsTy Args = {}; + Args.NumTeams[0] = 1; + Args.ThreadLimit[0] = 1; + AsyncInfoWrapperTy AsyncInfoWrapper(Device, nullptr); + for (GenericKernelTy *NewFP : NewFns) { + struct { + void *Ptr; + uint64_t Length; + uint64_t Slot; + } KernelArgs{DevicePtr, Size, Slot}; + KernelLaunchParamsTy ArgPtrs{sizeof(KernelArgs), &KernelArgs, nullptr}; + Args.ArgPtrs = reinterpret_cast(&ArgPtrs); + Args.Flags.IsCUDA = true; + if (auto Err = NewFP->launch(Device, Args.ArgPtrs, nullptr, Args, + AsyncInfoWrapper)) { + AsyncInfoWrapper.finalize(Err); + return Err; + } + } + + Error Err = Plugin::success(); + AsyncInfoWrapper.finalize(Err); + return Err; +} + +Error GPUSanTy::notifyDataUnmapped(void *FakeHstPtr) { + if (!FakeHstPtr) + return Plugin::success(); + KernelArgsTy Args = {}; + Args.NumTeams[0] = 1; + Args.ThreadLimit[0] = 1; + AsyncInfoWrapperTy AsyncInfoWrapper(Device, nullptr); + for (GenericKernelTy *FreeFn : FreeFns) { + KernelLaunchParamsTy ArgPtrs{sizeof(void *), &FakeHstPtr, nullptr}; + Args.ArgPtrs = reinterpret_cast(&ArgPtrs); + Args.Flags.IsCUDA = true; + if (auto Err = FreeFn->launch(Device, Args.ArgPtrs, nullptr, Args, + AsyncInfoWrapper)) { + AsyncInfoWrapper.finalize(Err); + return Err; + } + } + Error Err = Plugin::success(); + AsyncInfoWrapper.finalize(Err); + return Err; +} + +void GPUSanTy::checkAndReportError() { + SanitizerTrapInfoTy *STI; + DeviceImageTy *Image = nullptr; + for (auto &It : Device.SanitizerTrapInfos) { + STI = It.second; + if (!STI || STI->ErrorCode == SanitizerTrapInfoTy::None) + continue; + Image = It.first; + break; + } + if (!Image) + return; + + auto Green = []() { return "\033[1m\033[32m"; }; + auto Blue = []() { return "\033[1m\033[34m"; }; + auto Red = []() { return "\033[1m\033[31m"; }; + auto Default = []() { return "\033[1m\033[0m"; }; + + GenericGlobalHandlerTy &GHandler = Device.Plugin.getGlobalHandler(); + auto GetImagePtr = [&](GlobalTy &GV, bool Quiet = false) { + if (auto Err = GHandler.getGlobalMetadataFromImage(Device, *Image, GV)) { + if (Quiet) + consumeError(std::move(Err)); + else + REPORT("WARNING: Failed to read backtrace " + "(%s)\n", + toString(std::move(Err)).data()); + return false; + } + return true; + }; + GlobalTy LocationsGV("__san.locations", -1); + GlobalTy LocationNamesGV("__san.location_names", -1); + GlobalTy AmbiguousCallsBitWidthGV("__san.num_ambiguous_calls", -1); + GlobalTy AmbiguousCallsLocationsGV("__san.ambiguous_calls_mapping", -1); + if (GetImagePtr(LocationsGV)) + GetImagePtr(LocationNamesGV); + GetImagePtr(AmbiguousCallsBitWidthGV, /*Quiet=*/true); + GetImagePtr(AmbiguousCallsLocationsGV, /*Quiet=*/true); + + fprintf(stderr, "============================================================" + "====================\n"); + + auto PrintStackTrace = [&](int64_t LocationId) { + if (!LocationsGV.getPtr() || !LocationNamesGV.getPtr()) { + fprintf(stderr, " no backtrace available\n"); + return; + } + char *LocationNames = LocationNamesGV.getPtrAs(); + LocationEncodingTy *Locations = LocationsGV.getPtrAs(); + uint64_t *AmbiguousCallsBitWidth = + AmbiguousCallsBitWidthGV.getPtrAs(); + uint64_t *AmbiguousCallsLocations = + AmbiguousCallsLocationsGV.getPtrAs(); + int32_t FrameIdx = 0; + do { + LocationEncodingTy &LE = Locations[LocationId]; + fprintf(stderr, " #%i %s in %s:%lu:%lu\n", FrameIdx, + &LocationNames[LE.FunctionNameIdx], + &LocationNames[LE.FileNameIdx], LE.LineNo, LE.ColumnNo); + LocationId = LE.ParentIdx; + FrameIdx++; + if (LocationId < 0 && STI->CallId != 0 && AmbiguousCallsBitWidth && + AmbiguousCallsLocations) { + uint64_t LastCallId = + STI->CallId & ((1 << *AmbiguousCallsBitWidth) - 1); + LocationId = AmbiguousCallsLocations[LastCallId - 1]; + STI->CallId >>= (*AmbiguousCallsBitWidth); + } + } while (LocationId >= 0); + fputc('\n', stderr); + }; + + auto DiagnoseAccess = [&](StringRef Name) { + void *PC = reinterpret_cast(STI->PC); + void *Addr = utils::advancePtr(STI->AllocationStart, STI->PtrOffset); + fprintf(stderr, + "%sERROR: OffloadSanitizer %s access on address " DPxMOD + " at pc " DPxMOD "\n%s", + Red(), Name.data(), DPxPTR(Addr), DPxPTR(PC), Default()); + fprintf(stderr, + "%s%s of size %u at " DPxMOD + " thread <%u, %u, %u> block <%lu, %lu, %lu> (acc %li, %s)\n%s", + Blue(), STI->AccessId > 0 ? "WRITE" : "READ", STI->AccessSize, + DPxPTR(Addr), STI->ThreadId[0], STI->ThreadId[1], STI->ThreadId[2], + STI->BlockId[0], STI->BlockId[1], STI->BlockId[2], STI->AccessId, + (STI->AllocationKind ? "heap" : "stack"), Default()); + PrintStackTrace(STI->LocationId); + fprintf( + stderr, + "%s" DPxMOD " is located %lu bytes inside of a %lu-byte region [" DPxMOD + "," DPxMOD ")\n%s", + Green(), DPxPTR(Addr), STI->PtrOffset, STI->AllocationLength, + DPxPTR(STI->AllocationStart), + DPxPTR(utils::advancePtr(STI->AllocationStart, STI->AllocationLength)), + Default()); + fprintf(stderr, + "%s Pointer[slot:%lu,tag:%u,kind:%i] " + "Allocation[slot:%d,tag:%u,kind:%i]\n%s", + Green(), STI->PtrSlot, STI->PtrTag, STI->PtrKind, STI->AllocationId, + STI->AllocationTag, STI->AllocationKind, Default()); + }; + + switch (STI->ErrorCode) { + case SanitizerTrapInfoTy::None: + llvm_unreachable("Unexpected exception"); + case SanitizerTrapInfoTy::ExceedsLength: + fprintf(stderr, "%sERROR: OffloadSanitizer %s\n%s", Red(), "exceeds length", + Default()); + break; + case SanitizerTrapInfoTy::ExceedsSlots: + fprintf(stderr, "%sERROR: OffloadSanitizer %s\n%s", Red(), "exceeds slots", + Default()); + break; + case SanitizerTrapInfoTy::PointerOutsideAllocation: + fprintf(stderr, "%sERROR: OffloadSanitizer %s : %p : %i %lu (%s)\n%s", + Red(), "outside allocation", STI->AllocationStart, + STI->AllocationId, STI->PtrSlot, + (STI->AllocationKind ? "heap" : "stack"), Default()); + break; + case SanitizerTrapInfoTy::OutOfBounds: { + DiagnoseAccess("out-of-bounds"); + break; + } + case SanitizerTrapInfoTy::UseAfterScope: + DiagnoseAccess("use-after-scope"); + break; + case SanitizerTrapInfoTy::UseAfterFree: + DiagnoseAccess("use-after-free"); + break; + case SanitizerTrapInfoTy::MemoryLeak: + fprintf(stderr, "%sERROR: OffloadSanitizer %s\n%s", Red(), "memory leak", + Default()); + break; + case SanitizerTrapInfoTy::GarbagePointer: + DiagnoseAccess("garbage-pointer"); + break; + } + fflush(stderr); +} + bool llvm::omp::target::plugin::libomptargetSupportsRPC() { #ifdef LIBOMPTARGET_RPC_SUPPORT return true; diff --git a/offload/plugins-nextgen/cuda/src/rtl.cpp b/offload/plugins-nextgen/cuda/src/rtl.cpp index b6465d61bd033..6ba51ae0db565 100644 --- a/offload/plugins-nextgen/cuda/src/rtl.cpp +++ b/offload/plugins-nextgen/cuda/src/rtl.cpp @@ -643,8 +643,11 @@ struct CUDADeviceTy : public GenericDeviceTy { } // Once the stream is synchronized, return it to stream pool and reset - // AsyncInfo. This is to make sure the synchronization only works for its - // own tasks. + // AsyncInfo if the queue is not persistent. This is to make sure the + // synchronization only works for its own tasks. + if (AsyncInfo.PersistentQueue) + return Plugin::success(); + AsyncInfo.Queue = nullptr; if (auto Err = CUDAStreamManager.returnResource(Stream)) return Err; @@ -705,7 +708,7 @@ struct CUDADeviceTy : public GenericDeviceTy { return Plugin::error("Wrong device Page size"); // Ceil to page size. - Size = roundUp(Size, Granularity); + Size = utils::roundUp(Size, Granularity); // Create a handler of our allocation CUmemGenericAllocationHandle AHandle; @@ -777,9 +780,12 @@ struct CUDADeviceTy : public GenericDeviceTy { if (Res == CUDA_ERROR_NOT_READY) return Plugin::success(); - // Once the stream is synchronized and the operations completed (or an error - // occurs), return it to stream pool and reset AsyncInfo. This is to make - // sure the synchronization only works for its own tasks. + // Once the stream is synchronized, return it to stream pool and reset + // AsyncInfo if the queue is not persistent. This is to make sure the + // synchronization only works for its own tasks. + if (AsyncInfo.PersistentQueue) + return Plugin::success(); + AsyncInfo.Queue = nullptr; if (auto Err = CUDAStreamManager.returnResource(Stream)) return Err; diff --git a/offload/src/CMakeLists.txt b/offload/src/CMakeLists.txt index efa5cdab33ec9..0f30f6028f103 100644 --- a/offload/src/CMakeLists.txt +++ b/offload/src/CMakeLists.txt @@ -22,6 +22,7 @@ add_llvm_library(omptarget OpenMP/InteropAPI.cpp OpenMP/OMPT/Callback.cpp + KernelLanguage/API.cpp ADDITIONAL_HEADER_DIRS ${LIBOMPTARGET_INCLUDE_DIR} @@ -60,6 +61,23 @@ endforeach() target_compile_options(omptarget PRIVATE ${offload_compile_flags}) target_link_options(omptarget PRIVATE ${offload_link_flags}) +add_llvm_library(offload.kernels + STATIC + + Kernels/Sanitizer.cpp + + LINK_LIBS + PUBLIC + omptarget.devicertl + + NO_INSTALL_RPATH + BUILDTREE_ONLY +) + +list(JOIN LIBOMPTARGET_DEVICE_ARCHITECTURES "," KERNEL_OFFLOAD_ARCHS) +target_compile_options(offload.kernels PRIVATE -x cuda --offload-arch=${KERNEL_OFFLOAD_ARCHS} -nocudalib -nogpulib -fopenmp-target-jit -foffload-via-llvm ) +target_link_options(offload.kernels PRIVATE -x cuda --offload-arch=${KERNEL_OFFLOAD_ARCHS} -nocudalib -nogpulib -fopenmp-target-jit -foffload-via-llvm ) + # libomptarget.so needs to be aware of where the plugins live as they # are now separated in the build directory. set_target_properties(omptarget PROPERTIES @@ -67,3 +85,4 @@ set_target_properties(omptarget PROPERTIES INSTALL_RPATH "$ORIGIN" BUILD_RPATH "$ORIGIN:${CMAKE_CURRENT_BINARY_DIR}/..") install(TARGETS omptarget LIBRARY COMPONENT omptarget DESTINATION "${OFFLOAD_INSTALL_LIBDIR}") +install(TARGETS offload.kernels LIBRARY COMPONENT offload.kernels DESTINATION "${OFFLOAD_INSTALL_LIBDIR}") diff --git a/offload/src/DeviceImage.cpp b/offload/src/DeviceImage.cpp index e42460b5cca4f..e5b4bf5526437 100644 --- a/offload/src/DeviceImage.cpp +++ b/offload/src/DeviceImage.cpp @@ -27,9 +27,8 @@ DeviceImageTy::DeviceImageTy(__tgt_bin_desc &BinaryDesc, __tgt_device_image &TgtDeviceImage) : BinaryDesc(&BinaryDesc), Image(TgtDeviceImage) { - llvm::StringRef ImageStr( - static_cast(Image.ImageStart), - llvm::omp::target::getPtrDiff(Image.ImageEnd, Image.ImageStart)); + llvm::StringRef ImageStr(static_cast(Image.ImageStart), + utils::getPtrDiff(Image.ImageEnd, Image.ImageStart)); auto BinaryOrErr = llvm::object::OffloadBinary::create(llvm::MemoryBufferRef(ImageStr, "")); diff --git a/offload/src/KernelLanguage/API.cpp b/offload/src/KernelLanguage/API.cpp new file mode 100644 index 0000000000000..779751deed661 --- /dev/null +++ b/offload/src/KernelLanguage/API.cpp @@ -0,0 +1,86 @@ +//===------ API.cpp - Kernel Language (CUDA/HIP) entry points ----- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +//===----------------------------------------------------------------------===// + +#include "llvm/Frontend/OpenMP/OMPConstants.h" + +#include "Shared/APITypes.h" + +#include +#include + +struct dim3 { + unsigned x = 0, y = 0, z = 0; +}; + +struct __omp_kernel_t { + dim3 __grid_size; + dim3 __block_size; + size_t __shared_memory; + + void *__stream; +}; + +static __omp_kernel_t __current_kernel = {}; +#pragma omp threadprivate(__current_kernel); + +extern "C" { + +// TODO: There is little reason we need to keep these names or the way calls are +// issued. For now we do to avoid modifying Clang's CUDA codegen. Unclear when +// we actually need to push/pop configurations. +unsigned __llvmPushCallConfiguration(dim3 __grid_size, dim3 __block_size, + size_t __shared_memory, void *__stream) { + __omp_kernel_t &__kernel = __current_kernel; + __kernel.__grid_size = __grid_size; + __kernel.__block_size = __block_size; + __kernel.__shared_memory = __shared_memory; + __kernel.__stream = __stream; + return 0; +} + +unsigned __llvmPopCallConfiguration(dim3 *__grid_size, dim3 *__block_size, + size_t *__shared_memory, void *__stream) { + __omp_kernel_t &__kernel = __current_kernel; + *__grid_size = __kernel.__grid_size; + *__block_size = __kernel.__block_size; + *__shared_memory = __kernel.__shared_memory; + *((void **)__stream) = __kernel.__stream; + return 0; +} + +int __tgt_target_kernel(void *Loc, int64_t DeviceId, int32_t NumTeams, + int32_t ThreadLimit, const void *HostPtr, + KernelArgsTy *Args); +void *__tgt_target_get_default_async_info_queue(void *Loc, int64_t DeviceId); + +unsigned llvmLaunchKernel(const void *func, dim3 gridDim, dim3 blockDim, + void *args, size_t sharedMem, void *stream) { + int64_t DeviceNo = 0; + KernelArgsTy Args = {}; + Args.Version = OMP_KERNEL_ARG_VERSION; + Args.DynCGroupMem = sharedMem; + Args.NumTeams[0] = gridDim.x; + Args.NumTeams[1] = gridDim.y; + Args.NumTeams[2] = gridDim.z; + Args.ThreadLimit[0] = blockDim.x; + Args.ThreadLimit[1] = blockDim.y; + Args.ThreadLimit[2] = blockDim.z; + Args.ArgPtrs = reinterpret_cast(args); + Args.Flags.IsCUDA = true; + if (stream) + Args.AsyncInfoQueue = stream; + else + Args.AsyncInfoQueue = + __tgt_target_get_default_async_info_queue(nullptr, DeviceNo); + int rv = __tgt_target_kernel(nullptr, DeviceNo, gridDim.x, blockDim.x, func, + &Args); + return rv; +} +} diff --git a/offload/src/Kernels/Sanitizer.cpp b/offload/src/Kernels/Sanitizer.cpp new file mode 100644 index 0000000000000..7b1d73a81aeba --- /dev/null +++ b/offload/src/Kernels/Sanitizer.cpp @@ -0,0 +1,27 @@ +//===-- Kenrels/Sanitizer.cpp - Sanitizer Kernel Definitions --------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +//===----------------------------------------------------------------------===// + +#include + +extern "C" { +__device__ void __sanitizer_register_host(void *P, uint64_t Bytes, + uint64_t Slot); +__device__ void __sanitizer_unregister_host(void *P); + +[[clang::disable_sanitizer_instrumentation]] __global__ void +__sanitizer_register(void *P, uint64_t Bytes, uint64_t Slot) { + __sanitizer_register_host(P, Bytes, Slot); +} + +[[clang::disable_sanitizer_instrumentation]] __global__ void +__sanitizer_unregister(void *P) { + __sanitizer_unregister_host(P); +} +} diff --git a/offload/src/OpenMP/API.cpp b/offload/src/OpenMP/API.cpp index 374c54163d6a4..c07cea550c39c 100644 --- a/offload/src/OpenMP/API.cpp +++ b/offload/src/OpenMP/API.cpp @@ -39,7 +39,7 @@ EXTERN void ompx_dump_mapping_tables() { using namespace llvm::omp::target::ompt; #endif -void *targetAllocExplicit(size_t Size, int DeviceNum, int Kind, +void *targetAllocExplicit(size_t Size, int64_t DeviceNum, int Kind, const char *Name); void targetFreeExplicit(void *DevicePtr, int DeviceNum, int Kind, const char *Name); diff --git a/offload/src/OpenMP/Mapping.cpp b/offload/src/OpenMP/Mapping.cpp index 595e3456ab54c..407c6cd422ba7 100644 --- a/offload/src/OpenMP/Mapping.cpp +++ b/offload/src/OpenMP/Mapping.cpp @@ -69,7 +69,7 @@ int MappingInfoTy::associatePtr(void *HstPtrBegin, void *TgtPtrBegin, } // Mapping does not exist, allocate it with refCount=INF - const HostDataToTargetTy &NewEntry = + HostDataToTargetTy &NewEntry = *HDTTMap ->emplace(new HostDataToTargetTy( /*HstPtrBase=*/(uintptr_t)HstPtrBegin, @@ -89,7 +89,8 @@ int MappingInfoTy::associatePtr(void *HstPtrBegin, void *TgtPtrBegin, (void)NewEntry; // Notify the plugin about the new mapping. - return Device.notifyDataMapped(HstPtrBegin, Size); + return Device.notifyDataMapped(HstPtrBegin, TgtPtrBegin, Size, + NewEntry.FakeTgtPtrBegin); } int MappingInfoTy::disassociatePtr(void *HstPtrBegin) { @@ -120,7 +121,7 @@ int MappingInfoTy::disassociatePtr(void *HstPtrBegin) { if (Event) Device.destroyEvent(Event); HDTTMap->erase(It); - return Device.notifyDataUnmapped(HstPtrBegin); + return Device.notifyDataUnmapped(HstPtrBegin, HDTT.FakeTgtPtrBegin); } REPORT("Trying to disassociate a pointer which was not mapped via " @@ -294,12 +295,13 @@ TargetPointerResultTy MappingInfoTy::getTargetPointer( uintptr_t TgtPtrBegin = TgtAllocBegin + TgtPadding; // Release the mapping table lock only after the entry is locked by // attaching it to TPR. - LR.TPR.setEntry(HDTTMap - ->emplace(new HostDataToTargetTy( - (uintptr_t)HstPtrBase, (uintptr_t)HstPtrBegin, - (uintptr_t)HstPtrBegin + Size, TgtAllocBegin, - TgtPtrBegin, HasHoldModifier, HstPtrName)) - .first->HDTT); + LR.TPR.setEntry( + HDTTMap + ->emplace(new HostDataToTargetTy( + (uintptr_t)HstPtrBase, (uintptr_t)HstPtrBegin, + (uintptr_t)HstPtrBegin + Size, TgtAllocBegin, TgtPtrBegin, + HasHoldModifier, HstPtrName, /*IsINF=*/false)) + .first->HDTT); INFO(OMP_INFOTYPE_MAPPING_CHANGED, Device.DeviceID, "Creating new map entry with HstPtrBase=" DPxMOD ", HstPtrBegin=" DPxMOD ", TgtAllocBegin=" DPxMOD @@ -313,7 +315,8 @@ TargetPointerResultTy MappingInfoTy::getTargetPointer( LR.TPR.TargetPointer = (void *)TgtPtrBegin; // Notify the plugin about the new mapping. - if (Device.notifyDataMapped(HstPtrBegin, Size)) + if (Device.notifyDataMapped(HstPtrBegin, LR.TPR.TargetPointer, Size, + LR.TPR.getEntry()->FakeTgtPtrBegin)) return TargetPointerResultTy{}; } else { // This entry is not present and we did not create a new entry for it. @@ -495,7 +498,8 @@ int MappingInfoTy::deallocTgtPtrAndEntry(HostDataToTargetTy *Entry, int Ret = Device.deleteData((void *)Entry->TgtAllocBegin); // Notify the plugin about the unmapped memory. - Ret |= Device.notifyDataUnmapped((void *)Entry->HstPtrBegin); + Ret |= Device.notifyDataUnmapped((void *)Entry->HstPtrBegin, + Entry->FakeTgtPtrBegin); delete Entry; diff --git a/offload/src/device.cpp b/offload/src/device.cpp index 943c778278730..fc41721e6c0a6 100644 --- a/offload/src/device.cpp +++ b/offload/src/device.cpp @@ -191,21 +191,24 @@ int32_t DeviceTy::dataExchange(void *SrcPtr, DeviceTy &DstDev, void *DstPtr, DstPtr, Size, AsyncInfo); } -int32_t DeviceTy::notifyDataMapped(void *HstPtr, int64_t Size) { +int32_t DeviceTy::notifyDataMapped(void *HstPtr, void *DevicePtr, int64_t Size, + void *&FakeHstPtr) { DP("Notifying about new mapping: HstPtr=" DPxMOD ", Size=%" PRId64 "\n", DPxPTR(HstPtr), Size); - if (RTL->data_notify_mapped(RTLDeviceID, HstPtr, Size)) { + if (RTL->data_notify_mapped(RTLDeviceID, HstPtr, DevicePtr, Size, + FakeHstPtr)) { REPORT("Notifiying about data mapping failed.\n"); return OFFLOAD_FAIL; } return OFFLOAD_SUCCESS; } -int32_t DeviceTy::notifyDataUnmapped(void *HstPtr) { - DP("Notifying about an unmapping: HstPtr=" DPxMOD "\n", DPxPTR(HstPtr)); +int32_t DeviceTy::notifyDataUnmapped(void *HstPtr, void *FakeHstPtr) { + DP("Notifying about an unmapping: HstPtr=" DPxMOD " FakeHstPtr=" DPxMOD "\n", + DPxPTR(HstPtr), DPxPTR(FakeHstPtr)); - if (RTL->data_notify_unmapped(RTLDeviceID, HstPtr)) { + if (RTL->data_notify_unmapped(RTLDeviceID, HstPtr, FakeHstPtr)) { REPORT("Notifiying about data unmapping failed.\n"); return OFFLOAD_FAIL; } diff --git a/offload/src/exports b/offload/src/exports index f95544ec8329c..829a55fc6398e 100644 --- a/offload/src/exports +++ b/offload/src/exports @@ -29,6 +29,8 @@ VERS1.0 { __tgt_target_kernel; __tgt_target_kernel_nowait; __tgt_target_nowait_query; + __tgt_target_get_default_async_info_queue; + __tgt_target_synchronize_async_info_queue; __tgt_target_kernel_replay; __tgt_activate_record_replay; __tgt_mapper_num_components; @@ -71,6 +73,12 @@ VERS1.0 { __tgt_interop_use; __tgt_interop_destroy; ompt_libomptarget_connect; + __llvmPushCallConfiguration; + __llvmPopCallConfiguration; + llvmLaunchKernel; + ompx_new_allocation_host; + ompx_free_allocation_host; + ompx_register_image_functions; local: *; }; diff --git a/offload/src/interface.cpp b/offload/src/interface.cpp index 763b051cc6d77..759fb54a11262 100644 --- a/offload/src/interface.cpp +++ b/offload/src/interface.cpp @@ -14,6 +14,8 @@ #include "OpenMP/OMPT/Interface.h" #include "OpenMP/OMPT/Callback.h" #include "PluginManager.h" +#include "Shared/APITypes.h" +#include "omptarget.h" #include "private.h" #include "Shared/EnvironmentVar.h" @@ -312,7 +314,7 @@ static inline int targetKernel(ident_t *Loc, int64_t DeviceId, int32_t NumTeams, if (!DeviceOrErr) FATAL_MESSAGE(DeviceId, "%s", toString(DeviceOrErr.takeError()).c_str()); - TargetAsyncInfoTy TargetAsyncInfo(*DeviceOrErr); + TargetAsyncInfoTy TargetAsyncInfo(*DeviceOrErr, KernelArgs->AsyncInfoQueue); AsyncInfoTy &AsyncInfo = TargetAsyncInfo; /// RAII to establish tool anchors before and after target region OMPT_IF_BUILT(InterfaceRAII TargetRAII( @@ -510,3 +512,48 @@ EXTERN void __tgt_target_nowait_query(void **AsyncHandle) { delete AsyncInfo; *AsyncHandle = nullptr; } + +EXTERN void *__tgt_target_get_default_async_info_queue(void *Loc, + int64_t DeviceId) { + assert(PM && "Runtime not initialized"); + + static thread_local void **AsyncInfoQueue = nullptr; + + if (!AsyncInfoQueue) + AsyncInfoQueue = reinterpret_cast( + calloc(PM->getNumDevices(), sizeof(AsyncInfoQueue[0]))); + + if (!AsyncInfoQueue[DeviceId]) { + auto DeviceOrErr = PM->getDevice(DeviceId); + if (!DeviceOrErr) + FATAL_MESSAGE(DeviceId, "%s", toString(DeviceOrErr.takeError()).c_str()); + + __tgt_async_info *AsyncInfo = nullptr; + DeviceOrErr->RTL->init_async_info(DeviceId, &AsyncInfo); + AsyncInfoQueue[DeviceId] = AsyncInfo->Queue; + } + + return AsyncInfoQueue[DeviceId]; +} + +EXTERN int __tgt_target_synchronize_async_info_queue(void *Loc, + int64_t DeviceId, + void *AsyncInfoQueue) { + assert(PM && "Runtime not initialized"); + + auto DeviceOrErr = PM->getDevice(DeviceId); + if (!DeviceOrErr) + FATAL_MESSAGE(DeviceId, "%s", toString(DeviceOrErr.takeError()).c_str()); + if (!AsyncInfoQueue) + AsyncInfoQueue = __tgt_target_get_default_async_info_queue(Loc, DeviceId); + AsyncInfoTy AsyncInfo(*DeviceOrErr, AsyncInfoQueue, + AsyncInfoTy::SyncTy::BLOCKING); + + if (AsyncInfo.synchronize()) + FATAL_MESSAGE0(1, "Error while querying the async queue for completion.\n"); + [[maybe_unused]] __tgt_async_info *ASI = AsyncInfo; + assert(ASI->Queue); + assert(ASI->Queue && ASI->PersistentQueue); + + return 0; +} diff --git a/offload/src/omptarget.cpp b/offload/src/omptarget.cpp index 9bca8529c5ee3..10f9b9a7d9358 100644 --- a/offload/src/omptarget.cpp +++ b/offload/src/omptarget.cpp @@ -49,7 +49,7 @@ int AsyncInfoTy::synchronize() { case SyncTy::BLOCKING: // If we have a queue we need to synchronize it now. Result = Device.synchronize(*this); - assert(AsyncInfo.Queue == nullptr && + assert((AsyncInfo.PersistentQueue || !AsyncInfo.Queue) && "The device plugin should have nulled the queue to indicate there " "are no outstanding actions!"); break; @@ -271,17 +271,22 @@ static int initLibrary(DeviceTy &Device) { ", name \"%s\"\n", DPxPTR(CurrHostEntry->addr), DPxPTR(CurrDeviceEntry->addr), CurrDeviceEntry->size, CurrDeviceEntry->name); - HDTTMap->emplace(new HostDataToTargetTy( - (uintptr_t)CurrHostEntry->addr /*HstPtrBase*/, - (uintptr_t)CurrHostEntry->addr /*HstPtrBegin*/, - (uintptr_t)CurrHostEntry->addr + CurrHostEntry->size /*HstPtrEnd*/, - (uintptr_t)CurrDeviceEntryAddr /*TgtAllocBegin*/, - (uintptr_t)CurrDeviceEntryAddr /*TgtPtrBegin*/, - false /*UseHoldRefCount*/, CurrHostEntry->name, - true /*IsRefCountINF*/)); + auto *Entry = HDTTMap + ->emplace(new HostDataToTargetTy( + (uintptr_t)CurrHostEntry->addr /*HstPtrBase*/, + (uintptr_t)CurrHostEntry->addr /*HstPtrBegin*/, + (uintptr_t)CurrHostEntry->addr + + CurrHostEntry->size /*HstPtrEnd*/, + (uintptr_t)CurrDeviceEntryAddr /*TgtAllocBegin*/, + (uintptr_t)CurrDeviceEntryAddr /*TgtPtrBegin*/, + false /*UseHoldRefCount*/, CurrHostEntry->name, + true /*IsRefCountINF*/)) + .first->HDTT; // Notify about the new mapping. - if (Device.notifyDataMapped(CurrHostEntry->addr, CurrHostEntry->size)) + if (Device.notifyDataMapped(CurrHostEntry->addr, CurrDeviceEntryAddr, + CurrHostEntry->size, + Entry->FakeTgtPtrBegin)) return OFFLOAD_FAIL; } } @@ -323,8 +328,8 @@ void handleTargetOutcome(bool Success, ident_t *Loc) { for (auto &Image : PM->deviceImages()) { const char *Start = reinterpret_cast( Image.getExecutableImage().ImageStart); - uint64_t Length = llvm::omp::target::getPtrDiff( - Start, Image.getExecutableImage().ImageEnd); + uint64_t Length = + utils::getPtrDiff(Start, Image.getExecutableImage().ImageEnd); llvm::MemoryBufferRef Buffer(llvm::StringRef(Start, Length), /*Identifier=*/""); @@ -415,9 +420,9 @@ static int32_t getParentIndex(int64_t Type) { return ((Type & OMP_TGT_MAPTYPE_MEMBER_OF) >> 48) - 1; } -void *targetAllocExplicit(size_t Size, int DeviceNum, int Kind, +void *targetAllocExplicit(size_t Size, int64_t DeviceNum, int Kind, const char *Name) { - DP("Call to %s for device %d requesting %zu bytes\n", Name, DeviceNum, Size); + DP("Call to %s for device %ld requesting %zu bytes\n", Name, DeviceNum, Size); if (Size <= 0) { DP("Call to %s with non-positive length\n", Name); @@ -432,13 +437,21 @@ void *targetAllocExplicit(size_t Size, int DeviceNum, int Kind, return Rc; } + if (checkDeviceAndCtors(DeviceNum, nullptr)) { + DP("Not offloading to device %" PRId64 "\n", DeviceNum); + return Rc; + } + auto DeviceOrErr = PM->getDevice(DeviceNum); if (!DeviceOrErr) FATAL_MESSAGE(DeviceNum, "%s", toString(DeviceOrErr.takeError()).c_str()); Rc = DeviceOrErr->allocData(Size, nullptr, Kind); DP("%s returns device ptr " DPxMOD "\n", Name, DPxPTR(Rc)); - return Rc; + void *FakeHstPtr = nullptr; + if (DeviceOrErr->notifyDataMapped(nullptr, Rc, Size, FakeHstPtr)) + return nullptr; + return FakeHstPtr ? FakeHstPtr : Rc; } void targetFreeExplicit(void *DevicePtr, int DeviceNum, int Kind, @@ -464,6 +477,8 @@ void targetFreeExplicit(void *DevicePtr, int DeviceNum, int Kind, if (DeviceOrErr->deleteData(DevicePtr, Kind) == OFFLOAD_FAIL) FATAL_MESSAGE(DeviceNum, "%s", "Failed to deallocate device ptr"); + DeviceOrErr->notifyDataUnmapped(nullptr, DevicePtr); + DP("omp_target_free deallocated device ptr\n"); } @@ -670,6 +685,9 @@ int targetDataBegin(ident_t *Loc, DeviceTy &Device, int32_t ArgNum, HasFlagTo, HasFlagAlways, IsImplicit, UpdateRef, HasCloseModifier, HasPresentModifier, HasHoldModifier, AsyncInfo, PointerTpr.getEntry()); void *TgtPtrBegin = TPR.TargetPointer; + if (auto *Entry = TPR.getEntry()) + if (auto *FakeTgtPtrBegin = Entry->FakeTgtPtrBegin) + TgtPtrBegin = FakeTgtPtrBegin; IsHostPtr = TPR.Flags.IsHostPointer; // If data_size==0, then the argument could be a zero-length pointer to // NULL, so getOrAlloc() returning NULL is not an error. @@ -1523,11 +1541,16 @@ static int processDataBefore(ident_t *Loc, int64_t DeviceId, void *HostPtr, /*UpdateRefCount=*/false, /*UseHoldRefCount=*/false); TgtPtrBegin = TPR.TargetPointer; + if (auto *Entry = TPR.getEntry()) + if (auto *FakeTgtPtrBegin = Entry->FakeTgtPtrBegin) + TgtPtrBegin = FakeTgtPtrBegin; TgtBaseOffset = (intptr_t)HstPtrBase - (intptr_t)HstPtrBegin; #ifdef OMPTARGET_DEBUG void *TgtPtrBase = (void *)((intptr_t)TgtPtrBegin + TgtBaseOffset); - DP("Obtained target argument " DPxMOD " from host pointer " DPxMOD "\n", - DPxPTR(TgtPtrBase), DPxPTR(HstPtrBegin)); + DP("Obtained target argument " DPxMOD " from host pointer " DPxMOD + " %s\n", + DPxPTR(TgtPtrBase), DPxPTR(HstPtrBegin), + TgtPtrBegin != TPR.TargetPointer ? "fake" : ""); #endif } TgtArgsPositions[I] = TgtArgs.size(); diff --git a/offload/test/lit.cfg b/offload/test/lit.cfg index 6c590603079c4..9053151e44a78 100644 --- a/offload/test/lit.cfg +++ b/offload/test/lit.cfg @@ -66,7 +66,7 @@ def evaluate_bool_env(env): config.name = 'libomptarget :: ' + config.libomptarget_current_target # suffixes: A list of file extensions to treat as test files. -config.suffixes = ['.c', '.cpp', '.cc', '.f90'] +config.suffixes = ['.c', '.cpp', '.cc', '.f90', '.cu'] # excludes: A list of directories to exclude from the testuites. config.excludes = ['Inputs'] diff --git a/offload/test/offloading/CUDA/basic_api_malloc_free.cu b/offload/test/offloading/CUDA/basic_api_malloc_free.cu new file mode 100644 index 0000000000000..60a51e33a5af9 --- /dev/null +++ b/offload/test/offloading/CUDA/basic_api_malloc_free.cu @@ -0,0 +1,42 @@ +// RUN: %clang++ -foffload-via-llvm --offload-arch=native %s -o %t +// RUN: %t | %fcheck-generic + +// UNSUPPORTED: aarch64-unknown-linux-gnu +// UNSUPPORTED: aarch64-unknown-linux-gnu-LTO +// UNSUPPORTED: x86_64-pc-linux-gnu +// UNSUPPORTED: x86_64-pc-linux-gnu-LTO + +#include +#include + +extern "C" { +void *llvm_omp_target_alloc_shared(size_t Size, int DeviceNum); +void llvm_omp_target_free_shared(void *DevicePtr, int DeviceNum); +} + +__global__ void kernel(int *A, int *DevPtr, int N) { + for (int i = 0; i < N; ++i) + DevPtr[i] = 1; + for (int i = 0; i < N; ++i) + *A += DevPtr[i]; +} + +int main(int argc, char **argv) { + int DevNo = 0; + int *Ptr = reinterpret_cast(llvm_omp_target_alloc_shared(4, DevNo)); + int *DevPtr; + auto Err = cudaMalloc(&DevPtr, 42 * sizeof(int)); + if (Err != cudaSuccess) + return -1; + *Ptr = 0; + printf("Ptr %p, *Ptr: %i\n", Ptr, *Ptr); + // CHECK: Ptr [[Ptr:0x.*]], *Ptr: 0 + kernel<<<1, 1>>>(Ptr, DevPtr, 42); + cudaDeviceSynchronize(); + printf("Ptr %p, *Ptr: %i\n", Ptr, *Ptr); + // CHECK: Ptr [[Ptr]], *Ptr: 42 + Err = cudaFree(DevPtr); + if (Err != cudaSuccess) + return -1; + llvm_omp_target_free_shared(Ptr, DevNo); +} diff --git a/offload/test/offloading/CUDA/basic_api_memcpy.cu b/offload/test/offloading/CUDA/basic_api_memcpy.cu new file mode 100644 index 0000000000000..088e20ffa9e2b --- /dev/null +++ b/offload/test/offloading/CUDA/basic_api_memcpy.cu @@ -0,0 +1,47 @@ +// RUN: %clang++ -foffload-via-llvm --offload-arch=native %s -o %t +// RUN: %t | %fcheck-generic + +// UNSUPPORTED: aarch64-unknown-linux-gnu +// UNSUPPORTED: aarch64-unknown-linux-gnu-LTO +// UNSUPPORTED: x86_64-pc-linux-gnu +// UNSUPPORTED: x86_64-pc-linux-gnu-LTO + +#include +#include + +__global__ void kernel(int *DevPtr, int N) { + for (int i = 0; i < N; ++i) + DevPtr[i]--; +} + +int main(int argc, char **argv) { + int DevNo = 0; + int Res = 0; + int *DevPtr; + auto Err = cudaMalloc(&DevPtr, 42 * sizeof(int)); + if (Err != cudaSuccess) + return -1; + int HstPtr[42]; + for (int i = 0; i < 42; ++i) { + HstPtr[i] = 2; + } + Err = cudaMemcpy(DevPtr, HstPtr, 42 * sizeof(int), cudaMemcpyHostToDevice); + if (Err != cudaSuccess) + return -1; + printf("Res: %i\n", Res); + // CHECK: Res: 0 + kernel<<<1, 1>>>(DevPtr, 42); + cudaDeviceSynchronize(); + Err = cudaMemcpy(HstPtr, DevPtr, 42 * sizeof(int), cudaMemcpyDeviceToHost); + if (Err != cudaSuccess) + return -1; + for (int i = 0; i < 42; ++i) { + printf("%i : %i\n", i, HstPtr[i]); + Res += HstPtr[i]; + } + printf("Res: %i\n", Res); + // CHECK: Res: 42 + Err = cudaFree(DevPtr); + if (Err != cudaSuccess) + return -1; +} diff --git a/offload/test/offloading/CUDA/basic_api_memset.cu b/offload/test/offloading/CUDA/basic_api_memset.cu new file mode 100644 index 0000000000000..474eb2a46f0a2 --- /dev/null +++ b/offload/test/offloading/CUDA/basic_api_memset.cu @@ -0,0 +1,44 @@ +// RUN: %clang++ -foffload-via-llvm --offload-arch=native %s -o %t +// RUN: %t | %fcheck-generic + +// UNSUPPORTED: aarch64-unknown-linux-gnu +// UNSUPPORTED: aarch64-unknown-linux-gnu-LTO +// UNSUPPORTED: x86_64-pc-linux-gnu +// UNSUPPORTED: x86_64-pc-linux-gnu-LTO + +#include +#include + +extern "C" { +void *llvm_omp_target_alloc_shared(size_t Size, int DeviceNum); +void llvm_omp_target_free_shared(void *DevicePtr, int DeviceNum); +} + +__global__ void kernel(int *A, int *DevPtr, int N) { + for (int i = 0; i < N; ++i) + *A += DevPtr[i]; + *A *= -1; +} + +int main(int argc, char **argv) { + int DevNo = 0; + int *Ptr = reinterpret_cast(llvm_omp_target_alloc_shared(4, DevNo)); + int *DevPtr; + auto Err = cudaMalloc(&DevPtr, 42 * sizeof(int)); + if (Err != cudaSuccess) + return -1; + Err = cudaMemset(DevPtr, -1, 42 * sizeof(int)); + if (Err != cudaSuccess) + return -1; + *Ptr = 0; + printf("Ptr %p, *Ptr: %i\n", Ptr, *Ptr); + // CHECK: Ptr [[Ptr:0x.*]], *Ptr: 0 + kernel<<<1, 1>>>(Ptr, DevPtr, 42); + cudaDeviceSynchronize(); + printf("Ptr %p, *Ptr: %i\n", Ptr, *Ptr); + // CHECK: Ptr [[Ptr]], *Ptr: 42 + Err = cudaFree(DevPtr); + if (Err != cudaSuccess) + return -1; + llvm_omp_target_free_shared(Ptr, DevNo); +} diff --git a/offload/test/offloading/CUDA/basic_launch.cu b/offload/test/offloading/CUDA/basic_launch.cu new file mode 100644 index 0000000000000..298aa7db83bad --- /dev/null +++ b/offload/test/offloading/CUDA/basic_launch.cu @@ -0,0 +1,32 @@ +// RUN: %clang++ -foffload-via-llvm --offload-arch=native %s -o %t +// RUN: %t | %fcheck-generic +// RUN: %clang++ -foffload-via-llvm --offload-arch=native %s -o %t -fopenmp +// RUN: %t | %fcheck-generic + +// UNSUPPORTED: aarch64-unknown-linux-gnu +// UNSUPPORTED: aarch64-unknown-linux-gnu-LTO +// UNSUPPORTED: x86_64-pc-linux-gnu +// UNSUPPORTED: x86_64-pc-linux-gnu-LTO + +#include +#include + +extern "C" { +void *llvm_omp_target_alloc_shared(size_t Size, int DeviceNum); +void llvm_omp_target_free_shared(void *DevicePtr, int DeviceNum); +} + +__global__ void square(int *A) { *A = 42; } + +int main(int argc, char **argv) { + int DevNo = 0; + int *Ptr = reinterpret_cast(llvm_omp_target_alloc_shared(4, DevNo)); + *Ptr = 7; + printf("Ptr %p, *Ptr: %i\n", Ptr, *Ptr); + // CHECK: Ptr [[Ptr:0x.*]], *Ptr: 7 + square<<<1, 1>>>(Ptr); + cudaDeviceSynchronize(); + printf("Ptr %p, *Ptr: %i\n", Ptr, *Ptr); + // CHECK: Ptr [[Ptr]], *Ptr: 42 + llvm_omp_target_free_shared(Ptr, DevNo); +} diff --git a/offload/test/offloading/CUDA/basic_launch_blocks_and_threads.cu b/offload/test/offloading/CUDA/basic_launch_blocks_and_threads.cu new file mode 100644 index 0000000000000..c47b1a1b83bde --- /dev/null +++ b/offload/test/offloading/CUDA/basic_launch_blocks_and_threads.cu @@ -0,0 +1,34 @@ +// RUN: %clang++ -foffload-via-llvm --offload-arch=native %s -o %t +// RUN: %t | %fcheck-generic +// RUN: %clang++ -foffload-via-llvm --offload-arch=native %s -o %t -fopenmp +// RUN: %t | %fcheck-generic + +// UNSUPPORTED: aarch64-unknown-linux-gnu +// UNSUPPORTED: aarch64-unknown-linux-gnu-LTO +// UNSUPPORTED: x86_64-pc-linux-gnu +// UNSUPPORTED: x86_64-pc-linux-gnu-LTO + +#include +#include + +extern "C" { +void *llvm_omp_target_alloc_shared(size_t Size, int DeviceNum); +void llvm_omp_target_free_shared(void *DevicePtr, int DeviceNum); +} + +__global__ void square(int *A) { + __scoped_atomic_fetch_add(A, 1, __ATOMIC_SEQ_CST, __MEMORY_SCOPE_DEVICE); +} + +int main(int argc, char **argv) { + int DevNo = 0; + int *Ptr = reinterpret_cast(llvm_omp_target_alloc_shared(4, DevNo)); + *Ptr = 0; + printf("Ptr %p, *Ptr: %i\n", Ptr, *Ptr); + // CHECK: Ptr [[Ptr:0x.*]], *Ptr: 0 + square<<<7, 6>>>(Ptr); + cudaDeviceSynchronize(); + printf("Ptr %p, *Ptr: %i\n", Ptr, *Ptr); + // CHECK: Ptr [[Ptr]], *Ptr: 42 + llvm_omp_target_free_shared(Ptr, DevNo); +} diff --git a/offload/test/offloading/CUDA/basic_launch_multi_arg.cu b/offload/test/offloading/CUDA/basic_launch_multi_arg.cu new file mode 100644 index 0000000000000..58ff89dcd4aac --- /dev/null +++ b/offload/test/offloading/CUDA/basic_launch_multi_arg.cu @@ -0,0 +1,43 @@ +// RUN: %clang++ -foffload-via-llvm --offload-arch=native %s -o %t +// RUN: %t | %fcheck-generic +// RUN: %clang++ -foffload-via-llvm --offload-arch=native %s -o %t -fopenmp +// RUN: %t | %fcheck-generic + +// UNSUPPORTED: aarch64-unknown-linux-gnu +// UNSUPPORTED: aarch64-unknown-linux-gnu-LTO +// UNSUPPORTED: x86_64-pc-linux-gnu +// UNSUPPORTED: x86_64-pc-linux-gnu-LTO + +#include +#include + +extern "C" { +void *llvm_omp_target_alloc_shared(size_t Size, int DeviceNum); +void llvm_omp_target_free_shared(void *DevicePtr, int DeviceNum); +} + +__global__ void square(int *Dst, short Q, int *Src, short P) { + *Dst = (Src[0] + Src[1]) * (Q + P); + Src[0] = Q; + Src[1] = P; +} + +int main(int argc, char **argv) { + int DevNo = 0; + int *Ptr = reinterpret_cast(llvm_omp_target_alloc_shared(4, DevNo)); + int *Src = reinterpret_cast(llvm_omp_target_alloc_shared(8, DevNo)); + *Ptr = 7; + Src[0] = -2; + Src[1] = 8; + printf("Ptr %p, *Ptr: %i\n", Ptr, *Ptr); + // CHECK: Ptr [[Ptr:0x.*]], *Ptr: 7 + printf("Src: %i : %i\n", Src[0], Src[1]); + // CHECK: Src: -2 : 8 + square<<<1, 1>>>(Ptr, 3, Src, 4); + cudaDeviceSynchronize(); + printf("Ptr %p, *Ptr: %i\n", Ptr, *Ptr); + // CHECK: Ptr [[Ptr]], *Ptr: 42 + printf("Src: %i : %i\n", Src[0], Src[1]); + // CHECK: Src: 3 : 4 + llvm_omp_target_free_shared(Ptr, DevNo); +} diff --git a/offload/test/offloading/CUDA/kernel_tu.cu.inc b/offload/test/offloading/CUDA/kernel_tu.cu.inc new file mode 100644 index 0000000000000..d7d28a109dfc5 --- /dev/null +++ b/offload/test/offloading/CUDA/kernel_tu.cu.inc @@ -0,0 +1 @@ +__global__ void square(int *A) { *A = 42; } diff --git a/offload/test/offloading/CUDA/launch_tu.cu b/offload/test/offloading/CUDA/launch_tu.cu new file mode 100644 index 0000000000000..fa6b1d7692de5 --- /dev/null +++ b/offload/test/offloading/CUDA/launch_tu.cu @@ -0,0 +1,34 @@ +// clang-format off +// RUN: %clang++ -foffload-via-llvm --offload-arch=native %s -o %t.launch_tu.o -c +// RUN: %clang++ -foffload-via-llvm --offload-arch=native -x cuda %S/kernel_tu.cu.inc -o %t.kernel_tu.o -c +// RUN: %clang++ -foffload-via-llvm --offload-arch=native %t.launch_tu.o %t.kernel_tu.o -o %t +// RUN: %t | %fcheck-generic +// clang-format on + +// UNSUPPORTED: aarch64-unknown-linux-gnu +// UNSUPPORTED: aarch64-unknown-linux-gnu-LTO +// UNSUPPORTED: x86_64-pc-linux-gnu +// UNSUPPORTED: x86_64-pc-linux-gnu-LTO + +#include +#include + +extern "C" { +void *llvm_omp_target_alloc_shared(size_t Size, int DeviceNum); +void llvm_omp_target_free_shared(void *DevicePtr, int DeviceNum); +} + +extern __global__ void square(int *A); + +int main(int argc, char **argv) { + int DevNo = 0; + int *Ptr = reinterpret_cast(llvm_omp_target_alloc_shared(4, DevNo)); + *Ptr = 7; + printf("Ptr %p, *Ptr: %i\n", Ptr, *Ptr); + // CHECK: Ptr [[Ptr:0x.*]], *Ptr: 7 + square<<<1, 1>>>(Ptr); + cudaDeviceSynchronize(); + printf("Ptr %p, *Ptr: %i\n", Ptr, *Ptr); + // CHECK: Ptr [[Ptr]], *Ptr: 42 + llvm_omp_target_free_shared(Ptr, DevNo); +} diff --git a/offload/test/sanitizer/global_null.c b/offload/test/sanitizer/global_null.c new file mode 100644 index 0000000000000..91be2cb499c45 --- /dev/null +++ b/offload/test/sanitizer/global_null.c @@ -0,0 +1,29 @@ +// clang-format off +// RUN: %libomptarget-compileopt-generic -fsanitize=offload +// RUN: not %libomptarget-run-generic 2>&1 > %t.out +// RUN: %fcheck-generic --check-prefixes=CHECK < %t.out +// clang-format on + +// UNSUPPORTED: aarch64-unknown-linux-gnu +// UNSUPPORTED: aarch64-unknown-linux-gnu-LTO +// UNSUPPORTED: x86_64-pc-linux-gnu +// UNSUPPORTED: x86_64-pc-linux-gnu-LTO +// UNSUPPORTED: s390x-ibm-linux-gnu +// UNSUPPORTED: s390x-ibm-linux-gnu-LTO + +int *Null = 0; +#pragma omp declare target(Null) + +int main(void) { + +#pragma omp target + { + // clang-format off + // CHECK: ERROR: OffloadSanitizer out-of-bounds access on address 0x0000000000000000 at pc [[PC:0x.*]] + // CHECK-NEXT: WRITE of size 4 at 0x0000000000000000 thread <0, 0, 0> block <0, 0, 0> + // CHECK-NEXT: #0 [[PC]] main null.c:[[@LINE+3]] + // CHECK-NEXT: 0x0000000000000000 is located 0 bytes inside of 0-byte region [0x0000000000000000,0x0000000000000000) + // clang-format on + *Null = 42; + } +} diff --git a/offload/test/sanitizer/heap_null.c b/offload/test/sanitizer/heap_null.c new file mode 100644 index 0000000000000..e13ad234d21c0 --- /dev/null +++ b/offload/test/sanitizer/heap_null.c @@ -0,0 +1,32 @@ +// clang-format off +// RUN: %libomptarget-compileopt-generic -fsanitize=offload +// RUN: not %libomptarget-run-generic 2>&1 > %t.out +// RUN: %fcheck-generic --check-prefixes=CHECK < %t.out +// clang-format on + +// UNSUPPORTED: aarch64-unknown-linux-gnu +// UNSUPPORTED: aarch64-unknown-linux-gnu-LTO +// UNSUPPORTED: x86_64-pc-linux-gnu +// UNSUPPORTED: x86_64-pc-linux-gnu-LTO +// UNSUPPORTED: s390x-ibm-linux-gnu +// UNSUPPORTED: s390x-ibm-linux-gnu-LTO + +// Align lines. + +#include +#include + +int main(void) { + + int *Null = 0; +#pragma omp target + { + // clang-format off + // CHECK: ERROR: OffloadSanitizer out-of-bounds access on address 0x0000000000000000 at pc [[PC:0x.*]] + // CHECK-NEXT: WRITE of size 4 at 0x0000000000000000 thread <0, 0, 0> block <0, 0, 0> + // CHECK-NEXT: #0 [[PC]] main null.c:[[@LINE+3]] + // CHECK-NEXT: 0x0000000000000000 is located 0 bytes inside of 0-byte region [0x0000000000000000,0x0000000000000000) + // clang-format on + *Null = 42; + } +} diff --git a/offload/test/sanitizer/heap_out_of_bounds.c b/offload/test/sanitizer/heap_out_of_bounds.c new file mode 100644 index 0000000000000..37a8d99b5241f --- /dev/null +++ b/offload/test/sanitizer/heap_out_of_bounds.c @@ -0,0 +1,27 @@ +// clang-format off +// RUN: %libomptarget-compileopt-generic -fsanitize=offload +// RUN: not %libomptarget-run-generic 2> %t.out +// RUN: %fcheck-generic --check-prefixes=CHECK < %t.out +// clang-format on + +// UNSUPPORTED: aarch64-unknown-linux-gnu +// UNSUPPORTED: aarch64-unknown-linux-gnu-LTO +// UNSUPPORTED: x86_64-pc-linux-gnu +// UNSUPPORTED: x86_64-pc-linux-gnu-LTO +// UNSUPPORTED: s390x-ibm-linux-gnu +// UNSUPPORTED: s390x-ibm-linux-gnu-LTO + +// Align lines. + +#include +#include + +int main(int argc, char **argv) { + int N = argc > 42 ? 1000 : 100; + double A[N]; +#pragma omp target map(from : A[ : N]) + { + // CHECK: is located 7992 bytes inside of a 800-byte region + A[999] = 3.14; + } +} diff --git a/offload/test/sanitizer/heap_partial_out_of_bounds.c b/offload/test/sanitizer/heap_partial_out_of_bounds.c new file mode 100644 index 0000000000000..981cba3a8eebd --- /dev/null +++ b/offload/test/sanitizer/heap_partial_out_of_bounds.c @@ -0,0 +1,31 @@ +// clang-format off +// RUN: %libomptarget-compileopt-generic -fsanitize=offload +// RUN: not %libomptarget-run-generic 2> %t.out +// RUN: %fcheck-generic --check-prefixes=CHECK < %t.out +// clang-format on + +// UNSUPPORTED: aarch64-unknown-linux-gnu +// UNSUPPORTED: aarch64-unknown-linux-gnu-LTO +// UNSUPPORTED: x86_64-pc-linux-gnu +// UNSUPPORTED: x86_64-pc-linux-gnu-LTO +// UNSUPPORTED: s390x-ibm-linux-gnu +// UNSUPPORTED: s390x-ibm-linux-gnu-LTO + +// Align lines. + +#include "omp.h" +#include +#include + +int main(int argc, char **argv) { + int N = argc > 42 ? 1000 : 100; + double *A = + (double *)omp_target_alloc(N * sizeof(*A), omp_get_default_device()); + char *C = ((char *)&A[N - 1] + 1); +#pragma omp target is_device_ptr(A, C) + { + // CHECK: is located 793 bytes inside of a 800-byte region + double *D = (double *)C; + *D = 3.14; + } +} diff --git a/offload/test/sanitizer/heap_random.c b/offload/test/sanitizer/heap_random.c new file mode 100644 index 0000000000000..265495f910f69 --- /dev/null +++ b/offload/test/sanitizer/heap_random.c @@ -0,0 +1,22 @@ +// clang-format off +// RUN: %libomptarget-compileopt-generic -fsanitize=offload +// RUN: not %libomptarget-run-generic 2>&1 > %t.out +// RUN: %fcheck-generic --check-prefixes=CHECK < %t.out +// clang-format on + +// UNSUPPORTED: aarch64-unknown-linux-gnu +// UNSUPPORTED: aarch64-unknown-linux-gnu-LTO +// UNSUPPORTED: x86_64-pc-linux-gnu +// UNSUPPORTED: x86_64-pc-linux-gnu-LTO +// UNSUPPORTED: s390x-ibm-linux-gnu +// UNSUPPORTED: s390x-ibm-linux-gnu-LTO + +// Align lines. + +int main(void) { + + int X = 0; + int *Random = &X; +#pragma omp target + { *Random = 99; } +} diff --git a/offload/test/sanitizer/null_forced_stack.c b/offload/test/sanitizer/null_forced_stack.c new file mode 100644 index 0000000000000..e59e34b3a0cd2 --- /dev/null +++ b/offload/test/sanitizer/null_forced_stack.c @@ -0,0 +1,43 @@ +// clang-format off +// RUN: %libomptarget-compileopt-generic -fsanitize=offload -O1 +// RUN: not %libomptarget-run-generic 2> %t.out +// RUN: %fcheck-generic --check-prefixes=CHECK < %t.out +// RUN: %libomptarget-compileopt-generic -fsanitize=offload -O3 +// RUN: not %libomptarget-run-generic 2> %t.out +// RUN: %libomptarget-compileopt-generic -fsanitize=offload -O3 -g +// RUN: not %libomptarget-run-generic 2> %t.out +// RUN: %fcheck-generic --check-prefixes=DEBUG < %t.out +// clang-format on + +// UNSUPPORTED: aarch64-unknown-linux-gnu +// UNSUPPORTED: aarch64-unknown-linux-gnu-LTO +// UNSUPPORTED: x86_64-pc-linux-gnu +// UNSUPPORTED: x86_64-pc-linux-gnu-LTO +// UNSUPPORTED: s390x-ibm-linux-gnu +// UNSUPPORTED: s390x-ibm-linux-gnu-LTO + +[[clang::optnone]] int *deref(int **P) { return *P; } + +int *bar(int **P) { return deref(P); } + +int main(void) { + +#pragma omp target + { + int *NullPtr = 0; + // clang-format off + // CHECK: ERROR: OffloadSanitizer out-of-bounds access on address 0x0000000000000000 at pc [[PC:.*]] + // CHECK: WRITE of size 4 at 0x0000000000000000 thread <0, 0, 0> block <0, 0, 0> (acc 1, heap) + // CHECK: #0 [[PC]] omp target (main:[[@LINE-6]]) in :0 + // + // CHECK: 0x0000000000000000 is located 0 bytes inside of a 0-byte region [0x0000000000000000,0x0000000000000000) + // + // DEBUG: ERROR: OffloadSanitizer out-of-bounds access on address 0x0000000000000000 at pc [[PC:.*]] + // DEBUG: WRITE of size 4 at 0x0000000000000000 thread <0, 0, 0> block <0, 0, 0> (acc 1, heap) + // DEBUG: #0 [[PC]] omp target (main:[[@LINE-12]]) in {{.*}}volatile_stack_null.c:[[@LINE+4]] + // + // DEBUG: 0x0000000000000000 is located 0 bytes inside of a 0-byte region [0x0000000000000000,0x0000000000000000) + // clang-format on + bar(&NullPtr)[10] = 42; + } +} diff --git a/offload/test/sanitizer/stack_trace_1.c b/offload/test/sanitizer/stack_trace_1.c new file mode 100644 index 0000000000000..7129f67c70900 --- /dev/null +++ b/offload/test/sanitizer/stack_trace_1.c @@ -0,0 +1,43 @@ +// clang-format off +// : %libomptarget-compileopt-generic -fsanitize=offload -O1 +// : not %libomptarget-run-generic 2> %t.out +// : %fcheck-generic --check-prefixes=CHECK < %t.out +// : %libomptarget-compileopt-generic -fsanitize=offload -O3 +// : not %libomptarget-run-generic 2> %t.out +// RUN: %libomptarget-compileopt-generic -fsanitize=offload -O3 -g +// RUN: not %libomptarget-run-generic 2> %t.out +// RUN: %fcheck-generic --check-prefixes=DEBUG < %t.out +// clang-format on + +// UNSUPPORTED: aarch64-unknown-linux-gnu +// UNSUPPORTED: aarch64-unknown-linux-gnu-LTO +// UNSUPPORTED: x86_64-pc-linux-gnu +// UNSUPPORTED: x86_64-pc-linux-gnu-LTO +// UNSUPPORTED: s390x-ibm-linux-gnu +// UNSUPPORTED: s390x-ibm-linux-gnu-LTO + +[[clang::optnone]] int deref(int *P) { return *P; } + +[[gnu::always_inline]] int bar(int *P) { return deref(P); } + +int main(void) { + +#pragma omp target + { + int *NullPtr = 0; + // clang-format off + // CHECK: ERROR: OffloadSanitizer out-of-bounds access on address 0x0000000000000000 at pc [[PC:.*]] + // CHECK: WRITE of size 4 at 0x0000000000000000 thread <0, 0, 0> block <0, 0, 0> (acc 1, heap) + // CHECK: #0 [[PC]] omp target (main:[[@LINE-6]]) in :0 + // + // CHECK: 0x0000000000000000 is located 0 bytes inside of a 0-byte region [0x0000000000000000,0x0000000000000000) + // + // DEBUG: ERROR: OffloadSanitizer out-of-bounds access on address 0x0000000000000000 at pc [[PC:.*]] + // DEBUG: WRITE of size 4 at 0x0000000000000000 thread <0, 0, 0> block <0, 0, 0> (acc 1, heap) + // DEBUG: #0 [[PC]] omp target (main:[[@LINE-12]]) in {{.*}}volatile_stack_null.c:[[@LINE+4]] + // + // DEBUG: 0x0000000000000000 is located 0 bytes inside of a 0-byte region [0x0000000000000000,0x0000000000000000) + // clang-format on + bar(NullPtr); + } +} diff --git a/offload/test/sanitizer/stack_trace_multi_path_1.c b/offload/test/sanitizer/stack_trace_multi_path_1.c new file mode 100644 index 0000000000000..58717afe64df5 --- /dev/null +++ b/offload/test/sanitizer/stack_trace_multi_path_1.c @@ -0,0 +1,48 @@ +// clang-format off +// : %libomptarget-compileopt-generic -fsanitize=offload -O1 +// : not %libomptarget-run-generic 2> %t.out +// : %fcheck-generic --check-prefixes=CHECK < %t.out +// : %libomptarget-compileopt-generic -fsanitize=offload -O3 +// : not %libomptarget-run-generic 2> %t.out +// RUN: %libomptarget-compileopt-generic -fsanitize=offload -O3 -g +// RUN: not %libomptarget-run-generic 2> %t.out +// RUN: %fcheck-generic --check-prefixes=DEBUG < %t.out +// clang-format on + +// UNSUPPORTED: aarch64-unknown-linux-gnu +// UNSUPPORTED: aarch64-unknown-linux-gnu-LTO +// UNSUPPORTED: x86_64-pc-linux-gnu +// UNSUPPORTED: x86_64-pc-linux-gnu-LTO +// UNSUPPORTED: s390x-ibm-linux-gnu +// UNSUPPORTED: s390x-ibm-linux-gnu-LTO + +#include + +[[clang::optnone]] int deref(int *P) { return *P; } + +[[gnu::noinline]] int bar(int *P) { return deref(P); } +[[gnu::noinline]] int baz(int *P) { return deref(P); } + +int main(void) { + + int *Valid = (int *)omp_target_alloc(4, omp_get_default_device()); +#pragma omp target is_device_ptr(Valid) + { + int *NullPtr = 0; + // clang-format off + // CHECK: ERROR: OffloadSanitizer out-of-bounds access on address 0x0000000000000000 at pc [[PC:.*]] + // CHECK: WRITE of size 4 at 0x0000000000000000 thread <0, 0, 0> block <0, 0, 0> (acc 1, heap) + // CHECK: #0 [[PC]] omp target (main:[[@LINE-6]]) in :0 + // + // CHECK: 0x0000000000000000 is located 0 bytes inside of a 0-byte region [0x0000000000000000,0x0000000000000000) + // + // DEBUG: ERROR: OffloadSanitizer out-of-bounds access on address 0x0000000000000000 at pc [[PC:.*]] + // DEBUG: WRITE of size 4 at 0x0000000000000000 thread <0, 0, 0> block <0, 0, 0> (acc 1, heap) + // DEBUG: #0 [[PC]] omp target (main:[[@LINE-12]]) in {{.*}}volatile_stack_null.c:[[@LINE+4]] + // + // DEBUG: 0x0000000000000000 is located 0 bytes inside of a 0-byte region [0x0000000000000000,0x0000000000000000) + // clang-format on + bar(Valid); + baz(NullPtr); + } +} diff --git a/offload/test/sanitizer/stack_trace_multi_path_many.cpp b/offload/test/sanitizer/stack_trace_multi_path_many.cpp new file mode 100644 index 0000000000000..5f6871e825e54 --- /dev/null +++ b/offload/test/sanitizer/stack_trace_multi_path_many.cpp @@ -0,0 +1,42 @@ +// clang-format off +// : %libomptarget-compileoptxx-generic -fsanitize=offload -O1 +// : not %libomptarget-run-generic 2> %t.out +// : %fcheck-generic --check-prefixes=CHECK < %t.out +// : %libomptarget-compileoptxx-generic -fsanitize=offload -O3 +// : not %libomptarget-run-generic 2> %t.out +// RUN: %libomptarget-compileoptxx-generic -fsanitize=offload -O3 -g +// RUN: not %libomptarget-run-generic 2> %t.out +// RUN: %fcheck-generic --check-prefixes=DEBUG < %t.out +// clang-format on + +// UNSUPPORTED: aarch64-unknown-linux-gnu +// UNSUPPORTED: aarch64-unknown-linux-gnu-LTO +// UNSUPPORTED: x86_64-pc-linux-gnu +// UNSUPPORTED: x86_64-pc-linux-gnu-LTO +// UNSUPPORTED: s390x-ibm-linux-gnu +// UNSUPPORTED: s390x-ibm-linux-gnu-LTO + +#include + +template [[clang::optnone]] T deref(T *P) { return *P; } + +template [[gnu::noinline]] T level(T *P) { + if constexpr (LEVEL > 1) + return level(P) + level(P); + if constexpr (LEVEL > 0) + return level(P); + return deref(P); +} + +int main(void) { + + int *ValidInt = (int *)omp_target_alloc(4, omp_get_default_device()); +#pragma omp target is_device_ptr(ValidInt) + { + level<12>(ValidInt); + short *ValidShort = ((short *)ValidInt) + 2; + level<12>(ValidShort); + char *Invalid = ((char *)ValidInt) + 4; + level<12>(Invalid); + } +} diff --git a/offload/test/sanitizer/sycl-tests/bad-free/bad-free-host.c b/offload/test/sanitizer/sycl-tests/bad-free/bad-free-host.c new file mode 100644 index 0000000000000..0b5c4c114eea3 --- /dev/null +++ b/offload/test/sanitizer/sycl-tests/bad-free/bad-free-host.c @@ -0,0 +1,13 @@ +// RUN: %libomptarget-compileopt-generic -fsanitize=offload +// RUN: not %libomptarget-run-generic 2>&1 > %t.out +// RUN: %fcheck-generic --check-prefixes=CHECK < %t.out + +#include +#include + +int main() { + int FakePtr[3] = {1, 2, 3}; + int Device = omp_get_default_device(); + omp_target_free(FakePtr, Device); + return 0; +} diff --git a/offload/test/sanitizer/sycl-tests/bad-free/bad-free-minus1.c b/offload/test/sanitizer/sycl-tests/bad-free/bad-free-minus1.c new file mode 100644 index 0000000000000..c0b23584859fe --- /dev/null +++ b/offload/test/sanitizer/sycl-tests/bad-free/bad-free-minus1.c @@ -0,0 +1,18 @@ +// RUN: %libomptarget-compileopt-generic -fsanitize=offload +// RUN: not %libomptarget-run-generic 2>&1 > %t.out +// RUN: %fcheck-generic --check-prefixes=CHECK < %t.out + +#include +#include + +int main() { + int N = 100; + int N_SZ = sizeof(int) * N; + + int Device = omp_get_default_device(); + + int *DevPtr = (int *)omp_target_alloc(N_SZ, Device); + omp_target_free(DevPtr - 1, Device); + + return 0; +} diff --git a/offload/test/sanitizer/sycl-tests/bad-free/bad-free-plus1.c b/offload/test/sanitizer/sycl-tests/bad-free/bad-free-plus1.c new file mode 100644 index 0000000000000..f8f688572c846 --- /dev/null +++ b/offload/test/sanitizer/sycl-tests/bad-free/bad-free-plus1.c @@ -0,0 +1,18 @@ +// RUN: %libomptarget-compileopt-generic -fsanitize=offload +// RUN: not %libomptarget-run-generic 2>&1 > %t.out +// RUN: %fcheck-generic --check-prefixes=CHECK < %t.out + +#include +#include + +int main() { + int N = 100; + int N_SZ = sizeof(int) * N; + + int Device = omp_get_default_device(); + + int *DevPtr = (int *)omp_target_alloc(N_SZ, Device); + omp_target_free(DevPtr + 1, Device); + + return 0; +} diff --git a/offload/test/sanitizer/sycl-tests/double-free/double-free.c b/offload/test/sanitizer/sycl-tests/double-free/double-free.c new file mode 100644 index 0000000000000..9e1e2a6ceb26a --- /dev/null +++ b/offload/test/sanitizer/sycl-tests/double-free/double-free.c @@ -0,0 +1,19 @@ +// RUN: %libomptarget-compileopt-generic -fsanitize=offload +// RUN: not %libomptarget-run-generic 2>&1 > %t.out +// RUN: %fcheck-generic --check-prefixes=CHECK < %t.out + +#include +#include + +int main() { + int N = 100; + int N_SZ = sizeof(int) * N; + + int Device = omp_get_default_device(); + + int *DevPtr = (int *)omp_target_alloc(N_SZ, Device); + omp_target_free(DevPtr, Device); + omp_target_free(DevPtr, Device); + + return 0; +} diff --git a/offload/test/sanitizer/sycl-tests/out-of-bounds/DeviceGlobal/device-global.c b/offload/test/sanitizer/sycl-tests/out-of-bounds/DeviceGlobal/device-global.c new file mode 100644 index 0000000000000..8d134ca219df0 --- /dev/null +++ b/offload/test/sanitizer/sycl-tests/out-of-bounds/DeviceGlobal/device-global.c @@ -0,0 +1,20 @@ +// RUN: %libomptarget-compileopt-generic -fsanitize=offload +// RUN: not %libomptarget-run-generic 2>&1 > %t.out +// RUN: %fcheck-generic --check-prefixes=CHECK < %t.out + +// Port of https://github.com/intel/llvm/blob/sycl/sycl/test-e2e/ +// AddressSanitizer/out-of-bounds/DeviceGlobal/device_global.cpp + +#include +#include + +#define ITEM_COUNT 3 + +char dev_global[5]; +#pragma omp declare target(dev_global) + +int main() { +#pragma omp target + { dev_global[8] = 42; } + return 0; +} diff --git a/offload/test/sanitizer/sycl-tests/out-of-bounds/USM/parallel-for-char.c b/offload/test/sanitizer/sycl-tests/out-of-bounds/USM/parallel-for-char.c new file mode 100644 index 0000000000000..ec6a60d76a382 --- /dev/null +++ b/offload/test/sanitizer/sycl-tests/out-of-bounds/USM/parallel-for-char.c @@ -0,0 +1,27 @@ +// RUN: %libomptarget-compileopt-generic -fsanitize=offload +// RUN: not %libomptarget-run-generic 2>&1 > %t.out +// RUN: %fcheck-generic --check-prefixes=CHECK < %t.out + +#include +#include + +int main() { + int N = 10; + int N_SZ = sizeof(char) * N; + + int Device = omp_get_default_device(); + + char *DevPtr = (char *)omp_target_alloc(N_SZ, Device); + +#pragma omp target is_device_ptr(DevPtr) + { +#pragma omp parallel for + for (int i = 0; i < N; i++) { + DevPtr[i] = '*'; + } + } + + omp_target_free(DevPtr, Device); + + return 0; +} diff --git a/offload/test/sanitizer/sycl-tests/out-of-bounds/USM/parallel-for-double.c b/offload/test/sanitizer/sycl-tests/out-of-bounds/USM/parallel-for-double.c new file mode 100644 index 0000000000000..f14a277706916 --- /dev/null +++ b/offload/test/sanitizer/sycl-tests/out-of-bounds/USM/parallel-for-double.c @@ -0,0 +1,27 @@ +// RUN: %libomptarget-compileopt-generic -fsanitize=offload +// RUN: not %libomptarget-run-generic 2>&1 > %t.out +// RUN: %fcheck-generic --check-prefixes=CHECK < %t.out + +#include +#include + +int main() { + int N = 10; + int N_SZ = sizeof(double) * N; + + int Device = omp_get_default_device(); + + double *DevPtr = (double *)omp_target_alloc(N_SZ, Device); + +#pragma omp target is_device_ptr(DevPtr) + { +#pragma omp parallel for + for (int i = 0; i < N; i++) { + DevPtr[i] = 1.23; + } + } + + omp_target_free(DevPtr, Device); + + return 0; +} diff --git a/offload/test/sanitizer/sycl-tests/out-of-bounds/USM/parallel-for-func.c b/offload/test/sanitizer/sycl-tests/out-of-bounds/USM/parallel-for-func.c new file mode 100644 index 0000000000000..ca8d9807ce42b --- /dev/null +++ b/offload/test/sanitizer/sycl-tests/out-of-bounds/USM/parallel-for-func.c @@ -0,0 +1,29 @@ +// RUN: %libomptarget-compileopt-generic -fsanitize=offload +// RUN: not %libomptarget-run-generic 2>&1 > %t.out +// RUN: %fcheck-generic --check-prefixes=CHECK < %t.out + +#include +#include + +__attribute__((noinline)) void foo(int *array, int i) { array[i] = 1; } + +int main() { + int N = 10; + int N_SZ = sizeof(int) * N; + + int Device = omp_get_default_device(); + + int *DevPtr = (int *)omp_target_alloc(N_SZ, Device); + +#pragma omp target is_device_ptr(DevPtr) + { +#pragma omp parallel for + for (int i = 0; i < N; i++) { + foo(DevPtr, i); + } + } + + omp_target_free(DevPtr, Device); + + return 0; +} diff --git a/offload/test/sanitizer/sycl-tests/out-of-bounds/USM/parallel-for-int.c b/offload/test/sanitizer/sycl-tests/out-of-bounds/USM/parallel-for-int.c new file mode 100644 index 0000000000000..ed76dc221ae25 --- /dev/null +++ b/offload/test/sanitizer/sycl-tests/out-of-bounds/USM/parallel-for-int.c @@ -0,0 +1,27 @@ +// RUN: %libomptarget-compileopt-generic -fsanitize=offload +// RUN: not %libomptarget-run-generic 2>&1 > %t.out +// RUN: %fcheck-generic --check-prefixes=CHECK < %t.out + +#include +#include + +int main() { + int N = 10; + int N_SZ = sizeof(int) * N; + + int Device = omp_get_default_device(); + + int *DevPtr = (int *)omp_target_alloc(N_SZ, Device); + +#pragma omp target is_device_ptr(DevPtr) + { +#pragma omp parallel for + for (int i = 0; i < N; i++) { + DevPtr[i] = 2; + } + } + + omp_target_free(DevPtr, Device); + + return 0; +} diff --git a/offload/test/sanitizer/sycl-tests/out-of-bounds/USM/parallel-for-short.c b/offload/test/sanitizer/sycl-tests/out-of-bounds/USM/parallel-for-short.c new file mode 100644 index 0000000000000..79a7f69a0bcf3 --- /dev/null +++ b/offload/test/sanitizer/sycl-tests/out-of-bounds/USM/parallel-for-short.c @@ -0,0 +1,27 @@ +// RUN: %libomptarget-compileopt-generic -fsanitize=offload +// RUN: not %libomptarget-run-generic 2>&1 > %t.out +// RUN: %fcheck-generic --check-prefixes=CHECK < %t.out + +#include +#include + +int main() { + int N = 10; + int N_SZ = sizeof(short) * N; + + int Device = omp_get_default_device(); + + short *DevPtr = (short *)omp_target_alloc(N_SZ, Device); + +#pragma omp target is_device_ptr(DevPtr) + { +#pragma omp parallel for + for (int i = 0; i < N; i++) { + DevPtr[i] = 2; + } + } + + omp_target_free(DevPtr, Device); + + return 0; +} diff --git a/offload/test/sanitizer/sycl-tests/out-of-bounds/buffer/buffer1d.c b/offload/test/sanitizer/sycl-tests/out-of-bounds/buffer/buffer1d.c new file mode 100644 index 0000000000000..0423a2dcf0a17 --- /dev/null +++ b/offload/test/sanitizer/sycl-tests/out-of-bounds/buffer/buffer1d.c @@ -0,0 +1,30 @@ +// RUN: %libomptarget-compileopt-generic -fsanitize=offload +// RUN: %libomptarget-run-generic 2>&1 > %t.out +// RUN: %fcheck-generic --check-prefixes=CHECK < %t.out + +#include +#include +#include + +#define X 3 + +int main() { + int A[X] = {1, 2, 3}; + +#pragma omp target map(tofrom : A) + { + for (int i = 0; i < X; i++) { + A[i] = A[i] * 2; + } + } + + for (int i = 0; i < X; i++) { + printf("%d\n", A[i]); + } + + return 0; +} + +// CHECK: 2 +// CHECK: 4 +// CHECK: 6 diff --git a/offload/test/sanitizer/sycl-tests/out-of-bounds/buffer/buffer2d.c b/offload/test/sanitizer/sycl-tests/out-of-bounds/buffer/buffer2d.c new file mode 100644 index 0000000000000..e6512628c0188 --- /dev/null +++ b/offload/test/sanitizer/sycl-tests/out-of-bounds/buffer/buffer2d.c @@ -0,0 +1,38 @@ +// RUN: %libomptarget-compileopt-generic -fsanitize=offload +// RUN: %libomptarget-run-generic 2>&1 > %t.out +// RUN: %fcheck-generic --check-prefixes=CHECK < %t.out + +#include +#include +#include + +#define X 2 +#define Y 3 + +int main() { + int A[X][Y] = {{1, 2, 3}, {4, 5, 6}}; + +#pragma omp target map(tofrom : A) + { + for (int i = 0; i < X; i++) { + for (int j = 0; j < Y; j++) { + A[i][j] = A[i][j] * 2; + } + } + } + + for (int i = 0; i < X; i++) { + for (int j = 0; j < Y; j++) { + printf("%d", A[i][j]); + if (j + 1 != Y) { + printf(" "); + } + } + printf("\n"); + } + + return 0; +} + +// CHECK: 2 4 6 +// CHECK-NEXT: 8 10 12 diff --git a/offload/test/sanitizer/sycl-tests/out-of-bounds/buffer/buffer3d.c b/offload/test/sanitizer/sycl-tests/out-of-bounds/buffer/buffer3d.c new file mode 100644 index 0000000000000..0f4022e2c6cf7 --- /dev/null +++ b/offload/test/sanitizer/sycl-tests/out-of-bounds/buffer/buffer3d.c @@ -0,0 +1,55 @@ +// RUN: %libomptarget-compileopt-generic -fsanitize=offload +// RUN: %libomptarget-run-generic 2>&1 > %t.out +// RUN: %fcheck-generic --check-prefixes=CHECK < %t.out + +#include +#include +#include + +#define X 4 +#define Y 3 +#define Z 2 + +int main() { + int A[X][Y][Z] = { + {{0, 1}, {2, 3}, {4, 5}}, + {{6, 7}, {8, 9}, {10, 11}}, + {{12, 13}, {14, 15}, {16, 17}}, + {{18, 19}, {20, 21}, {22, 23}}, + }; + +#pragma omp target map(tofrom : A) + { + for (int i = 0; i < X; i++) { + for (int j = 0; j < Y; j++) { + for (int k = 0; k < Z; k++) { + A[i][j][k] = A[i][j][k] * 2; + } + } + } + } + + for (int i = 0; i < X; i++) { + for (int j = 0; j < Y; j++) { + printf("("); + for (int k = 0; k < Z; k++) { + printf("%d", A[i][j][k]); + if (k + 1 != Z) { + printf(","); + } + } + printf(")"); + if (j + 1 != Y) { + printf(","); + } + } + printf("\n"); + } + + return 0; +} + +// CHECK: (0,2),(4,6),(8,10) +// CHECK: (12,14),(16,18),(20,22) +// CHECK: (24,26),(28,30),(32,34) +// CHECK: (36,38),(40,42),(44,46) diff --git a/offload/test/sanitizer/sycl-tests/out-of-bounds/buffer/buffer_malloc.c b/offload/test/sanitizer/sycl-tests/out-of-bounds/buffer/buffer_malloc.c new file mode 100644 index 0000000000000..a9b84ae62a05a --- /dev/null +++ b/offload/test/sanitizer/sycl-tests/out-of-bounds/buffer/buffer_malloc.c @@ -0,0 +1,41 @@ +// RUN: %libomptarget-compileopt-generic -fsanitize=offload +// RUN: %libomptarget-run-generic 2>&1 > %t.out +// RUN: %fcheck-generic --check-prefixes=CHECK < %t.out + +#include +#include +#include + +int main() { + int N = 10; + int N_SZ = sizeof(int) * N; + + int Device = omp_get_default_device(); + + int *Buffer = (int *)malloc(N_SZ); + +#pragma omp target map(tofrom : Buffer[0 : N]) + { + for (int i = 0; i < N; i++) { + Buffer[i] = i; + } + } + + for (int i = 0; i < N; i++) { + printf("%d\n", Buffer[i]); + } + free(Buffer); + + return 0; +} + +// CHECK: 0 +// CHECK-NEXT: 1 +// CHECK-NEXT: 2 +// CHECK-NEXT: 3 +// CHECK-NEXT: 4 +// CHECK-NEXT: 5 +// CHECK-NEXT: 6 +// CHECK-NEXT: 7 +// CHECK-NEXT: 8 +// CHECK-NEXT: 9 diff --git a/offload/test/sanitizer/sycl-tests/out-of-bounds/local/local-accessor-basic.c b/offload/test/sanitizer/sycl-tests/out-of-bounds/local/local-accessor-basic.c new file mode 100644 index 0000000000000..72742d43b43e3 --- /dev/null +++ b/offload/test/sanitizer/sycl-tests/out-of-bounds/local/local-accessor-basic.c @@ -0,0 +1,30 @@ +// RUN: %libomptarget-compileopt-generic -fsanitize=offload +// RUN: not %libomptarget-run-generic 2>&1 > %t.out +// RUN: %fcheck-generic --check-prefixes=CHECK < %t.out + +// Port of https://github.com/intel/llvm/blob/sycl/sycl/test-e2e/ +// AddressSanitizer/out-of-bounds/local/local_accessor_basic.cpp + +#include +#include + +#define ITEM_COUNT 3 + +int main() { + int N_SZ = sizeof(int) * ITEM_COUNT; + + int Device = omp_get_default_device(); + int *DevPtr = (int *)omp_target_alloc(N_SZ, Device); + +#pragma omp target is_device_ptr(DevPtr1) + { + int T1[ITEM_COUNT] = {0}; + for (int i = 0; i < ITEM_COUNT; i++) { + DevPtr[i] = T1[i + 1]; + } + } + + omp_target_free(DevPtr, Device); + + return 0; +} diff --git a/offload/test/sanitizer/sycl-tests/out-of-bounds/local/local-accessor-function.c b/offload/test/sanitizer/sycl-tests/out-of-bounds/local/local-accessor-function.c new file mode 100644 index 0000000000000..427d7abe94e70 --- /dev/null +++ b/offload/test/sanitizer/sycl-tests/out-of-bounds/local/local-accessor-function.c @@ -0,0 +1,38 @@ +// RUN: %libomptarget-compileopt-generic -fsanitize=offload +// RUN: not %libomptarget-run-generic 2>&1 > %t.out +// RUN: %fcheck-generic --check-prefixes=CHECK < %t.out + +// Port of https://github.com/intel/llvm/blob/sycl/sycl/test-e2e/ +// AddressSanitizer/out-of-bounds/local/local_accessor_function.cpp + +#include +#include + +#define ITEM_COUNT 3 + +__attribute__((noinline)) void foo(int *dest, const int *source1, + const int *source2, const int *source3, + int index) { + dest[index] = source1[index] + source2[index] + source3[index + 1]; +} + +int main() { + int N_SZ = sizeof(int) * ITEM_COUNT; + + int Device = omp_get_default_device(); + int *DevPtr = (int *)omp_target_alloc(N_SZ, Device); + +#pragma omp target is_device_ptr(DevPtr) + { + int T1[ITEM_COUNT] = {0}; + int T2[ITEM_COUNT] = {0}; + int T3[ITEM_COUNT] = {0}; + for (int i = 0; i < ITEM_COUNT; i++) { + foo(DevPtr, T1, T2, T3, i); + } + } + + omp_target_free(DevPtr, Device); + + return 0; +} diff --git a/offload/test/sanitizer/sycl-tests/out-of-bounds/local/local-accessor-multiargs.c b/offload/test/sanitizer/sycl-tests/out-of-bounds/local/local-accessor-multiargs.c new file mode 100644 index 0000000000000..ae2dde735371e --- /dev/null +++ b/offload/test/sanitizer/sycl-tests/out-of-bounds/local/local-accessor-multiargs.c @@ -0,0 +1,35 @@ +// RUN: %libomptarget-compileopt-generic -fsanitize=offload +// RUN: not %libomptarget-run-generic 2>&1 > %t.out +// RUN: %fcheck-generic --check-prefixes=CHECK < %t.out + +// Port of https://github.com/intel/llvm/blob/sycl/sycl/test-e2e/ +// AddressSanitizer/out-of-bounds/local/local_accessor_multiargs.cpp + +#include +#include + +#define ITEM_COUNT 3 + +int main() { + int N_SZ = sizeof(int) * ITEM_COUNT; + + int Device = omp_get_default_device(); + int *DevPtr1 = (int *)omp_target_alloc(N_SZ, Device); + int *DevPtr2 = (int *)omp_target_alloc(N_SZ, Device); + +#pragma omp target is_device_ptr(DevPtr1, DevPtr2) + { + int T1[ITEM_COUNT] = {0}; + int T2[ITEM_COUNT] = {0}; + int T3[ITEM_COUNT] = {0}; + for (int i = 0; i < ITEM_COUNT; i++) { + DevPtr1[i] = T1[i] + T2[i] + T3[i]; + DevPtr2[i] = T1[i] + T2[i + 1] + T3[i]; + } + } + + omp_target_free(DevPtr1, Device); + omp_target_free(DevPtr2, Device); + + return 0; +} diff --git a/offload/test/sanitizer/sycl-tests/out-of-bounds/local/multiple-source.c b/offload/test/sanitizer/sycl-tests/out-of-bounds/local/multiple-source.c new file mode 100644 index 0000000000000..55adfcae18fb4 --- /dev/null +++ b/offload/test/sanitizer/sycl-tests/out-of-bounds/local/multiple-source.c @@ -0,0 +1,41 @@ +// RUN: %libomptarget-compileopt-generic -fsanitize=offload +// RUN: not %libomptarget-run-generic 2>&1 > %t.out +// RUN: %fcheck-generic --check-prefixes=CHECK < %t.out + +// Port of https://github.com/intel/llvm/blob/sycl/sycl/test-e2e/ +// AddressSanitizer/out-of-bounds/local/multiple_source.cpp + +#include +#include + +#define ITEM_COUNT 3 + +__attribute__((noinline)) void foo(int *DevPtr) { +#pragma omp target is_device_ptr(DevPtr) + { + int T1[ITEM_COUNT] = {0}; + for (int i = 0; i < ITEM_COUNT; i++) { + DevPtr[i] = T1[i + 1]; + } + } +} + +int main() { + int N_SZ = sizeof(int) * ITEM_COUNT; + + int Device = omp_get_default_device(); + int *DevPtr = (int *)omp_target_alloc(N_SZ, Device); + +#pragma omp target is_device_ptr(DevPtr) + { + int T1[ITEM_COUNT] = {0}; + for (int i = 0; i < ITEM_COUNT; i++) { + DevPtr[i] = T1[i]; + } + } + foo(DevPtr); + + omp_target_free(DevPtr, Device); + + return 0; +} diff --git a/offload/test/sanitizer/sycl-tests/use-after-free/quarantine-free.c b/offload/test/sanitizer/sycl-tests/use-after-free/quarantine-free.c new file mode 100644 index 0000000000000..eb62f9e61c79c --- /dev/null +++ b/offload/test/sanitizer/sycl-tests/use-after-free/quarantine-free.c @@ -0,0 +1,28 @@ +// RUN: %libomptarget-compileopt-generic -fsanitize=offload +// RUN: %libomptarget-run-generic 2>&1 > %t.out +// RUN: %fcheck-generic --check-prefixes=CHECK < %t.out + +// Port of +// https://github.com/intel/llvm/blob/sycl/sycl/test-e2e/AddressSanitizer +// /use-after-free/quarantine-free.cpp + +#include +#include + +int main() { + int N = 100; + int N_SZ = sizeof(int) * N; + + int Device = omp_get_default_device(); + + int *DevPtr = (int *)omp_target_alloc(N_SZ, Device); + omp_target_free(DevPtr, Device); + + DevPtr = (int *)omp_target_alloc(N_SZ, Device); + omp_target_free(DevPtr, Device); + + DevPtr = (int *)omp_target_alloc(N_SZ, Device); + omp_target_free(DevPtr, Device); + + return 0; +} diff --git a/offload/test/sanitizer/sycl-tests/use-after-free/quarantine-no-free.c b/offload/test/sanitizer/sycl-tests/use-after-free/quarantine-no-free.c new file mode 100644 index 0000000000000..7af5b8751c68c --- /dev/null +++ b/offload/test/sanitizer/sycl-tests/use-after-free/quarantine-no-free.c @@ -0,0 +1,31 @@ +// RUN: %libomptarget-compileopt-generic -fsanitize=offload +// RUN: not %libomptarget-run-generic 2>&1 > %t.out +// RUN: %fcheck-generic --check-prefixes=CHECK < %t.out + +// Port of +// https://github.com/intel/llvm/blob/sycl/sycl/test-e2e/AddressSanitizer +// /use-after-free/quarantine-no-free.cpp + +#include +#include + +int main() { + int N = 100; + int N_SZ = sizeof(int) * N; + + int Device = omp_get_default_device(); + + int *DevPtr = (int *)omp_target_alloc(N_SZ, Device); + omp_target_free(DevPtr, Device); + + DevPtr = (int *)omp_target_alloc(N_SZ, Device); + omp_target_free(DevPtr, Device); + + DevPtr = (int *)omp_target_alloc(N_SZ, Device); + omp_target_free(DevPtr, Device); + +#pragma omp target is_device_ptr(DevPtr) + { DevPtr[0] = 0; } + + return 0; +} diff --git a/offload/test/sanitizer/sycl-tests/use-after-free/use-after-free.c b/offload/test/sanitizer/sycl-tests/use-after-free/use-after-free.c new file mode 100644 index 0000000000000..89ff2d2f1f080 --- /dev/null +++ b/offload/test/sanitizer/sycl-tests/use-after-free/use-after-free.c @@ -0,0 +1,23 @@ +// RUN: %libomptarget-compileopt-generic -fsanitize=offload +// RUN: not %libomptarget-run-generic 2>&1 > %t.out +// RUN: %fcheck-generic --check-prefixes=CHECK < %t.out + +#include +#include + +int main() { + int N = 100; + int N_SZ = sizeof(int) * N; + + int Device = omp_get_default_device(); + + int *DevPtr = (int *)omp_target_alloc(N_SZ, Device); + omp_target_free(DevPtr, Device); + +#pragma omp target is_device_ptr(DevPtr) + for (int i = 0; i < N; i++) { + DevPtr[i] = i; + } + + return 0; +} diff --git a/offload/test/sanitizer/sycl-tests/use-after-free/use-before-free.c b/offload/test/sanitizer/sycl-tests/use-after-free/use-before-free.c new file mode 100644 index 0000000000000..d27709db8fe3e --- /dev/null +++ b/offload/test/sanitizer/sycl-tests/use-after-free/use-before-free.c @@ -0,0 +1,25 @@ +// RUN: %libomptarget-compileopt-generic -fsanitize=offload +// RUN: %libomptarget-run-generic 2>&1 > %t.out +// RUN: %fcheck-generic --check-prefixes=CHECK < %t.out + +#include +#include + +int main() { + int N = 100; + int N_SZ = sizeof(int) * N; + + int Device = omp_get_default_device(); + + int *DevPtr = (int *)omp_target_alloc(N_SZ, Device); +#pragma omp target is_device_ptr(DevPtr) + for (int i = 0; i < N; i++) { + DevPtr[i] = i; + } + for (int i = 0; i < N; i++) { + printf("%d\n", DevPtr[i]); + } + omp_target_free(DevPtr, Device); + + return 0; +} diff --git a/offload/test/sanitizer/volatile_stack_null.c b/offload/test/sanitizer/volatile_stack_null.c new file mode 100644 index 0000000000000..08542c3516f28 --- /dev/null +++ b/offload/test/sanitizer/volatile_stack_null.c @@ -0,0 +1,39 @@ +// clang-format off +// RUN: %libomptarget-compileopt-generic -fsanitize=offload -O1 +// RUN: not %libomptarget-run-generic 2> %t.out +// RUN: %fcheck-generic --check-prefixes=CHECK < %t.out +// RUN: %libomptarget-compileopt-generic -fsanitize=offload -O3 +// RUN: not %libomptarget-run-generic 2> %t.out +// RUN: %libomptarget-compileopt-generic -fsanitize=offload -O3 -g +// RUN: not %libomptarget-run-generic 2> %t.out +// RUN: %fcheck-generic --check-prefixes=DEBUG < %t.out +// clang-format on + +// UNSUPPORTED: aarch64-unknown-linux-gnu +// UNSUPPORTED: aarch64-unknown-linux-gnu-LTO +// UNSUPPORTED: x86_64-pc-linux-gnu +// UNSUPPORTED: x86_64-pc-linux-gnu-LTO +// UNSUPPORTED: s390x-ibm-linux-gnu +// UNSUPPORTED: s390x-ibm-linux-gnu-LTO + +int main(void) { + +#pragma omp target + { + volatile int *Null = 0; + // clang-format off + // CHECK: ERROR: OffloadSanitizer out-of-bounds access on address 0x0000000000000000 at pc [[PC:.*]] + // CHECK: WRITE of size 4 at 0x0000000000000000 thread <0, 0, 0> block <0, 0, 0> (acc 1, heap) + // CHECK: #0 [[PC]] omp target (main:[[@LINE-6]]) in :0 + // + // CHECK: 0x0000000000000000 is located 0 bytes inside of a 0-byte region [0x0000000000000000,0x0000000000000000) + // + // DEBUG: ERROR: OffloadSanitizer out-of-bounds access on address 0x0000000000000000 at pc [[PC:.*]] + // DEBUG: WRITE of size 4 at 0x0000000000000000 thread <0, 0, 0> block <0, 0, 0> (acc 1, heap) + // DEBUG: #0 [[PC]] omp target (main:[[@LINE-12]]) in {{.*}}volatile_stack_null.c:[[@LINE+4]] + // + // DEBUG: 0x0000000000000000 is located 0 bytes inside of a 0-byte region [0x0000000000000000,0x0000000000000000) + // clang-format on + *Null = 42; + } +}