diff --git a/clang/include/clang/Basic/LangOptions.def b/clang/include/clang/Basic/LangOptions.def index 2dea3cd4d795b..e8d3be7e89dbb 100644 --- a/clang/include/clang/Basic/LangOptions.def +++ b/clang/include/clang/Basic/LangOptions.def @@ -288,6 +288,7 @@ LANGOPT(GPUMaxThreadsPerBlock, 32, 1024, "default max threads per block for kern LANGOPT(GPUDeferDiag, 1, 0, "defer host/device related diagnostic messages for CUDA/HIP") LANGOPT(GPUExcludeWrongSideOverloads, 1, 0, "always exclude wrong side overloads in overloading resolution for CUDA/HIP") LANGOPT(OffloadingNewDriver, 1, 0, "use the new driver for generating offloading code.") +LANGOPT(OffloadViaLLVM, 1, 0, "target LLVM/Offload as portable offloading runtime.") LANGOPT(SYCLIsDevice , 1, 0, "Generate code for SYCL device") LANGOPT(SYCLIsHost , 1, 0, "SYCL host compilation") diff --git a/clang/include/clang/Basic/Sanitizers.def b/clang/include/clang/Basic/Sanitizers.def index bee35e9dca7c3..0b41187f6db52 100644 --- a/clang/include/clang/Basic/Sanitizers.def +++ b/clang/include/clang/Basic/Sanitizers.def @@ -190,6 +190,9 @@ SANITIZER_GROUP("bounds", Bounds, ArrayBounds | LocalBounds) // Scudo hardened allocator SANITIZER("scudo", Scudo) +// LLVM/Offload sanitizer +SANITIZER("offload", Offload) + // Magic group, containing all sanitizers. For example, "-fno-sanitize=all" // can be used to disable all the sanitizers. SANITIZER_GROUP("all", All, ~SanitizerMask()) diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td index d44faa55c456f..b77fd063f5519 100644 --- a/clang/include/clang/Driver/Options.td +++ b/clang/include/clang/Driver/Options.td @@ -1293,6 +1293,12 @@ def no_offload_compress : Flag<["--"], "no-offload-compress">; def offload_compression_level_EQ : Joined<["--"], "offload-compression-level=">, Flags<[HelpHidden]>, HelpText<"Compression level for offload device binaries (HIP only)">; + +defm offload_via_llvm : BoolFOption<"offload-via-llvm", + LangOpts<"OffloadViaLLVM">, DefaultFalse, + PosFlag, + NegFlag, + BothFlags<[], [ClangOption], " LLVM/Offload as portable offloading runtime.">>; } // CUDA options diff --git a/clang/include/clang/Driver/SanitizerArgs.h b/clang/include/clang/Driver/SanitizerArgs.h index 47ef175302679..004d5fbf4af73 100644 --- a/clang/include/clang/Driver/SanitizerArgs.h +++ b/clang/include/clang/Driver/SanitizerArgs.h @@ -80,6 +80,9 @@ class SanitizerArgs { bool needsMemProfRt() const { return NeedsMemProfRt; } bool needsAsanRt() const { return Sanitizers.has(SanitizerKind::Address); } + bool needsOffloadKernels() const { + return Sanitizers.has(SanitizerKind::Offload); + } bool needsHwasanRt() const { return Sanitizers.has(SanitizerKind::HWAddress); } diff --git a/clang/lib/CodeGen/BackendUtil.cpp b/clang/lib/CodeGen/BackendUtil.cpp index b09680086248d..b1d11b4a6497a 100644 --- a/clang/lib/CodeGen/BackendUtil.cpp +++ b/clang/lib/CodeGen/BackendUtil.cpp @@ -70,6 +70,7 @@ #include "llvm/Transforms/Instrumentation/BoundsChecking.h" #include "llvm/Transforms/Instrumentation/DataFlowSanitizer.h" #include "llvm/Transforms/Instrumentation/GCOVProfiler.h" +#include "llvm/Transforms/Instrumentation/GPUSan.h" #include "llvm/Transforms/Instrumentation/HWAddressSanitizer.h" #include "llvm/Transforms/Instrumentation/InstrProfiling.h" #include "llvm/Transforms/Instrumentation/KCFI.h" diff --git a/clang/lib/CodeGen/CGCUDANV.cpp b/clang/lib/CodeGen/CGCUDANV.cpp index 43dfbbb90dd52..2ebe0bf802dfa 100644 --- a/clang/lib/CodeGen/CGCUDANV.cpp +++ b/clang/lib/CodeGen/CGCUDANV.cpp @@ -15,10 +15,12 @@ #include "CGCXXABI.h" #include "CodeGenFunction.h" #include "CodeGenModule.h" +#include "clang/AST/CharUnits.h" #include "clang/AST/Decl.h" #include "clang/Basic/Cuda.h" #include "clang/CodeGen/CodeGenABITypes.h" #include "clang/CodeGen/ConstantInitBuilder.h" +#include "llvm/ADT/StringRef.h" #include "llvm/Frontend/Offloading/Utility.h" #include "llvm/IR/BasicBlock.h" #include "llvm/IR/Constants.h" @@ -36,6 +38,11 @@ constexpr unsigned HIPFatMagic = 0x48495046; // "HIPF" class CGNVCUDARuntime : public CGCUDARuntime { + /// The prefix used for function calls and section names (CUDA, HIP, LLVM) + StringRef Prefix; + /// TODO: We should transition the OpenMP section to LLVM/Offload + StringRef SectionPrefix; + private: llvm::IntegerType *IntTy, *SizeTy; llvm::Type *VoidTy; @@ -132,6 +139,9 @@ class CGNVCUDARuntime : public CGCUDARuntime { return DummyFunc; } + Address prepareKernelArgs(CodeGenFunction &CGF, FunctionArgList &Args); + Address prepareKernelArgsLLVMOffload(CodeGenFunction &CGF, + FunctionArgList &Args); void emitDeviceStubBodyLegacy(CodeGenFunction &CGF, FunctionArgList &Args); void emitDeviceStubBodyNew(CodeGenFunction &CGF, FunctionArgList &Args); std::string getDeviceSideName(const NamedDecl *ND) override; @@ -191,15 +201,11 @@ class CGNVCUDARuntime : public CGCUDARuntime { } // end anonymous namespace std::string CGNVCUDARuntime::addPrefixToName(StringRef FuncName) const { - if (CGM.getLangOpts().HIP) - return ((Twine("hip") + Twine(FuncName)).str()); - return ((Twine("cuda") + Twine(FuncName)).str()); + return (Prefix + FuncName).str(); } std::string CGNVCUDARuntime::addUnderscoredPrefixToName(StringRef FuncName) const { - if (CGM.getLangOpts().HIP) - return ((Twine("__hip") + Twine(FuncName)).str()); - return ((Twine("__cuda") + Twine(FuncName)).str()); + return ("__" + Prefix + FuncName).str(); } static std::unique_ptr InitDeviceMC(CodeGenModule &CGM) { @@ -227,6 +233,14 @@ CGNVCUDARuntime::CGNVCUDARuntime(CodeGenModule &CGM) SizeTy = CGM.SizeTy; VoidTy = CGM.VoidTy; PtrTy = CGM.UnqualPtrTy; + + if (CGM.getLangOpts().OffloadViaLLVM) { + Prefix = "llvm"; + SectionPrefix = "omp"; + } else if (CGM.getLangOpts().HIP) + SectionPrefix = Prefix = "hip"; + else + SectionPrefix = Prefix = "cuda"; } llvm::FunctionCallee CGNVCUDARuntime::getSetupArgumentFn() const { @@ -305,18 +319,58 @@ void CGNVCUDARuntime::emitDeviceStub(CodeGenFunction &CGF, } if (CudaFeatureEnabled(CGM.getTarget().getSDKVersion(), CudaFeature::CUDA_USES_NEW_LAUNCH) || - (CGF.getLangOpts().HIP && CGF.getLangOpts().HIPUseNewLaunchAPI)) + (CGF.getLangOpts().HIP && CGF.getLangOpts().HIPUseNewLaunchAPI) || + (CGF.getLangOpts().OffloadViaLLVM)) emitDeviceStubBodyNew(CGF, Args); else emitDeviceStubBodyLegacy(CGF, Args); } -// CUDA 9.0+ uses new way to launch kernels. Parameters are packed in a local -// array and kernels are launched using cudaLaunchKernel(). -void CGNVCUDARuntime::emitDeviceStubBodyNew(CodeGenFunction &CGF, - FunctionArgList &Args) { - // Build the shadow stack entry at the very start of the function. +/// CUDA passes the arguments with a level of indirection. For example, a +/// (void*, short, void*) is passed as {void **, short *, void **} to the launch +/// function. For the LLVM/offload launch we flatten the arguments into the +/// struct directly. In addition, we include the size of the arguments, thus +/// pass {sizeof({void *, short, void *}), ptr to {void *, short, void *}, +/// nullptr}. The last nullptr needs to be initialized to an array of pointers +/// pointing to the arguments if we want to offload to the host. +Address CGNVCUDARuntime::prepareKernelArgsLLVMOffload(CodeGenFunction &CGF, + FunctionArgList &Args) { + SmallVector ArgTypes, KernelLaunchParamsTypes; + for (auto &Arg : Args) + ArgTypes.push_back(CGF.ConvertTypeForMem(Arg->getType())); + llvm::StructType *KernelArgsTy = llvm::StructType::create(ArgTypes); + + auto *Int64Ty = CGF.Builder.getInt64Ty(); + KernelLaunchParamsTypes.push_back(Int64Ty); + KernelLaunchParamsTypes.push_back(PtrTy); + KernelLaunchParamsTypes.push_back(PtrTy); + + llvm::StructType *KernelLaunchParamsTy = + llvm::StructType::create(KernelLaunchParamsTypes); + Address KernelArgs = CGF.CreateTempAllocaWithoutCast( + KernelArgsTy, CharUnits::fromQuantity(16), "kernel_args"); + Address KernelLaunchParams = CGF.CreateTempAllocaWithoutCast( + KernelLaunchParamsTy, CharUnits::fromQuantity(16), + "kernel_launch_params"); + + auto KernelArgsSize = CGM.getDataLayout().getTypeAllocSize(KernelArgsTy); + CGF.Builder.CreateStore(llvm::ConstantInt::get(Int64Ty, KernelArgsSize), + CGF.Builder.CreateStructGEP(KernelLaunchParams, 0)); + CGF.Builder.CreateStore(KernelArgs.emitRawPointer(CGF), + CGF.Builder.CreateStructGEP(KernelLaunchParams, 1)); + CGF.Builder.CreateStore(llvm::Constant::getNullValue(PtrTy), + CGF.Builder.CreateStructGEP(KernelLaunchParams, 2)); + + for (unsigned i = 0; i < Args.size(); ++i) { + auto *ArgVal = CGF.Builder.CreateLoad(CGF.GetAddrOfLocalVar(Args[i])); + CGF.Builder.CreateStore(ArgVal, CGF.Builder.CreateStructGEP(KernelArgs, i)); + } + return KernelLaunchParams; +} + +Address CGNVCUDARuntime::prepareKernelArgs(CodeGenFunction &CGF, + FunctionArgList &Args) { // Calculate amount of space we will need for all arguments. If we have no // args, allocate a single pointer so we still have a valid pointer to the // argument array that we can pass to runtime, even if it will be unused. @@ -331,6 +385,17 @@ void CGNVCUDARuntime::emitDeviceStubBodyNew(CodeGenFunction &CGF, VoidVarPtr, CGF.Builder.CreateConstGEP1_32( PtrTy, KernelArgs.emitRawPointer(CGF), i)); } + return KernelArgs; +} + +// CUDA 9.0+ uses new way to launch kernels. Parameters are packed in a local +// array and kernels are launched using cudaLaunchKernel(). +void CGNVCUDARuntime::emitDeviceStubBodyNew(CodeGenFunction &CGF, + FunctionArgList &Args) { + // Build the shadow stack entry at the very start of the function. + Address KernelArgs = CGF.getLangOpts().OffloadViaLLVM + ? prepareKernelArgsLLVMOffload(CGF, Args) + : prepareKernelArgs(CGF, Args); llvm::BasicBlock *EndBlock = CGF.createBasicBlock("setup.end"); @@ -1129,8 +1194,9 @@ void CGNVCUDARuntime::transformManagedVars() { // registered. The linker will provide a pointer to this section so we can // register the symbols with the linked device image. void CGNVCUDARuntime::createOffloadingEntries() { - StringRef Section = CGM.getLangOpts().HIP ? "hip_offloading_entries" - : "cuda_offloading_entries"; + SmallVector Out; + StringRef Section = (SectionPrefix + "_offloading_entries").toStringRef(Out); + llvm::Module &M = CGM.getModule(); for (KernelInfo &I : EmittedKernels) llvm::offloading::emitOffloadingEntry( @@ -1199,7 +1265,8 @@ llvm::Function *CGNVCUDARuntime::finalizeModule() { } return nullptr; } - if (CGM.getLangOpts().OffloadingNewDriver && RelocatableDeviceCode) + if (CGM.getLangOpts().OffloadViaLLVM || + (CGM.getLangOpts().OffloadingNewDriver && RelocatableDeviceCode)) createOffloadingEntries(); else return makeModuleCtorFunction(); diff --git a/clang/lib/CodeGen/CGDeclCXX.cpp b/clang/lib/CodeGen/CGDeclCXX.cpp index a88bb2af59fee..90ebc638bd45e 100644 --- a/clang/lib/CodeGen/CGDeclCXX.cpp +++ b/clang/lib/CodeGen/CGDeclCXX.cpp @@ -456,6 +456,10 @@ llvm::Function *CodeGenModule::CreateGlobalInitOrCleanUpFunction( !isInNoSanitizeList(SanitizerKind::Address, Fn, Loc)) Fn->addFnAttr(llvm::Attribute::SanitizeAddress); + if (getLangOpts().Sanitize.has(SanitizerKind::Offload) && + !isInNoSanitizeList(SanitizerKind::Offload, Fn, Loc)) + Fn->addFnAttr(llvm::Attribute::SanitizeAddress); + if (getLangOpts().Sanitize.has(SanitizerKind::KernelAddress) && !isInNoSanitizeList(SanitizerKind::KernelAddress, Fn, Loc)) Fn->addFnAttr(llvm::Attribute::SanitizeAddress); diff --git a/clang/lib/CodeGen/CGStmtOpenMP.cpp b/clang/lib/CodeGen/CGStmtOpenMP.cpp index f73d32de7c484..450371aef12b9 100644 --- a/clang/lib/CodeGen/CGStmtOpenMP.cpp +++ b/clang/lib/CodeGen/CGStmtOpenMP.cpp @@ -639,27 +639,42 @@ CodeGenFunction::GenerateOpenMPCapturedStmtFunction(const CapturedStmt &S, // Build the argument list. bool NeedWrapperFunction = getDebugInfo() && CGM.getCodeGenOpts().hasReducedDebugInfo(); - FunctionArgList Args; - llvm::MapVector> LocalAddrs; - llvm::DenseMap> VLASizes; + FunctionArgList Args, WrapperArgs; + llvm::MapVector> LocalAddrs, + WrapperLocalAddrs; + llvm::DenseMap> VLASizes, + WrapperVLASizes; SmallString<256> Buffer; llvm::raw_svector_ostream Out(Buffer); Out << CapturedStmtInfo->getHelperName(); - if (NeedWrapperFunction) + + CodeGenFunction WrapperCGF(CGM, /*suppressNewContext=*/true); + llvm::Function *WrapperF = nullptr; + if (NeedWrapperFunction) { + // Emit the final kernel early to allow attributes to be added by the + // OpenMPI-IR-Builder. + FunctionOptions WrapperFO(&S, /*UIntPtrCastRequired=*/true, + /*RegisterCastedArgsOnly=*/true, + CapturedStmtInfo->getHelperName(), Loc); + WrapperCGF.CapturedStmtInfo = CapturedStmtInfo; + WrapperF = + emitOutlinedFunctionPrologue(WrapperCGF, Args, LocalAddrs, VLASizes, + WrapperCGF.CXXThisValue, WrapperFO); Out << "_debug__"; + } FunctionOptions FO(&S, !NeedWrapperFunction, /*RegisterCastedArgsOnly=*/false, Out.str(), Loc); - llvm::Function *F = emitOutlinedFunctionPrologue(*this, Args, LocalAddrs, - VLASizes, CXXThisValue, FO); + llvm::Function *F = emitOutlinedFunctionPrologue( + *this, WrapperArgs, WrapperLocalAddrs, WrapperVLASizes, CXXThisValue, FO); CodeGenFunction::OMPPrivateScope LocalScope(*this); - for (const auto &LocalAddrPair : LocalAddrs) { + for (const auto &LocalAddrPair : WrapperLocalAddrs) { if (LocalAddrPair.second.first) { LocalScope.addPrivate(LocalAddrPair.second.first, LocalAddrPair.second.second); } } (void)LocalScope.Privatize(); - for (const auto &VLASizePair : VLASizes) + for (const auto &VLASizePair : WrapperVLASizes) VLASizeMap[VLASizePair.second.first] = VLASizePair.second.second; PGO.assignRegionCounters(GlobalDecl(CD), F); CapturedStmtInfo->EmitBody(*this, CD->getBody()); @@ -668,17 +683,10 @@ CodeGenFunction::GenerateOpenMPCapturedStmtFunction(const CapturedStmt &S, if (!NeedWrapperFunction) return F; - FunctionOptions WrapperFO(&S, /*UIntPtrCastRequired=*/true, - /*RegisterCastedArgsOnly=*/true, - CapturedStmtInfo->getHelperName(), Loc); - CodeGenFunction WrapperCGF(CGM, /*suppressNewContext=*/true); - WrapperCGF.CapturedStmtInfo = CapturedStmtInfo; - Args.clear(); - LocalAddrs.clear(); - VLASizes.clear(); - llvm::Function *WrapperF = - emitOutlinedFunctionPrologue(WrapperCGF, Args, LocalAddrs, VLASizes, - WrapperCGF.CXXThisValue, WrapperFO); + // Reverse the order. + WrapperF->removeFromParent(); + F->getParent()->getFunctionList().insertAfter(F->getIterator(), WrapperF); + llvm::SmallVector CallArgs; auto *PI = F->arg_begin(); for (const auto *Arg : Args) { diff --git a/clang/lib/CodeGen/CodeGenFunction.cpp b/clang/lib/CodeGen/CodeGenFunction.cpp index cea0d84c64bc4..d11c4ed65a597 100644 --- a/clang/lib/CodeGen/CodeGenFunction.cpp +++ b/clang/lib/CodeGen/CodeGenFunction.cpp @@ -31,6 +31,7 @@ #include "clang/AST/StmtObjC.h" #include "clang/Basic/Builtins.h" #include "clang/Basic/CodeGenOptions.h" +#include "clang/Basic/Sanitizers.h" #include "clang/Basic/TargetBuiltins.h" #include "clang/Basic/TargetInfo.h" #include "clang/CodeGen/CGFunctionInfo.h" @@ -67,7 +68,8 @@ static bool shouldEmitLifetimeMarkers(const CodeGenOptions &CGOpts, // Sanitizers may use markers. if (CGOpts.SanitizeAddressUseAfterScope || LangOpts.Sanitize.has(SanitizerKind::HWAddress) || - LangOpts.Sanitize.has(SanitizerKind::Memory)) + LangOpts.Sanitize.has(SanitizerKind::Memory) || + LangOpts.Sanitize.has(SanitizerKind::Offload)) return true; // For now, only in optimized builds. @@ -791,6 +793,8 @@ void CodeGenFunction::StartFunction(GlobalDecl GD, QualType RetTy, SanOpts.set(SanitizerKind::KernelHWAddress, false); if (no_sanitize_mask & SanitizerKind::KernelHWAddress) SanOpts.set(SanitizerKind::HWAddress, false); + if (no_sanitize_mask & SanitizerKind::Offload) + SanOpts.set(SanitizerKind::Offload, false); if (SanitizeBounds && !SanOpts.hasOneOf(SanitizerKind::Bounds)) Fn->addFnAttr(llvm::Attribute::NoSanitizeBounds); @@ -809,7 +813,8 @@ void CodeGenFunction::StartFunction(GlobalDecl GD, QualType RetTy, CurFn->addFnAttr(llvm::Attribute::DisableSanitizerInstrumentation); } else { // Apply sanitizer attributes to the function. - if (SanOpts.hasOneOf(SanitizerKind::Address | SanitizerKind::KernelAddress)) + if (SanOpts.hasOneOf(SanitizerKind::Address | SanitizerKind::KernelAddress | + SanitizerKind::Offload)) Fn->addFnAttr(llvm::Attribute::SanitizeAddress); if (SanOpts.hasOneOf(SanitizerKind::HWAddress | SanitizerKind::KernelHWAddress)) diff --git a/clang/lib/Driver/Driver.cpp b/clang/lib/Driver/Driver.cpp index 67bf0604acd6e..0f55b429512c2 100644 --- a/clang/lib/Driver/Driver.cpp +++ b/clang/lib/Driver/Driver.cpp @@ -792,11 +792,13 @@ void Driver::CreateOffloadingDeviceToolChains(Compilation &C, }) || C.getInputArgs().hasArg(options::OPT_hip_link) || C.getInputArgs().hasArg(options::OPT_hipstdpar); + bool UseLLVMOffload = C.getInputArgs().hasArg( + options::OPT_foffload_via_llvm, options::OPT_fno_offload_via_llvm, false); if (IsCuda && IsHIP) { Diag(clang::diag::err_drv_mix_cuda_hip); return; } - if (IsCuda) { + if (IsCuda && !UseLLVMOffload) { const ToolChain *HostTC = C.getSingleOffloadToolChain(); const llvm::Triple &HostTriple = HostTC->getTriple(); auto OFK = Action::OFK_Cuda; @@ -818,7 +820,7 @@ void Driver::CreateOffloadingDeviceToolChains(Compilation &C, CudaInstallation.WarnIfUnsupportedVersion(); } C.addOffloadDeviceToolChain(CudaTC.get(), OFK); - } else if (IsHIP) { + } else if (IsHIP && !UseLLVMOffload) { if (auto *OMPTargetArg = C.getInputArgs().getLastArg(options::OPT_fopenmp_targets_EQ)) { Diag(clang::diag::err_drv_unsupported_opt_for_language_mode) @@ -842,10 +844,11 @@ void Driver::CreateOffloadingDeviceToolChains(Compilation &C, // We need to generate an OpenMP toolchain if the user specified targets with // the -fopenmp-targets option or used --offload-arch with OpenMP enabled. bool IsOpenMPOffloading = - C.getInputArgs().hasFlag(options::OPT_fopenmp, options::OPT_fopenmp_EQ, - options::OPT_fno_openmp, false) && - (C.getInputArgs().hasArg(options::OPT_fopenmp_targets_EQ) || - C.getInputArgs().hasArg(options::OPT_offload_arch_EQ)); + ((IsCuda || IsHIP) && UseLLVMOffload) || + (C.getInputArgs().hasFlag(options::OPT_fopenmp, options::OPT_fopenmp_EQ, + options::OPT_fno_openmp, false) && + (C.getInputArgs().hasArg(options::OPT_fopenmp_targets_EQ) || + C.getInputArgs().hasArg(options::OPT_offload_arch_EQ))); if (IsOpenMPOffloading) { // We expect that -fopenmp-targets is always used in conjunction with the // option -fopenmp specifying a valid runtime with offloading support, i.e. @@ -873,7 +876,7 @@ void Driver::CreateOffloadingDeviceToolChains(Compilation &C, for (StringRef T : OpenMPTargets->getValues()) OpenMPTriples.insert(T); } else if (C.getInputArgs().hasArg(options::OPT_offload_arch_EQ) && - !IsHIP && !IsCuda) { + ((!IsHIP && !IsCuda) || UseLLVMOffload)) { const ToolChain *HostTC = C.getSingleOffloadToolChain(); auto AMDTriple = getHIPOffloadTargetTriple(*this, C.getInputArgs()); auto NVPTXTriple = getNVIDIAOffloadTargetTriple(*this, C.getInputArgs(), @@ -4146,6 +4149,8 @@ void Driver::BuildActions(Compilation &C, DerivedArgList &Args, bool UseNewOffloadingDriver = C.isOffloadingHostKind(Action::OFK_OpenMP) || + Args.hasFlag(options::OPT_foffload_via_llvm, + options::OPT_fno_offload_via_llvm, false) || Args.hasFlag(options::OPT_offload_new_driver, options::OPT_no_offload_new_driver, false); diff --git a/clang/lib/Driver/ToolChain.cpp b/clang/lib/Driver/ToolChain.cpp index 40ab2e91125d1..05b53c5573a1c 100644 --- a/clang/lib/Driver/ToolChain.cpp +++ b/clang/lib/Driver/ToolChain.cpp @@ -1367,7 +1367,8 @@ SanitizerMask ToolChain::getSupportedSanitizers() const { SanitizerKind::CFICastStrict | SanitizerKind::FloatDivideByZero | SanitizerKind::KCFI | SanitizerKind::UnsignedIntegerOverflow | SanitizerKind::UnsignedShiftBase | SanitizerKind::ImplicitConversion | - SanitizerKind::Nullability | SanitizerKind::LocalBounds; + SanitizerKind::Nullability | SanitizerKind::LocalBounds | + SanitizerKind::Offload; if (getTriple().getArch() == llvm::Triple::x86 || getTriple().getArch() == llvm::Triple::x86_64 || getTriple().getArch() == llvm::Triple::arm || getTriple().isWasm() || diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp index b8d8ff3db5d1f..77d365bad229f 100644 --- a/clang/lib/Driver/ToolChains/Clang.cpp +++ b/clang/lib/Driver/ToolChains/Clang.cpp @@ -1125,6 +1125,18 @@ void Clang::AddPreprocessingOptions(Compilation &C, const JobAction &JA, CmdArgs.push_back("__clang_openmp_device_functions.h"); } + if (Args.hasArg(options::OPT_foffload_via_llvm)) { + // Add llvm_wrappers/* to our system include path. This lets us wrap + // standard library headers and other headers. + SmallString<128> P(D.ResourceDir); + llvm::sys::path::append(P, "include", "llvm_offload_wrappers"); + CmdArgs.append({"-internal-isystem", Args.MakeArgString(P), "-include"}); + if (JA.isDeviceOffloading(Action::OFK_OpenMP)) + CmdArgs.push_back("__llvm_offload_device.h"); + else + CmdArgs.push_back("__llvm_offload_host.h"); + } + // Add -i* options, and automatically translate to // -include-pch/-include-pth for transparent PCH support. It's // wonky, but we include looking for .gch so we can support seamless @@ -6598,6 +6610,8 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA, // device offloading action other than OpenMP. if (Args.hasFlag(options::OPT_fopenmp, options::OPT_fopenmp_EQ, options::OPT_fno_openmp, false) && + !Args.hasFlag(options::OPT_foffload_via_llvm, + options::OPT_fno_offload_via_llvm, false) && (JA.isDeviceOffloading(Action::OFK_None) || JA.isDeviceOffloading(Action::OFK_OpenMP))) { switch (D.getOpenMPRuntime(Args)) { @@ -6675,11 +6689,16 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA, Args.addOptOutFlag(CmdArgs, options::OPT_fopenmp_extensions, options::OPT_fno_openmp_extensions); } - - // Forward the new driver to change offloading code generation. - if (Args.hasFlag(options::OPT_offload_new_driver, - options::OPT_no_offload_new_driver, false)) + // Forward the offload runtime change to code generation, liboffload implies + // new driver. Otherwise, check if we should forward the new driver to change + // offloading code generation. + if (Args.hasFlag(options::OPT_foffload_via_llvm, + options::OPT_fno_offload_via_llvm, false)) { + CmdArgs.append({"--offload-new-driver", "-foffload-via-llvm"}); + } else if (Args.hasFlag(options::OPT_offload_new_driver, + options::OPT_no_offload_new_driver, false)) { CmdArgs.push_back("--offload-new-driver"); + } SanitizeArgs.addArgs(TC, Args, CmdArgs, InputType); diff --git a/clang/lib/Driver/ToolChains/CommonArgs.cpp b/clang/lib/Driver/ToolChains/CommonArgs.cpp index 2a4c1369f5a73..ecbee87ed6486 100644 --- a/clang/lib/Driver/ToolChains/CommonArgs.cpp +++ b/clang/lib/Driver/ToolChains/CommonArgs.cpp @@ -1204,9 +1204,20 @@ bool tools::addOpenMPRuntime(const Compilation &C, ArgStringList &CmdArgs, const ToolChain &TC, const ArgList &Args, bool ForceStaticHostRuntime, bool IsOffloadingHost, bool GompNeedsRT) { + const SanitizerArgs &SanArgs = TC.getSanitizerArgs(Args); + if (SanArgs.needsOffloadKernels()) { + CmdArgs.push_back("-loffload.kernels"); + CmdArgs.append({"-mllvm", "-enable-offload-sanitizer"}); + } + if (!Args.hasFlag(options::OPT_fopenmp, options::OPT_fopenmp_EQ, - options::OPT_fno_openmp, false)) + options::OPT_fno_openmp, false)) { + // We need libomptarget (liboffload) if it's the choosen offloading runtime. + if (Args.hasFlag(options::OPT_foffload_via_llvm, + options::OPT_fno_offload_via_llvm, false)) + CmdArgs.push_back("-lomptarget"); return false; + } Driver::OpenMPRuntimeKind RTKind = TC.getDriver().getOpenMPRuntime(Args); diff --git a/clang/lib/Driver/ToolChains/Cuda.cpp b/clang/lib/Driver/ToolChains/Cuda.cpp index 2dfc7457b0ac7..7d154865ce3f2 100644 --- a/clang/lib/Driver/ToolChains/Cuda.cpp +++ b/clang/lib/Driver/ToolChains/Cuda.cpp @@ -861,17 +861,15 @@ void CudaToolChain::addClangTargetOptions( DeviceOffloadingKind == Action::OFK_Cuda) && "Only OpenMP or CUDA offloading kinds are supported for NVIDIA GPUs."); - if (DeviceOffloadingKind == Action::OFK_Cuda) { - CC1Args.append( - {"-fcuda-is-device", "-mllvm", "-enable-memcpyopt-without-libcalls"}); - - // Unsized function arguments used for variadics were introduced in CUDA-9.0 - // We still do not support generating code that actually uses variadic - // arguments yet, but we do need to allow parsing them as recent CUDA - // headers rely on that. https://github.com/llvm/llvm-project/issues/58410 - if (CudaInstallation.version() >= CudaVersion::CUDA_90) - CC1Args.push_back("-fcuda-allow-variadic-functions"); - } + CC1Args.append( + {"-fcuda-is-device", "-mllvm", "-enable-memcpyopt-without-libcalls"}); + + // Unsized function arguments used for variadics were introduced in CUDA-9.0 + // We still do not support generating code that actually uses variadic + // arguments yet, but we do need to allow parsing them as recent CUDA + // headers rely on that. https://github.com/llvm/llvm-project/issues/58410 + if (CudaInstallation.version() >= CudaVersion::CUDA_90) + CC1Args.push_back("-fcuda-allow-variadic-functions"); if (DriverArgs.hasArg(options::OPT_nogpulib)) return; @@ -889,6 +887,13 @@ void CudaToolChain::addClangTargetOptions( CC1Args.push_back("-mlink-builtin-bitcode"); CC1Args.push_back(DriverArgs.MakeArgString(LibDeviceFile)); + // For now, we don't use any Offload/OpenMP device runtime when we offload + // CUDA via LLVM/Offload. We should split the Offload/OpenMP device runtime + // and include the "generic" (or CUDA-specific) parts. + if (DriverArgs.hasFlag(options::OPT_foffload_via_llvm, + options::OPT_fno_offload_via_llvm, false)) + return; + clang::CudaVersion CudaInstallationVersion = CudaInstallation.version(); if (DriverArgs.hasFlag(options::OPT_fcuda_short_ptr, diff --git a/clang/lib/Headers/CMakeLists.txt b/clang/lib/Headers/CMakeLists.txt index d3090e488306f..251e5b0ba2381 100644 --- a/clang/lib/Headers/CMakeLists.txt +++ b/clang/lib/Headers/CMakeLists.txt @@ -325,6 +325,13 @@ set(openmp_wrapper_files openmp_wrappers/new ) +set(llvm_offload_wrapper_files + llvm_offload_wrappers/__llvm_offload.h + llvm_offload_wrappers/__llvm_offload_host.h + llvm_offload_wrappers/__llvm_offload_device.h + llvm_offload_wrappers/cuda_runtime.h +) + set(llvm_libc_wrapper_files llvm_libc_wrappers/assert.h llvm_libc_wrappers/stdio.h @@ -375,7 +382,7 @@ endfunction(clang_generate_header) # Copy header files from the source directory to the build directory foreach( f ${files} ${cuda_wrapper_files} ${cuda_wrapper_bits_files} ${ppc_wrapper_files} ${openmp_wrapper_files} ${zos_wrapper_files} ${hlsl_files} - ${llvm_libc_wrapper_files}) + ${llvm_libc_wrapper_files} ${llvm_offload_wrapper_files}) copy_header_to_output_dir(${CMAKE_CURRENT_SOURCE_DIR} ${f}) endforeach( f ) @@ -501,6 +508,7 @@ add_header_target("hlsl-resource-headers" ${hlsl_files}) add_header_target("opencl-resource-headers" ${opencl_files}) add_header_target("llvm-libc-resource-headers" ${llvm_libc_wrapper_files}) add_header_target("openmp-resource-headers" ${openmp_wrapper_files}) +add_header_target("llvm-offload-resource-headers" ${llvm_libc_wrapper_files}) add_header_target("windows-resource-headers" ${windows_only_files}) add_header_target("utility-resource-headers" ${utility_files}) @@ -542,6 +550,11 @@ install( DESTINATION ${header_install_dir}/openmp_wrappers COMPONENT clang-resource-headers) +install( + FILES ${llvm_offload_wrapper_files} + DESTINATION ${header_install_dir}/llvm_offload_wrappers + COMPONENT clang-resource-headers) + install( FILES ${zos_wrapper_files} DESTINATION ${header_install_dir}/zos_wrappers @@ -704,8 +717,8 @@ install( COMPONENT openmp-resource-headers) install( - FILES ${openmp_wrapper_files} - DESTINATION ${header_install_dir}/openmp_wrappers + FILES ${llvm_offload_wrapper_files} + DESTINATION ${header_install_dir}/llvm_offload_wrappers EXCLUDE_FROM_ALL COMPONENT openmp-resource-headers) diff --git a/clang/lib/Headers/llvm_offload_wrappers/__llvm_offload.h b/clang/lib/Headers/llvm_offload_wrappers/__llvm_offload.h new file mode 100644 index 0000000000000..2898898904e29 --- /dev/null +++ b/clang/lib/Headers/llvm_offload_wrappers/__llvm_offload.h @@ -0,0 +1,31 @@ +/*===------ LLVM/Offload helpers for kernel languages (CUDA/HIP) -*- c++ -*-=== + * + * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. + * See https://llvm.org/LICENSE.txt for license information. + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + * + *===-----------------------------------------------------------------------=== + */ + +#include + +#define __host__ __attribute__((host)) +#define __device__ __attribute__((device)) +#define __global__ __attribute__((global)) +#define __shared__ __attribute__((shared)) +#define __constant__ __attribute__((constant)) +#define __managed__ __attribute__((managed)) + +extern "C" { + +typedef struct dim3 { + dim3() {} + dim3(unsigned x) : x(x) {} + unsigned x = 0, y = 0, z = 0; +} dim3; + +// TODO: For some reason the CUDA device compilation requires this declaration +// to be present on the device while it is only used on the host. +unsigned __llvmPushCallConfiguration(dim3 gridDim, dim3 blockDim, + size_t sharedMem = 0, void *stream = 0); +} diff --git a/clang/lib/Headers/llvm_offload_wrappers/__llvm_offload_device.h b/clang/lib/Headers/llvm_offload_wrappers/__llvm_offload_device.h new file mode 100644 index 0000000000000..1a813b331515b --- /dev/null +++ b/clang/lib/Headers/llvm_offload_wrappers/__llvm_offload_device.h @@ -0,0 +1,10 @@ +/*===------ LLVM/Offload helpers for kernel languages (CUDA/HIP) -*- c++ -*-=== + * + * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. + * See https://llvm.org/LICENSE.txt for license information. + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + * + *===-----------------------------------------------------------------------=== + */ + +#include "__llvm_offload.h" diff --git a/clang/lib/Headers/llvm_offload_wrappers/__llvm_offload_host.h b/clang/lib/Headers/llvm_offload_wrappers/__llvm_offload_host.h new file mode 100644 index 0000000000000..160289d169b55 --- /dev/null +++ b/clang/lib/Headers/llvm_offload_wrappers/__llvm_offload_host.h @@ -0,0 +1,15 @@ +/*===------ LLVM/Offload helpers for kernel languages (CUDA/HIP) -*- c++ -*-=== + * + * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. + * See https://llvm.org/LICENSE.txt for license information. + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + * + *===-----------------------------------------------------------------------=== + */ + +#include "__llvm_offload.h" + +extern "C" { +unsigned llvmLaunchKernel(const void *func, dim3 gridDim, dim3 blockDim, + void **args, size_t sharedMem = 0, void *stream = 0); +} diff --git a/clang/lib/Headers/llvm_offload_wrappers/cuda_runtime.h b/clang/lib/Headers/llvm_offload_wrappers/cuda_runtime.h new file mode 100644 index 0000000000000..2d698e1c14e49 --- /dev/null +++ b/clang/lib/Headers/llvm_offload_wrappers/cuda_runtime.h @@ -0,0 +1,137 @@ +/*===- __cuda_runtime.h - LLVM/Offload wrappers for CUDA runtime API -------=== + * + * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. + * See https://llvm.org/LICENSE.txt for license information. + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + * + *===-----------------------------------------------------------------------=== + */ + +#ifndef __CUDA_RUNTIME_API__ +#define __CUDA_RUNTIME_API__ + +#include +#include +#include + +extern "C" { +int omp_get_initial_device(void); +void omp_target_free(void *Ptr, int Device); +void *omp_target_alloc(size_t Size, int Device); +int omp_target_memcpy(void *Dst, const void *Src, size_t Length, + size_t DstOffset, size_t SrcOffset, int DstDevice, + int SrcDevice); +void *omp_target_memset(void *Ptr, int C, size_t N, int DeviceNum); +int __tgt_target_synchronize_async_info_queue(void *Loc, int64_t DeviceNum, + void *AsyncInfoQueue); +} + +// TODO: There are many fields missing in this enumeration. +typedef enum cudaError { + cudaSuccess = 0, + cudaErrorInvalidValue = 1, + cudaErrorMemoryAllocation = 2, + cudaErrorNoDevice = 100, + cudaErrorInvalidDevice = 101, + cudaErrorOTHER = -1, +} cudaError_t; + +enum cudaMemcpyKind { + cudaMemcpyHostToHost = 0, + cudaMemcpyHostToDevice = 1, + cudaMemcpyDeviceToHost = 2, + cudaMemcpyDeviceToDevice = 3, + cudaMemcpyDefault = 4 +}; + +typedef void *cudaStream_t; + +static thread_local cudaError_t __cudaomp_last_error = cudaSuccess; + +// Returns the last error that has been produced and resets it to cudaSuccess. +inline cudaError_t cudaGetLastError() { + cudaError_t TempError = __cudaomp_last_error; + __cudaomp_last_error = cudaSuccess; + return TempError; +} + +// Returns the last error that has been produced without reseting it. +inline cudaError_t cudaPeekAtLastError() { return __cudaomp_last_error; } + +inline cudaError_t cudaDeviceSynchronize() { + int DeviceNum = 0; + return __cudaomp_last_error = + (cudaError_t)__tgt_target_synchronize_async_info_queue( + /*Loc=*/nullptr, DeviceNum, /*AsyncInfoQueue=*/nullptr); +} + +inline cudaError_t __cudaMalloc(void **devPtr, size_t size) { + int DeviceNum = 0; + *devPtr = omp_target_alloc(size, DeviceNum); + if (*devPtr == NULL) + return __cudaomp_last_error = cudaErrorMemoryAllocation; + + return __cudaomp_last_error = cudaSuccess; +} + +template cudaError_t cudaMalloc(T **devPtr, size_t size) { + return __cudaMalloc((void **)devPtr, size); +} + +inline cudaError_t __cudaFree(void *devPtr) { + int DeviceNum = 0; + omp_target_free(devPtr, DeviceNum); + return __cudaomp_last_error = cudaSuccess; +} + +template inline cudaError_t cudaFree(T *ptr) { + return __cudaFree((void *)ptr); +} + +inline cudaError_t __cudaMemcpy(void *dst, const void *src, size_t count, + cudaMemcpyKind kind) { + // get the host device number (which is the inital device) + int HostDeviceNum = omp_get_initial_device(); + + // use the default device for gpu + int GPUDeviceNum = 0; + + // default to copy from host to device + int DstDeviceNum = GPUDeviceNum; + int SrcDeviceNum = HostDeviceNum; + + if (kind == cudaMemcpyDeviceToHost) + std::swap(DstDeviceNum, SrcDeviceNum); + + // omp_target_memcpy returns 0 on success and non-zero on failure + if (omp_target_memcpy(dst, src, count, 0, 0, DstDeviceNum, SrcDeviceNum)) + return __cudaomp_last_error = cudaErrorInvalidValue; + return __cudaomp_last_error = cudaSuccess; +} + +template +inline cudaError_t cudaMemcpy(T *dst, const T *src, size_t count, + cudaMemcpyKind kind) { + return __cudaMemcpy((void *)dst, (const void *)src, count, kind); +} + +inline cudaError_t __cudaMemset(void *devPtr, int value, size_t count, + cudaStream_t stream = 0) { + int DeviceNum = 0; + if (!omp_target_memset(devPtr, value, count, DeviceNum)) + return __cudaomp_last_error = cudaErrorInvalidValue; + return __cudaomp_last_error = cudaSuccess; +} + +template +inline cudaError_t cudaMemset(T *devPtr, int value, size_t count) { + return __cudaMemset((void *)devPtr, value, count); +} + +inline cudaError_t cudaDeviceReset(void) { + cudaDeviceSynchronize(); + // TODO: not implemented. + return __cudaomp_last_error = cudaSuccess; +} + +#endif diff --git a/clang/lib/Headers/openmp_wrappers/__clang_openmp_device_functions.h b/clang/lib/Headers/openmp_wrappers/__clang_openmp_device_functions.h index d5b6846b03488..3e354c63efc66 100644 --- a/clang/lib/Headers/openmp_wrappers/__clang_openmp_device_functions.h +++ b/clang/lib/Headers/openmp_wrappers/__clang_openmp_device_functions.h @@ -10,17 +10,15 @@ #ifndef __CLANG_OPENMP_DEVICE_FUNCTIONS_H__ #define __CLANG_OPENMP_DEVICE_FUNCTIONS_H__ -#ifndef _OPENMP -#error "This file is for OpenMP compilation only." -#endif - #ifdef __cplusplus extern "C" { #endif +#ifdef __NVPTX__ #pragma omp begin declare variant match( \ device = {arch(nvptx, nvptx64)}, implementation = {extension(match_any)}) +#pragma push_macro("__CUDA__") #define __CUDA__ #define __OPENMP_NVPTX__ @@ -31,9 +29,10 @@ extern "C" { #include <__clang_cuda_device_functions.h> #undef __OPENMP_NVPTX__ -#undef __CUDA__ +#pragma pop_macro("__CUDA__") #pragma omp end declare variant +#endif #ifdef __AMDGCN__ #pragma omp begin declare variant match(device = {arch(amdgcn)}) diff --git a/clang/lib/Sema/SemaCUDA.cpp b/clang/lib/Sema/SemaCUDA.cpp index 580b9872c6a1d..ec37c0df56c67 100644 --- a/clang/lib/Sema/SemaCUDA.cpp +++ b/clang/lib/Sema/SemaCUDA.cpp @@ -1068,6 +1068,9 @@ void SemaCUDA::inheritTargetAttrs(FunctionDecl *FD, } std::string SemaCUDA::getConfigureFuncName() const { + if (getLangOpts().OffloadViaLLVM) + return "__llvmPushCallConfiguration"; + if (getLangOpts().HIP) return getLangOpts().HIPUseNewLaunchAPI ? "__hipPushCallConfiguration" : "hipConfigureCall"; diff --git a/clang/test/CodeGenCUDA/offload_via_llvm.cu b/clang/test/CodeGenCUDA/offload_via_llvm.cu new file mode 100644 index 0000000000000..3eb580850fc48 --- /dev/null +++ b/clang/test/CodeGenCUDA/offload_via_llvm.cu @@ -0,0 +1,97 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5 +// RUN: %clang -Xclang -triple -Xclang "x86_64-unknown-linux-gnu" -S -c -foffload-via-llvm -emit-llvm -o - %s | FileCheck %s + +// Check that we generate LLVM/Offload calls, including the KERNEL_LAUNCH_PARAMS argument. + +// CHECK-LABEL: define dso_local void @_Z18__device_stub__fooisPvS_( +// CHECK-SAME: i32 noundef [[TMP0:%.*]], i16 noundef signext [[TMP1:%.*]], ptr noundef [[TMP2:%.*]], ptr noundef [[TMP3:%.*]]) #[[ATTR0:[0-9]+]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[DOTADDR:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[DOTADDR1:%.*]] = alloca i16, align 2 +// CHECK-NEXT: [[DOTADDR2:%.*]] = alloca ptr, align 8 +// CHECK-NEXT: [[DOTADDR3:%.*]] = alloca ptr, align 8 +// CHECK-NEXT: [[KERNEL_ARGS:%.*]] = alloca [[TMP0]], align 16 +// CHECK-NEXT: [[KERNEL_LAUNCH_PARAMS:%.*]] = alloca [[TMP1]], align 16 +// CHECK-NEXT: [[GRID_DIM:%.*]] = alloca [[STRUCT_DIM3:%.*]], align 8 +// CHECK-NEXT: [[BLOCK_DIM:%.*]] = alloca [[STRUCT_DIM3]], align 8 +// CHECK-NEXT: [[SHMEM_SIZE:%.*]] = alloca i64, align 8 +// CHECK-NEXT: [[STREAM:%.*]] = alloca ptr, align 8 +// CHECK-NEXT: [[GRID_DIM_COERCE:%.*]] = alloca { i64, i32 }, align 8 +// CHECK-NEXT: [[BLOCK_DIM_COERCE:%.*]] = alloca { i64, i32 }, align 8 +// CHECK-NEXT: store i32 [[TMP0]], ptr [[DOTADDR]], align 4 +// CHECK-NEXT: store i16 [[TMP1]], ptr [[DOTADDR1]], align 2 +// CHECK-NEXT: store ptr [[TMP2]], ptr [[DOTADDR2]], align 8 +// CHECK-NEXT: store ptr [[TMP3]], ptr [[DOTADDR3]], align 8 +// CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds [[TMP1]], ptr [[KERNEL_LAUNCH_PARAMS]], i32 0, i32 0 +// CHECK-NEXT: store i64 24, ptr [[TMP4]], align 16 +// CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds [[TMP1]], ptr [[KERNEL_LAUNCH_PARAMS]], i32 0, i32 1 +// CHECK-NEXT: store ptr [[KERNEL_ARGS]], ptr [[TMP5]], align 8 +// CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds [[TMP1]], ptr [[KERNEL_LAUNCH_PARAMS]], i32 0, i32 2 +// CHECK-NEXT: store ptr null, ptr [[TMP6]], align 16 +// CHECK-NEXT: [[TMP7:%.*]] = load i32, ptr [[DOTADDR]], align 4 +// CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds [[TMP0]], ptr [[KERNEL_ARGS]], i32 0, i32 0 +// CHECK-NEXT: store i32 [[TMP7]], ptr [[TMP8]], align 16 +// CHECK-NEXT: [[TMP9:%.*]] = load i16, ptr [[DOTADDR1]], align 2 +// CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds [[TMP0]], ptr [[KERNEL_ARGS]], i32 0, i32 1 +// CHECK-NEXT: store i16 [[TMP9]], ptr [[TMP10]], align 4 +// CHECK-NEXT: [[TMP11:%.*]] = load ptr, ptr [[DOTADDR2]], align 8 +// CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds [[TMP0]], ptr [[KERNEL_ARGS]], i32 0, i32 2 +// CHECK-NEXT: store ptr [[TMP11]], ptr [[TMP12]], align 8 +// CHECK-NEXT: [[TMP13:%.*]] = load ptr, ptr [[DOTADDR3]], align 8 +// CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds [[TMP0]], ptr [[KERNEL_ARGS]], i32 0, i32 3 +// CHECK-NEXT: store ptr [[TMP13]], ptr [[TMP14]], align 16 +// CHECK-NEXT: [[TMP15:%.*]] = call i32 @__llvmPopCallConfiguration(ptr [[GRID_DIM]], ptr [[BLOCK_DIM]], ptr [[SHMEM_SIZE]], ptr [[STREAM]]) +// CHECK-NEXT: [[TMP16:%.*]] = load i64, ptr [[SHMEM_SIZE]], align 8 +// CHECK-NEXT: [[TMP17:%.*]] = load ptr, ptr [[STREAM]], align 8 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[GRID_DIM_COERCE]], ptr align 8 [[GRID_DIM]], i64 12, i1 false) +// CHECK-NEXT: [[TMP18:%.*]] = getelementptr inbounds { i64, i32 }, ptr [[GRID_DIM_COERCE]], i32 0, i32 0 +// CHECK-NEXT: [[TMP19:%.*]] = load i64, ptr [[TMP18]], align 8 +// CHECK-NEXT: [[TMP20:%.*]] = getelementptr inbounds { i64, i32 }, ptr [[GRID_DIM_COERCE]], i32 0, i32 1 +// CHECK-NEXT: [[TMP21:%.*]] = load i32, ptr [[TMP20]], align 8 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[BLOCK_DIM_COERCE]], ptr align 8 [[BLOCK_DIM]], i64 12, i1 false) +// CHECK-NEXT: [[TMP22:%.*]] = getelementptr inbounds { i64, i32 }, ptr [[BLOCK_DIM_COERCE]], i32 0, i32 0 +// CHECK-NEXT: [[TMP23:%.*]] = load i64, ptr [[TMP22]], align 8 +// CHECK-NEXT: [[TMP24:%.*]] = getelementptr inbounds { i64, i32 }, ptr [[BLOCK_DIM_COERCE]], i32 0, i32 1 +// CHECK-NEXT: [[TMP25:%.*]] = load i32, ptr [[TMP24]], align 8 +// CHECK-NEXT: [[CALL:%.*]] = call noundef i32 @llvmLaunchKernel(ptr noundef @_Z18__device_stub__fooisPvS_, i64 [[TMP19]], i32 [[TMP21]], i64 [[TMP23]], i32 [[TMP25]], ptr noundef [[KERNEL_LAUNCH_PARAMS]], i64 noundef [[TMP16]], ptr noundef [[TMP17]]) +// CHECK-NEXT: br label %[[SETUP_END:.*]] +// CHECK: [[SETUP_END]]: +// CHECK-NEXT: ret void +// +__global__ void foo(int, short, void *, void *) {} + +// CHECK-LABEL: define dso_local void @_Z5test1Pv( +// CHECK-SAME: ptr noundef [[PTR:%.*]]) #[[ATTR2:[0-9]+]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[PTR_ADDR:%.*]] = alloca ptr, align 8 +// CHECK-NEXT: [[AGG_TMP:%.*]] = alloca [[STRUCT_DIM3:%.*]], align 4 +// CHECK-NEXT: [[AGG_TMP1:%.*]] = alloca [[STRUCT_DIM3]], align 4 +// CHECK-NEXT: [[AGG_TMP_COERCE:%.*]] = alloca { i64, i32 }, align 4 +// CHECK-NEXT: [[AGG_TMP1_COERCE:%.*]] = alloca { i64, i32 }, align 4 +// CHECK-NEXT: store ptr [[PTR]], ptr [[PTR_ADDR]], align 8 +// CHECK-NEXT: call void @_ZN4dim3C2Ej(ptr noundef nonnull align 4 dereferenceable(12) [[AGG_TMP]], i32 noundef 3) +// CHECK-NEXT: call void @_ZN4dim3C2Ej(ptr noundef nonnull align 4 dereferenceable(12) [[AGG_TMP1]], i32 noundef 7) +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[AGG_TMP_COERCE]], ptr align 4 [[AGG_TMP]], i64 12, i1 false) +// CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds { i64, i32 }, ptr [[AGG_TMP_COERCE]], i32 0, i32 0 +// CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr [[TMP0]], align 4 +// CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds { i64, i32 }, ptr [[AGG_TMP_COERCE]], i32 0, i32 1 +// CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr [[TMP2]], align 4 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[AGG_TMP1_COERCE]], ptr align 4 [[AGG_TMP1]], i64 12, i1 false) +// CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds { i64, i32 }, ptr [[AGG_TMP1_COERCE]], i32 0, i32 0 +// CHECK-NEXT: [[TMP5:%.*]] = load i64, ptr [[TMP4]], align 4 +// CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds { i64, i32 }, ptr [[AGG_TMP1_COERCE]], i32 0, i32 1 +// CHECK-NEXT: [[TMP7:%.*]] = load i32, ptr [[TMP6]], align 4 +// CHECK-NEXT: [[CALL:%.*]] = call i32 @__llvmPushCallConfiguration(i64 [[TMP1]], i32 [[TMP3]], i64 [[TMP5]], i32 [[TMP7]], i64 noundef 0, ptr noundef null) +// CHECK-NEXT: [[TOBOOL:%.*]] = icmp ne i32 [[CALL]], 0 +// CHECK-NEXT: br i1 [[TOBOOL]], label %[[KCALL_END:.*]], label %[[KCALL_CONFIGOK:.*]] +// CHECK: [[KCALL_CONFIGOK]]: +// CHECK-NEXT: [[TMP8:%.*]] = load ptr, ptr [[PTR_ADDR]], align 8 +// CHECK-NEXT: [[TMP9:%.*]] = load ptr, ptr [[PTR_ADDR]], align 8 +// CHECK-NEXT: call void @_Z18__device_stub__fooisPvS_(i32 noundef 13, i16 noundef signext 1, ptr noundef [[TMP8]], ptr noundef [[TMP9]]) #[[ATTR5:[0-9]+]] +// CHECK-NEXT: br label %[[KCALL_END]] +// CHECK: [[KCALL_END]]: +// CHECK-NEXT: ret void +// +void test1(void *Ptr) { + foo<<<3, 7>>>(13, 1, Ptr, Ptr); +} diff --git a/clang/test/Driver/cuda-via-liboffload.cu b/clang/test/Driver/cuda-via-liboffload.cu new file mode 100644 index 0000000000000..68dc963e906b2 --- /dev/null +++ b/clang/test/Driver/cuda-via-liboffload.cu @@ -0,0 +1,23 @@ +// RUN: %clang -### -target x86_64-linux-gnu -foffload-via-llvm -ccc-print-bindings \ +// RUN: --offload-arch=sm_35 --offload-arch=sm_70 %s 2>&1 \ +// RUN: | FileCheck -check-prefix BINDINGS %s + +// BINDINGS: "x86_64-unknown-linux-gnu" - "clang", inputs: ["[[INPUT:.+]]"], output: "[[HOST_BC:.+]]" +// BINDINGS-NEXT: "nvptx64-nvidia-cuda" - "clang", inputs: ["[[INPUT]]", "[[HOST_BC]]"], output: "[[PTX_SM_35:.+]]" +// BINDINGS-NEXT: "nvptx64-nvidia-cuda" - "NVPTX::Assembler", inputs: ["[[PTX_SM_35]]"], output: "[[CUBIN_SM_35:.+]]" +// BINDINGS-NEXT: "nvptx64-nvidia-cuda" - "clang", inputs: ["[[INPUT]]", "[[HOST_BC]]"], output: "[[PTX_SM_70:.+]]" +// BINDINGS-NEXT: "nvptx64-nvidia-cuda" - "NVPTX::Assembler", inputs: ["[[PTX_SM_70:.+]]"], output: "[[CUBIN_SM_70:.+]]" +// BINDINGS-NEXT: "x86_64-unknown-linux-gnu" - "Offload::Packager", inputs: ["[[CUBIN_SM_35]]", "[[CUBIN_SM_70]]"], output: "[[BINARY:.+]]" +// BINDINGS-NEXT: "x86_64-unknown-linux-gnu" - "clang", inputs: ["[[HOST_BC]]", "[[BINARY]]"], output: "[[HOST_OBJ:.+]]" +// BINDINGS-NEXT: "x86_64-unknown-linux-gnu" - "Offload::Linker", inputs: ["[[HOST_OBJ]]"], output: "a.out" + +// RUN: %clang -### -target x86_64-linux-gnu -foffload-via-llvm -ccc-print-bindings \ +// RUN: --offload-arch=sm_35 --offload-arch=sm_70 %s 2>&1 \ +// RUN: | FileCheck -check-prefix BINDINGS-DEVICE %s + +// BINDINGS-DEVICE: # "nvptx64-nvidia-cuda" - "clang", inputs: ["[[INPUT:.+]]"], output: "[[PTX:.+]]" +// BINDINGS-DEVICE: # "nvptx64-nvidia-cuda" - "NVPTX::Assembler", inputs: ["[[PTX]]"], output: "[[CUBIN:.+]]" + +// RUN: %clang -### -target x86_64-linux-gnu -ccc-print-bindings --offload-link -foffload-via-llvm %s 2>&1 | FileCheck -check-prefix DEVICE-LINK %s + +// DEVICE-LINK: "x86_64-unknown-linux-gnu" - "Offload::Linker", inputs: ["[[INPUT:.+]]"], output: "a.out" diff --git a/clang/test/OpenMP/parallel_codegen.cpp b/clang/test/OpenMP/parallel_codegen.cpp index 9082f1c3232af..9cdb1da996152 100644 --- a/clang/test/OpenMP/parallel_codegen.cpp +++ b/clang/test/OpenMP/parallel_codegen.cpp @@ -115,7 +115,7 @@ int main (int argc, char **argv) { // CHECK1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 1 // CHECK1-NEXT: [[TMP2:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 // CHECK1-NEXT: invoke void @_Z3fooIiEvT_(i32 noundef [[TMP2]]) -// CHECK1-NEXT: to label [[INVOKE_CONT:%.*]] unwind label [[TERMINATE_LPAD:%.*]] +// CHECK1-NEXT: to label [[INVOKE_CONT:%.*]] unwind label [[TERMINATE_LPAD:%.*]] // CHECK1: invoke.cont: // CHECK1-NEXT: [[TMP3:%.*]] = load i32, ptr @global, align 4 // CHECK1-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 1 @@ -123,7 +123,7 @@ int main (int argc, char **argv) { // CHECK1-NEXT: ret void // CHECK1: terminate.lpad: // CHECK1-NEXT: [[TMP4:%.*]] = landingpad { ptr, i32 } -// CHECK1-NEXT: catch ptr null +// CHECK1-NEXT: catch ptr null // CHECK1-NEXT: [[TMP5:%.*]] = extractvalue { ptr, i32 } [[TMP4]], 0 // CHECK1-NEXT: call void @__clang_call_terminate(ptr [[TMP5]]) #[[ATTR6:[0-9]+]] // CHECK1-NEXT: unreachable @@ -186,7 +186,7 @@ int main (int argc, char **argv) { // CHECK1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 1 // CHECK1-NEXT: [[TMP3:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 // CHECK1-NEXT: invoke void @_Z3fooIiEvT_(i32 noundef [[TMP3]]) -// CHECK1-NEXT: to label [[INVOKE_CONT:%.*]] unwind label [[TERMINATE_LPAD:%.*]] +// CHECK1-NEXT: to label [[INVOKE_CONT:%.*]] unwind label [[TERMINATE_LPAD:%.*]] // CHECK1: invoke.cont: // CHECK1-NEXT: [[TMP4:%.*]] = load i32, ptr [[TMP2]], align 4 // CHECK1-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 1 @@ -194,7 +194,7 @@ int main (int argc, char **argv) { // CHECK1-NEXT: ret void // CHECK1: terminate.lpad: // CHECK1-NEXT: [[TMP5:%.*]] = landingpad { ptr, i32 } -// CHECK1-NEXT: catch ptr null +// CHECK1-NEXT: catch ptr null // CHECK1-NEXT: [[TMP6:%.*]] = extractvalue { ptr, i32 } [[TMP5]], 0 // CHECK1-NEXT: call void @__clang_call_terminate(ptr [[TMP6]]) #[[ATTR6]] // CHECK1-NEXT: unreachable @@ -233,7 +233,7 @@ int main (int argc, char **argv) { // CHECK1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 1 // CHECK1-NEXT: [[TMP2:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 // CHECK1-NEXT: invoke void @_Z3fooIiEvT_(i32 noundef [[TMP2]]) -// CHECK1-NEXT: to label [[INVOKE_CONT:%.*]] unwind label [[TERMINATE_LPAD:%.*]] +// CHECK1-NEXT: to label [[INVOKE_CONT:%.*]] unwind label [[TERMINATE_LPAD:%.*]] // CHECK1: invoke.cont: // CHECK1-NEXT: [[TMP3:%.*]] = load i32, ptr @global, align 4 // CHECK1-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 1 @@ -241,7 +241,7 @@ int main (int argc, char **argv) { // CHECK1-NEXT: ret void // CHECK1: terminate.lpad: // CHECK1-NEXT: [[TMP4:%.*]] = landingpad { ptr, i32 } -// CHECK1-NEXT: catch ptr null +// CHECK1-NEXT: catch ptr null // CHECK1-NEXT: [[TMP5:%.*]] = extractvalue { ptr, i32 } [[TMP4]], 0 // CHECK1-NEXT: call void @__clang_call_terminate(ptr [[TMP5]]) #[[ATTR6]] // CHECK1-NEXT: unreachable @@ -278,7 +278,7 @@ int main (int argc, char **argv) { // CHECK1-NEXT: [[TMP1:%.*]] = load i64, ptr [[VLA_ADDR]], align 8 // CHECK1-NEXT: [[TMP2:%.*]] = load ptr, ptr [[TMP0]], align 8 // CHECK1-NEXT: invoke void @_Z3fooIPPcEvT_(ptr noundef [[TMP2]]) -// CHECK1-NEXT: to label [[INVOKE_CONT:%.*]] unwind label [[TERMINATE_LPAD:%.*]] +// CHECK1-NEXT: to label [[INVOKE_CONT:%.*]] unwind label [[TERMINATE_LPAD:%.*]] // CHECK1: invoke.cont: // CHECK1-NEXT: [[TMP3:%.*]] = load ptr, ptr [[VAR]], align 8 // CHECK1-NEXT: [[TMP4:%.*]] = mul nsw i64 0, [[TMP1]] @@ -287,7 +287,7 @@ int main (int argc, char **argv) { // CHECK1-NEXT: ret void // CHECK1: terminate.lpad: // CHECK1-NEXT: [[TMP5:%.*]] = landingpad { ptr, i32 } -// CHECK1-NEXT: catch ptr null +// CHECK1-NEXT: catch ptr null // CHECK1-NEXT: [[TMP6:%.*]] = extractvalue { ptr, i32 } [[TMP5]], 0 // CHECK1-NEXT: call void @__clang_call_terminate(ptr [[TMP6]]) #[[ATTR6]] // CHECK1-NEXT: unreachable @@ -311,20 +311,20 @@ int main (int argc, char **argv) { // CHECK2-NEXT: [[__VLA_EXPR0:%.*]] = alloca i64, align 8 // CHECK2-NEXT: store i32 0, ptr [[RETVAL]], align 4 // CHECK2-NEXT: store i32 [[ARGC]], ptr [[ARGC_ADDR]], align 4 -// CHECK2-NEXT: call void @llvm.dbg.declare(metadata ptr [[ARGC_ADDR]], metadata [[META18:![0-9]+]], metadata !DIExpression()), !dbg [[DBG19:![0-9]+]] +// CHECK2-NEXT: tail call void @llvm.dbg.declare(metadata ptr [[ARGC_ADDR]], metadata [[META18:![0-9]+]], metadata !DIExpression()), !dbg [[DBG19:![0-9]+]] // CHECK2-NEXT: store ptr [[ARGV]], ptr [[ARGV_ADDR]], align 8 -// CHECK2-NEXT: call void @llvm.dbg.declare(metadata ptr [[ARGV_ADDR]], metadata [[META20:![0-9]+]], metadata !DIExpression()), !dbg [[DBG21:![0-9]+]] +// CHECK2-NEXT: tail call void @llvm.dbg.declare(metadata ptr [[ARGV_ADDR]], metadata [[META20:![0-9]+]], metadata !DIExpression()), !dbg [[DBG21:![0-9]+]] // CHECK2-NEXT: [[TMP0:%.*]] = load i32, ptr [[ARGC_ADDR]], align 4, !dbg [[DBG22:![0-9]+]] // CHECK2-NEXT: [[TMP1:%.*]] = zext i32 [[TMP0]] to i64, !dbg [[DBG23:![0-9]+]] // CHECK2-NEXT: [[TMP2:%.*]] = call ptr @llvm.stacksave.p0(), !dbg [[DBG23]] // CHECK2-NEXT: store ptr [[TMP2]], ptr [[SAVED_STACK]], align 8, !dbg [[DBG23]] // CHECK2-NEXT: [[VLA:%.*]] = alloca i32, i64 [[TMP1]], align 16, !dbg [[DBG23]] // CHECK2-NEXT: store i64 [[TMP1]], ptr [[__VLA_EXPR0]], align 8, !dbg [[DBG23]] -// CHECK2-NEXT: call void @llvm.dbg.declare(metadata ptr [[__VLA_EXPR0]], metadata [[META24:![0-9]+]], metadata !DIExpression()), !dbg [[DBG26:![0-9]+]] -// CHECK2-NEXT: call void @llvm.dbg.declare(metadata ptr [[VLA]], metadata [[META27:![0-9]+]], metadata !DIExpression()), !dbg [[DBG31:![0-9]+]] +// CHECK2-NEXT: tail call void @llvm.dbg.declare(metadata ptr [[__VLA_EXPR0]], metadata [[META24:![0-9]+]], metadata !DIExpression()), !dbg [[DBG26:![0-9]+]] +// CHECK2-NEXT: tail call void @llvm.dbg.declare(metadata ptr [[VLA]], metadata [[META27:![0-9]+]], metadata !DIExpression()), !dbg [[DBG31:![0-9]+]] // CHECK2-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB1:[0-9]+]], i32 2, ptr @main.omp_outlined, i64 [[TMP1]], ptr [[VLA]]), !dbg [[DBG32:![0-9]+]] -// CHECK2-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB5:[0-9]+]], i32 1, ptr @main.omp_outlined.2, i64 [[TMP1]]), !dbg [[DBG33:![0-9]+]] -// CHECK2-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB9:[0-9]+]], i32 2, ptr @main.omp_outlined.4, i64 [[TMP1]], ptr [[VLA]]), !dbg [[DBG34:![0-9]+]] +// CHECK2-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB5:[0-9]+]], i32 1, ptr @main.omp_outlined.1, i64 [[TMP1]]), !dbg [[DBG33:![0-9]+]] +// CHECK2-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB9:[0-9]+]], i32 2, ptr @main.omp_outlined.3, i64 [[TMP1]], ptr [[VLA]]), !dbg [[DBG34:![0-9]+]] // CHECK2-NEXT: [[TMP3:%.*]] = load ptr, ptr [[ARGV_ADDR]], align 8, !dbg [[DBG35:![0-9]+]] // CHECK2-NEXT: [[CALL:%.*]] = call noundef i32 @_Z5tmainIPPcEiT_(ptr noundef [[TMP3]]), !dbg [[DBG36:![0-9]+]] // CHECK2-NEXT: store i32 [[CALL]], ptr [[RETVAL]], align 4, !dbg [[DBG37:![0-9]+]] @@ -342,19 +342,19 @@ int main (int argc, char **argv) { // CHECK2-NEXT: [[VLA_ADDR:%.*]] = alloca i64, align 8 // CHECK2-NEXT: [[A_ADDR:%.*]] = alloca ptr, align 8 // CHECK2-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK2-NEXT: call void @llvm.dbg.declare(metadata ptr [[DOTGLOBAL_TID__ADDR]], metadata [[META47:![0-9]+]], metadata !DIExpression()), !dbg [[DBG48:![0-9]+]] +// CHECK2-NEXT: tail call void @llvm.dbg.declare(metadata ptr [[DOTGLOBAL_TID__ADDR]], metadata [[META47:![0-9]+]], metadata !DIExpression()), !dbg [[DBG48:![0-9]+]] // CHECK2-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8 -// CHECK2-NEXT: call void @llvm.dbg.declare(metadata ptr [[DOTBOUND_TID__ADDR]], metadata [[META49:![0-9]+]], metadata !DIExpression()), !dbg [[DBG48]] +// CHECK2-NEXT: tail call void @llvm.dbg.declare(metadata ptr [[DOTBOUND_TID__ADDR]], metadata [[META49:![0-9]+]], metadata !DIExpression()), !dbg [[DBG48]] // CHECK2-NEXT: store i64 [[VLA]], ptr [[VLA_ADDR]], align 8 -// CHECK2-NEXT: call void @llvm.dbg.declare(metadata ptr [[VLA_ADDR]], metadata [[META50:![0-9]+]], metadata !DIExpression()), !dbg [[DBG48]] +// CHECK2-NEXT: tail call void @llvm.dbg.declare(metadata ptr [[VLA_ADDR]], metadata [[META50:![0-9]+]], metadata !DIExpression()), !dbg [[DBG48]] // CHECK2-NEXT: store ptr [[A]], ptr [[A_ADDR]], align 8 -// CHECK2-NEXT: call void @llvm.dbg.declare(metadata ptr [[A_ADDR]], metadata [[META51:![0-9]+]], metadata !DIExpression()), !dbg [[DBG52:![0-9]+]] +// CHECK2-NEXT: tail call void @llvm.dbg.declare(metadata ptr [[A_ADDR]], metadata [[META51:![0-9]+]], metadata !DIExpression()), !dbg [[DBG52:![0-9]+]] // CHECK2-NEXT: [[TMP0:%.*]] = load i64, ptr [[VLA_ADDR]], align 8, !dbg [[DBG53:![0-9]+]] // CHECK2-NEXT: [[TMP1:%.*]] = load ptr, ptr [[A_ADDR]], align 8, !dbg [[DBG53]] // CHECK2-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 1, !dbg [[DBG54:![0-9]+]] // CHECK2-NEXT: [[TMP2:%.*]] = load i32, ptr [[ARRAYIDX]], align 4, !dbg [[DBG54]] // CHECK2-NEXT: invoke void @_Z3fooIiEvT_(i32 noundef [[TMP2]]) -// CHECK2-NEXT: to label [[INVOKE_CONT:%.*]] unwind label [[TERMINATE_LPAD:%.*]], !dbg [[DBG53]] +// CHECK2-NEXT: to label [[INVOKE_CONT:%.*]] unwind label [[TERMINATE_LPAD:%.*]], !dbg [[DBG53]] // CHECK2: invoke.cont: // CHECK2-NEXT: [[TMP3:%.*]] = load i32, ptr @global, align 4, !dbg [[DBG55:![0-9]+]] // CHECK2-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 1, !dbg [[DBG56:![0-9]+]] @@ -362,53 +362,53 @@ int main (int argc, char **argv) { // CHECK2-NEXT: ret void, !dbg [[DBG55]] // CHECK2: terminate.lpad: // CHECK2-NEXT: [[TMP4:%.*]] = landingpad { ptr, i32 } -// CHECK2-NEXT: catch ptr null, !dbg [[DBG53]] +// CHECK2-NEXT: catch ptr null, !dbg [[DBG53]] // CHECK2-NEXT: [[TMP5:%.*]] = extractvalue { ptr, i32 } [[TMP4]], 0, !dbg [[DBG53]] // CHECK2-NEXT: call void @__clang_call_terminate(ptr [[TMP5]]) #[[ATTR7:[0-9]+]], !dbg [[DBG53]] // CHECK2-NEXT: unreachable, !dbg [[DBG53]] // // -// CHECK2-LABEL: define {{[^@]+}}@_Z3fooIiEvT_ -// CHECK2-SAME: (i32 noundef [[ARGC:%.*]]) #[[ATTR4:[0-9]+]] comdat !dbg [[DBG58:![0-9]+]] { -// CHECK2-NEXT: entry: -// CHECK2-NEXT: [[ARGC_ADDR:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: store i32 [[ARGC]], ptr [[ARGC_ADDR]], align 4 -// CHECK2-NEXT: call void @llvm.dbg.declare(metadata ptr [[ARGC_ADDR]], metadata [[META63:![0-9]+]], metadata !DIExpression()), !dbg [[DBG64:![0-9]+]] -// CHECK2-NEXT: ret void, !dbg [[DBG65:![0-9]+]] -// -// -// CHECK2-LABEL: define {{[^@]+}}@__clang_call_terminate -// CHECK2-SAME: (ptr noundef [[TMP0:%.*]]) #[[ATTR5:[0-9]+]] comdat { -// CHECK2-NEXT: [[TMP2:%.*]] = call ptr @__cxa_begin_catch(ptr [[TMP0]]) #[[ATTR6:[0-9]+]] -// CHECK2-NEXT: call void @_ZSt9terminatev() #[[ATTR7]] -// CHECK2-NEXT: unreachable -// -// // CHECK2-LABEL: define {{[^@]+}}@main.omp_outlined -// CHECK2-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR3]] !dbg [[DBG66:![0-9]+]] { +// CHECK2-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR3]] !dbg [[DBG58:![0-9]+]] { // CHECK2-NEXT: entry: // CHECK2-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK2-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK2-NEXT: [[VLA_ADDR:%.*]] = alloca i64, align 8 // CHECK2-NEXT: [[A_ADDR:%.*]] = alloca ptr, align 8 // CHECK2-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK2-NEXT: call void @llvm.dbg.declare(metadata ptr [[DOTGLOBAL_TID__ADDR]], metadata [[META67:![0-9]+]], metadata !DIExpression()), !dbg [[DBG68:![0-9]+]] +// CHECK2-NEXT: tail call void @llvm.dbg.declare(metadata ptr [[DOTGLOBAL_TID__ADDR]], metadata [[META59:![0-9]+]], metadata !DIExpression()), !dbg [[DBG60:![0-9]+]] // CHECK2-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8 -// CHECK2-NEXT: call void @llvm.dbg.declare(metadata ptr [[DOTBOUND_TID__ADDR]], metadata [[META69:![0-9]+]], metadata !DIExpression()), !dbg [[DBG68]] +// CHECK2-NEXT: tail call void @llvm.dbg.declare(metadata ptr [[DOTBOUND_TID__ADDR]], metadata [[META61:![0-9]+]], metadata !DIExpression()), !dbg [[DBG60]] // CHECK2-NEXT: store i64 [[VLA]], ptr [[VLA_ADDR]], align 8 -// CHECK2-NEXT: call void @llvm.dbg.declare(metadata ptr [[VLA_ADDR]], metadata [[META70:![0-9]+]], metadata !DIExpression()), !dbg [[DBG68]] +// CHECK2-NEXT: tail call void @llvm.dbg.declare(metadata ptr [[VLA_ADDR]], metadata [[META62:![0-9]+]], metadata !DIExpression()), !dbg [[DBG60]] // CHECK2-NEXT: store ptr [[A]], ptr [[A_ADDR]], align 8 -// CHECK2-NEXT: call void @llvm.dbg.declare(metadata ptr [[A_ADDR]], metadata [[META71:![0-9]+]], metadata !DIExpression()), !dbg [[DBG68]] -// CHECK2-NEXT: [[TMP0:%.*]] = load i64, ptr [[VLA_ADDR]], align 8, !dbg [[DBG72:![0-9]+]] -// CHECK2-NEXT: [[TMP1:%.*]] = load ptr, ptr [[A_ADDR]], align 8, !dbg [[DBG72]] -// CHECK2-NEXT: [[TMP2:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8, !dbg [[DBG72]] -// CHECK2-NEXT: [[TMP3:%.*]] = load ptr, ptr [[DOTBOUND_TID__ADDR]], align 8, !dbg [[DBG72]] -// CHECK2-NEXT: [[TMP4:%.*]] = load ptr, ptr [[A_ADDR]], align 8, !dbg [[DBG72]] -// CHECK2-NEXT: call void @main.omp_outlined_debug__(ptr [[TMP2]], ptr [[TMP3]], i64 [[TMP0]], ptr [[TMP4]]) #[[ATTR6]], !dbg [[DBG72]] -// CHECK2-NEXT: ret void, !dbg [[DBG72]] +// CHECK2-NEXT: tail call void @llvm.dbg.declare(metadata ptr [[A_ADDR]], metadata [[META63:![0-9]+]], metadata !DIExpression()), !dbg [[DBG60]] +// CHECK2-NEXT: [[TMP0:%.*]] = load i64, ptr [[VLA_ADDR]], align 8, !dbg [[DBG64:![0-9]+]] +// CHECK2-NEXT: [[TMP1:%.*]] = load ptr, ptr [[A_ADDR]], align 8, !dbg [[DBG64]] +// CHECK2-NEXT: [[TMP2:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8, !dbg [[DBG64]] +// CHECK2-NEXT: [[TMP3:%.*]] = load ptr, ptr [[DOTBOUND_TID__ADDR]], align 8, !dbg [[DBG64]] +// CHECK2-NEXT: [[TMP4:%.*]] = load ptr, ptr [[A_ADDR]], align 8, !dbg [[DBG64]] +// CHECK2-NEXT: call void @main.omp_outlined_debug__(ptr [[TMP2]], ptr [[TMP3]], i64 [[TMP0]], ptr [[TMP4]]) #[[ATTR6:[0-9]+]], !dbg [[DBG64]] +// CHECK2-NEXT: ret void, !dbg [[DBG64]] // // -// CHECK2-LABEL: define {{[^@]+}}@main.omp_outlined_debug__.1 +// CHECK2-LABEL: define {{[^@]+}}@_Z3fooIiEvT_ +// CHECK2-SAME: (i32 noundef [[ARGC:%.*]]) #[[ATTR4:[0-9]+]] comdat !dbg [[DBG65:![0-9]+]] { +// CHECK2-NEXT: entry: +// CHECK2-NEXT: [[ARGC_ADDR:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: store i32 [[ARGC]], ptr [[ARGC_ADDR]], align 4 +// CHECK2-NEXT: tail call void @llvm.dbg.declare(metadata ptr [[ARGC_ADDR]], metadata [[META70:![0-9]+]], metadata !DIExpression()), !dbg [[DBG71:![0-9]+]] +// CHECK2-NEXT: ret void, !dbg [[DBG72:![0-9]+]] +// +// +// CHECK2-LABEL: define {{[^@]+}}@__clang_call_terminate +// CHECK2-SAME: (ptr noundef [[TMP0:%.*]]) #[[ATTR5:[0-9]+]] comdat { +// CHECK2-NEXT: [[TMP2:%.*]] = call ptr @__cxa_begin_catch(ptr [[TMP0]]) #[[ATTR6]] +// CHECK2-NEXT: call void @_ZSt9terminatev() #[[ATTR7]] +// CHECK2-NEXT: unreachable +// +// +// CHECK2-LABEL: define {{[^@]+}}@main.omp_outlined_debug__.2 // CHECK2-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[VLA:%.*]]) #[[ATTR3]] !dbg [[DBG75:![0-9]+]] { // CHECK2-NEXT: entry: // CHECK2-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -418,27 +418,46 @@ int main (int argc, char **argv) { // CHECK2-NEXT: [[SAVED_STACK:%.*]] = alloca ptr, align 8 // CHECK2-NEXT: [[__VLA_EXPR0:%.*]] = alloca i64, align 8 // CHECK2-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK2-NEXT: call void @llvm.dbg.declare(metadata ptr [[DOTGLOBAL_TID__ADDR]], metadata [[META78:![0-9]+]], metadata !DIExpression()), !dbg [[DBG79:![0-9]+]] +// CHECK2-NEXT: tail call void @llvm.dbg.declare(metadata ptr [[DOTGLOBAL_TID__ADDR]], metadata [[META78:![0-9]+]], metadata !DIExpression()), !dbg [[DBG79:![0-9]+]] // CHECK2-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8 -// CHECK2-NEXT: call void @llvm.dbg.declare(metadata ptr [[DOTBOUND_TID__ADDR]], metadata [[META80:![0-9]+]], metadata !DIExpression()), !dbg [[DBG79]] +// CHECK2-NEXT: tail call void @llvm.dbg.declare(metadata ptr [[DOTBOUND_TID__ADDR]], metadata [[META80:![0-9]+]], metadata !DIExpression()), !dbg [[DBG79]] // CHECK2-NEXT: store i64 [[VLA]], ptr [[VLA_ADDR]], align 8 -// CHECK2-NEXT: call void @llvm.dbg.declare(metadata ptr [[VLA_ADDR]], metadata [[META81:![0-9]+]], metadata !DIExpression()), !dbg [[DBG79]] +// CHECK2-NEXT: tail call void @llvm.dbg.declare(metadata ptr [[VLA_ADDR]], metadata [[META81:![0-9]+]], metadata !DIExpression()), !dbg [[DBG79]] // CHECK2-NEXT: [[TMP0:%.*]] = load i64, ptr [[VLA_ADDR]], align 8, !dbg [[DBG82:![0-9]+]] -// CHECK2-NEXT: call void @llvm.dbg.declare(metadata ptr [[GLOBAL]], metadata [[META83:![0-9]+]], metadata !DIExpression()), !dbg [[DBG79]] +// CHECK2-NEXT: tail call void @llvm.dbg.declare(metadata ptr [[GLOBAL]], metadata [[META83:![0-9]+]], metadata !DIExpression()), !dbg [[DBG79]] // CHECK2-NEXT: [[TMP1:%.*]] = call ptr @llvm.stacksave.p0(), !dbg [[DBG82]] // CHECK2-NEXT: store ptr [[TMP1]], ptr [[SAVED_STACK]], align 8, !dbg [[DBG82]] // CHECK2-NEXT: [[VLA1:%.*]] = alloca i32, i64 [[TMP0]], align 16, !dbg [[DBG82]] // CHECK2-NEXT: store i64 [[TMP0]], ptr [[__VLA_EXPR0]], align 8, !dbg [[DBG82]] -// CHECK2-NEXT: call void @llvm.dbg.declare(metadata ptr [[__VLA_EXPR0]], metadata [[META84:![0-9]+]], metadata !DIExpression()), !dbg [[DBG79]] -// CHECK2-NEXT: call void @llvm.dbg.declare(metadata ptr [[VLA1]], metadata [[META85:![0-9]+]], metadata !DIExpression()), !dbg [[DBG79]] -// CHECK2-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB3:[0-9]+]], i32 3, ptr @main.omp_outlined_debug__.1.omp_outlined, i64 [[TMP0]], ptr [[VLA1]], ptr [[GLOBAL]]), !dbg [[DBG82]] +// CHECK2-NEXT: tail call void @llvm.dbg.declare(metadata ptr [[__VLA_EXPR0]], metadata [[META84:![0-9]+]], metadata !DIExpression()), !dbg [[DBG79]] +// CHECK2-NEXT: tail call void @llvm.dbg.declare(metadata ptr [[VLA1]], metadata [[META85:![0-9]+]], metadata !DIExpression()), !dbg [[DBG79]] +// CHECK2-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB3:[0-9]+]], i32 3, ptr @main.omp_outlined_debug__.2.omp_outlined, i64 [[TMP0]], ptr [[VLA1]], ptr [[GLOBAL]]), !dbg [[DBG82]] // CHECK2-NEXT: [[TMP2:%.*]] = load ptr, ptr [[SAVED_STACK]], align 8, !dbg [[DBG86:![0-9]+]] // CHECK2-NEXT: call void @llvm.stackrestore.p0(ptr [[TMP2]]), !dbg [[DBG86]] // CHECK2-NEXT: ret void, !dbg [[DBG88:![0-9]+]] // // -// CHECK2-LABEL: define {{[^@]+}}@main.omp_outlined_debug__.1.omp_outlined_debug__ -// CHECK2-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[GLOBAL:%.*]]) #[[ATTR3]] personality ptr @__gxx_personality_v0 !dbg [[DBG89:![0-9]+]] { +// CHECK2-LABEL: define {{[^@]+}}@main.omp_outlined.1 +// CHECK2-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[VLA:%.*]]) #[[ATTR3]] !dbg [[DBG89:![0-9]+]] { +// CHECK2-NEXT: entry: +// CHECK2-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 +// CHECK2-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 +// CHECK2-NEXT: [[VLA_ADDR:%.*]] = alloca i64, align 8 +// CHECK2-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK2-NEXT: tail call void @llvm.dbg.declare(metadata ptr [[DOTGLOBAL_TID__ADDR]], metadata [[META90:![0-9]+]], metadata !DIExpression()), !dbg [[DBG91:![0-9]+]] +// CHECK2-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8 +// CHECK2-NEXT: tail call void @llvm.dbg.declare(metadata ptr [[DOTBOUND_TID__ADDR]], metadata [[META92:![0-9]+]], metadata !DIExpression()), !dbg [[DBG91]] +// CHECK2-NEXT: store i64 [[VLA]], ptr [[VLA_ADDR]], align 8 +// CHECK2-NEXT: tail call void @llvm.dbg.declare(metadata ptr [[VLA_ADDR]], metadata [[META93:![0-9]+]], metadata !DIExpression()), !dbg [[DBG91]] +// CHECK2-NEXT: [[TMP0:%.*]] = load i64, ptr [[VLA_ADDR]], align 8, !dbg [[DBG94:![0-9]+]] +// CHECK2-NEXT: [[TMP1:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8, !dbg [[DBG94]] +// CHECK2-NEXT: [[TMP2:%.*]] = load ptr, ptr [[DOTBOUND_TID__ADDR]], align 8, !dbg [[DBG94]] +// CHECK2-NEXT: call void @main.omp_outlined_debug__.2(ptr [[TMP1]], ptr [[TMP2]], i64 [[TMP0]]) #[[ATTR6]], !dbg [[DBG94]] +// CHECK2-NEXT: ret void, !dbg [[DBG94]] +// +// +// CHECK2-LABEL: define {{[^@]+}}@main.omp_outlined_debug__.2.omp_outlined_debug__ +// CHECK2-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[GLOBAL:%.*]]) #[[ATTR3]] personality ptr @__gxx_personality_v0 !dbg [[DBG95:![0-9]+]] { // CHECK2-NEXT: entry: // CHECK2-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK2-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -446,37 +465,37 @@ int main (int argc, char **argv) { // CHECK2-NEXT: [[A_ADDR:%.*]] = alloca ptr, align 8 // CHECK2-NEXT: [[GLOBAL_ADDR:%.*]] = alloca ptr, align 8 // CHECK2-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK2-NEXT: call void @llvm.dbg.declare(metadata ptr [[DOTGLOBAL_TID__ADDR]], metadata [[META92:![0-9]+]], metadata !DIExpression()), !dbg [[DBG93:![0-9]+]] +// CHECK2-NEXT: tail call void @llvm.dbg.declare(metadata ptr [[DOTGLOBAL_TID__ADDR]], metadata [[META98:![0-9]+]], metadata !DIExpression()), !dbg [[DBG99:![0-9]+]] // CHECK2-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8 -// CHECK2-NEXT: call void @llvm.dbg.declare(metadata ptr [[DOTBOUND_TID__ADDR]], metadata [[META94:![0-9]+]], metadata !DIExpression()), !dbg [[DBG93]] +// CHECK2-NEXT: tail call void @llvm.dbg.declare(metadata ptr [[DOTBOUND_TID__ADDR]], metadata [[META100:![0-9]+]], metadata !DIExpression()), !dbg [[DBG99]] // CHECK2-NEXT: store i64 [[VLA]], ptr [[VLA_ADDR]], align 8 -// CHECK2-NEXT: call void @llvm.dbg.declare(metadata ptr [[VLA_ADDR]], metadata [[META95:![0-9]+]], metadata !DIExpression()), !dbg [[DBG93]] +// CHECK2-NEXT: tail call void @llvm.dbg.declare(metadata ptr [[VLA_ADDR]], metadata [[META101:![0-9]+]], metadata !DIExpression()), !dbg [[DBG99]] // CHECK2-NEXT: store ptr [[A]], ptr [[A_ADDR]], align 8 -// CHECK2-NEXT: call void @llvm.dbg.declare(metadata ptr [[A_ADDR]], metadata [[META96:![0-9]+]], metadata !DIExpression()), !dbg [[DBG97:![0-9]+]] +// CHECK2-NEXT: tail call void @llvm.dbg.declare(metadata ptr [[A_ADDR]], metadata [[META102:![0-9]+]], metadata !DIExpression()), !dbg [[DBG103:![0-9]+]] // CHECK2-NEXT: store ptr [[GLOBAL]], ptr [[GLOBAL_ADDR]], align 8 -// CHECK2-NEXT: call void @llvm.dbg.declare(metadata ptr [[GLOBAL_ADDR]], metadata [[META98:![0-9]+]], metadata !DIExpression()), !dbg [[DBG99:![0-9]+]] -// CHECK2-NEXT: [[TMP0:%.*]] = load i64, ptr [[VLA_ADDR]], align 8, !dbg [[DBG100:![0-9]+]] -// CHECK2-NEXT: [[TMP1:%.*]] = load ptr, ptr [[A_ADDR]], align 8, !dbg [[DBG100]] -// CHECK2-NEXT: [[TMP2:%.*]] = load ptr, ptr [[GLOBAL_ADDR]], align 8, !dbg [[DBG100]] -// CHECK2-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 1, !dbg [[DBG101:![0-9]+]] -// CHECK2-NEXT: [[TMP3:%.*]] = load i32, ptr [[ARRAYIDX]], align 4, !dbg [[DBG101]] +// CHECK2-NEXT: tail call void @llvm.dbg.declare(metadata ptr [[GLOBAL_ADDR]], metadata [[META104:![0-9]+]], metadata !DIExpression()), !dbg [[DBG105:![0-9]+]] +// CHECK2-NEXT: [[TMP0:%.*]] = load i64, ptr [[VLA_ADDR]], align 8, !dbg [[DBG106:![0-9]+]] +// CHECK2-NEXT: [[TMP1:%.*]] = load ptr, ptr [[A_ADDR]], align 8, !dbg [[DBG106]] +// CHECK2-NEXT: [[TMP2:%.*]] = load ptr, ptr [[GLOBAL_ADDR]], align 8, !dbg [[DBG106]] +// CHECK2-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 1, !dbg [[DBG107:![0-9]+]] +// CHECK2-NEXT: [[TMP3:%.*]] = load i32, ptr [[ARRAYIDX]], align 4, !dbg [[DBG107]] // CHECK2-NEXT: invoke void @_Z3fooIiEvT_(i32 noundef [[TMP3]]) -// CHECK2-NEXT: to label [[INVOKE_CONT:%.*]] unwind label [[TERMINATE_LPAD:%.*]], !dbg [[DBG100]] +// CHECK2-NEXT: to label [[INVOKE_CONT:%.*]] unwind label [[TERMINATE_LPAD:%.*]], !dbg [[DBG106]] // CHECK2: invoke.cont: -// CHECK2-NEXT: [[TMP4:%.*]] = load i32, ptr [[TMP2]], align 4, !dbg [[DBG102:![0-9]+]] -// CHECK2-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 1, !dbg [[DBG103:![0-9]+]] -// CHECK2-NEXT: store i32 [[TMP4]], ptr [[ARRAYIDX1]], align 4, !dbg [[DBG104:![0-9]+]] -// CHECK2-NEXT: ret void, !dbg [[DBG102]] +// CHECK2-NEXT: [[TMP4:%.*]] = load i32, ptr [[TMP2]], align 4, !dbg [[DBG108:![0-9]+]] +// CHECK2-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 1, !dbg [[DBG109:![0-9]+]] +// CHECK2-NEXT: store i32 [[TMP4]], ptr [[ARRAYIDX1]], align 4, !dbg [[DBG110:![0-9]+]] +// CHECK2-NEXT: ret void, !dbg [[DBG108]] // CHECK2: terminate.lpad: // CHECK2-NEXT: [[TMP5:%.*]] = landingpad { ptr, i32 } -// CHECK2-NEXT: catch ptr null, !dbg [[DBG100]] -// CHECK2-NEXT: [[TMP6:%.*]] = extractvalue { ptr, i32 } [[TMP5]], 0, !dbg [[DBG100]] -// CHECK2-NEXT: call void @__clang_call_terminate(ptr [[TMP6]]) #[[ATTR7]], !dbg [[DBG100]] -// CHECK2-NEXT: unreachable, !dbg [[DBG100]] +// CHECK2-NEXT: catch ptr null, !dbg [[DBG106]] +// CHECK2-NEXT: [[TMP6:%.*]] = extractvalue { ptr, i32 } [[TMP5]], 0, !dbg [[DBG106]] +// CHECK2-NEXT: call void @__clang_call_terminate(ptr [[TMP6]]) #[[ATTR7]], !dbg [[DBG106]] +// CHECK2-NEXT: unreachable, !dbg [[DBG106]] // // -// CHECK2-LABEL: define {{[^@]+}}@main.omp_outlined_debug__.1.omp_outlined -// CHECK2-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[GLOBAL:%.*]]) #[[ATTR3]] !dbg [[DBG105:![0-9]+]] { +// CHECK2-LABEL: define {{[^@]+}}@main.omp_outlined_debug__.2.omp_outlined +// CHECK2-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[GLOBAL:%.*]]) #[[ATTR3]] !dbg [[DBG111:![0-9]+]] { // CHECK2-NEXT: entry: // CHECK2-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK2-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -484,46 +503,27 @@ int main (int argc, char **argv) { // CHECK2-NEXT: [[A_ADDR:%.*]] = alloca ptr, align 8 // CHECK2-NEXT: [[GLOBAL_ADDR:%.*]] = alloca ptr, align 8 // CHECK2-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK2-NEXT: call void @llvm.dbg.declare(metadata ptr [[DOTGLOBAL_TID__ADDR]], metadata [[META106:![0-9]+]], metadata !DIExpression()), !dbg [[DBG107:![0-9]+]] +// CHECK2-NEXT: tail call void @llvm.dbg.declare(metadata ptr [[DOTGLOBAL_TID__ADDR]], metadata [[META112:![0-9]+]], metadata !DIExpression()), !dbg [[DBG113:![0-9]+]] // CHECK2-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8 -// CHECK2-NEXT: call void @llvm.dbg.declare(metadata ptr [[DOTBOUND_TID__ADDR]], metadata [[META108:![0-9]+]], metadata !DIExpression()), !dbg [[DBG107]] +// CHECK2-NEXT: tail call void @llvm.dbg.declare(metadata ptr [[DOTBOUND_TID__ADDR]], metadata [[META114:![0-9]+]], metadata !DIExpression()), !dbg [[DBG113]] // CHECK2-NEXT: store i64 [[VLA]], ptr [[VLA_ADDR]], align 8 -// CHECK2-NEXT: call void @llvm.dbg.declare(metadata ptr [[VLA_ADDR]], metadata [[META109:![0-9]+]], metadata !DIExpression()), !dbg [[DBG107]] +// CHECK2-NEXT: tail call void @llvm.dbg.declare(metadata ptr [[VLA_ADDR]], metadata [[META115:![0-9]+]], metadata !DIExpression()), !dbg [[DBG113]] // CHECK2-NEXT: store ptr [[A]], ptr [[A_ADDR]], align 8 -// CHECK2-NEXT: call void @llvm.dbg.declare(metadata ptr [[A_ADDR]], metadata [[META110:![0-9]+]], metadata !DIExpression()), !dbg [[DBG107]] +// CHECK2-NEXT: tail call void @llvm.dbg.declare(metadata ptr [[A_ADDR]], metadata [[META116:![0-9]+]], metadata !DIExpression()), !dbg [[DBG113]] // CHECK2-NEXT: store ptr [[GLOBAL]], ptr [[GLOBAL_ADDR]], align 8 -// CHECK2-NEXT: call void @llvm.dbg.declare(metadata ptr [[GLOBAL_ADDR]], metadata [[META111:![0-9]+]], metadata !DIExpression()), !dbg [[DBG107]] -// CHECK2-NEXT: [[TMP0:%.*]] = load i64, ptr [[VLA_ADDR]], align 8, !dbg [[DBG112:![0-9]+]] -// CHECK2-NEXT: [[TMP1:%.*]] = load ptr, ptr [[A_ADDR]], align 8, !dbg [[DBG112]] -// CHECK2-NEXT: [[TMP2:%.*]] = load ptr, ptr [[GLOBAL_ADDR]], align 8, !dbg [[DBG112]] -// CHECK2-NEXT: [[TMP3:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8, !dbg [[DBG112]] -// CHECK2-NEXT: [[TMP4:%.*]] = load ptr, ptr [[DOTBOUND_TID__ADDR]], align 8, !dbg [[DBG112]] -// CHECK2-NEXT: [[TMP5:%.*]] = load ptr, ptr [[A_ADDR]], align 8, !dbg [[DBG112]] -// CHECK2-NEXT: [[TMP6:%.*]] = load ptr, ptr [[GLOBAL_ADDR]], align 8, !dbg [[DBG112]] -// CHECK2-NEXT: call void @main.omp_outlined_debug__.1.omp_outlined_debug__(ptr [[TMP3]], ptr [[TMP4]], i64 [[TMP0]], ptr [[TMP5]], ptr [[TMP6]]) #[[ATTR6]], !dbg [[DBG112]] -// CHECK2-NEXT: ret void, !dbg [[DBG112]] -// -// -// CHECK2-LABEL: define {{[^@]+}}@main.omp_outlined.2 -// CHECK2-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[VLA:%.*]]) #[[ATTR3]] !dbg [[DBG113:![0-9]+]] { -// CHECK2-NEXT: entry: -// CHECK2-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 -// CHECK2-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 -// CHECK2-NEXT: [[VLA_ADDR:%.*]] = alloca i64, align 8 -// CHECK2-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK2-NEXT: call void @llvm.dbg.declare(metadata ptr [[DOTGLOBAL_TID__ADDR]], metadata [[META114:![0-9]+]], metadata !DIExpression()), !dbg [[DBG115:![0-9]+]] -// CHECK2-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8 -// CHECK2-NEXT: call void @llvm.dbg.declare(metadata ptr [[DOTBOUND_TID__ADDR]], metadata [[META116:![0-9]+]], metadata !DIExpression()), !dbg [[DBG115]] -// CHECK2-NEXT: store i64 [[VLA]], ptr [[VLA_ADDR]], align 8 -// CHECK2-NEXT: call void @llvm.dbg.declare(metadata ptr [[VLA_ADDR]], metadata [[META117:![0-9]+]], metadata !DIExpression()), !dbg [[DBG115]] +// CHECK2-NEXT: tail call void @llvm.dbg.declare(metadata ptr [[GLOBAL_ADDR]], metadata [[META117:![0-9]+]], metadata !DIExpression()), !dbg [[DBG113]] // CHECK2-NEXT: [[TMP0:%.*]] = load i64, ptr [[VLA_ADDR]], align 8, !dbg [[DBG118:![0-9]+]] -// CHECK2-NEXT: [[TMP1:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8, !dbg [[DBG118]] -// CHECK2-NEXT: [[TMP2:%.*]] = load ptr, ptr [[DOTBOUND_TID__ADDR]], align 8, !dbg [[DBG118]] -// CHECK2-NEXT: call void @main.omp_outlined_debug__.1(ptr [[TMP1]], ptr [[TMP2]], i64 [[TMP0]]) #[[ATTR6]], !dbg [[DBG118]] +// CHECK2-NEXT: [[TMP1:%.*]] = load ptr, ptr [[A_ADDR]], align 8, !dbg [[DBG118]] +// CHECK2-NEXT: [[TMP2:%.*]] = load ptr, ptr [[GLOBAL_ADDR]], align 8, !dbg [[DBG118]] +// CHECK2-NEXT: [[TMP3:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8, !dbg [[DBG118]] +// CHECK2-NEXT: [[TMP4:%.*]] = load ptr, ptr [[DOTBOUND_TID__ADDR]], align 8, !dbg [[DBG118]] +// CHECK2-NEXT: [[TMP5:%.*]] = load ptr, ptr [[A_ADDR]], align 8, !dbg [[DBG118]] +// CHECK2-NEXT: [[TMP6:%.*]] = load ptr, ptr [[GLOBAL_ADDR]], align 8, !dbg [[DBG118]] +// CHECK2-NEXT: call void @main.omp_outlined_debug__.2.omp_outlined_debug__(ptr [[TMP3]], ptr [[TMP4]], i64 [[TMP0]], ptr [[TMP5]], ptr [[TMP6]]) #[[ATTR6]], !dbg [[DBG118]] // CHECK2-NEXT: ret void, !dbg [[DBG118]] // // -// CHECK2-LABEL: define {{[^@]+}}@main.omp_outlined_debug__.3 +// CHECK2-LABEL: define {{[^@]+}}@main.omp_outlined_debug__.4 // CHECK2-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR3]] !dbg [[DBG119:![0-9]+]] { // CHECK2-NEXT: entry: // CHECK2-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -531,78 +531,78 @@ int main (int argc, char **argv) { // CHECK2-NEXT: [[VLA_ADDR:%.*]] = alloca i64, align 8 // CHECK2-NEXT: [[A_ADDR:%.*]] = alloca ptr, align 8 // CHECK2-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK2-NEXT: call void @llvm.dbg.declare(metadata ptr [[DOTGLOBAL_TID__ADDR]], metadata [[META120:![0-9]+]], metadata !DIExpression()), !dbg [[DBG121:![0-9]+]] +// CHECK2-NEXT: tail call void @llvm.dbg.declare(metadata ptr [[DOTGLOBAL_TID__ADDR]], metadata [[META120:![0-9]+]], metadata !DIExpression()), !dbg [[DBG121:![0-9]+]] // CHECK2-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8 -// CHECK2-NEXT: call void @llvm.dbg.declare(metadata ptr [[DOTBOUND_TID__ADDR]], metadata [[META122:![0-9]+]], metadata !DIExpression()), !dbg [[DBG121]] +// CHECK2-NEXT: tail call void @llvm.dbg.declare(metadata ptr [[DOTBOUND_TID__ADDR]], metadata [[META122:![0-9]+]], metadata !DIExpression()), !dbg [[DBG121]] // CHECK2-NEXT: store i64 [[VLA]], ptr [[VLA_ADDR]], align 8 -// CHECK2-NEXT: call void @llvm.dbg.declare(metadata ptr [[VLA_ADDR]], metadata [[META123:![0-9]+]], metadata !DIExpression()), !dbg [[DBG121]] +// CHECK2-NEXT: tail call void @llvm.dbg.declare(metadata ptr [[VLA_ADDR]], metadata [[META123:![0-9]+]], metadata !DIExpression()), !dbg [[DBG121]] // CHECK2-NEXT: store ptr [[A]], ptr [[A_ADDR]], align 8 -// CHECK2-NEXT: call void @llvm.dbg.declare(metadata ptr [[A_ADDR]], metadata [[META124:![0-9]+]], metadata !DIExpression()), !dbg [[DBG125:![0-9]+]] +// CHECK2-NEXT: tail call void @llvm.dbg.declare(metadata ptr [[A_ADDR]], metadata [[META124:![0-9]+]], metadata !DIExpression()), !dbg [[DBG125:![0-9]+]] // CHECK2-NEXT: [[TMP0:%.*]] = load i64, ptr [[VLA_ADDR]], align 8, !dbg [[DBG126:![0-9]+]] // CHECK2-NEXT: [[TMP1:%.*]] = load ptr, ptr [[A_ADDR]], align 8, !dbg [[DBG126]] -// CHECK2-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB7:[0-9]+]], i32 2, ptr @main.omp_outlined_debug__.3.omp_outlined, i64 [[TMP0]], ptr [[TMP1]]), !dbg [[DBG126]] +// CHECK2-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB7:[0-9]+]], i32 2, ptr @main.omp_outlined_debug__.4.omp_outlined, i64 [[TMP0]], ptr [[TMP1]]), !dbg [[DBG126]] // CHECK2-NEXT: ret void, !dbg [[DBG127:![0-9]+]] // // -// CHECK2-LABEL: define {{[^@]+}}@main.omp_outlined_debug__.3.omp_outlined_debug__ -// CHECK2-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR3]] personality ptr @__gxx_personality_v0 !dbg [[DBG128:![0-9]+]] { +// CHECK2-LABEL: define {{[^@]+}}@main.omp_outlined.3 +// CHECK2-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR3]] !dbg [[DBG128:![0-9]+]] { // CHECK2-NEXT: entry: // CHECK2-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK2-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK2-NEXT: [[VLA_ADDR:%.*]] = alloca i64, align 8 // CHECK2-NEXT: [[A_ADDR:%.*]] = alloca ptr, align 8 // CHECK2-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK2-NEXT: call void @llvm.dbg.declare(metadata ptr [[DOTGLOBAL_TID__ADDR]], metadata [[META129:![0-9]+]], metadata !DIExpression()), !dbg [[DBG130:![0-9]+]] +// CHECK2-NEXT: tail call void @llvm.dbg.declare(metadata ptr [[DOTGLOBAL_TID__ADDR]], metadata [[META129:![0-9]+]], metadata !DIExpression()), !dbg [[DBG130:![0-9]+]] // CHECK2-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8 -// CHECK2-NEXT: call void @llvm.dbg.declare(metadata ptr [[DOTBOUND_TID__ADDR]], metadata [[META131:![0-9]+]], metadata !DIExpression()), !dbg [[DBG130]] +// CHECK2-NEXT: tail call void @llvm.dbg.declare(metadata ptr [[DOTBOUND_TID__ADDR]], metadata [[META131:![0-9]+]], metadata !DIExpression()), !dbg [[DBG130]] // CHECK2-NEXT: store i64 [[VLA]], ptr [[VLA_ADDR]], align 8 -// CHECK2-NEXT: call void @llvm.dbg.declare(metadata ptr [[VLA_ADDR]], metadata [[META132:![0-9]+]], metadata !DIExpression()), !dbg [[DBG130]] +// CHECK2-NEXT: tail call void @llvm.dbg.declare(metadata ptr [[VLA_ADDR]], metadata [[META132:![0-9]+]], metadata !DIExpression()), !dbg [[DBG130]] // CHECK2-NEXT: store ptr [[A]], ptr [[A_ADDR]], align 8 -// CHECK2-NEXT: call void @llvm.dbg.declare(metadata ptr [[A_ADDR]], metadata [[META133:![0-9]+]], metadata !DIExpression()), !dbg [[DBG134:![0-9]+]] -// CHECK2-NEXT: [[TMP0:%.*]] = load i64, ptr [[VLA_ADDR]], align 8, !dbg [[DBG135:![0-9]+]] -// CHECK2-NEXT: [[TMP1:%.*]] = load ptr, ptr [[A_ADDR]], align 8, !dbg [[DBG135]] -// CHECK2-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 1, !dbg [[DBG136:![0-9]+]] -// CHECK2-NEXT: [[TMP2:%.*]] = load i32, ptr [[ARRAYIDX]], align 4, !dbg [[DBG136]] -// CHECK2-NEXT: invoke void @_Z3fooIiEvT_(i32 noundef [[TMP2]]) -// CHECK2-NEXT: to label [[INVOKE_CONT:%.*]] unwind label [[TERMINATE_LPAD:%.*]], !dbg [[DBG135]] -// CHECK2: invoke.cont: -// CHECK2-NEXT: [[TMP3:%.*]] = load i32, ptr @global, align 4, !dbg [[DBG137:![0-9]+]] -// CHECK2-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 1, !dbg [[DBG138:![0-9]+]] -// CHECK2-NEXT: store i32 [[TMP3]], ptr [[ARRAYIDX1]], align 4, !dbg [[DBG139:![0-9]+]] -// CHECK2-NEXT: ret void, !dbg [[DBG137]] -// CHECK2: terminate.lpad: -// CHECK2-NEXT: [[TMP4:%.*]] = landingpad { ptr, i32 } -// CHECK2-NEXT: catch ptr null, !dbg [[DBG135]] -// CHECK2-NEXT: [[TMP5:%.*]] = extractvalue { ptr, i32 } [[TMP4]], 0, !dbg [[DBG135]] -// CHECK2-NEXT: call void @__clang_call_terminate(ptr [[TMP5]]) #[[ATTR7]], !dbg [[DBG135]] -// CHECK2-NEXT: unreachable, !dbg [[DBG135]] -// -// -// CHECK2-LABEL: define {{[^@]+}}@main.omp_outlined_debug__.3.omp_outlined -// CHECK2-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR3]] !dbg [[DBG140:![0-9]+]] { +// CHECK2-NEXT: tail call void @llvm.dbg.declare(metadata ptr [[A_ADDR]], metadata [[META133:![0-9]+]], metadata !DIExpression()), !dbg [[DBG130]] +// CHECK2-NEXT: [[TMP0:%.*]] = load i64, ptr [[VLA_ADDR]], align 8, !dbg [[DBG134:![0-9]+]] +// CHECK2-NEXT: [[TMP1:%.*]] = load ptr, ptr [[A_ADDR]], align 8, !dbg [[DBG134]] +// CHECK2-NEXT: [[TMP2:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8, !dbg [[DBG134]] +// CHECK2-NEXT: [[TMP3:%.*]] = load ptr, ptr [[DOTBOUND_TID__ADDR]], align 8, !dbg [[DBG134]] +// CHECK2-NEXT: [[TMP4:%.*]] = load ptr, ptr [[A_ADDR]], align 8, !dbg [[DBG134]] +// CHECK2-NEXT: call void @main.omp_outlined_debug__.4(ptr [[TMP2]], ptr [[TMP3]], i64 [[TMP0]], ptr [[TMP4]]) #[[ATTR6]], !dbg [[DBG134]] +// CHECK2-NEXT: ret void, !dbg [[DBG134]] +// +// +// CHECK2-LABEL: define {{[^@]+}}@main.omp_outlined_debug__.4.omp_outlined_debug__ +// CHECK2-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR3]] personality ptr @__gxx_personality_v0 !dbg [[DBG135:![0-9]+]] { // CHECK2-NEXT: entry: // CHECK2-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK2-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK2-NEXT: [[VLA_ADDR:%.*]] = alloca i64, align 8 // CHECK2-NEXT: [[A_ADDR:%.*]] = alloca ptr, align 8 // CHECK2-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK2-NEXT: call void @llvm.dbg.declare(metadata ptr [[DOTGLOBAL_TID__ADDR]], metadata [[META141:![0-9]+]], metadata !DIExpression()), !dbg [[DBG142:![0-9]+]] +// CHECK2-NEXT: tail call void @llvm.dbg.declare(metadata ptr [[DOTGLOBAL_TID__ADDR]], metadata [[META136:![0-9]+]], metadata !DIExpression()), !dbg [[DBG137:![0-9]+]] // CHECK2-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8 -// CHECK2-NEXT: call void @llvm.dbg.declare(metadata ptr [[DOTBOUND_TID__ADDR]], metadata [[META143:![0-9]+]], metadata !DIExpression()), !dbg [[DBG142]] +// CHECK2-NEXT: tail call void @llvm.dbg.declare(metadata ptr [[DOTBOUND_TID__ADDR]], metadata [[META138:![0-9]+]], metadata !DIExpression()), !dbg [[DBG137]] // CHECK2-NEXT: store i64 [[VLA]], ptr [[VLA_ADDR]], align 8 -// CHECK2-NEXT: call void @llvm.dbg.declare(metadata ptr [[VLA_ADDR]], metadata [[META144:![0-9]+]], metadata !DIExpression()), !dbg [[DBG142]] +// CHECK2-NEXT: tail call void @llvm.dbg.declare(metadata ptr [[VLA_ADDR]], metadata [[META139:![0-9]+]], metadata !DIExpression()), !dbg [[DBG137]] // CHECK2-NEXT: store ptr [[A]], ptr [[A_ADDR]], align 8 -// CHECK2-NEXT: call void @llvm.dbg.declare(metadata ptr [[A_ADDR]], metadata [[META145:![0-9]+]], metadata !DIExpression()), !dbg [[DBG142]] -// CHECK2-NEXT: [[TMP0:%.*]] = load i64, ptr [[VLA_ADDR]], align 8, !dbg [[DBG146:![0-9]+]] -// CHECK2-NEXT: [[TMP1:%.*]] = load ptr, ptr [[A_ADDR]], align 8, !dbg [[DBG146]] -// CHECK2-NEXT: [[TMP2:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8, !dbg [[DBG146]] -// CHECK2-NEXT: [[TMP3:%.*]] = load ptr, ptr [[DOTBOUND_TID__ADDR]], align 8, !dbg [[DBG146]] -// CHECK2-NEXT: [[TMP4:%.*]] = load ptr, ptr [[A_ADDR]], align 8, !dbg [[DBG146]] -// CHECK2-NEXT: call void @main.omp_outlined_debug__.3.omp_outlined_debug__(ptr [[TMP2]], ptr [[TMP3]], i64 [[TMP0]], ptr [[TMP4]]) #[[ATTR6]], !dbg [[DBG146]] -// CHECK2-NEXT: ret void, !dbg [[DBG146]] +// CHECK2-NEXT: tail call void @llvm.dbg.declare(metadata ptr [[A_ADDR]], metadata [[META140:![0-9]+]], metadata !DIExpression()), !dbg [[DBG141:![0-9]+]] +// CHECK2-NEXT: [[TMP0:%.*]] = load i64, ptr [[VLA_ADDR]], align 8, !dbg [[DBG142:![0-9]+]] +// CHECK2-NEXT: [[TMP1:%.*]] = load ptr, ptr [[A_ADDR]], align 8, !dbg [[DBG142]] +// CHECK2-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 1, !dbg [[DBG143:![0-9]+]] +// CHECK2-NEXT: [[TMP2:%.*]] = load i32, ptr [[ARRAYIDX]], align 4, !dbg [[DBG143]] +// CHECK2-NEXT: invoke void @_Z3fooIiEvT_(i32 noundef [[TMP2]]) +// CHECK2-NEXT: to label [[INVOKE_CONT:%.*]] unwind label [[TERMINATE_LPAD:%.*]], !dbg [[DBG142]] +// CHECK2: invoke.cont: +// CHECK2-NEXT: [[TMP3:%.*]] = load i32, ptr @global, align 4, !dbg [[DBG144:![0-9]+]] +// CHECK2-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 1, !dbg [[DBG145:![0-9]+]] +// CHECK2-NEXT: store i32 [[TMP3]], ptr [[ARRAYIDX1]], align 4, !dbg [[DBG146:![0-9]+]] +// CHECK2-NEXT: ret void, !dbg [[DBG144]] +// CHECK2: terminate.lpad: +// CHECK2-NEXT: [[TMP4:%.*]] = landingpad { ptr, i32 } +// CHECK2-NEXT: catch ptr null, !dbg [[DBG142]] +// CHECK2-NEXT: [[TMP5:%.*]] = extractvalue { ptr, i32 } [[TMP4]], 0, !dbg [[DBG142]] +// CHECK2-NEXT: call void @__clang_call_terminate(ptr [[TMP5]]) #[[ATTR7]], !dbg [[DBG142]] +// CHECK2-NEXT: unreachable, !dbg [[DBG142]] // // -// CHECK2-LABEL: define {{[^@]+}}@main.omp_outlined.4 +// CHECK2-LABEL: define {{[^@]+}}@main.omp_outlined_debug__.4.omp_outlined // CHECK2-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR3]] !dbg [[DBG147:![0-9]+]] { // CHECK2-NEXT: entry: // CHECK2-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -610,19 +610,19 @@ int main (int argc, char **argv) { // CHECK2-NEXT: [[VLA_ADDR:%.*]] = alloca i64, align 8 // CHECK2-NEXT: [[A_ADDR:%.*]] = alloca ptr, align 8 // CHECK2-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK2-NEXT: call void @llvm.dbg.declare(metadata ptr [[DOTGLOBAL_TID__ADDR]], metadata [[META148:![0-9]+]], metadata !DIExpression()), !dbg [[DBG149:![0-9]+]] +// CHECK2-NEXT: tail call void @llvm.dbg.declare(metadata ptr [[DOTGLOBAL_TID__ADDR]], metadata [[META148:![0-9]+]], metadata !DIExpression()), !dbg [[DBG149:![0-9]+]] // CHECK2-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8 -// CHECK2-NEXT: call void @llvm.dbg.declare(metadata ptr [[DOTBOUND_TID__ADDR]], metadata [[META150:![0-9]+]], metadata !DIExpression()), !dbg [[DBG149]] +// CHECK2-NEXT: tail call void @llvm.dbg.declare(metadata ptr [[DOTBOUND_TID__ADDR]], metadata [[META150:![0-9]+]], metadata !DIExpression()), !dbg [[DBG149]] // CHECK2-NEXT: store i64 [[VLA]], ptr [[VLA_ADDR]], align 8 -// CHECK2-NEXT: call void @llvm.dbg.declare(metadata ptr [[VLA_ADDR]], metadata [[META151:![0-9]+]], metadata !DIExpression()), !dbg [[DBG149]] +// CHECK2-NEXT: tail call void @llvm.dbg.declare(metadata ptr [[VLA_ADDR]], metadata [[META151:![0-9]+]], metadata !DIExpression()), !dbg [[DBG149]] // CHECK2-NEXT: store ptr [[A]], ptr [[A_ADDR]], align 8 -// CHECK2-NEXT: call void @llvm.dbg.declare(metadata ptr [[A_ADDR]], metadata [[META152:![0-9]+]], metadata !DIExpression()), !dbg [[DBG149]] +// CHECK2-NEXT: tail call void @llvm.dbg.declare(metadata ptr [[A_ADDR]], metadata [[META152:![0-9]+]], metadata !DIExpression()), !dbg [[DBG149]] // CHECK2-NEXT: [[TMP0:%.*]] = load i64, ptr [[VLA_ADDR]], align 8, !dbg [[DBG153:![0-9]+]] // CHECK2-NEXT: [[TMP1:%.*]] = load ptr, ptr [[A_ADDR]], align 8, !dbg [[DBG153]] // CHECK2-NEXT: [[TMP2:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8, !dbg [[DBG153]] // CHECK2-NEXT: [[TMP3:%.*]] = load ptr, ptr [[DOTBOUND_TID__ADDR]], align 8, !dbg [[DBG153]] // CHECK2-NEXT: [[TMP4:%.*]] = load ptr, ptr [[A_ADDR]], align 8, !dbg [[DBG153]] -// CHECK2-NEXT: call void @main.omp_outlined_debug__.3(ptr [[TMP2]], ptr [[TMP3]], i64 [[TMP0]], ptr [[TMP4]]) #[[ATTR6]], !dbg [[DBG153]] +// CHECK2-NEXT: call void @main.omp_outlined_debug__.4.omp_outlined_debug__(ptr [[TMP2]], ptr [[TMP3]], i64 [[TMP0]], ptr [[TMP4]]) #[[ATTR6]], !dbg [[DBG153]] // CHECK2-NEXT: ret void, !dbg [[DBG153]] // // @@ -631,7 +631,7 @@ int main (int argc, char **argv) { // CHECK2-NEXT: entry: // CHECK2-NEXT: [[ARGC_ADDR:%.*]] = alloca ptr, align 8 // CHECK2-NEXT: store ptr [[ARGC]], ptr [[ARGC_ADDR]], align 8 -// CHECK2-NEXT: call void @llvm.dbg.declare(metadata ptr [[ARGC_ADDR]], metadata [[META159:![0-9]+]], metadata !DIExpression()), !dbg [[DBG160:![0-9]+]] +// CHECK2-NEXT: tail call void @llvm.dbg.declare(metadata ptr [[ARGC_ADDR]], metadata [[META159:![0-9]+]], metadata !DIExpression()), !dbg [[DBG160:![0-9]+]] // CHECK2-NEXT: [[TMP0:%.*]] = load ptr, ptr [[ARGC_ADDR]], align 8, !dbg [[DBG161:![0-9]+]] // CHECK2-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds ptr, ptr [[TMP0]], i64 0, !dbg [[DBG161]] // CHECK2-NEXT: [[TMP1:%.*]] = load ptr, ptr [[ARRAYIDX]], align 8, !dbg [[DBG161]] @@ -651,20 +651,20 @@ int main (int argc, char **argv) { // CHECK2-NEXT: [[VLA_ADDR:%.*]] = alloca i64, align 8 // CHECK2-NEXT: [[VAR:%.*]] = alloca ptr, align 8 // CHECK2-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK2-NEXT: call void @llvm.dbg.declare(metadata ptr [[DOTGLOBAL_TID__ADDR]], metadata [[META169:![0-9]+]], metadata !DIExpression()), !dbg [[DBG170:![0-9]+]] +// CHECK2-NEXT: tail call void @llvm.dbg.declare(metadata ptr [[DOTGLOBAL_TID__ADDR]], metadata [[META169:![0-9]+]], metadata !DIExpression()), !dbg [[DBG170:![0-9]+]] // CHECK2-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8 -// CHECK2-NEXT: call void @llvm.dbg.declare(metadata ptr [[DOTBOUND_TID__ADDR]], metadata [[META171:![0-9]+]], metadata !DIExpression()), !dbg [[DBG170]] +// CHECK2-NEXT: tail call void @llvm.dbg.declare(metadata ptr [[DOTBOUND_TID__ADDR]], metadata [[META171:![0-9]+]], metadata !DIExpression()), !dbg [[DBG170]] // CHECK2-NEXT: store ptr [[ARGC]], ptr [[ARGC_ADDR]], align 8 -// CHECK2-NEXT: call void @llvm.dbg.declare(metadata ptr [[ARGC_ADDR]], metadata [[META172:![0-9]+]], metadata !DIExpression()), !dbg [[DBG173:![0-9]+]] +// CHECK2-NEXT: tail call void @llvm.dbg.declare(metadata ptr [[ARGC_ADDR]], metadata [[META172:![0-9]+]], metadata !DIExpression()), !dbg [[DBG173:![0-9]+]] // CHECK2-NEXT: store i64 [[VLA]], ptr [[VLA_ADDR]], align 8 -// CHECK2-NEXT: call void @llvm.dbg.declare(metadata ptr [[VLA_ADDR]], metadata [[META174:![0-9]+]], metadata !DIExpression()), !dbg [[DBG170]] +// CHECK2-NEXT: tail call void @llvm.dbg.declare(metadata ptr [[VLA_ADDR]], metadata [[META174:![0-9]+]], metadata !DIExpression()), !dbg [[DBG170]] // CHECK2-NEXT: [[TMP0:%.*]] = load ptr, ptr [[ARGC_ADDR]], align 8, !dbg [[DBG175:![0-9]+]] // CHECK2-NEXT: [[TMP1:%.*]] = load i64, ptr [[VLA_ADDR]], align 8, !dbg [[DBG175]] // CHECK2-NEXT: [[TMP2:%.*]] = load ptr, ptr [[TMP0]], align 8, !dbg [[DBG176:![0-9]+]] // CHECK2-NEXT: invoke void @_Z3fooIPPcEvT_(ptr noundef [[TMP2]]) -// CHECK2-NEXT: to label [[INVOKE_CONT:%.*]] unwind label [[TERMINATE_LPAD:%.*]], !dbg [[DBG178:![0-9]+]] +// CHECK2-NEXT: to label [[INVOKE_CONT:%.*]] unwind label [[TERMINATE_LPAD:%.*]], !dbg [[DBG178:![0-9]+]] // CHECK2: invoke.cont: -// CHECK2-NEXT: call void @llvm.dbg.declare(metadata ptr [[VAR]], metadata [[META179:![0-9]+]], metadata !DIExpression()), !dbg [[DBG186:![0-9]+]] +// CHECK2-NEXT: tail call void @llvm.dbg.declare(metadata ptr [[VAR]], metadata [[META179:![0-9]+]], metadata !DIExpression()), !dbg [[DBG186:![0-9]+]] // CHECK2-NEXT: [[TMP3:%.*]] = load ptr, ptr [[VAR]], align 8, !dbg [[DBG187:![0-9]+]] // CHECK2-NEXT: [[TMP4:%.*]] = mul nsw i64 0, [[TMP1]], !dbg [[DBG187]] // CHECK2-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds double, ptr [[TMP3]], i64 [[TMP4]], !dbg [[DBG187]] @@ -672,43 +672,43 @@ int main (int argc, char **argv) { // CHECK2-NEXT: ret void, !dbg [[DBG188:![0-9]+]] // CHECK2: terminate.lpad: // CHECK2-NEXT: [[TMP5:%.*]] = landingpad { ptr, i32 } -// CHECK2-NEXT: catch ptr null, !dbg [[DBG178]] +// CHECK2-NEXT: catch ptr null, !dbg [[DBG178]] // CHECK2-NEXT: [[TMP6:%.*]] = extractvalue { ptr, i32 } [[TMP5]], 0, !dbg [[DBG178]] // CHECK2-NEXT: call void @__clang_call_terminate(ptr [[TMP6]]) #[[ATTR7]], !dbg [[DBG178]] // CHECK2-NEXT: unreachable, !dbg [[DBG178]] // // -// CHECK2-LABEL: define {{[^@]+}}@_Z3fooIPPcEvT_ -// CHECK2-SAME: (ptr noundef [[ARGC:%.*]]) #[[ATTR4]] comdat !dbg [[DBG189:![0-9]+]] { -// CHECK2-NEXT: entry: -// CHECK2-NEXT: [[ARGC_ADDR:%.*]] = alloca ptr, align 8 -// CHECK2-NEXT: store ptr [[ARGC]], ptr [[ARGC_ADDR]], align 8 -// CHECK2-NEXT: call void @llvm.dbg.declare(metadata ptr [[ARGC_ADDR]], metadata [[META192:![0-9]+]], metadata !DIExpression()), !dbg [[DBG193:![0-9]+]] -// CHECK2-NEXT: ret void, !dbg [[DBG194:![0-9]+]] -// -// // CHECK2-LABEL: define {{[^@]+}}@_Z5tmainIPPcEiT_.omp_outlined -// CHECK2-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[ARGC:%.*]], i64 noundef [[VLA:%.*]]) #[[ATTR3]] !dbg [[DBG195:![0-9]+]] { +// CHECK2-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[ARGC:%.*]], i64 noundef [[VLA:%.*]]) #[[ATTR3]] !dbg [[DBG189:![0-9]+]] { // CHECK2-NEXT: entry: // CHECK2-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK2-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK2-NEXT: [[ARGC_ADDR:%.*]] = alloca ptr, align 8 // CHECK2-NEXT: [[VLA_ADDR:%.*]] = alloca i64, align 8 // CHECK2-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK2-NEXT: call void @llvm.dbg.declare(metadata ptr [[DOTGLOBAL_TID__ADDR]], metadata [[META196:![0-9]+]], metadata !DIExpression()), !dbg [[DBG197:![0-9]+]] +// CHECK2-NEXT: tail call void @llvm.dbg.declare(metadata ptr [[DOTGLOBAL_TID__ADDR]], metadata [[META190:![0-9]+]], metadata !DIExpression()), !dbg [[DBG191:![0-9]+]] // CHECK2-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8 -// CHECK2-NEXT: call void @llvm.dbg.declare(metadata ptr [[DOTBOUND_TID__ADDR]], metadata [[META198:![0-9]+]], metadata !DIExpression()), !dbg [[DBG197]] +// CHECK2-NEXT: tail call void @llvm.dbg.declare(metadata ptr [[DOTBOUND_TID__ADDR]], metadata [[META192:![0-9]+]], metadata !DIExpression()), !dbg [[DBG191]] // CHECK2-NEXT: store ptr [[ARGC]], ptr [[ARGC_ADDR]], align 8 -// CHECK2-NEXT: call void @llvm.dbg.declare(metadata ptr [[ARGC_ADDR]], metadata [[META199:![0-9]+]], metadata !DIExpression()), !dbg [[DBG197]] +// CHECK2-NEXT: tail call void @llvm.dbg.declare(metadata ptr [[ARGC_ADDR]], metadata [[META193:![0-9]+]], metadata !DIExpression()), !dbg [[DBG191]] // CHECK2-NEXT: store i64 [[VLA]], ptr [[VLA_ADDR]], align 8 -// CHECK2-NEXT: call void @llvm.dbg.declare(metadata ptr [[VLA_ADDR]], metadata [[META200:![0-9]+]], metadata !DIExpression()), !dbg [[DBG197]] -// CHECK2-NEXT: [[TMP0:%.*]] = load ptr, ptr [[ARGC_ADDR]], align 8, !dbg [[DBG201:![0-9]+]] -// CHECK2-NEXT: [[TMP1:%.*]] = load i64, ptr [[VLA_ADDR]], align 8, !dbg [[DBG201]] -// CHECK2-NEXT: [[TMP2:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8, !dbg [[DBG201]] -// CHECK2-NEXT: [[TMP3:%.*]] = load ptr, ptr [[DOTBOUND_TID__ADDR]], align 8, !dbg [[DBG201]] -// CHECK2-NEXT: [[TMP4:%.*]] = load ptr, ptr [[ARGC_ADDR]], align 8, !dbg [[DBG201]] -// CHECK2-NEXT: call void @_Z5tmainIPPcEiT_.omp_outlined_debug__(ptr [[TMP2]], ptr [[TMP3]], ptr [[TMP4]], i64 [[TMP1]]) #[[ATTR6]], !dbg [[DBG201]] -// CHECK2-NEXT: ret void, !dbg [[DBG201]] +// CHECK2-NEXT: tail call void @llvm.dbg.declare(metadata ptr [[VLA_ADDR]], metadata [[META194:![0-9]+]], metadata !DIExpression()), !dbg [[DBG191]] +// CHECK2-NEXT: [[TMP0:%.*]] = load ptr, ptr [[ARGC_ADDR]], align 8, !dbg [[DBG195:![0-9]+]] +// CHECK2-NEXT: [[TMP1:%.*]] = load i64, ptr [[VLA_ADDR]], align 8, !dbg [[DBG195]] +// CHECK2-NEXT: [[TMP2:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8, !dbg [[DBG195]] +// CHECK2-NEXT: [[TMP3:%.*]] = load ptr, ptr [[DOTBOUND_TID__ADDR]], align 8, !dbg [[DBG195]] +// CHECK2-NEXT: [[TMP4:%.*]] = load ptr, ptr [[ARGC_ADDR]], align 8, !dbg [[DBG195]] +// CHECK2-NEXT: call void @_Z5tmainIPPcEiT_.omp_outlined_debug__(ptr [[TMP2]], ptr [[TMP3]], ptr [[TMP4]], i64 [[TMP1]]) #[[ATTR6]], !dbg [[DBG195]] +// CHECK2-NEXT: ret void, !dbg [[DBG195]] +// +// +// CHECK2-LABEL: define {{[^@]+}}@_Z3fooIPPcEvT_ +// CHECK2-SAME: (ptr noundef [[ARGC:%.*]]) #[[ATTR4]] comdat !dbg [[DBG196:![0-9]+]] { +// CHECK2-NEXT: entry: +// CHECK2-NEXT: [[ARGC_ADDR:%.*]] = alloca ptr, align 8 +// CHECK2-NEXT: store ptr [[ARGC]], ptr [[ARGC_ADDR]], align 8 +// CHECK2-NEXT: tail call void @llvm.dbg.declare(metadata ptr [[ARGC_ADDR]], metadata [[META199:![0-9]+]], metadata !DIExpression()), !dbg [[DBG200:![0-9]+]] +// CHECK2-NEXT: ret void, !dbg [[DBG201:![0-9]+]] // // // CHECK3-LABEL: define {{[^@]+}}@main @@ -812,7 +812,7 @@ int main (int argc, char **argv) { // // // CHECK3-LABEL: define {{[^@]+}}@_Z5tmainIPPcEiT_..omp_par -// CHECK3-SAME: (ptr noalias [[TID_ADDR:%.*]], ptr noalias [[ZERO_ADDR:%.*]], ptr [[TMP0:%.*]]) #[[ATTR2:[0-9]+]] +// CHECK3-SAME: (ptr noalias [[TID_ADDR:%.*]], ptr noalias [[ZERO_ADDR:%.*]], ptr [[TMP0:%.*]]) #[[ATTR6:[0-9]+]] { // CHECK3-NEXT: omp.par.entry: // CHECK3-NEXT: [[GEP__RELOADED:%.*]] = getelementptr { ptr, ptr }, ptr [[TMP0]], i32 0, i32 0 // CHECK3-NEXT: [[LOADGEP__RELOADED:%.*]] = load ptr, ptr [[GEP__RELOADED]], align 8 @@ -860,17 +860,17 @@ int main (int argc, char **argv) { // CHECK4-NEXT: [[__VLA_EXPR0:%.*]] = alloca i64, align 8 // CHECK4-NEXT: store i32 0, ptr [[RETVAL]], align 4 // CHECK4-NEXT: store i32 [[ARGC]], ptr [[ARGC_ADDR]], align 4 -// CHECK4-NEXT: call void @llvm.dbg.declare(metadata ptr [[ARGC_ADDR]], metadata [[META18:![0-9]+]], metadata !DIExpression()), !dbg [[DBG19:![0-9]+]] +// CHECK4-NEXT: tail call void @llvm.dbg.declare(metadata ptr [[ARGC_ADDR]], metadata [[META18:![0-9]+]], metadata !DIExpression()), !dbg [[DBG19:![0-9]+]] // CHECK4-NEXT: store ptr [[ARGV]], ptr [[ARGV_ADDR]], align 8 -// CHECK4-NEXT: call void @llvm.dbg.declare(metadata ptr [[ARGV_ADDR]], metadata [[META20:![0-9]+]], metadata !DIExpression()), !dbg [[DBG19]] +// CHECK4-NEXT: tail call void @llvm.dbg.declare(metadata ptr [[ARGV_ADDR]], metadata [[META20:![0-9]+]], metadata !DIExpression()), !dbg [[DBG19]] // CHECK4-NEXT: [[TMP0:%.*]] = load i32, ptr [[ARGC_ADDR]], align 4, !dbg [[DBG21:![0-9]+]] // CHECK4-NEXT: [[TMP1:%.*]] = zext i32 [[TMP0]] to i64, !dbg [[DBG21]] // CHECK4-NEXT: [[TMP2:%.*]] = call ptr @llvm.stacksave.p0(), !dbg [[DBG21]] // CHECK4-NEXT: store ptr [[TMP2]], ptr [[SAVED_STACK]], align 8, !dbg [[DBG21]] // CHECK4-NEXT: [[VLA:%.*]] = alloca i32, i64 [[TMP1]], align 16, !dbg [[DBG21]] // CHECK4-NEXT: store i64 [[TMP1]], ptr [[__VLA_EXPR0]], align 8, !dbg [[DBG21]] -// CHECK4-NEXT: call void @llvm.dbg.declare(metadata ptr [[__VLA_EXPR0]], metadata [[META22:![0-9]+]], metadata !DIExpression()), !dbg [[DBG24:![0-9]+]] -// CHECK4-NEXT: call void @llvm.dbg.declare(metadata ptr [[VLA]], metadata [[META25:![0-9]+]], metadata !DIExpression()), !dbg [[DBG21]] +// CHECK4-NEXT: tail call void @llvm.dbg.declare(metadata ptr [[__VLA_EXPR0]], metadata [[META22:![0-9]+]], metadata !DIExpression()), !dbg [[DBG24:![0-9]+]] +// CHECK4-NEXT: tail call void @llvm.dbg.declare(metadata ptr [[VLA]], metadata [[META25:![0-9]+]], metadata !DIExpression()), !dbg [[DBG21]] // CHECK4-NEXT: [[OMP_GLOBAL_THREAD_NUM:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1:[0-9]+]]), !dbg [[DBG29:![0-9]+]] // CHECK4-NEXT: br label [[OMP_PARALLEL:%.*]] // CHECK4: omp_parallel: @@ -921,25 +921,25 @@ int main (int argc, char **argv) { // CHECK4-NEXT: entry: // CHECK4-NEXT: [[ARGC_ADDR:%.*]] = alloca i32, align 4 // CHECK4-NEXT: store i32 [[ARGC]], ptr [[ARGC_ADDR]], align 4 -// CHECK4-NEXT: call void @llvm.dbg.declare(metadata ptr [[ARGC_ADDR]], metadata [[META41:![0-9]+]], metadata !DIExpression()), !dbg [[DBG42:![0-9]+]] +// CHECK4-NEXT: tail call void @llvm.dbg.declare(metadata ptr [[ARGC_ADDR]], metadata [[META41:![0-9]+]], metadata !DIExpression()), !dbg [[DBG42:![0-9]+]] // CHECK4-NEXT: ret void, !dbg [[DBG42]] // // // CHECK4-LABEL: define {{[^@]+}}@_Z5tmainIPPcEiT_ -// CHECK4-SAME: (ptr noundef [[ARGC:%.*]]) #[[ATTR6:[0-9]+]] comdat !dbg [[DBG45:![0-9]+]] { +// CHECK4-SAME: (ptr noundef [[ARGC:%.*]]) #[[ATTR6:[0-9]+]] comdat !dbg [[DBG43:![0-9]+]] { // CHECK4-NEXT: entry: // CHECK4-NEXT: [[STRUCTARG:%.*]] = alloca { ptr, ptr }, align 8 // CHECK4-NEXT: [[DOTRELOADED:%.*]] = alloca i64, align 8 // CHECK4-NEXT: [[ARGC_ADDR:%.*]] = alloca ptr, align 8 // CHECK4-NEXT: store ptr [[ARGC]], ptr [[ARGC_ADDR]], align 8 -// CHECK4-NEXT: call void @llvm.dbg.declare(metadata ptr [[ARGC_ADDR]], metadata [[META50:![0-9]+]], metadata !DIExpression()), !dbg [[DBG51:![0-9]+]] -// CHECK4-NEXT: [[TMP0:%.*]] = load ptr, ptr [[ARGC_ADDR]], align 8, !dbg [[DBG52:![0-9]+]] -// CHECK4-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds ptr, ptr [[TMP0]], i64 0, !dbg [[DBG52]] -// CHECK4-NEXT: [[TMP1:%.*]] = load ptr, ptr [[ARRAYIDX]], align 8, !dbg [[DBG52]] -// CHECK4-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 0, !dbg [[DBG52]] -// CHECK4-NEXT: [[TMP2:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1, !dbg [[DBG52]] -// CHECK4-NEXT: [[TMP3:%.*]] = zext i8 [[TMP2]] to i64, !dbg [[DBG52]] -// CHECK4-NEXT: [[OMP_GLOBAL_THREAD_NUM:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB3:[0-9]+]]), !dbg [[DBG53:![0-9]+]] +// CHECK4-NEXT: tail call void @llvm.dbg.declare(metadata ptr [[ARGC_ADDR]], metadata [[META48:![0-9]+]], metadata !DIExpression()), !dbg [[DBG49:![0-9]+]] +// CHECK4-NEXT: [[TMP0:%.*]] = load ptr, ptr [[ARGC_ADDR]], align 8, !dbg [[DBG50:![0-9]+]] +// CHECK4-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds ptr, ptr [[TMP0]], i64 0, !dbg [[DBG50]] +// CHECK4-NEXT: [[TMP1:%.*]] = load ptr, ptr [[ARRAYIDX]], align 8, !dbg [[DBG50]] +// CHECK4-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 0, !dbg [[DBG50]] +// CHECK4-NEXT: [[TMP2:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1, !dbg [[DBG50]] +// CHECK4-NEXT: [[TMP3:%.*]] = zext i8 [[TMP2]] to i64, !dbg [[DBG50]] +// CHECK4-NEXT: [[OMP_GLOBAL_THREAD_NUM:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB3:[0-9]+]]), !dbg [[DBG51:![0-9]+]] // CHECK4-NEXT: store i64 [[TMP3]], ptr [[DOTRELOADED]], align 8 // CHECK4-NEXT: br label [[OMP_PARALLEL:%.*]] // CHECK4: omp_parallel: @@ -947,16 +947,16 @@ int main (int argc, char **argv) { // CHECK4-NEXT: store ptr [[DOTRELOADED]], ptr [[GEP__RELOADED]], align 8 // CHECK4-NEXT: [[GEP_ARGC_ADDR:%.*]] = getelementptr { ptr, ptr }, ptr [[STRUCTARG]], i32 0, i32 1 // CHECK4-NEXT: store ptr [[ARGC_ADDR]], ptr [[GEP_ARGC_ADDR]], align 8 -// CHECK4-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB3]], i32 1, ptr @_Z5tmainIPPcEiT_..omp_par, ptr [[STRUCTARG]]), !dbg [[DBG54:![0-9]+]] +// CHECK4-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB3]], i32 1, ptr @_Z5tmainIPPcEiT_..omp_par, ptr [[STRUCTARG]]), !dbg [[DBG52:![0-9]+]] // CHECK4-NEXT: br label [[OMP_PAR_OUTLINED_EXIT:%.*]] // CHECK4: omp.par.outlined.exit: // CHECK4-NEXT: br label [[OMP_PAR_EXIT_SPLIT:%.*]] // CHECK4: omp.par.exit.split: -// CHECK4-NEXT: ret i32 0, !dbg [[DBG56:![0-9]+]] +// CHECK4-NEXT: ret i32 0, !dbg [[DBG54:![0-9]+]] // // // CHECK4-LABEL: define {{[^@]+}}@_Z5tmainIPPcEiT_..omp_par -// CHECK4-SAME: (ptr noalias [[TID_ADDR:%.*]], ptr noalias [[ZERO_ADDR:%.*]], ptr [[TMP0:%.*]]) #[[ATTR2:[0-9]+]] !dbg [[DBG57:![0-9]+]] { +// CHECK4-SAME: (ptr noalias [[TID_ADDR:%.*]], ptr noalias [[ZERO_ADDR:%.*]], ptr [[TMP0:%.*]]) #[[ATTR7:[0-9]+]] !dbg [[DBG55:![0-9]+]] { // CHECK4-NEXT: omp.par.entry: // CHECK4-NEXT: [[GEP__RELOADED:%.*]] = getelementptr { ptr, ptr }, ptr [[TMP0]], i32 0, i32 0 // CHECK4-NEXT: [[LOADGEP__RELOADED:%.*]] = load ptr, ptr [[GEP__RELOADED]], align 8 @@ -970,18 +970,18 @@ int main (int argc, char **argv) { // CHECK4-NEXT: [[TMP2:%.*]] = load i64, ptr [[LOADGEP__RELOADED]], align 8 // CHECK4-NEXT: br label [[OMP_PAR_REGION:%.*]] // CHECK4: omp.par.region: -// CHECK4-NEXT: [[TMP3:%.*]] = load ptr, ptr [[LOADGEP_ARGC_ADDR]], align 8, !dbg [[DBG58:![0-9]+]] -// CHECK4-NEXT: call void @_Z3fooIPPcEvT_(ptr noundef [[TMP3]]), !dbg [[DBG58]] -// CHECK4-NEXT: call void @llvm.dbg.declare(metadata ptr [[VAR]], metadata [[META60:![0-9]+]], metadata !DIExpression()), !dbg [[DBG67:![0-9]+]] -// CHECK4-NEXT: [[TMP4:%.*]] = load ptr, ptr [[VAR]], align 8, !dbg [[DBG67]] -// CHECK4-NEXT: [[TMP5:%.*]] = mul nsw i64 0, [[TMP2]], !dbg [[DBG67]] -// CHECK4-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds double, ptr [[TMP4]], i64 [[TMP5]], !dbg [[DBG67]] -// CHECK4-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds double, ptr [[ARRAYIDX2]], i64 0, !dbg [[DBG67]] -// CHECK4-NEXT: br label [[OMP_PAR_REGION_PARALLEL_AFTER:%.*]], !dbg [[DBG68:![0-9]+]] +// CHECK4-NEXT: [[TMP3:%.*]] = load ptr, ptr [[LOADGEP_ARGC_ADDR]], align 8, !dbg [[DBG56:![0-9]+]] +// CHECK4-NEXT: call void @_Z3fooIPPcEvT_(ptr noundef [[TMP3]]), !dbg [[DBG56]] +// CHECK4-NEXT: tail call void @llvm.dbg.declare(metadata ptr [[VAR]], metadata [[META58:![0-9]+]], metadata !DIExpression()), !dbg [[DBG65:![0-9]+]] +// CHECK4-NEXT: [[TMP4:%.*]] = load ptr, ptr [[VAR]], align 8, !dbg [[DBG65]] +// CHECK4-NEXT: [[TMP5:%.*]] = mul nsw i64 0, [[TMP2]], !dbg [[DBG65]] +// CHECK4-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds double, ptr [[TMP4]], i64 [[TMP5]], !dbg [[DBG65]] +// CHECK4-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds double, ptr [[ARRAYIDX2]], i64 0, !dbg [[DBG65]] +// CHECK4-NEXT: br label [[OMP_PAR_REGION_PARALLEL_AFTER:%.*]], !dbg [[DBG66:![0-9]+]] // CHECK4: omp.par.region.parallel.after: // CHECK4-NEXT: br label [[OMP_PAR_PRE_FINALIZE:%.*]] // CHECK4: omp.par.pre_finalize: -// CHECK4-NEXT: br label [[OMP_PAR_OUTLINED_EXIT_EXITSTUB:%.*]], !dbg [[DBG68]] +// CHECK4-NEXT: br label [[OMP_PAR_OUTLINED_EXIT_EXITSTUB:%.*]], !dbg [[DBG66]] // CHECK4: omp.par.outlined.exit.exitStub: // CHECK4-NEXT: ret void // @@ -991,6 +991,6 @@ int main (int argc, char **argv) { // CHECK4-NEXT: entry: // CHECK4-NEXT: [[ARGC_ADDR:%.*]] = alloca ptr, align 8 // CHECK4-NEXT: store ptr [[ARGC]], ptr [[ARGC_ADDR]], align 8 -// CHECK4-NEXT: call void @llvm.dbg.declare(metadata ptr [[ARGC_ADDR]], metadata [[META72:![0-9]+]], metadata !DIExpression()), !dbg [[DBG73:![0-9]+]] +// CHECK4-NEXT: tail call void @llvm.dbg.declare(metadata ptr [[ARGC_ADDR]], metadata [[META72:![0-9]+]], metadata !DIExpression()), !dbg [[DBG73:![0-9]+]] // CHECK4-NEXT: ret void, !dbg [[DBG73]] // diff --git a/clang/test/OpenMP/target_parallel_debug_codegen.cpp b/clang/test/OpenMP/target_parallel_debug_codegen.cpp index e99c393a4021c..0852a0772b883 100644 --- a/clang/test/OpenMP/target_parallel_debug_codegen.cpp +++ b/clang/test/OpenMP/target_parallel_debug_codegen.cpp @@ -65,7 +65,7 @@ int main() { return 0; } // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l23_debug__ -// CHECK1-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], ptr addrspace(1) noalias noundef [[C:%.*]], i32 noundef [[A:%.*]], ptr noalias noundef [[B:%.*]], ptr addrspace(1) noalias noundef [[BB:%.*]]) #[[ATTR0:[0-9]+]] !dbg [[DBG32:![0-9]+]] { +// CHECK1-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], ptr addrspace(1) noalias noundef [[C:%.*]], i32 noundef [[A:%.*]], ptr noalias noundef [[B:%.*]], ptr addrspace(1) noalias noundef [[BB:%.*]]) #[[ATTR1:[0-9]+]] !dbg [[DBG29:![0-9]+]] { // CHECK1-NEXT: entry: // CHECK1-NEXT: [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8 // CHECK1-NEXT: [[C_ADDR:%.*]] = alloca ptr addrspace(1), align 8 @@ -78,52 +78,84 @@ int main() { // CHECK1-NEXT: [[A_CASTED:%.*]] = alloca i64, align 8 // CHECK1-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [4 x ptr], align 8 // CHECK1-NEXT: store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8 -// CHECK1-NEXT: call void @llvm.dbg.declare(metadata ptr [[DYN_PTR_ADDR]], metadata [[META51:![0-9]+]], metadata !DIExpression()), !dbg [[DBG52:![0-9]+]] +// CHECK1-NEXT: tail call void @llvm.dbg.declare(metadata ptr [[DYN_PTR_ADDR]], metadata [[META48:![0-9]+]], metadata !DIExpression()), !dbg [[DBG49:![0-9]+]] // CHECK1-NEXT: store ptr addrspace(1) [[C]], ptr [[C_ADDR]], align 8 -// CHECK1-NEXT: call void @llvm.dbg.declare(metadata ptr [[C_ADDR]], metadata [[META53:![0-9]+]], metadata !DIExpression()), !dbg [[DBG54:![0-9]+]] +// CHECK1-NEXT: tail call void @llvm.dbg.declare(metadata ptr [[C_ADDR]], metadata [[META50:![0-9]+]], metadata !DIExpression()), !dbg [[DBG51:![0-9]+]] // CHECK1-NEXT: store i32 [[A]], ptr [[A_ADDR]], align 4 -// CHECK1-NEXT: call void @llvm.dbg.declare(metadata ptr [[A_ADDR]], metadata [[META55:![0-9]+]], metadata !DIExpression()), !dbg [[DBG56:![0-9]+]] +// CHECK1-NEXT: tail call void @llvm.dbg.declare(metadata ptr [[A_ADDR]], metadata [[META52:![0-9]+]], metadata !DIExpression()), !dbg [[DBG53:![0-9]+]] // CHECK1-NEXT: store ptr [[B]], ptr [[B_ADDR]], align 8 -// CHECK1-NEXT: call void @llvm.dbg.declare(metadata ptr [[B_ADDR]], metadata [[META57:![0-9]+]], metadata !DIExpression()), !dbg [[DBG58:![0-9]+]] +// CHECK1-NEXT: tail call void @llvm.dbg.declare(metadata ptr [[B_ADDR]], metadata [[META54:![0-9]+]], metadata !DIExpression()), !dbg [[DBG55:![0-9]+]] // CHECK1-NEXT: store ptr addrspace(1) [[BB]], ptr [[BB_ADDR]], align 8 -// CHECK1-NEXT: call void @llvm.dbg.declare(metadata ptr [[BB_ADDR]], metadata [[META59:![0-9]+]], metadata !DIExpression()), !dbg [[DBG60:![0-9]+]] -// CHECK1-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr [[C_ADDR]], align 8, !dbg [[DBG61:![0-9]+]] -// CHECK1-NEXT: [[TMP1:%.*]] = addrspacecast ptr addrspace(1) [[TMP0]] to ptr, !dbg [[DBG61]] -// CHECK1-NEXT: store ptr [[TMP1]], ptr [[TMP]], align 8, !dbg [[DBG61]] -// CHECK1-NEXT: [[TMP2:%.*]] = load ptr, ptr [[TMP]], align 8, !dbg [[DBG61]] -// CHECK1-NEXT: [[TMP3:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG61]] -// CHECK1-NEXT: store ptr [[TMP3]], ptr [[_TMP1]], align 8, !dbg [[DBG61]] -// CHECK1-NEXT: [[TMP4:%.*]] = load ptr, ptr [[_TMP1]], align 8, !dbg [[DBG61]] -// CHECK1-NEXT: [[TMP5:%.*]] = load ptr addrspace(1), ptr [[BB_ADDR]], align 8, !dbg [[DBG61]] -// CHECK1-NEXT: [[TMP6:%.*]] = addrspacecast ptr addrspace(1) [[TMP5]] to ptr, !dbg [[DBG61]] -// CHECK1-NEXT: store ptr [[TMP6]], ptr [[_TMP2]], align 8, !dbg [[DBG61]] -// CHECK1-NEXT: [[TMP7:%.*]] = load ptr, ptr [[_TMP2]], align 8, !dbg [[DBG61]] -// CHECK1-NEXT: [[TMP8:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l23_kernel_environment, ptr [[DYN_PTR]]), !dbg [[DBG61]] -// CHECK1-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP8]], -1, !dbg [[DBG61]] -// CHECK1-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]], !dbg [[DBG61]] +// CHECK1-NEXT: tail call void @llvm.dbg.declare(metadata ptr [[BB_ADDR]], metadata [[META56:![0-9]+]], metadata !DIExpression()), !dbg [[DBG57:![0-9]+]] +// CHECK1-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr [[C_ADDR]], align 8, !dbg [[DBG58:![0-9]+]] +// CHECK1-NEXT: [[TMP1:%.*]] = addrspacecast ptr addrspace(1) [[TMP0]] to ptr, !dbg [[DBG58]] +// CHECK1-NEXT: store ptr [[TMP1]], ptr [[TMP]], align 8, !dbg [[DBG58]] +// CHECK1-NEXT: [[TMP2:%.*]] = load ptr, ptr [[TMP]], align 8, !dbg [[DBG58]] +// CHECK1-NEXT: [[TMP3:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG58]] +// CHECK1-NEXT: store ptr [[TMP3]], ptr [[_TMP1]], align 8, !dbg [[DBG58]] +// CHECK1-NEXT: [[TMP4:%.*]] = load ptr, ptr [[_TMP1]], align 8, !dbg [[DBG58]] +// CHECK1-NEXT: [[TMP5:%.*]] = load ptr addrspace(1), ptr [[BB_ADDR]], align 8, !dbg [[DBG58]] +// CHECK1-NEXT: [[TMP6:%.*]] = addrspacecast ptr addrspace(1) [[TMP5]] to ptr, !dbg [[DBG58]] +// CHECK1-NEXT: store ptr [[TMP6]], ptr [[_TMP2]], align 8, !dbg [[DBG58]] +// CHECK1-NEXT: [[TMP7:%.*]] = load ptr, ptr [[_TMP2]], align 8, !dbg [[DBG58]] +// CHECK1-NEXT: [[TMP8:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l23_kernel_environment, ptr [[DYN_PTR]]), !dbg [[DBG58]] +// CHECK1-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP8]], -1, !dbg [[DBG58]] +// CHECK1-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]], !dbg [[DBG58]] // CHECK1: user_code.entry: -// CHECK1-NEXT: [[TMP9:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB3:[0-9]+]]) -// CHECK1-NEXT: [[TMP10:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG62:![0-9]+]] -// CHECK1-NEXT: store i32 [[TMP10]], ptr [[A_CASTED]], align 4, !dbg [[DBG62]] -// CHECK1-NEXT: [[TMP11:%.*]] = load i64, ptr [[A_CASTED]], align 8, !dbg [[DBG62]] -// CHECK1-NEXT: [[TMP12:%.*]] = getelementptr inbounds [4 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 0, !dbg [[DBG62]] -// CHECK1-NEXT: store ptr [[TMP2]], ptr [[TMP12]], align 8, !dbg [[DBG62]] -// CHECK1-NEXT: [[TMP13:%.*]] = getelementptr inbounds [4 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 1, !dbg [[DBG62]] -// CHECK1-NEXT: [[TMP14:%.*]] = inttoptr i64 [[TMP11]] to ptr, !dbg [[DBG62]] -// CHECK1-NEXT: store ptr [[TMP14]], ptr [[TMP13]], align 8, !dbg [[DBG62]] -// CHECK1-NEXT: [[TMP15:%.*]] = getelementptr inbounds [4 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 2, !dbg [[DBG62]] -// CHECK1-NEXT: store ptr [[TMP4]], ptr [[TMP15]], align 8, !dbg [[DBG62]] -// CHECK1-NEXT: [[TMP16:%.*]] = getelementptr inbounds [4 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 3, !dbg [[DBG62]] -// CHECK1-NEXT: store ptr [[TMP7]], ptr [[TMP16]], align 8, !dbg [[DBG62]] -// CHECK1-NEXT: call void @__kmpc_parallel_51(ptr @[[GLOB3]], i32 [[TMP9]], i32 1, i32 -1, i32 -1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l23_debug___omp_outlined, ptr null, ptr [[CAPTURED_VARS_ADDRS]], i64 4), !dbg [[DBG62]] -// CHECK1-NEXT: call void @__kmpc_target_deinit(), !dbg [[DBG63:![0-9]+]] -// CHECK1-NEXT: ret void, !dbg [[DBG65:![0-9]+]] +// CHECK1-NEXT: [[TMP9:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB3:[0-9]+]]), !dbg [[DBG59:![0-9]+]] +// CHECK1-NEXT: [[TMP10:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG61:![0-9]+]] +// CHECK1-NEXT: store i32 [[TMP10]], ptr [[A_CASTED]], align 4, !dbg [[DBG61]] +// CHECK1-NEXT: [[TMP11:%.*]] = load i64, ptr [[A_CASTED]], align 8, !dbg [[DBG61]] +// CHECK1-NEXT: [[TMP12:%.*]] = getelementptr inbounds [4 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 0, !dbg [[DBG61]] +// CHECK1-NEXT: store ptr [[TMP2]], ptr [[TMP12]], align 8, !dbg [[DBG61]] +// CHECK1-NEXT: [[TMP13:%.*]] = getelementptr inbounds [4 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 1, !dbg [[DBG61]] +// CHECK1-NEXT: [[TMP14:%.*]] = inttoptr i64 [[TMP11]] to ptr, !dbg [[DBG61]] +// CHECK1-NEXT: store ptr [[TMP14]], ptr [[TMP13]], align 8, !dbg [[DBG61]] +// CHECK1-NEXT: [[TMP15:%.*]] = getelementptr inbounds [4 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 2, !dbg [[DBG61]] +// CHECK1-NEXT: store ptr [[TMP4]], ptr [[TMP15]], align 8, !dbg [[DBG61]] +// CHECK1-NEXT: [[TMP16:%.*]] = getelementptr inbounds [4 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 3, !dbg [[DBG61]] +// CHECK1-NEXT: store ptr [[TMP7]], ptr [[TMP16]], align 8, !dbg [[DBG61]] +// CHECK1-NEXT: call void @__kmpc_parallel_51(ptr @[[GLOB3]], i32 [[TMP9]], i32 1, i32 -1, i32 -1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l23_debug___omp_outlined, ptr null, ptr [[CAPTURED_VARS_ADDRS]], i64 4), !dbg [[DBG61]] +// CHECK1-NEXT: call void @__kmpc_target_deinit(), !dbg [[DBG62:![0-9]+]] +// CHECK1-NEXT: ret void, !dbg [[DBG63:![0-9]+]] // CHECK1: worker.exit: -// CHECK1-NEXT: ret void, !dbg [[DBG61]] +// CHECK1-NEXT: ret void, !dbg [[DBG58]] +// +// +// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l23 +// CHECK1-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], ptr noundef nonnull align 4 dereferenceable(4000) [[C:%.*]], i64 noundef [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(400) [[B:%.*]], ptr noundef nonnull align 1 dereferenceable(1) [[BB:%.*]]) #[[ATTR2:[0-9]+]] !dbg [[DBG64:![0-9]+]] { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8 +// CHECK1-NEXT: [[C_ADDR:%.*]] = alloca ptr, align 8 +// CHECK1-NEXT: [[A_ADDR:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[B_ADDR:%.*]] = alloca ptr, align 8 +// CHECK1-NEXT: [[BB_ADDR:%.*]] = alloca ptr, align 8 +// CHECK1-NEXT: store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8 +// CHECK1-NEXT: tail call void @llvm.dbg.declare(metadata ptr [[DYN_PTR_ADDR]], metadata [[META71:![0-9]+]], metadata !DIExpression()), !dbg [[DBG72:![0-9]+]] +// CHECK1-NEXT: store ptr [[C]], ptr [[C_ADDR]], align 8 +// CHECK1-NEXT: tail call void @llvm.dbg.declare(metadata ptr [[C_ADDR]], metadata [[META73:![0-9]+]], metadata !DIExpression()), !dbg [[DBG72]] +// CHECK1-NEXT: store i64 [[A]], ptr [[A_ADDR]], align 8 +// CHECK1-NEXT: tail call void @llvm.dbg.declare(metadata ptr [[A_ADDR]], metadata [[META74:![0-9]+]], metadata !DIExpression()), !dbg [[DBG72]] +// CHECK1-NEXT: store ptr [[B]], ptr [[B_ADDR]], align 8 +// CHECK1-NEXT: tail call void @llvm.dbg.declare(metadata ptr [[B_ADDR]], metadata [[META75:![0-9]+]], metadata !DIExpression()), !dbg [[DBG72]] +// CHECK1-NEXT: store ptr [[BB]], ptr [[BB_ADDR]], align 8 +// CHECK1-NEXT: tail call void @llvm.dbg.declare(metadata ptr [[BB_ADDR]], metadata [[META76:![0-9]+]], metadata !DIExpression()), !dbg [[DBG72]] +// CHECK1-NEXT: [[TMP0:%.*]] = load ptr, ptr [[C_ADDR]], align 8, !dbg [[DBG77:![0-9]+]] +// CHECK1-NEXT: [[TMP1:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG77]] +// CHECK1-NEXT: [[TMP2:%.*]] = load ptr, ptr [[BB_ADDR]], align 8, !dbg [[DBG77]] +// CHECK1-NEXT: [[TMP3:%.*]] = load ptr, ptr [[DYN_PTR_ADDR]], align 8, !dbg [[DBG77]] +// CHECK1-NEXT: [[TMP4:%.*]] = load ptr, ptr [[C_ADDR]], align 8, !dbg [[DBG77]] +// CHECK1-NEXT: [[TMP5:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG77]] +// CHECK1-NEXT: [[TMP6:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG77]] +// CHECK1-NEXT: [[TMP7:%.*]] = load ptr, ptr [[BB_ADDR]], align 8, !dbg [[DBG77]] +// CHECK1-NEXT: [[TMP8:%.*]] = addrspacecast ptr [[TMP4]] to ptr addrspace(1), !dbg [[DBG77]] +// CHECK1-NEXT: [[TMP9:%.*]] = addrspacecast ptr [[TMP7]] to ptr addrspace(1), !dbg [[DBG77]] +// CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l23_debug__(ptr [[TMP3]], ptr addrspace(1) [[TMP8]], i32 [[TMP5]], ptr [[TMP6]], ptr addrspace(1) [[TMP9]]) #[[ATTR4:[0-9]+]], !dbg [[DBG77]] +// CHECK1-NEXT: ret void, !dbg [[DBG77]] // // // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l23_debug___omp_outlined_debug__ -// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr addrspace(1) noalias noundef [[C:%.*]], i32 noundef [[A:%.*]], ptr noalias noundef [[B:%.*]], ptr addrspace(1) noalias noundef [[BB:%.*]]) #[[ATTR2:[0-9]+]] !dbg [[DBG66:![0-9]+]] { +// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr addrspace(1) noalias noundef [[C:%.*]], i32 noundef [[A:%.*]], ptr noalias noundef [[B:%.*]], ptr addrspace(1) noalias noundef [[BB:%.*]]) #[[ATTR1]] !dbg [[DBG78:![0-9]+]] { // CHECK1-NEXT: entry: // CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -140,83 +172,83 @@ int main() { // CHECK1-NEXT: [[H:%.*]] = alloca ptr, align 8 // CHECK1-NEXT: [[D:%.*]] = alloca i32, align 4 // CHECK1-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK1-NEXT: call void @llvm.dbg.declare(metadata ptr [[DOTGLOBAL_TID__ADDR]], metadata [[META73:![0-9]+]], metadata !DIExpression()), !dbg [[DBG74:![0-9]+]] +// CHECK1-NEXT: tail call void @llvm.dbg.declare(metadata ptr [[DOTGLOBAL_TID__ADDR]], metadata [[META85:![0-9]+]], metadata !DIExpression()), !dbg [[DBG86:![0-9]+]] // CHECK1-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8 -// CHECK1-NEXT: call void @llvm.dbg.declare(metadata ptr [[DOTBOUND_TID__ADDR]], metadata [[META75:![0-9]+]], metadata !DIExpression()), !dbg [[DBG74]] +// CHECK1-NEXT: tail call void @llvm.dbg.declare(metadata ptr [[DOTBOUND_TID__ADDR]], metadata [[META87:![0-9]+]], metadata !DIExpression()), !dbg [[DBG86]] // CHECK1-NEXT: store ptr addrspace(1) [[C]], ptr [[C_ADDR]], align 8 -// CHECK1-NEXT: call void @llvm.dbg.declare(metadata ptr [[C_ADDR]], metadata [[META76:![0-9]+]], metadata !DIExpression()), !dbg [[DBG77:![0-9]+]] +// CHECK1-NEXT: tail call void @llvm.dbg.declare(metadata ptr [[C_ADDR]], metadata [[META88:![0-9]+]], metadata !DIExpression()), !dbg [[DBG89:![0-9]+]] // CHECK1-NEXT: store i32 [[A]], ptr [[A_ADDR]], align 4 -// CHECK1-NEXT: call void @llvm.dbg.declare(metadata ptr [[A_ADDR]], metadata [[META78:![0-9]+]], metadata !DIExpression()), !dbg [[DBG79:![0-9]+]] +// CHECK1-NEXT: tail call void @llvm.dbg.declare(metadata ptr [[A_ADDR]], metadata [[META90:![0-9]+]], metadata !DIExpression()), !dbg [[DBG91:![0-9]+]] // CHECK1-NEXT: store ptr [[B]], ptr [[B_ADDR]], align 8 -// CHECK1-NEXT: call void @llvm.dbg.declare(metadata ptr [[B_ADDR]], metadata [[META80:![0-9]+]], metadata !DIExpression()), !dbg [[DBG81:![0-9]+]] +// CHECK1-NEXT: tail call void @llvm.dbg.declare(metadata ptr [[B_ADDR]], metadata [[META92:![0-9]+]], metadata !DIExpression()), !dbg [[DBG93:![0-9]+]] // CHECK1-NEXT: store ptr addrspace(1) [[BB]], ptr [[BB_ADDR]], align 8 -// CHECK1-NEXT: call void @llvm.dbg.declare(metadata ptr [[BB_ADDR]], metadata [[META82:![0-9]+]], metadata !DIExpression()), !dbg [[DBG83:![0-9]+]] -// CHECK1-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr [[C_ADDR]], align 8, !dbg [[DBG84:![0-9]+]] -// CHECK1-NEXT: [[TMP1:%.*]] = addrspacecast ptr addrspace(1) [[TMP0]] to ptr, !dbg [[DBG84]] -// CHECK1-NEXT: store ptr [[TMP1]], ptr [[TMP]], align 8, !dbg [[DBG84]] -// CHECK1-NEXT: [[TMP2:%.*]] = load ptr, ptr [[TMP]], align 8, !dbg [[DBG84]] -// CHECK1-NEXT: [[TMP3:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG84]] -// CHECK1-NEXT: store ptr [[TMP3]], ptr [[_TMP1]], align 8, !dbg [[DBG84]] -// CHECK1-NEXT: [[TMP4:%.*]] = load ptr, ptr [[_TMP1]], align 8, !dbg [[DBG84]] -// CHECK1-NEXT: [[TMP5:%.*]] = load ptr addrspace(1), ptr [[BB_ADDR]], align 8, !dbg [[DBG84]] -// CHECK1-NEXT: [[TMP6:%.*]] = addrspacecast ptr addrspace(1) [[TMP5]] to ptr, !dbg [[DBG84]] -// CHECK1-NEXT: store ptr [[TMP6]], ptr [[_TMP2]], align 8, !dbg [[DBG84]] -// CHECK1-NEXT: [[TMP7:%.*]] = load ptr, ptr [[_TMP2]], align 8, !dbg [[DBG84]] -// CHECK1-NEXT: call void @llvm.dbg.declare(metadata ptr [[B3]], metadata [[META85:![0-9]+]], metadata !DIExpression()), !dbg [[DBG74]] -// CHECK1-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[B3]], ptr align 4 [[TMP4]], i64 400, i1 false), !dbg [[DBG84]] -// CHECK1-NEXT: call void @llvm.dbg.declare(metadata ptr [[F]], metadata [[META86:![0-9]+]], metadata !DIExpression()), !dbg [[DBG89:![0-9]+]] -// CHECK1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x [10 x [10 x i32]]], ptr [[TMP2]], i64 0, i64 1, !dbg [[DBG90:![0-9]+]] -// CHECK1-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[ARRAYIDX]], i64 0, i64 1, !dbg [[DBG90]] -// CHECK1-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX4]], i64 0, i64 1, !dbg [[DBG90]] -// CHECK1-NEXT: store ptr [[ARRAYIDX5]], ptr [[F]], align 8, !dbg [[DBG89]] -// CHECK1-NEXT: call void @llvm.dbg.declare(metadata ptr [[G]], metadata [[META91:![0-9]+]], metadata !DIExpression()), !dbg [[DBG92:![0-9]+]] -// CHECK1-NEXT: store ptr [[A_ADDR]], ptr [[G]], align 8, !dbg [[DBG92]] -// CHECK1-NEXT: call void @llvm.dbg.declare(metadata ptr [[H]], metadata [[META93:![0-9]+]], metadata !DIExpression()), !dbg [[DBG94:![0-9]+]] -// CHECK1-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[B3]], i64 0, i64 1, !dbg [[DBG95:![0-9]+]] -// CHECK1-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX6]], i64 0, i64 1, !dbg [[DBG95]] -// CHECK1-NEXT: store ptr [[ARRAYIDX7]], ptr [[H]], align 8, !dbg [[DBG94]] -// CHECK1-NEXT: call void @llvm.dbg.declare(metadata ptr [[D]], metadata [[META96:![0-9]+]], metadata !DIExpression()), !dbg [[DBG97:![0-9]+]] -// CHECK1-NEXT: store i32 15, ptr [[D]], align 4, !dbg [[DBG97]] -// CHECK1-NEXT: store i32 5, ptr [[A_ADDR]], align 4, !dbg [[DBG98:![0-9]+]] -// CHECK1-NEXT: [[ARRAYIDX8:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[B3]], i64 0, i64 0, !dbg [[DBG99:![0-9]+]] -// CHECK1-NEXT: [[TMP8:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG100:![0-9]+]] -// CHECK1-NEXT: [[IDXPROM:%.*]] = sext i32 [[TMP8]] to i64, !dbg [[DBG99]] -// CHECK1-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX8]], i64 0, i64 [[IDXPROM]], !dbg [[DBG99]] -// CHECK1-NEXT: store i32 10, ptr [[ARRAYIDX9]], align 4, !dbg [[DBG101:![0-9]+]] -// CHECK1-NEXT: [[ARRAYIDX10:%.*]] = getelementptr inbounds [10 x [10 x [10 x i32]]], ptr [[TMP2]], i64 0, i64 0, !dbg [[DBG102:![0-9]+]] -// CHECK1-NEXT: [[ARRAYIDX11:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[ARRAYIDX10]], i64 0, i64 0, !dbg [[DBG102]] -// CHECK1-NEXT: [[TMP9:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG103:![0-9]+]] -// CHECK1-NEXT: [[IDXPROM12:%.*]] = sext i32 [[TMP9]] to i64, !dbg [[DBG102]] -// CHECK1-NEXT: [[ARRAYIDX13:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX11]], i64 0, i64 [[IDXPROM12]], !dbg [[DBG102]] -// CHECK1-NEXT: store i32 11, ptr [[ARRAYIDX13]], align 4, !dbg [[DBG104:![0-9]+]] -// CHECK1-NEXT: [[ARRAYIDX14:%.*]] = getelementptr inbounds [10 x [10 x [10 x i32]]], ptr [[TMP2]], i64 0, i64 0, !dbg [[DBG105:![0-9]+]] -// CHECK1-NEXT: [[ARRAYIDX15:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[ARRAYIDX14]], i64 0, i64 0, !dbg [[DBG105]] -// CHECK1-NEXT: [[TMP10:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG106:![0-9]+]] -// CHECK1-NEXT: [[IDXPROM16:%.*]] = sext i32 [[TMP10]] to i64, !dbg [[DBG105]] -// CHECK1-NEXT: [[ARRAYIDX17:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX15]], i64 0, i64 [[IDXPROM16]], !dbg [[DBG105]] -// CHECK1-NEXT: [[TMP11:%.*]] = load i32, ptr [[ARRAYIDX17]], align 4, !dbg [[DBG105]] -// CHECK1-NEXT: [[ARRAYIDX18:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[B3]], i64 0, i64 0, !dbg [[DBG107:![0-9]+]] -// CHECK1-NEXT: [[TMP12:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG108:![0-9]+]] -// CHECK1-NEXT: [[IDXPROM19:%.*]] = sext i32 [[TMP12]] to i64, !dbg [[DBG107]] -// CHECK1-NEXT: [[ARRAYIDX20:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX18]], i64 0, i64 [[IDXPROM19]], !dbg [[DBG107]] -// CHECK1-NEXT: store i32 [[TMP11]], ptr [[ARRAYIDX20]], align 4, !dbg [[DBG109:![0-9]+]] -// CHECK1-NEXT: [[ARRAYIDX21:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[B3]], i64 0, i64 0, !dbg [[DBG110:![0-9]+]] -// CHECK1-NEXT: [[TMP13:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG111:![0-9]+]] -// CHECK1-NEXT: [[IDXPROM22:%.*]] = sext i32 [[TMP13]] to i64, !dbg [[DBG110]] -// CHECK1-NEXT: [[ARRAYIDX23:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX21]], i64 0, i64 [[IDXPROM22]], !dbg [[DBG110]] -// CHECK1-NEXT: [[TMP14:%.*]] = load i32, ptr [[ARRAYIDX23]], align 4, !dbg [[DBG110]] -// CHECK1-NEXT: [[TMP15:%.*]] = load i8, ptr [[TMP7]], align 1, !dbg [[DBG112:![0-9]+]] -// CHECK1-NEXT: [[TOBOOL:%.*]] = trunc i8 [[TMP15]] to i1, !dbg [[DBG112]] -// CHECK1-NEXT: [[CONV:%.*]] = zext i1 [[TOBOOL]] to i32, !dbg [[DBG112]] -// CHECK1-NEXT: [[OR:%.*]] = or i32 [[CONV]], [[TMP14]], !dbg [[DBG112]] -// CHECK1-NEXT: [[TOBOOL24:%.*]] = icmp ne i32 [[OR]], 0, !dbg [[DBG112]] -// CHECK1-NEXT: [[FROMBOOL:%.*]] = zext i1 [[TOBOOL24]] to i8, !dbg [[DBG112]] -// CHECK1-NEXT: store i8 [[FROMBOOL]], ptr [[TMP7]], align 1, !dbg [[DBG112]] -// CHECK1-NEXT: ret void, !dbg [[DBG113:![0-9]+]] +// CHECK1-NEXT: tail call void @llvm.dbg.declare(metadata ptr [[BB_ADDR]], metadata [[META94:![0-9]+]], metadata !DIExpression()), !dbg [[DBG95:![0-9]+]] +// CHECK1-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr [[C_ADDR]], align 8, !dbg [[DBG96:![0-9]+]] +// CHECK1-NEXT: [[TMP1:%.*]] = addrspacecast ptr addrspace(1) [[TMP0]] to ptr, !dbg [[DBG96]] +// CHECK1-NEXT: store ptr [[TMP1]], ptr [[TMP]], align 8, !dbg [[DBG96]] +// CHECK1-NEXT: [[TMP2:%.*]] = load ptr, ptr [[TMP]], align 8, !dbg [[DBG96]] +// CHECK1-NEXT: [[TMP3:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG96]] +// CHECK1-NEXT: store ptr [[TMP3]], ptr [[_TMP1]], align 8, !dbg [[DBG96]] +// CHECK1-NEXT: [[TMP4:%.*]] = load ptr, ptr [[_TMP1]], align 8, !dbg [[DBG96]] +// CHECK1-NEXT: [[TMP5:%.*]] = load ptr addrspace(1), ptr [[BB_ADDR]], align 8, !dbg [[DBG96]] +// CHECK1-NEXT: [[TMP6:%.*]] = addrspacecast ptr addrspace(1) [[TMP5]] to ptr, !dbg [[DBG96]] +// CHECK1-NEXT: store ptr [[TMP6]], ptr [[_TMP2]], align 8, !dbg [[DBG96]] +// CHECK1-NEXT: [[TMP7:%.*]] = load ptr, ptr [[_TMP2]], align 8, !dbg [[DBG96]] +// CHECK1-NEXT: tail call void @llvm.dbg.declare(metadata ptr [[B3]], metadata [[META97:![0-9]+]], metadata !DIExpression()), !dbg [[DBG86]] +// CHECK1-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[B3]], ptr align 4 [[TMP4]], i64 400, i1 false), !dbg [[DBG96]] +// CHECK1-NEXT: tail call void @llvm.dbg.declare(metadata ptr [[F]], metadata [[META98:![0-9]+]], metadata !DIExpression()), !dbg [[DBG101:![0-9]+]] +// CHECK1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x [10 x [10 x i32]]], ptr [[TMP2]], i64 0, i64 1, !dbg [[DBG102:![0-9]+]] +// CHECK1-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[ARRAYIDX]], i64 0, i64 1, !dbg [[DBG102]] +// CHECK1-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX4]], i64 0, i64 1, !dbg [[DBG102]] +// CHECK1-NEXT: store ptr [[ARRAYIDX5]], ptr [[F]], align 8, !dbg [[DBG101]] +// CHECK1-NEXT: tail call void @llvm.dbg.declare(metadata ptr [[G]], metadata [[META103:![0-9]+]], metadata !DIExpression()), !dbg [[DBG104:![0-9]+]] +// CHECK1-NEXT: store ptr [[A_ADDR]], ptr [[G]], align 8, !dbg [[DBG104]] +// CHECK1-NEXT: tail call void @llvm.dbg.declare(metadata ptr [[H]], metadata [[META105:![0-9]+]], metadata !DIExpression()), !dbg [[DBG106:![0-9]+]] +// CHECK1-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[B3]], i64 0, i64 1, !dbg [[DBG107:![0-9]+]] +// CHECK1-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX6]], i64 0, i64 1, !dbg [[DBG107]] +// CHECK1-NEXT: store ptr [[ARRAYIDX7]], ptr [[H]], align 8, !dbg [[DBG106]] +// CHECK1-NEXT: tail call void @llvm.dbg.declare(metadata ptr [[D]], metadata [[META108:![0-9]+]], metadata !DIExpression()), !dbg [[DBG109:![0-9]+]] +// CHECK1-NEXT: store i32 15, ptr [[D]], align 4, !dbg [[DBG109]] +// CHECK1-NEXT: store i32 5, ptr [[A_ADDR]], align 4, !dbg [[DBG110:![0-9]+]] +// CHECK1-NEXT: [[ARRAYIDX8:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[B3]], i64 0, i64 0, !dbg [[DBG111:![0-9]+]] +// CHECK1-NEXT: [[TMP8:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG112:![0-9]+]] +// CHECK1-NEXT: [[IDXPROM:%.*]] = sext i32 [[TMP8]] to i64, !dbg [[DBG111]] +// CHECK1-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX8]], i64 0, i64 [[IDXPROM]], !dbg [[DBG111]] +// CHECK1-NEXT: store i32 10, ptr [[ARRAYIDX9]], align 4, !dbg [[DBG113:![0-9]+]] +// CHECK1-NEXT: [[ARRAYIDX10:%.*]] = getelementptr inbounds [10 x [10 x [10 x i32]]], ptr [[TMP2]], i64 0, i64 0, !dbg [[DBG114:![0-9]+]] +// CHECK1-NEXT: [[ARRAYIDX11:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[ARRAYIDX10]], i64 0, i64 0, !dbg [[DBG114]] +// CHECK1-NEXT: [[TMP9:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG115:![0-9]+]] +// CHECK1-NEXT: [[IDXPROM12:%.*]] = sext i32 [[TMP9]] to i64, !dbg [[DBG114]] +// CHECK1-NEXT: [[ARRAYIDX13:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX11]], i64 0, i64 [[IDXPROM12]], !dbg [[DBG114]] +// CHECK1-NEXT: store i32 11, ptr [[ARRAYIDX13]], align 4, !dbg [[DBG116:![0-9]+]] +// CHECK1-NEXT: [[ARRAYIDX14:%.*]] = getelementptr inbounds [10 x [10 x [10 x i32]]], ptr [[TMP2]], i64 0, i64 0, !dbg [[DBG117:![0-9]+]] +// CHECK1-NEXT: [[ARRAYIDX15:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[ARRAYIDX14]], i64 0, i64 0, !dbg [[DBG117]] +// CHECK1-NEXT: [[TMP10:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG118:![0-9]+]] +// CHECK1-NEXT: [[IDXPROM16:%.*]] = sext i32 [[TMP10]] to i64, !dbg [[DBG117]] +// CHECK1-NEXT: [[ARRAYIDX17:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX15]], i64 0, i64 [[IDXPROM16]], !dbg [[DBG117]] +// CHECK1-NEXT: [[TMP11:%.*]] = load i32, ptr [[ARRAYIDX17]], align 4, !dbg [[DBG117]] +// CHECK1-NEXT: [[ARRAYIDX18:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[B3]], i64 0, i64 0, !dbg [[DBG119:![0-9]+]] +// CHECK1-NEXT: [[TMP12:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG120:![0-9]+]] +// CHECK1-NEXT: [[IDXPROM19:%.*]] = sext i32 [[TMP12]] to i64, !dbg [[DBG119]] +// CHECK1-NEXT: [[ARRAYIDX20:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX18]], i64 0, i64 [[IDXPROM19]], !dbg [[DBG119]] +// CHECK1-NEXT: store i32 [[TMP11]], ptr [[ARRAYIDX20]], align 4, !dbg [[DBG121:![0-9]+]] +// CHECK1-NEXT: [[ARRAYIDX21:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[B3]], i64 0, i64 0, !dbg [[DBG122:![0-9]+]] +// CHECK1-NEXT: [[TMP13:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG123:![0-9]+]] +// CHECK1-NEXT: [[IDXPROM22:%.*]] = sext i32 [[TMP13]] to i64, !dbg [[DBG122]] +// CHECK1-NEXT: [[ARRAYIDX23:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX21]], i64 0, i64 [[IDXPROM22]], !dbg [[DBG122]] +// CHECK1-NEXT: [[TMP14:%.*]] = load i32, ptr [[ARRAYIDX23]], align 4, !dbg [[DBG122]] +// CHECK1-NEXT: [[TMP15:%.*]] = load i8, ptr [[TMP7]], align 1, !dbg [[DBG124:![0-9]+]] +// CHECK1-NEXT: [[TOBOOL:%.*]] = trunc i8 [[TMP15]] to i1, !dbg [[DBG124]] +// CHECK1-NEXT: [[CONV:%.*]] = zext i1 [[TOBOOL]] to i32, !dbg [[DBG124]] +// CHECK1-NEXT: [[OR:%.*]] = or i32 [[CONV]], [[TMP14]], !dbg [[DBG124]] +// CHECK1-NEXT: [[TOBOOL24:%.*]] = icmp ne i32 [[OR]], 0, !dbg [[DBG124]] +// CHECK1-NEXT: [[FROMBOOL:%.*]] = zext i1 [[TOBOOL24]] to i8, !dbg [[DBG124]] +// CHECK1-NEXT: store i8 [[FROMBOOL]], ptr [[TMP7]], align 1, !dbg [[DBG124]] +// CHECK1-NEXT: ret void, !dbg [[DBG125:![0-9]+]] // // // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l23_debug___omp_outlined -// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4000) [[C:%.*]], i64 noundef [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(400) [[B:%.*]], ptr noundef nonnull align 1 dereferenceable(1) [[BB:%.*]]) #[[ATTR2]] !dbg [[DBG114:![0-9]+]] { +// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4000) [[C:%.*]], i64 noundef [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(400) [[B:%.*]], ptr noundef nonnull align 1 dereferenceable(1) [[BB:%.*]]) #[[ATTR1]] !dbg [[DBG126:![0-9]+]] { // CHECK1-NEXT: entry: // CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -225,66 +257,34 @@ int main() { // CHECK1-NEXT: [[B_ADDR:%.*]] = alloca ptr, align 8 // CHECK1-NEXT: [[BB_ADDR:%.*]] = alloca ptr, align 8 // CHECK1-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK1-NEXT: call void @llvm.dbg.declare(metadata ptr [[DOTGLOBAL_TID__ADDR]], metadata [[META121:![0-9]+]], metadata !DIExpression()), !dbg [[DBG122:![0-9]+]] +// CHECK1-NEXT: tail call void @llvm.dbg.declare(metadata ptr [[DOTGLOBAL_TID__ADDR]], metadata [[META129:![0-9]+]], metadata !DIExpression()), !dbg [[DBG130:![0-9]+]] // CHECK1-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8 -// CHECK1-NEXT: call void @llvm.dbg.declare(metadata ptr [[DOTBOUND_TID__ADDR]], metadata [[META123:![0-9]+]], metadata !DIExpression()), !dbg [[DBG122]] -// CHECK1-NEXT: store ptr [[C]], ptr [[C_ADDR]], align 8 -// CHECK1-NEXT: call void @llvm.dbg.declare(metadata ptr [[C_ADDR]], metadata [[META124:![0-9]+]], metadata !DIExpression()), !dbg [[DBG122]] -// CHECK1-NEXT: store i64 [[A]], ptr [[A_ADDR]], align 8 -// CHECK1-NEXT: call void @llvm.dbg.declare(metadata ptr [[A_ADDR]], metadata [[META125:![0-9]+]], metadata !DIExpression()), !dbg [[DBG122]] -// CHECK1-NEXT: store ptr [[B]], ptr [[B_ADDR]], align 8 -// CHECK1-NEXT: call void @llvm.dbg.declare(metadata ptr [[B_ADDR]], metadata [[META126:![0-9]+]], metadata !DIExpression()), !dbg [[DBG122]] -// CHECK1-NEXT: store ptr [[BB]], ptr [[BB_ADDR]], align 8 -// CHECK1-NEXT: call void @llvm.dbg.declare(metadata ptr [[BB_ADDR]], metadata [[META127:![0-9]+]], metadata !DIExpression()), !dbg [[DBG122]] -// CHECK1-NEXT: [[TMP0:%.*]] = load ptr, ptr [[C_ADDR]], align 8, !dbg [[DBG128:![0-9]+]] -// CHECK1-NEXT: [[TMP1:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG128]] -// CHECK1-NEXT: [[TMP2:%.*]] = load ptr, ptr [[BB_ADDR]], align 8, !dbg [[DBG128]] -// CHECK1-NEXT: [[TMP3:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8, !dbg [[DBG128]] -// CHECK1-NEXT: [[TMP4:%.*]] = load ptr, ptr [[DOTBOUND_TID__ADDR]], align 8, !dbg [[DBG128]] -// CHECK1-NEXT: [[TMP5:%.*]] = load ptr, ptr [[C_ADDR]], align 8, !dbg [[DBG128]] -// CHECK1-NEXT: [[TMP6:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG128]] -// CHECK1-NEXT: [[TMP7:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG128]] -// CHECK1-NEXT: [[TMP8:%.*]] = load ptr, ptr [[BB_ADDR]], align 8, !dbg [[DBG128]] -// CHECK1-NEXT: [[TMP9:%.*]] = addrspacecast ptr [[TMP5]] to ptr addrspace(1), !dbg [[DBG128]] -// CHECK1-NEXT: [[TMP10:%.*]] = addrspacecast ptr [[TMP8]] to ptr addrspace(1), !dbg [[DBG128]] -// CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l23_debug___omp_outlined_debug__(ptr [[TMP3]], ptr [[TMP4]], ptr addrspace(1) [[TMP9]], i32 [[TMP6]], ptr [[TMP7]], ptr addrspace(1) [[TMP10]]) #[[ATTR4:[0-9]+]], !dbg [[DBG128]] -// CHECK1-NEXT: ret void, !dbg [[DBG128]] -// -// -// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l23 -// CHECK1-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], ptr noundef nonnull align 4 dereferenceable(4000) [[C:%.*]], i64 noundef [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(400) [[B:%.*]], ptr noundef nonnull align 1 dereferenceable(1) [[BB:%.*]]) #[[ATTR6:[0-9]+]] !dbg [[DBG129:![0-9]+]] { -// CHECK1-NEXT: entry: -// CHECK1-NEXT: [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8 -// CHECK1-NEXT: [[C_ADDR:%.*]] = alloca ptr, align 8 -// CHECK1-NEXT: [[A_ADDR:%.*]] = alloca i64, align 8 -// CHECK1-NEXT: [[B_ADDR:%.*]] = alloca ptr, align 8 -// CHECK1-NEXT: [[BB_ADDR:%.*]] = alloca ptr, align 8 -// CHECK1-NEXT: store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8 -// CHECK1-NEXT: call void @llvm.dbg.declare(metadata ptr [[DYN_PTR_ADDR]], metadata [[META132:![0-9]+]], metadata !DIExpression()), !dbg [[DBG133:![0-9]+]] +// CHECK1-NEXT: tail call void @llvm.dbg.declare(metadata ptr [[DOTBOUND_TID__ADDR]], metadata [[META131:![0-9]+]], metadata !DIExpression()), !dbg [[DBG130]] // CHECK1-NEXT: store ptr [[C]], ptr [[C_ADDR]], align 8 -// CHECK1-NEXT: call void @llvm.dbg.declare(metadata ptr [[C_ADDR]], metadata [[META134:![0-9]+]], metadata !DIExpression()), !dbg [[DBG133]] +// CHECK1-NEXT: tail call void @llvm.dbg.declare(metadata ptr [[C_ADDR]], metadata [[META132:![0-9]+]], metadata !DIExpression()), !dbg [[DBG130]] // CHECK1-NEXT: store i64 [[A]], ptr [[A_ADDR]], align 8 -// CHECK1-NEXT: call void @llvm.dbg.declare(metadata ptr [[A_ADDR]], metadata [[META135:![0-9]+]], metadata !DIExpression()), !dbg [[DBG133]] +// CHECK1-NEXT: tail call void @llvm.dbg.declare(metadata ptr [[A_ADDR]], metadata [[META133:![0-9]+]], metadata !DIExpression()), !dbg [[DBG130]] // CHECK1-NEXT: store ptr [[B]], ptr [[B_ADDR]], align 8 -// CHECK1-NEXT: call void @llvm.dbg.declare(metadata ptr [[B_ADDR]], metadata [[META136:![0-9]+]], metadata !DIExpression()), !dbg [[DBG133]] +// CHECK1-NEXT: tail call void @llvm.dbg.declare(metadata ptr [[B_ADDR]], metadata [[META134:![0-9]+]], metadata !DIExpression()), !dbg [[DBG130]] // CHECK1-NEXT: store ptr [[BB]], ptr [[BB_ADDR]], align 8 -// CHECK1-NEXT: call void @llvm.dbg.declare(metadata ptr [[BB_ADDR]], metadata [[META137:![0-9]+]], metadata !DIExpression()), !dbg [[DBG133]] -// CHECK1-NEXT: [[TMP0:%.*]] = load ptr, ptr [[C_ADDR]], align 8, !dbg [[DBG138:![0-9]+]] -// CHECK1-NEXT: [[TMP1:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG138]] -// CHECK1-NEXT: [[TMP2:%.*]] = load ptr, ptr [[BB_ADDR]], align 8, !dbg [[DBG138]] -// CHECK1-NEXT: [[TMP3:%.*]] = load ptr, ptr [[DYN_PTR_ADDR]], align 8, !dbg [[DBG138]] -// CHECK1-NEXT: [[TMP4:%.*]] = load ptr, ptr [[C_ADDR]], align 8, !dbg [[DBG138]] -// CHECK1-NEXT: [[TMP5:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG138]] -// CHECK1-NEXT: [[TMP6:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG138]] -// CHECK1-NEXT: [[TMP7:%.*]] = load ptr, ptr [[BB_ADDR]], align 8, !dbg [[DBG138]] -// CHECK1-NEXT: [[TMP8:%.*]] = addrspacecast ptr [[TMP4]] to ptr addrspace(1), !dbg [[DBG138]] -// CHECK1-NEXT: [[TMP9:%.*]] = addrspacecast ptr [[TMP7]] to ptr addrspace(1), !dbg [[DBG138]] -// CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l23_debug__(ptr [[TMP3]], ptr addrspace(1) [[TMP8]], i32 [[TMP5]], ptr [[TMP6]], ptr addrspace(1) [[TMP9]]) #[[ATTR4]], !dbg [[DBG138]] -// CHECK1-NEXT: ret void, !dbg [[DBG138]] +// CHECK1-NEXT: tail call void @llvm.dbg.declare(metadata ptr [[BB_ADDR]], metadata [[META135:![0-9]+]], metadata !DIExpression()), !dbg [[DBG130]] +// CHECK1-NEXT: [[TMP0:%.*]] = load ptr, ptr [[C_ADDR]], align 8, !dbg [[DBG136:![0-9]+]] +// CHECK1-NEXT: [[TMP1:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG136]] +// CHECK1-NEXT: [[TMP2:%.*]] = load ptr, ptr [[BB_ADDR]], align 8, !dbg [[DBG136]] +// CHECK1-NEXT: [[TMP3:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8, !dbg [[DBG136]] +// CHECK1-NEXT: [[TMP4:%.*]] = load ptr, ptr [[DOTBOUND_TID__ADDR]], align 8, !dbg [[DBG136]] +// CHECK1-NEXT: [[TMP5:%.*]] = load ptr, ptr [[C_ADDR]], align 8, !dbg [[DBG136]] +// CHECK1-NEXT: [[TMP6:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG136]] +// CHECK1-NEXT: [[TMP7:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG136]] +// CHECK1-NEXT: [[TMP8:%.*]] = load ptr, ptr [[BB_ADDR]], align 8, !dbg [[DBG136]] +// CHECK1-NEXT: [[TMP9:%.*]] = addrspacecast ptr [[TMP5]] to ptr addrspace(1), !dbg [[DBG136]] +// CHECK1-NEXT: [[TMP10:%.*]] = addrspacecast ptr [[TMP8]] to ptr addrspace(1), !dbg [[DBG136]] +// CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l23_debug___omp_outlined_debug__(ptr [[TMP3]], ptr [[TMP4]], ptr addrspace(1) [[TMP9]], i32 [[TMP6]], ptr [[TMP7]], ptr addrspace(1) [[TMP10]]) #[[ATTR4]], !dbg [[DBG136]] +// CHECK1-NEXT: ret void, !dbg [[DBG136]] // // // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l37_debug__ -// CHECK1-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], ptr addrspace(1) noalias noundef [[C:%.*]], i32 noundef [[A:%.*]], ptr addrspace(1) noalias noundef [[B:%.*]], ptr addrspace(1) noalias noundef [[BB:%.*]]) #[[ATTR0]] !dbg [[DBG139:![0-9]+]] { +// CHECK1-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], ptr addrspace(1) noalias noundef [[C:%.*]], i32 noundef [[A:%.*]], ptr addrspace(1) noalias noundef [[B:%.*]], ptr addrspace(1) noalias noundef [[BB:%.*]]) #[[ATTR1]] !dbg [[DBG137:![0-9]+]] { // CHECK1-NEXT: entry: // CHECK1-NEXT: [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8 // CHECK1-NEXT: [[C_ADDR:%.*]] = alloca ptr addrspace(1), align 8 @@ -297,32 +297,32 @@ int main() { // CHECK1-NEXT: [[A_CASTED:%.*]] = alloca i64, align 8 // CHECK1-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [4 x ptr], align 8 // CHECK1-NEXT: store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8 -// CHECK1-NEXT: call void @llvm.dbg.declare(metadata ptr [[DYN_PTR_ADDR]], metadata [[META144:![0-9]+]], metadata !DIExpression()), !dbg [[DBG145:![0-9]+]] +// CHECK1-NEXT: tail call void @llvm.dbg.declare(metadata ptr [[DYN_PTR_ADDR]], metadata [[META142:![0-9]+]], metadata !DIExpression()), !dbg [[DBG143:![0-9]+]] // CHECK1-NEXT: store ptr addrspace(1) [[C]], ptr [[C_ADDR]], align 8 -// CHECK1-NEXT: call void @llvm.dbg.declare(metadata ptr [[C_ADDR]], metadata [[META146:![0-9]+]], metadata !DIExpression()), !dbg [[DBG147:![0-9]+]] +// CHECK1-NEXT: tail call void @llvm.dbg.declare(metadata ptr [[C_ADDR]], metadata [[META144:![0-9]+]], metadata !DIExpression()), !dbg [[DBG145:![0-9]+]] // CHECK1-NEXT: store i32 [[A]], ptr [[A_ADDR]], align 4 -// CHECK1-NEXT: call void @llvm.dbg.declare(metadata ptr [[A_ADDR]], metadata [[META148:![0-9]+]], metadata !DIExpression()), !dbg [[DBG149:![0-9]+]] +// CHECK1-NEXT: tail call void @llvm.dbg.declare(metadata ptr [[A_ADDR]], metadata [[META146:![0-9]+]], metadata !DIExpression()), !dbg [[DBG147:![0-9]+]] // CHECK1-NEXT: store ptr addrspace(1) [[B]], ptr [[B_ADDR]], align 8 -// CHECK1-NEXT: call void @llvm.dbg.declare(metadata ptr [[B_ADDR]], metadata [[META150:![0-9]+]], metadata !DIExpression()), !dbg [[DBG151:![0-9]+]] +// CHECK1-NEXT: tail call void @llvm.dbg.declare(metadata ptr [[B_ADDR]], metadata [[META148:![0-9]+]], metadata !DIExpression()), !dbg [[DBG149:![0-9]+]] // CHECK1-NEXT: store ptr addrspace(1) [[BB]], ptr [[BB_ADDR]], align 8 -// CHECK1-NEXT: call void @llvm.dbg.declare(metadata ptr [[BB_ADDR]], metadata [[META152:![0-9]+]], metadata !DIExpression()), !dbg [[DBG153:![0-9]+]] -// CHECK1-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr [[C_ADDR]], align 8, !dbg [[DBG154:![0-9]+]] -// CHECK1-NEXT: [[TMP1:%.*]] = addrspacecast ptr addrspace(1) [[TMP0]] to ptr, !dbg [[DBG154]] -// CHECK1-NEXT: store ptr [[TMP1]], ptr [[TMP]], align 8, !dbg [[DBG154]] -// CHECK1-NEXT: [[TMP2:%.*]] = load ptr, ptr [[TMP]], align 8, !dbg [[DBG154]] -// CHECK1-NEXT: [[TMP3:%.*]] = load ptr addrspace(1), ptr [[B_ADDR]], align 8, !dbg [[DBG154]] -// CHECK1-NEXT: [[TMP4:%.*]] = addrspacecast ptr addrspace(1) [[TMP3]] to ptr, !dbg [[DBG154]] -// CHECK1-NEXT: store ptr [[TMP4]], ptr [[_TMP1]], align 8, !dbg [[DBG154]] -// CHECK1-NEXT: [[TMP5:%.*]] = load ptr, ptr [[_TMP1]], align 8, !dbg [[DBG154]] -// CHECK1-NEXT: [[TMP6:%.*]] = load ptr addrspace(1), ptr [[BB_ADDR]], align 8, !dbg [[DBG154]] -// CHECK1-NEXT: [[TMP7:%.*]] = addrspacecast ptr addrspace(1) [[TMP6]] to ptr, !dbg [[DBG154]] -// CHECK1-NEXT: store ptr [[TMP7]], ptr [[_TMP2]], align 8, !dbg [[DBG154]] -// CHECK1-NEXT: [[TMP8:%.*]] = load ptr, ptr [[_TMP2]], align 8, !dbg [[DBG154]] -// CHECK1-NEXT: [[TMP9:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l37_kernel_environment, ptr [[DYN_PTR]]), !dbg [[DBG154]] -// CHECK1-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP9]], -1, !dbg [[DBG154]] -// CHECK1-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]], !dbg [[DBG154]] +// CHECK1-NEXT: tail call void @llvm.dbg.declare(metadata ptr [[BB_ADDR]], metadata [[META150:![0-9]+]], metadata !DIExpression()), !dbg [[DBG151:![0-9]+]] +// CHECK1-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr [[C_ADDR]], align 8, !dbg [[DBG152:![0-9]+]] +// CHECK1-NEXT: [[TMP1:%.*]] = addrspacecast ptr addrspace(1) [[TMP0]] to ptr, !dbg [[DBG152]] +// CHECK1-NEXT: store ptr [[TMP1]], ptr [[TMP]], align 8, !dbg [[DBG152]] +// CHECK1-NEXT: [[TMP2:%.*]] = load ptr, ptr [[TMP]], align 8, !dbg [[DBG152]] +// CHECK1-NEXT: [[TMP3:%.*]] = load ptr addrspace(1), ptr [[B_ADDR]], align 8, !dbg [[DBG152]] +// CHECK1-NEXT: [[TMP4:%.*]] = addrspacecast ptr addrspace(1) [[TMP3]] to ptr, !dbg [[DBG152]] +// CHECK1-NEXT: store ptr [[TMP4]], ptr [[_TMP1]], align 8, !dbg [[DBG152]] +// CHECK1-NEXT: [[TMP5:%.*]] = load ptr, ptr [[_TMP1]], align 8, !dbg [[DBG152]] +// CHECK1-NEXT: [[TMP6:%.*]] = load ptr addrspace(1), ptr [[BB_ADDR]], align 8, !dbg [[DBG152]] +// CHECK1-NEXT: [[TMP7:%.*]] = addrspacecast ptr addrspace(1) [[TMP6]] to ptr, !dbg [[DBG152]] +// CHECK1-NEXT: store ptr [[TMP7]], ptr [[_TMP2]], align 8, !dbg [[DBG152]] +// CHECK1-NEXT: [[TMP8:%.*]] = load ptr, ptr [[_TMP2]], align 8, !dbg [[DBG152]] +// CHECK1-NEXT: [[TMP9:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l37_kernel_environment, ptr [[DYN_PTR]]), !dbg [[DBG152]] +// CHECK1-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP9]], -1, !dbg [[DBG152]] +// CHECK1-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]], !dbg [[DBG152]] // CHECK1: user_code.entry: -// CHECK1-NEXT: [[TMP10:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB7:[0-9]+]]) +// CHECK1-NEXT: [[TMP10:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB7:[0-9]+]]), !dbg [[DBG153:![0-9]+]] // CHECK1-NEXT: [[TMP11:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG155:![0-9]+]] // CHECK1-NEXT: store i32 [[TMP11]], ptr [[A_CASTED]], align 4, !dbg [[DBG155]] // CHECK1-NEXT: [[TMP12:%.*]] = load i64, ptr [[A_CASTED]], align 8, !dbg [[DBG155]] @@ -337,13 +337,46 @@ int main() { // CHECK1-NEXT: store ptr [[TMP8]], ptr [[TMP17]], align 8, !dbg [[DBG155]] // CHECK1-NEXT: call void @__kmpc_parallel_51(ptr @[[GLOB7]], i32 [[TMP10]], i32 1, i32 -1, i32 -1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l37_debug___omp_outlined, ptr null, ptr [[CAPTURED_VARS_ADDRS]], i64 4), !dbg [[DBG155]] // CHECK1-NEXT: call void @__kmpc_target_deinit(), !dbg [[DBG156:![0-9]+]] -// CHECK1-NEXT: ret void, !dbg [[DBG158:![0-9]+]] +// CHECK1-NEXT: ret void, !dbg [[DBG157:![0-9]+]] // CHECK1: worker.exit: -// CHECK1-NEXT: ret void, !dbg [[DBG154]] +// CHECK1-NEXT: ret void, !dbg [[DBG152]] +// +// +// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l37 +// CHECK1-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], ptr noundef nonnull align 4 dereferenceable(4000) [[C:%.*]], i64 noundef [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(400) [[B:%.*]], ptr noundef nonnull align 1 dereferenceable(1) [[BB:%.*]]) #[[ATTR2]] !dbg [[DBG158:![0-9]+]] { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8 +// CHECK1-NEXT: [[C_ADDR:%.*]] = alloca ptr, align 8 +// CHECK1-NEXT: [[A_ADDR:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[B_ADDR:%.*]] = alloca ptr, align 8 +// CHECK1-NEXT: [[BB_ADDR:%.*]] = alloca ptr, align 8 +// CHECK1-NEXT: store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8 +// CHECK1-NEXT: tail call void @llvm.dbg.declare(metadata ptr [[DYN_PTR_ADDR]], metadata [[META159:![0-9]+]], metadata !DIExpression()), !dbg [[DBG160:![0-9]+]] +// CHECK1-NEXT: store ptr [[C]], ptr [[C_ADDR]], align 8 +// CHECK1-NEXT: tail call void @llvm.dbg.declare(metadata ptr [[C_ADDR]], metadata [[META161:![0-9]+]], metadata !DIExpression()), !dbg [[DBG160]] +// CHECK1-NEXT: store i64 [[A]], ptr [[A_ADDR]], align 8 +// CHECK1-NEXT: tail call void @llvm.dbg.declare(metadata ptr [[A_ADDR]], metadata [[META162:![0-9]+]], metadata !DIExpression()), !dbg [[DBG160]] +// CHECK1-NEXT: store ptr [[B]], ptr [[B_ADDR]], align 8 +// CHECK1-NEXT: tail call void @llvm.dbg.declare(metadata ptr [[B_ADDR]], metadata [[META163:![0-9]+]], metadata !DIExpression()), !dbg [[DBG160]] +// CHECK1-NEXT: store ptr [[BB]], ptr [[BB_ADDR]], align 8 +// CHECK1-NEXT: tail call void @llvm.dbg.declare(metadata ptr [[BB_ADDR]], metadata [[META164:![0-9]+]], metadata !DIExpression()), !dbg [[DBG160]] +// CHECK1-NEXT: [[TMP0:%.*]] = load ptr, ptr [[C_ADDR]], align 8, !dbg [[DBG165:![0-9]+]] +// CHECK1-NEXT: [[TMP1:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG165]] +// CHECK1-NEXT: [[TMP2:%.*]] = load ptr, ptr [[BB_ADDR]], align 8, !dbg [[DBG165]] +// CHECK1-NEXT: [[TMP3:%.*]] = load ptr, ptr [[DYN_PTR_ADDR]], align 8, !dbg [[DBG165]] +// CHECK1-NEXT: [[TMP4:%.*]] = load ptr, ptr [[C_ADDR]], align 8, !dbg [[DBG165]] +// CHECK1-NEXT: [[TMP5:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG165]] +// CHECK1-NEXT: [[TMP6:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG165]] +// CHECK1-NEXT: [[TMP7:%.*]] = load ptr, ptr [[BB_ADDR]], align 8, !dbg [[DBG165]] +// CHECK1-NEXT: [[TMP8:%.*]] = addrspacecast ptr [[TMP4]] to ptr addrspace(1), !dbg [[DBG165]] +// CHECK1-NEXT: [[TMP9:%.*]] = addrspacecast ptr [[TMP6]] to ptr addrspace(1), !dbg [[DBG165]] +// CHECK1-NEXT: [[TMP10:%.*]] = addrspacecast ptr [[TMP7]] to ptr addrspace(1), !dbg [[DBG165]] +// CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l37_debug__(ptr [[TMP3]], ptr addrspace(1) [[TMP8]], i32 [[TMP5]], ptr addrspace(1) [[TMP9]], ptr addrspace(1) [[TMP10]]) #[[ATTR4]], !dbg [[DBG165]] +// CHECK1-NEXT: ret void, !dbg [[DBG165]] // // // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l37_debug___omp_outlined_debug__ -// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr addrspace(1) noalias noundef [[C:%.*]], i32 noundef [[A:%.*]], ptr addrspace(1) noalias noundef [[B:%.*]], ptr addrspace(1) noalias noundef [[BB:%.*]]) #[[ATTR2]] !dbg [[DBG159:![0-9]+]] { +// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr addrspace(1) noalias noundef [[C:%.*]], i32 noundef [[A:%.*]], ptr addrspace(1) noalias noundef [[B:%.*]], ptr addrspace(1) noalias noundef [[BB:%.*]]) #[[ATTR1]] !dbg [[DBG166:![0-9]+]] { // CHECK1-NEXT: entry: // CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -359,74 +392,74 @@ int main() { // CHECK1-NEXT: [[H:%.*]] = alloca ptr, align 8 // CHECK1-NEXT: [[D:%.*]] = alloca i32, align 4 // CHECK1-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK1-NEXT: call void @llvm.dbg.declare(metadata ptr [[DOTGLOBAL_TID__ADDR]], metadata [[META162:![0-9]+]], metadata !DIExpression()), !dbg [[DBG163:![0-9]+]] +// CHECK1-NEXT: tail call void @llvm.dbg.declare(metadata ptr [[DOTGLOBAL_TID__ADDR]], metadata [[META169:![0-9]+]], metadata !DIExpression()), !dbg [[DBG170:![0-9]+]] // CHECK1-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8 -// CHECK1-NEXT: call void @llvm.dbg.declare(metadata ptr [[DOTBOUND_TID__ADDR]], metadata [[META164:![0-9]+]], metadata !DIExpression()), !dbg [[DBG163]] +// CHECK1-NEXT: tail call void @llvm.dbg.declare(metadata ptr [[DOTBOUND_TID__ADDR]], metadata [[META171:![0-9]+]], metadata !DIExpression()), !dbg [[DBG170]] // CHECK1-NEXT: store ptr addrspace(1) [[C]], ptr [[C_ADDR]], align 8 -// CHECK1-NEXT: call void @llvm.dbg.declare(metadata ptr [[C_ADDR]], metadata [[META165:![0-9]+]], metadata !DIExpression()), !dbg [[DBG166:![0-9]+]] +// CHECK1-NEXT: tail call void @llvm.dbg.declare(metadata ptr [[C_ADDR]], metadata [[META172:![0-9]+]], metadata !DIExpression()), !dbg [[DBG173:![0-9]+]] // CHECK1-NEXT: store i32 [[A]], ptr [[A_ADDR]], align 4 -// CHECK1-NEXT: call void @llvm.dbg.declare(metadata ptr [[A_ADDR]], metadata [[META167:![0-9]+]], metadata !DIExpression()), !dbg [[DBG168:![0-9]+]] +// CHECK1-NEXT: tail call void @llvm.dbg.declare(metadata ptr [[A_ADDR]], metadata [[META174:![0-9]+]], metadata !DIExpression()), !dbg [[DBG175:![0-9]+]] // CHECK1-NEXT: store ptr addrspace(1) [[B]], ptr [[B_ADDR]], align 8 -// CHECK1-NEXT: call void @llvm.dbg.declare(metadata ptr [[B_ADDR]], metadata [[META169:![0-9]+]], metadata !DIExpression()), !dbg [[DBG170:![0-9]+]] +// CHECK1-NEXT: tail call void @llvm.dbg.declare(metadata ptr [[B_ADDR]], metadata [[META176:![0-9]+]], metadata !DIExpression()), !dbg [[DBG177:![0-9]+]] // CHECK1-NEXT: store ptr addrspace(1) [[BB]], ptr [[BB_ADDR]], align 8 -// CHECK1-NEXT: call void @llvm.dbg.declare(metadata ptr [[BB_ADDR]], metadata [[META171:![0-9]+]], metadata !DIExpression()), !dbg [[DBG172:![0-9]+]] -// CHECK1-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr [[C_ADDR]], align 8, !dbg [[DBG173:![0-9]+]] -// CHECK1-NEXT: [[TMP1:%.*]] = addrspacecast ptr addrspace(1) [[TMP0]] to ptr, !dbg [[DBG173]] -// CHECK1-NEXT: store ptr [[TMP1]], ptr [[TMP]], align 8, !dbg [[DBG173]] -// CHECK1-NEXT: [[TMP2:%.*]] = load ptr, ptr [[TMP]], align 8, !dbg [[DBG173]] -// CHECK1-NEXT: [[TMP3:%.*]] = load ptr addrspace(1), ptr [[B_ADDR]], align 8, !dbg [[DBG173]] -// CHECK1-NEXT: [[TMP4:%.*]] = addrspacecast ptr addrspace(1) [[TMP3]] to ptr, !dbg [[DBG173]] -// CHECK1-NEXT: store ptr [[TMP4]], ptr [[_TMP1]], align 8, !dbg [[DBG173]] -// CHECK1-NEXT: [[TMP5:%.*]] = load ptr, ptr [[_TMP1]], align 8, !dbg [[DBG173]] -// CHECK1-NEXT: [[TMP6:%.*]] = load ptr addrspace(1), ptr [[BB_ADDR]], align 8, !dbg [[DBG173]] -// CHECK1-NEXT: [[TMP7:%.*]] = addrspacecast ptr addrspace(1) [[TMP6]] to ptr, !dbg [[DBG173]] -// CHECK1-NEXT: store ptr [[TMP7]], ptr [[_TMP2]], align 8, !dbg [[DBG173]] -// CHECK1-NEXT: [[TMP8:%.*]] = load ptr, ptr [[_TMP2]], align 8, !dbg [[DBG173]] -// CHECK1-NEXT: call void @llvm.dbg.declare(metadata ptr [[F]], metadata [[META174:![0-9]+]], metadata !DIExpression()), !dbg [[DBG176:![0-9]+]] -// CHECK1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x [10 x [10 x i32]]], ptr [[TMP2]], i64 0, i64 1, !dbg [[DBG177:![0-9]+]] -// CHECK1-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[ARRAYIDX]], i64 0, i64 1, !dbg [[DBG177]] -// CHECK1-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX3]], i64 0, i64 1, !dbg [[DBG177]] -// CHECK1-NEXT: store ptr [[ARRAYIDX4]], ptr [[F]], align 8, !dbg [[DBG176]] -// CHECK1-NEXT: call void @llvm.dbg.declare(metadata ptr [[G]], metadata [[META178:![0-9]+]], metadata !DIExpression()), !dbg [[DBG179:![0-9]+]] -// CHECK1-NEXT: store ptr [[A_ADDR]], ptr [[G]], align 8, !dbg [[DBG179]] -// CHECK1-NEXT: call void @llvm.dbg.declare(metadata ptr [[H]], metadata [[META180:![0-9]+]], metadata !DIExpression()), !dbg [[DBG181:![0-9]+]] -// CHECK1-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[TMP5]], i64 0, i64 1, !dbg [[DBG182:![0-9]+]] -// CHECK1-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX5]], i64 0, i64 1, !dbg [[DBG182]] -// CHECK1-NEXT: store ptr [[ARRAYIDX6]], ptr [[H]], align 8, !dbg [[DBG181]] -// CHECK1-NEXT: call void @llvm.dbg.declare(metadata ptr [[D]], metadata [[META183:![0-9]+]], metadata !DIExpression()), !dbg [[DBG184:![0-9]+]] -// CHECK1-NEXT: store i32 15, ptr [[D]], align 4, !dbg [[DBG184]] -// CHECK1-NEXT: store i32 5, ptr [[A_ADDR]], align 4, !dbg [[DBG185:![0-9]+]] -// CHECK1-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[TMP5]], i64 0, i64 0, !dbg [[DBG186:![0-9]+]] -// CHECK1-NEXT: [[TMP9:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG187:![0-9]+]] -// CHECK1-NEXT: [[IDXPROM:%.*]] = sext i32 [[TMP9]] to i64, !dbg [[DBG186]] -// CHECK1-NEXT: [[ARRAYIDX8:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX7]], i64 0, i64 [[IDXPROM]], !dbg [[DBG186]] -// CHECK1-NEXT: store i32 10, ptr [[ARRAYIDX8]], align 4, !dbg [[DBG188:![0-9]+]] -// CHECK1-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds [10 x [10 x [10 x i32]]], ptr [[TMP2]], i64 0, i64 0, !dbg [[DBG189:![0-9]+]] -// CHECK1-NEXT: [[ARRAYIDX10:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[ARRAYIDX9]], i64 0, i64 0, !dbg [[DBG189]] -// CHECK1-NEXT: [[TMP10:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG190:![0-9]+]] -// CHECK1-NEXT: [[IDXPROM11:%.*]] = sext i32 [[TMP10]] to i64, !dbg [[DBG189]] -// CHECK1-NEXT: [[ARRAYIDX12:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX10]], i64 0, i64 [[IDXPROM11]], !dbg [[DBG189]] -// CHECK1-NEXT: store i32 11, ptr [[ARRAYIDX12]], align 4, !dbg [[DBG191:![0-9]+]] -// CHECK1-NEXT: [[ARRAYIDX13:%.*]] = getelementptr inbounds [10 x [10 x [10 x i32]]], ptr [[TMP2]], i64 0, i64 0, !dbg [[DBG192:![0-9]+]] -// CHECK1-NEXT: [[ARRAYIDX14:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[ARRAYIDX13]], i64 0, i64 0, !dbg [[DBG192]] -// CHECK1-NEXT: [[TMP11:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG193:![0-9]+]] -// CHECK1-NEXT: [[IDXPROM15:%.*]] = sext i32 [[TMP11]] to i64, !dbg [[DBG192]] -// CHECK1-NEXT: [[ARRAYIDX16:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX14]], i64 0, i64 [[IDXPROM15]], !dbg [[DBG192]] -// CHECK1-NEXT: [[TMP12:%.*]] = load i32, ptr [[ARRAYIDX16]], align 4, !dbg [[DBG192]] -// CHECK1-NEXT: [[ARRAYIDX17:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[TMP5]], i64 0, i64 0, !dbg [[DBG194:![0-9]+]] -// CHECK1-NEXT: [[TMP13:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG195:![0-9]+]] -// CHECK1-NEXT: [[IDXPROM18:%.*]] = sext i32 [[TMP13]] to i64, !dbg [[DBG194]] -// CHECK1-NEXT: [[ARRAYIDX19:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX17]], i64 0, i64 [[IDXPROM18]], !dbg [[DBG194]] -// CHECK1-NEXT: store i32 [[TMP12]], ptr [[ARRAYIDX19]], align 4, !dbg [[DBG196:![0-9]+]] -// CHECK1-NEXT: [[TMP14:%.*]] = load i8, ptr [[TMP8]], align 1, !dbg [[DBG197:![0-9]+]] -// CHECK1-NEXT: [[TOBOOL:%.*]] = trunc i8 [[TMP14]] to i1, !dbg [[DBG197]] -// CHECK1-NEXT: [[CONV:%.*]] = zext i1 [[TOBOOL]] to i32, !dbg [[DBG197]] -// CHECK1-NEXT: store i32 [[CONV]], ptr [[D]], align 4, !dbg [[DBG198:![0-9]+]] -// CHECK1-NEXT: ret void, !dbg [[DBG199:![0-9]+]] +// CHECK1-NEXT: tail call void @llvm.dbg.declare(metadata ptr [[BB_ADDR]], metadata [[META178:![0-9]+]], metadata !DIExpression()), !dbg [[DBG179:![0-9]+]] +// CHECK1-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr [[C_ADDR]], align 8, !dbg [[DBG180:![0-9]+]] +// CHECK1-NEXT: [[TMP1:%.*]] = addrspacecast ptr addrspace(1) [[TMP0]] to ptr, !dbg [[DBG180]] +// CHECK1-NEXT: store ptr [[TMP1]], ptr [[TMP]], align 8, !dbg [[DBG180]] +// CHECK1-NEXT: [[TMP2:%.*]] = load ptr, ptr [[TMP]], align 8, !dbg [[DBG180]] +// CHECK1-NEXT: [[TMP3:%.*]] = load ptr addrspace(1), ptr [[B_ADDR]], align 8, !dbg [[DBG180]] +// CHECK1-NEXT: [[TMP4:%.*]] = addrspacecast ptr addrspace(1) [[TMP3]] to ptr, !dbg [[DBG180]] +// CHECK1-NEXT: store ptr [[TMP4]], ptr [[_TMP1]], align 8, !dbg [[DBG180]] +// CHECK1-NEXT: [[TMP5:%.*]] = load ptr, ptr [[_TMP1]], align 8, !dbg [[DBG180]] +// CHECK1-NEXT: [[TMP6:%.*]] = load ptr addrspace(1), ptr [[BB_ADDR]], align 8, !dbg [[DBG180]] +// CHECK1-NEXT: [[TMP7:%.*]] = addrspacecast ptr addrspace(1) [[TMP6]] to ptr, !dbg [[DBG180]] +// CHECK1-NEXT: store ptr [[TMP7]], ptr [[_TMP2]], align 8, !dbg [[DBG180]] +// CHECK1-NEXT: [[TMP8:%.*]] = load ptr, ptr [[_TMP2]], align 8, !dbg [[DBG180]] +// CHECK1-NEXT: tail call void @llvm.dbg.declare(metadata ptr [[F]], metadata [[META181:![0-9]+]], metadata !DIExpression()), !dbg [[DBG183:![0-9]+]] +// CHECK1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x [10 x [10 x i32]]], ptr [[TMP2]], i64 0, i64 1, !dbg [[DBG184:![0-9]+]] +// CHECK1-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[ARRAYIDX]], i64 0, i64 1, !dbg [[DBG184]] +// CHECK1-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX3]], i64 0, i64 1, !dbg [[DBG184]] +// CHECK1-NEXT: store ptr [[ARRAYIDX4]], ptr [[F]], align 8, !dbg [[DBG183]] +// CHECK1-NEXT: tail call void @llvm.dbg.declare(metadata ptr [[G]], metadata [[META185:![0-9]+]], metadata !DIExpression()), !dbg [[DBG186:![0-9]+]] +// CHECK1-NEXT: store ptr [[A_ADDR]], ptr [[G]], align 8, !dbg [[DBG186]] +// CHECK1-NEXT: tail call void @llvm.dbg.declare(metadata ptr [[H]], metadata [[META187:![0-9]+]], metadata !DIExpression()), !dbg [[DBG188:![0-9]+]] +// CHECK1-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[TMP5]], i64 0, i64 1, !dbg [[DBG189:![0-9]+]] +// CHECK1-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX5]], i64 0, i64 1, !dbg [[DBG189]] +// CHECK1-NEXT: store ptr [[ARRAYIDX6]], ptr [[H]], align 8, !dbg [[DBG188]] +// CHECK1-NEXT: tail call void @llvm.dbg.declare(metadata ptr [[D]], metadata [[META190:![0-9]+]], metadata !DIExpression()), !dbg [[DBG191:![0-9]+]] +// CHECK1-NEXT: store i32 15, ptr [[D]], align 4, !dbg [[DBG191]] +// CHECK1-NEXT: store i32 5, ptr [[A_ADDR]], align 4, !dbg [[DBG192:![0-9]+]] +// CHECK1-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[TMP5]], i64 0, i64 0, !dbg [[DBG193:![0-9]+]] +// CHECK1-NEXT: [[TMP9:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG194:![0-9]+]] +// CHECK1-NEXT: [[IDXPROM:%.*]] = sext i32 [[TMP9]] to i64, !dbg [[DBG193]] +// CHECK1-NEXT: [[ARRAYIDX8:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX7]], i64 0, i64 [[IDXPROM]], !dbg [[DBG193]] +// CHECK1-NEXT: store i32 10, ptr [[ARRAYIDX8]], align 4, !dbg [[DBG195:![0-9]+]] +// CHECK1-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds [10 x [10 x [10 x i32]]], ptr [[TMP2]], i64 0, i64 0, !dbg [[DBG196:![0-9]+]] +// CHECK1-NEXT: [[ARRAYIDX10:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[ARRAYIDX9]], i64 0, i64 0, !dbg [[DBG196]] +// CHECK1-NEXT: [[TMP10:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG197:![0-9]+]] +// CHECK1-NEXT: [[IDXPROM11:%.*]] = sext i32 [[TMP10]] to i64, !dbg [[DBG196]] +// CHECK1-NEXT: [[ARRAYIDX12:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX10]], i64 0, i64 [[IDXPROM11]], !dbg [[DBG196]] +// CHECK1-NEXT: store i32 11, ptr [[ARRAYIDX12]], align 4, !dbg [[DBG198:![0-9]+]] +// CHECK1-NEXT: [[ARRAYIDX13:%.*]] = getelementptr inbounds [10 x [10 x [10 x i32]]], ptr [[TMP2]], i64 0, i64 0, !dbg [[DBG199:![0-9]+]] +// CHECK1-NEXT: [[ARRAYIDX14:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[ARRAYIDX13]], i64 0, i64 0, !dbg [[DBG199]] +// CHECK1-NEXT: [[TMP11:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG200:![0-9]+]] +// CHECK1-NEXT: [[IDXPROM15:%.*]] = sext i32 [[TMP11]] to i64, !dbg [[DBG199]] +// CHECK1-NEXT: [[ARRAYIDX16:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX14]], i64 0, i64 [[IDXPROM15]], !dbg [[DBG199]] +// CHECK1-NEXT: [[TMP12:%.*]] = load i32, ptr [[ARRAYIDX16]], align 4, !dbg [[DBG199]] +// CHECK1-NEXT: [[ARRAYIDX17:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[TMP5]], i64 0, i64 0, !dbg [[DBG201:![0-9]+]] +// CHECK1-NEXT: [[TMP13:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG202:![0-9]+]] +// CHECK1-NEXT: [[IDXPROM18:%.*]] = sext i32 [[TMP13]] to i64, !dbg [[DBG201]] +// CHECK1-NEXT: [[ARRAYIDX19:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX17]], i64 0, i64 [[IDXPROM18]], !dbg [[DBG201]] +// CHECK1-NEXT: store i32 [[TMP12]], ptr [[ARRAYIDX19]], align 4, !dbg [[DBG203:![0-9]+]] +// CHECK1-NEXT: [[TMP14:%.*]] = load i8, ptr [[TMP8]], align 1, !dbg [[DBG204:![0-9]+]] +// CHECK1-NEXT: [[TOBOOL:%.*]] = trunc i8 [[TMP14]] to i1, !dbg [[DBG204]] +// CHECK1-NEXT: [[CONV:%.*]] = zext i1 [[TOBOOL]] to i32, !dbg [[DBG204]] +// CHECK1-NEXT: store i32 [[CONV]], ptr [[D]], align 4, !dbg [[DBG205:![0-9]+]] +// CHECK1-NEXT: ret void, !dbg [[DBG206:![0-9]+]] // // // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l37_debug___omp_outlined -// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4000) [[C:%.*]], i64 noundef [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(400) [[B:%.*]], ptr noundef nonnull align 1 dereferenceable(1) [[BB:%.*]]) #[[ATTR2]] !dbg [[DBG200:![0-9]+]] { +// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4000) [[C:%.*]], i64 noundef [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(400) [[B:%.*]], ptr noundef nonnull align 1 dereferenceable(1) [[BB:%.*]]) #[[ATTR1]] !dbg [[DBG207:![0-9]+]] { // CHECK1-NEXT: entry: // CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -435,68 +468,35 @@ int main() { // CHECK1-NEXT: [[B_ADDR:%.*]] = alloca ptr, align 8 // CHECK1-NEXT: [[BB_ADDR:%.*]] = alloca ptr, align 8 // CHECK1-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK1-NEXT: call void @llvm.dbg.declare(metadata ptr [[DOTGLOBAL_TID__ADDR]], metadata [[META201:![0-9]+]], metadata !DIExpression()), !dbg [[DBG202:![0-9]+]] +// CHECK1-NEXT: tail call void @llvm.dbg.declare(metadata ptr [[DOTGLOBAL_TID__ADDR]], metadata [[META208:![0-9]+]], metadata !DIExpression()), !dbg [[DBG209:![0-9]+]] // CHECK1-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8 -// CHECK1-NEXT: call void @llvm.dbg.declare(metadata ptr [[DOTBOUND_TID__ADDR]], metadata [[META203:![0-9]+]], metadata !DIExpression()), !dbg [[DBG202]] +// CHECK1-NEXT: tail call void @llvm.dbg.declare(metadata ptr [[DOTBOUND_TID__ADDR]], metadata [[META210:![0-9]+]], metadata !DIExpression()), !dbg [[DBG209]] // CHECK1-NEXT: store ptr [[C]], ptr [[C_ADDR]], align 8 -// CHECK1-NEXT: call void @llvm.dbg.declare(metadata ptr [[C_ADDR]], metadata [[META204:![0-9]+]], metadata !DIExpression()), !dbg [[DBG202]] +// CHECK1-NEXT: tail call void @llvm.dbg.declare(metadata ptr [[C_ADDR]], metadata [[META211:![0-9]+]], metadata !DIExpression()), !dbg [[DBG209]] // CHECK1-NEXT: store i64 [[A]], ptr [[A_ADDR]], align 8 -// CHECK1-NEXT: call void @llvm.dbg.declare(metadata ptr [[A_ADDR]], metadata [[META205:![0-9]+]], metadata !DIExpression()), !dbg [[DBG202]] +// CHECK1-NEXT: tail call void @llvm.dbg.declare(metadata ptr [[A_ADDR]], metadata [[META212:![0-9]+]], metadata !DIExpression()), !dbg [[DBG209]] // CHECK1-NEXT: store ptr [[B]], ptr [[B_ADDR]], align 8 -// CHECK1-NEXT: call void @llvm.dbg.declare(metadata ptr [[B_ADDR]], metadata [[META206:![0-9]+]], metadata !DIExpression()), !dbg [[DBG202]] +// CHECK1-NEXT: tail call void @llvm.dbg.declare(metadata ptr [[B_ADDR]], metadata [[META213:![0-9]+]], metadata !DIExpression()), !dbg [[DBG209]] // CHECK1-NEXT: store ptr [[BB]], ptr [[BB_ADDR]], align 8 -// CHECK1-NEXT: call void @llvm.dbg.declare(metadata ptr [[BB_ADDR]], metadata [[META207:![0-9]+]], metadata !DIExpression()), !dbg [[DBG202]] -// CHECK1-NEXT: [[TMP0:%.*]] = load ptr, ptr [[C_ADDR]], align 8, !dbg [[DBG208:![0-9]+]] -// CHECK1-NEXT: [[TMP1:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG208]] -// CHECK1-NEXT: [[TMP2:%.*]] = load ptr, ptr [[BB_ADDR]], align 8, !dbg [[DBG208]] -// CHECK1-NEXT: [[TMP3:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8, !dbg [[DBG208]] -// CHECK1-NEXT: [[TMP4:%.*]] = load ptr, ptr [[DOTBOUND_TID__ADDR]], align 8, !dbg [[DBG208]] -// CHECK1-NEXT: [[TMP5:%.*]] = load ptr, ptr [[C_ADDR]], align 8, !dbg [[DBG208]] -// CHECK1-NEXT: [[TMP6:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG208]] -// CHECK1-NEXT: [[TMP7:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG208]] -// CHECK1-NEXT: [[TMP8:%.*]] = load ptr, ptr [[BB_ADDR]], align 8, !dbg [[DBG208]] -// CHECK1-NEXT: [[TMP9:%.*]] = addrspacecast ptr [[TMP5]] to ptr addrspace(1), !dbg [[DBG208]] -// CHECK1-NEXT: [[TMP10:%.*]] = addrspacecast ptr [[TMP7]] to ptr addrspace(1), !dbg [[DBG208]] -// CHECK1-NEXT: [[TMP11:%.*]] = addrspacecast ptr [[TMP8]] to ptr addrspace(1), !dbg [[DBG208]] -// CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l37_debug___omp_outlined_debug__(ptr [[TMP3]], ptr [[TMP4]], ptr addrspace(1) [[TMP9]], i32 [[TMP6]], ptr addrspace(1) [[TMP10]], ptr addrspace(1) [[TMP11]]) #[[ATTR4]], !dbg [[DBG208]] -// CHECK1-NEXT: ret void, !dbg [[DBG208]] -// -// -// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l37 -// CHECK1-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], ptr noundef nonnull align 4 dereferenceable(4000) [[C:%.*]], i64 noundef [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(400) [[B:%.*]], ptr noundef nonnull align 1 dereferenceable(1) [[BB:%.*]]) #[[ATTR6]] !dbg [[DBG209:![0-9]+]] { -// CHECK1-NEXT: entry: -// CHECK1-NEXT: [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8 -// CHECK1-NEXT: [[C_ADDR:%.*]] = alloca ptr, align 8 -// CHECK1-NEXT: [[A_ADDR:%.*]] = alloca i64, align 8 -// CHECK1-NEXT: [[B_ADDR:%.*]] = alloca ptr, align 8 -// CHECK1-NEXT: [[BB_ADDR:%.*]] = alloca ptr, align 8 -// CHECK1-NEXT: store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8 -// CHECK1-NEXT: call void @llvm.dbg.declare(metadata ptr [[DYN_PTR_ADDR]], metadata [[META210:![0-9]+]], metadata !DIExpression()), !dbg [[DBG211:![0-9]+]] -// CHECK1-NEXT: store ptr [[C]], ptr [[C_ADDR]], align 8 -// CHECK1-NEXT: call void @llvm.dbg.declare(metadata ptr [[C_ADDR]], metadata [[META212:![0-9]+]], metadata !DIExpression()), !dbg [[DBG211]] -// CHECK1-NEXT: store i64 [[A]], ptr [[A_ADDR]], align 8 -// CHECK1-NEXT: call void @llvm.dbg.declare(metadata ptr [[A_ADDR]], metadata [[META213:![0-9]+]], metadata !DIExpression()), !dbg [[DBG211]] -// CHECK1-NEXT: store ptr [[B]], ptr [[B_ADDR]], align 8 -// CHECK1-NEXT: call void @llvm.dbg.declare(metadata ptr [[B_ADDR]], metadata [[META214:![0-9]+]], metadata !DIExpression()), !dbg [[DBG211]] -// CHECK1-NEXT: store ptr [[BB]], ptr [[BB_ADDR]], align 8 -// CHECK1-NEXT: call void @llvm.dbg.declare(metadata ptr [[BB_ADDR]], metadata [[META215:![0-9]+]], metadata !DIExpression()), !dbg [[DBG211]] -// CHECK1-NEXT: [[TMP0:%.*]] = load ptr, ptr [[C_ADDR]], align 8, !dbg [[DBG216:![0-9]+]] -// CHECK1-NEXT: [[TMP1:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG216]] -// CHECK1-NEXT: [[TMP2:%.*]] = load ptr, ptr [[BB_ADDR]], align 8, !dbg [[DBG216]] -// CHECK1-NEXT: [[TMP3:%.*]] = load ptr, ptr [[DYN_PTR_ADDR]], align 8, !dbg [[DBG216]] -// CHECK1-NEXT: [[TMP4:%.*]] = load ptr, ptr [[C_ADDR]], align 8, !dbg [[DBG216]] -// CHECK1-NEXT: [[TMP5:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG216]] -// CHECK1-NEXT: [[TMP6:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG216]] -// CHECK1-NEXT: [[TMP7:%.*]] = load ptr, ptr [[BB_ADDR]], align 8, !dbg [[DBG216]] -// CHECK1-NEXT: [[TMP8:%.*]] = addrspacecast ptr [[TMP4]] to ptr addrspace(1), !dbg [[DBG216]] -// CHECK1-NEXT: [[TMP9:%.*]] = addrspacecast ptr [[TMP6]] to ptr addrspace(1), !dbg [[DBG216]] -// CHECK1-NEXT: [[TMP10:%.*]] = addrspacecast ptr [[TMP7]] to ptr addrspace(1), !dbg [[DBG216]] -// CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l37_debug__(ptr [[TMP3]], ptr addrspace(1) [[TMP8]], i32 [[TMP5]], ptr addrspace(1) [[TMP9]], ptr addrspace(1) [[TMP10]]) #[[ATTR4]], !dbg [[DBG216]] -// CHECK1-NEXT: ret void, !dbg [[DBG216]] +// CHECK1-NEXT: tail call void @llvm.dbg.declare(metadata ptr [[BB_ADDR]], metadata [[META214:![0-9]+]], metadata !DIExpression()), !dbg [[DBG209]] +// CHECK1-NEXT: [[TMP0:%.*]] = load ptr, ptr [[C_ADDR]], align 8, !dbg [[DBG215:![0-9]+]] +// CHECK1-NEXT: [[TMP1:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG215]] +// CHECK1-NEXT: [[TMP2:%.*]] = load ptr, ptr [[BB_ADDR]], align 8, !dbg [[DBG215]] +// CHECK1-NEXT: [[TMP3:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8, !dbg [[DBG215]] +// CHECK1-NEXT: [[TMP4:%.*]] = load ptr, ptr [[DOTBOUND_TID__ADDR]], align 8, !dbg [[DBG215]] +// CHECK1-NEXT: [[TMP5:%.*]] = load ptr, ptr [[C_ADDR]], align 8, !dbg [[DBG215]] +// CHECK1-NEXT: [[TMP6:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG215]] +// CHECK1-NEXT: [[TMP7:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG215]] +// CHECK1-NEXT: [[TMP8:%.*]] = load ptr, ptr [[BB_ADDR]], align 8, !dbg [[DBG215]] +// CHECK1-NEXT: [[TMP9:%.*]] = addrspacecast ptr [[TMP5]] to ptr addrspace(1), !dbg [[DBG215]] +// CHECK1-NEXT: [[TMP10:%.*]] = addrspacecast ptr [[TMP7]] to ptr addrspace(1), !dbg [[DBG215]] +// CHECK1-NEXT: [[TMP11:%.*]] = addrspacecast ptr [[TMP8]] to ptr addrspace(1), !dbg [[DBG215]] +// CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l37_debug___omp_outlined_debug__(ptr [[TMP3]], ptr [[TMP4]], ptr addrspace(1) [[TMP9]], i32 [[TMP6]], ptr addrspace(1) [[TMP10]], ptr addrspace(1) [[TMP11]]) #[[ATTR4]], !dbg [[DBG215]] +// CHECK1-NEXT: ret void, !dbg [[DBG215]] // // // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l51_debug__ -// CHECK1-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], ptr addrspace(1) noalias noundef [[C:%.*]], ptr addrspace(1) noalias noundef [[A:%.*]], ptr addrspace(1) noalias noundef [[B:%.*]], ptr addrspace(1) noalias noundef [[BB:%.*]]) #[[ATTR0]] !dbg [[DBG217:![0-9]+]] { +// CHECK1-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], ptr addrspace(1) noalias noundef [[C:%.*]], ptr addrspace(1) noalias noundef [[A:%.*]], ptr addrspace(1) noalias noundef [[B:%.*]], ptr addrspace(1) noalias noundef [[BB:%.*]]) #[[ATTR1]] !dbg [[DBG216:![0-9]+]] { // CHECK1-NEXT: entry: // CHECK1-NEXT: [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8 // CHECK1-NEXT: [[C_ADDR:%.*]] = alloca ptr addrspace(1), align 8 @@ -509,53 +509,88 @@ int main() { // CHECK1-NEXT: [[_TMP3:%.*]] = alloca ptr, align 8 // CHECK1-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [4 x ptr], align 8 // CHECK1-NEXT: store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8 -// CHECK1-NEXT: call void @llvm.dbg.declare(metadata ptr [[DYN_PTR_ADDR]], metadata [[META222:![0-9]+]], metadata !DIExpression()), !dbg [[DBG223:![0-9]+]] +// CHECK1-NEXT: tail call void @llvm.dbg.declare(metadata ptr [[DYN_PTR_ADDR]], metadata [[META221:![0-9]+]], metadata !DIExpression()), !dbg [[DBG222:![0-9]+]] // CHECK1-NEXT: store ptr addrspace(1) [[C]], ptr [[C_ADDR]], align 8 -// CHECK1-NEXT: call void @llvm.dbg.declare(metadata ptr [[C_ADDR]], metadata [[META224:![0-9]+]], metadata !DIExpression()), !dbg [[DBG225:![0-9]+]] +// CHECK1-NEXT: tail call void @llvm.dbg.declare(metadata ptr [[C_ADDR]], metadata [[META223:![0-9]+]], metadata !DIExpression()), !dbg [[DBG224:![0-9]+]] // CHECK1-NEXT: store ptr addrspace(1) [[A]], ptr [[A_ADDR]], align 8 -// CHECK1-NEXT: call void @llvm.dbg.declare(metadata ptr [[A_ADDR]], metadata [[META226:![0-9]+]], metadata !DIExpression()), !dbg [[DBG227:![0-9]+]] +// CHECK1-NEXT: tail call void @llvm.dbg.declare(metadata ptr [[A_ADDR]], metadata [[META225:![0-9]+]], metadata !DIExpression()), !dbg [[DBG226:![0-9]+]] // CHECK1-NEXT: store ptr addrspace(1) [[B]], ptr [[B_ADDR]], align 8 -// CHECK1-NEXT: call void @llvm.dbg.declare(metadata ptr [[B_ADDR]], metadata [[META228:![0-9]+]], metadata !DIExpression()), !dbg [[DBG229:![0-9]+]] +// CHECK1-NEXT: tail call void @llvm.dbg.declare(metadata ptr [[B_ADDR]], metadata [[META227:![0-9]+]], metadata !DIExpression()), !dbg [[DBG228:![0-9]+]] // CHECK1-NEXT: store ptr addrspace(1) [[BB]], ptr [[BB_ADDR]], align 8 -// CHECK1-NEXT: call void @llvm.dbg.declare(metadata ptr [[BB_ADDR]], metadata [[META230:![0-9]+]], metadata !DIExpression()), !dbg [[DBG231:![0-9]+]] -// CHECK1-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr [[C_ADDR]], align 8, !dbg [[DBG232:![0-9]+]] -// CHECK1-NEXT: [[TMP1:%.*]] = addrspacecast ptr addrspace(1) [[TMP0]] to ptr, !dbg [[DBG232]] -// CHECK1-NEXT: store ptr [[TMP1]], ptr [[TMP]], align 8, !dbg [[DBG232]] -// CHECK1-NEXT: [[TMP2:%.*]] = load ptr, ptr [[TMP]], align 8, !dbg [[DBG232]] -// CHECK1-NEXT: [[TMP3:%.*]] = load ptr addrspace(1), ptr [[A_ADDR]], align 8, !dbg [[DBG232]] -// CHECK1-NEXT: [[TMP4:%.*]] = addrspacecast ptr addrspace(1) [[TMP3]] to ptr, !dbg [[DBG232]] -// CHECK1-NEXT: store ptr [[TMP4]], ptr [[_TMP1]], align 8, !dbg [[DBG232]] -// CHECK1-NEXT: [[TMP5:%.*]] = load ptr, ptr [[_TMP1]], align 8, !dbg [[DBG232]] -// CHECK1-NEXT: [[TMP6:%.*]] = load ptr addrspace(1), ptr [[B_ADDR]], align 8, !dbg [[DBG232]] -// CHECK1-NEXT: [[TMP7:%.*]] = addrspacecast ptr addrspace(1) [[TMP6]] to ptr, !dbg [[DBG232]] -// CHECK1-NEXT: store ptr [[TMP7]], ptr [[_TMP2]], align 8, !dbg [[DBG232]] -// CHECK1-NEXT: [[TMP8:%.*]] = load ptr, ptr [[_TMP2]], align 8, !dbg [[DBG232]] -// CHECK1-NEXT: [[TMP9:%.*]] = load ptr addrspace(1), ptr [[BB_ADDR]], align 8, !dbg [[DBG232]] -// CHECK1-NEXT: [[TMP10:%.*]] = addrspacecast ptr addrspace(1) [[TMP9]] to ptr, !dbg [[DBG232]] -// CHECK1-NEXT: store ptr [[TMP10]], ptr [[_TMP3]], align 8, !dbg [[DBG232]] -// CHECK1-NEXT: [[TMP11:%.*]] = load ptr, ptr [[_TMP3]], align 8, !dbg [[DBG232]] -// CHECK1-NEXT: [[TMP12:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l51_kernel_environment, ptr [[DYN_PTR]]), !dbg [[DBG232]] -// CHECK1-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP12]], -1, !dbg [[DBG232]] -// CHECK1-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]], !dbg [[DBG232]] +// CHECK1-NEXT: tail call void @llvm.dbg.declare(metadata ptr [[BB_ADDR]], metadata [[META229:![0-9]+]], metadata !DIExpression()), !dbg [[DBG230:![0-9]+]] +// CHECK1-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr [[C_ADDR]], align 8, !dbg [[DBG231:![0-9]+]] +// CHECK1-NEXT: [[TMP1:%.*]] = addrspacecast ptr addrspace(1) [[TMP0]] to ptr, !dbg [[DBG231]] +// CHECK1-NEXT: store ptr [[TMP1]], ptr [[TMP]], align 8, !dbg [[DBG231]] +// CHECK1-NEXT: [[TMP2:%.*]] = load ptr, ptr [[TMP]], align 8, !dbg [[DBG231]] +// CHECK1-NEXT: [[TMP3:%.*]] = load ptr addrspace(1), ptr [[A_ADDR]], align 8, !dbg [[DBG231]] +// CHECK1-NEXT: [[TMP4:%.*]] = addrspacecast ptr addrspace(1) [[TMP3]] to ptr, !dbg [[DBG231]] +// CHECK1-NEXT: store ptr [[TMP4]], ptr [[_TMP1]], align 8, !dbg [[DBG231]] +// CHECK1-NEXT: [[TMP5:%.*]] = load ptr, ptr [[_TMP1]], align 8, !dbg [[DBG231]] +// CHECK1-NEXT: [[TMP6:%.*]] = load ptr addrspace(1), ptr [[B_ADDR]], align 8, !dbg [[DBG231]] +// CHECK1-NEXT: [[TMP7:%.*]] = addrspacecast ptr addrspace(1) [[TMP6]] to ptr, !dbg [[DBG231]] +// CHECK1-NEXT: store ptr [[TMP7]], ptr [[_TMP2]], align 8, !dbg [[DBG231]] +// CHECK1-NEXT: [[TMP8:%.*]] = load ptr, ptr [[_TMP2]], align 8, !dbg [[DBG231]] +// CHECK1-NEXT: [[TMP9:%.*]] = load ptr addrspace(1), ptr [[BB_ADDR]], align 8, !dbg [[DBG231]] +// CHECK1-NEXT: [[TMP10:%.*]] = addrspacecast ptr addrspace(1) [[TMP9]] to ptr, !dbg [[DBG231]] +// CHECK1-NEXT: store ptr [[TMP10]], ptr [[_TMP3]], align 8, !dbg [[DBG231]] +// CHECK1-NEXT: [[TMP11:%.*]] = load ptr, ptr [[_TMP3]], align 8, !dbg [[DBG231]] +// CHECK1-NEXT: [[TMP12:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l51_kernel_environment, ptr [[DYN_PTR]]), !dbg [[DBG231]] +// CHECK1-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP12]], -1, !dbg [[DBG231]] +// CHECK1-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]], !dbg [[DBG231]] // CHECK1: user_code.entry: -// CHECK1-NEXT: [[TMP13:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB11:[0-9]+]]) -// CHECK1-NEXT: [[TMP14:%.*]] = getelementptr inbounds [4 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 0, !dbg [[DBG233:![0-9]+]] -// CHECK1-NEXT: store ptr [[TMP2]], ptr [[TMP14]], align 8, !dbg [[DBG233]] -// CHECK1-NEXT: [[TMP15:%.*]] = getelementptr inbounds [4 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 1, !dbg [[DBG233]] -// CHECK1-NEXT: store ptr [[TMP5]], ptr [[TMP15]], align 8, !dbg [[DBG233]] -// CHECK1-NEXT: [[TMP16:%.*]] = getelementptr inbounds [4 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 2, !dbg [[DBG233]] -// CHECK1-NEXT: store ptr [[TMP8]], ptr [[TMP16]], align 8, !dbg [[DBG233]] -// CHECK1-NEXT: [[TMP17:%.*]] = getelementptr inbounds [4 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 3, !dbg [[DBG233]] -// CHECK1-NEXT: store ptr [[TMP11]], ptr [[TMP17]], align 8, !dbg [[DBG233]] -// CHECK1-NEXT: call void @__kmpc_parallel_51(ptr @[[GLOB11]], i32 [[TMP13]], i32 1, i32 -1, i32 -1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l51_debug___omp_outlined, ptr null, ptr [[CAPTURED_VARS_ADDRS]], i64 4), !dbg [[DBG233]] -// CHECK1-NEXT: call void @__kmpc_target_deinit(), !dbg [[DBG234:![0-9]+]] +// CHECK1-NEXT: [[TMP13:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB11:[0-9]+]]), !dbg [[DBG232:![0-9]+]] +// CHECK1-NEXT: [[TMP14:%.*]] = getelementptr inbounds [4 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 0, !dbg [[DBG234:![0-9]+]] +// CHECK1-NEXT: store ptr [[TMP2]], ptr [[TMP14]], align 8, !dbg [[DBG234]] +// CHECK1-NEXT: [[TMP15:%.*]] = getelementptr inbounds [4 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 1, !dbg [[DBG234]] +// CHECK1-NEXT: store ptr [[TMP5]], ptr [[TMP15]], align 8, !dbg [[DBG234]] +// CHECK1-NEXT: [[TMP16:%.*]] = getelementptr inbounds [4 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 2, !dbg [[DBG234]] +// CHECK1-NEXT: store ptr [[TMP8]], ptr [[TMP16]], align 8, !dbg [[DBG234]] +// CHECK1-NEXT: [[TMP17:%.*]] = getelementptr inbounds [4 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 3, !dbg [[DBG234]] +// CHECK1-NEXT: store ptr [[TMP11]], ptr [[TMP17]], align 8, !dbg [[DBG234]] +// CHECK1-NEXT: call void @__kmpc_parallel_51(ptr @[[GLOB11]], i32 [[TMP13]], i32 1, i32 -1, i32 -1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l51_debug___omp_outlined, ptr null, ptr [[CAPTURED_VARS_ADDRS]], i64 4), !dbg [[DBG234]] +// CHECK1-NEXT: call void @__kmpc_target_deinit(), !dbg [[DBG235:![0-9]+]] // CHECK1-NEXT: ret void, !dbg [[DBG236:![0-9]+]] // CHECK1: worker.exit: -// CHECK1-NEXT: ret void, !dbg [[DBG232]] +// CHECK1-NEXT: ret void, !dbg [[DBG231]] +// +// +// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l51 +// CHECK1-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], ptr noundef nonnull align 4 dereferenceable(4000) [[C:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(400) [[B:%.*]], ptr noundef nonnull align 1 dereferenceable(1) [[BB:%.*]]) #[[ATTR2]] !dbg [[DBG237:![0-9]+]] { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8 +// CHECK1-NEXT: [[C_ADDR:%.*]] = alloca ptr, align 8 +// CHECK1-NEXT: [[A_ADDR:%.*]] = alloca ptr, align 8 +// CHECK1-NEXT: [[B_ADDR:%.*]] = alloca ptr, align 8 +// CHECK1-NEXT: [[BB_ADDR:%.*]] = alloca ptr, align 8 +// CHECK1-NEXT: store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8 +// CHECK1-NEXT: tail call void @llvm.dbg.declare(metadata ptr [[DYN_PTR_ADDR]], metadata [[META240:![0-9]+]], metadata !DIExpression()), !dbg [[DBG241:![0-9]+]] +// CHECK1-NEXT: store ptr [[C]], ptr [[C_ADDR]], align 8 +// CHECK1-NEXT: tail call void @llvm.dbg.declare(metadata ptr [[C_ADDR]], metadata [[META242:![0-9]+]], metadata !DIExpression()), !dbg [[DBG241]] +// CHECK1-NEXT: store ptr [[A]], ptr [[A_ADDR]], align 8 +// CHECK1-NEXT: tail call void @llvm.dbg.declare(metadata ptr [[A_ADDR]], metadata [[META243:![0-9]+]], metadata !DIExpression()), !dbg [[DBG241]] +// CHECK1-NEXT: store ptr [[B]], ptr [[B_ADDR]], align 8 +// CHECK1-NEXT: tail call void @llvm.dbg.declare(metadata ptr [[B_ADDR]], metadata [[META244:![0-9]+]], metadata !DIExpression()), !dbg [[DBG241]] +// CHECK1-NEXT: store ptr [[BB]], ptr [[BB_ADDR]], align 8 +// CHECK1-NEXT: tail call void @llvm.dbg.declare(metadata ptr [[BB_ADDR]], metadata [[META245:![0-9]+]], metadata !DIExpression()), !dbg [[DBG241]] +// CHECK1-NEXT: [[TMP0:%.*]] = load ptr, ptr [[C_ADDR]], align 8, !dbg [[DBG246:![0-9]+]] +// CHECK1-NEXT: [[TMP1:%.*]] = load ptr, ptr [[A_ADDR]], align 8, !dbg [[DBG246]] +// CHECK1-NEXT: [[TMP2:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG246]] +// CHECK1-NEXT: [[TMP3:%.*]] = load ptr, ptr [[BB_ADDR]], align 8, !dbg [[DBG246]] +// CHECK1-NEXT: [[TMP4:%.*]] = load ptr, ptr [[DYN_PTR_ADDR]], align 8, !dbg [[DBG246]] +// CHECK1-NEXT: [[TMP5:%.*]] = load ptr, ptr [[C_ADDR]], align 8, !dbg [[DBG246]] +// CHECK1-NEXT: [[TMP6:%.*]] = load ptr, ptr [[A_ADDR]], align 8, !dbg [[DBG246]] +// CHECK1-NEXT: [[TMP7:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG246]] +// CHECK1-NEXT: [[TMP8:%.*]] = load ptr, ptr [[BB_ADDR]], align 8, !dbg [[DBG246]] +// CHECK1-NEXT: [[TMP9:%.*]] = addrspacecast ptr [[TMP5]] to ptr addrspace(1), !dbg [[DBG246]] +// CHECK1-NEXT: [[TMP10:%.*]] = addrspacecast ptr [[TMP6]] to ptr addrspace(1), !dbg [[DBG246]] +// CHECK1-NEXT: [[TMP11:%.*]] = addrspacecast ptr [[TMP7]] to ptr addrspace(1), !dbg [[DBG246]] +// CHECK1-NEXT: [[TMP12:%.*]] = addrspacecast ptr [[TMP8]] to ptr addrspace(1), !dbg [[DBG246]] +// CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l51_debug__(ptr [[TMP4]], ptr addrspace(1) [[TMP9]], ptr addrspace(1) [[TMP10]], ptr addrspace(1) [[TMP11]], ptr addrspace(1) [[TMP12]]) #[[ATTR4]], !dbg [[DBG246]] +// CHECK1-NEXT: ret void, !dbg [[DBG246]] // // // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l51_debug___omp_outlined_debug__ -// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr addrspace(1) noalias noundef [[C:%.*]], ptr addrspace(1) noalias noundef [[A:%.*]], ptr addrspace(1) noalias noundef [[B:%.*]], ptr addrspace(1) noalias noundef [[BB:%.*]]) #[[ATTR2]] !dbg [[DBG237:![0-9]+]] { +// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr addrspace(1) noalias noundef [[C:%.*]], ptr addrspace(1) noalias noundef [[A:%.*]], ptr addrspace(1) noalias noundef [[B:%.*]], ptr addrspace(1) noalias noundef [[BB:%.*]]) #[[ATTR1]] !dbg [[DBG247:![0-9]+]] { // CHECK1-NEXT: entry: // CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -572,82 +607,82 @@ int main() { // CHECK1-NEXT: [[H:%.*]] = alloca ptr, align 8 // CHECK1-NEXT: [[D:%.*]] = alloca i32, align 4 // CHECK1-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK1-NEXT: call void @llvm.dbg.declare(metadata ptr [[DOTGLOBAL_TID__ADDR]], metadata [[META240:![0-9]+]], metadata !DIExpression()), !dbg [[DBG241:![0-9]+]] +// CHECK1-NEXT: tail call void @llvm.dbg.declare(metadata ptr [[DOTGLOBAL_TID__ADDR]], metadata [[META250:![0-9]+]], metadata !DIExpression()), !dbg [[DBG251:![0-9]+]] // CHECK1-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8 -// CHECK1-NEXT: call void @llvm.dbg.declare(metadata ptr [[DOTBOUND_TID__ADDR]], metadata [[META242:![0-9]+]], metadata !DIExpression()), !dbg [[DBG241]] +// CHECK1-NEXT: tail call void @llvm.dbg.declare(metadata ptr [[DOTBOUND_TID__ADDR]], metadata [[META252:![0-9]+]], metadata !DIExpression()), !dbg [[DBG251]] // CHECK1-NEXT: store ptr addrspace(1) [[C]], ptr [[C_ADDR]], align 8 -// CHECK1-NEXT: call void @llvm.dbg.declare(metadata ptr [[C_ADDR]], metadata [[META243:![0-9]+]], metadata !DIExpression()), !dbg [[DBG244:![0-9]+]] +// CHECK1-NEXT: tail call void @llvm.dbg.declare(metadata ptr [[C_ADDR]], metadata [[META253:![0-9]+]], metadata !DIExpression()), !dbg [[DBG254:![0-9]+]] // CHECK1-NEXT: store ptr addrspace(1) [[A]], ptr [[A_ADDR]], align 8 -// CHECK1-NEXT: call void @llvm.dbg.declare(metadata ptr [[A_ADDR]], metadata [[META245:![0-9]+]], metadata !DIExpression()), !dbg [[DBG246:![0-9]+]] +// CHECK1-NEXT: tail call void @llvm.dbg.declare(metadata ptr [[A_ADDR]], metadata [[META255:![0-9]+]], metadata !DIExpression()), !dbg [[DBG256:![0-9]+]] // CHECK1-NEXT: store ptr addrspace(1) [[B]], ptr [[B_ADDR]], align 8 -// CHECK1-NEXT: call void @llvm.dbg.declare(metadata ptr [[B_ADDR]], metadata [[META247:![0-9]+]], metadata !DIExpression()), !dbg [[DBG248:![0-9]+]] +// CHECK1-NEXT: tail call void @llvm.dbg.declare(metadata ptr [[B_ADDR]], metadata [[META257:![0-9]+]], metadata !DIExpression()), !dbg [[DBG258:![0-9]+]] // CHECK1-NEXT: store ptr addrspace(1) [[BB]], ptr [[BB_ADDR]], align 8 -// CHECK1-NEXT: call void @llvm.dbg.declare(metadata ptr [[BB_ADDR]], metadata [[META249:![0-9]+]], metadata !DIExpression()), !dbg [[DBG250:![0-9]+]] -// CHECK1-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr [[C_ADDR]], align 8, !dbg [[DBG251:![0-9]+]] -// CHECK1-NEXT: [[TMP1:%.*]] = addrspacecast ptr addrspace(1) [[TMP0]] to ptr, !dbg [[DBG251]] -// CHECK1-NEXT: store ptr [[TMP1]], ptr [[TMP]], align 8, !dbg [[DBG251]] -// CHECK1-NEXT: [[TMP2:%.*]] = load ptr, ptr [[TMP]], align 8, !dbg [[DBG251]] -// CHECK1-NEXT: [[TMP3:%.*]] = load ptr addrspace(1), ptr [[A_ADDR]], align 8, !dbg [[DBG251]] -// CHECK1-NEXT: [[TMP4:%.*]] = addrspacecast ptr addrspace(1) [[TMP3]] to ptr, !dbg [[DBG251]] -// CHECK1-NEXT: store ptr [[TMP4]], ptr [[_TMP1]], align 8, !dbg [[DBG251]] -// CHECK1-NEXT: [[TMP5:%.*]] = load ptr, ptr [[_TMP1]], align 8, !dbg [[DBG251]] -// CHECK1-NEXT: [[TMP6:%.*]] = load ptr addrspace(1), ptr [[B_ADDR]], align 8, !dbg [[DBG251]] -// CHECK1-NEXT: [[TMP7:%.*]] = addrspacecast ptr addrspace(1) [[TMP6]] to ptr, !dbg [[DBG251]] -// CHECK1-NEXT: store ptr [[TMP7]], ptr [[_TMP2]], align 8, !dbg [[DBG251]] -// CHECK1-NEXT: [[TMP8:%.*]] = load ptr, ptr [[_TMP2]], align 8, !dbg [[DBG251]] -// CHECK1-NEXT: [[TMP9:%.*]] = load ptr addrspace(1), ptr [[BB_ADDR]], align 8, !dbg [[DBG251]] -// CHECK1-NEXT: [[TMP10:%.*]] = addrspacecast ptr addrspace(1) [[TMP9]] to ptr, !dbg [[DBG251]] -// CHECK1-NEXT: store ptr [[TMP10]], ptr [[_TMP3]], align 8, !dbg [[DBG251]] -// CHECK1-NEXT: [[TMP11:%.*]] = load ptr, ptr [[_TMP3]], align 8, !dbg [[DBG251]] -// CHECK1-NEXT: call void @llvm.dbg.declare(metadata ptr [[F]], metadata [[META252:![0-9]+]], metadata !DIExpression()), !dbg [[DBG254:![0-9]+]] -// CHECK1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x [10 x [10 x i32]]], ptr [[TMP2]], i64 0, i64 1, !dbg [[DBG255:![0-9]+]] -// CHECK1-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[ARRAYIDX]], i64 0, i64 1, !dbg [[DBG255]] -// CHECK1-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX4]], i64 0, i64 1, !dbg [[DBG255]] -// CHECK1-NEXT: store ptr [[ARRAYIDX5]], ptr [[F]], align 8, !dbg [[DBG254]] -// CHECK1-NEXT: call void @llvm.dbg.declare(metadata ptr [[G]], metadata [[META256:![0-9]+]], metadata !DIExpression()), !dbg [[DBG257:![0-9]+]] -// CHECK1-NEXT: store ptr [[TMP5]], ptr [[G]], align 8, !dbg [[DBG257]] -// CHECK1-NEXT: call void @llvm.dbg.declare(metadata ptr [[H]], metadata [[META258:![0-9]+]], metadata !DIExpression()), !dbg [[DBG259:![0-9]+]] -// CHECK1-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[TMP8]], i64 0, i64 1, !dbg [[DBG260:![0-9]+]] -// CHECK1-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX6]], i64 0, i64 1, !dbg [[DBG260]] -// CHECK1-NEXT: store ptr [[ARRAYIDX7]], ptr [[H]], align 8, !dbg [[DBG259]] -// CHECK1-NEXT: call void @llvm.dbg.declare(metadata ptr [[D]], metadata [[META261:![0-9]+]], metadata !DIExpression()), !dbg [[DBG262:![0-9]+]] -// CHECK1-NEXT: store i32 15, ptr [[D]], align 4, !dbg [[DBG262]] -// CHECK1-NEXT: store i32 5, ptr [[TMP5]], align 4, !dbg [[DBG263:![0-9]+]] -// CHECK1-NEXT: [[ARRAYIDX8:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[TMP8]], i64 0, i64 0, !dbg [[DBG264:![0-9]+]] -// CHECK1-NEXT: [[TMP12:%.*]] = load i32, ptr [[TMP5]], align 4, !dbg [[DBG265:![0-9]+]] -// CHECK1-NEXT: [[IDXPROM:%.*]] = sext i32 [[TMP12]] to i64, !dbg [[DBG264]] -// CHECK1-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX8]], i64 0, i64 [[IDXPROM]], !dbg [[DBG264]] -// CHECK1-NEXT: store i32 10, ptr [[ARRAYIDX9]], align 4, !dbg [[DBG266:![0-9]+]] -// CHECK1-NEXT: [[ARRAYIDX10:%.*]] = getelementptr inbounds [10 x [10 x [10 x i32]]], ptr [[TMP2]], i64 0, i64 0, !dbg [[DBG267:![0-9]+]] -// CHECK1-NEXT: [[ARRAYIDX11:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[ARRAYIDX10]], i64 0, i64 0, !dbg [[DBG267]] -// CHECK1-NEXT: [[TMP13:%.*]] = load i32, ptr [[TMP5]], align 4, !dbg [[DBG268:![0-9]+]] -// CHECK1-NEXT: [[IDXPROM12:%.*]] = sext i32 [[TMP13]] to i64, !dbg [[DBG267]] -// CHECK1-NEXT: [[ARRAYIDX13:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX11]], i64 0, i64 [[IDXPROM12]], !dbg [[DBG267]] -// CHECK1-NEXT: store i32 11, ptr [[ARRAYIDX13]], align 4, !dbg [[DBG269:![0-9]+]] -// CHECK1-NEXT: [[ARRAYIDX14:%.*]] = getelementptr inbounds [10 x [10 x [10 x i32]]], ptr [[TMP2]], i64 0, i64 0, !dbg [[DBG270:![0-9]+]] -// CHECK1-NEXT: [[ARRAYIDX15:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[ARRAYIDX14]], i64 0, i64 0, !dbg [[DBG270]] -// CHECK1-NEXT: [[TMP14:%.*]] = load i32, ptr [[TMP5]], align 4, !dbg [[DBG271:![0-9]+]] -// CHECK1-NEXT: [[IDXPROM16:%.*]] = sext i32 [[TMP14]] to i64, !dbg [[DBG270]] -// CHECK1-NEXT: [[ARRAYIDX17:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX15]], i64 0, i64 [[IDXPROM16]], !dbg [[DBG270]] -// CHECK1-NEXT: [[TMP15:%.*]] = load i32, ptr [[ARRAYIDX17]], align 4, !dbg [[DBG270]] -// CHECK1-NEXT: [[ARRAYIDX18:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[TMP8]], i64 0, i64 0, !dbg [[DBG272:![0-9]+]] -// CHECK1-NEXT: [[TMP16:%.*]] = load i32, ptr [[TMP5]], align 4, !dbg [[DBG273:![0-9]+]] -// CHECK1-NEXT: [[IDXPROM19:%.*]] = sext i32 [[TMP16]] to i64, !dbg [[DBG272]] -// CHECK1-NEXT: [[ARRAYIDX20:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX18]], i64 0, i64 [[IDXPROM19]], !dbg [[DBG272]] -// CHECK1-NEXT: store i32 [[TMP15]], ptr [[ARRAYIDX20]], align 4, !dbg [[DBG274:![0-9]+]] -// CHECK1-NEXT: [[ARRAYIDX21:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[TMP8]], i64 0, i64 0, !dbg [[DBG275:![0-9]+]] -// CHECK1-NEXT: [[TMP17:%.*]] = load i32, ptr [[TMP5]], align 4, !dbg [[DBG276:![0-9]+]] -// CHECK1-NEXT: [[IDXPROM22:%.*]] = sext i32 [[TMP17]] to i64, !dbg [[DBG275]] -// CHECK1-NEXT: [[ARRAYIDX23:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX21]], i64 0, i64 [[IDXPROM22]], !dbg [[DBG275]] -// CHECK1-NEXT: [[TMP18:%.*]] = load i32, ptr [[ARRAYIDX23]], align 4, !dbg [[DBG275]] -// CHECK1-NEXT: [[TOBOOL:%.*]] = icmp ne i32 [[TMP18]], 0, !dbg [[DBG275]] -// CHECK1-NEXT: [[FROMBOOL:%.*]] = zext i1 [[TOBOOL]] to i8, !dbg [[DBG277:![0-9]+]] -// CHECK1-NEXT: store i8 [[FROMBOOL]], ptr [[TMP11]], align 1, !dbg [[DBG277]] -// CHECK1-NEXT: ret void, !dbg [[DBG278:![0-9]+]] +// CHECK1-NEXT: tail call void @llvm.dbg.declare(metadata ptr [[BB_ADDR]], metadata [[META259:![0-9]+]], metadata !DIExpression()), !dbg [[DBG260:![0-9]+]] +// CHECK1-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr [[C_ADDR]], align 8, !dbg [[DBG261:![0-9]+]] +// CHECK1-NEXT: [[TMP1:%.*]] = addrspacecast ptr addrspace(1) [[TMP0]] to ptr, !dbg [[DBG261]] +// CHECK1-NEXT: store ptr [[TMP1]], ptr [[TMP]], align 8, !dbg [[DBG261]] +// CHECK1-NEXT: [[TMP2:%.*]] = load ptr, ptr [[TMP]], align 8, !dbg [[DBG261]] +// CHECK1-NEXT: [[TMP3:%.*]] = load ptr addrspace(1), ptr [[A_ADDR]], align 8, !dbg [[DBG261]] +// CHECK1-NEXT: [[TMP4:%.*]] = addrspacecast ptr addrspace(1) [[TMP3]] to ptr, !dbg [[DBG261]] +// CHECK1-NEXT: store ptr [[TMP4]], ptr [[_TMP1]], align 8, !dbg [[DBG261]] +// CHECK1-NEXT: [[TMP5:%.*]] = load ptr, ptr [[_TMP1]], align 8, !dbg [[DBG261]] +// CHECK1-NEXT: [[TMP6:%.*]] = load ptr addrspace(1), ptr [[B_ADDR]], align 8, !dbg [[DBG261]] +// CHECK1-NEXT: [[TMP7:%.*]] = addrspacecast ptr addrspace(1) [[TMP6]] to ptr, !dbg [[DBG261]] +// CHECK1-NEXT: store ptr [[TMP7]], ptr [[_TMP2]], align 8, !dbg [[DBG261]] +// CHECK1-NEXT: [[TMP8:%.*]] = load ptr, ptr [[_TMP2]], align 8, !dbg [[DBG261]] +// CHECK1-NEXT: [[TMP9:%.*]] = load ptr addrspace(1), ptr [[BB_ADDR]], align 8, !dbg [[DBG261]] +// CHECK1-NEXT: [[TMP10:%.*]] = addrspacecast ptr addrspace(1) [[TMP9]] to ptr, !dbg [[DBG261]] +// CHECK1-NEXT: store ptr [[TMP10]], ptr [[_TMP3]], align 8, !dbg [[DBG261]] +// CHECK1-NEXT: [[TMP11:%.*]] = load ptr, ptr [[_TMP3]], align 8, !dbg [[DBG261]] +// CHECK1-NEXT: tail call void @llvm.dbg.declare(metadata ptr [[F]], metadata [[META262:![0-9]+]], metadata !DIExpression()), !dbg [[DBG264:![0-9]+]] +// CHECK1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x [10 x [10 x i32]]], ptr [[TMP2]], i64 0, i64 1, !dbg [[DBG265:![0-9]+]] +// CHECK1-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[ARRAYIDX]], i64 0, i64 1, !dbg [[DBG265]] +// CHECK1-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX4]], i64 0, i64 1, !dbg [[DBG265]] +// CHECK1-NEXT: store ptr [[ARRAYIDX5]], ptr [[F]], align 8, !dbg [[DBG264]] +// CHECK1-NEXT: tail call void @llvm.dbg.declare(metadata ptr [[G]], metadata [[META266:![0-9]+]], metadata !DIExpression()), !dbg [[DBG267:![0-9]+]] +// CHECK1-NEXT: store ptr [[TMP5]], ptr [[G]], align 8, !dbg [[DBG267]] +// CHECK1-NEXT: tail call void @llvm.dbg.declare(metadata ptr [[H]], metadata [[META268:![0-9]+]], metadata !DIExpression()), !dbg [[DBG269:![0-9]+]] +// CHECK1-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[TMP8]], i64 0, i64 1, !dbg [[DBG270:![0-9]+]] +// CHECK1-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX6]], i64 0, i64 1, !dbg [[DBG270]] +// CHECK1-NEXT: store ptr [[ARRAYIDX7]], ptr [[H]], align 8, !dbg [[DBG269]] +// CHECK1-NEXT: tail call void @llvm.dbg.declare(metadata ptr [[D]], metadata [[META271:![0-9]+]], metadata !DIExpression()), !dbg [[DBG272:![0-9]+]] +// CHECK1-NEXT: store i32 15, ptr [[D]], align 4, !dbg [[DBG272]] +// CHECK1-NEXT: store i32 5, ptr [[TMP5]], align 4, !dbg [[DBG273:![0-9]+]] +// CHECK1-NEXT: [[ARRAYIDX8:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[TMP8]], i64 0, i64 0, !dbg [[DBG274:![0-9]+]] +// CHECK1-NEXT: [[TMP12:%.*]] = load i32, ptr [[TMP5]], align 4, !dbg [[DBG275:![0-9]+]] +// CHECK1-NEXT: [[IDXPROM:%.*]] = sext i32 [[TMP12]] to i64, !dbg [[DBG274]] +// CHECK1-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX8]], i64 0, i64 [[IDXPROM]], !dbg [[DBG274]] +// CHECK1-NEXT: store i32 10, ptr [[ARRAYIDX9]], align 4, !dbg [[DBG276:![0-9]+]] +// CHECK1-NEXT: [[ARRAYIDX10:%.*]] = getelementptr inbounds [10 x [10 x [10 x i32]]], ptr [[TMP2]], i64 0, i64 0, !dbg [[DBG277:![0-9]+]] +// CHECK1-NEXT: [[ARRAYIDX11:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[ARRAYIDX10]], i64 0, i64 0, !dbg [[DBG277]] +// CHECK1-NEXT: [[TMP13:%.*]] = load i32, ptr [[TMP5]], align 4, !dbg [[DBG278:![0-9]+]] +// CHECK1-NEXT: [[IDXPROM12:%.*]] = sext i32 [[TMP13]] to i64, !dbg [[DBG277]] +// CHECK1-NEXT: [[ARRAYIDX13:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX11]], i64 0, i64 [[IDXPROM12]], !dbg [[DBG277]] +// CHECK1-NEXT: store i32 11, ptr [[ARRAYIDX13]], align 4, !dbg [[DBG279:![0-9]+]] +// CHECK1-NEXT: [[ARRAYIDX14:%.*]] = getelementptr inbounds [10 x [10 x [10 x i32]]], ptr [[TMP2]], i64 0, i64 0, !dbg [[DBG280:![0-9]+]] +// CHECK1-NEXT: [[ARRAYIDX15:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[ARRAYIDX14]], i64 0, i64 0, !dbg [[DBG280]] +// CHECK1-NEXT: [[TMP14:%.*]] = load i32, ptr [[TMP5]], align 4, !dbg [[DBG281:![0-9]+]] +// CHECK1-NEXT: [[IDXPROM16:%.*]] = sext i32 [[TMP14]] to i64, !dbg [[DBG280]] +// CHECK1-NEXT: [[ARRAYIDX17:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX15]], i64 0, i64 [[IDXPROM16]], !dbg [[DBG280]] +// CHECK1-NEXT: [[TMP15:%.*]] = load i32, ptr [[ARRAYIDX17]], align 4, !dbg [[DBG280]] +// CHECK1-NEXT: [[ARRAYIDX18:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[TMP8]], i64 0, i64 0, !dbg [[DBG282:![0-9]+]] +// CHECK1-NEXT: [[TMP16:%.*]] = load i32, ptr [[TMP5]], align 4, !dbg [[DBG283:![0-9]+]] +// CHECK1-NEXT: [[IDXPROM19:%.*]] = sext i32 [[TMP16]] to i64, !dbg [[DBG282]] +// CHECK1-NEXT: [[ARRAYIDX20:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX18]], i64 0, i64 [[IDXPROM19]], !dbg [[DBG282]] +// CHECK1-NEXT: store i32 [[TMP15]], ptr [[ARRAYIDX20]], align 4, !dbg [[DBG284:![0-9]+]] +// CHECK1-NEXT: [[ARRAYIDX21:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[TMP8]], i64 0, i64 0, !dbg [[DBG285:![0-9]+]] +// CHECK1-NEXT: [[TMP17:%.*]] = load i32, ptr [[TMP5]], align 4, !dbg [[DBG286:![0-9]+]] +// CHECK1-NEXT: [[IDXPROM22:%.*]] = sext i32 [[TMP17]] to i64, !dbg [[DBG285]] +// CHECK1-NEXT: [[ARRAYIDX23:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX21]], i64 0, i64 [[IDXPROM22]], !dbg [[DBG285]] +// CHECK1-NEXT: [[TMP18:%.*]] = load i32, ptr [[ARRAYIDX23]], align 4, !dbg [[DBG285]] +// CHECK1-NEXT: [[TOBOOL:%.*]] = icmp ne i32 [[TMP18]], 0, !dbg [[DBG285]] +// CHECK1-NEXT: [[FROMBOOL:%.*]] = zext i1 [[TOBOOL]] to i8, !dbg [[DBG287:![0-9]+]] +// CHECK1-NEXT: store i8 [[FROMBOOL]], ptr [[TMP11]], align 1, !dbg [[DBG287]] +// CHECK1-NEXT: ret void, !dbg [[DBG288:![0-9]+]] // // // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l51_debug___omp_outlined -// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4000) [[C:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(400) [[B:%.*]], ptr noundef nonnull align 1 dereferenceable(1) [[BB:%.*]]) #[[ATTR2]] !dbg [[DBG279:![0-9]+]] { +// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4000) [[C:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(400) [[B:%.*]], ptr noundef nonnull align 1 dereferenceable(1) [[BB:%.*]]) #[[ATTR1]] !dbg [[DBG289:![0-9]+]] { // CHECK1-NEXT: entry: // CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -656,66 +691,31 @@ int main() { // CHECK1-NEXT: [[B_ADDR:%.*]] = alloca ptr, align 8 // CHECK1-NEXT: [[BB_ADDR:%.*]] = alloca ptr, align 8 // CHECK1-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK1-NEXT: call void @llvm.dbg.declare(metadata ptr [[DOTGLOBAL_TID__ADDR]], metadata [[META282:![0-9]+]], metadata !DIExpression()), !dbg [[DBG283:![0-9]+]] +// CHECK1-NEXT: tail call void @llvm.dbg.declare(metadata ptr [[DOTGLOBAL_TID__ADDR]], metadata [[META292:![0-9]+]], metadata !DIExpression()), !dbg [[DBG293:![0-9]+]] // CHECK1-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8 -// CHECK1-NEXT: call void @llvm.dbg.declare(metadata ptr [[DOTBOUND_TID__ADDR]], metadata [[META284:![0-9]+]], metadata !DIExpression()), !dbg [[DBG283]] -// CHECK1-NEXT: store ptr [[C]], ptr [[C_ADDR]], align 8 -// CHECK1-NEXT: call void @llvm.dbg.declare(metadata ptr [[C_ADDR]], metadata [[META285:![0-9]+]], metadata !DIExpression()), !dbg [[DBG283]] -// CHECK1-NEXT: store ptr [[A]], ptr [[A_ADDR]], align 8 -// CHECK1-NEXT: call void @llvm.dbg.declare(metadata ptr [[A_ADDR]], metadata [[META286:![0-9]+]], metadata !DIExpression()), !dbg [[DBG283]] -// CHECK1-NEXT: store ptr [[B]], ptr [[B_ADDR]], align 8 -// CHECK1-NEXT: call void @llvm.dbg.declare(metadata ptr [[B_ADDR]], metadata [[META287:![0-9]+]], metadata !DIExpression()), !dbg [[DBG283]] -// CHECK1-NEXT: store ptr [[BB]], ptr [[BB_ADDR]], align 8 -// CHECK1-NEXT: call void @llvm.dbg.declare(metadata ptr [[BB_ADDR]], metadata [[META288:![0-9]+]], metadata !DIExpression()), !dbg [[DBG283]] -// CHECK1-NEXT: [[TMP0:%.*]] = load ptr, ptr [[C_ADDR]], align 8, !dbg [[DBG289:![0-9]+]] -// CHECK1-NEXT: [[TMP1:%.*]] = load ptr, ptr [[A_ADDR]], align 8, !dbg [[DBG289]] -// CHECK1-NEXT: [[TMP2:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG289]] -// CHECK1-NEXT: [[TMP3:%.*]] = load ptr, ptr [[BB_ADDR]], align 8, !dbg [[DBG289]] -// CHECK1-NEXT: [[TMP4:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8, !dbg [[DBG289]] -// CHECK1-NEXT: [[TMP5:%.*]] = load ptr, ptr [[DOTBOUND_TID__ADDR]], align 8, !dbg [[DBG289]] -// CHECK1-NEXT: [[TMP6:%.*]] = load ptr, ptr [[C_ADDR]], align 8, !dbg [[DBG289]] -// CHECK1-NEXT: [[TMP7:%.*]] = load ptr, ptr [[A_ADDR]], align 8, !dbg [[DBG289]] -// CHECK1-NEXT: [[TMP8:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG289]] -// CHECK1-NEXT: [[TMP9:%.*]] = load ptr, ptr [[BB_ADDR]], align 8, !dbg [[DBG289]] -// CHECK1-NEXT: [[TMP10:%.*]] = addrspacecast ptr [[TMP6]] to ptr addrspace(1), !dbg [[DBG289]] -// CHECK1-NEXT: [[TMP11:%.*]] = addrspacecast ptr [[TMP7]] to ptr addrspace(1), !dbg [[DBG289]] -// CHECK1-NEXT: [[TMP12:%.*]] = addrspacecast ptr [[TMP8]] to ptr addrspace(1), !dbg [[DBG289]] -// CHECK1-NEXT: [[TMP13:%.*]] = addrspacecast ptr [[TMP9]] to ptr addrspace(1), !dbg [[DBG289]] -// CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l51_debug___omp_outlined_debug__(ptr [[TMP4]], ptr [[TMP5]], ptr addrspace(1) [[TMP10]], ptr addrspace(1) [[TMP11]], ptr addrspace(1) [[TMP12]], ptr addrspace(1) [[TMP13]]) #[[ATTR4]], !dbg [[DBG289]] -// CHECK1-NEXT: ret void, !dbg [[DBG289]] -// -// -// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l51 -// CHECK1-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], ptr noundef nonnull align 4 dereferenceable(4000) [[C:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(400) [[B:%.*]], ptr noundef nonnull align 1 dereferenceable(1) [[BB:%.*]]) #[[ATTR6]] !dbg [[DBG290:![0-9]+]] { -// CHECK1-NEXT: entry: -// CHECK1-NEXT: [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8 -// CHECK1-NEXT: [[C_ADDR:%.*]] = alloca ptr, align 8 -// CHECK1-NEXT: [[A_ADDR:%.*]] = alloca ptr, align 8 -// CHECK1-NEXT: [[B_ADDR:%.*]] = alloca ptr, align 8 -// CHECK1-NEXT: [[BB_ADDR:%.*]] = alloca ptr, align 8 -// CHECK1-NEXT: store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8 -// CHECK1-NEXT: call void @llvm.dbg.declare(metadata ptr [[DYN_PTR_ADDR]], metadata [[META293:![0-9]+]], metadata !DIExpression()), !dbg [[DBG294:![0-9]+]] +// CHECK1-NEXT: tail call void @llvm.dbg.declare(metadata ptr [[DOTBOUND_TID__ADDR]], metadata [[META294:![0-9]+]], metadata !DIExpression()), !dbg [[DBG293]] // CHECK1-NEXT: store ptr [[C]], ptr [[C_ADDR]], align 8 -// CHECK1-NEXT: call void @llvm.dbg.declare(metadata ptr [[C_ADDR]], metadata [[META295:![0-9]+]], metadata !DIExpression()), !dbg [[DBG294]] +// CHECK1-NEXT: tail call void @llvm.dbg.declare(metadata ptr [[C_ADDR]], metadata [[META295:![0-9]+]], metadata !DIExpression()), !dbg [[DBG293]] // CHECK1-NEXT: store ptr [[A]], ptr [[A_ADDR]], align 8 -// CHECK1-NEXT: call void @llvm.dbg.declare(metadata ptr [[A_ADDR]], metadata [[META296:![0-9]+]], metadata !DIExpression()), !dbg [[DBG294]] +// CHECK1-NEXT: tail call void @llvm.dbg.declare(metadata ptr [[A_ADDR]], metadata [[META296:![0-9]+]], metadata !DIExpression()), !dbg [[DBG293]] // CHECK1-NEXT: store ptr [[B]], ptr [[B_ADDR]], align 8 -// CHECK1-NEXT: call void @llvm.dbg.declare(metadata ptr [[B_ADDR]], metadata [[META297:![0-9]+]], metadata !DIExpression()), !dbg [[DBG294]] +// CHECK1-NEXT: tail call void @llvm.dbg.declare(metadata ptr [[B_ADDR]], metadata [[META297:![0-9]+]], metadata !DIExpression()), !dbg [[DBG293]] // CHECK1-NEXT: store ptr [[BB]], ptr [[BB_ADDR]], align 8 -// CHECK1-NEXT: call void @llvm.dbg.declare(metadata ptr [[BB_ADDR]], metadata [[META298:![0-9]+]], metadata !DIExpression()), !dbg [[DBG294]] +// CHECK1-NEXT: tail call void @llvm.dbg.declare(metadata ptr [[BB_ADDR]], metadata [[META298:![0-9]+]], metadata !DIExpression()), !dbg [[DBG293]] // CHECK1-NEXT: [[TMP0:%.*]] = load ptr, ptr [[C_ADDR]], align 8, !dbg [[DBG299:![0-9]+]] // CHECK1-NEXT: [[TMP1:%.*]] = load ptr, ptr [[A_ADDR]], align 8, !dbg [[DBG299]] // CHECK1-NEXT: [[TMP2:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG299]] // CHECK1-NEXT: [[TMP3:%.*]] = load ptr, ptr [[BB_ADDR]], align 8, !dbg [[DBG299]] -// CHECK1-NEXT: [[TMP4:%.*]] = load ptr, ptr [[DYN_PTR_ADDR]], align 8, !dbg [[DBG299]] -// CHECK1-NEXT: [[TMP5:%.*]] = load ptr, ptr [[C_ADDR]], align 8, !dbg [[DBG299]] -// CHECK1-NEXT: [[TMP6:%.*]] = load ptr, ptr [[A_ADDR]], align 8, !dbg [[DBG299]] -// CHECK1-NEXT: [[TMP7:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG299]] -// CHECK1-NEXT: [[TMP8:%.*]] = load ptr, ptr [[BB_ADDR]], align 8, !dbg [[DBG299]] -// CHECK1-NEXT: [[TMP9:%.*]] = addrspacecast ptr [[TMP5]] to ptr addrspace(1), !dbg [[DBG299]] +// CHECK1-NEXT: [[TMP4:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8, !dbg [[DBG299]] +// CHECK1-NEXT: [[TMP5:%.*]] = load ptr, ptr [[DOTBOUND_TID__ADDR]], align 8, !dbg [[DBG299]] +// CHECK1-NEXT: [[TMP6:%.*]] = load ptr, ptr [[C_ADDR]], align 8, !dbg [[DBG299]] +// CHECK1-NEXT: [[TMP7:%.*]] = load ptr, ptr [[A_ADDR]], align 8, !dbg [[DBG299]] +// CHECK1-NEXT: [[TMP8:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG299]] +// CHECK1-NEXT: [[TMP9:%.*]] = load ptr, ptr [[BB_ADDR]], align 8, !dbg [[DBG299]] // CHECK1-NEXT: [[TMP10:%.*]] = addrspacecast ptr [[TMP6]] to ptr addrspace(1), !dbg [[DBG299]] // CHECK1-NEXT: [[TMP11:%.*]] = addrspacecast ptr [[TMP7]] to ptr addrspace(1), !dbg [[DBG299]] // CHECK1-NEXT: [[TMP12:%.*]] = addrspacecast ptr [[TMP8]] to ptr addrspace(1), !dbg [[DBG299]] -// CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l51_debug__(ptr [[TMP4]], ptr addrspace(1) [[TMP9]], ptr addrspace(1) [[TMP10]], ptr addrspace(1) [[TMP11]], ptr addrspace(1) [[TMP12]]) #[[ATTR4]], !dbg [[DBG299]] +// CHECK1-NEXT: [[TMP13:%.*]] = addrspacecast ptr [[TMP9]] to ptr addrspace(1), !dbg [[DBG299]] +// CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l51_debug___omp_outlined_debug__(ptr [[TMP4]], ptr [[TMP5]], ptr addrspace(1) [[TMP10]], ptr addrspace(1) [[TMP11]], ptr addrspace(1) [[TMP12]], ptr addrspace(1) [[TMP13]]) #[[ATTR4]], !dbg [[DBG299]] // CHECK1-NEXT: ret void, !dbg [[DBG299]] // diff --git a/clang/test/OpenMP/target_parallel_for_debug_codegen.cpp b/clang/test/OpenMP/target_parallel_for_debug_codegen.cpp index 688456a647764..7f9c7e0c3babb 100644 --- a/clang/test/OpenMP/target_parallel_for_debug_codegen.cpp +++ b/clang/test/OpenMP/target_parallel_for_debug_codegen.cpp @@ -55,7 +55,7 @@ int main() { return 0; } // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l13_debug__ -// CHECK1-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], ptr addrspace(1) noalias noundef [[C:%.*]], i32 noundef [[A:%.*]], ptr noalias noundef [[B:%.*]], ptr addrspace(1) noalias noundef [[BB:%.*]], i1 noundef zeroext [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR0:[0-9]+]] !dbg [[DBG22:![0-9]+]] { +// CHECK1-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], ptr addrspace(1) noalias noundef [[C:%.*]], i32 noundef [[A:%.*]], ptr noalias noundef [[B:%.*]], ptr addrspace(1) noalias noundef [[BB:%.*]], i1 noundef zeroext [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR1:[0-9]+]] !dbg [[DBG19:![0-9]+]] { // CHECK1-NEXT: entry: // CHECK1-NEXT: [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8 // CHECK1-NEXT: [[C_ADDR:%.*]] = alloca ptr addrspace(1), align 8 @@ -69,58 +69,95 @@ int main() { // CHECK1-NEXT: [[A_CASTED:%.*]] = alloca i64, align 8 // CHECK1-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [4 x ptr], align 8 // CHECK1-NEXT: store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8 -// CHECK1-NEXT: call void @llvm.dbg.declare(metadata ptr [[DYN_PTR_ADDR]], metadata [[META43:![0-9]+]], metadata !DIExpression()), !dbg [[DBG44:![0-9]+]] +// CHECK1-NEXT: tail call void @llvm.dbg.declare(metadata ptr [[DYN_PTR_ADDR]], metadata [[META40:![0-9]+]], metadata !DIExpression()), !dbg [[DBG41:![0-9]+]] // CHECK1-NEXT: store ptr addrspace(1) [[C]], ptr [[C_ADDR]], align 8 -// CHECK1-NEXT: call void @llvm.dbg.declare(metadata ptr [[C_ADDR]], metadata [[META45:![0-9]+]], metadata !DIExpression()), !dbg [[DBG46:![0-9]+]] +// CHECK1-NEXT: tail call void @llvm.dbg.declare(metadata ptr [[C_ADDR]], metadata [[META42:![0-9]+]], metadata !DIExpression()), !dbg [[DBG43:![0-9]+]] // CHECK1-NEXT: store i32 [[A]], ptr [[A_ADDR]], align 4 -// CHECK1-NEXT: call void @llvm.dbg.declare(metadata ptr [[A_ADDR]], metadata [[META47:![0-9]+]], metadata !DIExpression()), !dbg [[DBG48:![0-9]+]] +// CHECK1-NEXT: tail call void @llvm.dbg.declare(metadata ptr [[A_ADDR]], metadata [[META44:![0-9]+]], metadata !DIExpression()), !dbg [[DBG45:![0-9]+]] // CHECK1-NEXT: store ptr [[B]], ptr [[B_ADDR]], align 8 -// CHECK1-NEXT: call void @llvm.dbg.declare(metadata ptr [[B_ADDR]], metadata [[META49:![0-9]+]], metadata !DIExpression()), !dbg [[DBG50:![0-9]+]] +// CHECK1-NEXT: tail call void @llvm.dbg.declare(metadata ptr [[B_ADDR]], metadata [[META46:![0-9]+]], metadata !DIExpression()), !dbg [[DBG47:![0-9]+]] // CHECK1-NEXT: store ptr addrspace(1) [[BB]], ptr [[BB_ADDR]], align 8 -// CHECK1-NEXT: call void @llvm.dbg.declare(metadata ptr [[BB_ADDR]], metadata [[META51:![0-9]+]], metadata !DIExpression()), !dbg [[DBG52:![0-9]+]] +// CHECK1-NEXT: tail call void @llvm.dbg.declare(metadata ptr [[BB_ADDR]], metadata [[META48:![0-9]+]], metadata !DIExpression()), !dbg [[DBG49:![0-9]+]] // CHECK1-NEXT: [[FROMBOOL:%.*]] = zext i1 [[DOTCAPTURE_EXPR_]] to i8 // CHECK1-NEXT: store i8 [[FROMBOOL]], ptr [[DOTCAPTURE_EXPR__ADDR]], align 1 -// CHECK1-NEXT: call void @llvm.dbg.declare(metadata ptr [[DOTCAPTURE_EXPR__ADDR]], metadata [[META53:![0-9]+]], metadata !DIExpression()), !dbg [[DBG54:![0-9]+]] -// CHECK1-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr [[C_ADDR]], align 8, !dbg [[DBG55:![0-9]+]] -// CHECK1-NEXT: [[TMP1:%.*]] = addrspacecast ptr addrspace(1) [[TMP0]] to ptr, !dbg [[DBG55]] -// CHECK1-NEXT: store ptr [[TMP1]], ptr [[TMP]], align 8, !dbg [[DBG55]] -// CHECK1-NEXT: [[TMP2:%.*]] = load ptr, ptr [[TMP]], align 8, !dbg [[DBG55]] -// CHECK1-NEXT: [[TMP3:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG55]] -// CHECK1-NEXT: store ptr [[TMP3]], ptr [[_TMP1]], align 8, !dbg [[DBG55]] -// CHECK1-NEXT: [[TMP4:%.*]] = load ptr, ptr [[_TMP1]], align 8, !dbg [[DBG55]] -// CHECK1-NEXT: [[TMP5:%.*]] = load ptr addrspace(1), ptr [[BB_ADDR]], align 8, !dbg [[DBG55]] -// CHECK1-NEXT: [[TMP6:%.*]] = addrspacecast ptr addrspace(1) [[TMP5]] to ptr, !dbg [[DBG55]] -// CHECK1-NEXT: store ptr [[TMP6]], ptr [[_TMP2]], align 8, !dbg [[DBG55]] -// CHECK1-NEXT: [[TMP7:%.*]] = load ptr, ptr [[_TMP2]], align 8, !dbg [[DBG55]] -// CHECK1-NEXT: [[TMP8:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l13_kernel_environment, ptr [[DYN_PTR]]), !dbg [[DBG55]] -// CHECK1-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP8]], -1, !dbg [[DBG55]] -// CHECK1-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]], !dbg [[DBG55]] +// CHECK1-NEXT: tail call void @llvm.dbg.declare(metadata ptr [[DOTCAPTURE_EXPR__ADDR]], metadata [[META50:![0-9]+]], metadata !DIExpression()), !dbg [[DBG51:![0-9]+]] +// CHECK1-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr [[C_ADDR]], align 8, !dbg [[DBG52:![0-9]+]] +// CHECK1-NEXT: [[TMP1:%.*]] = addrspacecast ptr addrspace(1) [[TMP0]] to ptr, !dbg [[DBG52]] +// CHECK1-NEXT: store ptr [[TMP1]], ptr [[TMP]], align 8, !dbg [[DBG52]] +// CHECK1-NEXT: [[TMP2:%.*]] = load ptr, ptr [[TMP]], align 8, !dbg [[DBG52]] +// CHECK1-NEXT: [[TMP3:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG52]] +// CHECK1-NEXT: store ptr [[TMP3]], ptr [[_TMP1]], align 8, !dbg [[DBG52]] +// CHECK1-NEXT: [[TMP4:%.*]] = load ptr, ptr [[_TMP1]], align 8, !dbg [[DBG52]] +// CHECK1-NEXT: [[TMP5:%.*]] = load ptr addrspace(1), ptr [[BB_ADDR]], align 8, !dbg [[DBG52]] +// CHECK1-NEXT: [[TMP6:%.*]] = addrspacecast ptr addrspace(1) [[TMP5]] to ptr, !dbg [[DBG52]] +// CHECK1-NEXT: store ptr [[TMP6]], ptr [[_TMP2]], align 8, !dbg [[DBG52]] +// CHECK1-NEXT: [[TMP7:%.*]] = load ptr, ptr [[_TMP2]], align 8, !dbg [[DBG52]] +// CHECK1-NEXT: [[TMP8:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l13_kernel_environment, ptr [[DYN_PTR]]), !dbg [[DBG52]] +// CHECK1-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP8]], -1, !dbg [[DBG52]] +// CHECK1-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]], !dbg [[DBG52]] // CHECK1: user_code.entry: -// CHECK1-NEXT: [[TMP9:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB6:[0-9]+]]) -// CHECK1-NEXT: [[TMP10:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG56:![0-9]+]] -// CHECK1-NEXT: store i32 [[TMP10]], ptr [[A_CASTED]], align 4, !dbg [[DBG56]] -// CHECK1-NEXT: [[TMP11:%.*]] = load i64, ptr [[A_CASTED]], align 8, !dbg [[DBG56]] -// CHECK1-NEXT: [[TMP12:%.*]] = getelementptr inbounds [4 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 0, !dbg [[DBG56]] -// CHECK1-NEXT: store ptr [[TMP2]], ptr [[TMP12]], align 8, !dbg [[DBG56]] -// CHECK1-NEXT: [[TMP13:%.*]] = getelementptr inbounds [4 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 1, !dbg [[DBG56]] -// CHECK1-NEXT: [[TMP14:%.*]] = inttoptr i64 [[TMP11]] to ptr, !dbg [[DBG56]] -// CHECK1-NEXT: store ptr [[TMP14]], ptr [[TMP13]], align 8, !dbg [[DBG56]] -// CHECK1-NEXT: [[TMP15:%.*]] = getelementptr inbounds [4 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 2, !dbg [[DBG56]] -// CHECK1-NEXT: store ptr [[TMP4]], ptr [[TMP15]], align 8, !dbg [[DBG56]] -// CHECK1-NEXT: [[TMP16:%.*]] = getelementptr inbounds [4 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 3, !dbg [[DBG56]] -// CHECK1-NEXT: store ptr [[TMP7]], ptr [[TMP16]], align 8, !dbg [[DBG56]] -// CHECK1-NEXT: [[TMP17:%.*]] = load i8, ptr [[DOTCAPTURE_EXPR__ADDR]], align 1, !dbg [[DBG57:![0-9]+]] -// CHECK1-NEXT: [[TOBOOL:%.*]] = trunc i8 [[TMP17]] to i1, !dbg [[DBG57]] -// CHECK1-NEXT: [[TMP18:%.*]] = zext i1 [[TOBOOL]] to i32, !dbg [[DBG56]] -// CHECK1-NEXT: call void @__kmpc_parallel_51(ptr @[[GLOB6]], i32 [[TMP9]], i32 [[TMP18]], i32 -1, i32 -1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l13_debug___omp_outlined, ptr null, ptr [[CAPTURED_VARS_ADDRS]], i64 4), !dbg [[DBG56]] -// CHECK1-NEXT: call void @__kmpc_target_deinit(), !dbg [[DBG59:![0-9]+]] -// CHECK1-NEXT: ret void, !dbg [[DBG60:![0-9]+]] +// CHECK1-NEXT: [[TMP9:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB6:[0-9]+]]), !dbg [[DBG53:![0-9]+]] +// CHECK1-NEXT: [[TMP10:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG55:![0-9]+]] +// CHECK1-NEXT: store i32 [[TMP10]], ptr [[A_CASTED]], align 4, !dbg [[DBG55]] +// CHECK1-NEXT: [[TMP11:%.*]] = load i64, ptr [[A_CASTED]], align 8, !dbg [[DBG55]] +// CHECK1-NEXT: [[TMP12:%.*]] = getelementptr inbounds [4 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 0, !dbg [[DBG55]] +// CHECK1-NEXT: store ptr [[TMP2]], ptr [[TMP12]], align 8, !dbg [[DBG55]] +// CHECK1-NEXT: [[TMP13:%.*]] = getelementptr inbounds [4 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 1, !dbg [[DBG55]] +// CHECK1-NEXT: [[TMP14:%.*]] = inttoptr i64 [[TMP11]] to ptr, !dbg [[DBG55]] +// CHECK1-NEXT: store ptr [[TMP14]], ptr [[TMP13]], align 8, !dbg [[DBG55]] +// CHECK1-NEXT: [[TMP15:%.*]] = getelementptr inbounds [4 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 2, !dbg [[DBG55]] +// CHECK1-NEXT: store ptr [[TMP4]], ptr [[TMP15]], align 8, !dbg [[DBG55]] +// CHECK1-NEXT: [[TMP16:%.*]] = getelementptr inbounds [4 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 3, !dbg [[DBG55]] +// CHECK1-NEXT: store ptr [[TMP7]], ptr [[TMP16]], align 8, !dbg [[DBG55]] +// CHECK1-NEXT: [[TMP17:%.*]] = load i8, ptr [[DOTCAPTURE_EXPR__ADDR]], align 1, !dbg [[DBG56:![0-9]+]] +// CHECK1-NEXT: [[TOBOOL:%.*]] = trunc i8 [[TMP17]] to i1, !dbg [[DBG56]] +// CHECK1-NEXT: [[TMP18:%.*]] = zext i1 [[TOBOOL]] to i32, !dbg [[DBG55]] +// CHECK1-NEXT: call void @__kmpc_parallel_51(ptr @[[GLOB6]], i32 [[TMP9]], i32 [[TMP18]], i32 -1, i32 -1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l13_debug___omp_outlined, ptr null, ptr [[CAPTURED_VARS_ADDRS]], i64 4), !dbg [[DBG55]] +// CHECK1-NEXT: call void @__kmpc_target_deinit(), !dbg [[DBG57:![0-9]+]] +// CHECK1-NEXT: ret void, !dbg [[DBG58:![0-9]+]] // CHECK1: worker.exit: -// CHECK1-NEXT: ret void, !dbg [[DBG55]] +// CHECK1-NEXT: ret void, !dbg [[DBG52]] +// +// +// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l13 +// CHECK1-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], ptr noundef nonnull align 4 dereferenceable(4000) [[C:%.*]], i64 noundef [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(400) [[B:%.*]], ptr noundef nonnull align 1 dereferenceable(1) [[BB:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR2:[0-9]+]] !dbg [[DBG59:![0-9]+]] { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8 +// CHECK1-NEXT: [[C_ADDR:%.*]] = alloca ptr, align 8 +// CHECK1-NEXT: [[A_ADDR:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[B_ADDR:%.*]] = alloca ptr, align 8 +// CHECK1-NEXT: [[BB_ADDR:%.*]] = alloca ptr, align 8 +// CHECK1-NEXT: [[DOTCAPTURE_EXPR__ADDR:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8 +// CHECK1-NEXT: tail call void @llvm.dbg.declare(metadata ptr [[DYN_PTR_ADDR]], metadata [[META66:![0-9]+]], metadata !DIExpression()), !dbg [[DBG67:![0-9]+]] +// CHECK1-NEXT: store ptr [[C]], ptr [[C_ADDR]], align 8 +// CHECK1-NEXT: tail call void @llvm.dbg.declare(metadata ptr [[C_ADDR]], metadata [[META68:![0-9]+]], metadata !DIExpression()), !dbg [[DBG67]] +// CHECK1-NEXT: store i64 [[A]], ptr [[A_ADDR]], align 8 +// CHECK1-NEXT: tail call void @llvm.dbg.declare(metadata ptr [[A_ADDR]], metadata [[META69:![0-9]+]], metadata !DIExpression()), !dbg [[DBG67]] +// CHECK1-NEXT: store ptr [[B]], ptr [[B_ADDR]], align 8 +// CHECK1-NEXT: tail call void @llvm.dbg.declare(metadata ptr [[B_ADDR]], metadata [[META70:![0-9]+]], metadata !DIExpression()), !dbg [[DBG67]] +// CHECK1-NEXT: store ptr [[BB]], ptr [[BB_ADDR]], align 8 +// CHECK1-NEXT: tail call void @llvm.dbg.declare(metadata ptr [[BB_ADDR]], metadata [[META71:![0-9]+]], metadata !DIExpression()), !dbg [[DBG67]] +// CHECK1-NEXT: store i64 [[DOTCAPTURE_EXPR_]], ptr [[DOTCAPTURE_EXPR__ADDR]], align 8 +// CHECK1-NEXT: tail call void @llvm.dbg.declare(metadata ptr [[DOTCAPTURE_EXPR__ADDR]], metadata [[META72:![0-9]+]], metadata !DIExpression()), !dbg [[DBG67]] +// CHECK1-NEXT: [[TMP0:%.*]] = load ptr, ptr [[C_ADDR]], align 8, !dbg [[DBG73:![0-9]+]] +// CHECK1-NEXT: [[TMP1:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG73]] +// CHECK1-NEXT: [[TMP2:%.*]] = load ptr, ptr [[BB_ADDR]], align 8, !dbg [[DBG73]] +// CHECK1-NEXT: [[TMP3:%.*]] = load ptr, ptr [[DYN_PTR_ADDR]], align 8, !dbg [[DBG73]] +// CHECK1-NEXT: [[TMP4:%.*]] = load ptr, ptr [[C_ADDR]], align 8, !dbg [[DBG73]] +// CHECK1-NEXT: [[TMP5:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG73]] +// CHECK1-NEXT: [[TMP6:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG73]] +// CHECK1-NEXT: [[TMP7:%.*]] = load ptr, ptr [[BB_ADDR]], align 8, !dbg [[DBG73]] +// CHECK1-NEXT: [[TMP8:%.*]] = load i8, ptr [[DOTCAPTURE_EXPR__ADDR]], align 1, !dbg [[DBG73]] +// CHECK1-NEXT: [[TOBOOL:%.*]] = trunc i8 [[TMP8]] to i1, !dbg [[DBG73]] +// CHECK1-NEXT: [[TMP9:%.*]] = addrspacecast ptr [[TMP4]] to ptr addrspace(1), !dbg [[DBG73]] +// CHECK1-NEXT: [[TMP10:%.*]] = addrspacecast ptr [[TMP7]] to ptr addrspace(1), !dbg [[DBG73]] +// CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l13_debug__(ptr [[TMP3]], ptr addrspace(1) [[TMP9]], i32 [[TMP5]], ptr [[TMP6]], ptr addrspace(1) [[TMP10]], i1 [[TOBOOL]]) #[[ATTR4:[0-9]+]], !dbg [[DBG73]] +// CHECK1-NEXT: ret void, !dbg [[DBG73]] // // // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l13_debug___omp_outlined_debug__ -// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr addrspace(1) noalias noundef [[C:%.*]], i32 noundef [[A:%.*]], ptr noalias noundef [[B:%.*]], ptr addrspace(1) noalias noundef [[BB:%.*]]) #[[ATTR2:[0-9]+]] !dbg [[DBG61:![0-9]+]] { +// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr addrspace(1) noalias noundef [[C:%.*]], i32 noundef [[A:%.*]], ptr noalias noundef [[B:%.*]], ptr addrspace(1) noalias noundef [[BB:%.*]]) #[[ATTR1]] !dbg [[DBG74:![0-9]+]] { // CHECK1-NEXT: entry: // CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -144,149 +181,149 @@ int main() { // CHECK1-NEXT: [[H:%.*]] = alloca ptr, align 8 // CHECK1-NEXT: [[D:%.*]] = alloca i32, align 4 // CHECK1-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK1-NEXT: call void @llvm.dbg.declare(metadata ptr [[DOTGLOBAL_TID__ADDR]], metadata [[META68:![0-9]+]], metadata !DIExpression()), !dbg [[DBG69:![0-9]+]] +// CHECK1-NEXT: tail call void @llvm.dbg.declare(metadata ptr [[DOTGLOBAL_TID__ADDR]], metadata [[META81:![0-9]+]], metadata !DIExpression()), !dbg [[DBG82:![0-9]+]] // CHECK1-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8 -// CHECK1-NEXT: call void @llvm.dbg.declare(metadata ptr [[DOTBOUND_TID__ADDR]], metadata [[META70:![0-9]+]], metadata !DIExpression()), !dbg [[DBG69]] +// CHECK1-NEXT: tail call void @llvm.dbg.declare(metadata ptr [[DOTBOUND_TID__ADDR]], metadata [[META83:![0-9]+]], metadata !DIExpression()), !dbg [[DBG82]] // CHECK1-NEXT: store ptr addrspace(1) [[C]], ptr [[C_ADDR]], align 8 -// CHECK1-NEXT: call void @llvm.dbg.declare(metadata ptr [[C_ADDR]], metadata [[META71:![0-9]+]], metadata !DIExpression()), !dbg [[DBG72:![0-9]+]] +// CHECK1-NEXT: tail call void @llvm.dbg.declare(metadata ptr [[C_ADDR]], metadata [[META84:![0-9]+]], metadata !DIExpression()), !dbg [[DBG85:![0-9]+]] // CHECK1-NEXT: store i32 [[A]], ptr [[A_ADDR]], align 4 -// CHECK1-NEXT: call void @llvm.dbg.declare(metadata ptr [[A_ADDR]], metadata [[META73:![0-9]+]], metadata !DIExpression()), !dbg [[DBG74:![0-9]+]] +// CHECK1-NEXT: tail call void @llvm.dbg.declare(metadata ptr [[A_ADDR]], metadata [[META86:![0-9]+]], metadata !DIExpression()), !dbg [[DBG87:![0-9]+]] // CHECK1-NEXT: store ptr [[B]], ptr [[B_ADDR]], align 8 -// CHECK1-NEXT: call void @llvm.dbg.declare(metadata ptr [[B_ADDR]], metadata [[META75:![0-9]+]], metadata !DIExpression()), !dbg [[DBG76:![0-9]+]] +// CHECK1-NEXT: tail call void @llvm.dbg.declare(metadata ptr [[B_ADDR]], metadata [[META88:![0-9]+]], metadata !DIExpression()), !dbg [[DBG89:![0-9]+]] // CHECK1-NEXT: store ptr addrspace(1) [[BB]], ptr [[BB_ADDR]], align 8 -// CHECK1-NEXT: call void @llvm.dbg.declare(metadata ptr [[BB_ADDR]], metadata [[META77:![0-9]+]], metadata !DIExpression()), !dbg [[DBG78:![0-9]+]] -// CHECK1-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr [[C_ADDR]], align 8, !dbg [[DBG79:![0-9]+]] -// CHECK1-NEXT: [[TMP1:%.*]] = addrspacecast ptr addrspace(1) [[TMP0]] to ptr, !dbg [[DBG79]] -// CHECK1-NEXT: store ptr [[TMP1]], ptr [[TMP]], align 8, !dbg [[DBG79]] -// CHECK1-NEXT: [[TMP2:%.*]] = load ptr, ptr [[TMP]], align 8, !dbg [[DBG79]] -// CHECK1-NEXT: [[TMP3:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG79]] -// CHECK1-NEXT: store ptr [[TMP3]], ptr [[_TMP1]], align 8, !dbg [[DBG79]] -// CHECK1-NEXT: [[TMP4:%.*]] = load ptr, ptr [[_TMP1]], align 8, !dbg [[DBG79]] -// CHECK1-NEXT: [[TMP5:%.*]] = load ptr addrspace(1), ptr [[BB_ADDR]], align 8, !dbg [[DBG79]] -// CHECK1-NEXT: [[TMP6:%.*]] = addrspacecast ptr addrspace(1) [[TMP5]] to ptr, !dbg [[DBG79]] -// CHECK1-NEXT: store ptr [[TMP6]], ptr [[_TMP2]], align 8, !dbg [[DBG79]] -// CHECK1-NEXT: [[TMP7:%.*]] = load ptr, ptr [[_TMP2]], align 8, !dbg [[DBG79]] -// CHECK1-NEXT: call void @llvm.dbg.declare(metadata ptr [[DOTOMP_IV]], metadata [[META80:![0-9]+]], metadata !DIExpression()), !dbg [[DBG69]] -// CHECK1-NEXT: call void @llvm.dbg.declare(metadata ptr [[DOTOMP_LB]], metadata [[META81:![0-9]+]], metadata !DIExpression()), !dbg [[DBG69]] -// CHECK1-NEXT: store i32 0, ptr [[DOTOMP_LB]], align 4, !dbg [[DBG82:![0-9]+]] -// CHECK1-NEXT: call void @llvm.dbg.declare(metadata ptr [[DOTOMP_UB]], metadata [[META83:![0-9]+]], metadata !DIExpression()), !dbg [[DBG69]] -// CHECK1-NEXT: store i32 9, ptr [[DOTOMP_UB]], align 4, !dbg [[DBG82]] -// CHECK1-NEXT: call void @llvm.dbg.declare(metadata ptr [[DOTOMP_STRIDE]], metadata [[META84:![0-9]+]], metadata !DIExpression()), !dbg [[DBG69]] -// CHECK1-NEXT: store i32 1, ptr [[DOTOMP_STRIDE]], align 4, !dbg [[DBG82]] -// CHECK1-NEXT: call void @llvm.dbg.declare(metadata ptr [[DOTOMP_IS_LAST]], metadata [[META85:![0-9]+]], metadata !DIExpression()), !dbg [[DBG69]] -// CHECK1-NEXT: store i32 0, ptr [[DOTOMP_IS_LAST]], align 4, !dbg [[DBG82]] -// CHECK1-NEXT: call void @llvm.dbg.declare(metadata ptr [[B4]], metadata [[META86:![0-9]+]], metadata !DIExpression()), !dbg [[DBG69]] -// CHECK1-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[B4]], ptr align 4 [[TMP4]], i64 400, i1 false), !dbg [[DBG79]] -// CHECK1-NEXT: call void @llvm.dbg.declare(metadata ptr [[I]], metadata [[META87:![0-9]+]], metadata !DIExpression()), !dbg [[DBG69]] -// CHECK1-NEXT: [[TMP8:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8, !dbg [[DBG79]] -// CHECK1-NEXT: [[TMP9:%.*]] = load i32, ptr [[TMP8]], align 4, !dbg [[DBG79]] -// CHECK1-NEXT: call void @__kmpc_for_static_init_4(ptr @[[GLOB3:[0-9]+]], i32 [[TMP9]], i32 33, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_LB]], ptr [[DOTOMP_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1), !dbg [[DBG88:![0-9]+]] -// CHECK1-NEXT: br label [[OMP_DISPATCH_COND:%.*]], !dbg [[DBG79]] +// CHECK1-NEXT: tail call void @llvm.dbg.declare(metadata ptr [[BB_ADDR]], metadata [[META90:![0-9]+]], metadata !DIExpression()), !dbg [[DBG91:![0-9]+]] +// CHECK1-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr [[C_ADDR]], align 8, !dbg [[DBG92:![0-9]+]] +// CHECK1-NEXT: [[TMP1:%.*]] = addrspacecast ptr addrspace(1) [[TMP0]] to ptr, !dbg [[DBG92]] +// CHECK1-NEXT: store ptr [[TMP1]], ptr [[TMP]], align 8, !dbg [[DBG92]] +// CHECK1-NEXT: [[TMP2:%.*]] = load ptr, ptr [[TMP]], align 8, !dbg [[DBG92]] +// CHECK1-NEXT: [[TMP3:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG92]] +// CHECK1-NEXT: store ptr [[TMP3]], ptr [[_TMP1]], align 8, !dbg [[DBG92]] +// CHECK1-NEXT: [[TMP4:%.*]] = load ptr, ptr [[_TMP1]], align 8, !dbg [[DBG92]] +// CHECK1-NEXT: [[TMP5:%.*]] = load ptr addrspace(1), ptr [[BB_ADDR]], align 8, !dbg [[DBG92]] +// CHECK1-NEXT: [[TMP6:%.*]] = addrspacecast ptr addrspace(1) [[TMP5]] to ptr, !dbg [[DBG92]] +// CHECK1-NEXT: store ptr [[TMP6]], ptr [[_TMP2]], align 8, !dbg [[DBG92]] +// CHECK1-NEXT: [[TMP7:%.*]] = load ptr, ptr [[_TMP2]], align 8, !dbg [[DBG92]] +// CHECK1-NEXT: tail call void @llvm.dbg.declare(metadata ptr [[DOTOMP_IV]], metadata [[META93:![0-9]+]], metadata !DIExpression()), !dbg [[DBG82]] +// CHECK1-NEXT: tail call void @llvm.dbg.declare(metadata ptr [[DOTOMP_LB]], metadata [[META94:![0-9]+]], metadata !DIExpression()), !dbg [[DBG82]] +// CHECK1-NEXT: store i32 0, ptr [[DOTOMP_LB]], align 4, !dbg [[DBG95:![0-9]+]] +// CHECK1-NEXT: tail call void @llvm.dbg.declare(metadata ptr [[DOTOMP_UB]], metadata [[META96:![0-9]+]], metadata !DIExpression()), !dbg [[DBG82]] +// CHECK1-NEXT: store i32 9, ptr [[DOTOMP_UB]], align 4, !dbg [[DBG95]] +// CHECK1-NEXT: tail call void @llvm.dbg.declare(metadata ptr [[DOTOMP_STRIDE]], metadata [[META97:![0-9]+]], metadata !DIExpression()), !dbg [[DBG82]] +// CHECK1-NEXT: store i32 1, ptr [[DOTOMP_STRIDE]], align 4, !dbg [[DBG95]] +// CHECK1-NEXT: tail call void @llvm.dbg.declare(metadata ptr [[DOTOMP_IS_LAST]], metadata [[META98:![0-9]+]], metadata !DIExpression()), !dbg [[DBG82]] +// CHECK1-NEXT: store i32 0, ptr [[DOTOMP_IS_LAST]], align 4, !dbg [[DBG95]] +// CHECK1-NEXT: tail call void @llvm.dbg.declare(metadata ptr [[B4]], metadata [[META99:![0-9]+]], metadata !DIExpression()), !dbg [[DBG82]] +// CHECK1-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[B4]], ptr align 4 [[TMP4]], i64 400, i1 false), !dbg [[DBG92]] +// CHECK1-NEXT: tail call void @llvm.dbg.declare(metadata ptr [[I]], metadata [[META100:![0-9]+]], metadata !DIExpression()), !dbg [[DBG82]] +// CHECK1-NEXT: [[TMP8:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8, !dbg [[DBG92]] +// CHECK1-NEXT: [[TMP9:%.*]] = load i32, ptr [[TMP8]], align 4, !dbg [[DBG92]] +// CHECK1-NEXT: call void @__kmpc_for_static_init_4(ptr @[[GLOB3:[0-9]+]], i32 [[TMP9]], i32 33, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_LB]], ptr [[DOTOMP_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1), !dbg [[DBG101:![0-9]+]] +// CHECK1-NEXT: br label [[OMP_DISPATCH_COND:%.*]], !dbg [[DBG92]] // CHECK1: omp.dispatch.cond: -// CHECK1-NEXT: [[TMP10:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !dbg [[DBG82]] -// CHECK1-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP10]], 9, !dbg [[DBG82]] -// CHECK1-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]], !dbg [[DBG82]] +// CHECK1-NEXT: [[TMP10:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !dbg [[DBG95]] +// CHECK1-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP10]], 9, !dbg [[DBG95]] +// CHECK1-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]], !dbg [[DBG95]] // CHECK1: cond.true: -// CHECK1-NEXT: br label [[COND_END:%.*]], !dbg [[DBG82]] +// CHECK1-NEXT: br label [[COND_END:%.*]], !dbg [[DBG95]] // CHECK1: cond.false: -// CHECK1-NEXT: [[TMP11:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !dbg [[DBG82]] -// CHECK1-NEXT: br label [[COND_END]], !dbg [[DBG82]] +// CHECK1-NEXT: [[TMP11:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !dbg [[DBG95]] +// CHECK1-NEXT: br label [[COND_END]], !dbg [[DBG95]] // CHECK1: cond.end: -// CHECK1-NEXT: [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP11]], [[COND_FALSE]] ], !dbg [[DBG82]] -// CHECK1-NEXT: store i32 [[COND]], ptr [[DOTOMP_UB]], align 4, !dbg [[DBG82]] -// CHECK1-NEXT: [[TMP12:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4, !dbg [[DBG82]] -// CHECK1-NEXT: store i32 [[TMP12]], ptr [[DOTOMP_IV]], align 4, !dbg [[DBG82]] -// CHECK1-NEXT: [[TMP13:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !dbg [[DBG82]] -// CHECK1-NEXT: [[TMP14:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !dbg [[DBG82]] -// CHECK1-NEXT: [[CMP5:%.*]] = icmp sle i32 [[TMP13]], [[TMP14]], !dbg [[DBG79]] -// CHECK1-NEXT: br i1 [[CMP5]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]], !dbg [[DBG79]] +// CHECK1-NEXT: [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP11]], [[COND_FALSE]] ], !dbg [[DBG95]] +// CHECK1-NEXT: store i32 [[COND]], ptr [[DOTOMP_UB]], align 4, !dbg [[DBG95]] +// CHECK1-NEXT: [[TMP12:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4, !dbg [[DBG95]] +// CHECK1-NEXT: store i32 [[TMP12]], ptr [[DOTOMP_IV]], align 4, !dbg [[DBG95]] +// CHECK1-NEXT: [[TMP13:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !dbg [[DBG95]] +// CHECK1-NEXT: [[TMP14:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !dbg [[DBG95]] +// CHECK1-NEXT: [[CMP5:%.*]] = icmp sle i32 [[TMP13]], [[TMP14]], !dbg [[DBG92]] +// CHECK1-NEXT: br i1 [[CMP5]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]], !dbg [[DBG92]] // CHECK1: omp.dispatch.body: -// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND:%.*]], !dbg [[DBG79]] +// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND:%.*]], !dbg [[DBG92]] // CHECK1: omp.inner.for.cond: -// CHECK1-NEXT: [[TMP15:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !dbg [[DBG82]] -// CHECK1-NEXT: [[TMP16:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !dbg [[DBG82]] -// CHECK1-NEXT: [[CMP6:%.*]] = icmp sle i32 [[TMP15]], [[TMP16]], !dbg [[DBG79]] -// CHECK1-NEXT: br i1 [[CMP6]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]], !dbg [[DBG79]] +// CHECK1-NEXT: [[TMP15:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !dbg [[DBG95]] +// CHECK1-NEXT: [[TMP16:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !dbg [[DBG95]] +// CHECK1-NEXT: [[CMP6:%.*]] = icmp sle i32 [[TMP15]], [[TMP16]], !dbg [[DBG92]] +// CHECK1-NEXT: br i1 [[CMP6]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]], !dbg [[DBG92]] // CHECK1: omp.inner.for.body: -// CHECK1-NEXT: [[TMP17:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !dbg [[DBG82]] -// CHECK1-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP17]], 1, !dbg [[DBG89:![0-9]+]] -// CHECK1-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]], !dbg [[DBG89]] -// CHECK1-NEXT: store i32 [[ADD]], ptr [[I]], align 4, !dbg [[DBG89]] -// CHECK1-NEXT: call void @llvm.dbg.declare(metadata ptr [[F]], metadata [[META90:![0-9]+]], metadata !DIExpression()), !dbg [[DBG93:![0-9]+]] -// CHECK1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x [10 x [10 x i32]]], ptr [[TMP2]], i64 0, i64 1, !dbg [[DBG94:![0-9]+]] -// CHECK1-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[ARRAYIDX]], i64 0, i64 1, !dbg [[DBG94]] -// CHECK1-NEXT: [[ARRAYIDX8:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX7]], i64 0, i64 1, !dbg [[DBG94]] -// CHECK1-NEXT: store ptr [[ARRAYIDX8]], ptr [[F]], align 8, !dbg [[DBG93]] -// CHECK1-NEXT: call void @llvm.dbg.declare(metadata ptr [[G]], metadata [[META95:![0-9]+]], metadata !DIExpression()), !dbg [[DBG96:![0-9]+]] -// CHECK1-NEXT: store ptr [[A_ADDR]], ptr [[G]], align 8, !dbg [[DBG96]] -// CHECK1-NEXT: call void @llvm.dbg.declare(metadata ptr [[H]], metadata [[META97:![0-9]+]], metadata !DIExpression()), !dbg [[DBG98:![0-9]+]] -// CHECK1-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[B4]], i64 0, i64 1, !dbg [[DBG99:![0-9]+]] -// CHECK1-NEXT: [[ARRAYIDX10:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX9]], i64 0, i64 1, !dbg [[DBG99]] -// CHECK1-NEXT: store ptr [[ARRAYIDX10]], ptr [[H]], align 8, !dbg [[DBG98]] -// CHECK1-NEXT: call void @llvm.dbg.declare(metadata ptr [[D]], metadata [[META100:![0-9]+]], metadata !DIExpression()), !dbg [[DBG101:![0-9]+]] -// CHECK1-NEXT: store i32 15, ptr [[D]], align 4, !dbg [[DBG101]] -// CHECK1-NEXT: store i32 5, ptr [[A_ADDR]], align 4, !dbg [[DBG102:![0-9]+]] -// CHECK1-NEXT: [[ARRAYIDX11:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[B4]], i64 0, i64 0, !dbg [[DBG103:![0-9]+]] -// CHECK1-NEXT: [[TMP18:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG104:![0-9]+]] -// CHECK1-NEXT: [[IDXPROM:%.*]] = sext i32 [[TMP18]] to i64, !dbg [[DBG103]] -// CHECK1-NEXT: [[ARRAYIDX12:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX11]], i64 0, i64 [[IDXPROM]], !dbg [[DBG103]] -// CHECK1-NEXT: store i32 10, ptr [[ARRAYIDX12]], align 4, !dbg [[DBG105:![0-9]+]] -// CHECK1-NEXT: [[ARRAYIDX13:%.*]] = getelementptr inbounds [10 x [10 x [10 x i32]]], ptr [[TMP2]], i64 0, i64 0, !dbg [[DBG106:![0-9]+]] -// CHECK1-NEXT: [[ARRAYIDX14:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[ARRAYIDX13]], i64 0, i64 0, !dbg [[DBG106]] -// CHECK1-NEXT: [[TMP19:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG107:![0-9]+]] -// CHECK1-NEXT: [[IDXPROM15:%.*]] = sext i32 [[TMP19]] to i64, !dbg [[DBG106]] -// CHECK1-NEXT: [[ARRAYIDX16:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX14]], i64 0, i64 [[IDXPROM15]], !dbg [[DBG106]] -// CHECK1-NEXT: store i32 11, ptr [[ARRAYIDX16]], align 4, !dbg [[DBG108:![0-9]+]] -// CHECK1-NEXT: [[ARRAYIDX17:%.*]] = getelementptr inbounds [10 x [10 x [10 x i32]]], ptr [[TMP2]], i64 0, i64 0, !dbg [[DBG109:![0-9]+]] -// CHECK1-NEXT: [[ARRAYIDX18:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[ARRAYIDX17]], i64 0, i64 0, !dbg [[DBG109]] -// CHECK1-NEXT: [[TMP20:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG110:![0-9]+]] -// CHECK1-NEXT: [[IDXPROM19:%.*]] = sext i32 [[TMP20]] to i64, !dbg [[DBG109]] -// CHECK1-NEXT: [[ARRAYIDX20:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX18]], i64 0, i64 [[IDXPROM19]], !dbg [[DBG109]] -// CHECK1-NEXT: [[TMP21:%.*]] = load i32, ptr [[ARRAYIDX20]], align 4, !dbg [[DBG109]] -// CHECK1-NEXT: [[ARRAYIDX21:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[B4]], i64 0, i64 0, !dbg [[DBG111:![0-9]+]] -// CHECK1-NEXT: [[TMP22:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG112:![0-9]+]] -// CHECK1-NEXT: [[IDXPROM22:%.*]] = sext i32 [[TMP22]] to i64, !dbg [[DBG111]] -// CHECK1-NEXT: [[ARRAYIDX23:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX21]], i64 0, i64 [[IDXPROM22]], !dbg [[DBG111]] -// CHECK1-NEXT: store i32 [[TMP21]], ptr [[ARRAYIDX23]], align 4, !dbg [[DBG113:![0-9]+]] -// CHECK1-NEXT: [[ARRAYIDX24:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[B4]], i64 0, i64 0, !dbg [[DBG114:![0-9]+]] -// CHECK1-NEXT: [[TMP23:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG115:![0-9]+]] -// CHECK1-NEXT: [[IDXPROM25:%.*]] = sext i32 [[TMP23]] to i64, !dbg [[DBG114]] -// CHECK1-NEXT: [[ARRAYIDX26:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX24]], i64 0, i64 [[IDXPROM25]], !dbg [[DBG114]] -// CHECK1-NEXT: [[TMP24:%.*]] = load i32, ptr [[ARRAYIDX26]], align 4, !dbg [[DBG114]] -// CHECK1-NEXT: [[TMP25:%.*]] = load i8, ptr [[TMP7]], align 1, !dbg [[DBG116:![0-9]+]] -// CHECK1-NEXT: [[TOBOOL:%.*]] = trunc i8 [[TMP25]] to i1, !dbg [[DBG116]] -// CHECK1-NEXT: [[CONV:%.*]] = zext i1 [[TOBOOL]] to i32, !dbg [[DBG116]] -// CHECK1-NEXT: [[OR:%.*]] = or i32 [[CONV]], [[TMP24]], !dbg [[DBG116]] -// CHECK1-NEXT: [[TOBOOL27:%.*]] = icmp ne i32 [[OR]], 0, !dbg [[DBG116]] -// CHECK1-NEXT: [[FROMBOOL:%.*]] = zext i1 [[TOBOOL27]] to i8, !dbg [[DBG116]] -// CHECK1-NEXT: store i8 [[FROMBOOL]], ptr [[TMP7]], align 1, !dbg [[DBG116]] -// CHECK1-NEXT: br label [[OMP_BODY_CONTINUE:%.*]], !dbg [[DBG117:![0-9]+]] +// CHECK1-NEXT: [[TMP17:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !dbg [[DBG95]] +// CHECK1-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP17]], 1, !dbg [[DBG102:![0-9]+]] +// CHECK1-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]], !dbg [[DBG102]] +// CHECK1-NEXT: store i32 [[ADD]], ptr [[I]], align 4, !dbg [[DBG102]] +// CHECK1-NEXT: tail call void @llvm.dbg.declare(metadata ptr [[F]], metadata [[META103:![0-9]+]], metadata !DIExpression()), !dbg [[DBG106:![0-9]+]] +// CHECK1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x [10 x [10 x i32]]], ptr [[TMP2]], i64 0, i64 1, !dbg [[DBG107:![0-9]+]] +// CHECK1-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[ARRAYIDX]], i64 0, i64 1, !dbg [[DBG107]] +// CHECK1-NEXT: [[ARRAYIDX8:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX7]], i64 0, i64 1, !dbg [[DBG107]] +// CHECK1-NEXT: store ptr [[ARRAYIDX8]], ptr [[F]], align 8, !dbg [[DBG106]] +// CHECK1-NEXT: tail call void @llvm.dbg.declare(metadata ptr [[G]], metadata [[META108:![0-9]+]], metadata !DIExpression()), !dbg [[DBG109:![0-9]+]] +// CHECK1-NEXT: store ptr [[A_ADDR]], ptr [[G]], align 8, !dbg [[DBG109]] +// CHECK1-NEXT: tail call void @llvm.dbg.declare(metadata ptr [[H]], metadata [[META110:![0-9]+]], metadata !DIExpression()), !dbg [[DBG111:![0-9]+]] +// CHECK1-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[B4]], i64 0, i64 1, !dbg [[DBG112:![0-9]+]] +// CHECK1-NEXT: [[ARRAYIDX10:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX9]], i64 0, i64 1, !dbg [[DBG112]] +// CHECK1-NEXT: store ptr [[ARRAYIDX10]], ptr [[H]], align 8, !dbg [[DBG111]] +// CHECK1-NEXT: tail call void @llvm.dbg.declare(metadata ptr [[D]], metadata [[META113:![0-9]+]], metadata !DIExpression()), !dbg [[DBG114:![0-9]+]] +// CHECK1-NEXT: store i32 15, ptr [[D]], align 4, !dbg [[DBG114]] +// CHECK1-NEXT: store i32 5, ptr [[A_ADDR]], align 4, !dbg [[DBG115:![0-9]+]] +// CHECK1-NEXT: [[ARRAYIDX11:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[B4]], i64 0, i64 0, !dbg [[DBG116:![0-9]+]] +// CHECK1-NEXT: [[TMP18:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG117:![0-9]+]] +// CHECK1-NEXT: [[IDXPROM:%.*]] = sext i32 [[TMP18]] to i64, !dbg [[DBG116]] +// CHECK1-NEXT: [[ARRAYIDX12:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX11]], i64 0, i64 [[IDXPROM]], !dbg [[DBG116]] +// CHECK1-NEXT: store i32 10, ptr [[ARRAYIDX12]], align 4, !dbg [[DBG118:![0-9]+]] +// CHECK1-NEXT: [[ARRAYIDX13:%.*]] = getelementptr inbounds [10 x [10 x [10 x i32]]], ptr [[TMP2]], i64 0, i64 0, !dbg [[DBG119:![0-9]+]] +// CHECK1-NEXT: [[ARRAYIDX14:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[ARRAYIDX13]], i64 0, i64 0, !dbg [[DBG119]] +// CHECK1-NEXT: [[TMP19:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG120:![0-9]+]] +// CHECK1-NEXT: [[IDXPROM15:%.*]] = sext i32 [[TMP19]] to i64, !dbg [[DBG119]] +// CHECK1-NEXT: [[ARRAYIDX16:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX14]], i64 0, i64 [[IDXPROM15]], !dbg [[DBG119]] +// CHECK1-NEXT: store i32 11, ptr [[ARRAYIDX16]], align 4, !dbg [[DBG121:![0-9]+]] +// CHECK1-NEXT: [[ARRAYIDX17:%.*]] = getelementptr inbounds [10 x [10 x [10 x i32]]], ptr [[TMP2]], i64 0, i64 0, !dbg [[DBG122:![0-9]+]] +// CHECK1-NEXT: [[ARRAYIDX18:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[ARRAYIDX17]], i64 0, i64 0, !dbg [[DBG122]] +// CHECK1-NEXT: [[TMP20:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG123:![0-9]+]] +// CHECK1-NEXT: [[IDXPROM19:%.*]] = sext i32 [[TMP20]] to i64, !dbg [[DBG122]] +// CHECK1-NEXT: [[ARRAYIDX20:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX18]], i64 0, i64 [[IDXPROM19]], !dbg [[DBG122]] +// CHECK1-NEXT: [[TMP21:%.*]] = load i32, ptr [[ARRAYIDX20]], align 4, !dbg [[DBG122]] +// CHECK1-NEXT: [[ARRAYIDX21:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[B4]], i64 0, i64 0, !dbg [[DBG124:![0-9]+]] +// CHECK1-NEXT: [[TMP22:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG125:![0-9]+]] +// CHECK1-NEXT: [[IDXPROM22:%.*]] = sext i32 [[TMP22]] to i64, !dbg [[DBG124]] +// CHECK1-NEXT: [[ARRAYIDX23:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX21]], i64 0, i64 [[IDXPROM22]], !dbg [[DBG124]] +// CHECK1-NEXT: store i32 [[TMP21]], ptr [[ARRAYIDX23]], align 4, !dbg [[DBG126:![0-9]+]] +// CHECK1-NEXT: [[ARRAYIDX24:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[B4]], i64 0, i64 0, !dbg [[DBG127:![0-9]+]] +// CHECK1-NEXT: [[TMP23:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG128:![0-9]+]] +// CHECK1-NEXT: [[IDXPROM25:%.*]] = sext i32 [[TMP23]] to i64, !dbg [[DBG127]] +// CHECK1-NEXT: [[ARRAYIDX26:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX24]], i64 0, i64 [[IDXPROM25]], !dbg [[DBG127]] +// CHECK1-NEXT: [[TMP24:%.*]] = load i32, ptr [[ARRAYIDX26]], align 4, !dbg [[DBG127]] +// CHECK1-NEXT: [[TMP25:%.*]] = load i8, ptr [[TMP7]], align 1, !dbg [[DBG129:![0-9]+]] +// CHECK1-NEXT: [[TOBOOL:%.*]] = trunc i8 [[TMP25]] to i1, !dbg [[DBG129]] +// CHECK1-NEXT: [[CONV:%.*]] = zext i1 [[TOBOOL]] to i32, !dbg [[DBG129]] +// CHECK1-NEXT: [[OR:%.*]] = or i32 [[CONV]], [[TMP24]], !dbg [[DBG129]] +// CHECK1-NEXT: [[TOBOOL27:%.*]] = icmp ne i32 [[OR]], 0, !dbg [[DBG129]] +// CHECK1-NEXT: [[FROMBOOL:%.*]] = zext i1 [[TOBOOL27]] to i8, !dbg [[DBG129]] +// CHECK1-NEXT: store i8 [[FROMBOOL]], ptr [[TMP7]], align 1, !dbg [[DBG129]] +// CHECK1-NEXT: br label [[OMP_BODY_CONTINUE:%.*]], !dbg [[DBG130:![0-9]+]] // CHECK1: omp.body.continue: -// CHECK1-NEXT: br label [[OMP_INNER_FOR_INC:%.*]], !dbg [[DBG88]] +// CHECK1-NEXT: br label [[OMP_INNER_FOR_INC:%.*]], !dbg [[DBG101]] // CHECK1: omp.inner.for.inc: -// CHECK1-NEXT: [[TMP26:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !dbg [[DBG82]] -// CHECK1-NEXT: [[ADD28:%.*]] = add nsw i32 [[TMP26]], 1, !dbg [[DBG79]] -// CHECK1-NEXT: store i32 [[ADD28]], ptr [[DOTOMP_IV]], align 4, !dbg [[DBG79]] -// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND]], !dbg [[DBG88]], !llvm.loop [[LOOP118:![0-9]+]] +// CHECK1-NEXT: [[TMP26:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !dbg [[DBG95]] +// CHECK1-NEXT: [[ADD28:%.*]] = add nsw i32 [[TMP26]], 1, !dbg [[DBG92]] +// CHECK1-NEXT: store i32 [[ADD28]], ptr [[DOTOMP_IV]], align 4, !dbg [[DBG92]] +// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND]], !dbg [[DBG101]], !llvm.loop [[LOOP131:![0-9]+]] // CHECK1: omp.inner.for.end: -// CHECK1-NEXT: br label [[OMP_DISPATCH_INC:%.*]], !dbg [[DBG88]] +// CHECK1-NEXT: br label [[OMP_DISPATCH_INC:%.*]], !dbg [[DBG101]] // CHECK1: omp.dispatch.inc: -// CHECK1-NEXT: [[TMP27:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4, !dbg [[DBG82]] -// CHECK1-NEXT: [[TMP28:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !dbg [[DBG82]] -// CHECK1-NEXT: [[ADD29:%.*]] = add nsw i32 [[TMP27]], [[TMP28]], !dbg [[DBG79]] -// CHECK1-NEXT: store i32 [[ADD29]], ptr [[DOTOMP_LB]], align 4, !dbg [[DBG79]] -// CHECK1-NEXT: [[TMP29:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !dbg [[DBG82]] -// CHECK1-NEXT: [[TMP30:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !dbg [[DBG82]] -// CHECK1-NEXT: [[ADD30:%.*]] = add nsw i32 [[TMP29]], [[TMP30]], !dbg [[DBG79]] -// CHECK1-NEXT: store i32 [[ADD30]], ptr [[DOTOMP_UB]], align 4, !dbg [[DBG79]] -// CHECK1-NEXT: br label [[OMP_DISPATCH_COND]], !dbg [[DBG88]], !llvm.loop [[LOOP120:![0-9]+]] +// CHECK1-NEXT: [[TMP27:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4, !dbg [[DBG95]] +// CHECK1-NEXT: [[TMP28:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !dbg [[DBG95]] +// CHECK1-NEXT: [[ADD29:%.*]] = add nsw i32 [[TMP27]], [[TMP28]], !dbg [[DBG92]] +// CHECK1-NEXT: store i32 [[ADD29]], ptr [[DOTOMP_LB]], align 4, !dbg [[DBG92]] +// CHECK1-NEXT: [[TMP29:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !dbg [[DBG95]] +// CHECK1-NEXT: [[TMP30:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !dbg [[DBG95]] +// CHECK1-NEXT: [[ADD30:%.*]] = add nsw i32 [[TMP29]], [[TMP30]], !dbg [[DBG92]] +// CHECK1-NEXT: store i32 [[ADD30]], ptr [[DOTOMP_UB]], align 4, !dbg [[DBG92]] +// CHECK1-NEXT: br label [[OMP_DISPATCH_COND]], !dbg [[DBG101]], !llvm.loop [[LOOP133:![0-9]+]] // CHECK1: omp.dispatch.end: -// CHECK1-NEXT: call void @__kmpc_for_static_fini(ptr @[[GLOB5:[0-9]+]], i32 [[TMP9]]), !dbg [[DBG119:![0-9]+]] -// CHECK1-NEXT: ret void, !dbg [[DBG121:![0-9]+]] +// CHECK1-NEXT: call void @__kmpc_for_static_fini(ptr @[[GLOB5:[0-9]+]], i32 [[TMP9]]), !dbg [[DBG132:![0-9]+]] +// CHECK1-NEXT: ret void, !dbg [[DBG134:![0-9]+]] // // // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l13_debug___omp_outlined -// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4000) [[C:%.*]], i64 noundef [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(400) [[B:%.*]], ptr noundef nonnull align 1 dereferenceable(1) [[BB:%.*]]) #[[ATTR2]] !dbg [[DBG122:![0-9]+]] { +// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4000) [[C:%.*]], i64 noundef [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(400) [[B:%.*]], ptr noundef nonnull align 1 dereferenceable(1) [[BB:%.*]]) #[[ATTR1]] !dbg [[DBG135:![0-9]+]] { // CHECK1-NEXT: entry: // CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -295,71 +332,34 @@ int main() { // CHECK1-NEXT: [[B_ADDR:%.*]] = alloca ptr, align 8 // CHECK1-NEXT: [[BB_ADDR:%.*]] = alloca ptr, align 8 // CHECK1-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK1-NEXT: call void @llvm.dbg.declare(metadata ptr [[DOTGLOBAL_TID__ADDR]], metadata [[META129:![0-9]+]], metadata !DIExpression()), !dbg [[DBG130:![0-9]+]] +// CHECK1-NEXT: tail call void @llvm.dbg.declare(metadata ptr [[DOTGLOBAL_TID__ADDR]], metadata [[META138:![0-9]+]], metadata !DIExpression()), !dbg [[DBG139:![0-9]+]] // CHECK1-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8 -// CHECK1-NEXT: call void @llvm.dbg.declare(metadata ptr [[DOTBOUND_TID__ADDR]], metadata [[META131:![0-9]+]], metadata !DIExpression()), !dbg [[DBG130]] -// CHECK1-NEXT: store ptr [[C]], ptr [[C_ADDR]], align 8 -// CHECK1-NEXT: call void @llvm.dbg.declare(metadata ptr [[C_ADDR]], metadata [[META132:![0-9]+]], metadata !DIExpression()), !dbg [[DBG130]] -// CHECK1-NEXT: store i64 [[A]], ptr [[A_ADDR]], align 8 -// CHECK1-NEXT: call void @llvm.dbg.declare(metadata ptr [[A_ADDR]], metadata [[META133:![0-9]+]], metadata !DIExpression()), !dbg [[DBG130]] -// CHECK1-NEXT: store ptr [[B]], ptr [[B_ADDR]], align 8 -// CHECK1-NEXT: call void @llvm.dbg.declare(metadata ptr [[B_ADDR]], metadata [[META134:![0-9]+]], metadata !DIExpression()), !dbg [[DBG130]] -// CHECK1-NEXT: store ptr [[BB]], ptr [[BB_ADDR]], align 8 -// CHECK1-NEXT: call void @llvm.dbg.declare(metadata ptr [[BB_ADDR]], metadata [[META135:![0-9]+]], metadata !DIExpression()), !dbg [[DBG130]] -// CHECK1-NEXT: [[TMP0:%.*]] = load ptr, ptr [[C_ADDR]], align 8, !dbg [[DBG136:![0-9]+]] -// CHECK1-NEXT: [[TMP1:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG136]] -// CHECK1-NEXT: [[TMP2:%.*]] = load ptr, ptr [[BB_ADDR]], align 8, !dbg [[DBG136]] -// CHECK1-NEXT: [[TMP3:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8, !dbg [[DBG136]] -// CHECK1-NEXT: [[TMP4:%.*]] = load ptr, ptr [[DOTBOUND_TID__ADDR]], align 8, !dbg [[DBG136]] -// CHECK1-NEXT: [[TMP5:%.*]] = load ptr, ptr [[C_ADDR]], align 8, !dbg [[DBG136]] -// CHECK1-NEXT: [[TMP6:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG136]] -// CHECK1-NEXT: [[TMP7:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG136]] -// CHECK1-NEXT: [[TMP8:%.*]] = load ptr, ptr [[BB_ADDR]], align 8, !dbg [[DBG136]] -// CHECK1-NEXT: [[TMP9:%.*]] = addrspacecast ptr [[TMP5]] to ptr addrspace(1), !dbg [[DBG136]] -// CHECK1-NEXT: [[TMP10:%.*]] = addrspacecast ptr [[TMP8]] to ptr addrspace(1), !dbg [[DBG136]] -// CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l13_debug___omp_outlined_debug__(ptr [[TMP3]], ptr [[TMP4]], ptr addrspace(1) [[TMP9]], i32 [[TMP6]], ptr [[TMP7]], ptr addrspace(1) [[TMP10]]) #[[ATTR4:[0-9]+]], !dbg [[DBG136]] -// CHECK1-NEXT: ret void, !dbg [[DBG136]] -// -// -// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l13 -// CHECK1-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], ptr noundef nonnull align 4 dereferenceable(4000) [[C:%.*]], i64 noundef [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(400) [[B:%.*]], ptr noundef nonnull align 1 dereferenceable(1) [[BB:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR6:[0-9]+]] !dbg [[DBG137:![0-9]+]] { -// CHECK1-NEXT: entry: -// CHECK1-NEXT: [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8 -// CHECK1-NEXT: [[C_ADDR:%.*]] = alloca ptr, align 8 -// CHECK1-NEXT: [[A_ADDR:%.*]] = alloca i64, align 8 -// CHECK1-NEXT: [[B_ADDR:%.*]] = alloca ptr, align 8 -// CHECK1-NEXT: [[BB_ADDR:%.*]] = alloca ptr, align 8 -// CHECK1-NEXT: [[DOTCAPTURE_EXPR__ADDR:%.*]] = alloca i64, align 8 -// CHECK1-NEXT: store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8 -// CHECK1-NEXT: call void @llvm.dbg.declare(metadata ptr [[DYN_PTR_ADDR]], metadata [[META140:![0-9]+]], metadata !DIExpression()), !dbg [[DBG141:![0-9]+]] +// CHECK1-NEXT: tail call void @llvm.dbg.declare(metadata ptr [[DOTBOUND_TID__ADDR]], metadata [[META140:![0-9]+]], metadata !DIExpression()), !dbg [[DBG139]] // CHECK1-NEXT: store ptr [[C]], ptr [[C_ADDR]], align 8 -// CHECK1-NEXT: call void @llvm.dbg.declare(metadata ptr [[C_ADDR]], metadata [[META142:![0-9]+]], metadata !DIExpression()), !dbg [[DBG141]] +// CHECK1-NEXT: tail call void @llvm.dbg.declare(metadata ptr [[C_ADDR]], metadata [[META141:![0-9]+]], metadata !DIExpression()), !dbg [[DBG139]] // CHECK1-NEXT: store i64 [[A]], ptr [[A_ADDR]], align 8 -// CHECK1-NEXT: call void @llvm.dbg.declare(metadata ptr [[A_ADDR]], metadata [[META143:![0-9]+]], metadata !DIExpression()), !dbg [[DBG141]] +// CHECK1-NEXT: tail call void @llvm.dbg.declare(metadata ptr [[A_ADDR]], metadata [[META142:![0-9]+]], metadata !DIExpression()), !dbg [[DBG139]] // CHECK1-NEXT: store ptr [[B]], ptr [[B_ADDR]], align 8 -// CHECK1-NEXT: call void @llvm.dbg.declare(metadata ptr [[B_ADDR]], metadata [[META144:![0-9]+]], metadata !DIExpression()), !dbg [[DBG141]] +// CHECK1-NEXT: tail call void @llvm.dbg.declare(metadata ptr [[B_ADDR]], metadata [[META143:![0-9]+]], metadata !DIExpression()), !dbg [[DBG139]] // CHECK1-NEXT: store ptr [[BB]], ptr [[BB_ADDR]], align 8 -// CHECK1-NEXT: call void @llvm.dbg.declare(metadata ptr [[BB_ADDR]], metadata [[META145:![0-9]+]], metadata !DIExpression()), !dbg [[DBG141]] -// CHECK1-NEXT: store i64 [[DOTCAPTURE_EXPR_]], ptr [[DOTCAPTURE_EXPR__ADDR]], align 8 -// CHECK1-NEXT: call void @llvm.dbg.declare(metadata ptr [[DOTCAPTURE_EXPR__ADDR]], metadata [[META146:![0-9]+]], metadata !DIExpression()), !dbg [[DBG141]] -// CHECK1-NEXT: [[TMP0:%.*]] = load ptr, ptr [[C_ADDR]], align 8, !dbg [[DBG147:![0-9]+]] -// CHECK1-NEXT: [[TMP1:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG147]] -// CHECK1-NEXT: [[TMP2:%.*]] = load ptr, ptr [[BB_ADDR]], align 8, !dbg [[DBG147]] -// CHECK1-NEXT: [[TMP3:%.*]] = load ptr, ptr [[DYN_PTR_ADDR]], align 8, !dbg [[DBG147]] -// CHECK1-NEXT: [[TMP4:%.*]] = load ptr, ptr [[C_ADDR]], align 8, !dbg [[DBG147]] -// CHECK1-NEXT: [[TMP5:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG147]] -// CHECK1-NEXT: [[TMP6:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG147]] -// CHECK1-NEXT: [[TMP7:%.*]] = load ptr, ptr [[BB_ADDR]], align 8, !dbg [[DBG147]] -// CHECK1-NEXT: [[TMP8:%.*]] = load i8, ptr [[DOTCAPTURE_EXPR__ADDR]], align 1, !dbg [[DBG147]] -// CHECK1-NEXT: [[TOBOOL:%.*]] = trunc i8 [[TMP8]] to i1, !dbg [[DBG147]] -// CHECK1-NEXT: [[TMP9:%.*]] = addrspacecast ptr [[TMP4]] to ptr addrspace(1), !dbg [[DBG147]] -// CHECK1-NEXT: [[TMP10:%.*]] = addrspacecast ptr [[TMP7]] to ptr addrspace(1), !dbg [[DBG147]] -// CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l13_debug__(ptr [[TMP3]], ptr addrspace(1) [[TMP9]], i32 [[TMP5]], ptr [[TMP6]], ptr addrspace(1) [[TMP10]], i1 [[TOBOOL]]) #[[ATTR4]], !dbg [[DBG147]] -// CHECK1-NEXT: ret void, !dbg [[DBG147]] +// CHECK1-NEXT: tail call void @llvm.dbg.declare(metadata ptr [[BB_ADDR]], metadata [[META144:![0-9]+]], metadata !DIExpression()), !dbg [[DBG139]] +// CHECK1-NEXT: [[TMP0:%.*]] = load ptr, ptr [[C_ADDR]], align 8, !dbg [[DBG145:![0-9]+]] +// CHECK1-NEXT: [[TMP1:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG145]] +// CHECK1-NEXT: [[TMP2:%.*]] = load ptr, ptr [[BB_ADDR]], align 8, !dbg [[DBG145]] +// CHECK1-NEXT: [[TMP3:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8, !dbg [[DBG145]] +// CHECK1-NEXT: [[TMP4:%.*]] = load ptr, ptr [[DOTBOUND_TID__ADDR]], align 8, !dbg [[DBG145]] +// CHECK1-NEXT: [[TMP5:%.*]] = load ptr, ptr [[C_ADDR]], align 8, !dbg [[DBG145]] +// CHECK1-NEXT: [[TMP6:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG145]] +// CHECK1-NEXT: [[TMP7:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG145]] +// CHECK1-NEXT: [[TMP8:%.*]] = load ptr, ptr [[BB_ADDR]], align 8, !dbg [[DBG145]] +// CHECK1-NEXT: [[TMP9:%.*]] = addrspacecast ptr [[TMP5]] to ptr addrspace(1), !dbg [[DBG145]] +// CHECK1-NEXT: [[TMP10:%.*]] = addrspacecast ptr [[TMP8]] to ptr addrspace(1), !dbg [[DBG145]] +// CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l13_debug___omp_outlined_debug__(ptr [[TMP3]], ptr [[TMP4]], ptr addrspace(1) [[TMP9]], i32 [[TMP6]], ptr [[TMP7]], ptr addrspace(1) [[TMP10]]) #[[ATTR4]], !dbg [[DBG145]] +// CHECK1-NEXT: ret void, !dbg [[DBG145]] // // // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l27_debug__ -// CHECK1-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], ptr addrspace(1) noalias noundef [[C:%.*]], i32 noundef [[A:%.*]], ptr addrspace(1) noalias noundef [[B:%.*]], ptr addrspace(1) noalias noundef [[BB:%.*]]) #[[ATTR0]] !dbg [[DBG148:![0-9]+]] { +// CHECK1-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], ptr addrspace(1) noalias noundef [[C:%.*]], i32 noundef [[A:%.*]], ptr addrspace(1) noalias noundef [[B:%.*]], ptr addrspace(1) noalias noundef [[BB:%.*]]) #[[ATTR1]] !dbg [[DBG146:![0-9]+]] { // CHECK1-NEXT: entry: // CHECK1-NEXT: [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8 // CHECK1-NEXT: [[C_ADDR:%.*]] = alloca ptr addrspace(1), align 8 @@ -372,32 +372,32 @@ int main() { // CHECK1-NEXT: [[A_CASTED:%.*]] = alloca i64, align 8 // CHECK1-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [4 x ptr], align 8 // CHECK1-NEXT: store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8 -// CHECK1-NEXT: call void @llvm.dbg.declare(metadata ptr [[DYN_PTR_ADDR]], metadata [[META153:![0-9]+]], metadata !DIExpression()), !dbg [[DBG154:![0-9]+]] +// CHECK1-NEXT: tail call void @llvm.dbg.declare(metadata ptr [[DYN_PTR_ADDR]], metadata [[META151:![0-9]+]], metadata !DIExpression()), !dbg [[DBG152:![0-9]+]] // CHECK1-NEXT: store ptr addrspace(1) [[C]], ptr [[C_ADDR]], align 8 -// CHECK1-NEXT: call void @llvm.dbg.declare(metadata ptr [[C_ADDR]], metadata [[META155:![0-9]+]], metadata !DIExpression()), !dbg [[DBG156:![0-9]+]] +// CHECK1-NEXT: tail call void @llvm.dbg.declare(metadata ptr [[C_ADDR]], metadata [[META153:![0-9]+]], metadata !DIExpression()), !dbg [[DBG154:![0-9]+]] // CHECK1-NEXT: store i32 [[A]], ptr [[A_ADDR]], align 4 -// CHECK1-NEXT: call void @llvm.dbg.declare(metadata ptr [[A_ADDR]], metadata [[META157:![0-9]+]], metadata !DIExpression()), !dbg [[DBG158:![0-9]+]] +// CHECK1-NEXT: tail call void @llvm.dbg.declare(metadata ptr [[A_ADDR]], metadata [[META155:![0-9]+]], metadata !DIExpression()), !dbg [[DBG156:![0-9]+]] // CHECK1-NEXT: store ptr addrspace(1) [[B]], ptr [[B_ADDR]], align 8 -// CHECK1-NEXT: call void @llvm.dbg.declare(metadata ptr [[B_ADDR]], metadata [[META159:![0-9]+]], metadata !DIExpression()), !dbg [[DBG160:![0-9]+]] +// CHECK1-NEXT: tail call void @llvm.dbg.declare(metadata ptr [[B_ADDR]], metadata [[META157:![0-9]+]], metadata !DIExpression()), !dbg [[DBG158:![0-9]+]] // CHECK1-NEXT: store ptr addrspace(1) [[BB]], ptr [[BB_ADDR]], align 8 -// CHECK1-NEXT: call void @llvm.dbg.declare(metadata ptr [[BB_ADDR]], metadata [[META161:![0-9]+]], metadata !DIExpression()), !dbg [[DBG162:![0-9]+]] -// CHECK1-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr [[C_ADDR]], align 8, !dbg [[DBG163:![0-9]+]] -// CHECK1-NEXT: [[TMP1:%.*]] = addrspacecast ptr addrspace(1) [[TMP0]] to ptr, !dbg [[DBG163]] -// CHECK1-NEXT: store ptr [[TMP1]], ptr [[TMP]], align 8, !dbg [[DBG163]] -// CHECK1-NEXT: [[TMP2:%.*]] = load ptr, ptr [[TMP]], align 8, !dbg [[DBG163]] -// CHECK1-NEXT: [[TMP3:%.*]] = load ptr addrspace(1), ptr [[B_ADDR]], align 8, !dbg [[DBG163]] -// CHECK1-NEXT: [[TMP4:%.*]] = addrspacecast ptr addrspace(1) [[TMP3]] to ptr, !dbg [[DBG163]] -// CHECK1-NEXT: store ptr [[TMP4]], ptr [[_TMP1]], align 8, !dbg [[DBG163]] -// CHECK1-NEXT: [[TMP5:%.*]] = load ptr, ptr [[_TMP1]], align 8, !dbg [[DBG163]] -// CHECK1-NEXT: [[TMP6:%.*]] = load ptr addrspace(1), ptr [[BB_ADDR]], align 8, !dbg [[DBG163]] -// CHECK1-NEXT: [[TMP7:%.*]] = addrspacecast ptr addrspace(1) [[TMP6]] to ptr, !dbg [[DBG163]] -// CHECK1-NEXT: store ptr [[TMP7]], ptr [[_TMP2]], align 8, !dbg [[DBG163]] -// CHECK1-NEXT: [[TMP8:%.*]] = load ptr, ptr [[_TMP2]], align 8, !dbg [[DBG163]] -// CHECK1-NEXT: [[TMP9:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l27_kernel_environment, ptr [[DYN_PTR]]), !dbg [[DBG163]] -// CHECK1-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP9]], -1, !dbg [[DBG163]] -// CHECK1-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]], !dbg [[DBG163]] +// CHECK1-NEXT: tail call void @llvm.dbg.declare(metadata ptr [[BB_ADDR]], metadata [[META159:![0-9]+]], metadata !DIExpression()), !dbg [[DBG160:![0-9]+]] +// CHECK1-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr [[C_ADDR]], align 8, !dbg [[DBG161:![0-9]+]] +// CHECK1-NEXT: [[TMP1:%.*]] = addrspacecast ptr addrspace(1) [[TMP0]] to ptr, !dbg [[DBG161]] +// CHECK1-NEXT: store ptr [[TMP1]], ptr [[TMP]], align 8, !dbg [[DBG161]] +// CHECK1-NEXT: [[TMP2:%.*]] = load ptr, ptr [[TMP]], align 8, !dbg [[DBG161]] +// CHECK1-NEXT: [[TMP3:%.*]] = load ptr addrspace(1), ptr [[B_ADDR]], align 8, !dbg [[DBG161]] +// CHECK1-NEXT: [[TMP4:%.*]] = addrspacecast ptr addrspace(1) [[TMP3]] to ptr, !dbg [[DBG161]] +// CHECK1-NEXT: store ptr [[TMP4]], ptr [[_TMP1]], align 8, !dbg [[DBG161]] +// CHECK1-NEXT: [[TMP5:%.*]] = load ptr, ptr [[_TMP1]], align 8, !dbg [[DBG161]] +// CHECK1-NEXT: [[TMP6:%.*]] = load ptr addrspace(1), ptr [[BB_ADDR]], align 8, !dbg [[DBG161]] +// CHECK1-NEXT: [[TMP7:%.*]] = addrspacecast ptr addrspace(1) [[TMP6]] to ptr, !dbg [[DBG161]] +// CHECK1-NEXT: store ptr [[TMP7]], ptr [[_TMP2]], align 8, !dbg [[DBG161]] +// CHECK1-NEXT: [[TMP8:%.*]] = load ptr, ptr [[_TMP2]], align 8, !dbg [[DBG161]] +// CHECK1-NEXT: [[TMP9:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l27_kernel_environment, ptr [[DYN_PTR]]), !dbg [[DBG161]] +// CHECK1-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP9]], -1, !dbg [[DBG161]] +// CHECK1-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]], !dbg [[DBG161]] // CHECK1: user_code.entry: -// CHECK1-NEXT: [[TMP10:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB13:[0-9]+]]) +// CHECK1-NEXT: [[TMP10:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB13:[0-9]+]]), !dbg [[DBG162:![0-9]+]] // CHECK1-NEXT: [[TMP11:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG164:![0-9]+]] // CHECK1-NEXT: store i32 [[TMP11]], ptr [[A_CASTED]], align 4, !dbg [[DBG164]] // CHECK1-NEXT: [[TMP12:%.*]] = load i64, ptr [[A_CASTED]], align 8, !dbg [[DBG164]] @@ -412,13 +412,46 @@ int main() { // CHECK1-NEXT: store ptr [[TMP8]], ptr [[TMP17]], align 8, !dbg [[DBG164]] // CHECK1-NEXT: call void @__kmpc_parallel_51(ptr @[[GLOB13]], i32 [[TMP10]], i32 1, i32 -1, i32 -1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l27_debug___omp_outlined, ptr null, ptr [[CAPTURED_VARS_ADDRS]], i64 4), !dbg [[DBG164]] // CHECK1-NEXT: call void @__kmpc_target_deinit(), !dbg [[DBG165:![0-9]+]] -// CHECK1-NEXT: ret void, !dbg [[DBG167:![0-9]+]] +// CHECK1-NEXT: ret void, !dbg [[DBG166:![0-9]+]] // CHECK1: worker.exit: -// CHECK1-NEXT: ret void, !dbg [[DBG163]] +// CHECK1-NEXT: ret void, !dbg [[DBG161]] +// +// +// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l27 +// CHECK1-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], ptr noundef nonnull align 4 dereferenceable(4000) [[C:%.*]], i64 noundef [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(400) [[B:%.*]], ptr noundef nonnull align 1 dereferenceable(1) [[BB:%.*]]) #[[ATTR2]] !dbg [[DBG167:![0-9]+]] { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8 +// CHECK1-NEXT: [[C_ADDR:%.*]] = alloca ptr, align 8 +// CHECK1-NEXT: [[A_ADDR:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[B_ADDR:%.*]] = alloca ptr, align 8 +// CHECK1-NEXT: [[BB_ADDR:%.*]] = alloca ptr, align 8 +// CHECK1-NEXT: store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8 +// CHECK1-NEXT: tail call void @llvm.dbg.declare(metadata ptr [[DYN_PTR_ADDR]], metadata [[META170:![0-9]+]], metadata !DIExpression()), !dbg [[DBG171:![0-9]+]] +// CHECK1-NEXT: store ptr [[C]], ptr [[C_ADDR]], align 8 +// CHECK1-NEXT: tail call void @llvm.dbg.declare(metadata ptr [[C_ADDR]], metadata [[META172:![0-9]+]], metadata !DIExpression()), !dbg [[DBG171]] +// CHECK1-NEXT: store i64 [[A]], ptr [[A_ADDR]], align 8 +// CHECK1-NEXT: tail call void @llvm.dbg.declare(metadata ptr [[A_ADDR]], metadata [[META173:![0-9]+]], metadata !DIExpression()), !dbg [[DBG171]] +// CHECK1-NEXT: store ptr [[B]], ptr [[B_ADDR]], align 8 +// CHECK1-NEXT: tail call void @llvm.dbg.declare(metadata ptr [[B_ADDR]], metadata [[META174:![0-9]+]], metadata !DIExpression()), !dbg [[DBG171]] +// CHECK1-NEXT: store ptr [[BB]], ptr [[BB_ADDR]], align 8 +// CHECK1-NEXT: tail call void @llvm.dbg.declare(metadata ptr [[BB_ADDR]], metadata [[META175:![0-9]+]], metadata !DIExpression()), !dbg [[DBG171]] +// CHECK1-NEXT: [[TMP0:%.*]] = load ptr, ptr [[C_ADDR]], align 8, !dbg [[DBG176:![0-9]+]] +// CHECK1-NEXT: [[TMP1:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG176]] +// CHECK1-NEXT: [[TMP2:%.*]] = load ptr, ptr [[BB_ADDR]], align 8, !dbg [[DBG176]] +// CHECK1-NEXT: [[TMP3:%.*]] = load ptr, ptr [[DYN_PTR_ADDR]], align 8, !dbg [[DBG176]] +// CHECK1-NEXT: [[TMP4:%.*]] = load ptr, ptr [[C_ADDR]], align 8, !dbg [[DBG176]] +// CHECK1-NEXT: [[TMP5:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG176]] +// CHECK1-NEXT: [[TMP6:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG176]] +// CHECK1-NEXT: [[TMP7:%.*]] = load ptr, ptr [[BB_ADDR]], align 8, !dbg [[DBG176]] +// CHECK1-NEXT: [[TMP8:%.*]] = addrspacecast ptr [[TMP4]] to ptr addrspace(1), !dbg [[DBG176]] +// CHECK1-NEXT: [[TMP9:%.*]] = addrspacecast ptr [[TMP6]] to ptr addrspace(1), !dbg [[DBG176]] +// CHECK1-NEXT: [[TMP10:%.*]] = addrspacecast ptr [[TMP7]] to ptr addrspace(1), !dbg [[DBG176]] +// CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l27_debug__(ptr [[TMP3]], ptr addrspace(1) [[TMP8]], i32 [[TMP5]], ptr addrspace(1) [[TMP9]], ptr addrspace(1) [[TMP10]]) #[[ATTR4]], !dbg [[DBG176]] +// CHECK1-NEXT: ret void, !dbg [[DBG176]] // // // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l27_debug___omp_outlined_debug__ -// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr addrspace(1) noalias noundef [[C:%.*]], i32 noundef [[A:%.*]], ptr addrspace(1) noalias noundef [[B:%.*]], ptr addrspace(1) noalias noundef [[BB:%.*]]) #[[ATTR2]] !dbg [[DBG168:![0-9]+]] { +// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr addrspace(1) noalias noundef [[C:%.*]], i32 noundef [[A:%.*]], ptr addrspace(1) noalias noundef [[B:%.*]], ptr addrspace(1) noalias noundef [[BB:%.*]]) #[[ATTR1]] !dbg [[DBG177:![0-9]+]] { // CHECK1-NEXT: entry: // CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -441,140 +474,140 @@ int main() { // CHECK1-NEXT: [[H:%.*]] = alloca ptr, align 8 // CHECK1-NEXT: [[D:%.*]] = alloca i32, align 4 // CHECK1-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK1-NEXT: call void @llvm.dbg.declare(metadata ptr [[DOTGLOBAL_TID__ADDR]], metadata [[META171:![0-9]+]], metadata !DIExpression()), !dbg [[DBG172:![0-9]+]] +// CHECK1-NEXT: tail call void @llvm.dbg.declare(metadata ptr [[DOTGLOBAL_TID__ADDR]], metadata [[META180:![0-9]+]], metadata !DIExpression()), !dbg [[DBG181:![0-9]+]] // CHECK1-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8 -// CHECK1-NEXT: call void @llvm.dbg.declare(metadata ptr [[DOTBOUND_TID__ADDR]], metadata [[META173:![0-9]+]], metadata !DIExpression()), !dbg [[DBG172]] +// CHECK1-NEXT: tail call void @llvm.dbg.declare(metadata ptr [[DOTBOUND_TID__ADDR]], metadata [[META182:![0-9]+]], metadata !DIExpression()), !dbg [[DBG181]] // CHECK1-NEXT: store ptr addrspace(1) [[C]], ptr [[C_ADDR]], align 8 -// CHECK1-NEXT: call void @llvm.dbg.declare(metadata ptr [[C_ADDR]], metadata [[META174:![0-9]+]], metadata !DIExpression()), !dbg [[DBG175:![0-9]+]] +// CHECK1-NEXT: tail call void @llvm.dbg.declare(metadata ptr [[C_ADDR]], metadata [[META183:![0-9]+]], metadata !DIExpression()), !dbg [[DBG184:![0-9]+]] // CHECK1-NEXT: store i32 [[A]], ptr [[A_ADDR]], align 4 -// CHECK1-NEXT: call void @llvm.dbg.declare(metadata ptr [[A_ADDR]], metadata [[META176:![0-9]+]], metadata !DIExpression()), !dbg [[DBG177:![0-9]+]] +// CHECK1-NEXT: tail call void @llvm.dbg.declare(metadata ptr [[A_ADDR]], metadata [[META185:![0-9]+]], metadata !DIExpression()), !dbg [[DBG186:![0-9]+]] // CHECK1-NEXT: store ptr addrspace(1) [[B]], ptr [[B_ADDR]], align 8 -// CHECK1-NEXT: call void @llvm.dbg.declare(metadata ptr [[B_ADDR]], metadata [[META178:![0-9]+]], metadata !DIExpression()), !dbg [[DBG179:![0-9]+]] +// CHECK1-NEXT: tail call void @llvm.dbg.declare(metadata ptr [[B_ADDR]], metadata [[META187:![0-9]+]], metadata !DIExpression()), !dbg [[DBG188:![0-9]+]] // CHECK1-NEXT: store ptr addrspace(1) [[BB]], ptr [[BB_ADDR]], align 8 -// CHECK1-NEXT: call void @llvm.dbg.declare(metadata ptr [[BB_ADDR]], metadata [[META180:![0-9]+]], metadata !DIExpression()), !dbg [[DBG181:![0-9]+]] -// CHECK1-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr [[C_ADDR]], align 8, !dbg [[DBG182:![0-9]+]] -// CHECK1-NEXT: [[TMP1:%.*]] = addrspacecast ptr addrspace(1) [[TMP0]] to ptr, !dbg [[DBG182]] -// CHECK1-NEXT: store ptr [[TMP1]], ptr [[TMP]], align 8, !dbg [[DBG182]] -// CHECK1-NEXT: [[TMP2:%.*]] = load ptr, ptr [[TMP]], align 8, !dbg [[DBG182]] -// CHECK1-NEXT: [[TMP3:%.*]] = load ptr addrspace(1), ptr [[B_ADDR]], align 8, !dbg [[DBG182]] -// CHECK1-NEXT: [[TMP4:%.*]] = addrspacecast ptr addrspace(1) [[TMP3]] to ptr, !dbg [[DBG182]] -// CHECK1-NEXT: store ptr [[TMP4]], ptr [[_TMP1]], align 8, !dbg [[DBG182]] -// CHECK1-NEXT: [[TMP5:%.*]] = load ptr, ptr [[_TMP1]], align 8, !dbg [[DBG182]] -// CHECK1-NEXT: [[TMP6:%.*]] = load ptr addrspace(1), ptr [[BB_ADDR]], align 8, !dbg [[DBG182]] -// CHECK1-NEXT: [[TMP7:%.*]] = addrspacecast ptr addrspace(1) [[TMP6]] to ptr, !dbg [[DBG182]] -// CHECK1-NEXT: store ptr [[TMP7]], ptr [[_TMP2]], align 8, !dbg [[DBG182]] -// CHECK1-NEXT: [[TMP8:%.*]] = load ptr, ptr [[_TMP2]], align 8, !dbg [[DBG182]] -// CHECK1-NEXT: call void @llvm.dbg.declare(metadata ptr [[DOTOMP_IV]], metadata [[META183:![0-9]+]], metadata !DIExpression()), !dbg [[DBG172]] -// CHECK1-NEXT: call void @llvm.dbg.declare(metadata ptr [[DOTOMP_LB]], metadata [[META184:![0-9]+]], metadata !DIExpression()), !dbg [[DBG172]] -// CHECK1-NEXT: store i32 0, ptr [[DOTOMP_LB]], align 4, !dbg [[DBG185:![0-9]+]] -// CHECK1-NEXT: call void @llvm.dbg.declare(metadata ptr [[DOTOMP_UB]], metadata [[META186:![0-9]+]], metadata !DIExpression()), !dbg [[DBG172]] -// CHECK1-NEXT: store i32 9, ptr [[DOTOMP_UB]], align 4, !dbg [[DBG185]] -// CHECK1-NEXT: call void @llvm.dbg.declare(metadata ptr [[DOTOMP_STRIDE]], metadata [[META187:![0-9]+]], metadata !DIExpression()), !dbg [[DBG172]] -// CHECK1-NEXT: store i32 1, ptr [[DOTOMP_STRIDE]], align 4, !dbg [[DBG185]] -// CHECK1-NEXT: call void @llvm.dbg.declare(metadata ptr [[DOTOMP_IS_LAST]], metadata [[META188:![0-9]+]], metadata !DIExpression()), !dbg [[DBG172]] -// CHECK1-NEXT: store i32 0, ptr [[DOTOMP_IS_LAST]], align 4, !dbg [[DBG185]] -// CHECK1-NEXT: call void @llvm.dbg.declare(metadata ptr [[I]], metadata [[META189:![0-9]+]], metadata !DIExpression()), !dbg [[DBG172]] -// CHECK1-NEXT: [[TMP9:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8, !dbg [[DBG182]] -// CHECK1-NEXT: [[TMP10:%.*]] = load i32, ptr [[TMP9]], align 4, !dbg [[DBG182]] -// CHECK1-NEXT: call void @__kmpc_for_static_init_4(ptr @[[GLOB10:[0-9]+]], i32 [[TMP10]], i32 33, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_LB]], ptr [[DOTOMP_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1), !dbg [[DBG190:![0-9]+]] -// CHECK1-NEXT: br label [[OMP_DISPATCH_COND:%.*]], !dbg [[DBG182]] +// CHECK1-NEXT: tail call void @llvm.dbg.declare(metadata ptr [[BB_ADDR]], metadata [[META189:![0-9]+]], metadata !DIExpression()), !dbg [[DBG190:![0-9]+]] +// CHECK1-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr [[C_ADDR]], align 8, !dbg [[DBG191:![0-9]+]] +// CHECK1-NEXT: [[TMP1:%.*]] = addrspacecast ptr addrspace(1) [[TMP0]] to ptr, !dbg [[DBG191]] +// CHECK1-NEXT: store ptr [[TMP1]], ptr [[TMP]], align 8, !dbg [[DBG191]] +// CHECK1-NEXT: [[TMP2:%.*]] = load ptr, ptr [[TMP]], align 8, !dbg [[DBG191]] +// CHECK1-NEXT: [[TMP3:%.*]] = load ptr addrspace(1), ptr [[B_ADDR]], align 8, !dbg [[DBG191]] +// CHECK1-NEXT: [[TMP4:%.*]] = addrspacecast ptr addrspace(1) [[TMP3]] to ptr, !dbg [[DBG191]] +// CHECK1-NEXT: store ptr [[TMP4]], ptr [[_TMP1]], align 8, !dbg [[DBG191]] +// CHECK1-NEXT: [[TMP5:%.*]] = load ptr, ptr [[_TMP1]], align 8, !dbg [[DBG191]] +// CHECK1-NEXT: [[TMP6:%.*]] = load ptr addrspace(1), ptr [[BB_ADDR]], align 8, !dbg [[DBG191]] +// CHECK1-NEXT: [[TMP7:%.*]] = addrspacecast ptr addrspace(1) [[TMP6]] to ptr, !dbg [[DBG191]] +// CHECK1-NEXT: store ptr [[TMP7]], ptr [[_TMP2]], align 8, !dbg [[DBG191]] +// CHECK1-NEXT: [[TMP8:%.*]] = load ptr, ptr [[_TMP2]], align 8, !dbg [[DBG191]] +// CHECK1-NEXT: tail call void @llvm.dbg.declare(metadata ptr [[DOTOMP_IV]], metadata [[META192:![0-9]+]], metadata !DIExpression()), !dbg [[DBG181]] +// CHECK1-NEXT: tail call void @llvm.dbg.declare(metadata ptr [[DOTOMP_LB]], metadata [[META193:![0-9]+]], metadata !DIExpression()), !dbg [[DBG181]] +// CHECK1-NEXT: store i32 0, ptr [[DOTOMP_LB]], align 4, !dbg [[DBG194:![0-9]+]] +// CHECK1-NEXT: tail call void @llvm.dbg.declare(metadata ptr [[DOTOMP_UB]], metadata [[META195:![0-9]+]], metadata !DIExpression()), !dbg [[DBG181]] +// CHECK1-NEXT: store i32 9, ptr [[DOTOMP_UB]], align 4, !dbg [[DBG194]] +// CHECK1-NEXT: tail call void @llvm.dbg.declare(metadata ptr [[DOTOMP_STRIDE]], metadata [[META196:![0-9]+]], metadata !DIExpression()), !dbg [[DBG181]] +// CHECK1-NEXT: store i32 1, ptr [[DOTOMP_STRIDE]], align 4, !dbg [[DBG194]] +// CHECK1-NEXT: tail call void @llvm.dbg.declare(metadata ptr [[DOTOMP_IS_LAST]], metadata [[META197:![0-9]+]], metadata !DIExpression()), !dbg [[DBG181]] +// CHECK1-NEXT: store i32 0, ptr [[DOTOMP_IS_LAST]], align 4, !dbg [[DBG194]] +// CHECK1-NEXT: tail call void @llvm.dbg.declare(metadata ptr [[I]], metadata [[META198:![0-9]+]], metadata !DIExpression()), !dbg [[DBG181]] +// CHECK1-NEXT: [[TMP9:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8, !dbg [[DBG191]] +// CHECK1-NEXT: [[TMP10:%.*]] = load i32, ptr [[TMP9]], align 4, !dbg [[DBG191]] +// CHECK1-NEXT: call void @__kmpc_for_static_init_4(ptr @[[GLOB10:[0-9]+]], i32 [[TMP10]], i32 33, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_LB]], ptr [[DOTOMP_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1), !dbg [[DBG199:![0-9]+]] +// CHECK1-NEXT: br label [[OMP_DISPATCH_COND:%.*]], !dbg [[DBG191]] // CHECK1: omp.dispatch.cond: -// CHECK1-NEXT: [[TMP11:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !dbg [[DBG185]] -// CHECK1-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP11]], 9, !dbg [[DBG185]] -// CHECK1-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]], !dbg [[DBG185]] +// CHECK1-NEXT: [[TMP11:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !dbg [[DBG194]] +// CHECK1-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP11]], 9, !dbg [[DBG194]] +// CHECK1-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]], !dbg [[DBG194]] // CHECK1: cond.true: -// CHECK1-NEXT: br label [[COND_END:%.*]], !dbg [[DBG185]] +// CHECK1-NEXT: br label [[COND_END:%.*]], !dbg [[DBG194]] // CHECK1: cond.false: -// CHECK1-NEXT: [[TMP12:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !dbg [[DBG185]] -// CHECK1-NEXT: br label [[COND_END]], !dbg [[DBG185]] +// CHECK1-NEXT: [[TMP12:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !dbg [[DBG194]] +// CHECK1-NEXT: br label [[COND_END]], !dbg [[DBG194]] // CHECK1: cond.end: -// CHECK1-NEXT: [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP12]], [[COND_FALSE]] ], !dbg [[DBG185]] -// CHECK1-NEXT: store i32 [[COND]], ptr [[DOTOMP_UB]], align 4, !dbg [[DBG185]] -// CHECK1-NEXT: [[TMP13:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4, !dbg [[DBG185]] -// CHECK1-NEXT: store i32 [[TMP13]], ptr [[DOTOMP_IV]], align 4, !dbg [[DBG185]] -// CHECK1-NEXT: [[TMP14:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !dbg [[DBG185]] -// CHECK1-NEXT: [[TMP15:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !dbg [[DBG185]] -// CHECK1-NEXT: [[CMP4:%.*]] = icmp sle i32 [[TMP14]], [[TMP15]], !dbg [[DBG182]] -// CHECK1-NEXT: br i1 [[CMP4]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]], !dbg [[DBG182]] +// CHECK1-NEXT: [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP12]], [[COND_FALSE]] ], !dbg [[DBG194]] +// CHECK1-NEXT: store i32 [[COND]], ptr [[DOTOMP_UB]], align 4, !dbg [[DBG194]] +// CHECK1-NEXT: [[TMP13:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4, !dbg [[DBG194]] +// CHECK1-NEXT: store i32 [[TMP13]], ptr [[DOTOMP_IV]], align 4, !dbg [[DBG194]] +// CHECK1-NEXT: [[TMP14:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !dbg [[DBG194]] +// CHECK1-NEXT: [[TMP15:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !dbg [[DBG194]] +// CHECK1-NEXT: [[CMP4:%.*]] = icmp sle i32 [[TMP14]], [[TMP15]], !dbg [[DBG191]] +// CHECK1-NEXT: br i1 [[CMP4]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]], !dbg [[DBG191]] // CHECK1: omp.dispatch.body: -// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND:%.*]], !dbg [[DBG182]] +// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND:%.*]], !dbg [[DBG191]] // CHECK1: omp.inner.for.cond: -// CHECK1-NEXT: [[TMP16:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !dbg [[DBG185]] -// CHECK1-NEXT: [[TMP17:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !dbg [[DBG185]] -// CHECK1-NEXT: [[CMP5:%.*]] = icmp sle i32 [[TMP16]], [[TMP17]], !dbg [[DBG182]] -// CHECK1-NEXT: br i1 [[CMP5]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]], !dbg [[DBG182]] +// CHECK1-NEXT: [[TMP16:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !dbg [[DBG194]] +// CHECK1-NEXT: [[TMP17:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !dbg [[DBG194]] +// CHECK1-NEXT: [[CMP5:%.*]] = icmp sle i32 [[TMP16]], [[TMP17]], !dbg [[DBG191]] +// CHECK1-NEXT: br i1 [[CMP5]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]], !dbg [[DBG191]] // CHECK1: omp.inner.for.body: -// CHECK1-NEXT: [[TMP18:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !dbg [[DBG185]] -// CHECK1-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP18]], 1, !dbg [[DBG191:![0-9]+]] -// CHECK1-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]], !dbg [[DBG191]] -// CHECK1-NEXT: store i32 [[ADD]], ptr [[I]], align 4, !dbg [[DBG191]] -// CHECK1-NEXT: call void @llvm.dbg.declare(metadata ptr [[F]], metadata [[META192:![0-9]+]], metadata !DIExpression()), !dbg [[DBG194:![0-9]+]] -// CHECK1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x [10 x [10 x i32]]], ptr [[TMP2]], i64 0, i64 1, !dbg [[DBG195:![0-9]+]] -// CHECK1-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[ARRAYIDX]], i64 0, i64 1, !dbg [[DBG195]] -// CHECK1-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX6]], i64 0, i64 1, !dbg [[DBG195]] -// CHECK1-NEXT: store ptr [[ARRAYIDX7]], ptr [[F]], align 8, !dbg [[DBG194]] -// CHECK1-NEXT: call void @llvm.dbg.declare(metadata ptr [[G]], metadata [[META196:![0-9]+]], metadata !DIExpression()), !dbg [[DBG197:![0-9]+]] -// CHECK1-NEXT: store ptr [[A_ADDR]], ptr [[G]], align 8, !dbg [[DBG197]] -// CHECK1-NEXT: call void @llvm.dbg.declare(metadata ptr [[H]], metadata [[META198:![0-9]+]], metadata !DIExpression()), !dbg [[DBG199:![0-9]+]] -// CHECK1-NEXT: [[ARRAYIDX8:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[TMP5]], i64 0, i64 1, !dbg [[DBG200:![0-9]+]] -// CHECK1-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX8]], i64 0, i64 1, !dbg [[DBG200]] -// CHECK1-NEXT: store ptr [[ARRAYIDX9]], ptr [[H]], align 8, !dbg [[DBG199]] -// CHECK1-NEXT: call void @llvm.dbg.declare(metadata ptr [[D]], metadata [[META201:![0-9]+]], metadata !DIExpression()), !dbg [[DBG202:![0-9]+]] -// CHECK1-NEXT: store i32 15, ptr [[D]], align 4, !dbg [[DBG202]] -// CHECK1-NEXT: store i32 5, ptr [[A_ADDR]], align 4, !dbg [[DBG203:![0-9]+]] -// CHECK1-NEXT: [[ARRAYIDX10:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[TMP5]], i64 0, i64 0, !dbg [[DBG204:![0-9]+]] -// CHECK1-NEXT: [[TMP19:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG205:![0-9]+]] -// CHECK1-NEXT: [[IDXPROM:%.*]] = sext i32 [[TMP19]] to i64, !dbg [[DBG204]] -// CHECK1-NEXT: [[ARRAYIDX11:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX10]], i64 0, i64 [[IDXPROM]], !dbg [[DBG204]] -// CHECK1-NEXT: store i32 10, ptr [[ARRAYIDX11]], align 4, !dbg [[DBG206:![0-9]+]] -// CHECK1-NEXT: [[ARRAYIDX12:%.*]] = getelementptr inbounds [10 x [10 x [10 x i32]]], ptr [[TMP2]], i64 0, i64 0, !dbg [[DBG207:![0-9]+]] -// CHECK1-NEXT: [[ARRAYIDX13:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[ARRAYIDX12]], i64 0, i64 0, !dbg [[DBG207]] -// CHECK1-NEXT: [[TMP20:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG208:![0-9]+]] -// CHECK1-NEXT: [[IDXPROM14:%.*]] = sext i32 [[TMP20]] to i64, !dbg [[DBG207]] -// CHECK1-NEXT: [[ARRAYIDX15:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX13]], i64 0, i64 [[IDXPROM14]], !dbg [[DBG207]] -// CHECK1-NEXT: store i32 11, ptr [[ARRAYIDX15]], align 4, !dbg [[DBG209:![0-9]+]] -// CHECK1-NEXT: [[ARRAYIDX16:%.*]] = getelementptr inbounds [10 x [10 x [10 x i32]]], ptr [[TMP2]], i64 0, i64 0, !dbg [[DBG210:![0-9]+]] -// CHECK1-NEXT: [[ARRAYIDX17:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[ARRAYIDX16]], i64 0, i64 0, !dbg [[DBG210]] -// CHECK1-NEXT: [[TMP21:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG211:![0-9]+]] -// CHECK1-NEXT: [[IDXPROM18:%.*]] = sext i32 [[TMP21]] to i64, !dbg [[DBG210]] -// CHECK1-NEXT: [[ARRAYIDX19:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX17]], i64 0, i64 [[IDXPROM18]], !dbg [[DBG210]] -// CHECK1-NEXT: [[TMP22:%.*]] = load i32, ptr [[ARRAYIDX19]], align 4, !dbg [[DBG210]] -// CHECK1-NEXT: [[ARRAYIDX20:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[TMP5]], i64 0, i64 0, !dbg [[DBG212:![0-9]+]] -// CHECK1-NEXT: [[TMP23:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG213:![0-9]+]] -// CHECK1-NEXT: [[IDXPROM21:%.*]] = sext i32 [[TMP23]] to i64, !dbg [[DBG212]] -// CHECK1-NEXT: [[ARRAYIDX22:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX20]], i64 0, i64 [[IDXPROM21]], !dbg [[DBG212]] -// CHECK1-NEXT: store i32 [[TMP22]], ptr [[ARRAYIDX22]], align 4, !dbg [[DBG214:![0-9]+]] -// CHECK1-NEXT: [[TMP24:%.*]] = load i8, ptr [[TMP8]], align 1, !dbg [[DBG215:![0-9]+]] -// CHECK1-NEXT: [[TOBOOL:%.*]] = trunc i8 [[TMP24]] to i1, !dbg [[DBG215]] -// CHECK1-NEXT: [[CONV:%.*]] = zext i1 [[TOBOOL]] to i32, !dbg [[DBG215]] -// CHECK1-NEXT: store i32 [[CONV]], ptr [[D]], align 4, !dbg [[DBG216:![0-9]+]] -// CHECK1-NEXT: br label [[OMP_BODY_CONTINUE:%.*]], !dbg [[DBG217:![0-9]+]] +// CHECK1-NEXT: [[TMP18:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !dbg [[DBG194]] +// CHECK1-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP18]], 1, !dbg [[DBG200:![0-9]+]] +// CHECK1-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]], !dbg [[DBG200]] +// CHECK1-NEXT: store i32 [[ADD]], ptr [[I]], align 4, !dbg [[DBG200]] +// CHECK1-NEXT: tail call void @llvm.dbg.declare(metadata ptr [[F]], metadata [[META201:![0-9]+]], metadata !DIExpression()), !dbg [[DBG203:![0-9]+]] +// CHECK1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x [10 x [10 x i32]]], ptr [[TMP2]], i64 0, i64 1, !dbg [[DBG204:![0-9]+]] +// CHECK1-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[ARRAYIDX]], i64 0, i64 1, !dbg [[DBG204]] +// CHECK1-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX6]], i64 0, i64 1, !dbg [[DBG204]] +// CHECK1-NEXT: store ptr [[ARRAYIDX7]], ptr [[F]], align 8, !dbg [[DBG203]] +// CHECK1-NEXT: tail call void @llvm.dbg.declare(metadata ptr [[G]], metadata [[META205:![0-9]+]], metadata !DIExpression()), !dbg [[DBG206:![0-9]+]] +// CHECK1-NEXT: store ptr [[A_ADDR]], ptr [[G]], align 8, !dbg [[DBG206]] +// CHECK1-NEXT: tail call void @llvm.dbg.declare(metadata ptr [[H]], metadata [[META207:![0-9]+]], metadata !DIExpression()), !dbg [[DBG208:![0-9]+]] +// CHECK1-NEXT: [[ARRAYIDX8:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[TMP5]], i64 0, i64 1, !dbg [[DBG209:![0-9]+]] +// CHECK1-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX8]], i64 0, i64 1, !dbg [[DBG209]] +// CHECK1-NEXT: store ptr [[ARRAYIDX9]], ptr [[H]], align 8, !dbg [[DBG208]] +// CHECK1-NEXT: tail call void @llvm.dbg.declare(metadata ptr [[D]], metadata [[META210:![0-9]+]], metadata !DIExpression()), !dbg [[DBG211:![0-9]+]] +// CHECK1-NEXT: store i32 15, ptr [[D]], align 4, !dbg [[DBG211]] +// CHECK1-NEXT: store i32 5, ptr [[A_ADDR]], align 4, !dbg [[DBG212:![0-9]+]] +// CHECK1-NEXT: [[ARRAYIDX10:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[TMP5]], i64 0, i64 0, !dbg [[DBG213:![0-9]+]] +// CHECK1-NEXT: [[TMP19:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG214:![0-9]+]] +// CHECK1-NEXT: [[IDXPROM:%.*]] = sext i32 [[TMP19]] to i64, !dbg [[DBG213]] +// CHECK1-NEXT: [[ARRAYIDX11:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX10]], i64 0, i64 [[IDXPROM]], !dbg [[DBG213]] +// CHECK1-NEXT: store i32 10, ptr [[ARRAYIDX11]], align 4, !dbg [[DBG215:![0-9]+]] +// CHECK1-NEXT: [[ARRAYIDX12:%.*]] = getelementptr inbounds [10 x [10 x [10 x i32]]], ptr [[TMP2]], i64 0, i64 0, !dbg [[DBG216:![0-9]+]] +// CHECK1-NEXT: [[ARRAYIDX13:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[ARRAYIDX12]], i64 0, i64 0, !dbg [[DBG216]] +// CHECK1-NEXT: [[TMP20:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG217:![0-9]+]] +// CHECK1-NEXT: [[IDXPROM14:%.*]] = sext i32 [[TMP20]] to i64, !dbg [[DBG216]] +// CHECK1-NEXT: [[ARRAYIDX15:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX13]], i64 0, i64 [[IDXPROM14]], !dbg [[DBG216]] +// CHECK1-NEXT: store i32 11, ptr [[ARRAYIDX15]], align 4, !dbg [[DBG218:![0-9]+]] +// CHECK1-NEXT: [[ARRAYIDX16:%.*]] = getelementptr inbounds [10 x [10 x [10 x i32]]], ptr [[TMP2]], i64 0, i64 0, !dbg [[DBG219:![0-9]+]] +// CHECK1-NEXT: [[ARRAYIDX17:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[ARRAYIDX16]], i64 0, i64 0, !dbg [[DBG219]] +// CHECK1-NEXT: [[TMP21:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG220:![0-9]+]] +// CHECK1-NEXT: [[IDXPROM18:%.*]] = sext i32 [[TMP21]] to i64, !dbg [[DBG219]] +// CHECK1-NEXT: [[ARRAYIDX19:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX17]], i64 0, i64 [[IDXPROM18]], !dbg [[DBG219]] +// CHECK1-NEXT: [[TMP22:%.*]] = load i32, ptr [[ARRAYIDX19]], align 4, !dbg [[DBG219]] +// CHECK1-NEXT: [[ARRAYIDX20:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[TMP5]], i64 0, i64 0, !dbg [[DBG221:![0-9]+]] +// CHECK1-NEXT: [[TMP23:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG222:![0-9]+]] +// CHECK1-NEXT: [[IDXPROM21:%.*]] = sext i32 [[TMP23]] to i64, !dbg [[DBG221]] +// CHECK1-NEXT: [[ARRAYIDX22:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX20]], i64 0, i64 [[IDXPROM21]], !dbg [[DBG221]] +// CHECK1-NEXT: store i32 [[TMP22]], ptr [[ARRAYIDX22]], align 4, !dbg [[DBG223:![0-9]+]] +// CHECK1-NEXT: [[TMP24:%.*]] = load i8, ptr [[TMP8]], align 1, !dbg [[DBG224:![0-9]+]] +// CHECK1-NEXT: [[TOBOOL:%.*]] = trunc i8 [[TMP24]] to i1, !dbg [[DBG224]] +// CHECK1-NEXT: [[CONV:%.*]] = zext i1 [[TOBOOL]] to i32, !dbg [[DBG224]] +// CHECK1-NEXT: store i32 [[CONV]], ptr [[D]], align 4, !dbg [[DBG225:![0-9]+]] +// CHECK1-NEXT: br label [[OMP_BODY_CONTINUE:%.*]], !dbg [[DBG226:![0-9]+]] // CHECK1: omp.body.continue: -// CHECK1-NEXT: br label [[OMP_INNER_FOR_INC:%.*]], !dbg [[DBG190]] +// CHECK1-NEXT: br label [[OMP_INNER_FOR_INC:%.*]], !dbg [[DBG199]] // CHECK1: omp.inner.for.inc: -// CHECK1-NEXT: [[TMP25:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !dbg [[DBG185]] -// CHECK1-NEXT: [[ADD23:%.*]] = add nsw i32 [[TMP25]], 1, !dbg [[DBG182]] -// CHECK1-NEXT: store i32 [[ADD23]], ptr [[DOTOMP_IV]], align 4, !dbg [[DBG182]] -// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND]], !dbg [[DBG190]], !llvm.loop [[LOOP218:![0-9]+]] +// CHECK1-NEXT: [[TMP25:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !dbg [[DBG194]] +// CHECK1-NEXT: [[ADD23:%.*]] = add nsw i32 [[TMP25]], 1, !dbg [[DBG191]] +// CHECK1-NEXT: store i32 [[ADD23]], ptr [[DOTOMP_IV]], align 4, !dbg [[DBG191]] +// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND]], !dbg [[DBG199]], !llvm.loop [[LOOP227:![0-9]+]] // CHECK1: omp.inner.for.end: -// CHECK1-NEXT: br label [[OMP_DISPATCH_INC:%.*]], !dbg [[DBG190]] +// CHECK1-NEXT: br label [[OMP_DISPATCH_INC:%.*]], !dbg [[DBG199]] // CHECK1: omp.dispatch.inc: -// CHECK1-NEXT: [[TMP26:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4, !dbg [[DBG185]] -// CHECK1-NEXT: [[TMP27:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !dbg [[DBG185]] -// CHECK1-NEXT: [[ADD24:%.*]] = add nsw i32 [[TMP26]], [[TMP27]], !dbg [[DBG182]] -// CHECK1-NEXT: store i32 [[ADD24]], ptr [[DOTOMP_LB]], align 4, !dbg [[DBG182]] -// CHECK1-NEXT: [[TMP28:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !dbg [[DBG185]] -// CHECK1-NEXT: [[TMP29:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !dbg [[DBG185]] -// CHECK1-NEXT: [[ADD25:%.*]] = add nsw i32 [[TMP28]], [[TMP29]], !dbg [[DBG182]] -// CHECK1-NEXT: store i32 [[ADD25]], ptr [[DOTOMP_UB]], align 4, !dbg [[DBG182]] -// CHECK1-NEXT: br label [[OMP_DISPATCH_COND]], !dbg [[DBG190]], !llvm.loop [[LOOP220:![0-9]+]] +// CHECK1-NEXT: [[TMP26:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4, !dbg [[DBG194]] +// CHECK1-NEXT: [[TMP27:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !dbg [[DBG194]] +// CHECK1-NEXT: [[ADD24:%.*]] = add nsw i32 [[TMP26]], [[TMP27]], !dbg [[DBG191]] +// CHECK1-NEXT: store i32 [[ADD24]], ptr [[DOTOMP_LB]], align 4, !dbg [[DBG191]] +// CHECK1-NEXT: [[TMP28:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !dbg [[DBG194]] +// CHECK1-NEXT: [[TMP29:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !dbg [[DBG194]] +// CHECK1-NEXT: [[ADD25:%.*]] = add nsw i32 [[TMP28]], [[TMP29]], !dbg [[DBG191]] +// CHECK1-NEXT: store i32 [[ADD25]], ptr [[DOTOMP_UB]], align 4, !dbg [[DBG191]] +// CHECK1-NEXT: br label [[OMP_DISPATCH_COND]], !dbg [[DBG199]], !llvm.loop [[LOOP229:![0-9]+]] // CHECK1: omp.dispatch.end: -// CHECK1-NEXT: call void @__kmpc_for_static_fini(ptr @[[GLOB12:[0-9]+]], i32 [[TMP10]]), !dbg [[DBG219:![0-9]+]] -// CHECK1-NEXT: ret void, !dbg [[DBG221:![0-9]+]] +// CHECK1-NEXT: call void @__kmpc_for_static_fini(ptr @[[GLOB12:[0-9]+]], i32 [[TMP10]]), !dbg [[DBG228:![0-9]+]] +// CHECK1-NEXT: ret void, !dbg [[DBG230:![0-9]+]] // // // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l27_debug___omp_outlined -// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4000) [[C:%.*]], i64 noundef [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(400) [[B:%.*]], ptr noundef nonnull align 1 dereferenceable(1) [[BB:%.*]]) #[[ATTR2]] !dbg [[DBG222:![0-9]+]] { +// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4000) [[C:%.*]], i64 noundef [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(400) [[B:%.*]], ptr noundef nonnull align 1 dereferenceable(1) [[BB:%.*]]) #[[ATTR1]] !dbg [[DBG231:![0-9]+]] { // CHECK1-NEXT: entry: // CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -583,68 +616,35 @@ int main() { // CHECK1-NEXT: [[B_ADDR:%.*]] = alloca ptr, align 8 // CHECK1-NEXT: [[BB_ADDR:%.*]] = alloca ptr, align 8 // CHECK1-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK1-NEXT: call void @llvm.dbg.declare(metadata ptr [[DOTGLOBAL_TID__ADDR]], metadata [[META223:![0-9]+]], metadata !DIExpression()), !dbg [[DBG224:![0-9]+]] +// CHECK1-NEXT: tail call void @llvm.dbg.declare(metadata ptr [[DOTGLOBAL_TID__ADDR]], metadata [[META232:![0-9]+]], metadata !DIExpression()), !dbg [[DBG233:![0-9]+]] // CHECK1-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8 -// CHECK1-NEXT: call void @llvm.dbg.declare(metadata ptr [[DOTBOUND_TID__ADDR]], metadata [[META225:![0-9]+]], metadata !DIExpression()), !dbg [[DBG224]] -// CHECK1-NEXT: store ptr [[C]], ptr [[C_ADDR]], align 8 -// CHECK1-NEXT: call void @llvm.dbg.declare(metadata ptr [[C_ADDR]], metadata [[META226:![0-9]+]], metadata !DIExpression()), !dbg [[DBG224]] -// CHECK1-NEXT: store i64 [[A]], ptr [[A_ADDR]], align 8 -// CHECK1-NEXT: call void @llvm.dbg.declare(metadata ptr [[A_ADDR]], metadata [[META227:![0-9]+]], metadata !DIExpression()), !dbg [[DBG224]] -// CHECK1-NEXT: store ptr [[B]], ptr [[B_ADDR]], align 8 -// CHECK1-NEXT: call void @llvm.dbg.declare(metadata ptr [[B_ADDR]], metadata [[META228:![0-9]+]], metadata !DIExpression()), !dbg [[DBG224]] -// CHECK1-NEXT: store ptr [[BB]], ptr [[BB_ADDR]], align 8 -// CHECK1-NEXT: call void @llvm.dbg.declare(metadata ptr [[BB_ADDR]], metadata [[META229:![0-9]+]], metadata !DIExpression()), !dbg [[DBG224]] -// CHECK1-NEXT: [[TMP0:%.*]] = load ptr, ptr [[C_ADDR]], align 8, !dbg [[DBG230:![0-9]+]] -// CHECK1-NEXT: [[TMP1:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG230]] -// CHECK1-NEXT: [[TMP2:%.*]] = load ptr, ptr [[BB_ADDR]], align 8, !dbg [[DBG230]] -// CHECK1-NEXT: [[TMP3:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8, !dbg [[DBG230]] -// CHECK1-NEXT: [[TMP4:%.*]] = load ptr, ptr [[DOTBOUND_TID__ADDR]], align 8, !dbg [[DBG230]] -// CHECK1-NEXT: [[TMP5:%.*]] = load ptr, ptr [[C_ADDR]], align 8, !dbg [[DBG230]] -// CHECK1-NEXT: [[TMP6:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG230]] -// CHECK1-NEXT: [[TMP7:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG230]] -// CHECK1-NEXT: [[TMP8:%.*]] = load ptr, ptr [[BB_ADDR]], align 8, !dbg [[DBG230]] -// CHECK1-NEXT: [[TMP9:%.*]] = addrspacecast ptr [[TMP5]] to ptr addrspace(1), !dbg [[DBG230]] -// CHECK1-NEXT: [[TMP10:%.*]] = addrspacecast ptr [[TMP7]] to ptr addrspace(1), !dbg [[DBG230]] -// CHECK1-NEXT: [[TMP11:%.*]] = addrspacecast ptr [[TMP8]] to ptr addrspace(1), !dbg [[DBG230]] -// CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l27_debug___omp_outlined_debug__(ptr [[TMP3]], ptr [[TMP4]], ptr addrspace(1) [[TMP9]], i32 [[TMP6]], ptr addrspace(1) [[TMP10]], ptr addrspace(1) [[TMP11]]) #[[ATTR4]], !dbg [[DBG230]] -// CHECK1-NEXT: ret void, !dbg [[DBG230]] -// -// -// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l27 -// CHECK1-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], ptr noundef nonnull align 4 dereferenceable(4000) [[C:%.*]], i64 noundef [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(400) [[B:%.*]], ptr noundef nonnull align 1 dereferenceable(1) [[BB:%.*]]) #[[ATTR6]] !dbg [[DBG231:![0-9]+]] { -// CHECK1-NEXT: entry: -// CHECK1-NEXT: [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8 -// CHECK1-NEXT: [[C_ADDR:%.*]] = alloca ptr, align 8 -// CHECK1-NEXT: [[A_ADDR:%.*]] = alloca i64, align 8 -// CHECK1-NEXT: [[B_ADDR:%.*]] = alloca ptr, align 8 -// CHECK1-NEXT: [[BB_ADDR:%.*]] = alloca ptr, align 8 -// CHECK1-NEXT: store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8 -// CHECK1-NEXT: call void @llvm.dbg.declare(metadata ptr [[DYN_PTR_ADDR]], metadata [[META234:![0-9]+]], metadata !DIExpression()), !dbg [[DBG235:![0-9]+]] +// CHECK1-NEXT: tail call void @llvm.dbg.declare(metadata ptr [[DOTBOUND_TID__ADDR]], metadata [[META234:![0-9]+]], metadata !DIExpression()), !dbg [[DBG233]] // CHECK1-NEXT: store ptr [[C]], ptr [[C_ADDR]], align 8 -// CHECK1-NEXT: call void @llvm.dbg.declare(metadata ptr [[C_ADDR]], metadata [[META236:![0-9]+]], metadata !DIExpression()), !dbg [[DBG235]] +// CHECK1-NEXT: tail call void @llvm.dbg.declare(metadata ptr [[C_ADDR]], metadata [[META235:![0-9]+]], metadata !DIExpression()), !dbg [[DBG233]] // CHECK1-NEXT: store i64 [[A]], ptr [[A_ADDR]], align 8 -// CHECK1-NEXT: call void @llvm.dbg.declare(metadata ptr [[A_ADDR]], metadata [[META237:![0-9]+]], metadata !DIExpression()), !dbg [[DBG235]] +// CHECK1-NEXT: tail call void @llvm.dbg.declare(metadata ptr [[A_ADDR]], metadata [[META236:![0-9]+]], metadata !DIExpression()), !dbg [[DBG233]] // CHECK1-NEXT: store ptr [[B]], ptr [[B_ADDR]], align 8 -// CHECK1-NEXT: call void @llvm.dbg.declare(metadata ptr [[B_ADDR]], metadata [[META238:![0-9]+]], metadata !DIExpression()), !dbg [[DBG235]] +// CHECK1-NEXT: tail call void @llvm.dbg.declare(metadata ptr [[B_ADDR]], metadata [[META237:![0-9]+]], metadata !DIExpression()), !dbg [[DBG233]] // CHECK1-NEXT: store ptr [[BB]], ptr [[BB_ADDR]], align 8 -// CHECK1-NEXT: call void @llvm.dbg.declare(metadata ptr [[BB_ADDR]], metadata [[META239:![0-9]+]], metadata !DIExpression()), !dbg [[DBG235]] -// CHECK1-NEXT: [[TMP0:%.*]] = load ptr, ptr [[C_ADDR]], align 8, !dbg [[DBG240:![0-9]+]] -// CHECK1-NEXT: [[TMP1:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG240]] -// CHECK1-NEXT: [[TMP2:%.*]] = load ptr, ptr [[BB_ADDR]], align 8, !dbg [[DBG240]] -// CHECK1-NEXT: [[TMP3:%.*]] = load ptr, ptr [[DYN_PTR_ADDR]], align 8, !dbg [[DBG240]] -// CHECK1-NEXT: [[TMP4:%.*]] = load ptr, ptr [[C_ADDR]], align 8, !dbg [[DBG240]] -// CHECK1-NEXT: [[TMP5:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG240]] -// CHECK1-NEXT: [[TMP6:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG240]] -// CHECK1-NEXT: [[TMP7:%.*]] = load ptr, ptr [[BB_ADDR]], align 8, !dbg [[DBG240]] -// CHECK1-NEXT: [[TMP8:%.*]] = addrspacecast ptr [[TMP4]] to ptr addrspace(1), !dbg [[DBG240]] -// CHECK1-NEXT: [[TMP9:%.*]] = addrspacecast ptr [[TMP6]] to ptr addrspace(1), !dbg [[DBG240]] -// CHECK1-NEXT: [[TMP10:%.*]] = addrspacecast ptr [[TMP7]] to ptr addrspace(1), !dbg [[DBG240]] -// CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l27_debug__(ptr [[TMP3]], ptr addrspace(1) [[TMP8]], i32 [[TMP5]], ptr addrspace(1) [[TMP9]], ptr addrspace(1) [[TMP10]]) #[[ATTR4]], !dbg [[DBG240]] -// CHECK1-NEXT: ret void, !dbg [[DBG240]] +// CHECK1-NEXT: tail call void @llvm.dbg.declare(metadata ptr [[BB_ADDR]], metadata [[META238:![0-9]+]], metadata !DIExpression()), !dbg [[DBG233]] +// CHECK1-NEXT: [[TMP0:%.*]] = load ptr, ptr [[C_ADDR]], align 8, !dbg [[DBG239:![0-9]+]] +// CHECK1-NEXT: [[TMP1:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG239]] +// CHECK1-NEXT: [[TMP2:%.*]] = load ptr, ptr [[BB_ADDR]], align 8, !dbg [[DBG239]] +// CHECK1-NEXT: [[TMP3:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8, !dbg [[DBG239]] +// CHECK1-NEXT: [[TMP4:%.*]] = load ptr, ptr [[DOTBOUND_TID__ADDR]], align 8, !dbg [[DBG239]] +// CHECK1-NEXT: [[TMP5:%.*]] = load ptr, ptr [[C_ADDR]], align 8, !dbg [[DBG239]] +// CHECK1-NEXT: [[TMP6:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG239]] +// CHECK1-NEXT: [[TMP7:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG239]] +// CHECK1-NEXT: [[TMP8:%.*]] = load ptr, ptr [[BB_ADDR]], align 8, !dbg [[DBG239]] +// CHECK1-NEXT: [[TMP9:%.*]] = addrspacecast ptr [[TMP5]] to ptr addrspace(1), !dbg [[DBG239]] +// CHECK1-NEXT: [[TMP10:%.*]] = addrspacecast ptr [[TMP7]] to ptr addrspace(1), !dbg [[DBG239]] +// CHECK1-NEXT: [[TMP11:%.*]] = addrspacecast ptr [[TMP8]] to ptr addrspace(1), !dbg [[DBG239]] +// CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l27_debug___omp_outlined_debug__(ptr [[TMP3]], ptr [[TMP4]], ptr addrspace(1) [[TMP9]], i32 [[TMP6]], ptr addrspace(1) [[TMP10]], ptr addrspace(1) [[TMP11]]) #[[ATTR4]], !dbg [[DBG239]] +// CHECK1-NEXT: ret void, !dbg [[DBG239]] // // // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l41_debug__ -// CHECK1-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], ptr addrspace(1) noalias noundef [[C:%.*]], ptr addrspace(1) noalias noundef [[A:%.*]], ptr addrspace(1) noalias noundef [[B:%.*]], ptr addrspace(1) noalias noundef [[BB:%.*]]) #[[ATTR0]] !dbg [[DBG241:![0-9]+]] { +// CHECK1-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], ptr addrspace(1) noalias noundef [[C:%.*]], ptr addrspace(1) noalias noundef [[A:%.*]], ptr addrspace(1) noalias noundef [[B:%.*]], ptr addrspace(1) noalias noundef [[BB:%.*]]) #[[ATTR1]] !dbg [[DBG240:![0-9]+]] { // CHECK1-NEXT: entry: // CHECK1-NEXT: [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8 // CHECK1-NEXT: [[C_ADDR:%.*]] = alloca ptr addrspace(1), align 8 @@ -657,53 +657,88 @@ int main() { // CHECK1-NEXT: [[_TMP3:%.*]] = alloca ptr, align 8 // CHECK1-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [4 x ptr], align 8 // CHECK1-NEXT: store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8 -// CHECK1-NEXT: call void @llvm.dbg.declare(metadata ptr [[DYN_PTR_ADDR]], metadata [[META246:![0-9]+]], metadata !DIExpression()), !dbg [[DBG247:![0-9]+]] +// CHECK1-NEXT: tail call void @llvm.dbg.declare(metadata ptr [[DYN_PTR_ADDR]], metadata [[META245:![0-9]+]], metadata !DIExpression()), !dbg [[DBG246:![0-9]+]] // CHECK1-NEXT: store ptr addrspace(1) [[C]], ptr [[C_ADDR]], align 8 -// CHECK1-NEXT: call void @llvm.dbg.declare(metadata ptr [[C_ADDR]], metadata [[META248:![0-9]+]], metadata !DIExpression()), !dbg [[DBG249:![0-9]+]] +// CHECK1-NEXT: tail call void @llvm.dbg.declare(metadata ptr [[C_ADDR]], metadata [[META247:![0-9]+]], metadata !DIExpression()), !dbg [[DBG248:![0-9]+]] // CHECK1-NEXT: store ptr addrspace(1) [[A]], ptr [[A_ADDR]], align 8 -// CHECK1-NEXT: call void @llvm.dbg.declare(metadata ptr [[A_ADDR]], metadata [[META250:![0-9]+]], metadata !DIExpression()), !dbg [[DBG251:![0-9]+]] +// CHECK1-NEXT: tail call void @llvm.dbg.declare(metadata ptr [[A_ADDR]], metadata [[META249:![0-9]+]], metadata !DIExpression()), !dbg [[DBG250:![0-9]+]] // CHECK1-NEXT: store ptr addrspace(1) [[B]], ptr [[B_ADDR]], align 8 -// CHECK1-NEXT: call void @llvm.dbg.declare(metadata ptr [[B_ADDR]], metadata [[META252:![0-9]+]], metadata !DIExpression()), !dbg [[DBG253:![0-9]+]] +// CHECK1-NEXT: tail call void @llvm.dbg.declare(metadata ptr [[B_ADDR]], metadata [[META251:![0-9]+]], metadata !DIExpression()), !dbg [[DBG252:![0-9]+]] // CHECK1-NEXT: store ptr addrspace(1) [[BB]], ptr [[BB_ADDR]], align 8 -// CHECK1-NEXT: call void @llvm.dbg.declare(metadata ptr [[BB_ADDR]], metadata [[META254:![0-9]+]], metadata !DIExpression()), !dbg [[DBG255:![0-9]+]] -// CHECK1-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr [[C_ADDR]], align 8, !dbg [[DBG256:![0-9]+]] -// CHECK1-NEXT: [[TMP1:%.*]] = addrspacecast ptr addrspace(1) [[TMP0]] to ptr, !dbg [[DBG256]] -// CHECK1-NEXT: store ptr [[TMP1]], ptr [[TMP]], align 8, !dbg [[DBG256]] -// CHECK1-NEXT: [[TMP2:%.*]] = load ptr, ptr [[TMP]], align 8, !dbg [[DBG256]] -// CHECK1-NEXT: [[TMP3:%.*]] = load ptr addrspace(1), ptr [[A_ADDR]], align 8, !dbg [[DBG256]] -// CHECK1-NEXT: [[TMP4:%.*]] = addrspacecast ptr addrspace(1) [[TMP3]] to ptr, !dbg [[DBG256]] -// CHECK1-NEXT: store ptr [[TMP4]], ptr [[_TMP1]], align 8, !dbg [[DBG256]] -// CHECK1-NEXT: [[TMP5:%.*]] = load ptr, ptr [[_TMP1]], align 8, !dbg [[DBG256]] -// CHECK1-NEXT: [[TMP6:%.*]] = load ptr addrspace(1), ptr [[B_ADDR]], align 8, !dbg [[DBG256]] -// CHECK1-NEXT: [[TMP7:%.*]] = addrspacecast ptr addrspace(1) [[TMP6]] to ptr, !dbg [[DBG256]] -// CHECK1-NEXT: store ptr [[TMP7]], ptr [[_TMP2]], align 8, !dbg [[DBG256]] -// CHECK1-NEXT: [[TMP8:%.*]] = load ptr, ptr [[_TMP2]], align 8, !dbg [[DBG256]] -// CHECK1-NEXT: [[TMP9:%.*]] = load ptr addrspace(1), ptr [[BB_ADDR]], align 8, !dbg [[DBG256]] -// CHECK1-NEXT: [[TMP10:%.*]] = addrspacecast ptr addrspace(1) [[TMP9]] to ptr, !dbg [[DBG256]] -// CHECK1-NEXT: store ptr [[TMP10]], ptr [[_TMP3]], align 8, !dbg [[DBG256]] -// CHECK1-NEXT: [[TMP11:%.*]] = load ptr, ptr [[_TMP3]], align 8, !dbg [[DBG256]] -// CHECK1-NEXT: [[TMP12:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l41_kernel_environment, ptr [[DYN_PTR]]), !dbg [[DBG256]] -// CHECK1-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP12]], -1, !dbg [[DBG256]] -// CHECK1-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]], !dbg [[DBG256]] +// CHECK1-NEXT: tail call void @llvm.dbg.declare(metadata ptr [[BB_ADDR]], metadata [[META253:![0-9]+]], metadata !DIExpression()), !dbg [[DBG254:![0-9]+]] +// CHECK1-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr [[C_ADDR]], align 8, !dbg [[DBG255:![0-9]+]] +// CHECK1-NEXT: [[TMP1:%.*]] = addrspacecast ptr addrspace(1) [[TMP0]] to ptr, !dbg [[DBG255]] +// CHECK1-NEXT: store ptr [[TMP1]], ptr [[TMP]], align 8, !dbg [[DBG255]] +// CHECK1-NEXT: [[TMP2:%.*]] = load ptr, ptr [[TMP]], align 8, !dbg [[DBG255]] +// CHECK1-NEXT: [[TMP3:%.*]] = load ptr addrspace(1), ptr [[A_ADDR]], align 8, !dbg [[DBG255]] +// CHECK1-NEXT: [[TMP4:%.*]] = addrspacecast ptr addrspace(1) [[TMP3]] to ptr, !dbg [[DBG255]] +// CHECK1-NEXT: store ptr [[TMP4]], ptr [[_TMP1]], align 8, !dbg [[DBG255]] +// CHECK1-NEXT: [[TMP5:%.*]] = load ptr, ptr [[_TMP1]], align 8, !dbg [[DBG255]] +// CHECK1-NEXT: [[TMP6:%.*]] = load ptr addrspace(1), ptr [[B_ADDR]], align 8, !dbg [[DBG255]] +// CHECK1-NEXT: [[TMP7:%.*]] = addrspacecast ptr addrspace(1) [[TMP6]] to ptr, !dbg [[DBG255]] +// CHECK1-NEXT: store ptr [[TMP7]], ptr [[_TMP2]], align 8, !dbg [[DBG255]] +// CHECK1-NEXT: [[TMP8:%.*]] = load ptr, ptr [[_TMP2]], align 8, !dbg [[DBG255]] +// CHECK1-NEXT: [[TMP9:%.*]] = load ptr addrspace(1), ptr [[BB_ADDR]], align 8, !dbg [[DBG255]] +// CHECK1-NEXT: [[TMP10:%.*]] = addrspacecast ptr addrspace(1) [[TMP9]] to ptr, !dbg [[DBG255]] +// CHECK1-NEXT: store ptr [[TMP10]], ptr [[_TMP3]], align 8, !dbg [[DBG255]] +// CHECK1-NEXT: [[TMP11:%.*]] = load ptr, ptr [[_TMP3]], align 8, !dbg [[DBG255]] +// CHECK1-NEXT: [[TMP12:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l41_kernel_environment, ptr [[DYN_PTR]]), !dbg [[DBG255]] +// CHECK1-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP12]], -1, !dbg [[DBG255]] +// CHECK1-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]], !dbg [[DBG255]] // CHECK1: user_code.entry: -// CHECK1-NEXT: [[TMP13:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB20:[0-9]+]]) -// CHECK1-NEXT: [[TMP14:%.*]] = getelementptr inbounds [4 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 0, !dbg [[DBG257:![0-9]+]] -// CHECK1-NEXT: store ptr [[TMP2]], ptr [[TMP14]], align 8, !dbg [[DBG257]] -// CHECK1-NEXT: [[TMP15:%.*]] = getelementptr inbounds [4 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 1, !dbg [[DBG257]] -// CHECK1-NEXT: store ptr [[TMP5]], ptr [[TMP15]], align 8, !dbg [[DBG257]] -// CHECK1-NEXT: [[TMP16:%.*]] = getelementptr inbounds [4 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 2, !dbg [[DBG257]] -// CHECK1-NEXT: store ptr [[TMP8]], ptr [[TMP16]], align 8, !dbg [[DBG257]] -// CHECK1-NEXT: [[TMP17:%.*]] = getelementptr inbounds [4 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 3, !dbg [[DBG257]] -// CHECK1-NEXT: store ptr [[TMP11]], ptr [[TMP17]], align 8, !dbg [[DBG257]] -// CHECK1-NEXT: call void @__kmpc_parallel_51(ptr @[[GLOB20]], i32 [[TMP13]], i32 1, i32 -1, i32 -1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l41_debug___omp_outlined, ptr null, ptr [[CAPTURED_VARS_ADDRS]], i64 4), !dbg [[DBG257]] -// CHECK1-NEXT: call void @__kmpc_target_deinit(), !dbg [[DBG258:![0-9]+]] +// CHECK1-NEXT: [[TMP13:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB20:[0-9]+]]), !dbg [[DBG256:![0-9]+]] +// CHECK1-NEXT: [[TMP14:%.*]] = getelementptr inbounds [4 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 0, !dbg [[DBG258:![0-9]+]] +// CHECK1-NEXT: store ptr [[TMP2]], ptr [[TMP14]], align 8, !dbg [[DBG258]] +// CHECK1-NEXT: [[TMP15:%.*]] = getelementptr inbounds [4 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 1, !dbg [[DBG258]] +// CHECK1-NEXT: store ptr [[TMP5]], ptr [[TMP15]], align 8, !dbg [[DBG258]] +// CHECK1-NEXT: [[TMP16:%.*]] = getelementptr inbounds [4 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 2, !dbg [[DBG258]] +// CHECK1-NEXT: store ptr [[TMP8]], ptr [[TMP16]], align 8, !dbg [[DBG258]] +// CHECK1-NEXT: [[TMP17:%.*]] = getelementptr inbounds [4 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 3, !dbg [[DBG258]] +// CHECK1-NEXT: store ptr [[TMP11]], ptr [[TMP17]], align 8, !dbg [[DBG258]] +// CHECK1-NEXT: call void @__kmpc_parallel_51(ptr @[[GLOB20]], i32 [[TMP13]], i32 1, i32 -1, i32 -1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l41_debug___omp_outlined, ptr null, ptr [[CAPTURED_VARS_ADDRS]], i64 4), !dbg [[DBG258]] +// CHECK1-NEXT: call void @__kmpc_target_deinit(), !dbg [[DBG259:![0-9]+]] // CHECK1-NEXT: ret void, !dbg [[DBG260:![0-9]+]] // CHECK1: worker.exit: -// CHECK1-NEXT: ret void, !dbg [[DBG256]] +// CHECK1-NEXT: ret void, !dbg [[DBG255]] +// +// +// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l41 +// CHECK1-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], ptr noundef nonnull align 4 dereferenceable(4000) [[C:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(400) [[B:%.*]], ptr noundef nonnull align 1 dereferenceable(1) [[BB:%.*]]) #[[ATTR2]] !dbg [[DBG261:![0-9]+]] { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8 +// CHECK1-NEXT: [[C_ADDR:%.*]] = alloca ptr, align 8 +// CHECK1-NEXT: [[A_ADDR:%.*]] = alloca ptr, align 8 +// CHECK1-NEXT: [[B_ADDR:%.*]] = alloca ptr, align 8 +// CHECK1-NEXT: [[BB_ADDR:%.*]] = alloca ptr, align 8 +// CHECK1-NEXT: store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8 +// CHECK1-NEXT: tail call void @llvm.dbg.declare(metadata ptr [[DYN_PTR_ADDR]], metadata [[META264:![0-9]+]], metadata !DIExpression()), !dbg [[DBG265:![0-9]+]] +// CHECK1-NEXT: store ptr [[C]], ptr [[C_ADDR]], align 8 +// CHECK1-NEXT: tail call void @llvm.dbg.declare(metadata ptr [[C_ADDR]], metadata [[META266:![0-9]+]], metadata !DIExpression()), !dbg [[DBG265]] +// CHECK1-NEXT: store ptr [[A]], ptr [[A_ADDR]], align 8 +// CHECK1-NEXT: tail call void @llvm.dbg.declare(metadata ptr [[A_ADDR]], metadata [[META267:![0-9]+]], metadata !DIExpression()), !dbg [[DBG265]] +// CHECK1-NEXT: store ptr [[B]], ptr [[B_ADDR]], align 8 +// CHECK1-NEXT: tail call void @llvm.dbg.declare(metadata ptr [[B_ADDR]], metadata [[META268:![0-9]+]], metadata !DIExpression()), !dbg [[DBG265]] +// CHECK1-NEXT: store ptr [[BB]], ptr [[BB_ADDR]], align 8 +// CHECK1-NEXT: tail call void @llvm.dbg.declare(metadata ptr [[BB_ADDR]], metadata [[META269:![0-9]+]], metadata !DIExpression()), !dbg [[DBG265]] +// CHECK1-NEXT: [[TMP0:%.*]] = load ptr, ptr [[C_ADDR]], align 8, !dbg [[DBG270:![0-9]+]] +// CHECK1-NEXT: [[TMP1:%.*]] = load ptr, ptr [[A_ADDR]], align 8, !dbg [[DBG270]] +// CHECK1-NEXT: [[TMP2:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG270]] +// CHECK1-NEXT: [[TMP3:%.*]] = load ptr, ptr [[BB_ADDR]], align 8, !dbg [[DBG270]] +// CHECK1-NEXT: [[TMP4:%.*]] = load ptr, ptr [[DYN_PTR_ADDR]], align 8, !dbg [[DBG270]] +// CHECK1-NEXT: [[TMP5:%.*]] = load ptr, ptr [[C_ADDR]], align 8, !dbg [[DBG270]] +// CHECK1-NEXT: [[TMP6:%.*]] = load ptr, ptr [[A_ADDR]], align 8, !dbg [[DBG270]] +// CHECK1-NEXT: [[TMP7:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG270]] +// CHECK1-NEXT: [[TMP8:%.*]] = load ptr, ptr [[BB_ADDR]], align 8, !dbg [[DBG270]] +// CHECK1-NEXT: [[TMP9:%.*]] = addrspacecast ptr [[TMP5]] to ptr addrspace(1), !dbg [[DBG270]] +// CHECK1-NEXT: [[TMP10:%.*]] = addrspacecast ptr [[TMP6]] to ptr addrspace(1), !dbg [[DBG270]] +// CHECK1-NEXT: [[TMP11:%.*]] = addrspacecast ptr [[TMP7]] to ptr addrspace(1), !dbg [[DBG270]] +// CHECK1-NEXT: [[TMP12:%.*]] = addrspacecast ptr [[TMP8]] to ptr addrspace(1), !dbg [[DBG270]] +// CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l41_debug__(ptr [[TMP4]], ptr addrspace(1) [[TMP9]], ptr addrspace(1) [[TMP10]], ptr addrspace(1) [[TMP11]], ptr addrspace(1) [[TMP12]]) #[[ATTR4]], !dbg [[DBG270]] +// CHECK1-NEXT: ret void, !dbg [[DBG270]] // // // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l41_debug___omp_outlined_debug__ -// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr addrspace(1) noalias noundef [[C:%.*]], ptr addrspace(1) noalias noundef [[A:%.*]], ptr addrspace(1) noalias noundef [[B:%.*]], ptr addrspace(1) noalias noundef [[BB:%.*]]) #[[ATTR2]] !dbg [[DBG261:![0-9]+]] { +// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr addrspace(1) noalias noundef [[C:%.*]], ptr addrspace(1) noalias noundef [[A:%.*]], ptr addrspace(1) noalias noundef [[B:%.*]], ptr addrspace(1) noalias noundef [[BB:%.*]]) #[[ATTR1]] !dbg [[DBG271:![0-9]+]] { // CHECK1-NEXT: entry: // CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -727,148 +762,148 @@ int main() { // CHECK1-NEXT: [[H:%.*]] = alloca ptr, align 8 // CHECK1-NEXT: [[D:%.*]] = alloca i32, align 4 // CHECK1-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK1-NEXT: call void @llvm.dbg.declare(metadata ptr [[DOTGLOBAL_TID__ADDR]], metadata [[META264:![0-9]+]], metadata !DIExpression()), !dbg [[DBG265:![0-9]+]] +// CHECK1-NEXT: tail call void @llvm.dbg.declare(metadata ptr [[DOTGLOBAL_TID__ADDR]], metadata [[META274:![0-9]+]], metadata !DIExpression()), !dbg [[DBG275:![0-9]+]] // CHECK1-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8 -// CHECK1-NEXT: call void @llvm.dbg.declare(metadata ptr [[DOTBOUND_TID__ADDR]], metadata [[META266:![0-9]+]], metadata !DIExpression()), !dbg [[DBG265]] +// CHECK1-NEXT: tail call void @llvm.dbg.declare(metadata ptr [[DOTBOUND_TID__ADDR]], metadata [[META276:![0-9]+]], metadata !DIExpression()), !dbg [[DBG275]] // CHECK1-NEXT: store ptr addrspace(1) [[C]], ptr [[C_ADDR]], align 8 -// CHECK1-NEXT: call void @llvm.dbg.declare(metadata ptr [[C_ADDR]], metadata [[META267:![0-9]+]], metadata !DIExpression()), !dbg [[DBG268:![0-9]+]] +// CHECK1-NEXT: tail call void @llvm.dbg.declare(metadata ptr [[C_ADDR]], metadata [[META277:![0-9]+]], metadata !DIExpression()), !dbg [[DBG278:![0-9]+]] // CHECK1-NEXT: store ptr addrspace(1) [[A]], ptr [[A_ADDR]], align 8 -// CHECK1-NEXT: call void @llvm.dbg.declare(metadata ptr [[A_ADDR]], metadata [[META269:![0-9]+]], metadata !DIExpression()), !dbg [[DBG270:![0-9]+]] +// CHECK1-NEXT: tail call void @llvm.dbg.declare(metadata ptr [[A_ADDR]], metadata [[META279:![0-9]+]], metadata !DIExpression()), !dbg [[DBG280:![0-9]+]] // CHECK1-NEXT: store ptr addrspace(1) [[B]], ptr [[B_ADDR]], align 8 -// CHECK1-NEXT: call void @llvm.dbg.declare(metadata ptr [[B_ADDR]], metadata [[META271:![0-9]+]], metadata !DIExpression()), !dbg [[DBG272:![0-9]+]] +// CHECK1-NEXT: tail call void @llvm.dbg.declare(metadata ptr [[B_ADDR]], metadata [[META281:![0-9]+]], metadata !DIExpression()), !dbg [[DBG282:![0-9]+]] // CHECK1-NEXT: store ptr addrspace(1) [[BB]], ptr [[BB_ADDR]], align 8 -// CHECK1-NEXT: call void @llvm.dbg.declare(metadata ptr [[BB_ADDR]], metadata [[META273:![0-9]+]], metadata !DIExpression()), !dbg [[DBG274:![0-9]+]] -// CHECK1-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr [[C_ADDR]], align 8, !dbg [[DBG275:![0-9]+]] -// CHECK1-NEXT: [[TMP1:%.*]] = addrspacecast ptr addrspace(1) [[TMP0]] to ptr, !dbg [[DBG275]] -// CHECK1-NEXT: store ptr [[TMP1]], ptr [[TMP]], align 8, !dbg [[DBG275]] -// CHECK1-NEXT: [[TMP2:%.*]] = load ptr, ptr [[TMP]], align 8, !dbg [[DBG275]] -// CHECK1-NEXT: [[TMP3:%.*]] = load ptr addrspace(1), ptr [[A_ADDR]], align 8, !dbg [[DBG275]] -// CHECK1-NEXT: [[TMP4:%.*]] = addrspacecast ptr addrspace(1) [[TMP3]] to ptr, !dbg [[DBG275]] -// CHECK1-NEXT: store ptr [[TMP4]], ptr [[_TMP1]], align 8, !dbg [[DBG275]] -// CHECK1-NEXT: [[TMP5:%.*]] = load ptr, ptr [[_TMP1]], align 8, !dbg [[DBG275]] -// CHECK1-NEXT: [[TMP6:%.*]] = load ptr addrspace(1), ptr [[B_ADDR]], align 8, !dbg [[DBG275]] -// CHECK1-NEXT: [[TMP7:%.*]] = addrspacecast ptr addrspace(1) [[TMP6]] to ptr, !dbg [[DBG275]] -// CHECK1-NEXT: store ptr [[TMP7]], ptr [[_TMP2]], align 8, !dbg [[DBG275]] -// CHECK1-NEXT: [[TMP8:%.*]] = load ptr, ptr [[_TMP2]], align 8, !dbg [[DBG275]] -// CHECK1-NEXT: [[TMP9:%.*]] = load ptr addrspace(1), ptr [[BB_ADDR]], align 8, !dbg [[DBG275]] -// CHECK1-NEXT: [[TMP10:%.*]] = addrspacecast ptr addrspace(1) [[TMP9]] to ptr, !dbg [[DBG275]] -// CHECK1-NEXT: store ptr [[TMP10]], ptr [[_TMP3]], align 8, !dbg [[DBG275]] -// CHECK1-NEXT: [[TMP11:%.*]] = load ptr, ptr [[_TMP3]], align 8, !dbg [[DBG275]] -// CHECK1-NEXT: call void @llvm.dbg.declare(metadata ptr [[DOTOMP_IV]], metadata [[META276:![0-9]+]], metadata !DIExpression()), !dbg [[DBG265]] -// CHECK1-NEXT: call void @llvm.dbg.declare(metadata ptr [[DOTOMP_LB]], metadata [[META277:![0-9]+]], metadata !DIExpression()), !dbg [[DBG265]] -// CHECK1-NEXT: store i32 0, ptr [[DOTOMP_LB]], align 4, !dbg [[DBG278:![0-9]+]] -// CHECK1-NEXT: call void @llvm.dbg.declare(metadata ptr [[DOTOMP_UB]], metadata [[META279:![0-9]+]], metadata !DIExpression()), !dbg [[DBG265]] -// CHECK1-NEXT: store i32 9, ptr [[DOTOMP_UB]], align 4, !dbg [[DBG278]] -// CHECK1-NEXT: call void @llvm.dbg.declare(metadata ptr [[DOTOMP_STRIDE]], metadata [[META280:![0-9]+]], metadata !DIExpression()), !dbg [[DBG265]] -// CHECK1-NEXT: store i32 1, ptr [[DOTOMP_STRIDE]], align 4, !dbg [[DBG278]] -// CHECK1-NEXT: call void @llvm.dbg.declare(metadata ptr [[DOTOMP_IS_LAST]], metadata [[META281:![0-9]+]], metadata !DIExpression()), !dbg [[DBG265]] -// CHECK1-NEXT: store i32 0, ptr [[DOTOMP_IS_LAST]], align 4, !dbg [[DBG278]] -// CHECK1-NEXT: call void @llvm.dbg.declare(metadata ptr [[I]], metadata [[META282:![0-9]+]], metadata !DIExpression()), !dbg [[DBG265]] -// CHECK1-NEXT: [[TMP12:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8, !dbg [[DBG275]] -// CHECK1-NEXT: [[TMP13:%.*]] = load i32, ptr [[TMP12]], align 4, !dbg [[DBG275]] -// CHECK1-NEXT: call void @__kmpc_for_static_init_4(ptr @[[GLOB17:[0-9]+]], i32 [[TMP13]], i32 33, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_LB]], ptr [[DOTOMP_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1), !dbg [[DBG283:![0-9]+]] -// CHECK1-NEXT: br label [[OMP_DISPATCH_COND:%.*]], !dbg [[DBG275]] +// CHECK1-NEXT: tail call void @llvm.dbg.declare(metadata ptr [[BB_ADDR]], metadata [[META283:![0-9]+]], metadata !DIExpression()), !dbg [[DBG284:![0-9]+]] +// CHECK1-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr [[C_ADDR]], align 8, !dbg [[DBG285:![0-9]+]] +// CHECK1-NEXT: [[TMP1:%.*]] = addrspacecast ptr addrspace(1) [[TMP0]] to ptr, !dbg [[DBG285]] +// CHECK1-NEXT: store ptr [[TMP1]], ptr [[TMP]], align 8, !dbg [[DBG285]] +// CHECK1-NEXT: [[TMP2:%.*]] = load ptr, ptr [[TMP]], align 8, !dbg [[DBG285]] +// CHECK1-NEXT: [[TMP3:%.*]] = load ptr addrspace(1), ptr [[A_ADDR]], align 8, !dbg [[DBG285]] +// CHECK1-NEXT: [[TMP4:%.*]] = addrspacecast ptr addrspace(1) [[TMP3]] to ptr, !dbg [[DBG285]] +// CHECK1-NEXT: store ptr [[TMP4]], ptr [[_TMP1]], align 8, !dbg [[DBG285]] +// CHECK1-NEXT: [[TMP5:%.*]] = load ptr, ptr [[_TMP1]], align 8, !dbg [[DBG285]] +// CHECK1-NEXT: [[TMP6:%.*]] = load ptr addrspace(1), ptr [[B_ADDR]], align 8, !dbg [[DBG285]] +// CHECK1-NEXT: [[TMP7:%.*]] = addrspacecast ptr addrspace(1) [[TMP6]] to ptr, !dbg [[DBG285]] +// CHECK1-NEXT: store ptr [[TMP7]], ptr [[_TMP2]], align 8, !dbg [[DBG285]] +// CHECK1-NEXT: [[TMP8:%.*]] = load ptr, ptr [[_TMP2]], align 8, !dbg [[DBG285]] +// CHECK1-NEXT: [[TMP9:%.*]] = load ptr addrspace(1), ptr [[BB_ADDR]], align 8, !dbg [[DBG285]] +// CHECK1-NEXT: [[TMP10:%.*]] = addrspacecast ptr addrspace(1) [[TMP9]] to ptr, !dbg [[DBG285]] +// CHECK1-NEXT: store ptr [[TMP10]], ptr [[_TMP3]], align 8, !dbg [[DBG285]] +// CHECK1-NEXT: [[TMP11:%.*]] = load ptr, ptr [[_TMP3]], align 8, !dbg [[DBG285]] +// CHECK1-NEXT: tail call void @llvm.dbg.declare(metadata ptr [[DOTOMP_IV]], metadata [[META286:![0-9]+]], metadata !DIExpression()), !dbg [[DBG275]] +// CHECK1-NEXT: tail call void @llvm.dbg.declare(metadata ptr [[DOTOMP_LB]], metadata [[META287:![0-9]+]], metadata !DIExpression()), !dbg [[DBG275]] +// CHECK1-NEXT: store i32 0, ptr [[DOTOMP_LB]], align 4, !dbg [[DBG288:![0-9]+]] +// CHECK1-NEXT: tail call void @llvm.dbg.declare(metadata ptr [[DOTOMP_UB]], metadata [[META289:![0-9]+]], metadata !DIExpression()), !dbg [[DBG275]] +// CHECK1-NEXT: store i32 9, ptr [[DOTOMP_UB]], align 4, !dbg [[DBG288]] +// CHECK1-NEXT: tail call void @llvm.dbg.declare(metadata ptr [[DOTOMP_STRIDE]], metadata [[META290:![0-9]+]], metadata !DIExpression()), !dbg [[DBG275]] +// CHECK1-NEXT: store i32 1, ptr [[DOTOMP_STRIDE]], align 4, !dbg [[DBG288]] +// CHECK1-NEXT: tail call void @llvm.dbg.declare(metadata ptr [[DOTOMP_IS_LAST]], metadata [[META291:![0-9]+]], metadata !DIExpression()), !dbg [[DBG275]] +// CHECK1-NEXT: store i32 0, ptr [[DOTOMP_IS_LAST]], align 4, !dbg [[DBG288]] +// CHECK1-NEXT: tail call void @llvm.dbg.declare(metadata ptr [[I]], metadata [[META292:![0-9]+]], metadata !DIExpression()), !dbg [[DBG275]] +// CHECK1-NEXT: [[TMP12:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8, !dbg [[DBG285]] +// CHECK1-NEXT: [[TMP13:%.*]] = load i32, ptr [[TMP12]], align 4, !dbg [[DBG285]] +// CHECK1-NEXT: call void @__kmpc_for_static_init_4(ptr @[[GLOB17:[0-9]+]], i32 [[TMP13]], i32 33, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_LB]], ptr [[DOTOMP_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1), !dbg [[DBG293:![0-9]+]] +// CHECK1-NEXT: br label [[OMP_DISPATCH_COND:%.*]], !dbg [[DBG285]] // CHECK1: omp.dispatch.cond: -// CHECK1-NEXT: [[TMP14:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !dbg [[DBG278]] -// CHECK1-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP14]], 9, !dbg [[DBG278]] -// CHECK1-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]], !dbg [[DBG278]] +// CHECK1-NEXT: [[TMP14:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !dbg [[DBG288]] +// CHECK1-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP14]], 9, !dbg [[DBG288]] +// CHECK1-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]], !dbg [[DBG288]] // CHECK1: cond.true: -// CHECK1-NEXT: br label [[COND_END:%.*]], !dbg [[DBG278]] +// CHECK1-NEXT: br label [[COND_END:%.*]], !dbg [[DBG288]] // CHECK1: cond.false: -// CHECK1-NEXT: [[TMP15:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !dbg [[DBG278]] -// CHECK1-NEXT: br label [[COND_END]], !dbg [[DBG278]] +// CHECK1-NEXT: [[TMP15:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !dbg [[DBG288]] +// CHECK1-NEXT: br label [[COND_END]], !dbg [[DBG288]] // CHECK1: cond.end: -// CHECK1-NEXT: [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP15]], [[COND_FALSE]] ], !dbg [[DBG278]] -// CHECK1-NEXT: store i32 [[COND]], ptr [[DOTOMP_UB]], align 4, !dbg [[DBG278]] -// CHECK1-NEXT: [[TMP16:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4, !dbg [[DBG278]] -// CHECK1-NEXT: store i32 [[TMP16]], ptr [[DOTOMP_IV]], align 4, !dbg [[DBG278]] -// CHECK1-NEXT: [[TMP17:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !dbg [[DBG278]] -// CHECK1-NEXT: [[TMP18:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !dbg [[DBG278]] -// CHECK1-NEXT: [[CMP5:%.*]] = icmp sle i32 [[TMP17]], [[TMP18]], !dbg [[DBG275]] -// CHECK1-NEXT: br i1 [[CMP5]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]], !dbg [[DBG275]] +// CHECK1-NEXT: [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP15]], [[COND_FALSE]] ], !dbg [[DBG288]] +// CHECK1-NEXT: store i32 [[COND]], ptr [[DOTOMP_UB]], align 4, !dbg [[DBG288]] +// CHECK1-NEXT: [[TMP16:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4, !dbg [[DBG288]] +// CHECK1-NEXT: store i32 [[TMP16]], ptr [[DOTOMP_IV]], align 4, !dbg [[DBG288]] +// CHECK1-NEXT: [[TMP17:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !dbg [[DBG288]] +// CHECK1-NEXT: [[TMP18:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !dbg [[DBG288]] +// CHECK1-NEXT: [[CMP5:%.*]] = icmp sle i32 [[TMP17]], [[TMP18]], !dbg [[DBG285]] +// CHECK1-NEXT: br i1 [[CMP5]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]], !dbg [[DBG285]] // CHECK1: omp.dispatch.body: -// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND:%.*]], !dbg [[DBG275]] +// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND:%.*]], !dbg [[DBG285]] // CHECK1: omp.inner.for.cond: -// CHECK1-NEXT: [[TMP19:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !dbg [[DBG278]] -// CHECK1-NEXT: [[TMP20:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !dbg [[DBG278]] -// CHECK1-NEXT: [[CMP6:%.*]] = icmp sle i32 [[TMP19]], [[TMP20]], !dbg [[DBG275]] -// CHECK1-NEXT: br i1 [[CMP6]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]], !dbg [[DBG275]] +// CHECK1-NEXT: [[TMP19:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !dbg [[DBG288]] +// CHECK1-NEXT: [[TMP20:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !dbg [[DBG288]] +// CHECK1-NEXT: [[CMP6:%.*]] = icmp sle i32 [[TMP19]], [[TMP20]], !dbg [[DBG285]] +// CHECK1-NEXT: br i1 [[CMP6]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]], !dbg [[DBG285]] // CHECK1: omp.inner.for.body: -// CHECK1-NEXT: [[TMP21:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !dbg [[DBG278]] -// CHECK1-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP21]], 1, !dbg [[DBG284:![0-9]+]] -// CHECK1-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]], !dbg [[DBG284]] -// CHECK1-NEXT: store i32 [[ADD]], ptr [[I]], align 4, !dbg [[DBG284]] -// CHECK1-NEXT: call void @llvm.dbg.declare(metadata ptr [[F]], metadata [[META285:![0-9]+]], metadata !DIExpression()), !dbg [[DBG287:![0-9]+]] -// CHECK1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x [10 x [10 x i32]]], ptr [[TMP2]], i64 0, i64 1, !dbg [[DBG288:![0-9]+]] -// CHECK1-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[ARRAYIDX]], i64 0, i64 1, !dbg [[DBG288]] -// CHECK1-NEXT: [[ARRAYIDX8:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX7]], i64 0, i64 1, !dbg [[DBG288]] -// CHECK1-NEXT: store ptr [[ARRAYIDX8]], ptr [[F]], align 8, !dbg [[DBG287]] -// CHECK1-NEXT: call void @llvm.dbg.declare(metadata ptr [[G]], metadata [[META289:![0-9]+]], metadata !DIExpression()), !dbg [[DBG290:![0-9]+]] -// CHECK1-NEXT: store ptr [[TMP5]], ptr [[G]], align 8, !dbg [[DBG290]] -// CHECK1-NEXT: call void @llvm.dbg.declare(metadata ptr [[H]], metadata [[META291:![0-9]+]], metadata !DIExpression()), !dbg [[DBG292:![0-9]+]] -// CHECK1-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[TMP8]], i64 0, i64 1, !dbg [[DBG293:![0-9]+]] -// CHECK1-NEXT: [[ARRAYIDX10:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX9]], i64 0, i64 1, !dbg [[DBG293]] -// CHECK1-NEXT: store ptr [[ARRAYIDX10]], ptr [[H]], align 8, !dbg [[DBG292]] -// CHECK1-NEXT: call void @llvm.dbg.declare(metadata ptr [[D]], metadata [[META294:![0-9]+]], metadata !DIExpression()), !dbg [[DBG295:![0-9]+]] -// CHECK1-NEXT: store i32 15, ptr [[D]], align 4, !dbg [[DBG295]] -// CHECK1-NEXT: store i32 5, ptr [[TMP5]], align 4, !dbg [[DBG296:![0-9]+]] -// CHECK1-NEXT: [[ARRAYIDX11:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[TMP8]], i64 0, i64 0, !dbg [[DBG297:![0-9]+]] -// CHECK1-NEXT: [[TMP22:%.*]] = load i32, ptr [[TMP5]], align 4, !dbg [[DBG298:![0-9]+]] -// CHECK1-NEXT: [[IDXPROM:%.*]] = sext i32 [[TMP22]] to i64, !dbg [[DBG297]] -// CHECK1-NEXT: [[ARRAYIDX12:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX11]], i64 0, i64 [[IDXPROM]], !dbg [[DBG297]] -// CHECK1-NEXT: store i32 10, ptr [[ARRAYIDX12]], align 4, !dbg [[DBG299:![0-9]+]] -// CHECK1-NEXT: [[ARRAYIDX13:%.*]] = getelementptr inbounds [10 x [10 x [10 x i32]]], ptr [[TMP2]], i64 0, i64 0, !dbg [[DBG300:![0-9]+]] -// CHECK1-NEXT: [[ARRAYIDX14:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[ARRAYIDX13]], i64 0, i64 0, !dbg [[DBG300]] -// CHECK1-NEXT: [[TMP23:%.*]] = load i32, ptr [[TMP5]], align 4, !dbg [[DBG301:![0-9]+]] -// CHECK1-NEXT: [[IDXPROM15:%.*]] = sext i32 [[TMP23]] to i64, !dbg [[DBG300]] -// CHECK1-NEXT: [[ARRAYIDX16:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX14]], i64 0, i64 [[IDXPROM15]], !dbg [[DBG300]] -// CHECK1-NEXT: store i32 11, ptr [[ARRAYIDX16]], align 4, !dbg [[DBG302:![0-9]+]] -// CHECK1-NEXT: [[ARRAYIDX17:%.*]] = getelementptr inbounds [10 x [10 x [10 x i32]]], ptr [[TMP2]], i64 0, i64 0, !dbg [[DBG303:![0-9]+]] -// CHECK1-NEXT: [[ARRAYIDX18:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[ARRAYIDX17]], i64 0, i64 0, !dbg [[DBG303]] -// CHECK1-NEXT: [[TMP24:%.*]] = load i32, ptr [[TMP5]], align 4, !dbg [[DBG304:![0-9]+]] -// CHECK1-NEXT: [[IDXPROM19:%.*]] = sext i32 [[TMP24]] to i64, !dbg [[DBG303]] -// CHECK1-NEXT: [[ARRAYIDX20:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX18]], i64 0, i64 [[IDXPROM19]], !dbg [[DBG303]] -// CHECK1-NEXT: [[TMP25:%.*]] = load i32, ptr [[ARRAYIDX20]], align 4, !dbg [[DBG303]] -// CHECK1-NEXT: [[ARRAYIDX21:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[TMP8]], i64 0, i64 0, !dbg [[DBG305:![0-9]+]] -// CHECK1-NEXT: [[TMP26:%.*]] = load i32, ptr [[TMP5]], align 4, !dbg [[DBG306:![0-9]+]] -// CHECK1-NEXT: [[IDXPROM22:%.*]] = sext i32 [[TMP26]] to i64, !dbg [[DBG305]] -// CHECK1-NEXT: [[ARRAYIDX23:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX21]], i64 0, i64 [[IDXPROM22]], !dbg [[DBG305]] -// CHECK1-NEXT: store i32 [[TMP25]], ptr [[ARRAYIDX23]], align 4, !dbg [[DBG307:![0-9]+]] -// CHECK1-NEXT: [[ARRAYIDX24:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[TMP8]], i64 0, i64 0, !dbg [[DBG308:![0-9]+]] -// CHECK1-NEXT: [[TMP27:%.*]] = load i32, ptr [[TMP5]], align 4, !dbg [[DBG309:![0-9]+]] -// CHECK1-NEXT: [[IDXPROM25:%.*]] = sext i32 [[TMP27]] to i64, !dbg [[DBG308]] -// CHECK1-NEXT: [[ARRAYIDX26:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX24]], i64 0, i64 [[IDXPROM25]], !dbg [[DBG308]] -// CHECK1-NEXT: [[TMP28:%.*]] = load i32, ptr [[ARRAYIDX26]], align 4, !dbg [[DBG308]] -// CHECK1-NEXT: [[TOBOOL:%.*]] = icmp ne i32 [[TMP28]], 0, !dbg [[DBG308]] -// CHECK1-NEXT: [[FROMBOOL:%.*]] = zext i1 [[TOBOOL]] to i8, !dbg [[DBG310:![0-9]+]] -// CHECK1-NEXT: store i8 [[FROMBOOL]], ptr [[TMP11]], align 1, !dbg [[DBG310]] -// CHECK1-NEXT: br label [[OMP_BODY_CONTINUE:%.*]], !dbg [[DBG311:![0-9]+]] +// CHECK1-NEXT: [[TMP21:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !dbg [[DBG288]] +// CHECK1-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP21]], 1, !dbg [[DBG294:![0-9]+]] +// CHECK1-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]], !dbg [[DBG294]] +// CHECK1-NEXT: store i32 [[ADD]], ptr [[I]], align 4, !dbg [[DBG294]] +// CHECK1-NEXT: tail call void @llvm.dbg.declare(metadata ptr [[F]], metadata [[META295:![0-9]+]], metadata !DIExpression()), !dbg [[DBG297:![0-9]+]] +// CHECK1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x [10 x [10 x i32]]], ptr [[TMP2]], i64 0, i64 1, !dbg [[DBG298:![0-9]+]] +// CHECK1-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[ARRAYIDX]], i64 0, i64 1, !dbg [[DBG298]] +// CHECK1-NEXT: [[ARRAYIDX8:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX7]], i64 0, i64 1, !dbg [[DBG298]] +// CHECK1-NEXT: store ptr [[ARRAYIDX8]], ptr [[F]], align 8, !dbg [[DBG297]] +// CHECK1-NEXT: tail call void @llvm.dbg.declare(metadata ptr [[G]], metadata [[META299:![0-9]+]], metadata !DIExpression()), !dbg [[DBG300:![0-9]+]] +// CHECK1-NEXT: store ptr [[TMP5]], ptr [[G]], align 8, !dbg [[DBG300]] +// CHECK1-NEXT: tail call void @llvm.dbg.declare(metadata ptr [[H]], metadata [[META301:![0-9]+]], metadata !DIExpression()), !dbg [[DBG302:![0-9]+]] +// CHECK1-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[TMP8]], i64 0, i64 1, !dbg [[DBG303:![0-9]+]] +// CHECK1-NEXT: [[ARRAYIDX10:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX9]], i64 0, i64 1, !dbg [[DBG303]] +// CHECK1-NEXT: store ptr [[ARRAYIDX10]], ptr [[H]], align 8, !dbg [[DBG302]] +// CHECK1-NEXT: tail call void @llvm.dbg.declare(metadata ptr [[D]], metadata [[META304:![0-9]+]], metadata !DIExpression()), !dbg [[DBG305:![0-9]+]] +// CHECK1-NEXT: store i32 15, ptr [[D]], align 4, !dbg [[DBG305]] +// CHECK1-NEXT: store i32 5, ptr [[TMP5]], align 4, !dbg [[DBG306:![0-9]+]] +// CHECK1-NEXT: [[ARRAYIDX11:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[TMP8]], i64 0, i64 0, !dbg [[DBG307:![0-9]+]] +// CHECK1-NEXT: [[TMP22:%.*]] = load i32, ptr [[TMP5]], align 4, !dbg [[DBG308:![0-9]+]] +// CHECK1-NEXT: [[IDXPROM:%.*]] = sext i32 [[TMP22]] to i64, !dbg [[DBG307]] +// CHECK1-NEXT: [[ARRAYIDX12:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX11]], i64 0, i64 [[IDXPROM]], !dbg [[DBG307]] +// CHECK1-NEXT: store i32 10, ptr [[ARRAYIDX12]], align 4, !dbg [[DBG309:![0-9]+]] +// CHECK1-NEXT: [[ARRAYIDX13:%.*]] = getelementptr inbounds [10 x [10 x [10 x i32]]], ptr [[TMP2]], i64 0, i64 0, !dbg [[DBG310:![0-9]+]] +// CHECK1-NEXT: [[ARRAYIDX14:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[ARRAYIDX13]], i64 0, i64 0, !dbg [[DBG310]] +// CHECK1-NEXT: [[TMP23:%.*]] = load i32, ptr [[TMP5]], align 4, !dbg [[DBG311:![0-9]+]] +// CHECK1-NEXT: [[IDXPROM15:%.*]] = sext i32 [[TMP23]] to i64, !dbg [[DBG310]] +// CHECK1-NEXT: [[ARRAYIDX16:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX14]], i64 0, i64 [[IDXPROM15]], !dbg [[DBG310]] +// CHECK1-NEXT: store i32 11, ptr [[ARRAYIDX16]], align 4, !dbg [[DBG312:![0-9]+]] +// CHECK1-NEXT: [[ARRAYIDX17:%.*]] = getelementptr inbounds [10 x [10 x [10 x i32]]], ptr [[TMP2]], i64 0, i64 0, !dbg [[DBG313:![0-9]+]] +// CHECK1-NEXT: [[ARRAYIDX18:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[ARRAYIDX17]], i64 0, i64 0, !dbg [[DBG313]] +// CHECK1-NEXT: [[TMP24:%.*]] = load i32, ptr [[TMP5]], align 4, !dbg [[DBG314:![0-9]+]] +// CHECK1-NEXT: [[IDXPROM19:%.*]] = sext i32 [[TMP24]] to i64, !dbg [[DBG313]] +// CHECK1-NEXT: [[ARRAYIDX20:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX18]], i64 0, i64 [[IDXPROM19]], !dbg [[DBG313]] +// CHECK1-NEXT: [[TMP25:%.*]] = load i32, ptr [[ARRAYIDX20]], align 4, !dbg [[DBG313]] +// CHECK1-NEXT: [[ARRAYIDX21:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[TMP8]], i64 0, i64 0, !dbg [[DBG315:![0-9]+]] +// CHECK1-NEXT: [[TMP26:%.*]] = load i32, ptr [[TMP5]], align 4, !dbg [[DBG316:![0-9]+]] +// CHECK1-NEXT: [[IDXPROM22:%.*]] = sext i32 [[TMP26]] to i64, !dbg [[DBG315]] +// CHECK1-NEXT: [[ARRAYIDX23:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX21]], i64 0, i64 [[IDXPROM22]], !dbg [[DBG315]] +// CHECK1-NEXT: store i32 [[TMP25]], ptr [[ARRAYIDX23]], align 4, !dbg [[DBG317:![0-9]+]] +// CHECK1-NEXT: [[ARRAYIDX24:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[TMP8]], i64 0, i64 0, !dbg [[DBG318:![0-9]+]] +// CHECK1-NEXT: [[TMP27:%.*]] = load i32, ptr [[TMP5]], align 4, !dbg [[DBG319:![0-9]+]] +// CHECK1-NEXT: [[IDXPROM25:%.*]] = sext i32 [[TMP27]] to i64, !dbg [[DBG318]] +// CHECK1-NEXT: [[ARRAYIDX26:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX24]], i64 0, i64 [[IDXPROM25]], !dbg [[DBG318]] +// CHECK1-NEXT: [[TMP28:%.*]] = load i32, ptr [[ARRAYIDX26]], align 4, !dbg [[DBG318]] +// CHECK1-NEXT: [[TOBOOL:%.*]] = icmp ne i32 [[TMP28]], 0, !dbg [[DBG318]] +// CHECK1-NEXT: [[FROMBOOL:%.*]] = zext i1 [[TOBOOL]] to i8, !dbg [[DBG320:![0-9]+]] +// CHECK1-NEXT: store i8 [[FROMBOOL]], ptr [[TMP11]], align 1, !dbg [[DBG320]] +// CHECK1-NEXT: br label [[OMP_BODY_CONTINUE:%.*]], !dbg [[DBG321:![0-9]+]] // CHECK1: omp.body.continue: -// CHECK1-NEXT: br label [[OMP_INNER_FOR_INC:%.*]], !dbg [[DBG283]] +// CHECK1-NEXT: br label [[OMP_INNER_FOR_INC:%.*]], !dbg [[DBG293]] // CHECK1: omp.inner.for.inc: -// CHECK1-NEXT: [[TMP29:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !dbg [[DBG278]] -// CHECK1-NEXT: [[ADD27:%.*]] = add nsw i32 [[TMP29]], 1, !dbg [[DBG275]] -// CHECK1-NEXT: store i32 [[ADD27]], ptr [[DOTOMP_IV]], align 4, !dbg [[DBG275]] -// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND]], !dbg [[DBG283]], !llvm.loop [[LOOP312:![0-9]+]] +// CHECK1-NEXT: [[TMP29:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !dbg [[DBG288]] +// CHECK1-NEXT: [[ADD27:%.*]] = add nsw i32 [[TMP29]], 1, !dbg [[DBG285]] +// CHECK1-NEXT: store i32 [[ADD27]], ptr [[DOTOMP_IV]], align 4, !dbg [[DBG285]] +// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND]], !dbg [[DBG293]], !llvm.loop [[LOOP322:![0-9]+]] // CHECK1: omp.inner.for.end: -// CHECK1-NEXT: br label [[OMP_DISPATCH_INC:%.*]], !dbg [[DBG283]] +// CHECK1-NEXT: br label [[OMP_DISPATCH_INC:%.*]], !dbg [[DBG293]] // CHECK1: omp.dispatch.inc: -// CHECK1-NEXT: [[TMP30:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4, !dbg [[DBG278]] -// CHECK1-NEXT: [[TMP31:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !dbg [[DBG278]] -// CHECK1-NEXT: [[ADD28:%.*]] = add nsw i32 [[TMP30]], [[TMP31]], !dbg [[DBG275]] -// CHECK1-NEXT: store i32 [[ADD28]], ptr [[DOTOMP_LB]], align 4, !dbg [[DBG275]] -// CHECK1-NEXT: [[TMP32:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !dbg [[DBG278]] -// CHECK1-NEXT: [[TMP33:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !dbg [[DBG278]] -// CHECK1-NEXT: [[ADD29:%.*]] = add nsw i32 [[TMP32]], [[TMP33]], !dbg [[DBG275]] -// CHECK1-NEXT: store i32 [[ADD29]], ptr [[DOTOMP_UB]], align 4, !dbg [[DBG275]] -// CHECK1-NEXT: br label [[OMP_DISPATCH_COND]], !dbg [[DBG283]], !llvm.loop [[LOOP314:![0-9]+]] +// CHECK1-NEXT: [[TMP30:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4, !dbg [[DBG288]] +// CHECK1-NEXT: [[TMP31:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !dbg [[DBG288]] +// CHECK1-NEXT: [[ADD28:%.*]] = add nsw i32 [[TMP30]], [[TMP31]], !dbg [[DBG285]] +// CHECK1-NEXT: store i32 [[ADD28]], ptr [[DOTOMP_LB]], align 4, !dbg [[DBG285]] +// CHECK1-NEXT: [[TMP32:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !dbg [[DBG288]] +// CHECK1-NEXT: [[TMP33:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !dbg [[DBG288]] +// CHECK1-NEXT: [[ADD29:%.*]] = add nsw i32 [[TMP32]], [[TMP33]], !dbg [[DBG285]] +// CHECK1-NEXT: store i32 [[ADD29]], ptr [[DOTOMP_UB]], align 4, !dbg [[DBG285]] +// CHECK1-NEXT: br label [[OMP_DISPATCH_COND]], !dbg [[DBG293]], !llvm.loop [[LOOP324:![0-9]+]] // CHECK1: omp.dispatch.end: -// CHECK1-NEXT: call void @__kmpc_for_static_fini(ptr @[[GLOB19:[0-9]+]], i32 [[TMP13]]), !dbg [[DBG313:![0-9]+]] -// CHECK1-NEXT: ret void, !dbg [[DBG315:![0-9]+]] +// CHECK1-NEXT: call void @__kmpc_for_static_fini(ptr @[[GLOB19:[0-9]+]], i32 [[TMP13]]), !dbg [[DBG323:![0-9]+]] +// CHECK1-NEXT: ret void, !dbg [[DBG325:![0-9]+]] // // // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l41_debug___omp_outlined -// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4000) [[C:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(400) [[B:%.*]], ptr noundef nonnull align 1 dereferenceable(1) [[BB:%.*]]) #[[ATTR2]] !dbg [[DBG316:![0-9]+]] { +// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4000) [[C:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(400) [[B:%.*]], ptr noundef nonnull align 1 dereferenceable(1) [[BB:%.*]]) #[[ATTR1]] !dbg [[DBG326:![0-9]+]] { // CHECK1-NEXT: entry: // CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -877,66 +912,31 @@ int main() { // CHECK1-NEXT: [[B_ADDR:%.*]] = alloca ptr, align 8 // CHECK1-NEXT: [[BB_ADDR:%.*]] = alloca ptr, align 8 // CHECK1-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK1-NEXT: call void @llvm.dbg.declare(metadata ptr [[DOTGLOBAL_TID__ADDR]], metadata [[META319:![0-9]+]], metadata !DIExpression()), !dbg [[DBG320:![0-9]+]] +// CHECK1-NEXT: tail call void @llvm.dbg.declare(metadata ptr [[DOTGLOBAL_TID__ADDR]], metadata [[META329:![0-9]+]], metadata !DIExpression()), !dbg [[DBG330:![0-9]+]] // CHECK1-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8 -// CHECK1-NEXT: call void @llvm.dbg.declare(metadata ptr [[DOTBOUND_TID__ADDR]], metadata [[META321:![0-9]+]], metadata !DIExpression()), !dbg [[DBG320]] -// CHECK1-NEXT: store ptr [[C]], ptr [[C_ADDR]], align 8 -// CHECK1-NEXT: call void @llvm.dbg.declare(metadata ptr [[C_ADDR]], metadata [[META322:![0-9]+]], metadata !DIExpression()), !dbg [[DBG320]] -// CHECK1-NEXT: store ptr [[A]], ptr [[A_ADDR]], align 8 -// CHECK1-NEXT: call void @llvm.dbg.declare(metadata ptr [[A_ADDR]], metadata [[META323:![0-9]+]], metadata !DIExpression()), !dbg [[DBG320]] -// CHECK1-NEXT: store ptr [[B]], ptr [[B_ADDR]], align 8 -// CHECK1-NEXT: call void @llvm.dbg.declare(metadata ptr [[B_ADDR]], metadata [[META324:![0-9]+]], metadata !DIExpression()), !dbg [[DBG320]] -// CHECK1-NEXT: store ptr [[BB]], ptr [[BB_ADDR]], align 8 -// CHECK1-NEXT: call void @llvm.dbg.declare(metadata ptr [[BB_ADDR]], metadata [[META325:![0-9]+]], metadata !DIExpression()), !dbg [[DBG320]] -// CHECK1-NEXT: [[TMP0:%.*]] = load ptr, ptr [[C_ADDR]], align 8, !dbg [[DBG326:![0-9]+]] -// CHECK1-NEXT: [[TMP1:%.*]] = load ptr, ptr [[A_ADDR]], align 8, !dbg [[DBG326]] -// CHECK1-NEXT: [[TMP2:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG326]] -// CHECK1-NEXT: [[TMP3:%.*]] = load ptr, ptr [[BB_ADDR]], align 8, !dbg [[DBG326]] -// CHECK1-NEXT: [[TMP4:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8, !dbg [[DBG326]] -// CHECK1-NEXT: [[TMP5:%.*]] = load ptr, ptr [[DOTBOUND_TID__ADDR]], align 8, !dbg [[DBG326]] -// CHECK1-NEXT: [[TMP6:%.*]] = load ptr, ptr [[C_ADDR]], align 8, !dbg [[DBG326]] -// CHECK1-NEXT: [[TMP7:%.*]] = load ptr, ptr [[A_ADDR]], align 8, !dbg [[DBG326]] -// CHECK1-NEXT: [[TMP8:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG326]] -// CHECK1-NEXT: [[TMP9:%.*]] = load ptr, ptr [[BB_ADDR]], align 8, !dbg [[DBG326]] -// CHECK1-NEXT: [[TMP10:%.*]] = addrspacecast ptr [[TMP6]] to ptr addrspace(1), !dbg [[DBG326]] -// CHECK1-NEXT: [[TMP11:%.*]] = addrspacecast ptr [[TMP7]] to ptr addrspace(1), !dbg [[DBG326]] -// CHECK1-NEXT: [[TMP12:%.*]] = addrspacecast ptr [[TMP8]] to ptr addrspace(1), !dbg [[DBG326]] -// CHECK1-NEXT: [[TMP13:%.*]] = addrspacecast ptr [[TMP9]] to ptr addrspace(1), !dbg [[DBG326]] -// CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l41_debug___omp_outlined_debug__(ptr [[TMP4]], ptr [[TMP5]], ptr addrspace(1) [[TMP10]], ptr addrspace(1) [[TMP11]], ptr addrspace(1) [[TMP12]], ptr addrspace(1) [[TMP13]]) #[[ATTR4]], !dbg [[DBG326]] -// CHECK1-NEXT: ret void, !dbg [[DBG326]] -// -// -// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l41 -// CHECK1-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], ptr noundef nonnull align 4 dereferenceable(4000) [[C:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(400) [[B:%.*]], ptr noundef nonnull align 1 dereferenceable(1) [[BB:%.*]]) #[[ATTR6]] !dbg [[DBG327:![0-9]+]] { -// CHECK1-NEXT: entry: -// CHECK1-NEXT: [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8 -// CHECK1-NEXT: [[C_ADDR:%.*]] = alloca ptr, align 8 -// CHECK1-NEXT: [[A_ADDR:%.*]] = alloca ptr, align 8 -// CHECK1-NEXT: [[B_ADDR:%.*]] = alloca ptr, align 8 -// CHECK1-NEXT: [[BB_ADDR:%.*]] = alloca ptr, align 8 -// CHECK1-NEXT: store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8 -// CHECK1-NEXT: call void @llvm.dbg.declare(metadata ptr [[DYN_PTR_ADDR]], metadata [[META330:![0-9]+]], metadata !DIExpression()), !dbg [[DBG331:![0-9]+]] +// CHECK1-NEXT: tail call void @llvm.dbg.declare(metadata ptr [[DOTBOUND_TID__ADDR]], metadata [[META331:![0-9]+]], metadata !DIExpression()), !dbg [[DBG330]] // CHECK1-NEXT: store ptr [[C]], ptr [[C_ADDR]], align 8 -// CHECK1-NEXT: call void @llvm.dbg.declare(metadata ptr [[C_ADDR]], metadata [[META332:![0-9]+]], metadata !DIExpression()), !dbg [[DBG331]] +// CHECK1-NEXT: tail call void @llvm.dbg.declare(metadata ptr [[C_ADDR]], metadata [[META332:![0-9]+]], metadata !DIExpression()), !dbg [[DBG330]] // CHECK1-NEXT: store ptr [[A]], ptr [[A_ADDR]], align 8 -// CHECK1-NEXT: call void @llvm.dbg.declare(metadata ptr [[A_ADDR]], metadata [[META333:![0-9]+]], metadata !DIExpression()), !dbg [[DBG331]] +// CHECK1-NEXT: tail call void @llvm.dbg.declare(metadata ptr [[A_ADDR]], metadata [[META333:![0-9]+]], metadata !DIExpression()), !dbg [[DBG330]] // CHECK1-NEXT: store ptr [[B]], ptr [[B_ADDR]], align 8 -// CHECK1-NEXT: call void @llvm.dbg.declare(metadata ptr [[B_ADDR]], metadata [[META334:![0-9]+]], metadata !DIExpression()), !dbg [[DBG331]] +// CHECK1-NEXT: tail call void @llvm.dbg.declare(metadata ptr [[B_ADDR]], metadata [[META334:![0-9]+]], metadata !DIExpression()), !dbg [[DBG330]] // CHECK1-NEXT: store ptr [[BB]], ptr [[BB_ADDR]], align 8 -// CHECK1-NEXT: call void @llvm.dbg.declare(metadata ptr [[BB_ADDR]], metadata [[META335:![0-9]+]], metadata !DIExpression()), !dbg [[DBG331]] +// CHECK1-NEXT: tail call void @llvm.dbg.declare(metadata ptr [[BB_ADDR]], metadata [[META335:![0-9]+]], metadata !DIExpression()), !dbg [[DBG330]] // CHECK1-NEXT: [[TMP0:%.*]] = load ptr, ptr [[C_ADDR]], align 8, !dbg [[DBG336:![0-9]+]] // CHECK1-NEXT: [[TMP1:%.*]] = load ptr, ptr [[A_ADDR]], align 8, !dbg [[DBG336]] // CHECK1-NEXT: [[TMP2:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG336]] // CHECK1-NEXT: [[TMP3:%.*]] = load ptr, ptr [[BB_ADDR]], align 8, !dbg [[DBG336]] -// CHECK1-NEXT: [[TMP4:%.*]] = load ptr, ptr [[DYN_PTR_ADDR]], align 8, !dbg [[DBG336]] -// CHECK1-NEXT: [[TMP5:%.*]] = load ptr, ptr [[C_ADDR]], align 8, !dbg [[DBG336]] -// CHECK1-NEXT: [[TMP6:%.*]] = load ptr, ptr [[A_ADDR]], align 8, !dbg [[DBG336]] -// CHECK1-NEXT: [[TMP7:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG336]] -// CHECK1-NEXT: [[TMP8:%.*]] = load ptr, ptr [[BB_ADDR]], align 8, !dbg [[DBG336]] -// CHECK1-NEXT: [[TMP9:%.*]] = addrspacecast ptr [[TMP5]] to ptr addrspace(1), !dbg [[DBG336]] +// CHECK1-NEXT: [[TMP4:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8, !dbg [[DBG336]] +// CHECK1-NEXT: [[TMP5:%.*]] = load ptr, ptr [[DOTBOUND_TID__ADDR]], align 8, !dbg [[DBG336]] +// CHECK1-NEXT: [[TMP6:%.*]] = load ptr, ptr [[C_ADDR]], align 8, !dbg [[DBG336]] +// CHECK1-NEXT: [[TMP7:%.*]] = load ptr, ptr [[A_ADDR]], align 8, !dbg [[DBG336]] +// CHECK1-NEXT: [[TMP8:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG336]] +// CHECK1-NEXT: [[TMP9:%.*]] = load ptr, ptr [[BB_ADDR]], align 8, !dbg [[DBG336]] // CHECK1-NEXT: [[TMP10:%.*]] = addrspacecast ptr [[TMP6]] to ptr addrspace(1), !dbg [[DBG336]] // CHECK1-NEXT: [[TMP11:%.*]] = addrspacecast ptr [[TMP7]] to ptr addrspace(1), !dbg [[DBG336]] // CHECK1-NEXT: [[TMP12:%.*]] = addrspacecast ptr [[TMP8]] to ptr addrspace(1), !dbg [[DBG336]] -// CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l41_debug__(ptr [[TMP4]], ptr addrspace(1) [[TMP9]], ptr addrspace(1) [[TMP10]], ptr addrspace(1) [[TMP11]], ptr addrspace(1) [[TMP12]]) #[[ATTR4]], !dbg [[DBG336]] +// CHECK1-NEXT: [[TMP13:%.*]] = addrspacecast ptr [[TMP9]] to ptr addrspace(1), !dbg [[DBG336]] +// CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l41_debug___omp_outlined_debug__(ptr [[TMP4]], ptr [[TMP5]], ptr addrspace(1) [[TMP10]], ptr addrspace(1) [[TMP11]], ptr addrspace(1) [[TMP12]], ptr addrspace(1) [[TMP13]]) #[[ATTR4]], !dbg [[DBG336]] // CHECK1-NEXT: ret void, !dbg [[DBG336]] // diff --git a/clang/test/OpenMP/target_parallel_generic_loop_codegen-3.cpp b/clang/test/OpenMP/target_parallel_generic_loop_codegen-3.cpp index 5a34dab313419..fd59743fdb07d 100644 --- a/clang/test/OpenMP/target_parallel_generic_loop_codegen-3.cpp +++ b/clang/test/OpenMP/target_parallel_generic_loop_codegen-3.cpp @@ -55,7 +55,7 @@ int main() { return 0; } // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l13_debug__ -// CHECK1-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], ptr addrspace(1) noalias noundef [[C:%.*]], i32 noundef [[A:%.*]], ptr noalias noundef [[B:%.*]], ptr addrspace(1) noalias noundef [[BB:%.*]], i1 noundef zeroext [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR0:[0-9]+]] !dbg [[DBG22:![0-9]+]] { +// CHECK1-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], ptr addrspace(1) noalias noundef [[C:%.*]], i32 noundef [[A:%.*]], ptr noalias noundef [[B:%.*]], ptr addrspace(1) noalias noundef [[BB:%.*]], i1 noundef zeroext [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR1:[0-9]+]] !dbg [[DBG19:![0-9]+]] { // CHECK1-NEXT: entry: // CHECK1-NEXT: [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8 // CHECK1-NEXT: [[C_ADDR:%.*]] = alloca ptr addrspace(1), align 8 @@ -69,58 +69,95 @@ int main() { // CHECK1-NEXT: [[A_CASTED:%.*]] = alloca i64, align 8 // CHECK1-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [4 x ptr], align 8 // CHECK1-NEXT: store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8 -// CHECK1-NEXT: call void @llvm.dbg.declare(metadata ptr [[DYN_PTR_ADDR]], metadata [[META43:![0-9]+]], metadata !DIExpression()), !dbg [[DBG44:![0-9]+]] +// CHECK1-NEXT: tail call void @llvm.dbg.declare(metadata ptr [[DYN_PTR_ADDR]], metadata [[META40:![0-9]+]], metadata !DIExpression()), !dbg [[DBG41:![0-9]+]] // CHECK1-NEXT: store ptr addrspace(1) [[C]], ptr [[C_ADDR]], align 8 -// CHECK1-NEXT: call void @llvm.dbg.declare(metadata ptr [[C_ADDR]], metadata [[META45:![0-9]+]], metadata !DIExpression()), !dbg [[DBG46:![0-9]+]] +// CHECK1-NEXT: tail call void @llvm.dbg.declare(metadata ptr [[C_ADDR]], metadata [[META42:![0-9]+]], metadata !DIExpression()), !dbg [[DBG43:![0-9]+]] // CHECK1-NEXT: store i32 [[A]], ptr [[A_ADDR]], align 4 -// CHECK1-NEXT: call void @llvm.dbg.declare(metadata ptr [[A_ADDR]], metadata [[META47:![0-9]+]], metadata !DIExpression()), !dbg [[DBG48:![0-9]+]] +// CHECK1-NEXT: tail call void @llvm.dbg.declare(metadata ptr [[A_ADDR]], metadata [[META44:![0-9]+]], metadata !DIExpression()), !dbg [[DBG45:![0-9]+]] // CHECK1-NEXT: store ptr [[B]], ptr [[B_ADDR]], align 8 -// CHECK1-NEXT: call void @llvm.dbg.declare(metadata ptr [[B_ADDR]], metadata [[META49:![0-9]+]], metadata !DIExpression()), !dbg [[DBG50:![0-9]+]] +// CHECK1-NEXT: tail call void @llvm.dbg.declare(metadata ptr [[B_ADDR]], metadata [[META46:![0-9]+]], metadata !DIExpression()), !dbg [[DBG47:![0-9]+]] // CHECK1-NEXT: store ptr addrspace(1) [[BB]], ptr [[BB_ADDR]], align 8 -// CHECK1-NEXT: call void @llvm.dbg.declare(metadata ptr [[BB_ADDR]], metadata [[META51:![0-9]+]], metadata !DIExpression()), !dbg [[DBG52:![0-9]+]] +// CHECK1-NEXT: tail call void @llvm.dbg.declare(metadata ptr [[BB_ADDR]], metadata [[META48:![0-9]+]], metadata !DIExpression()), !dbg [[DBG49:![0-9]+]] // CHECK1-NEXT: [[FROMBOOL:%.*]] = zext i1 [[DOTCAPTURE_EXPR_]] to i8 // CHECK1-NEXT: store i8 [[FROMBOOL]], ptr [[DOTCAPTURE_EXPR__ADDR]], align 1 -// CHECK1-NEXT: call void @llvm.dbg.declare(metadata ptr [[DOTCAPTURE_EXPR__ADDR]], metadata [[META53:![0-9]+]], metadata !DIExpression()), !dbg [[DBG54:![0-9]+]] -// CHECK1-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr [[C_ADDR]], align 8, !dbg [[DBG55:![0-9]+]] -// CHECK1-NEXT: [[TMP1:%.*]] = addrspacecast ptr addrspace(1) [[TMP0]] to ptr, !dbg [[DBG55]] -// CHECK1-NEXT: store ptr [[TMP1]], ptr [[TMP]], align 8, !dbg [[DBG55]] -// CHECK1-NEXT: [[TMP2:%.*]] = load ptr, ptr [[TMP]], align 8, !dbg [[DBG55]] -// CHECK1-NEXT: [[TMP3:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG55]] -// CHECK1-NEXT: store ptr [[TMP3]], ptr [[_TMP1]], align 8, !dbg [[DBG55]] -// CHECK1-NEXT: [[TMP4:%.*]] = load ptr, ptr [[_TMP1]], align 8, !dbg [[DBG55]] -// CHECK1-NEXT: [[TMP5:%.*]] = load ptr addrspace(1), ptr [[BB_ADDR]], align 8, !dbg [[DBG55]] -// CHECK1-NEXT: [[TMP6:%.*]] = addrspacecast ptr addrspace(1) [[TMP5]] to ptr, !dbg [[DBG55]] -// CHECK1-NEXT: store ptr [[TMP6]], ptr [[_TMP2]], align 8, !dbg [[DBG55]] -// CHECK1-NEXT: [[TMP7:%.*]] = load ptr, ptr [[_TMP2]], align 8, !dbg [[DBG55]] -// CHECK1-NEXT: [[TMP8:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l13_kernel_environment, ptr [[DYN_PTR]]), !dbg [[DBG55]] -// CHECK1-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP8]], -1, !dbg [[DBG55]] -// CHECK1-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]], !dbg [[DBG55]] +// CHECK1-NEXT: tail call void @llvm.dbg.declare(metadata ptr [[DOTCAPTURE_EXPR__ADDR]], metadata [[META50:![0-9]+]], metadata !DIExpression()), !dbg [[DBG51:![0-9]+]] +// CHECK1-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr [[C_ADDR]], align 8, !dbg [[DBG52:![0-9]+]] +// CHECK1-NEXT: [[TMP1:%.*]] = addrspacecast ptr addrspace(1) [[TMP0]] to ptr, !dbg [[DBG52]] +// CHECK1-NEXT: store ptr [[TMP1]], ptr [[TMP]], align 8, !dbg [[DBG52]] +// CHECK1-NEXT: [[TMP2:%.*]] = load ptr, ptr [[TMP]], align 8, !dbg [[DBG52]] +// CHECK1-NEXT: [[TMP3:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG52]] +// CHECK1-NEXT: store ptr [[TMP3]], ptr [[_TMP1]], align 8, !dbg [[DBG52]] +// CHECK1-NEXT: [[TMP4:%.*]] = load ptr, ptr [[_TMP1]], align 8, !dbg [[DBG52]] +// CHECK1-NEXT: [[TMP5:%.*]] = load ptr addrspace(1), ptr [[BB_ADDR]], align 8, !dbg [[DBG52]] +// CHECK1-NEXT: [[TMP6:%.*]] = addrspacecast ptr addrspace(1) [[TMP5]] to ptr, !dbg [[DBG52]] +// CHECK1-NEXT: store ptr [[TMP6]], ptr [[_TMP2]], align 8, !dbg [[DBG52]] +// CHECK1-NEXT: [[TMP7:%.*]] = load ptr, ptr [[_TMP2]], align 8, !dbg [[DBG52]] +// CHECK1-NEXT: [[TMP8:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l13_kernel_environment, ptr [[DYN_PTR]]), !dbg [[DBG52]] +// CHECK1-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP8]], -1, !dbg [[DBG52]] +// CHECK1-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]], !dbg [[DBG52]] // CHECK1: user_code.entry: -// CHECK1-NEXT: [[TMP9:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB6:[0-9]+]]) -// CHECK1-NEXT: [[TMP10:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG56:![0-9]+]] -// CHECK1-NEXT: store i32 [[TMP10]], ptr [[A_CASTED]], align 4, !dbg [[DBG56]] -// CHECK1-NEXT: [[TMP11:%.*]] = load i64, ptr [[A_CASTED]], align 8, !dbg [[DBG56]] -// CHECK1-NEXT: [[TMP12:%.*]] = getelementptr inbounds [4 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 0, !dbg [[DBG56]] -// CHECK1-NEXT: store ptr [[TMP2]], ptr [[TMP12]], align 8, !dbg [[DBG56]] -// CHECK1-NEXT: [[TMP13:%.*]] = getelementptr inbounds [4 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 1, !dbg [[DBG56]] -// CHECK1-NEXT: [[TMP14:%.*]] = inttoptr i64 [[TMP11]] to ptr, !dbg [[DBG56]] -// CHECK1-NEXT: store ptr [[TMP14]], ptr [[TMP13]], align 8, !dbg [[DBG56]] -// CHECK1-NEXT: [[TMP15:%.*]] = getelementptr inbounds [4 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 2, !dbg [[DBG56]] -// CHECK1-NEXT: store ptr [[TMP4]], ptr [[TMP15]], align 8, !dbg [[DBG56]] -// CHECK1-NEXT: [[TMP16:%.*]] = getelementptr inbounds [4 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 3, !dbg [[DBG56]] -// CHECK1-NEXT: store ptr [[TMP7]], ptr [[TMP16]], align 8, !dbg [[DBG56]] -// CHECK1-NEXT: [[TMP17:%.*]] = load i8, ptr [[DOTCAPTURE_EXPR__ADDR]], align 1, !dbg [[DBG57:![0-9]+]] -// CHECK1-NEXT: [[TOBOOL:%.*]] = trunc i8 [[TMP17]] to i1, !dbg [[DBG57]] -// CHECK1-NEXT: [[TMP18:%.*]] = zext i1 [[TOBOOL]] to i32, !dbg [[DBG56]] -// CHECK1-NEXT: call void @__kmpc_parallel_51(ptr @[[GLOB6]], i32 [[TMP9]], i32 [[TMP18]], i32 -1, i32 -1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l13_debug___omp_outlined, ptr null, ptr [[CAPTURED_VARS_ADDRS]], i64 4), !dbg [[DBG56]] -// CHECK1-NEXT: call void @__kmpc_target_deinit(), !dbg [[DBG59:![0-9]+]] -// CHECK1-NEXT: ret void, !dbg [[DBG60:![0-9]+]] +// CHECK1-NEXT: [[TMP9:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB6:[0-9]+]]), !dbg [[DBG53:![0-9]+]] +// CHECK1-NEXT: [[TMP10:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG55:![0-9]+]] +// CHECK1-NEXT: store i32 [[TMP10]], ptr [[A_CASTED]], align 4, !dbg [[DBG55]] +// CHECK1-NEXT: [[TMP11:%.*]] = load i64, ptr [[A_CASTED]], align 8, !dbg [[DBG55]] +// CHECK1-NEXT: [[TMP12:%.*]] = getelementptr inbounds [4 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 0, !dbg [[DBG55]] +// CHECK1-NEXT: store ptr [[TMP2]], ptr [[TMP12]], align 8, !dbg [[DBG55]] +// CHECK1-NEXT: [[TMP13:%.*]] = getelementptr inbounds [4 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 1, !dbg [[DBG55]] +// CHECK1-NEXT: [[TMP14:%.*]] = inttoptr i64 [[TMP11]] to ptr, !dbg [[DBG55]] +// CHECK1-NEXT: store ptr [[TMP14]], ptr [[TMP13]], align 8, !dbg [[DBG55]] +// CHECK1-NEXT: [[TMP15:%.*]] = getelementptr inbounds [4 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 2, !dbg [[DBG55]] +// CHECK1-NEXT: store ptr [[TMP4]], ptr [[TMP15]], align 8, !dbg [[DBG55]] +// CHECK1-NEXT: [[TMP16:%.*]] = getelementptr inbounds [4 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 3, !dbg [[DBG55]] +// CHECK1-NEXT: store ptr [[TMP7]], ptr [[TMP16]], align 8, !dbg [[DBG55]] +// CHECK1-NEXT: [[TMP17:%.*]] = load i8, ptr [[DOTCAPTURE_EXPR__ADDR]], align 1, !dbg [[DBG56:![0-9]+]] +// CHECK1-NEXT: [[TOBOOL:%.*]] = trunc i8 [[TMP17]] to i1, !dbg [[DBG56]] +// CHECK1-NEXT: [[TMP18:%.*]] = zext i1 [[TOBOOL]] to i32, !dbg [[DBG55]] +// CHECK1-NEXT: call void @__kmpc_parallel_51(ptr @[[GLOB6]], i32 [[TMP9]], i32 [[TMP18]], i32 -1, i32 -1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l13_debug___omp_outlined, ptr null, ptr [[CAPTURED_VARS_ADDRS]], i64 4), !dbg [[DBG55]] +// CHECK1-NEXT: call void @__kmpc_target_deinit(), !dbg [[DBG57:![0-9]+]] +// CHECK1-NEXT: ret void, !dbg [[DBG58:![0-9]+]] // CHECK1: worker.exit: -// CHECK1-NEXT: ret void, !dbg [[DBG55]] +// CHECK1-NEXT: ret void, !dbg [[DBG52]] +// +// +// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l13 +// CHECK1-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], ptr noundef nonnull align 4 dereferenceable(4000) [[C:%.*]], i64 noundef [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(400) [[B:%.*]], ptr noundef nonnull align 1 dereferenceable(1) [[BB:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR2:[0-9]+]] !dbg [[DBG59:![0-9]+]] { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8 +// CHECK1-NEXT: [[C_ADDR:%.*]] = alloca ptr, align 8 +// CHECK1-NEXT: [[A_ADDR:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[B_ADDR:%.*]] = alloca ptr, align 8 +// CHECK1-NEXT: [[BB_ADDR:%.*]] = alloca ptr, align 8 +// CHECK1-NEXT: [[DOTCAPTURE_EXPR__ADDR:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8 +// CHECK1-NEXT: tail call void @llvm.dbg.declare(metadata ptr [[DYN_PTR_ADDR]], metadata [[META66:![0-9]+]], metadata !DIExpression()), !dbg [[DBG67:![0-9]+]] +// CHECK1-NEXT: store ptr [[C]], ptr [[C_ADDR]], align 8 +// CHECK1-NEXT: tail call void @llvm.dbg.declare(metadata ptr [[C_ADDR]], metadata [[META68:![0-9]+]], metadata !DIExpression()), !dbg [[DBG67]] +// CHECK1-NEXT: store i64 [[A]], ptr [[A_ADDR]], align 8 +// CHECK1-NEXT: tail call void @llvm.dbg.declare(metadata ptr [[A_ADDR]], metadata [[META69:![0-9]+]], metadata !DIExpression()), !dbg [[DBG67]] +// CHECK1-NEXT: store ptr [[B]], ptr [[B_ADDR]], align 8 +// CHECK1-NEXT: tail call void @llvm.dbg.declare(metadata ptr [[B_ADDR]], metadata [[META70:![0-9]+]], metadata !DIExpression()), !dbg [[DBG67]] +// CHECK1-NEXT: store ptr [[BB]], ptr [[BB_ADDR]], align 8 +// CHECK1-NEXT: tail call void @llvm.dbg.declare(metadata ptr [[BB_ADDR]], metadata [[META71:![0-9]+]], metadata !DIExpression()), !dbg [[DBG67]] +// CHECK1-NEXT: store i64 [[DOTCAPTURE_EXPR_]], ptr [[DOTCAPTURE_EXPR__ADDR]], align 8 +// CHECK1-NEXT: tail call void @llvm.dbg.declare(metadata ptr [[DOTCAPTURE_EXPR__ADDR]], metadata [[META72:![0-9]+]], metadata !DIExpression()), !dbg [[DBG67]] +// CHECK1-NEXT: [[TMP0:%.*]] = load ptr, ptr [[C_ADDR]], align 8, !dbg [[DBG73:![0-9]+]] +// CHECK1-NEXT: [[TMP1:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG73]] +// CHECK1-NEXT: [[TMP2:%.*]] = load ptr, ptr [[BB_ADDR]], align 8, !dbg [[DBG73]] +// CHECK1-NEXT: [[TMP3:%.*]] = load ptr, ptr [[DYN_PTR_ADDR]], align 8, !dbg [[DBG73]] +// CHECK1-NEXT: [[TMP4:%.*]] = load ptr, ptr [[C_ADDR]], align 8, !dbg [[DBG73]] +// CHECK1-NEXT: [[TMP5:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG73]] +// CHECK1-NEXT: [[TMP6:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG73]] +// CHECK1-NEXT: [[TMP7:%.*]] = load ptr, ptr [[BB_ADDR]], align 8, !dbg [[DBG73]] +// CHECK1-NEXT: [[TMP8:%.*]] = load i8, ptr [[DOTCAPTURE_EXPR__ADDR]], align 1, !dbg [[DBG73]] +// CHECK1-NEXT: [[TOBOOL:%.*]] = trunc i8 [[TMP8]] to i1, !dbg [[DBG73]] +// CHECK1-NEXT: [[TMP9:%.*]] = addrspacecast ptr [[TMP4]] to ptr addrspace(1), !dbg [[DBG73]] +// CHECK1-NEXT: [[TMP10:%.*]] = addrspacecast ptr [[TMP7]] to ptr addrspace(1), !dbg [[DBG73]] +// CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l13_debug__(ptr [[TMP3]], ptr addrspace(1) [[TMP9]], i32 [[TMP5]], ptr [[TMP6]], ptr addrspace(1) [[TMP10]], i1 [[TOBOOL]]) #[[ATTR4:[0-9]+]], !dbg [[DBG73]] +// CHECK1-NEXT: ret void, !dbg [[DBG73]] // // // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l13_debug___omp_outlined_debug__ -// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr addrspace(1) noalias noundef [[C:%.*]], i32 noundef [[A:%.*]], ptr noalias noundef [[B:%.*]], ptr addrspace(1) noalias noundef [[BB:%.*]]) #[[ATTR2:[0-9]+]] !dbg [[DBG61:![0-9]+]] { +// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr addrspace(1) noalias noundef [[C:%.*]], i32 noundef [[A:%.*]], ptr noalias noundef [[B:%.*]], ptr addrspace(1) noalias noundef [[BB:%.*]]) #[[ATTR1]] !dbg [[DBG74:![0-9]+]] { // CHECK1-NEXT: entry: // CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -144,149 +181,149 @@ int main() { // CHECK1-NEXT: [[H:%.*]] = alloca ptr, align 8 // CHECK1-NEXT: [[D:%.*]] = alloca i32, align 4 // CHECK1-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK1-NEXT: call void @llvm.dbg.declare(metadata ptr [[DOTGLOBAL_TID__ADDR]], metadata [[META68:![0-9]+]], metadata !DIExpression()), !dbg [[DBG69:![0-9]+]] +// CHECK1-NEXT: tail call void @llvm.dbg.declare(metadata ptr [[DOTGLOBAL_TID__ADDR]], metadata [[META81:![0-9]+]], metadata !DIExpression()), !dbg [[DBG82:![0-9]+]] // CHECK1-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8 -// CHECK1-NEXT: call void @llvm.dbg.declare(metadata ptr [[DOTBOUND_TID__ADDR]], metadata [[META70:![0-9]+]], metadata !DIExpression()), !dbg [[DBG69]] +// CHECK1-NEXT: tail call void @llvm.dbg.declare(metadata ptr [[DOTBOUND_TID__ADDR]], metadata [[META83:![0-9]+]], metadata !DIExpression()), !dbg [[DBG82]] // CHECK1-NEXT: store ptr addrspace(1) [[C]], ptr [[C_ADDR]], align 8 -// CHECK1-NEXT: call void @llvm.dbg.declare(metadata ptr [[C_ADDR]], metadata [[META71:![0-9]+]], metadata !DIExpression()), !dbg [[DBG72:![0-9]+]] +// CHECK1-NEXT: tail call void @llvm.dbg.declare(metadata ptr [[C_ADDR]], metadata [[META84:![0-9]+]], metadata !DIExpression()), !dbg [[DBG85:![0-9]+]] // CHECK1-NEXT: store i32 [[A]], ptr [[A_ADDR]], align 4 -// CHECK1-NEXT: call void @llvm.dbg.declare(metadata ptr [[A_ADDR]], metadata [[META73:![0-9]+]], metadata !DIExpression()), !dbg [[DBG74:![0-9]+]] +// CHECK1-NEXT: tail call void @llvm.dbg.declare(metadata ptr [[A_ADDR]], metadata [[META86:![0-9]+]], metadata !DIExpression()), !dbg [[DBG87:![0-9]+]] // CHECK1-NEXT: store ptr [[B]], ptr [[B_ADDR]], align 8 -// CHECK1-NEXT: call void @llvm.dbg.declare(metadata ptr [[B_ADDR]], metadata [[META75:![0-9]+]], metadata !DIExpression()), !dbg [[DBG76:![0-9]+]] +// CHECK1-NEXT: tail call void @llvm.dbg.declare(metadata ptr [[B_ADDR]], metadata [[META88:![0-9]+]], metadata !DIExpression()), !dbg [[DBG89:![0-9]+]] // CHECK1-NEXT: store ptr addrspace(1) [[BB]], ptr [[BB_ADDR]], align 8 -// CHECK1-NEXT: call void @llvm.dbg.declare(metadata ptr [[BB_ADDR]], metadata [[META77:![0-9]+]], metadata !DIExpression()), !dbg [[DBG78:![0-9]+]] -// CHECK1-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr [[C_ADDR]], align 8, !dbg [[DBG79:![0-9]+]] -// CHECK1-NEXT: [[TMP1:%.*]] = addrspacecast ptr addrspace(1) [[TMP0]] to ptr, !dbg [[DBG79]] -// CHECK1-NEXT: store ptr [[TMP1]], ptr [[TMP]], align 8, !dbg [[DBG79]] -// CHECK1-NEXT: [[TMP2:%.*]] = load ptr, ptr [[TMP]], align 8, !dbg [[DBG79]] -// CHECK1-NEXT: [[TMP3:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG79]] -// CHECK1-NEXT: store ptr [[TMP3]], ptr [[_TMP1]], align 8, !dbg [[DBG79]] -// CHECK1-NEXT: [[TMP4:%.*]] = load ptr, ptr [[_TMP1]], align 8, !dbg [[DBG79]] -// CHECK1-NEXT: [[TMP5:%.*]] = load ptr addrspace(1), ptr [[BB_ADDR]], align 8, !dbg [[DBG79]] -// CHECK1-NEXT: [[TMP6:%.*]] = addrspacecast ptr addrspace(1) [[TMP5]] to ptr, !dbg [[DBG79]] -// CHECK1-NEXT: store ptr [[TMP6]], ptr [[_TMP2]], align 8, !dbg [[DBG79]] -// CHECK1-NEXT: [[TMP7:%.*]] = load ptr, ptr [[_TMP2]], align 8, !dbg [[DBG79]] -// CHECK1-NEXT: call void @llvm.dbg.declare(metadata ptr [[DOTOMP_IV]], metadata [[META80:![0-9]+]], metadata !DIExpression()), !dbg [[DBG69]] -// CHECK1-NEXT: call void @llvm.dbg.declare(metadata ptr [[DOTOMP_LB]], metadata [[META81:![0-9]+]], metadata !DIExpression()), !dbg [[DBG69]] -// CHECK1-NEXT: store i32 0, ptr [[DOTOMP_LB]], align 4, !dbg [[DBG82:![0-9]+]] -// CHECK1-NEXT: call void @llvm.dbg.declare(metadata ptr [[DOTOMP_UB]], metadata [[META83:![0-9]+]], metadata !DIExpression()), !dbg [[DBG69]] -// CHECK1-NEXT: store i32 9, ptr [[DOTOMP_UB]], align 4, !dbg [[DBG82]] -// CHECK1-NEXT: call void @llvm.dbg.declare(metadata ptr [[DOTOMP_STRIDE]], metadata [[META84:![0-9]+]], metadata !DIExpression()), !dbg [[DBG69]] -// CHECK1-NEXT: store i32 1, ptr [[DOTOMP_STRIDE]], align 4, !dbg [[DBG82]] -// CHECK1-NEXT: call void @llvm.dbg.declare(metadata ptr [[DOTOMP_IS_LAST]], metadata [[META85:![0-9]+]], metadata !DIExpression()), !dbg [[DBG69]] -// CHECK1-NEXT: store i32 0, ptr [[DOTOMP_IS_LAST]], align 4, !dbg [[DBG82]] -// CHECK1-NEXT: call void @llvm.dbg.declare(metadata ptr [[B4]], metadata [[META86:![0-9]+]], metadata !DIExpression()), !dbg [[DBG69]] -// CHECK1-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[B4]], ptr align 4 [[TMP4]], i64 400, i1 false), !dbg [[DBG79]] -// CHECK1-NEXT: call void @llvm.dbg.declare(metadata ptr [[I]], metadata [[META87:![0-9]+]], metadata !DIExpression()), !dbg [[DBG69]] -// CHECK1-NEXT: [[TMP8:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8, !dbg [[DBG79]] -// CHECK1-NEXT: [[TMP9:%.*]] = load i32, ptr [[TMP8]], align 4, !dbg [[DBG79]] -// CHECK1-NEXT: call void @__kmpc_for_static_init_4(ptr @[[GLOB3:[0-9]+]], i32 [[TMP9]], i32 33, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_LB]], ptr [[DOTOMP_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1), !dbg [[DBG88:![0-9]+]] -// CHECK1-NEXT: br label [[OMP_DISPATCH_COND:%.*]], !dbg [[DBG79]] +// CHECK1-NEXT: tail call void @llvm.dbg.declare(metadata ptr [[BB_ADDR]], metadata [[META90:![0-9]+]], metadata !DIExpression()), !dbg [[DBG91:![0-9]+]] +// CHECK1-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr [[C_ADDR]], align 8, !dbg [[DBG92:![0-9]+]] +// CHECK1-NEXT: [[TMP1:%.*]] = addrspacecast ptr addrspace(1) [[TMP0]] to ptr, !dbg [[DBG92]] +// CHECK1-NEXT: store ptr [[TMP1]], ptr [[TMP]], align 8, !dbg [[DBG92]] +// CHECK1-NEXT: [[TMP2:%.*]] = load ptr, ptr [[TMP]], align 8, !dbg [[DBG92]] +// CHECK1-NEXT: [[TMP3:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG92]] +// CHECK1-NEXT: store ptr [[TMP3]], ptr [[_TMP1]], align 8, !dbg [[DBG92]] +// CHECK1-NEXT: [[TMP4:%.*]] = load ptr, ptr [[_TMP1]], align 8, !dbg [[DBG92]] +// CHECK1-NEXT: [[TMP5:%.*]] = load ptr addrspace(1), ptr [[BB_ADDR]], align 8, !dbg [[DBG92]] +// CHECK1-NEXT: [[TMP6:%.*]] = addrspacecast ptr addrspace(1) [[TMP5]] to ptr, !dbg [[DBG92]] +// CHECK1-NEXT: store ptr [[TMP6]], ptr [[_TMP2]], align 8, !dbg [[DBG92]] +// CHECK1-NEXT: [[TMP7:%.*]] = load ptr, ptr [[_TMP2]], align 8, !dbg [[DBG92]] +// CHECK1-NEXT: tail call void @llvm.dbg.declare(metadata ptr [[DOTOMP_IV]], metadata [[META93:![0-9]+]], metadata !DIExpression()), !dbg [[DBG82]] +// CHECK1-NEXT: tail call void @llvm.dbg.declare(metadata ptr [[DOTOMP_LB]], metadata [[META94:![0-9]+]], metadata !DIExpression()), !dbg [[DBG82]] +// CHECK1-NEXT: store i32 0, ptr [[DOTOMP_LB]], align 4, !dbg [[DBG95:![0-9]+]] +// CHECK1-NEXT: tail call void @llvm.dbg.declare(metadata ptr [[DOTOMP_UB]], metadata [[META96:![0-9]+]], metadata !DIExpression()), !dbg [[DBG82]] +// CHECK1-NEXT: store i32 9, ptr [[DOTOMP_UB]], align 4, !dbg [[DBG95]] +// CHECK1-NEXT: tail call void @llvm.dbg.declare(metadata ptr [[DOTOMP_STRIDE]], metadata [[META97:![0-9]+]], metadata !DIExpression()), !dbg [[DBG82]] +// CHECK1-NEXT: store i32 1, ptr [[DOTOMP_STRIDE]], align 4, !dbg [[DBG95]] +// CHECK1-NEXT: tail call void @llvm.dbg.declare(metadata ptr [[DOTOMP_IS_LAST]], metadata [[META98:![0-9]+]], metadata !DIExpression()), !dbg [[DBG82]] +// CHECK1-NEXT: store i32 0, ptr [[DOTOMP_IS_LAST]], align 4, !dbg [[DBG95]] +// CHECK1-NEXT: tail call void @llvm.dbg.declare(metadata ptr [[B4]], metadata [[META99:![0-9]+]], metadata !DIExpression()), !dbg [[DBG82]] +// CHECK1-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[B4]], ptr align 4 [[TMP4]], i64 400, i1 false), !dbg [[DBG92]] +// CHECK1-NEXT: tail call void @llvm.dbg.declare(metadata ptr [[I]], metadata [[META100:![0-9]+]], metadata !DIExpression()), !dbg [[DBG82]] +// CHECK1-NEXT: [[TMP8:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8, !dbg [[DBG92]] +// CHECK1-NEXT: [[TMP9:%.*]] = load i32, ptr [[TMP8]], align 4, !dbg [[DBG92]] +// CHECK1-NEXT: call void @__kmpc_for_static_init_4(ptr @[[GLOB3:[0-9]+]], i32 [[TMP9]], i32 33, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_LB]], ptr [[DOTOMP_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1), !dbg [[DBG101:![0-9]+]] +// CHECK1-NEXT: br label [[OMP_DISPATCH_COND:%.*]], !dbg [[DBG92]] // CHECK1: omp.dispatch.cond: -// CHECK1-NEXT: [[TMP10:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !dbg [[DBG82]] -// CHECK1-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP10]], 9, !dbg [[DBG82]] -// CHECK1-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]], !dbg [[DBG82]] +// CHECK1-NEXT: [[TMP10:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !dbg [[DBG95]] +// CHECK1-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP10]], 9, !dbg [[DBG95]] +// CHECK1-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]], !dbg [[DBG95]] // CHECK1: cond.true: -// CHECK1-NEXT: br label [[COND_END:%.*]], !dbg [[DBG82]] +// CHECK1-NEXT: br label [[COND_END:%.*]], !dbg [[DBG95]] // CHECK1: cond.false: -// CHECK1-NEXT: [[TMP11:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !dbg [[DBG82]] -// CHECK1-NEXT: br label [[COND_END]], !dbg [[DBG82]] +// CHECK1-NEXT: [[TMP11:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !dbg [[DBG95]] +// CHECK1-NEXT: br label [[COND_END]], !dbg [[DBG95]] // CHECK1: cond.end: -// CHECK1-NEXT: [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP11]], [[COND_FALSE]] ], !dbg [[DBG82]] -// CHECK1-NEXT: store i32 [[COND]], ptr [[DOTOMP_UB]], align 4, !dbg [[DBG82]] -// CHECK1-NEXT: [[TMP12:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4, !dbg [[DBG82]] -// CHECK1-NEXT: store i32 [[TMP12]], ptr [[DOTOMP_IV]], align 4, !dbg [[DBG82]] -// CHECK1-NEXT: [[TMP13:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !dbg [[DBG82]] -// CHECK1-NEXT: [[TMP14:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !dbg [[DBG82]] -// CHECK1-NEXT: [[CMP5:%.*]] = icmp sle i32 [[TMP13]], [[TMP14]], !dbg [[DBG79]] -// CHECK1-NEXT: br i1 [[CMP5]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]], !dbg [[DBG79]] +// CHECK1-NEXT: [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP11]], [[COND_FALSE]] ], !dbg [[DBG95]] +// CHECK1-NEXT: store i32 [[COND]], ptr [[DOTOMP_UB]], align 4, !dbg [[DBG95]] +// CHECK1-NEXT: [[TMP12:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4, !dbg [[DBG95]] +// CHECK1-NEXT: store i32 [[TMP12]], ptr [[DOTOMP_IV]], align 4, !dbg [[DBG95]] +// CHECK1-NEXT: [[TMP13:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !dbg [[DBG95]] +// CHECK1-NEXT: [[TMP14:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !dbg [[DBG95]] +// CHECK1-NEXT: [[CMP5:%.*]] = icmp sle i32 [[TMP13]], [[TMP14]], !dbg [[DBG92]] +// CHECK1-NEXT: br i1 [[CMP5]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]], !dbg [[DBG92]] // CHECK1: omp.dispatch.body: -// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND:%.*]], !dbg [[DBG79]] +// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND:%.*]], !dbg [[DBG92]] // CHECK1: omp.inner.for.cond: -// CHECK1-NEXT: [[TMP15:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !dbg [[DBG82]] -// CHECK1-NEXT: [[TMP16:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !dbg [[DBG82]] -// CHECK1-NEXT: [[CMP6:%.*]] = icmp sle i32 [[TMP15]], [[TMP16]], !dbg [[DBG79]] -// CHECK1-NEXT: br i1 [[CMP6]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]], !dbg [[DBG79]] +// CHECK1-NEXT: [[TMP15:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !dbg [[DBG95]] +// CHECK1-NEXT: [[TMP16:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !dbg [[DBG95]] +// CHECK1-NEXT: [[CMP6:%.*]] = icmp sle i32 [[TMP15]], [[TMP16]], !dbg [[DBG92]] +// CHECK1-NEXT: br i1 [[CMP6]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]], !dbg [[DBG92]] // CHECK1: omp.inner.for.body: -// CHECK1-NEXT: [[TMP17:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !dbg [[DBG82]] -// CHECK1-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP17]], 1, !dbg [[DBG89:![0-9]+]] -// CHECK1-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]], !dbg [[DBG89]] -// CHECK1-NEXT: store i32 [[ADD]], ptr [[I]], align 4, !dbg [[DBG89]] -// CHECK1-NEXT: call void @llvm.dbg.declare(metadata ptr [[F]], metadata [[META90:![0-9]+]], metadata !DIExpression()), !dbg [[DBG93:![0-9]+]] -// CHECK1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x [10 x [10 x i32]]], ptr [[TMP2]], i64 0, i64 1, !dbg [[DBG94:![0-9]+]] -// CHECK1-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[ARRAYIDX]], i64 0, i64 1, !dbg [[DBG94]] -// CHECK1-NEXT: [[ARRAYIDX8:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX7]], i64 0, i64 1, !dbg [[DBG94]] -// CHECK1-NEXT: store ptr [[ARRAYIDX8]], ptr [[F]], align 8, !dbg [[DBG93]] -// CHECK1-NEXT: call void @llvm.dbg.declare(metadata ptr [[G]], metadata [[META95:![0-9]+]], metadata !DIExpression()), !dbg [[DBG96:![0-9]+]] -// CHECK1-NEXT: store ptr [[A_ADDR]], ptr [[G]], align 8, !dbg [[DBG96]] -// CHECK1-NEXT: call void @llvm.dbg.declare(metadata ptr [[H]], metadata [[META97:![0-9]+]], metadata !DIExpression()), !dbg [[DBG98:![0-9]+]] -// CHECK1-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[B4]], i64 0, i64 1, !dbg [[DBG99:![0-9]+]] -// CHECK1-NEXT: [[ARRAYIDX10:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX9]], i64 0, i64 1, !dbg [[DBG99]] -// CHECK1-NEXT: store ptr [[ARRAYIDX10]], ptr [[H]], align 8, !dbg [[DBG98]] -// CHECK1-NEXT: call void @llvm.dbg.declare(metadata ptr [[D]], metadata [[META100:![0-9]+]], metadata !DIExpression()), !dbg [[DBG101:![0-9]+]] -// CHECK1-NEXT: store i32 15, ptr [[D]], align 4, !dbg [[DBG101]] -// CHECK1-NEXT: store i32 5, ptr [[A_ADDR]], align 4, !dbg [[DBG102:![0-9]+]] -// CHECK1-NEXT: [[ARRAYIDX11:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[B4]], i64 0, i64 0, !dbg [[DBG103:![0-9]+]] -// CHECK1-NEXT: [[TMP18:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG104:![0-9]+]] -// CHECK1-NEXT: [[IDXPROM:%.*]] = sext i32 [[TMP18]] to i64, !dbg [[DBG103]] -// CHECK1-NEXT: [[ARRAYIDX12:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX11]], i64 0, i64 [[IDXPROM]], !dbg [[DBG103]] -// CHECK1-NEXT: store i32 10, ptr [[ARRAYIDX12]], align 4, !dbg [[DBG105:![0-9]+]] -// CHECK1-NEXT: [[ARRAYIDX13:%.*]] = getelementptr inbounds [10 x [10 x [10 x i32]]], ptr [[TMP2]], i64 0, i64 0, !dbg [[DBG106:![0-9]+]] -// CHECK1-NEXT: [[ARRAYIDX14:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[ARRAYIDX13]], i64 0, i64 0, !dbg [[DBG106]] -// CHECK1-NEXT: [[TMP19:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG107:![0-9]+]] -// CHECK1-NEXT: [[IDXPROM15:%.*]] = sext i32 [[TMP19]] to i64, !dbg [[DBG106]] -// CHECK1-NEXT: [[ARRAYIDX16:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX14]], i64 0, i64 [[IDXPROM15]], !dbg [[DBG106]] -// CHECK1-NEXT: store i32 11, ptr [[ARRAYIDX16]], align 4, !dbg [[DBG108:![0-9]+]] -// CHECK1-NEXT: [[ARRAYIDX17:%.*]] = getelementptr inbounds [10 x [10 x [10 x i32]]], ptr [[TMP2]], i64 0, i64 0, !dbg [[DBG109:![0-9]+]] -// CHECK1-NEXT: [[ARRAYIDX18:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[ARRAYIDX17]], i64 0, i64 0, !dbg [[DBG109]] -// CHECK1-NEXT: [[TMP20:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG110:![0-9]+]] -// CHECK1-NEXT: [[IDXPROM19:%.*]] = sext i32 [[TMP20]] to i64, !dbg [[DBG109]] -// CHECK1-NEXT: [[ARRAYIDX20:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX18]], i64 0, i64 [[IDXPROM19]], !dbg [[DBG109]] -// CHECK1-NEXT: [[TMP21:%.*]] = load i32, ptr [[ARRAYIDX20]], align 4, !dbg [[DBG109]] -// CHECK1-NEXT: [[ARRAYIDX21:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[B4]], i64 0, i64 0, !dbg [[DBG111:![0-9]+]] -// CHECK1-NEXT: [[TMP22:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG112:![0-9]+]] -// CHECK1-NEXT: [[IDXPROM22:%.*]] = sext i32 [[TMP22]] to i64, !dbg [[DBG111]] -// CHECK1-NEXT: [[ARRAYIDX23:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX21]], i64 0, i64 [[IDXPROM22]], !dbg [[DBG111]] -// CHECK1-NEXT: store i32 [[TMP21]], ptr [[ARRAYIDX23]], align 4, !dbg [[DBG113:![0-9]+]] -// CHECK1-NEXT: [[ARRAYIDX24:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[B4]], i64 0, i64 0, !dbg [[DBG114:![0-9]+]] -// CHECK1-NEXT: [[TMP23:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG115:![0-9]+]] -// CHECK1-NEXT: [[IDXPROM25:%.*]] = sext i32 [[TMP23]] to i64, !dbg [[DBG114]] -// CHECK1-NEXT: [[ARRAYIDX26:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX24]], i64 0, i64 [[IDXPROM25]], !dbg [[DBG114]] -// CHECK1-NEXT: [[TMP24:%.*]] = load i32, ptr [[ARRAYIDX26]], align 4, !dbg [[DBG114]] -// CHECK1-NEXT: [[TMP25:%.*]] = load i8, ptr [[TMP7]], align 1, !dbg [[DBG116:![0-9]+]] -// CHECK1-NEXT: [[TOBOOL:%.*]] = trunc i8 [[TMP25]] to i1, !dbg [[DBG116]] -// CHECK1-NEXT: [[CONV:%.*]] = zext i1 [[TOBOOL]] to i32, !dbg [[DBG116]] -// CHECK1-NEXT: [[OR:%.*]] = or i32 [[CONV]], [[TMP24]], !dbg [[DBG116]] -// CHECK1-NEXT: [[TOBOOL27:%.*]] = icmp ne i32 [[OR]], 0, !dbg [[DBG116]] -// CHECK1-NEXT: [[FROMBOOL:%.*]] = zext i1 [[TOBOOL27]] to i8, !dbg [[DBG116]] -// CHECK1-NEXT: store i8 [[FROMBOOL]], ptr [[TMP7]], align 1, !dbg [[DBG116]] -// CHECK1-NEXT: br label [[OMP_BODY_CONTINUE:%.*]], !dbg [[DBG117:![0-9]+]] +// CHECK1-NEXT: [[TMP17:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !dbg [[DBG95]] +// CHECK1-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP17]], 1, !dbg [[DBG102:![0-9]+]] +// CHECK1-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]], !dbg [[DBG102]] +// CHECK1-NEXT: store i32 [[ADD]], ptr [[I]], align 4, !dbg [[DBG102]] +// CHECK1-NEXT: tail call void @llvm.dbg.declare(metadata ptr [[F]], metadata [[META103:![0-9]+]], metadata !DIExpression()), !dbg [[DBG106:![0-9]+]] +// CHECK1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x [10 x [10 x i32]]], ptr [[TMP2]], i64 0, i64 1, !dbg [[DBG107:![0-9]+]] +// CHECK1-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[ARRAYIDX]], i64 0, i64 1, !dbg [[DBG107]] +// CHECK1-NEXT: [[ARRAYIDX8:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX7]], i64 0, i64 1, !dbg [[DBG107]] +// CHECK1-NEXT: store ptr [[ARRAYIDX8]], ptr [[F]], align 8, !dbg [[DBG106]] +// CHECK1-NEXT: tail call void @llvm.dbg.declare(metadata ptr [[G]], metadata [[META108:![0-9]+]], metadata !DIExpression()), !dbg [[DBG109:![0-9]+]] +// CHECK1-NEXT: store ptr [[A_ADDR]], ptr [[G]], align 8, !dbg [[DBG109]] +// CHECK1-NEXT: tail call void @llvm.dbg.declare(metadata ptr [[H]], metadata [[META110:![0-9]+]], metadata !DIExpression()), !dbg [[DBG111:![0-9]+]] +// CHECK1-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[B4]], i64 0, i64 1, !dbg [[DBG112:![0-9]+]] +// CHECK1-NEXT: [[ARRAYIDX10:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX9]], i64 0, i64 1, !dbg [[DBG112]] +// CHECK1-NEXT: store ptr [[ARRAYIDX10]], ptr [[H]], align 8, !dbg [[DBG111]] +// CHECK1-NEXT: tail call void @llvm.dbg.declare(metadata ptr [[D]], metadata [[META113:![0-9]+]], metadata !DIExpression()), !dbg [[DBG114:![0-9]+]] +// CHECK1-NEXT: store i32 15, ptr [[D]], align 4, !dbg [[DBG114]] +// CHECK1-NEXT: store i32 5, ptr [[A_ADDR]], align 4, !dbg [[DBG115:![0-9]+]] +// CHECK1-NEXT: [[ARRAYIDX11:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[B4]], i64 0, i64 0, !dbg [[DBG116:![0-9]+]] +// CHECK1-NEXT: [[TMP18:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG117:![0-9]+]] +// CHECK1-NEXT: [[IDXPROM:%.*]] = sext i32 [[TMP18]] to i64, !dbg [[DBG116]] +// CHECK1-NEXT: [[ARRAYIDX12:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX11]], i64 0, i64 [[IDXPROM]], !dbg [[DBG116]] +// CHECK1-NEXT: store i32 10, ptr [[ARRAYIDX12]], align 4, !dbg [[DBG118:![0-9]+]] +// CHECK1-NEXT: [[ARRAYIDX13:%.*]] = getelementptr inbounds [10 x [10 x [10 x i32]]], ptr [[TMP2]], i64 0, i64 0, !dbg [[DBG119:![0-9]+]] +// CHECK1-NEXT: [[ARRAYIDX14:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[ARRAYIDX13]], i64 0, i64 0, !dbg [[DBG119]] +// CHECK1-NEXT: [[TMP19:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG120:![0-9]+]] +// CHECK1-NEXT: [[IDXPROM15:%.*]] = sext i32 [[TMP19]] to i64, !dbg [[DBG119]] +// CHECK1-NEXT: [[ARRAYIDX16:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX14]], i64 0, i64 [[IDXPROM15]], !dbg [[DBG119]] +// CHECK1-NEXT: store i32 11, ptr [[ARRAYIDX16]], align 4, !dbg [[DBG121:![0-9]+]] +// CHECK1-NEXT: [[ARRAYIDX17:%.*]] = getelementptr inbounds [10 x [10 x [10 x i32]]], ptr [[TMP2]], i64 0, i64 0, !dbg [[DBG122:![0-9]+]] +// CHECK1-NEXT: [[ARRAYIDX18:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[ARRAYIDX17]], i64 0, i64 0, !dbg [[DBG122]] +// CHECK1-NEXT: [[TMP20:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG123:![0-9]+]] +// CHECK1-NEXT: [[IDXPROM19:%.*]] = sext i32 [[TMP20]] to i64, !dbg [[DBG122]] +// CHECK1-NEXT: [[ARRAYIDX20:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX18]], i64 0, i64 [[IDXPROM19]], !dbg [[DBG122]] +// CHECK1-NEXT: [[TMP21:%.*]] = load i32, ptr [[ARRAYIDX20]], align 4, !dbg [[DBG122]] +// CHECK1-NEXT: [[ARRAYIDX21:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[B4]], i64 0, i64 0, !dbg [[DBG124:![0-9]+]] +// CHECK1-NEXT: [[TMP22:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG125:![0-9]+]] +// CHECK1-NEXT: [[IDXPROM22:%.*]] = sext i32 [[TMP22]] to i64, !dbg [[DBG124]] +// CHECK1-NEXT: [[ARRAYIDX23:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX21]], i64 0, i64 [[IDXPROM22]], !dbg [[DBG124]] +// CHECK1-NEXT: store i32 [[TMP21]], ptr [[ARRAYIDX23]], align 4, !dbg [[DBG126:![0-9]+]] +// CHECK1-NEXT: [[ARRAYIDX24:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[B4]], i64 0, i64 0, !dbg [[DBG127:![0-9]+]] +// CHECK1-NEXT: [[TMP23:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG128:![0-9]+]] +// CHECK1-NEXT: [[IDXPROM25:%.*]] = sext i32 [[TMP23]] to i64, !dbg [[DBG127]] +// CHECK1-NEXT: [[ARRAYIDX26:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX24]], i64 0, i64 [[IDXPROM25]], !dbg [[DBG127]] +// CHECK1-NEXT: [[TMP24:%.*]] = load i32, ptr [[ARRAYIDX26]], align 4, !dbg [[DBG127]] +// CHECK1-NEXT: [[TMP25:%.*]] = load i8, ptr [[TMP7]], align 1, !dbg [[DBG129:![0-9]+]] +// CHECK1-NEXT: [[TOBOOL:%.*]] = trunc i8 [[TMP25]] to i1, !dbg [[DBG129]] +// CHECK1-NEXT: [[CONV:%.*]] = zext i1 [[TOBOOL]] to i32, !dbg [[DBG129]] +// CHECK1-NEXT: [[OR:%.*]] = or i32 [[CONV]], [[TMP24]], !dbg [[DBG129]] +// CHECK1-NEXT: [[TOBOOL27:%.*]] = icmp ne i32 [[OR]], 0, !dbg [[DBG129]] +// CHECK1-NEXT: [[FROMBOOL:%.*]] = zext i1 [[TOBOOL27]] to i8, !dbg [[DBG129]] +// CHECK1-NEXT: store i8 [[FROMBOOL]], ptr [[TMP7]], align 1, !dbg [[DBG129]] +// CHECK1-NEXT: br label [[OMP_BODY_CONTINUE:%.*]], !dbg [[DBG130:![0-9]+]] // CHECK1: omp.body.continue: -// CHECK1-NEXT: br label [[OMP_INNER_FOR_INC:%.*]], !dbg [[DBG88]] +// CHECK1-NEXT: br label [[OMP_INNER_FOR_INC:%.*]], !dbg [[DBG101]] // CHECK1: omp.inner.for.inc: -// CHECK1-NEXT: [[TMP26:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !dbg [[DBG82]] -// CHECK1-NEXT: [[ADD28:%.*]] = add nsw i32 [[TMP26]], 1, !dbg [[DBG79]] -// CHECK1-NEXT: store i32 [[ADD28]], ptr [[DOTOMP_IV]], align 4, !dbg [[DBG79]] -// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND]], !dbg [[DBG88]], !llvm.loop [[LOOP118:![0-9]+]] +// CHECK1-NEXT: [[TMP26:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !dbg [[DBG95]] +// CHECK1-NEXT: [[ADD28:%.*]] = add nsw i32 [[TMP26]], 1, !dbg [[DBG92]] +// CHECK1-NEXT: store i32 [[ADD28]], ptr [[DOTOMP_IV]], align 4, !dbg [[DBG92]] +// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND]], !dbg [[DBG101]], !llvm.loop [[LOOP131:![0-9]+]] // CHECK1: omp.inner.for.end: -// CHECK1-NEXT: br label [[OMP_DISPATCH_INC:%.*]], !dbg [[DBG88]] +// CHECK1-NEXT: br label [[OMP_DISPATCH_INC:%.*]], !dbg [[DBG101]] // CHECK1: omp.dispatch.inc: -// CHECK1-NEXT: [[TMP27:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4, !dbg [[DBG82]] -// CHECK1-NEXT: [[TMP28:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !dbg [[DBG82]] -// CHECK1-NEXT: [[ADD29:%.*]] = add nsw i32 [[TMP27]], [[TMP28]], !dbg [[DBG79]] -// CHECK1-NEXT: store i32 [[ADD29]], ptr [[DOTOMP_LB]], align 4, !dbg [[DBG79]] -// CHECK1-NEXT: [[TMP29:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !dbg [[DBG82]] -// CHECK1-NEXT: [[TMP30:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !dbg [[DBG82]] -// CHECK1-NEXT: [[ADD30:%.*]] = add nsw i32 [[TMP29]], [[TMP30]], !dbg [[DBG79]] -// CHECK1-NEXT: store i32 [[ADD30]], ptr [[DOTOMP_UB]], align 4, !dbg [[DBG79]] -// CHECK1-NEXT: br label [[OMP_DISPATCH_COND]], !dbg [[DBG88]], !llvm.loop [[LOOP120:![0-9]+]] +// CHECK1-NEXT: [[TMP27:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4, !dbg [[DBG95]] +// CHECK1-NEXT: [[TMP28:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !dbg [[DBG95]] +// CHECK1-NEXT: [[ADD29:%.*]] = add nsw i32 [[TMP27]], [[TMP28]], !dbg [[DBG92]] +// CHECK1-NEXT: store i32 [[ADD29]], ptr [[DOTOMP_LB]], align 4, !dbg [[DBG92]] +// CHECK1-NEXT: [[TMP29:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !dbg [[DBG95]] +// CHECK1-NEXT: [[TMP30:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !dbg [[DBG95]] +// CHECK1-NEXT: [[ADD30:%.*]] = add nsw i32 [[TMP29]], [[TMP30]], !dbg [[DBG92]] +// CHECK1-NEXT: store i32 [[ADD30]], ptr [[DOTOMP_UB]], align 4, !dbg [[DBG92]] +// CHECK1-NEXT: br label [[OMP_DISPATCH_COND]], !dbg [[DBG101]], !llvm.loop [[LOOP133:![0-9]+]] // CHECK1: omp.dispatch.end: -// CHECK1-NEXT: call void @__kmpc_for_static_fini(ptr @[[GLOB5:[0-9]+]], i32 [[TMP9]]), !dbg [[DBG119:![0-9]+]] -// CHECK1-NEXT: ret void, !dbg [[DBG121:![0-9]+]] +// CHECK1-NEXT: call void @__kmpc_for_static_fini(ptr @[[GLOB5:[0-9]+]], i32 [[TMP9]]), !dbg [[DBG132:![0-9]+]] +// CHECK1-NEXT: ret void, !dbg [[DBG134:![0-9]+]] // // // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l13_debug___omp_outlined -// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4000) [[C:%.*]], i64 noundef [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(400) [[B:%.*]], ptr noundef nonnull align 1 dereferenceable(1) [[BB:%.*]]) #[[ATTR2]] !dbg [[DBG122:![0-9]+]] { +// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4000) [[C:%.*]], i64 noundef [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(400) [[B:%.*]], ptr noundef nonnull align 1 dereferenceable(1) [[BB:%.*]]) #[[ATTR1]] !dbg [[DBG135:![0-9]+]] { // CHECK1-NEXT: entry: // CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -295,71 +332,34 @@ int main() { // CHECK1-NEXT: [[B_ADDR:%.*]] = alloca ptr, align 8 // CHECK1-NEXT: [[BB_ADDR:%.*]] = alloca ptr, align 8 // CHECK1-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK1-NEXT: call void @llvm.dbg.declare(metadata ptr [[DOTGLOBAL_TID__ADDR]], metadata [[META129:![0-9]+]], metadata !DIExpression()), !dbg [[DBG130:![0-9]+]] +// CHECK1-NEXT: tail call void @llvm.dbg.declare(metadata ptr [[DOTGLOBAL_TID__ADDR]], metadata [[META138:![0-9]+]], metadata !DIExpression()), !dbg [[DBG139:![0-9]+]] // CHECK1-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8 -// CHECK1-NEXT: call void @llvm.dbg.declare(metadata ptr [[DOTBOUND_TID__ADDR]], metadata [[META131:![0-9]+]], metadata !DIExpression()), !dbg [[DBG130]] -// CHECK1-NEXT: store ptr [[C]], ptr [[C_ADDR]], align 8 -// CHECK1-NEXT: call void @llvm.dbg.declare(metadata ptr [[C_ADDR]], metadata [[META132:![0-9]+]], metadata !DIExpression()), !dbg [[DBG130]] -// CHECK1-NEXT: store i64 [[A]], ptr [[A_ADDR]], align 8 -// CHECK1-NEXT: call void @llvm.dbg.declare(metadata ptr [[A_ADDR]], metadata [[META133:![0-9]+]], metadata !DIExpression()), !dbg [[DBG130]] -// CHECK1-NEXT: store ptr [[B]], ptr [[B_ADDR]], align 8 -// CHECK1-NEXT: call void @llvm.dbg.declare(metadata ptr [[B_ADDR]], metadata [[META134:![0-9]+]], metadata !DIExpression()), !dbg [[DBG130]] -// CHECK1-NEXT: store ptr [[BB]], ptr [[BB_ADDR]], align 8 -// CHECK1-NEXT: call void @llvm.dbg.declare(metadata ptr [[BB_ADDR]], metadata [[META135:![0-9]+]], metadata !DIExpression()), !dbg [[DBG130]] -// CHECK1-NEXT: [[TMP0:%.*]] = load ptr, ptr [[C_ADDR]], align 8, !dbg [[DBG136:![0-9]+]] -// CHECK1-NEXT: [[TMP1:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG136]] -// CHECK1-NEXT: [[TMP2:%.*]] = load ptr, ptr [[BB_ADDR]], align 8, !dbg [[DBG136]] -// CHECK1-NEXT: [[TMP3:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8, !dbg [[DBG136]] -// CHECK1-NEXT: [[TMP4:%.*]] = load ptr, ptr [[DOTBOUND_TID__ADDR]], align 8, !dbg [[DBG136]] -// CHECK1-NEXT: [[TMP5:%.*]] = load ptr, ptr [[C_ADDR]], align 8, !dbg [[DBG136]] -// CHECK1-NEXT: [[TMP6:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG136]] -// CHECK1-NEXT: [[TMP7:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG136]] -// CHECK1-NEXT: [[TMP8:%.*]] = load ptr, ptr [[BB_ADDR]], align 8, !dbg [[DBG136]] -// CHECK1-NEXT: [[TMP9:%.*]] = addrspacecast ptr [[TMP5]] to ptr addrspace(1), !dbg [[DBG136]] -// CHECK1-NEXT: [[TMP10:%.*]] = addrspacecast ptr [[TMP8]] to ptr addrspace(1), !dbg [[DBG136]] -// CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l13_debug___omp_outlined_debug__(ptr [[TMP3]], ptr [[TMP4]], ptr addrspace(1) [[TMP9]], i32 [[TMP6]], ptr [[TMP7]], ptr addrspace(1) [[TMP10]]) #[[ATTR4:[0-9]+]], !dbg [[DBG136]] -// CHECK1-NEXT: ret void, !dbg [[DBG136]] -// -// -// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l13 -// CHECK1-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], ptr noundef nonnull align 4 dereferenceable(4000) [[C:%.*]], i64 noundef [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(400) [[B:%.*]], ptr noundef nonnull align 1 dereferenceable(1) [[BB:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR6:[0-9]+]] !dbg [[DBG137:![0-9]+]] { -// CHECK1-NEXT: entry: -// CHECK1-NEXT: [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8 -// CHECK1-NEXT: [[C_ADDR:%.*]] = alloca ptr, align 8 -// CHECK1-NEXT: [[A_ADDR:%.*]] = alloca i64, align 8 -// CHECK1-NEXT: [[B_ADDR:%.*]] = alloca ptr, align 8 -// CHECK1-NEXT: [[BB_ADDR:%.*]] = alloca ptr, align 8 -// CHECK1-NEXT: [[DOTCAPTURE_EXPR__ADDR:%.*]] = alloca i64, align 8 -// CHECK1-NEXT: store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8 -// CHECK1-NEXT: call void @llvm.dbg.declare(metadata ptr [[DYN_PTR_ADDR]], metadata [[META140:![0-9]+]], metadata !DIExpression()), !dbg [[DBG141:![0-9]+]] +// CHECK1-NEXT: tail call void @llvm.dbg.declare(metadata ptr [[DOTBOUND_TID__ADDR]], metadata [[META140:![0-9]+]], metadata !DIExpression()), !dbg [[DBG139]] // CHECK1-NEXT: store ptr [[C]], ptr [[C_ADDR]], align 8 -// CHECK1-NEXT: call void @llvm.dbg.declare(metadata ptr [[C_ADDR]], metadata [[META142:![0-9]+]], metadata !DIExpression()), !dbg [[DBG141]] +// CHECK1-NEXT: tail call void @llvm.dbg.declare(metadata ptr [[C_ADDR]], metadata [[META141:![0-9]+]], metadata !DIExpression()), !dbg [[DBG139]] // CHECK1-NEXT: store i64 [[A]], ptr [[A_ADDR]], align 8 -// CHECK1-NEXT: call void @llvm.dbg.declare(metadata ptr [[A_ADDR]], metadata [[META143:![0-9]+]], metadata !DIExpression()), !dbg [[DBG141]] +// CHECK1-NEXT: tail call void @llvm.dbg.declare(metadata ptr [[A_ADDR]], metadata [[META142:![0-9]+]], metadata !DIExpression()), !dbg [[DBG139]] // CHECK1-NEXT: store ptr [[B]], ptr [[B_ADDR]], align 8 -// CHECK1-NEXT: call void @llvm.dbg.declare(metadata ptr [[B_ADDR]], metadata [[META144:![0-9]+]], metadata !DIExpression()), !dbg [[DBG141]] +// CHECK1-NEXT: tail call void @llvm.dbg.declare(metadata ptr [[B_ADDR]], metadata [[META143:![0-9]+]], metadata !DIExpression()), !dbg [[DBG139]] // CHECK1-NEXT: store ptr [[BB]], ptr [[BB_ADDR]], align 8 -// CHECK1-NEXT: call void @llvm.dbg.declare(metadata ptr [[BB_ADDR]], metadata [[META145:![0-9]+]], metadata !DIExpression()), !dbg [[DBG141]] -// CHECK1-NEXT: store i64 [[DOTCAPTURE_EXPR_]], ptr [[DOTCAPTURE_EXPR__ADDR]], align 8 -// CHECK1-NEXT: call void @llvm.dbg.declare(metadata ptr [[DOTCAPTURE_EXPR__ADDR]], metadata [[META146:![0-9]+]], metadata !DIExpression()), !dbg [[DBG141]] -// CHECK1-NEXT: [[TMP0:%.*]] = load ptr, ptr [[C_ADDR]], align 8, !dbg [[DBG147:![0-9]+]] -// CHECK1-NEXT: [[TMP1:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG147]] -// CHECK1-NEXT: [[TMP2:%.*]] = load ptr, ptr [[BB_ADDR]], align 8, !dbg [[DBG147]] -// CHECK1-NEXT: [[TMP3:%.*]] = load ptr, ptr [[DYN_PTR_ADDR]], align 8, !dbg [[DBG147]] -// CHECK1-NEXT: [[TMP4:%.*]] = load ptr, ptr [[C_ADDR]], align 8, !dbg [[DBG147]] -// CHECK1-NEXT: [[TMP5:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG147]] -// CHECK1-NEXT: [[TMP6:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG147]] -// CHECK1-NEXT: [[TMP7:%.*]] = load ptr, ptr [[BB_ADDR]], align 8, !dbg [[DBG147]] -// CHECK1-NEXT: [[TMP8:%.*]] = load i8, ptr [[DOTCAPTURE_EXPR__ADDR]], align 1, !dbg [[DBG147]] -// CHECK1-NEXT: [[TOBOOL:%.*]] = trunc i8 [[TMP8]] to i1, !dbg [[DBG147]] -// CHECK1-NEXT: [[TMP9:%.*]] = addrspacecast ptr [[TMP4]] to ptr addrspace(1), !dbg [[DBG147]] -// CHECK1-NEXT: [[TMP10:%.*]] = addrspacecast ptr [[TMP7]] to ptr addrspace(1), !dbg [[DBG147]] -// CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l13_debug__(ptr [[TMP3]], ptr addrspace(1) [[TMP9]], i32 [[TMP5]], ptr [[TMP6]], ptr addrspace(1) [[TMP10]], i1 [[TOBOOL]]) #[[ATTR4]], !dbg [[DBG147]] -// CHECK1-NEXT: ret void, !dbg [[DBG147]] +// CHECK1-NEXT: tail call void @llvm.dbg.declare(metadata ptr [[BB_ADDR]], metadata [[META144:![0-9]+]], metadata !DIExpression()), !dbg [[DBG139]] +// CHECK1-NEXT: [[TMP0:%.*]] = load ptr, ptr [[C_ADDR]], align 8, !dbg [[DBG145:![0-9]+]] +// CHECK1-NEXT: [[TMP1:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG145]] +// CHECK1-NEXT: [[TMP2:%.*]] = load ptr, ptr [[BB_ADDR]], align 8, !dbg [[DBG145]] +// CHECK1-NEXT: [[TMP3:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8, !dbg [[DBG145]] +// CHECK1-NEXT: [[TMP4:%.*]] = load ptr, ptr [[DOTBOUND_TID__ADDR]], align 8, !dbg [[DBG145]] +// CHECK1-NEXT: [[TMP5:%.*]] = load ptr, ptr [[C_ADDR]], align 8, !dbg [[DBG145]] +// CHECK1-NEXT: [[TMP6:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG145]] +// CHECK1-NEXT: [[TMP7:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG145]] +// CHECK1-NEXT: [[TMP8:%.*]] = load ptr, ptr [[BB_ADDR]], align 8, !dbg [[DBG145]] +// CHECK1-NEXT: [[TMP9:%.*]] = addrspacecast ptr [[TMP5]] to ptr addrspace(1), !dbg [[DBG145]] +// CHECK1-NEXT: [[TMP10:%.*]] = addrspacecast ptr [[TMP8]] to ptr addrspace(1), !dbg [[DBG145]] +// CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l13_debug___omp_outlined_debug__(ptr [[TMP3]], ptr [[TMP4]], ptr addrspace(1) [[TMP9]], i32 [[TMP6]], ptr [[TMP7]], ptr addrspace(1) [[TMP10]]) #[[ATTR4]], !dbg [[DBG145]] +// CHECK1-NEXT: ret void, !dbg [[DBG145]] // // // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l27_debug__ -// CHECK1-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], ptr addrspace(1) noalias noundef [[C:%.*]], i32 noundef [[A:%.*]], ptr addrspace(1) noalias noundef [[B:%.*]], ptr addrspace(1) noalias noundef [[BB:%.*]]) #[[ATTR0]] !dbg [[DBG148:![0-9]+]] { +// CHECK1-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], ptr addrspace(1) noalias noundef [[C:%.*]], i32 noundef [[A:%.*]], ptr addrspace(1) noalias noundef [[B:%.*]], ptr addrspace(1) noalias noundef [[BB:%.*]]) #[[ATTR1]] !dbg [[DBG146:![0-9]+]] { // CHECK1-NEXT: entry: // CHECK1-NEXT: [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8 // CHECK1-NEXT: [[C_ADDR:%.*]] = alloca ptr addrspace(1), align 8 @@ -372,32 +372,32 @@ int main() { // CHECK1-NEXT: [[A_CASTED:%.*]] = alloca i64, align 8 // CHECK1-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [4 x ptr], align 8 // CHECK1-NEXT: store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8 -// CHECK1-NEXT: call void @llvm.dbg.declare(metadata ptr [[DYN_PTR_ADDR]], metadata [[META153:![0-9]+]], metadata !DIExpression()), !dbg [[DBG154:![0-9]+]] +// CHECK1-NEXT: tail call void @llvm.dbg.declare(metadata ptr [[DYN_PTR_ADDR]], metadata [[META151:![0-9]+]], metadata !DIExpression()), !dbg [[DBG152:![0-9]+]] // CHECK1-NEXT: store ptr addrspace(1) [[C]], ptr [[C_ADDR]], align 8 -// CHECK1-NEXT: call void @llvm.dbg.declare(metadata ptr [[C_ADDR]], metadata [[META155:![0-9]+]], metadata !DIExpression()), !dbg [[DBG156:![0-9]+]] +// CHECK1-NEXT: tail call void @llvm.dbg.declare(metadata ptr [[C_ADDR]], metadata [[META153:![0-9]+]], metadata !DIExpression()), !dbg [[DBG154:![0-9]+]] // CHECK1-NEXT: store i32 [[A]], ptr [[A_ADDR]], align 4 -// CHECK1-NEXT: call void @llvm.dbg.declare(metadata ptr [[A_ADDR]], metadata [[META157:![0-9]+]], metadata !DIExpression()), !dbg [[DBG158:![0-9]+]] +// CHECK1-NEXT: tail call void @llvm.dbg.declare(metadata ptr [[A_ADDR]], metadata [[META155:![0-9]+]], metadata !DIExpression()), !dbg [[DBG156:![0-9]+]] // CHECK1-NEXT: store ptr addrspace(1) [[B]], ptr [[B_ADDR]], align 8 -// CHECK1-NEXT: call void @llvm.dbg.declare(metadata ptr [[B_ADDR]], metadata [[META159:![0-9]+]], metadata !DIExpression()), !dbg [[DBG160:![0-9]+]] +// CHECK1-NEXT: tail call void @llvm.dbg.declare(metadata ptr [[B_ADDR]], metadata [[META157:![0-9]+]], metadata !DIExpression()), !dbg [[DBG158:![0-9]+]] // CHECK1-NEXT: store ptr addrspace(1) [[BB]], ptr [[BB_ADDR]], align 8 -// CHECK1-NEXT: call void @llvm.dbg.declare(metadata ptr [[BB_ADDR]], metadata [[META161:![0-9]+]], metadata !DIExpression()), !dbg [[DBG162:![0-9]+]] -// CHECK1-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr [[C_ADDR]], align 8, !dbg [[DBG163:![0-9]+]] -// CHECK1-NEXT: [[TMP1:%.*]] = addrspacecast ptr addrspace(1) [[TMP0]] to ptr, !dbg [[DBG163]] -// CHECK1-NEXT: store ptr [[TMP1]], ptr [[TMP]], align 8, !dbg [[DBG163]] -// CHECK1-NEXT: [[TMP2:%.*]] = load ptr, ptr [[TMP]], align 8, !dbg [[DBG163]] -// CHECK1-NEXT: [[TMP3:%.*]] = load ptr addrspace(1), ptr [[B_ADDR]], align 8, !dbg [[DBG163]] -// CHECK1-NEXT: [[TMP4:%.*]] = addrspacecast ptr addrspace(1) [[TMP3]] to ptr, !dbg [[DBG163]] -// CHECK1-NEXT: store ptr [[TMP4]], ptr [[_TMP1]], align 8, !dbg [[DBG163]] -// CHECK1-NEXT: [[TMP5:%.*]] = load ptr, ptr [[_TMP1]], align 8, !dbg [[DBG163]] -// CHECK1-NEXT: [[TMP6:%.*]] = load ptr addrspace(1), ptr [[BB_ADDR]], align 8, !dbg [[DBG163]] -// CHECK1-NEXT: [[TMP7:%.*]] = addrspacecast ptr addrspace(1) [[TMP6]] to ptr, !dbg [[DBG163]] -// CHECK1-NEXT: store ptr [[TMP7]], ptr [[_TMP2]], align 8, !dbg [[DBG163]] -// CHECK1-NEXT: [[TMP8:%.*]] = load ptr, ptr [[_TMP2]], align 8, !dbg [[DBG163]] -// CHECK1-NEXT: [[TMP9:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l27_kernel_environment, ptr [[DYN_PTR]]), !dbg [[DBG163]] -// CHECK1-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP9]], -1, !dbg [[DBG163]] -// CHECK1-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]], !dbg [[DBG163]] +// CHECK1-NEXT: tail call void @llvm.dbg.declare(metadata ptr [[BB_ADDR]], metadata [[META159:![0-9]+]], metadata !DIExpression()), !dbg [[DBG160:![0-9]+]] +// CHECK1-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr [[C_ADDR]], align 8, !dbg [[DBG161:![0-9]+]] +// CHECK1-NEXT: [[TMP1:%.*]] = addrspacecast ptr addrspace(1) [[TMP0]] to ptr, !dbg [[DBG161]] +// CHECK1-NEXT: store ptr [[TMP1]], ptr [[TMP]], align 8, !dbg [[DBG161]] +// CHECK1-NEXT: [[TMP2:%.*]] = load ptr, ptr [[TMP]], align 8, !dbg [[DBG161]] +// CHECK1-NEXT: [[TMP3:%.*]] = load ptr addrspace(1), ptr [[B_ADDR]], align 8, !dbg [[DBG161]] +// CHECK1-NEXT: [[TMP4:%.*]] = addrspacecast ptr addrspace(1) [[TMP3]] to ptr, !dbg [[DBG161]] +// CHECK1-NEXT: store ptr [[TMP4]], ptr [[_TMP1]], align 8, !dbg [[DBG161]] +// CHECK1-NEXT: [[TMP5:%.*]] = load ptr, ptr [[_TMP1]], align 8, !dbg [[DBG161]] +// CHECK1-NEXT: [[TMP6:%.*]] = load ptr addrspace(1), ptr [[BB_ADDR]], align 8, !dbg [[DBG161]] +// CHECK1-NEXT: [[TMP7:%.*]] = addrspacecast ptr addrspace(1) [[TMP6]] to ptr, !dbg [[DBG161]] +// CHECK1-NEXT: store ptr [[TMP7]], ptr [[_TMP2]], align 8, !dbg [[DBG161]] +// CHECK1-NEXT: [[TMP8:%.*]] = load ptr, ptr [[_TMP2]], align 8, !dbg [[DBG161]] +// CHECK1-NEXT: [[TMP9:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l27_kernel_environment, ptr [[DYN_PTR]]), !dbg [[DBG161]] +// CHECK1-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP9]], -1, !dbg [[DBG161]] +// CHECK1-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]], !dbg [[DBG161]] // CHECK1: user_code.entry: -// CHECK1-NEXT: [[TMP10:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB13:[0-9]+]]) +// CHECK1-NEXT: [[TMP10:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB13:[0-9]+]]), !dbg [[DBG162:![0-9]+]] // CHECK1-NEXT: [[TMP11:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG164:![0-9]+]] // CHECK1-NEXT: store i32 [[TMP11]], ptr [[A_CASTED]], align 4, !dbg [[DBG164]] // CHECK1-NEXT: [[TMP12:%.*]] = load i64, ptr [[A_CASTED]], align 8, !dbg [[DBG164]] @@ -412,13 +412,46 @@ int main() { // CHECK1-NEXT: store ptr [[TMP8]], ptr [[TMP17]], align 8, !dbg [[DBG164]] // CHECK1-NEXT: call void @__kmpc_parallel_51(ptr @[[GLOB13]], i32 [[TMP10]], i32 1, i32 -1, i32 -1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l27_debug___omp_outlined, ptr null, ptr [[CAPTURED_VARS_ADDRS]], i64 4), !dbg [[DBG164]] // CHECK1-NEXT: call void @__kmpc_target_deinit(), !dbg [[DBG165:![0-9]+]] -// CHECK1-NEXT: ret void, !dbg [[DBG167:![0-9]+]] +// CHECK1-NEXT: ret void, !dbg [[DBG166:![0-9]+]] // CHECK1: worker.exit: -// CHECK1-NEXT: ret void, !dbg [[DBG163]] +// CHECK1-NEXT: ret void, !dbg [[DBG161]] +// +// +// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l27 +// CHECK1-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], ptr noundef nonnull align 4 dereferenceable(4000) [[C:%.*]], i64 noundef [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(400) [[B:%.*]], ptr noundef nonnull align 1 dereferenceable(1) [[BB:%.*]]) #[[ATTR2]] !dbg [[DBG167:![0-9]+]] { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8 +// CHECK1-NEXT: [[C_ADDR:%.*]] = alloca ptr, align 8 +// CHECK1-NEXT: [[A_ADDR:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[B_ADDR:%.*]] = alloca ptr, align 8 +// CHECK1-NEXT: [[BB_ADDR:%.*]] = alloca ptr, align 8 +// CHECK1-NEXT: store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8 +// CHECK1-NEXT: tail call void @llvm.dbg.declare(metadata ptr [[DYN_PTR_ADDR]], metadata [[META170:![0-9]+]], metadata !DIExpression()), !dbg [[DBG171:![0-9]+]] +// CHECK1-NEXT: store ptr [[C]], ptr [[C_ADDR]], align 8 +// CHECK1-NEXT: tail call void @llvm.dbg.declare(metadata ptr [[C_ADDR]], metadata [[META172:![0-9]+]], metadata !DIExpression()), !dbg [[DBG171]] +// CHECK1-NEXT: store i64 [[A]], ptr [[A_ADDR]], align 8 +// CHECK1-NEXT: tail call void @llvm.dbg.declare(metadata ptr [[A_ADDR]], metadata [[META173:![0-9]+]], metadata !DIExpression()), !dbg [[DBG171]] +// CHECK1-NEXT: store ptr [[B]], ptr [[B_ADDR]], align 8 +// CHECK1-NEXT: tail call void @llvm.dbg.declare(metadata ptr [[B_ADDR]], metadata [[META174:![0-9]+]], metadata !DIExpression()), !dbg [[DBG171]] +// CHECK1-NEXT: store ptr [[BB]], ptr [[BB_ADDR]], align 8 +// CHECK1-NEXT: tail call void @llvm.dbg.declare(metadata ptr [[BB_ADDR]], metadata [[META175:![0-9]+]], metadata !DIExpression()), !dbg [[DBG171]] +// CHECK1-NEXT: [[TMP0:%.*]] = load ptr, ptr [[C_ADDR]], align 8, !dbg [[DBG176:![0-9]+]] +// CHECK1-NEXT: [[TMP1:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG176]] +// CHECK1-NEXT: [[TMP2:%.*]] = load ptr, ptr [[BB_ADDR]], align 8, !dbg [[DBG176]] +// CHECK1-NEXT: [[TMP3:%.*]] = load ptr, ptr [[DYN_PTR_ADDR]], align 8, !dbg [[DBG176]] +// CHECK1-NEXT: [[TMP4:%.*]] = load ptr, ptr [[C_ADDR]], align 8, !dbg [[DBG176]] +// CHECK1-NEXT: [[TMP5:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG176]] +// CHECK1-NEXT: [[TMP6:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG176]] +// CHECK1-NEXT: [[TMP7:%.*]] = load ptr, ptr [[BB_ADDR]], align 8, !dbg [[DBG176]] +// CHECK1-NEXT: [[TMP8:%.*]] = addrspacecast ptr [[TMP4]] to ptr addrspace(1), !dbg [[DBG176]] +// CHECK1-NEXT: [[TMP9:%.*]] = addrspacecast ptr [[TMP6]] to ptr addrspace(1), !dbg [[DBG176]] +// CHECK1-NEXT: [[TMP10:%.*]] = addrspacecast ptr [[TMP7]] to ptr addrspace(1), !dbg [[DBG176]] +// CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l27_debug__(ptr [[TMP3]], ptr addrspace(1) [[TMP8]], i32 [[TMP5]], ptr addrspace(1) [[TMP9]], ptr addrspace(1) [[TMP10]]) #[[ATTR4]], !dbg [[DBG176]] +// CHECK1-NEXT: ret void, !dbg [[DBG176]] // // // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l27_debug___omp_outlined_debug__ -// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr addrspace(1) noalias noundef [[C:%.*]], i32 noundef [[A:%.*]], ptr addrspace(1) noalias noundef [[B:%.*]], ptr addrspace(1) noalias noundef [[BB:%.*]]) #[[ATTR2]] !dbg [[DBG168:![0-9]+]] { +// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr addrspace(1) noalias noundef [[C:%.*]], i32 noundef [[A:%.*]], ptr addrspace(1) noalias noundef [[B:%.*]], ptr addrspace(1) noalias noundef [[BB:%.*]]) #[[ATTR1]] !dbg [[DBG177:![0-9]+]] { // CHECK1-NEXT: entry: // CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -441,140 +474,140 @@ int main() { // CHECK1-NEXT: [[H:%.*]] = alloca ptr, align 8 // CHECK1-NEXT: [[D:%.*]] = alloca i32, align 4 // CHECK1-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK1-NEXT: call void @llvm.dbg.declare(metadata ptr [[DOTGLOBAL_TID__ADDR]], metadata [[META171:![0-9]+]], metadata !DIExpression()), !dbg [[DBG172:![0-9]+]] +// CHECK1-NEXT: tail call void @llvm.dbg.declare(metadata ptr [[DOTGLOBAL_TID__ADDR]], metadata [[META180:![0-9]+]], metadata !DIExpression()), !dbg [[DBG181:![0-9]+]] // CHECK1-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8 -// CHECK1-NEXT: call void @llvm.dbg.declare(metadata ptr [[DOTBOUND_TID__ADDR]], metadata [[META173:![0-9]+]], metadata !DIExpression()), !dbg [[DBG172]] +// CHECK1-NEXT: tail call void @llvm.dbg.declare(metadata ptr [[DOTBOUND_TID__ADDR]], metadata [[META182:![0-9]+]], metadata !DIExpression()), !dbg [[DBG181]] // CHECK1-NEXT: store ptr addrspace(1) [[C]], ptr [[C_ADDR]], align 8 -// CHECK1-NEXT: call void @llvm.dbg.declare(metadata ptr [[C_ADDR]], metadata [[META174:![0-9]+]], metadata !DIExpression()), !dbg [[DBG175:![0-9]+]] +// CHECK1-NEXT: tail call void @llvm.dbg.declare(metadata ptr [[C_ADDR]], metadata [[META183:![0-9]+]], metadata !DIExpression()), !dbg [[DBG184:![0-9]+]] // CHECK1-NEXT: store i32 [[A]], ptr [[A_ADDR]], align 4 -// CHECK1-NEXT: call void @llvm.dbg.declare(metadata ptr [[A_ADDR]], metadata [[META176:![0-9]+]], metadata !DIExpression()), !dbg [[DBG177:![0-9]+]] +// CHECK1-NEXT: tail call void @llvm.dbg.declare(metadata ptr [[A_ADDR]], metadata [[META185:![0-9]+]], metadata !DIExpression()), !dbg [[DBG186:![0-9]+]] // CHECK1-NEXT: store ptr addrspace(1) [[B]], ptr [[B_ADDR]], align 8 -// CHECK1-NEXT: call void @llvm.dbg.declare(metadata ptr [[B_ADDR]], metadata [[META178:![0-9]+]], metadata !DIExpression()), !dbg [[DBG179:![0-9]+]] +// CHECK1-NEXT: tail call void @llvm.dbg.declare(metadata ptr [[B_ADDR]], metadata [[META187:![0-9]+]], metadata !DIExpression()), !dbg [[DBG188:![0-9]+]] // CHECK1-NEXT: store ptr addrspace(1) [[BB]], ptr [[BB_ADDR]], align 8 -// CHECK1-NEXT: call void @llvm.dbg.declare(metadata ptr [[BB_ADDR]], metadata [[META180:![0-9]+]], metadata !DIExpression()), !dbg [[DBG181:![0-9]+]] -// CHECK1-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr [[C_ADDR]], align 8, !dbg [[DBG182:![0-9]+]] -// CHECK1-NEXT: [[TMP1:%.*]] = addrspacecast ptr addrspace(1) [[TMP0]] to ptr, !dbg [[DBG182]] -// CHECK1-NEXT: store ptr [[TMP1]], ptr [[TMP]], align 8, !dbg [[DBG182]] -// CHECK1-NEXT: [[TMP2:%.*]] = load ptr, ptr [[TMP]], align 8, !dbg [[DBG182]] -// CHECK1-NEXT: [[TMP3:%.*]] = load ptr addrspace(1), ptr [[B_ADDR]], align 8, !dbg [[DBG182]] -// CHECK1-NEXT: [[TMP4:%.*]] = addrspacecast ptr addrspace(1) [[TMP3]] to ptr, !dbg [[DBG182]] -// CHECK1-NEXT: store ptr [[TMP4]], ptr [[_TMP1]], align 8, !dbg [[DBG182]] -// CHECK1-NEXT: [[TMP5:%.*]] = load ptr, ptr [[_TMP1]], align 8, !dbg [[DBG182]] -// CHECK1-NEXT: [[TMP6:%.*]] = load ptr addrspace(1), ptr [[BB_ADDR]], align 8, !dbg [[DBG182]] -// CHECK1-NEXT: [[TMP7:%.*]] = addrspacecast ptr addrspace(1) [[TMP6]] to ptr, !dbg [[DBG182]] -// CHECK1-NEXT: store ptr [[TMP7]], ptr [[_TMP2]], align 8, !dbg [[DBG182]] -// CHECK1-NEXT: [[TMP8:%.*]] = load ptr, ptr [[_TMP2]], align 8, !dbg [[DBG182]] -// CHECK1-NEXT: call void @llvm.dbg.declare(metadata ptr [[DOTOMP_IV]], metadata [[META183:![0-9]+]], metadata !DIExpression()), !dbg [[DBG172]] -// CHECK1-NEXT: call void @llvm.dbg.declare(metadata ptr [[DOTOMP_LB]], metadata [[META184:![0-9]+]], metadata !DIExpression()), !dbg [[DBG172]] -// CHECK1-NEXT: store i32 0, ptr [[DOTOMP_LB]], align 4, !dbg [[DBG185:![0-9]+]] -// CHECK1-NEXT: call void @llvm.dbg.declare(metadata ptr [[DOTOMP_UB]], metadata [[META186:![0-9]+]], metadata !DIExpression()), !dbg [[DBG172]] -// CHECK1-NEXT: store i32 9, ptr [[DOTOMP_UB]], align 4, !dbg [[DBG185]] -// CHECK1-NEXT: call void @llvm.dbg.declare(metadata ptr [[DOTOMP_STRIDE]], metadata [[META187:![0-9]+]], metadata !DIExpression()), !dbg [[DBG172]] -// CHECK1-NEXT: store i32 1, ptr [[DOTOMP_STRIDE]], align 4, !dbg [[DBG185]] -// CHECK1-NEXT: call void @llvm.dbg.declare(metadata ptr [[DOTOMP_IS_LAST]], metadata [[META188:![0-9]+]], metadata !DIExpression()), !dbg [[DBG172]] -// CHECK1-NEXT: store i32 0, ptr [[DOTOMP_IS_LAST]], align 4, !dbg [[DBG185]] -// CHECK1-NEXT: call void @llvm.dbg.declare(metadata ptr [[I]], metadata [[META189:![0-9]+]], metadata !DIExpression()), !dbg [[DBG172]] -// CHECK1-NEXT: [[TMP9:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8, !dbg [[DBG182]] -// CHECK1-NEXT: [[TMP10:%.*]] = load i32, ptr [[TMP9]], align 4, !dbg [[DBG182]] -// CHECK1-NEXT: call void @__kmpc_for_static_init_4(ptr @[[GLOB10:[0-9]+]], i32 [[TMP10]], i32 33, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_LB]], ptr [[DOTOMP_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1), !dbg [[DBG190:![0-9]+]] -// CHECK1-NEXT: br label [[OMP_DISPATCH_COND:%.*]], !dbg [[DBG182]] +// CHECK1-NEXT: tail call void @llvm.dbg.declare(metadata ptr [[BB_ADDR]], metadata [[META189:![0-9]+]], metadata !DIExpression()), !dbg [[DBG190:![0-9]+]] +// CHECK1-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr [[C_ADDR]], align 8, !dbg [[DBG191:![0-9]+]] +// CHECK1-NEXT: [[TMP1:%.*]] = addrspacecast ptr addrspace(1) [[TMP0]] to ptr, !dbg [[DBG191]] +// CHECK1-NEXT: store ptr [[TMP1]], ptr [[TMP]], align 8, !dbg [[DBG191]] +// CHECK1-NEXT: [[TMP2:%.*]] = load ptr, ptr [[TMP]], align 8, !dbg [[DBG191]] +// CHECK1-NEXT: [[TMP3:%.*]] = load ptr addrspace(1), ptr [[B_ADDR]], align 8, !dbg [[DBG191]] +// CHECK1-NEXT: [[TMP4:%.*]] = addrspacecast ptr addrspace(1) [[TMP3]] to ptr, !dbg [[DBG191]] +// CHECK1-NEXT: store ptr [[TMP4]], ptr [[_TMP1]], align 8, !dbg [[DBG191]] +// CHECK1-NEXT: [[TMP5:%.*]] = load ptr, ptr [[_TMP1]], align 8, !dbg [[DBG191]] +// CHECK1-NEXT: [[TMP6:%.*]] = load ptr addrspace(1), ptr [[BB_ADDR]], align 8, !dbg [[DBG191]] +// CHECK1-NEXT: [[TMP7:%.*]] = addrspacecast ptr addrspace(1) [[TMP6]] to ptr, !dbg [[DBG191]] +// CHECK1-NEXT: store ptr [[TMP7]], ptr [[_TMP2]], align 8, !dbg [[DBG191]] +// CHECK1-NEXT: [[TMP8:%.*]] = load ptr, ptr [[_TMP2]], align 8, !dbg [[DBG191]] +// CHECK1-NEXT: tail call void @llvm.dbg.declare(metadata ptr [[DOTOMP_IV]], metadata [[META192:![0-9]+]], metadata !DIExpression()), !dbg [[DBG181]] +// CHECK1-NEXT: tail call void @llvm.dbg.declare(metadata ptr [[DOTOMP_LB]], metadata [[META193:![0-9]+]], metadata !DIExpression()), !dbg [[DBG181]] +// CHECK1-NEXT: store i32 0, ptr [[DOTOMP_LB]], align 4, !dbg [[DBG194:![0-9]+]] +// CHECK1-NEXT: tail call void @llvm.dbg.declare(metadata ptr [[DOTOMP_UB]], metadata [[META195:![0-9]+]], metadata !DIExpression()), !dbg [[DBG181]] +// CHECK1-NEXT: store i32 9, ptr [[DOTOMP_UB]], align 4, !dbg [[DBG194]] +// CHECK1-NEXT: tail call void @llvm.dbg.declare(metadata ptr [[DOTOMP_STRIDE]], metadata [[META196:![0-9]+]], metadata !DIExpression()), !dbg [[DBG181]] +// CHECK1-NEXT: store i32 1, ptr [[DOTOMP_STRIDE]], align 4, !dbg [[DBG194]] +// CHECK1-NEXT: tail call void @llvm.dbg.declare(metadata ptr [[DOTOMP_IS_LAST]], metadata [[META197:![0-9]+]], metadata !DIExpression()), !dbg [[DBG181]] +// CHECK1-NEXT: store i32 0, ptr [[DOTOMP_IS_LAST]], align 4, !dbg [[DBG194]] +// CHECK1-NEXT: tail call void @llvm.dbg.declare(metadata ptr [[I]], metadata [[META198:![0-9]+]], metadata !DIExpression()), !dbg [[DBG181]] +// CHECK1-NEXT: [[TMP9:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8, !dbg [[DBG191]] +// CHECK1-NEXT: [[TMP10:%.*]] = load i32, ptr [[TMP9]], align 4, !dbg [[DBG191]] +// CHECK1-NEXT: call void @__kmpc_for_static_init_4(ptr @[[GLOB10:[0-9]+]], i32 [[TMP10]], i32 33, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_LB]], ptr [[DOTOMP_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1), !dbg [[DBG199:![0-9]+]] +// CHECK1-NEXT: br label [[OMP_DISPATCH_COND:%.*]], !dbg [[DBG191]] // CHECK1: omp.dispatch.cond: -// CHECK1-NEXT: [[TMP11:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !dbg [[DBG185]] -// CHECK1-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP11]], 9, !dbg [[DBG185]] -// CHECK1-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]], !dbg [[DBG185]] +// CHECK1-NEXT: [[TMP11:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !dbg [[DBG194]] +// CHECK1-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP11]], 9, !dbg [[DBG194]] +// CHECK1-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]], !dbg [[DBG194]] // CHECK1: cond.true: -// CHECK1-NEXT: br label [[COND_END:%.*]], !dbg [[DBG185]] +// CHECK1-NEXT: br label [[COND_END:%.*]], !dbg [[DBG194]] // CHECK1: cond.false: -// CHECK1-NEXT: [[TMP12:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !dbg [[DBG185]] -// CHECK1-NEXT: br label [[COND_END]], !dbg [[DBG185]] +// CHECK1-NEXT: [[TMP12:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !dbg [[DBG194]] +// CHECK1-NEXT: br label [[COND_END]], !dbg [[DBG194]] // CHECK1: cond.end: -// CHECK1-NEXT: [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP12]], [[COND_FALSE]] ], !dbg [[DBG185]] -// CHECK1-NEXT: store i32 [[COND]], ptr [[DOTOMP_UB]], align 4, !dbg [[DBG185]] -// CHECK1-NEXT: [[TMP13:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4, !dbg [[DBG185]] -// CHECK1-NEXT: store i32 [[TMP13]], ptr [[DOTOMP_IV]], align 4, !dbg [[DBG185]] -// CHECK1-NEXT: [[TMP14:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !dbg [[DBG185]] -// CHECK1-NEXT: [[TMP15:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !dbg [[DBG185]] -// CHECK1-NEXT: [[CMP4:%.*]] = icmp sle i32 [[TMP14]], [[TMP15]], !dbg [[DBG182]] -// CHECK1-NEXT: br i1 [[CMP4]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]], !dbg [[DBG182]] +// CHECK1-NEXT: [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP12]], [[COND_FALSE]] ], !dbg [[DBG194]] +// CHECK1-NEXT: store i32 [[COND]], ptr [[DOTOMP_UB]], align 4, !dbg [[DBG194]] +// CHECK1-NEXT: [[TMP13:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4, !dbg [[DBG194]] +// CHECK1-NEXT: store i32 [[TMP13]], ptr [[DOTOMP_IV]], align 4, !dbg [[DBG194]] +// CHECK1-NEXT: [[TMP14:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !dbg [[DBG194]] +// CHECK1-NEXT: [[TMP15:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !dbg [[DBG194]] +// CHECK1-NEXT: [[CMP4:%.*]] = icmp sle i32 [[TMP14]], [[TMP15]], !dbg [[DBG191]] +// CHECK1-NEXT: br i1 [[CMP4]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]], !dbg [[DBG191]] // CHECK1: omp.dispatch.body: -// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND:%.*]], !dbg [[DBG182]] +// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND:%.*]], !dbg [[DBG191]] // CHECK1: omp.inner.for.cond: -// CHECK1-NEXT: [[TMP16:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !dbg [[DBG185]] -// CHECK1-NEXT: [[TMP17:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !dbg [[DBG185]] -// CHECK1-NEXT: [[CMP5:%.*]] = icmp sle i32 [[TMP16]], [[TMP17]], !dbg [[DBG182]] -// CHECK1-NEXT: br i1 [[CMP5]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]], !dbg [[DBG182]] +// CHECK1-NEXT: [[TMP16:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !dbg [[DBG194]] +// CHECK1-NEXT: [[TMP17:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !dbg [[DBG194]] +// CHECK1-NEXT: [[CMP5:%.*]] = icmp sle i32 [[TMP16]], [[TMP17]], !dbg [[DBG191]] +// CHECK1-NEXT: br i1 [[CMP5]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]], !dbg [[DBG191]] // CHECK1: omp.inner.for.body: -// CHECK1-NEXT: [[TMP18:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !dbg [[DBG185]] -// CHECK1-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP18]], 1, !dbg [[DBG191:![0-9]+]] -// CHECK1-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]], !dbg [[DBG191]] -// CHECK1-NEXT: store i32 [[ADD]], ptr [[I]], align 4, !dbg [[DBG191]] -// CHECK1-NEXT: call void @llvm.dbg.declare(metadata ptr [[F]], metadata [[META192:![0-9]+]], metadata !DIExpression()), !dbg [[DBG194:![0-9]+]] -// CHECK1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x [10 x [10 x i32]]], ptr [[TMP2]], i64 0, i64 1, !dbg [[DBG195:![0-9]+]] -// CHECK1-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[ARRAYIDX]], i64 0, i64 1, !dbg [[DBG195]] -// CHECK1-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX6]], i64 0, i64 1, !dbg [[DBG195]] -// CHECK1-NEXT: store ptr [[ARRAYIDX7]], ptr [[F]], align 8, !dbg [[DBG194]] -// CHECK1-NEXT: call void @llvm.dbg.declare(metadata ptr [[G]], metadata [[META196:![0-9]+]], metadata !DIExpression()), !dbg [[DBG197:![0-9]+]] -// CHECK1-NEXT: store ptr [[A_ADDR]], ptr [[G]], align 8, !dbg [[DBG197]] -// CHECK1-NEXT: call void @llvm.dbg.declare(metadata ptr [[H]], metadata [[META198:![0-9]+]], metadata !DIExpression()), !dbg [[DBG199:![0-9]+]] -// CHECK1-NEXT: [[ARRAYIDX8:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[TMP5]], i64 0, i64 1, !dbg [[DBG200:![0-9]+]] -// CHECK1-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX8]], i64 0, i64 1, !dbg [[DBG200]] -// CHECK1-NEXT: store ptr [[ARRAYIDX9]], ptr [[H]], align 8, !dbg [[DBG199]] -// CHECK1-NEXT: call void @llvm.dbg.declare(metadata ptr [[D]], metadata [[META201:![0-9]+]], metadata !DIExpression()), !dbg [[DBG202:![0-9]+]] -// CHECK1-NEXT: store i32 15, ptr [[D]], align 4, !dbg [[DBG202]] -// CHECK1-NEXT: store i32 5, ptr [[A_ADDR]], align 4, !dbg [[DBG203:![0-9]+]] -// CHECK1-NEXT: [[ARRAYIDX10:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[TMP5]], i64 0, i64 0, !dbg [[DBG204:![0-9]+]] -// CHECK1-NEXT: [[TMP19:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG205:![0-9]+]] -// CHECK1-NEXT: [[IDXPROM:%.*]] = sext i32 [[TMP19]] to i64, !dbg [[DBG204]] -// CHECK1-NEXT: [[ARRAYIDX11:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX10]], i64 0, i64 [[IDXPROM]], !dbg [[DBG204]] -// CHECK1-NEXT: store i32 10, ptr [[ARRAYIDX11]], align 4, !dbg [[DBG206:![0-9]+]] -// CHECK1-NEXT: [[ARRAYIDX12:%.*]] = getelementptr inbounds [10 x [10 x [10 x i32]]], ptr [[TMP2]], i64 0, i64 0, !dbg [[DBG207:![0-9]+]] -// CHECK1-NEXT: [[ARRAYIDX13:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[ARRAYIDX12]], i64 0, i64 0, !dbg [[DBG207]] -// CHECK1-NEXT: [[TMP20:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG208:![0-9]+]] -// CHECK1-NEXT: [[IDXPROM14:%.*]] = sext i32 [[TMP20]] to i64, !dbg [[DBG207]] -// CHECK1-NEXT: [[ARRAYIDX15:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX13]], i64 0, i64 [[IDXPROM14]], !dbg [[DBG207]] -// CHECK1-NEXT: store i32 11, ptr [[ARRAYIDX15]], align 4, !dbg [[DBG209:![0-9]+]] -// CHECK1-NEXT: [[ARRAYIDX16:%.*]] = getelementptr inbounds [10 x [10 x [10 x i32]]], ptr [[TMP2]], i64 0, i64 0, !dbg [[DBG210:![0-9]+]] -// CHECK1-NEXT: [[ARRAYIDX17:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[ARRAYIDX16]], i64 0, i64 0, !dbg [[DBG210]] -// CHECK1-NEXT: [[TMP21:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG211:![0-9]+]] -// CHECK1-NEXT: [[IDXPROM18:%.*]] = sext i32 [[TMP21]] to i64, !dbg [[DBG210]] -// CHECK1-NEXT: [[ARRAYIDX19:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX17]], i64 0, i64 [[IDXPROM18]], !dbg [[DBG210]] -// CHECK1-NEXT: [[TMP22:%.*]] = load i32, ptr [[ARRAYIDX19]], align 4, !dbg [[DBG210]] -// CHECK1-NEXT: [[ARRAYIDX20:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[TMP5]], i64 0, i64 0, !dbg [[DBG212:![0-9]+]] -// CHECK1-NEXT: [[TMP23:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG213:![0-9]+]] -// CHECK1-NEXT: [[IDXPROM21:%.*]] = sext i32 [[TMP23]] to i64, !dbg [[DBG212]] -// CHECK1-NEXT: [[ARRAYIDX22:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX20]], i64 0, i64 [[IDXPROM21]], !dbg [[DBG212]] -// CHECK1-NEXT: store i32 [[TMP22]], ptr [[ARRAYIDX22]], align 4, !dbg [[DBG214:![0-9]+]] -// CHECK1-NEXT: [[TMP24:%.*]] = load i8, ptr [[TMP8]], align 1, !dbg [[DBG215:![0-9]+]] -// CHECK1-NEXT: [[TOBOOL:%.*]] = trunc i8 [[TMP24]] to i1, !dbg [[DBG215]] -// CHECK1-NEXT: [[CONV:%.*]] = zext i1 [[TOBOOL]] to i32, !dbg [[DBG215]] -// CHECK1-NEXT: store i32 [[CONV]], ptr [[D]], align 4, !dbg [[DBG216:![0-9]+]] -// CHECK1-NEXT: br label [[OMP_BODY_CONTINUE:%.*]], !dbg [[DBG217:![0-9]+]] +// CHECK1-NEXT: [[TMP18:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !dbg [[DBG194]] +// CHECK1-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP18]], 1, !dbg [[DBG200:![0-9]+]] +// CHECK1-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]], !dbg [[DBG200]] +// CHECK1-NEXT: store i32 [[ADD]], ptr [[I]], align 4, !dbg [[DBG200]] +// CHECK1-NEXT: tail call void @llvm.dbg.declare(metadata ptr [[F]], metadata [[META201:![0-9]+]], metadata !DIExpression()), !dbg [[DBG203:![0-9]+]] +// CHECK1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x [10 x [10 x i32]]], ptr [[TMP2]], i64 0, i64 1, !dbg [[DBG204:![0-9]+]] +// CHECK1-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[ARRAYIDX]], i64 0, i64 1, !dbg [[DBG204]] +// CHECK1-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX6]], i64 0, i64 1, !dbg [[DBG204]] +// CHECK1-NEXT: store ptr [[ARRAYIDX7]], ptr [[F]], align 8, !dbg [[DBG203]] +// CHECK1-NEXT: tail call void @llvm.dbg.declare(metadata ptr [[G]], metadata [[META205:![0-9]+]], metadata !DIExpression()), !dbg [[DBG206:![0-9]+]] +// CHECK1-NEXT: store ptr [[A_ADDR]], ptr [[G]], align 8, !dbg [[DBG206]] +// CHECK1-NEXT: tail call void @llvm.dbg.declare(metadata ptr [[H]], metadata [[META207:![0-9]+]], metadata !DIExpression()), !dbg [[DBG208:![0-9]+]] +// CHECK1-NEXT: [[ARRAYIDX8:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[TMP5]], i64 0, i64 1, !dbg [[DBG209:![0-9]+]] +// CHECK1-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX8]], i64 0, i64 1, !dbg [[DBG209]] +// CHECK1-NEXT: store ptr [[ARRAYIDX9]], ptr [[H]], align 8, !dbg [[DBG208]] +// CHECK1-NEXT: tail call void @llvm.dbg.declare(metadata ptr [[D]], metadata [[META210:![0-9]+]], metadata !DIExpression()), !dbg [[DBG211:![0-9]+]] +// CHECK1-NEXT: store i32 15, ptr [[D]], align 4, !dbg [[DBG211]] +// CHECK1-NEXT: store i32 5, ptr [[A_ADDR]], align 4, !dbg [[DBG212:![0-9]+]] +// CHECK1-NEXT: [[ARRAYIDX10:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[TMP5]], i64 0, i64 0, !dbg [[DBG213:![0-9]+]] +// CHECK1-NEXT: [[TMP19:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG214:![0-9]+]] +// CHECK1-NEXT: [[IDXPROM:%.*]] = sext i32 [[TMP19]] to i64, !dbg [[DBG213]] +// CHECK1-NEXT: [[ARRAYIDX11:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX10]], i64 0, i64 [[IDXPROM]], !dbg [[DBG213]] +// CHECK1-NEXT: store i32 10, ptr [[ARRAYIDX11]], align 4, !dbg [[DBG215:![0-9]+]] +// CHECK1-NEXT: [[ARRAYIDX12:%.*]] = getelementptr inbounds [10 x [10 x [10 x i32]]], ptr [[TMP2]], i64 0, i64 0, !dbg [[DBG216:![0-9]+]] +// CHECK1-NEXT: [[ARRAYIDX13:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[ARRAYIDX12]], i64 0, i64 0, !dbg [[DBG216]] +// CHECK1-NEXT: [[TMP20:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG217:![0-9]+]] +// CHECK1-NEXT: [[IDXPROM14:%.*]] = sext i32 [[TMP20]] to i64, !dbg [[DBG216]] +// CHECK1-NEXT: [[ARRAYIDX15:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX13]], i64 0, i64 [[IDXPROM14]], !dbg [[DBG216]] +// CHECK1-NEXT: store i32 11, ptr [[ARRAYIDX15]], align 4, !dbg [[DBG218:![0-9]+]] +// CHECK1-NEXT: [[ARRAYIDX16:%.*]] = getelementptr inbounds [10 x [10 x [10 x i32]]], ptr [[TMP2]], i64 0, i64 0, !dbg [[DBG219:![0-9]+]] +// CHECK1-NEXT: [[ARRAYIDX17:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[ARRAYIDX16]], i64 0, i64 0, !dbg [[DBG219]] +// CHECK1-NEXT: [[TMP21:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG220:![0-9]+]] +// CHECK1-NEXT: [[IDXPROM18:%.*]] = sext i32 [[TMP21]] to i64, !dbg [[DBG219]] +// CHECK1-NEXT: [[ARRAYIDX19:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX17]], i64 0, i64 [[IDXPROM18]], !dbg [[DBG219]] +// CHECK1-NEXT: [[TMP22:%.*]] = load i32, ptr [[ARRAYIDX19]], align 4, !dbg [[DBG219]] +// CHECK1-NEXT: [[ARRAYIDX20:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[TMP5]], i64 0, i64 0, !dbg [[DBG221:![0-9]+]] +// CHECK1-NEXT: [[TMP23:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG222:![0-9]+]] +// CHECK1-NEXT: [[IDXPROM21:%.*]] = sext i32 [[TMP23]] to i64, !dbg [[DBG221]] +// CHECK1-NEXT: [[ARRAYIDX22:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX20]], i64 0, i64 [[IDXPROM21]], !dbg [[DBG221]] +// CHECK1-NEXT: store i32 [[TMP22]], ptr [[ARRAYIDX22]], align 4, !dbg [[DBG223:![0-9]+]] +// CHECK1-NEXT: [[TMP24:%.*]] = load i8, ptr [[TMP8]], align 1, !dbg [[DBG224:![0-9]+]] +// CHECK1-NEXT: [[TOBOOL:%.*]] = trunc i8 [[TMP24]] to i1, !dbg [[DBG224]] +// CHECK1-NEXT: [[CONV:%.*]] = zext i1 [[TOBOOL]] to i32, !dbg [[DBG224]] +// CHECK1-NEXT: store i32 [[CONV]], ptr [[D]], align 4, !dbg [[DBG225:![0-9]+]] +// CHECK1-NEXT: br label [[OMP_BODY_CONTINUE:%.*]], !dbg [[DBG226:![0-9]+]] // CHECK1: omp.body.continue: -// CHECK1-NEXT: br label [[OMP_INNER_FOR_INC:%.*]], !dbg [[DBG190]] +// CHECK1-NEXT: br label [[OMP_INNER_FOR_INC:%.*]], !dbg [[DBG199]] // CHECK1: omp.inner.for.inc: -// CHECK1-NEXT: [[TMP25:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !dbg [[DBG185]] -// CHECK1-NEXT: [[ADD23:%.*]] = add nsw i32 [[TMP25]], 1, !dbg [[DBG182]] -// CHECK1-NEXT: store i32 [[ADD23]], ptr [[DOTOMP_IV]], align 4, !dbg [[DBG182]] -// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND]], !dbg [[DBG190]], !llvm.loop [[LOOP218:![0-9]+]] +// CHECK1-NEXT: [[TMP25:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !dbg [[DBG194]] +// CHECK1-NEXT: [[ADD23:%.*]] = add nsw i32 [[TMP25]], 1, !dbg [[DBG191]] +// CHECK1-NEXT: store i32 [[ADD23]], ptr [[DOTOMP_IV]], align 4, !dbg [[DBG191]] +// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND]], !dbg [[DBG199]], !llvm.loop [[LOOP227:![0-9]+]] // CHECK1: omp.inner.for.end: -// CHECK1-NEXT: br label [[OMP_DISPATCH_INC:%.*]], !dbg [[DBG190]] +// CHECK1-NEXT: br label [[OMP_DISPATCH_INC:%.*]], !dbg [[DBG199]] // CHECK1: omp.dispatch.inc: -// CHECK1-NEXT: [[TMP26:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4, !dbg [[DBG185]] -// CHECK1-NEXT: [[TMP27:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !dbg [[DBG185]] -// CHECK1-NEXT: [[ADD24:%.*]] = add nsw i32 [[TMP26]], [[TMP27]], !dbg [[DBG182]] -// CHECK1-NEXT: store i32 [[ADD24]], ptr [[DOTOMP_LB]], align 4, !dbg [[DBG182]] -// CHECK1-NEXT: [[TMP28:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !dbg [[DBG185]] -// CHECK1-NEXT: [[TMP29:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !dbg [[DBG185]] -// CHECK1-NEXT: [[ADD25:%.*]] = add nsw i32 [[TMP28]], [[TMP29]], !dbg [[DBG182]] -// CHECK1-NEXT: store i32 [[ADD25]], ptr [[DOTOMP_UB]], align 4, !dbg [[DBG182]] -// CHECK1-NEXT: br label [[OMP_DISPATCH_COND]], !dbg [[DBG190]], !llvm.loop [[LOOP220:![0-9]+]] +// CHECK1-NEXT: [[TMP26:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4, !dbg [[DBG194]] +// CHECK1-NEXT: [[TMP27:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !dbg [[DBG194]] +// CHECK1-NEXT: [[ADD24:%.*]] = add nsw i32 [[TMP26]], [[TMP27]], !dbg [[DBG191]] +// CHECK1-NEXT: store i32 [[ADD24]], ptr [[DOTOMP_LB]], align 4, !dbg [[DBG191]] +// CHECK1-NEXT: [[TMP28:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !dbg [[DBG194]] +// CHECK1-NEXT: [[TMP29:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !dbg [[DBG194]] +// CHECK1-NEXT: [[ADD25:%.*]] = add nsw i32 [[TMP28]], [[TMP29]], !dbg [[DBG191]] +// CHECK1-NEXT: store i32 [[ADD25]], ptr [[DOTOMP_UB]], align 4, !dbg [[DBG191]] +// CHECK1-NEXT: br label [[OMP_DISPATCH_COND]], !dbg [[DBG199]], !llvm.loop [[LOOP229:![0-9]+]] // CHECK1: omp.dispatch.end: -// CHECK1-NEXT: call void @__kmpc_for_static_fini(ptr @[[GLOB12:[0-9]+]], i32 [[TMP10]]), !dbg [[DBG219:![0-9]+]] -// CHECK1-NEXT: ret void, !dbg [[DBG221:![0-9]+]] +// CHECK1-NEXT: call void @__kmpc_for_static_fini(ptr @[[GLOB12:[0-9]+]], i32 [[TMP10]]), !dbg [[DBG228:![0-9]+]] +// CHECK1-NEXT: ret void, !dbg [[DBG230:![0-9]+]] // // // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l27_debug___omp_outlined -// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4000) [[C:%.*]], i64 noundef [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(400) [[B:%.*]], ptr noundef nonnull align 1 dereferenceable(1) [[BB:%.*]]) #[[ATTR2]] !dbg [[DBG222:![0-9]+]] { +// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4000) [[C:%.*]], i64 noundef [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(400) [[B:%.*]], ptr noundef nonnull align 1 dereferenceable(1) [[BB:%.*]]) #[[ATTR1]] !dbg [[DBG231:![0-9]+]] { // CHECK1-NEXT: entry: // CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -583,68 +616,35 @@ int main() { // CHECK1-NEXT: [[B_ADDR:%.*]] = alloca ptr, align 8 // CHECK1-NEXT: [[BB_ADDR:%.*]] = alloca ptr, align 8 // CHECK1-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK1-NEXT: call void @llvm.dbg.declare(metadata ptr [[DOTGLOBAL_TID__ADDR]], metadata [[META223:![0-9]+]], metadata !DIExpression()), !dbg [[DBG224:![0-9]+]] +// CHECK1-NEXT: tail call void @llvm.dbg.declare(metadata ptr [[DOTGLOBAL_TID__ADDR]], metadata [[META232:![0-9]+]], metadata !DIExpression()), !dbg [[DBG233:![0-9]+]] // CHECK1-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8 -// CHECK1-NEXT: call void @llvm.dbg.declare(metadata ptr [[DOTBOUND_TID__ADDR]], metadata [[META225:![0-9]+]], metadata !DIExpression()), !dbg [[DBG224]] -// CHECK1-NEXT: store ptr [[C]], ptr [[C_ADDR]], align 8 -// CHECK1-NEXT: call void @llvm.dbg.declare(metadata ptr [[C_ADDR]], metadata [[META226:![0-9]+]], metadata !DIExpression()), !dbg [[DBG224]] -// CHECK1-NEXT: store i64 [[A]], ptr [[A_ADDR]], align 8 -// CHECK1-NEXT: call void @llvm.dbg.declare(metadata ptr [[A_ADDR]], metadata [[META227:![0-9]+]], metadata !DIExpression()), !dbg [[DBG224]] -// CHECK1-NEXT: store ptr [[B]], ptr [[B_ADDR]], align 8 -// CHECK1-NEXT: call void @llvm.dbg.declare(metadata ptr [[B_ADDR]], metadata [[META228:![0-9]+]], metadata !DIExpression()), !dbg [[DBG224]] -// CHECK1-NEXT: store ptr [[BB]], ptr [[BB_ADDR]], align 8 -// CHECK1-NEXT: call void @llvm.dbg.declare(metadata ptr [[BB_ADDR]], metadata [[META229:![0-9]+]], metadata !DIExpression()), !dbg [[DBG224]] -// CHECK1-NEXT: [[TMP0:%.*]] = load ptr, ptr [[C_ADDR]], align 8, !dbg [[DBG230:![0-9]+]] -// CHECK1-NEXT: [[TMP1:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG230]] -// CHECK1-NEXT: [[TMP2:%.*]] = load ptr, ptr [[BB_ADDR]], align 8, !dbg [[DBG230]] -// CHECK1-NEXT: [[TMP3:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8, !dbg [[DBG230]] -// CHECK1-NEXT: [[TMP4:%.*]] = load ptr, ptr [[DOTBOUND_TID__ADDR]], align 8, !dbg [[DBG230]] -// CHECK1-NEXT: [[TMP5:%.*]] = load ptr, ptr [[C_ADDR]], align 8, !dbg [[DBG230]] -// CHECK1-NEXT: [[TMP6:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG230]] -// CHECK1-NEXT: [[TMP7:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG230]] -// CHECK1-NEXT: [[TMP8:%.*]] = load ptr, ptr [[BB_ADDR]], align 8, !dbg [[DBG230]] -// CHECK1-NEXT: [[TMP9:%.*]] = addrspacecast ptr [[TMP5]] to ptr addrspace(1), !dbg [[DBG230]] -// CHECK1-NEXT: [[TMP10:%.*]] = addrspacecast ptr [[TMP7]] to ptr addrspace(1), !dbg [[DBG230]] -// CHECK1-NEXT: [[TMP11:%.*]] = addrspacecast ptr [[TMP8]] to ptr addrspace(1), !dbg [[DBG230]] -// CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l27_debug___omp_outlined_debug__(ptr [[TMP3]], ptr [[TMP4]], ptr addrspace(1) [[TMP9]], i32 [[TMP6]], ptr addrspace(1) [[TMP10]], ptr addrspace(1) [[TMP11]]) #[[ATTR4]], !dbg [[DBG230]] -// CHECK1-NEXT: ret void, !dbg [[DBG230]] -// -// -// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l27 -// CHECK1-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], ptr noundef nonnull align 4 dereferenceable(4000) [[C:%.*]], i64 noundef [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(400) [[B:%.*]], ptr noundef nonnull align 1 dereferenceable(1) [[BB:%.*]]) #[[ATTR6]] !dbg [[DBG231:![0-9]+]] { -// CHECK1-NEXT: entry: -// CHECK1-NEXT: [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8 -// CHECK1-NEXT: [[C_ADDR:%.*]] = alloca ptr, align 8 -// CHECK1-NEXT: [[A_ADDR:%.*]] = alloca i64, align 8 -// CHECK1-NEXT: [[B_ADDR:%.*]] = alloca ptr, align 8 -// CHECK1-NEXT: [[BB_ADDR:%.*]] = alloca ptr, align 8 -// CHECK1-NEXT: store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8 -// CHECK1-NEXT: call void @llvm.dbg.declare(metadata ptr [[DYN_PTR_ADDR]], metadata [[META234:![0-9]+]], metadata !DIExpression()), !dbg [[DBG235:![0-9]+]] +// CHECK1-NEXT: tail call void @llvm.dbg.declare(metadata ptr [[DOTBOUND_TID__ADDR]], metadata [[META234:![0-9]+]], metadata !DIExpression()), !dbg [[DBG233]] // CHECK1-NEXT: store ptr [[C]], ptr [[C_ADDR]], align 8 -// CHECK1-NEXT: call void @llvm.dbg.declare(metadata ptr [[C_ADDR]], metadata [[META236:![0-9]+]], metadata !DIExpression()), !dbg [[DBG235]] +// CHECK1-NEXT: tail call void @llvm.dbg.declare(metadata ptr [[C_ADDR]], metadata [[META235:![0-9]+]], metadata !DIExpression()), !dbg [[DBG233]] // CHECK1-NEXT: store i64 [[A]], ptr [[A_ADDR]], align 8 -// CHECK1-NEXT: call void @llvm.dbg.declare(metadata ptr [[A_ADDR]], metadata [[META237:![0-9]+]], metadata !DIExpression()), !dbg [[DBG235]] +// CHECK1-NEXT: tail call void @llvm.dbg.declare(metadata ptr [[A_ADDR]], metadata [[META236:![0-9]+]], metadata !DIExpression()), !dbg [[DBG233]] // CHECK1-NEXT: store ptr [[B]], ptr [[B_ADDR]], align 8 -// CHECK1-NEXT: call void @llvm.dbg.declare(metadata ptr [[B_ADDR]], metadata [[META238:![0-9]+]], metadata !DIExpression()), !dbg [[DBG235]] +// CHECK1-NEXT: tail call void @llvm.dbg.declare(metadata ptr [[B_ADDR]], metadata [[META237:![0-9]+]], metadata !DIExpression()), !dbg [[DBG233]] // CHECK1-NEXT: store ptr [[BB]], ptr [[BB_ADDR]], align 8 -// CHECK1-NEXT: call void @llvm.dbg.declare(metadata ptr [[BB_ADDR]], metadata [[META239:![0-9]+]], metadata !DIExpression()), !dbg [[DBG235]] -// CHECK1-NEXT: [[TMP0:%.*]] = load ptr, ptr [[C_ADDR]], align 8, !dbg [[DBG240:![0-9]+]] -// CHECK1-NEXT: [[TMP1:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG240]] -// CHECK1-NEXT: [[TMP2:%.*]] = load ptr, ptr [[BB_ADDR]], align 8, !dbg [[DBG240]] -// CHECK1-NEXT: [[TMP3:%.*]] = load ptr, ptr [[DYN_PTR_ADDR]], align 8, !dbg [[DBG240]] -// CHECK1-NEXT: [[TMP4:%.*]] = load ptr, ptr [[C_ADDR]], align 8, !dbg [[DBG240]] -// CHECK1-NEXT: [[TMP5:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG240]] -// CHECK1-NEXT: [[TMP6:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG240]] -// CHECK1-NEXT: [[TMP7:%.*]] = load ptr, ptr [[BB_ADDR]], align 8, !dbg [[DBG240]] -// CHECK1-NEXT: [[TMP8:%.*]] = addrspacecast ptr [[TMP4]] to ptr addrspace(1), !dbg [[DBG240]] -// CHECK1-NEXT: [[TMP9:%.*]] = addrspacecast ptr [[TMP6]] to ptr addrspace(1), !dbg [[DBG240]] -// CHECK1-NEXT: [[TMP10:%.*]] = addrspacecast ptr [[TMP7]] to ptr addrspace(1), !dbg [[DBG240]] -// CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l27_debug__(ptr [[TMP3]], ptr addrspace(1) [[TMP8]], i32 [[TMP5]], ptr addrspace(1) [[TMP9]], ptr addrspace(1) [[TMP10]]) #[[ATTR4]], !dbg [[DBG240]] -// CHECK1-NEXT: ret void, !dbg [[DBG240]] +// CHECK1-NEXT: tail call void @llvm.dbg.declare(metadata ptr [[BB_ADDR]], metadata [[META238:![0-9]+]], metadata !DIExpression()), !dbg [[DBG233]] +// CHECK1-NEXT: [[TMP0:%.*]] = load ptr, ptr [[C_ADDR]], align 8, !dbg [[DBG239:![0-9]+]] +// CHECK1-NEXT: [[TMP1:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG239]] +// CHECK1-NEXT: [[TMP2:%.*]] = load ptr, ptr [[BB_ADDR]], align 8, !dbg [[DBG239]] +// CHECK1-NEXT: [[TMP3:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8, !dbg [[DBG239]] +// CHECK1-NEXT: [[TMP4:%.*]] = load ptr, ptr [[DOTBOUND_TID__ADDR]], align 8, !dbg [[DBG239]] +// CHECK1-NEXT: [[TMP5:%.*]] = load ptr, ptr [[C_ADDR]], align 8, !dbg [[DBG239]] +// CHECK1-NEXT: [[TMP6:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG239]] +// CHECK1-NEXT: [[TMP7:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG239]] +// CHECK1-NEXT: [[TMP8:%.*]] = load ptr, ptr [[BB_ADDR]], align 8, !dbg [[DBG239]] +// CHECK1-NEXT: [[TMP9:%.*]] = addrspacecast ptr [[TMP5]] to ptr addrspace(1), !dbg [[DBG239]] +// CHECK1-NEXT: [[TMP10:%.*]] = addrspacecast ptr [[TMP7]] to ptr addrspace(1), !dbg [[DBG239]] +// CHECK1-NEXT: [[TMP11:%.*]] = addrspacecast ptr [[TMP8]] to ptr addrspace(1), !dbg [[DBG239]] +// CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l27_debug___omp_outlined_debug__(ptr [[TMP3]], ptr [[TMP4]], ptr addrspace(1) [[TMP9]], i32 [[TMP6]], ptr addrspace(1) [[TMP10]], ptr addrspace(1) [[TMP11]]) #[[ATTR4]], !dbg [[DBG239]] +// CHECK1-NEXT: ret void, !dbg [[DBG239]] // // // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l41_debug__ -// CHECK1-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], ptr addrspace(1) noalias noundef [[C:%.*]], ptr addrspace(1) noalias noundef [[A:%.*]], ptr addrspace(1) noalias noundef [[B:%.*]], ptr addrspace(1) noalias noundef [[BB:%.*]]) #[[ATTR0]] !dbg [[DBG241:![0-9]+]] { +// CHECK1-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], ptr addrspace(1) noalias noundef [[C:%.*]], ptr addrspace(1) noalias noundef [[A:%.*]], ptr addrspace(1) noalias noundef [[B:%.*]], ptr addrspace(1) noalias noundef [[BB:%.*]]) #[[ATTR1]] !dbg [[DBG240:![0-9]+]] { // CHECK1-NEXT: entry: // CHECK1-NEXT: [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8 // CHECK1-NEXT: [[C_ADDR:%.*]] = alloca ptr addrspace(1), align 8 @@ -657,53 +657,88 @@ int main() { // CHECK1-NEXT: [[_TMP3:%.*]] = alloca ptr, align 8 // CHECK1-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [4 x ptr], align 8 // CHECK1-NEXT: store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8 -// CHECK1-NEXT: call void @llvm.dbg.declare(metadata ptr [[DYN_PTR_ADDR]], metadata [[META246:![0-9]+]], metadata !DIExpression()), !dbg [[DBG247:![0-9]+]] +// CHECK1-NEXT: tail call void @llvm.dbg.declare(metadata ptr [[DYN_PTR_ADDR]], metadata [[META245:![0-9]+]], metadata !DIExpression()), !dbg [[DBG246:![0-9]+]] // CHECK1-NEXT: store ptr addrspace(1) [[C]], ptr [[C_ADDR]], align 8 -// CHECK1-NEXT: call void @llvm.dbg.declare(metadata ptr [[C_ADDR]], metadata [[META248:![0-9]+]], metadata !DIExpression()), !dbg [[DBG249:![0-9]+]] +// CHECK1-NEXT: tail call void @llvm.dbg.declare(metadata ptr [[C_ADDR]], metadata [[META247:![0-9]+]], metadata !DIExpression()), !dbg [[DBG248:![0-9]+]] // CHECK1-NEXT: store ptr addrspace(1) [[A]], ptr [[A_ADDR]], align 8 -// CHECK1-NEXT: call void @llvm.dbg.declare(metadata ptr [[A_ADDR]], metadata [[META250:![0-9]+]], metadata !DIExpression()), !dbg [[DBG251:![0-9]+]] +// CHECK1-NEXT: tail call void @llvm.dbg.declare(metadata ptr [[A_ADDR]], metadata [[META249:![0-9]+]], metadata !DIExpression()), !dbg [[DBG250:![0-9]+]] // CHECK1-NEXT: store ptr addrspace(1) [[B]], ptr [[B_ADDR]], align 8 -// CHECK1-NEXT: call void @llvm.dbg.declare(metadata ptr [[B_ADDR]], metadata [[META252:![0-9]+]], metadata !DIExpression()), !dbg [[DBG253:![0-9]+]] +// CHECK1-NEXT: tail call void @llvm.dbg.declare(metadata ptr [[B_ADDR]], metadata [[META251:![0-9]+]], metadata !DIExpression()), !dbg [[DBG252:![0-9]+]] // CHECK1-NEXT: store ptr addrspace(1) [[BB]], ptr [[BB_ADDR]], align 8 -// CHECK1-NEXT: call void @llvm.dbg.declare(metadata ptr [[BB_ADDR]], metadata [[META254:![0-9]+]], metadata !DIExpression()), !dbg [[DBG255:![0-9]+]] -// CHECK1-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr [[C_ADDR]], align 8, !dbg [[DBG256:![0-9]+]] -// CHECK1-NEXT: [[TMP1:%.*]] = addrspacecast ptr addrspace(1) [[TMP0]] to ptr, !dbg [[DBG256]] -// CHECK1-NEXT: store ptr [[TMP1]], ptr [[TMP]], align 8, !dbg [[DBG256]] -// CHECK1-NEXT: [[TMP2:%.*]] = load ptr, ptr [[TMP]], align 8, !dbg [[DBG256]] -// CHECK1-NEXT: [[TMP3:%.*]] = load ptr addrspace(1), ptr [[A_ADDR]], align 8, !dbg [[DBG256]] -// CHECK1-NEXT: [[TMP4:%.*]] = addrspacecast ptr addrspace(1) [[TMP3]] to ptr, !dbg [[DBG256]] -// CHECK1-NEXT: store ptr [[TMP4]], ptr [[_TMP1]], align 8, !dbg [[DBG256]] -// CHECK1-NEXT: [[TMP5:%.*]] = load ptr, ptr [[_TMP1]], align 8, !dbg [[DBG256]] -// CHECK1-NEXT: [[TMP6:%.*]] = load ptr addrspace(1), ptr [[B_ADDR]], align 8, !dbg [[DBG256]] -// CHECK1-NEXT: [[TMP7:%.*]] = addrspacecast ptr addrspace(1) [[TMP6]] to ptr, !dbg [[DBG256]] -// CHECK1-NEXT: store ptr [[TMP7]], ptr [[_TMP2]], align 8, !dbg [[DBG256]] -// CHECK1-NEXT: [[TMP8:%.*]] = load ptr, ptr [[_TMP2]], align 8, !dbg [[DBG256]] -// CHECK1-NEXT: [[TMP9:%.*]] = load ptr addrspace(1), ptr [[BB_ADDR]], align 8, !dbg [[DBG256]] -// CHECK1-NEXT: [[TMP10:%.*]] = addrspacecast ptr addrspace(1) [[TMP9]] to ptr, !dbg [[DBG256]] -// CHECK1-NEXT: store ptr [[TMP10]], ptr [[_TMP3]], align 8, !dbg [[DBG256]] -// CHECK1-NEXT: [[TMP11:%.*]] = load ptr, ptr [[_TMP3]], align 8, !dbg [[DBG256]] -// CHECK1-NEXT: [[TMP12:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l41_kernel_environment, ptr [[DYN_PTR]]), !dbg [[DBG256]] -// CHECK1-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP12]], -1, !dbg [[DBG256]] -// CHECK1-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]], !dbg [[DBG256]] +// CHECK1-NEXT: tail call void @llvm.dbg.declare(metadata ptr [[BB_ADDR]], metadata [[META253:![0-9]+]], metadata !DIExpression()), !dbg [[DBG254:![0-9]+]] +// CHECK1-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr [[C_ADDR]], align 8, !dbg [[DBG255:![0-9]+]] +// CHECK1-NEXT: [[TMP1:%.*]] = addrspacecast ptr addrspace(1) [[TMP0]] to ptr, !dbg [[DBG255]] +// CHECK1-NEXT: store ptr [[TMP1]], ptr [[TMP]], align 8, !dbg [[DBG255]] +// CHECK1-NEXT: [[TMP2:%.*]] = load ptr, ptr [[TMP]], align 8, !dbg [[DBG255]] +// CHECK1-NEXT: [[TMP3:%.*]] = load ptr addrspace(1), ptr [[A_ADDR]], align 8, !dbg [[DBG255]] +// CHECK1-NEXT: [[TMP4:%.*]] = addrspacecast ptr addrspace(1) [[TMP3]] to ptr, !dbg [[DBG255]] +// CHECK1-NEXT: store ptr [[TMP4]], ptr [[_TMP1]], align 8, !dbg [[DBG255]] +// CHECK1-NEXT: [[TMP5:%.*]] = load ptr, ptr [[_TMP1]], align 8, !dbg [[DBG255]] +// CHECK1-NEXT: [[TMP6:%.*]] = load ptr addrspace(1), ptr [[B_ADDR]], align 8, !dbg [[DBG255]] +// CHECK1-NEXT: [[TMP7:%.*]] = addrspacecast ptr addrspace(1) [[TMP6]] to ptr, !dbg [[DBG255]] +// CHECK1-NEXT: store ptr [[TMP7]], ptr [[_TMP2]], align 8, !dbg [[DBG255]] +// CHECK1-NEXT: [[TMP8:%.*]] = load ptr, ptr [[_TMP2]], align 8, !dbg [[DBG255]] +// CHECK1-NEXT: [[TMP9:%.*]] = load ptr addrspace(1), ptr [[BB_ADDR]], align 8, !dbg [[DBG255]] +// CHECK1-NEXT: [[TMP10:%.*]] = addrspacecast ptr addrspace(1) [[TMP9]] to ptr, !dbg [[DBG255]] +// CHECK1-NEXT: store ptr [[TMP10]], ptr [[_TMP3]], align 8, !dbg [[DBG255]] +// CHECK1-NEXT: [[TMP11:%.*]] = load ptr, ptr [[_TMP3]], align 8, !dbg [[DBG255]] +// CHECK1-NEXT: [[TMP12:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l41_kernel_environment, ptr [[DYN_PTR]]), !dbg [[DBG255]] +// CHECK1-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP12]], -1, !dbg [[DBG255]] +// CHECK1-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]], !dbg [[DBG255]] // CHECK1: user_code.entry: -// CHECK1-NEXT: [[TMP13:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB20:[0-9]+]]) -// CHECK1-NEXT: [[TMP14:%.*]] = getelementptr inbounds [4 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 0, !dbg [[DBG257:![0-9]+]] -// CHECK1-NEXT: store ptr [[TMP2]], ptr [[TMP14]], align 8, !dbg [[DBG257]] -// CHECK1-NEXT: [[TMP15:%.*]] = getelementptr inbounds [4 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 1, !dbg [[DBG257]] -// CHECK1-NEXT: store ptr [[TMP5]], ptr [[TMP15]], align 8, !dbg [[DBG257]] -// CHECK1-NEXT: [[TMP16:%.*]] = getelementptr inbounds [4 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 2, !dbg [[DBG257]] -// CHECK1-NEXT: store ptr [[TMP8]], ptr [[TMP16]], align 8, !dbg [[DBG257]] -// CHECK1-NEXT: [[TMP17:%.*]] = getelementptr inbounds [4 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 3, !dbg [[DBG257]] -// CHECK1-NEXT: store ptr [[TMP11]], ptr [[TMP17]], align 8, !dbg [[DBG257]] -// CHECK1-NEXT: call void @__kmpc_parallel_51(ptr @[[GLOB20]], i32 [[TMP13]], i32 1, i32 -1, i32 -1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l41_debug___omp_outlined, ptr null, ptr [[CAPTURED_VARS_ADDRS]], i64 4), !dbg [[DBG257]] -// CHECK1-NEXT: call void @__kmpc_target_deinit(), !dbg [[DBG258:![0-9]+]] +// CHECK1-NEXT: [[TMP13:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB20:[0-9]+]]), !dbg [[DBG256:![0-9]+]] +// CHECK1-NEXT: [[TMP14:%.*]] = getelementptr inbounds [4 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 0, !dbg [[DBG258:![0-9]+]] +// CHECK1-NEXT: store ptr [[TMP2]], ptr [[TMP14]], align 8, !dbg [[DBG258]] +// CHECK1-NEXT: [[TMP15:%.*]] = getelementptr inbounds [4 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 1, !dbg [[DBG258]] +// CHECK1-NEXT: store ptr [[TMP5]], ptr [[TMP15]], align 8, !dbg [[DBG258]] +// CHECK1-NEXT: [[TMP16:%.*]] = getelementptr inbounds [4 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 2, !dbg [[DBG258]] +// CHECK1-NEXT: store ptr [[TMP8]], ptr [[TMP16]], align 8, !dbg [[DBG258]] +// CHECK1-NEXT: [[TMP17:%.*]] = getelementptr inbounds [4 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 3, !dbg [[DBG258]] +// CHECK1-NEXT: store ptr [[TMP11]], ptr [[TMP17]], align 8, !dbg [[DBG258]] +// CHECK1-NEXT: call void @__kmpc_parallel_51(ptr @[[GLOB20]], i32 [[TMP13]], i32 1, i32 -1, i32 -1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l41_debug___omp_outlined, ptr null, ptr [[CAPTURED_VARS_ADDRS]], i64 4), !dbg [[DBG258]] +// CHECK1-NEXT: call void @__kmpc_target_deinit(), !dbg [[DBG259:![0-9]+]] // CHECK1-NEXT: ret void, !dbg [[DBG260:![0-9]+]] // CHECK1: worker.exit: -// CHECK1-NEXT: ret void, !dbg [[DBG256]] +// CHECK1-NEXT: ret void, !dbg [[DBG255]] +// +// +// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l41 +// CHECK1-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], ptr noundef nonnull align 4 dereferenceable(4000) [[C:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(400) [[B:%.*]], ptr noundef nonnull align 1 dereferenceable(1) [[BB:%.*]]) #[[ATTR2]] !dbg [[DBG261:![0-9]+]] { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8 +// CHECK1-NEXT: [[C_ADDR:%.*]] = alloca ptr, align 8 +// CHECK1-NEXT: [[A_ADDR:%.*]] = alloca ptr, align 8 +// CHECK1-NEXT: [[B_ADDR:%.*]] = alloca ptr, align 8 +// CHECK1-NEXT: [[BB_ADDR:%.*]] = alloca ptr, align 8 +// CHECK1-NEXT: store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8 +// CHECK1-NEXT: tail call void @llvm.dbg.declare(metadata ptr [[DYN_PTR_ADDR]], metadata [[META264:![0-9]+]], metadata !DIExpression()), !dbg [[DBG265:![0-9]+]] +// CHECK1-NEXT: store ptr [[C]], ptr [[C_ADDR]], align 8 +// CHECK1-NEXT: tail call void @llvm.dbg.declare(metadata ptr [[C_ADDR]], metadata [[META266:![0-9]+]], metadata !DIExpression()), !dbg [[DBG265]] +// CHECK1-NEXT: store ptr [[A]], ptr [[A_ADDR]], align 8 +// CHECK1-NEXT: tail call void @llvm.dbg.declare(metadata ptr [[A_ADDR]], metadata [[META267:![0-9]+]], metadata !DIExpression()), !dbg [[DBG265]] +// CHECK1-NEXT: store ptr [[B]], ptr [[B_ADDR]], align 8 +// CHECK1-NEXT: tail call void @llvm.dbg.declare(metadata ptr [[B_ADDR]], metadata [[META268:![0-9]+]], metadata !DIExpression()), !dbg [[DBG265]] +// CHECK1-NEXT: store ptr [[BB]], ptr [[BB_ADDR]], align 8 +// CHECK1-NEXT: tail call void @llvm.dbg.declare(metadata ptr [[BB_ADDR]], metadata [[META269:![0-9]+]], metadata !DIExpression()), !dbg [[DBG265]] +// CHECK1-NEXT: [[TMP0:%.*]] = load ptr, ptr [[C_ADDR]], align 8, !dbg [[DBG270:![0-9]+]] +// CHECK1-NEXT: [[TMP1:%.*]] = load ptr, ptr [[A_ADDR]], align 8, !dbg [[DBG270]] +// CHECK1-NEXT: [[TMP2:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG270]] +// CHECK1-NEXT: [[TMP3:%.*]] = load ptr, ptr [[BB_ADDR]], align 8, !dbg [[DBG270]] +// CHECK1-NEXT: [[TMP4:%.*]] = load ptr, ptr [[DYN_PTR_ADDR]], align 8, !dbg [[DBG270]] +// CHECK1-NEXT: [[TMP5:%.*]] = load ptr, ptr [[C_ADDR]], align 8, !dbg [[DBG270]] +// CHECK1-NEXT: [[TMP6:%.*]] = load ptr, ptr [[A_ADDR]], align 8, !dbg [[DBG270]] +// CHECK1-NEXT: [[TMP7:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG270]] +// CHECK1-NEXT: [[TMP8:%.*]] = load ptr, ptr [[BB_ADDR]], align 8, !dbg [[DBG270]] +// CHECK1-NEXT: [[TMP9:%.*]] = addrspacecast ptr [[TMP5]] to ptr addrspace(1), !dbg [[DBG270]] +// CHECK1-NEXT: [[TMP10:%.*]] = addrspacecast ptr [[TMP6]] to ptr addrspace(1), !dbg [[DBG270]] +// CHECK1-NEXT: [[TMP11:%.*]] = addrspacecast ptr [[TMP7]] to ptr addrspace(1), !dbg [[DBG270]] +// CHECK1-NEXT: [[TMP12:%.*]] = addrspacecast ptr [[TMP8]] to ptr addrspace(1), !dbg [[DBG270]] +// CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l41_debug__(ptr [[TMP4]], ptr addrspace(1) [[TMP9]], ptr addrspace(1) [[TMP10]], ptr addrspace(1) [[TMP11]], ptr addrspace(1) [[TMP12]]) #[[ATTR4]], !dbg [[DBG270]] +// CHECK1-NEXT: ret void, !dbg [[DBG270]] // // // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l41_debug___omp_outlined_debug__ -// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr addrspace(1) noalias noundef [[C:%.*]], ptr addrspace(1) noalias noundef [[A:%.*]], ptr addrspace(1) noalias noundef [[B:%.*]], ptr addrspace(1) noalias noundef [[BB:%.*]]) #[[ATTR2]] !dbg [[DBG261:![0-9]+]] { +// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr addrspace(1) noalias noundef [[C:%.*]], ptr addrspace(1) noalias noundef [[A:%.*]], ptr addrspace(1) noalias noundef [[B:%.*]], ptr addrspace(1) noalias noundef [[BB:%.*]]) #[[ATTR1]] !dbg [[DBG271:![0-9]+]] { // CHECK1-NEXT: entry: // CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -727,148 +762,148 @@ int main() { // CHECK1-NEXT: [[H:%.*]] = alloca ptr, align 8 // CHECK1-NEXT: [[D:%.*]] = alloca i32, align 4 // CHECK1-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK1-NEXT: call void @llvm.dbg.declare(metadata ptr [[DOTGLOBAL_TID__ADDR]], metadata [[META264:![0-9]+]], metadata !DIExpression()), !dbg [[DBG265:![0-9]+]] +// CHECK1-NEXT: tail call void @llvm.dbg.declare(metadata ptr [[DOTGLOBAL_TID__ADDR]], metadata [[META274:![0-9]+]], metadata !DIExpression()), !dbg [[DBG275:![0-9]+]] // CHECK1-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8 -// CHECK1-NEXT: call void @llvm.dbg.declare(metadata ptr [[DOTBOUND_TID__ADDR]], metadata [[META266:![0-9]+]], metadata !DIExpression()), !dbg [[DBG265]] +// CHECK1-NEXT: tail call void @llvm.dbg.declare(metadata ptr [[DOTBOUND_TID__ADDR]], metadata [[META276:![0-9]+]], metadata !DIExpression()), !dbg [[DBG275]] // CHECK1-NEXT: store ptr addrspace(1) [[C]], ptr [[C_ADDR]], align 8 -// CHECK1-NEXT: call void @llvm.dbg.declare(metadata ptr [[C_ADDR]], metadata [[META267:![0-9]+]], metadata !DIExpression()), !dbg [[DBG268:![0-9]+]] +// CHECK1-NEXT: tail call void @llvm.dbg.declare(metadata ptr [[C_ADDR]], metadata [[META277:![0-9]+]], metadata !DIExpression()), !dbg [[DBG278:![0-9]+]] // CHECK1-NEXT: store ptr addrspace(1) [[A]], ptr [[A_ADDR]], align 8 -// CHECK1-NEXT: call void @llvm.dbg.declare(metadata ptr [[A_ADDR]], metadata [[META269:![0-9]+]], metadata !DIExpression()), !dbg [[DBG270:![0-9]+]] +// CHECK1-NEXT: tail call void @llvm.dbg.declare(metadata ptr [[A_ADDR]], metadata [[META279:![0-9]+]], metadata !DIExpression()), !dbg [[DBG280:![0-9]+]] // CHECK1-NEXT: store ptr addrspace(1) [[B]], ptr [[B_ADDR]], align 8 -// CHECK1-NEXT: call void @llvm.dbg.declare(metadata ptr [[B_ADDR]], metadata [[META271:![0-9]+]], metadata !DIExpression()), !dbg [[DBG272:![0-9]+]] +// CHECK1-NEXT: tail call void @llvm.dbg.declare(metadata ptr [[B_ADDR]], metadata [[META281:![0-9]+]], metadata !DIExpression()), !dbg [[DBG282:![0-9]+]] // CHECK1-NEXT: store ptr addrspace(1) [[BB]], ptr [[BB_ADDR]], align 8 -// CHECK1-NEXT: call void @llvm.dbg.declare(metadata ptr [[BB_ADDR]], metadata [[META273:![0-9]+]], metadata !DIExpression()), !dbg [[DBG274:![0-9]+]] -// CHECK1-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr [[C_ADDR]], align 8, !dbg [[DBG275:![0-9]+]] -// CHECK1-NEXT: [[TMP1:%.*]] = addrspacecast ptr addrspace(1) [[TMP0]] to ptr, !dbg [[DBG275]] -// CHECK1-NEXT: store ptr [[TMP1]], ptr [[TMP]], align 8, !dbg [[DBG275]] -// CHECK1-NEXT: [[TMP2:%.*]] = load ptr, ptr [[TMP]], align 8, !dbg [[DBG275]] -// CHECK1-NEXT: [[TMP3:%.*]] = load ptr addrspace(1), ptr [[A_ADDR]], align 8, !dbg [[DBG275]] -// CHECK1-NEXT: [[TMP4:%.*]] = addrspacecast ptr addrspace(1) [[TMP3]] to ptr, !dbg [[DBG275]] -// CHECK1-NEXT: store ptr [[TMP4]], ptr [[_TMP1]], align 8, !dbg [[DBG275]] -// CHECK1-NEXT: [[TMP5:%.*]] = load ptr, ptr [[_TMP1]], align 8, !dbg [[DBG275]] -// CHECK1-NEXT: [[TMP6:%.*]] = load ptr addrspace(1), ptr [[B_ADDR]], align 8, !dbg [[DBG275]] -// CHECK1-NEXT: [[TMP7:%.*]] = addrspacecast ptr addrspace(1) [[TMP6]] to ptr, !dbg [[DBG275]] -// CHECK1-NEXT: store ptr [[TMP7]], ptr [[_TMP2]], align 8, !dbg [[DBG275]] -// CHECK1-NEXT: [[TMP8:%.*]] = load ptr, ptr [[_TMP2]], align 8, !dbg [[DBG275]] -// CHECK1-NEXT: [[TMP9:%.*]] = load ptr addrspace(1), ptr [[BB_ADDR]], align 8, !dbg [[DBG275]] -// CHECK1-NEXT: [[TMP10:%.*]] = addrspacecast ptr addrspace(1) [[TMP9]] to ptr, !dbg [[DBG275]] -// CHECK1-NEXT: store ptr [[TMP10]], ptr [[_TMP3]], align 8, !dbg [[DBG275]] -// CHECK1-NEXT: [[TMP11:%.*]] = load ptr, ptr [[_TMP3]], align 8, !dbg [[DBG275]] -// CHECK1-NEXT: call void @llvm.dbg.declare(metadata ptr [[DOTOMP_IV]], metadata [[META276:![0-9]+]], metadata !DIExpression()), !dbg [[DBG265]] -// CHECK1-NEXT: call void @llvm.dbg.declare(metadata ptr [[DOTOMP_LB]], metadata [[META277:![0-9]+]], metadata !DIExpression()), !dbg [[DBG265]] -// CHECK1-NEXT: store i32 0, ptr [[DOTOMP_LB]], align 4, !dbg [[DBG278:![0-9]+]] -// CHECK1-NEXT: call void @llvm.dbg.declare(metadata ptr [[DOTOMP_UB]], metadata [[META279:![0-9]+]], metadata !DIExpression()), !dbg [[DBG265]] -// CHECK1-NEXT: store i32 9, ptr [[DOTOMP_UB]], align 4, !dbg [[DBG278]] -// CHECK1-NEXT: call void @llvm.dbg.declare(metadata ptr [[DOTOMP_STRIDE]], metadata [[META280:![0-9]+]], metadata !DIExpression()), !dbg [[DBG265]] -// CHECK1-NEXT: store i32 1, ptr [[DOTOMP_STRIDE]], align 4, !dbg [[DBG278]] -// CHECK1-NEXT: call void @llvm.dbg.declare(metadata ptr [[DOTOMP_IS_LAST]], metadata [[META281:![0-9]+]], metadata !DIExpression()), !dbg [[DBG265]] -// CHECK1-NEXT: store i32 0, ptr [[DOTOMP_IS_LAST]], align 4, !dbg [[DBG278]] -// CHECK1-NEXT: call void @llvm.dbg.declare(metadata ptr [[I]], metadata [[META282:![0-9]+]], metadata !DIExpression()), !dbg [[DBG265]] -// CHECK1-NEXT: [[TMP12:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8, !dbg [[DBG275]] -// CHECK1-NEXT: [[TMP13:%.*]] = load i32, ptr [[TMP12]], align 4, !dbg [[DBG275]] -// CHECK1-NEXT: call void @__kmpc_for_static_init_4(ptr @[[GLOB17:[0-9]+]], i32 [[TMP13]], i32 33, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_LB]], ptr [[DOTOMP_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1), !dbg [[DBG283:![0-9]+]] -// CHECK1-NEXT: br label [[OMP_DISPATCH_COND:%.*]], !dbg [[DBG275]] +// CHECK1-NEXT: tail call void @llvm.dbg.declare(metadata ptr [[BB_ADDR]], metadata [[META283:![0-9]+]], metadata !DIExpression()), !dbg [[DBG284:![0-9]+]] +// CHECK1-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr [[C_ADDR]], align 8, !dbg [[DBG285:![0-9]+]] +// CHECK1-NEXT: [[TMP1:%.*]] = addrspacecast ptr addrspace(1) [[TMP0]] to ptr, !dbg [[DBG285]] +// CHECK1-NEXT: store ptr [[TMP1]], ptr [[TMP]], align 8, !dbg [[DBG285]] +// CHECK1-NEXT: [[TMP2:%.*]] = load ptr, ptr [[TMP]], align 8, !dbg [[DBG285]] +// CHECK1-NEXT: [[TMP3:%.*]] = load ptr addrspace(1), ptr [[A_ADDR]], align 8, !dbg [[DBG285]] +// CHECK1-NEXT: [[TMP4:%.*]] = addrspacecast ptr addrspace(1) [[TMP3]] to ptr, !dbg [[DBG285]] +// CHECK1-NEXT: store ptr [[TMP4]], ptr [[_TMP1]], align 8, !dbg [[DBG285]] +// CHECK1-NEXT: [[TMP5:%.*]] = load ptr, ptr [[_TMP1]], align 8, !dbg [[DBG285]] +// CHECK1-NEXT: [[TMP6:%.*]] = load ptr addrspace(1), ptr [[B_ADDR]], align 8, !dbg [[DBG285]] +// CHECK1-NEXT: [[TMP7:%.*]] = addrspacecast ptr addrspace(1) [[TMP6]] to ptr, !dbg [[DBG285]] +// CHECK1-NEXT: store ptr [[TMP7]], ptr [[_TMP2]], align 8, !dbg [[DBG285]] +// CHECK1-NEXT: [[TMP8:%.*]] = load ptr, ptr [[_TMP2]], align 8, !dbg [[DBG285]] +// CHECK1-NEXT: [[TMP9:%.*]] = load ptr addrspace(1), ptr [[BB_ADDR]], align 8, !dbg [[DBG285]] +// CHECK1-NEXT: [[TMP10:%.*]] = addrspacecast ptr addrspace(1) [[TMP9]] to ptr, !dbg [[DBG285]] +// CHECK1-NEXT: store ptr [[TMP10]], ptr [[_TMP3]], align 8, !dbg [[DBG285]] +// CHECK1-NEXT: [[TMP11:%.*]] = load ptr, ptr [[_TMP3]], align 8, !dbg [[DBG285]] +// CHECK1-NEXT: tail call void @llvm.dbg.declare(metadata ptr [[DOTOMP_IV]], metadata [[META286:![0-9]+]], metadata !DIExpression()), !dbg [[DBG275]] +// CHECK1-NEXT: tail call void @llvm.dbg.declare(metadata ptr [[DOTOMP_LB]], metadata [[META287:![0-9]+]], metadata !DIExpression()), !dbg [[DBG275]] +// CHECK1-NEXT: store i32 0, ptr [[DOTOMP_LB]], align 4, !dbg [[DBG288:![0-9]+]] +// CHECK1-NEXT: tail call void @llvm.dbg.declare(metadata ptr [[DOTOMP_UB]], metadata [[META289:![0-9]+]], metadata !DIExpression()), !dbg [[DBG275]] +// CHECK1-NEXT: store i32 9, ptr [[DOTOMP_UB]], align 4, !dbg [[DBG288]] +// CHECK1-NEXT: tail call void @llvm.dbg.declare(metadata ptr [[DOTOMP_STRIDE]], metadata [[META290:![0-9]+]], metadata !DIExpression()), !dbg [[DBG275]] +// CHECK1-NEXT: store i32 1, ptr [[DOTOMP_STRIDE]], align 4, !dbg [[DBG288]] +// CHECK1-NEXT: tail call void @llvm.dbg.declare(metadata ptr [[DOTOMP_IS_LAST]], metadata [[META291:![0-9]+]], metadata !DIExpression()), !dbg [[DBG275]] +// CHECK1-NEXT: store i32 0, ptr [[DOTOMP_IS_LAST]], align 4, !dbg [[DBG288]] +// CHECK1-NEXT: tail call void @llvm.dbg.declare(metadata ptr [[I]], metadata [[META292:![0-9]+]], metadata !DIExpression()), !dbg [[DBG275]] +// CHECK1-NEXT: [[TMP12:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8, !dbg [[DBG285]] +// CHECK1-NEXT: [[TMP13:%.*]] = load i32, ptr [[TMP12]], align 4, !dbg [[DBG285]] +// CHECK1-NEXT: call void @__kmpc_for_static_init_4(ptr @[[GLOB17:[0-9]+]], i32 [[TMP13]], i32 33, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_LB]], ptr [[DOTOMP_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1), !dbg [[DBG293:![0-9]+]] +// CHECK1-NEXT: br label [[OMP_DISPATCH_COND:%.*]], !dbg [[DBG285]] // CHECK1: omp.dispatch.cond: -// CHECK1-NEXT: [[TMP14:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !dbg [[DBG278]] -// CHECK1-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP14]], 9, !dbg [[DBG278]] -// CHECK1-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]], !dbg [[DBG278]] +// CHECK1-NEXT: [[TMP14:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !dbg [[DBG288]] +// CHECK1-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP14]], 9, !dbg [[DBG288]] +// CHECK1-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]], !dbg [[DBG288]] // CHECK1: cond.true: -// CHECK1-NEXT: br label [[COND_END:%.*]], !dbg [[DBG278]] +// CHECK1-NEXT: br label [[COND_END:%.*]], !dbg [[DBG288]] // CHECK1: cond.false: -// CHECK1-NEXT: [[TMP15:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !dbg [[DBG278]] -// CHECK1-NEXT: br label [[COND_END]], !dbg [[DBG278]] +// CHECK1-NEXT: [[TMP15:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !dbg [[DBG288]] +// CHECK1-NEXT: br label [[COND_END]], !dbg [[DBG288]] // CHECK1: cond.end: -// CHECK1-NEXT: [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP15]], [[COND_FALSE]] ], !dbg [[DBG278]] -// CHECK1-NEXT: store i32 [[COND]], ptr [[DOTOMP_UB]], align 4, !dbg [[DBG278]] -// CHECK1-NEXT: [[TMP16:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4, !dbg [[DBG278]] -// CHECK1-NEXT: store i32 [[TMP16]], ptr [[DOTOMP_IV]], align 4, !dbg [[DBG278]] -// CHECK1-NEXT: [[TMP17:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !dbg [[DBG278]] -// CHECK1-NEXT: [[TMP18:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !dbg [[DBG278]] -// CHECK1-NEXT: [[CMP5:%.*]] = icmp sle i32 [[TMP17]], [[TMP18]], !dbg [[DBG275]] -// CHECK1-NEXT: br i1 [[CMP5]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]], !dbg [[DBG275]] +// CHECK1-NEXT: [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP15]], [[COND_FALSE]] ], !dbg [[DBG288]] +// CHECK1-NEXT: store i32 [[COND]], ptr [[DOTOMP_UB]], align 4, !dbg [[DBG288]] +// CHECK1-NEXT: [[TMP16:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4, !dbg [[DBG288]] +// CHECK1-NEXT: store i32 [[TMP16]], ptr [[DOTOMP_IV]], align 4, !dbg [[DBG288]] +// CHECK1-NEXT: [[TMP17:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !dbg [[DBG288]] +// CHECK1-NEXT: [[TMP18:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !dbg [[DBG288]] +// CHECK1-NEXT: [[CMP5:%.*]] = icmp sle i32 [[TMP17]], [[TMP18]], !dbg [[DBG285]] +// CHECK1-NEXT: br i1 [[CMP5]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]], !dbg [[DBG285]] // CHECK1: omp.dispatch.body: -// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND:%.*]], !dbg [[DBG275]] +// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND:%.*]], !dbg [[DBG285]] // CHECK1: omp.inner.for.cond: -// CHECK1-NEXT: [[TMP19:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !dbg [[DBG278]] -// CHECK1-NEXT: [[TMP20:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !dbg [[DBG278]] -// CHECK1-NEXT: [[CMP6:%.*]] = icmp sle i32 [[TMP19]], [[TMP20]], !dbg [[DBG275]] -// CHECK1-NEXT: br i1 [[CMP6]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]], !dbg [[DBG275]] +// CHECK1-NEXT: [[TMP19:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !dbg [[DBG288]] +// CHECK1-NEXT: [[TMP20:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !dbg [[DBG288]] +// CHECK1-NEXT: [[CMP6:%.*]] = icmp sle i32 [[TMP19]], [[TMP20]], !dbg [[DBG285]] +// CHECK1-NEXT: br i1 [[CMP6]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]], !dbg [[DBG285]] // CHECK1: omp.inner.for.body: -// CHECK1-NEXT: [[TMP21:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !dbg [[DBG278]] -// CHECK1-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP21]], 1, !dbg [[DBG284:![0-9]+]] -// CHECK1-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]], !dbg [[DBG284]] -// CHECK1-NEXT: store i32 [[ADD]], ptr [[I]], align 4, !dbg [[DBG284]] -// CHECK1-NEXT: call void @llvm.dbg.declare(metadata ptr [[F]], metadata [[META285:![0-9]+]], metadata !DIExpression()), !dbg [[DBG287:![0-9]+]] -// CHECK1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x [10 x [10 x i32]]], ptr [[TMP2]], i64 0, i64 1, !dbg [[DBG288:![0-9]+]] -// CHECK1-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[ARRAYIDX]], i64 0, i64 1, !dbg [[DBG288]] -// CHECK1-NEXT: [[ARRAYIDX8:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX7]], i64 0, i64 1, !dbg [[DBG288]] -// CHECK1-NEXT: store ptr [[ARRAYIDX8]], ptr [[F]], align 8, !dbg [[DBG287]] -// CHECK1-NEXT: call void @llvm.dbg.declare(metadata ptr [[G]], metadata [[META289:![0-9]+]], metadata !DIExpression()), !dbg [[DBG290:![0-9]+]] -// CHECK1-NEXT: store ptr [[TMP5]], ptr [[G]], align 8, !dbg [[DBG290]] -// CHECK1-NEXT: call void @llvm.dbg.declare(metadata ptr [[H]], metadata [[META291:![0-9]+]], metadata !DIExpression()), !dbg [[DBG292:![0-9]+]] -// CHECK1-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[TMP8]], i64 0, i64 1, !dbg [[DBG293:![0-9]+]] -// CHECK1-NEXT: [[ARRAYIDX10:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX9]], i64 0, i64 1, !dbg [[DBG293]] -// CHECK1-NEXT: store ptr [[ARRAYIDX10]], ptr [[H]], align 8, !dbg [[DBG292]] -// CHECK1-NEXT: call void @llvm.dbg.declare(metadata ptr [[D]], metadata [[META294:![0-9]+]], metadata !DIExpression()), !dbg [[DBG295:![0-9]+]] -// CHECK1-NEXT: store i32 15, ptr [[D]], align 4, !dbg [[DBG295]] -// CHECK1-NEXT: store i32 5, ptr [[TMP5]], align 4, !dbg [[DBG296:![0-9]+]] -// CHECK1-NEXT: [[ARRAYIDX11:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[TMP8]], i64 0, i64 0, !dbg [[DBG297:![0-9]+]] -// CHECK1-NEXT: [[TMP22:%.*]] = load i32, ptr [[TMP5]], align 4, !dbg [[DBG298:![0-9]+]] -// CHECK1-NEXT: [[IDXPROM:%.*]] = sext i32 [[TMP22]] to i64, !dbg [[DBG297]] -// CHECK1-NEXT: [[ARRAYIDX12:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX11]], i64 0, i64 [[IDXPROM]], !dbg [[DBG297]] -// CHECK1-NEXT: store i32 10, ptr [[ARRAYIDX12]], align 4, !dbg [[DBG299:![0-9]+]] -// CHECK1-NEXT: [[ARRAYIDX13:%.*]] = getelementptr inbounds [10 x [10 x [10 x i32]]], ptr [[TMP2]], i64 0, i64 0, !dbg [[DBG300:![0-9]+]] -// CHECK1-NEXT: [[ARRAYIDX14:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[ARRAYIDX13]], i64 0, i64 0, !dbg [[DBG300]] -// CHECK1-NEXT: [[TMP23:%.*]] = load i32, ptr [[TMP5]], align 4, !dbg [[DBG301:![0-9]+]] -// CHECK1-NEXT: [[IDXPROM15:%.*]] = sext i32 [[TMP23]] to i64, !dbg [[DBG300]] -// CHECK1-NEXT: [[ARRAYIDX16:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX14]], i64 0, i64 [[IDXPROM15]], !dbg [[DBG300]] -// CHECK1-NEXT: store i32 11, ptr [[ARRAYIDX16]], align 4, !dbg [[DBG302:![0-9]+]] -// CHECK1-NEXT: [[ARRAYIDX17:%.*]] = getelementptr inbounds [10 x [10 x [10 x i32]]], ptr [[TMP2]], i64 0, i64 0, !dbg [[DBG303:![0-9]+]] -// CHECK1-NEXT: [[ARRAYIDX18:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[ARRAYIDX17]], i64 0, i64 0, !dbg [[DBG303]] -// CHECK1-NEXT: [[TMP24:%.*]] = load i32, ptr [[TMP5]], align 4, !dbg [[DBG304:![0-9]+]] -// CHECK1-NEXT: [[IDXPROM19:%.*]] = sext i32 [[TMP24]] to i64, !dbg [[DBG303]] -// CHECK1-NEXT: [[ARRAYIDX20:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX18]], i64 0, i64 [[IDXPROM19]], !dbg [[DBG303]] -// CHECK1-NEXT: [[TMP25:%.*]] = load i32, ptr [[ARRAYIDX20]], align 4, !dbg [[DBG303]] -// CHECK1-NEXT: [[ARRAYIDX21:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[TMP8]], i64 0, i64 0, !dbg [[DBG305:![0-9]+]] -// CHECK1-NEXT: [[TMP26:%.*]] = load i32, ptr [[TMP5]], align 4, !dbg [[DBG306:![0-9]+]] -// CHECK1-NEXT: [[IDXPROM22:%.*]] = sext i32 [[TMP26]] to i64, !dbg [[DBG305]] -// CHECK1-NEXT: [[ARRAYIDX23:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX21]], i64 0, i64 [[IDXPROM22]], !dbg [[DBG305]] -// CHECK1-NEXT: store i32 [[TMP25]], ptr [[ARRAYIDX23]], align 4, !dbg [[DBG307:![0-9]+]] -// CHECK1-NEXT: [[ARRAYIDX24:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[TMP8]], i64 0, i64 0, !dbg [[DBG308:![0-9]+]] -// CHECK1-NEXT: [[TMP27:%.*]] = load i32, ptr [[TMP5]], align 4, !dbg [[DBG309:![0-9]+]] -// CHECK1-NEXT: [[IDXPROM25:%.*]] = sext i32 [[TMP27]] to i64, !dbg [[DBG308]] -// CHECK1-NEXT: [[ARRAYIDX26:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX24]], i64 0, i64 [[IDXPROM25]], !dbg [[DBG308]] -// CHECK1-NEXT: [[TMP28:%.*]] = load i32, ptr [[ARRAYIDX26]], align 4, !dbg [[DBG308]] -// CHECK1-NEXT: [[TOBOOL:%.*]] = icmp ne i32 [[TMP28]], 0, !dbg [[DBG308]] -// CHECK1-NEXT: [[FROMBOOL:%.*]] = zext i1 [[TOBOOL]] to i8, !dbg [[DBG310:![0-9]+]] -// CHECK1-NEXT: store i8 [[FROMBOOL]], ptr [[TMP11]], align 1, !dbg [[DBG310]] -// CHECK1-NEXT: br label [[OMP_BODY_CONTINUE:%.*]], !dbg [[DBG311:![0-9]+]] +// CHECK1-NEXT: [[TMP21:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !dbg [[DBG288]] +// CHECK1-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP21]], 1, !dbg [[DBG294:![0-9]+]] +// CHECK1-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]], !dbg [[DBG294]] +// CHECK1-NEXT: store i32 [[ADD]], ptr [[I]], align 4, !dbg [[DBG294]] +// CHECK1-NEXT: tail call void @llvm.dbg.declare(metadata ptr [[F]], metadata [[META295:![0-9]+]], metadata !DIExpression()), !dbg [[DBG297:![0-9]+]] +// CHECK1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x [10 x [10 x i32]]], ptr [[TMP2]], i64 0, i64 1, !dbg [[DBG298:![0-9]+]] +// CHECK1-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[ARRAYIDX]], i64 0, i64 1, !dbg [[DBG298]] +// CHECK1-NEXT: [[ARRAYIDX8:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX7]], i64 0, i64 1, !dbg [[DBG298]] +// CHECK1-NEXT: store ptr [[ARRAYIDX8]], ptr [[F]], align 8, !dbg [[DBG297]] +// CHECK1-NEXT: tail call void @llvm.dbg.declare(metadata ptr [[G]], metadata [[META299:![0-9]+]], metadata !DIExpression()), !dbg [[DBG300:![0-9]+]] +// CHECK1-NEXT: store ptr [[TMP5]], ptr [[G]], align 8, !dbg [[DBG300]] +// CHECK1-NEXT: tail call void @llvm.dbg.declare(metadata ptr [[H]], metadata [[META301:![0-9]+]], metadata !DIExpression()), !dbg [[DBG302:![0-9]+]] +// CHECK1-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[TMP8]], i64 0, i64 1, !dbg [[DBG303:![0-9]+]] +// CHECK1-NEXT: [[ARRAYIDX10:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX9]], i64 0, i64 1, !dbg [[DBG303]] +// CHECK1-NEXT: store ptr [[ARRAYIDX10]], ptr [[H]], align 8, !dbg [[DBG302]] +// CHECK1-NEXT: tail call void @llvm.dbg.declare(metadata ptr [[D]], metadata [[META304:![0-9]+]], metadata !DIExpression()), !dbg [[DBG305:![0-9]+]] +// CHECK1-NEXT: store i32 15, ptr [[D]], align 4, !dbg [[DBG305]] +// CHECK1-NEXT: store i32 5, ptr [[TMP5]], align 4, !dbg [[DBG306:![0-9]+]] +// CHECK1-NEXT: [[ARRAYIDX11:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[TMP8]], i64 0, i64 0, !dbg [[DBG307:![0-9]+]] +// CHECK1-NEXT: [[TMP22:%.*]] = load i32, ptr [[TMP5]], align 4, !dbg [[DBG308:![0-9]+]] +// CHECK1-NEXT: [[IDXPROM:%.*]] = sext i32 [[TMP22]] to i64, !dbg [[DBG307]] +// CHECK1-NEXT: [[ARRAYIDX12:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX11]], i64 0, i64 [[IDXPROM]], !dbg [[DBG307]] +// CHECK1-NEXT: store i32 10, ptr [[ARRAYIDX12]], align 4, !dbg [[DBG309:![0-9]+]] +// CHECK1-NEXT: [[ARRAYIDX13:%.*]] = getelementptr inbounds [10 x [10 x [10 x i32]]], ptr [[TMP2]], i64 0, i64 0, !dbg [[DBG310:![0-9]+]] +// CHECK1-NEXT: [[ARRAYIDX14:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[ARRAYIDX13]], i64 0, i64 0, !dbg [[DBG310]] +// CHECK1-NEXT: [[TMP23:%.*]] = load i32, ptr [[TMP5]], align 4, !dbg [[DBG311:![0-9]+]] +// CHECK1-NEXT: [[IDXPROM15:%.*]] = sext i32 [[TMP23]] to i64, !dbg [[DBG310]] +// CHECK1-NEXT: [[ARRAYIDX16:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX14]], i64 0, i64 [[IDXPROM15]], !dbg [[DBG310]] +// CHECK1-NEXT: store i32 11, ptr [[ARRAYIDX16]], align 4, !dbg [[DBG312:![0-9]+]] +// CHECK1-NEXT: [[ARRAYIDX17:%.*]] = getelementptr inbounds [10 x [10 x [10 x i32]]], ptr [[TMP2]], i64 0, i64 0, !dbg [[DBG313:![0-9]+]] +// CHECK1-NEXT: [[ARRAYIDX18:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[ARRAYIDX17]], i64 0, i64 0, !dbg [[DBG313]] +// CHECK1-NEXT: [[TMP24:%.*]] = load i32, ptr [[TMP5]], align 4, !dbg [[DBG314:![0-9]+]] +// CHECK1-NEXT: [[IDXPROM19:%.*]] = sext i32 [[TMP24]] to i64, !dbg [[DBG313]] +// CHECK1-NEXT: [[ARRAYIDX20:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX18]], i64 0, i64 [[IDXPROM19]], !dbg [[DBG313]] +// CHECK1-NEXT: [[TMP25:%.*]] = load i32, ptr [[ARRAYIDX20]], align 4, !dbg [[DBG313]] +// CHECK1-NEXT: [[ARRAYIDX21:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[TMP8]], i64 0, i64 0, !dbg [[DBG315:![0-9]+]] +// CHECK1-NEXT: [[TMP26:%.*]] = load i32, ptr [[TMP5]], align 4, !dbg [[DBG316:![0-9]+]] +// CHECK1-NEXT: [[IDXPROM22:%.*]] = sext i32 [[TMP26]] to i64, !dbg [[DBG315]] +// CHECK1-NEXT: [[ARRAYIDX23:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX21]], i64 0, i64 [[IDXPROM22]], !dbg [[DBG315]] +// CHECK1-NEXT: store i32 [[TMP25]], ptr [[ARRAYIDX23]], align 4, !dbg [[DBG317:![0-9]+]] +// CHECK1-NEXT: [[ARRAYIDX24:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[TMP8]], i64 0, i64 0, !dbg [[DBG318:![0-9]+]] +// CHECK1-NEXT: [[TMP27:%.*]] = load i32, ptr [[TMP5]], align 4, !dbg [[DBG319:![0-9]+]] +// CHECK1-NEXT: [[IDXPROM25:%.*]] = sext i32 [[TMP27]] to i64, !dbg [[DBG318]] +// CHECK1-NEXT: [[ARRAYIDX26:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX24]], i64 0, i64 [[IDXPROM25]], !dbg [[DBG318]] +// CHECK1-NEXT: [[TMP28:%.*]] = load i32, ptr [[ARRAYIDX26]], align 4, !dbg [[DBG318]] +// CHECK1-NEXT: [[TOBOOL:%.*]] = icmp ne i32 [[TMP28]], 0, !dbg [[DBG318]] +// CHECK1-NEXT: [[FROMBOOL:%.*]] = zext i1 [[TOBOOL]] to i8, !dbg [[DBG320:![0-9]+]] +// CHECK1-NEXT: store i8 [[FROMBOOL]], ptr [[TMP11]], align 1, !dbg [[DBG320]] +// CHECK1-NEXT: br label [[OMP_BODY_CONTINUE:%.*]], !dbg [[DBG321:![0-9]+]] // CHECK1: omp.body.continue: -// CHECK1-NEXT: br label [[OMP_INNER_FOR_INC:%.*]], !dbg [[DBG283]] +// CHECK1-NEXT: br label [[OMP_INNER_FOR_INC:%.*]], !dbg [[DBG293]] // CHECK1: omp.inner.for.inc: -// CHECK1-NEXT: [[TMP29:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !dbg [[DBG278]] -// CHECK1-NEXT: [[ADD27:%.*]] = add nsw i32 [[TMP29]], 1, !dbg [[DBG275]] -// CHECK1-NEXT: store i32 [[ADD27]], ptr [[DOTOMP_IV]], align 4, !dbg [[DBG275]] -// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND]], !dbg [[DBG283]], !llvm.loop [[LOOP312:![0-9]+]] +// CHECK1-NEXT: [[TMP29:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !dbg [[DBG288]] +// CHECK1-NEXT: [[ADD27:%.*]] = add nsw i32 [[TMP29]], 1, !dbg [[DBG285]] +// CHECK1-NEXT: store i32 [[ADD27]], ptr [[DOTOMP_IV]], align 4, !dbg [[DBG285]] +// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND]], !dbg [[DBG293]], !llvm.loop [[LOOP322:![0-9]+]] // CHECK1: omp.inner.for.end: -// CHECK1-NEXT: br label [[OMP_DISPATCH_INC:%.*]], !dbg [[DBG283]] +// CHECK1-NEXT: br label [[OMP_DISPATCH_INC:%.*]], !dbg [[DBG293]] // CHECK1: omp.dispatch.inc: -// CHECK1-NEXT: [[TMP30:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4, !dbg [[DBG278]] -// CHECK1-NEXT: [[TMP31:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !dbg [[DBG278]] -// CHECK1-NEXT: [[ADD28:%.*]] = add nsw i32 [[TMP30]], [[TMP31]], !dbg [[DBG275]] -// CHECK1-NEXT: store i32 [[ADD28]], ptr [[DOTOMP_LB]], align 4, !dbg [[DBG275]] -// CHECK1-NEXT: [[TMP32:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !dbg [[DBG278]] -// CHECK1-NEXT: [[TMP33:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !dbg [[DBG278]] -// CHECK1-NEXT: [[ADD29:%.*]] = add nsw i32 [[TMP32]], [[TMP33]], !dbg [[DBG275]] -// CHECK1-NEXT: store i32 [[ADD29]], ptr [[DOTOMP_UB]], align 4, !dbg [[DBG275]] -// CHECK1-NEXT: br label [[OMP_DISPATCH_COND]], !dbg [[DBG283]], !llvm.loop [[LOOP314:![0-9]+]] +// CHECK1-NEXT: [[TMP30:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4, !dbg [[DBG288]] +// CHECK1-NEXT: [[TMP31:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !dbg [[DBG288]] +// CHECK1-NEXT: [[ADD28:%.*]] = add nsw i32 [[TMP30]], [[TMP31]], !dbg [[DBG285]] +// CHECK1-NEXT: store i32 [[ADD28]], ptr [[DOTOMP_LB]], align 4, !dbg [[DBG285]] +// CHECK1-NEXT: [[TMP32:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !dbg [[DBG288]] +// CHECK1-NEXT: [[TMP33:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !dbg [[DBG288]] +// CHECK1-NEXT: [[ADD29:%.*]] = add nsw i32 [[TMP32]], [[TMP33]], !dbg [[DBG285]] +// CHECK1-NEXT: store i32 [[ADD29]], ptr [[DOTOMP_UB]], align 4, !dbg [[DBG285]] +// CHECK1-NEXT: br label [[OMP_DISPATCH_COND]], !dbg [[DBG293]], !llvm.loop [[LOOP324:![0-9]+]] // CHECK1: omp.dispatch.end: -// CHECK1-NEXT: call void @__kmpc_for_static_fini(ptr @[[GLOB19:[0-9]+]], i32 [[TMP13]]), !dbg [[DBG313:![0-9]+]] -// CHECK1-NEXT: ret void, !dbg [[DBG315:![0-9]+]] +// CHECK1-NEXT: call void @__kmpc_for_static_fini(ptr @[[GLOB19:[0-9]+]], i32 [[TMP13]]), !dbg [[DBG323:![0-9]+]] +// CHECK1-NEXT: ret void, !dbg [[DBG325:![0-9]+]] // // // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l41_debug___omp_outlined -// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4000) [[C:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(400) [[B:%.*]], ptr noundef nonnull align 1 dereferenceable(1) [[BB:%.*]]) #[[ATTR2]] !dbg [[DBG316:![0-9]+]] { +// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4000) [[C:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(400) [[B:%.*]], ptr noundef nonnull align 1 dereferenceable(1) [[BB:%.*]]) #[[ATTR1]] !dbg [[DBG326:![0-9]+]] { // CHECK1-NEXT: entry: // CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -877,66 +912,31 @@ int main() { // CHECK1-NEXT: [[B_ADDR:%.*]] = alloca ptr, align 8 // CHECK1-NEXT: [[BB_ADDR:%.*]] = alloca ptr, align 8 // CHECK1-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK1-NEXT: call void @llvm.dbg.declare(metadata ptr [[DOTGLOBAL_TID__ADDR]], metadata [[META319:![0-9]+]], metadata !DIExpression()), !dbg [[DBG320:![0-9]+]] +// CHECK1-NEXT: tail call void @llvm.dbg.declare(metadata ptr [[DOTGLOBAL_TID__ADDR]], metadata [[META329:![0-9]+]], metadata !DIExpression()), !dbg [[DBG330:![0-9]+]] // CHECK1-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8 -// CHECK1-NEXT: call void @llvm.dbg.declare(metadata ptr [[DOTBOUND_TID__ADDR]], metadata [[META321:![0-9]+]], metadata !DIExpression()), !dbg [[DBG320]] -// CHECK1-NEXT: store ptr [[C]], ptr [[C_ADDR]], align 8 -// CHECK1-NEXT: call void @llvm.dbg.declare(metadata ptr [[C_ADDR]], metadata [[META322:![0-9]+]], metadata !DIExpression()), !dbg [[DBG320]] -// CHECK1-NEXT: store ptr [[A]], ptr [[A_ADDR]], align 8 -// CHECK1-NEXT: call void @llvm.dbg.declare(metadata ptr [[A_ADDR]], metadata [[META323:![0-9]+]], metadata !DIExpression()), !dbg [[DBG320]] -// CHECK1-NEXT: store ptr [[B]], ptr [[B_ADDR]], align 8 -// CHECK1-NEXT: call void @llvm.dbg.declare(metadata ptr [[B_ADDR]], metadata [[META324:![0-9]+]], metadata !DIExpression()), !dbg [[DBG320]] -// CHECK1-NEXT: store ptr [[BB]], ptr [[BB_ADDR]], align 8 -// CHECK1-NEXT: call void @llvm.dbg.declare(metadata ptr [[BB_ADDR]], metadata [[META325:![0-9]+]], metadata !DIExpression()), !dbg [[DBG320]] -// CHECK1-NEXT: [[TMP0:%.*]] = load ptr, ptr [[C_ADDR]], align 8, !dbg [[DBG326:![0-9]+]] -// CHECK1-NEXT: [[TMP1:%.*]] = load ptr, ptr [[A_ADDR]], align 8, !dbg [[DBG326]] -// CHECK1-NEXT: [[TMP2:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG326]] -// CHECK1-NEXT: [[TMP3:%.*]] = load ptr, ptr [[BB_ADDR]], align 8, !dbg [[DBG326]] -// CHECK1-NEXT: [[TMP4:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8, !dbg [[DBG326]] -// CHECK1-NEXT: [[TMP5:%.*]] = load ptr, ptr [[DOTBOUND_TID__ADDR]], align 8, !dbg [[DBG326]] -// CHECK1-NEXT: [[TMP6:%.*]] = load ptr, ptr [[C_ADDR]], align 8, !dbg [[DBG326]] -// CHECK1-NEXT: [[TMP7:%.*]] = load ptr, ptr [[A_ADDR]], align 8, !dbg [[DBG326]] -// CHECK1-NEXT: [[TMP8:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG326]] -// CHECK1-NEXT: [[TMP9:%.*]] = load ptr, ptr [[BB_ADDR]], align 8, !dbg [[DBG326]] -// CHECK1-NEXT: [[TMP10:%.*]] = addrspacecast ptr [[TMP6]] to ptr addrspace(1), !dbg [[DBG326]] -// CHECK1-NEXT: [[TMP11:%.*]] = addrspacecast ptr [[TMP7]] to ptr addrspace(1), !dbg [[DBG326]] -// CHECK1-NEXT: [[TMP12:%.*]] = addrspacecast ptr [[TMP8]] to ptr addrspace(1), !dbg [[DBG326]] -// CHECK1-NEXT: [[TMP13:%.*]] = addrspacecast ptr [[TMP9]] to ptr addrspace(1), !dbg [[DBG326]] -// CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l41_debug___omp_outlined_debug__(ptr [[TMP4]], ptr [[TMP5]], ptr addrspace(1) [[TMP10]], ptr addrspace(1) [[TMP11]], ptr addrspace(1) [[TMP12]], ptr addrspace(1) [[TMP13]]) #[[ATTR4]], !dbg [[DBG326]] -// CHECK1-NEXT: ret void, !dbg [[DBG326]] -// -// -// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l41 -// CHECK1-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], ptr noundef nonnull align 4 dereferenceable(4000) [[C:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(400) [[B:%.*]], ptr noundef nonnull align 1 dereferenceable(1) [[BB:%.*]]) #[[ATTR6]] !dbg [[DBG327:![0-9]+]] { -// CHECK1-NEXT: entry: -// CHECK1-NEXT: [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8 -// CHECK1-NEXT: [[C_ADDR:%.*]] = alloca ptr, align 8 -// CHECK1-NEXT: [[A_ADDR:%.*]] = alloca ptr, align 8 -// CHECK1-NEXT: [[B_ADDR:%.*]] = alloca ptr, align 8 -// CHECK1-NEXT: [[BB_ADDR:%.*]] = alloca ptr, align 8 -// CHECK1-NEXT: store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8 -// CHECK1-NEXT: call void @llvm.dbg.declare(metadata ptr [[DYN_PTR_ADDR]], metadata [[META330:![0-9]+]], metadata !DIExpression()), !dbg [[DBG331:![0-9]+]] +// CHECK1-NEXT: tail call void @llvm.dbg.declare(metadata ptr [[DOTBOUND_TID__ADDR]], metadata [[META331:![0-9]+]], metadata !DIExpression()), !dbg [[DBG330]] // CHECK1-NEXT: store ptr [[C]], ptr [[C_ADDR]], align 8 -// CHECK1-NEXT: call void @llvm.dbg.declare(metadata ptr [[C_ADDR]], metadata [[META332:![0-9]+]], metadata !DIExpression()), !dbg [[DBG331]] +// CHECK1-NEXT: tail call void @llvm.dbg.declare(metadata ptr [[C_ADDR]], metadata [[META332:![0-9]+]], metadata !DIExpression()), !dbg [[DBG330]] // CHECK1-NEXT: store ptr [[A]], ptr [[A_ADDR]], align 8 -// CHECK1-NEXT: call void @llvm.dbg.declare(metadata ptr [[A_ADDR]], metadata [[META333:![0-9]+]], metadata !DIExpression()), !dbg [[DBG331]] +// CHECK1-NEXT: tail call void @llvm.dbg.declare(metadata ptr [[A_ADDR]], metadata [[META333:![0-9]+]], metadata !DIExpression()), !dbg [[DBG330]] // CHECK1-NEXT: store ptr [[B]], ptr [[B_ADDR]], align 8 -// CHECK1-NEXT: call void @llvm.dbg.declare(metadata ptr [[B_ADDR]], metadata [[META334:![0-9]+]], metadata !DIExpression()), !dbg [[DBG331]] +// CHECK1-NEXT: tail call void @llvm.dbg.declare(metadata ptr [[B_ADDR]], metadata [[META334:![0-9]+]], metadata !DIExpression()), !dbg [[DBG330]] // CHECK1-NEXT: store ptr [[BB]], ptr [[BB_ADDR]], align 8 -// CHECK1-NEXT: call void @llvm.dbg.declare(metadata ptr [[BB_ADDR]], metadata [[META335:![0-9]+]], metadata !DIExpression()), !dbg [[DBG331]] +// CHECK1-NEXT: tail call void @llvm.dbg.declare(metadata ptr [[BB_ADDR]], metadata [[META335:![0-9]+]], metadata !DIExpression()), !dbg [[DBG330]] // CHECK1-NEXT: [[TMP0:%.*]] = load ptr, ptr [[C_ADDR]], align 8, !dbg [[DBG336:![0-9]+]] // CHECK1-NEXT: [[TMP1:%.*]] = load ptr, ptr [[A_ADDR]], align 8, !dbg [[DBG336]] // CHECK1-NEXT: [[TMP2:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG336]] // CHECK1-NEXT: [[TMP3:%.*]] = load ptr, ptr [[BB_ADDR]], align 8, !dbg [[DBG336]] -// CHECK1-NEXT: [[TMP4:%.*]] = load ptr, ptr [[DYN_PTR_ADDR]], align 8, !dbg [[DBG336]] -// CHECK1-NEXT: [[TMP5:%.*]] = load ptr, ptr [[C_ADDR]], align 8, !dbg [[DBG336]] -// CHECK1-NEXT: [[TMP6:%.*]] = load ptr, ptr [[A_ADDR]], align 8, !dbg [[DBG336]] -// CHECK1-NEXT: [[TMP7:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG336]] -// CHECK1-NEXT: [[TMP8:%.*]] = load ptr, ptr [[BB_ADDR]], align 8, !dbg [[DBG336]] -// CHECK1-NEXT: [[TMP9:%.*]] = addrspacecast ptr [[TMP5]] to ptr addrspace(1), !dbg [[DBG336]] +// CHECK1-NEXT: [[TMP4:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8, !dbg [[DBG336]] +// CHECK1-NEXT: [[TMP5:%.*]] = load ptr, ptr [[DOTBOUND_TID__ADDR]], align 8, !dbg [[DBG336]] +// CHECK1-NEXT: [[TMP6:%.*]] = load ptr, ptr [[C_ADDR]], align 8, !dbg [[DBG336]] +// CHECK1-NEXT: [[TMP7:%.*]] = load ptr, ptr [[A_ADDR]], align 8, !dbg [[DBG336]] +// CHECK1-NEXT: [[TMP8:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG336]] +// CHECK1-NEXT: [[TMP9:%.*]] = load ptr, ptr [[BB_ADDR]], align 8, !dbg [[DBG336]] // CHECK1-NEXT: [[TMP10:%.*]] = addrspacecast ptr [[TMP6]] to ptr addrspace(1), !dbg [[DBG336]] // CHECK1-NEXT: [[TMP11:%.*]] = addrspacecast ptr [[TMP7]] to ptr addrspace(1), !dbg [[DBG336]] // CHECK1-NEXT: [[TMP12:%.*]] = addrspacecast ptr [[TMP8]] to ptr addrspace(1), !dbg [[DBG336]] -// CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l41_debug__(ptr [[TMP4]], ptr addrspace(1) [[TMP9]], ptr addrspace(1) [[TMP10]], ptr addrspace(1) [[TMP11]], ptr addrspace(1) [[TMP12]]) #[[ATTR4]], !dbg [[DBG336]] +// CHECK1-NEXT: [[TMP13:%.*]] = addrspacecast ptr [[TMP9]] to ptr addrspace(1), !dbg [[DBG336]] +// CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l41_debug___omp_outlined_debug__(ptr [[TMP4]], ptr [[TMP5]], ptr addrspace(1) [[TMP10]], ptr addrspace(1) [[TMP11]], ptr addrspace(1) [[TMP12]], ptr addrspace(1) [[TMP13]]) #[[ATTR4]], !dbg [[DBG336]] // CHECK1-NEXT: ret void, !dbg [[DBG336]] // diff --git a/llvm/include/llvm/Frontend/OpenMP/OMPConstants.h b/llvm/include/llvm/Frontend/OpenMP/OMPConstants.h index 338b56226f204..a7be3f51fac7d 100644 --- a/llvm/include/llvm/Frontend/OpenMP/OMPConstants.h +++ b/llvm/include/llvm/Frontend/OpenMP/OMPConstants.h @@ -72,7 +72,7 @@ enum class IdentFlag { #include "llvm/Frontend/OpenMP/OMPKinds.def" // Version of the kernel argument format used by the omp runtime. -#define OMP_KERNEL_ARG_VERSION 3 +#define OMP_KERNEL_ARG_VERSION 4 // Minimum version of the compiler that generates a kernel dynamic pointer. #define OMP_KERNEL_ARG_MIN_VERSION_WITH_DYN_PTR 3 diff --git a/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def b/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def index fe09bb8177c28..0be3827185e2e 100644 --- a/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def +++ b/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def @@ -90,7 +90,7 @@ __OMP_ARRAY_TYPE(Int32Arr3, Int32, 3) __OMP_STRUCT_TYPE(Ident, ident_t, false, Int32, Int32, Int32, Int32, Int8Ptr) __OMP_STRUCT_TYPE(KernelArgs, __tgt_kernel_arguments, false, Int32, Int32, VoidPtrPtr, VoidPtrPtr, Int64Ptr, Int64Ptr, VoidPtrPtr, VoidPtrPtr, - Int64, Int64, Int32Arr3Ty, Int32Arr3Ty, Int32) + Int64, Int64, Int32Arr3Ty, Int32Arr3Ty, Int32, VoidPtr) __OMP_STRUCT_TYPE(AsyncInfo, __tgt_async_info, false, Int8Ptr) __OMP_STRUCT_TYPE(DependInfo, kmp_dep_info, false, SizeTy, SizeTy, Int8) __OMP_STRUCT_TYPE(Task, kmp_task_ompbuilder_t, false, VoidPtr, VoidPtr, Int32, VoidPtr, VoidPtr) diff --git a/llvm/include/llvm/Transforms/Instrumentation/GPUSan.h b/llvm/include/llvm/Transforms/Instrumentation/GPUSan.h new file mode 100644 index 0000000000000..1201b3c47ac06 --- /dev/null +++ b/llvm/include/llvm/Transforms/Instrumentation/GPUSan.h @@ -0,0 +1,26 @@ +//===- Transforms/Instrumentation/GPUSan.h ----------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +/// \file +/// This file provides the interface for LLVM's PGO Instrumentation lowering +/// pass. +//===----------------------------------------------------------------------===// + +#ifndef LLVM_TRANSFORMS_INSTRUMENTATION_GPUSAN_H +#define LLVM_TRANSFORMS_INSTRUMENTATION_GPUSAN_H + +#include "llvm/IR/PassManager.h" + +namespace llvm { + +class GPUSanPass : public PassInfoMixin { +public: + PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM); +}; +} // end namespace llvm + +#endif // LLVM_TRANSFORMS_INSTRUMENTATION_GPUSAN_H diff --git a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp index 92213e19c9d9d..ad383c2548ee6 100644 --- a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp +++ b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp @@ -496,6 +496,7 @@ void OpenMPIRBuilder::getKernelArgsVector(TargetKernelArgs &KernelArgs, auto Int32Ty = Type::getInt32Ty(Builder.getContext()); Value *ZeroArray = Constant::getNullValue(ArrayType::get(Int32Ty, 3)); Value *Flags = Builder.getInt64(KernelArgs.HasNoWait); + Value *AsyncInfoQueue = Constant::getNullValue(Builder.getPtrTy()); Value *NumTeams3D = Builder.CreateInsertValue(ZeroArray, KernelArgs.NumTeams, {0}); @@ -514,7 +515,8 @@ void OpenMPIRBuilder::getKernelArgsVector(TargetKernelArgs &KernelArgs, Flags, NumTeams3D, NumThreads3D, - KernelArgs.DynCGGroupMem}; + KernelArgs.DynCGGroupMem, + AsyncInfoQueue}; } void OpenMPIRBuilder::addAttributes(omp::RuntimeFunction FnID, Function &Fn) { @@ -4569,7 +4571,17 @@ OpenMPIRBuilder::createTargetInit(const LocationDescription &Loc, bool IsSPMD, Constant *MayUseNestedParallelismVal = ConstantInt::getSigned(Int8, true); Constant *DebugIndentionLevelVal = ConstantInt::getSigned(Int16, 0); - Function *Kernel = Builder.GetInsertBlock()->getParent(); + Function *DebugKernelWrapper = Builder.GetInsertBlock()->getParent(); + Function *Kernel = DebugKernelWrapper; + + // We need to strip the debug prefix to get the correct kernel name. + StringRef KernelName = Kernel->getName(); + const std::string DebugPrefix = "_debug__"; + if (KernelName.ends_with(DebugPrefix)) { + KernelName = KernelName.drop_back(DebugPrefix.length()); + Kernel = M.getFunction(KernelName); + assert(Kernel && "Expected the real kernel to exist"); + } // Manifest the launch configuration in the metadata matching the kernel // environment. @@ -4591,12 +4603,6 @@ OpenMPIRBuilder::createTargetInit(const LocationDescription &Loc, bool IsSPMD, Constant *ReductionDataSize = ConstantInt::getSigned(Int32, 0); Constant *ReductionBufferLength = ConstantInt::getSigned(Int32, 0); - // We need to strip the debug prefix to get the correct kernel name. - StringRef KernelName = Kernel->getName(); - const std::string DebugPrefix = "_debug__"; - if (KernelName.ends_with(DebugPrefix)) - KernelName = KernelName.drop_back(DebugPrefix.length()); - Function *Fn = getOrCreateRuntimeFunctionPtr( omp::RuntimeFunction::OMPRTL___kmpc_target_init); const DataLayout &DL = Fn->getParent()->getDataLayout(); @@ -4648,7 +4654,7 @@ OpenMPIRBuilder::createTargetInit(const LocationDescription &Loc, bool IsSPMD, ? KernelEnvironmentGV : ConstantExpr::getAddrSpaceCast(KernelEnvironmentGV, KernelEnvironmentPtr); - Value *KernelLaunchEnvironment = Kernel->getArg(0); + Value *KernelLaunchEnvironment = DebugKernelWrapper->getArg(0); CallInst *ThreadKind = Builder.CreateCall(Fn, {KernelEnvironment, KernelLaunchEnvironment}); diff --git a/llvm/lib/Passes/PassBuilder.cpp b/llvm/lib/Passes/PassBuilder.cpp index 19e8a8ab68a73..42fdb2415c2fe 100644 --- a/llvm/lib/Passes/PassBuilder.cpp +++ b/llvm/lib/Passes/PassBuilder.cpp @@ -174,6 +174,7 @@ #include "llvm/Transforms/Instrumentation/ControlHeightReduction.h" #include "llvm/Transforms/Instrumentation/DataFlowSanitizer.h" #include "llvm/Transforms/Instrumentation/GCOVProfiler.h" +#include "llvm/Transforms/Instrumentation/GPUSan.h" #include "llvm/Transforms/Instrumentation/HWAddressSanitizer.h" #include "llvm/Transforms/Instrumentation/InstrOrderFile.h" #include "llvm/Transforms/Instrumentation/InstrProfiling.h" diff --git a/llvm/lib/Passes/PassBuilderPipelines.cpp b/llvm/lib/Passes/PassBuilderPipelines.cpp index 926515c9508a9..410aedd57dc9a 100644 --- a/llvm/lib/Passes/PassBuilderPipelines.cpp +++ b/llvm/lib/Passes/PassBuilderPipelines.cpp @@ -71,6 +71,7 @@ #include "llvm/Transforms/InstCombine/InstCombine.h" #include "llvm/Transforms/Instrumentation/CGProfile.h" #include "llvm/Transforms/Instrumentation/ControlHeightReduction.h" +#include "llvm/Transforms/Instrumentation/GPUSan.h" #include "llvm/Transforms/Instrumentation/InstrOrderFile.h" #include "llvm/Transforms/Instrumentation/InstrProfiling.h" #include "llvm/Transforms/Instrumentation/MemProfiler.h" @@ -163,6 +164,10 @@ static cl::opt cl::Hidden, cl::desc("Enable inline deferral during PGO")); +static cl::opt + EnableOffloadSanitizer("enable-offload-sanitizer", cl::init(false), + cl::Hidden, cl::desc("Enable offload sanitizer")); + static cl::opt EnableModuleInliner("enable-module-inliner", cl::init(false), cl::Hidden, cl::desc("Enable module inliner")); @@ -1750,6 +1755,9 @@ PassBuilder::buildLTODefaultPipeline(OptimizationLevel Level, // in ICP. MPM.addPass(LowerTypeTestsPass(nullptr, nullptr, true)); + if (EnableOffloadSanitizer) + MPM.addPass(GPUSanPass()); + invokeFullLinkTimeOptimizationLastEPCallbacks(MPM, Level); // Emit annotation remarks. @@ -1828,6 +1836,9 @@ PassBuilder::buildLTODefaultPipeline(OptimizationLevel Level, // pipeline). MPM.addPass(LowerTypeTestsPass(nullptr, nullptr, true)); + if (EnableOffloadSanitizer) + MPM.addPass(GPUSanPass()); + invokeFullLinkTimeOptimizationLastEPCallbacks(MPM, Level); // Emit annotation remarks. @@ -2040,6 +2051,9 @@ PassBuilder::buildLTODefaultPipeline(OptimizationLevel Level, if (PTO.CallGraphProfile) MPM.addPass(CGProfilePass(/*InLTOPostLink=*/true)); + if (EnableOffloadSanitizer) + MPM.addPass(GPUSanPass()); + invokeFullLinkTimeOptimizationLastEPCallbacks(MPM, Level); // Emit annotation remarks. diff --git a/llvm/lib/Passes/PassRegistry.def b/llvm/lib/Passes/PassRegistry.def index 60c517790bcab..741c1d3e1842a 100644 --- a/llvm/lib/Passes/PassRegistry.def +++ b/llvm/lib/Passes/PassRegistry.def @@ -78,6 +78,7 @@ MODULE_PASS("inliner-wrapper-no-mandatory-first", MODULE_PASS("insert-gcov-profiling", GCOVProfilerPass()) MODULE_PASS("instrorderfile", InstrOrderFilePass()) MODULE_PASS("instrprof", InstrProfilingLoweringPass()) +MODULE_PASS("gpusan", GPUSanPass()) MODULE_PASS("ctx-instr-lower", PGOCtxProfLoweringPass()) MODULE_PASS("invalidate", InvalidateAllAnalysesPass()) MODULE_PASS("iroutliner", IROutlinerPass()) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp index 6e7d34f5adaa3..a310038543532 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp @@ -2043,7 +2043,9 @@ static bool isPtrKnownNeverNull(const Value *V, const DataLayout &DL, // // TODO: Use ValueTracking's isKnownNeverNull if it becomes aware that some // address spaces have non-zero null values. - auto SrcPtrKB = computeKnownBits(V, DL).trunc(DL.getPointerSizeInBits(AS)); + auto SrcPtrKB = computeKnownBits(V, DL); + if (SrcPtrKB.getBitWidth() > DL.getPointerSizeInBits(AS)) + SrcPtrKB = SrcPtrKB.trunc(DL.getPointerSizeInBits(AS)); const auto NullVal = TM.getNullPointerValue(AS); assert((NullVal == 0 || NullVal == -1) && "don't know how to check for this null value!"); diff --git a/llvm/lib/Transforms/IPO/AttributorAttributes.cpp b/llvm/lib/Transforms/IPO/AttributorAttributes.cpp index 05a38cdd7d7b6..ebdc2c3682aab 100644 --- a/llvm/lib/Transforms/IPO/AttributorAttributes.cpp +++ b/llvm/lib/Transforms/IPO/AttributorAttributes.cpp @@ -7536,7 +7536,7 @@ struct AAPrivatizablePtrArgument final : public AAPrivatizablePtrImpl { // Collect all tail calls in the function as we cannot allow new allocas to // escape into tail recursion. // TODO: Be smarter about new allocas escaping into tail calls. - SmallVector TailCalls; + SmallVector TailCalls; bool UsedAssumedInformation = false; if (!A.checkForAllInstructions( [&](Instruction &I) { @@ -7574,8 +7574,9 @@ struct AAPrivatizablePtrArgument final : public AAPrivatizablePtrImpl { AI, Arg->getType(), "", IP); Arg->replaceAllUsesWith(AI); - for (CallInst *CI : TailCalls) - CI->setTailCall(false); + for (auto &CI : TailCalls) + if (CI) + cast(CI)->setTailCall(false); }; // Callback to repair a call site of the associated function. The elements diff --git a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp index e3a4821b8226b..89ce01907deb7 100644 --- a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp +++ b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp @@ -620,11 +620,13 @@ struct OMPInformationCache : public InformationCache { // functions, except if `optnone` is present. if (isOpenMPDevice(M)) { for (Function &F : M) { - for (StringRef Prefix : {"__kmpc", "_ZN4ompx", "omp_"}) - if (F.hasFnAttribute(Attribute::NoInline) && - F.getName().starts_with(Prefix) && - !F.hasFnAttribute(Attribute::OptimizeNone)) - F.removeFnAttr(Attribute::NoInline); + for (StringRef Prefix : {"__kmpc", "_ZN4ompx", "omp_"}) { + if (!F.getName().starts_with(Prefix) || + F.hasFnAttribute(Attribute::OptimizeNone)) + continue; + F.removeFnAttr(Attribute::NoInline); + F.addFnAttr(Attribute::AlwaysInline); + } } } diff --git a/llvm/lib/Transforms/Instrumentation/CMakeLists.txt b/llvm/lib/Transforms/Instrumentation/CMakeLists.txt index 8d345d394b51a..9aa530229ab7e 100644 --- a/llvm/lib/Transforms/Instrumentation/CMakeLists.txt +++ b/llvm/lib/Transforms/Instrumentation/CMakeLists.txt @@ -6,6 +6,7 @@ add_llvm_component_library(LLVMInstrumentation DataFlowSanitizer.cpp GCOVProfiling.cpp BlockCoverageInference.cpp + GPUSan.cpp MemProfiler.cpp MemorySanitizer.cpp IndirectCallPromotion.cpp diff --git a/llvm/lib/Transforms/Instrumentation/GPUSan.cpp b/llvm/lib/Transforms/Instrumentation/GPUSan.cpp new file mode 100644 index 0000000000000..10c39ca590ee5 --- /dev/null +++ b/llvm/lib/Transforms/Instrumentation/GPUSan.cpp @@ -0,0 +1,2094 @@ +//===-- GPUSan.cpp - GPU sanitizer ----------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +//===----------------------------------------------------------------------===// + +#include "llvm/Transforms/Instrumentation/GPUSan.h" +#include "llvm/ADT/ArrayRef.h" +#include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/DenseMapInfo.h" +#include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/SetVector.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/StringMap.h" +#include "llvm/ADT/StringRef.h" +#include "llvm/Analysis/LoopInfo.h" +#include "llvm/Analysis/ScalarEvolution.h" +#include "llvm/Analysis/ScalarEvolutionExpressions.h" +#include "llvm/Analysis/ValueTracking.h" +#include "llvm/IR/BasicBlock.h" +#include "llvm/IR/CallingConv.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/DataLayout.h" +#include "llvm/IR/DerivedTypes.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/GlobalObject.h" +#include "llvm/IR/GlobalValue.h" +#include "llvm/IR/GlobalVariable.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/InstIterator.h" +#include "llvm/IR/Instruction.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/IntrinsicsAMDGPU.h" +#include "llvm/IR/LLVMContext.h" +#include "llvm/IR/Metadata.h" +#include "llvm/IR/Module.h" +#include "llvm/Support/Allocator.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/MathExtras.h" +#include "llvm/Support/StringSaver.h" +#include "llvm/Transforms/Utils/Cloning.h" +#include "llvm/Transforms/Utils/ModuleUtils.h" +#include "llvm/Transforms/Utils/ScalarEvolutionExpander.h" +#include +#include + +using namespace llvm; + +#define DEBUG_TYPE "gpusan" + +cl::opt UseTags( + "gpusan-use-tags", + cl::desc( + "Use tags to detect use after if the number of allocations is large"), + cl::init(false)); + +namespace llvm { + +struct LocationInfoTy { + uint64_t LineNo = 0; + uint64_t ColumnNo = 0; + uint64_t ParentIdx = -1; + StringRef FileName; + StringRef FunctionName; + bool operator==(const LocationInfoTy &RHS) const { + return LineNo == RHS.LineNo && ColumnNo == RHS.ColumnNo && + FileName == RHS.FileName && FunctionName == RHS.FunctionName; + } +}; +template <> struct DenseMapInfo { + static LocationInfoTy EmptyKey; + static LocationInfoTy TombstoneKey; + static inline LocationInfoTy *getEmptyKey() { return &EmptyKey; } + + static inline LocationInfoTy *getTombstoneKey() { return &TombstoneKey; } + + static unsigned getHashValue(const LocationInfoTy *LI) { + unsigned Hash = DenseMapInfo::getHashValue(LI->LineNo); + Hash = detail::combineHashValue( + Hash, DenseMapInfo::getHashValue(LI->ColumnNo)); + Hash = detail::combineHashValue( + Hash, DenseMapInfo::getHashValue(LI->FileName)); + Hash = detail::combineHashValue( + Hash, DenseMapInfo::getHashValue(LI->FunctionName)); + return Hash; + } + + static bool isEqual(const LocationInfoTy *LHS, const LocationInfoTy *RHS) { + return *LHS == *RHS; + } +}; +LocationInfoTy DenseMapInfo::EmptyKey = + LocationInfoTy{(uint64_t)-1}; +LocationInfoTy DenseMapInfo::TombstoneKey = + LocationInfoTy{(uint64_t)-2}; +} // namespace llvm + +namespace { + +enum PtrOrigin { + UNKNOWN, + LOCAL, + GLOBAL, + SYSTEM, + NONE, +}; + +static std::string getSuffix(PtrOrigin PO) { + switch (PO) { + case UNKNOWN: + return ""; + case LOCAL: + return "_local"; + case GLOBAL: + return "_global"; + default: + break; + } + llvm_unreachable("Bad pointer origin!"); +} + +static StringRef prettifyFunctionName(StringSaver &SS, StringRef Name) { + if (Name.ends_with(".internalized")) + return SS.save(Name.drop_back(sizeof("internalized")) + " (internalized)"); + if (!Name.starts_with("__omp_offloading_")) + return Name; + Name = Name.drop_front(sizeof("__omp_offloading_")); + auto It = Name.find_first_of("_"); + if (It != StringRef::npos && It + 1 < Name.size()) + Name = Name.drop_front(It + 1); + It = Name.find_first_of("_"); + if (It != StringRef::npos && It + 1 < Name.size()) + Name = Name.drop_front(It + 1); + if (Name.ends_with("_debug__")) + Name = Name.drop_back(sizeof("debug__")); + if (Name.ends_with("_debug___omp_outlined_debug__")) + Name = Name.drop_back(sizeof("debug___omp_outlined_debug__")); + It = Name.find_last_of("_"); + if (It == StringRef::npos || It + 1 >= Name.size()) + return Name; + if (Name[It + 1] != 'l') + return Name; + int64_t KernelLineNo = 0; + Name.take_back(Name.size() - It - + /* '_' and 'l' */ 2) + .getAsInteger(10, KernelLineNo); + if (KernelLineNo) + Name = SS.save("omp target (" + Name.take_front(It).str() + ":" + + std::to_string(KernelLineNo) + ")"); + return Name; +} + +class GPUSanImpl final { +public: + GPUSanImpl(Module &M, FunctionAnalysisManager &FAM) + : M(M), FAM(FAM), Ctx(M.getContext()) {} + + bool instrument(); + +private: + bool instrumentGlobals(); + bool instrumentFunction(Function &Fn); + Value *instrumentAllocation(Instruction &I, Value &Size, FunctionCallee Fn, + PtrOrigin PO); + Value *instrumentAllocaInst(LoopInfo &LI, AllocaInst &AI); + void instrumentAccess(LoopInfo &LI, Instruction &I, int PtrIdx, + Type &AccessTy, bool IsRead, + SmallVector &GEPs); + void instrumentMultipleAccessPerBasicBlock( + LoopInfo &LI, + SmallVector &AccessCausingInstructionInABasicBlock, + Function &Fn); + void instrumentLoadInst(LoopInfo &LI, LoadInst &LoadI, + SmallVector &GEPs); + void instrumentStoreInst(LoopInfo &LI, StoreInst &StoreI, + SmallVector &GEPs); + void instrumentGEPInst(LoopInfo &LI, GetElementPtrInst &GEP); + bool instrumentCallInst(LoopInfo &LI, CallInst &CI); + void + instrumentReturns(SmallVectorImpl> &Allocas, + SmallVectorImpl &Returns); + + // Function used by access instrumentation to replace all references to + // user global variables with shadow variable references + Value *replaceUserGlobals(IRBuilder<> &IRB, GlobalVariable *ShadowGlobal, + Value *PtrOp, Value *&GlobalRef, + Instruction *InsertBefore = nullptr); + + void addCtor(); + void addDtor(); + + // Creates a function that applies a given block to each shadow global that + // satisfies a certain predicate. This predicate is usually whether or not + // the shadow global is set. + Function *createApplyShadowGlobalFn( + const Twine &Name, + llvm::function_ref &, Value *)> Predicate, + llvm::function_ref &, GlobalVariable *, + GlobalVariable *)> + Codegen); + + // Global (un)registration functions + Function *createShadowGlobalRegisterFn(); + Function *createShadowGlobalUnregisterFn(); + + Value *getPC(IRBuilder<> &IRB); + Value *getFunctionName(IRBuilder<> &IRB); + Value *getFileName(IRBuilder<> &IRB); + Value *getLineNo(IRBuilder<> &IRB); + + void getAllocationInfo(Function &Fn, PtrOrigin PO, Value &Object, + Value *&Start, Value *&Length, Value *&Tag); + PtrOrigin getPtrOrigin(LoopInfo &LI, Value *Ptr, + const Value **Object = nullptr); + + FunctionCallee getOrCreateFn(FunctionCallee &FC, StringRef Name, Type *RetTy, + ArrayRef ArgTys) { + if (!FC) { + auto *NewAllocationFnTy = FunctionType::get(RetTy, ArgTys, false); + FC = M.getOrInsertFunction(Name, NewAllocationFnTy); + Function *F = cast(FC.getCallee()); + } + return FC; + } + + PointerType *getPtrTy(PtrOrigin PO) { + if (PO == PtrOrigin::LOCAL) + return PointerType::get(Ctx, 5); + return PtrTy; + } + + FunctionCallee getNewFn(PtrOrigin PO) { + assert(PO <= GLOBAL && "Origin does not need handling."); + return getOrCreateFn(NewFn[PO], "ompx_new" + getSuffix(PO), getPtrTy(PO), + {getPtrTy(PO), Int64Ty, Int64Ty, Int64Ty, Int64Ty}); + } + FunctionCallee getFreeFn(PtrOrigin PO) { + assert(PO <= GLOBAL && "Origin does not need handling."); + return getOrCreateFn(FreeFn[PO], "ompx_free" + getSuffix(PO), VoidTy, + {getPtrTy(PO), Int64Ty}); + } + FunctionCallee getFreeNLocalFn() { + return getOrCreateFn(FreeNLocalFn, "ompx_free_local_n", VoidTy, {Int32Ty}); + } + + FunctionCallee getCheckFn(PtrOrigin PO) { + assert(PO <= GLOBAL && "Origin does not need handling."); + return getOrCreateFn(CheckFn[PO], "ompx_check" + getSuffix(PO), + getPtrTy(PO), + {getPtrTy(PO), Int64Ty, Int64Ty, Int64Ty, Int64Ty}); + } + + FunctionCallee getCheckVoidFn(PtrOrigin PO) { + assert(PO <= GLOBAL && "Origin does not need handling."); + return getOrCreateFn(CheckVoidFn[PO], "ompx_check_void" + getSuffix(PO), + Type::getVoidTy(Ctx), + {getPtrTy(PO), Int64Ty, Int64Ty, Int64Ty, Int64Ty}); + } + + FunctionCallee getCheckWithBaseFn(PtrOrigin PO) { + assert(PO >= LOCAL && PO <= GLOBAL && "Origin does not need handling."); + return getOrCreateFn(CheckWithBaseFn[PO], + "ompx_check_with_base" + getSuffix(PO), getPtrTy(PO), + {getPtrTy(PO), getPtrTy(PO), Int64Ty, Int32Ty, Int64Ty, + Int64Ty, Int64Ty, Int64Ty}); + } + + // check with base void return type + FunctionCallee getCheckWithBaseVoidFn(PtrOrigin PO) { + assert(PO >= LOCAL && PO <= GLOBAL && "Origin does not need handling."); + return getOrCreateFn(CheckWithBaseVoidFn[PO], + "ompx_check_with_base_void" + getSuffix(PO), + Type::getVoidTy(Ctx), + {getPtrTy(PO), getPtrTy(PO), Int64Ty, Int32Ty, Int64Ty, + Int64Ty, Int64Ty, Int64Ty}); + } + + FunctionCallee getCheckFnVector(uint64_t NumElements) { + return getOrCreateFn(CheckFnVector[0], "ompx_check_global_vec", PtrTy, + { + PtrTy, /*PlainPtrOps*/ + PtrTy, /*Sizes*/ + PtrTy, /*AccessIds*/ + PtrTy, /*SourceIds*/ + Int64Ty, /*PC*/ + Int64Ty /*NumElements*/ + }); + } + + FunctionCallee getCheckWithBaseFnVector(uint64_t NumElements, Type *ArrTy) { + return getOrCreateFn(CheckWithBaseFnVector[0], + "ompx_check_with_base_global_vec", ArrTy, + { + ArrTy, /*PlainPtrOps*/ + ArrTy, /*Starts*/ + ArrTy, /*Lengths*/ + ArrTy, /*Tags*/ + ArrTy, /*Sizes*/ + ArrTy, /*AccessIds*/ + ArrTy, /*SourceIds*/ + Int64Ty, /*PC*/ + Int64Ty /*NumElementsTy*/ + }); + } + + FunctionCallee getCheckRangeWithBaseFn(PtrOrigin PO, Type* UpperBoundType, Type* LowerBoundType) { + return getOrCreateFn(CheckRangeWithBaseFn[PO], + "ompx_check_range_with_base" + getSuffix(PO), + Type::getVoidTy(Ctx), + { + UpperBoundType, /*SCEV max computed address*/ + LowerBoundType, /*SCEV min computed address*/ + getPtrTy(PO), /*Start of allocation address*/ + Int64Ty, /*Size of allocation, i.e. Length*/ + Int32Ty, /*Tag*/ + Int64Ty, /*Size of the type that is loaded/stored*/ + Int64Ty, /*AccessId, Read/Write*/ + Int64Ty, /*SourceId, Allocation source ID*/ + Int64Ty /*PC -- Program Counter*/ + }); + } + + FunctionCallee getCheckRangeFn(PtrOrigin PO, Type* UpperBoundType, Type* LowerBoundType) { + return getOrCreateFn(CheckRangeFn[PO], "ompx_check_range" + getSuffix(PO), + Type::getVoidTy(Ctx), + { + UpperBoundType, /*SCEV max computed address*/ + LowerBoundType, /*SCEV min computed address*/ + Int64Ty, /*Size of the type that is loaded/stored*/ + Int64Ty, /*AccessId, Read/Write*/ + Int64Ty, /*SourceId, Allocation source ID*/ + Int64Ty /*PC -- Program Counter*/ + }); + } + + FunctionCallee getAllocationInfoFn(PtrOrigin PO) { + assert(PO >= LOCAL && PO <= GLOBAL && "Origin does not need handling."); + if (auto *F = M.getFunction("ompx_get_allocation_info" + getSuffix(PO))) + return FunctionCallee(F->getFunctionType(), F); + return getOrCreateFn( + AllocationInfoFn[PO], "ompx_get_allocation_info" + getSuffix(PO), + StructType::create({getPtrTy(PO), Int64Ty, Int32Ty}), {getPtrTy(PO)}); + } + FunctionCallee getGEPFn(PtrOrigin PO) { + assert(PO <= GLOBAL && "Origin does not need handling."); + return getOrCreateFn(GEPFn[PO], "ompx_gep" + getSuffix(PO), getPtrTy(PO), + {getPtrTy(PO), Int64Ty, Int64Ty}); + } + FunctionCallee getUnpackFn(PtrOrigin PO) { + assert(PO <= GLOBAL && "Origin does not need handling."); + return getOrCreateFn(UnpackFn[PO], "ompx_unpack" + getSuffix(PO), + getPtrTy(PO), {getPtrTy(PO), Int64Ty}); + } + FunctionCallee getLifetimeStart() { + return getOrCreateFn(LifetimeStartFn, "ompx_lifetime_start", VoidTy, + {getPtrTy(LOCAL), Int64Ty}); + } + FunctionCallee getLifetimeEnd() { + return getOrCreateFn(LifetimeEndFn, "ompx_lifetime_end", VoidTy, + {getPtrTy(LOCAL), Int64Ty}); + } + FunctionCallee getLeakCheckFn() { + FunctionCallee LeakCheckFn; + return getOrCreateFn(LeakCheckFn, "ompx_leak_check", VoidTy, {}); + } + FunctionCallee getThreadIdFn() { + return getOrCreateFn(ThreadIDFn, "ompx_global_thread_id", Int32Ty, {}); + } + + CallInst *createCall(IRBuilder<> &IRB, FunctionCallee Callee, + ArrayRef Args = std::nullopt, + const Twine &Name = "") { + Calls.push_back(IRB.CreateCall(Callee, Args, Name)); + return Calls.back(); + } + SmallVector Calls; + + Module &M; + FunctionAnalysisManager &FAM; + LLVMContext &Ctx; + bool HasAllocas; + GlobalVariable *LocationsArray; + SmallSetVector AmbiguousCalls; + int AllocationId = 1; + + // Maps user-defined globals to shadow globals + SmallMapVector UserGlobals; + + Type *VoidTy = Type::getVoidTy(Ctx); + Type *IntptrTy = M.getDataLayout().getIntPtrType(Ctx); + PointerType *PtrTy = PointerType::getUnqual(Ctx); + IntegerType *Int8Ty = Type::getInt8Ty(Ctx); + IntegerType *Int32Ty = Type::getInt32Ty(Ctx); + IntegerType *Int64Ty = Type::getInt64Ty(Ctx); + + // Create a pointer to the 8-bit integer type + Type *Int8PtrType = PointerType::get(Int8Ty, 0); + Type *Int32ASPtrType = PointerType::get(Int32Ty, 1); + + const DataLayout &DL = M.getDataLayout(); + + FunctionCallee NewFn[3]; + FunctionCallee GEPFn[3]; + FunctionCallee FreeFn[3]; + FunctionCallee CheckFn[3]; + FunctionCallee CheckVoidFn[3]; + FunctionCallee CheckWithBaseFn[3]; + FunctionCallee CheckWithBaseVoidFn[3]; + FunctionCallee CheckFnVector[1]; + FunctionCallee CheckWithBaseFnVector[1]; + FunctionCallee AllocationInfoFn[3]; + FunctionCallee UnpackFn[3]; + FunctionCallee LifetimeEndFn; + FunctionCallee LifetimeStartFn; + FunctionCallee FreeNLocalFn; + FunctionCallee ThreadIDFn; + FunctionCallee CheckRangeWithBaseFn[3]; + FunctionCallee CheckRangeFn[3]; + + StringMap GlobalStringMap; + struct AllocationInfoTy { + Value *Start; + Value *Length; + Value *Tag; + }; + DenseMap, AllocationInfoTy> AllocationInfoMap; + + DenseMap> + LocationMap; + + const std::pair + addLocationInfo(LocationInfoTy *LI, bool &IsNew) { + auto It = LocationMap.insert({LI, LocationMap.size()}); + IsNew = It.second; + if (!IsNew) + delete LI; + return {It.first->first, It.first->second}; + } + + void buildCallTreeInfo(Function &Fn, LocationInfoTy &LI); + ConstantInt *getSourceIndex(Instruction &I, LocationInfoTy *LastLI = nullptr); + ConstantInt *getSourceIndex(const GlobalVariable *G); + + uint64_t addString(StringRef S) { + const auto &It = UniqueStrings.insert({S, ConcatenatedString.size()}); + if (It.second) { + ConcatenatedString += S; + ConcatenatedString.push_back('\0'); + } + return It.first->second; + }; + + void encodeLocationInfo(LocationInfoTy &LI, uint64_t Idx) { + StringRef FunctionName = LI.FunctionName; + if (LI.ParentIdx == (decltype(LI.ParentIdx))-1) + FunctionName = prettifyFunctionName(SS, FunctionName); + + auto FuncIdx = addString(FunctionName); + auto FileIdx = addString(LI.FileName); + if (LocationEncoding.size() < (Idx + 1) * 5) + LocationEncoding.resize((Idx + 1) * 5); + LocationEncoding[Idx * 5 + 0] = ConstantInt::get(Int64Ty, FuncIdx); + LocationEncoding[Idx * 5 + 1] = ConstantInt::get(Int64Ty, FileIdx); + LocationEncoding[Idx * 5 + 2] = ConstantInt::get(Int64Ty, LI.LineNo); + LocationEncoding[Idx * 5 + 3] = ConstantInt::get(Int64Ty, LI.ColumnNo); + LocationEncoding[Idx * 5 + 4] = ConstantInt::get(Int64Ty, LI.ParentIdx); + } + + SmallVector LocationEncoding; + std::string ConcatenatedString; + DenseMap StringIndexMap; + DenseMap UniqueStrings; + + BumpPtrAllocator BPA; + StringSaver SS = StringSaver(BPA); +}; + +} // end anonymous namespace + +ConstantInt *GPUSanImpl::getSourceIndex(Instruction &I, + LocationInfoTy *LastLI) { + LocationInfoTy *LI = new LocationInfoTy(); + auto *DILoc = I.getDebugLoc().get(); + + auto FillLI = [&](LocationInfoTy &LI, DILocation &DIL) { + LI.FileName = DIL.getFilename(); + if (LI.FileName.empty()) + LI.FileName = I.getFunction()->getSubprogram()->getFilename(); + LI.FunctionName = DIL.getSubprogramLinkageName(); + if (LI.FunctionName.empty()) + LI.FunctionName = I.getFunction()->getName(); + LI.LineNo = DIL.getLine(); + LI.ColumnNo = DIL.getColumn(); + }; + + DILocation *ParentDILoc = nullptr; + if (DILoc) { + FillLI(*LI, *DILoc); + ParentDILoc = DILoc->getInlinedAt(); + } else { + LI->FunctionName = I.getFunction()->getName(); + } + + bool IsNew; + uint64_t Idx; + std::tie(LI, Idx) = addLocationInfo(LI, IsNew); + if (LastLI) + LastLI->ParentIdx = Idx; + if (!IsNew) + return ConstantInt::get(Int64Ty, Idx); + + uint64_t CurIdx = Idx; + LocationInfoTy *CurLI = LI; + while (ParentDILoc) { + auto *ParentLI = new LocationInfoTy(); + FillLI(*ParentLI, *ParentDILoc); + uint64_t ParentIdx; + std::tie(ParentLI, ParentIdx) = addLocationInfo(ParentLI, IsNew); + CurLI->ParentIdx = ParentIdx; + if (!IsNew) + break; + encodeLocationInfo(*CurLI, CurIdx); + CurLI = ParentLI; + CurIdx = ParentIdx; + ParentDILoc = ParentDILoc->getInlinedAt(); + } + + Function &Fn = *I.getFunction(); + buildCallTreeInfo(Fn, *CurLI); + + encodeLocationInfo(*CurLI, CurIdx); + + return ConstantInt::get(Int64Ty, Idx); +} + +ConstantInt *GPUSanImpl::getSourceIndex(const GlobalVariable *G) { + SmallVector GlobalLocations; + G->getDebugInfo(GlobalLocations); + + if (GlobalLocations.empty()) + return ConstantInt::get(Int64Ty, 0); // Fallback + + const auto *DLVar = GlobalLocations.front()->getVariable(); + + LocationInfoTy *LI = new LocationInfoTy(); + LI->FileName = DLVar->getFilename(); + LI->LineNo = DLVar->getLine(); + LI->FunctionName = DLVar->getName(); + LI->ColumnNo = 0; + + bool IsNew; + uint64_t Idx; + std::tie(LI, Idx) = addLocationInfo(LI, IsNew); + + if (IsNew) + encodeLocationInfo(*LI, Idx); + + return ConstantInt::get(Int64Ty, Idx); +} + +void GPUSanImpl::buildCallTreeInfo(Function &Fn, LocationInfoTy &LI) { + if (Fn.hasFnAttribute("kernel")) + return; + SmallVector Calls; + for (auto &U : Fn.uses()) { + auto *CB = dyn_cast(U.getUser()); + if (!CB) + continue; + if (!CB->isCallee(&U)) + continue; + Calls.push_back(CB); + } + if (Calls.size() == 1) { + getSourceIndex(*Calls.back(), &LI); + return; + } + LI.ParentIdx = -2; + AmbiguousCalls.insert(Calls.begin(), Calls.end()); +} + +Value *GPUSanImpl::getPC(IRBuilder<> &IRB) { + return IRB.CreateIntrinsic(Int64Ty, Intrinsic::amdgcn_s_getpc, {}, nullptr, + "PC"); +} +Value *GPUSanImpl::getFunctionName(IRBuilder<> &IRB) { + const auto &DLoc = IRB.getCurrentDebugLocation(); + StringRef FnName = IRB.GetInsertPoint()->getFunction()->getName(); + if (DLoc && DLoc.get()) { + StringRef SubprogramName = DLoc.get()->getSubprogramLinkageName(); + if (!SubprogramName.empty()) + FnName = SubprogramName; + } + StringRef Name = FnName.take_back(255); + Value *&NameVal = GlobalStringMap[Name]; + if (!NameVal) + NameVal = IRB.CreateAddrSpaceCast( + IRB.CreateGlobalStringPtr(Name, "", DL.getDefaultGlobalsAddressSpace(), + &M), + PtrTy); + return NameVal; +} +Value *GPUSanImpl::getFileName(IRBuilder<> &IRB) { + const auto &DLoc = IRB.getCurrentDebugLocation(); + if (!DLoc || DLoc->getFilename().empty()) + return ConstantPointerNull::get(PtrTy); + StringRef Name = DLoc->getFilename().take_back(255); + Value *&NameVal = GlobalStringMap[Name]; + if (!NameVal) + NameVal = IRB.CreateAddrSpaceCast( + IRB.CreateGlobalStringPtr(Name, "", DL.getDefaultGlobalsAddressSpace(), + &M), + PtrTy); + return NameVal; +} +Value *GPUSanImpl::getLineNo(IRBuilder<> &IRB) { + const auto &DLoc = IRB.getCurrentDebugLocation(); + if (!DLoc) + return Constant::getNullValue(Int64Ty); + return ConstantInt::get(Int64Ty, DLoc.getLine()); +} + +void GPUSanImpl::getAllocationInfo(Function &Fn, PtrOrigin PO, Value &Object, + Value *&Start, Value *&Length, Value *&Tag) { + auto &It = AllocationInfoMap[{&Fn, &Object}]; + if (!It.Start) { + auto *IP = dyn_cast(&Object); + if (IP) + IP = IP->getNextNode(); + else + IP = &*Fn.getEntryBlock().getFirstNonPHIOrDbgOrAlloca(); + IRBuilder<> IRB(IP); + auto *CB = createCall(IRB, getAllocationInfoFn(PO), + {IRB.CreateAddrSpaceCast(&Object, getPtrTy(PO))}); + It.Start = IRB.CreateExtractValue(CB, {0}); + It.Length = IRB.CreateExtractValue(CB, {1}); + It.Tag = IRB.CreateExtractValue(CB, {2}); + } + Start = It.Start; + Length = It.Length; + Tag = It.Tag; +} + +PtrOrigin GPUSanImpl::getPtrOrigin(LoopInfo &LI, Value *Ptr, + const Value **Object) { + SmallVector Objects; + getUnderlyingObjects(Ptr, Objects, &LI); + if (Object && Objects.size() == 1) + *Object = Objects.front(); + PtrOrigin PO = NONE; + for (auto *Obj : Objects) { + PtrOrigin ObjPO = HasAllocas ? UNKNOWN : GLOBAL; + if (isa(Obj)) { + ObjPO = LOCAL; + } else if (isa(Obj)) { + ObjPO = GLOBAL; + } else if (auto *II = dyn_cast(Obj)) { + if (II->getIntrinsicID() == Intrinsic::amdgcn_implicitarg_ptr || + II->getIntrinsicID() == Intrinsic::amdgcn_dispatch_ptr) + return SYSTEM; + } else if (auto *CI = dyn_cast(Obj)) { + if (auto *Callee = CI->getCalledFunction()) + if (Callee->getName().starts_with("ompx_")) { + if (Callee->getName().ends_with("_global")) + ObjPO = GLOBAL; + else if (Callee->getName().ends_with("_local")) + ObjPO = LOCAL; + } + } else if (auto *Arg = dyn_cast(Obj)) { + if (Arg->getParent()->hasFnAttribute("kernel")) + ObjPO = GLOBAL; + } + if (PO == NONE || PO == ObjPO) { + PO = ObjPO; + } else { + return UNKNOWN; + } + } + return PO; +} + +constexpr StringRef ShadowGlobalPrefix = "__san.global."; +constexpr StringRef GlobalIgnorePrefix[] = {"__omp_", "llvm.", "_Z", + "__sanitizer_", "__san."}; + +bool isUserGlobal(const GlobalVariable &G) { + auto Name = G.getName(); + if (Name.empty()) + return false; + for (const auto &s : GlobalIgnorePrefix) { + if (Name.starts_with(s)) + return false; + } + return true; +} + +Twine getShadowGlobalName(const GlobalVariable &G) { + return ShadowGlobalPrefix + G.getName(); +} + +Function *GPUSanImpl::createApplyShadowGlobalFn( + const Twine &Name, + llvm::function_ref &, Value *)> Predicate, + llvm::function_ref &, GlobalVariable *, GlobalVariable *)> + Codegen) { + Function *RegisterFn = + Function::Create(FunctionType::get(VoidTy, false), + GlobalValue::PrivateLinkage, "__san." + Name, &M); + RegisterFn->addFnAttr(Attribute::DisableSanitizerInstrumentation); + + BasicBlock *Entry = BasicBlock::Create(Ctx, "entry", RegisterFn); + IRBuilder<> IRB(Entry); + + if (UserGlobals.empty()) { + IRB.CreateRetVoid(); + return RegisterFn; + } + + bool FirstGlobal = true; + SmallVector CheckPtrBlocks, ApplicationBlocks; + SmallVector Conditions; + + for (auto &[UserGlobal, ShadowGlobal] : UserGlobals) { + BasicBlock *CheckBlock; + if (FirstGlobal) { + CheckBlock = Entry; + FirstGlobal = false; + } else { + auto CheckBlockName = "check_" + UserGlobal->getName() + "_shadow"; + CheckBlock = BasicBlock::Create(Ctx, CheckBlockName, RegisterFn); + IRB.SetInsertPoint(CheckBlock); + } + + auto *ShadowVal = IRB.CreateLoad(getPtrTy(GLOBAL), ShadowGlobal); + auto *ShadowIntVal = IRB.CreatePtrToInt(ShadowVal, Int64Ty); + auto *ShadowPredicate = Predicate(IRB, ShadowIntVal); + + Conditions.push_back(ShadowPredicate); + CheckPtrBlocks.push_back(CheckBlock); + + auto AppBlockName = "register_" + UserGlobal->getName() + "_apply"; + BasicBlock *ApplicationBlock = + BasicBlock::Create(Ctx, AppBlockName, RegisterFn); + IRB.SetInsertPoint(ApplicationBlock); + + Codegen(IRB, const_cast(UserGlobal), ShadowGlobal); + ApplicationBlocks.push_back(ApplicationBlock); + } + + BasicBlock *End = BasicBlock::Create(Ctx, Name + "_end", RegisterFn); + IRB.SetInsertPoint(End); + IRB.CreateRetVoid(); + + // Insert block terminators + for (size_t i = 0; i < CheckPtrBlocks.size(); i++) { + auto NextBlock = + (i + 1 < CheckPtrBlocks.size()) ? CheckPtrBlocks[i + 1] : End; + + IRB.SetInsertPoint(CheckPtrBlocks[i]); + IRB.CreateCondBr(Conditions[i], ApplicationBlocks[i], NextBlock); + + IRB.SetInsertPoint(ApplicationBlocks[i]); + IRB.CreateBr(NextBlock); + } + + return RegisterFn; +} + +// Register shadow globals if they haven't already been set by host +Function *GPUSanImpl::createShadowGlobalRegisterFn() { + auto PredicateCodegen = [&](IRBuilder<> &IRB, Value *PredicateValue) { + return IRB.CreateICmpEQ(PredicateValue, ConstantInt::get(Int64Ty, 0)); + }; + auto ShadowFnCodegen = [&](IRBuilder<> &IRB, GlobalVariable *Usr, + GlobalVariable *Shadow) { + auto *OriginalType = Usr->getValueType(); + auto OrginalTypeSize = DL.getTypeAllocSize(OriginalType); + + Value *PlainUserGlobal = + IRB.CreatePointerBitCastOrAddrSpaceCast(Usr, getPtrTy(GLOBAL)); + + auto *RegisterGlobalCall = + createCall(IRB, getNewFn(GLOBAL), + {PlainUserGlobal, ConstantInt::get(Int64Ty, OrginalTypeSize), + ConstantInt::get(Int64Ty, AllocationId++), + getSourceIndex(Usr), getPC(IRB)}); + IRB.CreateStore(RegisterGlobalCall, Shadow); + }; + return createApplyShadowGlobalFn("register_globals", PredicateCodegen, + ShadowFnCodegen); +} + +Function *GPUSanImpl::createShadowGlobalUnregisterFn() { + auto FreeGlobalFn = getFreeFn(GLOBAL); + auto PredicateCodegen = [&](IRBuilder<> &IRB, Value *PredicateValue) { + return IRB.CreateICmpNE(PredicateValue, ConstantInt::get(Int64Ty, 0)); + }; + auto ShadowFnCodegen = [&](IRBuilder<> &IRB, GlobalVariable *Usr, + GlobalVariable *Shadow) { + Value *LoadDummyPtr = IRB.CreateLoad(getPtrTy(GLOBAL), Shadow); + createCall(IRB, FreeGlobalFn, {LoadDummyPtr, getSourceIndex(Usr)}); + }; + return createApplyShadowGlobalFn("unregister_globals", PredicateCodegen, + ShadowFnCodegen); +} + +void GPUSanImpl::addCtor() { + Function *CtorFn = + Function::Create(FunctionType::get(VoidTy, false), + GlobalValue::PrivateLinkage, "__san.ctor", &M); + CtorFn->addFnAttr(Attribute::DisableSanitizerInstrumentation); + + BasicBlock *Entry = BasicBlock::Create(Ctx, "entry", CtorFn); + IRBuilder<> IRB(Entry); + + // createCall(IRB, createShadowGlobalRegisterFn()); + IRB.CreateRetVoid(); + + appendToGlobalCtors(M, CtorFn, 0, nullptr); +} + +void GPUSanImpl::addDtor() { + Function *DtorFn = + Function::Create(FunctionType::get(VoidTy, false), + GlobalValue::PrivateLinkage, "__san.dtor", &M); + DtorFn->addFnAttr(Attribute::DisableSanitizerInstrumentation); + BasicBlock *Entry = BasicBlock::Create(Ctx, "entry", DtorFn); + IRBuilder<> IRB(Entry); + + createCall(IRB, createShadowGlobalUnregisterFn()); + createCall(IRB, getLeakCheckFn()); + + IRB.CreateRetVoid(); + appendToGlobalDtors(M, DtorFn, 0, nullptr); +} + +bool GPUSanImpl::instrumentGlobals() { + bool Changed = false; + for (GlobalVariable &V : M.globals()) { + if (!isUserGlobal(V)) + continue; + Twine ShadowName = getShadowGlobalName(V); + Constant *ShadowInit = Constant::getNullValue(Int64Ty); + auto *ShadowVar = + new GlobalVariable(M, Int64Ty, false, GlobalValue::ExternalLinkage, + ShadowInit, ShadowName); + ShadowVar->setVisibility(GlobalValue::ProtectedVisibility); + UserGlobals.insert(std::make_pair(&V, ShadowVar)); + Changed = true; + } + + return Changed; + + // Function *DTorFn; + // std::tie(DTorFn, std::ignore) = getOrCreateSanitizerCtorAndInitFunctions( + // M, "ompx.ctor", "ompx.init", + // /*InitArgTypes=*/{}, + // /*InitArgs=*/{}, + // This callback is invoked when the functions are created the first + // time. Hook them into the global ctors list in that case: + // [&](Function *Ctor, FunctionCallee) { + // appendToGlobalCtors(M, Ctor, 0, Ctor); + // }); + // return true; +} + +Value *GPUSanImpl::instrumentAllocation(Instruction &I, Value &Size, + FunctionCallee Fn, PtrOrigin PO) { + IRBuilder<> IRB(I.getNextNode()); + Value *PlainI = IRB.CreatePointerBitCastOrAddrSpaceCast(&I, getPtrTy(PO)); + auto *CB = + createCall(IRB, Fn, + {PlainI, &Size, ConstantInt::get(Int64Ty, AllocationId++), + getSourceIndex(I), getPC(IRB)}, + I.getName() + ".san"); + SmallVector Lifetimes; + I.replaceUsesWithIf( + IRB.CreatePointerBitCastOrAddrSpaceCast(CB, I.getType()), [&](Use &U) { + if (auto *LT = dyn_cast(U.getUser())) { + Lifetimes.push_back(LT); + return false; + } + return U.getUser() != PlainI && U.getUser() != CB; + }); + if (Lifetimes.empty()) + return CB; + + CB->setArgOperand(1, ConstantInt::get(Int64Ty, 0)); + for (auto *LT : Lifetimes) { + if (LT->getIntrinsicID() == Intrinsic::lifetime_start) { + IRB.SetInsertPoint(LT); + createCall(IRB, getLifetimeStart(), {CB, LT->getArgOperand(0)}); + } else { + IRB.SetInsertPoint(LT); + createCall(IRB, getLifetimeEnd(), {CB, LT->getArgOperand(0)}); + } + } + return CB; +} + +Value *GPUSanImpl::instrumentAllocaInst(LoopInfo &LI, AllocaInst &AI) { + auto SizeOrNone = AI.getAllocationSize(DL); + if (!SizeOrNone) + llvm_unreachable("TODO"); + Value *Size = ConstantInt::get(Int64Ty, *SizeOrNone); + return instrumentAllocation(AI, *Size, getNewFn(LOCAL), LOCAL); +} + +// Changes GEP instruction PtrOp and ensures instruction type corresponds +// with new Ptr type. In some cases, the new pointer will not match the +// original GEP inst's addressspace. +void changePtrOperand(GetElementPtrInst *GEP, Value *NewPtrOp) { + Type *OldType = GEP->getPointerOperandType(); + GEP->setOperand(GetElementPtrInst::getPointerOperandIndex(), NewPtrOp); + + if (OldType == NewPtrOp->getType()) + return; + + SmallVector IdxList; + IdxList.reserve(GEP->getNumIndices()); + for (auto &Usr : GEP->indices()) + IdxList.push_back(Usr.get()); + + auto *ExpectedTy = GetElementPtrInst::getGEPReturnType(NewPtrOp, IdxList); + + if (ExpectedTy != GEP->getType()) + GEP->mutateType(ExpectedTy); +} + +Value *GPUSanImpl::replaceUserGlobals(IRBuilder<> &IRB, + GlobalVariable *ShadowGlobal, + Value *PtrOp, Value *&GlobalRef, + Instruction *InsertBefore) { + Type *ShadowPtrType = getPtrTy(GLOBAL); + auto CreateGlobalRef = [&]() { + if (InsertBefore) { + GlobalRef = + new LoadInst(ShadowPtrType, ShadowGlobal, + "load_sg_" + ShadowGlobal->getName(), InsertBefore); + } else { + GlobalRef = IRB.CreateLoad(ShadowPtrType, ShadowGlobal); + } + return GlobalRef; + }; + + if (auto *Inst = dyn_cast(PtrOp)) { + auto *NewOperand = replaceUserGlobals( + IRB, ShadowGlobal, Inst->getPointerOperand(), GlobalRef, Inst); + changePtrOperand(Inst, NewOperand); + + return Inst; + } + + auto *C = dyn_cast(PtrOp); + if (C && isa(PtrOp)) { + if (auto *Inst = dyn_cast(C->getAsInstruction())) { + changePtrOperand(Inst, CreateGlobalRef()); + + auto IP = IRB.saveIP(); + if (InsertBefore) + IRB.SetInsertPoint(InsertBefore); + auto I = IRB.Insert(Inst); + IRB.restoreIP(IP); + + return I; + } else { + llvm_unreachable("Expected GEP instruction"); + } + } + + return CreateGlobalRef(); +} + +void GPUSanImpl::instrumentAccess(LoopInfo &LI, Instruction &I, int PtrIdx, + Type &AccessTy, bool IsRead, + SmallVector &GEPs) { + Value *PtrOp = I.getOperand(PtrIdx); + const Value *Object = nullptr; + PtrOrigin PO = getPtrOrigin(LI, PtrOp, &Object); + if (PO > GLOBAL) + return; + + Value *Start = nullptr; + Value *Length = nullptr; + Value *Tag = nullptr; + IRBuilder<> IRB(&I); + + if (Object && PO != UNKNOWN) { + Value *ObjectRef = const_cast(Object); + + // Replace any references to user-defined global variables + // with their respective shadow globals + auto *GlobalCast = dyn_cast(ObjectRef); + if (GlobalCast && UserGlobals.contains(GlobalCast)) { + auto *ShadowGlobal = UserGlobals.lookup(GlobalCast); + Value *LoadDummyPtr; + PtrOp = replaceUserGlobals(IRB, ShadowGlobal, PtrOp, LoadDummyPtr); + ObjectRef = LoadDummyPtr; + } + + getAllocationInfo(*I.getFunction(), PO, *ObjectRef, Start, Length, Tag); + } + + if (Loop *L = LI.getLoopFor(I.getParent())) { + + goto handleunhoistable; + + auto &SE = FAM.getResult(*I.getFunction()); + SCEVExpander Expander = SCEVExpander(SE, DL, "SCEVExpander"); + const SCEV *PtrExpr = SE.getSCEV(PtrOp); + + const SCEV *ScStart; + const SCEV *ScEnd; + const SCEV *Step; + + if (SE.isLoopInvariant(PtrExpr, L)) { + + if (!Expander.isSafeToExpand(PtrExpr)) + goto handleunhoistable; + + // Assumption: Current loop has one unique predecessor + // We can insert at the end of the basic block if it + // is not a branch instruction. + auto *Entry = L->getLoopPreheader(); + + if (!Entry) + goto handleunhoistable; + + Instruction *PtrOpInst = dyn_cast(PtrOp); + + if (!PtrOpInst) + goto handleunhoistable; + + // Get handle to last instruction. + auto LoopEnd = --(Entry->end()); + + static int32_t ReadAccessId = -1; + static int32_t WriteAccessId = 1; + const int32_t &AccessId = IsRead ? ReadAccessId-- : WriteAccessId++; + + auto TySize = DL.getTypeStoreSize(&AccessTy); + assert(!TySize.isScalable()); + Value *Size = ConstantInt::get(Int64Ty, TySize.getFixedValue()); + + LoopEnd = --(Entry->end()); + CallInst *CB; + Value *PCVal = getPC(IRB); + Instruction *PCInst = dyn_cast(PCVal); + if (!PCInst) + return; + + Value *AccessIDVal = ConstantInt::get(Int64Ty, AccessId); + PCInst->removeFromParent(); + PCInst->insertBefore(LoopEnd); + + errs() << "PtrOp: " << *PtrOp->getType() << "\n"; + errs() << "Start: " << *Start->getType() << "\n"; + errs() << "Length: " << *Length->getType() << "\n"; + errs() << "Tag: " << *Tag->getType() << "\n"; + errs() << "Size: " << *Size->getType() << "\n"; + errs() << "AccessIDVal: " << *AccessIDVal->getType() << "\n"; + + FunctionCallee Callee; + //Value *PlainPtrOpHoisted = + // IRB.CreatePointerBitCastOrAddrSpaceCast(PtrOp, getPtrTy(PO)); + //Instruction *PlainPtrOpHoistedIns = dyn_cast(PlainPtrOpHoisted); + //if (!PlainPtrOpHoistedIns) + // goto handleunhoistable; + + //PlainPtrOpHoistedIns->moveAfter(PCInst); + if (Start) { + Callee = getCheckWithBaseFn(PO); + errs() << "Print Function Callee Signature: " << *Callee.getFunctionType() << "\n"; + CB = createCall(IRB, Callee, + {PtrOp, Start, Length, Tag, Size, + ConstantInt::get(Int64Ty, AccessId), getSourceIndex(I), + PCVal}); + } else { + Callee = getCheckFn(PO); + CB = createCall(IRB, Callee, + {PtrOp, Size, ConstantInt::get(Int64Ty, AccessId), + getSourceIndex(I), PCVal}); + } + CB->removeFromParent(); + CB->insertAfter(PCInst); + + // get real pointer from the fake pointer. + //Value *PlainPtrOp = + // IRB.CreatePointerBitCastOrAddrSpaceCast(PtrOp, getPtrTy(PO)); + //auto *CBUnpack = + // createCall(IRB, getUnpackFn(PO), {PlainPtrOp, getPC(IRB)}, + // PtrOp->getName() + ".unpack"); + + I.setOperand(PtrIdx, IRB.CreatePointerBitCastOrAddrSpaceCast(CB, PtrOp->getType())); + + return; + + } else { + const SCEVAddRecExpr *AddRecExpr = dyn_cast(PtrExpr); + if (AddRecExpr) { + + auto *Entry = L->getLoopPreheader(); + + if (!Entry) + goto handleunhoistable; + + const SCEV *Ex = SE.getSymbolicMaxBackedgeTakenCount(L); + + ScStart = AddRecExpr->getStart(); + ScEnd = AddRecExpr->evaluateAtIteration(Ex, SE); + Step = AddRecExpr->getStepRecurrence(SE); + + if (const auto *CStep = dyn_cast(Step)) { + if (CStep->getValue()->isNegative()) + std::swap(ScStart, ScEnd); + } else { + ScStart = SE.getUMinExpr(ScStart, ScEnd); + ScEnd = SE.getUMaxExpr(AddRecExpr->getStart(), ScEnd); + } + + if (!Expander.isSafeToExpand(ScStart)) + goto handleunhoistable; + + if (!Expander.isSafeToExpand(ScEnd)) + goto handleunhoistable; + + // Get handle to last instruction. + auto LoopEnd = --(Entry->end()); + Instruction *LoopEndInst = &*LoopEnd; + + Type *Int64Ty = Type::getInt64Ty(Ctx); + Value *LowerBoundCode = + Expander.expandCodeFor(ScStart, nullptr, LoopEnd); + + LoopEnd = --(Entry->end()); + + Value *UpperBoundCode = Expander.expandCodeFor(ScEnd, nullptr, LoopEnd); + static int32_t ReadAccessId = -1; + static int32_t WriteAccessId = 1; + const int32_t &AccessId = IsRead ? ReadAccessId-- : WriteAccessId++; + + auto TySize = DL.getTypeStoreSize(&AccessTy); + assert(!TySize.isScalable()); + Value *Size = ConstantInt::get(Int64Ty, TySize.getFixedValue()); + + LoopEnd = --(Entry->end()); + + CallInst *CB; + Value *PCVal = getPC(IRB); + Instruction *PCInst = dyn_cast(PCVal); + + if (!PCInst) + return; + + Value *AccessIDVal = ConstantInt::get(Int64Ty, AccessId); + PCInst->removeFromParent(); + PCInst->insertBefore(LoopEnd); + + FunctionCallee Callee; + + errs() << "UpperBoundCode: " << *UpperBoundCode->getType() << "\n"; + errs() << "LowerBoundCode: " << *LowerBoundCode->getType() << "\n"; + errs() << "Start: " << *Start->getType() << "\n"; + errs() << "Length: " << *Length->getType() << "\n"; + errs() << "Tag: " << *Tag->getType() << "\n"; + errs() << "Size: " << *Size->getType() << "\n"; + errs() << "AccessIDVal: " << *AccessIDVal->getType() << "\n"; + + if (Start) { + Callee = getCheckRangeWithBaseFn(PO, UpperBoundCode->getType(), LowerBoundCode->getType()); + errs() << "Print Function Callee Signature: " << *Callee.getFunctionType() << "\n"; + CB = createCall(IRB, Callee, + {UpperBoundCode, LowerBoundCode, Start, Length, Tag, + Size, AccessIDVal, getSourceIndex(I), PCVal}); + } else { + Callee = getCheckRangeFn(PO, UpperBoundCode->getType(), LowerBoundCode->getType()); + errs() << "Print Function Callee Signature: " << *Callee.getFunctionType() << "\n"; + CB = createCall(IRB, Callee, + {UpperBoundCode, LowerBoundCode, Size, AccessIDVal, + getSourceIndex(I), PCVal}); + } + CB->removeFromParent(); + CB->insertAfter(PCInst); + + // Convert fake pointer to real pointer. + Value *PlainPtrOp = + IRB.CreatePointerBitCastOrAddrSpaceCast(PtrOp, getPtrTy(PO)); + auto *CBUnpack = + createCall(IRB, getUnpackFn(PO), {PlainPtrOp, getPC(IRB)}, + PtrOp->getName() + ".unpack"); + + I.setOperand(PtrIdx, IRB.CreatePointerBitCastOrAddrSpaceCast( + CBUnpack, PtrOp->getType())); + + return; + + } else { + goto handleunhoistable; + } + } + } + +handleunhoistable: + + static int32_t ReadAccessId = -1; + static int32_t WriteAccessId = 1; + const int32_t &AccessId = IsRead ? ReadAccessId-- : WriteAccessId++; + + auto TySize = DL.getTypeStoreSize(&AccessTy); + assert(!TySize.isScalable()); + Value *Size = ConstantInt::get(Int64Ty, TySize.getFixedValue()); + + Value *PlainPtrOp = + IRB.CreatePointerBitCastOrAddrSpaceCast(PtrOp, getPtrTy(PO)); + + CallInst *CB; + if (Start) { + CB = createCall(IRB, getCheckWithBaseFn(PO), + {PlainPtrOp, Start, Length, Tag, Size, + ConstantInt::get(Int64Ty, AccessId), getSourceIndex(I), + getPC(IRB)}, + I.getName() + ".san"); + } else { + CB = createCall(IRB, getCheckFn(PO), + {PlainPtrOp, Size, ConstantInt::get(Int64Ty, AccessId), + getSourceIndex(I), getPC(IRB)}, + I.getName() + ".san"); + } + + I.setOperand(PtrIdx, + IRB.CreatePointerBitCastOrAddrSpaceCast(CB, PtrOp->getType())); +} + +void GPUSanImpl::instrumentMultipleAccessPerBasicBlock( + LoopInfo &LI, + SmallVector &AccessCausingInstructionInABasicBlock, + Function &Fn) { + + if (AccessCausingInstructionInABasicBlock.empty()) + return; + + SmallVector InstructionsFromBase; + SmallVector PtrIdxListBase; + SmallVector PtrOpsBase; + SmallVector PlainPtrOpsBase; + SmallVector StartsBase; + SmallVector LengthsBase; + SmallVector TagsBase; + SmallVector SizesBase; + SmallVector AccessIdsBase; + SmallVector SourceIdsBase; + SmallVector PointerOriginsBase; + + SmallVector InstructionsWithoutBase; + SmallVector PtrIdxList; + SmallVector PtrOps; + SmallVector PlainPtrOps; + SmallVector Starts; + SmallVector Lengths; + SmallVector Tags; + SmallVector Sizes; + SmallVector AccessIds; + SmallVector SourceIds; + SmallVector PointerOrigins; + + IRBuilder<> IRB(AccessCausingInstructionInABasicBlock.front()); + + //Type *PtrOpType; + for (Instruction *I : AccessCausingInstructionInABasicBlock) { + + int PtrIdx = -1; + Type *AccessTy; + bool IsRead; + if (LoadInst *Load = dyn_cast(I)) { + PtrIdx = LoadInst::getPointerOperandIndex(); + AccessTy = Load->getType(); + IsRead = true; + + } else if (StoreInst *Store = dyn_cast(I)) { + PtrIdx = StoreInst::getPointerOperandIndex(); + AccessTy = Store->getValueOperand()->getType(); + IsRead = true; + } else { + continue; + } + + Value *PtrOp = I->getOperand(PtrIdx); + //PtrOpType = PtrOp->getType(); + const Value *Object = nullptr; + PtrOrigin PO = getPtrOrigin(LI, PtrOp, &Object); + + if (PO > GLOBAL) + continue; + + Value *Start = nullptr; + Value *Length = nullptr; + Value *Tag = nullptr; + if (PO != UNKNOWN && Object) + getAllocationInfo(*I->getFunction(), PO, *const_cast(Object), + Start, Length, Tag); + + if (Loop *L = LI.getLoopFor(I->getParent())) { + // auto &SE = FAM.getResult(*I->getFunction()); + // auto *PtrOpScev = SE.getSCEVAtScope(PtrOp, L); + // const auto &LD = SE.getLoopDisposition(PtrOpScev, L); + // SmallVector Preds; + // SmallPtrSet PredsSet; + // for (auto *Pred : Preds) + // PredsSet.insert(Pred); + // auto *Ex = SE.getPredicatedBackedgeTakenCount(L, Preds); + + // errs() << "Loop Disposition: " << LD << "\n"; + // errs() << "ABS Expression: " << SE.getSmallConstantTripCount(L) << + // "\n"; const SCEVAddRecExpr *AR = + // SE.convertSCEVToAddRecWithPredicates(PtrOpScev, L, PredsSet); + + // const SCEV *ScStart = AR->getStart(); + // const SCEV *ScEnd = AR->evaluateAtIteration(Ex, SE); + // const SCEV *Step = AR->getStepRecurrence(SE); + + // // // For expressions with negative step, the upper bound is ScStart + // and + // // the + // // // lower bound is ScEnd. + // if (const SCEVConstant *CStep = dyn_cast(Step)) { + // if (CStep->getValue()->isNegative()) + // std::swap(ScStart, ScEnd); + // } else { + // // Fallback case: the step is not constant, but the we can still + // // get the upper and lower bounds of the interval by using min/max + // // expressions. + // ScStart = SE.getUMinExpr(ScStart, ScEnd); + // ScEnd = SE.getUMaxExpr(AR->getStart(), ScEnd); + // } + + // errs() << "SC step: " << *Step << "\n"; + // errs() << "Sc start: " << *ScStart << "\n"; + // errs() << "Sc end: " << *ScEnd << "\n"; + // ScEnd->print(errs()); + // errs() << "\n"; + // ScEnd->dump(); + // errs() << "\n"; + + // ArrayRef Ops = ScEnd->operands(); + // errs() << "\n"; + // for (auto *Op : Ops) { + // errs() << "Operand: " << *Op << "\n"; + // errs() << "Operand Scev Type: " << Op->getSCEVType() << "\n"; + // errs() << "Operand Type: " << *Op->getType() << "\n"; + // } + // errs() << "\n"; + + // errs() << "Scev Type: " << ScEnd->getSCEVType() << "\n"; + // errs() << "Type: " << *ScEnd->getType() << "\n"; + // errs() << "Is Non Constant Negative: " << + // ScEnd->isNonConstantNegative() + // << "\n"; + // errs() << "PtrOp: " << *PtrOp << "\n"; + } + + static int32_t ReadAccessId = -1; + static int32_t WriteAccessId = 1; + const int32_t &AccessId = IsRead ? ReadAccessId-- : WriteAccessId++; + + auto TySize = DL.getTypeStoreSize(AccessTy); + assert(!TySize.isScalable()); + Value *Size = ConstantInt::get(Int64Ty, TySize.getFixedValue()); + + Value *PlainPtrOp = + IRB.CreatePointerBitCastOrAddrSpaceCast(PtrOp, getPtrTy(PO)); + if (Start) { + if (PO == GLOBAL) { + InstructionsFromBase.push_back(I); + PtrIdxListBase.push_back(PtrIdx); + PtrOpsBase.push_back(PtrOp); + PlainPtrOpsBase.push_back(PlainPtrOp); + StartsBase.push_back(Start); + LengthsBase.push_back(Length); + TagsBase.push_back(Tag); + SizesBase.push_back(Size); + AccessIdsBase.push_back(ConstantInt::get(Int64Ty, AccessId)); + SourceIdsBase.push_back(getSourceIndex(*I)); + PointerOriginsBase.push_back(PO); + } else { + + CallInst *CB; + CB = createCall(IRB, getCheckWithBaseFn(PO), + {PlainPtrOp, Start, Length, Tag, Size, + ConstantInt::get(Int64Ty, AccessId), + getSourceIndex(*I), getPC(IRB)}, + I->getName() + ".san"); + + I->setOperand(PtrIdx, IRB.CreatePointerBitCastOrAddrSpaceCast( + CB, PtrOp->getType())); + } + } else { + if (PO == GLOBAL) { + InstructionsWithoutBase.push_back(I); + PtrIdxList.push_back(PtrIdx); + PtrOps.push_back(PtrOp); + PlainPtrOps.push_back(PlainPtrOp); + Sizes.push_back(Size); + AccessIds.push_back(ConstantInt::get(Int64Ty, AccessId)); + SourceIds.push_back(getSourceIndex(*I)); + PointerOrigins.push_back(PO); + } else { + CallInst *CB; + CB = createCall(IRB, getCheckFn(PO), + {PlainPtrOp, Size, ConstantInt::get(Int64Ty, AccessId), + getSourceIndex(*I), getPC(IRB)}, + I->getName() + ".san"); + + I->setOperand(PtrIdx, IRB.CreatePointerBitCastOrAddrSpaceCast( + CB, PtrOp->getType())); + } + } + } + + BasicBlock &EntryBlock = Fn.getEntryBlock(); + auto EntryBlockEnd = (--EntryBlock.end()); + + // // Create the malloc function declaration + // FunctionType *MallocType = llvm::FunctionType::get( + // Int8PtrType, // Return type: i8* + // {llvm::Type::getInt64Ty(Ctx)}, // Parameter type: i64 + // false // Not variadic + // ); + // llvm::Function *MallocFunc = llvm::Function::Create( + // MallocType, + // llvm::Function::ExternalLinkage, + // "malloc", M ); + + // Sanitize multiple pointers in one call. + if (!PlainPtrOpsBase.empty()) { + CallInst *CB; + uint64_t NumElements = PlainPtrOpsBase.size(); + // ArrayType for array of plain pointer ops from base + auto *PlainPtrOpsBaseTy = ArrayType::get(PtrTy, NumElements); + // Make Alloca to array type + unsigned int Addr = 5; + AllocaInst *PlainPtrOpsBaseArr = IRB.CreateAlloca(PlainPtrOpsBaseTy, Addr); + PlainPtrOpsBaseArr->moveBefore(&*EntryBlockEnd); + + //create Malloc call + // Value *AllocSize = IRB.getInt64(M.getDataLayout().getTypeAllocSize(PtrTy) * PlainPtrOpsBase.size()); + // CallInst *MallocCallPtrOp = IRB.CreateCall(MallocFunc, AllocSize); + // MallocCallPtrOp->moveBefore(&*EntryBlockEnd); + // Type *IntPtrPtrTy = PtrTy->getPointerTo(); + // Instruction *PlainPtrOpsBaseArr = dyn_cast(IRB.CreateBitCast(MallocCallPtrOp, IntPtrPtrTy)); + // PlainPtrOpsBaseArr->moveAfter(MallocCallPtrOp); + + + //Constant *NullVal = ConstantAggregateZero::get(PlainPtrOpsBaseTy); + //GlobalVariable *PlainPtrOpsBaseArr = new GlobalVariable(M, PlainPtrOpsBaseTy, false, GlobalValue::ExternalLinkage, NullVal,"", nullptr, GlobalValue::NotThreadLocal, 0); + int Index = 0; + for (auto &Element : PlainPtrOpsBase) { + StoreInst *Store = IRB.CreateStore( + Element, + IRB.CreateGEP(PlainPtrOpsBaseTy, PlainPtrOpsBaseArr, + {ConstantInt::get(Type::getInt32Ty(Ctx), Index)})); + Index++; + } + + auto *StartsBaseTy = ArrayType::get(PtrTy, NumElements); + AllocaInst *StartsBaseArr = IRB.CreateAlloca(StartsBaseTy, Addr); + StartsBaseArr->moveBefore(&*EntryBlockEnd); + //Type *IntPtrPtrTy = PtrTy->getPointerTo(); + //NullVal = ConstantAggregateZero::get(StartsBaseTy); + //GlobalVariable *StartsBaseArr = new GlobalVariable(M, StartsBaseTy, false, GlobalValue::ExternalLinkage, NullVal, "", nullptr, GlobalValue::NotThreadLocal, 0); + + //malloc call + // AllocSize = IRB.getInt64(M.getDataLayout().getTypeAllocSize(PtrTy) * StartsBase.size()); + // CallInst *MallocCallStarts = IRB.CreateCall(MallocFunc, AllocSize); + // MallocCallStarts->moveBefore(&*EntryBlockEnd); + // Instruction *StartsBaseArr = dyn_cast(IRB.CreateBitCast(MallocCallStarts, IntPtrPtrTy)); + // StartsBaseArr->moveAfter(MallocCallStarts); + + Index = 0; + for (auto &Element : StartsBase) { + StoreInst *Store = IRB.CreateStore( + Element, + IRB.CreateGEP(StartsBaseTy, StartsBaseArr, + {ConstantInt::get(Type::getInt32Ty(Ctx), Index)})); + Index++; + } + + auto *LengthsBaseTy = ArrayType::get(Int64Ty, NumElements); + AllocaInst *LengthsBaseArr = IRB.CreateAlloca(LengthsBaseTy, Addr); + LengthsBaseArr->moveBefore(&*EntryBlockEnd); + //NullVal = ConstantAggregateZero::get(LengthsBaseTy); + //GlobalVariable *LengthsBaseArr = new GlobalVariable(M, LengthsBaseTy, false, GlobalValue::ExternalLinkage, NullVal, "", nullptr, GlobalValue::NotThreadLocal, 0); + + //create Malloc call + // AllocSize = IRB.getInt64(M.getDataLayout().getTypeAllocSize(Int64Ty) * LengthsBase.size()); + // CallInst *MallocCallLengths = IRB.CreateCall(MallocFunc, AllocSize); + // MallocCallLengths->moveBefore(&*EntryBlockEnd); + // Type *PtrToType = Int64Ty->getPointerTo(); + // Instruction *LengthsBaseArr = dyn_cast(IRB.CreateBitCast(MallocCallLengths, PtrToType)); + // LengthsBaseArr->moveAfter(MallocCallLengths); + + + Index = 0; + for (auto &Element : LengthsBase) { + StoreInst *Store = IRB.CreateStore( + Element, + IRB.CreateGEP(LengthsBaseTy, LengthsBaseArr, + {ConstantInt::get(Type::getInt32Ty(Ctx), Index)})); + Index++; + } + + auto *TagsBaseTy = ArrayType::get(Int32Ty, NumElements); + AllocaInst *TagsBaseArr = IRB.CreateAlloca(TagsBaseTy, Addr); + TagsBaseArr->moveBefore(&*EntryBlockEnd); + //NullVal = ConstantAggregateZero::get(TagsBaseTy); + //GlobalVariable *TagsBaseArr = new GlobalVariable(M, TagsBaseTy, false, GlobalValue::ExternalLinkage, NullVal, "", nullptr, GlobalValue::NotThreadLocal, 0); + + //create Malloc call + // AllocSize = IRB.getInt64(M.getDataLayout().getTypeAllocSize(Int32Ty) * TagsBase.size()); + // CallInst *MallocCallTags = IRB.CreateCall(MallocFunc, AllocSize); + // MallocCallTags->moveBefore(&*EntryBlockEnd); + // PtrToType = Int32Ty->getPointerTo(); + // Instruction *TagsBaseArr = dyn_cast(IRB.CreateBitCast(MallocCallTags, PtrToType)); + // TagsBaseArr->moveAfter(MallocCallTags); + + Index = 0; + for (auto &Element : TagsBase) { + StoreInst *Store = IRB.CreateStore( + Element, + IRB.CreateGEP(TagsBaseTy, TagsBaseArr, + {ConstantInt::get(Type::getInt32Ty(Ctx), Index)})); + Index++; + } + + auto *SizesBaseTy = ArrayType::get(Int64Ty, NumElements); + auto *SizesBaseArr = IRB.CreateAlloca(SizesBaseTy, Addr); + SizesBaseArr->moveBefore(&*EntryBlockEnd); + //NullVal = ConstantAggregateZero::get(SizesBaseTy); + //GlobalVariable *SizesBaseArr = new GlobalVariable(M, SizesBaseTy, false, GlobalValue::ExternalLinkage, NullVal, "", nullptr, GlobalValue::NotThreadLocal, 0); + + //create Malloc call + // AllocSize = IRB.getInt64(M.getDataLayout().getTypeAllocSize(Int64Ty) * SizesBase.size()); + // CallInst *MallocCallSizes = IRB.CreateCall(MallocFunc, AllocSize); + // MallocCallSizes->moveBefore(&*EntryBlockEnd); + // PtrToType = Int64Ty->getPointerTo(); + // Instruction *SizesBaseArr = dyn_cast(IRB.CreateBitCast(MallocCallSizes, PtrToType)); + // SizesBaseArr->moveAfter(MallocCallSizes); + + Index = 0; + for (auto &Element : SizesBase) { + StoreInst *Store = IRB.CreateStore( + Element, + IRB.CreateGEP(SizesBaseTy, SizesBaseArr, + {ConstantInt::get(Type::getInt32Ty(Ctx), Index)})); + Index++; + } + + auto *AccessIdsBaseTy = ArrayType::get(Int64Ty, NumElements); + auto *AccessIdsBaseArr = IRB.CreateAlloca(AccessIdsBaseTy, Addr); + AccessIdsBaseArr->moveBefore(&*EntryBlockEnd); + //NullVal = ConstantAggregateZero::get(AccessIdsBaseTy); + //GlobalVariable *AccessIdsBaseArr = new GlobalVariable(M, AccessIdsBaseTy, false, GlobalValue::ExternalLinkage, NullVal, "", nullptr, GlobalValue::NotThreadLocal, 0); + + //create Malloc call + // AllocSize = IRB.getInt64(M.getDataLayout().getTypeAllocSize(Int64Ty) * AccessIdsBase.size()); + // CallInst *MallocCallAccess = IRB.CreateCall(MallocFunc, AllocSize); + // MallocCallAccess->moveBefore(&*EntryBlockEnd); + // PtrToType = Int64Ty->getPointerTo(); + // Instruction *AccessIdsBaseArr = dyn_cast(IRB.CreateBitCast(MallocCallAccess, PtrToType)); + // AccessIdsBaseArr->moveAfter(MallocCallAccess); + + Index = 0; + for (auto &Element : AccessIdsBase) { + StoreInst *Store = IRB.CreateStore( + Element, + IRB.CreateGEP(AccessIdsBaseTy, AccessIdsBaseArr, + {ConstantInt::get(Type::getInt32Ty(Ctx), Index)})); + Index++; + } + + auto *SourceIdsBaseTy = ArrayType::get(Int64Ty, NumElements); + auto *SourceIdsBaseArr = IRB.CreateAlloca(SourceIdsBaseTy, Addr); + SourceIdsBaseArr->moveBefore(&*EntryBlockEnd); + //NullVal = ConstantAggregateZero::get(SourceIdsBaseTy); + //GlobalVariable *SourceIdsBaseArr = new GlobalVariable(M, SourceIdsBaseTy, false, GlobalValue::ExternalLinkage, NullVal, "", nullptr, GlobalValue::NotThreadLocal, 0); + + //create Malloc call + // AllocSize = IRB.getInt64(M.getDataLayout().getTypeAllocSize(Int64Ty) * SourceIdsBase.size()); + // CallInst *MallocCallSources = IRB.CreateCall(MallocFunc, AllocSize); + // MallocCallSources->moveBefore(&*EntryBlockEnd); + // PtrToType = Int64Ty->getPointerTo(); + // Instruction *SourceIdsBaseArr = dyn_cast(IRB.CreateBitCast(MallocCallSources, PtrToType)); + // SourceIdsBaseArr->moveAfter(MallocCallSources); + + Index = 0; + for (auto &Element : SourceIdsBase) { + StoreInst *Store = IRB.CreateStore( + Element, + IRB.CreateGEP(SourceIdsBaseTy, SourceIdsBaseArr, + {ConstantInt::get(Type::getInt32Ty(Ctx), Index)})); + Index++; + } + + FunctionCallee Callee = + getCheckWithBaseFnVector(NumElements, PlainPtrOpsBaseArr->getType()); + + errs() << "Print Function Callee Signature: " << *Callee.getFunctionType() + << "\n"; + + errs() << "PlainPtrOpsBaseArr: " << *PlainPtrOpsBaseArr->getType() << "\n"; + errs() << "StartsBaseArr: " << *StartsBaseArr->getType() << "\n"; + errs() << "LengthsBaseArr: " << *LengthsBaseArr->getType() << "\n"; + errs() << "TagsBaseArr: " << *TagsBaseArr->getType() << "\n"; + errs() << "SizesBaseArr: " << *SizesBaseArr->getType() << "\n"; + errs() << "AccessIdsBaseArr: " << *AccessIdsBaseArr->getType() << "\n"; + errs() << "SourceIdsBaseArr: " << *SourceIdsBaseArr->getType() << "\n"; + + CB = createCall(IRB, Callee, + {PlainPtrOpsBaseArr, StartsBaseArr, LengthsBaseArr, + TagsBaseArr, SizesBaseArr, AccessIdsBaseArr, + SourceIdsBaseArr, getPC(IRB), + ConstantInt::get(Int64Ty, NumElements)}, + ".san_vector"); + + // Set the current operand from the result of the sanitization call. + Index = 0; + for (auto *I : InstructionsFromBase) { + Value *ValueIndex = ConstantInt::get(Type::getInt32Ty(Ctx), Index); + Value *GEPForLoad = IRB.CreateGEP(CB->getType(), CB, {ValueIndex}); + LoadInst *Load = IRB.CreateLoad(PtrTy, GEPForLoad); + int PtrIdx = PtrIdxListBase[Index]; + PtrOrigin PO = PointerOriginsBase[Index]; + Value *PtrOp = PtrOpsBase[Index]; + + // Still need to get the real pointer from the pointer op. + // Convert fake pointer to real pointer. + //Value *PlainPtrOp = + // IRB.CreatePointerBitCastOrAddrSpaceCast(PtrOp, getPtrTy(PO)); + + //auto *CBUnpack = + // createCall(IRB, getUnpackFn(PO), {PlainPtrOp, getPC(IRB)}, + // PtrOp->getName() + ".unpack"); + + errs() << "PtrOp Type" << *PtrOp->getType() << "\n"; + + + I->setOperand(PtrIdx, IRB.CreatePointerBitCastOrAddrSpaceCast( + Load, PtrOp->getType())); + + + Index++; + } + } + + if (!PlainPtrOps.empty()) { + CallInst *CB; + uint64_t NumElements = PlainPtrOps.size(); + auto *PlainPtrOpsTy = ArrayType::get(PtrTy, NumElements); + unsigned int Addr = 5; + auto *PlainPtrOpsArr = IRB.CreateAlloca(PlainPtrOpsTy, Addr); + PlainPtrOpsArr->moveBefore(&*EntryBlockEnd); + //auto *NullVal = ConstantAggregateZero::get(PlainPtrOpsTy); + //GlobalVariable *PlainPtrOpsArr = new GlobalVariable(M, PlainPtrOpsTy, false, GlobalValue::ExternalLinkage, NullVal, "", nullptr, GlobalValue::NotThreadLocal, 0); + int Index = 0; + for (auto &Element : PlainPtrOps) { + IRB.CreateStore( + Element, + IRB.CreateGEP(PlainPtrOpsTy, PlainPtrOpsArr, + {ConstantInt::get(Type::getInt32Ty(Ctx), Index)})); + Index++; + } + + auto *SizesTy = ArrayType::get(Int64Ty, NumElements); + auto *SizesArr = IRB.CreateAlloca(SizesTy, Addr); + SizesArr->moveBefore(&*EntryBlockEnd); + //NullVal = ConstantAggregateZero::get(SizesTy); + //GlobalVariable *SizesArr = new GlobalVariable(M, SizesTy, false, GlobalValue::ExternalLinkage, NullVal, "", nullptr, GlobalValue::NotThreadLocal, 0); + Index = 0; + for (auto &Element : Sizes) { + IRB.CreateStore( + Element, + IRB.CreateGEP(SizesTy, SizesArr, + {ConstantInt::get(Type::getInt32Ty(Ctx), Index)})); + Index++; + } + + auto *AccessIdsTy = ArrayType::get(Int64Ty, NumElements); + auto *AccessIdsArr = IRB.CreateAlloca(AccessIdsTy, Addr); + AccessIdsArr->moveBefore(&*EntryBlockEnd); + //NullVal = ConstantAggregateZero::get(AccessIdsTy); + //GlobalVariable *AccessIdsArr = new GlobalVariable(M, AccessIdsTy, false, GlobalValue::ExternalLinkage, NullVal, "", nullptr, GlobalValue::NotThreadLocal, 0); + Index = 0; + for (auto &Element : AccessIds) { + IRB.CreateStore( + Element, + IRB.CreateGEP(AccessIdsTy, AccessIdsArr, + {ConstantInt::get(Type::getInt32Ty(Ctx), Index)})); + Index++; + } + + auto *SourceIdsTy = ArrayType::get(Int64Ty, NumElements); + auto *SourceIdsArr = IRB.CreateAlloca(SourceIdsTy, Addr); + SourceIdsArr->moveBefore(&*EntryBlockEnd); + //NullVal = ConstantAggregateZero::get(SourceIdsTy); + //GlobalVariable *SourceIdsArr = new GlobalVariable(M, SourceIdsTy, false, GlobalValue::ExternalLinkage, NullVal, "", nullptr, GlobalValue::NotThreadLocal, 0); + Index = 0; + for (auto &Element : SourceIds) { + IRB.CreateStore( + Element, + IRB.CreateGEP(SourceIdsTy, SourceIdsArr, + {ConstantInt::get(Type::getInt32Ty(Ctx), Index)})); + Index++; + } + + CB = createCall(IRB, getCheckFnVector(NumElements), + {PlainPtrOpsArr, SizesArr, AccessIdsArr, SourceIdsArr, + getPC(IRB), ConstantInt::get(Int64Ty, NumElements)}, + ".san_vector"); + + // Set the current operand from the result of the sanitization call. + Index = 0; + for (Instruction *I : InstructionsWithoutBase) { + Value *ValueIndex = ConstantInt::get(Type::getInt32Ty(Ctx), Index); + Value *GEPForLoad = IRB.CreateGEP(CB->getType(), CB, {ValueIndex}); + LoadInst *Load = IRB.CreateLoad(PtrTy, GEPForLoad); + int PtrIdx = PtrIdxList[Index]; + Value *PtrOp = PtrOps[Index]; + PtrOrigin PO = PointerOrigins[Index]; + + // Still need to get the real pointer from the pointer op. + // Convert fake pointer to real pointer. + //Value *PlainPtrOp = + // IRB.CreatePointerBitCastOrAddrSpaceCast(PtrOp, getPtrTy(PO)); + + //auto *CBUnpack = + // createCall(IRB, getUnpackFn(PO), {PlainPtrOp, getPC(IRB)}, + // PtrOp->getName() + ".unpack"); + + I->setOperand(PtrIdx, Load); + + Index++; + } + } +} + +void GPUSanImpl::instrumentLoadInst(LoopInfo &LI, LoadInst &LoadI, + SmallVector &GEPs) { + instrumentAccess(LI, LoadI, LoadInst::getPointerOperandIndex(), + *LoadI.getType(), + /*IsRead=*/true, GEPs); +} + +void GPUSanImpl::instrumentStoreInst(LoopInfo &LI, StoreInst &StoreI, + SmallVector &GEPs) { + instrumentAccess(LI, StoreI, StoreInst::getPointerOperandIndex(), + *StoreI.getValueOperand()->getType(), /*IsRead=*/false, + GEPs); +} + +void GPUSanImpl::instrumentGEPInst(LoopInfo &LI, GetElementPtrInst &GEP) { + Value *PtrOp = GEP.getPointerOperand(); + PtrOrigin PO = getPtrOrigin(LI, PtrOp); + if (PO > GLOBAL) + return; + + GEP.setOperand(GetElementPtrInst::getPointerOperandIndex(), + Constant::getNullValue(PtrOp->getType())); + IRBuilder<> IRB(GEP.getNextNode()); + Value *PlainPtrOp = + IRB.CreatePointerBitCastOrAddrSpaceCast(PtrOp, getPtrTy(PO)); + auto *CB = createCall(IRB, getGEPFn(PO), + {PlainPtrOp, UndefValue::get(Int64Ty), getPC(IRB)}, + GEP.getName() + ".san"); + GEP.replaceAllUsesWith( + IRB.CreatePointerBitCastOrAddrSpaceCast(CB, GEP.getType())); + Value *Offset = + new PtrToIntInst(&GEP, Int64Ty, GEP.getName() + ".san.offset", CB); + CB->setArgOperand(1, Offset); +} + +bool GPUSanImpl::instrumentCallInst(LoopInfo &LI, CallInst &CI) { + bool Changed = false; + if (isa(CI)) + return Changed; + if (auto *Fn = CI.getCalledFunction()) { + if (Fn->getName().starts_with("__kmpc_target_init")) + return Changed; + if ((Fn->isDeclaration() || Fn->getName().starts_with("__kmpc") || + Fn->getName().starts_with("rpc_")) && + !Fn->getName().starts_with("ompx")) { + IRBuilder<> IRB(&CI); + for (int I = 0, E = CI.arg_size(); I != E; ++I) { + Value *Op = CI.getArgOperand(I); + if (!Op->getType()->isPointerTy()) + continue; + PtrOrigin PO = getPtrOrigin(LI, Op); + if (PO > GLOBAL) + continue; + Value *PlainOp = + IRB.CreatePointerBitCastOrAddrSpaceCast(Op, getPtrTy(PO)); + auto *CB = createCall(IRB, getUnpackFn(PO), {PlainOp, getPC(IRB)}, + Op->getName() + ".unpack"); + CI.setArgOperand( + I, IRB.CreatePointerBitCastOrAddrSpaceCast(CB, Op->getType())); + Changed = true; + } + } + } + return Changed; +} + +bool GPUSanImpl::instrumentFunction(Function &Fn) { + if (Fn.isDeclaration()) + return false; + + bool Changed = false; + LoopInfo &LI = FAM.getResult(Fn); + + for (auto BB = Fn.begin(); BB != Fn.end(); BB++) { + + SmallVector> Allocas; + SmallVector Returns; + SmallVector LoadsStores; + SmallVector Calls; + SmallVector GEPs; + + SmallVector Stores; + SmallVector Loads; + + for (auto I = BB->begin(); I != BB->end(); I++) { + + switch (I->getOpcode()) { + case Instruction::Alloca: { + AllocaInst &AI = cast(*I); + Allocas.push_back({&AI, nullptr}); + Changed = true; + break; + } + case Instruction::Load: + LoadsStores.push_back(&*I); + Loads.push_back(cast(&*I)); + Changed = true; + break; + case Instruction::Store: + LoadsStores.push_back(&*I); + Stores.push_back(cast(&*I)); + Changed = true; + break; + case Instruction::GetElementPtr: + GEPs.push_back(&cast(*I)); + Changed = true; + break; + case Instruction::Call: { + auto &CI = cast(*I); + Calls.push_back(&CI); + if (CI.isIndirectCall()) + AmbiguousCalls.insert(&CI); + break; + } + case Instruction::Ret: + Returns.push_back(&cast(*I)); + break; + default: + break; + } + } + + // // Hoist all address computation in a basic block + // auto GEPCopy = GEPs; + // while (!GEPCopy.empty()) { + // auto *Inst = GEPCopy.pop_back_val(); + // Instruction *LatestDependency = &*Inst->getParent()->begin(); + // for (auto *It = Inst->op_begin(); It != Inst->op_end(); It++) { + + // if (Instruction *ToInstruction = dyn_cast(It)) { + + // if (!LatestDependency) { + // LatestDependency = ToInstruction; + // continue; + // } + + // if (ToInstruction->getParent() != Inst->getParent()) + // continue; + + // if (LatestDependency->comesBefore(ToInstruction)) + // LatestDependency = ToInstruction; + // } + // } + + // Inst->moveAfter(LatestDependency); + // } + + // bool CanMergeChecks = true; + // for (auto *GEP : GEPs) { + + // if (GEP->comesBefore(LoadsStores.front())) { + // CanMergeChecks = CanMergeChecks && true; + // } else { + // CanMergeChecks = CanMergeChecks && false; + // } + // } + + // check if you can merge various pointer checks. + //if (CanMergeChecks) { + // instrumentMultipleAccessPerBasicBlock(LI, LoadsStores, Fn); + //} else { + for (auto *Load : Loads) + instrumentLoadInst(LI, *Load, GEPs); + for (auto *Store : Stores) + instrumentStoreInst(LI, *Store, GEPs); + //} + + for (auto *GEP : GEPs) + instrumentGEPInst(LI, *GEP); + for (auto *Call : Calls) + Changed |= instrumentCallInst(LI, *Call); + for (auto &It : Allocas) + It.second = instrumentAllocaInst(LI, *It.first); + + instrumentReturns(Allocas, Returns); + } + + return Changed; +} + +void GPUSanImpl::instrumentReturns( + SmallVectorImpl> &Allocas, + SmallVectorImpl &Returns) { + if (Allocas.empty()) + return; + for (auto *RI : Returns) { + IRBuilder<> IRB(RI); + createCall(IRB, getFreeNLocalFn(), + {ConstantInt::get(Int32Ty, Allocas.size())}); + } +} + +bool GPUSanImpl::instrument() { + bool Changed = instrumentGlobals(); + HasAllocas = [&]() { + for (Function &Fn : M) + for (auto &I : instructions(Fn)) + if (isa(I)) + return true; + return false; + }(); + + SmallVector Kernels; + for (Function &Fn : M) { + if (Fn.hasFnAttribute("kernel")) + Kernels.push_back(&Fn); + if (!Fn.getName().contains("ompx") && !Fn.getName().contains("__kmpc") && + !Fn.getName().starts_with("rpc_")) { + if (!Fn.hasFnAttribute(Attribute::DisableSanitizerInstrumentation)) { + Changed |= instrumentFunction(Fn); + } else if (!Fn.isDeclaration() && + Fn.getName().contains("SanitizerTrapInfoTy")) { + } + } + } + + addCtor(); + addDtor(); + + SmallVector AmbiguousCallsOrdered; + SmallVector AmbiguousCallsMapping; + if (LocationMap.empty()) + AmbiguousCalls.clear(); + for (size_t I = 0; I < AmbiguousCalls.size(); ++I) { + CallBase &CB = *AmbiguousCalls[I]; + AmbiguousCallsOrdered.push_back(&CB); + AmbiguousCallsMapping.push_back(getSourceIndex(CB)); + } + + uint64_t AmbiguousCallsBitWidth = + llvm::Log2_64_Ceil(AmbiguousCalls.size() + 1); + + new GlobalVariable(M, Int64Ty, /*isConstant=*/true, + GlobalValue::ExternalLinkage, + ConstantInt::get(Int64Ty, AmbiguousCallsBitWidth), + "__san.num_ambiguous_calls", nullptr, + GlobalValue::ThreadLocalMode::NotThreadLocal, 1); + + if (size_t NumAmbiguousCalls = AmbiguousCalls.size()) { + { + auto *ArrayTy = ArrayType::get(Int64Ty, NumAmbiguousCalls); + auto *GV = new GlobalVariable( + M, ArrayTy, /*isConstant=*/true, GlobalValue::ExternalLinkage, + ConstantArray::get(ArrayTy, AmbiguousCallsMapping), + "__san.ambiguous_calls_mapping", nullptr, + GlobalValue::ThreadLocalMode::NotThreadLocal, 4); + GV->setVisibility(GlobalValue::ProtectedVisibility); + } + + auto *ArrayTy = ArrayType::get(Int64Ty, 1024); + LocationsArray = new GlobalVariable( + M, ArrayTy, /*isConstant=*/false, GlobalValue::PrivateLinkage, + UndefValue::get(ArrayTy), "__san.calls", nullptr, + GlobalValue::ThreadLocalMode::NotThreadLocal, 3); + + auto *OldFn = M.getFunction("__san_get_location_value"); + if (OldFn) + OldFn->setName(""); + Function *LocationGetter = Function::Create( + FunctionType::get(Int64Ty, false), GlobalValue::ExternalLinkage, + "__san_get_location_value", M); + if (OldFn) { + OldFn->replaceAllUsesWith(LocationGetter); + OldFn->eraseFromParent(); + } + auto *EntryBB = BasicBlock::Create(Ctx, "entry", LocationGetter); + IRBuilder<> IRB(EntryBB); + Value *Idx = createCall(IRB, getThreadIdFn(), {}, "san.gtid"); + Value *Ptr = IRB.CreateGEP(Int64Ty, LocationsArray, {Idx}); + auto *LocationValue = IRB.CreateLoad(Int64Ty, Ptr); + IRB.CreateRet(LocationValue); + } + + Function *InitSharedFn = + Function::Create(FunctionType::get(VoidTy, false), + GlobalValue::PrivateLinkage, "__san.init_shared", &M); + auto *EntryBB = BasicBlock::Create(Ctx, "entry", InitSharedFn); + IRBuilder<> IRB(EntryBB); + if (!AmbiguousCalls.empty()) { + Value *Idx = createCall(IRB, getThreadIdFn(), {}, "san.gtid"); + Value *Ptr = IRB.CreateGEP(Int64Ty, LocationsArray, {Idx}); + IRB.CreateStore(ConstantInt::get(Int64Ty, 0), Ptr); + + for (auto *KernelFn : Kernels) { + IRBuilder<> IRB( + &*KernelFn->getEntryBlock().getFirstNonPHIOrDbgOrAlloca()); + createCall(IRB, InitSharedFn, {}); + } + } + IRB.CreateRetVoid(); + + for (const auto &It : llvm::enumerate(AmbiguousCallsOrdered)) { + IRBuilder<> IRB(It.value()); + Value *Idx = createCall(IRB, getThreadIdFn(), {}, "san.gtid"); + Value *Ptr = IRB.CreateGEP(Int64Ty, LocationsArray, {Idx}); + Value *OldVal = IRB.CreateLoad(Int64Ty, Ptr); + Value *OldValShifted = IRB.CreateShl( + OldVal, ConstantInt::get(Int64Ty, AmbiguousCallsBitWidth)); + Value *NewVal = IRB.CreateBinOp(Instruction::Or, OldValShifted, + ConstantInt::get(Int64Ty, It.index() + 1)); + IRB.CreateStore(NewVal, Ptr); + IRB.SetInsertPoint(It.value()->getNextNode()); + IRB.CreateStore(OldVal, Ptr); + } + + auto *NamesTy = ArrayType::get(Int8Ty, ConcatenatedString.size() + 1); + auto *Names = new GlobalVariable( + M, NamesTy, /*isConstant=*/true, GlobalValue::ExternalLinkage, + ConstantDataArray::getString(Ctx, ConcatenatedString), + "__san.location_names", nullptr, + GlobalValue::ThreadLocalMode::NotThreadLocal, 4); + Names->setVisibility(GlobalValue::ProtectedVisibility); + + auto *ArrayTy = ArrayType::get(Int64Ty, LocationEncoding.size()); + auto *GV = new GlobalVariable( + M, ArrayTy, /*isConstant=*/true, GlobalValue::ExternalLinkage, + ConstantArray::get(ArrayTy, LocationEncoding), "__san.locations", nullptr, + GlobalValue::ThreadLocalMode::NotThreadLocal, 4); + GV->setVisibility(GlobalValue::ProtectedVisibility); + + for (auto *CI : Calls) { + if (!CI->getCalledFunction()) { + CI->dump(); + continue; + } + // if (!CI->getCalledFunction()->getName().contains("gep") && + // !CI->getCalledFunction()->getName().contains("info")) + // continue; + InlineFunctionInfo IFI; + if (InlineFunction(*CI, IFI).isSuccess()) + Changed = true; + } + + return Changed; +} + +PreservedAnalyses GPUSanPass::run(Module &M, ModuleAnalysisManager &AM) { + FunctionAnalysisManager &FAM = + AM.getResult(M).getManager(); + GPUSanImpl Lowerer(M, FAM); + if (!Lowerer.instrument()) + return PreservedAnalyses::all(); + LLVM_DEBUG(M.dump()); + return PreservedAnalyses::none(); +} diff --git a/offload/DeviceRTL/CMakeLists.txt b/offload/DeviceRTL/CMakeLists.txt index d88430a52b8b7..644befc532ab2 100644 --- a/offload/DeviceRTL/CMakeLists.txt +++ b/offload/DeviceRTL/CMakeLists.txt @@ -69,6 +69,7 @@ elseif(LIBOMPTARGET_DEVICE_ARCHITECTURES STREQUAL "auto" OR "${LIBOMPTARGET_NVPTX_DETECTED_ARCH_LIST};${LIBOMPTARGET_AMDGPU_DETECTED_ARCH_LIST}") endif() list(REMOVE_DUPLICATES LIBOMPTARGET_DEVICE_ARCHITECTURES) +set(LIBOMPTARGET_DEVICE_ARCHITECTURES ${LIBOMPTARGET_DEVICE_ARCHITECTURES} PARENT_SCOPE) set(include_files ${include_directory}/Allocator.h @@ -85,6 +86,7 @@ set(include_files set(src_files ${source_directory}/Allocator.cpp + ${source_directory}/Sanitizer.cpp ${source_directory}/Configuration.cpp ${source_directory}/Debug.cpp ${source_directory}/Kernel.cpp diff --git a/offload/DeviceRTL/include/Allocator.h b/offload/DeviceRTL/include/Allocator.h index a28eb0fb2977e..6bb1cafac720f 100644 --- a/offload/DeviceRTL/include/Allocator.h +++ b/offload/DeviceRTL/include/Allocator.h @@ -12,7 +12,7 @@ #ifndef OMPTARGET_ALLOCATOR_H #define OMPTARGET_ALLOCATOR_H -#include "Types.h" +#include "DeviceTypes.h" // Forward declaration. struct KernelEnvironmentTy; diff --git a/offload/DeviceRTL/include/Configuration.h b/offload/DeviceRTL/include/Configuration.h index 8e6f5c89cbf24..f8b7a6c3c6c9d 100644 --- a/offload/DeviceRTL/include/Configuration.h +++ b/offload/DeviceRTL/include/Configuration.h @@ -15,7 +15,7 @@ #include "Shared/Environment.h" -#include "Types.h" +#include "DeviceTypes.h" namespace ompx { namespace config { diff --git a/offload/DeviceRTL/include/Types.h b/offload/DeviceRTL/include/DeviceTypes.h similarity index 99% rename from offload/DeviceRTL/include/Types.h rename to offload/DeviceRTL/include/DeviceTypes.h index 2e12d9da0353b..bf30ba31260f5 100644 --- a/offload/DeviceRTL/include/Types.h +++ b/offload/DeviceRTL/include/DeviceTypes.h @@ -1,4 +1,4 @@ -//===---------- Types.h - OpenMP types ---------------------------- C++ -*-===// +//===---- DeviceTypes.h - OpenMP types ---------------------------- C++ -*-===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. diff --git a/offload/DeviceRTL/include/DeviceUtils.h b/offload/DeviceRTL/include/DeviceUtils.h new file mode 100644 index 0000000000000..9581ba83604db --- /dev/null +++ b/offload/DeviceRTL/include/DeviceUtils.h @@ -0,0 +1,55 @@ +//===--------- DeviceUtils.h - OpenMP device runtime utility functions -- C++ +//-*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// +//===----------------------------------------------------------------------===// + +#ifndef OMPTARGET_DEVICERTL_DEVICE_UTILS_H +#define OMPTARGET_DEVICERTL_DEVICE_UTILS_H + +#include "Shared/Utils.h" +#include "DeviceTypes.h" + +#pragma omp begin declare target device_type(nohost) + +namespace utils { + +/// Return the value \p Var from thread Id \p SrcLane in the warp if the thread +/// is identified by \p Mask. +int32_t shuffle(uint64_t Mask, int32_t Var, int32_t SrcLane, int32_t Width); + +int32_t shuffleDown(uint64_t Mask, int32_t Var, uint32_t Delta, int32_t Width); + +int64_t shuffleDown(uint64_t Mask, int64_t Var, uint32_t Delta, int32_t Width); + +uint64_t ballotSync(uint64_t Mask, int32_t Pred); + +/// Return \p LowBits and \p HighBits packed into a single 64 bit value. +uint64_t pack(uint32_t LowBits, uint32_t HighBits); + +/// Unpack \p Val into \p LowBits and \p HighBits. +void unpack(uint64_t Val, uint32_t &LowBits, uint32_t &HighBits); + +/// Return true iff \p Ptr is pointing into shared (local) memory (AS(3)). +bool isSharedMemPtr(void *Ptr); + +/// Return true iff \p Ptr is pointing into (thread) local memory (AS(5)). +bool isThreadLocalMemPtr(void *Ptr); + +/// A pointer variable that has by design an `undef` value. Use with care. +[[clang::loader_uninitialized]] static void *const UndefPtr; + +#define OMP_LIKELY(EXPR) __builtin_expect((bool)(EXPR), true) +#define OMP_UNLIKELY(EXPR) __builtin_expect((bool)(EXPR), false) + +} // namespace utils + +#pragma omp end declare target + +#endif diff --git a/offload/DeviceRTL/include/Interface.h b/offload/DeviceRTL/include/Interface.h index f4854ed3d1678..12244a0971f49 100644 --- a/offload/DeviceRTL/include/Interface.h +++ b/offload/DeviceRTL/include/Interface.h @@ -14,7 +14,7 @@ #include "Shared/Environment.h" -#include "Types.h" +#include "DeviceTypes.h" /// External API /// diff --git a/offload/DeviceRTL/include/LibC.h b/offload/DeviceRTL/include/LibC.h index dde86af783af9..6e02b4aca462a 100644 --- a/offload/DeviceRTL/include/LibC.h +++ b/offload/DeviceRTL/include/LibC.h @@ -12,7 +12,7 @@ #ifndef OMPTARGET_LIBC_H #define OMPTARGET_LIBC_H -#include "Types.h" +#include "DeviceTypes.h" extern "C" { diff --git a/offload/DeviceRTL/include/Mapping.h b/offload/DeviceRTL/include/Mapping.h index 165904644dbb9..2fb87abe5418c 100644 --- a/offload/DeviceRTL/include/Mapping.h +++ b/offload/DeviceRTL/include/Mapping.h @@ -12,7 +12,7 @@ #ifndef OMPTARGET_MAPPING_H #define OMPTARGET_MAPPING_H -#include "Types.h" +#include "DeviceTypes.h" namespace ompx { diff --git a/offload/DeviceRTL/include/State.h b/offload/DeviceRTL/include/State.h index 1a3490394458f..37699529e726f 100644 --- a/offload/DeviceRTL/include/State.h +++ b/offload/DeviceRTL/include/State.h @@ -16,8 +16,8 @@ #include "Debug.h" #include "Mapping.h" -#include "Types.h" -#include "Utils.h" +#include "DeviceTypes.h" +#include "DeviceUtils.h" // Forward declaration. struct KernelEnvironmentTy; diff --git a/offload/DeviceRTL/include/Synchronization.h b/offload/DeviceRTL/include/Synchronization.h index af9e1a673e6a2..874974cc861df 100644 --- a/offload/DeviceRTL/include/Synchronization.h +++ b/offload/DeviceRTL/include/Synchronization.h @@ -12,7 +12,7 @@ #ifndef OMPTARGET_DEVICERTL_SYNCHRONIZATION_H #define OMPTARGET_DEVICERTL_SYNCHRONIZATION_H -#include "Types.h" +#include "DeviceTypes.h" namespace ompx { diff --git a/offload/DeviceRTL/include/Utils.h b/offload/DeviceRTL/include/Utils.h deleted file mode 100644 index 82e2397b5958b..0000000000000 --- a/offload/DeviceRTL/include/Utils.h +++ /dev/null @@ -1,100 +0,0 @@ -//===--------- Utils.h - OpenMP device runtime utility functions -- C++ -*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -// -//===----------------------------------------------------------------------===// - -#ifndef OMPTARGET_DEVICERTL_UTILS_H -#define OMPTARGET_DEVICERTL_UTILS_H - -#include "Types.h" - -#pragma omp begin declare target device_type(nohost) - -namespace ompx { -namespace utils { - -/// Return the value \p Var from thread Id \p SrcLane in the warp if the thread -/// is identified by \p Mask. -int32_t shuffle(uint64_t Mask, int32_t Var, int32_t SrcLane); - -int32_t shuffleDown(uint64_t Mask, int32_t Var, uint32_t Delta, int32_t Width); - -int64_t shuffleDown(uint64_t Mask, int64_t Var, uint32_t Delta, int32_t Width); - -uint64_t ballotSync(uint64_t Mask, int32_t Pred); - -/// Return \p LowBits and \p HighBits packed into a single 64 bit value. -uint64_t pack(uint32_t LowBits, uint32_t HighBits); - -/// Unpack \p Val into \p LowBits and \p HighBits. -void unpack(uint64_t Val, uint32_t &LowBits, uint32_t &HighBits); - -/// Round up \p V to a \p Boundary. -template inline Ty roundUp(Ty V, Ty Boundary) { - return (V + Boundary - 1) / Boundary * Boundary; -} - -/// Advance \p Ptr by \p Bytes bytes. -template inline Ty1 *advance(Ty1 Ptr, Ty2 Bytes) { - return reinterpret_cast(reinterpret_cast(Ptr) + Bytes); -} - -/// Return the first bit set in \p V. -inline uint32_t ffs(uint32_t V) { - static_assert(sizeof(int) == sizeof(uint32_t), "type size mismatch"); - return __builtin_ffs(V); -} - -/// Return the first bit set in \p V. -inline uint32_t ffs(uint64_t V) { - static_assert(sizeof(long) == sizeof(uint64_t), "type size mismatch"); - return __builtin_ffsl(V); -} - -/// Return the number of bits set in \p V. -inline uint32_t popc(uint32_t V) { - static_assert(sizeof(int) == sizeof(uint32_t), "type size mismatch"); - return __builtin_popcount(V); -} - -/// Return the number of bits set in \p V. -inline uint32_t popc(uint64_t V) { - static_assert(sizeof(long) == sizeof(uint64_t), "type size mismatch"); - return __builtin_popcountl(V); -} - -/// Return \p V aligned "upwards" according to \p Align. -template inline Ty1 align_up(Ty1 V, Ty2 Align) { - return ((V + Ty1(Align) - 1) / Ty1(Align)) * Ty1(Align); -} -/// Return \p V aligned "downwards" according to \p Align. -template inline Ty1 align_down(Ty1 V, Ty2 Align) { - return V - V % Align; -} - -/// Return true iff \p Ptr is pointing into shared (local) memory (AS(3)). -bool isSharedMemPtr(void *Ptr); - -/// Return \p V typed punned as \p DstTy. -template inline DstTy convertViaPun(SrcTy V) { - return *((DstTy *)(&V)); -} - -/// A pointer variable that has by design an `undef` value. Use with care. -[[clang::loader_uninitialized]] static void *const UndefPtr; - -#define OMP_LIKELY(EXPR) __builtin_expect((bool)(EXPR), true) -#define OMP_UNLIKELY(EXPR) __builtin_expect((bool)(EXPR), false) - -} // namespace utils -} // namespace ompx - -#pragma omp end declare target - -#endif diff --git a/offload/DeviceRTL/src/Allocator.cpp b/offload/DeviceRTL/src/Allocator.cpp index c9c940de62c1a..2a85a34d32f6e 100644 --- a/offload/DeviceRTL/src/Allocator.cpp +++ b/offload/DeviceRTL/src/Allocator.cpp @@ -14,8 +14,8 @@ #include "Configuration.h" #include "Mapping.h" #include "Synchronization.h" -#include "Types.h" -#include "Utils.h" +#include "DeviceTypes.h" +#include "DeviceUtils.h" using namespace ompx; diff --git a/offload/DeviceRTL/src/Configuration.cpp b/offload/DeviceRTL/src/Configuration.cpp index ef0c3663536f5..4d97ad67313aa 100644 --- a/offload/DeviceRTL/src/Configuration.cpp +++ b/offload/DeviceRTL/src/Configuration.cpp @@ -13,7 +13,7 @@ #include "Configuration.h" #include "State.h" -#include "Types.h" +#include "DeviceTypes.h" using namespace ompx; diff --git a/offload/DeviceRTL/src/Debug.cpp b/offload/DeviceRTL/src/Debug.cpp index 31cd54e3de35c..bf228ec1b32cc 100644 --- a/offload/DeviceRTL/src/Debug.cpp +++ b/offload/DeviceRTL/src/Debug.cpp @@ -17,7 +17,7 @@ #include "Interface.h" #include "Mapping.h" #include "State.h" -#include "Types.h" +#include "DeviceTypes.h" using namespace ompx; diff --git a/offload/DeviceRTL/src/Kernel.cpp b/offload/DeviceRTL/src/Kernel.cpp index 95d4c728016d2..afee129c2589c 100644 --- a/offload/DeviceRTL/src/Kernel.cpp +++ b/offload/DeviceRTL/src/Kernel.cpp @@ -18,7 +18,7 @@ #include "Mapping.h" #include "State.h" #include "Synchronization.h" -#include "Types.h" +#include "DeviceTypes.h" #include "llvm/Frontend/OpenMP/OMPDeviceConstants.h" diff --git a/offload/DeviceRTL/src/Mapping.cpp b/offload/DeviceRTL/src/Mapping.cpp index c1ce878746a69..06318635e0170 100644 --- a/offload/DeviceRTL/src/Mapping.cpp +++ b/offload/DeviceRTL/src/Mapping.cpp @@ -12,8 +12,8 @@ #include "Mapping.h" #include "Interface.h" #include "State.h" -#include "Types.h" -#include "Utils.h" +#include "DeviceTypes.h" +#include "DeviceUtils.h" #pragma omp begin declare target device_type(nohost) @@ -364,11 +364,30 @@ _TGT_KERNEL_LANGUAGE(block_id, getBlockIdInKernel) _TGT_KERNEL_LANGUAGE(block_dim, getNumberOfThreadsInBlock) _TGT_KERNEL_LANGUAGE(grid_dim, getNumberOfBlocksInKernel) +extern "C" [[clang::disable_sanitizer_instrumentation, gnu::flatten, + gnu::always_inline, gnu::used, gnu::retain]] int +ompx_global_thread_id() { + return ompx_thread_id(0) + ompx_thread_id(1) * ompx_block_dim(0) + + ompx_thread_id(2) * ompx_block_dim(0) * ompx_block_dim(1); +} + extern "C" { uint64_t ompx_ballot_sync(uint64_t mask, int pred) { return utils::ballotSync(mask, pred); } +#define _TGT_KERNEL_LANGUAGE_SHFL_SYNC(TYPE, TY) \ + int ompx_shfl_sync_##TY(uint64_t mask, TYPE var, int src, int width) { \ + return utils::shuffle(mask, var, src, width); \ + } + +_TGT_KERNEL_LANGUAGE_SHFL_SYNC(int, i) +_TGT_KERNEL_LANGUAGE_SHFL_SYNC(float, f) +_TGT_KERNEL_LANGUAGE_SHFL_SYNC(long, l) +_TGT_KERNEL_LANGUAGE_SHFL_SYNC(double, d) + +#undef _TGT_KERNEL_LANGUAGE_SHFL_SYNC + int ompx_shfl_down_sync_i(uint64_t mask, int var, unsigned delta, int width) { return utils::shuffleDown(mask, var, delta, width); } diff --git a/offload/DeviceRTL/src/Misc.cpp b/offload/DeviceRTL/src/Misc.cpp index c24af9442d16e..ca8b549b28dbf 100644 --- a/offload/DeviceRTL/src/Misc.cpp +++ b/offload/DeviceRTL/src/Misc.cpp @@ -10,7 +10,7 @@ //===----------------------------------------------------------------------===// #include "Configuration.h" -#include "Types.h" +#include "DeviceTypes.h" #include "Debug.h" diff --git a/offload/DeviceRTL/src/Parallelism.cpp b/offload/DeviceRTL/src/Parallelism.cpp index 031a5ced25518..2a510e9531657 100644 --- a/offload/DeviceRTL/src/Parallelism.cpp +++ b/offload/DeviceRTL/src/Parallelism.cpp @@ -37,8 +37,8 @@ #include "Mapping.h" #include "State.h" #include "Synchronization.h" -#include "Types.h" -#include "Utils.h" +#include "DeviceTypes.h" +#include "DeviceUtils.h" using namespace ompx; diff --git a/offload/DeviceRTL/src/Reduction.cpp b/offload/DeviceRTL/src/Reduction.cpp index 744d1a3a231c8..f4e2e0d25bde9 100644 --- a/offload/DeviceRTL/src/Reduction.cpp +++ b/offload/DeviceRTL/src/Reduction.cpp @@ -15,8 +15,8 @@ #include "Mapping.h" #include "State.h" #include "Synchronization.h" -#include "Types.h" -#include "Utils.h" +#include "DeviceTypes.h" +#include "DeviceUtils.h" using namespace ompx; diff --git a/offload/DeviceRTL/src/Sanitizer.cpp b/offload/DeviceRTL/src/Sanitizer.cpp new file mode 100644 index 0000000000000..d7e1190dad3e5 --- /dev/null +++ b/offload/DeviceRTL/src/Sanitizer.cpp @@ -0,0 +1,690 @@ +//===------ Sanitizer.cpp - Track allocation for sanitizer checks ---------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +//===----------------------------------------------------------------------===// + +#include "DeviceTypes.h" +#include "DeviceUtils.h" +#include "Interface.h" +#include "LibC.h" +#include "Mapping.h" +#include "Shared/Environment.h" +#include "Synchronization.h" +//#include "Shared/Debug.h" + +using namespace ompx; +using namespace utils; + +#pragma omp begin declare target device_type(nohost) + +#include "Shared/Sanitizer.h" + +struct AllocationInfoLocalTy { + _AS_PTR(void, AllocationKind::LOCAL) Start; + uint64_t Length; + uint32_t Tag; +}; +struct AllocationInfoGlobalTy { + _AS_PTR(void, AllocationKind::GLOBAL) Start; + uint64_t Length; + uint32_t Tag; +}; + +template struct AllocationInfoTy {}; +template <> struct AllocationInfoTy { + using ASVoidPtrTy = AllocationInfoGlobalTy; +}; +template <> struct AllocationInfoTy { + using ASVoidPtrTy = AllocationInfoLocalTy; +}; + +template <> +AllocationPtrTy +AllocationPtrTy::get(_AS_PTR(void, AllocationKind::LOCAL) + P) { + TypePunUnion TPU; + TPU.P = (void *)P; + return TPU.AP; +} + +template <> +AllocationPtrTy::operator _AS_PTR( + void, AllocationKind::LOCAL)() const { + TypePunUnion TPU; + TPU.AP = *this; + return TPU.AddrP; +} + +template struct AllocationTracker { + static_assert(sizeof(AllocationTy) == sizeof(_AS_PTR(void, AK)) * 2, + "AllocationTy should not exceed two pointers"); + // static_assert(sizeof(AllocationPtrTy) * 8 == + // SanitizerConfig::ADDR_SPACE_PTR_SIZE, + // "AllocationTy pointers should be pointer sized"); + + [[clang::disable_sanitizer_instrumentation]] static + typename AllocationInfoTy::ASVoidPtrTy + getAllocationInfo(_AS_PTR(void, AK) P) { + AllocationPtrTy AP = AllocationPtrTy::get(P); + uint32_t AllocationId = AP.AllocationId; + if (OMP_UNLIKELY(AllocationId >= SanitizerConfig::SLOTS)) + return {P, 0, (uint32_t)-1}; + auto &A = getAllocation(AP, /*AccessId=*/0, /*PC=*/0); + return {A.Start, A.Length, (uint32_t)A.Tag}; + } + + [[clang::disable_sanitizer_instrumentation]] static _AS_PTR(void, AK) + create(_AS_PTR(void, AK) Start, uint64_t Length, int64_t AllocationId, + int64_t Slot, int64_t SourceId, uint64_t PC) { + if constexpr (SanitizerConfig::OFFSET_BITS < 64) + if (OMP_UNLIKELY(Length >= (1UL << (SanitizerConfig::OFFSET_BITS)))) + __sanitizer_trap_info_ptr->exceedsAllocationLength( + Start, Length, AllocationId, Slot, SourceId, /*PC=*/0); + + // Reserve the 0 element for the null pointer in global space. + auto &AllocArr = getAllocationArray(); + auto &Cnt = AllocArr.Cnt; + if constexpr (AK == AllocationKind::LOCAL) + Slot = ++Cnt; + if (Slot == -1) + Slot = ++Cnt; + + uint64_t NumSlots = SanitizerConfig::SLOTS; + if (OMP_UNLIKELY(Slot >= NumSlots)) + __sanitizer_trap_info_ptr->exceedsAllocationSlots( + Start, Length, AllocationId, Slot, SourceId, /*PC=*/0); + + auto &A = AllocArr.Arr[Slot]; + + A.Start = Start; + A.Length = Length; + A.Id = AllocationId; + + AllocationPtrTy AP; + AP.Offset = 0; + if constexpr (SanitizerConfig::useTags()) { + AP.AllocationTag = ++A.Tag; + } + AP.AllocationId = Slot; + AP.Magic = SanitizerConfig::MAGIC; + AP.Kind = (uint64_t)AK; + return AP; + } + + [[clang::disable_sanitizer_instrumentation]] static void + remove(_AS_PTR(void, AK) P, int64_t SourceId) { + AllocationPtrTy AP = AllocationPtrTy::get(P); + uint64_t AllocationId = AP.AllocationId; + auto &AllocArr = getAllocationArray(); + auto &A = AllocArr.Arr[AllocationId]; + A.Length = 0; + + auto &Cnt = AllocArr.Cnt; + if constexpr (AK == AllocationKind::LOCAL) { + if (Cnt == AllocationId) + --Cnt; + } + } + + [[clang::disable_sanitizer_instrumentation]] static void remove_n(int32_t N) { + static_assert(AK == AllocationKind::LOCAL, ""); + auto &AllocArr = getAllocationArray(); + auto &Cnt = AllocArr.Cnt; + for (int32_t I = 0; I < N; ++I) { + auto &A = AllocArr.Arr[Cnt--]; + A.Length = 0; + } + } + + [[clang::disable_sanitizer_instrumentation]] static _AS_PTR(void, AK) + advance(_AS_PTR(void, AK) P, uint64_t Offset, int64_t SourceId) { + AllocationPtrTy AP = AllocationPtrTy::get(P); + AP.Offset += Offset; + return AP; + } + + [[clang::disable_sanitizer_instrumentation]] static _AS_PTR(void, AK) + checkWithBase(_AS_PTR(void, AK) P, _AS_PTR(void, AK) Start, + int64_t Length, uint32_t Tag, int64_t Size, + int64_t AccessId, int64_t SourceId, uint64_t PC) { + AllocationPtrTy AP = AllocationPtrTy::get(P); + if constexpr (AK == AllocationKind::LOCAL) + if (Length == 0) + Length = getAllocation(AP, AccessId, PC).Length; + // if constexpr (AK == AllocationKind::GLOBAL) + // if (AP.Magic != SanitizerConfig::MAGIC) + // __sanitizer_trap_info_ptr->garbagePointer(AP, (void *)P, SourceId, + // PC); + int64_t Offset = AP.Offset; + // if (OMP_UNLIKELY( + // Offset > Length - Size || + // (SanitizerConfig::useTags() && Tag != AP.AllocationTag))) { + // __sanitizer_trap_info_ptr->accessError(AP, Size, AccessId, SourceId, + // PC); + // } + return utils::advancePtr(Start, Offset); + } + + [[clang::disable_sanitizer_instrumentation]] static void + checkWithBaseVoid(_AS_PTR(void, AK) P, _AS_PTR(void, AK) Start, + int64_t Length, uint32_t Tag, int64_t Size, + int64_t AccessId, int64_t SourceId, uint64_t PC) { + AllocationPtrTy AP = AllocationPtrTy::get(P); + if constexpr (AK == AllocationKind::LOCAL) + if (Length == 0) + Length = getAllocation(AP, AccessId, PC).Length; + if constexpr (AK == AllocationKind::GLOBAL) + if (AP.Magic != SanitizerConfig::MAGIC) + __sanitizer_trap_info_ptr->garbagePointer(AP, (void *)P, SourceId, + PC); + int64_t Offset = AP.Offset; + if (OMP_UNLIKELY( + Offset > Length - Size || + (SanitizerConfig::useTags() && Tag != AP.AllocationTag))) { + __sanitizer_trap_info_ptr->accessError(AP, Size, AccessId, SourceId, + PC); + } + } + + [[clang::disable_sanitizer_instrumentation]] static void + checkRangeWithBase(_AS_PTR(void, AK) SCEVMax, _AS_PTR(void, AK) SCEVMin, + _AS_PTR(void, AK) StartAddress, int64_t AllocationLength, + uint32_t Tag, int64_t AccessTypeSize, int64_t AccessId, + int64_t SourceId, uint64_t PC) { + printf("Hello World!\n"); + AllocationPtrTy APSCEVMax = AllocationPtrTy::get(SCEVMax); + AllocationPtrTy APSCEVMin = AllocationPtrTy::get(SCEVMin); + if constexpr (AK == AllocationKind::LOCAL) + if (AllocationLength == 0) + AllocationLength = getAllocation(APSCEVMax, AccessId, PC).Length; + + if constexpr (AK == AllocationKind::GLOBAL) { + if (APSCEVMax.Magic != SanitizerConfig::MAGIC) + __sanitizer_trap_info_ptr->garbagePointer( + APSCEVMax, (void *)SCEVMax, SourceId, PC); + + if (APSCEVMin.Magic != SanitizerConfig::MAGIC) + __sanitizer_trap_info_ptr->garbagePointer( + APSCEVMin, (void *)SCEVMin, SourceId, PC); + } + + // check upper bound + int64_t MaxOffset = APSCEVMax.Offset; + if (OMP_UNLIKELY(MaxOffset > AllocationLength - AccessTypeSize || + (SanitizerConfig::useTags() && + Tag != APSCEVMax.AllocationTag))) { + __sanitizer_trap_info_ptr->accessError(APSCEVMax, AccessTypeSize, + AccessId, SourceId, PC); + } + + // check lower bound + auto &AllocationOfMinOffset = getAllocation(APSCEVMin, AccessId, PC); + if (OMP_UNLIKELY(AllocationOfMinOffset.Start != StartAddress || + (SanitizerConfig::useTags() && + Tag != APSCEVMin.AllocationTag))) { + __sanitizer_trap_info_ptr->accessError(APSCEVMin, AccessTypeSize, + AccessId, SourceId, PC); + } + } + + [[clang::disable_sanitizer_instrumentation]] static void + checkRange(_AS_PTR(void, AK) SCEVMax, _AS_PTR(void, AK) SCEVMin, + int64_t AccessTypeSize, int64_t AccessId, int64_t SourceId, + uint64_t PC) { + AllocationPtrTy AP = AllocationPtrTy::get(SCEVMax); + auto &Alloc = getAllocation(AP, AccessId, PC); + return checkRangeWithBase(SCEVMax, SCEVMin, Alloc.Start, Alloc.Length, + Alloc.Tag, AccessTypeSize, AccessId, SourceId, + PC); + } + + [[clang::disable_sanitizer_instrumentation]] static _AS_PTR(void, AK) + check(_AS_PTR(void, AK) P, int64_t Size, int64_t AccessId, + int64_t SourceId, uint64_t PC) { + AllocationPtrTy AP = AllocationPtrTy::get(P); + auto &Alloc = getAllocation(AP, AccessId, PC); + return checkWithBase(P, Alloc.Start, Alloc.Length, Alloc.Tag, Size, + AccessId, SourceId, PC); + } + + [[clang::disable_sanitizer_instrumentation]] static void + checkVoid(_AS_PTR(void, AK) P, int64_t Size, int64_t AccessId, + int64_t SourceId, uint64_t PC) { + AllocationPtrTy AP = AllocationPtrTy::get(P); + auto &Alloc = getAllocation(AP, AccessId, PC); + return checkWithBaseVoid(P, Alloc.Start, Alloc.Length, Alloc.Tag, Size, + AccessId, SourceId, PC); + } + + [[clang::disable_sanitizer_instrumentation]] static _AS_PTR(void, AK) + unpack(_AS_PTR(void, AK) P, int64_t SourceId, uint64_t PC) { + AllocationPtrTy AP = AllocationPtrTy::get(P); + auto &A = getAllocation(AP, SourceId, PC); + uint64_t Offset = AP.Offset; + _AS_PTR(void, AK) Ptr = utils::advancePtr(A.Start, Offset); + return Ptr; + } + + [[clang::disable_sanitizer_instrumentation]] static void + lifetimeStart(_AS_PTR(void, AK) P, uint64_t Length) { + AllocationPtrTy AP = AllocationPtrTy::get(P); + auto &A = getAllocation(AP, /*AccessId=*/0, /*PC=*/0); + // TODO: Check length + A.Length = Length; + } + + [[clang::disable_sanitizer_instrumentation]] static void + lifetimeEnd(_AS_PTR(void, AK) P, uint64_t Length) { + AllocationPtrTy AP = AllocationPtrTy::get(P); + auto &A = getAllocation(AP, /*AccessId=*/0, /*PC=*/0); + // TODO: Check length + A.Length = 0; + } + + [[clang::disable_sanitizer_instrumentation]] static void leakCheck() { + static_assert(AK == AllocationKind::GLOBAL, ""); + auto &AllocArr = getAllocationArray(); + for (uint64_t Slot = 0; Slot < SanitizerConfig::SLOTS; ++Slot) { + auto &A = AllocArr.Arr[Slot]; + if (OMP_UNLIKELY(A.Length)) + __sanitizer_trap_info_ptr->memoryLeak(A, Slot); + } + } +}; + +template +AllocationArrayTy + Allocations::Arr[SanitizerConfig::NUM_ALLOCATION_ARRAYS]; + +static void checkForMagic(bool IsGlobal, void *P, int64_t SourceId, + uint64_t PC) { + if (IsGlobal) { + // auto AP = AllocationPtrTy::get(P); + // if (AP.Magic != SanitizerConfig::MAGIC) + // __sanitizer_trap_info_ptr->garbagePointer( + // AP, P, SourceId, PC); + } else { + auto AP = AllocationPtrTy::get(P); + if (AP.Magic != SanitizerConfig::MAGIC) + __sanitizer_trap_info_ptr->garbagePointer( + AP, P, SourceId, PC); + } +} + +static void checkForMagic2(bool IsGlobal, void *P, int64_t SourceId, + int64_t AccessId, + uint64_t PC) { + //printf("HELLO THERE!\n"); + //if (IsGlobal) { + auto AP = AllocationPtrTy::get(P); + if (AP.Magic != SanitizerConfig::MAGIC) + __sanitizer_trap_info_ptr->garbagePointer2( + AP, P, SourceId,AccessId, PC); + //} else { + // auto AP = AllocationPtrTy::get(P); + // if (AP.Magic != SanitizerConfig::MAGIC) + // __sanitizer_trap_info_ptr->garbagePointer2( + // AP, P, SourceId, AccessId, PC); + // } +} + +extern "C" { + +#define REAL_PTR_IS_LOCAL(PTR) (isThreadLocalMemPtr(PTR)) +#define IS_GLOBAL(PTR) ((uintptr_t)PTR & (1UL << 63)) + +[[clang::disable_sanitizer_instrumentation, gnu::flatten, gnu::always_inline, + gnu::used, gnu::retain]] _AS_PTR(void, AllocationKind::LOCAL) + ompx_new_local(_AS_PTR(void, AllocationKind::LOCAL) Start, uint64_t Length, + int64_t AllocationId, int64_t SourceId, uint64_t PC) { + return AllocationTracker::create( + Start, Length, AllocationId, 0, SourceId, PC); +} +[[clang::disable_sanitizer_instrumentation, gnu::flatten, gnu::always_inline, + gnu::used, gnu::retain]] _AS_PTR(void, AllocationKind::GLOBAL) + ompx_new_global(_AS_PTR(void, AllocationKind::GLOBAL) Start, + uint64_t Length, int64_t AllocationId, int64_t SourceId, + uint64_t PC) { + return AllocationTracker::create( + Start, Length, AllocationId, -1, SourceId, PC); +} +[[clang::disable_sanitizer_instrumentation, gnu::flatten, gnu::always_inline, + gnu::used, gnu::retain]] void +__sanitizer_register_host(_AS_PTR(void, AllocationKind::GLOBAL) Start, + uint64_t Length, uint64_t Slot, int64_t SourceId) { + AllocationTracker::create(Start, Length, Slot, Slot, + SourceId, /*PC=*/0); +} +[[clang::disable_sanitizer_instrumentation, gnu::flatten, gnu::always_inline, + gnu::used, gnu::retain]] void * +ompx_new(void *Start, uint64_t Length, int64_t AllocationId, int64_t SourceId, + uint64_t PC) { + if (REAL_PTR_IS_LOCAL(Start)) + return (void *)ompx_new_local((_AS_PTR(void, AllocationKind::LOCAL))Start, + Length, AllocationId, SourceId, PC); + return (void *)ompx_new_global((_AS_PTR(void, AllocationKind::GLOBAL))Start, + Length, AllocationId, SourceId, PC); +} +[[clang::disable_sanitizer_instrumentation, gnu::flatten, gnu::always_inline, + gnu::used, gnu::retain]] void +ompx_free_local_n(int32_t N) { + return AllocationTracker::remove_n(N); +} +[[clang::disable_sanitizer_instrumentation, gnu::flatten, gnu::always_inline, + gnu::used, gnu::retain]] void +__sanitizer_unregister_host(_AS_PTR(void, AllocationKind::GLOBAL) P) { + AllocationTracker::remove(P, /*SourceId=*/0); +} +[[clang::disable_sanitizer_instrumentation, gnu::flatten, gnu::always_inline, + gnu::used, gnu::retain]] void +ompx_free_local(_AS_PTR(void, AllocationKind::LOCAL) P, int64_t SourceId) { + return AllocationTracker::remove(P, SourceId); +} +[[clang::disable_sanitizer_instrumentation, gnu::flatten, gnu::always_inline, + gnu::used, gnu::retain]] void +ompx_free_global(_AS_PTR(void, AllocationKind::GLOBAL) P, int64_t SourceId) { + return AllocationTracker::remove(P, SourceId); +} +[[clang::disable_sanitizer_instrumentation, gnu::flatten, gnu::always_inline, + gnu::used, gnu::retain]] void +ompx_free(void *P, int64_t SourceId, uint64_t PC) { + bool IsGlobal = IS_GLOBAL(P); + checkForMagic(IsGlobal, P, SourceId, PC); + if (IsGlobal) + return ompx_free_global((_AS_PTR(void, AllocationKind::GLOBAL))P, SourceId); + return ompx_free_local((_AS_PTR(void, AllocationKind::LOCAL))P, SourceId); +} + +[[clang::disable_sanitizer_instrumentation, gnu::flatten, gnu::always_inline, + gnu::used, gnu::retain]] _AS_PTR(void, AllocationKind::LOCAL) + ompx_gep_local(_AS_PTR(void, AllocationKind::LOCAL) P, uint64_t Offset, + int64_t SourceId) { + return AllocationTracker::advance(P, Offset, SourceId); +} +[[clang::disable_sanitizer_instrumentation, gnu::flatten, gnu::always_inline, + gnu::used, gnu::retain]] _AS_PTR(void, AllocationKind::GLOBAL) + ompx_gep_global(_AS_PTR(void, AllocationKind::GLOBAL) P, uint64_t Offset, + int64_t SourceId) { + return AllocationTracker::advance(P, Offset, + SourceId); +} +[[clang::disable_sanitizer_instrumentation, gnu::flatten, gnu::always_inline, + gnu::used, gnu::retain]] void * +ompx_gep(void *P, uint64_t Offset, int64_t SourceId) { + bool IsGlobal = IS_GLOBAL(P); + checkForMagic(IsGlobal, P, SourceId, /*PC=*/0); + if (IsGlobal) + return (void *)ompx_gep_global((_AS_PTR(void, AllocationKind::GLOBAL))P, + Offset, SourceId); + return (void *)ompx_gep_local((_AS_PTR(void, AllocationKind::LOCAL))P, Offset, + SourceId); +} + +[[clang::disable_sanitizer_instrumentation, gnu::flatten, gnu::always_inline, + gnu::used, gnu::retain]] _AS_PTR(void, AllocationKind::LOCAL) + ompx_check_local(_AS_PTR(void, AllocationKind::LOCAL) P, uint64_t Size, + uint64_t AccessId, int64_t SourceId, uint64_t PC) { + return AllocationTracker::check(P, Size, AccessId, + SourceId, PC); +} +[[clang::disable_sanitizer_instrumentation, gnu::flatten, gnu::always_inline, + gnu::used, gnu::retain]] _AS_PTR(void, AllocationKind::GLOBAL) + ompx_check_global(_AS_PTR(void, AllocationKind::GLOBAL) P, uint64_t Size, + uint64_t AccessId, int64_t SourceId, uint64_t PC) { + return AllocationTracker::check(P, Size, AccessId, + SourceId, PC); +} +[[clang::disable_sanitizer_instrumentation, gnu::flatten, gnu::always_inline, + gnu::used, gnu::retain]] void * +ompx_check(void *P, uint64_t Size, uint64_t AccessId, int64_t SourceId, + uint64_t PC) { + bool IsGlobal = IS_GLOBAL(P); + checkForMagic2(IsGlobal, P, SourceId, AccessId, PC); + if (IsGlobal) + return (void *)ompx_check_global((_AS_PTR(void, AllocationKind::GLOBAL))P, + Size, AccessId, SourceId, PC); + return (void *)ompx_check_local((_AS_PTR(void, AllocationKind::LOCAL))P, Size, + AccessId, SourceId, PC); +} + +[[clang::disable_sanitizer_instrumentation, gnu::flatten, gnu::always_inline, + gnu::used, gnu::retain]] _AS_PTR(void, AllocationKind::LOCAL) + ompx_check_with_base_local(_AS_PTR(void, AllocationKind::LOCAL) P, + _AS_PTR(void, AllocationKind::LOCAL) Start, + uint64_t Length, uint32_t Tag, uint64_t Size, + uint64_t AccessId, int64_t SourceId, + uint64_t PC) { + return AllocationTracker::checkWithBase( + P, Start, Length, Tag, Size, AccessId, SourceId, PC); +} + +[[clang::disable_sanitizer_instrumentation, gnu::flatten, gnu::always_inline, + gnu::used, gnu::retain]] _AS_PTR(void, AllocationKind::GLOBAL) + ompx_check_with_base_global(_AS_PTR(void, AllocationKind::GLOBAL) P, + _AS_PTR(void, AllocationKind::GLOBAL) Start, + uint64_t Length, uint32_t Tag, uint64_t Size, + uint64_t AccessId, int64_t SourceId, + uint64_t PC) { + return AllocationTracker::checkWithBase( + P, Start, Length, Tag, Size, AccessId, SourceId, PC); +} + +// Void functions for sanitizing a pointer from base offset and without it +[[clang::disable_sanitizer_instrumentation, gnu::flatten, gnu::always_inline, + gnu::used, gnu::retain]] void +ompx_check_void_local(_AS_PTR(void, AllocationKind::LOCAL) P, uint64_t Size, + uint64_t AccessId, int64_t SourceId, uint64_t PC) { + return AllocationTracker::checkVoid(P, Size, AccessId, + SourceId, PC); +} +[[clang::disable_sanitizer_instrumentation, gnu::flatten, gnu::always_inline, + gnu::used, gnu::retain]] void +ompx_check_void_global(_AS_PTR(void, AllocationKind::GLOBAL) P, uint64_t Size, + uint64_t AccessId, int64_t SourceId, uint64_t PC) { + return AllocationTracker::checkVoid(P, Size, AccessId, + SourceId, PC); +} +[[clang::disable_sanitizer_instrumentation, gnu::flatten, gnu::always_inline, + gnu::used, gnu::retain]] void +ompx_check_void(void *P, uint64_t Size, uint64_t AccessId, int64_t SourceId, + uint64_t PC) { + bool IsGlobal = IS_GLOBAL(P); + checkForMagic2(IsGlobal, P, SourceId, AccessId, PC); + if (IsGlobal) + return ompx_check_void_global((_AS_PTR(void, AllocationKind::GLOBAL))P, + Size, AccessId, SourceId, PC); + return ompx_check_void_local((_AS_PTR(void, AllocationKind::LOCAL))P, Size, + AccessId, SourceId, PC); +} + +[[clang::disable_sanitizer_instrumentation, gnu::flatten, gnu::always_inline, + gnu::used, gnu::retain]] void +ompx_check_with_base_void_local(_AS_PTR(void, AllocationKind::LOCAL) P, + _AS_PTR(void, AllocationKind::LOCAL) Start, + uint64_t Length, uint32_t Tag, uint64_t Size, + uint64_t AccessId, int64_t SourceId, + uint64_t PC) { + return AllocationTracker::checkWithBaseVoid( + P, Start, Length, Tag, Size, AccessId, SourceId, PC); +} + +[[clang::disable_sanitizer_instrumentation, gnu::flatten, gnu::always_inline, + gnu::used, gnu::retain]] void +ompx_check_with_base_void_global(_AS_PTR(void, AllocationKind::GLOBAL) P, + _AS_PTR(void, AllocationKind::GLOBAL) Start, + uint64_t Length, uint32_t Tag, uint64_t Size, + uint64_t AccessId, int64_t SourceId, + uint64_t PC) { + printf("Hello World!\n"); + return AllocationTracker::checkWithBaseVoid( + P, Start, Length, Tag, Size, AccessId, SourceId, PC); +} +// End of void functions. + +[[clang::disable_sanitizer_instrumentation, gnu::flatten, gnu::always_inline, + gnu::used, gnu::retain]] void +ompx_check_range_with_base_global(_AS_PTR(void, AllocationKind::GLOBAL) SCEVMax, + _AS_PTR(void, AllocationKind::GLOBAL) SCEVMin, + _AS_PTR(void, AllocationKind::GLOBAL) + StartAddress, + int64_t AllocationLength, uint32_t Tag, + int64_t AccessTypeSize, int64_t AccessId, + int64_t SourceId, uint64_t PC) { + + printf("Hello World!\n"); + + return AllocationTracker::checkRangeWithBase( + SCEVMax, SCEVMin, StartAddress, AllocationLength, Tag, AccessTypeSize, + AccessId, SourceId, PC); +} + +[[clang::disable_sanitizer_instrumentation, gnu::flatten, gnu::always_inline, + gnu::used, gnu::retain]] void +ompx_check_range_with_base_local(_AS_PTR(void, AllocationKind::LOCAL) SCEVMax, + _AS_PTR(void, AllocationKind::LOCAL) SCEVMin, + _AS_PTR(void, AllocationKind::LOCAL) + StartAddress, + int64_t AllocationLength, uint32_t Tag, + int64_t AccessTypeSize, int64_t AccessId, + int64_t SourceId, uint64_t PC) { + printf("Hello World!\n"); + return AllocationTracker::checkRangeWithBase( + SCEVMax, SCEVMin, StartAddress, AllocationLength, Tag, AccessTypeSize, + AccessId, SourceId, PC); +} + +[[clang::disable_sanitizer_instrumentation, gnu::flatten, gnu::always_inline, + gnu::used, gnu::retain]] void +ompx_check_range_local(_AS_PTR(void, AllocationKind::LOCAL) SCEVMax, + _AS_PTR(void, AllocationKind::LOCAL) SCEVMin, + int64_t AccessTypeSize, int64_t AccessId, + int64_t SourceId, uint64_t PC) { + return AllocationTracker::checkRange( + SCEVMax, SCEVMin, AccessTypeSize, AccessId, SourceId, PC); +} +[[clang::disable_sanitizer_instrumentation, gnu::flatten, gnu::always_inline, + gnu::used, gnu::retain]] void +ompx_check_range_global(_AS_PTR(void, AllocationKind::GLOBAL) SCEVMax, + _AS_PTR(void, AllocationKind::GLOBAL) SCEVMin, + int64_t AccessTypeSize, int64_t AccessId, + int64_t SourceId, uint64_t PC) { + return AllocationTracker::checkRange( + SCEVMax, SCEVMin, AccessTypeSize, AccessId, SourceId, PC); +} +[[clang::disable_sanitizer_instrumentation, gnu::flatten, gnu::always_inline, + gnu::used, gnu::retain]] void +ompx_check_range(void *SCEVMax, void *SCEVMin, int64_t AccessTypeSize, + int64_t AccessId, int64_t SourceId, uint64_t PC) { + bool IsGlobalMax = IS_GLOBAL(SCEVMax); + bool IsGlobalMin = IS_GLOBAL(SCEVMin); + checkForMagic2(IsGlobalMax, SCEVMax, SourceId, AccessId, PC); + checkForMagic2(IsGlobalMin, SCEVMin, SourceId, AccessId, PC); + if (IsGlobalMax && IsGlobalMin) + return ompx_check_range_global( + (_AS_PTR(void, AllocationKind::GLOBAL))SCEVMax, + (_AS_PTR(void, AllocationKind::GLOBAL))SCEVMin, AccessTypeSize, + AccessId, SourceId, PC); + + return ompx_check_range_local((_AS_PTR(void, AllocationKind::LOCAL))SCEVMax, + (_AS_PTR(void, AllocationKind::LOCAL))SCEVMin, + AccessTypeSize, AccessId, SourceId, PC); +} + +[[clang::disable_sanitizer_instrumentation, gnu::flatten, gnu::always_inline, + gnu::used, gnu::retain]] char** +ompx_check_with_base_global_vec(char **Pointers, + char **Starts, + uint64_t *Lengths, + uint32_t *Tags, + uint64_t *Sizes, + uint64_t *AccessIds, + int64_t *SourceIds, + uint64_t PC, + uint64_t ArraySize) { + + //for (int Index = 0; Index < ArraySize; Index++) { + //void* P = *Pointers; //*(Pointers + Index*8); + //void* Start = *(Starts + Index*8); + //uint64_t Length = Lengths[Index]; + //uint32_t Tag = Tags[Index]; + //uint64_t Size = Sizes[Index]; + //uint64_t AccessId = AccessIds[Index]; + //uint64_t SourceId = SourceIds[Index]; + + //_AS_PTR(void, AllocationKind::GLOBAL) + //Ptr = AllocationTracker::checkWithBase( + // P, Start, Length, Tag, Size, AccessId, SourceId, PC); + //void* Ptr = ompx_check( + // P, Size, AccessId, SourceId, PC); + //*Pointers = (char*) Ptr; + //} + + return Pointers; +} + +[[clang::disable_sanitizer_instrumentation, gnu::flatten, gnu::always_inline, + gnu::used, gnu::retain]] _AS_PTR(void, AllocationKind::LOCAL) + ompx_unpack_local(_AS_PTR(void, AllocationKind::LOCAL) P, + int64_t SourceId) { + return AllocationTracker::unpack(P, SourceId, + /*PC=*/0); +} +[[clang::disable_sanitizer_instrumentation, gnu::flatten, gnu::always_inline, + gnu::used, gnu::retain]] _AS_PTR(void, AllocationKind::GLOBAL) + ompx_unpack_global(_AS_PTR(void, AllocationKind::GLOBAL) P, + int64_t SourceId) { + return AllocationTracker::unpack(P, SourceId, + /*PC=*/0); +} +[[clang::disable_sanitizer_instrumentation, gnu::flatten, gnu::always_inline, + gnu::used, gnu::retain]] void * +ompx_unpack(void *P, int64_t SourceId) { + bool IsGlobal = IS_GLOBAL(P); + checkForMagic(IsGlobal, P, SourceId, /*PC=*/0); + if (IsGlobal) + return (void *)ompx_unpack_global((_AS_PTR(void, AllocationKind::GLOBAL))P, + SourceId); + return (void *)ompx_unpack_local((_AS_PTR(void, AllocationKind::LOCAL))P, + SourceId); +} + +[[clang::disable_sanitizer_instrumentation, gnu::flatten, gnu::always_inline, + gnu::used, gnu::retain]] void +ompx_lifetime_start(_AS_PTR(void, AllocationKind::LOCAL) P, uint64_t Length) { + AllocationTracker::lifetimeStart(P, Length); +} +[[clang::disable_sanitizer_instrumentation, gnu::flatten, gnu::always_inline, + gnu::used, gnu::retain]] void +ompx_lifetime_end(_AS_PTR(void, AllocationKind::LOCAL) P, uint64_t Length) { + AllocationTracker::lifetimeEnd(P, Length); +} + +[[clang::disable_sanitizer_instrumentation, gnu::flatten, gnu::always_inline, + gnu::used, gnu::retain]] struct AllocationInfoLocalTy +ompx_get_allocation_info_local(_AS_PTR(void, AllocationKind::LOCAL) P) { + return AllocationTracker::getAllocationInfo(P); +} +[[clang::disable_sanitizer_instrumentation, gnu::flatten, gnu::always_inline, + gnu::used, gnu::retain]] struct AllocationInfoGlobalTy +ompx_get_allocation_info_global(_AS_PTR(void, AllocationKind::GLOBAL) P) { + return AllocationTracker::getAllocationInfo(P); +} + +[[clang::disable_sanitizer_instrumentation, gnu::flatten, gnu::always_inline, + gnu::used, gnu::retain]] void +ompx_leak_check() { + AllocationTracker::leakCheck(); +} + +[[gnu::weak, gnu::noinline, gnu::used, gnu::retain]] int64_t +__san_get_location_value() { + return -1; +} +} + +#pragma omp end declare target diff --git a/offload/DeviceRTL/src/State.cpp b/offload/DeviceRTL/src/State.cpp index a1e4fa2449d9a..cb83b7839b5b2 100644 --- a/offload/DeviceRTL/src/State.cpp +++ b/offload/DeviceRTL/src/State.cpp @@ -13,13 +13,13 @@ #include "Allocator.h" #include "Configuration.h" #include "Debug.h" +#include "DeviceUtils.h" #include "Interface.h" #include "LibC.h" #include "Mapping.h" #include "State.h" #include "Synchronization.h" -#include "Types.h" -#include "Utils.h" +#include "DeviceTypes.h" using namespace ompx; @@ -84,14 +84,14 @@ struct SharedMemorySmartStackTy { /// Deallocate the last allocation made by the encountering thread and pointed /// to by \p Ptr from the stack. Each thread can call this function. - void pop(void *Ptr, uint32_t Bytes); + void pop(void *Ptr, uint64_t Bytes); private: /// Compute the size of the storage space reserved for a thread. uint32_t computeThreadStorageTotal() { uint32_t NumLanesInBlock = mapping::getNumberOfThreadsInBlock(); - return utils::align_down((state::SharedScratchpadSize / NumLanesInBlock), - allocator::ALIGNMENT); + return utils::alignDown((state::SharedScratchpadSize / NumLanesInBlock), + allocator::ALIGNMENT); } /// Return the top address of the warp data stack, that is the first address @@ -121,7 +121,7 @@ void *SharedMemorySmartStackTy::push(uint64_t Bytes) { // First align the number of requested bytes. /// FIXME: The stack shouldn't require worst-case padding. Alignment needs to /// be passed in as an argument and the stack rewritten to support it. - uint64_t AlignedBytes = utils::align_up(Bytes, allocator::ALIGNMENT); + uint64_t AlignedBytes = utils::alignPtr(Bytes, allocator::ALIGNMENT); uint32_t StorageTotal = computeThreadStorageTotal(); @@ -148,8 +148,8 @@ void *SharedMemorySmartStackTy::push(uint64_t Bytes) { return GlobalMemory; } -void SharedMemorySmartStackTy::pop(void *Ptr, uint32_t Bytes) { - uint64_t AlignedBytes = utils::align_up(Bytes, allocator::ALIGNMENT); +void SharedMemorySmartStackTy::pop(void *Ptr, uint64_t Bytes) { + uint64_t AlignedBytes = utils::alignPtr(Bytes, allocator::ALIGNMENT); if (utils::isSharedMemPtr(Ptr)) { int TId = mapping::getThreadIdInBlock(); Usage[TId] -= AlignedBytes; diff --git a/offload/DeviceRTL/src/Synchronization.cpp b/offload/DeviceRTL/src/Synchronization.cpp index 80ba87b300bcd..97a6b080169ad 100644 --- a/offload/DeviceRTL/src/Synchronization.cpp +++ b/offload/DeviceRTL/src/Synchronization.cpp @@ -16,8 +16,8 @@ #include "Interface.h" #include "Mapping.h" #include "State.h" -#include "Types.h" -#include "Utils.h" +#include "DeviceTypes.h" +#include "DeviceUtils.h" #pragma omp begin declare target device_type(nohost) diff --git a/offload/DeviceRTL/src/Tasking.cpp b/offload/DeviceRTL/src/Tasking.cpp index 2dc33562e6d79..34cb67cb1a351 100644 --- a/offload/DeviceRTL/src/Tasking.cpp +++ b/offload/DeviceRTL/src/Tasking.cpp @@ -13,10 +13,10 @@ // //===----------------------------------------------------------------------===// +#include "DeviceUtils.h" #include "Interface.h" #include "State.h" -#include "Types.h" -#include "Utils.h" +#include "DeviceTypes.h" using namespace ompx; @@ -34,7 +34,7 @@ TaskDescriptorTy *__kmpc_omp_task_alloc(IdentTy *, int32_t, int32_t, TaskDescriptorTy *TaskDescriptor = (TaskDescriptorTy *)memory::allocGlobal( TaskSizeTotal, "explicit task descriptor"); TaskDescriptor->Payload = - utils::advance(TaskDescriptor, TaskSizeInclPrivateValuesPadded); + utils::advancePtr(TaskDescriptor, TaskSizeInclPrivateValuesPadded); TaskDescriptor->TaskFn = TaskFn; return TaskDescriptor; diff --git a/offload/DeviceRTL/src/Utils.cpp b/offload/DeviceRTL/src/Utils.cpp index 53cc803234867..baefce4c65b68 100644 --- a/offload/DeviceRTL/src/Utils.cpp +++ b/offload/DeviceRTL/src/Utils.cpp @@ -9,7 +9,7 @@ // //===----------------------------------------------------------------------===// -#include "Utils.h" +#include "DeviceUtils.h" #include "Debug.h" #include "Interface.h" @@ -22,6 +22,7 @@ using namespace ompx; namespace impl { bool isSharedMemPtr(const void *Ptr) { return false; } +bool isThreadLocalMemPtr(const void *Ptr) { return false; } void Unpack(uint64_t Val, uint32_t *LowBits, uint32_t *HighBits) { static_assert(sizeof(unsigned long) == 8, ""); @@ -33,7 +34,7 @@ uint64_t Pack(uint32_t LowBits, uint32_t HighBits) { return (((uint64_t)HighBits) << 32) | (uint64_t)LowBits; } -int32_t shuffle(uint64_t Mask, int32_t Var, int32_t SrcLane); +int32_t shuffle(uint64_t Mask, int32_t Var, int32_t SrcLane, int32_t Width); int32_t shuffleDown(uint64_t Mask, int32_t Var, uint32_t LaneDelta, int32_t Width); @@ -44,8 +45,7 @@ uint64_t ballotSync(uint64_t Mask, int32_t Pred); ///{ #pragma omp begin declare variant match(device = {arch(amdgcn)}) -int32_t shuffle(uint64_t Mask, int32_t Var, int32_t SrcLane) { - int Width = mapping::getWarpSize(); +int32_t shuffle(uint64_t Mask, int32_t Var, int32_t SrcLane, int32_t Width) { int Self = mapping::getThreadIdInWarp(); int Index = SrcLane + (Self & ~(Width - 1)); return __builtin_amdgcn_ds_bpermute(Index << 2, Var); @@ -67,6 +67,10 @@ bool isSharedMemPtr(const void *Ptr) { return __builtin_amdgcn_is_shared( (const __attribute__((address_space(0))) void *)Ptr); } +bool isThreadLocalMemPtr(const void *Ptr) { + return __builtin_amdgcn_is_private( + (const __attribute__((address_space(0))) void *)Ptr); +} #pragma omp end declare variant ///} @@ -77,8 +81,8 @@ bool isSharedMemPtr(const void *Ptr) { device = {arch(nvptx, nvptx64)}, \ implementation = {extension(match_any)}) -int32_t shuffle(uint64_t Mask, int32_t Var, int32_t SrcLane) { - return __nvvm_shfl_sync_idx_i32(Mask, Var, SrcLane, 0x1f); +int32_t shuffle(uint64_t Mask, int32_t Var, int32_t SrcLane, int32_t Width) { + return __nvvm_shfl_sync_idx_i32(Mask, Var, SrcLane, Width); } int32_t shuffleDown(uint64_t Mask, int32_t Var, uint32_t Delta, int32_t Width) { @@ -92,6 +96,8 @@ uint64_t ballotSync(uint64_t Mask, int32_t Pred) { bool isSharedMemPtr(const void *Ptr) { return __nvvm_isspacep_shared(Ptr); } +bool isThreadLocalMemPtr(const void *Ptr) { return __nvvm_isspacep_local(Ptr); } + #pragma omp end declare variant ///} } // namespace impl @@ -104,8 +110,9 @@ void utils::unpack(uint64_t Val, uint32_t &LowBits, uint32_t &HighBits) { impl::Unpack(Val, &LowBits, &HighBits); } -int32_t utils::shuffle(uint64_t Mask, int32_t Var, int32_t SrcLane) { - return impl::shuffle(Mask, Var, SrcLane); +int32_t utils::shuffle(uint64_t Mask, int32_t Var, int32_t SrcLane, + int32_t Width) { + return impl::shuffle(Mask, Var, SrcLane, Width); } int32_t utils::shuffleDown(uint64_t Mask, int32_t Var, uint32_t Delta, @@ -127,6 +134,9 @@ uint64_t utils::ballotSync(uint64_t Mask, int32_t Pred) { } bool utils::isSharedMemPtr(void *Ptr) { return impl::isSharedMemPtr(Ptr); } +bool utils::isThreadLocalMemPtr(void *Ptr) { + return impl::isThreadLocalMemPtr(Ptr); +} extern "C" { int32_t __kmpc_shuffle_int32(int32_t Val, int16_t Delta, int16_t SrcLane) { diff --git a/offload/DeviceRTL/src/Workshare.cpp b/offload/DeviceRTL/src/Workshare.cpp index bcb7c5ad50a18..26238e7c7bc82 100644 --- a/offload/DeviceRTL/src/Workshare.cpp +++ b/offload/DeviceRTL/src/Workshare.cpp @@ -17,8 +17,8 @@ #include "Mapping.h" #include "State.h" #include "Synchronization.h" -#include "Types.h" -#include "Utils.h" +#include "DeviceTypes.h" +#include "DeviceUtils.h" using namespace ompx; @@ -348,7 +348,7 @@ template struct omptarget_nvptx_LoopSupport { if (rank == 0) { warp_res = atomic::add(&Cnt, change, atomic::seq_cst); } - warp_res = utils::shuffle(active, warp_res, leader); + warp_res = utils::shuffle(active, warp_res, leader, mapping::getWarpSize()); return warp_res + rank; } diff --git a/offload/DeviceRTL/src/exports b/offload/DeviceRTL/src/exports index 288ddf90b4a9f..b725cca7e1c1d 100644 --- a/offload/DeviceRTL/src/exports +++ b/offload/DeviceRTL/src/exports @@ -16,3 +16,5 @@ free memcmp printf __assert_fail +LocalAllocs +LocalCnt diff --git a/offload/include/OpenMP/Mapping.h b/offload/include/OpenMP/Mapping.h index b9f5c16582931..d310c6c707f8f 100644 --- a/offload/include/OpenMP/Mapping.h +++ b/offload/include/OpenMP/Mapping.h @@ -71,6 +71,7 @@ struct HostDataToTargetTy { const uintptr_t TgtAllocBegin; // allocated target memory const uintptr_t TgtPtrBegin; // mapped target memory = TgtAllocBegin + padding + void *FakeTgtPtrBegin = 0; // mapped target memory = TgtAllocBegin + padding private: static const uint64_t INFRefCount = ~(uint64_t)0; @@ -125,9 +126,10 @@ struct HostDataToTargetTy { HostDataToTargetTy(uintptr_t BP, uintptr_t B, uintptr_t E, uintptr_t TgtAllocBegin, uintptr_t TgtPtrBegin, bool UseHoldRefCount, map_var_info_t Name = nullptr, - bool IsINF = false) + bool IsINF = false, void *FakeTgtPtrBegin = nullptr) : HstPtrBase(BP), HstPtrBegin(B), HstPtrEnd(E), HstPtrName(Name), TgtAllocBegin(TgtAllocBegin), TgtPtrBegin(TgtPtrBegin), + FakeTgtPtrBegin(FakeTgtPtrBegin), States(std::make_unique(UseHoldRefCount ? 0 : IsINF ? INFRefCount : 1, diff --git a/offload/include/Shared/APITypes.h b/offload/include/Shared/APITypes.h index 5b22bbaac144f..f96b2f9ca259d 100644 --- a/offload/include/Shared/APITypes.h +++ b/offload/include/Shared/APITypes.h @@ -85,6 +85,9 @@ struct __tgt_async_info { /// ensure it is a valid location while the transfer to the device is /// happening. KernelLaunchEnvironmentTy KernelLaunchEnvironment; + + /// Flag to indicate the Queue should be persistent. + bool PersistentQueue = false; }; /// This struct contains all of the arguments to a target kernel region launch. @@ -102,19 +105,24 @@ struct KernelArgsTy { 0; // Tripcount for the teams / distribute loop, 0 otherwise. struct { uint64_t NoWait : 1; // Was this kernel spawned with a `nowait` clause. - uint64_t Unused : 63; - } Flags = {0, 0}; + uint64_t IsCUDA : 1; // Was this kernel spawned via CUDA. + uint64_t Unused : 62; + } Flags = {0, 0, 0}; // The number of teams (for x,y,z dimension). uint32_t NumTeams[3] = {0, 0, 0}; // The number of threads (for x,y,z dimension). uint32_t ThreadLimit[3] = {0, 0, 0}; uint32_t DynCGroupMem = 0; // Amount of dynamic cgroup memory requested. + // A __tgt_async_info queue pointer to be used for the kernel and all + // associated device interactions. The operations are implicitly made + // non-blocking. + void *AsyncInfoQueue = nullptr; }; static_assert(sizeof(KernelArgsTy().Flags) == sizeof(uint64_t), "Invalid struct size"); static_assert(sizeof(KernelArgsTy) == (8 * sizeof(int32_t) + 3 * sizeof(int64_t) + - 4 * sizeof(void **) + 2 * sizeof(int64_t *)), + 5 * sizeof(void **) + 2 * sizeof(int64_t *)), "Invalid struct size"); /// Flat array of kernel launch parameters and their total size. diff --git a/offload/include/Shared/RefCnt.h b/offload/include/Shared/RefCnt.h new file mode 100644 index 0000000000000..7c615ba167a3d --- /dev/null +++ b/offload/include/Shared/RefCnt.h @@ -0,0 +1,56 @@ +//===-- Shared/RefCnt.h - Helper to keep track of references --- C++ ------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +//===----------------------------------------------------------------------===// + +#ifndef OMPTARGET_SHARED_REF_CNT_H +#define OMPTARGET_SHARED_REF_CNT_H + +#include +#include +#include +#include + +namespace llvm { +namespace omp { +namespace target { + +/// Utility class for thread-safe reference counting. Any class that needs +/// objects' reference counting can inherit from this entity or have it as a +/// class data member. +template +struct RefCountTy { + /// Create a refcount object initialized to zero. + RefCountTy() : Refs(0) {} + + ~RefCountTy() { assert(Refs == 0 && "Destroying with non-zero refcount"); } + + /// Increase the reference count atomically. + void increase() { Refs.fetch_add(1, MemoryOrder); } + + /// Decrease the reference count and return whether it became zero. Decreasing + /// the counter in more units than it was previously increased results in + /// undefined behavior. + bool decrease() { + Ty Prev = Refs.fetch_sub(1, MemoryOrder); + assert(Prev > 0 && "Invalid refcount"); + return (Prev == 1); + } + + Ty get() const { return Refs.load(MemoryOrder); } + +private: + /// The atomic reference counter. + std::atomic Refs; +}; +} // namespace target +} // namespace omp +} // namespace llvm + +#endif diff --git a/offload/include/Shared/Sanitizer.h b/offload/include/Shared/Sanitizer.h new file mode 100644 index 0000000000000..1e906e184758d --- /dev/null +++ b/offload/include/Shared/Sanitizer.h @@ -0,0 +1,393 @@ +//===-- Shared/SanitizerHost.h - OFfload sanitizer host logic ----- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +//===----------------------------------------------------------------------===// + +#ifndef OMPTARGET_SHARED_SANITIZER_HOST_H +#define OMPTARGET_SHARED_SANITIZER_HOST_H + +#include "Types.h" +#include "Utils.h" + +extern "C" { +int ompx_block_id(int Dim); +int ompx_block_dim(int Dim); +int ompx_thread_id(int Dim); +int64_t __san_get_location_value(); +} + +#define INLINE gnu::always_inline +#define NOINLINE gnu::noinline + +enum class AllocationKind { LOCAL, GLOBAL, LAST = GLOBAL }; + +template struct ASTypes { + using INT_TY = uint64_t; +}; +#pragma omp begin declare variant match(device = {arch(amdgcn)}) +template <> struct ASTypes { + using INT_TY = uint32_t; +}; +#pragma omp end declare variant + +template struct SanitizerConfig { + static constexpr uint32_t ADDR_SPACE = AK == AllocationKind::GLOBAL ? 0 : 5; + static constexpr uint32_t ADDR_SPACE_PTR_SIZE = + sizeof(typename ASTypes::INT_TY) * 8; + + static constexpr uint32_t NUM_ALLOCATION_ARRAYS = + AK == AllocationKind::GLOBAL ? 1 : (1024 * 1024 * 2); + static constexpr uint32_t TAG_BITS = AK == AllocationKind::GLOBAL ? 1 : 8; + static constexpr uint32_t MAGIC_BITS = 3; + static constexpr uint32_t MAGIC = 0b101; + + static constexpr uint32_t OBJECT_BITS = AK == AllocationKind::GLOBAL ? 10 : 7; + static constexpr uint32_t SLOTS = (1 << (OBJECT_BITS)); + static constexpr uint32_t KIND_BITS = 1; + static constexpr uint32_t Id_BITS = 9 - KIND_BITS; + + static constexpr uint32_t LENGTH_BITS = + ADDR_SPACE_PTR_SIZE - TAG_BITS - Id_BITS; + static constexpr uint32_t OFFSET_BITS = + ADDR_SPACE_PTR_SIZE - TAG_BITS - OBJECT_BITS - KIND_BITS - MAGIC_BITS; + + static constexpr bool useTags() { return TAG_BITS > 1; } + + static_assert(LENGTH_BITS + TAG_BITS + Id_BITS == ADDR_SPACE_PTR_SIZE, + "Length, tag, and Id bits should cover one pointer"); + static_assert(OFFSET_BITS + TAG_BITS + OBJECT_BITS + MAGIC_BITS + KIND_BITS == + ADDR_SPACE_PTR_SIZE, + "Offset, tag, object, and kind bits should cover one pointer"); + static_assert((1 << KIND_BITS) >= ((uint64_t)AllocationKind::LAST + 1), + "Kind bits should match allocation kinds"); +}; + +#define _AS_PTR(TY, AK) \ + TY [[clang::address_space(SanitizerConfig::ADDR_SPACE)]] * + +#define _AS_PTR_PTR(TY, AK) \ + TY [[clang::address_space(SanitizerConfig::ADDR_SPACE)]] ** + +template struct AllocationTy { + _AS_PTR(void, AK) Start; + typename ASTypes::INT_TY Length : SanitizerConfig::LENGTH_BITS; + typename ASTypes::INT_TY Tag : SanitizerConfig::TAG_BITS; + typename ASTypes::INT_TY Id : SanitizerConfig::Id_BITS; +}; + +template struct AllocationArrayTy { + AllocationTy Arr[SanitizerConfig::SLOTS]; + uint64_t Cnt; +}; + +template struct AllocationPtrTy { + static AllocationPtrTy get(_AS_PTR(void, AK) P) { + return utils::convertViaPun>(P); + } + static AllocationPtrTy get(void *P) { + return get((_AS_PTR(void, AK))(P)); + } + operator _AS_PTR(void, AK)() const { + return utils::convertViaPun<_AS_PTR(void, AK)>(*this); + } + operator typename ASTypes::INT_TY() const { + return utils::convertViaPun::INT_TY>(*this); + } + typename ASTypes::INT_TY Offset : SanitizerConfig::OFFSET_BITS; + typename ASTypes::INT_TY AllocationTag : SanitizerConfig::TAG_BITS; + typename ASTypes::INT_TY AllocationId : SanitizerConfig::OBJECT_BITS; + typename ASTypes::INT_TY Magic : SanitizerConfig::MAGIC_BITS; + // Must be last, TODO: merge into TAG + typename ASTypes::INT_TY Kind : SanitizerConfig::KIND_BITS; +}; +#pragma omp begin declare variant match(device = {arch(amdgcn)}) +static_assert(sizeof(AllocationPtrTy) * 8 == 32); +#pragma omp end declare variant + +union TypePunUnion { + uint64_t I; + void *P; + _AS_PTR(void, AllocationKind::LOCAL) AddrP; + struct { + AllocationPtrTy AP; + uint32_t U; + }; +}; +#pragma omp begin declare variant match(device = {arch(amdgcn)}) +static_assert(sizeof(TypePunUnion) * 8 == 64); +#pragma omp end declare variant + +static inline void *__offload_get_new_sanitizer_ptr(int32_t Slot) { + AllocationPtrTy AP; + AP.Offset = 0; + AP.AllocationId = Slot; + AP.Magic = SanitizerConfig::MAGIC; + AP.Kind = (uint32_t)AllocationKind::GLOBAL; + return (void *)(_AS_PTR(void, AllocationKind::GLOBAL))AP; +} + +template struct Allocations { + static AllocationArrayTy Arr[SanitizerConfig::NUM_ALLOCATION_ARRAYS]; +}; + +struct LocationEncodingTy { + uint64_t FunctionNameIdx; + uint64_t FileNameIdx; + uint64_t LineNo; + uint64_t ColumnNo; + uint64_t ParentIdx; +}; + +struct SanitizerTrapInfoTy { + /// AllocationTy + /// { + void *AllocationStart; + uint64_t AllocationLength; + int32_t AllocationId; + uint32_t AllocationTag; + uint8_t AllocationKind; + ///} + + enum ErrorCodeTy : uint8_t { + None = 0, + ExceedsLength, + ExceedsSlots, + PointerOutsideAllocation, + OutOfBounds, + UseAfterScope, + UseAfterFree, + MemoryLeak, + GarbagePointer, + } ErrorCode; + + /// AllocationTy + /// { + uint64_t PtrOffset; + uint64_t PtrSlot; + uint16_t PtrTag; + uint16_t PtrKind; + ///} + + /// Access + /// { + uint32_t AccessSize; + int64_t AccessId; + /// } + + /// Thread + /// { + uint64_t BlockId[3]; + uint32_t ThreadId[3]; + uint64_t PC; + uint64_t LocationId; + int64_t CallId; + /// } + + [[clang::disable_sanitizer_instrumentation, INLINE]] void + setCoordinates(int64_t SourceId, uint64_t ProgramCounter) { + for (int32_t Dim = 0; Dim < 3; ++Dim) { + BlockId[Dim] = ompx_block_id(Dim); + ThreadId[Dim] = ompx_thread_id(Dim); + } + PC = ProgramCounter; + LocationId = SourceId; + CallId = __san_get_location_value(); + } + + template + [[clang::disable_sanitizer_instrumentation, INLINE]] void + allocationError(ErrorCodeTy EC, _AS_PTR(void, AK) Start, uint64_t Length, + int64_t Id, int64_t Tag, uint64_t Slot, int64_t SourceId, + uint64_t PC) { + AllocationStart = (void *)Start; + AllocationLength = Length; + AllocationId = Id; + AllocationTag = Tag; + AllocationKind = (decltype(AllocationKind))AK; + PtrSlot = Slot; + + ErrorCode = EC; + setCoordinates(SourceId, PC); + } + + template + [[clang::disable_sanitizer_instrumentation, INLINE]] void + propagateAccessError(ErrorCodeTy EC, const AllocationTy &A, + const AllocationPtrTy &AP, uint64_t Size, int64_t Id, + int64_t SourceId, uint64_t PC) { + AllocationStart = (void *)A.Start; + AllocationLength = A.Length; + AllocationId = A.Id; + AllocationTag = A.Tag; + AllocationKind = (decltype(AllocationKind))AK; + + ErrorCode = EC; + + PtrOffset = AP.Offset; + PtrSlot = AP.AllocationId; + PtrTag = AP.AllocationTag; + PtrKind = AP.Kind; + + AccessSize = Size; + AccessId = Id; + + setCoordinates(SourceId, PC); + } + + template + [[clang::disable_sanitizer_instrumentation, noreturn, NOINLINE, + gnu::cold]] void + exceedsAllocationLength(_AS_PTR(void, AK) Start, uint64_t Length, + int64_t AllocationId, uint64_t Slot, int64_t SourceId, + uint64_t PC) { + allocationError(ExceedsLength, Start, Length, AllocationId, /*Tag=*/0, + Slot, SourceId, PC); + __builtin_trap(); + } + + template + [[clang::disable_sanitizer_instrumentation, noreturn, NOINLINE, + gnu::cold]] void + exceedsAllocationSlots(_AS_PTR(void, AK) Start, uint64_t Length, + int64_t AllocationId, uint64_t Slot, int64_t SourceId, + uint64_t PC) { + allocationError(ExceedsSlots, Start, Length, AllocationId, /*Tag=*/0, + Slot, SourceId, PC); + __builtin_trap(); + } + + template + [[clang::disable_sanitizer_instrumentation, noreturn, NOINLINE, + gnu::cold]] void + pointerOutsideAllocation(_AS_PTR(void, AK) Start, uint64_t Length, + int64_t AllocationId, uint64_t Slot, uint64_t PC) { + allocationError(PointerOutsideAllocation, Start, Length, AllocationId, + /*Tag=*/0, Slot, /*SourceId=*/0, PC); + __builtin_trap(); + } + + template + [[clang::disable_sanitizer_instrumentation, noreturn, INLINE, gnu::cold]] void + outOfBoundAccess(const AllocationTy A, const AllocationPtrTy AP, + uint64_t Size, int64_t AccessId, int64_t SourceId, + uint64_t PC) { + propagateAccessError(OutOfBounds, A, AP, Size, AccessId, SourceId, PC); + __builtin_trap(); + } + + template + [[clang::disable_sanitizer_instrumentation, noreturn, INLINE, gnu::cold]] void + useAfterScope(const AllocationTy A, const AllocationPtrTy AP, + uint64_t Size, int64_t AccessId, int64_t SourceId, + uint64_t PC) { + propagateAccessError(UseAfterScope, A, AP, Size, AccessId, SourceId, PC); + __builtin_trap(); + } + + template + [[clang::disable_sanitizer_instrumentation, noreturn, INLINE, gnu::cold]] void + useAfterFree(const AllocationTy A, const AllocationPtrTy AP, + uint64_t Size, int64_t AccessId, int64_t SourceId, uint64_t PC) { + propagateAccessError(UseAfterFree, A, AP, Size, AccessId, SourceId, PC); + __builtin_trap(); + } + + template + [[clang::disable_sanitizer_instrumentation, noreturn, NOINLINE, + gnu::cold]] void + accessError(const AllocationPtrTy AP, int64_t Size, int64_t AccessId, + int64_t SourceId, uint64_t PC); + + template + [[clang::disable_sanitizer_instrumentation, noreturn, NOINLINE, + gnu::cold]] void + garbagePointer(const AllocationPtrTy AP, void *P, int64_t SourceId, + uint64_t PC) { + ErrorCode = GarbagePointer; + AllocationStart = P; + AllocationKind = (decltype(AllocationKind))AK; + PtrOffset = AP.Offset; + PtrSlot = AP.AllocationId; + PtrTag = AP.AllocationTag; + PtrKind = AP.Kind; + setCoordinates(SourceId, PC); + __builtin_trap(); + } + + template + [[clang::disable_sanitizer_instrumentation, noreturn, NOINLINE, + gnu::cold]] void + garbagePointer2(const AllocationPtrTy AP, void *P, int64_t SourceId, + int64_t Id, + uint64_t PC) { + ErrorCode = GarbagePointer; + AllocationStart = P; + AllocationKind = (decltype(AllocationKind))AK; + PtrOffset = AP.Offset; + PtrSlot = AP.AllocationId; + PtrTag = AP.AllocationTag; + PtrKind = AP.Kind; + AccessId = Id; + setCoordinates(SourceId, PC); + __builtin_trap(); + } + + template + [[clang::disable_sanitizer_instrumentation, noreturn, INLINE, gnu::cold]] void + memoryLeak(const AllocationTy A, uint64_t Slot) { + allocationError(MemoryLeak, A.Start, A.Length, A.Id, A.Tag, Slot, + /*SourceId=*/0, /*PC=*/0); + __builtin_trap(); + } +}; + +[[gnu::used, gnu::retain, gnu::weak, + gnu::visibility("protected")]] SanitizerTrapInfoTy *__sanitizer_trap_info_ptr; + +template +[[clang::disable_sanitizer_instrumentation, + gnu::always_inline]] AllocationArrayTy & +getAllocationArray() { + uint32_t ThreadId = 0, BlockId = 0; + if constexpr (AK == AllocationKind::LOCAL) { + ThreadId = ompx_thread_id(0); + BlockId = ompx_block_id(0); + } + return Allocations::Arr[ThreadId + BlockId * ompx_block_dim(0)]; +} + +template +[[clang::disable_sanitizer_instrumentation, + gnu::always_inline]] AllocationTy & +getAllocation(const AllocationPtrTy AP, int64_t AccessId, uint64_t PC) { + auto &AllocArr = getAllocationArray(); + uint64_t NumSlots = SanitizerConfig::SLOTS; + uint64_t Slot = AP.AllocationId; + if (Slot >= NumSlots) + __sanitizer_trap_info_ptr->pointerOutsideAllocation(AP, AP.Offset, + AccessId, Slot, PC); + return AllocArr.Arr[Slot]; +} + +template +[[clang::disable_sanitizer_instrumentation, noreturn, NOINLINE, gnu::cold]] void +SanitizerTrapInfoTy::accessError(const AllocationPtrTy AP, int64_t Size, + int64_t AccessId, int64_t SourceId, + uint64_t PC) { + auto &A = getAllocationArray().Arr[AP.AllocationId]; + int64_t Offset = AP.Offset; + int64_t Length = A.Length; + if (AK == AllocationKind::LOCAL && Length == 0) + useAfterScope(A, AP, Size, AccessId, SourceId, PC); + else if (Offset > Length - Size) + outOfBoundAccess(A, AP, Size, AccessId, SourceId, PC); + else + useAfterFree(A, AP, Size, AccessId, SourceId, PC); +} + +#endif diff --git a/offload/include/Shared/Types.h b/offload/include/Shared/Types.h new file mode 100644 index 0000000000000..15e3cfefa37ed --- /dev/null +++ b/offload/include/Shared/Types.h @@ -0,0 +1,22 @@ +//===-- Shared/Types.h - Type defs shared between host and device - C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// Environments shared between host and device. +// +//===----------------------------------------------------------------------===// + +#ifndef OMPTARGET_SHARED_TYPES_H +#define OMPTARGET_SHARED_TYPES_H + +#ifndef OMPTARGET_DEVICE_RUNTIME +#include +#else +#include "DeviceTypes.h" +#endif + +#endif // OMPTARGET_SHARED_TYPES_H diff --git a/offload/include/Shared/Utils.h b/offload/include/Shared/Utils.h index fce14b54edb98..22ea2ba262d2a 100644 --- a/offload/include/Shared/Utils.h +++ b/offload/include/Shared/Utils.h @@ -14,75 +14,65 @@ #ifndef OMPTARGET_SHARED_UTILS_H #define OMPTARGET_SHARED_UTILS_H -#include "llvm/ADT/StringRef.h" +#include "Types.h" -#include "Debug.h" - -#include -#include -#include -#include - -namespace llvm { -namespace omp { -namespace target { - -/// Utility class for thread-safe reference counting. Any class that needs -/// objects' reference counting can inherit from this entity or have it as a -/// class data member. -template -struct RefCountTy { - /// Create a refcount object initialized to zero. - RefCountTy() : Refs(0) {} - - ~RefCountTy() { assert(Refs == 0 && "Destroying with non-zero refcount"); } - - /// Increase the reference count atomically. - void increase() { Refs.fetch_add(1, MemoryOrder); } - - /// Decrease the reference count and return whether it became zero. Decreasing - /// the counter in more units than it was previously increased results in - /// undefined behavior. - bool decrease() { - Ty Prev = Refs.fetch_sub(1, MemoryOrder); - assert(Prev > 0 && "Invalid refcount"); - return (Prev == 1); - } - - Ty get() const { return Refs.load(MemoryOrder); } - -private: - /// The atomic reference counter. - std::atomic Refs; -}; +namespace utils { /// Return the difference (in bytes) between \p Begin and \p End. template -ptrdiff_t getPtrDiff(const void *End, const void *Begin) { +auto getPtrDiff(const void *End, const void *Begin) { return reinterpret_cast(End) - reinterpret_cast(Begin); } /// Return \p Ptr advanced by \p Offset bytes. -template Ty *advanceVoidPtr(Ty *Ptr, int64_t Offset) { - static_assert(std::is_void::value); - return const_cast(reinterpret_cast(Ptr) + Offset); +template Ty1 *advancePtr(Ty1 *Ptr, Ty2 Offset) { + return (Ty1 *)(const_cast((const char *)(Ptr)) + Offset); } -/// Return \p Ptr aligned to \p Alignment bytes. -template Ty *alignPtr(Ty *Ptr, int64_t Alignment) { - size_t Space = std::numeric_limits::max(); - return std::align(Alignment, sizeof(char), Ptr, Space); +/// Return \p V aligned "upwards" according to \p Align. +template inline Ty1 alignPtr(Ty1 V, Ty2 Align) { + return reinterpret_cast(((uintptr_t(V) + Align - 1) / Align) * Align); +} +/// Return \p V aligned "downwards" according to \p Align. +template inline Ty1 alignDown(Ty1 V, Ty2 Align) { + return V - V % Align; } /// Round up \p V to a \p Boundary. template inline Ty roundUp(Ty V, Ty Boundary) { - return (V + Boundary - 1) / Boundary * Boundary; + return alignPtr(V, Boundary); +} + +/// Return the first bit set in \p V. +inline uint32_t ffs(uint32_t V) { + static_assert(sizeof(int) == sizeof(uint32_t), "type size mismatch"); + return __builtin_ffs(V); +} + +/// Return the first bit set in \p V. +inline uint32_t ffs(uint64_t V) { + static_assert(sizeof(long) == sizeof(uint64_t), "type size mismatch"); + return __builtin_ffsl(V); +} + +/// Return the number of bits set in \p V. +inline uint32_t popc(uint32_t V) { + static_assert(sizeof(int) == sizeof(uint32_t), "type size mismatch"); + return __builtin_popcount(V); +} + +/// Return the number of bits set in \p V. +inline uint32_t popc(uint64_t V) { + static_assert(sizeof(long) == sizeof(uint64_t), "type size mismatch"); + return __builtin_popcountl(V); +} + +template inline DstTy convertViaPun(SrcTy V) { + static_assert(sizeof(DstTy) == sizeof(SrcTy), "Bad conversion"); + return *((DstTy *)(&V)); } -} // namespace target -} // namespace omp -} // namespace llvm +} // namespace utils #endif // OMPTARGET_SHARED_UTILS_H diff --git a/offload/include/device.h b/offload/include/device.h index fd6e5fba5fc53..69954254666b8 100644 --- a/offload/include/device.h +++ b/offload/include/device.h @@ -100,11 +100,15 @@ struct DeviceTy { /// Notify the plugin about a new mapping starting at the host address /// \p HstPtr and \p Size bytes. - int32_t notifyDataMapped(void *HstPtr, int64_t Size); + /// If GPUSan is enabled, \p DevicePtr is registered in each image and + /// \p FakeHstPtr is updated. + int32_t notifyDataMapped(void *HstPtr, void *DevicePtr, int64_t Size, + void *&FakeHstPtr); /// Notify the plugin about an existing mapping being unmapped starting at /// the host address \p HstPtr. - int32_t notifyDataUnmapped(void *HstPtr); + /// If GPUSan is enabled, \p FakeHstPtr is unregistered. + int32_t notifyDataUnmapped(void *HstPtr, void *FakeHstPtr); // Launch the kernel identified by \p TgtEntryPtr with the given arguments. int32_t launchKernel(void *TgtEntryPtr, void **TgtVarsPtr, diff --git a/offload/include/omptarget.h b/offload/include/omptarget.h index 323dee41630f2..8730879905984 100644 --- a/offload/include/omptarget.h +++ b/offload/include/omptarget.h @@ -107,7 +107,7 @@ enum TargetAllocTy : int32_t { inline KernelArgsTy CTorDTorKernelArgs = {1, 0, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, - 0, {0,0}, {1, 0, 0}, {1, 0, 0}, 0}; + 0, {0,0,0}, {1, 0, 0}, {1, 0, 0}, 0}; struct DeviceTy; @@ -136,8 +136,19 @@ class AsyncInfoTy { /// Synchronization method to be used. SyncTy SyncType; - AsyncInfoTy(DeviceTy &Device, SyncTy SyncType = SyncTy::BLOCKING) + AsyncInfoTy(DeviceTy &Device, SyncTy SyncType = SyncTy::BLOCKING) : Device(Device), SyncType(SyncType) {} + AsyncInfoTy(DeviceTy &Device, void *AsyncInfoQueue) + : Device(Device), SyncType(AsyncInfoQueue ? SyncTy::NON_BLOCKING : SyncTy::BLOCKING) { + AsyncInfo.Queue = AsyncInfoQueue; + AsyncInfo.PersistentQueue = !!AsyncInfoQueue; + } + AsyncInfoTy(DeviceTy &Device, void *AsyncInfoQueue, SyncTy SyncType) + : Device(Device), SyncType(SyncType) { + AsyncInfo.Queue = AsyncInfoQueue; + AsyncInfo.PersistentQueue = !!AsyncInfoQueue; + } + ~AsyncInfoTy() { synchronize(); } /// Implicit conversion to the __tgt_async_info which is used in the @@ -207,8 +218,9 @@ class TaskAsyncInfoWrapperTy { void **TaskAsyncInfoPtr = nullptr; public: - TaskAsyncInfoWrapperTy(DeviceTy &Device) + TaskAsyncInfoWrapperTy(DeviceTy &Device, void *AsyncInfoQueue= nullptr) : ExecThreadID(__kmpc_global_thread_num(NULL)), LocalAsyncInfo(Device) { + assert(!AsyncInfoQueue && "Async tasks do not support predefined async queue pointers!"); // If we failed to acquired the current global thread id, we cannot // re-enqueue the current task. Thus we should use the local blocking async // info. @@ -425,6 +437,8 @@ int __tgt_activate_record_replay(int64_t DeviceId, uint64_t MemorySize, void *VAddr, bool IsRecord, bool SaveOutput, uint64_t &ReqPtrArgOffset); +void *__tgt_target_get_default_queue(void *Loc, int64_t DeviceId); + #ifdef __cplusplus } #endif diff --git a/offload/plugins-nextgen/amdgpu/src/rtl.cpp b/offload/plugins-nextgen/amdgpu/src/rtl.cpp index e678213df18ce..d9edfb0e8eab6 100644 --- a/offload/plugins-nextgen/amdgpu/src/rtl.cpp +++ b/offload/plugins-nextgen/amdgpu/src/rtl.cpp @@ -13,6 +13,8 @@ #include #include #include +#include +#include #include #include #include @@ -23,6 +25,7 @@ #include "Shared/APITypes.h" #include "Shared/Debug.h" #include "Shared/Environment.h" +#include "Shared/RefCnt.h" #include "Shared/Utils.h" #include "Utils/ELF.h" @@ -87,7 +90,7 @@ struct AMDGPUDeviceImageTy; struct AMDGPUMemoryManagerTy; struct AMDGPUMemoryPoolTy; -namespace utils { +namespace hsa_utils { /// Iterate elements using an HSA iterate function. Do not use this function /// directly but the specialized ones below instead. @@ -187,7 +190,7 @@ Error asyncMemCopy(bool UseMultipleSdmaEngines, void *Dst, hsa_agent_t DstAgent, Expected getTargetTripleAndFeatures(hsa_agent_t Agent) { std::string Target; - auto Err = utils::iterateAgentISAs(Agent, [&](hsa_isa_t ISA) { + auto Err = hsa_utils::iterateAgentISAs(Agent, [&](hsa_isa_t ISA) { uint32_t Length; hsa_status_t Status; Status = hsa_isa_get_info_alt(ISA, HSA_ISA_INFO_NAME_LENGTH, &Length); @@ -208,7 +211,7 @@ Expected getTargetTripleAndFeatures(hsa_agent_t Agent) { return Err; return Target; } -} // namespace utils +} // namespace hsa_utils /// Utility class representing generic resource references to AMDGPU resources. template @@ -481,7 +484,7 @@ struct AMDGPUDeviceImageTy : public DeviceImageTy { findDeviceSymbol(GenericDeviceTy &Device, StringRef SymbolName) const; /// Get additional info for kernel, e.g., register spill counts - std::optional + std::optional getKernelInfo(StringRef Identifier) const { auto It = KernelInfoMap.find(Identifier); @@ -495,7 +498,7 @@ struct AMDGPUDeviceImageTy : public DeviceImageTy { /// The exectuable loaded on the agent. hsa_executable_t Executable; hsa_code_object_t CodeObject; - StringMap KernelInfoMap; + StringMap KernelInfoMap; uint16_t ELFABIVersion; }; @@ -545,7 +548,8 @@ struct AMDGPUKernelTy : public GenericKernelTy { // TODO: Read the kernel descriptor for the max threads per block. May be // read from the image. - ImplicitArgsSize = utils::getImplicitArgsSize(AMDImage.getELFABIVersion()); + ImplicitArgsSize = + hsa_utils::getImplicitArgsSize(AMDImage.getELFABIVersion()); DP("ELFABIVersion: %d\n", AMDImage.getELFABIVersion()); // Get additional kernel info read from image @@ -596,7 +600,7 @@ struct AMDGPUKernelTy : public GenericKernelTy { uint32_t ImplicitArgsSize; /// Additional Info for the AMD GPU Kernel - std::optional KernelInfo; + std::optional KernelInfo; }; /// Class representing an HSA signal. Signals are used to define dependencies @@ -685,12 +689,12 @@ struct AMDGPUQueueTy { AMDGPUQueueTy() : Queue(nullptr), Mutex(), NumUsers(0) {} /// Lazily initialize a new queue belonging to a specific agent. - Error init(hsa_agent_t Agent, int32_t QueueSize) { + Error init(GenericDeviceTy &Device, hsa_agent_t Agent, int32_t QueueSize) { if (Queue) return Plugin::success(); hsa_status_t Status = hsa_queue_create(Agent, QueueSize, HSA_QUEUE_TYPE_MULTI, callbackError, - nullptr, UINT32_MAX, UINT32_MAX, &Queue); + &Device, UINT32_MAX, UINT32_MAX, &Queue); return Plugin::check(Status, "Error in hsa_queue_create: %s"); } @@ -875,10 +879,8 @@ struct AMDGPUQueueTy { } /// Callack that will be called when an error is detected on the HSA queue. - static void callbackError(hsa_status_t Status, hsa_queue_t *Source, void *) { - auto Err = Plugin::check(Status, "Received error in queue %p: %s", Source); - FATAL_MESSAGE(1, "%s", toString(std::move(Err)).data()); - } + static void callbackError(hsa_status_t Status, hsa_queue_t *Source, + void *Data); /// The HSA queue. hsa_queue_t *Queue; @@ -1268,13 +1270,14 @@ struct AMDGPUStreamTy { // Issue the async memory copy. if (InputSignal && InputSignal->load()) { hsa_signal_t InputSignalRaw = InputSignal->get(); - return utils::asyncMemCopy(UseMultipleSdmaEngines, Dst, Agent, Src, Agent, - CopySize, 1, &InputSignalRaw, - OutputSignal->get()); + return hsa_utils::asyncMemCopy(UseMultipleSdmaEngines, Dst, Agent, Src, + Agent, CopySize, 1, &InputSignalRaw, + OutputSignal->get()); } - return utils::asyncMemCopy(UseMultipleSdmaEngines, Dst, Agent, Src, Agent, - CopySize, 0, nullptr, OutputSignal->get()); + return hsa_utils::asyncMemCopy(UseMultipleSdmaEngines, Dst, Agent, Src, + Agent, CopySize, 0, nullptr, + OutputSignal->get()); } /// Push an asynchronous memory copy device-to-host involving an unpinned @@ -1308,14 +1311,14 @@ struct AMDGPUStreamTy { // dependency if already satisfied. if (InputSignal && InputSignal->load()) { hsa_signal_t InputSignalRaw = InputSignal->get(); - if (auto Err = utils::asyncMemCopy( + if (auto Err = hsa_utils::asyncMemCopy( UseMultipleSdmaEngines, Inter, Agent, Src, Agent, CopySize, 1, &InputSignalRaw, OutputSignals[0]->get())) return Err; } else { - if (auto Err = utils::asyncMemCopy(UseMultipleSdmaEngines, Inter, Agent, - Src, Agent, CopySize, 0, nullptr, - OutputSignals[0]->get())) + if (auto Err = hsa_utils::asyncMemCopy(UseMultipleSdmaEngines, Inter, + Agent, Src, Agent, CopySize, 0, + nullptr, OutputSignals[0]->get())) return Err; } @@ -1406,12 +1409,13 @@ struct AMDGPUStreamTy { // dependency if already satisfied. if (InputSignal && InputSignal->load()) { hsa_signal_t InputSignalRaw = InputSignal->get(); - return utils::asyncMemCopy(UseMultipleSdmaEngines, Dst, Agent, Inter, - Agent, CopySize, 1, &InputSignalRaw, - OutputSignal->get()); + return hsa_utils::asyncMemCopy(UseMultipleSdmaEngines, Dst, Agent, Inter, + Agent, CopySize, 1, &InputSignalRaw, + OutputSignal->get()); } - return utils::asyncMemCopy(UseMultipleSdmaEngines, Dst, Agent, Inter, Agent, - CopySize, 0, nullptr, OutputSignal->get()); + return hsa_utils::asyncMemCopy(UseMultipleSdmaEngines, Dst, Agent, Inter, + Agent, CopySize, 0, nullptr, + OutputSignal->get()); } // AMDGPUDeviceTy is incomplete here, passing the underlying agent instead @@ -1435,13 +1439,13 @@ struct AMDGPUStreamTy { if (InputSignal && InputSignal->load()) { hsa_signal_t InputSignalRaw = InputSignal->get(); - return utils::asyncMemCopy(UseMultipleSdmaEngines, Dst, DstAgent, Src, - SrcAgent, CopySize, 1, &InputSignalRaw, - OutputSignal->get()); + return hsa_utils::asyncMemCopy(UseMultipleSdmaEngines, Dst, DstAgent, Src, + SrcAgent, CopySize, 1, &InputSignalRaw, + OutputSignal->get()); } - return utils::asyncMemCopy(UseMultipleSdmaEngines, Dst, DstAgent, Src, - SrcAgent, CopySize, 0, nullptr, - OutputSignal->get()); + return hsa_utils::asyncMemCopy(UseMultipleSdmaEngines, Dst, DstAgent, Src, + SrcAgent, CopySize, 0, nullptr, + OutputSignal->get()); } /// Synchronize with the stream. The current thread waits until all operations @@ -1593,8 +1597,9 @@ struct AMDGPUStreamManagerTy final using ResourceRef = AMDGPUResourceRef; using ResourcePoolTy = GenericDeviceResourceManagerTy; + GenericDeviceTy &Device; AMDGPUStreamManagerTy(GenericDeviceTy &Device, hsa_agent_t HSAAgent) - : GenericDeviceResourceManagerTy(Device), + : GenericDeviceResourceManagerTy(Device), Device(Device), OMPX_QueueTracking("LIBOMPTARGET_AMDGPU_HSA_QUEUE_BUSY_TRACKING", true), NextQueue(0), Agent(HSAAgent) {} @@ -1603,7 +1608,7 @@ struct AMDGPUStreamManagerTy final QueueSize = HSAQueueSize; MaxNumQueues = NumHSAQueues; // Initialize one queue eagerly - if (auto Err = Queues.front().init(Agent, QueueSize)) + if (auto Err = Queues.front().init(Device, Agent, QueueSize)) return Err; return GenericDeviceResourceManagerTy::init(InitialSize); @@ -1660,7 +1665,7 @@ struct AMDGPUStreamManagerTy final } // Make sure the queue is initialized, then add user & assign. - if (auto Err = Queues[Index].init(Agent, QueueSize)) + if (auto Err = Queues[Index].init(Device, Agent, QueueSize)) return Err; Queues[Index].addUser(); Stream->Queue = &Queues[Index]; @@ -1799,7 +1804,7 @@ struct AMDHostDeviceTy : public AMDGenericDeviceTy { Error retrieveAllMemoryPools() override { // Iterate through the available pools across the host agents. for (hsa_agent_t Agent : Agents) { - Error Err = utils::iterateAgentMemoryPools( + Error Err = hsa_utils::iterateAgentMemoryPools( Agent, [&](hsa_amd_memory_pool_t HSAMemoryPool) { AMDGPUMemoryPoolTy *MemoryPool = new AMDGPUMemoryPoolTy(HSAMemoryPool); @@ -1964,7 +1969,7 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy { // Detect if XNACK is enabled auto TargeTripleAndFeaturesOrError = - utils::getTargetTripleAndFeatures(Agent); + hsa_utils::getTargetTripleAndFeatures(Agent); if (!TargeTripleAndFeaturesOrError) return TargeTripleAndFeaturesOrError.takeError(); if (static_cast(*TargeTripleAndFeaturesOrError) @@ -2208,8 +2213,11 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy { return Err; // Once the stream is synchronized, return it to stream pool and reset - // AsyncInfo. This is to make sure the synchronization only works for its - // own tasks. + // AsyncInfo if the queue is not persistent. This is to make sure the + // synchronization only works for its own tasks. + if (AsyncInfo.PersistentQueue) + return Plugin::success(); + AsyncInfo.Queue = nullptr; return AMDGPUStreamManager.returnResource(Stream); } @@ -2228,9 +2236,12 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy { if (!(*CompletedOrErr)) return Plugin::success(); - // Once the stream is completed, return it to stream pool and reset - // AsyncInfo. This is to make sure the synchronization only works for its - // own tasks. + // Once the stream is synchronized, return it to stream pool and reset + // AsyncInfo if the queue is not persistent. This is to make sure the + // synchronization only works for its own tasks. + if (AsyncInfo.PersistentQueue) + return Plugin::success(); + AsyncInfo.Queue = nullptr; return AMDGPUStreamManager.returnResource(Stream); } @@ -2316,9 +2327,9 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy { if (auto Err = Signal.init()) return Err; - if (auto Err = utils::asyncMemCopy(useMultipleSdmaEngines(), TgtPtr, - Agent, PinnedPtr, Agent, Size, 0, - nullptr, Signal.get())) + if (auto Err = hsa_utils::asyncMemCopy(useMultipleSdmaEngines(), TgtPtr, + Agent, PinnedPtr, Agent, Size, 0, + nullptr, Signal.get())) return Err; if (auto Err = Signal.wait(getStreamBusyWaitMicroseconds())) @@ -2376,9 +2387,9 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy { if (auto Err = Signal.init()) return Err; - if (auto Err = utils::asyncMemCopy(useMultipleSdmaEngines(), PinnedPtr, - Agent, TgtPtr, Agent, Size, 0, nullptr, - Signal.get())) + if (auto Err = hsa_utils::asyncMemCopy(useMultipleSdmaEngines(), + PinnedPtr, Agent, TgtPtr, Agent, + Size, 0, nullptr, Signal.get())) return Err; if (auto Err = Signal.wait(getStreamBusyWaitMicroseconds())) @@ -2420,7 +2431,7 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy { if (auto Err = Signal.init()) return Err; - if (auto Err = utils::asyncMemCopy( + if (auto Err = hsa_utils::asyncMemCopy( useMultipleSdmaEngines(), DstPtr, DstDevice.getAgent(), SrcPtr, getAgent(), (uint64_t)Size, 0, nullptr, Signal.get())) return Err; @@ -2443,7 +2454,10 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy { /// Initialize the async info for interoperability purposes. Error initAsyncInfoImpl(AsyncInfoWrapperTy &AsyncInfoWrapper) override { - // TODO: Implement this function. + AMDGPUStreamTy *Stream; + if (auto Err = getStream(AsyncInfoWrapper, Stream)) + return Err; + return Plugin::success(); } @@ -2686,7 +2700,7 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy { } Info.add("ISAs"); - auto Err = utils::iterateAgentISAs(getAgent(), [&](hsa_isa_t ISA) { + auto Err = hsa_utils::iterateAgentISAs(getAgent(), [&](hsa_isa_t ISA) { Status = hsa_isa_get_info_alt(ISA, HSA_ISA_INFO_NAME, TmpChar); if (Status == HSA_STATUS_SUCCESS) Info.add("Name", TmpChar); @@ -2768,7 +2782,7 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy { /// Retrieve and construct all memory pools of the device agent. Error retrieveAllMemoryPools() override { // Iterate through the available pools of the device agent. - return utils::iterateAgentMemoryPools( + return hsa_utils::iterateAgentMemoryPools( Agent, [&](hsa_amd_memory_pool_t HSAMemoryPool) { AMDGPUMemoryPoolTy *MemoryPool = Plugin.allocate(); @@ -2954,7 +2968,7 @@ Error AMDGPUDeviceImageTy::loadExecutable(const AMDGPUDeviceTy &Device) { if (Result) return Plugin::error("Loaded HSA executable does not validate"); - if (auto Err = utils::readAMDGPUMetaDataFromImage( + if (auto Err = hsa_utils::readAMDGPUMetaDataFromImage( getMemoryBuffer(), KernelInfoMap, ELFABIVersion)) return Err; @@ -3083,7 +3097,7 @@ struct AMDGPUPluginTy final : public GenericPluginTy { llvm::SmallVector HostAgents; // Count the number of available agents. - auto Err = utils::iterateAgents([&](hsa_agent_t Agent) { + auto Err = hsa_utils::iterateAgents([&](hsa_agent_t Agent) { // Get the device type of the agent. hsa_device_type_t DeviceType; hsa_status_t Status = @@ -3178,12 +3192,12 @@ struct AMDGPUPluginTy final : public GenericPluginTy { return false; auto TargeTripleAndFeaturesOrError = - utils::getTargetTripleAndFeatures(getKernelAgent(DeviceId)); + hsa_utils::getTargetTripleAndFeatures(getKernelAgent(DeviceId)); if (!TargeTripleAndFeaturesOrError) return TargeTripleAndFeaturesOrError.takeError(); - return utils::isImageCompatibleWithEnv(Processor ? *Processor : "", - ElfOrErr->getPlatformFlags(), - *TargeTripleAndFeaturesOrError); + return hsa_utils::isImageCompatibleWithEnv(Processor ? *Processor : "", + ElfOrErr->getPlatformFlags(), + *TargeTripleAndFeaturesOrError); } bool isDataExchangable(int32_t SrcDeviceId, int32_t DstDeviceId) override { @@ -3295,11 +3309,11 @@ Error AMDGPUKernelTy::launchImpl(GenericDeviceTy &GenericDevice, if (auto Err = GenericDevice.getDeviceStackSize(StackSize)) return Err; - utils::AMDGPUImplicitArgsTy *ImplArgs = nullptr; + hsa_utils::AMDGPUImplicitArgsTy *ImplArgs = nullptr; if (ArgsSize == LaunchParams.Size + getImplicitArgsSize()) { // Initialize implicit arguments. - ImplArgs = reinterpret_cast( - advanceVoidPtr(AllArgs, LaunchParams.Size)); + ImplArgs = reinterpret_cast( + utils::advancePtr(AllArgs, LaunchParams.Size)); // Initialize the implicit arguments to zero. std::memset(ImplArgs, 0, getImplicitArgsSize()); @@ -3323,7 +3337,7 @@ Error AMDGPUKernelTy::launchImpl(GenericDeviceTy &GenericDevice, // Only COV5 implicitargs needs to be set. COV4 implicitargs are not used. if (ImplArgs && - getImplicitArgsSize() == sizeof(utils::AMDGPUImplicitArgsTy)) { + getImplicitArgsSize() == sizeof(hsa_utils::AMDGPUImplicitArgsTy)) { ImplArgs->BlockCountX = NumBlocks; ImplArgs->BlockCountY = 1; ImplArgs->BlockCountZ = 1; @@ -3480,6 +3494,16 @@ void *AMDGPUDeviceTy::allocate(size_t Size, void *, TargetAllocTy Kind) { return Alloc; } +void AMDGPUQueueTy::callbackError(hsa_status_t Status, hsa_queue_t *Source, + void *Data) { + + auto *Device = reinterpret_cast(Data); + Device->reportSanitizerError(); + + auto Err = Plugin::check(Status, "Received error in queue %p: %s", Source); + FATAL_MESSAGE(1, "%s", toString(std::move(Err)).data()); +} + } // namespace plugin } // namespace target } // namespace omp diff --git a/offload/plugins-nextgen/amdgpu/utils/UtilitiesRTL.h b/offload/plugins-nextgen/amdgpu/utils/UtilitiesRTL.h index 58a3b5df00fac..1e99d0a30bdf2 100644 --- a/offload/plugins-nextgen/amdgpu/utils/UtilitiesRTL.h +++ b/offload/plugins-nextgen/amdgpu/utils/UtilitiesRTL.h @@ -33,7 +33,7 @@ namespace llvm { namespace omp { namespace target { namespace plugin { -namespace utils { +namespace hsa_utils { // The implicit arguments of COV5 AMDGPU kernels. struct AMDGPUImplicitArgsTy { @@ -310,7 +310,7 @@ readAMDGPUMetaDataFromImage(MemoryBufferRef MemBuffer, return Error::success(); } -} // namespace utils +} // namespace hsa_utils } // namespace plugin } // namespace target } // namespace omp diff --git a/offload/plugins-nextgen/common/include/GlobalHandler.h b/offload/plugins-nextgen/common/include/GlobalHandler.h index 829b4b7291193..b88d3e52bbf43 100644 --- a/offload/plugins-nextgen/common/include/GlobalHandler.h +++ b/offload/plugins-nextgen/common/include/GlobalHandler.h @@ -50,6 +50,9 @@ class GlobalTy { const std::string &getName() const { return Name; } uint32_t getSize() const { return Size; } void *getPtr() const { return Ptr; } + template T *getPtrAs() const { + return reinterpret_cast(Ptr); + } void setSize(int32_t S) { Size = S; } void setPtr(void *P) { Ptr = P; } diff --git a/offload/plugins-nextgen/common/include/PluginInterface.h b/offload/plugins-nextgen/common/include/PluginInterface.h index 0d2a36a42d5fa..c6ddaafce54e0 100644 --- a/offload/plugins-nextgen/common/include/PluginInterface.h +++ b/offload/plugins-nextgen/common/include/PluginInterface.h @@ -19,11 +19,13 @@ #include #include +#include "ExclusiveAccess.h" #include "Shared/APITypes.h" #include "Shared/Debug.h" #include "Shared/Environment.h" #include "Shared/EnvironmentVar.h" #include "Shared/Requirements.h" +#include "Shared/Sanitizer.h" #include "Shared/Utils.h" #include "GlobalHandler.h" @@ -231,7 +233,7 @@ class DeviceImageTy { /// Get the image size. size_t getSize() const { - return getPtrDiff(TgtImage->ImageEnd, TgtImage->ImageStart); + return utils::getPtrDiff(TgtImage->ImageEnd, TgtImage->ImageStart); } /// Get a memory buffer reference to the whole image. @@ -382,6 +384,14 @@ struct GenericKernelTy { bool IsBareKernel = false; }; +struct AllocationInfoTy { + std::string AllocationTrace; + std::string DeallocationTrace; + void *HostPtr; + uint64_t Size; + TargetAllocTy Kind; +}; + /// Class representing a map of host pinned allocations. We track these pinned /// allocations, so memory tranfers invloving these buffers can be optimized. class PinnedAllocationMapTy { @@ -471,7 +481,7 @@ class PinnedAllocationMapTy { --It; // The buffer is not contained in the pinned allocation. - if (advanceVoidPtr(It->HstPtr, It->Size) > HstPtr) + if (utils::advancePtr(It->HstPtr, It->Size) > HstPtr) return &(*It); // None found. @@ -498,15 +508,15 @@ class PinnedAllocationMapTy { /// Indicate whether the first range A fully contains the second range B. static bool contains(void *PtrA, size_t SizeA, void *PtrB, size_t SizeB) { - void *EndA = advanceVoidPtr(PtrA, SizeA); - void *EndB = advanceVoidPtr(PtrB, SizeB); + void *EndA = utils::advancePtr(PtrA, SizeA); + void *EndB = utils::advancePtr(PtrB, SizeB); return (PtrB >= PtrA && EndB <= EndA); } /// Indicate whether the first range A intersects with the second range B. static bool intersects(void *PtrA, size_t SizeA, void *PtrB, size_t SizeB) { - void *EndA = advanceVoidPtr(PtrA, SizeA); - void *EndB = advanceVoidPtr(PtrB, SizeB); + void *EndA = utils::advancePtr(PtrA, SizeA); + void *EndB = utils::advancePtr(PtrB, SizeB); return (PtrA < EndB && PtrB < EndA); } @@ -588,8 +598,8 @@ class PinnedAllocationMapTy { if (!Entry) return nullptr; - return advanceVoidPtr(Entry->DevAccessiblePtr, - getPtrDiff(HstPtr, Entry->HstPtr)); + return utils::advancePtr(Entry->DevAccessiblePtr, + utils::getPtrDiff(HstPtr, Entry->HstPtr)); } /// Check whether a buffer belongs to a registered host pinned allocation. @@ -601,6 +611,24 @@ class PinnedAllocationMapTy { } }; +struct GPUSanTy { + GPUSanTy(GenericDeviceTy &Device) : Device(Device) {} + Error notifyDataMapped(void *DevicePtr, uint64_t Size, void *&FakeHstPtr); + Error notifyDataUnmapped(void *FakeHstPtr); + + void addGPUSanNewFn(GenericKernelTy &GK) { NewFns.push_back(&GK); } + void addGPUSanFreeFn(GenericKernelTy &GK) { FreeFns.push_back(&GK); } + void checkAndReportError(); + Error transferFakePtrToDevice(const char *GlobalName, void *FakeHstPtr, + SmallVector &Images); + +private: + uint32_t SlotCnt = SanitizerConfig::SLOTS - 1; + GenericDeviceTy &Device; + SmallVector NewFns; + SmallVector FreeFns; +}; + /// Class implementing common functionalities of offload devices. Each plugin /// should define the specific device class, derive from this generic one, and /// implement the necessary virtual function members. @@ -718,17 +746,33 @@ struct GenericDeviceTy : public DeviceAllocatorTy { /// buffer (e.g., because a user OpenMP target map) and the buffer may be used /// as source/destination of memory transfers. We can use this information to /// lock the host buffer and optimize its memory transfers. - Error notifyDataMapped(void *HstPtr, int64_t Size) { + Error notifyDataMapped(void *HstPtr, void *DevicePtr, int64_t Size, + void *&FakeHstPtr) { + if (auto Err = GPUSan.notifyDataMapped(DevicePtr, Size, FakeHstPtr)) + return Err; + if (!HstPtr) + return Error::success(); return PinnedAllocs.lockMappedHostBuffer(HstPtr, Size); } /// Mark the host buffer with address \p HstPtr as unmapped. This means that /// libomptarget removed an existing mapping. If the plugin locked the buffer /// in notifyDataMapped, this function should unlock it. - Error notifyDataUnmapped(void *HstPtr) { + Error notifyDataUnmapped(void *HstPtr, void *FakeHstPtr) { + if (auto Err = GPUSan.notifyDataUnmapped(FakeHstPtr)) + return Err; + if (!HstPtr) + return Error::success(); return PinnedAllocs.unlockUnmappedHostBuffer(HstPtr); } + /// Transfers a fake pointer to its respective shadow variable to prevent + /// double initializing GPUSan shadow constants. Only runs if GPUSan is + /// enabled + Error transferFakePtrToDevice(const char *GlobalName, void *FakeHstPtr) { + return GPUSan.transferFakePtrToDevice(GlobalName, FakeHstPtr, LoadedImages); + } + /// Check whether the host buffer with address \p HstPtr is pinned by the /// underlying vendor-specific runtime (if any). Retrieve the host pointer, /// the device accessible pointer and the size of the original pinned buffer. @@ -736,6 +780,10 @@ struct GenericDeviceTy : public DeviceAllocatorTy { void *&BaseDevAccessiblePtr, size_t &BaseSize) const = 0; + void addGPUSanNewFn(GenericKernelTy &GK) { GPUSan.addGPUSanNewFn(GK); } + void addGPUSanFreeFn(GenericKernelTy &GK) { GPUSan.addGPUSanFreeFn(GK); } + void reportSanitizerError() { GPUSan.checkAndReportError(); } + /// Submit data to the device (host to device transfer). Error dataSubmit(void *TgtPtr, const void *HstPtr, int64_t Size, __tgt_async_info *AsyncInfo); @@ -857,9 +905,13 @@ struct GenericDeviceTy : public DeviceAllocatorTy { /// Allocate and construct a kernel object. virtual Expected constructKernel(const char *Name) = 0; + DenseMap SanitizerTrapInfos; + /// Reference to the underlying plugin that created this device. GenericPluginTy &Plugin; + ProtectedObj> AllocationTraces; + private: /// Get and set the stack size and heap size for the device. If not used, the /// plugin can implement the setters as no-op and setting the output @@ -883,7 +935,6 @@ struct GenericDeviceTy : public DeviceAllocatorTy { /// Pointer to the memory manager or nullptr if not available. MemoryManagerTy *MemoryManager; - /// Environment variables defined by the OpenMP standard. Int32Envar OMP_TeamLimit; Int32Envar OMP_NumTeams; @@ -950,6 +1001,8 @@ struct GenericDeviceTy : public DeviceAllocatorTy { #endif private: + GPUSanTy GPUSan; + DeviceMemoryPoolTy DeviceMemoryPool = {nullptr, 0}; DeviceMemoryPoolTrackingTy DeviceMemoryPoolTracking = {0, 0, ~0U, 0}; }; @@ -1118,10 +1171,12 @@ struct GenericPluginTy { int32_t data_unlock(int32_t DeviceId, void *Ptr); /// Notify the runtime about a new mapping that has been created outside. - int32_t data_notify_mapped(int32_t DeviceId, void *HstPtr, int64_t Size); + int32_t data_notify_mapped(int32_t DeviceId, void *HstPtr, void *DevicePtr, + int64_t Size, void *&FakeHstPtr); /// Notify t he runtime about a mapping that has been deleted. - int32_t data_notify_unmapped(int32_t DeviceId, void *HstPtr); + int32_t data_notify_unmapped(int32_t DeviceId, void *HstPtr, + void *FakeHstPtr); /// Copy data to the given device. int32_t data_submit(int32_t DeviceId, void *TgtPtr, void *HstPtr, diff --git a/offload/plugins-nextgen/common/src/GlobalHandler.cpp b/offload/plugins-nextgen/common/src/GlobalHandler.cpp index ba0aa47f8e51c..e18f358af116c 100644 --- a/offload/plugins-nextgen/common/src/GlobalHandler.cpp +++ b/offload/plugins-nextgen/common/src/GlobalHandler.cpp @@ -152,8 +152,8 @@ Error GenericGlobalHandlerTy::readGlobalFromImage(GenericDeviceTy &Device, HostGlobal.getPtr()); assert(Image.getStart() <= ImageGlobal.getPtr() && - advanceVoidPtr(ImageGlobal.getPtr(), ImageGlobal.getSize()) < - advanceVoidPtr(Image.getStart(), Image.getSize()) && + utils::advancePtr(ImageGlobal.getPtr(), ImageGlobal.getSize()) < + utils::advancePtr(Image.getStart(), Image.getSize()) && "Attempting to read outside the image!"); // Perform the copy from the image to the host memory. diff --git a/offload/plugins-nextgen/common/src/JIT.cpp b/offload/plugins-nextgen/common/src/JIT.cpp index 9dbba1459839d..9adb62b677b92 100644 --- a/offload/plugins-nextgen/common/src/JIT.cpp +++ b/offload/plugins-nextgen/common/src/JIT.cpp @@ -51,7 +51,7 @@ namespace { bool isImageBitcode(const __tgt_device_image &Image) { StringRef Binary(reinterpret_cast(Image.ImageStart), - target::getPtrDiff(Image.ImageEnd, Image.ImageStart)); + utils::getPtrDiff(Image.ImageEnd, Image.ImageStart)); return identify_magic(Binary) == file_magic::bitcode; } @@ -69,7 +69,7 @@ createModuleFromMemoryBuffer(std::unique_ptr &MB, Expected> createModuleFromImage(const __tgt_device_image &Image, LLVMContext &Context) { StringRef Data((const char *)Image.ImageStart, - target::getPtrDiff(Image.ImageEnd, Image.ImageStart)); + utils::getPtrDiff(Image.ImageEnd, Image.ImageStart)); std::unique_ptr MB = MemoryBuffer::getMemBuffer( Data, /*BufferName=*/"", /*RequiresNullTerminator=*/false); return createModuleFromMemoryBuffer(MB, Context); diff --git a/offload/plugins-nextgen/common/src/PluginInterface.cpp b/offload/plugins-nextgen/common/src/PluginInterface.cpp index 94f9d4670b672..e7dbd3258d132 100644 --- a/offload/plugins-nextgen/common/src/PluginInterface.cpp +++ b/offload/plugins-nextgen/common/src/PluginInterface.cpp @@ -16,14 +16,22 @@ #include "GlobalHandler.h" #include "JIT.h" +#include "Shared/Sanitizer.h" +#include "Shared/Utils.h" #include "Utils/ELF.h" #include "omptarget.h" +#include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/Signals.h" +#include "llvm/Support/raw_ostream.h" +#include +#include #ifdef OMPT_SUPPORT #include "OpenMP/OMPT/Callback.h" #include "omp-tools.h" #endif +#include "llvm/ADT/StringRef.h" #include "llvm/Bitcode/BitcodeReader.h" #include "llvm/Frontend/OpenMP/OMPConstants.h" #include "llvm/Support/Error.h" @@ -74,7 +82,7 @@ struct RecordReplayTy { Device->allocate(1024, /*HstPtr=*/nullptr, TARGET_ALLOC_DEFAULT); Device->free(Addr); // Align Address to MaxMemoryAllocation - Addr = (void *)alignPtr((Addr), MaxMemoryAllocation); + Addr = (void *)utils::alignPtr((Addr), MaxMemoryAllocation); return Addr; } @@ -207,8 +215,8 @@ struct RecordReplayTy { if (EC) report_fatal_error("Error saving image : " + StringRef(EC.message())); if (const auto *TgtImageBitcode = Image.getTgtImageBitcode()) { - size_t Size = - getPtrDiff(TgtImageBitcode->ImageEnd, TgtImageBitcode->ImageStart); + size_t Size = utils::getPtrDiff(TgtImageBitcode->ImageEnd, + TgtImageBitcode->ImageStart); MemoryBufferRef MBR = MemoryBufferRef( StringRef((const char *)TgtImageBitcode->ImageStart, Size), ""); OS << MBR.getBuffer(); @@ -241,10 +249,10 @@ struct RecordReplayTy { int32_t NameLength = std::strlen(OffloadEntry.Name) + 1; memcpy(BufferPtr, OffloadEntry.Name, NameLength); - BufferPtr = advanceVoidPtr(BufferPtr, NameLength); + BufferPtr = utils::advancePtr(BufferPtr, NameLength); *((uint32_t *)(BufferPtr)) = OffloadEntry.Size; - BufferPtr = advanceVoidPtr(BufferPtr, sizeof(uint32_t)); + BufferPtr = utils::advancePtr(BufferPtr, sizeof(uint32_t)); auto Err = Plugin::success(); { @@ -254,11 +262,12 @@ struct RecordReplayTy { } if (Err) report_fatal_error("Error retrieving data for global"); - BufferPtr = advanceVoidPtr(BufferPtr, OffloadEntry.Size); + BufferPtr = utils::advancePtr(BufferPtr, OffloadEntry.Size); } assert(BufferPtr == GlobalsMB->get()->getBufferEnd() && "Buffer over/under-filled."); - assert(Size == getPtrDiff(BufferPtr, GlobalsMB->get()->getBufferStart()) && + assert(Size == utils::getPtrDiff(BufferPtr, + GlobalsMB->get()->getBufferStart()) && "Buffer size mismatch"); StringRef GlobalsMemory(GlobalsMB.get()->getBufferStart(), Size); @@ -549,9 +558,16 @@ Error GenericKernelTy::launch(GenericDeviceTy &GenericDevice, void **ArgPtrs, if (!KernelLaunchEnvOrErr) return KernelLaunchEnvOrErr.takeError(); - KernelLaunchParamsTy LaunchParams = - prepareArgs(GenericDevice, ArgPtrs, ArgOffsets, KernelArgs.NumArgs, Args, - Ptrs, *KernelLaunchEnvOrErr); + KernelLaunchParamsTy LaunchParams; + + // Kernel languages don't use indirection. + if (KernelArgs.Flags.IsCUDA) { + LaunchParams = *reinterpret_cast(KernelArgs.ArgPtrs); + } else { + LaunchParams = + prepareArgs(GenericDevice, ArgPtrs, ArgOffsets, KernelArgs.NumArgs, + Args, Ptrs, *KernelLaunchEnvOrErr); + } uint32_t NumThreads = getNumThreads(GenericDevice, KernelArgs.ThreadLimit); uint64_t NumBlocks = @@ -723,7 +739,7 @@ GenericDeviceTy::GenericDeviceTy(GenericPluginTy &Plugin, int32_t DeviceId, OMPX_InitialNumEvents("LIBOMPTARGET_NUM_INITIAL_EVENTS", 1), DeviceId(DeviceId), GridValues(OMPGridValues), PeerAccesses(NumDevices, PeerAccessState::PENDING), PeerAccessesLock(), - PinnedAllocs(*this), RPCServer(nullptr) { + PinnedAllocs(*this), RPCServer(nullptr), GPUSan(*this) { #ifdef OMPT_SUPPORT OmptInitialized.store(false); // Bind the callbacks to this device's member functions @@ -903,7 +919,7 @@ GenericDeviceTy::loadBinary(GenericPluginTy &Plugin, #ifdef OMPT_SUPPORT if (ompt::Initialized) { size_t Bytes = - getPtrDiff(InputTgtImage->ImageEnd, InputTgtImage->ImageStart); + utils::getPtrDiff(InputTgtImage->ImageEnd, InputTgtImage->ImageStart); performOmptCallback( device_load, Plugin.getUserId(DeviceId), /*FileName=*/nullptr, /*FileOffset=*/0, /*VmaInFile=*/nullptr, @@ -916,6 +932,25 @@ GenericDeviceTy::loadBinary(GenericPluginTy &Plugin, if (auto Err = callGlobalConstructors(Plugin, *Image)) return std::move(Err); + auto GetKernel = [&](StringRef Name) -> GenericKernelTy * { + auto KernelOrErr = constructKernel(Name.data()); + if (Error Err = KernelOrErr.takeError()) { + REPORT("Failure to look up kernel: %s\n", + toString(std::move(Err)).data()); + return nullptr; + } + GenericKernelTy &Kernel = *KernelOrErr; + if (auto Err = Kernel.init(*this, *Image)) { + REPORT("Failure to init kernel: %s\n", toString(std::move(Err)).data()); + return nullptr; + } + return &Kernel; + }; + if (GenericKernelTy *Kernel = GetKernel("__sanitizer_register")) + addGPUSanNewFn(*Kernel); + if (GenericKernelTy *Kernel = GetKernel("__sanitizer_unregister")) + addGPUSanFreeFn(*Kernel); + // Return the pointer to the table of entries. return Image; } @@ -994,6 +1029,16 @@ Error GenericDeviceTy::setupDeviceMemoryPool(GenericPluginTy &Plugin, if (auto Err = GHandler.writeGlobalToDevice(*this, Image, TrackerGlobal)) return Err; + auto *&SanitizerTrapInfo = SanitizerTrapInfos[&Image]; + SanitizerTrapInfo = reinterpret_cast(allocate( + sizeof(*SanitizerTrapInfo), &SanitizerTrapInfo, TARGET_ALLOC_HOST)); + memset(SanitizerTrapInfo, '\0', sizeof(SanitizerTrapInfoTy)); + + GlobalTy TrapId("__sanitizer_trap_info_ptr", sizeof(SanitizerTrapInfo), + &SanitizerTrapInfo); + if (auto Err = GHandler.writeGlobalToDevice(*this, Image, TrapId)) + return Err; + // Create the metainfo of the device environment global. GlobalTy DevEnvGlobal("__omp_rtl_device_memory_pool", sizeof(DeviceMemoryPoolTy), &DeviceMemoryPool); @@ -1131,8 +1176,8 @@ Expected PinnedAllocationMapTy::lockHostBuffer(void *HstPtr, return std::move(Err); // Return the device accessible pointer with the correct offset. - return advanceVoidPtr(Entry->DevAccessiblePtr, - getPtrDiff(HstPtr, Entry->HstPtr)); + return utils::advancePtr(Entry->DevAccessiblePtr, + utils::getPtrDiff(HstPtr, Entry->HstPtr)); } // No intersecting registered allocation found in the map. First, lock the @@ -1334,6 +1379,22 @@ Expected GenericDeviceTy::dataAlloc(int64_t Size, void *HostPtr, if (auto Err = PinnedAllocs.registerHostBuffer(Alloc, Alloc, Size)) return std::move(Err); + std::string StackTrace; + llvm::raw_string_ostream OS(StackTrace); + llvm::sys::PrintStackTrace(OS); + + AllocationInfoTy *AllocationInfo = new AllocationInfoTy(); + AllocationInfo->AllocationTrace = std::move(StackTrace); + AllocationInfo->HostPtr = HostPtr; + AllocationInfo->Size = Size; + AllocationInfo->Kind = Kind; + + auto AllocationTraceMap = AllocationTraces.getExclusiveAccessor(); + auto *&AI = (*AllocationTraceMap)[Alloc]; + if (AI) + delete AI; + AI = AllocationInfo; + return Alloc; } @@ -1342,6 +1403,32 @@ Error GenericDeviceTy::dataDelete(void *TgtPtr, TargetAllocTy Kind) { if (Plugin.getRecordReplay().isRecordingOrReplaying()) return Plugin::success(); + AllocationInfoTy *AllocationInfo = nullptr; + { + auto AllocationTraceMap = AllocationTraces.getExclusiveAccessor(); + AllocationInfo = (*AllocationTraceMap)[TgtPtr]; + } + + std::string StackTrace; + llvm::raw_string_ostream OS(StackTrace); + llvm::sys::PrintStackTrace(OS); + if (!AllocationInfo) { + fprintf(stderr, "%s", StackTrace.c_str()); + report_fatal_error("Free of non-allocated memory"); + } + + if (!AllocationInfo->DeallocationTrace.empty()) { + fprintf(stderr, "%s", StackTrace.c_str()); + report_fatal_error("double-free"); + } + + if (AllocationInfo->Kind != Kind) { + fprintf(stderr, "%s", StackTrace.c_str()); + report_fatal_error("free of wrong kind of memory"); + } + + AllocationInfo->DeallocationTrace = StackTrace; + int Res; switch (Kind) { case TARGET_ALLOC_DEFAULT: @@ -1428,8 +1515,10 @@ Error GenericDeviceTy::launchKernel(void *EntryPtr, void **ArgPtrs, Error GenericDeviceTy::initAsyncInfo(__tgt_async_info **AsyncInfoPtr) { assert(AsyncInfoPtr && "Invalid async info"); + assert(!(*AsyncInfoPtr) && "Already initialized async info"); *AsyncInfoPtr = new __tgt_async_info(); + (*AsyncInfoPtr)->PersistentQueue = true; AsyncInfoWrapperTy AsyncInfoWrapper(*this, *AsyncInfoPtr); @@ -1607,7 +1696,7 @@ int32_t GenericPluginTy::is_initialized() const { return Initialized; } int32_t GenericPluginTy::is_plugin_compatible(__tgt_device_image *Image) { StringRef Buffer(reinterpret_cast(Image->ImageStart), - target::getPtrDiff(Image->ImageEnd, Image->ImageStart)); + utils::getPtrDiff(Image->ImageEnd, Image->ImageStart)); auto HandleError = [&](Error Err) -> bool { [[maybe_unused]] std::string ErrStr = toString(std::move(Err)); @@ -1639,7 +1728,7 @@ int32_t GenericPluginTy::is_plugin_compatible(__tgt_device_image *Image) { int32_t GenericPluginTy::is_device_compatible(int32_t DeviceId, __tgt_device_image *Image) { StringRef Buffer(reinterpret_cast(Image->ImageStart), - target::getPtrDiff(Image->ImageEnd, Image->ImageStart)); + utils::getPtrDiff(Image->ImageEnd, Image->ImageStart)); auto HandleError = [&](Error Err) -> bool { [[maybe_unused]] std::string ErrStr = toString(std::move(Err)); @@ -1801,8 +1890,10 @@ int32_t GenericPluginTy::data_unlock(int32_t DeviceId, void *Ptr) { } int32_t GenericPluginTy::data_notify_mapped(int32_t DeviceId, void *HstPtr, - int64_t Size) { - auto Err = getDevice(DeviceId).notifyDataMapped(HstPtr, Size); + void *DevicePtr, int64_t Size, + void *&FakeHstPtr) { + auto Err = + getDevice(DeviceId).notifyDataMapped(HstPtr, DevicePtr, Size, FakeHstPtr); if (Err) { REPORT("Failure to notify data mapped %p: %s\n", HstPtr, toString(std::move(Err)).data()); @@ -1812,8 +1903,9 @@ int32_t GenericPluginTy::data_notify_mapped(int32_t DeviceId, void *HstPtr, return OFFLOAD_SUCCESS; } -int32_t GenericPluginTy::data_notify_unmapped(int32_t DeviceId, void *HstPtr) { - auto Err = getDevice(DeviceId).notifyDataUnmapped(HstPtr); +int32_t GenericPluginTy::data_notify_unmapped(int32_t DeviceId, void *HstPtr, + void *FakeHstPtr) { + auto Err = getDevice(DeviceId).notifyDataUnmapped(HstPtr, FakeHstPtr); if (Err) { REPORT("Failure to notify data unmapped %p: %s\n", HstPtr, toString(std::move(Err)).data()); @@ -2088,6 +2180,238 @@ int32_t GenericPluginTy::get_function(__tgt_device_binary Binary, return OFFLOAD_SUCCESS; } +Error GPUSanTy::notifyDataMapped(void *DevicePtr, uint64_t Size, + void *&FakeHstPtr) { + FakeHstPtr = nullptr; + if (NewFns.empty()) + return Plugin::success(); + uint64_t Slot = SlotCnt--; + FakeHstPtr = __offload_get_new_sanitizer_ptr(Slot); + KernelArgsTy Args = {}; + Args.NumTeams[0] = 1; + Args.ThreadLimit[0] = 1; + AsyncInfoWrapperTy AsyncInfoWrapper(Device, nullptr); + for (GenericKernelTy *NewFP : NewFns) { + struct { + void *Ptr; + uint64_t Length; + uint64_t Slot; + } KernelArgs{DevicePtr, Size, Slot}; + KernelLaunchParamsTy ArgPtrs{sizeof(KernelArgs), &KernelArgs, nullptr}; + Args.ArgPtrs = reinterpret_cast(&ArgPtrs); + Args.Flags.IsCUDA = true; + if (auto Err = NewFP->launch(Device, Args.ArgPtrs, nullptr, Args, + AsyncInfoWrapper)) { + AsyncInfoWrapper.finalize(Err); + return Err; + } + } + + Error Err = Plugin::success(); + AsyncInfoWrapper.finalize(Err); + return Err; +} + +Error GPUSanTy::notifyDataUnmapped(void *FakeHstPtr) { + if (!FakeHstPtr) + return Plugin::success(); + KernelArgsTy Args = {}; + Args.NumTeams[0] = 1; + Args.ThreadLimit[0] = 1; + AsyncInfoWrapperTy AsyncInfoWrapper(Device, nullptr); + for (GenericKernelTy *FreeFn : FreeFns) { + KernelLaunchParamsTy ArgPtrs{sizeof(void *), &FakeHstPtr, nullptr}; + Args.ArgPtrs = reinterpret_cast(&ArgPtrs); + Args.Flags.IsCUDA = true; + if (auto Err = FreeFn->launch(Device, Args.ArgPtrs, nullptr, Args, + AsyncInfoWrapper)) { + AsyncInfoWrapper.finalize(Err); + return Err; + } + } + Error Err = Plugin::success(); + AsyncInfoWrapper.finalize(Err); + return Err; +} + +void GPUSanTy::checkAndReportError() { + SanitizerTrapInfoTy *STI; + DeviceImageTy *Image = nullptr; + for (auto &It : Device.SanitizerTrapInfos) { + STI = It.second; + if (!STI || STI->ErrorCode == SanitizerTrapInfoTy::None) + continue; + Image = It.first; + break; + } + if (!Image) + return; + + auto Green = []() { return "\033[1m\033[32m"; }; + auto Blue = []() { return "\033[1m\033[34m"; }; + auto Red = []() { return "\033[1m\033[31m"; }; + auto Default = []() { return "\033[1m\033[0m"; }; + + GenericGlobalHandlerTy &GHandler = Device.Plugin.getGlobalHandler(); + auto GetImagePtr = [&](GlobalTy &GV, bool Quiet = false) { + if (auto Err = GHandler.getGlobalMetadataFromImage(Device, *Image, GV)) { + if (Quiet) + consumeError(std::move(Err)); + else + REPORT("WARNING: Failed to read backtrace " + "(%s)\n", + toString(std::move(Err)).data()); + return false; + } + return true; + }; + GlobalTy LocationsGV("__san.locations", -1); + GlobalTy LocationNamesGV("__san.location_names", -1); + GlobalTy AmbiguousCallsBitWidthGV("__san.num_ambiguous_calls", -1); + GlobalTy AmbiguousCallsLocationsGV("__san.ambiguous_calls_mapping", -1); + if (GetImagePtr(LocationsGV)) + GetImagePtr(LocationNamesGV); + GetImagePtr(AmbiguousCallsBitWidthGV, /*Quiet=*/true); + GetImagePtr(AmbiguousCallsLocationsGV, /*Quiet=*/true); + + fprintf(stderr, "============================================================" + "====================\n"); + + auto PrintStackTrace = [&](int64_t LocationId) { + if (!LocationsGV.getPtr() || !LocationNamesGV.getPtr()) { + fprintf(stderr, " no backtrace available\n"); + return; + } + char *LocationNames = LocationNamesGV.getPtrAs(); + LocationEncodingTy *Locations = LocationsGV.getPtrAs(); + uint64_t *AmbiguousCallsBitWidth = + AmbiguousCallsBitWidthGV.getPtrAs(); + uint64_t *AmbiguousCallsLocations = + AmbiguousCallsLocationsGV.getPtrAs(); + int32_t FrameIdx = 0; + do { + LocationEncodingTy &LE = Locations[LocationId]; + fprintf(stderr, " #%i %s in %s:%lu:%lu\n", FrameIdx, + &LocationNames[LE.FunctionNameIdx], + &LocationNames[LE.FileNameIdx], LE.LineNo, LE.ColumnNo); + LocationId = LE.ParentIdx; + FrameIdx++; + if (LocationId < 0 && STI->CallId != 0 && AmbiguousCallsBitWidth && + AmbiguousCallsLocations) { + uint64_t LastCallId = + STI->CallId & ((1 << *AmbiguousCallsBitWidth) - 1); + LocationId = AmbiguousCallsLocations[LastCallId - 1]; + STI->CallId >>= (*AmbiguousCallsBitWidth); + } + } while (LocationId >= 0); + fputc('\n', stderr); + }; + + auto DiagnoseAccess = [&](StringRef Name) { + void *PC = reinterpret_cast(STI->PC); + void *Addr = utils::advancePtr(STI->AllocationStart, STI->PtrOffset); + fprintf(stderr, + "%sERROR: OffloadSanitizer %s access on address " DPxMOD + " at pc " DPxMOD "\n%s", + Red(), Name.data(), DPxPTR(Addr), DPxPTR(PC), Default()); + fprintf(stderr, + "%s%s of size %u at " DPxMOD + " thread <%u, %u, %u> block <%lu, %lu, %lu> (acc %li, %s)\n%s", + Blue(), STI->AccessId > 0 ? "WRITE" : "READ", STI->AccessSize, + DPxPTR(Addr), STI->ThreadId[0], STI->ThreadId[1], STI->ThreadId[2], + STI->BlockId[0], STI->BlockId[1], STI->BlockId[2], STI->AccessId, + (STI->AllocationKind ? "heap" : "stack"), Default()); + PrintStackTrace(STI->LocationId); + fprintf( + stderr, + "%s" DPxMOD " is located %lu bytes inside of a %lu-byte region [" DPxMOD + "," DPxMOD ")\n%s", + Green(), DPxPTR(Addr), STI->PtrOffset, STI->AllocationLength, + DPxPTR(STI->AllocationStart), + DPxPTR(utils::advancePtr(STI->AllocationStart, STI->AllocationLength)), + Default()); + fprintf(stderr, + "%s Pointer[slot:%lu,tag:%u,kind:%i] " + "Allocation[slot:%d,tag:%u,kind:%i]\n%s", + Green(), STI->PtrSlot, STI->PtrTag, STI->PtrKind, STI->AllocationId, + STI->AllocationTag, STI->AllocationKind, Default()); + + AllocationInfoTy *AllocationInfo = nullptr; + if (STI->AllocationStart) { + auto AllocationTraceMap = Device.AllocationTraces.getExclusiveAccessor(); + AllocationInfo = (*AllocationTraceMap)[STI->AllocationStart]; + } + if (AllocationInfo) { + fprintf(stderr, "\nAllocated at\n"); + fprintf(stderr, "%s", AllocationInfo->AllocationTrace.c_str()); + + if (!AllocationInfo->DeallocationTrace.empty()) { + fprintf(stderr, "\nDeallocated at\n"); + fprintf(stderr, "%s", AllocationInfo->DeallocationTrace.c_str()); + } + } + + }; + + switch (STI->ErrorCode) { + case SanitizerTrapInfoTy::None: + llvm_unreachable("Unexpected exception"); + case SanitizerTrapInfoTy::ExceedsLength: + fprintf(stderr, "%sERROR: OffloadSanitizer %s %d\n%s", Red(), + "exceeds length", STI->PtrSlot, Default()); + break; + case SanitizerTrapInfoTy::ExceedsSlots: + fprintf(stderr, "%sERROR: OffloadSanitizer %s %d\n%s", Red(), + "exceeds slots", STI->PtrSlot, Default()); + break; + case SanitizerTrapInfoTy::PointerOutsideAllocation: + fprintf(stderr, "%sERROR: OffloadSanitizer %s : %p : %i %lu (%s)\n%s", + Red(), "outside allocation", STI->AllocationStart, + STI->AllocationId, STI->PtrSlot, + (STI->AllocationKind ? "heap" : "stack"), Default()); + break; + case SanitizerTrapInfoTy::OutOfBounds: { + DiagnoseAccess("out-of-bounds"); + break; + } + case SanitizerTrapInfoTy::UseAfterScope: + DiagnoseAccess("use-after-scope"); + break; + case SanitizerTrapInfoTy::UseAfterFree: + DiagnoseAccess("use-after-free"); + break; + case SanitizerTrapInfoTy::MemoryLeak: + fprintf(stderr, "%sERROR: OffloadSanitizer memory leak at slot %d\n%s", + Red(), STI->PtrSlot, Default()); + break; + case SanitizerTrapInfoTy::GarbagePointer: + DiagnoseAccess("garbage-pointer"); + break; + } + fflush(stderr); +} + +Error GPUSanTy::transferFakePtrToDevice(const char *GlobalName, + void *FakeHstPtr, + SmallVector &Images) { + if (!FakeHstPtr) + return Plugin::success(); + + std::string ShadowName("__san.global."); + ShadowName.append(GlobalName); + + GenericGlobalHandlerTy &GHandler = Device.Plugin.getGlobalHandler(); + GlobalTy ShadowGlobal(ShadowName, sizeof(void *), &FakeHstPtr); + + int imgCount = 0; + for (auto Img : Images) { + if (GHandler.isSymbolInImage(Device, *Img, ShadowName)) + return GHandler.writeGlobalToDevice(Device, *Img, ShadowGlobal); + } + + return Plugin::error("Shadow global for '%s' not found", GlobalName); +} + bool llvm::omp::target::plugin::libomptargetSupportsRPC() { #ifdef LIBOMPTARGET_RPC_SUPPORT return true; diff --git a/offload/plugins-nextgen/cuda/src/rtl.cpp b/offload/plugins-nextgen/cuda/src/rtl.cpp index b6465d61bd033..6ba51ae0db565 100644 --- a/offload/plugins-nextgen/cuda/src/rtl.cpp +++ b/offload/plugins-nextgen/cuda/src/rtl.cpp @@ -643,8 +643,11 @@ struct CUDADeviceTy : public GenericDeviceTy { } // Once the stream is synchronized, return it to stream pool and reset - // AsyncInfo. This is to make sure the synchronization only works for its - // own tasks. + // AsyncInfo if the queue is not persistent. This is to make sure the + // synchronization only works for its own tasks. + if (AsyncInfo.PersistentQueue) + return Plugin::success(); + AsyncInfo.Queue = nullptr; if (auto Err = CUDAStreamManager.returnResource(Stream)) return Err; @@ -705,7 +708,7 @@ struct CUDADeviceTy : public GenericDeviceTy { return Plugin::error("Wrong device Page size"); // Ceil to page size. - Size = roundUp(Size, Granularity); + Size = utils::roundUp(Size, Granularity); // Create a handler of our allocation CUmemGenericAllocationHandle AHandle; @@ -777,9 +780,12 @@ struct CUDADeviceTy : public GenericDeviceTy { if (Res == CUDA_ERROR_NOT_READY) return Plugin::success(); - // Once the stream is synchronized and the operations completed (or an error - // occurs), return it to stream pool and reset AsyncInfo. This is to make - // sure the synchronization only works for its own tasks. + // Once the stream is synchronized, return it to stream pool and reset + // AsyncInfo if the queue is not persistent. This is to make sure the + // synchronization only works for its own tasks. + if (AsyncInfo.PersistentQueue) + return Plugin::success(); + AsyncInfo.Queue = nullptr; if (auto Err = CUDAStreamManager.returnResource(Stream)) return Err; diff --git a/offload/src/CMakeLists.txt b/offload/src/CMakeLists.txt index efa5cdab33ec9..0f30f6028f103 100644 --- a/offload/src/CMakeLists.txt +++ b/offload/src/CMakeLists.txt @@ -22,6 +22,7 @@ add_llvm_library(omptarget OpenMP/InteropAPI.cpp OpenMP/OMPT/Callback.cpp + KernelLanguage/API.cpp ADDITIONAL_HEADER_DIRS ${LIBOMPTARGET_INCLUDE_DIR} @@ -60,6 +61,23 @@ endforeach() target_compile_options(omptarget PRIVATE ${offload_compile_flags}) target_link_options(omptarget PRIVATE ${offload_link_flags}) +add_llvm_library(offload.kernels + STATIC + + Kernels/Sanitizer.cpp + + LINK_LIBS + PUBLIC + omptarget.devicertl + + NO_INSTALL_RPATH + BUILDTREE_ONLY +) + +list(JOIN LIBOMPTARGET_DEVICE_ARCHITECTURES "," KERNEL_OFFLOAD_ARCHS) +target_compile_options(offload.kernels PRIVATE -x cuda --offload-arch=${KERNEL_OFFLOAD_ARCHS} -nocudalib -nogpulib -fopenmp-target-jit -foffload-via-llvm ) +target_link_options(offload.kernels PRIVATE -x cuda --offload-arch=${KERNEL_OFFLOAD_ARCHS} -nocudalib -nogpulib -fopenmp-target-jit -foffload-via-llvm ) + # libomptarget.so needs to be aware of where the plugins live as they # are now separated in the build directory. set_target_properties(omptarget PROPERTIES @@ -67,3 +85,4 @@ set_target_properties(omptarget PROPERTIES INSTALL_RPATH "$ORIGIN" BUILD_RPATH "$ORIGIN:${CMAKE_CURRENT_BINARY_DIR}/..") install(TARGETS omptarget LIBRARY COMPONENT omptarget DESTINATION "${OFFLOAD_INSTALL_LIBDIR}") +install(TARGETS offload.kernels LIBRARY COMPONENT offload.kernels DESTINATION "${OFFLOAD_INSTALL_LIBDIR}") diff --git a/offload/src/DeviceImage.cpp b/offload/src/DeviceImage.cpp index e42460b5cca4f..e5b4bf5526437 100644 --- a/offload/src/DeviceImage.cpp +++ b/offload/src/DeviceImage.cpp @@ -27,9 +27,8 @@ DeviceImageTy::DeviceImageTy(__tgt_bin_desc &BinaryDesc, __tgt_device_image &TgtDeviceImage) : BinaryDesc(&BinaryDesc), Image(TgtDeviceImage) { - llvm::StringRef ImageStr( - static_cast(Image.ImageStart), - llvm::omp::target::getPtrDiff(Image.ImageEnd, Image.ImageStart)); + llvm::StringRef ImageStr(static_cast(Image.ImageStart), + utils::getPtrDiff(Image.ImageEnd, Image.ImageStart)); auto BinaryOrErr = llvm::object::OffloadBinary::create(llvm::MemoryBufferRef(ImageStr, "")); diff --git a/offload/src/KernelLanguage/API.cpp b/offload/src/KernelLanguage/API.cpp new file mode 100644 index 0000000000000..779751deed661 --- /dev/null +++ b/offload/src/KernelLanguage/API.cpp @@ -0,0 +1,86 @@ +//===------ API.cpp - Kernel Language (CUDA/HIP) entry points ----- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +//===----------------------------------------------------------------------===// + +#include "llvm/Frontend/OpenMP/OMPConstants.h" + +#include "Shared/APITypes.h" + +#include +#include + +struct dim3 { + unsigned x = 0, y = 0, z = 0; +}; + +struct __omp_kernel_t { + dim3 __grid_size; + dim3 __block_size; + size_t __shared_memory; + + void *__stream; +}; + +static __omp_kernel_t __current_kernel = {}; +#pragma omp threadprivate(__current_kernel); + +extern "C" { + +// TODO: There is little reason we need to keep these names or the way calls are +// issued. For now we do to avoid modifying Clang's CUDA codegen. Unclear when +// we actually need to push/pop configurations. +unsigned __llvmPushCallConfiguration(dim3 __grid_size, dim3 __block_size, + size_t __shared_memory, void *__stream) { + __omp_kernel_t &__kernel = __current_kernel; + __kernel.__grid_size = __grid_size; + __kernel.__block_size = __block_size; + __kernel.__shared_memory = __shared_memory; + __kernel.__stream = __stream; + return 0; +} + +unsigned __llvmPopCallConfiguration(dim3 *__grid_size, dim3 *__block_size, + size_t *__shared_memory, void *__stream) { + __omp_kernel_t &__kernel = __current_kernel; + *__grid_size = __kernel.__grid_size; + *__block_size = __kernel.__block_size; + *__shared_memory = __kernel.__shared_memory; + *((void **)__stream) = __kernel.__stream; + return 0; +} + +int __tgt_target_kernel(void *Loc, int64_t DeviceId, int32_t NumTeams, + int32_t ThreadLimit, const void *HostPtr, + KernelArgsTy *Args); +void *__tgt_target_get_default_async_info_queue(void *Loc, int64_t DeviceId); + +unsigned llvmLaunchKernel(const void *func, dim3 gridDim, dim3 blockDim, + void *args, size_t sharedMem, void *stream) { + int64_t DeviceNo = 0; + KernelArgsTy Args = {}; + Args.Version = OMP_KERNEL_ARG_VERSION; + Args.DynCGroupMem = sharedMem; + Args.NumTeams[0] = gridDim.x; + Args.NumTeams[1] = gridDim.y; + Args.NumTeams[2] = gridDim.z; + Args.ThreadLimit[0] = blockDim.x; + Args.ThreadLimit[1] = blockDim.y; + Args.ThreadLimit[2] = blockDim.z; + Args.ArgPtrs = reinterpret_cast(args); + Args.Flags.IsCUDA = true; + if (stream) + Args.AsyncInfoQueue = stream; + else + Args.AsyncInfoQueue = + __tgt_target_get_default_async_info_queue(nullptr, DeviceNo); + int rv = __tgt_target_kernel(nullptr, DeviceNo, gridDim.x, blockDim.x, func, + &Args); + return rv; +} +} diff --git a/offload/src/Kernels/Sanitizer.cpp b/offload/src/Kernels/Sanitizer.cpp new file mode 100644 index 0000000000000..7b1d73a81aeba --- /dev/null +++ b/offload/src/Kernels/Sanitizer.cpp @@ -0,0 +1,27 @@ +//===-- Kenrels/Sanitizer.cpp - Sanitizer Kernel Definitions --------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +//===----------------------------------------------------------------------===// + +#include + +extern "C" { +__device__ void __sanitizer_register_host(void *P, uint64_t Bytes, + uint64_t Slot); +__device__ void __sanitizer_unregister_host(void *P); + +[[clang::disable_sanitizer_instrumentation]] __global__ void +__sanitizer_register(void *P, uint64_t Bytes, uint64_t Slot) { + __sanitizer_register_host(P, Bytes, Slot); +} + +[[clang::disable_sanitizer_instrumentation]] __global__ void +__sanitizer_unregister(void *P) { + __sanitizer_unregister_host(P); +} +} diff --git a/offload/src/OpenMP/API.cpp b/offload/src/OpenMP/API.cpp index 374c54163d6a4..c07cea550c39c 100644 --- a/offload/src/OpenMP/API.cpp +++ b/offload/src/OpenMP/API.cpp @@ -39,7 +39,7 @@ EXTERN void ompx_dump_mapping_tables() { using namespace llvm::omp::target::ompt; #endif -void *targetAllocExplicit(size_t Size, int DeviceNum, int Kind, +void *targetAllocExplicit(size_t Size, int64_t DeviceNum, int Kind, const char *Name); void targetFreeExplicit(void *DevicePtr, int DeviceNum, int Kind, const char *Name); diff --git a/offload/src/OpenMP/Mapping.cpp b/offload/src/OpenMP/Mapping.cpp index 595e3456ab54c..407c6cd422ba7 100644 --- a/offload/src/OpenMP/Mapping.cpp +++ b/offload/src/OpenMP/Mapping.cpp @@ -69,7 +69,7 @@ int MappingInfoTy::associatePtr(void *HstPtrBegin, void *TgtPtrBegin, } // Mapping does not exist, allocate it with refCount=INF - const HostDataToTargetTy &NewEntry = + HostDataToTargetTy &NewEntry = *HDTTMap ->emplace(new HostDataToTargetTy( /*HstPtrBase=*/(uintptr_t)HstPtrBegin, @@ -89,7 +89,8 @@ int MappingInfoTy::associatePtr(void *HstPtrBegin, void *TgtPtrBegin, (void)NewEntry; // Notify the plugin about the new mapping. - return Device.notifyDataMapped(HstPtrBegin, Size); + return Device.notifyDataMapped(HstPtrBegin, TgtPtrBegin, Size, + NewEntry.FakeTgtPtrBegin); } int MappingInfoTy::disassociatePtr(void *HstPtrBegin) { @@ -120,7 +121,7 @@ int MappingInfoTy::disassociatePtr(void *HstPtrBegin) { if (Event) Device.destroyEvent(Event); HDTTMap->erase(It); - return Device.notifyDataUnmapped(HstPtrBegin); + return Device.notifyDataUnmapped(HstPtrBegin, HDTT.FakeTgtPtrBegin); } REPORT("Trying to disassociate a pointer which was not mapped via " @@ -294,12 +295,13 @@ TargetPointerResultTy MappingInfoTy::getTargetPointer( uintptr_t TgtPtrBegin = TgtAllocBegin + TgtPadding; // Release the mapping table lock only after the entry is locked by // attaching it to TPR. - LR.TPR.setEntry(HDTTMap - ->emplace(new HostDataToTargetTy( - (uintptr_t)HstPtrBase, (uintptr_t)HstPtrBegin, - (uintptr_t)HstPtrBegin + Size, TgtAllocBegin, - TgtPtrBegin, HasHoldModifier, HstPtrName)) - .first->HDTT); + LR.TPR.setEntry( + HDTTMap + ->emplace(new HostDataToTargetTy( + (uintptr_t)HstPtrBase, (uintptr_t)HstPtrBegin, + (uintptr_t)HstPtrBegin + Size, TgtAllocBegin, TgtPtrBegin, + HasHoldModifier, HstPtrName, /*IsINF=*/false)) + .first->HDTT); INFO(OMP_INFOTYPE_MAPPING_CHANGED, Device.DeviceID, "Creating new map entry with HstPtrBase=" DPxMOD ", HstPtrBegin=" DPxMOD ", TgtAllocBegin=" DPxMOD @@ -313,7 +315,8 @@ TargetPointerResultTy MappingInfoTy::getTargetPointer( LR.TPR.TargetPointer = (void *)TgtPtrBegin; // Notify the plugin about the new mapping. - if (Device.notifyDataMapped(HstPtrBegin, Size)) + if (Device.notifyDataMapped(HstPtrBegin, LR.TPR.TargetPointer, Size, + LR.TPR.getEntry()->FakeTgtPtrBegin)) return TargetPointerResultTy{}; } else { // This entry is not present and we did not create a new entry for it. @@ -495,7 +498,8 @@ int MappingInfoTy::deallocTgtPtrAndEntry(HostDataToTargetTy *Entry, int Ret = Device.deleteData((void *)Entry->TgtAllocBegin); // Notify the plugin about the unmapped memory. - Ret |= Device.notifyDataUnmapped((void *)Entry->HstPtrBegin); + Ret |= Device.notifyDataUnmapped((void *)Entry->HstPtrBegin, + Entry->FakeTgtPtrBegin); delete Entry; diff --git a/offload/src/device.cpp b/offload/src/device.cpp index 943c778278730..fc41721e6c0a6 100644 --- a/offload/src/device.cpp +++ b/offload/src/device.cpp @@ -191,21 +191,24 @@ int32_t DeviceTy::dataExchange(void *SrcPtr, DeviceTy &DstDev, void *DstPtr, DstPtr, Size, AsyncInfo); } -int32_t DeviceTy::notifyDataMapped(void *HstPtr, int64_t Size) { +int32_t DeviceTy::notifyDataMapped(void *HstPtr, void *DevicePtr, int64_t Size, + void *&FakeHstPtr) { DP("Notifying about new mapping: HstPtr=" DPxMOD ", Size=%" PRId64 "\n", DPxPTR(HstPtr), Size); - if (RTL->data_notify_mapped(RTLDeviceID, HstPtr, Size)) { + if (RTL->data_notify_mapped(RTLDeviceID, HstPtr, DevicePtr, Size, + FakeHstPtr)) { REPORT("Notifiying about data mapping failed.\n"); return OFFLOAD_FAIL; } return OFFLOAD_SUCCESS; } -int32_t DeviceTy::notifyDataUnmapped(void *HstPtr) { - DP("Notifying about an unmapping: HstPtr=" DPxMOD "\n", DPxPTR(HstPtr)); +int32_t DeviceTy::notifyDataUnmapped(void *HstPtr, void *FakeHstPtr) { + DP("Notifying about an unmapping: HstPtr=" DPxMOD " FakeHstPtr=" DPxMOD "\n", + DPxPTR(HstPtr), DPxPTR(FakeHstPtr)); - if (RTL->data_notify_unmapped(RTLDeviceID, HstPtr)) { + if (RTL->data_notify_unmapped(RTLDeviceID, HstPtr, FakeHstPtr)) { REPORT("Notifiying about data unmapping failed.\n"); return OFFLOAD_FAIL; } diff --git a/offload/src/exports b/offload/src/exports index f95544ec8329c..829a55fc6398e 100644 --- a/offload/src/exports +++ b/offload/src/exports @@ -29,6 +29,8 @@ VERS1.0 { __tgt_target_kernel; __tgt_target_kernel_nowait; __tgt_target_nowait_query; + __tgt_target_get_default_async_info_queue; + __tgt_target_synchronize_async_info_queue; __tgt_target_kernel_replay; __tgt_activate_record_replay; __tgt_mapper_num_components; @@ -71,6 +73,12 @@ VERS1.0 { __tgt_interop_use; __tgt_interop_destroy; ompt_libomptarget_connect; + __llvmPushCallConfiguration; + __llvmPopCallConfiguration; + llvmLaunchKernel; + ompx_new_allocation_host; + ompx_free_allocation_host; + ompx_register_image_functions; local: *; }; diff --git a/offload/src/interface.cpp b/offload/src/interface.cpp index 763b051cc6d77..759fb54a11262 100644 --- a/offload/src/interface.cpp +++ b/offload/src/interface.cpp @@ -14,6 +14,8 @@ #include "OpenMP/OMPT/Interface.h" #include "OpenMP/OMPT/Callback.h" #include "PluginManager.h" +#include "Shared/APITypes.h" +#include "omptarget.h" #include "private.h" #include "Shared/EnvironmentVar.h" @@ -312,7 +314,7 @@ static inline int targetKernel(ident_t *Loc, int64_t DeviceId, int32_t NumTeams, if (!DeviceOrErr) FATAL_MESSAGE(DeviceId, "%s", toString(DeviceOrErr.takeError()).c_str()); - TargetAsyncInfoTy TargetAsyncInfo(*DeviceOrErr); + TargetAsyncInfoTy TargetAsyncInfo(*DeviceOrErr, KernelArgs->AsyncInfoQueue); AsyncInfoTy &AsyncInfo = TargetAsyncInfo; /// RAII to establish tool anchors before and after target region OMPT_IF_BUILT(InterfaceRAII TargetRAII( @@ -510,3 +512,48 @@ EXTERN void __tgt_target_nowait_query(void **AsyncHandle) { delete AsyncInfo; *AsyncHandle = nullptr; } + +EXTERN void *__tgt_target_get_default_async_info_queue(void *Loc, + int64_t DeviceId) { + assert(PM && "Runtime not initialized"); + + static thread_local void **AsyncInfoQueue = nullptr; + + if (!AsyncInfoQueue) + AsyncInfoQueue = reinterpret_cast( + calloc(PM->getNumDevices(), sizeof(AsyncInfoQueue[0]))); + + if (!AsyncInfoQueue[DeviceId]) { + auto DeviceOrErr = PM->getDevice(DeviceId); + if (!DeviceOrErr) + FATAL_MESSAGE(DeviceId, "%s", toString(DeviceOrErr.takeError()).c_str()); + + __tgt_async_info *AsyncInfo = nullptr; + DeviceOrErr->RTL->init_async_info(DeviceId, &AsyncInfo); + AsyncInfoQueue[DeviceId] = AsyncInfo->Queue; + } + + return AsyncInfoQueue[DeviceId]; +} + +EXTERN int __tgt_target_synchronize_async_info_queue(void *Loc, + int64_t DeviceId, + void *AsyncInfoQueue) { + assert(PM && "Runtime not initialized"); + + auto DeviceOrErr = PM->getDevice(DeviceId); + if (!DeviceOrErr) + FATAL_MESSAGE(DeviceId, "%s", toString(DeviceOrErr.takeError()).c_str()); + if (!AsyncInfoQueue) + AsyncInfoQueue = __tgt_target_get_default_async_info_queue(Loc, DeviceId); + AsyncInfoTy AsyncInfo(*DeviceOrErr, AsyncInfoQueue, + AsyncInfoTy::SyncTy::BLOCKING); + + if (AsyncInfo.synchronize()) + FATAL_MESSAGE0(1, "Error while querying the async queue for completion.\n"); + [[maybe_unused]] __tgt_async_info *ASI = AsyncInfo; + assert(ASI->Queue); + assert(ASI->Queue && ASI->PersistentQueue); + + return 0; +} diff --git a/offload/src/omptarget.cpp b/offload/src/omptarget.cpp index 9bca8529c5ee3..ef471d086b8c3 100644 --- a/offload/src/omptarget.cpp +++ b/offload/src/omptarget.cpp @@ -49,7 +49,7 @@ int AsyncInfoTy::synchronize() { case SyncTy::BLOCKING: // If we have a queue we need to synchronize it now. Result = Device.synchronize(*this); - assert(AsyncInfo.Queue == nullptr && + assert((AsyncInfo.PersistentQueue || !AsyncInfo.Queue) && "The device plugin should have nulled the queue to indicate there " "are no outstanding actions!"); break; @@ -271,17 +271,27 @@ static int initLibrary(DeviceTy &Device) { ", name \"%s\"\n", DPxPTR(CurrHostEntry->addr), DPxPTR(CurrDeviceEntry->addr), CurrDeviceEntry->size, CurrDeviceEntry->name); - HDTTMap->emplace(new HostDataToTargetTy( - (uintptr_t)CurrHostEntry->addr /*HstPtrBase*/, - (uintptr_t)CurrHostEntry->addr /*HstPtrBegin*/, - (uintptr_t)CurrHostEntry->addr + CurrHostEntry->size /*HstPtrEnd*/, - (uintptr_t)CurrDeviceEntryAddr /*TgtAllocBegin*/, - (uintptr_t)CurrDeviceEntryAddr /*TgtPtrBegin*/, - false /*UseHoldRefCount*/, CurrHostEntry->name, - true /*IsRefCountINF*/)); + auto *Entry = HDTTMap + ->emplace(new HostDataToTargetTy( + (uintptr_t)CurrHostEntry->addr /*HstPtrBase*/, + (uintptr_t)CurrHostEntry->addr /*HstPtrBegin*/, + (uintptr_t)CurrHostEntry->addr + + CurrHostEntry->size /*HstPtrEnd*/, + (uintptr_t)CurrDeviceEntryAddr /*TgtAllocBegin*/, + (uintptr_t)CurrDeviceEntryAddr /*TgtPtrBegin*/, + false /*UseHoldRefCount*/, CurrHostEntry->name, + true /*IsRefCountINF*/)) + .first->HDTT; // Notify about the new mapping. - if (Device.notifyDataMapped(CurrHostEntry->addr, CurrHostEntry->size)) + if (Device.notifyDataMapped(CurrHostEntry->addr, CurrDeviceEntryAddr, + CurrHostEntry->size, + Entry->FakeTgtPtrBegin)) + return OFFLOAD_FAIL; + + auto &DeviceInterface = Device.RTL->getDevice(DeviceId); + if (DeviceInterface.transferFakePtrToDevice(CurrHostEntry->name, + Entry->FakeTgtPtrBegin)) return OFFLOAD_FAIL; } } @@ -323,8 +333,8 @@ void handleTargetOutcome(bool Success, ident_t *Loc) { for (auto &Image : PM->deviceImages()) { const char *Start = reinterpret_cast( Image.getExecutableImage().ImageStart); - uint64_t Length = llvm::omp::target::getPtrDiff( - Start, Image.getExecutableImage().ImageEnd); + uint64_t Length = + utils::getPtrDiff(Start, Image.getExecutableImage().ImageEnd); llvm::MemoryBufferRef Buffer(llvm::StringRef(Start, Length), /*Identifier=*/""); @@ -415,9 +425,9 @@ static int32_t getParentIndex(int64_t Type) { return ((Type & OMP_TGT_MAPTYPE_MEMBER_OF) >> 48) - 1; } -void *targetAllocExplicit(size_t Size, int DeviceNum, int Kind, +void *targetAllocExplicit(size_t Size, int64_t DeviceNum, int Kind, const char *Name) { - DP("Call to %s for device %d requesting %zu bytes\n", Name, DeviceNum, Size); + DP("Call to %s for device %ld requesting %zu bytes\n", Name, DeviceNum, Size); if (Size <= 0) { DP("Call to %s with non-positive length\n", Name); @@ -432,13 +442,21 @@ void *targetAllocExplicit(size_t Size, int DeviceNum, int Kind, return Rc; } + if (checkDeviceAndCtors(DeviceNum, nullptr)) { + DP("Not offloading to device %" PRId64 "\n", DeviceNum); + return Rc; + } + auto DeviceOrErr = PM->getDevice(DeviceNum); if (!DeviceOrErr) FATAL_MESSAGE(DeviceNum, "%s", toString(DeviceOrErr.takeError()).c_str()); Rc = DeviceOrErr->allocData(Size, nullptr, Kind); DP("%s returns device ptr " DPxMOD "\n", Name, DPxPTR(Rc)); - return Rc; + void *FakeHstPtr = nullptr; + if (DeviceOrErr->notifyDataMapped(nullptr, Rc, Size, FakeHstPtr)) + return nullptr; + return FakeHstPtr ? FakeHstPtr : Rc; } void targetFreeExplicit(void *DevicePtr, int DeviceNum, int Kind, @@ -464,6 +482,8 @@ void targetFreeExplicit(void *DevicePtr, int DeviceNum, int Kind, if (DeviceOrErr->deleteData(DevicePtr, Kind) == OFFLOAD_FAIL) FATAL_MESSAGE(DeviceNum, "%s", "Failed to deallocate device ptr"); + DeviceOrErr->notifyDataUnmapped(nullptr, DevicePtr); + DP("omp_target_free deallocated device ptr\n"); } @@ -670,6 +690,9 @@ int targetDataBegin(ident_t *Loc, DeviceTy &Device, int32_t ArgNum, HasFlagTo, HasFlagAlways, IsImplicit, UpdateRef, HasCloseModifier, HasPresentModifier, HasHoldModifier, AsyncInfo, PointerTpr.getEntry()); void *TgtPtrBegin = TPR.TargetPointer; + if (auto *Entry = TPR.getEntry()) + if (auto *FakeTgtPtrBegin = Entry->FakeTgtPtrBegin) + TgtPtrBegin = FakeTgtPtrBegin; IsHostPtr = TPR.Flags.IsHostPointer; // If data_size==0, then the argument could be a zero-length pointer to // NULL, so getOrAlloc() returning NULL is not an error. @@ -1523,11 +1546,16 @@ static int processDataBefore(ident_t *Loc, int64_t DeviceId, void *HostPtr, /*UpdateRefCount=*/false, /*UseHoldRefCount=*/false); TgtPtrBegin = TPR.TargetPointer; + if (auto *Entry = TPR.getEntry()) + if (auto *FakeTgtPtrBegin = Entry->FakeTgtPtrBegin) + TgtPtrBegin = FakeTgtPtrBegin; TgtBaseOffset = (intptr_t)HstPtrBase - (intptr_t)HstPtrBegin; #ifdef OMPTARGET_DEBUG void *TgtPtrBase = (void *)((intptr_t)TgtPtrBegin + TgtBaseOffset); - DP("Obtained target argument " DPxMOD " from host pointer " DPxMOD "\n", - DPxPTR(TgtPtrBase), DPxPTR(HstPtrBegin)); + DP("Obtained target argument " DPxMOD " from host pointer " DPxMOD + " %s\n", + DPxPTR(TgtPtrBase), DPxPTR(HstPtrBegin), + TgtPtrBegin != TPR.TargetPointer ? "fake" : ""); #endif } TgtArgsPositions[I] = TgtArgs.size(); diff --git a/offload/test/lit.cfg b/offload/test/lit.cfg index 6c590603079c4..9053151e44a78 100644 --- a/offload/test/lit.cfg +++ b/offload/test/lit.cfg @@ -66,7 +66,7 @@ def evaluate_bool_env(env): config.name = 'libomptarget :: ' + config.libomptarget_current_target # suffixes: A list of file extensions to treat as test files. -config.suffixes = ['.c', '.cpp', '.cc', '.f90'] +config.suffixes = ['.c', '.cpp', '.cc', '.f90', '.cu'] # excludes: A list of directories to exclude from the testuites. config.excludes = ['Inputs'] diff --git a/offload/test/offloading/CUDA/basic_api_malloc_free.cu b/offload/test/offloading/CUDA/basic_api_malloc_free.cu new file mode 100644 index 0000000000000..60a51e33a5af9 --- /dev/null +++ b/offload/test/offloading/CUDA/basic_api_malloc_free.cu @@ -0,0 +1,42 @@ +// RUN: %clang++ -foffload-via-llvm --offload-arch=native %s -o %t +// RUN: %t | %fcheck-generic + +// UNSUPPORTED: aarch64-unknown-linux-gnu +// UNSUPPORTED: aarch64-unknown-linux-gnu-LTO +// UNSUPPORTED: x86_64-pc-linux-gnu +// UNSUPPORTED: x86_64-pc-linux-gnu-LTO + +#include +#include + +extern "C" { +void *llvm_omp_target_alloc_shared(size_t Size, int DeviceNum); +void llvm_omp_target_free_shared(void *DevicePtr, int DeviceNum); +} + +__global__ void kernel(int *A, int *DevPtr, int N) { + for (int i = 0; i < N; ++i) + DevPtr[i] = 1; + for (int i = 0; i < N; ++i) + *A += DevPtr[i]; +} + +int main(int argc, char **argv) { + int DevNo = 0; + int *Ptr = reinterpret_cast(llvm_omp_target_alloc_shared(4, DevNo)); + int *DevPtr; + auto Err = cudaMalloc(&DevPtr, 42 * sizeof(int)); + if (Err != cudaSuccess) + return -1; + *Ptr = 0; + printf("Ptr %p, *Ptr: %i\n", Ptr, *Ptr); + // CHECK: Ptr [[Ptr:0x.*]], *Ptr: 0 + kernel<<<1, 1>>>(Ptr, DevPtr, 42); + cudaDeviceSynchronize(); + printf("Ptr %p, *Ptr: %i\n", Ptr, *Ptr); + // CHECK: Ptr [[Ptr]], *Ptr: 42 + Err = cudaFree(DevPtr); + if (Err != cudaSuccess) + return -1; + llvm_omp_target_free_shared(Ptr, DevNo); +} diff --git a/offload/test/offloading/CUDA/basic_api_memcpy.cu b/offload/test/offloading/CUDA/basic_api_memcpy.cu new file mode 100644 index 0000000000000..088e20ffa9e2b --- /dev/null +++ b/offload/test/offloading/CUDA/basic_api_memcpy.cu @@ -0,0 +1,47 @@ +// RUN: %clang++ -foffload-via-llvm --offload-arch=native %s -o %t +// RUN: %t | %fcheck-generic + +// UNSUPPORTED: aarch64-unknown-linux-gnu +// UNSUPPORTED: aarch64-unknown-linux-gnu-LTO +// UNSUPPORTED: x86_64-pc-linux-gnu +// UNSUPPORTED: x86_64-pc-linux-gnu-LTO + +#include +#include + +__global__ void kernel(int *DevPtr, int N) { + for (int i = 0; i < N; ++i) + DevPtr[i]--; +} + +int main(int argc, char **argv) { + int DevNo = 0; + int Res = 0; + int *DevPtr; + auto Err = cudaMalloc(&DevPtr, 42 * sizeof(int)); + if (Err != cudaSuccess) + return -1; + int HstPtr[42]; + for (int i = 0; i < 42; ++i) { + HstPtr[i] = 2; + } + Err = cudaMemcpy(DevPtr, HstPtr, 42 * sizeof(int), cudaMemcpyHostToDevice); + if (Err != cudaSuccess) + return -1; + printf("Res: %i\n", Res); + // CHECK: Res: 0 + kernel<<<1, 1>>>(DevPtr, 42); + cudaDeviceSynchronize(); + Err = cudaMemcpy(HstPtr, DevPtr, 42 * sizeof(int), cudaMemcpyDeviceToHost); + if (Err != cudaSuccess) + return -1; + for (int i = 0; i < 42; ++i) { + printf("%i : %i\n", i, HstPtr[i]); + Res += HstPtr[i]; + } + printf("Res: %i\n", Res); + // CHECK: Res: 42 + Err = cudaFree(DevPtr); + if (Err != cudaSuccess) + return -1; +} diff --git a/offload/test/offloading/CUDA/basic_api_memset.cu b/offload/test/offloading/CUDA/basic_api_memset.cu new file mode 100644 index 0000000000000..474eb2a46f0a2 --- /dev/null +++ b/offload/test/offloading/CUDA/basic_api_memset.cu @@ -0,0 +1,44 @@ +// RUN: %clang++ -foffload-via-llvm --offload-arch=native %s -o %t +// RUN: %t | %fcheck-generic + +// UNSUPPORTED: aarch64-unknown-linux-gnu +// UNSUPPORTED: aarch64-unknown-linux-gnu-LTO +// UNSUPPORTED: x86_64-pc-linux-gnu +// UNSUPPORTED: x86_64-pc-linux-gnu-LTO + +#include +#include + +extern "C" { +void *llvm_omp_target_alloc_shared(size_t Size, int DeviceNum); +void llvm_omp_target_free_shared(void *DevicePtr, int DeviceNum); +} + +__global__ void kernel(int *A, int *DevPtr, int N) { + for (int i = 0; i < N; ++i) + *A += DevPtr[i]; + *A *= -1; +} + +int main(int argc, char **argv) { + int DevNo = 0; + int *Ptr = reinterpret_cast(llvm_omp_target_alloc_shared(4, DevNo)); + int *DevPtr; + auto Err = cudaMalloc(&DevPtr, 42 * sizeof(int)); + if (Err != cudaSuccess) + return -1; + Err = cudaMemset(DevPtr, -1, 42 * sizeof(int)); + if (Err != cudaSuccess) + return -1; + *Ptr = 0; + printf("Ptr %p, *Ptr: %i\n", Ptr, *Ptr); + // CHECK: Ptr [[Ptr:0x.*]], *Ptr: 0 + kernel<<<1, 1>>>(Ptr, DevPtr, 42); + cudaDeviceSynchronize(); + printf("Ptr %p, *Ptr: %i\n", Ptr, *Ptr); + // CHECK: Ptr [[Ptr]], *Ptr: 42 + Err = cudaFree(DevPtr); + if (Err != cudaSuccess) + return -1; + llvm_omp_target_free_shared(Ptr, DevNo); +} diff --git a/offload/test/offloading/CUDA/basic_launch.cu b/offload/test/offloading/CUDA/basic_launch.cu new file mode 100644 index 0000000000000..298aa7db83bad --- /dev/null +++ b/offload/test/offloading/CUDA/basic_launch.cu @@ -0,0 +1,32 @@ +// RUN: %clang++ -foffload-via-llvm --offload-arch=native %s -o %t +// RUN: %t | %fcheck-generic +// RUN: %clang++ -foffload-via-llvm --offload-arch=native %s -o %t -fopenmp +// RUN: %t | %fcheck-generic + +// UNSUPPORTED: aarch64-unknown-linux-gnu +// UNSUPPORTED: aarch64-unknown-linux-gnu-LTO +// UNSUPPORTED: x86_64-pc-linux-gnu +// UNSUPPORTED: x86_64-pc-linux-gnu-LTO + +#include +#include + +extern "C" { +void *llvm_omp_target_alloc_shared(size_t Size, int DeviceNum); +void llvm_omp_target_free_shared(void *DevicePtr, int DeviceNum); +} + +__global__ void square(int *A) { *A = 42; } + +int main(int argc, char **argv) { + int DevNo = 0; + int *Ptr = reinterpret_cast(llvm_omp_target_alloc_shared(4, DevNo)); + *Ptr = 7; + printf("Ptr %p, *Ptr: %i\n", Ptr, *Ptr); + // CHECK: Ptr [[Ptr:0x.*]], *Ptr: 7 + square<<<1, 1>>>(Ptr); + cudaDeviceSynchronize(); + printf("Ptr %p, *Ptr: %i\n", Ptr, *Ptr); + // CHECK: Ptr [[Ptr]], *Ptr: 42 + llvm_omp_target_free_shared(Ptr, DevNo); +} diff --git a/offload/test/offloading/CUDA/basic_launch_blocks_and_threads.cu b/offload/test/offloading/CUDA/basic_launch_blocks_and_threads.cu new file mode 100644 index 0000000000000..c47b1a1b83bde --- /dev/null +++ b/offload/test/offloading/CUDA/basic_launch_blocks_and_threads.cu @@ -0,0 +1,34 @@ +// RUN: %clang++ -foffload-via-llvm --offload-arch=native %s -o %t +// RUN: %t | %fcheck-generic +// RUN: %clang++ -foffload-via-llvm --offload-arch=native %s -o %t -fopenmp +// RUN: %t | %fcheck-generic + +// UNSUPPORTED: aarch64-unknown-linux-gnu +// UNSUPPORTED: aarch64-unknown-linux-gnu-LTO +// UNSUPPORTED: x86_64-pc-linux-gnu +// UNSUPPORTED: x86_64-pc-linux-gnu-LTO + +#include +#include + +extern "C" { +void *llvm_omp_target_alloc_shared(size_t Size, int DeviceNum); +void llvm_omp_target_free_shared(void *DevicePtr, int DeviceNum); +} + +__global__ void square(int *A) { + __scoped_atomic_fetch_add(A, 1, __ATOMIC_SEQ_CST, __MEMORY_SCOPE_DEVICE); +} + +int main(int argc, char **argv) { + int DevNo = 0; + int *Ptr = reinterpret_cast(llvm_omp_target_alloc_shared(4, DevNo)); + *Ptr = 0; + printf("Ptr %p, *Ptr: %i\n", Ptr, *Ptr); + // CHECK: Ptr [[Ptr:0x.*]], *Ptr: 0 + square<<<7, 6>>>(Ptr); + cudaDeviceSynchronize(); + printf("Ptr %p, *Ptr: %i\n", Ptr, *Ptr); + // CHECK: Ptr [[Ptr]], *Ptr: 42 + llvm_omp_target_free_shared(Ptr, DevNo); +} diff --git a/offload/test/offloading/CUDA/basic_launch_multi_arg.cu b/offload/test/offloading/CUDA/basic_launch_multi_arg.cu new file mode 100644 index 0000000000000..58ff89dcd4aac --- /dev/null +++ b/offload/test/offloading/CUDA/basic_launch_multi_arg.cu @@ -0,0 +1,43 @@ +// RUN: %clang++ -foffload-via-llvm --offload-arch=native %s -o %t +// RUN: %t | %fcheck-generic +// RUN: %clang++ -foffload-via-llvm --offload-arch=native %s -o %t -fopenmp +// RUN: %t | %fcheck-generic + +// UNSUPPORTED: aarch64-unknown-linux-gnu +// UNSUPPORTED: aarch64-unknown-linux-gnu-LTO +// UNSUPPORTED: x86_64-pc-linux-gnu +// UNSUPPORTED: x86_64-pc-linux-gnu-LTO + +#include +#include + +extern "C" { +void *llvm_omp_target_alloc_shared(size_t Size, int DeviceNum); +void llvm_omp_target_free_shared(void *DevicePtr, int DeviceNum); +} + +__global__ void square(int *Dst, short Q, int *Src, short P) { + *Dst = (Src[0] + Src[1]) * (Q + P); + Src[0] = Q; + Src[1] = P; +} + +int main(int argc, char **argv) { + int DevNo = 0; + int *Ptr = reinterpret_cast(llvm_omp_target_alloc_shared(4, DevNo)); + int *Src = reinterpret_cast(llvm_omp_target_alloc_shared(8, DevNo)); + *Ptr = 7; + Src[0] = -2; + Src[1] = 8; + printf("Ptr %p, *Ptr: %i\n", Ptr, *Ptr); + // CHECK: Ptr [[Ptr:0x.*]], *Ptr: 7 + printf("Src: %i : %i\n", Src[0], Src[1]); + // CHECK: Src: -2 : 8 + square<<<1, 1>>>(Ptr, 3, Src, 4); + cudaDeviceSynchronize(); + printf("Ptr %p, *Ptr: %i\n", Ptr, *Ptr); + // CHECK: Ptr [[Ptr]], *Ptr: 42 + printf("Src: %i : %i\n", Src[0], Src[1]); + // CHECK: Src: 3 : 4 + llvm_omp_target_free_shared(Ptr, DevNo); +} diff --git a/offload/test/offloading/CUDA/kernel_tu.cu.inc b/offload/test/offloading/CUDA/kernel_tu.cu.inc new file mode 100644 index 0000000000000..d7d28a109dfc5 --- /dev/null +++ b/offload/test/offloading/CUDA/kernel_tu.cu.inc @@ -0,0 +1 @@ +__global__ void square(int *A) { *A = 42; } diff --git a/offload/test/offloading/CUDA/launch_tu.cu b/offload/test/offloading/CUDA/launch_tu.cu new file mode 100644 index 0000000000000..fa6b1d7692de5 --- /dev/null +++ b/offload/test/offloading/CUDA/launch_tu.cu @@ -0,0 +1,34 @@ +// clang-format off +// RUN: %clang++ -foffload-via-llvm --offload-arch=native %s -o %t.launch_tu.o -c +// RUN: %clang++ -foffload-via-llvm --offload-arch=native -x cuda %S/kernel_tu.cu.inc -o %t.kernel_tu.o -c +// RUN: %clang++ -foffload-via-llvm --offload-arch=native %t.launch_tu.o %t.kernel_tu.o -o %t +// RUN: %t | %fcheck-generic +// clang-format on + +// UNSUPPORTED: aarch64-unknown-linux-gnu +// UNSUPPORTED: aarch64-unknown-linux-gnu-LTO +// UNSUPPORTED: x86_64-pc-linux-gnu +// UNSUPPORTED: x86_64-pc-linux-gnu-LTO + +#include +#include + +extern "C" { +void *llvm_omp_target_alloc_shared(size_t Size, int DeviceNum); +void llvm_omp_target_free_shared(void *DevicePtr, int DeviceNum); +} + +extern __global__ void square(int *A); + +int main(int argc, char **argv) { + int DevNo = 0; + int *Ptr = reinterpret_cast(llvm_omp_target_alloc_shared(4, DevNo)); + *Ptr = 7; + printf("Ptr %p, *Ptr: %i\n", Ptr, *Ptr); + // CHECK: Ptr [[Ptr:0x.*]], *Ptr: 7 + square<<<1, 1>>>(Ptr); + cudaDeviceSynchronize(); + printf("Ptr %p, *Ptr: %i\n", Ptr, *Ptr); + // CHECK: Ptr [[Ptr]], *Ptr: 42 + llvm_omp_target_free_shared(Ptr, DevNo); +} diff --git a/offload/test/sanitizer/double_free.c b/offload/test/sanitizer/double_free.c new file mode 100644 index 0000000000000..c3c521b5e9a59 --- /dev/null +++ b/offload/test/sanitizer/double_free.c @@ -0,0 +1,21 @@ + +// clang-format off +// RUN: %libomptarget-compileopt-generic -fsanitize=offload +// RUN: not %libomptarget-run-generic 2>&1 > %t.out +// RUN: %fcheck-generic --check-prefixes=CHECK < %t.out +// clang-format on + +// UNSUPPORTED: aarch64-unknown-linux-gnu +// UNSUPPORTED: aarch64-unknown-linux-gnu-LTO +// UNSUPPORTED: x86_64-pc-linux-gnu +// UNSUPPORTED: x86_64-pc-linux-gnu-LTO +// UNSUPPORTED: s390x-ibm-linux-gnu +// UNSUPPORTED: s390x-ibm-linux-gnu-LTO + +#include + +int main(void) { + void *Ptr = omp_target_alloc(100, 0); + omp_target_free(Ptr, 0); + omp_target_free(Ptr, 0); +} diff --git a/offload/test/sanitizer/free_wrong_ptr.c b/offload/test/sanitizer/free_wrong_ptr.c new file mode 100644 index 0000000000000..3816214849811 --- /dev/null +++ b/offload/test/sanitizer/free_wrong_ptr.c @@ -0,0 +1,20 @@ + +// clang-format off +// RUN: %libomptarget-compileopt-generic -fsanitize=offload +// RUN: not %libomptarget-run-generic 2>&1 > %t.out +// RUN: %fcheck-generic --check-prefixes=CHECK < %t.out +// clang-format on + +// UNSUPPORTED: aarch64-unknown-linux-gnu +// UNSUPPORTED: aarch64-unknown-linux-gnu-LTO +// UNSUPPORTED: x86_64-pc-linux-gnu +// UNSUPPORTED: x86_64-pc-linux-gnu-LTO +// UNSUPPORTED: s390x-ibm-linux-gnu +// UNSUPPORTED: s390x-ibm-linux-gnu-LTO + +#include + +int main(void) { + int X; + omp_target_free(&X, 0); +} diff --git a/offload/test/sanitizer/global_null.c b/offload/test/sanitizer/global_null.c new file mode 100644 index 0000000000000..83f0bc1ec408d --- /dev/null +++ b/offload/test/sanitizer/global_null.c @@ -0,0 +1,29 @@ +// clang-format off +// RUN: %libomptarget-compileopt-generic -fsanitize=offload +// RUN: not %libomptarget-run-generic 2> %t.out +// RUN: %fcheck-generic --check-prefixes=CHECK < %t.out +// clang-format on + +// UNSUPPORTED: aarch64-unknown-linux-gnu +// UNSUPPORTED: aarch64-unknown-linux-gnu-LTO +// UNSUPPORTED: x86_64-pc-linux-gnu +// UNSUPPORTED: x86_64-pc-linux-gnu-LTO +// UNSUPPORTED: s390x-ibm-linux-gnu +// UNSUPPORTED: s390x-ibm-linux-gnu-LTO + +int *Null = 0; +#pragma omp declare target(Null) + +int main(void) { + +#pragma omp target + { + // clang-format off + // CHECK: ERROR: OffloadSanitizer out-of-bounds access on address 0x0000000000000000 at pc [[PC:0x.*]] + // CHECK-NEXT: WRITE of size 4 at 0x0000000000000000 thread <0, 0, 0> block <0, 0, 0> + // CHECK-NEXT: #0 [[PC]] main null.c:[[@LINE+3]] + // CHECK-NEXT: 0x0000000000000000 is located 0 bytes inside of 0-byte region [0x0000000000000000,0x0000000000000000) + // clang-format on + *Null = 42; + } +} diff --git a/offload/test/sanitizer/global_variable/global_variable_array_iter_fail.c b/offload/test/sanitizer/global_variable/global_variable_array_iter_fail.c new file mode 100644 index 0000000000000..e029a5fb982a6 --- /dev/null +++ b/offload/test/sanitizer/global_variable/global_variable_array_iter_fail.c @@ -0,0 +1,27 @@ +// RUN: %libomptarget-compileopt-generic -g -fsanitize=offload +// RUN: not %libomptarget-run-generic 2> %t.out +// RUN: %fcheck-generic --check-prefixes=CHECK < %t.out + +// UNSUPPORTED: aarch64-unknown-linux-gnu +// UNSUPPORTED: aarch64-unknown-linux-gnu-LTO +// UNSUPPORTED: x86_64-pc-linux-gnu +// UNSUPPORTED: x86_64-pc-linux-gnu-LTO +// UNSUPPORTED: s390x-ibm-linux-gnu +// UNSUPPORTED: s390x-ibm-linux-gnu-LTO + +#include +#include + +int global_arr[3] = {1, 2, 3}; +#pragma omp declare target(global_arr) + +int main() { +#pragma omp target + { + // CHECK: is located 12 bytes inside of a 12-byte region + for (int i = 0; i < 4; i++) { + global_arr[i] *= 4; + } + } + return 0; +} diff --git a/offload/test/sanitizer/global_variable/global_variable_array_pass.c b/offload/test/sanitizer/global_variable/global_variable_array_pass.c new file mode 100644 index 0000000000000..89695a8dd012a --- /dev/null +++ b/offload/test/sanitizer/global_variable/global_variable_array_pass.c @@ -0,0 +1,26 @@ +// RUN: %libomptarget-compileopt-generic -g -fsanitize=offload +// RUN: %libomptarget-run-generic + +// UNSUPPORTED: aarch64-unknown-linux-gnu +// UNSUPPORTED: aarch64-unknown-linux-gnu-LTO +// UNSUPPORTED: x86_64-pc-linux-gnu +// UNSUPPORTED: x86_64-pc-linux-gnu-LTO +// UNSUPPORTED: s390x-ibm-linux-gnu +// UNSUPPORTED: s390x-ibm-linux-gnu-LTO + +#include +#include + +int global_arr[3] = {1, 2, 3}; +#pragma omp declare target(global_arr) + +int main() { +#pragma omp target + { + for (int i = 0; i < 3; i++) { + global_arr[i] *= 4; + } + global_arr[1] = 22; + } + return 0; +} diff --git a/offload/test/sanitizer/global_variable/global_variable_array_static_fail.c b/offload/test/sanitizer/global_variable/global_variable_array_static_fail.c new file mode 100644 index 0000000000000..cca77ef444735 --- /dev/null +++ b/offload/test/sanitizer/global_variable/global_variable_array_static_fail.c @@ -0,0 +1,25 @@ +// RUN: %libomptarget-compileopt-generic -g -fsanitize=offload +// RUN: not %libomptarget-run-generic 2> %t.out +// RUN: %fcheck-generic --check-prefixes=CHECK < %t.out + +// UNSUPPORTED: aarch64-unknown-linux-gnu +// UNSUPPORTED: aarch64-unknown-linux-gnu-LTO +// UNSUPPORTED: x86_64-pc-linux-gnu +// UNSUPPORTED: x86_64-pc-linux-gnu-LTO +// UNSUPPORTED: s390x-ibm-linux-gnu +// UNSUPPORTED: s390x-ibm-linux-gnu-LTO + +#include +#include + +int global_arr[3] = {1, 2, 3}; +#pragma omp declare target(global_arr) + +int main() { +#pragma omp target + { + // CHECK: is located 20 bytes inside of a 12-byte region + global_arr[5] = 27; + } + return 0; +} diff --git a/offload/test/sanitizer/global_variable/global_variable_pass.c b/offload/test/sanitizer/global_variable/global_variable_pass.c new file mode 100644 index 0000000000000..344a79b97d9c4 --- /dev/null +++ b/offload/test/sanitizer/global_variable/global_variable_pass.c @@ -0,0 +1,25 @@ +// RUN: %libomptarget-compileopt-generic -g -fsanitize=offload +// RUN: %libomptarget-run-generic + +// UNSUPPORTED: aarch64-unknown-linux-gnu +// UNSUPPORTED: aarch64-unknown-linux-gnu-LTO +// UNSUPPORTED: x86_64-pc-linux-gnu +// UNSUPPORTED: x86_64-pc-linux-gnu-LTO +// UNSUPPORTED: s390x-ibm-linux-gnu +// UNSUPPORTED: s390x-ibm-linux-gnu-LTO + +#include +#include + +int global1 = 0; +int global2 = -1; +#pragma omp declare target(global1, global2) + +int main() { +#pragma omp target + { + global1 = 72; + global2 = global1 + 75; + } + return 0; +} diff --git a/offload/test/sanitizer/heap_null.c b/offload/test/sanitizer/heap_null.c new file mode 100644 index 0000000000000..f57b2dc56a0ec --- /dev/null +++ b/offload/test/sanitizer/heap_null.c @@ -0,0 +1,32 @@ +// clang-format off +// RUN: %libomptarget-compileopt-generic -fsanitize=offload +// RUN: not %libomptarget-run-generic 2> %t.out +// RUN: %fcheck-generic --check-prefixes=CHECK < %t.out +// clang-format on + +// UNSUPPORTED: aarch64-unknown-linux-gnu +// UNSUPPORTED: aarch64-unknown-linux-gnu-LTO +// UNSUPPORTED: x86_64-pc-linux-gnu +// UNSUPPORTED: x86_64-pc-linux-gnu-LTO +// UNSUPPORTED: s390x-ibm-linux-gnu +// UNSUPPORTED: s390x-ibm-linux-gnu-LTO + +// Align lines. + +#include +#include + +int main(void) { + + int *Null = 0; +#pragma omp target + { + // clang-format off + // CHECK: ERROR: OffloadSanitizer out-of-bounds access on address 0x0000000000000000 at pc [[PC:0x.*]] + // CHECK-NEXT: WRITE of size 4 at 0x0000000000000000 thread <0, 0, 0> block <0, 0, 0> + // CHECK-NEXT: #0 [[PC]] main null.c:[[@LINE+3]] + // CHECK-NEXT: 0x0000000000000000 is located 0 bytes inside of 0-byte region [0x0000000000000000,0x0000000000000000) + // clang-format on + *Null = 42; + } +} diff --git a/offload/test/sanitizer/heap_out_of_bounds.c b/offload/test/sanitizer/heap_out_of_bounds.c new file mode 100644 index 0000000000000..37a8d99b5241f --- /dev/null +++ b/offload/test/sanitizer/heap_out_of_bounds.c @@ -0,0 +1,27 @@ +// clang-format off +// RUN: %libomptarget-compileopt-generic -fsanitize=offload +// RUN: not %libomptarget-run-generic 2> %t.out +// RUN: %fcheck-generic --check-prefixes=CHECK < %t.out +// clang-format on + +// UNSUPPORTED: aarch64-unknown-linux-gnu +// UNSUPPORTED: aarch64-unknown-linux-gnu-LTO +// UNSUPPORTED: x86_64-pc-linux-gnu +// UNSUPPORTED: x86_64-pc-linux-gnu-LTO +// UNSUPPORTED: s390x-ibm-linux-gnu +// UNSUPPORTED: s390x-ibm-linux-gnu-LTO + +// Align lines. + +#include +#include + +int main(int argc, char **argv) { + int N = argc > 42 ? 1000 : 100; + double A[N]; +#pragma omp target map(from : A[ : N]) + { + // CHECK: is located 7992 bytes inside of a 800-byte region + A[999] = 3.14; + } +} diff --git a/offload/test/sanitizer/heap_partial_out_of_bounds.c b/offload/test/sanitizer/heap_partial_out_of_bounds.c new file mode 100644 index 0000000000000..981cba3a8eebd --- /dev/null +++ b/offload/test/sanitizer/heap_partial_out_of_bounds.c @@ -0,0 +1,31 @@ +// clang-format off +// RUN: %libomptarget-compileopt-generic -fsanitize=offload +// RUN: not %libomptarget-run-generic 2> %t.out +// RUN: %fcheck-generic --check-prefixes=CHECK < %t.out +// clang-format on + +// UNSUPPORTED: aarch64-unknown-linux-gnu +// UNSUPPORTED: aarch64-unknown-linux-gnu-LTO +// UNSUPPORTED: x86_64-pc-linux-gnu +// UNSUPPORTED: x86_64-pc-linux-gnu-LTO +// UNSUPPORTED: s390x-ibm-linux-gnu +// UNSUPPORTED: s390x-ibm-linux-gnu-LTO + +// Align lines. + +#include "omp.h" +#include +#include + +int main(int argc, char **argv) { + int N = argc > 42 ? 1000 : 100; + double *A = + (double *)omp_target_alloc(N * sizeof(*A), omp_get_default_device()); + char *C = ((char *)&A[N - 1] + 1); +#pragma omp target is_device_ptr(A, C) + { + // CHECK: is located 793 bytes inside of a 800-byte region + double *D = (double *)C; + *D = 3.14; + } +} diff --git a/offload/test/sanitizer/heap_random.c b/offload/test/sanitizer/heap_random.c new file mode 100644 index 0000000000000..78e92b94a746a --- /dev/null +++ b/offload/test/sanitizer/heap_random.c @@ -0,0 +1,23 @@ +// clang-format off +// RUN: %libomptarget-compileopt-generic -fsanitize=offload +// RUN: not %libomptarget-run-generic 2> %t.out +// RUN: %fcheck-generic --check-prefixes=CHECK < %t.out +// clang-format on + +// UNSUPPORTED: aarch64-unknown-linux-gnu +// UNSUPPORTED: aarch64-unknown-linux-gnu-LTO +// UNSUPPORTED: x86_64-pc-linux-gnu +// UNSUPPORTED: x86_64-pc-linux-gnu-LTO +// UNSUPPORTED: s390x-ibm-linux-gnu +// UNSUPPORTED: s390x-ibm-linux-gnu-LTO + +// Align lines. + +int main(void) { + + int X = 0; + int *Random = &X; +#pragma omp target + { *Random = 99; } + // CHECK: 0x{{[a-f0-9]*}} is located {{[0-9]*}} bytes inside +} diff --git a/offload/test/sanitizer/null_forced_stack.c b/offload/test/sanitizer/null_forced_stack.c new file mode 100644 index 0000000000000..e59e34b3a0cd2 --- /dev/null +++ b/offload/test/sanitizer/null_forced_stack.c @@ -0,0 +1,43 @@ +// clang-format off +// RUN: %libomptarget-compileopt-generic -fsanitize=offload -O1 +// RUN: not %libomptarget-run-generic 2> %t.out +// RUN: %fcheck-generic --check-prefixes=CHECK < %t.out +// RUN: %libomptarget-compileopt-generic -fsanitize=offload -O3 +// RUN: not %libomptarget-run-generic 2> %t.out +// RUN: %libomptarget-compileopt-generic -fsanitize=offload -O3 -g +// RUN: not %libomptarget-run-generic 2> %t.out +// RUN: %fcheck-generic --check-prefixes=DEBUG < %t.out +// clang-format on + +// UNSUPPORTED: aarch64-unknown-linux-gnu +// UNSUPPORTED: aarch64-unknown-linux-gnu-LTO +// UNSUPPORTED: x86_64-pc-linux-gnu +// UNSUPPORTED: x86_64-pc-linux-gnu-LTO +// UNSUPPORTED: s390x-ibm-linux-gnu +// UNSUPPORTED: s390x-ibm-linux-gnu-LTO + +[[clang::optnone]] int *deref(int **P) { return *P; } + +int *bar(int **P) { return deref(P); } + +int main(void) { + +#pragma omp target + { + int *NullPtr = 0; + // clang-format off + // CHECK: ERROR: OffloadSanitizer out-of-bounds access on address 0x0000000000000000 at pc [[PC:.*]] + // CHECK: WRITE of size 4 at 0x0000000000000000 thread <0, 0, 0> block <0, 0, 0> (acc 1, heap) + // CHECK: #0 [[PC]] omp target (main:[[@LINE-6]]) in :0 + // + // CHECK: 0x0000000000000000 is located 0 bytes inside of a 0-byte region [0x0000000000000000,0x0000000000000000) + // + // DEBUG: ERROR: OffloadSanitizer out-of-bounds access on address 0x0000000000000000 at pc [[PC:.*]] + // DEBUG: WRITE of size 4 at 0x0000000000000000 thread <0, 0, 0> block <0, 0, 0> (acc 1, heap) + // DEBUG: #0 [[PC]] omp target (main:[[@LINE-12]]) in {{.*}}volatile_stack_null.c:[[@LINE+4]] + // + // DEBUG: 0x0000000000000000 is located 0 bytes inside of a 0-byte region [0x0000000000000000,0x0000000000000000) + // clang-format on + bar(&NullPtr)[10] = 42; + } +} diff --git a/offload/test/sanitizer/optimize_sanitizer_checks/illegal_simple_access_lower.c b/offload/test/sanitizer/optimize_sanitizer_checks/illegal_simple_access_lower.c new file mode 100644 index 0000000000000..0dd0daeab7f14 --- /dev/null +++ b/offload/test/sanitizer/optimize_sanitizer_checks/illegal_simple_access_lower.c @@ -0,0 +1,35 @@ +#include +#include +#include + +typedef int IntTy; + +IntTy *foo(int Size) { + + IntTy *a; + + a = (IntTy *)malloc(sizeof(IntTy) * Size); + +#pragma omp target teams map(from : a [0:Size]) + { + for (IntTy I = -1; I < Size; I++) { + a[I] = I; + } + } + + return a; +} + +void printArray(int *a, int Size) { + + for (IntTy I = 0; I < Size; I++) { + printf("a: %d ", a[I]); + } +} + +int main() { + + int N = 1000; + int *a = foo(N); + printArray(a, N); +} diff --git a/offload/test/sanitizer/optimize_sanitizer_checks/illegal_simple_access_upper.c b/offload/test/sanitizer/optimize_sanitizer_checks/illegal_simple_access_upper.c new file mode 100644 index 0000000000000..3665b8d7eca22 --- /dev/null +++ b/offload/test/sanitizer/optimize_sanitizer_checks/illegal_simple_access_upper.c @@ -0,0 +1,35 @@ +#include +#include +#include + +typedef int IntTy; + +IntTy *foo(int Size) { + + IntTy *a; + + a = (IntTy *)malloc(sizeof(IntTy) * Size); + +#pragma omp target teams map(from : a [0:Size]) + { + for (IntTy I = 0; I < Size + 1; I++) { + a[I] = I; + } + } + + return a; +} + +void printArray(int *a, int Size) { + + for (IntTy I = 0; I < Size; I++) { + printf("a: %d ", a[I]); + } +} + +int main() { + + int N = 1000; + int *a = foo(N); + printArray(a, N); +} diff --git a/offload/test/sanitizer/optimize_sanitizer_checks/legal_access_mergable.c b/offload/test/sanitizer/optimize_sanitizer_checks/legal_access_mergable.c new file mode 100644 index 0000000000000..bcebc2a442c27 --- /dev/null +++ b/offload/test/sanitizer/optimize_sanitizer_checks/legal_access_mergable.c @@ -0,0 +1,55 @@ +#include +#include +#include + +typedef int IntTy; + +typedef struct { + IntTy *a; + IntTy *b; + IntTy *c; +} Product; + +Product *foo(IntTy N) { + + IntTy *a; + IntTy *b; + IntTy *c; + + a = (IntTy *)malloc(sizeof(IntTy) * N); + b = (IntTy *)malloc(sizeof(IntTy) * N); + c = (IntTy *)malloc(sizeof(IntTy) * N); + +#pragma omp target teams map(from : a [0:N], b [0:N], c [0:N]) + { + for (IntTy I = 0; I < N; I++) { + a[I] = I; + b[I] = I; + c[I] = a[I] + b[I]; + } + } + + Product *P = (Product *)malloc(sizeof(Product)); + P->a = a; + P->b = b; + P->c = c; + return P; +} + +void printProduct(Product *P, IntTy N) { + + IntTy *a = P->a; + IntTy *b = P->b; + IntTy *c = P->c; + + for (IntTy i = 1; i < N; i++) { + printf("a: %d, b:%d, c:%d\n", a[i], b[i], c[i]); + } +} + +int main() { + + IntTy N = 1000; + Product *P = foo(N); + printProduct(P, N); +} diff --git a/offload/test/sanitizer/optimize_sanitizer_checks/legal_simple_access_monotonic.c b/offload/test/sanitizer/optimize_sanitizer_checks/legal_simple_access_monotonic.c new file mode 100644 index 0000000000000..16105cc4f7493 --- /dev/null +++ b/offload/test/sanitizer/optimize_sanitizer_checks/legal_simple_access_monotonic.c @@ -0,0 +1,34 @@ +#include +#include +#include + +typedef int IntTy; + +IntTy *foo(IntTy Size) { + + IntTy *a; + a = (IntTy *)malloc(sizeof(IntTy) * Size); + +#pragma omp target teams map(from : a[0:Size]) + { + for (IntTy I = 0; I < Size; I++) { + a[I] = I; + } + } + + return a; +} + +void printArray(IntTy *a, IntTy Size) { + + for (IntTy I = 0; I < Size; I++) { + printf("a: %d ", a[I]); + } +} + +int main() { + + int N = 1000; + int *a = foo(N); + printArray(a, N); +} diff --git a/offload/test/sanitizer/optimize_sanitizer_checks/legal_simple_access_non_monotonic.c b/offload/test/sanitizer/optimize_sanitizer_checks/legal_simple_access_non_monotonic.c new file mode 100644 index 0000000000000..dfe5986eea180 --- /dev/null +++ b/offload/test/sanitizer/optimize_sanitizer_checks/legal_simple_access_non_monotonic.c @@ -0,0 +1,52 @@ +#include +#include +#include + +typedef int IntTy; + +IntTy *foo() { + + IntTy *a; + IntTy *b; + IntTy *c; + + int N = 1000; + + a = (IntTy *)malloc(sizeof(IntTy) * N * N); + b = (IntTy *)malloc(sizeof(IntTy) * N * N); + c = (IntTy *)malloc(sizeof(IntTy) * N * N); + + // I ranges from -N, -N + 1, ..., 0, 1, 2, ... N + // Square ranges from N^2 - 1, (N-1)^2 ..., 0, 1, 4, 9, 16, ... N^2 - 1 +#pragma omp target teams map(from : a [0:N * N]) + { + int Square; + for (IntTy I = -N; I <= N; I++) { + Square = I * I; + // legal boundary access. + if (I == N || I == -N) { + Square = Square - 1; + } + a[Square] = I; + } + } + + return a; +} + +void printArray(int *a, int N) { + + for (IntTy I = -N; I <= N; I++) { + int Index = I * I; + if (I == N || I == -N) { + Index = Index - 1; + } + printf("a: %d ", a[Index]); + } +} + +int main() { + int N = 1000; + int *a = foo(); + printArray(a, N); +} diff --git a/offload/test/sanitizer/stack_trace_1.c b/offload/test/sanitizer/stack_trace_1.c new file mode 100644 index 0000000000000..7129f67c70900 --- /dev/null +++ b/offload/test/sanitizer/stack_trace_1.c @@ -0,0 +1,43 @@ +// clang-format off +// : %libomptarget-compileopt-generic -fsanitize=offload -O1 +// : not %libomptarget-run-generic 2> %t.out +// : %fcheck-generic --check-prefixes=CHECK < %t.out +// : %libomptarget-compileopt-generic -fsanitize=offload -O3 +// : not %libomptarget-run-generic 2> %t.out +// RUN: %libomptarget-compileopt-generic -fsanitize=offload -O3 -g +// RUN: not %libomptarget-run-generic 2> %t.out +// RUN: %fcheck-generic --check-prefixes=DEBUG < %t.out +// clang-format on + +// UNSUPPORTED: aarch64-unknown-linux-gnu +// UNSUPPORTED: aarch64-unknown-linux-gnu-LTO +// UNSUPPORTED: x86_64-pc-linux-gnu +// UNSUPPORTED: x86_64-pc-linux-gnu-LTO +// UNSUPPORTED: s390x-ibm-linux-gnu +// UNSUPPORTED: s390x-ibm-linux-gnu-LTO + +[[clang::optnone]] int deref(int *P) { return *P; } + +[[gnu::always_inline]] int bar(int *P) { return deref(P); } + +int main(void) { + +#pragma omp target + { + int *NullPtr = 0; + // clang-format off + // CHECK: ERROR: OffloadSanitizer out-of-bounds access on address 0x0000000000000000 at pc [[PC:.*]] + // CHECK: WRITE of size 4 at 0x0000000000000000 thread <0, 0, 0> block <0, 0, 0> (acc 1, heap) + // CHECK: #0 [[PC]] omp target (main:[[@LINE-6]]) in :0 + // + // CHECK: 0x0000000000000000 is located 0 bytes inside of a 0-byte region [0x0000000000000000,0x0000000000000000) + // + // DEBUG: ERROR: OffloadSanitizer out-of-bounds access on address 0x0000000000000000 at pc [[PC:.*]] + // DEBUG: WRITE of size 4 at 0x0000000000000000 thread <0, 0, 0> block <0, 0, 0> (acc 1, heap) + // DEBUG: #0 [[PC]] omp target (main:[[@LINE-12]]) in {{.*}}volatile_stack_null.c:[[@LINE+4]] + // + // DEBUG: 0x0000000000000000 is located 0 bytes inside of a 0-byte region [0x0000000000000000,0x0000000000000000) + // clang-format on + bar(NullPtr); + } +} diff --git a/offload/test/sanitizer/stack_trace_multi_path_1.c b/offload/test/sanitizer/stack_trace_multi_path_1.c new file mode 100644 index 0000000000000..58717afe64df5 --- /dev/null +++ b/offload/test/sanitizer/stack_trace_multi_path_1.c @@ -0,0 +1,48 @@ +// clang-format off +// : %libomptarget-compileopt-generic -fsanitize=offload -O1 +// : not %libomptarget-run-generic 2> %t.out +// : %fcheck-generic --check-prefixes=CHECK < %t.out +// : %libomptarget-compileopt-generic -fsanitize=offload -O3 +// : not %libomptarget-run-generic 2> %t.out +// RUN: %libomptarget-compileopt-generic -fsanitize=offload -O3 -g +// RUN: not %libomptarget-run-generic 2> %t.out +// RUN: %fcheck-generic --check-prefixes=DEBUG < %t.out +// clang-format on + +// UNSUPPORTED: aarch64-unknown-linux-gnu +// UNSUPPORTED: aarch64-unknown-linux-gnu-LTO +// UNSUPPORTED: x86_64-pc-linux-gnu +// UNSUPPORTED: x86_64-pc-linux-gnu-LTO +// UNSUPPORTED: s390x-ibm-linux-gnu +// UNSUPPORTED: s390x-ibm-linux-gnu-LTO + +#include + +[[clang::optnone]] int deref(int *P) { return *P; } + +[[gnu::noinline]] int bar(int *P) { return deref(P); } +[[gnu::noinline]] int baz(int *P) { return deref(P); } + +int main(void) { + + int *Valid = (int *)omp_target_alloc(4, omp_get_default_device()); +#pragma omp target is_device_ptr(Valid) + { + int *NullPtr = 0; + // clang-format off + // CHECK: ERROR: OffloadSanitizer out-of-bounds access on address 0x0000000000000000 at pc [[PC:.*]] + // CHECK: WRITE of size 4 at 0x0000000000000000 thread <0, 0, 0> block <0, 0, 0> (acc 1, heap) + // CHECK: #0 [[PC]] omp target (main:[[@LINE-6]]) in :0 + // + // CHECK: 0x0000000000000000 is located 0 bytes inside of a 0-byte region [0x0000000000000000,0x0000000000000000) + // + // DEBUG: ERROR: OffloadSanitizer out-of-bounds access on address 0x0000000000000000 at pc [[PC:.*]] + // DEBUG: WRITE of size 4 at 0x0000000000000000 thread <0, 0, 0> block <0, 0, 0> (acc 1, heap) + // DEBUG: #0 [[PC]] omp target (main:[[@LINE-12]]) in {{.*}}volatile_stack_null.c:[[@LINE+4]] + // + // DEBUG: 0x0000000000000000 is located 0 bytes inside of a 0-byte region [0x0000000000000000,0x0000000000000000) + // clang-format on + bar(Valid); + baz(NullPtr); + } +} diff --git a/offload/test/sanitizer/stack_trace_multi_path_many.cpp b/offload/test/sanitizer/stack_trace_multi_path_many.cpp new file mode 100644 index 0000000000000..6d113985670ff --- /dev/null +++ b/offload/test/sanitizer/stack_trace_multi_path_many.cpp @@ -0,0 +1,45 @@ +// clang-format off +// : %libomptarget-compileoptxx-generic -fsanitize=offload -O1 +// : not %libomptarget-run-generic 2> %t.out +// : %fcheck-generic --check-prefixes=CHECK < %t.out +// : %libomptarget-compileoptxx-generic -fsanitize=offload -O3 +// : not %libomptarget-run-generic 2> %t.out +// RUN: %libomptarget-compileoptxx-generic -fsanitize=offload -O3 -g -DLEVELS=11 +// RUN: not %libomptarget-run-generic 2> %t.out +// RUN: %fcheck-generic --check-prefixes=DBG11 < %t.out +// %libomptarget-compileoptxx-generic -fsanitize=offload -O3 -g -DLEVELS=12 +// not %libomptarget-run-generic 2> %t.out +// %fcheck-generic --check-prefixes=DBG12 < %t.out +// clang-format on + +// UNSUPPORTED: aarch64-unknown-linux-gnu +// UNSUPPORTED: aarch64-unknown-linux-gnu-LTO +// UNSUPPORTED: x86_64-pc-linux-gnu +// UNSUPPORTED: x86_64-pc-linux-gnu-LTO +// UNSUPPORTED: s390x-ibm-linux-gnu +// UNSUPPORTED: s390x-ibm-linux-gnu-LTO + +#include + +template [[clang::optnone]] T deref(T *P) { return *P; } + +template [[gnu::noinline]] T level(T *P) { + if constexpr (LEVEL > 1) + return level(P) + level(P); + if constexpr (LEVEL > 0) + return level(P); + return deref(P); +} + +int main(void) { + + int *ValidInt = (int *)omp_target_alloc(4, omp_get_default_device()); +#pragma omp target is_device_ptr(ValidInt) + { + level(ValidInt); + short *ValidShort = ((short *)ValidInt) + 2; + level(ValidShort); + char *Invalid = ((char *)ValidInt) + 4; + level(Invalid); + } +} diff --git a/offload/test/sanitizer/volatile_stack_null.c b/offload/test/sanitizer/volatile_stack_null.c new file mode 100644 index 0000000000000..08542c3516f28 --- /dev/null +++ b/offload/test/sanitizer/volatile_stack_null.c @@ -0,0 +1,39 @@ +// clang-format off +// RUN: %libomptarget-compileopt-generic -fsanitize=offload -O1 +// RUN: not %libomptarget-run-generic 2> %t.out +// RUN: %fcheck-generic --check-prefixes=CHECK < %t.out +// RUN: %libomptarget-compileopt-generic -fsanitize=offload -O3 +// RUN: not %libomptarget-run-generic 2> %t.out +// RUN: %libomptarget-compileopt-generic -fsanitize=offload -O3 -g +// RUN: not %libomptarget-run-generic 2> %t.out +// RUN: %fcheck-generic --check-prefixes=DEBUG < %t.out +// clang-format on + +// UNSUPPORTED: aarch64-unknown-linux-gnu +// UNSUPPORTED: aarch64-unknown-linux-gnu-LTO +// UNSUPPORTED: x86_64-pc-linux-gnu +// UNSUPPORTED: x86_64-pc-linux-gnu-LTO +// UNSUPPORTED: s390x-ibm-linux-gnu +// UNSUPPORTED: s390x-ibm-linux-gnu-LTO + +int main(void) { + +#pragma omp target + { + volatile int *Null = 0; + // clang-format off + // CHECK: ERROR: OffloadSanitizer out-of-bounds access on address 0x0000000000000000 at pc [[PC:.*]] + // CHECK: WRITE of size 4 at 0x0000000000000000 thread <0, 0, 0> block <0, 0, 0> (acc 1, heap) + // CHECK: #0 [[PC]] omp target (main:[[@LINE-6]]) in :0 + // + // CHECK: 0x0000000000000000 is located 0 bytes inside of a 0-byte region [0x0000000000000000,0x0000000000000000) + // + // DEBUG: ERROR: OffloadSanitizer out-of-bounds access on address 0x0000000000000000 at pc [[PC:.*]] + // DEBUG: WRITE of size 4 at 0x0000000000000000 thread <0, 0, 0> block <0, 0, 0> (acc 1, heap) + // DEBUG: #0 [[PC]] omp target (main:[[@LINE-12]]) in {{.*}}volatile_stack_null.c:[[@LINE+4]] + // + // DEBUG: 0x0000000000000000 is located 0 bytes inside of a 0-byte region [0x0000000000000000,0x0000000000000000) + // clang-format on + *Null = 42; + } +} diff --git a/openmp/runtime/src/include/ompx.h.var b/openmp/runtime/src/include/ompx.h.var index 623f0b9c315bd..2d3ad31188ced 100644 --- a/openmp/runtime/src/include/ompx.h.var +++ b/openmp/runtime/src/include/ompx.h.var @@ -139,6 +139,20 @@ _TGT_KERNEL_LANGUAGE_DECL_GRID_C(grid_dim) uint64_t ompx_ballot_sync(uint64_t mask, int pred); +/// ompx_shfl_sync_{i,f,l,d} +///{ +#define _TGT_KERNEL_LANGUAGE_SHFL_SYNC(TYPE, TY) \ + TYPE ompx_shfl_sync_##TY(uint64_t mask, TYPE var, int src, \ + int width); + +_TGT_KERNEL_LANGUAGE_SHFL_SYNC(int, i) +_TGT_KERNEL_LANGUAGE_SHFL_SYNC(float, f) +_TGT_KERNEL_LANGUAGE_SHFL_SYNC(long, l) +_TGT_KERNEL_LANGUAGE_SHFL_SYNC(double, d) + +#undef _TGT_KERNEL_LANGUAGE_SHFL_SYNC +///} + /// ompx_shfl_down_sync_{i,f,l,d} ///{ #define _TGT_KERNEL_LANGUAGE_SHFL_DOWN_SYNC(TYPE, TY) \ @@ -208,6 +222,22 @@ static inline uint64_t ballot_sync(uint64_t mask, int pred) { return ompx_ballot_sync(mask, pred); } +/// shfl_sync +///{ +#define _TGT_KERNEL_LANGUAGE_SHFL_SYNC(TYPE, TY) \ + static inline TYPE shfl_sync(uint64_t mask, TYPE var, int src, \ + int width = __WARP_SIZE) { \ + return ompx_shfl_down_sync_##TY(mask, var, int, width); \ + } + +_TGT_KERNEL_LANGUAGE_SHFL_SYNC(int, i) +_TGT_KERNEL_LANGUAGE_SHFL_SYNC(float, f) +_TGT_KERNEL_LANGUAGE_SHFL_SYNC(long, l) +_TGT_KERNEL_LANGUAGE_SHFL_SYNC(double, d) + +#undef _TGT_KERNEL_LANGUAGE_SHFL_DOWN_SYNC +///} + /// shfl_down_sync ///{ #define _TGT_KERNEL_LANGUAGE_SHFL_DOWN_SYNC(TYPE, TY) \