diff --git a/clang/include/clang/Basic/LangOptions.def b/clang/include/clang/Basic/LangOptions.def
index 2dea3cd4d795b..e8d3be7e89dbb 100644
--- a/clang/include/clang/Basic/LangOptions.def
+++ b/clang/include/clang/Basic/LangOptions.def
@@ -288,6 +288,7 @@ LANGOPT(GPUMaxThreadsPerBlock, 32, 1024, "default max threads per block for kern
 LANGOPT(GPUDeferDiag, 1, 0, "defer host/device related diagnostic messages for CUDA/HIP")
 LANGOPT(GPUExcludeWrongSideOverloads, 1, 0, "always exclude wrong side overloads in overloading resolution for CUDA/HIP")
 LANGOPT(OffloadingNewDriver, 1, 0, "use the new driver for generating offloading code.")
+LANGOPT(OffloadViaLLVM, 1, 0, "target LLVM/Offload as portable offloading runtime.")
 
 LANGOPT(SYCLIsDevice      , 1, 0, "Generate code for SYCL device")
 LANGOPT(SYCLIsHost        , 1, 0, "SYCL host compilation")
diff --git a/clang/include/clang/Basic/Sanitizers.def b/clang/include/clang/Basic/Sanitizers.def
index bee35e9dca7c3..0b41187f6db52 100644
--- a/clang/include/clang/Basic/Sanitizers.def
+++ b/clang/include/clang/Basic/Sanitizers.def
@@ -190,6 +190,9 @@ SANITIZER_GROUP("bounds", Bounds, ArrayBounds | LocalBounds)
 // Scudo hardened allocator
 SANITIZER("scudo", Scudo)
 
+// LLVM/Offload sanitizer
+SANITIZER("offload", Offload)
+
 // Magic group, containing all sanitizers. For example, "-fno-sanitize=all"
 // can be used to disable all the sanitizers.
 SANITIZER_GROUP("all", All, ~SanitizerMask())
diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td
index d44faa55c456f..b77fd063f5519 100644
--- a/clang/include/clang/Driver/Options.td
+++ b/clang/include/clang/Driver/Options.td
@@ -1293,6 +1293,12 @@ def no_offload_compress : Flag<["--"], "no-offload-compress">;
 def offload_compression_level_EQ : Joined<["--"], "offload-compression-level=">,
   Flags<[HelpHidden]>,
   HelpText<"Compression level for offload device binaries (HIP only)">;
+
+defm offload_via_llvm : BoolFOption<"offload-via-llvm",
+  LangOpts<"OffloadViaLLVM">, DefaultFalse,
+  PosFlag<SetTrue, [], [ClangOption, CC1Option], "Use">,
+  NegFlag<SetFalse, [], [ClangOption], "Don't use">,
+  BothFlags<[], [ClangOption], " LLVM/Offload as portable offloading runtime.">>;
 }
 
 // CUDA options
diff --git a/clang/include/clang/Driver/SanitizerArgs.h b/clang/include/clang/Driver/SanitizerArgs.h
index 47ef175302679..004d5fbf4af73 100644
--- a/clang/include/clang/Driver/SanitizerArgs.h
+++ b/clang/include/clang/Driver/SanitizerArgs.h
@@ -80,6 +80,9 @@ class SanitizerArgs {
 
   bool needsMemProfRt() const { return NeedsMemProfRt; }
   bool needsAsanRt() const { return Sanitizers.has(SanitizerKind::Address); }
+  bool needsOffloadKernels() const {
+    return Sanitizers.has(SanitizerKind::Offload);
+  }
   bool needsHwasanRt() const {
     return Sanitizers.has(SanitizerKind::HWAddress);
   }
diff --git a/clang/lib/CodeGen/BackendUtil.cpp b/clang/lib/CodeGen/BackendUtil.cpp
index b09680086248d..b1d11b4a6497a 100644
--- a/clang/lib/CodeGen/BackendUtil.cpp
+++ b/clang/lib/CodeGen/BackendUtil.cpp
@@ -70,6 +70,7 @@
 #include "llvm/Transforms/Instrumentation/BoundsChecking.h"
 #include "llvm/Transforms/Instrumentation/DataFlowSanitizer.h"
 #include "llvm/Transforms/Instrumentation/GCOVProfiler.h"
+#include "llvm/Transforms/Instrumentation/GPUSan.h"
 #include "llvm/Transforms/Instrumentation/HWAddressSanitizer.h"
 #include "llvm/Transforms/Instrumentation/InstrProfiling.h"
 #include "llvm/Transforms/Instrumentation/KCFI.h"
diff --git a/clang/lib/CodeGen/CGCUDANV.cpp b/clang/lib/CodeGen/CGCUDANV.cpp
index 43dfbbb90dd52..2ebe0bf802dfa 100644
--- a/clang/lib/CodeGen/CGCUDANV.cpp
+++ b/clang/lib/CodeGen/CGCUDANV.cpp
@@ -15,10 +15,12 @@
 #include "CGCXXABI.h"
 #include "CodeGenFunction.h"
 #include "CodeGenModule.h"
+#include "clang/AST/CharUnits.h"
 #include "clang/AST/Decl.h"
 #include "clang/Basic/Cuda.h"
 #include "clang/CodeGen/CodeGenABITypes.h"
 #include "clang/CodeGen/ConstantInitBuilder.h"
+#include "llvm/ADT/StringRef.h"
 #include "llvm/Frontend/Offloading/Utility.h"
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/Constants.h"
@@ -36,6 +38,11 @@ constexpr unsigned HIPFatMagic = 0x48495046; // "HIPF"
 
 class CGNVCUDARuntime : public CGCUDARuntime {
 
+  /// The prefix used for function calls and section names (CUDA, HIP, LLVM)
+  StringRef Prefix;
+  /// TODO: We should transition the OpenMP section to LLVM/Offload
+  StringRef SectionPrefix;
+
 private:
   llvm::IntegerType *IntTy, *SizeTy;
   llvm::Type *VoidTy;
@@ -132,6 +139,9 @@ class CGNVCUDARuntime : public CGCUDARuntime {
     return DummyFunc;
   }
 
+  Address prepareKernelArgs(CodeGenFunction &CGF, FunctionArgList &Args);
+  Address prepareKernelArgsLLVMOffload(CodeGenFunction &CGF,
+                                       FunctionArgList &Args);
   void emitDeviceStubBodyLegacy(CodeGenFunction &CGF, FunctionArgList &Args);
   void emitDeviceStubBodyNew(CodeGenFunction &CGF, FunctionArgList &Args);
   std::string getDeviceSideName(const NamedDecl *ND) override;
@@ -191,15 +201,11 @@ class CGNVCUDARuntime : public CGCUDARuntime {
 } // end anonymous namespace
 
 std::string CGNVCUDARuntime::addPrefixToName(StringRef FuncName) const {
-  if (CGM.getLangOpts().HIP)
-    return ((Twine("hip") + Twine(FuncName)).str());
-  return ((Twine("cuda") + Twine(FuncName)).str());
+  return (Prefix + FuncName).str();
 }
 std::string
 CGNVCUDARuntime::addUnderscoredPrefixToName(StringRef FuncName) const {
-  if (CGM.getLangOpts().HIP)
-    return ((Twine("__hip") + Twine(FuncName)).str());
-  return ((Twine("__cuda") + Twine(FuncName)).str());
+  return ("__" + Prefix + FuncName).str();
 }
 
 static std::unique_ptr<MangleContext> InitDeviceMC(CodeGenModule &CGM) {
@@ -227,6 +233,14 @@ CGNVCUDARuntime::CGNVCUDARuntime(CodeGenModule &CGM)
   SizeTy = CGM.SizeTy;
   VoidTy = CGM.VoidTy;
   PtrTy = CGM.UnqualPtrTy;
+
+  if (CGM.getLangOpts().OffloadViaLLVM) {
+    Prefix = "llvm";
+    SectionPrefix = "omp";
+  } else if (CGM.getLangOpts().HIP)
+    SectionPrefix = Prefix = "hip";
+  else
+    SectionPrefix = Prefix = "cuda";
 }
 
 llvm::FunctionCallee CGNVCUDARuntime::getSetupArgumentFn() const {
@@ -305,18 +319,58 @@ void CGNVCUDARuntime::emitDeviceStub(CodeGenFunction &CGF,
   }
   if (CudaFeatureEnabled(CGM.getTarget().getSDKVersion(),
                          CudaFeature::CUDA_USES_NEW_LAUNCH) ||
-      (CGF.getLangOpts().HIP && CGF.getLangOpts().HIPUseNewLaunchAPI))
+      (CGF.getLangOpts().HIP && CGF.getLangOpts().HIPUseNewLaunchAPI) ||
+      (CGF.getLangOpts().OffloadViaLLVM))
     emitDeviceStubBodyNew(CGF, Args);
   else
     emitDeviceStubBodyLegacy(CGF, Args);
 }
 
-// CUDA 9.0+ uses new way to launch kernels. Parameters are packed in a local
-// array and kernels are launched using cudaLaunchKernel().
-void CGNVCUDARuntime::emitDeviceStubBodyNew(CodeGenFunction &CGF,
-                                            FunctionArgList &Args) {
-  // Build the shadow stack entry at the very start of the function.
+/// CUDA passes the arguments with a level of indirection. For example, a
+/// (void*, short, void*) is passed as {void **, short *, void **} to the launch
+/// function. For the LLVM/offload launch we flatten the arguments into the
+/// struct directly. In addition, we include the size of the arguments, thus
+/// pass {sizeof({void *, short, void *}), ptr to {void *, short, void *},
+/// nullptr}. The last nullptr needs to be initialized to an array of pointers
+/// pointing to the arguments if we want to offload to the host.
+Address CGNVCUDARuntime::prepareKernelArgsLLVMOffload(CodeGenFunction &CGF,
+                                                      FunctionArgList &Args) {
+  SmallVector<llvm::Type *> ArgTypes, KernelLaunchParamsTypes;
+  for (auto &Arg : Args)
+    ArgTypes.push_back(CGF.ConvertTypeForMem(Arg->getType()));
+  llvm::StructType *KernelArgsTy = llvm::StructType::create(ArgTypes);
+
+  auto *Int64Ty = CGF.Builder.getInt64Ty();
+  KernelLaunchParamsTypes.push_back(Int64Ty);
+  KernelLaunchParamsTypes.push_back(PtrTy);
+  KernelLaunchParamsTypes.push_back(PtrTy);
+
+  llvm::StructType *KernelLaunchParamsTy =
+      llvm::StructType::create(KernelLaunchParamsTypes);
+  Address KernelArgs = CGF.CreateTempAllocaWithoutCast(
+      KernelArgsTy, CharUnits::fromQuantity(16), "kernel_args");
+  Address KernelLaunchParams = CGF.CreateTempAllocaWithoutCast(
+      KernelLaunchParamsTy, CharUnits::fromQuantity(16),
+      "kernel_launch_params");
+
+  auto KernelArgsSize = CGM.getDataLayout().getTypeAllocSize(KernelArgsTy);
+  CGF.Builder.CreateStore(llvm::ConstantInt::get(Int64Ty, KernelArgsSize),
+                          CGF.Builder.CreateStructGEP(KernelLaunchParams, 0));
+  CGF.Builder.CreateStore(KernelArgs.emitRawPointer(CGF),
+                          CGF.Builder.CreateStructGEP(KernelLaunchParams, 1));
+  CGF.Builder.CreateStore(llvm::Constant::getNullValue(PtrTy),
+                          CGF.Builder.CreateStructGEP(KernelLaunchParams, 2));
+
+  for (unsigned i = 0; i < Args.size(); ++i) {
+    auto *ArgVal = CGF.Builder.CreateLoad(CGF.GetAddrOfLocalVar(Args[i]));
+    CGF.Builder.CreateStore(ArgVal, CGF.Builder.CreateStructGEP(KernelArgs, i));
+  }
 
+  return KernelLaunchParams;
+}
+
+Address CGNVCUDARuntime::prepareKernelArgs(CodeGenFunction &CGF,
+                                           FunctionArgList &Args) {
   // Calculate amount of space we will need for all arguments.  If we have no
   // args, allocate a single pointer so we still have a valid pointer to the
   // argument array that we can pass to runtime, even if it will be unused.
@@ -331,6 +385,17 @@ void CGNVCUDARuntime::emitDeviceStubBodyNew(CodeGenFunction &CGF,
         VoidVarPtr, CGF.Builder.CreateConstGEP1_32(
                         PtrTy, KernelArgs.emitRawPointer(CGF), i));
   }
+  return KernelArgs;
+}
+
+// CUDA 9.0+ uses new way to launch kernels. Parameters are packed in a local
+// array and kernels are launched using cudaLaunchKernel().
+void CGNVCUDARuntime::emitDeviceStubBodyNew(CodeGenFunction &CGF,
+                                            FunctionArgList &Args) {
+  // Build the shadow stack entry at the very start of the function.
+  Address KernelArgs = CGF.getLangOpts().OffloadViaLLVM
+                           ? prepareKernelArgsLLVMOffload(CGF, Args)
+                           : prepareKernelArgs(CGF, Args);
 
   llvm::BasicBlock *EndBlock = CGF.createBasicBlock("setup.end");
 
@@ -1129,8 +1194,9 @@ void CGNVCUDARuntime::transformManagedVars() {
 // registered. The linker will provide a pointer to this section so we can
 // register the symbols with the linked device image.
 void CGNVCUDARuntime::createOffloadingEntries() {
-  StringRef Section = CGM.getLangOpts().HIP ? "hip_offloading_entries"
-                                            : "cuda_offloading_entries";
+  SmallVector<char, 32> Out;
+  StringRef Section = (SectionPrefix + "_offloading_entries").toStringRef(Out);
+
   llvm::Module &M = CGM.getModule();
   for (KernelInfo &I : EmittedKernels)
     llvm::offloading::emitOffloadingEntry(
@@ -1199,7 +1265,8 @@ llvm::Function *CGNVCUDARuntime::finalizeModule() {
     }
     return nullptr;
   }
-  if (CGM.getLangOpts().OffloadingNewDriver && RelocatableDeviceCode)
+  if (CGM.getLangOpts().OffloadViaLLVM ||
+      (CGM.getLangOpts().OffloadingNewDriver && RelocatableDeviceCode))
     createOffloadingEntries();
   else
     return makeModuleCtorFunction();
diff --git a/clang/lib/CodeGen/CGDeclCXX.cpp b/clang/lib/CodeGen/CGDeclCXX.cpp
index a88bb2af59fee..90ebc638bd45e 100644
--- a/clang/lib/CodeGen/CGDeclCXX.cpp
+++ b/clang/lib/CodeGen/CGDeclCXX.cpp
@@ -456,6 +456,10 @@ llvm::Function *CodeGenModule::CreateGlobalInitOrCleanUpFunction(
       !isInNoSanitizeList(SanitizerKind::Address, Fn, Loc))
     Fn->addFnAttr(llvm::Attribute::SanitizeAddress);
 
+  if (getLangOpts().Sanitize.has(SanitizerKind::Offload) &&
+      !isInNoSanitizeList(SanitizerKind::Offload, Fn, Loc))
+    Fn->addFnAttr(llvm::Attribute::SanitizeAddress);
+
   if (getLangOpts().Sanitize.has(SanitizerKind::KernelAddress) &&
       !isInNoSanitizeList(SanitizerKind::KernelAddress, Fn, Loc))
     Fn->addFnAttr(llvm::Attribute::SanitizeAddress);
diff --git a/clang/lib/CodeGen/CodeGenFunction.cpp b/clang/lib/CodeGen/CodeGenFunction.cpp
index cea0d84c64bc4..d11c4ed65a597 100644
--- a/clang/lib/CodeGen/CodeGenFunction.cpp
+++ b/clang/lib/CodeGen/CodeGenFunction.cpp
@@ -31,6 +31,7 @@
 #include "clang/AST/StmtObjC.h"
 #include "clang/Basic/Builtins.h"
 #include "clang/Basic/CodeGenOptions.h"
+#include "clang/Basic/Sanitizers.h"
 #include "clang/Basic/TargetBuiltins.h"
 #include "clang/Basic/TargetInfo.h"
 #include "clang/CodeGen/CGFunctionInfo.h"
@@ -67,7 +68,8 @@ static bool shouldEmitLifetimeMarkers(const CodeGenOptions &CGOpts,
   // Sanitizers may use markers.
   if (CGOpts.SanitizeAddressUseAfterScope ||
       LangOpts.Sanitize.has(SanitizerKind::HWAddress) ||
-      LangOpts.Sanitize.has(SanitizerKind::Memory))
+      LangOpts.Sanitize.has(SanitizerKind::Memory) ||
+      LangOpts.Sanitize.has(SanitizerKind::Offload))
     return true;
 
   // For now, only in optimized builds.
@@ -791,6 +793,8 @@ void CodeGenFunction::StartFunction(GlobalDecl GD, QualType RetTy,
       SanOpts.set(SanitizerKind::KernelHWAddress, false);
     if (no_sanitize_mask & SanitizerKind::KernelHWAddress)
       SanOpts.set(SanitizerKind::HWAddress, false);
+    if (no_sanitize_mask & SanitizerKind::Offload)
+      SanOpts.set(SanitizerKind::Offload, false);
 
     if (SanitizeBounds && !SanOpts.hasOneOf(SanitizerKind::Bounds))
       Fn->addFnAttr(llvm::Attribute::NoSanitizeBounds);
@@ -809,7 +813,8 @@ void CodeGenFunction::StartFunction(GlobalDecl GD, QualType RetTy,
     CurFn->addFnAttr(llvm::Attribute::DisableSanitizerInstrumentation);
   } else {
     // Apply sanitizer attributes to the function.
-    if (SanOpts.hasOneOf(SanitizerKind::Address | SanitizerKind::KernelAddress))
+    if (SanOpts.hasOneOf(SanitizerKind::Address | SanitizerKind::KernelAddress |
+                         SanitizerKind::Offload))
       Fn->addFnAttr(llvm::Attribute::SanitizeAddress);
     if (SanOpts.hasOneOf(SanitizerKind::HWAddress |
                          SanitizerKind::KernelHWAddress))
diff --git a/clang/lib/Driver/Driver.cpp b/clang/lib/Driver/Driver.cpp
index 67bf0604acd6e..0f55b429512c2 100644
--- a/clang/lib/Driver/Driver.cpp
+++ b/clang/lib/Driver/Driver.cpp
@@ -792,11 +792,13 @@ void Driver::CreateOffloadingDeviceToolChains(Compilation &C,
                    }) ||
       C.getInputArgs().hasArg(options::OPT_hip_link) ||
       C.getInputArgs().hasArg(options::OPT_hipstdpar);
+  bool UseLLVMOffload = C.getInputArgs().hasArg(
+      options::OPT_foffload_via_llvm, options::OPT_fno_offload_via_llvm, false);
   if (IsCuda && IsHIP) {
     Diag(clang::diag::err_drv_mix_cuda_hip);
     return;
   }
-  if (IsCuda) {
+  if (IsCuda && !UseLLVMOffload) {
     const ToolChain *HostTC = C.getSingleOffloadToolChain<Action::OFK_Host>();
     const llvm::Triple &HostTriple = HostTC->getTriple();
     auto OFK = Action::OFK_Cuda;
@@ -818,7 +820,7 @@ void Driver::CreateOffloadingDeviceToolChains(Compilation &C,
         CudaInstallation.WarnIfUnsupportedVersion();
     }
     C.addOffloadDeviceToolChain(CudaTC.get(), OFK);
-  } else if (IsHIP) {
+  } else if (IsHIP && !UseLLVMOffload) {
     if (auto *OMPTargetArg =
             C.getInputArgs().getLastArg(options::OPT_fopenmp_targets_EQ)) {
       Diag(clang::diag::err_drv_unsupported_opt_for_language_mode)
@@ -842,10 +844,11 @@ void Driver::CreateOffloadingDeviceToolChains(Compilation &C,
   // We need to generate an OpenMP toolchain if the user specified targets with
   // the -fopenmp-targets option or used --offload-arch with OpenMP enabled.
   bool IsOpenMPOffloading =
-      C.getInputArgs().hasFlag(options::OPT_fopenmp, options::OPT_fopenmp_EQ,
-                               options::OPT_fno_openmp, false) &&
-      (C.getInputArgs().hasArg(options::OPT_fopenmp_targets_EQ) ||
-       C.getInputArgs().hasArg(options::OPT_offload_arch_EQ));
+      ((IsCuda || IsHIP) && UseLLVMOffload) ||
+      (C.getInputArgs().hasFlag(options::OPT_fopenmp, options::OPT_fopenmp_EQ,
+                                options::OPT_fno_openmp, false) &&
+       (C.getInputArgs().hasArg(options::OPT_fopenmp_targets_EQ) ||
+        C.getInputArgs().hasArg(options::OPT_offload_arch_EQ)));
   if (IsOpenMPOffloading) {
     // We expect that -fopenmp-targets is always used in conjunction with the
     // option -fopenmp specifying a valid runtime with offloading support, i.e.
@@ -873,7 +876,7 @@ void Driver::CreateOffloadingDeviceToolChains(Compilation &C,
       for (StringRef T : OpenMPTargets->getValues())
         OpenMPTriples.insert(T);
     } else if (C.getInputArgs().hasArg(options::OPT_offload_arch_EQ) &&
-               !IsHIP && !IsCuda) {
+               ((!IsHIP && !IsCuda) || UseLLVMOffload)) {
       const ToolChain *HostTC = C.getSingleOffloadToolChain<Action::OFK_Host>();
       auto AMDTriple = getHIPOffloadTargetTriple(*this, C.getInputArgs());
       auto NVPTXTriple = getNVIDIAOffloadTargetTriple(*this, C.getInputArgs(),
@@ -4146,6 +4149,8 @@ void Driver::BuildActions(Compilation &C, DerivedArgList &Args,
 
   bool UseNewOffloadingDriver =
       C.isOffloadingHostKind(Action::OFK_OpenMP) ||
+      Args.hasFlag(options::OPT_foffload_via_llvm,
+                   options::OPT_fno_offload_via_llvm, false) ||
       Args.hasFlag(options::OPT_offload_new_driver,
                    options::OPT_no_offload_new_driver, false);
 
diff --git a/clang/lib/Driver/ToolChain.cpp b/clang/lib/Driver/ToolChain.cpp
index 40ab2e91125d1..05b53c5573a1c 100644
--- a/clang/lib/Driver/ToolChain.cpp
+++ b/clang/lib/Driver/ToolChain.cpp
@@ -1367,7 +1367,8 @@ SanitizerMask ToolChain::getSupportedSanitizers() const {
       SanitizerKind::CFICastStrict | SanitizerKind::FloatDivideByZero |
       SanitizerKind::KCFI | SanitizerKind::UnsignedIntegerOverflow |
       SanitizerKind::UnsignedShiftBase | SanitizerKind::ImplicitConversion |
-      SanitizerKind::Nullability | SanitizerKind::LocalBounds;
+      SanitizerKind::Nullability | SanitizerKind::LocalBounds |
+      SanitizerKind::Offload;
   if (getTriple().getArch() == llvm::Triple::x86 ||
       getTriple().getArch() == llvm::Triple::x86_64 ||
       getTriple().getArch() == llvm::Triple::arm || getTriple().isWasm() ||
diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp
index b8d8ff3db5d1f..77d365bad229f 100644
--- a/clang/lib/Driver/ToolChains/Clang.cpp
+++ b/clang/lib/Driver/ToolChains/Clang.cpp
@@ -1125,6 +1125,18 @@ void Clang::AddPreprocessingOptions(Compilation &C, const JobAction &JA,
     CmdArgs.push_back("__clang_openmp_device_functions.h");
   }
 
+  if (Args.hasArg(options::OPT_foffload_via_llvm)) {
+    // Add llvm_wrappers/* to our system include path.  This lets us wrap
+    // standard library headers and other headers.
+    SmallString<128> P(D.ResourceDir);
+    llvm::sys::path::append(P, "include", "llvm_offload_wrappers");
+    CmdArgs.append({"-internal-isystem", Args.MakeArgString(P), "-include"});
+    if (JA.isDeviceOffloading(Action::OFK_OpenMP))
+      CmdArgs.push_back("__llvm_offload_device.h");
+    else
+      CmdArgs.push_back("__llvm_offload_host.h");
+  }
+
   // Add -i* options, and automatically translate to
   // -include-pch/-include-pth for transparent PCH support. It's
   // wonky, but we include looking for .gch so we can support seamless
@@ -6598,6 +6610,8 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA,
   // device offloading action other than OpenMP.
   if (Args.hasFlag(options::OPT_fopenmp, options::OPT_fopenmp_EQ,
                    options::OPT_fno_openmp, false) &&
+      !Args.hasFlag(options::OPT_foffload_via_llvm,
+                    options::OPT_fno_offload_via_llvm, false) &&
       (JA.isDeviceOffloading(Action::OFK_None) ||
        JA.isDeviceOffloading(Action::OFK_OpenMP))) {
     switch (D.getOpenMPRuntime(Args)) {
@@ -6675,11 +6689,16 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA,
     Args.addOptOutFlag(CmdArgs, options::OPT_fopenmp_extensions,
                        options::OPT_fno_openmp_extensions);
   }
-
-  // Forward the new driver to change offloading code generation.
-  if (Args.hasFlag(options::OPT_offload_new_driver,
-                   options::OPT_no_offload_new_driver, false))
+  // Forward the offload runtime change to code generation, liboffload implies
+  // new driver. Otherwise, check if we should forward the new driver to change
+  // offloading code generation.
+  if (Args.hasFlag(options::OPT_foffload_via_llvm,
+                   options::OPT_fno_offload_via_llvm, false)) {
+    CmdArgs.append({"--offload-new-driver", "-foffload-via-llvm"});
+  } else if (Args.hasFlag(options::OPT_offload_new_driver,
+                          options::OPT_no_offload_new_driver, false)) {
     CmdArgs.push_back("--offload-new-driver");
+  }
 
   SanitizeArgs.addArgs(TC, Args, CmdArgs, InputType);
 
diff --git a/clang/lib/Driver/ToolChains/CommonArgs.cpp b/clang/lib/Driver/ToolChains/CommonArgs.cpp
index 2a4c1369f5a73..ecbee87ed6486 100644
--- a/clang/lib/Driver/ToolChains/CommonArgs.cpp
+++ b/clang/lib/Driver/ToolChains/CommonArgs.cpp
@@ -1204,9 +1204,20 @@ bool tools::addOpenMPRuntime(const Compilation &C, ArgStringList &CmdArgs,
                              const ToolChain &TC, const ArgList &Args,
                              bool ForceStaticHostRuntime, bool IsOffloadingHost,
                              bool GompNeedsRT) {
+  const SanitizerArgs &SanArgs = TC.getSanitizerArgs(Args);
+  if (SanArgs.needsOffloadKernels()) {
+    CmdArgs.push_back("-loffload.kernels");
+    CmdArgs.append({"-mllvm", "-enable-offload-sanitizer"});
+  }
+
   if (!Args.hasFlag(options::OPT_fopenmp, options::OPT_fopenmp_EQ,
-                    options::OPT_fno_openmp, false))
+                    options::OPT_fno_openmp, false)) {
+    // We need libomptarget (liboffload) if it's the choosen offloading runtime.
+    if (Args.hasFlag(options::OPT_foffload_via_llvm,
+                     options::OPT_fno_offload_via_llvm, false))
+      CmdArgs.push_back("-lomptarget");
     return false;
+  }
 
   Driver::OpenMPRuntimeKind RTKind = TC.getDriver().getOpenMPRuntime(Args);
 
diff --git a/clang/lib/Driver/ToolChains/Cuda.cpp b/clang/lib/Driver/ToolChains/Cuda.cpp
index 2dfc7457b0ac7..7d154865ce3f2 100644
--- a/clang/lib/Driver/ToolChains/Cuda.cpp
+++ b/clang/lib/Driver/ToolChains/Cuda.cpp
@@ -861,17 +861,15 @@ void CudaToolChain::addClangTargetOptions(
           DeviceOffloadingKind == Action::OFK_Cuda) &&
          "Only OpenMP or CUDA offloading kinds are supported for NVIDIA GPUs.");
 
-  if (DeviceOffloadingKind == Action::OFK_Cuda) {
-    CC1Args.append(
-        {"-fcuda-is-device", "-mllvm", "-enable-memcpyopt-without-libcalls"});
-
-    // Unsized function arguments used for variadics were introduced in CUDA-9.0
-    // We still do not support generating code that actually uses variadic
-    // arguments yet, but we do need to allow parsing them as recent CUDA
-    // headers rely on that. https://github.com/llvm/llvm-project/issues/58410
-    if (CudaInstallation.version() >= CudaVersion::CUDA_90)
-      CC1Args.push_back("-fcuda-allow-variadic-functions");
-  }
+  CC1Args.append(
+      {"-fcuda-is-device", "-mllvm", "-enable-memcpyopt-without-libcalls"});
+
+  // Unsized function arguments used for variadics were introduced in CUDA-9.0
+  // We still do not support generating code that actually uses variadic
+  // arguments yet, but we do need to allow parsing them as recent CUDA
+  // headers rely on that. https://github.com/llvm/llvm-project/issues/58410
+  if (CudaInstallation.version() >= CudaVersion::CUDA_90)
+    CC1Args.push_back("-fcuda-allow-variadic-functions");
 
   if (DriverArgs.hasArg(options::OPT_nogpulib))
     return;
@@ -889,6 +887,13 @@ void CudaToolChain::addClangTargetOptions(
   CC1Args.push_back("-mlink-builtin-bitcode");
   CC1Args.push_back(DriverArgs.MakeArgString(LibDeviceFile));
 
+  // For now, we don't use any Offload/OpenMP device runtime when we offload
+  // CUDA via LLVM/Offload. We should split the Offload/OpenMP device runtime
+  // and include the "generic" (or CUDA-specific) parts.
+  if (DriverArgs.hasFlag(options::OPT_foffload_via_llvm,
+                         options::OPT_fno_offload_via_llvm, false))
+    return;
+
   clang::CudaVersion CudaInstallationVersion = CudaInstallation.version();
 
   if (DriverArgs.hasFlag(options::OPT_fcuda_short_ptr,
diff --git a/clang/lib/Headers/CMakeLists.txt b/clang/lib/Headers/CMakeLists.txt
index d3090e488306f..251e5b0ba2381 100644
--- a/clang/lib/Headers/CMakeLists.txt
+++ b/clang/lib/Headers/CMakeLists.txt
@@ -325,6 +325,13 @@ set(openmp_wrapper_files
   openmp_wrappers/new
 )
 
+set(llvm_offload_wrapper_files
+  llvm_offload_wrappers/__llvm_offload.h
+  llvm_offload_wrappers/__llvm_offload_host.h
+  llvm_offload_wrappers/__llvm_offload_device.h
+  llvm_offload_wrappers/cuda_runtime.h
+)
+
 set(llvm_libc_wrapper_files
   llvm_libc_wrappers/assert.h
   llvm_libc_wrappers/stdio.h
@@ -375,7 +382,7 @@ endfunction(clang_generate_header)
 # Copy header files from the source directory to the build directory
 foreach( f ${files} ${cuda_wrapper_files} ${cuda_wrapper_bits_files}
            ${ppc_wrapper_files} ${openmp_wrapper_files} ${zos_wrapper_files} ${hlsl_files}
-           ${llvm_libc_wrapper_files})
+	   ${llvm_libc_wrapper_files} ${llvm_offload_wrapper_files})
   copy_header_to_output_dir(${CMAKE_CURRENT_SOURCE_DIR} ${f})
 endforeach( f )
 
@@ -501,6 +508,7 @@ add_header_target("hlsl-resource-headers" ${hlsl_files})
 add_header_target("opencl-resource-headers" ${opencl_files})
 add_header_target("llvm-libc-resource-headers" ${llvm_libc_wrapper_files})
 add_header_target("openmp-resource-headers" ${openmp_wrapper_files})
+add_header_target("llvm-offload-resource-headers" ${llvm_libc_wrapper_files})
 add_header_target("windows-resource-headers" ${windows_only_files})
 add_header_target("utility-resource-headers" ${utility_files})
 
@@ -542,6 +550,11 @@ install(
   DESTINATION ${header_install_dir}/openmp_wrappers
   COMPONENT clang-resource-headers)
 
+install(
+  FILES ${llvm_offload_wrapper_files}
+  DESTINATION ${header_install_dir}/llvm_offload_wrappers
+  COMPONENT clang-resource-headers)
+
 install(
   FILES ${zos_wrapper_files}
   DESTINATION ${header_install_dir}/zos_wrappers
@@ -704,8 +717,8 @@ install(
   COMPONENT openmp-resource-headers)
 
 install(
-  FILES ${openmp_wrapper_files}
-  DESTINATION ${header_install_dir}/openmp_wrappers
+  FILES ${llvm_offload_wrapper_files}
+  DESTINATION ${header_install_dir}/llvm_offload_wrappers
   EXCLUDE_FROM_ALL
   COMPONENT openmp-resource-headers)
 
diff --git a/clang/lib/Headers/llvm_offload_wrappers/__llvm_offload.h b/clang/lib/Headers/llvm_offload_wrappers/__llvm_offload.h
new file mode 100644
index 0000000000000..2898898904e29
--- /dev/null
+++ b/clang/lib/Headers/llvm_offload_wrappers/__llvm_offload.h
@@ -0,0 +1,31 @@
+/*===------ LLVM/Offload helpers for kernel languages (CUDA/HIP) -*- c++ -*-===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#include <stddef.h>
+
+#define __host__ __attribute__((host))
+#define __device__ __attribute__((device))
+#define __global__ __attribute__((global))
+#define __shared__ __attribute__((shared))
+#define __constant__ __attribute__((constant))
+#define __managed__ __attribute__((managed))
+
+extern "C" {
+
+typedef struct dim3 {
+  dim3() {}
+  dim3(unsigned x) : x(x) {}
+  unsigned x = 0, y = 0, z = 0;
+} dim3;
+
+// TODO: For some reason the CUDA device compilation requires this declaration
+// to be present on the device while it is only used on the host.
+unsigned __llvmPushCallConfiguration(dim3 gridDim, dim3 blockDim,
+                                     size_t sharedMem = 0, void *stream = 0);
+}
diff --git a/clang/lib/Headers/llvm_offload_wrappers/__llvm_offload_device.h b/clang/lib/Headers/llvm_offload_wrappers/__llvm_offload_device.h
new file mode 100644
index 0000000000000..1a813b331515b
--- /dev/null
+++ b/clang/lib/Headers/llvm_offload_wrappers/__llvm_offload_device.h
@@ -0,0 +1,10 @@
+/*===------ LLVM/Offload helpers for kernel languages (CUDA/HIP) -*- c++ -*-===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#include "__llvm_offload.h"
diff --git a/clang/lib/Headers/llvm_offload_wrappers/__llvm_offload_host.h b/clang/lib/Headers/llvm_offload_wrappers/__llvm_offload_host.h
new file mode 100644
index 0000000000000..160289d169b55
--- /dev/null
+++ b/clang/lib/Headers/llvm_offload_wrappers/__llvm_offload_host.h
@@ -0,0 +1,15 @@
+/*===------ LLVM/Offload helpers for kernel languages (CUDA/HIP) -*- c++ -*-===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#include "__llvm_offload.h"
+
+extern "C" {
+unsigned llvmLaunchKernel(const void *func, dim3 gridDim, dim3 blockDim,
+                          void **args, size_t sharedMem = 0, void *stream = 0);
+}
diff --git a/clang/lib/Headers/llvm_offload_wrappers/cuda_runtime.h b/clang/lib/Headers/llvm_offload_wrappers/cuda_runtime.h
new file mode 100644
index 0000000000000..2d698e1c14e49
--- /dev/null
+++ b/clang/lib/Headers/llvm_offload_wrappers/cuda_runtime.h
@@ -0,0 +1,137 @@
+/*===- __cuda_runtime.h - LLVM/Offload wrappers for CUDA runtime API -------===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __CUDA_RUNTIME_API__
+#define __CUDA_RUNTIME_API__
+
+#include <cstddef>
+#include <cstdint>
+#include <optional>
+
+extern "C" {
+int omp_get_initial_device(void);
+void omp_target_free(void *Ptr, int Device);
+void *omp_target_alloc(size_t Size, int Device);
+int omp_target_memcpy(void *Dst, const void *Src, size_t Length,
+                      size_t DstOffset, size_t SrcOffset, int DstDevice,
+                      int SrcDevice);
+void *omp_target_memset(void *Ptr, int C, size_t N, int DeviceNum);
+int __tgt_target_synchronize_async_info_queue(void *Loc, int64_t DeviceNum,
+                                              void *AsyncInfoQueue);
+}
+
+// TODO: There are many fields missing in this enumeration.
+typedef enum cudaError {
+  cudaSuccess = 0,
+  cudaErrorInvalidValue = 1,
+  cudaErrorMemoryAllocation = 2,
+  cudaErrorNoDevice = 100,
+  cudaErrorInvalidDevice = 101,
+  cudaErrorOTHER = -1,
+} cudaError_t;
+
+enum cudaMemcpyKind {
+  cudaMemcpyHostToHost = 0,
+  cudaMemcpyHostToDevice = 1,
+  cudaMemcpyDeviceToHost = 2,
+  cudaMemcpyDeviceToDevice = 3,
+  cudaMemcpyDefault = 4
+};
+
+typedef void *cudaStream_t;
+
+static thread_local cudaError_t __cudaomp_last_error = cudaSuccess;
+
+// Returns the last error that has been produced and resets it to cudaSuccess.
+inline cudaError_t cudaGetLastError() {
+  cudaError_t TempError = __cudaomp_last_error;
+  __cudaomp_last_error = cudaSuccess;
+  return TempError;
+}
+
+// Returns the last error that has been produced without reseting it.
+inline cudaError_t cudaPeekAtLastError() { return __cudaomp_last_error; }
+
+inline cudaError_t cudaDeviceSynchronize() {
+  int DeviceNum = 0;
+  return __cudaomp_last_error =
+             (cudaError_t)__tgt_target_synchronize_async_info_queue(
+                 /*Loc=*/nullptr, DeviceNum, /*AsyncInfoQueue=*/nullptr);
+}
+
+inline cudaError_t __cudaMalloc(void **devPtr, size_t size) {
+  int DeviceNum = 0;
+  *devPtr = omp_target_alloc(size, DeviceNum);
+  if (*devPtr == NULL)
+    return __cudaomp_last_error = cudaErrorMemoryAllocation;
+
+  return __cudaomp_last_error = cudaSuccess;
+}
+
+template <class T> cudaError_t cudaMalloc(T **devPtr, size_t size) {
+  return __cudaMalloc((void **)devPtr, size);
+}
+
+inline cudaError_t __cudaFree(void *devPtr) {
+  int DeviceNum = 0;
+  omp_target_free(devPtr, DeviceNum);
+  return __cudaomp_last_error = cudaSuccess;
+}
+
+template <class T> inline cudaError_t cudaFree(T *ptr) {
+  return __cudaFree((void *)ptr);
+}
+
+inline cudaError_t __cudaMemcpy(void *dst, const void *src, size_t count,
+                                cudaMemcpyKind kind) {
+  // get the host device number (which is the inital device)
+  int HostDeviceNum = omp_get_initial_device();
+
+  // use the default device for gpu
+  int GPUDeviceNum = 0;
+
+  // default to copy from host to device
+  int DstDeviceNum = GPUDeviceNum;
+  int SrcDeviceNum = HostDeviceNum;
+
+  if (kind == cudaMemcpyDeviceToHost)
+    std::swap(DstDeviceNum, SrcDeviceNum);
+
+  // omp_target_memcpy returns 0 on success and non-zero on failure
+  if (omp_target_memcpy(dst, src, count, 0, 0, DstDeviceNum, SrcDeviceNum))
+    return __cudaomp_last_error = cudaErrorInvalidValue;
+  return __cudaomp_last_error = cudaSuccess;
+}
+
+template <class T>
+inline cudaError_t cudaMemcpy(T *dst, const T *src, size_t count,
+                              cudaMemcpyKind kind) {
+  return __cudaMemcpy((void *)dst, (const void *)src, count, kind);
+}
+
+inline cudaError_t __cudaMemset(void *devPtr, int value, size_t count,
+                                cudaStream_t stream = 0) {
+  int DeviceNum = 0;
+  if (!omp_target_memset(devPtr, value, count, DeviceNum))
+    return __cudaomp_last_error = cudaErrorInvalidValue;
+  return __cudaomp_last_error = cudaSuccess;
+}
+
+template <class T>
+inline cudaError_t cudaMemset(T *devPtr, int value, size_t count) {
+  return __cudaMemset((void *)devPtr, value, count);
+}
+
+inline cudaError_t cudaDeviceReset(void) {
+  cudaDeviceSynchronize();
+  // TODO: not implemented.
+  return __cudaomp_last_error = cudaSuccess;
+}
+
+#endif
diff --git a/clang/lib/Headers/openmp_wrappers/__clang_openmp_device_functions.h b/clang/lib/Headers/openmp_wrappers/__clang_openmp_device_functions.h
index d5b6846b03488..3e354c63efc66 100644
--- a/clang/lib/Headers/openmp_wrappers/__clang_openmp_device_functions.h
+++ b/clang/lib/Headers/openmp_wrappers/__clang_openmp_device_functions.h
@@ -10,17 +10,15 @@
 #ifndef __CLANG_OPENMP_DEVICE_FUNCTIONS_H__
 #define __CLANG_OPENMP_DEVICE_FUNCTIONS_H__
 
-#ifndef _OPENMP
-#error "This file is for OpenMP compilation only."
-#endif
-
 #ifdef __cplusplus
 extern "C" {
 #endif
 
+#ifdef __NVPTX__
 #pragma omp begin declare variant match(                                       \
     device = {arch(nvptx, nvptx64)}, implementation = {extension(match_any)})
 
+#pragma push_macro("__CUDA__")
 #define __CUDA__
 #define __OPENMP_NVPTX__
 
@@ -31,9 +29,10 @@ extern "C" {
 #include <__clang_cuda_device_functions.h>
 
 #undef __OPENMP_NVPTX__
-#undef __CUDA__
+#pragma pop_macro("__CUDA__")
 
 #pragma omp end declare variant
+#endif
 
 #ifdef __AMDGCN__
 #pragma omp begin declare variant match(device = {arch(amdgcn)})
diff --git a/clang/lib/Sema/SemaCUDA.cpp b/clang/lib/Sema/SemaCUDA.cpp
index 580b9872c6a1d..ec37c0df56c67 100644
--- a/clang/lib/Sema/SemaCUDA.cpp
+++ b/clang/lib/Sema/SemaCUDA.cpp
@@ -1068,6 +1068,9 @@ void SemaCUDA::inheritTargetAttrs(FunctionDecl *FD,
 }
 
 std::string SemaCUDA::getConfigureFuncName() const {
+  if (getLangOpts().OffloadViaLLVM)
+    return "__llvmPushCallConfiguration";
+
   if (getLangOpts().HIP)
     return getLangOpts().HIPUseNewLaunchAPI ? "__hipPushCallConfiguration"
                                             : "hipConfigureCall";
diff --git a/clang/test/CodeGenCUDA/offload_via_llvm.cu b/clang/test/CodeGenCUDA/offload_via_llvm.cu
new file mode 100644
index 0000000000000..3eb580850fc48
--- /dev/null
+++ b/clang/test/CodeGenCUDA/offload_via_llvm.cu
@@ -0,0 +1,97 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
+// RUN: %clang -Xclang -triple -Xclang "x86_64-unknown-linux-gnu" -S -c -foffload-via-llvm -emit-llvm -o - %s | FileCheck %s
+
+// Check that we generate LLVM/Offload calls, including the KERNEL_LAUNCH_PARAMS argument.
+
+// CHECK-LABEL: define dso_local void @_Z18__device_stub__fooisPvS_(
+// CHECK-SAME: i32 noundef [[TMP0:%.*]], i16 noundef signext [[TMP1:%.*]], ptr noundef [[TMP2:%.*]], ptr noundef [[TMP3:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[DOTADDR:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[DOTADDR1:%.*]] = alloca i16, align 2
+// CHECK-NEXT:    [[DOTADDR2:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[DOTADDR3:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[KERNEL_ARGS:%.*]] = alloca [[TMP0]], align 16
+// CHECK-NEXT:    [[KERNEL_LAUNCH_PARAMS:%.*]] = alloca [[TMP1]], align 16
+// CHECK-NEXT:    [[GRID_DIM:%.*]] = alloca [[STRUCT_DIM3:%.*]], align 8
+// CHECK-NEXT:    [[BLOCK_DIM:%.*]] = alloca [[STRUCT_DIM3]], align 8
+// CHECK-NEXT:    [[SHMEM_SIZE:%.*]] = alloca i64, align 8
+// CHECK-NEXT:    [[STREAM:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[GRID_DIM_COERCE:%.*]] = alloca { i64, i32 }, align 8
+// CHECK-NEXT:    [[BLOCK_DIM_COERCE:%.*]] = alloca { i64, i32 }, align 8
+// CHECK-NEXT:    store i32 [[TMP0]], ptr [[DOTADDR]], align 4
+// CHECK-NEXT:    store i16 [[TMP1]], ptr [[DOTADDR1]], align 2
+// CHECK-NEXT:    store ptr [[TMP2]], ptr [[DOTADDR2]], align 8
+// CHECK-NEXT:    store ptr [[TMP3]], ptr [[DOTADDR3]], align 8
+// CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [[TMP1]], ptr [[KERNEL_LAUNCH_PARAMS]], i32 0, i32 0
+// CHECK-NEXT:    store i64 24, ptr [[TMP4]], align 16
+// CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[TMP1]], ptr [[KERNEL_LAUNCH_PARAMS]], i32 0, i32 1
+// CHECK-NEXT:    store ptr [[KERNEL_ARGS]], ptr [[TMP5]], align 8
+// CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[TMP1]], ptr [[KERNEL_LAUNCH_PARAMS]], i32 0, i32 2
+// CHECK-NEXT:    store ptr null, ptr [[TMP6]], align 16
+// CHECK-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTADDR]], align 4
+// CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [[TMP0]], ptr [[KERNEL_ARGS]], i32 0, i32 0
+// CHECK-NEXT:    store i32 [[TMP7]], ptr [[TMP8]], align 16
+// CHECK-NEXT:    [[TMP9:%.*]] = load i16, ptr [[DOTADDR1]], align 2
+// CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [[TMP0]], ptr [[KERNEL_ARGS]], i32 0, i32 1
+// CHECK-NEXT:    store i16 [[TMP9]], ptr [[TMP10]], align 4
+// CHECK-NEXT:    [[TMP11:%.*]] = load ptr, ptr [[DOTADDR2]], align 8
+// CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [[TMP0]], ptr [[KERNEL_ARGS]], i32 0, i32 2
+// CHECK-NEXT:    store ptr [[TMP11]], ptr [[TMP12]], align 8
+// CHECK-NEXT:    [[TMP13:%.*]] = load ptr, ptr [[DOTADDR3]], align 8
+// CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [[TMP0]], ptr [[KERNEL_ARGS]], i32 0, i32 3
+// CHECK-NEXT:    store ptr [[TMP13]], ptr [[TMP14]], align 16
+// CHECK-NEXT:    [[TMP15:%.*]] = call i32 @__llvmPopCallConfiguration(ptr [[GRID_DIM]], ptr [[BLOCK_DIM]], ptr [[SHMEM_SIZE]], ptr [[STREAM]])
+// CHECK-NEXT:    [[TMP16:%.*]] = load i64, ptr [[SHMEM_SIZE]], align 8
+// CHECK-NEXT:    [[TMP17:%.*]] = load ptr, ptr [[STREAM]], align 8
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[GRID_DIM_COERCE]], ptr align 8 [[GRID_DIM]], i64 12, i1 false)
+// CHECK-NEXT:    [[TMP18:%.*]] = getelementptr inbounds { i64, i32 }, ptr [[GRID_DIM_COERCE]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP19:%.*]] = load i64, ptr [[TMP18]], align 8
+// CHECK-NEXT:    [[TMP20:%.*]] = getelementptr inbounds { i64, i32 }, ptr [[GRID_DIM_COERCE]], i32 0, i32 1
+// CHECK-NEXT:    [[TMP21:%.*]] = load i32, ptr [[TMP20]], align 8
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[BLOCK_DIM_COERCE]], ptr align 8 [[BLOCK_DIM]], i64 12, i1 false)
+// CHECK-NEXT:    [[TMP22:%.*]] = getelementptr inbounds { i64, i32 }, ptr [[BLOCK_DIM_COERCE]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP23:%.*]] = load i64, ptr [[TMP22]], align 8
+// CHECK-NEXT:    [[TMP24:%.*]] = getelementptr inbounds { i64, i32 }, ptr [[BLOCK_DIM_COERCE]], i32 0, i32 1
+// CHECK-NEXT:    [[TMP25:%.*]] = load i32, ptr [[TMP24]], align 8
+// CHECK-NEXT:    [[CALL:%.*]] = call noundef i32 @llvmLaunchKernel(ptr noundef @_Z18__device_stub__fooisPvS_, i64 [[TMP19]], i32 [[TMP21]], i64 [[TMP23]], i32 [[TMP25]], ptr noundef [[KERNEL_LAUNCH_PARAMS]], i64 noundef [[TMP16]], ptr noundef [[TMP17]])
+// CHECK-NEXT:    br label %[[SETUP_END:.*]]
+// CHECK:       [[SETUP_END]]:
+// CHECK-NEXT:    ret void
+//
+__global__ void foo(int, short, void *, void *) {}
+
+// CHECK-LABEL: define dso_local void @_Z5test1Pv(
+// CHECK-SAME: ptr noundef [[PTR:%.*]]) #[[ATTR2:[0-9]+]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[PTR_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[AGG_TMP:%.*]] = alloca [[STRUCT_DIM3:%.*]], align 4
+// CHECK-NEXT:    [[AGG_TMP1:%.*]] = alloca [[STRUCT_DIM3]], align 4
+// CHECK-NEXT:    [[AGG_TMP_COERCE:%.*]] = alloca { i64, i32 }, align 4
+// CHECK-NEXT:    [[AGG_TMP1_COERCE:%.*]] = alloca { i64, i32 }, align 4
+// CHECK-NEXT:    store ptr [[PTR]], ptr [[PTR_ADDR]], align 8
+// CHECK-NEXT:    call void @_ZN4dim3C2Ej(ptr noundef nonnull align 4 dereferenceable(12) [[AGG_TMP]], i32 noundef 3)
+// CHECK-NEXT:    call void @_ZN4dim3C2Ej(ptr noundef nonnull align 4 dereferenceable(12) [[AGG_TMP1]], i32 noundef 7)
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[AGG_TMP_COERCE]], ptr align 4 [[AGG_TMP]], i64 12, i1 false)
+// CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds { i64, i32 }, ptr [[AGG_TMP_COERCE]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr [[TMP0]], align 4
+// CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds { i64, i32 }, ptr [[AGG_TMP_COERCE]], i32 0, i32 1
+// CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr [[TMP2]], align 4
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[AGG_TMP1_COERCE]], ptr align 4 [[AGG_TMP1]], i64 12, i1 false)
+// CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds { i64, i32 }, ptr [[AGG_TMP1_COERCE]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP5:%.*]] = load i64, ptr [[TMP4]], align 4
+// CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds { i64, i32 }, ptr [[AGG_TMP1_COERCE]], i32 0, i32 1
+// CHECK-NEXT:    [[TMP7:%.*]] = load i32, ptr [[TMP6]], align 4
+// CHECK-NEXT:    [[CALL:%.*]] = call i32 @__llvmPushCallConfiguration(i64 [[TMP1]], i32 [[TMP3]], i64 [[TMP5]], i32 [[TMP7]], i64 noundef 0, ptr noundef null)
+// CHECK-NEXT:    [[TOBOOL:%.*]] = icmp ne i32 [[CALL]], 0
+// CHECK-NEXT:    br i1 [[TOBOOL]], label %[[KCALL_END:.*]], label %[[KCALL_CONFIGOK:.*]]
+// CHECK:       [[KCALL_CONFIGOK]]:
+// CHECK-NEXT:    [[TMP8:%.*]] = load ptr, ptr [[PTR_ADDR]], align 8
+// CHECK-NEXT:    [[TMP9:%.*]] = load ptr, ptr [[PTR_ADDR]], align 8
+// CHECK-NEXT:    call void @_Z18__device_stub__fooisPvS_(i32 noundef 13, i16 noundef signext 1, ptr noundef [[TMP8]], ptr noundef [[TMP9]]) #[[ATTR5:[0-9]+]]
+// CHECK-NEXT:    br label %[[KCALL_END]]
+// CHECK:       [[KCALL_END]]:
+// CHECK-NEXT:    ret void
+//
+void test1(void *Ptr) {
+  foo<<<3, 7>>>(13, 1, Ptr, Ptr);
+}
diff --git a/clang/test/Driver/cuda-via-liboffload.cu b/clang/test/Driver/cuda-via-liboffload.cu
new file mode 100644
index 0000000000000..68dc963e906b2
--- /dev/null
+++ b/clang/test/Driver/cuda-via-liboffload.cu
@@ -0,0 +1,23 @@
+// RUN: %clang -### -target x86_64-linux-gnu -foffload-via-llvm -ccc-print-bindings \
+// RUN:        --offload-arch=sm_35 --offload-arch=sm_70 %s 2>&1 \
+// RUN: | FileCheck -check-prefix BINDINGS %s
+
+//      BINDINGS: "x86_64-unknown-linux-gnu" - "clang", inputs: ["[[INPUT:.+]]"], output: "[[HOST_BC:.+]]"
+// BINDINGS-NEXT: "nvptx64-nvidia-cuda" - "clang", inputs: ["[[INPUT]]", "[[HOST_BC]]"], output: "[[PTX_SM_35:.+]]"
+// BINDINGS-NEXT: "nvptx64-nvidia-cuda" - "NVPTX::Assembler", inputs: ["[[PTX_SM_35]]"], output: "[[CUBIN_SM_35:.+]]"
+// BINDINGS-NEXT: "nvptx64-nvidia-cuda" - "clang", inputs: ["[[INPUT]]", "[[HOST_BC]]"], output: "[[PTX_SM_70:.+]]"
+// BINDINGS-NEXT: "nvptx64-nvidia-cuda" - "NVPTX::Assembler", inputs: ["[[PTX_SM_70:.+]]"], output: "[[CUBIN_SM_70:.+]]"
+// BINDINGS-NEXT: "x86_64-unknown-linux-gnu" - "Offload::Packager", inputs: ["[[CUBIN_SM_35]]", "[[CUBIN_SM_70]]"], output: "[[BINARY:.+]]"
+// BINDINGS-NEXT: "x86_64-unknown-linux-gnu" - "clang", inputs: ["[[HOST_BC]]", "[[BINARY]]"], output: "[[HOST_OBJ:.+]]"
+// BINDINGS-NEXT: "x86_64-unknown-linux-gnu" - "Offload::Linker", inputs: ["[[HOST_OBJ]]"], output: "a.out"
+
+// RUN: %clang -### -target x86_64-linux-gnu -foffload-via-llvm -ccc-print-bindings \
+// RUN:        --offload-arch=sm_35 --offload-arch=sm_70 %s 2>&1 \
+// RUN: | FileCheck -check-prefix BINDINGS-DEVICE %s
+
+// BINDINGS-DEVICE: # "nvptx64-nvidia-cuda" - "clang", inputs: ["[[INPUT:.+]]"], output: "[[PTX:.+]]"
+// BINDINGS-DEVICE: # "nvptx64-nvidia-cuda" - "NVPTX::Assembler", inputs: ["[[PTX]]"], output: "[[CUBIN:.+]]"
+
+// RUN: %clang -### -target x86_64-linux-gnu -ccc-print-bindings --offload-link -foffload-via-llvm %s 2>&1 | FileCheck -check-prefix DEVICE-LINK %s
+
+// DEVICE-LINK: "x86_64-unknown-linux-gnu" - "Offload::Linker", inputs: ["[[INPUT:.+]]"], output: "a.out"
diff --git a/llvm/include/llvm/Frontend/OpenMP/OMPConstants.h b/llvm/include/llvm/Frontend/OpenMP/OMPConstants.h
index 338b56226f204..a7be3f51fac7d 100644
--- a/llvm/include/llvm/Frontend/OpenMP/OMPConstants.h
+++ b/llvm/include/llvm/Frontend/OpenMP/OMPConstants.h
@@ -72,7 +72,7 @@ enum class IdentFlag {
 #include "llvm/Frontend/OpenMP/OMPKinds.def"
 
 // Version of the kernel argument format used by the omp runtime.
-#define OMP_KERNEL_ARG_VERSION 3
+#define OMP_KERNEL_ARG_VERSION 4
 
 // Minimum version of the compiler that generates a kernel dynamic pointer.
 #define OMP_KERNEL_ARG_MIN_VERSION_WITH_DYN_PTR 3
diff --git a/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def b/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def
index fe09bb8177c28..0be3827185e2e 100644
--- a/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def
+++ b/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def
@@ -90,7 +90,7 @@ __OMP_ARRAY_TYPE(Int32Arr3, Int32, 3)
 __OMP_STRUCT_TYPE(Ident, ident_t, false, Int32, Int32, Int32, Int32, Int8Ptr)
 __OMP_STRUCT_TYPE(KernelArgs, __tgt_kernel_arguments, false, Int32, Int32, VoidPtrPtr,
 		  VoidPtrPtr, Int64Ptr, Int64Ptr, VoidPtrPtr, VoidPtrPtr,
-		  Int64, Int64, Int32Arr3Ty, Int32Arr3Ty, Int32)
+		  Int64, Int64, Int32Arr3Ty, Int32Arr3Ty, Int32, VoidPtr)
 __OMP_STRUCT_TYPE(AsyncInfo, __tgt_async_info, false, Int8Ptr)
 __OMP_STRUCT_TYPE(DependInfo, kmp_dep_info, false, SizeTy, SizeTy, Int8)
 __OMP_STRUCT_TYPE(Task, kmp_task_ompbuilder_t, false, VoidPtr, VoidPtr, Int32, VoidPtr, VoidPtr)
diff --git a/llvm/include/llvm/Transforms/Instrumentation/GPUSan.h b/llvm/include/llvm/Transforms/Instrumentation/GPUSan.h
new file mode 100644
index 0000000000000..1201b3c47ac06
--- /dev/null
+++ b/llvm/include/llvm/Transforms/Instrumentation/GPUSan.h
@@ -0,0 +1,26 @@
+//===- Transforms/Instrumentation/GPUSan.h ----------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+/// \file
+/// This file provides the interface for LLVM's PGO Instrumentation lowering
+/// pass.
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TRANSFORMS_INSTRUMENTATION_GPUSAN_H
+#define LLVM_TRANSFORMS_INSTRUMENTATION_GPUSAN_H
+
+#include "llvm/IR/PassManager.h"
+
+namespace llvm {
+
+class GPUSanPass : public PassInfoMixin<GPUSanPass> {
+public:
+  PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM);
+};
+} // end namespace llvm
+
+#endif // LLVM_TRANSFORMS_INSTRUMENTATION_GPUSAN_H
diff --git a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
index 92213e19c9d9d..ba73d48bc9a8b 100644
--- a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
+++ b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
@@ -496,6 +496,7 @@ void OpenMPIRBuilder::getKernelArgsVector(TargetKernelArgs &KernelArgs,
   auto Int32Ty = Type::getInt32Ty(Builder.getContext());
   Value *ZeroArray = Constant::getNullValue(ArrayType::get(Int32Ty, 3));
   Value *Flags = Builder.getInt64(KernelArgs.HasNoWait);
+  Value *AsyncInfoQueue = Constant::getNullValue(Builder.getPtrTy());
 
   Value *NumTeams3D =
       Builder.CreateInsertValue(ZeroArray, KernelArgs.NumTeams, {0});
@@ -514,7 +515,8 @@ void OpenMPIRBuilder::getKernelArgsVector(TargetKernelArgs &KernelArgs,
                 Flags,
                 NumTeams3D,
                 NumThreads3D,
-                KernelArgs.DynCGGroupMem};
+                KernelArgs.DynCGGroupMem,
+                AsyncInfoQueue};
 }
 
 void OpenMPIRBuilder::addAttributes(omp::RuntimeFunction FnID, Function &Fn) {
diff --git a/llvm/lib/Passes/PassBuilder.cpp b/llvm/lib/Passes/PassBuilder.cpp
index 19e8a8ab68a73..42fdb2415c2fe 100644
--- a/llvm/lib/Passes/PassBuilder.cpp
+++ b/llvm/lib/Passes/PassBuilder.cpp
@@ -174,6 +174,7 @@
 #include "llvm/Transforms/Instrumentation/ControlHeightReduction.h"
 #include "llvm/Transforms/Instrumentation/DataFlowSanitizer.h"
 #include "llvm/Transforms/Instrumentation/GCOVProfiler.h"
+#include "llvm/Transforms/Instrumentation/GPUSan.h"
 #include "llvm/Transforms/Instrumentation/HWAddressSanitizer.h"
 #include "llvm/Transforms/Instrumentation/InstrOrderFile.h"
 #include "llvm/Transforms/Instrumentation/InstrProfiling.h"
diff --git a/llvm/lib/Passes/PassBuilderPipelines.cpp b/llvm/lib/Passes/PassBuilderPipelines.cpp
index 926515c9508a9..410aedd57dc9a 100644
--- a/llvm/lib/Passes/PassBuilderPipelines.cpp
+++ b/llvm/lib/Passes/PassBuilderPipelines.cpp
@@ -71,6 +71,7 @@
 #include "llvm/Transforms/InstCombine/InstCombine.h"
 #include "llvm/Transforms/Instrumentation/CGProfile.h"
 #include "llvm/Transforms/Instrumentation/ControlHeightReduction.h"
+#include "llvm/Transforms/Instrumentation/GPUSan.h"
 #include "llvm/Transforms/Instrumentation/InstrOrderFile.h"
 #include "llvm/Transforms/Instrumentation/InstrProfiling.h"
 #include "llvm/Transforms/Instrumentation/MemProfiler.h"
@@ -163,6 +164,10 @@ static cl::opt<bool>
                             cl::Hidden,
                             cl::desc("Enable inline deferral during PGO"));
 
+static cl::opt<bool>
+    EnableOffloadSanitizer("enable-offload-sanitizer", cl::init(false),
+                           cl::Hidden, cl::desc("Enable offload sanitizer"));
+
 static cl::opt<bool> EnableModuleInliner("enable-module-inliner",
                                          cl::init(false), cl::Hidden,
                                          cl::desc("Enable module inliner"));
@@ -1750,6 +1755,9 @@ PassBuilder::buildLTODefaultPipeline(OptimizationLevel Level,
     // in ICP.
     MPM.addPass(LowerTypeTestsPass(nullptr, nullptr, true));
 
+    if (EnableOffloadSanitizer)
+      MPM.addPass(GPUSanPass());
+
     invokeFullLinkTimeOptimizationLastEPCallbacks(MPM, Level);
 
     // Emit annotation remarks.
@@ -1828,6 +1836,9 @@ PassBuilder::buildLTODefaultPipeline(OptimizationLevel Level,
     // pipeline).
     MPM.addPass(LowerTypeTestsPass(nullptr, nullptr, true));
 
+    if (EnableOffloadSanitizer)
+      MPM.addPass(GPUSanPass());
+
     invokeFullLinkTimeOptimizationLastEPCallbacks(MPM, Level);
 
     // Emit annotation remarks.
@@ -2040,6 +2051,9 @@ PassBuilder::buildLTODefaultPipeline(OptimizationLevel Level,
   if (PTO.CallGraphProfile)
     MPM.addPass(CGProfilePass(/*InLTOPostLink=*/true));
 
+  if (EnableOffloadSanitizer)
+    MPM.addPass(GPUSanPass());
+
   invokeFullLinkTimeOptimizationLastEPCallbacks(MPM, Level);
 
   // Emit annotation remarks.
diff --git a/llvm/lib/Passes/PassRegistry.def b/llvm/lib/Passes/PassRegistry.def
index 60c517790bcab..741c1d3e1842a 100644
--- a/llvm/lib/Passes/PassRegistry.def
+++ b/llvm/lib/Passes/PassRegistry.def
@@ -78,6 +78,7 @@ MODULE_PASS("inliner-wrapper-no-mandatory-first",
 MODULE_PASS("insert-gcov-profiling", GCOVProfilerPass())
 MODULE_PASS("instrorderfile", InstrOrderFilePass())
 MODULE_PASS("instrprof", InstrProfilingLoweringPass())
+MODULE_PASS("gpusan", GPUSanPass())
 MODULE_PASS("ctx-instr-lower", PGOCtxProfLoweringPass())
 MODULE_PASS("invalidate<all>", InvalidateAllAnalysesPass())
 MODULE_PASS("iroutliner", IROutlinerPass())
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
index 6e7d34f5adaa3..a310038543532 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
@@ -2043,7 +2043,9 @@ static bool isPtrKnownNeverNull(const Value *V, const DataLayout &DL,
   //
   // TODO: Use ValueTracking's isKnownNeverNull if it becomes aware that some
   // address spaces have non-zero null values.
-  auto SrcPtrKB = computeKnownBits(V, DL).trunc(DL.getPointerSizeInBits(AS));
+  auto SrcPtrKB = computeKnownBits(V, DL);
+  if (SrcPtrKB.getBitWidth() > DL.getPointerSizeInBits(AS))
+    SrcPtrKB = SrcPtrKB.trunc(DL.getPointerSizeInBits(AS));
   const auto NullVal = TM.getNullPointerValue(AS);
   assert((NullVal == 0 || NullVal == -1) &&
          "don't know how to check for this null value!");
diff --git a/llvm/lib/Transforms/IPO/AttributorAttributes.cpp b/llvm/lib/Transforms/IPO/AttributorAttributes.cpp
index 05a38cdd7d7b6..ebdc2c3682aab 100644
--- a/llvm/lib/Transforms/IPO/AttributorAttributes.cpp
+++ b/llvm/lib/Transforms/IPO/AttributorAttributes.cpp
@@ -7536,7 +7536,7 @@ struct AAPrivatizablePtrArgument final : public AAPrivatizablePtrImpl {
     // Collect all tail calls in the function as we cannot allow new allocas to
     // escape into tail recursion.
     // TODO: Be smarter about new allocas escaping into tail calls.
-    SmallVector<CallInst *, 16> TailCalls;
+    SmallVector<WeakTrackingVH, 16> TailCalls;
     bool UsedAssumedInformation = false;
     if (!A.checkForAllInstructions(
             [&](Instruction &I) {
@@ -7574,8 +7574,9 @@ struct AAPrivatizablePtrArgument final : public AAPrivatizablePtrImpl {
                 AI, Arg->getType(), "", IP);
           Arg->replaceAllUsesWith(AI);
 
-          for (CallInst *CI : TailCalls)
-            CI->setTailCall(false);
+          for (auto &CI : TailCalls)
+            if (CI)
+              cast<CallInst>(CI)->setTailCall(false);
         };
 
     // Callback to repair a call site of the associated function. The elements
diff --git a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp
index e3a4821b8226b..89ce01907deb7 100644
--- a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp
+++ b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp
@@ -620,11 +620,13 @@ struct OMPInformationCache : public InformationCache {
     // functions, except if `optnone` is present.
     if (isOpenMPDevice(M)) {
       for (Function &F : M) {
-        for (StringRef Prefix : {"__kmpc", "_ZN4ompx", "omp_"})
-          if (F.hasFnAttribute(Attribute::NoInline) &&
-              F.getName().starts_with(Prefix) &&
-              !F.hasFnAttribute(Attribute::OptimizeNone))
-            F.removeFnAttr(Attribute::NoInline);
+        for (StringRef Prefix : {"__kmpc", "_ZN4ompx", "omp_"}) {
+          if (!F.getName().starts_with(Prefix) ||
+              F.hasFnAttribute(Attribute::OptimizeNone))
+            continue;
+          F.removeFnAttr(Attribute::NoInline);
+          F.addFnAttr(Attribute::AlwaysInline);
+        }
       }
     }
 
diff --git a/llvm/lib/Transforms/Instrumentation/CMakeLists.txt b/llvm/lib/Transforms/Instrumentation/CMakeLists.txt
index 8d345d394b51a..9aa530229ab7e 100644
--- a/llvm/lib/Transforms/Instrumentation/CMakeLists.txt
+++ b/llvm/lib/Transforms/Instrumentation/CMakeLists.txt
@@ -6,6 +6,7 @@ add_llvm_component_library(LLVMInstrumentation
   DataFlowSanitizer.cpp
   GCOVProfiling.cpp
   BlockCoverageInference.cpp
+  GPUSan.cpp
   MemProfiler.cpp
   MemorySanitizer.cpp
   IndirectCallPromotion.cpp
diff --git a/llvm/lib/Transforms/Instrumentation/GPUSan.cpp b/llvm/lib/Transforms/Instrumentation/GPUSan.cpp
new file mode 100644
index 0000000000000..611ec9c888782
--- /dev/null
+++ b/llvm/lib/Transforms/Instrumentation/GPUSan.cpp
@@ -0,0 +1,904 @@
+//===-- GPUSan.cpp - GPU sanitizer ----------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Instrumentation/GPUSan.h"
+
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/DenseMapInfo.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringMap.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/GlobalObject.h"
+#include "llvm/IR/GlobalValue.h"
+#include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InstIterator.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/IntrinsicsAMDGPU.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Metadata.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Support/Allocator.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/StringSaver.h"
+#include "llvm/Transforms/Utils/ModuleUtils.h"
+#include <cstdint>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "gpusan"
+
+cl::opt<bool> UseTags(
+    "gpusan-use-tags",
+    cl::desc(
+        "Use tags to detect use after if the number of allocations is large"),
+    cl::init(false));
+
+namespace llvm {
+
+struct LocationInfoTy {
+  uint64_t LineNo = 0;
+  uint64_t ColumnNo = 0;
+  uint64_t ParentIdx = -1;
+  StringRef FileName;
+  StringRef FunctionName;
+  bool operator==(const LocationInfoTy &RHS) const {
+    return LineNo == RHS.LineNo && ColumnNo == RHS.ColumnNo &&
+           FileName == RHS.FileName && FunctionName == RHS.FunctionName;
+  }
+};
+template <> struct DenseMapInfo<LocationInfoTy *> {
+  static LocationInfoTy EmptyKey;
+  static LocationInfoTy TombstoneKey;
+  static inline LocationInfoTy *getEmptyKey() { return &EmptyKey; }
+
+  static inline LocationInfoTy *getTombstoneKey() { return &TombstoneKey; }
+
+  static unsigned getHashValue(const LocationInfoTy *LI) {
+    unsigned Hash = DenseMapInfo<uint64_t>::getHashValue(LI->LineNo);
+    Hash = detail::combineHashValue(
+        Hash, DenseMapInfo<uint64_t>::getHashValue(LI->ColumnNo));
+    Hash = detail::combineHashValue(
+        Hash, DenseMapInfo<StringRef>::getHashValue(LI->FileName));
+    Hash = detail::combineHashValue(
+        Hash, DenseMapInfo<StringRef>::getHashValue(LI->FunctionName));
+    return Hash;
+  }
+
+  static bool isEqual(const LocationInfoTy *LHS, const LocationInfoTy *RHS) {
+    return *LHS == *RHS;
+  }
+};
+LocationInfoTy DenseMapInfo<LocationInfoTy *>::EmptyKey =
+    LocationInfoTy{(uint64_t)-1};
+LocationInfoTy DenseMapInfo<LocationInfoTy *>::TombstoneKey =
+    LocationInfoTy{(uint64_t)-2};
+} // namespace llvm
+
+namespace {
+
+enum PtrOrigin {
+  UNKNOWN,
+  LOCAL,
+  GLOBAL,
+  SYSTEM,
+  NONE,
+};
+
+static std::string getSuffix(PtrOrigin PO) {
+  switch (PO) {
+  case UNKNOWN:
+    return "";
+  case LOCAL:
+    return "_local";
+  case GLOBAL:
+    return "_global";
+  default:
+    break;
+  }
+  llvm_unreachable("Bad pointer origin!");
+}
+
+static StringRef prettifyFunctionName(StringSaver &SS, StringRef Name) {
+  if (Name.ends_with(".internalized"))
+    return SS.save(Name.drop_back(sizeof("internalized")) + " (internalized)");
+  if (!Name.starts_with("__omp_offloading_"))
+    return Name;
+  Name = Name.drop_front(sizeof("__omp_offloading_"));
+  auto It = Name.find_first_of("_");
+  if (It != StringRef::npos && It + 1 < Name.size())
+    Name = Name.drop_front(It + 1);
+  It = Name.find_first_of("_");
+  if (It != StringRef::npos && It + 1 < Name.size())
+    Name = Name.drop_front(It + 1);
+  if (Name.ends_with("_debug__"))
+    Name = Name.drop_back(sizeof("debug__"));
+  if (Name.ends_with("_debug___omp_outlined_debug__"))
+    Name = Name.drop_back(sizeof("debug___omp_outlined_debug__"));
+  It = Name.find_last_of("_");
+  if (It == StringRef::npos || It + 1 >= Name.size())
+    return Name;
+  if (Name[It + 1] != 'l')
+    return Name;
+  int64_t KernelLineNo = 0;
+  Name.take_back(Name.size() - It -
+                 /* '_' and 'l' */ 2)
+      .getAsInteger(10, KernelLineNo);
+  if (KernelLineNo)
+    Name = SS.save("omp target (" + Name.take_front(It).str() + ":" +
+                   std::to_string(KernelLineNo) + ")");
+  return Name;
+}
+
+class GPUSanImpl final {
+public:
+  GPUSanImpl(Module &M, FunctionAnalysisManager &FAM)
+      : M(M), FAM(FAM), Ctx(M.getContext()) {}
+
+  bool instrument();
+
+private:
+  bool instrumentGlobals();
+  bool instrumentFunction(Function &Fn);
+  Value *instrumentAllocation(Instruction &I, Value &Size, FunctionCallee Fn,
+                              PtrOrigin PO);
+  Value *instrumentAllocaInst(LoopInfo &LI, AllocaInst &AI);
+  void instrumentAccess(LoopInfo &LI, Instruction &I, int PtrIdx,
+                        Type &AccessTy, bool IsRead);
+  void instrumentLoadInst(LoopInfo &LI, LoadInst &LoadI);
+  void instrumentStoreInst(LoopInfo &LI, StoreInst &StoreI);
+  void instrumentGEPInst(LoopInfo &LI, GetElementPtrInst &GEP);
+  bool instrumentCallInst(LoopInfo &LI, CallInst &CI);
+  void
+  instrumentReturns(SmallVectorImpl<std::pair<AllocaInst *, Value *>> &Allocas,
+                    SmallVectorImpl<ReturnInst *> &Returns);
+
+  Value *getPC(IRBuilder<> &IRB);
+  Value *getFunctionName(IRBuilder<> &IRB);
+  Value *getFileName(IRBuilder<> &IRB);
+  Value *getLineNo(IRBuilder<> &IRB);
+
+  void getAllocationInfo(Function &Fn, PtrOrigin PO, Value &Object,
+                         Value *&Start, Value *&Length, Value *&Tag);
+  PtrOrigin getPtrOrigin(LoopInfo &LI, Value *Ptr,
+                         const Value **Object = nullptr);
+
+  FunctionCallee getOrCreateFn(FunctionCallee &FC, StringRef Name, Type *RetTy,
+                               ArrayRef<Type *> ArgTys) {
+    if (!FC) {
+      auto *NewAllocationFnTy = FunctionType::get(RetTy, ArgTys, false);
+      FC = M.getOrInsertFunction(Name, NewAllocationFnTy);
+    }
+    return FC;
+  }
+
+  PointerType *getPtrTy(PtrOrigin PO) {
+    if (PO == PtrOrigin::LOCAL)
+      return PointerType::get(Ctx, 5);
+    return PtrTy;
+  }
+
+  FunctionCallee getNewFn(PtrOrigin PO) {
+    assert(PO <= GLOBAL && "Origin does not need handling.");
+    return getOrCreateFn(NewFn[PO], "ompx_new" + getSuffix(PO), getPtrTy(PO),
+                         {getPtrTy(PO), Int64Ty, Int64Ty, Int64Ty});
+  }
+  FunctionCallee getFreeFn(PtrOrigin PO) {
+    assert(PO <= GLOBAL && "Origin does not need handling.");
+    return getOrCreateFn(FreeFn[PO], "ompx_free" + getSuffix(PO), VoidTy,
+                         {getPtrTy(PO), Int64Ty});
+  }
+  FunctionCallee getFreeNLocalFn() {
+    return getOrCreateFn(FreeNLocalFn, "ompx_free_local_n", VoidTy, {Int32Ty});
+  }
+  FunctionCallee getCheckFn(PtrOrigin PO) {
+    assert(PO <= GLOBAL && "Origin does not need handling.");
+    return getOrCreateFn(CheckFn[PO], "ompx_check" + getSuffix(PO),
+                         getPtrTy(PO),
+                         {getPtrTy(PO), Int64Ty, Int64Ty, Int64Ty});
+  }
+  FunctionCallee getCheckWithBaseFn(PtrOrigin PO) {
+    assert(PO >= LOCAL && PO <= GLOBAL && "Origin does not need handling.");
+    return getOrCreateFn(CheckWithBaseFn[PO],
+                         "ompx_check_with_base" + getSuffix(PO), getPtrTy(PO),
+                         {getPtrTy(PO), getPtrTy(PO), Int64Ty, Int32Ty, Int64Ty,
+                          Int64Ty, Int64Ty});
+  }
+  FunctionCallee getAllocationInfoFn(PtrOrigin PO) {
+    assert(PO >= LOCAL && PO <= GLOBAL && "Origin does not need handling.");
+    return getOrCreateFn(
+        AllocationInfoFn[PO], "ompx_get_allocation_info" + getSuffix(PO),
+        StructType::create({getPtrTy(PO), Int64Ty, Int32Ty}), {getPtrTy(PO)});
+  }
+  FunctionCallee getGEPFn(PtrOrigin PO) {
+    assert(PO <= GLOBAL && "Origin does not need handling.");
+    return getOrCreateFn(GEPFn[PO], "ompx_gep" + getSuffix(PO), getPtrTy(PO),
+                         {getPtrTy(PO), Int64Ty, Int64Ty});
+  }
+  FunctionCallee getUnpackFn(PtrOrigin PO) {
+    assert(PO <= GLOBAL && "Origin does not need handling.");
+    return getOrCreateFn(UnpackFn[PO], "ompx_unpack" + getSuffix(PO),
+                         getPtrTy(PO), {getPtrTy(PO), Int64Ty});
+  }
+  FunctionCallee getLifetimeStart() {
+    return getOrCreateFn(LifetimeStartFn, "ompx_lifetime_start", VoidTy,
+                         {getPtrTy(LOCAL), Int64Ty});
+  }
+  FunctionCallee getLifetimeEnd() {
+    return getOrCreateFn(LifetimeEndFn, "ompx_lifetime_end", VoidTy,
+                         {getPtrTy(LOCAL), Int64Ty});
+  }
+  FunctionCallee getLeakCheckFn() {
+    FunctionCallee LeakCheckFn;
+    return getOrCreateFn(LeakCheckFn, "ompx_leak_check", VoidTy, {});
+  }
+  FunctionCallee getThreadIdFn() {
+    return getOrCreateFn(ThreadIDFn, "ompx_global_thread_id", Int32Ty, {});
+  }
+
+  Module &M;
+  FunctionAnalysisManager &FAM;
+  LLVMContext &Ctx;
+  bool HasAllocas;
+  GlobalVariable *LocationsArray;
+  SmallSetVector<CallBase *, 16> AmbiguousCalls;
+
+  Type *VoidTy = Type::getVoidTy(Ctx);
+  Type *IntptrTy = M.getDataLayout().getIntPtrType(Ctx);
+  PointerType *PtrTy = PointerType::getUnqual(Ctx);
+  IntegerType *Int8Ty = Type::getInt8Ty(Ctx);
+  IntegerType *Int32Ty = Type::getInt32Ty(Ctx);
+  IntegerType *Int64Ty = Type::getInt64Ty(Ctx);
+
+  const DataLayout &DL = M.getDataLayout();
+
+  FunctionCallee NewFn[3];
+  FunctionCallee GEPFn[3];
+  FunctionCallee FreeFn[3];
+  FunctionCallee CheckFn[3];
+  FunctionCallee CheckWithBaseFn[3];
+  FunctionCallee AllocationInfoFn[3];
+  FunctionCallee UnpackFn[3];
+  FunctionCallee LifetimeEndFn;
+  FunctionCallee LifetimeStartFn;
+  FunctionCallee FreeNLocalFn;
+  FunctionCallee ThreadIDFn;
+
+  StringMap<Value *> GlobalStringMap;
+  struct AllocationInfoTy {
+    Value *Start;
+    Value *Length;
+    Value *Tag;
+  };
+  DenseMap<std::pair<Function *, Value *>, AllocationInfoTy> AllocationInfoMap;
+
+  DenseMap<LocationInfoTy *, uint64_t, DenseMapInfo<LocationInfoTy *>>
+      LocationMap;
+
+  const std::pair<LocationInfoTy *, uint64_t>
+  addLocationInfo(LocationInfoTy *LI, bool &IsNew) {
+    auto It = LocationMap.insert({LI, LocationMap.size()});
+    IsNew = It.second;
+    if (!IsNew)
+      delete LI;
+    return {It.first->first, It.first->second};
+  }
+
+  void buildCallTreeInfo(Function &Fn, LocationInfoTy &LI);
+  ConstantInt *getSourceIndex(Instruction &I, LocationInfoTy *LastLI = nullptr);
+
+  uint64_t addString(StringRef S) {
+    const auto &It = UniqueStrings.insert({S, ConcatenatedString.size()});
+    if (It.second) {
+      ConcatenatedString += S;
+      ConcatenatedString.push_back('\0');
+    }
+    return It.first->second;
+  };
+
+  void encodeLocationInfo(LocationInfoTy &LI, uint64_t Idx) {
+    StringRef FunctionName = LI.FunctionName;
+    if (LI.ParentIdx == (decltype(LI.ParentIdx))-1)
+      FunctionName = prettifyFunctionName(SS, FunctionName);
+
+    auto FuncIdx = addString(FunctionName);
+    auto FileIdx = addString(LI.FileName);
+    if (LocationEncoding.size() < (Idx + 1) * 5)
+      LocationEncoding.resize((Idx + 1) * 5);
+    LocationEncoding[Idx * 5 + 0] = ConstantInt::get(Int64Ty, FuncIdx);
+    LocationEncoding[Idx * 5 + 1] = ConstantInt::get(Int64Ty, FileIdx);
+    LocationEncoding[Idx * 5 + 2] = ConstantInt::get(Int64Ty, LI.LineNo);
+    LocationEncoding[Idx * 5 + 3] = ConstantInt::get(Int64Ty, LI.ColumnNo);
+    LocationEncoding[Idx * 5 + 4] = ConstantInt::get(Int64Ty, LI.ParentIdx);
+  }
+
+  SmallVector<Constant *> LocationEncoding;
+  std::string ConcatenatedString;
+  DenseMap<uint64_t, uint64_t> StringIndexMap;
+  DenseMap<StringRef, uint64_t> UniqueStrings;
+
+  BumpPtrAllocator BPA;
+  StringSaver SS = StringSaver(BPA);
+};
+
+} // end anonymous namespace
+
+ConstantInt *GPUSanImpl::getSourceIndex(Instruction &I,
+                                        LocationInfoTy *LastLI) {
+  LocationInfoTy *LI = new LocationInfoTy();
+  auto *DILoc = I.getDebugLoc().get();
+
+  auto FillLI = [&](LocationInfoTy &LI, DILocation &DIL) {
+    LI.FileName = DIL.getFilename();
+    if (LI.FileName.empty())
+      LI.FileName = I.getFunction()->getSubprogram()->getFilename();
+    LI.FunctionName = DIL.getSubprogramLinkageName();
+    if (LI.FunctionName.empty())
+      LI.FunctionName = I.getFunction()->getName();
+    LI.LineNo = DIL.getLine();
+    LI.ColumnNo = DIL.getColumn();
+  };
+
+  DILocation *ParentDILoc = nullptr;
+  if (DILoc) {
+    FillLI(*LI, *DILoc);
+    ParentDILoc = DILoc->getInlinedAt();
+  } else {
+    LI->FunctionName = I.getFunction()->getName();
+  }
+
+  bool IsNew;
+  uint64_t Idx;
+  std::tie(LI, Idx) = addLocationInfo(LI, IsNew);
+  if (LastLI)
+    LastLI->ParentIdx = Idx;
+  if (!IsNew)
+    return ConstantInt::get(Int64Ty, Idx);
+
+  uint64_t CurIdx = Idx;
+  LocationInfoTy *CurLI = LI;
+  while (ParentDILoc) {
+    auto *ParentLI = new LocationInfoTy();
+    FillLI(*ParentLI, *ParentDILoc);
+    uint64_t ParentIdx;
+    std::tie(ParentLI, ParentIdx) = addLocationInfo(ParentLI, IsNew);
+    CurLI->ParentIdx = ParentIdx;
+    if (!IsNew)
+      break;
+    encodeLocationInfo(*CurLI, CurIdx);
+    CurLI = ParentLI;
+    CurIdx = ParentIdx;
+    ParentDILoc = ParentDILoc->getInlinedAt();
+  }
+
+  Function &Fn = *I.getFunction();
+  buildCallTreeInfo(Fn, *CurLI);
+
+  encodeLocationInfo(*CurLI, CurIdx);
+
+  return ConstantInt::get(Int64Ty, Idx);
+}
+
+void GPUSanImpl::buildCallTreeInfo(Function &Fn, LocationInfoTy &LI) {
+  if (Fn.hasFnAttribute("kernel"))
+    return;
+  SmallVector<CallBase *> Calls;
+  for (auto &U : Fn.uses()) {
+    auto *CB = dyn_cast<CallBase>(U.getUser());
+    if (!CB)
+      continue;
+    if (!CB->isCallee(&U))
+      continue;
+    Calls.push_back(CB);
+  }
+  if (Calls.size() == 1) {
+    getSourceIndex(*Calls.back(), &LI);
+    return;
+  }
+  LI.ParentIdx = -2;
+  AmbiguousCalls.insert(Calls.begin(), Calls.end());
+}
+
+Value *GPUSanImpl::getPC(IRBuilder<> &IRB) {
+  return IRB.CreateIntrinsic(Int64Ty, Intrinsic::amdgcn_s_getpc, {}, nullptr,
+                             "PC");
+}
+Value *GPUSanImpl::getFunctionName(IRBuilder<> &IRB) {
+  const auto &DLoc = IRB.getCurrentDebugLocation();
+  StringRef FnName = IRB.GetInsertPoint()->getFunction()->getName();
+  if (DLoc && DLoc.get()) {
+    StringRef SubprogramName = DLoc.get()->getSubprogramLinkageName();
+    if (!SubprogramName.empty())
+      FnName = SubprogramName;
+  }
+  StringRef Name = FnName.take_back(255);
+  Value *&NameVal = GlobalStringMap[Name];
+  if (!NameVal)
+    NameVal = IRB.CreateAddrSpaceCast(
+        IRB.CreateGlobalStringPtr(Name, "", DL.getDefaultGlobalsAddressSpace(),
+                                  &M),
+        PtrTy);
+  return NameVal;
+}
+Value *GPUSanImpl::getFileName(IRBuilder<> &IRB) {
+  const auto &DLoc = IRB.getCurrentDebugLocation();
+  if (!DLoc || DLoc->getFilename().empty())
+    return ConstantPointerNull::get(PtrTy);
+  StringRef Name = DLoc->getFilename().take_back(255);
+  Value *&NameVal = GlobalStringMap[Name];
+  if (!NameVal)
+    NameVal = IRB.CreateAddrSpaceCast(
+        IRB.CreateGlobalStringPtr(Name, "", DL.getDefaultGlobalsAddressSpace(),
+                                  &M),
+        PtrTy);
+  return NameVal;
+}
+Value *GPUSanImpl::getLineNo(IRBuilder<> &IRB) {
+  const auto &DLoc = IRB.getCurrentDebugLocation();
+  if (!DLoc)
+    return Constant::getNullValue(Int64Ty);
+  return ConstantInt::get(Int64Ty, DLoc.getLine());
+}
+
+void GPUSanImpl::getAllocationInfo(Function &Fn, PtrOrigin PO, Value &Object,
+                                   Value *&Start, Value *&Length, Value *&Tag) {
+  auto &It = AllocationInfoMap[{&Fn, &Object}];
+  if (!It.Start) {
+    auto *IP = dyn_cast<Instruction>(&Object);
+    if (IP)
+      IP = IP->getNextNode();
+    else
+      IP = &*Fn.getEntryBlock().getFirstNonPHIOrDbgOrAlloca();
+    IRBuilder<> IRB(IP);
+    auto *CB = IRB.CreateCall(getAllocationInfoFn(PO),
+                              {IRB.CreateAddrSpaceCast(&Object, getPtrTy(PO))});
+    It.Start = IRB.CreateExtractValue(CB, {0});
+    It.Length = IRB.CreateExtractValue(CB, {1});
+    It.Tag = IRB.CreateExtractValue(CB, {2});
+  }
+  Start = It.Start;
+  Length = It.Length;
+  Tag = It.Tag;
+}
+
+PtrOrigin GPUSanImpl::getPtrOrigin(LoopInfo &LI, Value *Ptr,
+                                   const Value **Object) {
+  SmallVector<const Value *> Objects;
+  getUnderlyingObjects(Ptr, Objects, &LI);
+  if (Object && Objects.size() == 1)
+    *Object = Objects.front();
+  PtrOrigin PO = NONE;
+  for (auto *Obj : Objects) {
+    PtrOrigin ObjPO = HasAllocas ? UNKNOWN : GLOBAL;
+    if (isa<AllocaInst>(Obj)) {
+      ObjPO = LOCAL;
+    } else if (isa<GlobalVariable>(Obj)) {
+      ObjPO = GLOBAL;
+    } else if (auto *II = dyn_cast<IntrinsicInst>(Obj)) {
+      if (II->getIntrinsicID() == Intrinsic::amdgcn_implicitarg_ptr ||
+          II->getIntrinsicID() == Intrinsic::amdgcn_dispatch_ptr)
+        return SYSTEM;
+    } else if (auto *CI = dyn_cast<CallInst>(Obj)) {
+      if (auto *Callee = CI->getCalledFunction())
+        if (Callee->getName().starts_with("ompx_")) {
+          if (Callee->getName().ends_with("_global"))
+            ObjPO = GLOBAL;
+          else if (Callee->getName().ends_with("_local"))
+            ObjPO = LOCAL;
+        }
+    } else if (auto *Arg = dyn_cast<Argument>(Obj)) {
+      if (Arg->getParent()->hasFnAttribute("kernel"))
+        ObjPO = GLOBAL;
+    }
+    if (PO == NONE || PO == ObjPO) {
+      PO = ObjPO;
+    } else {
+      return UNKNOWN;
+    }
+  }
+  return PO;
+}
+
+bool GPUSanImpl::instrumentGlobals() {
+  Function *DtorFn =
+      Function::Create(FunctionType::get(VoidTy, false),
+                       GlobalValue::PrivateLinkage, "__san.dtor", &M);
+  BasicBlock *Entry = BasicBlock::Create(Ctx, "entry", DtorFn);
+  IRBuilder<> IRB(Entry);
+  IRB.CreateCall(getLeakCheckFn());
+  IRB.CreateRetVoid();
+  appendToGlobalDtors(M, DtorFn, 0, nullptr);
+
+  return true;
+
+  Function *DTorFn;
+  std::tie(DTorFn, std::ignore) = getOrCreateSanitizerCtorAndInitFunctions(
+      M, "ompx.ctor", "ompx.init",
+      /*InitArgTypes=*/{},
+      /*InitArgs=*/{},
+      // This callback is invoked when the functions are created the first
+      // time. Hook them into the global ctors list in that case:
+      [&](Function *Ctor, FunctionCallee) {
+        appendToGlobalCtors(M, Ctor, 0, Ctor);
+      });
+  return true;
+}
+
+Value *GPUSanImpl::instrumentAllocation(Instruction &I, Value &Size,
+                                        FunctionCallee Fn, PtrOrigin PO) {
+  IRBuilder<> IRB(I.getNextNode());
+  Value *PlainI = IRB.CreatePointerBitCastOrAddrSpaceCast(&I, getPtrTy(PO));
+  static int AllocationId = 1;
+  auto *CB =
+      IRB.CreateCall(Fn,
+                     {PlainI, &Size, ConstantInt::get(Int64Ty, AllocationId++),
+                      getSourceIndex(I)},
+                     I.getName() + ".san");
+  SmallVector<LifetimeIntrinsic *> Lifetimes;
+  I.replaceUsesWithIf(
+      IRB.CreatePointerBitCastOrAddrSpaceCast(CB, I.getType()), [&](Use &U) {
+        if (auto *LT = dyn_cast<LifetimeIntrinsic>(U.getUser())) {
+          Lifetimes.push_back(LT);
+          return false;
+        }
+        return U.getUser() != PlainI && U.getUser() != CB;
+      });
+  if (Lifetimes.empty())
+    return CB;
+
+  CB->setArgOperand(1, ConstantInt::get(Int64Ty, 0));
+  for (auto *LT : Lifetimes) {
+    if (LT->getIntrinsicID() == Intrinsic::lifetime_start) {
+      IRB.SetInsertPoint(LT);
+      IRB.CreateCall(getLifetimeStart(), {CB, LT->getArgOperand(0)});
+    } else {
+      IRB.SetInsertPoint(LT);
+      IRB.CreateCall(getLifetimeEnd(), {CB, LT->getArgOperand(0)});
+    }
+  }
+  return CB;
+}
+
+Value *GPUSanImpl::instrumentAllocaInst(LoopInfo &LI, AllocaInst &AI) {
+  auto SizeOrNone = AI.getAllocationSize(DL);
+  if (!SizeOrNone)
+    llvm_unreachable("TODO");
+  Value *Size = ConstantInt::get(Int64Ty, *SizeOrNone);
+  return instrumentAllocation(AI, *Size, getNewFn(LOCAL), LOCAL);
+}
+
+void GPUSanImpl::instrumentAccess(LoopInfo &LI, Instruction &I, int PtrIdx,
+                                  Type &AccessTy, bool IsRead) {
+  Value *PtrOp = I.getOperand(PtrIdx);
+  const Value *Object = nullptr;
+  PtrOrigin PO = getPtrOrigin(LI, PtrOp, &Object);
+  if (PO > GLOBAL)
+    return;
+
+  Value *Start = nullptr;
+  Value *Length = nullptr;
+  Value *Tag = nullptr;
+  if (PO != UNKNOWN && Object)
+    getAllocationInfo(*I.getFunction(), PO, *const_cast<Value *>(Object), Start,
+                      Length, Tag);
+
+  static int32_t ReadAccessId = -1;
+  static int32_t WriteAccessId = 1;
+  const int32_t &AccessId = IsRead ? ReadAccessId-- : WriteAccessId++;
+
+  auto TySize = DL.getTypeStoreSize(&AccessTy);
+  assert(!TySize.isScalable());
+  Value *Size = ConstantInt::get(Int64Ty, TySize.getFixedValue());
+  IRBuilder<> IRB(&I);
+  Value *PlainPtrOp =
+      IRB.CreatePointerBitCastOrAddrSpaceCast(PtrOp, getPtrTy(PO));
+  CallInst *CB;
+  if (Start) {
+    CB =
+        IRB.CreateCall(getCheckWithBaseFn(PO),
+                       {PlainPtrOp, Start, Length, Tag, Size,
+                        ConstantInt::get(Int64Ty, AccessId), getSourceIndex(I)},
+                       I.getName() + ".san");
+  } else {
+    CB = IRB.CreateCall(getCheckFn(PO),
+                        {PlainPtrOp, Size, ConstantInt::get(Int64Ty, AccessId),
+                         getSourceIndex(I)},
+                        I.getName() + ".san");
+  }
+  I.setOperand(PtrIdx,
+               IRB.CreatePointerBitCastOrAddrSpaceCast(CB, PtrOp->getType()));
+}
+
+void GPUSanImpl::instrumentLoadInst(LoopInfo &LI, LoadInst &LoadI) {
+  instrumentAccess(LI, LoadI, LoadInst::getPointerOperandIndex(),
+                   *LoadI.getType(),
+                   /*IsRead=*/true);
+}
+
+void GPUSanImpl::instrumentStoreInst(LoopInfo &LI, StoreInst &StoreI) {
+  instrumentAccess(LI, StoreI, StoreInst::getPointerOperandIndex(),
+                   *StoreI.getValueOperand()->getType(), /*IsRead=*/false);
+}
+
+void GPUSanImpl::instrumentGEPInst(LoopInfo &LI, GetElementPtrInst &GEP) {
+  Value *PtrOp = GEP.getPointerOperand();
+  PtrOrigin PO = getPtrOrigin(LI, PtrOp);
+  if (PO > GLOBAL)
+    return;
+
+  GEP.setOperand(GetElementPtrInst::getPointerOperandIndex(),
+                 Constant::getNullValue(PtrOp->getType()));
+  IRBuilder<> IRB(GEP.getNextNode());
+  Value *PlainPtrOp =
+      IRB.CreatePointerBitCastOrAddrSpaceCast(PtrOp, getPtrTy(PO));
+  auto *CB = IRB.CreateCall(getGEPFn(PO),
+                            {PlainPtrOp, UndefValue::get(Int64Ty), getPC(IRB)},
+                            GEP.getName() + ".san");
+  GEP.replaceAllUsesWith(
+      IRB.CreatePointerBitCastOrAddrSpaceCast(CB, GEP.getType()));
+  Value *Offset =
+      new PtrToIntInst(&GEP, Int64Ty, GEP.getName() + ".san.offset", CB);
+  CB->setArgOperand(1, Offset);
+}
+
+bool GPUSanImpl::instrumentCallInst(LoopInfo &LI, CallInst &CI) {
+  bool Changed = false;
+  if (isa<LifetimeIntrinsic>(CI))
+    return Changed;
+  if (auto *Fn = CI.getCalledFunction()) {
+    if (Fn->getName().starts_with("__kmpc_target_init"))
+      return Changed;
+    if ((Fn->isDeclaration() || Fn->getName().starts_with("__kmpc") ||
+         Fn->getName().starts_with("rpc_")) &&
+        !Fn->getName().starts_with("ompx")) {
+      IRBuilder<> IRB(&CI);
+      for (int I = 0, E = CI.arg_size(); I != E; ++I) {
+        Value *Op = CI.getArgOperand(I);
+        if (!Op->getType()->isPointerTy())
+          continue;
+        PtrOrigin PO = getPtrOrigin(LI, Op);
+        if (PO > GLOBAL)
+          continue;
+        Value *PlainOp =
+            IRB.CreatePointerBitCastOrAddrSpaceCast(Op, getPtrTy(PO));
+        auto *CB = IRB.CreateCall(getUnpackFn(PO), {PlainOp, getPC(IRB)},
+                                  Op->getName() + ".unpack");
+        CI.setArgOperand(
+            I, IRB.CreatePointerBitCastOrAddrSpaceCast(CB, Op->getType()));
+        Changed = true;
+      }
+    }
+  }
+  return Changed;
+}
+
+bool GPUSanImpl::instrumentFunction(Function &Fn) {
+  if (Fn.isDeclaration())
+    return false;
+
+  bool Changed = false;
+  LoopInfo &LI = FAM.getResult<LoopAnalysis>(Fn);
+  SmallVector<std::pair<AllocaInst *, Value *>> Allocas;
+  SmallVector<ReturnInst *> Returns;
+  SmallVector<LoadInst *> Loads;
+  SmallVector<StoreInst *> Stores;
+  SmallVector<CallInst *> Calls;
+  SmallVector<GetElementPtrInst *> GEPs;
+
+  for (auto &I : instructions(Fn)) {
+    switch (I.getOpcode()) {
+    case Instruction::Alloca: {
+      AllocaInst &AI = cast<AllocaInst>(I);
+      Allocas.push_back({&AI, nullptr});
+      Changed = true;
+      break;
+    }
+    case Instruction::Load:
+      Loads.push_back(&cast<LoadInst>(I));
+      Changed = true;
+      break;
+    case Instruction::Store:
+      Stores.push_back(&cast<StoreInst>(I));
+      Changed = true;
+      break;
+    case Instruction::GetElementPtr:
+      GEPs.push_back(&cast<GetElementPtrInst>(I));
+      Changed = true;
+      break;
+    case Instruction::Call: {
+      auto &CI = cast<CallInst>(I);
+      Calls.push_back(&CI);
+      if (CI.isIndirectCall())
+        AmbiguousCalls.insert(&CI);
+      break;
+    }
+    case Instruction::Ret:
+      Returns.push_back(&cast<ReturnInst>(I));
+      break;
+    default:
+      break;
+    }
+  }
+
+  for (auto *Load : Loads)
+    instrumentLoadInst(LI, *Load);
+  for (auto *Store : Stores)
+    instrumentStoreInst(LI, *Store);
+  for (auto *GEP : GEPs)
+    instrumentGEPInst(LI, *GEP);
+  for (auto *Call : Calls)
+    Changed |= instrumentCallInst(LI, *Call);
+  for (auto &It : Allocas)
+    It.second = instrumentAllocaInst(LI, *It.first);
+
+  instrumentReturns(Allocas, Returns);
+
+  return Changed;
+}
+
+void GPUSanImpl::instrumentReturns(
+    SmallVectorImpl<std::pair<AllocaInst *, Value *>> &Allocas,
+    SmallVectorImpl<ReturnInst *> &Returns) {
+  if (Allocas.empty())
+    return;
+  for (auto *RI : Returns) {
+    IRBuilder<> IRB(RI);
+    IRB.CreateCall(getFreeNLocalFn(),
+                   {ConstantInt::get(Int32Ty, Allocas.size())});
+  }
+}
+
+bool GPUSanImpl::instrument() {
+  bool Changed = instrumentGlobals();
+  HasAllocas = [&]() {
+    for (Function &Fn : M)
+      for (auto &I : instructions(Fn))
+        if (isa<AllocaInst>(I))
+          return true;
+    return false;
+  }();
+
+  SmallVector<Function *> Kernels;
+  for (Function &Fn : M) {
+    if (Fn.hasFnAttribute("kernel"))
+      Kernels.push_back(&Fn);
+    if (!Fn.getName().contains("ompx") && !Fn.getName().contains("__kmpc") &&
+        !Fn.getName().starts_with("rpc_"))
+      if (!Fn.hasFnAttribute(Attribute::DisableSanitizerInstrumentation))
+        Changed |= instrumentFunction(Fn);
+  }
+
+  SmallVector<CallBase *> AmbiguousCallsOrdered;
+  SmallVector<Constant *> AmbiguousCallsMapping;
+  if (LocationMap.empty())
+    AmbiguousCalls.clear();
+  for (size_t I = 0; I < AmbiguousCalls.size(); ++I) {
+    CallBase &CB = *AmbiguousCalls[I];
+    AmbiguousCallsOrdered.push_back(&CB);
+    AmbiguousCallsMapping.push_back(getSourceIndex(CB));
+  }
+
+  uint64_t AmbiguousCallsBitWidth =
+      llvm::Log2_64_Ceil(AmbiguousCalls.size() + 1);
+
+  new GlobalVariable(M, Int64Ty, /*isConstant=*/true,
+                     GlobalValue::ExternalLinkage,
+                     ConstantInt::get(Int64Ty, AmbiguousCallsBitWidth),
+                     "__san.num_ambiguous_calls", nullptr,
+                     GlobalValue::ThreadLocalMode::NotThreadLocal, 1);
+
+  if (size_t NumAmbiguousCalls = AmbiguousCalls.size()) {
+    {
+      auto *ArrayTy = ArrayType::get(Int64Ty, NumAmbiguousCalls);
+      auto *GV = new GlobalVariable(
+          M, ArrayTy, /*isConstant=*/true, GlobalValue::ExternalLinkage,
+          ConstantArray::get(ArrayTy, AmbiguousCallsMapping),
+          "__san.ambiguous_calls_mapping", nullptr,
+          GlobalValue::ThreadLocalMode::NotThreadLocal, 4);
+      GV->setVisibility(GlobalValue::ProtectedVisibility);
+    }
+
+    auto *ArrayTy = ArrayType::get(Int64Ty, 1024);
+    LocationsArray = new GlobalVariable(
+        M, ArrayTy, /*isConstant=*/false, GlobalValue::PrivateLinkage,
+        UndefValue::get(ArrayTy), "__san.calls", nullptr,
+        GlobalValue::ThreadLocalMode::NotThreadLocal, 3);
+
+    auto *OldFn = M.getFunction("__san_get_location_value");
+    if (OldFn)
+      OldFn->setName("");
+    Function *LocationGetter = Function::Create(
+        FunctionType::get(Int64Ty, false), GlobalValue::ExternalLinkage,
+        "__san_get_location_value", M);
+    if (OldFn) {
+      OldFn->replaceAllUsesWith(LocationGetter);
+      OldFn->eraseFromParent();
+    }
+    auto *EntryBB = BasicBlock::Create(Ctx, "entry", LocationGetter);
+    IRBuilder<> IRB(EntryBB);
+    Value *Idx = IRB.CreateCall(getThreadIdFn(), {}, "san.gtid");
+    Value *Ptr = IRB.CreateGEP(Int64Ty, LocationsArray, {Idx});
+    auto *LocationValue = IRB.CreateLoad(Int64Ty, Ptr);
+    IRB.CreateRet(LocationValue);
+  }
+
+  Function *InitSharedFn =
+      Function::Create(FunctionType::get(VoidTy, false),
+                       GlobalValue::PrivateLinkage, "__san.init_shared", &M);
+  auto *EntryBB = BasicBlock::Create(Ctx, "entry", InitSharedFn);
+  IRBuilder<> IRB(EntryBB);
+  if (!AmbiguousCalls.empty()) {
+    Value *Idx = IRB.CreateCall(getThreadIdFn(), {}, "san.gtid");
+    Value *Ptr = IRB.CreateGEP(Int64Ty, LocationsArray, {Idx});
+    IRB.CreateStore(ConstantInt::get(Int64Ty, 0), Ptr);
+  }
+  IRB.CreateRetVoid();
+
+  for (auto *KernelFn : Kernels) {
+    IRBuilder<> IRB(&*KernelFn->getEntryBlock().getFirstNonPHIOrDbgOrAlloca());
+    IRB.CreateCall(InitSharedFn, {});
+  }
+
+  for (const auto &It : llvm::enumerate(AmbiguousCallsOrdered)) {
+    IRBuilder<> IRB(It.value());
+    Value *Idx = IRB.CreateCall(getThreadIdFn(), {}, "san.gtid");
+    Value *Ptr = IRB.CreateGEP(Int64Ty, LocationsArray, {Idx});
+    Value *OldVal = IRB.CreateLoad(Int64Ty, Ptr);
+    Value *OldValShifted = IRB.CreateShl(
+        OldVal, ConstantInt::get(Int64Ty, AmbiguousCallsBitWidth));
+    Value *NewVal = IRB.CreateBinOp(Instruction::Or, OldValShifted,
+                                    ConstantInt::get(Int64Ty, It.index() + 1));
+    IRB.CreateStore(NewVal, Ptr);
+    IRB.SetInsertPoint(It.value()->getNextNode());
+    IRB.CreateStore(OldVal, Ptr);
+  }
+
+  auto *NamesTy = ArrayType::get(Int8Ty, ConcatenatedString.size() + 1);
+  auto *Names = new GlobalVariable(
+      M, NamesTy, /*isConstant=*/true, GlobalValue::ExternalLinkage,
+      ConstantDataArray::getString(Ctx, ConcatenatedString),
+      "__san.location_names", nullptr,
+      GlobalValue::ThreadLocalMode::NotThreadLocal, 4);
+  Names->setVisibility(GlobalValue::ProtectedVisibility);
+
+  auto *ArrayTy = ArrayType::get(Int64Ty, LocationEncoding.size());
+  auto *GV = new GlobalVariable(
+      M, ArrayTy, /*isConstant=*/true, GlobalValue::ExternalLinkage,
+      ConstantArray::get(ArrayTy, LocationEncoding), "__san.locations", nullptr,
+      GlobalValue::ThreadLocalMode::NotThreadLocal, 4);
+  GV->setVisibility(GlobalValue::ProtectedVisibility);
+
+  M.dump();
+  return Changed;
+}
+
+PreservedAnalyses GPUSanPass::run(Module &M, ModuleAnalysisManager &AM) {
+  FunctionAnalysisManager &FAM =
+      AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
+  GPUSanImpl Lowerer(M, FAM);
+  if (!Lowerer.instrument())
+    return PreservedAnalyses::all();
+  LLVM_DEBUG(M.dump());
+
+  return PreservedAnalyses::none();
+}
diff --git a/offload/DeviceRTL/CMakeLists.txt b/offload/DeviceRTL/CMakeLists.txt
index d88430a52b8b7..644befc532ab2 100644
--- a/offload/DeviceRTL/CMakeLists.txt
+++ b/offload/DeviceRTL/CMakeLists.txt
@@ -69,6 +69,7 @@ elseif(LIBOMPTARGET_DEVICE_ARCHITECTURES STREQUAL "auto" OR
       "${LIBOMPTARGET_NVPTX_DETECTED_ARCH_LIST};${LIBOMPTARGET_AMDGPU_DETECTED_ARCH_LIST}")
 endif()
 list(REMOVE_DUPLICATES LIBOMPTARGET_DEVICE_ARCHITECTURES)
+set(LIBOMPTARGET_DEVICE_ARCHITECTURES ${LIBOMPTARGET_DEVICE_ARCHITECTURES} PARENT_SCOPE)
 
 set(include_files
   ${include_directory}/Allocator.h
@@ -85,6 +86,7 @@ set(include_files
 
 set(src_files
   ${source_directory}/Allocator.cpp
+  ${source_directory}/Sanitizer.cpp
   ${source_directory}/Configuration.cpp
   ${source_directory}/Debug.cpp
   ${source_directory}/Kernel.cpp
diff --git a/offload/DeviceRTL/include/Allocator.h b/offload/DeviceRTL/include/Allocator.h
index a28eb0fb2977e..6bb1cafac720f 100644
--- a/offload/DeviceRTL/include/Allocator.h
+++ b/offload/DeviceRTL/include/Allocator.h
@@ -12,7 +12,7 @@
 #ifndef OMPTARGET_ALLOCATOR_H
 #define OMPTARGET_ALLOCATOR_H
 
-#include "Types.h"
+#include "DeviceTypes.h"
 
 // Forward declaration.
 struct KernelEnvironmentTy;
diff --git a/offload/DeviceRTL/include/Configuration.h b/offload/DeviceRTL/include/Configuration.h
index 8e6f5c89cbf24..f8b7a6c3c6c9d 100644
--- a/offload/DeviceRTL/include/Configuration.h
+++ b/offload/DeviceRTL/include/Configuration.h
@@ -15,7 +15,7 @@
 
 #include "Shared/Environment.h"
 
-#include "Types.h"
+#include "DeviceTypes.h"
 
 namespace ompx {
 namespace config {
diff --git a/offload/DeviceRTL/include/Types.h b/offload/DeviceRTL/include/DeviceTypes.h
similarity index 99%
rename from offload/DeviceRTL/include/Types.h
rename to offload/DeviceRTL/include/DeviceTypes.h
index 2e12d9da0353b..bf30ba31260f5 100644
--- a/offload/DeviceRTL/include/Types.h
+++ b/offload/DeviceRTL/include/DeviceTypes.h
@@ -1,4 +1,4 @@
-//===---------- Types.h - OpenMP types ---------------------------- C++ -*-===//
+//===---- DeviceTypes.h - OpenMP types ---------------------------- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/offload/DeviceRTL/include/DeviceUtils.h b/offload/DeviceRTL/include/DeviceUtils.h
new file mode 100644
index 0000000000000..7b8871a766161
--- /dev/null
+++ b/offload/DeviceRTL/include/DeviceUtils.h
@@ -0,0 +1,55 @@
+//===--------- DeviceUtils.h - OpenMP device runtime utility functions -- C++
+//-*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef OMPTARGET_DEVICERTL_DEVICE_UTILS_H
+#define OMPTARGET_DEVICERTL_DEVICE_UTILS_H
+
+#include "Shared/Utils.h"
+#include "DeviceTypes.h"
+
+#pragma omp begin declare target device_type(nohost)
+
+namespace utils {
+
+/// Return the value \p Var from thread Id \p SrcLane in the warp if the thread
+/// is identified by \p Mask.
+int32_t shuffle(uint64_t Mask, int32_t Var, int32_t SrcLane);
+
+int32_t shuffleDown(uint64_t Mask, int32_t Var, uint32_t Delta, int32_t Width);
+
+int64_t shuffleDown(uint64_t Mask, int64_t Var, uint32_t Delta, int32_t Width);
+
+uint64_t ballotSync(uint64_t Mask, int32_t Pred);
+
+/// Return \p LowBits and \p HighBits packed into a single 64 bit value.
+uint64_t pack(uint32_t LowBits, uint32_t HighBits);
+
+/// Unpack \p Val into \p LowBits and \p HighBits.
+void unpack(uint64_t Val, uint32_t &LowBits, uint32_t &HighBits);
+
+/// Return true iff \p Ptr is pointing into shared (local) memory (AS(3)).
+bool isSharedMemPtr(void *Ptr);
+
+/// Return true iff \p Ptr is pointing into (thread) local memory (AS(5)).
+bool isThreadLocalMemPtr(void *Ptr);
+
+/// A  pointer variable that has by design an `undef` value. Use with care.
+[[clang::loader_uninitialized]] static void *const UndefPtr;
+
+#define OMP_LIKELY(EXPR) __builtin_expect((bool)(EXPR), true)
+#define OMP_UNLIKELY(EXPR) __builtin_expect((bool)(EXPR), false)
+
+} // namespace utils
+
+#pragma omp end declare target
+
+#endif
diff --git a/offload/DeviceRTL/include/Interface.h b/offload/DeviceRTL/include/Interface.h
index f4854ed3d1678..12244a0971f49 100644
--- a/offload/DeviceRTL/include/Interface.h
+++ b/offload/DeviceRTL/include/Interface.h
@@ -14,7 +14,7 @@
 
 #include "Shared/Environment.h"
 
-#include "Types.h"
+#include "DeviceTypes.h"
 
 /// External API
 ///
diff --git a/offload/DeviceRTL/include/LibC.h b/offload/DeviceRTL/include/LibC.h
index dde86af783af9..6e02b4aca462a 100644
--- a/offload/DeviceRTL/include/LibC.h
+++ b/offload/DeviceRTL/include/LibC.h
@@ -12,7 +12,7 @@
 #ifndef OMPTARGET_LIBC_H
 #define OMPTARGET_LIBC_H
 
-#include "Types.h"
+#include "DeviceTypes.h"
 
 extern "C" {
 
diff --git a/offload/DeviceRTL/include/Mapping.h b/offload/DeviceRTL/include/Mapping.h
index 165904644dbb9..2fb87abe5418c 100644
--- a/offload/DeviceRTL/include/Mapping.h
+++ b/offload/DeviceRTL/include/Mapping.h
@@ -12,7 +12,7 @@
 #ifndef OMPTARGET_MAPPING_H
 #define OMPTARGET_MAPPING_H
 
-#include "Types.h"
+#include "DeviceTypes.h"
 
 namespace ompx {
 
diff --git a/offload/DeviceRTL/include/State.h b/offload/DeviceRTL/include/State.h
index 1a3490394458f..37699529e726f 100644
--- a/offload/DeviceRTL/include/State.h
+++ b/offload/DeviceRTL/include/State.h
@@ -16,8 +16,8 @@
 
 #include "Debug.h"
 #include "Mapping.h"
-#include "Types.h"
-#include "Utils.h"
+#include "DeviceTypes.h"
+#include "DeviceUtils.h"
 
 // Forward declaration.
 struct KernelEnvironmentTy;
diff --git a/offload/DeviceRTL/include/Synchronization.h b/offload/DeviceRTL/include/Synchronization.h
index af9e1a673e6a2..874974cc861df 100644
--- a/offload/DeviceRTL/include/Synchronization.h
+++ b/offload/DeviceRTL/include/Synchronization.h
@@ -12,7 +12,7 @@
 #ifndef OMPTARGET_DEVICERTL_SYNCHRONIZATION_H
 #define OMPTARGET_DEVICERTL_SYNCHRONIZATION_H
 
-#include "Types.h"
+#include "DeviceTypes.h"
 
 namespace ompx {
 
diff --git a/offload/DeviceRTL/include/Utils.h b/offload/DeviceRTL/include/Utils.h
deleted file mode 100644
index 82e2397b5958b..0000000000000
--- a/offload/DeviceRTL/include/Utils.h
+++ /dev/null
@@ -1,100 +0,0 @@
-//===--------- Utils.h - OpenMP device runtime utility functions -- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef OMPTARGET_DEVICERTL_UTILS_H
-#define OMPTARGET_DEVICERTL_UTILS_H
-
-#include "Types.h"
-
-#pragma omp begin declare target device_type(nohost)
-
-namespace ompx {
-namespace utils {
-
-/// Return the value \p Var from thread Id \p SrcLane in the warp if the thread
-/// is identified by \p Mask.
-int32_t shuffle(uint64_t Mask, int32_t Var, int32_t SrcLane);
-
-int32_t shuffleDown(uint64_t Mask, int32_t Var, uint32_t Delta, int32_t Width);
-
-int64_t shuffleDown(uint64_t Mask, int64_t Var, uint32_t Delta, int32_t Width);
-
-uint64_t ballotSync(uint64_t Mask, int32_t Pred);
-
-/// Return \p LowBits and \p HighBits packed into a single 64 bit value.
-uint64_t pack(uint32_t LowBits, uint32_t HighBits);
-
-/// Unpack \p Val into \p LowBits and \p HighBits.
-void unpack(uint64_t Val, uint32_t &LowBits, uint32_t &HighBits);
-
-/// Round up \p V to a \p Boundary.
-template <typename Ty> inline Ty roundUp(Ty V, Ty Boundary) {
-  return (V + Boundary - 1) / Boundary * Boundary;
-}
-
-/// Advance \p Ptr by \p Bytes bytes.
-template <typename Ty1, typename Ty2> inline Ty1 *advance(Ty1 Ptr, Ty2 Bytes) {
-  return reinterpret_cast<Ty1 *>(reinterpret_cast<char *>(Ptr) + Bytes);
-}
-
-/// Return the first bit set in \p V.
-inline uint32_t ffs(uint32_t V) {
-  static_assert(sizeof(int) == sizeof(uint32_t), "type size mismatch");
-  return __builtin_ffs(V);
-}
-
-/// Return the first bit set in \p V.
-inline uint32_t ffs(uint64_t V) {
-  static_assert(sizeof(long) == sizeof(uint64_t), "type size mismatch");
-  return __builtin_ffsl(V);
-}
-
-/// Return the number of bits set in \p V.
-inline uint32_t popc(uint32_t V) {
-  static_assert(sizeof(int) == sizeof(uint32_t), "type size mismatch");
-  return __builtin_popcount(V);
-}
-
-/// Return the number of bits set in \p V.
-inline uint32_t popc(uint64_t V) {
-  static_assert(sizeof(long) == sizeof(uint64_t), "type size mismatch");
-  return __builtin_popcountl(V);
-}
-
-/// Return \p V aligned "upwards" according to \p Align.
-template <typename Ty1, typename Ty2> inline Ty1 align_up(Ty1 V, Ty2 Align) {
-  return ((V + Ty1(Align) - 1) / Ty1(Align)) * Ty1(Align);
-}
-/// Return \p V aligned "downwards" according to \p Align.
-template <typename Ty1, typename Ty2> inline Ty1 align_down(Ty1 V, Ty2 Align) {
-  return V - V % Align;
-}
-
-/// Return true iff \p Ptr is pointing into shared (local) memory (AS(3)).
-bool isSharedMemPtr(void *Ptr);
-
-/// Return \p V typed punned as \p DstTy.
-template <typename DstTy, typename SrcTy> inline DstTy convertViaPun(SrcTy V) {
-  return *((DstTy *)(&V));
-}
-
-/// A  pointer variable that has by design an `undef` value. Use with care.
-[[clang::loader_uninitialized]] static void *const UndefPtr;
-
-#define OMP_LIKELY(EXPR) __builtin_expect((bool)(EXPR), true)
-#define OMP_UNLIKELY(EXPR) __builtin_expect((bool)(EXPR), false)
-
-} // namespace utils
-} // namespace ompx
-
-#pragma omp end declare target
-
-#endif
diff --git a/offload/DeviceRTL/src/Allocator.cpp b/offload/DeviceRTL/src/Allocator.cpp
index c9c940de62c1a..2a85a34d32f6e 100644
--- a/offload/DeviceRTL/src/Allocator.cpp
+++ b/offload/DeviceRTL/src/Allocator.cpp
@@ -14,8 +14,8 @@
 #include "Configuration.h"
 #include "Mapping.h"
 #include "Synchronization.h"
-#include "Types.h"
-#include "Utils.h"
+#include "DeviceTypes.h"
+#include "DeviceUtils.h"
 
 using namespace ompx;
 
diff --git a/offload/DeviceRTL/src/Configuration.cpp b/offload/DeviceRTL/src/Configuration.cpp
index ef0c3663536f5..4d97ad67313aa 100644
--- a/offload/DeviceRTL/src/Configuration.cpp
+++ b/offload/DeviceRTL/src/Configuration.cpp
@@ -13,7 +13,7 @@
 
 #include "Configuration.h"
 #include "State.h"
-#include "Types.h"
+#include "DeviceTypes.h"
 
 using namespace ompx;
 
diff --git a/offload/DeviceRTL/src/Debug.cpp b/offload/DeviceRTL/src/Debug.cpp
index 31cd54e3de35c..bf228ec1b32cc 100644
--- a/offload/DeviceRTL/src/Debug.cpp
+++ b/offload/DeviceRTL/src/Debug.cpp
@@ -17,7 +17,7 @@
 #include "Interface.h"
 #include "Mapping.h"
 #include "State.h"
-#include "Types.h"
+#include "DeviceTypes.h"
 
 using namespace ompx;
 
diff --git a/offload/DeviceRTL/src/Kernel.cpp b/offload/DeviceRTL/src/Kernel.cpp
index 95d4c728016d2..afee129c2589c 100644
--- a/offload/DeviceRTL/src/Kernel.cpp
+++ b/offload/DeviceRTL/src/Kernel.cpp
@@ -18,7 +18,7 @@
 #include "Mapping.h"
 #include "State.h"
 #include "Synchronization.h"
-#include "Types.h"
+#include "DeviceTypes.h"
 
 #include "llvm/Frontend/OpenMP/OMPDeviceConstants.h"
 
diff --git a/offload/DeviceRTL/src/Mapping.cpp b/offload/DeviceRTL/src/Mapping.cpp
index c1ce878746a69..4d89c11ee7e0f 100644
--- a/offload/DeviceRTL/src/Mapping.cpp
+++ b/offload/DeviceRTL/src/Mapping.cpp
@@ -12,8 +12,8 @@
 #include "Mapping.h"
 #include "Interface.h"
 #include "State.h"
-#include "Types.h"
-#include "Utils.h"
+#include "DeviceTypes.h"
+#include "DeviceUtils.h"
 
 #pragma omp begin declare target device_type(nohost)
 
@@ -364,6 +364,13 @@ _TGT_KERNEL_LANGUAGE(block_id, getBlockIdInKernel)
 _TGT_KERNEL_LANGUAGE(block_dim, getNumberOfThreadsInBlock)
 _TGT_KERNEL_LANGUAGE(grid_dim, getNumberOfBlocksInKernel)
 
+extern "C" [[clang::disable_sanitizer_instrumentation, gnu::flatten,
+             gnu::always_inline, gnu::used, gnu::retain]] int
+ompx_global_thread_id() {
+  return ompx_thread_id(0) + ompx_thread_id(1) * ompx_block_dim(0) +
+         ompx_thread_id(2) * ompx_block_dim(0) * ompx_block_dim(1);
+}
+
 extern "C" {
 uint64_t ompx_ballot_sync(uint64_t mask, int pred) {
   return utils::ballotSync(mask, pred);
diff --git a/offload/DeviceRTL/src/Misc.cpp b/offload/DeviceRTL/src/Misc.cpp
index c24af9442d16e..ca8b549b28dbf 100644
--- a/offload/DeviceRTL/src/Misc.cpp
+++ b/offload/DeviceRTL/src/Misc.cpp
@@ -10,7 +10,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "Configuration.h"
-#include "Types.h"
+#include "DeviceTypes.h"
 
 #include "Debug.h"
 
diff --git a/offload/DeviceRTL/src/Parallelism.cpp b/offload/DeviceRTL/src/Parallelism.cpp
index 031a5ced25518..2a510e9531657 100644
--- a/offload/DeviceRTL/src/Parallelism.cpp
+++ b/offload/DeviceRTL/src/Parallelism.cpp
@@ -37,8 +37,8 @@
 #include "Mapping.h"
 #include "State.h"
 #include "Synchronization.h"
-#include "Types.h"
-#include "Utils.h"
+#include "DeviceTypes.h"
+#include "DeviceUtils.h"
 
 using namespace ompx;
 
diff --git a/offload/DeviceRTL/src/Reduction.cpp b/offload/DeviceRTL/src/Reduction.cpp
index 744d1a3a231c8..f4e2e0d25bde9 100644
--- a/offload/DeviceRTL/src/Reduction.cpp
+++ b/offload/DeviceRTL/src/Reduction.cpp
@@ -15,8 +15,8 @@
 #include "Mapping.h"
 #include "State.h"
 #include "Synchronization.h"
-#include "Types.h"
-#include "Utils.h"
+#include "DeviceTypes.h"
+#include "DeviceUtils.h"
 
 using namespace ompx;
 
diff --git a/offload/DeviceRTL/src/Sanitizer.cpp b/offload/DeviceRTL/src/Sanitizer.cpp
new file mode 100644
index 0000000000000..2acf15c22d992
--- /dev/null
+++ b/offload/DeviceRTL/src/Sanitizer.cpp
@@ -0,0 +1,426 @@
+//===------ Sanitizer.cpp - Track allocation for sanitizer checks ---------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+//===----------------------------------------------------------------------===//
+
+#include "DeviceTypes.h"
+#include "DeviceUtils.h"
+#include "Interface.h"
+#include "LibC.h"
+#include "Shared/Environment.h"
+#include "Synchronization.h"
+
+using namespace ompx;
+using namespace utils;
+
+#pragma omp begin declare target device_type(nohost)
+
+#include "Shared/Sanitizer.h"
+
+struct AllocationInfoLocalTy {
+  _AS_PTR(void, AllocationKind::LOCAL) Start;
+  uint64_t Length;
+  uint32_t Tag;
+};
+struct AllocationInfoGlobalTy {
+  _AS_PTR(void, AllocationKind::GLOBAL) Start;
+  uint64_t Length;
+  uint32_t Tag;
+};
+
+template <AllocationKind AK> struct AllocationInfoTy {};
+template <> struct AllocationInfoTy<AllocationKind::GLOBAL> {
+  using ASVoidPtrTy = AllocationInfoGlobalTy;
+};
+template <> struct AllocationInfoTy<AllocationKind::LOCAL> {
+  using ASVoidPtrTy = AllocationInfoLocalTy;
+};
+
+template <>
+AllocationPtrTy<AllocationKind::LOCAL>
+AllocationPtrTy<AllocationKind::LOCAL>::get(_AS_PTR(void, AllocationKind::LOCAL)
+                                                P) {
+  TypePunUnion TPU;
+  TPU.P = (void *)P;
+  return TPU.AP;
+}
+
+template <>
+AllocationPtrTy<AllocationKind::LOCAL>::operator _AS_PTR(
+    void, AllocationKind::LOCAL)() const {
+  TypePunUnion TPU;
+  TPU.AP = *this;
+  return TPU.AddrP;
+}
+
+template <AllocationKind AK> struct AllocationTracker {
+  static_assert(sizeof(AllocationTy<AK>) == sizeof(_AS_PTR(void, AK)) * 2,
+                "AllocationTy should not exceed two pointers");
+  //  static_assert(sizeof(AllocationPtrTy<AK>) * 8 ==
+  //                    SanitizerConfig<AK>::ADDR_SPACE_PTR_SIZE,
+  //                "AllocationTy pointers should be pointer sized");
+
+  [[clang::disable_sanitizer_instrumentation]] static
+      typename AllocationInfoTy<AK>::ASVoidPtrTy
+      getAllocationInfo(_AS_PTR(void, AK) P) {
+    AllocationPtrTy<AK> AP = AllocationPtrTy<AK>::get(P);
+    uint32_t AllocationId = AP.AllocationId;
+    if (OMP_UNLIKELY(AllocationId >= SanitizerConfig<AK>::SLOTS))
+      return {P, 0, (uint32_t)-1};
+    auto &A = getAllocation<AK>(AP);
+    return {A.Start, A.Length, (uint32_t)A.Tag};
+  }
+
+  [[clang::disable_sanitizer_instrumentation]] static _AS_PTR(void, AK)
+      create(_AS_PTR(void, AK) Start, uint64_t Length, int64_t AllocationId,
+             uint64_t Slot, int64_t SourceId) {
+    if constexpr (SanitizerConfig<AK>::OFFSET_BITS < 64)
+      if (OMP_UNLIKELY(Length >= (1UL << (SanitizerConfig<AK>::OFFSET_BITS))))
+        __sanitizer_trap_info_ptr->exceedsAllocationLength<AK>(
+            Start, Length, AllocationId, Slot, SourceId);
+
+    // Reserve the 0 element for the null pointer in global space.
+    auto &AllocArr = getAllocationArray<AK>();
+    auto &Cnt = AllocArr.Cnt;
+    if constexpr (AK == AllocationKind::LOCAL)
+      Slot = ++Cnt;
+
+    uint64_t NumSlots = SanitizerConfig<AK>::SLOTS;
+    if (OMP_UNLIKELY(Slot >= NumSlots))
+      __sanitizer_trap_info_ptr->exceedsAllocationSlots<AK>(
+          Start, Length, AllocationId, Slot, SourceId);
+
+    auto &A = AllocArr.Arr[Slot];
+
+    A.Start = Start;
+    A.Length = Length;
+    A.Id = AllocationId;
+
+    AllocationPtrTy<AK> AP;
+    AP.Offset = 0;
+    if constexpr (SanitizerConfig<AK>::useTags()) {
+      AP.AllocationTag = ++A.Tag;
+    }
+    AP.AllocationId = Slot;
+    AP.Magic = SanitizerConfig<AK>::MAGIC;
+    AP.Kind = (uint64_t)AK;
+    return AP;
+  }
+
+  [[clang::disable_sanitizer_instrumentation]] static void
+  remove(_AS_PTR(void, AK) P, int64_t SourceId) {
+    AllocationPtrTy<AK> AP = AllocationPtrTy<AK>::get(P);
+    uint64_t AllocationId = AP.AllocationId;
+    auto &AllocArr = getAllocationArray<AK>();
+    auto &A = AllocArr.Arr[AllocationId];
+    A.Length = 0;
+
+    auto &Cnt = AllocArr.Cnt;
+    if constexpr (AK == AllocationKind::LOCAL) {
+      if (Cnt == AllocationId)
+        --Cnt;
+    }
+  }
+
+  [[clang::disable_sanitizer_instrumentation]] static void remove_n(int32_t N) {
+    static_assert(AK == AllocationKind::LOCAL, "");
+    auto &AllocArr = getAllocationArray<AK>();
+    auto &Cnt = AllocArr.Cnt;
+    for (int32_t I = 0; I < N; ++I) {
+      auto &A = AllocArr.Arr[Cnt--];
+      A.Length = 0;
+    }
+  }
+
+  [[clang::disable_sanitizer_instrumentation]] static _AS_PTR(void, AK)
+      advance(_AS_PTR(void, AK) P, uint64_t Offset, int64_t SourceId) {
+    AllocationPtrTy<AK> AP = AllocationPtrTy<AK>::get(P);
+    AP.Offset += Offset;
+    return AP;
+  }
+
+  [[clang::disable_sanitizer_instrumentation]] static _AS_PTR(void, AK)
+      checkWithBase(_AS_PTR(void, AK) P, _AS_PTR(void, AK) Start,
+                    int64_t Length, uint32_t Tag, int64_t Size,
+                    int64_t AccessId, int64_t SourceId) {
+    AllocationPtrTy<AK> AP = AllocationPtrTy<AK>::get(P);
+    if constexpr (AK == AllocationKind::LOCAL)
+      if (Length == 0)
+        Length = getAllocation<AK>(AP, AccessId).Length;
+    if constexpr (AK == AllocationKind::GLOBAL)
+      if (AP.Magic != SanitizerConfig<AllocationKind::GLOBAL>::MAGIC)
+        __sanitizer_trap_info_ptr->garbagePointer<AK>(AP, (void *)P, SourceId);
+    int64_t Offset = AP.Offset;
+    if (OMP_UNLIKELY(
+            Offset > Length - Size ||
+            (SanitizerConfig<AK>::useTags() && Tag != AP.AllocationTag))) {
+      __sanitizer_trap_info_ptr->accessError<AK>(AP, Size, AccessId, SourceId);
+    }
+    return utils::advancePtr(Start, Offset);
+  }
+
+  [[clang::disable_sanitizer_instrumentation]] static _AS_PTR(void, AK)
+      check(_AS_PTR(void, AK) P, int64_t Size, int64_t AccessId,
+            int64_t SourceId) {
+    AllocationPtrTy<AK> AP = AllocationPtrTy<AK>::get(P);
+    auto &Alloc = getAllocation<AK>(AP, AccessId);
+    return checkWithBase(P, Alloc.Start, Alloc.Length, Alloc.Tag, Size,
+                         AccessId, SourceId);
+  }
+
+  [[clang::disable_sanitizer_instrumentation]] static _AS_PTR(void, AK)
+      unpack(_AS_PTR(void, AK) P, int64_t SourceId = 0) {
+    AllocationPtrTy<AK> AP = AllocationPtrTy<AK>::get(P);
+    auto &A = getAllocation<AK>(AP);
+    uint64_t Offset = AP.Offset;
+    _AS_PTR(void, AK) Ptr = utils::advancePtr(A.Start, Offset);
+    return Ptr;
+  }
+
+  [[clang::disable_sanitizer_instrumentation]] static void
+  lifetimeStart(_AS_PTR(void, AK) P, uint64_t Length) {
+    AllocationPtrTy<AK> AP = AllocationPtrTy<AK>::get(P);
+    auto &A = getAllocation<AK>(AP);
+    // TODO: Check length
+    A.Length = Length;
+  }
+
+  [[clang::disable_sanitizer_instrumentation]] static void
+  lifetimeEnd(_AS_PTR(void, AK) P, uint64_t Length) {
+    AllocationPtrTy<AK> AP = AllocationPtrTy<AK>::get(P);
+    auto &A = getAllocation<AK>(AP);
+    // TODO: Check length
+    A.Length = 0;
+  }
+
+  [[clang::disable_sanitizer_instrumentation]] static void leakCheck() {
+    static_assert(AK == AllocationKind::GLOBAL, "");
+    auto &AllocArr = getAllocationArray<AK>();
+    for (uint64_t Slot = 0; Slot < SanitizerConfig<AK>::SLOTS; ++Slot) {
+      auto &A = AllocArr.Arr[Slot];
+      if (OMP_UNLIKELY(A.Length))
+        __sanitizer_trap_info_ptr->memoryLeak<AK>(A, Slot);
+    }
+  }
+};
+
+template <AllocationKind AK>
+AllocationArrayTy<AK>
+    Allocations<AK>::Arr[SanitizerConfig<AK>::NUM_ALLOCATION_ARRAYS];
+
+static void checkForMagic(bool IsGlobal, void *P, int64_t SourceId) {
+  if (IsGlobal) {
+    auto AP = AllocationPtrTy<AllocationKind::GLOBAL>::get(P);
+    if (AP.Magic != SanitizerConfig<AllocationKind::GLOBAL>::MAGIC)
+      __sanitizer_trap_info_ptr->garbagePointer<AllocationKind::GLOBAL>(
+          AP, P, SourceId);
+  } else {
+    auto AP = AllocationPtrTy<AllocationKind::LOCAL>::get(P);
+    if (AP.Magic != SanitizerConfig<AllocationKind::LOCAL>::MAGIC)
+      __sanitizer_trap_info_ptr->garbagePointer<AllocationKind::LOCAL>(
+          AP, P, SourceId);
+  }
+}
+
+extern "C" {
+
+#define REAL_PTR_IS_LOCAL(PTR) (isThreadLocalMemPtr(PTR))
+#define IS_GLOBAL(PTR) ((uintptr_t)PTR & (1UL << 63))
+
+[[clang::disable_sanitizer_instrumentation, gnu::flatten, gnu::always_inline,
+  gnu::used, gnu::retain]] _AS_PTR(void, AllocationKind::LOCAL)
+    ompx_new_local(_AS_PTR(void, AllocationKind::LOCAL) Start, uint64_t Length,
+                   int64_t AllocationId, uint32_t Slot, int64_t SourceId) {
+  return AllocationTracker<AllocationKind::LOCAL>::create(
+      Start, Length, AllocationId, Slot, SourceId);
+}
+[[clang::disable_sanitizer_instrumentation, gnu::flatten, gnu::always_inline,
+  gnu::used, gnu::retain]] _AS_PTR(void, AllocationKind::GLOBAL)
+    ompx_new_global(_AS_PTR(void, AllocationKind::GLOBAL) Start,
+                    uint64_t Length, int64_t AllocationId, uint32_t Slot,
+                    int64_t SourceId) {
+  return AllocationTracker<AllocationKind::GLOBAL>::create(
+      Start, Length, AllocationId, Slot, SourceId);
+}
+[[clang::disable_sanitizer_instrumentation, gnu::flatten, gnu::always_inline,
+  gnu::used, gnu::retain]] void
+__sanitizer_register_host(_AS_PTR(void, AllocationKind::GLOBAL) Start,
+                          uint64_t Length, uint64_t Slot, int64_t SourceId) {
+  AllocationTracker<AllocationKind::GLOBAL>::create(Start, Length, Slot, Slot,
+                                                    SourceId);
+}
+[[clang::disable_sanitizer_instrumentation, gnu::flatten, gnu::always_inline,
+  gnu::used, gnu::retain]] void *
+ompx_new(void *Start, uint64_t Length, int64_t AllocationId, uint32_t Slot,
+         int64_t SourceId) {
+  if (REAL_PTR_IS_LOCAL(Start))
+    return (void *)ompx_new_local((_AS_PTR(void, AllocationKind::LOCAL))Start,
+                                  Length, AllocationId, Slot, SourceId);
+  return (void *)ompx_new_global((_AS_PTR(void, AllocationKind::GLOBAL))Start,
+                                 Length, AllocationId, Slot, SourceId);
+}
+[[clang::disable_sanitizer_instrumentation, gnu::flatten, gnu::always_inline,
+  gnu::used, gnu::retain]] void
+ompx_free_local_n(int32_t N) {
+  return AllocationTracker<AllocationKind::LOCAL>::remove_n(N);
+}
+[[clang::disable_sanitizer_instrumentation, gnu::flatten, gnu::always_inline,
+  gnu::used, gnu::retain]] void
+__sanitizer_unregister_host(_AS_PTR(void, AllocationKind::GLOBAL) P) {
+  AllocationTracker<AllocationKind::GLOBAL>::remove(P, /*SourceId=*/0);
+}
+[[clang::disable_sanitizer_instrumentation, gnu::flatten, gnu::always_inline,
+  gnu::used, gnu::retain]] void
+ompx_free_local(_AS_PTR(void, AllocationKind::LOCAL) P, int64_t SourceId) {
+  return AllocationTracker<AllocationKind::LOCAL>::remove(P, SourceId);
+}
+[[clang::disable_sanitizer_instrumentation, gnu::flatten, gnu::always_inline,
+  gnu::used, gnu::retain]] void
+ompx_free_global(_AS_PTR(void, AllocationKind::GLOBAL) P, int64_t SourceId) {
+  return AllocationTracker<AllocationKind::GLOBAL>::remove(P, SourceId);
+}
+[[clang::disable_sanitizer_instrumentation, gnu::flatten, gnu::always_inline,
+  gnu::used, gnu::retain]] void
+ompx_free(void *P, int64_t SourceId) {
+  bool IsGlobal = IS_GLOBAL(P);
+  checkForMagic(IsGlobal, P, SourceId);
+  if (IsGlobal)
+    return ompx_free_global((_AS_PTR(void, AllocationKind::GLOBAL))P, SourceId);
+  return ompx_free_local((_AS_PTR(void, AllocationKind::LOCAL))P, SourceId);
+}
+
+[[clang::disable_sanitizer_instrumentation, gnu::flatten, gnu::always_inline,
+  gnu::used, gnu::retain]] _AS_PTR(void, AllocationKind::LOCAL)
+    ompx_gep_local(_AS_PTR(void, AllocationKind::LOCAL) P, uint64_t Offset,
+                   int64_t SourceId) {
+  return AllocationTracker<AllocationKind::LOCAL>::advance(P, Offset, SourceId);
+}
+[[clang::disable_sanitizer_instrumentation, gnu::flatten, gnu::always_inline,
+  gnu::used, gnu::retain]] _AS_PTR(void, AllocationKind::GLOBAL)
+    ompx_gep_global(_AS_PTR(void, AllocationKind::GLOBAL) P, uint64_t Offset,
+                    int64_t SourceId) {
+  return AllocationTracker<AllocationKind::GLOBAL>::advance(P, Offset,
+                                                            SourceId);
+}
+[[clang::disable_sanitizer_instrumentation, gnu::flatten, gnu::always_inline,
+  gnu::used, gnu::retain]] void *
+ompx_gep(void *P, uint64_t Offset, int64_t SourceId) {
+  bool IsGlobal = IS_GLOBAL(P);
+  checkForMagic(IsGlobal, P, SourceId);
+  if (IsGlobal)
+    return (void *)ompx_gep_global((_AS_PTR(void, AllocationKind::GLOBAL))P,
+                                   Offset, SourceId);
+  return (void *)ompx_gep_local((_AS_PTR(void, AllocationKind::LOCAL))P, Offset,
+                                SourceId);
+}
+
+[[clang::disable_sanitizer_instrumentation, gnu::flatten, gnu::always_inline,
+  gnu::used, gnu::retain]] _AS_PTR(void, AllocationKind::LOCAL)
+    ompx_check_local(_AS_PTR(void, AllocationKind::LOCAL) P, uint64_t Size,
+                     uint64_t AccessId, int64_t SourceId) {
+  return AllocationTracker<AllocationKind::LOCAL>::check(P, Size, AccessId,
+                                                         SourceId);
+}
+[[clang::disable_sanitizer_instrumentation, gnu::flatten, gnu::always_inline,
+  gnu::used, gnu::retain]] _AS_PTR(void, AllocationKind::GLOBAL)
+    ompx_check_global(_AS_PTR(void, AllocationKind::GLOBAL) P, uint64_t Size,
+                      uint64_t AccessId, int64_t SourceId) {
+  return AllocationTracker<AllocationKind::GLOBAL>::check(P, Size, AccessId,
+                                                          SourceId);
+}
+[[clang::disable_sanitizer_instrumentation, gnu::flatten, gnu::always_inline,
+  gnu::used, gnu::retain]] void *
+ompx_check(void *P, uint64_t Size, uint64_t AccessId, int64_t SourceId) {
+  bool IsGlobal = IS_GLOBAL(P);
+  checkForMagic(IsGlobal, P, SourceId);
+  if (IsGlobal)
+    return (void *)ompx_check_global((_AS_PTR(void, AllocationKind::GLOBAL))P,
+                                     Size, AccessId, SourceId);
+  return (void *)ompx_check_local((_AS_PTR(void, AllocationKind::LOCAL))P, Size,
+                                  AccessId, SourceId);
+}
+
+[[clang::disable_sanitizer_instrumentation, gnu::flatten, gnu::always_inline,
+  gnu::used, gnu::retain]] _AS_PTR(void, AllocationKind::LOCAL)
+    ompx_check_with_base_local(_AS_PTR(void, AllocationKind::LOCAL) P,
+                               _AS_PTR(void, AllocationKind::LOCAL) Start,
+                               uint64_t Length, uint32_t Tag, uint64_t Size,
+                               uint64_t AccessId, int64_t SourceId) {
+  return AllocationTracker<AllocationKind::LOCAL>::checkWithBase(
+      P, Start, Length, Tag, Size, AccessId, SourceId);
+}
+
+[[clang::disable_sanitizer_instrumentation, gnu::flatten, gnu::always_inline,
+  gnu::used, gnu::retain]] _AS_PTR(void, AllocationKind::GLOBAL)
+    ompx_check_with_base_global(_AS_PTR(void, AllocationKind::GLOBAL) P,
+                                _AS_PTR(void, AllocationKind::GLOBAL) Start,
+                                uint64_t Length, uint32_t Tag, uint64_t Size,
+                                uint64_t AccessId, int64_t SourceId) {
+  return AllocationTracker<AllocationKind::GLOBAL>::checkWithBase(
+      P, Start, Length, Tag, Size, AccessId, SourceId);
+}
+
+[[clang::disable_sanitizer_instrumentation, gnu::flatten, gnu::always_inline,
+  gnu::used, gnu::retain]] _AS_PTR(void, AllocationKind::LOCAL)
+    ompx_unpack_local(_AS_PTR(void, AllocationKind::LOCAL) P,
+                      int64_t SourceId) {
+  return AllocationTracker<AllocationKind::LOCAL>::unpack(P, SourceId);
+}
+[[clang::disable_sanitizer_instrumentation, gnu::flatten, gnu::always_inline,
+  gnu::used, gnu::retain]] _AS_PTR(void, AllocationKind::GLOBAL)
+    ompx_unpack_global(_AS_PTR(void, AllocationKind::GLOBAL) P,
+                       int64_t SourceId) {
+  return AllocationTracker<AllocationKind::GLOBAL>::unpack(P, SourceId);
+}
+[[clang::disable_sanitizer_instrumentation, gnu::flatten, gnu::always_inline,
+  gnu::used, gnu::retain]] void *
+ompx_unpack(void *P, int64_t SourceId) {
+  bool IsGlobal = IS_GLOBAL(P);
+  checkForMagic(IsGlobal, P, SourceId);
+  if (IsGlobal)
+    return (void *)ompx_unpack_global((_AS_PTR(void, AllocationKind::GLOBAL))P,
+                                      SourceId);
+  return (void *)ompx_unpack_local((_AS_PTR(void, AllocationKind::LOCAL))P,
+                                   SourceId);
+}
+
+[[clang::disable_sanitizer_instrumentation, gnu::flatten, gnu::always_inline,
+  gnu::used, gnu::retain]] void
+ompx_lifetime_start(_AS_PTR(void, AllocationKind::LOCAL) P, uint64_t Length) {
+  AllocationTracker<AllocationKind::LOCAL>::lifetimeStart(P, Length);
+}
+[[clang::disable_sanitizer_instrumentation, gnu::flatten, gnu::always_inline,
+  gnu::used, gnu::retain]] void
+ompx_lifetime_end(_AS_PTR(void, AllocationKind::LOCAL) P, uint64_t Length) {
+  AllocationTracker<AllocationKind::LOCAL>::lifetimeEnd(P, Length);
+}
+
+[[clang::disable_sanitizer_instrumentation, gnu::flatten, gnu::always_inline,
+  gnu::used, gnu::retain]] struct AllocationInfoLocalTy
+ompx_get_allocation_info_local(_AS_PTR(void, AllocationKind::LOCAL) P) {
+  return AllocationTracker<AllocationKind::LOCAL>::getAllocationInfo(P);
+}
+[[clang::disable_sanitizer_instrumentation, gnu::flatten, gnu::always_inline,
+  gnu::used, gnu::retain]] struct AllocationInfoGlobalTy
+ompx_get_allocation_info_global(_AS_PTR(void, AllocationKind::GLOBAL) P) {
+  return AllocationTracker<AllocationKind::GLOBAL>::getAllocationInfo(P);
+}
+
+[[clang::disable_sanitizer_instrumentation, gnu::flatten, gnu::always_inline,
+  gnu::used, gnu::retain]] void
+ompx_leak_check() {
+  AllocationTracker<AllocationKind::GLOBAL>::leakCheck();
+}
+
+[[gnu::weak, gnu::noinline, gnu::used, gnu::retain]] int64_t
+__san_get_location_value() {
+  return -1;
+}
+}
+
+#pragma omp end declare target
diff --git a/offload/DeviceRTL/src/State.cpp b/offload/DeviceRTL/src/State.cpp
index a1e4fa2449d9a..cb83b7839b5b2 100644
--- a/offload/DeviceRTL/src/State.cpp
+++ b/offload/DeviceRTL/src/State.cpp
@@ -13,13 +13,13 @@
 #include "Allocator.h"
 #include "Configuration.h"
 #include "Debug.h"
+#include "DeviceUtils.h"
 #include "Interface.h"
 #include "LibC.h"
 #include "Mapping.h"
 #include "State.h"
 #include "Synchronization.h"
-#include "Types.h"
-#include "Utils.h"
+#include "DeviceTypes.h"
 
 using namespace ompx;
 
@@ -84,14 +84,14 @@ struct SharedMemorySmartStackTy {
 
   /// Deallocate the last allocation made by the encountering thread and pointed
   /// to by \p Ptr from the stack. Each thread can call this function.
-  void pop(void *Ptr, uint32_t Bytes);
+  void pop(void *Ptr, uint64_t Bytes);
 
 private:
   /// Compute the size of the storage space reserved for a thread.
   uint32_t computeThreadStorageTotal() {
     uint32_t NumLanesInBlock = mapping::getNumberOfThreadsInBlock();
-    return utils::align_down((state::SharedScratchpadSize / NumLanesInBlock),
-                             allocator::ALIGNMENT);
+    return utils::alignDown((state::SharedScratchpadSize / NumLanesInBlock),
+                            allocator::ALIGNMENT);
   }
 
   /// Return the top address of the warp data stack, that is the first address
@@ -121,7 +121,7 @@ void *SharedMemorySmartStackTy::push(uint64_t Bytes) {
   // First align the number of requested bytes.
   /// FIXME: The stack shouldn't require worst-case padding. Alignment needs to
   /// be passed in as an argument and the stack rewritten to support it.
-  uint64_t AlignedBytes = utils::align_up(Bytes, allocator::ALIGNMENT);
+  uint64_t AlignedBytes = utils::alignPtr(Bytes, allocator::ALIGNMENT);
 
   uint32_t StorageTotal = computeThreadStorageTotal();
 
@@ -148,8 +148,8 @@ void *SharedMemorySmartStackTy::push(uint64_t Bytes) {
   return GlobalMemory;
 }
 
-void SharedMemorySmartStackTy::pop(void *Ptr, uint32_t Bytes) {
-  uint64_t AlignedBytes = utils::align_up(Bytes, allocator::ALIGNMENT);
+void SharedMemorySmartStackTy::pop(void *Ptr, uint64_t Bytes) {
+  uint64_t AlignedBytes = utils::alignPtr(Bytes, allocator::ALIGNMENT);
   if (utils::isSharedMemPtr(Ptr)) {
     int TId = mapping::getThreadIdInBlock();
     Usage[TId] -= AlignedBytes;
diff --git a/offload/DeviceRTL/src/Synchronization.cpp b/offload/DeviceRTL/src/Synchronization.cpp
index 80ba87b300bcd..97a6b080169ad 100644
--- a/offload/DeviceRTL/src/Synchronization.cpp
+++ b/offload/DeviceRTL/src/Synchronization.cpp
@@ -16,8 +16,8 @@
 #include "Interface.h"
 #include "Mapping.h"
 #include "State.h"
-#include "Types.h"
-#include "Utils.h"
+#include "DeviceTypes.h"
+#include "DeviceUtils.h"
 
 #pragma omp begin declare target device_type(nohost)
 
diff --git a/offload/DeviceRTL/src/Tasking.cpp b/offload/DeviceRTL/src/Tasking.cpp
index 2dc33562e6d79..34cb67cb1a351 100644
--- a/offload/DeviceRTL/src/Tasking.cpp
+++ b/offload/DeviceRTL/src/Tasking.cpp
@@ -13,10 +13,10 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "DeviceUtils.h"
 #include "Interface.h"
 #include "State.h"
-#include "Types.h"
-#include "Utils.h"
+#include "DeviceTypes.h"
 
 using namespace ompx;
 
@@ -34,7 +34,7 @@ TaskDescriptorTy *__kmpc_omp_task_alloc(IdentTy *, int32_t, int32_t,
   TaskDescriptorTy *TaskDescriptor = (TaskDescriptorTy *)memory::allocGlobal(
       TaskSizeTotal, "explicit task descriptor");
   TaskDescriptor->Payload =
-      utils::advance(TaskDescriptor, TaskSizeInclPrivateValuesPadded);
+      utils::advancePtr(TaskDescriptor, TaskSizeInclPrivateValuesPadded);
   TaskDescriptor->TaskFn = TaskFn;
 
   return TaskDescriptor;
diff --git a/offload/DeviceRTL/src/Utils.cpp b/offload/DeviceRTL/src/Utils.cpp
index 53cc803234867..956e6200ffd5c 100644
--- a/offload/DeviceRTL/src/Utils.cpp
+++ b/offload/DeviceRTL/src/Utils.cpp
@@ -9,7 +9,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "Utils.h"
+#include "DeviceUtils.h"
 
 #include "Debug.h"
 #include "Interface.h"
@@ -22,6 +22,7 @@ using namespace ompx;
 namespace impl {
 
 bool isSharedMemPtr(const void *Ptr) { return false; }
+bool isThreadLocalMemPtr(const void *Ptr) { return false; }
 
 void Unpack(uint64_t Val, uint32_t *LowBits, uint32_t *HighBits) {
   static_assert(sizeof(unsigned long) == 8, "");
@@ -67,6 +68,10 @@ bool isSharedMemPtr(const void *Ptr) {
   return __builtin_amdgcn_is_shared(
       (const __attribute__((address_space(0))) void *)Ptr);
 }
+bool isThreadLocalMemPtr(const void *Ptr) {
+  return __builtin_amdgcn_is_private(
+      (const __attribute__((address_space(0))) void *)Ptr);
+}
 #pragma omp end declare variant
 ///}
 
@@ -92,6 +97,8 @@ uint64_t ballotSync(uint64_t Mask, int32_t Pred) {
 
 bool isSharedMemPtr(const void *Ptr) { return __nvvm_isspacep_shared(Ptr); }
 
+bool isThreadLocalMemPtr(const void *Ptr) { return __nvvm_isspacep_local(Ptr); }
+
 #pragma omp end declare variant
 ///}
 } // namespace impl
@@ -127,6 +134,9 @@ uint64_t utils::ballotSync(uint64_t Mask, int32_t Pred) {
 }
 
 bool utils::isSharedMemPtr(void *Ptr) { return impl::isSharedMemPtr(Ptr); }
+bool utils::isThreadLocalMemPtr(void *Ptr) {
+  return impl::isThreadLocalMemPtr(Ptr);
+}
 
 extern "C" {
 int32_t __kmpc_shuffle_int32(int32_t Val, int16_t Delta, int16_t SrcLane) {
diff --git a/offload/DeviceRTL/src/Workshare.cpp b/offload/DeviceRTL/src/Workshare.cpp
index bcb7c5ad50a18..fd835f2af677e 100644
--- a/offload/DeviceRTL/src/Workshare.cpp
+++ b/offload/DeviceRTL/src/Workshare.cpp
@@ -17,8 +17,8 @@
 #include "Mapping.h"
 #include "State.h"
 #include "Synchronization.h"
-#include "Types.h"
-#include "Utils.h"
+#include "DeviceTypes.h"
+#include "DeviceUtils.h"
 
 using namespace ompx;
 
diff --git a/offload/DeviceRTL/src/exports b/offload/DeviceRTL/src/exports
index 288ddf90b4a9f..b725cca7e1c1d 100644
--- a/offload/DeviceRTL/src/exports
+++ b/offload/DeviceRTL/src/exports
@@ -16,3 +16,5 @@ free
 memcmp
 printf
 __assert_fail
+LocalAllocs
+LocalCnt
diff --git a/offload/include/OpenMP/Mapping.h b/offload/include/OpenMP/Mapping.h
index b9f5c16582931..d310c6c707f8f 100644
--- a/offload/include/OpenMP/Mapping.h
+++ b/offload/include/OpenMP/Mapping.h
@@ -71,6 +71,7 @@ struct HostDataToTargetTy {
 
   const uintptr_t TgtAllocBegin; // allocated target memory
   const uintptr_t TgtPtrBegin; // mapped target memory = TgtAllocBegin + padding
+  void *FakeTgtPtrBegin = 0;   // mapped target memory = TgtAllocBegin + padding
 
 private:
   static const uint64_t INFRefCount = ~(uint64_t)0;
@@ -125,9 +126,10 @@ struct HostDataToTargetTy {
   HostDataToTargetTy(uintptr_t BP, uintptr_t B, uintptr_t E,
                      uintptr_t TgtAllocBegin, uintptr_t TgtPtrBegin,
                      bool UseHoldRefCount, map_var_info_t Name = nullptr,
-                     bool IsINF = false)
+                     bool IsINF = false, void *FakeTgtPtrBegin = nullptr)
       : HstPtrBase(BP), HstPtrBegin(B), HstPtrEnd(E), HstPtrName(Name),
         TgtAllocBegin(TgtAllocBegin), TgtPtrBegin(TgtPtrBegin),
+        FakeTgtPtrBegin(FakeTgtPtrBegin),
         States(std::make_unique<StatesTy>(UseHoldRefCount ? 0
                                           : IsINF         ? INFRefCount
                                                           : 1,
diff --git a/offload/include/Shared/APITypes.h b/offload/include/Shared/APITypes.h
index 5b22bbaac144f..f96b2f9ca259d 100644
--- a/offload/include/Shared/APITypes.h
+++ b/offload/include/Shared/APITypes.h
@@ -85,6 +85,9 @@ struct __tgt_async_info {
   /// ensure it is a valid location while the transfer to the device is
   /// happening.
   KernelLaunchEnvironmentTy KernelLaunchEnvironment;
+
+  /// Flag to indicate the Queue should be persistent.
+  bool PersistentQueue = false;
 };
 
 /// This struct contains all of the arguments to a target kernel region launch.
@@ -102,19 +105,24 @@ struct KernelArgsTy {
       0; // Tripcount for the teams / distribute loop, 0 otherwise.
   struct {
     uint64_t NoWait : 1; // Was this kernel spawned with a `nowait` clause.
-    uint64_t Unused : 63;
-  } Flags = {0, 0};
+    uint64_t IsCUDA : 1; // Was this kernel spawned via CUDA.
+    uint64_t Unused : 62;
+  } Flags = {0, 0, 0};
   // The number of teams (for x,y,z dimension).
   uint32_t NumTeams[3] = {0, 0, 0};
    // The number of threads (for x,y,z dimension).
   uint32_t ThreadLimit[3] = {0, 0, 0};
   uint32_t DynCGroupMem = 0;     // Amount of dynamic cgroup memory requested.
+  // A __tgt_async_info queue pointer to be used for the kernel and all
+  // associated device interactions. The operations are implicitly made
+  // non-blocking.
+  void *AsyncInfoQueue = nullptr;
 };
 static_assert(sizeof(KernelArgsTy().Flags) == sizeof(uint64_t),
               "Invalid struct size");
 static_assert(sizeof(KernelArgsTy) ==
                   (8 * sizeof(int32_t) + 3 * sizeof(int64_t) +
-                   4 * sizeof(void **) + 2 * sizeof(int64_t *)),
+                   5 * sizeof(void **) + 2 * sizeof(int64_t *)),
               "Invalid struct size");
 
 /// Flat array of kernel launch parameters and their total size.
diff --git a/offload/include/Shared/RefCnt.h b/offload/include/Shared/RefCnt.h
new file mode 100644
index 0000000000000..7c615ba167a3d
--- /dev/null
+++ b/offload/include/Shared/RefCnt.h
@@ -0,0 +1,56 @@
+//===-- Shared/RefCnt.h - Helper to keep track of references --- C++ ------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef OMPTARGET_SHARED_REF_CNT_H
+#define OMPTARGET_SHARED_REF_CNT_H
+
+#include <atomic>
+#include <cassert>
+#include <limits>
+#include <memory>
+
+namespace llvm {
+namespace omp {
+namespace target {
+
+/// Utility class for thread-safe reference counting. Any class that needs
+/// objects' reference counting can inherit from this entity or have it as a
+/// class data member.
+template <typename Ty = uint32_t,
+          std::memory_order MemoryOrder = std::memory_order_relaxed>
+struct RefCountTy {
+  /// Create a refcount object initialized to zero.
+  RefCountTy() : Refs(0) {}
+
+  ~RefCountTy() { assert(Refs == 0 && "Destroying with non-zero refcount"); }
+
+  /// Increase the reference count atomically.
+  void increase() { Refs.fetch_add(1, MemoryOrder); }
+
+  /// Decrease the reference count and return whether it became zero. Decreasing
+  /// the counter in more units than it was previously increased results in
+  /// undefined behavior.
+  bool decrease() {
+    Ty Prev = Refs.fetch_sub(1, MemoryOrder);
+    assert(Prev > 0 && "Invalid refcount");
+    return (Prev == 1);
+  }
+
+  Ty get() const { return Refs.load(MemoryOrder); }
+
+private:
+  /// The atomic reference counter.
+  std::atomic<Ty> Refs;
+};
+} // namespace target
+} // namespace omp
+} // namespace llvm
+
+#endif
diff --git a/offload/include/Shared/Sanitizer.h b/offload/include/Shared/Sanitizer.h
new file mode 100644
index 0000000000000..79e3af2d33bbd
--- /dev/null
+++ b/offload/include/Shared/Sanitizer.h
@@ -0,0 +1,358 @@
+//===-- Shared/SanitizerHost.h - OFfload sanitizer host logic ----- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef OMPTARGET_SHARED_SANITIZER_HOST_H
+#define OMPTARGET_SHARED_SANITIZER_HOST_H
+
+#include "Types.h"
+#include "Utils.h"
+
+extern "C" {
+int ompx_block_id(int Dim);
+int ompx_block_dim(int Dim);
+int ompx_thread_id(int Dim);
+int64_t __san_get_location_value();
+}
+
+enum class AllocationKind { LOCAL, GLOBAL, LAST = GLOBAL };
+
+template <AllocationKind AK> struct ASTypes {
+  using INT_TY = uint64_t;
+};
+#pragma omp begin declare variant match(device = {arch(amdgcn)})
+template <> struct ASTypes<AllocationKind::LOCAL> {
+  using INT_TY = uint32_t;
+};
+#pragma omp end declare variant
+
+template <AllocationKind AK> struct SanitizerConfig {
+  static constexpr uint32_t ADDR_SPACE = AK == AllocationKind::GLOBAL ? 0 : 5;
+  static constexpr uint32_t ADDR_SPACE_PTR_SIZE =
+      sizeof(typename ASTypes<AK>::INT_TY) * 8;
+
+  static constexpr uint32_t NUM_ALLOCATION_ARRAYS =
+      AK == AllocationKind::GLOBAL ? 1 : (1024 * 1024 * 2);
+  static constexpr uint32_t TAG_BITS = AK == AllocationKind::GLOBAL ? 1 : 8;
+  static constexpr uint32_t MAGIC_BITS = 3;
+  static constexpr uint32_t MAGIC = 0b101;
+
+  static constexpr uint32_t OBJECT_BITS = AK == AllocationKind::GLOBAL ? 10 : 7;
+  static constexpr uint32_t SLOTS = (1 << (OBJECT_BITS));
+  static constexpr uint32_t KIND_BITS = 1;
+  static constexpr uint32_t Id_BITS = 9 - KIND_BITS;
+
+  static constexpr uint32_t LENGTH_BITS =
+      ADDR_SPACE_PTR_SIZE - TAG_BITS - Id_BITS;
+  static constexpr uint32_t OFFSET_BITS =
+      ADDR_SPACE_PTR_SIZE - TAG_BITS - OBJECT_BITS - KIND_BITS - MAGIC_BITS;
+
+  static constexpr bool useTags() { return TAG_BITS > 1; }
+
+  static_assert(LENGTH_BITS + TAG_BITS + Id_BITS == ADDR_SPACE_PTR_SIZE,
+                "Length, tag, and Id bits should cover one pointer");
+  static_assert(OFFSET_BITS + TAG_BITS + OBJECT_BITS + MAGIC_BITS + KIND_BITS ==
+                    ADDR_SPACE_PTR_SIZE,
+                "Offset, tag, object, and kind bits should cover one pointer");
+  static_assert((1 << KIND_BITS) >= ((uint64_t)AllocationKind::LAST + 1),
+                "Kind bits should match allocation kinds");
+};
+
+#define _AS_PTR(TY, AK)                                                        \
+  TY [[clang::address_space(SanitizerConfig<AK>::ADDR_SPACE)]] *
+
+template <AllocationKind AK> struct AllocationTy {
+  _AS_PTR(void, AK) Start;
+  typename ASTypes<AK>::INT_TY Length : SanitizerConfig<AK>::LENGTH_BITS;
+  typename ASTypes<AK>::INT_TY Tag : SanitizerConfig<AK>::TAG_BITS;
+  typename ASTypes<AK>::INT_TY Id : SanitizerConfig<AK>::Id_BITS;
+};
+
+template <AllocationKind AK> struct AllocationArrayTy {
+  AllocationTy<AK> Arr[SanitizerConfig<AK>::SLOTS];
+  uint64_t Cnt;
+};
+
+template <AllocationKind AK> struct AllocationPtrTy {
+  static AllocationPtrTy<AK> get(_AS_PTR(void, AK) P) {
+    return utils::convertViaPun<AllocationPtrTy<AK>>(P);
+  }
+  static AllocationPtrTy<AK> get(void *P) {
+    return get((_AS_PTR(void, AK))(P));
+  }
+  operator _AS_PTR(void, AK)() const {
+    return utils::convertViaPun<_AS_PTR(void, AK)>(*this);
+  }
+  operator typename ASTypes<AK>::INT_TY() const {
+    return utils::convertViaPun<typename ASTypes<AK>::INT_TY>(*this);
+  }
+  typename ASTypes<AK>::INT_TY Offset : SanitizerConfig<AK>::OFFSET_BITS;
+  typename ASTypes<AK>::INT_TY AllocationTag : SanitizerConfig<AK>::TAG_BITS;
+  typename ASTypes<AK>::INT_TY AllocationId : SanitizerConfig<AK>::OBJECT_BITS;
+  typename ASTypes<AK>::INT_TY Magic : SanitizerConfig<AK>::MAGIC_BITS;
+  // Must be last, TODO: merge into TAG
+  typename ASTypes<AK>::INT_TY Kind : SanitizerConfig<AK>::KIND_BITS;
+};
+#pragma omp begin declare variant match(device = {arch(amdgcn)})
+static_assert(sizeof(AllocationPtrTy<AllocationKind::LOCAL>) * 8 == 32);
+#pragma omp end declare variant
+
+union TypePunUnion {
+  uint64_t I;
+  void *P;
+  _AS_PTR(void, AllocationKind::LOCAL) AddrP;
+  struct {
+    AllocationPtrTy<AllocationKind::LOCAL> AP;
+    uint32_t U;
+  };
+};
+#pragma omp begin declare variant match(device = {arch(amdgcn)})
+static_assert(sizeof(TypePunUnion) * 8 == 64);
+#pragma omp end declare variant
+
+static inline void *__offload_get_new_sanitizer_ptr(int32_t Slot) {
+  AllocationPtrTy<AllocationKind::GLOBAL> AP;
+  AP.Offset = 0;
+  AP.AllocationId = Slot;
+  AP.Magic = SanitizerConfig<AllocationKind::GLOBAL>::MAGIC;
+  AP.Kind = (uint32_t)AllocationKind::GLOBAL;
+  return (void *)(_AS_PTR(void, AllocationKind::GLOBAL))AP;
+}
+
+template <AllocationKind AK> struct Allocations {
+  static AllocationArrayTy<AK> Arr[SanitizerConfig<AK>::NUM_ALLOCATION_ARRAYS];
+};
+
+struct LocationEncodingTy {
+  uint64_t FunctionNameIdx;
+  uint64_t FileNameIdx;
+  uint64_t LineNo;
+  uint64_t ColumnNo;
+  uint64_t ParentIdx;
+};
+
+struct SanitizerTrapInfoTy {
+  /// AllocationTy
+  /// {
+  void *AllocationStart;
+  uint64_t AllocationLength;
+  int32_t AllocationId;
+  uint32_t AllocationTag;
+  uint8_t AllocationKind;
+  ///}
+
+  enum ErrorCodeTy : uint8_t {
+    None = 0,
+    ExceedsLength,
+    ExceedsSlots,
+    PointerOutsideAllocation,
+    OutOfBounds,
+    UseAfterScope,
+    UseAfterFree,
+    MemoryLeak,
+    GarbagePointer,
+  } ErrorCode;
+
+  /// AllocationTy
+  /// {
+  uint64_t PtrOffset;
+  uint64_t PtrSlot;
+  uint16_t PtrTag;
+  uint16_t PtrKind;
+  ///}
+
+  /// Access
+  /// {
+  uint32_t AccessSize;
+  int64_t AccessId;
+  /// }
+
+  /// Thread
+  /// {
+  uint64_t BlockId[3];
+  uint32_t ThreadId[3];
+  uint64_t PC;
+  uint64_t LocationId;
+  int64_t CallId;
+  /// }
+
+  [[clang::disable_sanitizer_instrumentation]] void
+  setCoordinates(int64_t SourceId) {
+    for (int32_t Dim = 0; Dim < 3; ++Dim) {
+      BlockId[Dim] = ompx_block_id(Dim);
+      ThreadId[Dim] = ompx_thread_id(Dim);
+    }
+    LocationId = SourceId;
+    CallId = __san_get_location_value();
+  }
+
+  template <enum AllocationKind AK>
+  [[clang::disable_sanitizer_instrumentation, gnu::always_inline]] void
+  allocationError(ErrorCodeTy EC, _AS_PTR(void, AK) Start, uint64_t Length,
+                  int64_t Id, int64_t Tag, uint64_t Slot, int64_t SourceId) {
+    AllocationStart = (void *)Start;
+    AllocationLength = Length;
+    AllocationId = Id;
+    AllocationTag = Tag;
+    AllocationKind = (decltype(AllocationKind))AK;
+    PtrSlot = Slot;
+
+    ErrorCode = EC;
+    setCoordinates(SourceId);
+  }
+
+  template <enum AllocationKind AK>
+  [[clang::disable_sanitizer_instrumentation, gnu::always_inline]] void
+  propagateAccessError(ErrorCodeTy EC, const AllocationTy<AK> &A,
+                       const AllocationPtrTy<AK> &AP, uint64_t Size, int64_t Id,
+                       int64_t SourceId) {
+    AllocationStart = (void *)A.Start;
+    AllocationLength = A.Length;
+    AllocationId = A.Id;
+    AllocationTag = A.Tag;
+    AllocationKind = (decltype(AllocationKind))AK;
+
+    ErrorCode = EC;
+
+    PtrOffset = AP.Offset;
+    PtrSlot = AP.AllocationId;
+    PtrTag = AP.AllocationTag;
+    PtrKind = AP.Kind;
+
+    AccessSize = Size;
+    AccessId = Id;
+
+    setCoordinates(SourceId);
+  }
+
+  template <enum AllocationKind AK>
+  [[clang::disable_sanitizer_instrumentation, noreturn, gnu::noinline]] void
+  exceedsAllocationLength(_AS_PTR(void, AK) Start, uint64_t Length,
+                          int64_t AllocationId, uint64_t Slot,
+                          int64_t SourceId) {
+    allocationError<AK>(ExceedsLength, Start, Length, AllocationId, /*Tag=*/0,
+                        Slot, SourceId);
+    __builtin_trap();
+  }
+
+  template <enum AllocationKind AK>
+  [[clang::disable_sanitizer_instrumentation, noreturn, gnu::noinline]] void
+  exceedsAllocationSlots(_AS_PTR(void, AK) Start, uint64_t Length,
+                         int64_t AllocationId, uint64_t Slot,
+                         int64_t SourceId) {
+    allocationError<AK>(ExceedsSlots, Start, Length, AllocationId, /*Tag=*/0,
+                        Slot, SourceId);
+    __builtin_trap();
+  }
+
+  template <enum AllocationKind AK>
+  [[clang::disable_sanitizer_instrumentation, noreturn, gnu::noinline]] void
+  pointerOutsideAllocation(_AS_PTR(void, AK) Start, uint64_t Length,
+                           int64_t AllocationId, uint64_t Slot, uint64_t PC) {
+    allocationError<AK>(PointerOutsideAllocation, Start, Length, AllocationId,
+                        /*Tag=*/0, Slot, PC);
+    __builtin_trap();
+  }
+
+  template <enum AllocationKind AK>
+  [[clang::disable_sanitizer_instrumentation, noreturn, gnu::noinline]] void
+  outOfBoundAccess(const AllocationTy<AK> A, const AllocationPtrTy<AK> AP,
+                   uint64_t Size, int64_t AccessId, int64_t SourceId) {
+    propagateAccessError(OutOfBounds, A, AP, Size, AccessId, SourceId);
+    __builtin_trap();
+  }
+
+  template <enum AllocationKind AK>
+  [[clang::disable_sanitizer_instrumentation, noreturn, gnu::noinline]] void
+  useAfterScope(const AllocationTy<AK> A, const AllocationPtrTy<AK> AP,
+                uint64_t Size, int64_t AccessId, int64_t SourceId) {
+    propagateAccessError(UseAfterScope, A, AP, Size, AccessId, SourceId);
+    __builtin_trap();
+  }
+
+  template <enum AllocationKind AK>
+  [[clang::disable_sanitizer_instrumentation, noreturn, gnu::noinline]] void
+  useAfterFree(const AllocationTy<AK> A, const AllocationPtrTy<AK> AP,
+               uint64_t Size, int64_t AccessId, int64_t SourceId) {
+    propagateAccessError(UseAfterFree, A, AP, Size, AccessId, SourceId);
+    __builtin_trap();
+  }
+
+  template <enum AllocationKind AK>
+  [[clang::disable_sanitizer_instrumentation, noreturn, gnu::noinline]] void
+  accessError(const AllocationPtrTy<AK> AP, int64_t Size, int64_t AccessId,
+              int64_t SourceId);
+
+  template <enum AllocationKind AK>
+  [[clang::disable_sanitizer_instrumentation, noreturn, gnu::noinline]] void
+  garbagePointer(const AllocationPtrTy<AK> AP, void *P, int64_t SourceId) {
+    ErrorCode = GarbagePointer;
+    AllocationStart = P;
+    AllocationKind = (decltype(AllocationKind))AK;
+    PtrOffset = AP.Offset;
+    PtrSlot = AP.AllocationId;
+    PtrTag = AP.AllocationTag;
+    PtrKind = AP.Kind;
+    setCoordinates(SourceId);
+    __builtin_trap();
+  }
+
+  template <enum AllocationKind AK>
+  [[clang::disable_sanitizer_instrumentation, noreturn, gnu::noinline]] void
+  memoryLeak(const AllocationTy<AK> A, uint64_t Slot) {
+    allocationError<AK>(MemoryLeak, A.Start, A.Length, A.Id, A.Tag, Slot,
+                        /*SourceId=*/-1);
+    __builtin_trap();
+  }
+};
+
+[[gnu::used, gnu::retain, gnu::weak,
+  gnu::visibility("protected")]] SanitizerTrapInfoTy *__sanitizer_trap_info_ptr;
+
+template <AllocationKind AK>
+[[clang::disable_sanitizer_instrumentation,
+  gnu::always_inline]] AllocationArrayTy<AK> &
+getAllocationArray() {
+  uint32_t ThreadId = 0, BlockId = 0;
+  if constexpr (AK == AllocationKind::LOCAL) {
+    ThreadId = ompx_thread_id(0);
+    BlockId = ompx_block_id(0);
+  }
+  return Allocations<AK>::Arr[ThreadId + BlockId * ompx_block_dim(0)];
+}
+
+template <AllocationKind AK>
+[[clang::disable_sanitizer_instrumentation,
+  gnu::always_inline]] AllocationTy<AK> &
+getAllocation(const AllocationPtrTy<AK> AP, int64_t AccessId = 0) {
+  auto &AllocArr = getAllocationArray<AK>();
+  uint64_t NumSlots = SanitizerConfig<AK>::SLOTS;
+  uint64_t Slot = AP.AllocationId;
+  if (Slot >= NumSlots)
+    __sanitizer_trap_info_ptr->pointerOutsideAllocation<AK>(AP, AP.Offset,
+                                                            AccessId, Slot, 0);
+  return AllocArr.Arr[Slot];
+}
+
+template <enum AllocationKind AK>
+[[clang::disable_sanitizer_instrumentation, noreturn, gnu::noinline]] void
+SanitizerTrapInfoTy::accessError(const AllocationPtrTy<AK> AP, int64_t Size,
+                                 int64_t AccessId, int64_t SourceId) {
+  auto &A = getAllocationArray<AK>().Arr[AP.AllocationId];
+  int64_t Offset = AP.Offset;
+  int64_t Length = A.Length;
+  if (AK == AllocationKind::LOCAL && Length == 0)
+    useAfterScope<AK>(A, AP, Size, AccessId, SourceId);
+  else if (Offset > Length - Size)
+    outOfBoundAccess<AK>(A, AP, Size, AccessId, SourceId);
+  else
+    useAfterFree<AK>(A, AP, Size, AccessId, SourceId);
+}
+
+#endif
diff --git a/offload/include/Shared/Types.h b/offload/include/Shared/Types.h
new file mode 100644
index 0000000000000..1503a4b2a1437
--- /dev/null
+++ b/offload/include/Shared/Types.h
@@ -0,0 +1,22 @@
+//===-- Shared/Types.h - Type defs shared between host and device - C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Environments shared between host and device.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef OMPTARGET_SHARED_TYPES_H
+#define OMPTARGET_SHARED_TYPES_H
+
+#ifdef OMPTARGET_DEVICE_RUNTIME
+#include "DeviceTypes.h"
+#else
+#include <cstdint>
+#endif
+
+#endif // OMPTARGET_SHARED_TYPES_H
diff --git a/offload/include/Shared/Utils.h b/offload/include/Shared/Utils.h
index fce14b54edb98..22ea2ba262d2a 100644
--- a/offload/include/Shared/Utils.h
+++ b/offload/include/Shared/Utils.h
@@ -14,75 +14,65 @@
 #ifndef OMPTARGET_SHARED_UTILS_H
 #define OMPTARGET_SHARED_UTILS_H
 
-#include "llvm/ADT/StringRef.h"
+#include "Types.h"
 
-#include "Debug.h"
-
-#include <atomic>
-#include <cassert>
-#include <limits>
-#include <memory>
-
-namespace llvm {
-namespace omp {
-namespace target {
-
-/// Utility class for thread-safe reference counting. Any class that needs
-/// objects' reference counting can inherit from this entity or have it as a
-/// class data member.
-template <typename Ty = uint32_t,
-          std::memory_order MemoryOrder = std::memory_order_relaxed>
-struct RefCountTy {
-  /// Create a refcount object initialized to zero.
-  RefCountTy() : Refs(0) {}
-
-  ~RefCountTy() { assert(Refs == 0 && "Destroying with non-zero refcount"); }
-
-  /// Increase the reference count atomically.
-  void increase() { Refs.fetch_add(1, MemoryOrder); }
-
-  /// Decrease the reference count and return whether it became zero. Decreasing
-  /// the counter in more units than it was previously increased results in
-  /// undefined behavior.
-  bool decrease() {
-    Ty Prev = Refs.fetch_sub(1, MemoryOrder);
-    assert(Prev > 0 && "Invalid refcount");
-    return (Prev == 1);
-  }
-
-  Ty get() const { return Refs.load(MemoryOrder); }
-
-private:
-  /// The atomic reference counter.
-  std::atomic<Ty> Refs;
-};
+namespace utils {
 
 /// Return the difference (in bytes) between \p Begin and \p End.
 template <typename Ty = char>
-ptrdiff_t getPtrDiff(const void *End, const void *Begin) {
+auto getPtrDiff(const void *End, const void *Begin) {
   return reinterpret_cast<const Ty *>(End) -
          reinterpret_cast<const Ty *>(Begin);
 }
 
 /// Return \p Ptr advanced by \p Offset bytes.
-template <typename Ty> Ty *advanceVoidPtr(Ty *Ptr, int64_t Offset) {
-  static_assert(std::is_void<Ty>::value);
-  return const_cast<char *>(reinterpret_cast<const char *>(Ptr) + Offset);
+template <typename Ty1, typename Ty2> Ty1 *advancePtr(Ty1 *Ptr, Ty2 Offset) {
+  return (Ty1 *)(const_cast<char *>((const char *)(Ptr)) + Offset);
 }
 
-/// Return \p Ptr aligned to \p Alignment bytes.
-template <typename Ty> Ty *alignPtr(Ty *Ptr, int64_t Alignment) {
-  size_t Space = std::numeric_limits<size_t>::max();
-  return std::align(Alignment, sizeof(char), Ptr, Space);
+/// Return \p V aligned "upwards" according to \p Align.
+template <typename Ty1, typename Ty2> inline Ty1 alignPtr(Ty1 V, Ty2 Align) {
+  return reinterpret_cast<Ty1>(((uintptr_t(V) + Align - 1) / Align) * Align);
+}
+/// Return \p V aligned "downwards" according to \p Align.
+template <typename Ty1, typename Ty2> inline Ty1 alignDown(Ty1 V, Ty2 Align) {
+  return V - V % Align;
 }
 
 /// Round up \p V to a \p Boundary.
 template <typename Ty> inline Ty roundUp(Ty V, Ty Boundary) {
-  return (V + Boundary - 1) / Boundary * Boundary;
+  return alignPtr(V, Boundary);
+}
+
+/// Return the first bit set in \p V.
+inline uint32_t ffs(uint32_t V) {
+  static_assert(sizeof(int) == sizeof(uint32_t), "type size mismatch");
+  return __builtin_ffs(V);
+}
+
+/// Return the first bit set in \p V.
+inline uint32_t ffs(uint64_t V) {
+  static_assert(sizeof(long) == sizeof(uint64_t), "type size mismatch");
+  return __builtin_ffsl(V);
+}
+
+/// Return the number of bits set in \p V.
+inline uint32_t popc(uint32_t V) {
+  static_assert(sizeof(int) == sizeof(uint32_t), "type size mismatch");
+  return __builtin_popcount(V);
+}
+
+/// Return the number of bits set in \p V.
+inline uint32_t popc(uint64_t V) {
+  static_assert(sizeof(long) == sizeof(uint64_t), "type size mismatch");
+  return __builtin_popcountl(V);
+}
+
+template <typename DstTy, typename SrcTy> inline DstTy convertViaPun(SrcTy V) {
+  static_assert(sizeof(DstTy) == sizeof(SrcTy), "Bad conversion");
+  return *((DstTy *)(&V));
 }
 
-} // namespace target
-} // namespace omp
-} // namespace llvm
+} // namespace utils
 
 #endif // OMPTARGET_SHARED_UTILS_H
diff --git a/offload/include/device.h b/offload/include/device.h
index fd6e5fba5fc53..69954254666b8 100644
--- a/offload/include/device.h
+++ b/offload/include/device.h
@@ -100,11 +100,15 @@ struct DeviceTy {
 
   /// Notify the plugin about a new mapping starting at the host address
   /// \p HstPtr and \p Size bytes.
-  int32_t notifyDataMapped(void *HstPtr, int64_t Size);
+  /// If GPUSan is enabled, \p DevicePtr is registered in each image and
+  /// \p FakeHstPtr is updated.
+  int32_t notifyDataMapped(void *HstPtr, void *DevicePtr, int64_t Size,
+                           void *&FakeHstPtr);
 
   /// Notify the plugin about an existing mapping being unmapped starting at
   /// the host address \p HstPtr.
-  int32_t notifyDataUnmapped(void *HstPtr);
+  /// If GPUSan is enabled, \p FakeHstPtr is unregistered.
+  int32_t notifyDataUnmapped(void *HstPtr, void *FakeHstPtr);
 
   // Launch the kernel identified by \p TgtEntryPtr with the given arguments.
   int32_t launchKernel(void *TgtEntryPtr, void **TgtVarsPtr,
diff --git a/offload/include/omptarget.h b/offload/include/omptarget.h
index 323dee41630f2..8730879905984 100644
--- a/offload/include/omptarget.h
+++ b/offload/include/omptarget.h
@@ -107,7 +107,7 @@ enum TargetAllocTy : int32_t {
 
 inline KernelArgsTy CTorDTorKernelArgs = {1,       0,       nullptr,   nullptr,
 	     nullptr, nullptr, nullptr,   nullptr,
-	     0,      {0,0},       {1, 0, 0}, {1, 0, 0}, 0};
+	     0,      {0,0,0},       {1, 0, 0}, {1, 0, 0}, 0};
 
 struct DeviceTy;
 
@@ -136,8 +136,19 @@ class AsyncInfoTy {
   /// Synchronization method to be used.
   SyncTy SyncType;
 
-  AsyncInfoTy(DeviceTy &Device, SyncTy SyncType = SyncTy::BLOCKING)
+  AsyncInfoTy(DeviceTy &Device,  SyncTy SyncType = SyncTy::BLOCKING) 
       : Device(Device), SyncType(SyncType) {}
+  AsyncInfoTy(DeviceTy &Device, void *AsyncInfoQueue)
+      : Device(Device), SyncType(AsyncInfoQueue ? SyncTy::NON_BLOCKING : SyncTy::BLOCKING) {
+    AsyncInfo.Queue = AsyncInfoQueue;
+    AsyncInfo.PersistentQueue = !!AsyncInfoQueue;
+  }
+  AsyncInfoTy(DeviceTy &Device, void *AsyncInfoQueue, SyncTy SyncType)
+      : Device(Device), SyncType(SyncType) {
+    AsyncInfo.Queue = AsyncInfoQueue;
+    AsyncInfo.PersistentQueue = !!AsyncInfoQueue;
+  }
+
   ~AsyncInfoTy() { synchronize(); }
 
   /// Implicit conversion to the __tgt_async_info which is used in the
@@ -207,8 +218,9 @@ class TaskAsyncInfoWrapperTy {
   void **TaskAsyncInfoPtr = nullptr;
 
 public:
-  TaskAsyncInfoWrapperTy(DeviceTy &Device)
+  TaskAsyncInfoWrapperTy(DeviceTy &Device, void *AsyncInfoQueue=  nullptr) 
       : ExecThreadID(__kmpc_global_thread_num(NULL)), LocalAsyncInfo(Device) {
+    assert(!AsyncInfoQueue && "Async tasks do not support predefined async queue pointers!");
     // If we failed to acquired the current global thread id, we cannot
     // re-enqueue the current task. Thus we should use the local blocking async
     // info.
@@ -425,6 +437,8 @@ int __tgt_activate_record_replay(int64_t DeviceId, uint64_t MemorySize,
                                  void *VAddr, bool IsRecord, bool SaveOutput,
                                  uint64_t &ReqPtrArgOffset);
 
+void *__tgt_target_get_default_queue(void *Loc, int64_t DeviceId);
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/offload/plugins-nextgen/amdgpu/src/rtl.cpp b/offload/plugins-nextgen/amdgpu/src/rtl.cpp
index e678213df18ce..d9edfb0e8eab6 100644
--- a/offload/plugins-nextgen/amdgpu/src/rtl.cpp
+++ b/offload/plugins-nextgen/amdgpu/src/rtl.cpp
@@ -13,6 +13,8 @@
 #include <atomic>
 #include <cassert>
 #include <cstddef>
+#include <cstdint>
+#include <cstdio>
 #include <deque>
 #include <mutex>
 #include <string>
@@ -23,6 +25,7 @@
 #include "Shared/APITypes.h"
 #include "Shared/Debug.h"
 #include "Shared/Environment.h"
+#include "Shared/RefCnt.h"
 #include "Shared/Utils.h"
 #include "Utils/ELF.h"
 
@@ -87,7 +90,7 @@ struct AMDGPUDeviceImageTy;
 struct AMDGPUMemoryManagerTy;
 struct AMDGPUMemoryPoolTy;
 
-namespace utils {
+namespace hsa_utils {
 
 /// Iterate elements using an HSA iterate function. Do not use this function
 /// directly but the specialized ones below instead.
@@ -187,7 +190,7 @@ Error asyncMemCopy(bool UseMultipleSdmaEngines, void *Dst, hsa_agent_t DstAgent,
 
 Expected<std::string> getTargetTripleAndFeatures(hsa_agent_t Agent) {
   std::string Target;
-  auto Err = utils::iterateAgentISAs(Agent, [&](hsa_isa_t ISA) {
+  auto Err = hsa_utils::iterateAgentISAs(Agent, [&](hsa_isa_t ISA) {
     uint32_t Length;
     hsa_status_t Status;
     Status = hsa_isa_get_info_alt(ISA, HSA_ISA_INFO_NAME_LENGTH, &Length);
@@ -208,7 +211,7 @@ Expected<std::string> getTargetTripleAndFeatures(hsa_agent_t Agent) {
     return Err;
   return Target;
 }
-} // namespace utils
+} // namespace hsa_utils
 
 /// Utility class representing generic resource references to AMDGPU resources.
 template <typename ResourceTy>
@@ -481,7 +484,7 @@ struct AMDGPUDeviceImageTy : public DeviceImageTy {
   findDeviceSymbol(GenericDeviceTy &Device, StringRef SymbolName) const;
 
   /// Get additional info for kernel, e.g., register spill counts
-  std::optional<utils::KernelMetaDataTy>
+  std::optional<hsa_utils::KernelMetaDataTy>
   getKernelInfo(StringRef Identifier) const {
     auto It = KernelInfoMap.find(Identifier);
 
@@ -495,7 +498,7 @@ struct AMDGPUDeviceImageTy : public DeviceImageTy {
   /// The exectuable loaded on the agent.
   hsa_executable_t Executable;
   hsa_code_object_t CodeObject;
-  StringMap<utils::KernelMetaDataTy> KernelInfoMap;
+  StringMap<hsa_utils::KernelMetaDataTy> KernelInfoMap;
   uint16_t ELFABIVersion;
 };
 
@@ -545,7 +548,8 @@ struct AMDGPUKernelTy : public GenericKernelTy {
     // TODO: Read the kernel descriptor for the max threads per block. May be
     // read from the image.
 
-    ImplicitArgsSize = utils::getImplicitArgsSize(AMDImage.getELFABIVersion());
+    ImplicitArgsSize =
+        hsa_utils::getImplicitArgsSize(AMDImage.getELFABIVersion());
     DP("ELFABIVersion: %d\n", AMDImage.getELFABIVersion());
 
     // Get additional kernel info read from image
@@ -596,7 +600,7 @@ struct AMDGPUKernelTy : public GenericKernelTy {
   uint32_t ImplicitArgsSize;
 
   /// Additional Info for the AMD GPU Kernel
-  std::optional<utils::KernelMetaDataTy> KernelInfo;
+  std::optional<hsa_utils::KernelMetaDataTy> KernelInfo;
 };
 
 /// Class representing an HSA signal. Signals are used to define dependencies
@@ -685,12 +689,12 @@ struct AMDGPUQueueTy {
   AMDGPUQueueTy() : Queue(nullptr), Mutex(), NumUsers(0) {}
 
   /// Lazily initialize a new queue belonging to a specific agent.
-  Error init(hsa_agent_t Agent, int32_t QueueSize) {
+  Error init(GenericDeviceTy &Device, hsa_agent_t Agent, int32_t QueueSize) {
     if (Queue)
       return Plugin::success();
     hsa_status_t Status =
         hsa_queue_create(Agent, QueueSize, HSA_QUEUE_TYPE_MULTI, callbackError,
-                         nullptr, UINT32_MAX, UINT32_MAX, &Queue);
+                         &Device, UINT32_MAX, UINT32_MAX, &Queue);
     return Plugin::check(Status, "Error in hsa_queue_create: %s");
   }
 
@@ -875,10 +879,8 @@ struct AMDGPUQueueTy {
   }
 
   /// Callack that will be called when an error is detected on the HSA queue.
-  static void callbackError(hsa_status_t Status, hsa_queue_t *Source, void *) {
-    auto Err = Plugin::check(Status, "Received error in queue %p: %s", Source);
-    FATAL_MESSAGE(1, "%s", toString(std::move(Err)).data());
-  }
+  static void callbackError(hsa_status_t Status, hsa_queue_t *Source,
+                            void *Data);
 
   /// The HSA queue.
   hsa_queue_t *Queue;
@@ -1268,13 +1270,14 @@ struct AMDGPUStreamTy {
     // Issue the async memory copy.
     if (InputSignal && InputSignal->load()) {
       hsa_signal_t InputSignalRaw = InputSignal->get();
-      return utils::asyncMemCopy(UseMultipleSdmaEngines, Dst, Agent, Src, Agent,
-                                 CopySize, 1, &InputSignalRaw,
-                                 OutputSignal->get());
+      return hsa_utils::asyncMemCopy(UseMultipleSdmaEngines, Dst, Agent, Src,
+                                     Agent, CopySize, 1, &InputSignalRaw,
+                                     OutputSignal->get());
     }
 
-    return utils::asyncMemCopy(UseMultipleSdmaEngines, Dst, Agent, Src, Agent,
-                               CopySize, 0, nullptr, OutputSignal->get());
+    return hsa_utils::asyncMemCopy(UseMultipleSdmaEngines, Dst, Agent, Src,
+                                   Agent, CopySize, 0, nullptr,
+                                   OutputSignal->get());
   }
 
   /// Push an asynchronous memory copy device-to-host involving an unpinned
@@ -1308,14 +1311,14 @@ struct AMDGPUStreamTy {
     // dependency if already satisfied.
     if (InputSignal && InputSignal->load()) {
       hsa_signal_t InputSignalRaw = InputSignal->get();
-      if (auto Err = utils::asyncMemCopy(
+      if (auto Err = hsa_utils::asyncMemCopy(
               UseMultipleSdmaEngines, Inter, Agent, Src, Agent, CopySize, 1,
               &InputSignalRaw, OutputSignals[0]->get()))
         return Err;
     } else {
-      if (auto Err = utils::asyncMemCopy(UseMultipleSdmaEngines, Inter, Agent,
-                                         Src, Agent, CopySize, 0, nullptr,
-                                         OutputSignals[0]->get()))
+      if (auto Err = hsa_utils::asyncMemCopy(UseMultipleSdmaEngines, Inter,
+                                             Agent, Src, Agent, CopySize, 0,
+                                             nullptr, OutputSignals[0]->get()))
         return Err;
     }
 
@@ -1406,12 +1409,13 @@ struct AMDGPUStreamTy {
     // dependency if already satisfied.
     if (InputSignal && InputSignal->load()) {
       hsa_signal_t InputSignalRaw = InputSignal->get();
-      return utils::asyncMemCopy(UseMultipleSdmaEngines, Dst, Agent, Inter,
-                                 Agent, CopySize, 1, &InputSignalRaw,
-                                 OutputSignal->get());
+      return hsa_utils::asyncMemCopy(UseMultipleSdmaEngines, Dst, Agent, Inter,
+                                     Agent, CopySize, 1, &InputSignalRaw,
+                                     OutputSignal->get());
     }
-    return utils::asyncMemCopy(UseMultipleSdmaEngines, Dst, Agent, Inter, Agent,
-                               CopySize, 0, nullptr, OutputSignal->get());
+    return hsa_utils::asyncMemCopy(UseMultipleSdmaEngines, Dst, Agent, Inter,
+                                   Agent, CopySize, 0, nullptr,
+                                   OutputSignal->get());
   }
 
   // AMDGPUDeviceTy is incomplete here, passing the underlying agent instead
@@ -1435,13 +1439,13 @@ struct AMDGPUStreamTy {
 
     if (InputSignal && InputSignal->load()) {
       hsa_signal_t InputSignalRaw = InputSignal->get();
-      return utils::asyncMemCopy(UseMultipleSdmaEngines, Dst, DstAgent, Src,
-                                 SrcAgent, CopySize, 1, &InputSignalRaw,
-                                 OutputSignal->get());
+      return hsa_utils::asyncMemCopy(UseMultipleSdmaEngines, Dst, DstAgent, Src,
+                                     SrcAgent, CopySize, 1, &InputSignalRaw,
+                                     OutputSignal->get());
     }
-    return utils::asyncMemCopy(UseMultipleSdmaEngines, Dst, DstAgent, Src,
-                               SrcAgent, CopySize, 0, nullptr,
-                               OutputSignal->get());
+    return hsa_utils::asyncMemCopy(UseMultipleSdmaEngines, Dst, DstAgent, Src,
+                                   SrcAgent, CopySize, 0, nullptr,
+                                   OutputSignal->get());
   }
 
   /// Synchronize with the stream. The current thread waits until all operations
@@ -1593,8 +1597,9 @@ struct AMDGPUStreamManagerTy final
   using ResourceRef = AMDGPUResourceRef<AMDGPUStreamTy>;
   using ResourcePoolTy = GenericDeviceResourceManagerTy<ResourceRef>;
 
+  GenericDeviceTy &Device;
   AMDGPUStreamManagerTy(GenericDeviceTy &Device, hsa_agent_t HSAAgent)
-      : GenericDeviceResourceManagerTy(Device),
+      : GenericDeviceResourceManagerTy(Device), Device(Device),
         OMPX_QueueTracking("LIBOMPTARGET_AMDGPU_HSA_QUEUE_BUSY_TRACKING", true),
         NextQueue(0), Agent(HSAAgent) {}
 
@@ -1603,7 +1608,7 @@ struct AMDGPUStreamManagerTy final
     QueueSize = HSAQueueSize;
     MaxNumQueues = NumHSAQueues;
     // Initialize one queue eagerly
-    if (auto Err = Queues.front().init(Agent, QueueSize))
+    if (auto Err = Queues.front().init(Device, Agent, QueueSize))
       return Err;
 
     return GenericDeviceResourceManagerTy::init(InitialSize);
@@ -1660,7 +1665,7 @@ struct AMDGPUStreamManagerTy final
     }
 
     // Make sure the queue is initialized, then add user & assign.
-    if (auto Err = Queues[Index].init(Agent, QueueSize))
+    if (auto Err = Queues[Index].init(Device, Agent, QueueSize))
       return Err;
     Queues[Index].addUser();
     Stream->Queue = &Queues[Index];
@@ -1799,7 +1804,7 @@ struct AMDHostDeviceTy : public AMDGenericDeviceTy {
   Error retrieveAllMemoryPools() override {
     // Iterate through the available pools across the host agents.
     for (hsa_agent_t Agent : Agents) {
-      Error Err = utils::iterateAgentMemoryPools(
+      Error Err = hsa_utils::iterateAgentMemoryPools(
           Agent, [&](hsa_amd_memory_pool_t HSAMemoryPool) {
             AMDGPUMemoryPoolTy *MemoryPool =
                 new AMDGPUMemoryPoolTy(HSAMemoryPool);
@@ -1964,7 +1969,7 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
 
     // Detect if XNACK is enabled
     auto TargeTripleAndFeaturesOrError =
-        utils::getTargetTripleAndFeatures(Agent);
+        hsa_utils::getTargetTripleAndFeatures(Agent);
     if (!TargeTripleAndFeaturesOrError)
       return TargeTripleAndFeaturesOrError.takeError();
     if (static_cast<StringRef>(*TargeTripleAndFeaturesOrError)
@@ -2208,8 +2213,11 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
       return Err;
 
     // Once the stream is synchronized, return it to stream pool and reset
-    // AsyncInfo. This is to make sure the synchronization only works for its
-    // own tasks.
+    // AsyncInfo if the queue is not persistent. This is to make sure the
+    // synchronization only works for its own tasks.
+    if (AsyncInfo.PersistentQueue)
+      return Plugin::success();
+
     AsyncInfo.Queue = nullptr;
     return AMDGPUStreamManager.returnResource(Stream);
   }
@@ -2228,9 +2236,12 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
     if (!(*CompletedOrErr))
       return Plugin::success();
 
-    // Once the stream is completed, return it to stream pool and reset
-    // AsyncInfo. This is to make sure the synchronization only works for its
-    // own tasks.
+    // Once the stream is synchronized, return it to stream pool and reset
+    // AsyncInfo if the queue is not persistent. This is to make sure the
+    // synchronization only works for its own tasks.
+    if (AsyncInfo.PersistentQueue)
+      return Plugin::success();
+
     AsyncInfo.Queue = nullptr;
     return AMDGPUStreamManager.returnResource(Stream);
   }
@@ -2316,9 +2327,9 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
       if (auto Err = Signal.init())
         return Err;
 
-      if (auto Err = utils::asyncMemCopy(useMultipleSdmaEngines(), TgtPtr,
-                                         Agent, PinnedPtr, Agent, Size, 0,
-                                         nullptr, Signal.get()))
+      if (auto Err = hsa_utils::asyncMemCopy(useMultipleSdmaEngines(), TgtPtr,
+                                             Agent, PinnedPtr, Agent, Size, 0,
+                                             nullptr, Signal.get()))
         return Err;
 
       if (auto Err = Signal.wait(getStreamBusyWaitMicroseconds()))
@@ -2376,9 +2387,9 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
       if (auto Err = Signal.init())
         return Err;
 
-      if (auto Err = utils::asyncMemCopy(useMultipleSdmaEngines(), PinnedPtr,
-                                         Agent, TgtPtr, Agent, Size, 0, nullptr,
-                                         Signal.get()))
+      if (auto Err = hsa_utils::asyncMemCopy(useMultipleSdmaEngines(),
+                                             PinnedPtr, Agent, TgtPtr, Agent,
+                                             Size, 0, nullptr, Signal.get()))
         return Err;
 
       if (auto Err = Signal.wait(getStreamBusyWaitMicroseconds()))
@@ -2420,7 +2431,7 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
       if (auto Err = Signal.init())
         return Err;
 
-      if (auto Err = utils::asyncMemCopy(
+      if (auto Err = hsa_utils::asyncMemCopy(
               useMultipleSdmaEngines(), DstPtr, DstDevice.getAgent(), SrcPtr,
               getAgent(), (uint64_t)Size, 0, nullptr, Signal.get()))
         return Err;
@@ -2443,7 +2454,10 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
 
   /// Initialize the async info for interoperability purposes.
   Error initAsyncInfoImpl(AsyncInfoWrapperTy &AsyncInfoWrapper) override {
-    // TODO: Implement this function.
+    AMDGPUStreamTy *Stream;
+    if (auto Err = getStream(AsyncInfoWrapper, Stream))
+      return Err;
+
     return Plugin::success();
   }
 
@@ -2686,7 +2700,7 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
     }
 
     Info.add("ISAs");
-    auto Err = utils::iterateAgentISAs(getAgent(), [&](hsa_isa_t ISA) {
+    auto Err = hsa_utils::iterateAgentISAs(getAgent(), [&](hsa_isa_t ISA) {
       Status = hsa_isa_get_info_alt(ISA, HSA_ISA_INFO_NAME, TmpChar);
       if (Status == HSA_STATUS_SUCCESS)
         Info.add<InfoLevel2>("Name", TmpChar);
@@ -2768,7 +2782,7 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
   /// Retrieve and construct all memory pools of the device agent.
   Error retrieveAllMemoryPools() override {
     // Iterate through the available pools of the device agent.
-    return utils::iterateAgentMemoryPools(
+    return hsa_utils::iterateAgentMemoryPools(
         Agent, [&](hsa_amd_memory_pool_t HSAMemoryPool) {
           AMDGPUMemoryPoolTy *MemoryPool =
               Plugin.allocate<AMDGPUMemoryPoolTy>();
@@ -2954,7 +2968,7 @@ Error AMDGPUDeviceImageTy::loadExecutable(const AMDGPUDeviceTy &Device) {
   if (Result)
     return Plugin::error("Loaded HSA executable does not validate");
 
-  if (auto Err = utils::readAMDGPUMetaDataFromImage(
+  if (auto Err = hsa_utils::readAMDGPUMetaDataFromImage(
           getMemoryBuffer(), KernelInfoMap, ELFABIVersion))
     return Err;
 
@@ -3083,7 +3097,7 @@ struct AMDGPUPluginTy final : public GenericPluginTy {
     llvm::SmallVector<hsa_agent_t> HostAgents;
 
     // Count the number of available agents.
-    auto Err = utils::iterateAgents([&](hsa_agent_t Agent) {
+    auto Err = hsa_utils::iterateAgents([&](hsa_agent_t Agent) {
       // Get the device type of the agent.
       hsa_device_type_t DeviceType;
       hsa_status_t Status =
@@ -3178,12 +3192,12 @@ struct AMDGPUPluginTy final : public GenericPluginTy {
       return false;
 
     auto TargeTripleAndFeaturesOrError =
-        utils::getTargetTripleAndFeatures(getKernelAgent(DeviceId));
+        hsa_utils::getTargetTripleAndFeatures(getKernelAgent(DeviceId));
     if (!TargeTripleAndFeaturesOrError)
       return TargeTripleAndFeaturesOrError.takeError();
-    return utils::isImageCompatibleWithEnv(Processor ? *Processor : "",
-                                           ElfOrErr->getPlatformFlags(),
-                                           *TargeTripleAndFeaturesOrError);
+    return hsa_utils::isImageCompatibleWithEnv(Processor ? *Processor : "",
+                                               ElfOrErr->getPlatformFlags(),
+                                               *TargeTripleAndFeaturesOrError);
   }
 
   bool isDataExchangable(int32_t SrcDeviceId, int32_t DstDeviceId) override {
@@ -3295,11 +3309,11 @@ Error AMDGPUKernelTy::launchImpl(GenericDeviceTy &GenericDevice,
   if (auto Err = GenericDevice.getDeviceStackSize(StackSize))
     return Err;
 
-  utils::AMDGPUImplicitArgsTy *ImplArgs = nullptr;
+  hsa_utils::AMDGPUImplicitArgsTy *ImplArgs = nullptr;
   if (ArgsSize == LaunchParams.Size + getImplicitArgsSize()) {
     // Initialize implicit arguments.
-    ImplArgs = reinterpret_cast<utils::AMDGPUImplicitArgsTy *>(
-        advanceVoidPtr(AllArgs, LaunchParams.Size));
+    ImplArgs = reinterpret_cast<hsa_utils::AMDGPUImplicitArgsTy *>(
+        utils::advancePtr(AllArgs, LaunchParams.Size));
 
     // Initialize the implicit arguments to zero.
     std::memset(ImplArgs, 0, getImplicitArgsSize());
@@ -3323,7 +3337,7 @@ Error AMDGPUKernelTy::launchImpl(GenericDeviceTy &GenericDevice,
 
   // Only COV5 implicitargs needs to be set. COV4 implicitargs are not used.
   if (ImplArgs &&
-      getImplicitArgsSize() == sizeof(utils::AMDGPUImplicitArgsTy)) {
+      getImplicitArgsSize() == sizeof(hsa_utils::AMDGPUImplicitArgsTy)) {
     ImplArgs->BlockCountX = NumBlocks;
     ImplArgs->BlockCountY = 1;
     ImplArgs->BlockCountZ = 1;
@@ -3480,6 +3494,16 @@ void *AMDGPUDeviceTy::allocate(size_t Size, void *, TargetAllocTy Kind) {
   return Alloc;
 }
 
+void AMDGPUQueueTy::callbackError(hsa_status_t Status, hsa_queue_t *Source,
+                                  void *Data) {
+
+  auto *Device = reinterpret_cast<AMDGPUDeviceTy *>(Data);
+  Device->reportSanitizerError();
+
+  auto Err = Plugin::check(Status, "Received error in queue %p: %s", Source);
+  FATAL_MESSAGE(1, "%s", toString(std::move(Err)).data());
+}
+
 } // namespace plugin
 } // namespace target
 } // namespace omp
diff --git a/offload/plugins-nextgen/amdgpu/utils/UtilitiesRTL.h b/offload/plugins-nextgen/amdgpu/utils/UtilitiesRTL.h
index 58a3b5df00fac..1e99d0a30bdf2 100644
--- a/offload/plugins-nextgen/amdgpu/utils/UtilitiesRTL.h
+++ b/offload/plugins-nextgen/amdgpu/utils/UtilitiesRTL.h
@@ -33,7 +33,7 @@ namespace llvm {
 namespace omp {
 namespace target {
 namespace plugin {
-namespace utils {
+namespace hsa_utils {
 
 // The implicit arguments of COV5 AMDGPU kernels.
 struct AMDGPUImplicitArgsTy {
@@ -310,7 +310,7 @@ readAMDGPUMetaDataFromImage(MemoryBufferRef MemBuffer,
   return Error::success();
 }
 
-} // namespace utils
+} // namespace hsa_utils
 } // namespace plugin
 } // namespace target
 } // namespace omp
diff --git a/offload/plugins-nextgen/common/include/GlobalHandler.h b/offload/plugins-nextgen/common/include/GlobalHandler.h
index 829b4b7291193..b88d3e52bbf43 100644
--- a/offload/plugins-nextgen/common/include/GlobalHandler.h
+++ b/offload/plugins-nextgen/common/include/GlobalHandler.h
@@ -50,6 +50,9 @@ class GlobalTy {
   const std::string &getName() const { return Name; }
   uint32_t getSize() const { return Size; }
   void *getPtr() const { return Ptr; }
+  template <typename T> T *getPtrAs() const {
+    return reinterpret_cast<T *>(Ptr);
+  }
 
   void setSize(int32_t S) { Size = S; }
   void setPtr(void *P) { Ptr = P; }
diff --git a/offload/plugins-nextgen/common/include/PluginInterface.h b/offload/plugins-nextgen/common/include/PluginInterface.h
index 0d2a36a42d5fa..e1e5b5e05acdc 100644
--- a/offload/plugins-nextgen/common/include/PluginInterface.h
+++ b/offload/plugins-nextgen/common/include/PluginInterface.h
@@ -24,6 +24,7 @@
 #include "Shared/Environment.h"
 #include "Shared/EnvironmentVar.h"
 #include "Shared/Requirements.h"
+#include "Shared/Sanitizer.h"
 #include "Shared/Utils.h"
 
 #include "GlobalHandler.h"
@@ -231,7 +232,7 @@ class DeviceImageTy {
 
   /// Get the image size.
   size_t getSize() const {
-    return getPtrDiff(TgtImage->ImageEnd, TgtImage->ImageStart);
+    return utils::getPtrDiff(TgtImage->ImageEnd, TgtImage->ImageStart);
   }
 
   /// Get a memory buffer reference to the whole image.
@@ -471,7 +472,7 @@ class PinnedAllocationMapTy {
     --It;
 
     // The buffer is not contained in the pinned allocation.
-    if (advanceVoidPtr(It->HstPtr, It->Size) > HstPtr)
+    if (utils::advancePtr(It->HstPtr, It->Size) > HstPtr)
       return &(*It);
 
     // None found.
@@ -498,15 +499,15 @@ class PinnedAllocationMapTy {
 
   /// Indicate whether the first range A fully contains the second range B.
   static bool contains(void *PtrA, size_t SizeA, void *PtrB, size_t SizeB) {
-    void *EndA = advanceVoidPtr(PtrA, SizeA);
-    void *EndB = advanceVoidPtr(PtrB, SizeB);
+    void *EndA = utils::advancePtr(PtrA, SizeA);
+    void *EndB = utils::advancePtr(PtrB, SizeB);
     return (PtrB >= PtrA && EndB <= EndA);
   }
 
   /// Indicate whether the first range A intersects with the second range B.
   static bool intersects(void *PtrA, size_t SizeA, void *PtrB, size_t SizeB) {
-    void *EndA = advanceVoidPtr(PtrA, SizeA);
-    void *EndB = advanceVoidPtr(PtrB, SizeB);
+    void *EndA = utils::advancePtr(PtrA, SizeA);
+    void *EndB = utils::advancePtr(PtrB, SizeB);
     return (PtrA < EndB && PtrB < EndA);
   }
 
@@ -588,8 +589,8 @@ class PinnedAllocationMapTy {
     if (!Entry)
       return nullptr;
 
-    return advanceVoidPtr(Entry->DevAccessiblePtr,
-                          getPtrDiff(HstPtr, Entry->HstPtr));
+    return utils::advancePtr(Entry->DevAccessiblePtr,
+                             utils::getPtrDiff(HstPtr, Entry->HstPtr));
   }
 
   /// Check whether a buffer belongs to a registered host pinned allocation.
@@ -601,6 +602,22 @@ class PinnedAllocationMapTy {
   }
 };
 
+struct GPUSanTy {
+  GPUSanTy(GenericDeviceTy &Device) : Device(Device) {}
+  Error notifyDataMapped(void *DevicePtr, uint64_t Size, void *&FakeHstPtr);
+  Error notifyDataUnmapped(void *FakeHstPtr);
+
+  void addGPUSanNewFn(GenericKernelTy &GK) { NewFns.push_back(&GK); }
+  void addGPUSanFreeFn(GenericKernelTy &GK) { FreeFns.push_back(&GK); }
+  void checkAndReportError();
+
+private:
+  uint32_t SlotCnt = SanitizerConfig<AllocationKind::GLOBAL>::SLOTS - 1;
+  GenericDeviceTy &Device;
+  SmallVector<GenericKernelTy *> NewFns;
+  SmallVector<GenericKernelTy *> FreeFns;
+};
+
 /// Class implementing common functionalities of offload devices. Each plugin
 /// should define the specific device class, derive from this generic one, and
 /// implement the necessary virtual function members.
@@ -718,14 +735,23 @@ struct GenericDeviceTy : public DeviceAllocatorTy {
   /// buffer (e.g., because a user OpenMP target map) and the buffer may be used
   /// as source/destination of memory transfers. We can use this information to
   /// lock the host buffer and optimize its memory transfers.
-  Error notifyDataMapped(void *HstPtr, int64_t Size) {
+  Error notifyDataMapped(void *HstPtr, void *DevicePtr, int64_t Size,
+                         void *&FakeHstPtr) {
+    if (auto Err = GPUSan.notifyDataMapped(DevicePtr, Size, FakeHstPtr))
+      return Err;
+    if (!HstPtr)
+      return Error::success();
     return PinnedAllocs.lockMappedHostBuffer(HstPtr, Size);
   }
 
   /// Mark the host buffer with address \p HstPtr as unmapped. This means that
   /// libomptarget removed an existing mapping. If the plugin locked the buffer
   /// in notifyDataMapped, this function should unlock it.
-  Error notifyDataUnmapped(void *HstPtr) {
+  Error notifyDataUnmapped(void *HstPtr, void *FakeHstPtr) {
+    if (auto Err = GPUSan.notifyDataUnmapped(FakeHstPtr))
+      return Err;
+    if (!HstPtr)
+      return Error::success();
     return PinnedAllocs.unlockUnmappedHostBuffer(HstPtr);
   }
 
@@ -736,6 +762,10 @@ struct GenericDeviceTy : public DeviceAllocatorTy {
                                          void *&BaseDevAccessiblePtr,
                                          size_t &BaseSize) const = 0;
 
+  void addGPUSanNewFn(GenericKernelTy &GK) { GPUSan.addGPUSanNewFn(GK); }
+  void addGPUSanFreeFn(GenericKernelTy &GK) { GPUSan.addGPUSanFreeFn(GK); }
+  void reportSanitizerError() { GPUSan.checkAndReportError(); }
+
   /// Submit data to the device (host to device transfer).
   Error dataSubmit(void *TgtPtr, const void *HstPtr, int64_t Size,
                    __tgt_async_info *AsyncInfo);
@@ -857,6 +887,8 @@ struct GenericDeviceTy : public DeviceAllocatorTy {
   /// Allocate and construct a kernel object.
   virtual Expected<GenericKernelTy &> constructKernel(const char *Name) = 0;
 
+  DenseMap<DeviceImageTy *, SanitizerTrapInfoTy *> SanitizerTrapInfos;
+
   /// Reference to the underlying plugin that created this device.
   GenericPluginTy &Plugin;
 
@@ -950,6 +982,8 @@ struct GenericDeviceTy : public DeviceAllocatorTy {
 #endif
 
 private:
+  GPUSanTy GPUSan;
+
   DeviceMemoryPoolTy DeviceMemoryPool = {nullptr, 0};
   DeviceMemoryPoolTrackingTy DeviceMemoryPoolTracking = {0, 0, ~0U, 0};
 };
@@ -1118,10 +1152,12 @@ struct GenericPluginTy {
   int32_t data_unlock(int32_t DeviceId, void *Ptr);
 
   /// Notify the runtime about a new mapping that has been created outside.
-  int32_t data_notify_mapped(int32_t DeviceId, void *HstPtr, int64_t Size);
+  int32_t data_notify_mapped(int32_t DeviceId, void *HstPtr, void *DevicePtr,
+                             int64_t Size, void *&FakeHstPtr);
 
   /// Notify t he runtime about a mapping that has been deleted.
-  int32_t data_notify_unmapped(int32_t DeviceId, void *HstPtr);
+  int32_t data_notify_unmapped(int32_t DeviceId, void *HstPtr,
+                               void *FakeHstPtr);
 
   /// Copy data to the given device.
   int32_t data_submit(int32_t DeviceId, void *TgtPtr, void *HstPtr,
diff --git a/offload/plugins-nextgen/common/src/GlobalHandler.cpp b/offload/plugins-nextgen/common/src/GlobalHandler.cpp
index ba0aa47f8e51c..e18f358af116c 100644
--- a/offload/plugins-nextgen/common/src/GlobalHandler.cpp
+++ b/offload/plugins-nextgen/common/src/GlobalHandler.cpp
@@ -152,8 +152,8 @@ Error GenericGlobalHandlerTy::readGlobalFromImage(GenericDeviceTy &Device,
      HostGlobal.getPtr());
 
   assert(Image.getStart() <= ImageGlobal.getPtr() &&
-         advanceVoidPtr(ImageGlobal.getPtr(), ImageGlobal.getSize()) <
-             advanceVoidPtr(Image.getStart(), Image.getSize()) &&
+         utils::advancePtr(ImageGlobal.getPtr(), ImageGlobal.getSize()) <
+             utils::advancePtr(Image.getStart(), Image.getSize()) &&
          "Attempting to read outside the image!");
 
   // Perform the copy from the image to the host memory.
diff --git a/offload/plugins-nextgen/common/src/JIT.cpp b/offload/plugins-nextgen/common/src/JIT.cpp
index 9dbba1459839d..9adb62b677b92 100644
--- a/offload/plugins-nextgen/common/src/JIT.cpp
+++ b/offload/plugins-nextgen/common/src/JIT.cpp
@@ -51,7 +51,7 @@ namespace {
 
 bool isImageBitcode(const __tgt_device_image &Image) {
   StringRef Binary(reinterpret_cast<const char *>(Image.ImageStart),
-                   target::getPtrDiff(Image.ImageEnd, Image.ImageStart));
+                   utils::getPtrDiff(Image.ImageEnd, Image.ImageStart));
 
   return identify_magic(Binary) == file_magic::bitcode;
 }
@@ -69,7 +69,7 @@ createModuleFromMemoryBuffer(std::unique_ptr<MemoryBuffer> &MB,
 Expected<std::unique_ptr<Module>>
 createModuleFromImage(const __tgt_device_image &Image, LLVMContext &Context) {
   StringRef Data((const char *)Image.ImageStart,
-                 target::getPtrDiff(Image.ImageEnd, Image.ImageStart));
+                 utils::getPtrDiff(Image.ImageEnd, Image.ImageStart));
   std::unique_ptr<MemoryBuffer> MB = MemoryBuffer::getMemBuffer(
       Data, /*BufferName=*/"", /*RequiresNullTerminator=*/false);
   return createModuleFromMemoryBuffer(MB, Context);
diff --git a/offload/plugins-nextgen/common/src/PluginInterface.cpp b/offload/plugins-nextgen/common/src/PluginInterface.cpp
index 94f9d4670b672..fe194a42acaab 100644
--- a/offload/plugins-nextgen/common/src/PluginInterface.cpp
+++ b/offload/plugins-nextgen/common/src/PluginInterface.cpp
@@ -16,14 +16,20 @@
 
 #include "GlobalHandler.h"
 #include "JIT.h"
+#include "Shared/Sanitizer.h"
+#include "Shared/Utils.h"
 #include "Utils/ELF.h"
 #include "omptarget.h"
+#include "llvm/Support/ErrorHandling.h"
+#include <cstdio>
+#include <string>
 
 #ifdef OMPT_SUPPORT
 #include "OpenMP/OMPT/Callback.h"
 #include "omp-tools.h"
 #endif
 
+#include "llvm/ADT/StringRef.h"
 #include "llvm/Bitcode/BitcodeReader.h"
 #include "llvm/Frontend/OpenMP/OMPConstants.h"
 #include "llvm/Support/Error.h"
@@ -74,7 +80,7 @@ struct RecordReplayTy {
         Device->allocate(1024, /*HstPtr=*/nullptr, TARGET_ALLOC_DEFAULT);
     Device->free(Addr);
     // Align Address to MaxMemoryAllocation
-    Addr = (void *)alignPtr((Addr), MaxMemoryAllocation);
+    Addr = (void *)utils::alignPtr((Addr), MaxMemoryAllocation);
     return Addr;
   }
 
@@ -207,8 +213,8 @@ struct RecordReplayTy {
     if (EC)
       report_fatal_error("Error saving image : " + StringRef(EC.message()));
     if (const auto *TgtImageBitcode = Image.getTgtImageBitcode()) {
-      size_t Size =
-          getPtrDiff(TgtImageBitcode->ImageEnd, TgtImageBitcode->ImageStart);
+      size_t Size = utils::getPtrDiff(TgtImageBitcode->ImageEnd,
+                                      TgtImageBitcode->ImageStart);
       MemoryBufferRef MBR = MemoryBufferRef(
           StringRef((const char *)TgtImageBitcode->ImageStart, Size), "");
       OS << MBR.getBuffer();
@@ -241,10 +247,10 @@ struct RecordReplayTy {
 
       int32_t NameLength = std::strlen(OffloadEntry.Name) + 1;
       memcpy(BufferPtr, OffloadEntry.Name, NameLength);
-      BufferPtr = advanceVoidPtr(BufferPtr, NameLength);
+      BufferPtr = utils::advancePtr(BufferPtr, NameLength);
 
       *((uint32_t *)(BufferPtr)) = OffloadEntry.Size;
-      BufferPtr = advanceVoidPtr(BufferPtr, sizeof(uint32_t));
+      BufferPtr = utils::advancePtr(BufferPtr, sizeof(uint32_t));
 
       auto Err = Plugin::success();
       {
@@ -254,11 +260,12 @@ struct RecordReplayTy {
       }
       if (Err)
         report_fatal_error("Error retrieving data for global");
-      BufferPtr = advanceVoidPtr(BufferPtr, OffloadEntry.Size);
+      BufferPtr = utils::advancePtr(BufferPtr, OffloadEntry.Size);
     }
     assert(BufferPtr == GlobalsMB->get()->getBufferEnd() &&
            "Buffer over/under-filled.");
-    assert(Size == getPtrDiff(BufferPtr, GlobalsMB->get()->getBufferStart()) &&
+    assert(Size == utils::getPtrDiff(BufferPtr,
+                                     GlobalsMB->get()->getBufferStart()) &&
            "Buffer size mismatch");
 
     StringRef GlobalsMemory(GlobalsMB.get()->getBufferStart(), Size);
@@ -549,9 +556,16 @@ Error GenericKernelTy::launch(GenericDeviceTy &GenericDevice, void **ArgPtrs,
   if (!KernelLaunchEnvOrErr)
     return KernelLaunchEnvOrErr.takeError();
 
-  KernelLaunchParamsTy LaunchParams =
-      prepareArgs(GenericDevice, ArgPtrs, ArgOffsets, KernelArgs.NumArgs, Args,
-                  Ptrs, *KernelLaunchEnvOrErr);
+  KernelLaunchParamsTy LaunchParams;
+
+  // Kernel languages don't use indirection.
+  if (KernelArgs.Flags.IsCUDA) {
+    LaunchParams = *reinterpret_cast<KernelLaunchParamsTy *>(KernelArgs.ArgPtrs);
+  } else {
+    LaunchParams =
+        prepareArgs(GenericDevice, ArgPtrs, ArgOffsets, KernelArgs.NumArgs,
+                    Args, Ptrs, *KernelLaunchEnvOrErr);
+  }
 
   uint32_t NumThreads = getNumThreads(GenericDevice, KernelArgs.ThreadLimit);
   uint64_t NumBlocks =
@@ -723,7 +737,7 @@ GenericDeviceTy::GenericDeviceTy(GenericPluginTy &Plugin, int32_t DeviceId,
       OMPX_InitialNumEvents("LIBOMPTARGET_NUM_INITIAL_EVENTS", 1),
       DeviceId(DeviceId), GridValues(OMPGridValues),
       PeerAccesses(NumDevices, PeerAccessState::PENDING), PeerAccessesLock(),
-      PinnedAllocs(*this), RPCServer(nullptr) {
+      PinnedAllocs(*this), RPCServer(nullptr), GPUSan(*this) {
 #ifdef OMPT_SUPPORT
   OmptInitialized.store(false);
   // Bind the callbacks to this device's member functions
@@ -903,7 +917,7 @@ GenericDeviceTy::loadBinary(GenericPluginTy &Plugin,
 #ifdef OMPT_SUPPORT
   if (ompt::Initialized) {
     size_t Bytes =
-        getPtrDiff(InputTgtImage->ImageEnd, InputTgtImage->ImageStart);
+        utils::getPtrDiff(InputTgtImage->ImageEnd, InputTgtImage->ImageStart);
     performOmptCallback(
         device_load, Plugin.getUserId(DeviceId),
         /*FileName=*/nullptr, /*FileOffset=*/0, /*VmaInFile=*/nullptr,
@@ -916,6 +930,25 @@ GenericDeviceTy::loadBinary(GenericPluginTy &Plugin,
   if (auto Err = callGlobalConstructors(Plugin, *Image))
     return std::move(Err);
 
+  auto GetKernel = [&](StringRef Name) -> GenericKernelTy * {
+    auto KernelOrErr = constructKernel(Name.data());
+    if (Error Err = KernelOrErr.takeError()) {
+      REPORT("Failure to look up kernel: %s\n",
+             toString(std::move(Err)).data());
+      return nullptr;
+    }
+    GenericKernelTy &Kernel = *KernelOrErr;
+    if (auto Err = Kernel.init(*this, *Image)) {
+      REPORT("Failure to init kernel: %s\n", toString(std::move(Err)).data());
+      return nullptr;
+    }
+    return &Kernel;
+  };
+  if (GenericKernelTy *Kernel = GetKernel("__sanitizer_register"))
+    addGPUSanNewFn(*Kernel);
+  if (GenericKernelTy *Kernel = GetKernel("__sanitizer_unregister"))
+    addGPUSanFreeFn(*Kernel);
+
   // Return the pointer to the table of entries.
   return Image;
 }
@@ -994,6 +1027,16 @@ Error GenericDeviceTy::setupDeviceMemoryPool(GenericPluginTy &Plugin,
   if (auto Err = GHandler.writeGlobalToDevice(*this, Image, TrackerGlobal))
     return Err;
 
+  auto *&SanitizerTrapInfo = SanitizerTrapInfos[&Image];
+  SanitizerTrapInfo = reinterpret_cast<SanitizerTrapInfoTy *>(allocate(
+      sizeof(*SanitizerTrapInfo), &SanitizerTrapInfo, TARGET_ALLOC_HOST));
+  memset(SanitizerTrapInfo, '\0', sizeof(SanitizerTrapInfoTy));
+
+  GlobalTy TrapId("__sanitizer_trap_info_ptr", sizeof(SanitizerTrapInfo),
+                  &SanitizerTrapInfo);
+  if (auto Err = GHandler.writeGlobalToDevice(*this, Image, TrapId))
+    return Err;
+
   // Create the metainfo of the device environment global.
   GlobalTy DevEnvGlobal("__omp_rtl_device_memory_pool",
                         sizeof(DeviceMemoryPoolTy), &DeviceMemoryPool);
@@ -1131,8 +1174,8 @@ Expected<void *> PinnedAllocationMapTy::lockHostBuffer(void *HstPtr,
       return std::move(Err);
 
     // Return the device accessible pointer with the correct offset.
-    return advanceVoidPtr(Entry->DevAccessiblePtr,
-                          getPtrDiff(HstPtr, Entry->HstPtr));
+    return utils::advancePtr(Entry->DevAccessiblePtr,
+                             utils::getPtrDiff(HstPtr, Entry->HstPtr));
   }
 
   // No intersecting registered allocation found in the map. First, lock the
@@ -1428,8 +1471,10 @@ Error GenericDeviceTy::launchKernel(void *EntryPtr, void **ArgPtrs,
 
 Error GenericDeviceTy::initAsyncInfo(__tgt_async_info **AsyncInfoPtr) {
   assert(AsyncInfoPtr && "Invalid async info");
+  assert(!(*AsyncInfoPtr) && "Already initialized async info");
 
   *AsyncInfoPtr = new __tgt_async_info();
+  (*AsyncInfoPtr)->PersistentQueue = true;
 
   AsyncInfoWrapperTy AsyncInfoWrapper(*this, *AsyncInfoPtr);
 
@@ -1607,7 +1652,7 @@ int32_t GenericPluginTy::is_initialized() const { return Initialized; }
 
 int32_t GenericPluginTy::is_plugin_compatible(__tgt_device_image *Image) {
   StringRef Buffer(reinterpret_cast<const char *>(Image->ImageStart),
-                   target::getPtrDiff(Image->ImageEnd, Image->ImageStart));
+                   utils::getPtrDiff(Image->ImageEnd, Image->ImageStart));
 
   auto HandleError = [&](Error Err) -> bool {
     [[maybe_unused]] std::string ErrStr = toString(std::move(Err));
@@ -1639,7 +1684,7 @@ int32_t GenericPluginTy::is_plugin_compatible(__tgt_device_image *Image) {
 int32_t GenericPluginTy::is_device_compatible(int32_t DeviceId,
                                               __tgt_device_image *Image) {
   StringRef Buffer(reinterpret_cast<const char *>(Image->ImageStart),
-                   target::getPtrDiff(Image->ImageEnd, Image->ImageStart));
+                   utils::getPtrDiff(Image->ImageEnd, Image->ImageStart));
 
   auto HandleError = [&](Error Err) -> bool {
     [[maybe_unused]] std::string ErrStr = toString(std::move(Err));
@@ -1801,8 +1846,10 @@ int32_t GenericPluginTy::data_unlock(int32_t DeviceId, void *Ptr) {
 }
 
 int32_t GenericPluginTy::data_notify_mapped(int32_t DeviceId, void *HstPtr,
-                                            int64_t Size) {
-  auto Err = getDevice(DeviceId).notifyDataMapped(HstPtr, Size);
+                                            void *DevicePtr, int64_t Size,
+                                            void *&FakeHstPtr) {
+  auto Err =
+      getDevice(DeviceId).notifyDataMapped(HstPtr, DevicePtr, Size, FakeHstPtr);
   if (Err) {
     REPORT("Failure to notify data mapped %p: %s\n", HstPtr,
            toString(std::move(Err)).data());
@@ -1812,8 +1859,9 @@ int32_t GenericPluginTy::data_notify_mapped(int32_t DeviceId, void *HstPtr,
   return OFFLOAD_SUCCESS;
 }
 
-int32_t GenericPluginTy::data_notify_unmapped(int32_t DeviceId, void *HstPtr) {
-  auto Err = getDevice(DeviceId).notifyDataUnmapped(HstPtr);
+int32_t GenericPluginTy::data_notify_unmapped(int32_t DeviceId, void *HstPtr,
+                                              void *FakeHstPtr) {
+  auto Err = getDevice(DeviceId).notifyDataUnmapped(HstPtr, FakeHstPtr);
   if (Err) {
     REPORT("Failure to notify data unmapped %p: %s\n", HstPtr,
            toString(std::move(Err)).data());
@@ -2088,6 +2136,201 @@ int32_t GenericPluginTy::get_function(__tgt_device_binary Binary,
   return OFFLOAD_SUCCESS;
 }
 
+Error GPUSanTy::notifyDataMapped(void *DevicePtr, uint64_t Size,
+                                 void *&FakeHstPtr) {
+  FakeHstPtr = nullptr;
+  if (NewFns.empty())
+    return Plugin::success();
+  uint64_t Slot = SlotCnt--;
+  FakeHstPtr = __offload_get_new_sanitizer_ptr(Slot);
+  KernelArgsTy Args = {};
+  Args.NumTeams[0] = 1;
+  Args.ThreadLimit[0] = 1;
+  AsyncInfoWrapperTy AsyncInfoWrapper(Device, nullptr);
+  for (GenericKernelTy *NewFP : NewFns) {
+    struct {
+      void *Ptr;
+      uint64_t Length;
+      uint64_t Slot;
+    } KernelArgs{DevicePtr, Size, Slot};
+    KernelLaunchParamsTy ArgPtrs{sizeof(KernelArgs), &KernelArgs, nullptr};
+    Args.ArgPtrs = reinterpret_cast<void **>(&ArgPtrs);
+    Args.Flags.IsCUDA = true;
+    if (auto Err = NewFP->launch(Device, Args.ArgPtrs, nullptr, Args,
+                                 AsyncInfoWrapper)) {
+      AsyncInfoWrapper.finalize(Err);
+      return Err;
+    }
+  }
+
+  Error Err = Plugin::success();
+  AsyncInfoWrapper.finalize(Err);
+  return Err;
+}
+
+Error GPUSanTy::notifyDataUnmapped(void *FakeHstPtr) {
+  if (!FakeHstPtr)
+    return Plugin::success();
+  KernelArgsTy Args = {};
+  Args.NumTeams[0] = 1;
+  Args.ThreadLimit[0] = 1;
+  AsyncInfoWrapperTy AsyncInfoWrapper(Device, nullptr);
+  for (GenericKernelTy *FreeFn : FreeFns) {
+    KernelLaunchParamsTy ArgPtrs{sizeof(void *), &FakeHstPtr, nullptr};
+    Args.ArgPtrs = reinterpret_cast<void **>(&ArgPtrs);
+    Args.Flags.IsCUDA = true;
+    if (auto Err = FreeFn->launch(Device, Args.ArgPtrs, nullptr, Args,
+                                  AsyncInfoWrapper)) {
+      AsyncInfoWrapper.finalize(Err);
+      return Err;
+    }
+  }
+  Error Err = Plugin::success();
+  AsyncInfoWrapper.finalize(Err);
+  return Err;
+}
+
+void GPUSanTy::checkAndReportError() {
+  SanitizerTrapInfoTy *STI;
+  DeviceImageTy *Image = nullptr;
+  for (auto &It : Device.SanitizerTrapInfos) {
+    STI = It.second;
+    if (!STI || STI->ErrorCode == SanitizerTrapInfoTy::None)
+      continue;
+    Image = It.first;
+    break;
+  }
+  if (!Image)
+    return;
+
+  auto Green = []() { return "\033[1m\033[32m"; };
+  auto Blue = []() { return "\033[1m\033[34m"; };
+  auto Red = []() { return "\033[1m\033[31m"; };
+  auto Default = []() { return "\033[1m\033[0m"; };
+
+  GenericGlobalHandlerTy &GHandler = Device.Plugin.getGlobalHandler();
+  auto GetImagePtr = [&](GlobalTy &GV, bool Quiet = false) {
+    if (auto Err = GHandler.getGlobalMetadataFromImage(Device, *Image, GV)) {
+      if (Quiet)
+        consumeError(std::move(Err));
+      else
+        REPORT("WARNING: Failed to read backtrace "
+               "(%s)\n",
+               toString(std::move(Err)).data());
+      return false;
+    }
+    return true;
+  };
+  GlobalTy LocationsGV("__san.locations", -1);
+  GlobalTy LocationNamesGV("__san.location_names", -1);
+  GlobalTy AmbiguousCallsBitWidthGV("__san.num_ambiguous_calls", -1);
+  GlobalTy AmbiguousCallsLocationsGV("__san.ambiguous_calls_mapping", -1);
+  if (GetImagePtr(LocationsGV))
+    GetImagePtr(LocationNamesGV);
+  GetImagePtr(AmbiguousCallsBitWidthGV, /*Quiet=*/true);
+  GetImagePtr(AmbiguousCallsLocationsGV, /*Quiet=*/true);
+
+  fprintf(stderr, "============================================================"
+                  "====================\n");
+
+  auto PrintStackTrace = [&](int64_t LocationId) {
+    if (!LocationsGV.getPtr() || !LocationNamesGV.getPtr()) {
+      fprintf(stderr, "    no backtrace available\n");
+      return;
+    }
+    char *LocationNames = LocationNamesGV.getPtrAs<char>();
+    LocationEncodingTy *Locations = LocationsGV.getPtrAs<LocationEncodingTy>();
+    uint64_t *AmbiguousCallsBitWidth =
+        AmbiguousCallsBitWidthGV.getPtrAs<uint64_t>();
+    uint64_t *AmbiguousCallsLocations =
+        AmbiguousCallsLocationsGV.getPtrAs<uint64_t>();
+    int32_t FrameIdx = 0;
+    do {
+      LocationEncodingTy &LE = Locations[LocationId];
+      fprintf(stderr, "    #%i %s in %s:%lu:%lu\n", FrameIdx,
+              &LocationNames[LE.FunctionNameIdx],
+              &LocationNames[LE.FileNameIdx], LE.LineNo, LE.ColumnNo);
+      LocationId = LE.ParentIdx;
+      FrameIdx++;
+      if (LocationId < 0 && STI->CallId != 0 && AmbiguousCallsBitWidth &&
+          AmbiguousCallsLocations) {
+        uint64_t LastCallId =
+            STI->CallId & ((1 << *AmbiguousCallsBitWidth) - 1);
+        LocationId = AmbiguousCallsLocations[LastCallId - 1];
+        STI->CallId >>= (*AmbiguousCallsBitWidth);
+      }
+    } while (LocationId >= 0);
+    fputc('\n', stderr);
+  };
+
+  auto DiagnoseAccess = [&](StringRef Name) {
+    void *PC = reinterpret_cast<void *>(STI->PC);
+    void *Addr = utils::advancePtr(STI->AllocationStart, STI->PtrOffset);
+    fprintf(stderr,
+            "%sERROR: OffloadSanitizer %s access on address " DPxMOD
+            " at pc " DPxMOD "\n%s",
+            Red(), Name.data(), DPxPTR(Addr), DPxPTR(PC), Default());
+    fprintf(stderr,
+            "%s%s of size %u at " DPxMOD
+            " thread <%u, %u, %u> block <%lu, %lu, %lu> (acc %li, %s)\n%s",
+            Blue(), STI->AccessId > 0 ? "WRITE" : "READ", STI->AccessSize,
+            DPxPTR(Addr), STI->ThreadId[0], STI->ThreadId[1], STI->ThreadId[2],
+            STI->BlockId[0], STI->BlockId[1], STI->BlockId[2], STI->AccessId,
+            (STI->AllocationKind ? "heap" : "stack"), Default());
+    PrintStackTrace(STI->LocationId);
+    fprintf(
+        stderr,
+        "%s" DPxMOD " is located %lu bytes inside of a %lu-byte region [" DPxMOD
+        "," DPxMOD ")\n%s",
+        Green(), DPxPTR(Addr), STI->PtrOffset, STI->AllocationLength,
+        DPxPTR(STI->AllocationStart),
+        DPxPTR(utils::advancePtr(STI->AllocationStart, STI->AllocationLength)),
+        Default());
+    fprintf(stderr,
+            "%s Pointer[slot:%lu,tag:%u,kind:%i] "
+            "Allocation[slot:%d,tag:%u,kind:%i]\n%s",
+            Green(), STI->PtrSlot, STI->PtrTag, STI->PtrKind, STI->AllocationId,
+            STI->AllocationTag, STI->AllocationKind, Default());
+  };
+
+  switch (STI->ErrorCode) {
+  case SanitizerTrapInfoTy::None:
+    llvm_unreachable("Unexpected exception");
+  case SanitizerTrapInfoTy::ExceedsLength:
+    fprintf(stderr, "%sERROR: OffloadSanitizer %s\n%s", Red(), "exceeds length",
+            Default());
+    break;
+  case SanitizerTrapInfoTy::ExceedsSlots:
+    fprintf(stderr, "%sERROR: OffloadSanitizer %s\n%s", Red(), "exceeds slots",
+            Default());
+    break;
+  case SanitizerTrapInfoTy::PointerOutsideAllocation:
+    fprintf(stderr, "%sERROR: OffloadSanitizer %s : %p : %i %lu (%s)\n%s",
+            Red(), "outside allocation", STI->AllocationStart,
+            STI->AllocationId, STI->PtrSlot,
+            (STI->AllocationKind ? "heap" : "stack"), Default());
+    break;
+  case SanitizerTrapInfoTy::OutOfBounds: {
+    DiagnoseAccess("out-of-bounds");
+    break;
+  }
+  case SanitizerTrapInfoTy::UseAfterScope:
+    DiagnoseAccess("use-after-scope");
+    break;
+  case SanitizerTrapInfoTy::UseAfterFree:
+    DiagnoseAccess("use-after-free");
+    break;
+  case SanitizerTrapInfoTy::MemoryLeak:
+    fprintf(stderr, "%sERROR: OffloadSanitizer %s\n%s", Red(), "memory leak",
+            Default());
+    break;
+  case SanitizerTrapInfoTy::GarbagePointer:
+    DiagnoseAccess("garbage-pointer");
+    break;
+  }
+  fflush(stderr);
+}
+
 bool llvm::omp::target::plugin::libomptargetSupportsRPC() {
 #ifdef LIBOMPTARGET_RPC_SUPPORT
   return true;
diff --git a/offload/plugins-nextgen/cuda/src/rtl.cpp b/offload/plugins-nextgen/cuda/src/rtl.cpp
index b6465d61bd033..6ba51ae0db565 100644
--- a/offload/plugins-nextgen/cuda/src/rtl.cpp
+++ b/offload/plugins-nextgen/cuda/src/rtl.cpp
@@ -643,8 +643,11 @@ struct CUDADeviceTy : public GenericDeviceTy {
     }
 
     // Once the stream is synchronized, return it to stream pool and reset
-    // AsyncInfo. This is to make sure the synchronization only works for its
-    // own tasks.
+    // AsyncInfo if the queue is not persistent. This is to make sure the
+    // synchronization only works for its own tasks.
+    if (AsyncInfo.PersistentQueue)
+      return Plugin::success();
+
     AsyncInfo.Queue = nullptr;
     if (auto Err = CUDAStreamManager.returnResource(Stream))
       return Err;
@@ -705,7 +708,7 @@ struct CUDADeviceTy : public GenericDeviceTy {
       return Plugin::error("Wrong device Page size");
 
     // Ceil to page size.
-    Size = roundUp(Size, Granularity);
+    Size = utils::roundUp(Size, Granularity);
 
     // Create a handler of our allocation
     CUmemGenericAllocationHandle AHandle;
@@ -777,9 +780,12 @@ struct CUDADeviceTy : public GenericDeviceTy {
     if (Res == CUDA_ERROR_NOT_READY)
       return Plugin::success();
 
-    // Once the stream is synchronized and the operations completed (or an error
-    // occurs), return it to stream pool and reset AsyncInfo. This is to make
-    // sure the synchronization only works for its own tasks.
+    // Once the stream is synchronized, return it to stream pool and reset
+    // AsyncInfo if the queue is not persistent. This is to make sure the
+    // synchronization only works for its own tasks.
+    if (AsyncInfo.PersistentQueue)
+      return Plugin::success();
+
     AsyncInfo.Queue = nullptr;
     if (auto Err = CUDAStreamManager.returnResource(Stream))
       return Err;
diff --git a/offload/src/CMakeLists.txt b/offload/src/CMakeLists.txt
index efa5cdab33ec9..0f30f6028f103 100644
--- a/offload/src/CMakeLists.txt
+++ b/offload/src/CMakeLists.txt
@@ -22,6 +22,7 @@ add_llvm_library(omptarget
   OpenMP/InteropAPI.cpp
   OpenMP/OMPT/Callback.cpp
 
+  KernelLanguage/API.cpp
 
   ADDITIONAL_HEADER_DIRS
   ${LIBOMPTARGET_INCLUDE_DIR}
@@ -60,6 +61,23 @@ endforeach()
 target_compile_options(omptarget PRIVATE ${offload_compile_flags})
 target_link_options(omptarget PRIVATE ${offload_link_flags})
 
+add_llvm_library(offload.kernels
+  STATIC
+
+  Kernels/Sanitizer.cpp
+  
+  LINK_LIBS
+  PUBLIC
+  omptarget.devicertl
+
+  NO_INSTALL_RPATH
+  BUILDTREE_ONLY
+)
+
+list(JOIN LIBOMPTARGET_DEVICE_ARCHITECTURES "," KERNEL_OFFLOAD_ARCHS)
+target_compile_options(offload.kernels PRIVATE -x cuda --offload-arch=${KERNEL_OFFLOAD_ARCHS} -nocudalib -nogpulib -fopenmp-target-jit -foffload-via-llvm )
+target_link_options(offload.kernels PRIVATE -x cuda --offload-arch=${KERNEL_OFFLOAD_ARCHS} -nocudalib -nogpulib -fopenmp-target-jit -foffload-via-llvm )
+
 # libomptarget.so needs to be aware of where the plugins live as they
 # are now separated in the build directory.
 set_target_properties(omptarget PROPERTIES
@@ -67,3 +85,4 @@ set_target_properties(omptarget PROPERTIES
                       INSTALL_RPATH "$ORIGIN"
                       BUILD_RPATH "$ORIGIN:${CMAKE_CURRENT_BINARY_DIR}/..")
 install(TARGETS omptarget LIBRARY COMPONENT omptarget DESTINATION "${OFFLOAD_INSTALL_LIBDIR}")
+install(TARGETS offload.kernels LIBRARY COMPONENT offload.kernels DESTINATION "${OFFLOAD_INSTALL_LIBDIR}")
diff --git a/offload/src/DeviceImage.cpp b/offload/src/DeviceImage.cpp
index e42460b5cca4f..e5b4bf5526437 100644
--- a/offload/src/DeviceImage.cpp
+++ b/offload/src/DeviceImage.cpp
@@ -27,9 +27,8 @@ DeviceImageTy::DeviceImageTy(__tgt_bin_desc &BinaryDesc,
                              __tgt_device_image &TgtDeviceImage)
     : BinaryDesc(&BinaryDesc), Image(TgtDeviceImage) {
 
-  llvm::StringRef ImageStr(
-      static_cast<char *>(Image.ImageStart),
-      llvm::omp::target::getPtrDiff(Image.ImageEnd, Image.ImageStart));
+  llvm::StringRef ImageStr(static_cast<char *>(Image.ImageStart),
+                           utils::getPtrDiff(Image.ImageEnd, Image.ImageStart));
 
   auto BinaryOrErr =
       llvm::object::OffloadBinary::create(llvm::MemoryBufferRef(ImageStr, ""));
diff --git a/offload/src/KernelLanguage/API.cpp b/offload/src/KernelLanguage/API.cpp
new file mode 100644
index 0000000000000..779751deed661
--- /dev/null
+++ b/offload/src/KernelLanguage/API.cpp
@@ -0,0 +1,86 @@
+//===------ API.cpp - Kernel Language (CUDA/HIP) entry points ----- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Frontend/OpenMP/OMPConstants.h"
+
+#include "Shared/APITypes.h"
+
+#include <cstdint>
+#include <cstdio>
+
+struct dim3 {
+  unsigned x = 0, y = 0, z = 0;
+};
+
+struct __omp_kernel_t {
+  dim3 __grid_size;
+  dim3 __block_size;
+  size_t __shared_memory;
+
+  void *__stream;
+};
+
+static __omp_kernel_t __current_kernel = {};
+#pragma omp threadprivate(__current_kernel);
+
+extern "C" {
+
+// TODO: There is little reason we need to keep these names or the way calls are
+// issued. For now we do to avoid modifying Clang's CUDA codegen. Unclear when
+// we actually need to push/pop configurations.
+unsigned __llvmPushCallConfiguration(dim3 __grid_size, dim3 __block_size,
+                                     size_t __shared_memory, void *__stream) {
+  __omp_kernel_t &__kernel = __current_kernel;
+  __kernel.__grid_size = __grid_size;
+  __kernel.__block_size = __block_size;
+  __kernel.__shared_memory = __shared_memory;
+  __kernel.__stream = __stream;
+  return 0;
+}
+
+unsigned __llvmPopCallConfiguration(dim3 *__grid_size, dim3 *__block_size,
+                                    size_t *__shared_memory, void *__stream) {
+   __omp_kernel_t &__kernel = __current_kernel;
+  *__grid_size = __kernel.__grid_size;
+  *__block_size = __kernel.__block_size;
+  *__shared_memory = __kernel.__shared_memory;
+  *((void **)__stream) = __kernel.__stream;
+  return 0;
+}
+
+int __tgt_target_kernel(void *Loc, int64_t DeviceId, int32_t NumTeams,
+                        int32_t ThreadLimit, const void *HostPtr,
+                        KernelArgsTy *Args);
+void *__tgt_target_get_default_async_info_queue(void *Loc, int64_t DeviceId);
+
+unsigned llvmLaunchKernel(const void *func, dim3 gridDim, dim3 blockDim,
+                          void *args, size_t sharedMem, void *stream) {
+  int64_t DeviceNo = 0;
+  KernelArgsTy Args = {};
+  Args.Version = OMP_KERNEL_ARG_VERSION;
+  Args.DynCGroupMem = sharedMem;
+  Args.NumTeams[0] = gridDim.x;
+  Args.NumTeams[1] = gridDim.y;
+  Args.NumTeams[2] = gridDim.z;
+  Args.ThreadLimit[0] = blockDim.x;
+  Args.ThreadLimit[1] = blockDim.y;
+  Args.ThreadLimit[2] = blockDim.z;
+  Args.ArgPtrs = reinterpret_cast<void **>(args);
+  Args.Flags.IsCUDA = true;
+  if (stream)
+    Args.AsyncInfoQueue = stream;
+  else
+    Args.AsyncInfoQueue =
+        __tgt_target_get_default_async_info_queue(nullptr, DeviceNo);
+  int rv = __tgt_target_kernel(nullptr, DeviceNo, gridDim.x, blockDim.x, func,
+                               &Args);
+  return rv;
+}
+}
diff --git a/offload/src/Kernels/Sanitizer.cpp b/offload/src/Kernels/Sanitizer.cpp
new file mode 100644
index 0000000000000..7b1d73a81aeba
--- /dev/null
+++ b/offload/src/Kernels/Sanitizer.cpp
@@ -0,0 +1,27 @@
+//===-- Kenrels/Sanitizer.cpp - Sanitizer Kernel Definitions --------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+//===----------------------------------------------------------------------===//
+
+#include <cstdint>
+
+extern "C" {
+__device__ void __sanitizer_register_host(void *P, uint64_t Bytes,
+                                          uint64_t Slot);
+__device__ void __sanitizer_unregister_host(void *P);
+
+[[clang::disable_sanitizer_instrumentation]] __global__ void
+__sanitizer_register(void *P, uint64_t Bytes, uint64_t Slot) {
+  __sanitizer_register_host(P, Bytes, Slot);
+}
+
+[[clang::disable_sanitizer_instrumentation]] __global__ void
+__sanitizer_unregister(void *P) {
+  __sanitizer_unregister_host(P);
+}
+}
diff --git a/offload/src/OpenMP/API.cpp b/offload/src/OpenMP/API.cpp
index 374c54163d6a4..c07cea550c39c 100644
--- a/offload/src/OpenMP/API.cpp
+++ b/offload/src/OpenMP/API.cpp
@@ -39,7 +39,7 @@ EXTERN void ompx_dump_mapping_tables() {
 using namespace llvm::omp::target::ompt;
 #endif
 
-void *targetAllocExplicit(size_t Size, int DeviceNum, int Kind,
+void *targetAllocExplicit(size_t Size, int64_t DeviceNum, int Kind,
                           const char *Name);
 void targetFreeExplicit(void *DevicePtr, int DeviceNum, int Kind,
                         const char *Name);
diff --git a/offload/src/OpenMP/Mapping.cpp b/offload/src/OpenMP/Mapping.cpp
index 595e3456ab54c..407c6cd422ba7 100644
--- a/offload/src/OpenMP/Mapping.cpp
+++ b/offload/src/OpenMP/Mapping.cpp
@@ -69,7 +69,7 @@ int MappingInfoTy::associatePtr(void *HstPtrBegin, void *TgtPtrBegin,
   }
 
   // Mapping does not exist, allocate it with refCount=INF
-  const HostDataToTargetTy &NewEntry =
+  HostDataToTargetTy &NewEntry =
       *HDTTMap
            ->emplace(new HostDataToTargetTy(
                /*HstPtrBase=*/(uintptr_t)HstPtrBegin,
@@ -89,7 +89,8 @@ int MappingInfoTy::associatePtr(void *HstPtrBegin, void *TgtPtrBegin,
   (void)NewEntry;
 
   // Notify the plugin about the new mapping.
-  return Device.notifyDataMapped(HstPtrBegin, Size);
+  return Device.notifyDataMapped(HstPtrBegin, TgtPtrBegin, Size,
+                                 NewEntry.FakeTgtPtrBegin);
 }
 
 int MappingInfoTy::disassociatePtr(void *HstPtrBegin) {
@@ -120,7 +121,7 @@ int MappingInfoTy::disassociatePtr(void *HstPtrBegin) {
     if (Event)
       Device.destroyEvent(Event);
     HDTTMap->erase(It);
-    return Device.notifyDataUnmapped(HstPtrBegin);
+    return Device.notifyDataUnmapped(HstPtrBegin, HDTT.FakeTgtPtrBegin);
   }
 
   REPORT("Trying to disassociate a pointer which was not mapped via "
@@ -294,12 +295,13 @@ TargetPointerResultTy MappingInfoTy::getTargetPointer(
     uintptr_t TgtPtrBegin = TgtAllocBegin + TgtPadding;
     // Release the mapping table lock only after the entry is locked by
     // attaching it to TPR.
-    LR.TPR.setEntry(HDTTMap
-                        ->emplace(new HostDataToTargetTy(
-                            (uintptr_t)HstPtrBase, (uintptr_t)HstPtrBegin,
-                            (uintptr_t)HstPtrBegin + Size, TgtAllocBegin,
-                            TgtPtrBegin, HasHoldModifier, HstPtrName))
-                        .first->HDTT);
+    LR.TPR.setEntry(
+        HDTTMap
+            ->emplace(new HostDataToTargetTy(
+                (uintptr_t)HstPtrBase, (uintptr_t)HstPtrBegin,
+                (uintptr_t)HstPtrBegin + Size, TgtAllocBegin, TgtPtrBegin,
+                HasHoldModifier, HstPtrName, /*IsINF=*/false))
+            .first->HDTT);
     INFO(OMP_INFOTYPE_MAPPING_CHANGED, Device.DeviceID,
          "Creating new map entry with HstPtrBase=" DPxMOD
          ", HstPtrBegin=" DPxMOD ", TgtAllocBegin=" DPxMOD
@@ -313,7 +315,8 @@ TargetPointerResultTy MappingInfoTy::getTargetPointer(
     LR.TPR.TargetPointer = (void *)TgtPtrBegin;
 
     // Notify the plugin about the new mapping.
-    if (Device.notifyDataMapped(HstPtrBegin, Size))
+    if (Device.notifyDataMapped(HstPtrBegin, LR.TPR.TargetPointer, Size,
+                                LR.TPR.getEntry()->FakeTgtPtrBegin))
       return TargetPointerResultTy{};
   } else {
     // This entry is not present and we did not create a new entry for it.
@@ -495,7 +498,8 @@ int MappingInfoTy::deallocTgtPtrAndEntry(HostDataToTargetTy *Entry,
   int Ret = Device.deleteData((void *)Entry->TgtAllocBegin);
 
   // Notify the plugin about the unmapped memory.
-  Ret |= Device.notifyDataUnmapped((void *)Entry->HstPtrBegin);
+  Ret |= Device.notifyDataUnmapped((void *)Entry->HstPtrBegin,
+                                   Entry->FakeTgtPtrBegin);
 
   delete Entry;
 
diff --git a/offload/src/device.cpp b/offload/src/device.cpp
index 943c778278730..fc41721e6c0a6 100644
--- a/offload/src/device.cpp
+++ b/offload/src/device.cpp
@@ -191,21 +191,24 @@ int32_t DeviceTy::dataExchange(void *SrcPtr, DeviceTy &DstDev, void *DstPtr,
                                   DstPtr, Size, AsyncInfo);
 }
 
-int32_t DeviceTy::notifyDataMapped(void *HstPtr, int64_t Size) {
+int32_t DeviceTy::notifyDataMapped(void *HstPtr, void *DevicePtr, int64_t Size,
+                                   void *&FakeHstPtr) {
   DP("Notifying about new mapping: HstPtr=" DPxMOD ", Size=%" PRId64 "\n",
      DPxPTR(HstPtr), Size);
 
-  if (RTL->data_notify_mapped(RTLDeviceID, HstPtr, Size)) {
+  if (RTL->data_notify_mapped(RTLDeviceID, HstPtr, DevicePtr, Size,
+                              FakeHstPtr)) {
     REPORT("Notifiying about data mapping failed.\n");
     return OFFLOAD_FAIL;
   }
   return OFFLOAD_SUCCESS;
 }
 
-int32_t DeviceTy::notifyDataUnmapped(void *HstPtr) {
-  DP("Notifying about an unmapping: HstPtr=" DPxMOD "\n", DPxPTR(HstPtr));
+int32_t DeviceTy::notifyDataUnmapped(void *HstPtr, void *FakeHstPtr) {
+  DP("Notifying about an unmapping: HstPtr=" DPxMOD " FakeHstPtr=" DPxMOD "\n",
+     DPxPTR(HstPtr), DPxPTR(FakeHstPtr));
 
-  if (RTL->data_notify_unmapped(RTLDeviceID, HstPtr)) {
+  if (RTL->data_notify_unmapped(RTLDeviceID, HstPtr, FakeHstPtr)) {
     REPORT("Notifiying about data unmapping failed.\n");
     return OFFLOAD_FAIL;
   }
diff --git a/offload/src/exports b/offload/src/exports
index f95544ec8329c..829a55fc6398e 100644
--- a/offload/src/exports
+++ b/offload/src/exports
@@ -29,6 +29,8 @@ VERS1.0 {
     __tgt_target_kernel;
     __tgt_target_kernel_nowait;
     __tgt_target_nowait_query;
+    __tgt_target_get_default_async_info_queue;
+    __tgt_target_synchronize_async_info_queue;
     __tgt_target_kernel_replay;
     __tgt_activate_record_replay;
     __tgt_mapper_num_components;
@@ -71,6 +73,12 @@ VERS1.0 {
     __tgt_interop_use;
     __tgt_interop_destroy;
     ompt_libomptarget_connect;
+    __llvmPushCallConfiguration;
+    __llvmPopCallConfiguration;
+    llvmLaunchKernel;
+    ompx_new_allocation_host;
+    ompx_free_allocation_host;
+    ompx_register_image_functions;
   local:
     *;
 };
diff --git a/offload/src/interface.cpp b/offload/src/interface.cpp
index 763b051cc6d77..759fb54a11262 100644
--- a/offload/src/interface.cpp
+++ b/offload/src/interface.cpp
@@ -14,6 +14,8 @@
 #include "OpenMP/OMPT/Interface.h"
 #include "OpenMP/OMPT/Callback.h"
 #include "PluginManager.h"
+#include "Shared/APITypes.h"
+#include "omptarget.h"
 #include "private.h"
 
 #include "Shared/EnvironmentVar.h"
@@ -312,7 +314,7 @@ static inline int targetKernel(ident_t *Loc, int64_t DeviceId, int32_t NumTeams,
   if (!DeviceOrErr)
     FATAL_MESSAGE(DeviceId, "%s", toString(DeviceOrErr.takeError()).c_str());
 
-  TargetAsyncInfoTy TargetAsyncInfo(*DeviceOrErr);
+  TargetAsyncInfoTy TargetAsyncInfo(*DeviceOrErr, KernelArgs->AsyncInfoQueue);
   AsyncInfoTy &AsyncInfo = TargetAsyncInfo;
   /// RAII to establish tool anchors before and after target region
   OMPT_IF_BUILT(InterfaceRAII TargetRAII(
@@ -510,3 +512,48 @@ EXTERN void __tgt_target_nowait_query(void **AsyncHandle) {
   delete AsyncInfo;
   *AsyncHandle = nullptr;
 }
+
+EXTERN void *__tgt_target_get_default_async_info_queue(void *Loc,
+                                                       int64_t DeviceId) {
+  assert(PM && "Runtime not initialized");
+
+  static thread_local void **AsyncInfoQueue = nullptr;
+
+  if (!AsyncInfoQueue)
+    AsyncInfoQueue = reinterpret_cast<void **>(
+        calloc(PM->getNumDevices(), sizeof(AsyncInfoQueue[0])));
+
+  if (!AsyncInfoQueue[DeviceId]) {
+    auto DeviceOrErr = PM->getDevice(DeviceId);
+    if (!DeviceOrErr)
+      FATAL_MESSAGE(DeviceId, "%s", toString(DeviceOrErr.takeError()).c_str());
+
+    __tgt_async_info *AsyncInfo = nullptr;
+    DeviceOrErr->RTL->init_async_info(DeviceId, &AsyncInfo);
+    AsyncInfoQueue[DeviceId] = AsyncInfo->Queue;
+  }
+
+  return AsyncInfoQueue[DeviceId];
+}
+
+EXTERN int __tgt_target_synchronize_async_info_queue(void *Loc,
+                                                     int64_t DeviceId,
+                                                     void *AsyncInfoQueue) {
+  assert(PM && "Runtime not initialized");
+
+  auto DeviceOrErr = PM->getDevice(DeviceId);
+  if (!DeviceOrErr)
+    FATAL_MESSAGE(DeviceId, "%s", toString(DeviceOrErr.takeError()).c_str());
+  if (!AsyncInfoQueue)
+    AsyncInfoQueue = __tgt_target_get_default_async_info_queue(Loc, DeviceId);
+  AsyncInfoTy AsyncInfo(*DeviceOrErr, AsyncInfoQueue,
+                        AsyncInfoTy::SyncTy::BLOCKING);
+
+  if (AsyncInfo.synchronize())
+    FATAL_MESSAGE0(1, "Error while querying the async queue for completion.\n");
+  [[maybe_unused]] __tgt_async_info *ASI = AsyncInfo;
+  assert(ASI->Queue);
+  assert(ASI->Queue && ASI->PersistentQueue);
+
+  return 0;
+}
diff --git a/offload/src/omptarget.cpp b/offload/src/omptarget.cpp
index 9bca8529c5ee3..10f9b9a7d9358 100644
--- a/offload/src/omptarget.cpp
+++ b/offload/src/omptarget.cpp
@@ -49,7 +49,7 @@ int AsyncInfoTy::synchronize() {
     case SyncTy::BLOCKING:
       // If we have a queue we need to synchronize it now.
       Result = Device.synchronize(*this);
-      assert(AsyncInfo.Queue == nullptr &&
+      assert((AsyncInfo.PersistentQueue || !AsyncInfo.Queue) &&
              "The device plugin should have nulled the queue to indicate there "
              "are no outstanding actions!");
       break;
@@ -271,17 +271,22 @@ static int initLibrary(DeviceTy &Device) {
            ", name \"%s\"\n",
            DPxPTR(CurrHostEntry->addr), DPxPTR(CurrDeviceEntry->addr),
            CurrDeviceEntry->size, CurrDeviceEntry->name);
-        HDTTMap->emplace(new HostDataToTargetTy(
-            (uintptr_t)CurrHostEntry->addr /*HstPtrBase*/,
-            (uintptr_t)CurrHostEntry->addr /*HstPtrBegin*/,
-            (uintptr_t)CurrHostEntry->addr + CurrHostEntry->size /*HstPtrEnd*/,
-            (uintptr_t)CurrDeviceEntryAddr /*TgtAllocBegin*/,
-            (uintptr_t)CurrDeviceEntryAddr /*TgtPtrBegin*/,
-            false /*UseHoldRefCount*/, CurrHostEntry->name,
-            true /*IsRefCountINF*/));
+        auto *Entry = HDTTMap
+                          ->emplace(new HostDataToTargetTy(
+                              (uintptr_t)CurrHostEntry->addr /*HstPtrBase*/,
+                              (uintptr_t)CurrHostEntry->addr /*HstPtrBegin*/,
+                              (uintptr_t)CurrHostEntry->addr +
+                                  CurrHostEntry->size /*HstPtrEnd*/,
+                              (uintptr_t)CurrDeviceEntryAddr /*TgtAllocBegin*/,
+                              (uintptr_t)CurrDeviceEntryAddr /*TgtPtrBegin*/,
+                              false /*UseHoldRefCount*/, CurrHostEntry->name,
+                              true /*IsRefCountINF*/))
+                          .first->HDTT;
 
         // Notify about the new mapping.
-        if (Device.notifyDataMapped(CurrHostEntry->addr, CurrHostEntry->size))
+        if (Device.notifyDataMapped(CurrHostEntry->addr, CurrDeviceEntryAddr,
+                                    CurrHostEntry->size,
+                                    Entry->FakeTgtPtrBegin))
           return OFFLOAD_FAIL;
       }
     }
@@ -323,8 +328,8 @@ void handleTargetOutcome(bool Success, ident_t *Loc) {
         for (auto &Image : PM->deviceImages()) {
           const char *Start = reinterpret_cast<const char *>(
               Image.getExecutableImage().ImageStart);
-          uint64_t Length = llvm::omp::target::getPtrDiff(
-              Start, Image.getExecutableImage().ImageEnd);
+          uint64_t Length =
+              utils::getPtrDiff(Start, Image.getExecutableImage().ImageEnd);
           llvm::MemoryBufferRef Buffer(llvm::StringRef(Start, Length),
                                        /*Identifier=*/"");
 
@@ -415,9 +420,9 @@ static int32_t getParentIndex(int64_t Type) {
   return ((Type & OMP_TGT_MAPTYPE_MEMBER_OF) >> 48) - 1;
 }
 
-void *targetAllocExplicit(size_t Size, int DeviceNum, int Kind,
+void *targetAllocExplicit(size_t Size, int64_t DeviceNum, int Kind,
                           const char *Name) {
-  DP("Call to %s for device %d requesting %zu bytes\n", Name, DeviceNum, Size);
+  DP("Call to %s for device %ld requesting %zu bytes\n", Name, DeviceNum, Size);
 
   if (Size <= 0) {
     DP("Call to %s with non-positive length\n", Name);
@@ -432,13 +437,21 @@ void *targetAllocExplicit(size_t Size, int DeviceNum, int Kind,
     return Rc;
   }
 
+  if (checkDeviceAndCtors(DeviceNum, nullptr)) {
+    DP("Not offloading to device %" PRId64 "\n", DeviceNum);
+    return Rc;
+  }
+
   auto DeviceOrErr = PM->getDevice(DeviceNum);
   if (!DeviceOrErr)
     FATAL_MESSAGE(DeviceNum, "%s", toString(DeviceOrErr.takeError()).c_str());
 
   Rc = DeviceOrErr->allocData(Size, nullptr, Kind);
   DP("%s returns device ptr " DPxMOD "\n", Name, DPxPTR(Rc));
-  return Rc;
+  void *FakeHstPtr = nullptr;
+  if (DeviceOrErr->notifyDataMapped(nullptr, Rc, Size, FakeHstPtr))
+    return nullptr;
+  return FakeHstPtr ? FakeHstPtr : Rc;
 }
 
 void targetFreeExplicit(void *DevicePtr, int DeviceNum, int Kind,
@@ -464,6 +477,8 @@ void targetFreeExplicit(void *DevicePtr, int DeviceNum, int Kind,
   if (DeviceOrErr->deleteData(DevicePtr, Kind) == OFFLOAD_FAIL)
     FATAL_MESSAGE(DeviceNum, "%s", "Failed to deallocate device ptr");
 
+  DeviceOrErr->notifyDataUnmapped(nullptr, DevicePtr);
+
   DP("omp_target_free deallocated device ptr\n");
 }
 
@@ -670,6 +685,9 @@ int targetDataBegin(ident_t *Loc, DeviceTy &Device, int32_t ArgNum,
         HasFlagTo, HasFlagAlways, IsImplicit, UpdateRef, HasCloseModifier,
         HasPresentModifier, HasHoldModifier, AsyncInfo, PointerTpr.getEntry());
     void *TgtPtrBegin = TPR.TargetPointer;
+    if (auto *Entry = TPR.getEntry())
+      if (auto *FakeTgtPtrBegin = Entry->FakeTgtPtrBegin)
+        TgtPtrBegin = FakeTgtPtrBegin;
     IsHostPtr = TPR.Flags.IsHostPointer;
     // If data_size==0, then the argument could be a zero-length pointer to
     // NULL, so getOrAlloc() returning NULL is not an error.
@@ -1523,11 +1541,16 @@ static int processDataBefore(ident_t *Loc, int64_t DeviceId, void *HostPtr,
           /*UpdateRefCount=*/false,
           /*UseHoldRefCount=*/false);
       TgtPtrBegin = TPR.TargetPointer;
+      if (auto *Entry = TPR.getEntry())
+        if (auto *FakeTgtPtrBegin = Entry->FakeTgtPtrBegin)
+          TgtPtrBegin = FakeTgtPtrBegin;
       TgtBaseOffset = (intptr_t)HstPtrBase - (intptr_t)HstPtrBegin;
 #ifdef OMPTARGET_DEBUG
       void *TgtPtrBase = (void *)((intptr_t)TgtPtrBegin + TgtBaseOffset);
-      DP("Obtained target argument " DPxMOD " from host pointer " DPxMOD "\n",
-         DPxPTR(TgtPtrBase), DPxPTR(HstPtrBegin));
+      DP("Obtained target argument " DPxMOD " from host pointer " DPxMOD
+         " %s\n",
+         DPxPTR(TgtPtrBase), DPxPTR(HstPtrBegin),
+         TgtPtrBegin != TPR.TargetPointer ? "fake" : "");
 #endif
     }
     TgtArgsPositions[I] = TgtArgs.size();
diff --git a/offload/test/lit.cfg b/offload/test/lit.cfg
index 6c590603079c4..9053151e44a78 100644
--- a/offload/test/lit.cfg
+++ b/offload/test/lit.cfg
@@ -66,7 +66,7 @@ def evaluate_bool_env(env):
 config.name = 'libomptarget :: ' + config.libomptarget_current_target
 
 # suffixes: A list of file extensions to treat as test files.
-config.suffixes = ['.c', '.cpp', '.cc', '.f90']
+config.suffixes = ['.c', '.cpp', '.cc', '.f90', '.cu']
 
 # excludes: A list of directories to exclude from the testuites.
 config.excludes = ['Inputs']
diff --git a/offload/test/offloading/CUDA/basic_api_malloc_free.cu b/offload/test/offloading/CUDA/basic_api_malloc_free.cu
new file mode 100644
index 0000000000000..60a51e33a5af9
--- /dev/null
+++ b/offload/test/offloading/CUDA/basic_api_malloc_free.cu
@@ -0,0 +1,42 @@
+// RUN: %clang++ -foffload-via-llvm --offload-arch=native %s -o %t
+// RUN: %t | %fcheck-generic
+
+// UNSUPPORTED: aarch64-unknown-linux-gnu
+// UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
+// UNSUPPORTED: x86_64-pc-linux-gnu
+// UNSUPPORTED: x86_64-pc-linux-gnu-LTO
+
+#include <cuda_runtime.h>
+#include <stdio.h>
+
+extern "C" {
+void *llvm_omp_target_alloc_shared(size_t Size, int DeviceNum);
+void llvm_omp_target_free_shared(void *DevicePtr, int DeviceNum);
+}
+
+__global__ void kernel(int *A, int *DevPtr, int N) {
+  for (int i = 0; i < N; ++i)
+    DevPtr[i] = 1;
+  for (int i = 0; i < N; ++i)
+    *A += DevPtr[i];
+}
+
+int main(int argc, char **argv) {
+  int DevNo = 0;
+  int *Ptr = reinterpret_cast<int *>(llvm_omp_target_alloc_shared(4, DevNo));
+  int *DevPtr;
+  auto Err = cudaMalloc(&DevPtr, 42 * sizeof(int));
+  if (Err != cudaSuccess)
+    return -1;
+  *Ptr = 0;
+  printf("Ptr %p, *Ptr: %i\n", Ptr, *Ptr);
+  // CHECK: Ptr [[Ptr:0x.*]], *Ptr: 0
+  kernel<<<1, 1>>>(Ptr, DevPtr, 42);
+  cudaDeviceSynchronize();
+  printf("Ptr %p, *Ptr: %i\n", Ptr, *Ptr);
+  // CHECK: Ptr [[Ptr]], *Ptr: 42
+  Err = cudaFree(DevPtr);
+  if (Err != cudaSuccess)
+    return -1;
+  llvm_omp_target_free_shared(Ptr, DevNo);
+}
diff --git a/offload/test/offloading/CUDA/basic_api_memcpy.cu b/offload/test/offloading/CUDA/basic_api_memcpy.cu
new file mode 100644
index 0000000000000..088e20ffa9e2b
--- /dev/null
+++ b/offload/test/offloading/CUDA/basic_api_memcpy.cu
@@ -0,0 +1,47 @@
+// RUN: %clang++ -foffload-via-llvm --offload-arch=native %s -o %t
+// RUN: %t | %fcheck-generic
+
+// UNSUPPORTED: aarch64-unknown-linux-gnu
+// UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
+// UNSUPPORTED: x86_64-pc-linux-gnu
+// UNSUPPORTED: x86_64-pc-linux-gnu-LTO
+
+#include <cuda_runtime.h>
+#include <stdio.h>
+
+__global__ void kernel(int *DevPtr, int N) {
+  for (int i = 0; i < N; ++i)
+    DevPtr[i]--;
+}
+
+int main(int argc, char **argv) {
+  int DevNo = 0;
+  int Res = 0;
+  int *DevPtr;
+  auto Err = cudaMalloc(&DevPtr, 42 * sizeof(int));
+  if (Err != cudaSuccess)
+    return -1;
+  int HstPtr[42];
+  for (int i = 0; i < 42; ++i) {
+    HstPtr[i] = 2;
+  }
+  Err = cudaMemcpy(DevPtr, HstPtr, 42 * sizeof(int), cudaMemcpyHostToDevice);
+  if (Err != cudaSuccess)
+    return -1;
+  printf("Res: %i\n", Res);
+  // CHECK: Res: 0
+  kernel<<<1, 1>>>(DevPtr, 42);
+  cudaDeviceSynchronize();
+  Err = cudaMemcpy(HstPtr, DevPtr, 42 * sizeof(int), cudaMemcpyDeviceToHost);
+  if (Err != cudaSuccess)
+    return -1;
+  for (int i = 0; i < 42; ++i) {
+    printf("%i : %i\n", i, HstPtr[i]);
+    Res += HstPtr[i];
+  }
+  printf("Res: %i\n", Res);
+  // CHECK: Res: 42
+  Err = cudaFree(DevPtr);
+  if (Err != cudaSuccess)
+    return -1;
+}
diff --git a/offload/test/offloading/CUDA/basic_api_memset.cu b/offload/test/offloading/CUDA/basic_api_memset.cu
new file mode 100644
index 0000000000000..474eb2a46f0a2
--- /dev/null
+++ b/offload/test/offloading/CUDA/basic_api_memset.cu
@@ -0,0 +1,44 @@
+// RUN: %clang++ -foffload-via-llvm --offload-arch=native %s -o %t
+// RUN: %t | %fcheck-generic
+
+// UNSUPPORTED: aarch64-unknown-linux-gnu
+// UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
+// UNSUPPORTED: x86_64-pc-linux-gnu
+// UNSUPPORTED: x86_64-pc-linux-gnu-LTO
+
+#include <cuda_runtime.h>
+#include <stdio.h>
+
+extern "C" {
+void *llvm_omp_target_alloc_shared(size_t Size, int DeviceNum);
+void llvm_omp_target_free_shared(void *DevicePtr, int DeviceNum);
+}
+
+__global__ void kernel(int *A, int *DevPtr, int N) {
+  for (int i = 0; i < N; ++i)
+    *A += DevPtr[i];
+  *A *= -1;
+}
+
+int main(int argc, char **argv) {
+  int DevNo = 0;
+  int *Ptr = reinterpret_cast<int *>(llvm_omp_target_alloc_shared(4, DevNo));
+  int *DevPtr;
+  auto Err = cudaMalloc(&DevPtr, 42 * sizeof(int));
+  if (Err != cudaSuccess)
+    return -1;
+  Err = cudaMemset(DevPtr, -1, 42 * sizeof(int));
+  if (Err != cudaSuccess)
+    return -1;
+  *Ptr = 0;
+  printf("Ptr %p, *Ptr: %i\n", Ptr, *Ptr);
+  // CHECK: Ptr [[Ptr:0x.*]], *Ptr: 0
+  kernel<<<1, 1>>>(Ptr, DevPtr, 42);
+  cudaDeviceSynchronize();
+  printf("Ptr %p, *Ptr: %i\n", Ptr, *Ptr);
+  // CHECK: Ptr [[Ptr]], *Ptr: 42
+  Err = cudaFree(DevPtr);
+  if (Err != cudaSuccess)
+    return -1;
+  llvm_omp_target_free_shared(Ptr, DevNo);
+}
diff --git a/offload/test/offloading/CUDA/basic_launch.cu b/offload/test/offloading/CUDA/basic_launch.cu
new file mode 100644
index 0000000000000..298aa7db83bad
--- /dev/null
+++ b/offload/test/offloading/CUDA/basic_launch.cu
@@ -0,0 +1,32 @@
+// RUN: %clang++ -foffload-via-llvm --offload-arch=native %s -o %t
+// RUN: %t | %fcheck-generic
+// RUN: %clang++ -foffload-via-llvm --offload-arch=native %s -o %t -fopenmp
+// RUN: %t | %fcheck-generic
+
+// UNSUPPORTED: aarch64-unknown-linux-gnu
+// UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
+// UNSUPPORTED: x86_64-pc-linux-gnu
+// UNSUPPORTED: x86_64-pc-linux-gnu-LTO
+
+#include <cuda_runtime.h>
+#include <stdio.h>
+
+extern "C" {
+void *llvm_omp_target_alloc_shared(size_t Size, int DeviceNum);
+void llvm_omp_target_free_shared(void *DevicePtr, int DeviceNum);
+}
+
+__global__ void square(int *A) { *A = 42; }
+
+int main(int argc, char **argv) {
+  int DevNo = 0;
+  int *Ptr = reinterpret_cast<int *>(llvm_omp_target_alloc_shared(4, DevNo));
+  *Ptr = 7;
+  printf("Ptr %p, *Ptr: %i\n", Ptr, *Ptr);
+  // CHECK: Ptr [[Ptr:0x.*]], *Ptr: 7
+  square<<<1, 1>>>(Ptr);
+  cudaDeviceSynchronize();
+  printf("Ptr %p, *Ptr: %i\n", Ptr, *Ptr);
+  // CHECK: Ptr [[Ptr]], *Ptr: 42
+  llvm_omp_target_free_shared(Ptr, DevNo);
+}
diff --git a/offload/test/offloading/CUDA/basic_launch_blocks_and_threads.cu b/offload/test/offloading/CUDA/basic_launch_blocks_and_threads.cu
new file mode 100644
index 0000000000000..c47b1a1b83bde
--- /dev/null
+++ b/offload/test/offloading/CUDA/basic_launch_blocks_and_threads.cu
@@ -0,0 +1,34 @@
+// RUN: %clang++ -foffload-via-llvm --offload-arch=native %s -o %t
+// RUN: %t | %fcheck-generic
+// RUN: %clang++ -foffload-via-llvm --offload-arch=native %s -o %t -fopenmp
+// RUN: %t | %fcheck-generic
+
+// UNSUPPORTED: aarch64-unknown-linux-gnu
+// UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
+// UNSUPPORTED: x86_64-pc-linux-gnu
+// UNSUPPORTED: x86_64-pc-linux-gnu-LTO
+
+#include <cuda_runtime.h>
+#include <stdio.h>
+
+extern "C" {
+void *llvm_omp_target_alloc_shared(size_t Size, int DeviceNum);
+void llvm_omp_target_free_shared(void *DevicePtr, int DeviceNum);
+}
+
+__global__ void square(int *A) {
+  __scoped_atomic_fetch_add(A, 1, __ATOMIC_SEQ_CST, __MEMORY_SCOPE_DEVICE);
+}
+
+int main(int argc, char **argv) {
+  int DevNo = 0;
+  int *Ptr = reinterpret_cast<int *>(llvm_omp_target_alloc_shared(4, DevNo));
+  *Ptr = 0;
+  printf("Ptr %p, *Ptr: %i\n", Ptr, *Ptr);
+  // CHECK: Ptr [[Ptr:0x.*]], *Ptr: 0
+  square<<<7, 6>>>(Ptr);
+  cudaDeviceSynchronize();
+  printf("Ptr %p, *Ptr: %i\n", Ptr, *Ptr);
+  // CHECK: Ptr [[Ptr]], *Ptr: 42
+  llvm_omp_target_free_shared(Ptr, DevNo);
+}
diff --git a/offload/test/offloading/CUDA/basic_launch_multi_arg.cu b/offload/test/offloading/CUDA/basic_launch_multi_arg.cu
new file mode 100644
index 0000000000000..58ff89dcd4aac
--- /dev/null
+++ b/offload/test/offloading/CUDA/basic_launch_multi_arg.cu
@@ -0,0 +1,43 @@
+// RUN: %clang++ -foffload-via-llvm --offload-arch=native %s -o %t
+// RUN: %t | %fcheck-generic
+// RUN: %clang++ -foffload-via-llvm --offload-arch=native %s -o %t -fopenmp
+// RUN: %t | %fcheck-generic
+
+// UNSUPPORTED: aarch64-unknown-linux-gnu
+// UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
+// UNSUPPORTED: x86_64-pc-linux-gnu
+// UNSUPPORTED: x86_64-pc-linux-gnu-LTO
+
+#include <cuda_runtime.h>
+#include <stdio.h>
+
+extern "C" {
+void *llvm_omp_target_alloc_shared(size_t Size, int DeviceNum);
+void llvm_omp_target_free_shared(void *DevicePtr, int DeviceNum);
+}
+
+__global__ void square(int *Dst, short Q, int *Src, short P) {
+  *Dst = (Src[0] + Src[1]) * (Q + P);
+  Src[0] = Q;
+  Src[1] = P;
+}
+
+int main(int argc, char **argv) {
+  int DevNo = 0;
+  int *Ptr = reinterpret_cast<int *>(llvm_omp_target_alloc_shared(4, DevNo));
+  int *Src = reinterpret_cast<int *>(llvm_omp_target_alloc_shared(8, DevNo));
+  *Ptr = 7;
+  Src[0] = -2;
+  Src[1] = 8;
+  printf("Ptr %p, *Ptr: %i\n", Ptr, *Ptr);
+  // CHECK: Ptr [[Ptr:0x.*]], *Ptr: 7
+  printf("Src: %i : %i\n", Src[0], Src[1]);
+  // CHECK: Src: -2 : 8
+  square<<<1, 1>>>(Ptr, 3, Src, 4);
+  cudaDeviceSynchronize();
+  printf("Ptr %p, *Ptr: %i\n", Ptr, *Ptr);
+  // CHECK: Ptr [[Ptr]], *Ptr: 42
+  printf("Src: %i : %i\n", Src[0], Src[1]);
+  // CHECK: Src: 3 : 4
+  llvm_omp_target_free_shared(Ptr, DevNo);
+}
diff --git a/offload/test/offloading/CUDA/kernel_tu.cu.inc b/offload/test/offloading/CUDA/kernel_tu.cu.inc
new file mode 100644
index 0000000000000..d7d28a109dfc5
--- /dev/null
+++ b/offload/test/offloading/CUDA/kernel_tu.cu.inc
@@ -0,0 +1 @@
+__global__ void square(int *A) { *A = 42; }
diff --git a/offload/test/offloading/CUDA/launch_tu.cu b/offload/test/offloading/CUDA/launch_tu.cu
new file mode 100644
index 0000000000000..fa6b1d7692de5
--- /dev/null
+++ b/offload/test/offloading/CUDA/launch_tu.cu
@@ -0,0 +1,34 @@
+// clang-format off
+// RUN: %clang++ -foffload-via-llvm --offload-arch=native %s -o %t.launch_tu.o -c
+// RUN: %clang++ -foffload-via-llvm --offload-arch=native -x cuda %S/kernel_tu.cu.inc -o %t.kernel_tu.o -c
+// RUN: %clang++ -foffload-via-llvm --offload-arch=native %t.launch_tu.o %t.kernel_tu.o -o %t
+// RUN: %t | %fcheck-generic
+// clang-format on
+
+// UNSUPPORTED: aarch64-unknown-linux-gnu
+// UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
+// UNSUPPORTED: x86_64-pc-linux-gnu
+// UNSUPPORTED: x86_64-pc-linux-gnu-LTO
+
+#include <cuda_runtime.h>
+#include <stdio.h>
+
+extern "C" {
+void *llvm_omp_target_alloc_shared(size_t Size, int DeviceNum);
+void llvm_omp_target_free_shared(void *DevicePtr, int DeviceNum);
+}
+
+extern __global__ void square(int *A);
+
+int main(int argc, char **argv) {
+  int DevNo = 0;
+  int *Ptr = reinterpret_cast<int *>(llvm_omp_target_alloc_shared(4, DevNo));
+  *Ptr = 7;
+  printf("Ptr %p, *Ptr: %i\n", Ptr, *Ptr);
+  // CHECK: Ptr [[Ptr:0x.*]], *Ptr: 7
+  square<<<1, 1>>>(Ptr);
+  cudaDeviceSynchronize();
+  printf("Ptr %p, *Ptr: %i\n", Ptr, *Ptr);
+  // CHECK: Ptr [[Ptr]], *Ptr: 42
+  llvm_omp_target_free_shared(Ptr, DevNo);
+}
diff --git a/offload/test/sanitizer/global_null.c b/offload/test/sanitizer/global_null.c
new file mode 100644
index 0000000000000..91be2cb499c45
--- /dev/null
+++ b/offload/test/sanitizer/global_null.c
@@ -0,0 +1,29 @@
+// clang-format off
+// RUN: %libomptarget-compileopt-generic -fsanitize=offload
+// RUN: not %libomptarget-run-generic 2>&1 > %t.out
+// RUN: %fcheck-generic --check-prefixes=CHECK < %t.out
+// clang-format on
+
+// UNSUPPORTED: aarch64-unknown-linux-gnu
+// UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
+// UNSUPPORTED: x86_64-pc-linux-gnu
+// UNSUPPORTED: x86_64-pc-linux-gnu-LTO
+// UNSUPPORTED: s390x-ibm-linux-gnu
+// UNSUPPORTED: s390x-ibm-linux-gnu-LTO
+
+int *Null = 0;
+#pragma omp declare target(Null)
+
+int main(void) {
+
+#pragma omp target
+  {
+    // clang-format off
+    // CHECK:      ERROR: OffloadSanitizer out-of-bounds access on address 0x0000000000000000 at pc [[PC:0x.*]]
+    // CHECK-NEXT: WRITE of size 4 at 0x0000000000000000 thread <0, 0, 0> block <0, 0, 0>
+    // CHECK-NEXT: #0 [[PC]] main null.c:[[@LINE+3]]
+    // CHECK-NEXT: 0x0000000000000000 is located 0 bytes inside of 0-byte region [0x0000000000000000,0x0000000000000000)
+    // clang-format on
+    *Null = 42;
+  }
+}
diff --git a/offload/test/sanitizer/heap_null.c b/offload/test/sanitizer/heap_null.c
new file mode 100644
index 0000000000000..e13ad234d21c0
--- /dev/null
+++ b/offload/test/sanitizer/heap_null.c
@@ -0,0 +1,32 @@
+// clang-format off
+// RUN: %libomptarget-compileopt-generic -fsanitize=offload
+// RUN: not %libomptarget-run-generic 2>&1 > %t.out
+// RUN: %fcheck-generic --check-prefixes=CHECK < %t.out
+// clang-format on
+
+// UNSUPPORTED: aarch64-unknown-linux-gnu
+// UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
+// UNSUPPORTED: x86_64-pc-linux-gnu
+// UNSUPPORTED: x86_64-pc-linux-gnu-LTO
+// UNSUPPORTED: s390x-ibm-linux-gnu
+// UNSUPPORTED: s390x-ibm-linux-gnu-LTO
+
+// Align lines.
+
+#include <stdint.h>
+#include <stdio.h>
+
+int main(void) {
+
+  int *Null = 0;
+#pragma omp target
+  {
+    // clang-format off
+    // CHECK:      ERROR: OffloadSanitizer out-of-bounds access on address 0x0000000000000000 at pc [[PC:0x.*]]
+    // CHECK-NEXT: WRITE of size 4 at 0x0000000000000000 thread <0, 0, 0> block <0, 0, 0>
+    // CHECK-NEXT: #0 [[PC]] main null.c:[[@LINE+3]]
+    // CHECK-NEXT: 0x0000000000000000 is located 0 bytes inside of 0-byte region [0x0000000000000000,0x0000000000000000)
+    // clang-format on
+    *Null = 42;
+  }
+}
diff --git a/offload/test/sanitizer/heap_out_of_bounds.c b/offload/test/sanitizer/heap_out_of_bounds.c
new file mode 100644
index 0000000000000..37a8d99b5241f
--- /dev/null
+++ b/offload/test/sanitizer/heap_out_of_bounds.c
@@ -0,0 +1,27 @@
+// clang-format off
+// RUN: %libomptarget-compileopt-generic -fsanitize=offload
+// RUN: not %libomptarget-run-generic 2> %t.out
+// RUN: %fcheck-generic --check-prefixes=CHECK < %t.out
+// clang-format on
+
+// UNSUPPORTED: aarch64-unknown-linux-gnu
+// UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
+// UNSUPPORTED: x86_64-pc-linux-gnu
+// UNSUPPORTED: x86_64-pc-linux-gnu-LTO
+// UNSUPPORTED: s390x-ibm-linux-gnu
+// UNSUPPORTED: s390x-ibm-linux-gnu-LTO
+
+// Align lines.
+
+#include <stdint.h>
+#include <stdio.h>
+
+int main(int argc, char **argv) {
+  int N = argc > 42 ? 1000 : 100;
+  double A[N];
+#pragma omp target map(from : A[ : N])
+  {
+    // CHECK: is located 7992 bytes inside of a 800-byte region
+    A[999] = 3.14;
+  }
+}
diff --git a/offload/test/sanitizer/heap_partial_out_of_bounds.c b/offload/test/sanitizer/heap_partial_out_of_bounds.c
new file mode 100644
index 0000000000000..981cba3a8eebd
--- /dev/null
+++ b/offload/test/sanitizer/heap_partial_out_of_bounds.c
@@ -0,0 +1,31 @@
+// clang-format off
+// RUN: %libomptarget-compileopt-generic -fsanitize=offload
+// RUN: not %libomptarget-run-generic 2> %t.out
+// RUN: %fcheck-generic --check-prefixes=CHECK < %t.out
+// clang-format on
+
+// UNSUPPORTED: aarch64-unknown-linux-gnu
+// UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
+// UNSUPPORTED: x86_64-pc-linux-gnu
+// UNSUPPORTED: x86_64-pc-linux-gnu-LTO
+// UNSUPPORTED: s390x-ibm-linux-gnu
+// UNSUPPORTED: s390x-ibm-linux-gnu-LTO
+
+// Align lines.
+
+#include "omp.h"
+#include <stdint.h>
+#include <stdio.h>
+
+int main(int argc, char **argv) {
+  int N = argc > 42 ? 1000 : 100;
+  double *A =
+      (double *)omp_target_alloc(N * sizeof(*A), omp_get_default_device());
+  char *C = ((char *)&A[N - 1] + 1);
+#pragma omp target is_device_ptr(A, C)
+  {
+    // CHECK: is located 793 bytes inside of a 800-byte region
+    double *D = (double *)C;
+    *D = 3.14;
+  }
+}
diff --git a/offload/test/sanitizer/heap_random.c b/offload/test/sanitizer/heap_random.c
new file mode 100644
index 0000000000000..265495f910f69
--- /dev/null
+++ b/offload/test/sanitizer/heap_random.c
@@ -0,0 +1,22 @@
+// clang-format off
+// RUN: %libomptarget-compileopt-generic -fsanitize=offload
+// RUN: not %libomptarget-run-generic 2>&1 > %t.out
+// RUN: %fcheck-generic --check-prefixes=CHECK < %t.out
+// clang-format on
+
+// UNSUPPORTED: aarch64-unknown-linux-gnu
+// UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
+// UNSUPPORTED: x86_64-pc-linux-gnu
+// UNSUPPORTED: x86_64-pc-linux-gnu-LTO
+// UNSUPPORTED: s390x-ibm-linux-gnu
+// UNSUPPORTED: s390x-ibm-linux-gnu-LTO
+
+// Align lines.
+
+int main(void) {
+
+  int X = 0;
+  int *Random = &X;
+#pragma omp target
+  { *Random = 99; }
+}
diff --git a/offload/test/sanitizer/null_forced_stack.c b/offload/test/sanitizer/null_forced_stack.c
new file mode 100644
index 0000000000000..e59e34b3a0cd2
--- /dev/null
+++ b/offload/test/sanitizer/null_forced_stack.c
@@ -0,0 +1,43 @@
+// clang-format off
+// RUN: %libomptarget-compileopt-generic -fsanitize=offload -O1
+// RUN: not %libomptarget-run-generic 2> %t.out
+// RUN: %fcheck-generic --check-prefixes=CHECK < %t.out
+// RUN: %libomptarget-compileopt-generic -fsanitize=offload -O3
+// RUN: not %libomptarget-run-generic 2> %t.out
+// RUN: %libomptarget-compileopt-generic -fsanitize=offload -O3 -g
+// RUN: not %libomptarget-run-generic 2> %t.out
+// RUN: %fcheck-generic --check-prefixes=DEBUG < %t.out
+// clang-format on
+
+// UNSUPPORTED: aarch64-unknown-linux-gnu
+// UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
+// UNSUPPORTED: x86_64-pc-linux-gnu
+// UNSUPPORTED: x86_64-pc-linux-gnu-LTO
+// UNSUPPORTED: s390x-ibm-linux-gnu
+// UNSUPPORTED: s390x-ibm-linux-gnu-LTO
+
+[[clang::optnone]] int *deref(int **P) { return *P; }
+
+int *bar(int **P) { return deref(P); }
+
+int main(void) {
+
+#pragma omp target
+  {
+    int *NullPtr = 0;
+    // clang-format off
+    // CHECK: ERROR: OffloadSanitizer out-of-bounds access on address 0x0000000000000000 at pc [[PC:.*]]
+    // CHECK: WRITE of size 4 at 0x0000000000000000 thread <0, 0, 0> block <0, 0, 0> (acc 1, heap)
+    // CHECK:     #0 [[PC]] omp target (main:[[@LINE-6]]) in <unknown>:0
+    // 
+    // CHECK: 0x0000000000000000 is located 0 bytes inside of a 0-byte region [0x0000000000000000,0x0000000000000000)
+    //
+    // DEBUG: ERROR: OffloadSanitizer out-of-bounds access on address 0x0000000000000000 at pc [[PC:.*]]
+    // DEBUG: WRITE of size 4 at 0x0000000000000000 thread <0, 0, 0> block <0, 0, 0> (acc 1, heap)
+    // DEBUG:     #0 [[PC]] omp target (main:[[@LINE-12]]) in {{.*}}volatile_stack_null.c:[[@LINE+4]]
+    // 
+    // DEBUG: 0x0000000000000000 is located 0 bytes inside of a 0-byte region [0x0000000000000000,0x0000000000000000)
+    // clang-format on
+    bar(&NullPtr)[10] = 42;
+  }
+}
diff --git a/offload/test/sanitizer/stack_trace_1.c b/offload/test/sanitizer/stack_trace_1.c
new file mode 100644
index 0000000000000..7129f67c70900
--- /dev/null
+++ b/offload/test/sanitizer/stack_trace_1.c
@@ -0,0 +1,43 @@
+// clang-format off
+// : %libomptarget-compileopt-generic -fsanitize=offload -O1
+// : not %libomptarget-run-generic 2> %t.out
+// : %fcheck-generic --check-prefixes=CHECK < %t.out
+// : %libomptarget-compileopt-generic -fsanitize=offload -O3
+// : not %libomptarget-run-generic 2> %t.out
+// RUN: %libomptarget-compileopt-generic -fsanitize=offload -O3 -g
+// RUN: not %libomptarget-run-generic 2> %t.out
+// RUN: %fcheck-generic --check-prefixes=DEBUG < %t.out
+// clang-format on
+
+// UNSUPPORTED: aarch64-unknown-linux-gnu
+// UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
+// UNSUPPORTED: x86_64-pc-linux-gnu
+// UNSUPPORTED: x86_64-pc-linux-gnu-LTO
+// UNSUPPORTED: s390x-ibm-linux-gnu
+// UNSUPPORTED: s390x-ibm-linux-gnu-LTO
+
+[[clang::optnone]] int deref(int *P) { return *P; }
+
+[[gnu::always_inline]] int bar(int *P) { return deref(P); }
+
+int main(void) {
+
+#pragma omp target
+  {
+    int *NullPtr = 0;
+    // clang-format off
+    // CHECK: ERROR: OffloadSanitizer out-of-bounds access on address 0x0000000000000000 at pc [[PC:.*]]
+    // CHECK: WRITE of size 4 at 0x0000000000000000 thread <0, 0, 0> block <0, 0, 0> (acc 1, heap)
+    // CHECK:     #0 [[PC]] omp target (main:[[@LINE-6]]) in <unknown>:0
+    // 
+    // CHECK: 0x0000000000000000 is located 0 bytes inside of a 0-byte region [0x0000000000000000,0x0000000000000000)
+    //
+    // DEBUG: ERROR: OffloadSanitizer out-of-bounds access on address 0x0000000000000000 at pc [[PC:.*]]
+    // DEBUG: WRITE of size 4 at 0x0000000000000000 thread <0, 0, 0> block <0, 0, 0> (acc 1, heap)
+    // DEBUG:     #0 [[PC]] omp target (main:[[@LINE-12]]) in {{.*}}volatile_stack_null.c:[[@LINE+4]]
+    // 
+    // DEBUG: 0x0000000000000000 is located 0 bytes inside of a 0-byte region [0x0000000000000000,0x0000000000000000)
+    // clang-format on
+    bar(NullPtr);
+  }
+}
diff --git a/offload/test/sanitizer/stack_trace_multi_path_1.c b/offload/test/sanitizer/stack_trace_multi_path_1.c
new file mode 100644
index 0000000000000..58717afe64df5
--- /dev/null
+++ b/offload/test/sanitizer/stack_trace_multi_path_1.c
@@ -0,0 +1,48 @@
+// clang-format off
+// : %libomptarget-compileopt-generic -fsanitize=offload -O1
+// : not %libomptarget-run-generic 2> %t.out
+// : %fcheck-generic --check-prefixes=CHECK < %t.out
+// : %libomptarget-compileopt-generic -fsanitize=offload -O3
+// : not %libomptarget-run-generic 2> %t.out
+// RUN: %libomptarget-compileopt-generic -fsanitize=offload -O3 -g
+// RUN: not %libomptarget-run-generic 2> %t.out
+// RUN: %fcheck-generic --check-prefixes=DEBUG < %t.out
+// clang-format on
+
+// UNSUPPORTED: aarch64-unknown-linux-gnu
+// UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
+// UNSUPPORTED: x86_64-pc-linux-gnu
+// UNSUPPORTED: x86_64-pc-linux-gnu-LTO
+// UNSUPPORTED: s390x-ibm-linux-gnu
+// UNSUPPORTED: s390x-ibm-linux-gnu-LTO
+
+#include <omp.h>
+
+[[clang::optnone]] int deref(int *P) { return *P; }
+
+[[gnu::noinline]] int bar(int *P) { return deref(P); }
+[[gnu::noinline]] int baz(int *P) { return deref(P); }
+
+int main(void) {
+
+  int *Valid = (int *)omp_target_alloc(4, omp_get_default_device());
+#pragma omp target is_device_ptr(Valid)
+  {
+    int *NullPtr = 0;
+    // clang-format off
+    // CHECK: ERROR: OffloadSanitizer out-of-bounds access on address 0x0000000000000000 at pc [[PC:.*]]
+    // CHECK: WRITE of size 4 at 0x0000000000000000 thread <0, 0, 0> block <0, 0, 0> (acc 1, heap)
+    // CHECK:     #0 [[PC]] omp target (main:[[@LINE-6]]) in <unknown>:0
+    // 
+    // CHECK: 0x0000000000000000 is located 0 bytes inside of a 0-byte region [0x0000000000000000,0x0000000000000000)
+    //
+    // DEBUG: ERROR: OffloadSanitizer out-of-bounds access on address 0x0000000000000000 at pc [[PC:.*]]
+    // DEBUG: WRITE of size 4 at 0x0000000000000000 thread <0, 0, 0> block <0, 0, 0> (acc 1, heap)
+    // DEBUG:     #0 [[PC]] omp target (main:[[@LINE-12]]) in {{.*}}volatile_stack_null.c:[[@LINE+4]]
+    // 
+    // DEBUG: 0x0000000000000000 is located 0 bytes inside of a 0-byte region [0x0000000000000000,0x0000000000000000)
+    // clang-format on
+    bar(Valid);
+    baz(NullPtr);
+  }
+}
diff --git a/offload/test/sanitizer/stack_trace_multi_path_many.cpp b/offload/test/sanitizer/stack_trace_multi_path_many.cpp
new file mode 100644
index 0000000000000..5f6871e825e54
--- /dev/null
+++ b/offload/test/sanitizer/stack_trace_multi_path_many.cpp
@@ -0,0 +1,42 @@
+// clang-format off
+// : %libomptarget-compileoptxx-generic -fsanitize=offload -O1
+// : not %libomptarget-run-generic 2> %t.out
+// : %fcheck-generic --check-prefixes=CHECK < %t.out
+// : %libomptarget-compileoptxx-generic -fsanitize=offload -O3
+// : not %libomptarget-run-generic 2> %t.out
+// RUN: %libomptarget-compileoptxx-generic -fsanitize=offload -O3 -g
+// RUN: not %libomptarget-run-generic 2> %t.out
+// RUN: %fcheck-generic --check-prefixes=DEBUG < %t.out
+// clang-format on
+
+// UNSUPPORTED: aarch64-unknown-linux-gnu
+// UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
+// UNSUPPORTED: x86_64-pc-linux-gnu
+// UNSUPPORTED: x86_64-pc-linux-gnu-LTO
+// UNSUPPORTED: s390x-ibm-linux-gnu
+// UNSUPPORTED: s390x-ibm-linux-gnu-LTO
+
+#include <omp.h>
+
+template <typename T> [[clang::optnone]] T deref(T *P) { return *P; }
+
+template <int LEVEL, typename T> [[gnu::noinline]] T level(T *P) {
+  if constexpr (LEVEL > 1)
+    return level<LEVEL - 1>(P) + level<LEVEL - 2>(P);
+  if constexpr (LEVEL > 0)
+    return level<LEVEL - 1>(P);
+  return deref(P);
+}
+
+int main(void) {
+
+  int *ValidInt = (int *)omp_target_alloc(4, omp_get_default_device());
+#pragma omp target is_device_ptr(ValidInt)
+  {
+    level<12>(ValidInt);
+    short *ValidShort = ((short *)ValidInt) + 2;
+    level<12>(ValidShort);
+    char *Invalid = ((char *)ValidInt) + 4;
+    level<12>(Invalid);
+  }
+}
diff --git a/offload/test/sanitizer/sycl-tests/bad-free/bad-free-host.c b/offload/test/sanitizer/sycl-tests/bad-free/bad-free-host.c
new file mode 100644
index 0000000000000..0b5c4c114eea3
--- /dev/null
+++ b/offload/test/sanitizer/sycl-tests/bad-free/bad-free-host.c
@@ -0,0 +1,13 @@
+// RUN: %libomptarget-compileopt-generic -fsanitize=offload
+// RUN: not %libomptarget-run-generic 2>&1 > %t.out
+// RUN: %fcheck-generic --check-prefixes=CHECK < %t.out
+
+#include <omp.h>
+#include <stdio.h>
+
+int main() {
+  int FakePtr[3] = {1, 2, 3};
+  int Device = omp_get_default_device();
+  omp_target_free(FakePtr, Device);
+  return 0;
+}
diff --git a/offload/test/sanitizer/sycl-tests/bad-free/bad-free-minus1.c b/offload/test/sanitizer/sycl-tests/bad-free/bad-free-minus1.c
new file mode 100644
index 0000000000000..c0b23584859fe
--- /dev/null
+++ b/offload/test/sanitizer/sycl-tests/bad-free/bad-free-minus1.c
@@ -0,0 +1,18 @@
+// RUN: %libomptarget-compileopt-generic -fsanitize=offload
+// RUN: not %libomptarget-run-generic 2>&1 > %t.out
+// RUN: %fcheck-generic --check-prefixes=CHECK < %t.out
+
+#include <omp.h>
+#include <stdio.h>
+
+int main() {
+  int N = 100;
+  int N_SZ = sizeof(int) * N;
+
+  int Device = omp_get_default_device();
+
+  int *DevPtr = (int *)omp_target_alloc(N_SZ, Device);
+  omp_target_free(DevPtr - 1, Device);
+
+  return 0;
+}
diff --git a/offload/test/sanitizer/sycl-tests/bad-free/bad-free-plus1.c b/offload/test/sanitizer/sycl-tests/bad-free/bad-free-plus1.c
new file mode 100644
index 0000000000000..f8f688572c846
--- /dev/null
+++ b/offload/test/sanitizer/sycl-tests/bad-free/bad-free-plus1.c
@@ -0,0 +1,18 @@
+// RUN: %libomptarget-compileopt-generic -fsanitize=offload
+// RUN: not %libomptarget-run-generic 2>&1 > %t.out
+// RUN: %fcheck-generic --check-prefixes=CHECK < %t.out
+
+#include <omp.h>
+#include <stdio.h>
+
+int main() {
+  int N = 100;
+  int N_SZ = sizeof(int) * N;
+
+  int Device = omp_get_default_device();
+
+  int *DevPtr = (int *)omp_target_alloc(N_SZ, Device);
+  omp_target_free(DevPtr + 1, Device);
+
+  return 0;
+}
diff --git a/offload/test/sanitizer/sycl-tests/double-free/double-free.c b/offload/test/sanitizer/sycl-tests/double-free/double-free.c
new file mode 100644
index 0000000000000..9e1e2a6ceb26a
--- /dev/null
+++ b/offload/test/sanitizer/sycl-tests/double-free/double-free.c
@@ -0,0 +1,19 @@
+// RUN: %libomptarget-compileopt-generic -fsanitize=offload
+// RUN: not %libomptarget-run-generic 2>&1 > %t.out
+// RUN: %fcheck-generic --check-prefixes=CHECK < %t.out
+
+#include <omp.h>
+#include <stdio.h>
+
+int main() {
+  int N = 100;
+  int N_SZ = sizeof(int) * N;
+
+  int Device = omp_get_default_device();
+
+  int *DevPtr = (int *)omp_target_alloc(N_SZ, Device);
+  omp_target_free(DevPtr, Device);
+  omp_target_free(DevPtr, Device);
+
+  return 0;
+}
diff --git a/offload/test/sanitizer/sycl-tests/out-of-bounds/DeviceGlobal/device-global.c b/offload/test/sanitizer/sycl-tests/out-of-bounds/DeviceGlobal/device-global.c
new file mode 100644
index 0000000000000..8d134ca219df0
--- /dev/null
+++ b/offload/test/sanitizer/sycl-tests/out-of-bounds/DeviceGlobal/device-global.c
@@ -0,0 +1,20 @@
+// RUN: %libomptarget-compileopt-generic -fsanitize=offload
+// RUN: not %libomptarget-run-generic 2>&1 > %t.out
+// RUN: %fcheck-generic --check-prefixes=CHECK < %t.out
+
+// Port of https://github.com/intel/llvm/blob/sycl/sycl/test-e2e/
+// AddressSanitizer/out-of-bounds/DeviceGlobal/device_global.cpp
+
+#include <omp.h>
+#include <stdio.h>
+
+#define ITEM_COUNT 3
+
+char dev_global[5];
+#pragma omp declare target(dev_global)
+
+int main() {
+#pragma omp target
+  { dev_global[8] = 42; }
+  return 0;
+}
diff --git a/offload/test/sanitizer/sycl-tests/out-of-bounds/USM/parallel-for-char.c b/offload/test/sanitizer/sycl-tests/out-of-bounds/USM/parallel-for-char.c
new file mode 100644
index 0000000000000..ec6a60d76a382
--- /dev/null
+++ b/offload/test/sanitizer/sycl-tests/out-of-bounds/USM/parallel-for-char.c
@@ -0,0 +1,27 @@
+// RUN: %libomptarget-compileopt-generic -fsanitize=offload
+// RUN: not %libomptarget-run-generic 2>&1 > %t.out
+// RUN: %fcheck-generic --check-prefixes=CHECK < %t.out
+
+#include <omp.h>
+#include <stdio.h>
+
+int main() {
+  int N = 10;
+  int N_SZ = sizeof(char) * N;
+
+  int Device = omp_get_default_device();
+
+  char *DevPtr = (char *)omp_target_alloc(N_SZ, Device);
+
+#pragma omp target is_device_ptr(DevPtr)
+  {
+#pragma omp parallel for
+    for (int i = 0; i < N; i++) {
+      DevPtr[i] = '*';
+    }
+  }
+
+  omp_target_free(DevPtr, Device);
+
+  return 0;
+}
diff --git a/offload/test/sanitizer/sycl-tests/out-of-bounds/USM/parallel-for-double.c b/offload/test/sanitizer/sycl-tests/out-of-bounds/USM/parallel-for-double.c
new file mode 100644
index 0000000000000..f14a277706916
--- /dev/null
+++ b/offload/test/sanitizer/sycl-tests/out-of-bounds/USM/parallel-for-double.c
@@ -0,0 +1,27 @@
+// RUN: %libomptarget-compileopt-generic -fsanitize=offload
+// RUN: not %libomptarget-run-generic 2>&1 > %t.out
+// RUN: %fcheck-generic --check-prefixes=CHECK < %t.out
+
+#include <omp.h>
+#include <stdio.h>
+
+int main() {
+  int N = 10;
+  int N_SZ = sizeof(double) * N;
+
+  int Device = omp_get_default_device();
+
+  double *DevPtr = (double *)omp_target_alloc(N_SZ, Device);
+
+#pragma omp target is_device_ptr(DevPtr)
+  {
+#pragma omp parallel for
+    for (int i = 0; i < N; i++) {
+      DevPtr[i] = 1.23;
+    }
+  }
+
+  omp_target_free(DevPtr, Device);
+
+  return 0;
+}
diff --git a/offload/test/sanitizer/sycl-tests/out-of-bounds/USM/parallel-for-func.c b/offload/test/sanitizer/sycl-tests/out-of-bounds/USM/parallel-for-func.c
new file mode 100644
index 0000000000000..ca8d9807ce42b
--- /dev/null
+++ b/offload/test/sanitizer/sycl-tests/out-of-bounds/USM/parallel-for-func.c
@@ -0,0 +1,29 @@
+// RUN: %libomptarget-compileopt-generic -fsanitize=offload
+// RUN: not %libomptarget-run-generic 2>&1 > %t.out
+// RUN: %fcheck-generic --check-prefixes=CHECK < %t.out
+
+#include <omp.h>
+#include <stdio.h>
+
+__attribute__((noinline)) void foo(int *array, int i) { array[i] = 1; }
+
+int main() {
+  int N = 10;
+  int N_SZ = sizeof(int) * N;
+
+  int Device = omp_get_default_device();
+
+  int *DevPtr = (int *)omp_target_alloc(N_SZ, Device);
+
+#pragma omp target is_device_ptr(DevPtr)
+  {
+#pragma omp parallel for
+    for (int i = 0; i < N; i++) {
+      foo(DevPtr, i);
+    }
+  }
+
+  omp_target_free(DevPtr, Device);
+
+  return 0;
+}
diff --git a/offload/test/sanitizer/sycl-tests/out-of-bounds/USM/parallel-for-int.c b/offload/test/sanitizer/sycl-tests/out-of-bounds/USM/parallel-for-int.c
new file mode 100644
index 0000000000000..ed76dc221ae25
--- /dev/null
+++ b/offload/test/sanitizer/sycl-tests/out-of-bounds/USM/parallel-for-int.c
@@ -0,0 +1,27 @@
+// RUN: %libomptarget-compileopt-generic -fsanitize=offload
+// RUN: not %libomptarget-run-generic 2>&1 > %t.out
+// RUN: %fcheck-generic --check-prefixes=CHECK < %t.out
+
+#include <omp.h>
+#include <stdio.h>
+
+int main() {
+  int N = 10;
+  int N_SZ = sizeof(int) * N;
+
+  int Device = omp_get_default_device();
+
+  int *DevPtr = (int *)omp_target_alloc(N_SZ, Device);
+
+#pragma omp target is_device_ptr(DevPtr)
+  {
+#pragma omp parallel for
+    for (int i = 0; i < N; i++) {
+      DevPtr[i] = 2;
+    }
+  }
+
+  omp_target_free(DevPtr, Device);
+
+  return 0;
+}
diff --git a/offload/test/sanitizer/sycl-tests/out-of-bounds/USM/parallel-for-short.c b/offload/test/sanitizer/sycl-tests/out-of-bounds/USM/parallel-for-short.c
new file mode 100644
index 0000000000000..79a7f69a0bcf3
--- /dev/null
+++ b/offload/test/sanitizer/sycl-tests/out-of-bounds/USM/parallel-for-short.c
@@ -0,0 +1,27 @@
+// RUN: %libomptarget-compileopt-generic -fsanitize=offload
+// RUN: not %libomptarget-run-generic 2>&1 > %t.out
+// RUN: %fcheck-generic --check-prefixes=CHECK < %t.out
+
+#include <omp.h>
+#include <stdio.h>
+
+int main() {
+  int N = 10;
+  int N_SZ = sizeof(short) * N;
+
+  int Device = omp_get_default_device();
+
+  short *DevPtr = (short *)omp_target_alloc(N_SZ, Device);
+
+#pragma omp target is_device_ptr(DevPtr)
+  {
+#pragma omp parallel for
+    for (int i = 0; i < N; i++) {
+      DevPtr[i] = 2;
+    }
+  }
+
+  omp_target_free(DevPtr, Device);
+
+  return 0;
+}
diff --git a/offload/test/sanitizer/sycl-tests/out-of-bounds/buffer/buffer1d.c b/offload/test/sanitizer/sycl-tests/out-of-bounds/buffer/buffer1d.c
new file mode 100644
index 0000000000000..0423a2dcf0a17
--- /dev/null
+++ b/offload/test/sanitizer/sycl-tests/out-of-bounds/buffer/buffer1d.c
@@ -0,0 +1,30 @@
+// RUN: %libomptarget-compileopt-generic -fsanitize=offload
+// RUN: %libomptarget-run-generic 2>&1 > %t.out
+// RUN: %fcheck-generic --check-prefixes=CHECK < %t.out
+
+#include <omp.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#define X 3
+
+int main() {
+  int A[X] = {1, 2, 3};
+
+#pragma omp target map(tofrom : A)
+  {
+    for (int i = 0; i < X; i++) {
+      A[i] = A[i] * 2;
+    }
+  }
+
+  for (int i = 0; i < X; i++) {
+    printf("%d\n", A[i]);
+  }
+
+  return 0;
+}
+
+// CHECK: 2
+// CHECK: 4
+// CHECK: 6
diff --git a/offload/test/sanitizer/sycl-tests/out-of-bounds/buffer/buffer2d.c b/offload/test/sanitizer/sycl-tests/out-of-bounds/buffer/buffer2d.c
new file mode 100644
index 0000000000000..e6512628c0188
--- /dev/null
+++ b/offload/test/sanitizer/sycl-tests/out-of-bounds/buffer/buffer2d.c
@@ -0,0 +1,38 @@
+// RUN: %libomptarget-compileopt-generic -fsanitize=offload
+// RUN: %libomptarget-run-generic 2>&1 > %t.out
+// RUN: %fcheck-generic --check-prefixes=CHECK < %t.out
+
+#include <omp.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#define X 2
+#define Y 3
+
+int main() {
+  int A[X][Y] = {{1, 2, 3}, {4, 5, 6}};
+
+#pragma omp target map(tofrom : A)
+  {
+    for (int i = 0; i < X; i++) {
+      for (int j = 0; j < Y; j++) {
+        A[i][j] = A[i][j] * 2;
+      }
+    }
+  }
+
+  for (int i = 0; i < X; i++) {
+    for (int j = 0; j < Y; j++) {
+      printf("%d", A[i][j]);
+      if (j + 1 != Y) {
+        printf(" ");
+      }
+    }
+    printf("\n");
+  }
+
+  return 0;
+}
+
+// CHECK: 2 4 6
+// CHECK-NEXT: 8 10 12
diff --git a/offload/test/sanitizer/sycl-tests/out-of-bounds/buffer/buffer3d.c b/offload/test/sanitizer/sycl-tests/out-of-bounds/buffer/buffer3d.c
new file mode 100644
index 0000000000000..0f4022e2c6cf7
--- /dev/null
+++ b/offload/test/sanitizer/sycl-tests/out-of-bounds/buffer/buffer3d.c
@@ -0,0 +1,55 @@
+// RUN: %libomptarget-compileopt-generic -fsanitize=offload
+// RUN: %libomptarget-run-generic 2>&1 > %t.out
+// RUN: %fcheck-generic --check-prefixes=CHECK < %t.out
+
+#include <omp.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#define X 4
+#define Y 3
+#define Z 2
+
+int main() {
+  int A[X][Y][Z] = {
+      {{0, 1}, {2, 3}, {4, 5}},
+      {{6, 7}, {8, 9}, {10, 11}},
+      {{12, 13}, {14, 15}, {16, 17}},
+      {{18, 19}, {20, 21}, {22, 23}},
+  };
+
+#pragma omp target map(tofrom : A)
+  {
+    for (int i = 0; i < X; i++) {
+      for (int j = 0; j < Y; j++) {
+        for (int k = 0; k < Z; k++) {
+          A[i][j][k] = A[i][j][k] * 2;
+        }
+      }
+    }
+  }
+
+  for (int i = 0; i < X; i++) {
+    for (int j = 0; j < Y; j++) {
+      printf("(");
+      for (int k = 0; k < Z; k++) {
+        printf("%d", A[i][j][k]);
+        if (k + 1 != Z) {
+          printf(",");
+        }
+      }
+      printf(")");
+      if (j + 1 != Y) {
+        printf(",");
+      }
+    }
+    printf("\n");
+  }
+
+  return 0;
+}
+
+// CHECK: (0,2),(4,6),(8,10)
+// CHECK: (12,14),(16,18),(20,22)
+// CHECK: (24,26),(28,30),(32,34)
+// CHECK: (36,38),(40,42),(44,46)
diff --git a/offload/test/sanitizer/sycl-tests/out-of-bounds/buffer/buffer_malloc.c b/offload/test/sanitizer/sycl-tests/out-of-bounds/buffer/buffer_malloc.c
new file mode 100644
index 0000000000000..a9b84ae62a05a
--- /dev/null
+++ b/offload/test/sanitizer/sycl-tests/out-of-bounds/buffer/buffer_malloc.c
@@ -0,0 +1,41 @@
+// RUN: %libomptarget-compileopt-generic -fsanitize=offload
+// RUN: %libomptarget-run-generic 2>&1 > %t.out
+// RUN: %fcheck-generic --check-prefixes=CHECK < %t.out
+
+#include <omp.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+int main() {
+  int N = 10;
+  int N_SZ = sizeof(int) * N;
+
+  int Device = omp_get_default_device();
+
+  int *Buffer = (int *)malloc(N_SZ);
+
+#pragma omp target map(tofrom : Buffer[0 : N])
+  {
+    for (int i = 0; i < N; i++) {
+      Buffer[i] = i;
+    }
+  }
+
+  for (int i = 0; i < N; i++) {
+    printf("%d\n", Buffer[i]);
+  }
+  free(Buffer);
+
+  return 0;
+}
+
+// CHECK: 0
+// CHECK-NEXT: 1
+// CHECK-NEXT: 2
+// CHECK-NEXT: 3
+// CHECK-NEXT: 4
+// CHECK-NEXT: 5
+// CHECK-NEXT: 6
+// CHECK-NEXT: 7
+// CHECK-NEXT: 8
+// CHECK-NEXT: 9
diff --git a/offload/test/sanitizer/sycl-tests/out-of-bounds/local/local-accessor-basic.c b/offload/test/sanitizer/sycl-tests/out-of-bounds/local/local-accessor-basic.c
new file mode 100644
index 0000000000000..72742d43b43e3
--- /dev/null
+++ b/offload/test/sanitizer/sycl-tests/out-of-bounds/local/local-accessor-basic.c
@@ -0,0 +1,30 @@
+// RUN: %libomptarget-compileopt-generic -fsanitize=offload
+// RUN: not %libomptarget-run-generic 2>&1 > %t.out
+// RUN: %fcheck-generic --check-prefixes=CHECK < %t.out
+
+// Port of https://github.com/intel/llvm/blob/sycl/sycl/test-e2e/
+// AddressSanitizer/out-of-bounds/local/local_accessor_basic.cpp
+
+#include <omp.h>
+#include <stdio.h>
+
+#define ITEM_COUNT 3
+
+int main() {
+  int N_SZ = sizeof(int) * ITEM_COUNT;
+
+  int Device = omp_get_default_device();
+  int *DevPtr = (int *)omp_target_alloc(N_SZ, Device);
+
+#pragma omp target is_device_ptr(DevPtr1)
+  {
+    int T1[ITEM_COUNT] = {0};
+    for (int i = 0; i < ITEM_COUNT; i++) {
+      DevPtr[i] = T1[i + 1];
+    }
+  }
+
+  omp_target_free(DevPtr, Device);
+
+  return 0;
+}
diff --git a/offload/test/sanitizer/sycl-tests/out-of-bounds/local/local-accessor-function.c b/offload/test/sanitizer/sycl-tests/out-of-bounds/local/local-accessor-function.c
new file mode 100644
index 0000000000000..427d7abe94e70
--- /dev/null
+++ b/offload/test/sanitizer/sycl-tests/out-of-bounds/local/local-accessor-function.c
@@ -0,0 +1,38 @@
+// RUN: %libomptarget-compileopt-generic -fsanitize=offload
+// RUN: not %libomptarget-run-generic 2>&1 > %t.out
+// RUN: %fcheck-generic --check-prefixes=CHECK < %t.out
+
+// Port of https://github.com/intel/llvm/blob/sycl/sycl/test-e2e/
+// AddressSanitizer/out-of-bounds/local/local_accessor_function.cpp
+
+#include <omp.h>
+#include <stdio.h>
+
+#define ITEM_COUNT 3
+
+__attribute__((noinline)) void foo(int *dest, const int *source1,
+                                   const int *source2, const int *source3,
+                                   int index) {
+  dest[index] = source1[index] + source2[index] + source3[index + 1];
+}
+
+int main() {
+  int N_SZ = sizeof(int) * ITEM_COUNT;
+
+  int Device = omp_get_default_device();
+  int *DevPtr = (int *)omp_target_alloc(N_SZ, Device);
+
+#pragma omp target is_device_ptr(DevPtr)
+  {
+    int T1[ITEM_COUNT] = {0};
+    int T2[ITEM_COUNT] = {0};
+    int T3[ITEM_COUNT] = {0};
+    for (int i = 0; i < ITEM_COUNT; i++) {
+      foo(DevPtr, T1, T2, T3, i);
+    }
+  }
+
+  omp_target_free(DevPtr, Device);
+
+  return 0;
+}
diff --git a/offload/test/sanitizer/sycl-tests/out-of-bounds/local/local-accessor-multiargs.c b/offload/test/sanitizer/sycl-tests/out-of-bounds/local/local-accessor-multiargs.c
new file mode 100644
index 0000000000000..ae2dde735371e
--- /dev/null
+++ b/offload/test/sanitizer/sycl-tests/out-of-bounds/local/local-accessor-multiargs.c
@@ -0,0 +1,35 @@
+// RUN: %libomptarget-compileopt-generic -fsanitize=offload
+// RUN: not %libomptarget-run-generic 2>&1 > %t.out
+// RUN: %fcheck-generic --check-prefixes=CHECK < %t.out
+
+// Port of https://github.com/intel/llvm/blob/sycl/sycl/test-e2e/
+// AddressSanitizer/out-of-bounds/local/local_accessor_multiargs.cpp
+
+#include <omp.h>
+#include <stdio.h>
+
+#define ITEM_COUNT 3
+
+int main() {
+  int N_SZ = sizeof(int) * ITEM_COUNT;
+
+  int Device = omp_get_default_device();
+  int *DevPtr1 = (int *)omp_target_alloc(N_SZ, Device);
+  int *DevPtr2 = (int *)omp_target_alloc(N_SZ, Device);
+
+#pragma omp target is_device_ptr(DevPtr1, DevPtr2)
+  {
+    int T1[ITEM_COUNT] = {0};
+    int T2[ITEM_COUNT] = {0};
+    int T3[ITEM_COUNT] = {0};
+    for (int i = 0; i < ITEM_COUNT; i++) {
+      DevPtr1[i] = T1[i] + T2[i] + T3[i];
+      DevPtr2[i] = T1[i] + T2[i + 1] + T3[i];
+    }
+  }
+
+  omp_target_free(DevPtr1, Device);
+  omp_target_free(DevPtr2, Device);
+
+  return 0;
+}
diff --git a/offload/test/sanitizer/sycl-tests/out-of-bounds/local/multiple-source.c b/offload/test/sanitizer/sycl-tests/out-of-bounds/local/multiple-source.c
new file mode 100644
index 0000000000000..55adfcae18fb4
--- /dev/null
+++ b/offload/test/sanitizer/sycl-tests/out-of-bounds/local/multiple-source.c
@@ -0,0 +1,41 @@
+// RUN: %libomptarget-compileopt-generic -fsanitize=offload
+// RUN: not %libomptarget-run-generic 2>&1 > %t.out
+// RUN: %fcheck-generic --check-prefixes=CHECK < %t.out
+
+// Port of https://github.com/intel/llvm/blob/sycl/sycl/test-e2e/
+// AddressSanitizer/out-of-bounds/local/multiple_source.cpp
+
+#include <omp.h>
+#include <stdio.h>
+
+#define ITEM_COUNT 3
+
+__attribute__((noinline)) void foo(int *DevPtr) {
+#pragma omp target is_device_ptr(DevPtr)
+  {
+    int T1[ITEM_COUNT] = {0};
+    for (int i = 0; i < ITEM_COUNT; i++) {
+      DevPtr[i] = T1[i + 1];
+    }
+  }
+}
+
+int main() {
+  int N_SZ = sizeof(int) * ITEM_COUNT;
+
+  int Device = omp_get_default_device();
+  int *DevPtr = (int *)omp_target_alloc(N_SZ, Device);
+
+#pragma omp target is_device_ptr(DevPtr)
+  {
+    int T1[ITEM_COUNT] = {0};
+    for (int i = 0; i < ITEM_COUNT; i++) {
+      DevPtr[i] = T1[i];
+    }
+  }
+  foo(DevPtr);
+
+  omp_target_free(DevPtr, Device);
+
+  return 0;
+}
diff --git a/offload/test/sanitizer/sycl-tests/use-after-free/quarantine-free.c b/offload/test/sanitizer/sycl-tests/use-after-free/quarantine-free.c
new file mode 100644
index 0000000000000..eb62f9e61c79c
--- /dev/null
+++ b/offload/test/sanitizer/sycl-tests/use-after-free/quarantine-free.c
@@ -0,0 +1,28 @@
+// RUN: %libomptarget-compileopt-generic -fsanitize=offload
+// RUN: %libomptarget-run-generic 2>&1 > %t.out
+// RUN: %fcheck-generic --check-prefixes=CHECK < %t.out
+
+// Port of
+// https://github.com/intel/llvm/blob/sycl/sycl/test-e2e/AddressSanitizer
+// /use-after-free/quarantine-free.cpp
+
+#include <omp.h>
+#include <stdio.h>
+
+int main() {
+  int N = 100;
+  int N_SZ = sizeof(int) * N;
+
+  int Device = omp_get_default_device();
+
+  int *DevPtr = (int *)omp_target_alloc(N_SZ, Device);
+  omp_target_free(DevPtr, Device);
+
+  DevPtr = (int *)omp_target_alloc(N_SZ, Device);
+  omp_target_free(DevPtr, Device);
+
+  DevPtr = (int *)omp_target_alloc(N_SZ, Device);
+  omp_target_free(DevPtr, Device);
+
+  return 0;
+}
diff --git a/offload/test/sanitizer/sycl-tests/use-after-free/quarantine-no-free.c b/offload/test/sanitizer/sycl-tests/use-after-free/quarantine-no-free.c
new file mode 100644
index 0000000000000..7af5b8751c68c
--- /dev/null
+++ b/offload/test/sanitizer/sycl-tests/use-after-free/quarantine-no-free.c
@@ -0,0 +1,31 @@
+// RUN: %libomptarget-compileopt-generic -fsanitize=offload
+// RUN: not %libomptarget-run-generic 2>&1 > %t.out
+// RUN: %fcheck-generic --check-prefixes=CHECK < %t.out
+
+// Port of
+// https://github.com/intel/llvm/blob/sycl/sycl/test-e2e/AddressSanitizer
+// /use-after-free/quarantine-no-free.cpp
+
+#include <omp.h>
+#include <stdio.h>
+
+int main() {
+  int N = 100;
+  int N_SZ = sizeof(int) * N;
+
+  int Device = omp_get_default_device();
+
+  int *DevPtr = (int *)omp_target_alloc(N_SZ, Device);
+  omp_target_free(DevPtr, Device);
+
+  DevPtr = (int *)omp_target_alloc(N_SZ, Device);
+  omp_target_free(DevPtr, Device);
+
+  DevPtr = (int *)omp_target_alloc(N_SZ, Device);
+  omp_target_free(DevPtr, Device);
+
+#pragma omp target is_device_ptr(DevPtr)
+  { DevPtr[0] = 0; }
+
+  return 0;
+}
diff --git a/offload/test/sanitizer/sycl-tests/use-after-free/use-after-free.c b/offload/test/sanitizer/sycl-tests/use-after-free/use-after-free.c
new file mode 100644
index 0000000000000..89ff2d2f1f080
--- /dev/null
+++ b/offload/test/sanitizer/sycl-tests/use-after-free/use-after-free.c
@@ -0,0 +1,23 @@
+// RUN: %libomptarget-compileopt-generic -fsanitize=offload
+// RUN: not %libomptarget-run-generic 2>&1 > %t.out
+// RUN: %fcheck-generic --check-prefixes=CHECK < %t.out
+
+#include <omp.h>
+#include <stdio.h>
+
+int main() {
+  int N = 100;
+  int N_SZ = sizeof(int) * N;
+
+  int Device = omp_get_default_device();
+
+  int *DevPtr = (int *)omp_target_alloc(N_SZ, Device);
+  omp_target_free(DevPtr, Device);
+
+#pragma omp target is_device_ptr(DevPtr)
+  for (int i = 0; i < N; i++) {
+    DevPtr[i] = i;
+  }
+
+  return 0;
+}
diff --git a/offload/test/sanitizer/sycl-tests/use-after-free/use-before-free.c b/offload/test/sanitizer/sycl-tests/use-after-free/use-before-free.c
new file mode 100644
index 0000000000000..d27709db8fe3e
--- /dev/null
+++ b/offload/test/sanitizer/sycl-tests/use-after-free/use-before-free.c
@@ -0,0 +1,25 @@
+// RUN: %libomptarget-compileopt-generic -fsanitize=offload
+// RUN: %libomptarget-run-generic 2>&1 > %t.out
+// RUN: %fcheck-generic --check-prefixes=CHECK < %t.out
+
+#include <omp.h>
+#include <stdio.h>
+
+int main() {
+  int N = 100;
+  int N_SZ = sizeof(int) * N;
+
+  int Device = omp_get_default_device();
+
+  int *DevPtr = (int *)omp_target_alloc(N_SZ, Device);
+#pragma omp target is_device_ptr(DevPtr)
+  for (int i = 0; i < N; i++) {
+    DevPtr[i] = i;
+  }
+  for (int i = 0; i < N; i++) {
+    printf("%d\n", DevPtr[i]);
+  }
+  omp_target_free(DevPtr, Device);
+
+  return 0;
+}
diff --git a/offload/test/sanitizer/volatile_stack_null.c b/offload/test/sanitizer/volatile_stack_null.c
new file mode 100644
index 0000000000000..08542c3516f28
--- /dev/null
+++ b/offload/test/sanitizer/volatile_stack_null.c
@@ -0,0 +1,39 @@
+// clang-format off
+// RUN: %libomptarget-compileopt-generic -fsanitize=offload -O1
+// RUN: not %libomptarget-run-generic 2> %t.out
+// RUN: %fcheck-generic --check-prefixes=CHECK < %t.out
+// RUN: %libomptarget-compileopt-generic -fsanitize=offload -O3
+// RUN: not %libomptarget-run-generic 2> %t.out
+// RUN: %libomptarget-compileopt-generic -fsanitize=offload -O3 -g
+// RUN: not %libomptarget-run-generic 2> %t.out
+// RUN: %fcheck-generic --check-prefixes=DEBUG < %t.out
+// clang-format on
+
+// UNSUPPORTED: aarch64-unknown-linux-gnu
+// UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
+// UNSUPPORTED: x86_64-pc-linux-gnu
+// UNSUPPORTED: x86_64-pc-linux-gnu-LTO
+// UNSUPPORTED: s390x-ibm-linux-gnu
+// UNSUPPORTED: s390x-ibm-linux-gnu-LTO
+
+int main(void) {
+
+#pragma omp target
+  {
+    volatile int *Null = 0;
+    // clang-format off
+    // CHECK: ERROR: OffloadSanitizer out-of-bounds access on address 0x0000000000000000 at pc [[PC:.*]]
+    // CHECK: WRITE of size 4 at 0x0000000000000000 thread <0, 0, 0> block <0, 0, 0> (acc 1, heap)
+    // CHECK:     #0 [[PC]] omp target (main:[[@LINE-6]]) in <unknown>:0
+    // 
+    // CHECK: 0x0000000000000000 is located 0 bytes inside of a 0-byte region [0x0000000000000000,0x0000000000000000)
+    //
+    // DEBUG: ERROR: OffloadSanitizer out-of-bounds access on address 0x0000000000000000 at pc [[PC:.*]]
+    // DEBUG: WRITE of size 4 at 0x0000000000000000 thread <0, 0, 0> block <0, 0, 0> (acc 1, heap)
+    // DEBUG:     #0 [[PC]] omp target (main:[[@LINE-12]]) in {{.*}}volatile_stack_null.c:[[@LINE+4]]
+    // 
+    // DEBUG: 0x0000000000000000 is located 0 bytes inside of a 0-byte region [0x0000000000000000,0x0000000000000000)
+    // clang-format on
+    *Null = 42;
+  }
+}