From 817a0c9b4471f92d1ec4f94930447f31fd163788 Mon Sep 17 00:00:00 2001
From: "Sarnie, Nick" <nick.sarnie@intel.com>
Date: Mon, 15 Jul 2024 12:58:31 -0700
Subject: [PATCH 1/9] weak

Signed-off-by: Sarnie, Nick <nick.sarnie@intel.com>
---
 libdevice/device.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/libdevice/device.h b/libdevice/device.h
index 360af54f9b4c4..e2f0da6e3c082 100644
--- a/libdevice/device.h
+++ b/libdevice/device.h
@@ -17,9 +17,9 @@
 
 #if defined(__SPIR__) || defined(__SPIRV__) || defined(__NVPTX__)
 #ifdef __SYCL_DEVICE_ONLY__
-#define DEVICE_EXTERNAL SYCL_EXTERNAL __attribute__((weak))
+#define DEVICE_EXTERNAL SYCL_EXTERNAL
 #else // __SYCL_DEVICE_ONLY__
-#define DEVICE_EXTERNAL __attribute__((weak))
+#define DEVICE_EXTERNAL
 #endif // __SYCL_DEVICE_ONLY__
 
 #define DEVICE_EXTERN_C DEVICE_EXTERNAL EXTERN_C

From 70ee0e04c960c06fc69323f7597232700cb34957 Mon Sep 17 00:00:00 2001
From: "Sarnie, Nick" <nick.sarnie@intel.com>
Date: Mon, 15 Jul 2024 13:23:09 -0700
Subject: [PATCH 2/9] thin

Signed-off-by: Sarnie, Nick <nick.sarnie@intel.com>
---
 clang/lib/Driver/ToolChains/Clang.cpp         |   6 +-
 .../tools/clang-linker-wrapper/CMakeLists.txt |   1 +
 .../ClangLinkerWrapper.cpp                    | 510 ++++++++++++++----
 llvm/include/llvm/Object/OffloadBinary.h      |  11 +
 .../include/llvm/SYCLLowerIR/ModuleSplitter.h |   1 +
 llvm/lib/LTO/LTO.cpp                          |   4 +-
 llvm/lib/SYCLLowerIR/ModuleSplitter.cpp       |  52 +-
 7 files changed, 448 insertions(+), 137 deletions(-)

diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp
index 8df597de8f5ff..216b216ba797b 100644
--- a/clang/lib/Driver/ToolChains/Clang.cpp
+++ b/clang/lib/Driver/ToolChains/Clang.cpp
@@ -11231,8 +11231,12 @@ void LinkerWrapper::ConstructJob(Compilation &C, const JobAction &JA,
 
     bool IsUsingLTO = D.isUsingLTO(/*IsDeviceOffloadAction=*/true);
     auto LTOMode = D.getLTOMode(/*IsDeviceOffloadAction=*/true);
-    if (IsUsingLTO && LTOMode == LTOK_Thin)
+    if (IsUsingLTO && LTOMode == LTOK_Thin) {
       CmdArgs.push_back(Args.MakeArgString("-sycl-thin-lto"));
+      // TODO: Pass the same value for this argument once we start using it
+      // for non-thinLTO.
+      CmdArgs.push_back(Args.MakeArgString("-sycl-module-split-mode=auto"));
+    }
 
     if (Args.hasArg(options::OPT_fsycl_embed_ir))
       CmdArgs.push_back(Args.MakeArgString("-sycl-embed-ir"));
diff --git a/clang/tools/clang-linker-wrapper/CMakeLists.txt b/clang/tools/clang-linker-wrapper/CMakeLists.txt
index 9dc1f244f2802..8ad197d85d535 100644
--- a/clang/tools/clang-linker-wrapper/CMakeLists.txt
+++ b/clang/tools/clang-linker-wrapper/CMakeLists.txt
@@ -1,5 +1,6 @@
 set(LLVM_LINK_COMPONENTS
   ${LLVM_TARGETS_TO_BUILD}
+  BitReader
   BitWriter
   Core
   BinaryFormat
diff --git a/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp b/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp
index e7294a314bc8a..7b38cebe77eef 100644
--- a/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp
+++ b/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp
@@ -42,6 +42,7 @@
 #include "llvm/Option/Option.h"
 #include "llvm/Passes/PassPlugin.h"
 #include "llvm/Remarks/HotnessThresholdParser.h"
+#include "llvm/SYCLLowerIR/ComputeModuleRuntimeInfo.h"
 #include "llvm/SYCLLowerIR/ModuleSplitter.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Errc.h"
@@ -590,6 +591,58 @@ static Error getSYCLDeviceLibs(SmallVector<std::string, 16> &DeviceLibFiles,
   return Error::success();
 }
 
+static Error getDeviceLibsForLTO(SmallVector<OffloadFile> &DeviceLibs,
+                                 const ArgList &Args,
+                                 const llvm::Triple Triple) {
+  // TODO: Fix copy paste
+  SmallVector<std::string, 16> DeviceLibFiles;
+  if (Error Err = sycl::getSYCLDeviceLibs(DeviceLibFiles, Args))
+    return Err;
+
+  auto processFile = [&](StringRef File) {
+    auto BufferOrErr = MemoryBuffer::getFile(File);
+    if (!BufferOrErr)
+      return createFileError(File, BufferOrErr.getError());
+    auto Buffer = std::move(*BufferOrErr);
+    SmallVector<OffloadFile> Candidates;
+    if (Error Err =
+            extractOffloadBinaries(Buffer->getMemBufferRef(), Candidates))
+      return Err;
+    for (OffloadFile &OffF : Candidates)
+      if (llvm::Triple(OffF.getBinary()->getTriple()) == Triple)
+        DeviceLibs.emplace_back(std::move(OffF));
+    return Error(Error::success());
+  };
+
+  for (auto &File : DeviceLibFiles) {
+
+    if (Error Err = processFile(File))
+      return Err;
+  }
+
+  // For NVPTX backend we need to also link libclc and CUDA libdevice.
+  if (Triple.isNVPTX()) {
+    if (Arg *A = Args.getLastArg(OPT_sycl_nvptx_device_lib_EQ)) {
+      if (A->getValues().size() == 0)
+        return createStringError(
+            inconvertibleErrorCode(),
+            "Number of device library files cannot be zero.");
+      for (StringRef Val : A->getValues()) {
+        SmallString<128> LibName(Val);
+        if (llvm::sys::fs::exists(LibName)) {
+          if (auto Err = processFile(LibName))
+            return Err;
+        } else
+          return createStringError(
+              inconvertibleErrorCode(),
+              std::string(LibName) +
+                  " SYCL device library file for NVPTX is not found.");
+      }
+    }
+  }
+  return Error::success();
+}
+
 /// This routine is used to convert SPIR-V input files into LLVM IR files.
 /// 'llvm-spirv -r' command is used for this purpose.
 /// If input is not a SPIR-V file, then the original file is returned.
@@ -625,6 +678,25 @@ static Expected<StringRef> convertSPIRVToIR(StringRef Filename,
   return *TempFileOrErr;
 }
 
+static bool considerOnlyKernelsAsEntryPoints(const ArgList &Args,
+                                             const llvm::Triple Triple) {
+  const llvm::Triple HostTriple(Args.getLastArgValue(OPT_host_triple_EQ));
+  bool SYCLNativeCPU = (HostTriple == Triple);
+  // On Intel targets we don't need non-kernel functions as entry points,
+  // because it only increases amount of code for device compiler to handle,
+  // without any actual benefits.
+  // TODO: Try to extend this feature for non-Intel GPUs.
+  return (!Args.hasFlag(OPT_no_sycl_remove_unused_external_funcs,
+                        OPT_sycl_remove_unused_external_funcs, false) &&
+          !SYCLNativeCPU) &&
+         !Triple.isNVPTX() && !Triple.isAMDGPU();
+}
+
+bool isSYCLThinLTO(const ArgList &Args, const llvm::Triple Triple) {
+  // TODO: Support CUDA/HIP
+  return Triple.isSPIROrSPIRV() && Args.hasArg(OPT_sycl_thin_lto);
+}
+
 /// Add any sycl-post-link options that rely on a specific Triple in addition
 /// to user supplied options.
 /// NOTE: Any changes made here should be reflected in the similarly named
@@ -661,10 +733,7 @@ getTripleBasedSYCLPostLinkOpts(const ArgList &Args,
   // because it only increases amount of code for device compiler to handle,
   // without any actual benefits.
   // TODO: Try to extend this feature for non-Intel GPUs.
-  if ((!Args.hasFlag(OPT_no_sycl_remove_unused_external_funcs,
-                     OPT_sycl_remove_unused_external_funcs, false) &&
-       !SYCLNativeCPU) &&
-      !Triple.isNVPTX() && !Triple.isAMDGPU())
+  if (considerOnlyKernelsAsEntryPoints(Args, Triple))
     PostLinkArgs.push_back("-emit-only-kernels-as-entry-points");
 
   if (!Triple.isAMDGCN())
@@ -677,7 +746,7 @@ getTripleBasedSYCLPostLinkOpts(const ArgList &Args,
   bool SplitEsimd =
       Args.hasFlag(OPT_sycl_device_code_split_esimd,
                    OPT_no_sycl_device_code_split_esimd, SplitEsimdByDefault);
-  if (!Args.hasArg(OPT_sycl_thin_lto))
+  if (!isSYCLThinLTO(Args, Triple))
     PostLinkArgs.push_back("-symbols");
   // Specialization constant info generation is mandatory -
   // add options unconditionally
@@ -881,27 +950,33 @@ getTripleBasedSPIRVTransOpts(const ArgList &Args,
   TranslatorArgs.push_back(Args.MakeArgString(ExtArg));
 }
 
+void computeLLVMToSPIRVTranslationToolArgs(const ArgList &Args,
+                                           SmallVector<StringRef, 8> &CmdArgs) {
+  const llvm::Triple Triple(Args.getLastArgValue(OPT_triple_EQ));
+  getTripleBasedSPIRVTransOpts(Args, CmdArgs, Triple);
+  StringRef LLVMToSPIRVOptions;
+  if (Arg *A = Args.getLastArg(OPT_llvm_spirv_options_EQ))
+    LLVMToSPIRVOptions = A->getValue();
+  LLVMToSPIRVOptions.split(CmdArgs, " ", /* MaxSplit = */ -1,
+                           /* KeepEmpty = */ false);
+}
+
 /// Run LLVM to SPIR-V translation.
 /// Converts 'File' from LLVM bitcode to SPIR-V format using llvm-spirv tool.
 /// 'Args' encompasses all arguments required for linking and wrapping device
 /// code and will be parsed to generate options required to be passed into the
 /// llvm-spirv tool.
-static Expected<StringRef> runLLVMToSPIRVTranslation(StringRef File,
-                                                     const ArgList &Args) {
+
+static Expected<StringRef>
+runLLVMToSPIRVTranslation(StringRef File,
+                          SmallVectorImpl<StringRef> &&CmdArgs) {
   Expected<std::string> LLVMToSPIRVPath =
       findProgram("llvm-spirv", {getMainExecutable("llvm-spirv")});
   if (!LLVMToSPIRVPath)
     return LLVMToSPIRVPath.takeError();
 
-  SmallVector<StringRef, 8> CmdArgs;
-  CmdArgs.push_back(*LLVMToSPIRVPath);
-  const llvm::Triple Triple(Args.getLastArgValue(OPT_triple_EQ));
-  getTripleBasedSPIRVTransOpts(Args, CmdArgs, Triple);
-  StringRef LLVMToSPIRVOptions;
-  if (Arg *A = Args.getLastArg(OPT_llvm_spirv_options_EQ))
-    LLVMToSPIRVOptions = A->getValue();
-  LLVMToSPIRVOptions.split(CmdArgs, " ", /* MaxSplit = */ -1,
-                           /* KeepEmpty = */ false);
+  CmdArgs.insert(CmdArgs.begin(), (*LLVMToSPIRVPath));
+
   CmdArgs.push_back("-o");
 
   // Create a new file to write the translated file to.
@@ -941,6 +1016,13 @@ static Expected<StringRef> runLLVMToSPIRVTranslation(StringRef File,
   return *TempFileOrErr;
 }
 
+static Expected<StringRef> runLLVMToSPIRVTranslation(StringRef File,
+                                                     const ArgList &Args) {
+  SmallVector<StringRef, 8> ToolArgs;
+  computeLLVMToSPIRVTranslationToolArgs(Args, ToolArgs);
+  return runLLVMToSPIRVTranslation(File, std::move(ToolArgs));
+}
+
 /// Adds all AOT backend options required for SYCL AOT compilation step to
 /// 'CmdArgs'.
 /// 'Args' encompasses all arguments required for linking and wrapping device
@@ -1350,6 +1432,44 @@ static Expected<StringRef> linkDevice(ArrayRef<StringRef> InputFiles,
   return *DeviceLinkedFile;
 }
 
+llvm::sycl::GlobalBinImageProps
+computeGlobalBinProps(const ArgList &Args, const llvm::Triple Triple) {
+  auto findParam = [](const SmallVectorImpl<StringRef> &Vec, StringRef Param) {
+    for (auto El : Vec)
+      if (Param == El)
+        return true;
+    return false;
+  };
+  SmallVector<StringRef, 8> CmdArgs;
+  getTripleBasedSYCLPostLinkOpts(Args, CmdArgs, Triple);
+  bool EmitKernelParamInfo = findParam(CmdArgs, "-emit-param-info");
+  bool EmitProgramMetadata = findParam(CmdArgs, "-emit-program-metadata");
+  bool EmitExportedSymbols = findParam(CmdArgs, "-emit-exported-symbols");
+  bool EmitImportedSymbols = findParam(CmdArgs, "-emit-imported-symbols");
+  // DeviceGlobals is not triple-based, so it will be present in Args.
+  bool DeviceGlobals = false;
+  if (Arg *A = Args.getLastArg(OPT_sycl_post_link_options_EQ))
+    DeviceGlobals = StringRef(A->getValue()).contains("-device-globals");
+
+  return {EmitKernelParamInfo, EmitProgramMetadata, EmitExportedSymbols,
+          EmitImportedSymbols, DeviceGlobals};
+}
+
+Error validateThinLTOModule(BitcodeModule &M, const ArgList &Args) {
+  Expected<BitcodeLTOInfo> LTOInfo = M.getLTOInfo();
+  if (!LTOInfo || !(*LTOInfo).IsThinLTO)
+    return createStringError(
+        "All code must be compiled with -foffload-lto=thin");
+
+  // For O0 we don't run function importing so it defeats
+  // the whole point of thinLTO. Maybe we could lift this
+  // restriction by enabling only required passes for importing for O0.
+  if (Args.getLastArgValue(OPT_opt_level, "") == "O0")
+    return createStringError("O0 is not supported");
+
+  return Error::success();
+}
+
 } // namespace sycl
 
 namespace generic {
@@ -1566,6 +1686,8 @@ std::vector<std::string> getTargetFeatures(ArrayRef<OffloadFile> InputFiles) {
 template <typename ModuleHook = function_ref<bool(size_t, const Module &)>>
 std::unique_ptr<lto::LTO> createLTO(
     const ArgList &Args, const std::vector<std::string> &Features,
+    SmallVectorImpl<OffloadFile> &BitcodeInputFiles,
+    std::vector<std::string> ModulesToCompile = {},
     ModuleHook Hook = [](size_t, const Module &) { return true; }) {
   const llvm::Triple Triple(Args.getLastArgValue(OPT_triple_EQ));
   // We need to remove AMD's target-id from the processor if present.
@@ -1595,6 +1717,14 @@ std::unique_ptr<lto::LTO> createLTO(
   Conf.OptLevel = OptLevel[1] - '0';
   Conf.DefaultTriple = Triple.getTriple();
 
+  // We need to set up the backend to use thinLTO
+  // even if we don't actually use it, and there is no
+  // backend for the spir64 triple, so override it to
+  // the SPIR-V backlend.
+  // TODO: Remove once SYCL uses the SPIR-V backend.
+  if (sycl::isSYCLThinLTO(Args, Triple))
+    Conf.OverrideTriple = "spirv64-unknown-unknown";
+
   // TODO: Should we complain about combining --opt-level and -passes, as opt
   // does?  That might be too limiting in clang-linker-wrapper, so for now we
   // just warn in the help entry for -passes that the default<O?> corresponding
@@ -1611,11 +1741,21 @@ std::unique_ptr<lto::LTO> createLTO(
 
   Conf.PTO.LoopVectorization = Conf.OptLevel > 1;
   Conf.PTO.SLPVectorization = Conf.OptLevel > 1;
-
+  std::string TempName = (sys::path::filename(ExecutableName) + "." +
+                          Triple.getTriple() + "." + Arch)
+                             .str();
+  auto PreCodeGenSaveTemps = [=](size_t Task, const Module &M) {
+    std::string File =
+        !Task ? TempName + ".postopt.bc"
+              : TempName + "." + std::to_string(Task) + ".postopt.bc";
+    error_code EC;
+    raw_fd_ostream LinkedBitcode(File, EC, sys::fs::OF_None);
+    if (EC)
+      reportError(errorCodeToError(EC));
+    WriteBitcodeToFile(M, LinkedBitcode);
+    return true;
+  };
   if (SaveTemps) {
-    std::string TempName = (sys::path::filename(ExecutableName) + "." +
-                            Triple.getTriple() + "." + TargetID)
-                               .str();
     Conf.PostInternalizeModuleHook = [=](size_t Task, const Module &M) {
       std::string File =
           !Task ? TempName + ".postlink.bc"
@@ -1627,17 +1767,7 @@ std::unique_ptr<lto::LTO> createLTO(
       WriteBitcodeToFile(M, LinkedBitcode);
       return true;
     };
-    Conf.PreCodeGenModuleHook = [=](size_t Task, const Module &M) {
-      std::string File =
-          !Task ? TempName + ".postopt.bc"
-                : TempName + "." + std::to_string(Task) + ".postopt.bc";
-      error_code EC;
-      raw_fd_ostream LinkedBitcode(File, EC, sys::fs::OF_None);
-      if (EC)
-        reportError(errorCodeToError(EC));
-      WriteBitcodeToFile(M, LinkedBitcode);
-      return true;
-    };
+    Conf.PreCodeGenModuleHook = PreCodeGenSaveTemps;
   }
   Conf.PostOptModuleHook = Hook;
   Conf.CGFileType = (Triple.isNVPTX() || SaveTemps)
@@ -1646,6 +1776,97 @@ std::unique_ptr<lto::LTO> createLTO(
 
   // TODO: Handle remark files
   Conf.HasWholeProgramVisibility = Args.hasArg(OPT_whole_program);
+  if (sycl::isSYCLThinLTO(Args, Triple)) {
+    // Passing Args to each thinLTO thread causes crashes, so compute everything
+    // we can here.
+    const llvm::Triple Triple(Args.getLastArgValue(OPT_triple_EQ));
+    bool OnlyKernelsAsEntryPoints =
+        sycl::considerOnlyKernelsAsEntryPoints(Args, Triple);
+    auto GlobalBinProps = sycl::computeGlobalBinProps(Args, Triple);
+    SmallVector<StringRef, 8> SPIRVArgs;
+    sycl::computeLLVMToSPIRVTranslationToolArgs(Args, SPIRVArgs);
+    Conf.PreCodeGenModuleHook = [=, &BitcodeInputFiles](unsigned Task,
+                                                        const Module &M) {
+      // This is the main part of SYCL LTO handling.
+      // Here we process the IR from each BC file, compute module
+      // properties and the module symbol table, convert to SPV (using the
+      // translator for now) and save required information for binary created
+      // inside the OffloadFile.
+
+      assert(Task != 0 && "Unexpected task");
+      auto &OffloadF = BitcodeInputFiles[Task - 1];
+      if (OffloadF.getBinary()->getOffloadKind() != OFK_SYCL) {
+        if (SaveTemps)
+          PreCodeGenSaveTemps(Task, M);
+        return true;
+      }
+
+      llvm::sycl::EntryPointSet EntryPoints;
+
+      for (const Function &F : M.functions()) {
+        if (llvm::module_split::isEntryPoint(F, OnlyKernelsAsEntryPoints))
+          EntryPoints.insert(const_cast<Function *>(&F));
+      }
+      // No entry points, don't proceed
+      if (EntryPoints.empty())
+        return false;
+
+      if (SaveTemps)
+        PreCodeGenSaveTemps(Task, M);
+
+      // TODO: Handle spec constants.
+
+      // TODO: Handle internalization of non-entry-points, we don't do it during
+      // early split anymore.
+      // One problem is that the modules are pased in as `const Module&`, and
+      // ideally we want to delete non-entry point functions, but const-casting
+      // and modifying the module seems from here seems wrong.
+
+      auto ModuleProps = llvm::sycl::computeModuleProperties(
+          M, EntryPoints, GlobalBinProps,
+          /*SpecConstsMet=*/false, /*SpecConstsMet=*/false);
+      std::string ModulePropsStr;
+      raw_string_ostream SCOut(ModulePropsStr);
+      ModuleProps.write(SCOut);
+      std::string ModuleSyms =
+          llvm::sycl::computeModuleSymbolTable(M, EntryPoints);
+      // This part is the hackiest part of this change. However, this code is
+      // run on multiple threads, so the data structures we can use are more
+      // limited. We can't use StringRef because we would need a StringSaver to
+      // keep the values around, but StringSaver is not thread safe.
+      OffloadF.getBinary()->addTmpString(ModulePropsStr);
+      OffloadF.getBinary()->addTmpString(ModuleSyms);
+      // TODO: Use SPIR-V backend instead of SPIR-V translator once the backend
+      // is mature.
+      auto IRFile = createOutputFile(sys::path::filename(ExecutableName) + "." +
+                                         std::to_string(Task) + ".to.spv",
+                                     "spv");
+      if (!IRFile)
+        reportError(IRFile.takeError());
+      error_code EC;
+      raw_fd_ostream LinkedBitcode(*IRFile, EC, sys::fs::OF_None);
+      if (EC)
+        reportError(errorCodeToError(EC));
+      WriteBitcodeToFile(M, LinkedBitcode);
+      LinkedBitcode.close();
+      // We need this copy to prevent data corruption of the arguments when
+      // calling llvm-spirv. Probably some multithreading thing, I didn't deeply
+      // investigate it yet.
+      SmallVector<StringRef, 8> SPIRVArgsCopy = SPIRVArgs;
+      auto SPVFile =
+          sycl::runLLVMToSPIRVTranslation(*IRFile, std::move(SPIRVArgsCopy));
+      if (!SPVFile)
+        reportError(SPVFile.takeError());
+      OffloadF.getBinary()->addTmpString((*SPVFile).str());
+      // Return false so the thinLTO backend doesn't continue to process this
+      // module. We already emitted SPIR-V ourselves, so we don't need to do
+      // anything else. Once the SPIR-V backend is ready, we can remove the
+      // manual SPIR-V translator call and return true here.
+      return false;
+    };
+    // Only compile user modules to SPV, not device libraries.
+    Conf.ThinLTOModulesToCompile = ModulesToCompile;
+  }
 
   return std::make_unique<lto::LTO>(std::move(Conf), Backend);
 }
@@ -1660,16 +1881,16 @@ bool isValidCIdentifier(StringRef S) {
 
 Error linkBitcodeFiles(SmallVectorImpl<OffloadFile> &InputFiles,
                        SmallVectorImpl<StringRef> &OutputFiles,
+                       SmallVector<OffloadFile, 4> &BitcodeInputFiles,
                        const ArgList &Args) {
   llvm::TimeTraceScope TimeScope("Link bitcode files");
   const llvm::Triple Triple(Args.getLastArgValue(OPT_triple_EQ));
   StringRef Arch = Args.getLastArgValue(OPT_arch_EQ);
 
-  // Early exit for SPIR targets
-  if (Triple.isSPIROrSPIRV())
+  // Early exit for non-thin-LTO SPIR targets
+  if (Triple.isSPIROrSPIRV() && !sycl::isSYCLThinLTO(Args, Triple))
     return Error::success();
 
-  SmallVector<OffloadFile, 4> BitcodeInputFiles;
   DenseSet<StringRef> StrongResolutions;
   DenseSet<StringRef> UsedInRegularObj;
   DenseSet<StringRef> UsedInSharedLib;
@@ -1732,6 +1953,17 @@ Error linkBitcodeFiles(SmallVectorImpl<OffloadFile> &InputFiles,
 
   // LTO Module hook to output bitcode without running the backend.
   SmallVector<StringRef> BitcodeOutput;
+  std::vector<std::string> ModulesToCompile;
+  if (sycl::isSYCLThinLTO(Args, Triple)) {
+    for (const OffloadFile &BitcodeInput : BitcodeInputFiles) {
+      auto ModuleName = BitcodeInput.getBinary()->getFileName();
+      // TODO: This is pretty hacky, maybe we could check some module metadata
+      // or something.
+      if (ModuleName.find("libsycl-") == std::string::npos)
+        ModulesToCompile.push_back(ModuleName.str());
+    }
+  }
+
   auto OutputBitcode = [&](size_t, const Module &M) {
     auto TempFileOrErr = createOutputFile(sys::path::filename(ExecutableName) +
                                               "-jit-" + Triple.getTriple(),
@@ -1750,11 +1982,11 @@ Error linkBitcodeFiles(SmallVectorImpl<OffloadFile> &InputFiles,
 
   // We assume visibility of the whole program if every input file was bitcode.
   auto Features = getTargetFeatures(BitcodeInputFiles);
-  auto LTOBackend = Args.hasArg(OPT_embed_bitcode) ||
-                            Args.hasArg(OPT_builtin_bitcode_EQ) ||
-                            Args.hasArg(OPT_clang_backend)
-                        ? createLTO(Args, Features, OutputBitcode)
-                        : createLTO(Args, Features);
+  auto LTOBackend =
+      Args.hasArg(OPT_embed_bitcode) || Args.hasArg(OPT_builtin_bitcode_EQ) ||
+              Args.hasArg(OPT_clang_backend)
+          ? createLTO(Args, Features, BitcodeInputFiles, {}, OutputBitcode)
+          : createLTO(Args, Features, BitcodeInputFiles, ModulesToCompile);
 
   // We need to resolve the symbols so the LTO backend knows which symbols need
   // to be kept or can be internalized. This is a simplified symbol resolution
@@ -1773,6 +2005,17 @@ Error linkBitcodeFiles(SmallVectorImpl<OffloadFile> &InputFiles,
     if (!BitcodeFileOrErr)
       return BitcodeFileOrErr.takeError();
 
+    if (sycl::isSYCLThinLTO(Args, Triple)) {
+      // Error if any module was not compiled with thinLTO. Other platforms
+      // can fall back to binary linking if thinLTO fails, but we don't have
+      // that for SPIR-V (besides spirv-link). In the future we may be able to
+      // fall back to normal SYCL processing and throw a warning instead of a
+      // fatal error.
+      if (auto Err = sycl::validateThinLTOModule(
+              (*BitcodeFileOrErr)->getSingleBitcodeModule(), Args))
+        return Err;
+    }
+
     // Save the input file and the buffer associated with its memory.
     const auto Symbols = (*BitcodeFileOrErr)->symbols();
     SmallVector<lto::SymbolResolution, 16> Resolutions(Symbols.size());
@@ -2175,76 +2418,125 @@ Expected<SmallVector<StringRef>> linkAndWrapDeviceFiles(
     }
     if (HasSYCLOffloadKind) {
       SmallVector<StringRef> InputFiles;
-      // Write device inputs to an output file for the linker.
-      for (const OffloadFile &File : Input) {
-        auto FileNameOrErr = writeOffloadFile(File);
-        if (!FileNameOrErr)
-          return FileNameOrErr.takeError();
-        InputFiles.emplace_back(*FileNameOrErr);
+      SmallVector<OffloadFile, 4> BitcodeInputFiles;
+      StringRef TmpOutput;
+      llvm::Triple Triple(LinkerArgs.getLastArgValue(OPT_triple_EQ));
+      if (sycl::isSYCLThinLTO(Args, Triple)) {
+        // For thinLTO, we consider device libs as normal compiler input
+        // and add them to the files to be processed by the LTO backend.
+        // Later we set ModulesToCompile so that we don't
+        // actually emit code for them, we just link in their functions in
+        // modules that use them.
+        if (auto Err = sycl::getDeviceLibsForLTO(Input, LinkerArgs, Triple))
+          return Err;
+        if (auto Err = linkBitcodeFiles(Input, InputFiles, BitcodeInputFiles,
+                                        LinkerArgs))
+          return Err;
+      } else {
+        // Write device inputs to an output file for the linker.
+        for (const OffloadFile &File : Input) {
+          auto FileNameOrErr = writeOffloadFile(File);
+          if (!FileNameOrErr)
+            return FileNameOrErr.takeError();
+          InputFiles.emplace_back(*FileNameOrErr);
+        }
+        // Link the input device files using the device linker for SYCL
+        // offload.
+        auto TmpOutputOrErr = sycl::linkDevice(InputFiles, LinkerArgs);
+        if (!TmpOutputOrErr)
+          return TmpOutputOrErr.takeError();
+        TmpOutput = *TmpOutputOrErr;
       }
-      // Link the input device files using the device linker for SYCL
-      // offload.
-      auto TmpOutputOrErr = sycl::linkDevice(InputFiles, LinkerArgs);
-      if (!TmpOutputOrErr)
-        return TmpOutputOrErr.takeError();
       SmallVector<StringRef> InputFilesSYCL;
-      InputFilesSYCL.emplace_back(*TmpOutputOrErr);
-      auto SplitModulesOrErr =
-          SYCLModuleSplitMode
-              ? sycl::runSYCLSplitLibrary(InputFilesSYCL, LinkerArgs,
-                                          *SYCLModuleSplitMode)
-              : sycl::runSYCLPostLinkTool(InputFilesSYCL, LinkerArgs);
-      if (!SplitModulesOrErr)
-        return SplitModulesOrErr.takeError();
-
-      auto &SplitModules = *SplitModulesOrErr;
-      const llvm::Triple Triple(LinkerArgs.getLastArgValue(OPT_triple_EQ));
-      if ((Triple.isNVPTX() || Triple.isAMDGCN()) &&
-          LinkerArgs.hasArg(OPT_sycl_embed_ir)) {
-        // When compiling for Nvidia/AMD devices and the user requested the
-        // IR to be embedded in the application (via option), run the output
-        // of sycl-post-link (filetable referencing LLVM Bitcode + symbols)
-        // through the offload wrapper and link the resulting object to the
-        // application.
-        auto OutputFile =
-            sycl::runWrapperAndCompile(SplitModules, LinkerArgs, /* IsEmbeddedIR */ true);
-        if (!OutputFile)
-          return OutputFile.takeError();
-        WrappedOutput.push_back(*OutputFile);
-      }
-      for (size_t I = 0, E = SplitModules.size(); I != E; ++I) {
-        SmallVector<StringRef> Files = {SplitModules[I].ModuleFilePath};
-        StringRef Arch = LinkerArgs.getLastArgValue(OPT_arch_EQ);
-        if (Arch.empty())
-          Arch = "native";
-        SmallVector<std::pair<StringRef, StringRef>, 4> BundlerInputFiles;
-        auto ClangOutputOrErr =
-            linkDevice(Files, LinkerArgs, true /* IsSYCLKind */);
-        if (!ClangOutputOrErr)
-          return ClangOutputOrErr.takeError();
-        if (Triple.isNVPTX()) {
-          auto VirtualArch = StringRef(clang::OffloadArchToVirtualArchString(
-              clang::StringToOffloadArch(Arch)));
-          auto PtxasOutputOrErr =
-              nvptx::ptxas(*ClangOutputOrErr, LinkerArgs, Arch);
-          if (!PtxasOutputOrErr)
-            return PtxasOutputOrErr.takeError();
-          BundlerInputFiles.emplace_back(*ClangOutputOrErr, VirtualArch);
-          BundlerInputFiles.emplace_back(*PtxasOutputOrErr, Arch);
-          auto BundledFileOrErr =
-              nvptx::fatbinary(BundlerInputFiles, LinkerArgs);
-          if (!BundledFileOrErr)
-            return BundledFileOrErr.takeError();
-          SplitModules[I].ModuleFilePath = *BundledFileOrErr;
-        } else if (Triple.isAMDGCN()) {
-          BundlerInputFiles.emplace_back(*ClangOutputOrErr, Arch);
-          auto BundledFileOrErr =
-              amdgcn::fatbinary(BundlerInputFiles, LinkerArgs);
-          if (!BundledFileOrErr)
-            return BundledFileOrErr.takeError();
-          SplitModules[I].ModuleFilePath = *BundledFileOrErr;
-        } else {
-          SplitModules[I].ModuleFilePath = *ClangOutputOrErr;
+      std::vector<module_split::SplitModule> SplitModules;
+      if (sycl::isSYCLThinLTO(Args, Triple)) {
+        for (size_t FileIdx = 0; FileIdx < BitcodeInputFiles.size();
+             FileIdx++) {
+          // After we have run the LTO backend, extract the information computed
+          // in the backend (module props/symbol table/spv file path) and set it
+          // up to be used by SYCL image creation.
+          // TODO: Once SYCL image creation is reconsiled with the non-SYCL
+          // path, we can move all of the thinLTO handling to be more in-line
+          // with community code.
+          const OffloadFile &F = BitcodeInputFiles[FileIdx];
+          const auto &SYCLInfo = F.getBinary()->getTmpStrings();
+          if (SYCLInfo.size() != 3)
+            continue;
+          // The hardcoded vector indexes are very hacky,
+          // but I feel the most controversial part of this hcange is how we
+          // store the required information for later and it's likely to change
+          // based on feedback, so I didn't completely design that part yet.
+          StringRef CodegenPath = SYCLInfo[2];
+          assert(!CodegenPath.empty() && "Codegen failed");
+          const auto &Props = SYCLInfo[0];
+          auto MB = MemoryBuffer::getMemBuffer(Props);
+          auto PropSetOrErr = llvm::util::PropertySetRegistry::read(MB.get());
+          if (!PropSetOrErr)
+            return PropSetOrErr.takeError();
+          llvm::util::PropertySetRegistry Properties =
+              std::move(**PropSetOrErr);
+          const auto &Syms = SYCLInfo[1];
+          SplitModules.emplace_back(CodegenPath, std::move(Properties), Syms);
+        }
+        // We don't need the OffloadFiles anymore, so free them from memory.
+        BitcodeInputFiles.clear();
+      } else {
+        InputFilesSYCL.emplace_back(TmpOutput);
+        auto SplitModulesOrErr =
+            SYCLModuleSplitMode
+                ? sycl::runSYCLSplitLibrary(InputFilesSYCL, LinkerArgs,
+                                            *SYCLModuleSplitMode)
+                : sycl::runSYCLPostLinkTool(InputFilesSYCL, LinkerArgs);
+        if (!SplitModulesOrErr)
+          return SplitModulesOrErr.takeError();
+        SplitModules = std::move(*SplitModulesOrErr);
+        if ((Triple.isNVPTX() || Triple.isAMDGCN()) &&
+            LinkerArgs.hasArg(OPT_sycl_embed_ir)) {
+          // When compiling for Nvidia/AMD devices and the user requested the
+          // IR to be embedded in the application (via option), run the output
+          // of sycl-post-link (filetable referencing LLVM Bitcode + symbols)
+          // through the offload wrapper and link the resulting object to the
+          // application.
+          auto OutputFile = sycl::runWrapperAndCompile(SplitModules, LinkerArgs,
+                                                       /* IsEmbeddedIR */ true);
+          if (!OutputFile)
+            return OutputFile.takeError();
+          WrappedOutput.push_back(*OutputFile);
+        }
+        for (size_t I = 0, E = SplitModules.size(); I != E; ++I) {
+          SmallVector<StringRef> Files = {SplitModules[I].ModuleFilePath};
+          StringRef Arch = LinkerArgs.getLastArgValue(OPT_arch_EQ);
+          if (Arch.empty())
+            Arch = "native";
+          SmallVector<std::pair<StringRef, StringRef>, 4> BundlerInputFiles;
+          auto ClangOutputOrErr =
+              linkDevice(Files, LinkerArgs, true /* IsSYCLKind */);
+          if (!ClangOutputOrErr)
+            return ClangOutputOrErr.takeError();
+          if (Triple.isNVPTX()) {
+            auto VirtualArch = StringRef(clang::OffloadArchToVirtualArchString(
+                clang::StringToOffloadArch(Arch)));
+            auto PtxasOutputOrErr =
+                nvptx::ptxas(*ClangOutputOrErr, LinkerArgs, Arch);
+            if (!PtxasOutputOrErr)
+              return PtxasOutputOrErr.takeError();
+            BundlerInputFiles.emplace_back(*ClangOutputOrErr, VirtualArch);
+            BundlerInputFiles.emplace_back(*PtxasOutputOrErr, Arch);
+            auto BundledFileOrErr =
+                nvptx::fatbinary(BundlerInputFiles, LinkerArgs);
+            if (!BundledFileOrErr)
+              return BundledFileOrErr.takeError();
+            SplitModules[I].ModuleFilePath = *BundledFileOrErr;
+          } else if (Triple.isAMDGCN()) {
+            BundlerInputFiles.emplace_back(*ClangOutputOrErr, Arch);
+            auto BundledFileOrErr =
+                amdgcn::fatbinary(BundlerInputFiles, LinkerArgs);
+            if (!BundledFileOrErr)
+              return BundledFileOrErr.takeError();
+            SplitModules[I].ModuleFilePath = *BundledFileOrErr;
+          } else {
+            SplitModules[I].ModuleFilePath = *ClangOutputOrErr;
+          }
         }
       }
       // TODO(NOM7): Remove this call and use community flow for bundle/wrap
@@ -2263,7 +2555,9 @@ Expected<SmallVector<StringRef>> linkAndWrapDeviceFiles(
     if (HasNonSYCLOffloadKinds) {
       // First link and remove all the input files containing bitcode.
       SmallVector<StringRef> InputFiles;
-      if (Error Err = linkBitcodeFiles(Input, InputFiles, LinkerArgs))
+      SmallVector<OffloadFile, 4> BitcodeInputFiles;
+      if (Error Err = linkBitcodeFiles(Input, InputFiles, BitcodeInputFiles,
+                                       LinkerArgs))
         return Err;
 
       // Write any remaining device inputs to an output file for the linker.
diff --git a/llvm/include/llvm/Object/OffloadBinary.h b/llvm/include/llvm/Object/OffloadBinary.h
index d590110ffa598..3d1b18b7f3418 100644
--- a/llvm/include/llvm/Object/OffloadBinary.h
+++ b/llvm/include/llvm/Object/OffloadBinary.h
@@ -103,6 +103,14 @@ class OffloadBinary : public Binary {
 
   StringRef getString(StringRef Key) const { return StringData.lookup(Key); }
 
+  /// XXX: Hack
+  const SmallVectorImpl<std::string> &getTmpStrings() const {
+    return TmpStringData;
+  }
+
+  /// XXX: Hack
+  void addTmpString(std::string Value) { TmpStringData.push_back(Value); }
+
   static bool classof(const Binary *V) { return V->isOffloadFile(); }
 
   struct Header {
@@ -151,6 +159,9 @@ class OffloadBinary : public Binary {
   const Header *TheHeader;
   /// Location of the metadata entries within the binary.
   const Entry *TheEntry;
+
+  /// XXX: Hack
+  SmallVector<std::string, 8> TmpStringData;
 };
 
 /// A class to contain the binary information for a single OffloadBinary that
diff --git a/llvm/include/llvm/SYCLLowerIR/ModuleSplitter.h b/llvm/include/llvm/SYCLLowerIR/ModuleSplitter.h
index 0da3706ad3626..cb6049910b934 100644
--- a/llvm/include/llvm/SYCLLowerIR/ModuleSplitter.h
+++ b/llvm/include/llvm/SYCLLowerIR/ModuleSplitter.h
@@ -321,6 +321,7 @@ splitSYCLModule(std::unique_ptr<Module> M, ModuleSplitterSettings Settings);
 
 bool isESIMDFunction(const Function &F);
 bool canBeImportedFunction(const Function &F);
+bool isEntryPoint(const Function &F, bool EmitOnlyKernelsAsEntryPoints);
 
 } // namespace module_split
 
diff --git a/llvm/lib/LTO/LTO.cpp b/llvm/lib/LTO/LTO.cpp
index d303f228aa72c..9cdcf2738a6d7 100644
--- a/llvm/lib/LTO/LTO.cpp
+++ b/llvm/lib/LTO/LTO.cpp
@@ -1077,8 +1077,8 @@ Error LTO::addThinLTO(BitcodeModule BM, ArrayRef<InputFile::Symbol> Syms,
     for (const std::string &Name : Conf.ThinLTOModulesToCompile) {
       if (BM.getModuleIdentifier().contains(Name)) {
         ThinLTO.ModulesToCompile->insert({BM.getModuleIdentifier(), BM});
-        llvm::errs() << "[ThinLTO] Selecting " << BM.getModuleIdentifier()
-                     << " to compile\n";
+        LLVM_DEBUG(dbgs() << "[ThinLTO] Selecting " << BM.getModuleIdentifier()
+                          << " to compile\n");
       }
     }
   }
diff --git a/llvm/lib/SYCLLowerIR/ModuleSplitter.cpp b/llvm/lib/SYCLLowerIR/ModuleSplitter.cpp
index b9eda1376663f..a012d8a68e2f2 100644
--- a/llvm/lib/SYCLLowerIR/ModuleSplitter.cpp
+++ b/llvm/lib/SYCLLowerIR/ModuleSplitter.cpp
@@ -117,32 +117,6 @@ bool isKernel(const Function &F) {
          F.getCallingConv() == CallingConv::AMDGPU_KERNEL;
 }
 
-bool isEntryPoint(const Function &F, bool EmitOnlyKernelsAsEntryPoints) {
-  // Skip declarations, if any: they should not be included into a vector of
-  // entry points groups or otherwise we will end up with incorrectly generated
-  // list of symbols.
-  if (F.isDeclaration())
-    return false;
-
-  // Kernels are always considered to be entry points
-  if (isKernel(F))
-    return true;
-
-  if (!EmitOnlyKernelsAsEntryPoints) {
-    // If not disabled, SYCL_EXTERNAL functions with sycl-module-id attribute
-    // are also considered as entry points (except __spirv_* and __sycl_*
-    // functions)
-    return llvm::sycl::utils::isSYCLExternalFunction(&F) &&
-           !isSpirvSyclBuiltin(F.getName()) && !isESIMDBuiltin(F.getName()) &&
-           !isGenericBuiltin(F.getName());
-  }
-
-  // Even if we are emitting only kernels as entry points, virtual functions
-  // should still be treated as entry points, because they are going to be
-  // outlined into separate device images and linked in later.
-  return F.hasFnAttribute("indirectly-callable");
-}
-
 // Represents "dependency" or "use" graph of global objects (functions and
 // global variables) in a module. It is used during device code split to
 // understand which global variables and functions (other than entry points)
@@ -445,6 +419,32 @@ class ModuleSplitter : public ModuleSplitterBase {
 namespace llvm {
 namespace module_split {
 
+bool isEntryPoint(const Function &F, bool EmitOnlyKernelsAsEntryPoints) {
+  // Skip declarations, if any: they should not be included into a vector of
+  // entry points groups or otherwise we will end up with incorrectly generated
+  // list of symbols.
+  if (F.isDeclaration())
+    return false;
+
+  // Kernels are always considered to be entry points
+  if (isKernel(F))
+    return true;
+
+  if (!EmitOnlyKernelsAsEntryPoints) {
+    // If not disabled, SYCL_EXTERNAL functions with sycl-module-id attribute
+    // are also considered as entry points (except __spirv_* and __sycl_*
+    // functions)
+    return llvm::sycl::utils::isSYCLExternalFunction(&F) &&
+           !isSpirvSyclBuiltin(F.getName()) && !isESIMDBuiltin(F.getName()) &&
+           !isGenericBuiltin(F.getName());
+  }
+
+  // Even if we are emitting only kernels as entry points, virtual functions
+  // should still be treated as entry points, because they are going to be
+  // outlined into separate device images and linked in later.
+  return F.hasFnAttribute("indirectly-callable");
+}
+
 std::optional<IRSplitMode> convertStringToSplitMode(StringRef S) {
   static const StringMap<IRSplitMode> Values = {{"kernel", SPLIT_PER_KERNEL},
                                                 {"source", SPLIT_PER_TU},

From 00a28c1276881300306cfe2b880153c4f977d061 Mon Sep 17 00:00:00 2001
From: "Sarnie, Nick" <nick.sarnie@intel.com>
Date: Wed, 18 Sep 2024 08:16:14 -0700
Subject: [PATCH 3/9] add design doc, rework prototype with new design

Signed-off-by: Sarnie, Nick <nick.sarnie@intel.com>
---
 clang/lib/Driver/ToolChains/Clang.cpp         |   6 +-
 .../ClangLinkerWrapper.cpp                    | 180 ++++++++++--------
 .../SYCLLowerIR/SYCLLinkedModuleProcessor.h   |  22 +++
 llvm/lib/SYCLLowerIR/CMakeLists.txt           |   1 +
 .../SYCLLowerIR/SYCLLinkedModuleProcessor.cpp |  45 +++++
 sycl/doc/design/CompilerAndRuntimeDesign.md   |   3 +
 sycl/doc/design/ThinLTO.md                    | 147 ++++++++++++++
 .../design/images/ThinLTOCommunityFlow.svg    |   1 +
 sycl/doc/design/images/ThinLTOSYCLFlow.svg    |   1 +
 .../images/ThinLTOSYCLSPIRVBackendFlow.svg    |   1 +
 10 files changed, 326 insertions(+), 81 deletions(-)
 create mode 100644 llvm/include/llvm/SYCLLowerIR/SYCLLinkedModuleProcessor.h
 create mode 100644 llvm/lib/SYCLLowerIR/SYCLLinkedModuleProcessor.cpp
 create mode 100644 sycl/doc/design/ThinLTO.md
 create mode 100644 sycl/doc/design/images/ThinLTOCommunityFlow.svg
 create mode 100644 sycl/doc/design/images/ThinLTOSYCLFlow.svg
 create mode 100644 sycl/doc/design/images/ThinLTOSYCLSPIRVBackendFlow.svg

diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp
index 216b216ba797b..8df597de8f5ff 100644
--- a/clang/lib/Driver/ToolChains/Clang.cpp
+++ b/clang/lib/Driver/ToolChains/Clang.cpp
@@ -11231,12 +11231,8 @@ void LinkerWrapper::ConstructJob(Compilation &C, const JobAction &JA,
 
     bool IsUsingLTO = D.isUsingLTO(/*IsDeviceOffloadAction=*/true);
     auto LTOMode = D.getLTOMode(/*IsDeviceOffloadAction=*/true);
-    if (IsUsingLTO && LTOMode == LTOK_Thin) {
+    if (IsUsingLTO && LTOMode == LTOK_Thin)
       CmdArgs.push_back(Args.MakeArgString("-sycl-thin-lto"));
-      // TODO: Pass the same value for this argument once we start using it
-      // for non-thinLTO.
-      CmdArgs.push_back(Args.MakeArgString("-sycl-module-split-mode=auto"));
-    }
 
     if (Args.hasArg(options::OPT_fsycl_embed_ir))
       CmdArgs.push_back(Args.MakeArgString("-sycl-embed-ir"));
diff --git a/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp b/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp
index 7b38cebe77eef..f799d3cf05d46 100644
--- a/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp
+++ b/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp
@@ -44,6 +44,7 @@
 #include "llvm/Remarks/HotnessThresholdParser.h"
 #include "llvm/SYCLLowerIR/ComputeModuleRuntimeInfo.h"
 #include "llvm/SYCLLowerIR/ModuleSplitter.h"
+#include "llvm/SYCLLowerIR/SYCLLinkedModuleProcessor.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Errc.h"
 #include "llvm/Support/FileOutputBuffer.h"
@@ -697,6 +698,13 @@ bool isSYCLThinLTO(const ArgList &Args, const llvm::Triple Triple) {
   return Triple.isSPIROrSPIRV() && Args.hasArg(OPT_sycl_thin_lto);
 }
 
+bool areSpecConstsSupported(const ArgList &Args, const llvm::Triple Triple) {
+  const llvm::Triple HostTriple(Args.getLastArgValue(OPT_host_triple_EQ));
+  bool SYCLNativeCPU = (HostTriple == Triple);
+  return (!Triple.isNVPTX() && !Triple.isAMDGCN() && !Triple.isSPIRAOT() &&
+          !SYCLNativeCPU);
+}
+
 /// Add any sycl-post-link options that rely on a specific Triple in addition
 /// to user supplied options.
 /// NOTE: Any changes made here should be reflected in the similarly named
@@ -1687,6 +1695,7 @@ template <typename ModuleHook = function_ref<bool(size_t, const Module &)>>
 std::unique_ptr<lto::LTO> createLTO(
     const ArgList &Args, const std::vector<std::string> &Features,
     SmallVectorImpl<OffloadFile> &BitcodeInputFiles,
+    SmallVectorImpl<StringRef> &Files,
     std::vector<std::string> ModulesToCompile = {},
     ModuleHook Hook = [](size_t, const Module &) { return true; }) {
   const llvm::Triple Triple(Args.getLastArgValue(OPT_triple_EQ));
@@ -1780,18 +1789,16 @@ std::unique_ptr<lto::LTO> createLTO(
     // Passing Args to each thinLTO thread causes crashes, so compute everything
     // we can here.
     const llvm::Triple Triple(Args.getLastArgValue(OPT_triple_EQ));
-    bool OnlyKernelsAsEntryPoints =
-        sycl::considerOnlyKernelsAsEntryPoints(Args, Triple);
-    auto GlobalBinProps = sycl::computeGlobalBinProps(Args, Triple);
     SmallVector<StringRef, 8> SPIRVArgs;
     sycl::computeLLVMToSPIRVTranslationToolArgs(Args, SPIRVArgs);
-    Conf.PreCodeGenModuleHook = [=, &BitcodeInputFiles](unsigned Task,
-                                                        const Module &M) {
-      // This is the main part of SYCL LTO handling.
-      // Here we process the IR from each BC file, compute module
-      // properties and the module symbol table, convert to SPV (using the
-      // translator for now) and save required information for binary created
-      // inside the OffloadFile.
+    auto SpecConstArg = sycl::areSpecConstsSupported(Args, Triple)
+                            ? SpecConstantsPass::HandlingMode::native
+                            : SpecConstantsPass::HandlingMode::emulation;
+    Conf.PreCodeGenModuleHook = [=, &BitcodeInputFiles, &Files](
+                                    unsigned Task, const Module &M) mutable {
+      // Here we process the IR from each BC file, save the module for later
+      // use, convert to SPV (using the translator for now) and save the path to
+      // the output file.
 
       assert(Task != 0 && "Unexpected task");
       auto &OffloadF = BitcodeInputFiles[Task - 1];
@@ -1801,43 +1808,18 @@ std::unique_ptr<lto::LTO> createLTO(
         return true;
       }
 
-      llvm::sycl::EntryPointSet EntryPoints;
-
-      for (const Function &F : M.functions()) {
-        if (llvm::module_split::isEntryPoint(F, OnlyKernelsAsEntryPoints))
-          EntryPoints.insert(const_cast<Function *>(&F));
-      }
-      // No entry points, don't proceed
-      if (EntryPoints.empty())
-        return false;
-
       if (SaveTemps)
         PreCodeGenSaveTemps(Task, M);
 
-      // TODO: Handle spec constants.
-
-      // TODO: Handle internalization of non-entry-points, we don't do it during
-      // early split anymore.
-      // One problem is that the modules are pased in as `const Module&`, and
-      // ideally we want to delete non-entry point functions, but const-casting
-      // and modifying the module seems from here seems wrong.
-
-      auto ModuleProps = llvm::sycl::computeModuleProperties(
-          M, EntryPoints, GlobalBinProps,
-          /*SpecConstsMet=*/false, /*SpecConstsMet=*/false);
-      std::string ModulePropsStr;
-      raw_string_ostream SCOut(ModulePropsStr);
-      ModuleProps.write(SCOut);
-      std::string ModuleSyms =
-          llvm::sycl::computeModuleSymbolTable(M, EntryPoints);
-      // This part is the hackiest part of this change. However, this code is
-      // run on multiple threads, so the data structures we can use are more
-      // limited. We can't use StringRef because we would need a StringSaver to
-      // keep the values around, but StringSaver is not thread safe.
-      OffloadF.getBinary()->addTmpString(ModulePropsStr);
-      OffloadF.getBinary()->addTmpString(ModuleSyms);
-      // TODO: Use SPIR-V backend instead of SPIR-V translator once the backend
-      // is mature.
+      // Use the legacy PM because eventually we will use the
+      // PreCodeGenPassesHook field of LTOConfig which requires the legacy PM.
+      legacy::PassManager PM;
+
+      // LTO does not continue processing the module after this
+      // function finishes, so it's safe to modify the module.
+      PM.add(createSYCLLinkedModuleProcessorPass(SpecConstArg));
+      PM.run(const_cast<Module &>(M));
+
       auto IRFile = createOutputFile(sys::path::filename(ExecutableName) + "." +
                                          std::to_string(Task) + ".to.spv",
                                      "spv");
@@ -1849,15 +1831,32 @@ std::unique_ptr<lto::LTO> createLTO(
         reportError(errorCodeToError(EC));
       WriteBitcodeToFile(M, LinkedBitcode);
       LinkedBitcode.close();
-      // We need this copy to prevent data corruption of the arguments when
-      // calling llvm-spirv. Probably some multithreading thing, I didn't deeply
-      // investigate it yet.
-      SmallVector<StringRef, 8> SPIRVArgsCopy = SPIRVArgs;
+      {
+        // Overwrite the fully linked module in BitcodeInputFiles
+        // so we can compute the module properties and symbol table.
+        // We need a fully linked module to accurately compute these.
+        llvm::ErrorOr<std::unique_ptr<llvm::MemoryBuffer>> ImageOrError =
+            llvm::MemoryBuffer::getFileOrSTDIN(*IRFile);
+        assert(ImageOrError);
+        OffloadingImage Image{};
+        Image.TheImageKind = IMG_Bitcode;
+        Image.TheOffloadKind = OffloadF.getBinary()->getOffloadKind();
+        Image.StringData["triple"] = OffloadF.getBinary()->getTriple();
+        Image.StringData["arch"] = OffloadF.getBinary()->getArch();
+        Image.Image = std::move(*ImageOrError);
+
+        std::unique_ptr<MemoryBuffer> Binary =
+            MemoryBuffer::getMemBufferCopy(OffloadBinary::write(Image));
+        auto NewBinaryOrErr = OffloadBinary::create(*Binary);
+        assert(NewBinaryOrErr);
+        BitcodeInputFiles[Task - 1] =
+            OffloadFile(std::move(*NewBinaryOrErr), std::move(Binary));
+      }
       auto SPVFile =
-          sycl::runLLVMToSPIRVTranslation(*IRFile, std::move(SPIRVArgsCopy));
+          sycl::runLLVMToSPIRVTranslation(*IRFile, std::move(SPIRVArgs));
       if (!SPVFile)
         reportError(SPVFile.takeError());
-      OffloadF.getBinary()->addTmpString((*SPVFile).str());
+      Files[Task] = *SPVFile;
       // Return false so the thinLTO backend doesn't continue to process this
       // module. We already emitted SPIR-V ourselves, so we don't need to do
       // anything else. Once the SPIR-V backend is ready, we can remove the
@@ -1981,12 +1980,15 @@ Error linkBitcodeFiles(SmallVectorImpl<OffloadFile> &InputFiles,
   };
 
   // We assume visibility of the whole program if every input file was bitcode.
+  SmallVector<StringRef> Files;
   auto Features = getTargetFeatures(BitcodeInputFiles);
-  auto LTOBackend =
-      Args.hasArg(OPT_embed_bitcode) || Args.hasArg(OPT_builtin_bitcode_EQ) ||
-              Args.hasArg(OPT_clang_backend)
-          ? createLTO(Args, Features, BitcodeInputFiles, {}, OutputBitcode)
-          : createLTO(Args, Features, BitcodeInputFiles, ModulesToCompile);
+  auto LTOBackend = Args.hasArg(OPT_embed_bitcode) ||
+                            Args.hasArg(OPT_builtin_bitcode_EQ) ||
+                            Args.hasArg(OPT_clang_backend)
+                        ? createLTO(Args, Features, BitcodeInputFiles, Files,
+                                    {}, OutputBitcode)
+                        : createLTO(Args, Features, BitcodeInputFiles, Files,
+                                    ModulesToCompile);
 
   // We need to resolve the symbols so the LTO backend knows which symbols need
   // to be kept or can be internalized. This is a simplified symbol resolution
@@ -2067,7 +2069,7 @@ Error linkBitcodeFiles(SmallVectorImpl<OffloadFile> &InputFiles,
 
   // Run the LTO job to compile the bitcode.
   size_t MaxTasks = LTOBackend->getMaxTasks();
-  SmallVector<StringRef> Files(MaxTasks);
+  Files.resize(MaxTasks);
   auto AddStream =
       [&](size_t Task,
           const Twine &ModuleName) -> std::unique_ptr<CachedFileStream> {
@@ -2450,33 +2452,59 @@ Expected<SmallVector<StringRef>> linkAndWrapDeviceFiles(
       SmallVector<StringRef> InputFilesSYCL;
       std::vector<module_split::SplitModule> SplitModules;
       if (sycl::isSYCLThinLTO(Args, Triple)) {
+        size_t LastSPVFilePath = 0;
         for (size_t FileIdx = 0; FileIdx < BitcodeInputFiles.size();
              FileIdx++) {
-          // After we have run the LTO backend, extract the information computed
-          // in the backend (module props/symbol table/spv file path) and set it
-          // up to be used by SYCL image creation.
+          // After we have run the LTO backend, compute module props/symbol
+          // table/spv file path and set it up to be used by SYCL image
+          // creation.
           // TODO: Once SYCL image creation is reconsiled with the non-SYCL
           // path, we can move all of the thinLTO handling to be more in-line
           // with community code.
+
+          // This is a bit hacky but not every BitcodeInputFile will end up as a
+          // SPV file in InputFiles, for example if it is a device library file.
+          // If the file name is empty, that means we didn't generate SPV for
+          // it, so just find the next non-empty file name. Should be easy to
+          // clean this up later if we go with this overall design.
           const OffloadFile &F = BitcodeInputFiles[FileIdx];
-          const auto &SYCLInfo = F.getBinary()->getTmpStrings();
-          if (SYCLInfo.size() != 3)
+          StringRef CodegenPath;
+          for (size_t OutputNum = LastSPVFilePath;
+               OutputNum < InputFiles.size(); OutputNum++) {
+            auto SPVFilePath = InputFiles[OutputNum];
+            if (!SPVFilePath.empty()) {
+              LastSPVFilePath = OutputNum + 1;
+              CodegenPath = SPVFilePath;
+              break;
+            }
+          }
+          if (CodegenPath.empty())
             continue;
-          // The hardcoded vector indexes are very hacky,
-          // but I feel the most controversial part of this hcange is how we
-          // store the required information for later and it's likely to change
-          // based on feedback, so I didn't completely design that part yet.
-          StringRef CodegenPath = SYCLInfo[2];
-          assert(!CodegenPath.empty() && "Codegen failed");
-          const auto &Props = SYCLInfo[0];
-          auto MB = MemoryBuffer::getMemBuffer(Props);
-          auto PropSetOrErr = llvm::util::PropertySetRegistry::read(MB.get());
-          if (!PropSetOrErr)
-            return PropSetOrErr.takeError();
-          llvm::util::PropertySetRegistry Properties =
-              std::move(**PropSetOrErr);
-          const auto &Syms = SYCLInfo[1];
-          SplitModules.emplace_back(CodegenPath, std::move(Properties), Syms);
+          LLVMContext Context;
+          auto Buf = MemoryBuffer::getMemBuffer(F.getBinary()->getImage());
+          auto ModOrErr = parseBitcodeFile(*Buf, Context);
+          if (!ModOrErr)
+            return ModOrErr.takeError();
+          auto &M = **ModOrErr;
+
+          llvm::sycl::EntryPointSet EntryPoints;
+          bool OnlyKernelsAsEntryPoints =
+              sycl::considerOnlyKernelsAsEntryPoints(Args, Triple);
+          auto GlobalBinProps = sycl::computeGlobalBinProps(Args, Triple);
+          for (const Function &F : M.functions()) {
+            if (llvm::module_split::isEntryPoint(F, OnlyKernelsAsEntryPoints))
+              EntryPoints.insert(const_cast<Function *>(&F));
+          }
+          if (EntryPoints.empty())
+            continue;
+
+          auto Properties = llvm::sycl::computeModuleProperties(
+              M, EntryPoints, GlobalBinProps, true, true);
+
+          std::string ModuleSyms =
+              llvm::sycl::computeModuleSymbolTable(M, EntryPoints);
+          SplitModules.emplace_back(CodegenPath, std::move(Properties),
+                                    ModuleSyms);
         }
         // We don't need the OffloadFiles anymore, so free them from memory.
         BitcodeInputFiles.clear();
diff --git a/llvm/include/llvm/SYCLLowerIR/SYCLLinkedModuleProcessor.h b/llvm/include/llvm/SYCLLowerIR/SYCLLinkedModuleProcessor.h
new file mode 100644
index 0000000000000..171992dfb2586
--- /dev/null
+++ b/llvm/include/llvm/SYCLLowerIR/SYCLLinkedModuleProcessor.h
@@ -0,0 +1,22 @@
+//===-- SYCLLinkedModuleProcessor.h - finalize a fully linked module ---===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// The file contains a number of functions to create a pass that can be called
+// by the LTO backend that will finalize a fully-linked module.
+//===----------------------------------------------------------------------===//
+#pragma once
+#include "SpecConstants.h"
+namespace llvm {
+
+class PassRegistry;
+class ModulePass;
+ModulePass *
+    createSYCLLinkedModuleProcessorPass(llvm::SpecConstantsPass::HandlingMode);
+void initializeSYCLLinkedModuleProcessorPass(PassRegistry &);
+
+} // namespace llvm
diff --git a/llvm/lib/SYCLLowerIR/CMakeLists.txt b/llvm/lib/SYCLLowerIR/CMakeLists.txt
index 3eb84ba9864cb..9e7543d898a2a 100644
--- a/llvm/lib/SYCLLowerIR/CMakeLists.txt
+++ b/llvm/lib/SYCLLowerIR/CMakeLists.txt
@@ -64,6 +64,7 @@ add_llvm_component_library(LLVMSYCLLowerIR
   SYCLDeviceLibReqMask.cpp
   SYCLDeviceRequirements.cpp
   SYCLKernelParamOptInfo.cpp
+  SYCLLinkedModuleProcessor.cpp
   SYCLPropagateAspectsUsage.cpp
   SYCLPropagateJointMatrixUsage.cpp
   SYCLVirtualFunctionsAnalysis.cpp
diff --git a/llvm/lib/SYCLLowerIR/SYCLLinkedModuleProcessor.cpp b/llvm/lib/SYCLLowerIR/SYCLLinkedModuleProcessor.cpp
new file mode 100644
index 0000000000000..672d49d6ad161
--- /dev/null
+++ b/llvm/lib/SYCLLowerIR/SYCLLinkedModuleProcessor.cpp
@@ -0,0 +1,45 @@
+//===-- SYCLLinkedModuleProcessor.cpp - finalize a fully linked module ---===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+// See comments in the header.
+//===----------------------------------------------------------------------===//
+
+#include "llvm/SYCLLowerIR/SYCLLinkedModuleProcessor.h"
+
+#include "llvm/Pass.h"
+
+#define DEBUG_TYPE "sycl-linked-module-processor"
+using namespace llvm;
+
+namespace {
+class SYCLLinkedModuleProcessor : public ModulePass {
+public:
+  static char ID;
+  SYCLLinkedModuleProcessor(SpecConstantsPass::HandlingMode Mode)
+      : ModulePass(ID), Mode(Mode) {
+    initializeSYCLLinkedModuleProcessorPass(*PassRegistry::getPassRegistry());
+  }
+
+  bool runOnModule(Module &M) override {
+    // TODO: determine if we need to run other passes
+    ModuleAnalysisManager MAM;
+    SpecConstantsPass SCP(Mode);
+    auto PA = SCP.run(M, MAM);
+    return !PA.areAllPreserved();
+  }
+
+private:
+  SpecConstantsPass::HandlingMode Mode;
+};
+} // namespace
+char SYCLLinkedModuleProcessor::ID = 0;
+INITIALIZE_PASS(SYCLLinkedModuleProcessor, "SYCLLinkedModuleProcessor",
+                "Finalize a fully linked SYCL module", false, false)
+ModulePass *llvm::createSYCLLinkedModuleProcessorPass(
+    SpecConstantsPass::HandlingMode Mode) {
+  return new SYCLLinkedModuleProcessor(Mode);
+}
diff --git a/sycl/doc/design/CompilerAndRuntimeDesign.md b/sycl/doc/design/CompilerAndRuntimeDesign.md
index 52ae88a2c0ef1..954f66109c390 100644
--- a/sycl/doc/design/CompilerAndRuntimeDesign.md
+++ b/sycl/doc/design/CompilerAndRuntimeDesign.md
@@ -550,6 +550,9 @@ unit)
 - `off` - disables device code split. If `-fno-sycl-rdc` is specified, the behavior is
    the same as `per_source`
 
+If ThinLTO is enabled, device code splitting is run during the compilation stage.
+See [here](ThinLTO.md) for more information.
+
 ##### Symbol table generation
 
 TBD
diff --git a/sycl/doc/design/ThinLTO.md b/sycl/doc/design/ThinLTO.md
new file mode 100644
index 0000000000000..3ffb7819bef68
--- /dev/null
+++ b/sycl/doc/design/ThinLTO.md
@@ -0,0 +1,147 @@
+# ThinLTO for SYCL
+
+This document describes the purpose and design of ThinLTO for SYCL.
+
+**NOTE**: This is not the final version. The document is still in progress.
+
+## Background
+
+With traditional SYCL device code linking, all user code is linked together 
+along with device libraries into a single huge module and then split and 
+processed by `sycl-post-link`. This requires sequential processing, has a large 
+memory footprint, and differs from the linking flow for AMD and NVIDIA devices.
+
+## Summary
+SYCL ThinLTO will hook into the existing community mechanism to run LTO as part 
+of device linking inside `clang-linker-wrapper`. We split the device images 
+early at compilation time, and at link time we use ThinLTO's function importing 
+feature
+to bring in the defintions for referenced functions. Only the new offload model
+is supported.
+
+## Device code compilation time changes
+Most of the changes for ThinLTO occur during device link time, however there is 
+one major change during compilation (-c) time: we now run device code split 
+during compilaton instead of linking.
+The main reason for doing this is increased parallelization. Many compilation 
+jobs can be run at the same time, but linking happens once total for the 
+application. Device code split is currently a common source of performance 
+issues.
+
+Splitting early means that the resulting IR after splitting is not complete, it 
+still may contain calls to functions (user code and/or the SYCL device 
+libraries) from other object files.
+
+We rely on the assumption that all function defintions matching a declaration 
+will be the same and we can let ThinLTO pull in any one.
+
+For example, let's start with user device code that defines a `SYCL_EXTERNAL` 
+function `foo` in translation unit `tu_foo`. There is also another translation 
+unit `tu_bar` that references `foo`.
+During the early device code splitting run of `tu_foo`, we may find that more 
+than one of the resultant device images contain a defintion for `foo`.
+
+We assert that any function defintion for `foo` that is deemed a match by the 
+ThinLTO infrastruction during the processing of `tu_bar` is valid.
+
+As a result of running early device code split, the fat object file generated 
+as part of device compilation may contain multiple device code images.
+
+# Device code link time changes
+
+Before we go into the link time changes for SYCL, let's understand the device 
+linking flow for community devices (AMD/NVIDIA):
+
+![Community linking flow](images/ThinLTOCommunityFlow.svg)
+
+SYCL has two differenting requirements:
+1) The SPIR-V backend is not production ready and the SPIR-V translator is used.
+2) The SYCL runtime requires metadata (module properties and module symbol 
+table) computed from device images that will be stored along the device images 
+in the fat executable.
+
+The effect of requirement 1) is that instead of letting ThinLTO call the SPIR-V 
+backend, we add a callback that runs right before codegen would run.
+In that callback, we call the SPIR-V translator and store the resultant file 
+path for use later, and we instruct the ThinLTO framework to not
+perform codegen.
+
+An interesting additional fact about requirement 2) is that we actually need to 
+process fully linked module to accurate compute the module properties. One 
+example where we need the full module is to [compute the required devicelib mask](https://github.com/intel/llvm/blob/sycl/llvm/lib/SYCLLowerIR/SYCLDeviceLibReqMask.cpp).
+If we only process the device code that was included in the 
+original fat object input to `clang-linker-wrapper`, we will miss devicelib 
+calls in referenced `SYCL_EXTERNAL` functions.
+
+The effect of requirement 2) is that we store the fully linked device image for 
+metadata computation in the SYCL-specific handing code after the ThinLTO 
+framework has completed. Another option would be to try to compute the metadata 
+inside the ThinLTO framework callbacks, but this would require SYCL-specific 
+arguments to many caller functions in the stack and pollute community code.
+
+Here is the current ThinLTO flow for SYCL:
+
+![SYCL linking flow](images/ThinLTOSYCLFlow.svg)
+
+We add a `PreCodeGenModuleHook` function to the `LTOConfig` object so that we 
+can process the fully linked module without running the backend.
+
+However, the flow is not ideal for many reasons:
+1) We are relying on the external `llvm-spirv` tool instead of the SPIR-V 
+backend. We could slightly improve this issue by using a library call to the 
+SPIR-V translator instead of the tool, however the library API requires setting 
+up an object to represent the arguments while we only have strings, and it's 
+non-trivial to parse the trings to figure out how to create the argument 
+object. Since we plan to use the SPIR-V backend in the long term, this does not 
+seem to be worth the effort.
+
+2) We manually run passes inside `PreCodeGenModuleHook`. This is because we 
+don't run codegen, so we can't take advantage of the `PreCodeGenPassesHook` 
+field of `LTOConfig` to run some custom passes, as those passes are only run 
+when we actually are going to run codegen.
+
+3) We have to store the fully linked module. This is needed because we need a 
+fully linked module to accurately compute metadata, see the above explanation 
+of SYCL requirement 2). We could get around storing the module by computing the 
+metadata inside the LTO framework and storing it for late use by the SYCL 
+bundling code, but doing this would require SYCL-only customizations including 
+even more new function arguments and modifications of the `OffloadFile` class. 
+It's also complicated because the LTO framework is multithreaded, and not all 
+LLVM data structures are thread safe.
+
+The proposed long-term SYCL ThinLTO flow is as follows:
+
+![SYCL SPIR-V backend linking flow](images/ThinLTOSYCLSPIRVBackendFlow.svg)
+
+The biggest difference here is that we are running codegen using the SPIR-V 
+backend.
+
+Also, instead of using a lambda function in the `PreCodeGenModuleHook` 
+callback, we can take advantage of the `PreCodeGenPassesHook` field to add 
+passes to the pass manager that the LTO framework will run.
+
+It is possible that the number of device images in the fat executable
+and which device image contains which kernel is different with ThinLTO
+enabled, but we do expect this to have any impact on correctness or
+performance, nor we do expect users to care.
+
+
+# Current limitations
+
+`-O0`: Compiling with `-O0` prevent clang from generating ThinLTO metadata 
+during the compilation phase. In the current implementation, this is an error. 
+In the final version, we could either silently fall back to full LTO or 
+generate ThinLTO metadata even for `-O0`.
+
+SYCL libdevice: Current all `libdevice` functions are explicitly marked to be 
+weak symbols. The ThinLTO framework does not consider a defintion of function 
+with weak linkage as it cannot be sure that this definiton is the correct one. 
+Ideally we could remove the weak symbol annotation.
+
+No binary linkage: The SPIR-V target does not currently have a production 
+quality binary linker. This means that we must generate a fully linked image as 
+part of device linkage. At least for AMD devices, this is not a requirement as 
+`lld` is used for the final link which can resolve any unresolved symbols. 
+`-fno-gpu-rdc` is default for AMD, so in that case it can call `lld` during 
+compile, but if `-fno-gpu-rdc` is passed, the lld call happens as part of 
+`clang-linker-wrapper` to resolve any symbols not resolved by ThinLTO.
\ No newline at end of file
diff --git a/sycl/doc/design/images/ThinLTOCommunityFlow.svg b/sycl/doc/design/images/ThinLTOCommunityFlow.svg
new file mode 100644
index 0000000000000..f2fe257a07af8
--- /dev/null
+++ b/sycl/doc/design/images/ThinLTOCommunityFlow.svg
@@ -0,0 +1 @@
+<svg width="1280" height="720" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" overflow="hidden"><g><rect x="0" y="0" width="1280" height="720" fill="#FFFFFF"/><rect x="221" y="57" width="752" height="640" stroke="#042433" stroke-width="2" stroke-miterlimit="8" fill="#7F7F7F"/><path d="M478 98.5C478 81.6553 531.502 68 597.5 68 663.498 68 717 81.6553 717 98.5 717 115.345 663.498 129 597.5 129 531.502 129 478 115.345 478 98.5Z" stroke="#042433" stroke-width="2" stroke-miterlimit="8" fill="#156082" fill-rule="evenodd"/><text fill="#FFFFFF" font-family="Aptos,Aptos_MSFontService,sans-serif" font-weight="400" font-size="16" transform="translate(530.336 95)">Extract device code</text><text fill="#FFFFFF" font-family="Aptos,Aptos_MSFontService,sans-serif" font-weight="400" font-size="16" transform="translate(557.336 114)">from inputs</text><text font-family="Aptos,Aptos_MSFontService,sans-serif" font-weight="400" font-size="24" transform="translate(484.018 53)">clang</text><text font-family="Aptos,Aptos_MSFontService,sans-serif" font-weight="400" font-size="24" transform="translate(541.018 53)">-</text><text font-family="Aptos,Aptos_MSFontService,sans-serif" font-weight="400" font-size="24" transform="translate(549.018 53)">linker</text><text font-family="Aptos,Aptos_MSFontService,sans-serif" font-weight="400" font-size="24" transform="translate(605.018 53)">-</text><text font-family="Aptos,Aptos_MSFontService,sans-serif" font-weight="400" font-size="24" transform="translate(613.018 53)">wrapper</text><path d="M468 201C468 175.595 525.755 155 597 155 668.245 155 726 175.595 726 201 726 226.405 668.245 247 597 247 525.755 247 468 226.405 468 201Z" stroke="#042433" stroke-width="2" stroke-miterlimit="8" fill="#156082" fill-rule="evenodd"/><text fill="#FFFFFF" font-family="Aptos,Aptos_MSFontService,sans-serif" font-weight="400" font-size="16" transform="translate(523.416 187)">Process symbols and</text><text fill="#FFFFFF" font-family="Aptos,Aptos_MSFontService,sans-serif" font-weight="400" font-size="16" transform="translate(516.416 206)">add device code to LTO</text><text fill="#FFFFFF" font-family="Aptos,Aptos_MSFontService,sans-serif" font-weight="400" font-size="16" transform="translate(559.416 225)">framework</text><rect x="428" y="275" width="339" height="243" stroke="#000000" stroke-width="2" stroke-miterlimit="8" fill="#83CBEB"/><text fill="#FFFFFF" font-family="Aptos,Aptos_MSFontService,sans-serif" font-weight="400" font-size="24" transform="translate(495.438 297)">T</text><text fill="#FFFFFF" font-family="Aptos,Aptos_MSFontService,sans-serif" font-weight="400" font-size="24" transform="translate(506.438 297)">hinLTO</text><text fill="#FFFFFF" font-family="Aptos,Aptos_MSFontService,sans-serif" font-weight="400" font-size="24" transform="translate(582.438 297)">framework</text><path d="M502 327C502 312.641 544.757 301 597.5 301 650.243 301 693 312.641 693 327 693 341.359 650.243 353 597.5 353 544.757 353 502 341.359 502 327Z" stroke="#042433" stroke-width="2" stroke-miterlimit="8" fill="#156082" fill-rule="evenodd"/><text fill="#FFFFFF" font-family="Aptos,Aptos_MSFontService,sans-serif" font-weight="400" font-size="16" transform="translate(543.25 323)">Import function</text><text fill="#FFFFFF" font-family="Aptos,Aptos_MSFontService,sans-serif" font-weight="400" font-size="16" transform="translate(560.25 342)">definitions</text><path d="M502 412C502 397.641 544.757 386 597.5 386 650.243 386 693 397.641 693 412 693 426.359 650.243 438 597.5 438 544.757 438 502 426.359 502 412Z" stroke="#042433" stroke-width="2" stroke-miterlimit="8" fill="#156082" fill-rule="evenodd"/><text fill="#FFFFFF" font-family="Aptos,Aptos_MSFontService,sans-serif" font-weight="400" font-size="16" transform="translate(551.5 417)">Optimization</text><path d="M502 488C502 473.641 544.757 462 597.5 462 650.243 462 693 473.641 693 488 693 502.359 650.243 514 597.5 514 544.757 514 502 502.359 502 488Z" stroke="#042433" stroke-width="2" stroke-miterlimit="8" fill="#156082" fill-rule="evenodd"/><text fill="#FFFFFF" font-family="Aptos,Aptos_MSFontService,sans-serif" font-weight="400" font-size="16" transform="translate(566.25 493)">Codegen</text><path d="M499 564.5C499 551.521 542.876 541 597 541 651.124 541 695 551.521 695 564.5 695 577.479 651.124 588 597 588 542.876 588 499 577.479 499 564.5Z" stroke="#042433" stroke-width="2" stroke-miterlimit="8" fill="#156082" fill-rule="evenodd"/><text fill="#FFFFFF" font-family="Aptos,Aptos_MSFontService,sans-serif" font-weight="400" font-size="16" transform="translate(566.416 570)">Bundling</text><path d="M488 646C488 627.222 536.801 612 597 612 657.199 612 706 627.222 706 646 706 664.778 657.199 680 597 680 536.801 680 488 664.778 488 646Z" stroke="#042433" stroke-width="2" stroke-miterlimit="8" fill="#156082" fill-rule="evenodd"/><text fill="#FFFFFF" font-family="Aptos,Aptos_MSFontService,sans-serif" font-weight="400" font-size="16" transform="translate(545.836 632)">Call host linker</text><text fill="#FFFFFF" font-family="Aptos,Aptos_MSFontService,sans-serif" font-weight="400" font-size="16" transform="translate(540.836 651)">with bundled fat</text><text fill="#FFFFFF" font-family="Aptos,Aptos_MSFontService,sans-serif" font-weight="400" font-size="16" transform="translate(575.836 670)">object</text><path d="M584 143 590.5 143 590.5 132 603.5 132 603.5 143 610 143 597 154Z" stroke="#042433" stroke-width="2" stroke-miterlimit="8" fill="#156082" fill-rule="evenodd"/><path d="M584 260.5 590.5 260.5 590.5 252 603.5 252 603.5 260.5 610 260.5 597 269Z" stroke="#042433" stroke-width="2" stroke-miterlimit="8" fill="#156082" fill-rule="evenodd"/><path d="M584 370.5 590.5 370.5 590.5 359 603.5 359 603.5 370.5 610 370.5 597 382Z" stroke="#042433" stroke-width="2" stroke-miterlimit="8" fill="#156082" fill-rule="evenodd"/><path d="M584 529.5 590.5 529.5 590.5 519 603.5 519 603.5 529.5 610 529.5 597 540Z" stroke="#042433" stroke-width="2" stroke-miterlimit="8" fill="#156082" fill-rule="evenodd"/><path d="M584 451 590.5 451 590.5 443 603.5 443 603.5 451 610 451 597 459Z" stroke="#042433" stroke-width="2" stroke-miterlimit="8" fill="#156082" fill-rule="evenodd"/><path d="M584 599.5 590.5 599.5 590.5 590 603.5 590 603.5 599.5 610 599.5 597 609Z" stroke="#042433" stroke-width="2" stroke-miterlimit="8" fill="#156082" fill-rule="evenodd"/></g></svg>
\ No newline at end of file
diff --git a/sycl/doc/design/images/ThinLTOSYCLFlow.svg b/sycl/doc/design/images/ThinLTOSYCLFlow.svg
new file mode 100644
index 0000000000000..622f4d20d158c
--- /dev/null
+++ b/sycl/doc/design/images/ThinLTOSYCLFlow.svg
@@ -0,0 +1 @@
+<svg width="1280" height="720" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" overflow="hidden"><defs><clipPath id="clip0"><rect x="0" y="0" width="1280" height="720"/></clipPath></defs><g clip-path="url(#clip0)"><rect x="0" y="0" width="1280" height="720" fill="#FFFFFF"/><rect x="220" y="20" width="752" height="683" stroke="#042433" stroke-width="2" stroke-miterlimit="8" fill="#7F7F7F"/><path d="M485 57.5C485 43.4167 534.696 32 596 32 657.304 32 707 43.4167 707 57.5 707 71.5833 657.304 83 596 83 534.696 83 485 71.5833 485 57.5Z" stroke="#042433" stroke-width="2" stroke-miterlimit="8" fill="#156082" fill-rule="evenodd"/><text fill="#FFFFFF" font-family="Aptos,Aptos_MSFontService,sans-serif" font-weight="400" font-size="16" transform="translate(529.17 53)">Extract</text><text fill="#FFFFFF" font-family="Aptos,Aptos_MSFontService,sans-serif" font-weight="400" font-size="16" transform="translate(580.17 53)">device</text><text fill="#FFFFFF" font-family="Aptos,Aptos_MSFontService,sans-serif" font-weight="400" font-size="16" transform="translate(629.17 53)">code</text><text fill="#FFFFFF" font-family="Aptos,Aptos_MSFontService,sans-serif" font-weight="400" font-size="16" transform="translate(556.17 72)">from</text><text fill="#FFFFFF" font-family="Aptos,Aptos_MSFontService,sans-serif" font-weight="400" font-size="16" transform="translate(592.17 72)">input</text><text fill="#FFFFFF" font-family="Aptos,Aptos_MSFontService,sans-serif" font-weight="400" font-size="16" transform="translate(628.17 72)">s</text><text font-family="Aptos,Aptos_MSFontService,sans-serif" font-weight="400" font-size="24" transform="translate(483.018 16)">c</text><text font-family="Aptos,Aptos_MSFontService,sans-serif" font-weight="400" font-size="24" transform="translate(496.018 16)">l</text><text font-family="Aptos,Aptos_MSFontService,sans-serif" font-weight="400" font-size="24" transform="translate(502.018 16)">ang</text><text font-family="Aptos,Aptos_MSFontService,sans-serif" font-weight="400" font-size="24" transform="translate(540.018 16)">-</text><text font-family="Aptos,Aptos_MSFontService,sans-serif" font-weight="400" font-size="24" transform="translate(548.018 16)">linker</text><text font-family="Aptos,Aptos_MSFontService,sans-serif" font-weight="400" font-size="24" transform="translate(604.018 16)">-</text><text font-family="Aptos,Aptos_MSFontService,sans-serif" font-weight="400" font-size="24" transform="translate(612.018 16)">wrapper</text><path d="M476 157C476 136.013 529.726 119 596 119 662.274 119 716 136.013 716 157 716 177.987 662.274 195 596 195 529.726 195 476 177.987 476 157Z" stroke="#042433" stroke-width="2" stroke-miterlimit="8" fill="#156082" fill-rule="evenodd"/><text fill="#FFFFFF" font-family="Aptos,Aptos_MSFontService,sans-serif" font-weight="400" font-size="15" transform="translate(528.753 145)">Process</text><text fill="#FFFFFF" font-family="Aptos,Aptos_MSFontService,sans-serif" font-weight="400" font-size="15" transform="translate(582.753 145)">symbols</text><text fill="#FFFFFF" font-family="Aptos,Aptos_MSFontService,sans-serif" font-weight="400" font-size="15" transform="translate(639.753 145)">and</text><text fill="#FFFFFF" font-family="Aptos,Aptos_MSFontService,sans-serif" font-weight="400" font-size="15" transform="translate(522.753 162)">add</text><text fill="#FFFFFF" font-family="Aptos,Aptos_MSFontService,sans-serif" font-weight="400" font-size="15" transform="translate(549.753 162)">device</text><text fill="#FFFFFF" font-family="Aptos,Aptos_MSFontService,sans-serif" font-weight="400" font-size="15" transform="translate(594.753 162)">code</text><text fill="#FFFFFF" font-family="Aptos,Aptos_MSFontService,sans-serif" font-weight="400" font-size="15" transform="translate(628.753 162)">to</text><text fill="#FFFFFF" font-family="Aptos,Aptos_MSFontService,sans-serif" font-weight="400" font-size="15" transform="translate(644.753 162)">LTO</text><text fill="#FFFFFF" font-family="Aptos,Aptos_MSFontService,sans-serif" font-weight="400" font-size="15" transform="translate(560.753 180)">framewor</text><text fill="#FFFFFF" font-family="Aptos,Aptos_MSFontService,sans-serif" font-weight="400" font-size="15" transform="translate(623.753 180)">k</text><rect x="409" y="213" width="374" height="273" stroke="#000000" stroke-width="2" stroke-miterlimit="8" fill="#83CBEB"/><text fill="#FFFFFF" font-family="Aptos,Aptos_MSFontService,sans-serif" font-weight="400" font-size="24" transform="translate(494.438 234)">T</text><text fill="#FFFFFF" font-family="Aptos,Aptos_MSFontService,sans-serif" font-weight="400" font-size="24" transform="translate(505.438 234)">hinLTO</text><text fill="#FFFFFF" font-family="Aptos,Aptos_MSFontService,sans-serif" font-weight="400" font-size="24" transform="translate(581.438 234)">framework</text><path d="M555 250.5C555 243.044 573.58 237 596.5 237 619.42 237 638 243.044 638 250.5 638 257.956 619.42 264 596.5 264 573.58 264 555 257.956 555 250.5Z" stroke="#042433" stroke-width="2" stroke-miterlimit="8" fill="#156082" fill-rule="evenodd"/><text fill="#FFFFFF" font-family="Aptos,Aptos_MSFontService,sans-serif" font-weight="400" font-size="6" transform="translate(587.586 246)">Import</text><text fill="#FFFFFF" font-family="Aptos,Aptos_MSFontService,sans-serif" font-weight="400" font-size="6" transform="translate(585.586 253)">function</text><text fill="#FFFFFF" font-family="Aptos,Aptos_MSFontService,sans-serif" font-weight="400" font-size="6" transform="translate(583.586 260)">definitions</text><path d="M559 289.5C559 283.701 575.565 279 596 279 616.435 279 633 283.701 633 289.5 633 295.299 616.435 300 596 300 575.565 300 559 295.299 559 289.5Z" stroke="#042433" stroke-width="2" stroke-miterlimit="8" fill="#156082" fill-rule="evenodd"/><text fill="#FFFFFF" font-family="Aptos,Aptos_MSFontService,sans-serif" font-weight="400" font-size="5" transform="translate(581.836 292)">Optimization</text><path d="M518 594C518 578.536 553.146 566 596.5 566 639.854 566 675 578.536 675 594 675 609.464 639.854 622 596.5 622 553.146 622 518 609.464 518 594Z" stroke="#042433" stroke-width="2" stroke-miterlimit="8" fill="#156082" fill-rule="evenodd"/><text fill="#FFFFFF" font-family="Aptos,Aptos_MSFontService,sans-serif" font-weight="400" font-size="16" transform="translate(565.336 599)">Bundling</text><path d="M534 670C534 654.536 561.982 642 596.5 642 631.018 642 659 654.536 659 670 659 685.464 631.018 698 596.5 698 561.982 698 534 685.464 534 670Z" stroke="#042433" stroke-width="2" stroke-miterlimit="8" fill="#156082" fill-rule="evenodd"/><text fill="#FFFFFF" font-family="Aptos,Aptos_MSFontService,sans-serif" font-weight="400" font-size="11" transform="translate(561.67 661)">Call host linker</text><text fill="#FFFFFF" font-family="Aptos,Aptos_MSFontService,sans-serif" font-weight="400" font-size="11" transform="translate(564.67 674)">with bundled</text><text fill="#FFFFFF" font-family="Aptos,Aptos_MSFontService,sans-serif" font-weight="400" font-size="11" transform="translate(572.67 686)">fat object</text><path d="M583 106.5 589.5 106.5 589.5 95 602.5 95 602.5 106.5 609 106.5 596 118Z" stroke="#042433" stroke-width="2" stroke-miterlimit="8" fill="#156082" fill-rule="evenodd"/><path d="M589 204 592.5 204 592.5 197 599.5 197 599.5 204 603 204 596 211Z" stroke="#042433" stroke-width="2" stroke-miterlimit="8" fill="#156082" fill-rule="evenodd"/><path d="M591 271.5 593.75 271.5 593.75 266 599.25 266 599.25 271.5 602 271.5 596.5 277Z" stroke="#042433" stroke-width="2" stroke-miterlimit="8" fill="#156082" fill-rule="evenodd"/><path d="M586 472.5 591.25 472.5 591.25 463 601.75 463 601.75 472.5 607 472.5 596.5 482Z" stroke="#042433" stroke-width="2" stroke-miterlimit="8" fill="#156082" fill-rule="evenodd"/><path d="M589 558 592.5 558 592.5 548 599.5 548 599.5 558 603 558 596 565Z" stroke="#042433" stroke-width="2" stroke-miterlimit="8" fill="#156082" fill-rule="evenodd"/><path d="M591 307.5 593.5 307.5 593.5 303 598.5 303 598.5 307.5 601 307.5 596 312Z" stroke="#042433" stroke-width="2" stroke-miterlimit="8" fill="#156082" fill-rule="evenodd"/><rect x="518" y="314" width="156" height="116" stroke="#000000" stroke-width="2" stroke-miterlimit="8" fill="#4E95D9"/><text fill="#FFFFFF" font-family="Aptos,Aptos_MSFontService,sans-serif" font-weight="700" font-size="8" transform="translate(543.606 322)">PreCodeGenModuleHook</text><path d="M568 405C568 400.582 580.536 397 596 397 611.464 397 624 400.582 624 405 624 409.418 611.464 413 596 413 580.536 413 568 409.418 568 405Z" stroke="#042433" stroke-width="2" stroke-miterlimit="8" fill="#156082" fill-rule="evenodd"/><text fill="#FFFFFF" font-family="Aptos,Aptos_MSFontService,sans-serif" font-weight="400" font-size="4" transform="translate(587.166 404)">Call SPIR</text><text fill="#FFFFFF" font-family="Aptos,Aptos_MSFontService,sans-serif" font-weight="400" font-size="4" transform="translate(602.166 404)">-</text><text fill="#FFFFFF" font-family="Aptos,Aptos_MSFontService,sans-serif" font-weight="400" font-size="4" transform="translate(603.166 404)">V</text><text fill="#FFFFFF" font-family="Aptos,Aptos_MSFontService,sans-serif" font-weight="400" font-size="4" transform="translate(588.166 408)">translator</text><path d="M591 386.08 593.5 386.08 593.5 382 598.5 382 598.5 386.08 601 386.08 596 393Z" stroke="#042433" stroke-width="2" stroke-miterlimit="8" fill="#156082" fill-rule="evenodd"/><path d="M569 371C569 366.582 581.088 363 596 363 610.912 363 623 366.582 623 371 623 375.418 610.912 379 596 379 581.088 379 569 375.418 569 371Z" stroke="#042433" stroke-width="2" stroke-miterlimit="8" fill="#156082" fill-rule="evenodd"/><text fill="#FFFFFF" font-family="Aptos,Aptos_MSFontService,sans-serif" font-weight="400" font-size="4" transform="translate(587.92 368)">Store fully</text><text fill="#FFFFFF" font-family="Aptos,Aptos_MSFontService,sans-serif" font-weight="400" font-size="4" transform="translate(590.92 372)">linked</text><text fill="#FFFFFF" font-family="Aptos,Aptos_MSFontService,sans-serif" font-weight="400" font-size="4" transform="translate(589.92 377)">module</text><path d="M551 446.5C551 439.044 571.371 433 596.5 433 621.629 433 642 439.044 642 446.5 642 453.956 621.629 460 596.5 460 571.371 460 551 453.956 551 446.5Z" stroke="#042433" stroke-width="2" stroke-miterlimit="8" fill="#156082" fill-rule="evenodd"/><text fill="#FFFFFF" font-family="Aptos,Aptos_MSFontService,sans-serif" font-weight="400" font-size="8" transform="translate(579.586 449)">Earl</text><text fill="#FFFFFF" font-family="Aptos,Aptos_MSFontService,sans-serif" font-weight="400" font-size="8" transform="translate(592.586 449)">y</text><text fill="#FFFFFF" font-family="Aptos,Aptos_MSFontService,sans-serif" font-weight="400" font-size="8" transform="translate(598.586 449)">exi</text><text fill="#FFFFFF" font-family="Aptos,Aptos_MSFontService,sans-serif" font-weight="400" font-size="8" transform="translate(608.586 449)">t</text><path d="M591 422.5 593.75 422.5 593.75 417 599.25 417 599.25 422.5 602 422.5 596.5 428Z" stroke="#042433" stroke-width="2" stroke-miterlimit="8" fill="#156082" fill-rule="evenodd"/><path d="M518 517C518 501.536 553.146 489 596.5 489 639.854 489 675 501.536 675 517 675 532.464 639.854 545 596.5 545 553.146 545 518 532.464 518 517Z" stroke="#042433" stroke-width="2" stroke-miterlimit="8" fill="#156082" fill-rule="evenodd"/><text fill="#FFFFFF" font-family="Aptos,Aptos_MSFontService,sans-serif" font-weight="400" font-size="12" transform="translate(571.333 507)">Compute</text><text fill="#FFFFFF" font-family="Aptos,Aptos_MSFontService,sans-serif" font-weight="400" font-size="12" transform="translate(550.333 521)">metadata for fully</text><text fill="#FFFFFF" font-family="Aptos,Aptos_MSFontService,sans-serif" font-weight="400" font-size="12" transform="translate(556.333 536)">linked modules</text><path d="M583 631.5 589.5 631.5 589.5 622 602.5 622 602.5 631.5 609 631.5 596 641Z" stroke="#042433" stroke-width="2" stroke-miterlimit="8" fill="#156082" fill-rule="evenodd"/><path d="M566 336.5C566 331.253 579.655 327 596.5 327 613.345 327 627 331.253 627 336.5 627 341.747 613.345 346 596.5 346 579.655 346 566 341.747 566 336.5Z" stroke="#042433" stroke-width="2" stroke-miterlimit="8" fill="#156082" fill-rule="evenodd"/><text fill="#FFFFFF" font-family="Aptos,Aptos_MSFontService,sans-serif" font-weight="400" font-size="4" transform="translate(587.17 331)">Run linked</text><text fill="#FFFFFF" font-family="Aptos,Aptos_MSFontService,sans-serif" font-weight="400" font-size="4" transform="translate(590.17 335)">module</text><text fill="#FFFFFF" font-family="Aptos,Aptos_MSFontService,sans-serif" font-weight="400" font-size="4" transform="translate(587.17 340)">finalization</text><text fill="#FFFFFF" font-family="Aptos,Aptos_MSFontService,sans-serif" font-weight="400" font-size="4" transform="translate(590.17 345)">passes</text><path d="M591 352.772 593.5 352.772 593.5 350 598.5 350 598.5 352.772 601 352.772 596 359Z" stroke="#042433" stroke-width="2" stroke-miterlimit="8" fill="#156082" fill-rule="evenodd"/></g></svg>
\ No newline at end of file
diff --git a/sycl/doc/design/images/ThinLTOSYCLSPIRVBackendFlow.svg b/sycl/doc/design/images/ThinLTOSYCLSPIRVBackendFlow.svg
new file mode 100644
index 0000000000000..2b048f531b8b7
--- /dev/null
+++ b/sycl/doc/design/images/ThinLTOSYCLSPIRVBackendFlow.svg
@@ -0,0 +1 @@
+<svg width="1280" height="720" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" overflow="hidden"><defs><clipPath id="clip0"><rect x="0" y="0" width="1280" height="720"/></clipPath></defs><g clip-path="url(#clip0)"><rect x="0" y="0" width="1280" height="720" fill="#FFFFFF"/><rect x="220" y="20" width="752" height="683" stroke="#042433" stroke-width="2" stroke-miterlimit="8" fill="#7F7F7F"/><path d="M498 59C498 44.0883 542.1 32 596.5 32 650.9 32 695 44.0883 695 59 695 73.9117 650.9 86 596.5 86 542.1 86 498 73.9117 498 59Z" stroke="#042433" stroke-width="2" stroke-miterlimit="8" fill="#156082" fill-rule="evenodd"/><text fill="#FFFFFF" font-family="Aptos,Aptos_MSFontService,sans-serif" font-weight="400" font-size="16" transform="translate(548.166 55)">Extract</text><text fill="#FFFFFF" font-family="Aptos,Aptos_MSFontService,sans-serif" font-weight="400" font-size="16" transform="translate(599.166 55)">device</text><text fill="#FFFFFF" font-family="Aptos,Aptos_MSFontService,sans-serif" font-weight="400" font-size="16" transform="translate(537.166 74)">code</text><text fill="#FFFFFF" font-family="Aptos,Aptos_MSFontService,sans-serif" font-weight="400" font-size="16" transform="translate(575.166 74)">from</text><text fill="#FFFFFF" font-family="Aptos,Aptos_MSFontService,sans-serif" font-weight="400" font-size="16" transform="translate(611.166 74)">input</text><text fill="#FFFFFF" font-family="Aptos,Aptos_MSFontService,sans-serif" font-weight="400" font-size="16" transform="translate(647.166 74)">s</text><text font-family="Aptos,Aptos_MSFontService,sans-serif" font-weight="400" font-size="24" transform="translate(483.018 16)">c</text><text font-family="Aptos,Aptos_MSFontService,sans-serif" font-weight="400" font-size="24" transform="translate(496.018 16)">l</text><text font-family="Aptos,Aptos_MSFontService,sans-serif" font-weight="400" font-size="24" transform="translate(502.018 16)">ang</text><text font-family="Aptos,Aptos_MSFontService,sans-serif" font-weight="400" font-size="24" transform="translate(540.018 16)">-</text><text font-family="Aptos,Aptos_MSFontService,sans-serif" font-weight="400" font-size="24" transform="translate(548.018 16)">linker</text><text font-family="Aptos,Aptos_MSFontService,sans-serif" font-weight="400" font-size="24" transform="translate(604.018 16)">-</text><text font-family="Aptos,Aptos_MSFontService,sans-serif" font-weight="400" font-size="24" transform="translate(612.018 16)">wrapper</text><path d="M489 155.5C489 135.342 537.129 119 596.5 119 655.871 119 704 135.342 704 155.5 704 175.658 655.871 192 596.5 192 537.129 192 489 175.658 489 155.5Z" stroke="#042433" stroke-width="2" stroke-miterlimit="8" fill="#156082" fill-rule="evenodd"/><text fill="#FFFFFF" font-family="Aptos,Aptos_MSFontService,sans-serif" font-weight="400" font-size="15" transform="translate(541.833 143)">Process</text><text fill="#FFFFFF" font-family="Aptos,Aptos_MSFontService,sans-serif" font-weight="400" font-size="15" transform="translate(595.833 143)">symbols</text><text fill="#FFFFFF" font-family="Aptos,Aptos_MSFontService,sans-serif" font-weight="400" font-size="15" transform="translate(530.833 160)">and</text><text fill="#FFFFFF" font-family="Aptos,Aptos_MSFontService,sans-serif" font-weight="400" font-size="15" transform="translate(557.833 160)">add</text><text fill="#FFFFFF" font-family="Aptos,Aptos_MSFontService,sans-serif" font-weight="400" font-size="15" transform="translate(584.833 160)">device</text><text fill="#FFFFFF" font-family="Aptos,Aptos_MSFontService,sans-serif" font-weight="400" font-size="15" transform="translate(629.833 160)">code</text><text fill="#FFFFFF" font-family="Aptos,Aptos_MSFontService,sans-serif" font-weight="400" font-size="15" transform="translate(538.833 178)">to</text><text fill="#FFFFFF" font-family="Aptos,Aptos_MSFontService,sans-serif" font-weight="400" font-size="15" transform="translate(554.833 178)">LTO</text><text fill="#FFFFFF" font-family="Aptos,Aptos_MSFontService,sans-serif" font-weight="400" font-size="15" transform="translate(582.833 178)">framewor</text><text fill="#FFFFFF" font-family="Aptos,Aptos_MSFontService,sans-serif" font-weight="400" font-size="15" transform="translate(645.833 178)">k</text><rect x="409" y="213" width="374" height="273" stroke="#000000" stroke-width="2" stroke-miterlimit="8" fill="#83CBEB"/><text fill="#FFFFFF" font-family="Aptos,Aptos_MSFontService,sans-serif" font-weight="400" font-size="24" transform="translate(494.438 234)">T</text><text fill="#FFFFFF" font-family="Aptos,Aptos_MSFontService,sans-serif" font-weight="400" font-size="24" transform="translate(505.438 234)">hinLTO</text><text fill="#FFFFFF" font-family="Aptos,Aptos_MSFontService,sans-serif" font-weight="400" font-size="24" transform="translate(581.438 234)">framework</text><path d="M555 250.5C555 243.044 573.58 237 596.5 237 619.42 237 638 243.044 638 250.5 638 257.956 619.42 264 596.5 264 573.58 264 555 257.956 555 250.5Z" stroke="#042433" stroke-width="2" stroke-miterlimit="8" fill="#156082" fill-rule="evenodd"/><text fill="#FFFFFF" font-family="Aptos,Aptos_MSFontService,sans-serif" font-weight="400" font-size="6" transform="translate(587.586 246)">Import</text><text fill="#FFFFFF" font-family="Aptos,Aptos_MSFontService,sans-serif" font-weight="400" font-size="6" transform="translate(585.586 253)">function</text><text fill="#FFFFFF" font-family="Aptos,Aptos_MSFontService,sans-serif" font-weight="400" font-size="6" transform="translate(583.586 260)">definitions</text><path d="M559 289.5C559 283.701 575.565 279 596 279 616.435 279 633 283.701 633 289.5 633 295.299 616.435 300 596 300 575.565 300 559 295.299 559 289.5Z" stroke="#042433" stroke-width="2" stroke-miterlimit="8" fill="#156082" fill-rule="evenodd"/><text fill="#FFFFFF" font-family="Aptos,Aptos_MSFontService,sans-serif" font-weight="400" font-size="5" transform="translate(581.836 292)">Optimization</text><path d="M518 593.5C518 579.417 553.146 568 596.5 568 639.854 568 675 579.417 675 593.5 675 607.583 639.854 619 596.5 619 553.146 619 518 607.583 518 593.5Z" stroke="#042433" stroke-width="2" stroke-miterlimit="8" fill="#156082" fill-rule="evenodd"/><text fill="#FFFFFF" font-family="Aptos,Aptos_MSFontService,sans-serif" font-weight="400" font-size="16" transform="translate(565.336 599)">Bundling</text><path d="M534 672C534 656.536 561.982 644 596.5 644 631.018 644 659 656.536 659 672 659 687.464 631.018 700 596.5 700 561.982 700 534 687.464 534 672Z" stroke="#042433" stroke-width="2" stroke-miterlimit="8" fill="#156082" fill-rule="evenodd"/><text fill="#FFFFFF" font-family="Aptos,Aptos_MSFontService,sans-serif" font-weight="400" font-size="11" transform="translate(561.67 663)">Call host linker</text><text fill="#FFFFFF" font-family="Aptos,Aptos_MSFontService,sans-serif" font-weight="400" font-size="11" transform="translate(564.67 676)">with bundled</text><text fill="#FFFFFF" font-family="Aptos,Aptos_MSFontService,sans-serif" font-weight="400" font-size="11" transform="translate(572.67 688)">fat object</text><path d="M583 102 589.5 102 589.5 91 602.5 91 602.5 102 609 102 596 113Z" stroke="#042433" stroke-width="2" stroke-miterlimit="8" fill="#156082" fill-rule="evenodd"/><path d="M589 204 592.5 204 592.5 194 599.5 194 599.5 204 603 204 596 211Z" stroke="#042433" stroke-width="2" stroke-miterlimit="8" fill="#156082" fill-rule="evenodd"/><path d="M591 271.5 593.75 271.5 593.75 266 599.25 266 599.25 271.5 602 271.5 596.5 277Z" stroke="#042433" stroke-width="2" stroke-miterlimit="8" fill="#156082" fill-rule="evenodd"/><path d="M586 473 591.25 473 591.25 463 601.75 463 601.75 473 607 473 596.5 483Z" stroke="#042433" stroke-width="2" stroke-miterlimit="8" fill="#156082" fill-rule="evenodd"/><path d="M589 558 592.5 558 592.5 548 599.5 548 599.5 558 603 558 596 565Z" stroke="#042433" stroke-width="2" stroke-miterlimit="8" fill="#156082" fill-rule="evenodd"/><path d="M591 307.5 593.5 307.5 593.5 303 598.5 303 598.5 307.5 601 307.5 596 312Z" stroke="#042433" stroke-width="2" stroke-miterlimit="8" fill="#156082" fill-rule="evenodd"/><rect x="518" y="314" width="156" height="116" stroke="#000000" stroke-width="2" stroke-miterlimit="8" fill="#4E95D9"/><text fill="#FFFFFF" font-family="Aptos,Aptos_MSFontService,sans-serif" font-weight="400" font-size="7" transform="translate(554 321)">PreCodeGenPassesHook</text><path d="M549 391C549 381.059 570.043 373 596 373 621.957 373 643 381.059 643 391 643 400.941 621.957 409 596 409 570.043 409 549 400.941 549 391Z" stroke="#042433" stroke-width="2" stroke-miterlimit="8" fill="#156082" fill-rule="evenodd"/><text fill="#FFFFFF" font-family="Aptos,Aptos_MSFontService,sans-serif" font-weight="400" font-size="9" transform="translate(576.666 383)">Store fully</text><text fill="#FFFFFF" font-family="Aptos,Aptos_MSFontService,sans-serif" font-weight="400" font-size="9" transform="translate(584.666 394)">linked</text><text fill="#FFFFFF" font-family="Aptos,Aptos_MSFontService,sans-serif" font-weight="400" font-size="9" transform="translate(580.666 405)">module</text><path d="M551 446.5C551 439.044 571.371 433 596.5 433 621.629 433 642 439.044 642 446.5 642 453.956 621.629 460 596.5 460 571.371 460 551 453.956 551 446.5Z" stroke="#042433" stroke-width="2" stroke-miterlimit="8" fill="#156082" fill-rule="evenodd"/><text fill="#FFFFFF" font-family="Aptos,Aptos_MSFontService,sans-serif" font-weight="400" font-size="8" transform="translate(580.583 449)">Codegen</text><path d="M518 517C518 501.536 553.146 489 596.5 489 639.854 489 675 501.536 675 517 675 532.464 639.854 545 596.5 545 553.146 545 518 532.464 518 517Z" stroke="#042433" stroke-width="2" stroke-miterlimit="8" fill="#156082" fill-rule="evenodd"/><text fill="#FFFFFF" font-family="Aptos,Aptos_MSFontService,sans-serif" font-weight="400" font-size="12" transform="translate(571.333 507)">Compute</text><text fill="#FFFFFF" font-family="Aptos,Aptos_MSFontService,sans-serif" font-weight="400" font-size="12" transform="translate(550.333 521)">metadata for fully</text><text fill="#FFFFFF" font-family="Aptos,Aptos_MSFontService,sans-serif" font-weight="400" font-size="12" transform="translate(556.333 536)">linked modules</text><path d="M587 633 591.5 633 591.5 625 600.5 625 600.5 633 605 633 596 641Z" stroke="#042433" stroke-width="2" stroke-miterlimit="8" fill="#156082" fill-rule="evenodd"/><path d="M561 338.5C561 330.492 576.67 324 596 324 615.33 324 631 330.492 631 338.5 631 346.508 615.33 353 596 353 576.67 353 561 346.508 561 338.5Z" stroke="#042433" stroke-width="2" stroke-miterlimit="8" fill="#156082" fill-rule="evenodd"/><text fill="#FFFFFF" font-family="Aptos,Aptos_MSFontService,sans-serif" font-weight="400" font-size="5" transform="translate(584 331)">Run linked</text><text fill="#FFFFFF" font-family="Aptos,Aptos_MSFontService,sans-serif" font-weight="400" font-size="5" transform="translate(587 337)">module</text><text fill="#FFFFFF" font-family="Aptos,Aptos_MSFontService,sans-serif" font-weight="400" font-size="5" transform="translate(583 344)">finalization</text><text fill="#FFFFFF" font-family="Aptos,Aptos_MSFontService,sans-serif" font-weight="400" font-size="5" transform="translate(588 350)">passes</text><path d="M590 362.003 593.25 362.003 593.25 356 599.75 356 599.75 362.003 603 362.003 596.5 371Z" stroke="#042433" stroke-width="2" stroke-miterlimit="8" fill="#156082" fill-rule="evenodd"/><path d="M591 421.08 593.5 421.08 593.5 412 598.5 412 598.5 421.08 601 421.08 596 428Z" stroke="#042433" stroke-width="2" stroke-miterlimit="8" fill="#156082" fill-rule="evenodd"/></g></svg>
\ No newline at end of file

From 8c4edb31c2afcfd0f046d47c3cd6913c9551fb26 Mon Sep 17 00:00:00 2001
From: "Sarnie, Nick" <nick.sarnie@intel.com>
Date: Wed, 18 Sep 2024 10:09:18 -0700
Subject: [PATCH 4/9] fix merge

Signed-off-by: Sarnie, Nick <nick.sarnie@intel.com>
---
 .../ClangLinkerWrapper.cpp                    |  7 --
 foo                                           | 68 -------------------
 llvm/include/llvm/Object/OffloadBinary.h      | 11 ---
 3 files changed, 86 deletions(-)
 delete mode 100644 foo

diff --git a/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp b/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp
index 165f1d253cd55..9bf3b7d7bb594 100644
--- a/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp
+++ b/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp
@@ -2641,13 +2641,6 @@ Expected<SmallVector<StringRef>> linkAndWrapDeviceFiles(
     if (HasSYCLOffloadKind) {
       StringRef TmpOutput;
       if (!sycl::isSYCLThinLTO(Args, Triple)) {
-        // Write device inputs to an output file for the linker.
-        for (const OffloadFile &File : Input) {
-          auto FileNameOrErr = writeOffloadFile(File);
-          if (!FileNameOrErr)
-            return FileNameOrErr.takeError();
-          InputFiles.emplace_back(*FileNameOrErr);
-        }
         // Link the input device files using the device linker for SYCL
         // offload.
         auto TmpOutputOrErr = sycl::linkDevice(InputFiles, LinkerArgs);
diff --git a/foo b/foo
deleted file mode 100644
index 36206fcd9b1f0..0000000000000
--- a/foo
+++ /dev/null
@@ -1,68 +0,0 @@
-diff --git a/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp b/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp
-index 147b6010228a..9173ff2ac48b 100644
---- a/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp
-+++ b/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp
-@@ -154,6 +154,8 @@ static std::atomic<bool> LTOError;
- 
- static std::optional<llvm::module_split::IRSplitMode> SYCLModuleSplitMode;
- 
-+static bool UseSYCLPostLinkTool;
-+
- SmallString<128> SPIRVDumpDir;
- 
- using OffloadingImage = OffloadBinary::OffloadingImage;
-@@ -2392,10 +2394,10 @@ Expected<SmallVector<StringRef>> linkAndWrapDeviceFiles(
-       SmallVector<StringRef> InputFilesSYCL;
-       InputFilesSYCL.emplace_back(*TmpOutputOrErr);
-       auto SplitModulesOrErr =
--          SYCLModuleSplitMode
--              ? sycl::runSYCLSplitLibrary(InputFilesSYCL, LinkerArgs,
--                                          *SYCLModuleSplitMode)
--              : sycl::runSYCLPostLinkTool(InputFilesSYCL, LinkerArgs);
-+          UseSYCLPostLinkTool
-+              ? sycl::runSYCLPostLinkTool(InputFilesSYCL, LinkerArgs)
-+              : sycl::runSYCLSplitLibrary(InputFilesSYCL, LinkerArgs,
-+                                          *SYCLModuleSplitMode);
-       if (!SplitModulesOrErr)
-         return SplitModulesOrErr.takeError();
- 
-@@ -2961,7 +2963,19 @@ int main(int Argc, char **Argv) {
-     timeTraceProfilerInitialize(Granularity, Argv[0]);
-   }
- 
-+  UseSYCLPostLinkTool = Args.hasFlag(OPT_use_sycl_post_link_tool,
-+                                     OPT_no_use_sycl_post_link_tool, true);
-+  if (!UseSYCLPostLinkTool && Args.hasArg(OPT_use_sycl_post_link_tool))
-+    reportError(createStringError("-use-sycl-post-link-tool and "
-+                                  "-no-use-sycl-post-link-tool options can't "
-+                                  "be used together."));
-+
-   if (Args.hasArg(OPT_sycl_module_split_mode_EQ)) {
-+    if (UseSYCLPostLinkTool)
-+      reportError(createStringError(
-+          "-sycl-module-split-mode should be used with "
-+          "the -no-use-sycl-post-link-tool command line option."));
-+
-     StringRef StrMode = Args.getLastArgValue(OPT_sycl_module_split_mode_EQ);
-     SYCLModuleSplitMode = module_split::convertStringToSplitMode(StrMode);
-     if (!SYCLModuleSplitMode)
-diff --git a/clang/tools/clang-linker-wrapper/LinkerWrapperOpts.td b/clang/tools/clang-linker-wrapper/LinkerWrapperOpts.td
-index 2926a08c8759..60a13b23ba30 100644
---- a/clang/tools/clang-linker-wrapper/LinkerWrapperOpts.td
-+++ b/clang/tools/clang-linker-wrapper/LinkerWrapperOpts.td
-@@ -184,6 +184,15 @@ def sycl_module_split_mode_EQ :
-   Flags<[WrapperOnlyOption]>,
-   HelpText<"Option that turns on split library with the given split mode">;
- 
-+// TODO: Options will be removed when the sycl-post-link tool becomes removed.
-+def use_sycl_post_link_tool : Flag<["--", "-"], "use-sycl-post-link-tool">,
-+  Flags<[WrapperOnlyOption]>,
-+  HelpText<"Use the sycl-post-link tool. On by default">;
-+
-+def no_use_sycl_post_link_tool : Flag<["--", "-"], "no-use-sycl-post-link-tool">,
-+  Flags<[WrapperOnlyOption]>,
-+  HelpText<"Use a SYCL library instead of sycl-post-link tool. (experimental)">;
-+
- // Special option to pass in llvm-spirv options
- def llvm_spirv_options_EQ : Joined<["--", "-"], "llvm-spirv-options=">,
-   Flags<[WrapperOnlyOption]>,
diff --git a/llvm/include/llvm/Object/OffloadBinary.h b/llvm/include/llvm/Object/OffloadBinary.h
index 3d1b18b7f3418..d590110ffa598 100644
--- a/llvm/include/llvm/Object/OffloadBinary.h
+++ b/llvm/include/llvm/Object/OffloadBinary.h
@@ -103,14 +103,6 @@ class OffloadBinary : public Binary {
 
   StringRef getString(StringRef Key) const { return StringData.lookup(Key); }
 
-  /// XXX: Hack
-  const SmallVectorImpl<std::string> &getTmpStrings() const {
-    return TmpStringData;
-  }
-
-  /// XXX: Hack
-  void addTmpString(std::string Value) { TmpStringData.push_back(Value); }
-
   static bool classof(const Binary *V) { return V->isOffloadFile(); }
 
   struct Header {
@@ -159,9 +151,6 @@ class OffloadBinary : public Binary {
   const Header *TheHeader;
   /// Location of the metadata entries within the binary.
   const Entry *TheEntry;
-
-  /// XXX: Hack
-  SmallVector<std::string, 8> TmpStringData;
 };
 
 /// A class to contain the binary information for a single OffloadBinary that

From 33226848b7f23d96f4e5ca41085887e529e92ab0 Mon Sep 17 00:00:00 2001
From: Sarnie <nick.sarnie@intel.com>
Date: Thu, 19 Sep 2024 07:12:51 -0700
Subject: [PATCH 5/9] fix nvptx libdevice duplicate symbol error, investigate
 later

Signed-off-by: Sarnie <nick.sarnie@intel.com>
---
 libdevice/fallback-cassert.cpp | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/libdevice/fallback-cassert.cpp b/libdevice/fallback-cassert.cpp
index 5d3c99d63c556..1c685737002f4 100644
--- a/libdevice/fallback-cassert.cpp
+++ b/libdevice/fallback-cassert.cpp
@@ -114,9 +114,4 @@ DEVICE_EXTERN_C void __devicelib_assert_fail(const char *expr, const char *file,
   __assertfail(expr, file, line, func, 1);
 }
 
-DEVICE_EXTERN_C void _wassert(const char *_Message, const char *_File,
-                              unsigned _Line) {
-  __assertfail(_Message, _File, _Line, 0, 1);
-}
-
 #endif

From 43df8c948764749a7a40421ddfc14acb69721f6e Mon Sep 17 00:00:00 2001
From: Nick Sarnie <nick.sarnie@intel.com>
Date: Thu, 19 Sep 2024 10:39:39 -0700
Subject: [PATCH 6/9] typos

Signed-off-by: Nick Sarnie <nick.sarnie@intel.com>
---
 sycl/doc/design/ThinLTO.md | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/sycl/doc/design/ThinLTO.md b/sycl/doc/design/ThinLTO.md
index 3ffb7819bef68..b1cf4d4082698 100644
--- a/sycl/doc/design/ThinLTO.md
+++ b/sycl/doc/design/ThinLTO.md
@@ -91,7 +91,7 @@ However, the flow is not ideal for many reasons:
 backend. We could slightly improve this issue by using a library call to the 
 SPIR-V translator instead of the tool, however the library API requires setting 
 up an object to represent the arguments while we only have strings, and it's 
-non-trivial to parse the trings to figure out how to create the argument 
+non-trivial to parse the strings to figure out how to create the argument 
 object. Since we plan to use the SPIR-V backend in the long term, this does not 
 seem to be worth the effort.
 
@@ -104,9 +104,9 @@ when we actually are going to run codegen.
 fully linked module to accurately compute metadata, see the above explanation 
 of SYCL requirement 2). We could get around storing the module by computing the 
 metadata inside the LTO framework and storing it for late use by the SYCL 
-bundling code, but doing this would require SYCL-only customizations including 
+bundling code, but doing this would require even more SYCL-only customizations including 
 even more new function arguments and modifications of the `OffloadFile` class. 
-It's also complicated because the LTO framework is multithreaded, and not all 
+There are also compliations because the LTO framework is multithreaded, and not all 
 LLVM data structures are thread safe.
 
 The proposed long-term SYCL ThinLTO flow is as follows:
@@ -117,7 +117,7 @@ The biggest difference here is that we are running codegen using the SPIR-V
 backend.
 
 Also, instead of using a lambda function in the `PreCodeGenModuleHook` 
-callback, we can take advantage of the `PreCodeGenPassesHook` field to add 
+callback to run SYCL finalization passes, we can take advantage of the `PreCodeGenPassesHook` field to add 
 passes to the pass manager that the LTO framework will run.
 
 It is possible that the number of device images in the fat executable

From 9922d3093eea9545aea04d6ba06443e37e57c1ed Mon Sep 17 00:00:00 2001
From: Alexey Bader <alexey.bader@intel.com>
Date: Thu, 19 Sep 2024 16:48:55 -0700
Subject: [PATCH 7/9] [NFC] Fix typos, markdown linter issues

Among other changes:
- removed trailing spaces
- fixed 80-char line limitations
---
 sycl/doc/design/CompilerAndRuntimeDesign.md |   4 +-
 sycl/doc/design/ThinLTO.md                  | 190 ++++++++++----------
 2 files changed, 98 insertions(+), 96 deletions(-)

diff --git a/sycl/doc/design/CompilerAndRuntimeDesign.md b/sycl/doc/design/CompilerAndRuntimeDesign.md
index e22492970d99e..7843b5d3b88e6 100644
--- a/sycl/doc/design/CompilerAndRuntimeDesign.md
+++ b/sycl/doc/design/CompilerAndRuntimeDesign.md
@@ -550,8 +550,8 @@ unit)
 - `off` - disables device code split. If `-fno-sycl-rdc` is specified, the behavior is
    the same as `per_source`
 
-If ThinLTO is enabled, device code splitting is run during the compilation stage.
-See [here](ThinLTO.md) for more information.
+If ThinLTO is enabled, device code splitting is run during the compilation
+stage. See [here](ThinLTO.md) for more information.
 
 ##### Symbol table generation
 
diff --git a/sycl/doc/design/ThinLTO.md b/sycl/doc/design/ThinLTO.md
index b1cf4d4082698..41a0a00786bdf 100644
--- a/sycl/doc/design/ThinLTO.md
+++ b/sycl/doc/design/ThinLTO.md
@@ -6,142 +6,144 @@ This document describes the purpose and design of ThinLTO for SYCL.
 
 ## Background
 
-With traditional SYCL device code linking, all user code is linked together 
-along with device libraries into a single huge module and then split and 
-processed by `sycl-post-link`. This requires sequential processing, has a large 
+With traditional SYCL device code linking, all user code is linked together
+along with device libraries into a single huge module and then split and
+processed by `sycl-post-link`. This requires sequential processing, has a large
 memory footprint, and differs from the linking flow for AMD and NVIDIA devices.
 
 ## Summary
-SYCL ThinLTO will hook into the existing community mechanism to run LTO as part 
-of device linking inside `clang-linker-wrapper`. We split the device images 
-early at compilation time, and at link time we use ThinLTO's function importing 
-feature
-to bring in the defintions for referenced functions. Only the new offload model
-is supported.
+
+SYCL ThinLTO will hook into the existing community mechanism to run LTO as part
+of device linking inside `clang-linker-wrapper`. We split the device images
+early at compilation time, and at link time we use ThinLTO's function importing
+feature to bring in the definitions for referenced functions. Only the new
+offload model is supported.
 
 ## Device code compilation time changes
-Most of the changes for ThinLTO occur during device link time, however there is 
-one major change during compilation (-c) time: we now run device code split 
-during compilaton instead of linking.
-The main reason for doing this is increased parallelization. Many compilation 
-jobs can be run at the same time, but linking happens once total for the 
-application. Device code split is currently a common source of performance 
-issues.
-
-Splitting early means that the resulting IR after splitting is not complete, it 
-still may contain calls to functions (user code and/or the SYCL device 
+
+Most of the changes for ThinLTO occur during device link time, however there is
+one major change during compilation (-c) time: we now run device code split
+during compilation instead of linking. The main reason for doing this is
+increased parallelization. Many compilation jobs can be run at the same time,
+but linking happens once total for the application. Device code split is
+currently a common source of performance issues.
+
+Splitting early means that the resulting IR after splitting is not complete, it
+still may contain calls to functions (user code and/or the SYCL device
 libraries) from other object files.
 
-We rely on the assumption that all function defintions matching a declaration 
+We rely on the assumption that all function definitions matching a declaration
 will be the same and we can let ThinLTO pull in any one.
 
-For example, let's start with user device code that defines a `SYCL_EXTERNAL` 
-function `foo` in translation unit `tu_foo`. There is also another translation 
-unit `tu_bar` that references `foo`.
-During the early device code splitting run of `tu_foo`, we may find that more 
-than one of the resultant device images contain a defintion for `foo`.
+For example, let's start with user device code that defines a `SYCL_EXTERNAL`
+function `foo` in translation unit `tu_foo`. There is also another translation
+unit `tu_bar` that references `foo`. During the early device code splitting run
+of `tu_foo`, we may find that more than one of the resultant device images
+contain a definition for `foo`.
 
-We assert that any function defintion for `foo` that is deemed a match by the 
-ThinLTO infrastruction during the processing of `tu_bar` is valid.
+We assert that any function definition for `foo` that is deemed a match by the
+ThinLTO infrastructure during the processing of `tu_bar` is valid.
 
-As a result of running early device code split, the fat object file generated 
-as part of device compilation may contain multiple device code images.
+As a result of running early device code split, the fat object file generated as
+part of device compilation may contain multiple device code images.
 
-# Device code link time changes
+## Device code link time changes
 
-Before we go into the link time changes for SYCL, let's understand the device 
+Before we go into the link time changes for SYCL, let's understand the device
 linking flow for community devices (AMD/NVIDIA):
 
 ![Community linking flow](images/ThinLTOCommunityFlow.svg)
 
-SYCL has two differenting requirements:
+SYCL has two differentiating requirements:
+
 1) The SPIR-V backend is not production ready and the SPIR-V translator is used.
-2) The SYCL runtime requires metadata (module properties and module symbol 
-table) computed from device images that will be stored along the device images 
+2) The SYCL runtime requires metadata (module properties and module symbol
+table) computed from device images that will be stored along the device images
 in the fat executable.
 
-The effect of requirement 1) is that instead of letting ThinLTO call the SPIR-V 
-backend, we add a callback that runs right before codegen would run.
-In that callback, we call the SPIR-V translator and store the resultant file 
-path for use later, and we instruct the ThinLTO framework to not
-perform codegen.
-
-An interesting additional fact about requirement 2) is that we actually need to 
-process fully linked module to accurate compute the module properties. One 
-example where we need the full module is to [compute the required devicelib mask](https://github.com/intel/llvm/blob/sycl/llvm/lib/SYCLLowerIR/SYCLDeviceLibReqMask.cpp).
-If we only process the device code that was included in the 
-original fat object input to `clang-linker-wrapper`, we will miss devicelib 
-calls in referenced `SYCL_EXTERNAL` functions.
-
-The effect of requirement 2) is that we store the fully linked device image for 
-metadata computation in the SYCL-specific handing code after the ThinLTO 
-framework has completed. Another option would be to try to compute the metadata 
-inside the ThinLTO framework callbacks, but this would require SYCL-specific 
+The effect of requirement 1) is that instead of letting ThinLTO call the SPIR-V
+backend, we add a callback that runs right before CodeGen would run. In that
+callback, we call the SPIR-V translator and store the resultant file path for
+use later, and we instruct the ThinLTO framework to not perform CodeGen.
+
+An interesting additional fact about requirement 2) is that we actually need to
+process fully linked module to accurate compute the module properties. One
+example where we need the full module is to [compute the required devicelib
+mask](https://github.com/intel/llvm/blob/sycl/llvm/lib/SYCLLowerIR/SYCLDeviceLibReqMask.cpp).
+If we only process the device code that was included in the original fat object
+input to `clang-linker-wrapper`, we will miss devicelib calls in referenced
+`SYCL_EXTERNAL` functions.
+
+The effect of requirement 2) is that we store the fully linked device image for
+metadata computation in the SYCL-specific handing code after the ThinLTO
+framework has completed. Another option would be to try to compute the metadata
+inside the ThinLTO framework callbacks, but this would require SYCL-specific
 arguments to many caller functions in the stack and pollute community code.
 
 Here is the current ThinLTO flow for SYCL:
 
 ![SYCL linking flow](images/ThinLTOSYCLFlow.svg)
 
-We add a `PreCodeGenModuleHook` function to the `LTOConfig` object so that we 
+We add a `PreCodeGenModuleHook` function to the `LTOConfig` object so that we
 can process the fully linked module without running the backend.
 
 However, the flow is not ideal for many reasons:
-1) We are relying on the external `llvm-spirv` tool instead of the SPIR-V 
-backend. We could slightly improve this issue by using a library call to the 
-SPIR-V translator instead of the tool, however the library API requires setting 
-up an object to represent the arguments while we only have strings, and it's 
-non-trivial to parse the strings to figure out how to create the argument 
-object. Since we plan to use the SPIR-V backend in the long term, this does not 
+
+1) We are relying on the external `llvm-spirv` tool instead of the SPIR-V
+backend. We could slightly improve this issue by using a library call to the
+SPIR-V translator instead of the tool, however the library API requires setting
+up an object to represent the arguments while we only have strings, and it's
+non-trivial to parse the strings to figure out how to create the argument
+object. Since we plan to use the SPIR-V backend in the long term, this does not
 seem to be worth the effort.
 
-2) We manually run passes inside `PreCodeGenModuleHook`. This is because we 
-don't run codegen, so we can't take advantage of the `PreCodeGenPassesHook` 
-field of `LTOConfig` to run some custom passes, as those passes are only run 
-when we actually are going to run codegen.
+2) We manually run passes inside `PreCodeGenModuleHook`. This is because we
+don't run CodeGen, so we can't take advantage of the `PreCodeGenPassesHook`
+field of `LTOConfig` to run some custom passes, as those passes are only run
+when we actually are going to run CodeGen.
 
-3) We have to store the fully linked module. This is needed because we need a 
-fully linked module to accurately compute metadata, see the above explanation 
-of SYCL requirement 2). We could get around storing the module by computing the 
-metadata inside the LTO framework and storing it for late use by the SYCL 
-bundling code, but doing this would require even more SYCL-only customizations including 
-even more new function arguments and modifications of the `OffloadFile` class. 
-There are also compliations because the LTO framework is multithreaded, and not all 
-LLVM data structures are thread safe.
+3) We have to store the fully linked module. This is needed because we need a
+fully linked module to accurately compute metadata, see the above explanation of
+SYCL requirement 2). We could get around storing the module by computing the
+metadata inside the LTO framework and storing it for late use by the SYCL
+bundling code, but doing this would require even more SYCL-only customizations
+including even more new function arguments and modifications of the
+`OffloadFile` class. There are also compilations because the LTO framework is
+multithreaded, and not all LLVM data structures are thread safe.
 
 The proposed long-term SYCL ThinLTO flow is as follows:
 
 ![SYCL SPIR-V backend linking flow](images/ThinLTOSYCLSPIRVBackendFlow.svg)
 
-The biggest difference here is that we are running codegen using the SPIR-V 
+The biggest difference here is that we are running CodeGen using the SPIR-V
 backend.
 
-Also, instead of using a lambda function in the `PreCodeGenModuleHook` 
-callback to run SYCL finalization passes, we can take advantage of the `PreCodeGenPassesHook` field to add 
-passes to the pass manager that the LTO framework will run.
-
-It is possible that the number of device images in the fat executable
-and which device image contains which kernel is different with ThinLTO
-enabled, but we do expect this to have any impact on correctness or
-performance, nor we do expect users to care.
+Also, instead of using a lambda function in the `PreCodeGenModuleHook` callback
+to run SYCL finalization passes, we can take advantage of the
+`PreCodeGenPassesHook` field to add passes to the pass manager that the LTO
+framework will run.
 
+It is possible that the number of device images in the fat executable and which
+device image contains which kernel is different with ThinLTO enabled, but we do
+expect this to have any impact on correctness or performance, nor we do expect
+users to care.
 
-# Current limitations
+## Current limitations
 
-`-O0`: Compiling with `-O0` prevent clang from generating ThinLTO metadata 
-during the compilation phase. In the current implementation, this is an error. 
-In the final version, we could either silently fall back to full LTO or 
-generate ThinLTO metadata even for `-O0`.
+`-O0`: Compiling with `-O0` prevent clang from generating ThinLTO metadata
+during the compilation phase. In the current implementation, this is an error.
+In the final version, we could either silently fall back to full LTO or generate
+ThinLTO metadata even for `-O0`.
 
-SYCL libdevice: Current all `libdevice` functions are explicitly marked to be 
-weak symbols. The ThinLTO framework does not consider a defintion of function 
-with weak linkage as it cannot be sure that this definiton is the correct one. 
+SYCL libdevice: Current all `libdevice` functions are explicitly marked to be
+weak symbols. The ThinLTO framework does not consider a definition of function
+with weak linkage as it cannot be sure that this definition is the correct one.
 Ideally we could remove the weak symbol annotation.
 
-No binary linkage: The SPIR-V target does not currently have a production 
-quality binary linker. This means that we must generate a fully linked image as 
-part of device linkage. At least for AMD devices, this is not a requirement as 
-`lld` is used for the final link which can resolve any unresolved symbols. 
-`-fno-gpu-rdc` is default for AMD, so in that case it can call `lld` during 
-compile, but if `-fno-gpu-rdc` is passed, the lld call happens as part of 
-`clang-linker-wrapper` to resolve any symbols not resolved by ThinLTO.
\ No newline at end of file
+No binary linkage: The SPIR-V target does not currently have a production
+quality binary linker. This means that we must generate a fully linked image as
+part of device linkage. At least for AMD devices, this is not a requirement as
+`lld` is used for the final link which can resolve any unresolved symbols.
+`-fno-gpu-rdc` is default for AMD, so in that case it can call `lld` during
+compile, but if `-fno-gpu-rdc` is passed, the lld call happens as part of
+`clang-linker-wrapper` to resolve any symbols not resolved by ThinLTO.

From 1deaea2839c0bef5fd0963e7dd253779b22b6162 Mon Sep 17 00:00:00 2001
From: Alexey Bader <alexey.bader@intel.com>
Date: Thu, 19 Sep 2024 17:23:36 -0700
Subject: [PATCH 8/9] [NFC] Small word tweaking.

---
 sycl/doc/design/ThinLTO.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/sycl/doc/design/ThinLTO.md b/sycl/doc/design/ThinLTO.md
index 41a0a00786bdf..58232f0aff157 100644
--- a/sycl/doc/design/ThinLTO.md
+++ b/sycl/doc/design/ThinLTO.md
@@ -30,7 +30,7 @@ currently a common source of performance issues.
 
 Splitting early means that the resulting IR after splitting is not complete, it
 still may contain calls to functions (user code and/or the SYCL device
-libraries) from other object files.
+libraries) defined in other translation units.
 
 We rely on the assumption that all function definitions matching a declaration
 will be the same and we can let ThinLTO pull in any one.
@@ -50,7 +50,7 @@ part of device compilation may contain multiple device code images.
 ## Device code link time changes
 
 Before we go into the link time changes for SYCL, let's understand the device
-linking flow for community devices (AMD/NVIDIA):
+linking flow for AMD/NVIDIA devices:
 
 ![Community linking flow](images/ThinLTOCommunityFlow.svg)
 

From c68e7976bf6a49d983767db21b004f4083e2bc0a Mon Sep 17 00:00:00 2001
From: "Sarnie, Nick" <nick.sarnie@intel.com>
Date: Mon, 23 Sep 2024 08:42:46 -0700
Subject: [PATCH 9/9] address initial feedback

Signed-off-by: Sarnie, Nick <nick.sarnie@intel.com>
---
 .../ClangLinkerWrapper.cpp                    | 25 +++++++------------
 1 file changed, 9 insertions(+), 16 deletions(-)

diff --git a/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp b/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp
index 9bf3b7d7bb594..e58883a932efe 100644
--- a/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp
+++ b/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp
@@ -640,14 +640,13 @@ static Error getDeviceLibsForLTO(SmallVector<OffloadFile> &DeviceLibs,
             "Number of device library files cannot be zero.");
       for (StringRef Val : A->getValues()) {
         SmallString<128> LibName(Val);
-        if (llvm::sys::fs::exists(LibName)) {
-          if (auto Err = processFile(LibName))
-            return Err;
-        } else
+        if (!llvm::sys::fs::exists(LibName))
           return createStringError(
               inconvertibleErrorCode(),
               std::string(LibName) +
                   " SYCL device library file for NVPTX is not found.");
+        if (auto Err = processFile(LibName))
+          return Err;
       }
     }
   }
@@ -692,19 +691,17 @@ static Expected<StringRef> convertSPIRVToIR(StringRef Filename,
 static bool considerOnlyKernelsAsEntryPoints(const ArgList &Args,
                                              const llvm::Triple Triple) {
   const llvm::Triple HostTriple(Args.getLastArgValue(OPT_host_triple_EQ));
-  bool SYCLNativeCPU = (HostTriple == Triple);
   // On Intel targets we don't need non-kernel functions as entry points,
   // because it only increases amount of code for device compiler to handle,
   // without any actual benefits.
   // TODO: Try to extend this feature for non-Intel GPUs.
-  return (!Args.hasFlag(OPT_no_sycl_remove_unused_external_funcs,
-                        OPT_sycl_remove_unused_external_funcs, false) &&
-          !SYCLNativeCPU) &&
-         !Triple.isNVPTX() && !Triple.isAMDGPU();
+  return !Args.hasFlag(OPT_no_sycl_remove_unused_external_funcs,
+                       OPT_sycl_remove_unused_external_funcs, false) &&
+         Triple.isSPIROrSPIRV();
 }
 
 bool isSYCLThinLTO(const ArgList &Args, const llvm::Triple Triple) {
-  // TODO: Support CUDA/HIP
+  // TODO: Support AMDGPU/NVPTX targets
   return Triple.isSPIROrSPIRV() && Args.hasArg(OPT_sycl_thin_lto);
 }
 
@@ -747,10 +744,6 @@ getTripleBasedSYCLPostLinkOpts(const ArgList &Args,
   if (NoSplit && (Triple.getSubArch() != llvm::Triple::SPIRSubArch_fpga))
     PostLinkArgs.push_back("-split=auto");
 
-  // On Intel targets we don't need non-kernel functions as entry points,
-  // because it only increases amount of code for device compiler to handle,
-  // without any actual benefits.
-  // TODO: Try to extend this feature for non-Intel GPUs.
   if (considerOnlyKernelsAsEntryPoints(Args, Triple))
     PostLinkArgs.push_back("-emit-only-kernels-as-entry-points");
 
@@ -1917,8 +1910,8 @@ std::unique_ptr<lto::LTO> createLTO(
                              .str();
   auto PreCodeGenSaveTemps = [=](size_t Task, const Module &M) {
     std::string File =
-        !Task ? TempName + ".postopt.bc"
-              : TempName + "." + std::to_string(Task) + ".postopt.bc";
+        !Task ? TempName + ".precodegen.bc"
+              : TempName + "." + std::to_string(Task) + ".precodegen.bc";
     error_code EC;
     raw_fd_ostream LinkedBitcode(File, EC, sys::fs::OF_None);
     if (EC)