From 817a0c9b4471f92d1ec4f94930447f31fd163788 Mon Sep 17 00:00:00 2001 From: "Sarnie, Nick" Date: Mon, 15 Jul 2024 12:58:31 -0700 Subject: [PATCH 1/9] weak Signed-off-by: Sarnie, Nick --- libdevice/device.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/libdevice/device.h b/libdevice/device.h index 360af54f9b4c4..e2f0da6e3c082 100644 --- a/libdevice/device.h +++ b/libdevice/device.h @@ -17,9 +17,9 @@ #if defined(__SPIR__) || defined(__SPIRV__) || defined(__NVPTX__) #ifdef __SYCL_DEVICE_ONLY__ -#define DEVICE_EXTERNAL SYCL_EXTERNAL __attribute__((weak)) +#define DEVICE_EXTERNAL SYCL_EXTERNAL #else // __SYCL_DEVICE_ONLY__ -#define DEVICE_EXTERNAL __attribute__((weak)) +#define DEVICE_EXTERNAL #endif // __SYCL_DEVICE_ONLY__ #define DEVICE_EXTERN_C DEVICE_EXTERNAL EXTERN_C From 70ee0e04c960c06fc69323f7597232700cb34957 Mon Sep 17 00:00:00 2001 From: "Sarnie, Nick" Date: Mon, 15 Jul 2024 13:23:09 -0700 Subject: [PATCH 2/9] thin Signed-off-by: Sarnie, Nick --- clang/lib/Driver/ToolChains/Clang.cpp | 6 +- .../tools/clang-linker-wrapper/CMakeLists.txt | 1 + .../ClangLinkerWrapper.cpp | 510 ++++++++++++++---- llvm/include/llvm/Object/OffloadBinary.h | 11 + .../include/llvm/SYCLLowerIR/ModuleSplitter.h | 1 + llvm/lib/LTO/LTO.cpp | 4 +- llvm/lib/SYCLLowerIR/ModuleSplitter.cpp | 52 +- 7 files changed, 448 insertions(+), 137 deletions(-) diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp index 8df597de8f5ff..216b216ba797b 100644 --- a/clang/lib/Driver/ToolChains/Clang.cpp +++ b/clang/lib/Driver/ToolChains/Clang.cpp @@ -11231,8 +11231,12 @@ void LinkerWrapper::ConstructJob(Compilation &C, const JobAction &JA, bool IsUsingLTO = D.isUsingLTO(/*IsDeviceOffloadAction=*/true); auto LTOMode = D.getLTOMode(/*IsDeviceOffloadAction=*/true); - if (IsUsingLTO && LTOMode == LTOK_Thin) + if (IsUsingLTO && LTOMode == LTOK_Thin) { CmdArgs.push_back(Args.MakeArgString("-sycl-thin-lto")); + // TODO: Pass the same value for this argument once we start using it + // for non-thinLTO. + CmdArgs.push_back(Args.MakeArgString("-sycl-module-split-mode=auto")); + } if (Args.hasArg(options::OPT_fsycl_embed_ir)) CmdArgs.push_back(Args.MakeArgString("-sycl-embed-ir")); diff --git a/clang/tools/clang-linker-wrapper/CMakeLists.txt b/clang/tools/clang-linker-wrapper/CMakeLists.txt index 9dc1f244f2802..8ad197d85d535 100644 --- a/clang/tools/clang-linker-wrapper/CMakeLists.txt +++ b/clang/tools/clang-linker-wrapper/CMakeLists.txt @@ -1,5 +1,6 @@ set(LLVM_LINK_COMPONENTS ${LLVM_TARGETS_TO_BUILD} + BitReader BitWriter Core BinaryFormat diff --git a/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp b/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp index e7294a314bc8a..7b38cebe77eef 100644 --- a/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp +++ b/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp @@ -42,6 +42,7 @@ #include "llvm/Option/Option.h" #include "llvm/Passes/PassPlugin.h" #include "llvm/Remarks/HotnessThresholdParser.h" +#include "llvm/SYCLLowerIR/ComputeModuleRuntimeInfo.h" #include "llvm/SYCLLowerIR/ModuleSplitter.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Errc.h" @@ -590,6 +591,58 @@ static Error getSYCLDeviceLibs(SmallVector &DeviceLibFiles, return Error::success(); } +static Error getDeviceLibsForLTO(SmallVector &DeviceLibs, + const ArgList &Args, + const llvm::Triple Triple) { + // TODO: Fix copy paste + SmallVector DeviceLibFiles; + if (Error Err = sycl::getSYCLDeviceLibs(DeviceLibFiles, Args)) + return Err; + + auto processFile = [&](StringRef File) { + auto BufferOrErr = MemoryBuffer::getFile(File); + if (!BufferOrErr) + return createFileError(File, BufferOrErr.getError()); + auto Buffer = std::move(*BufferOrErr); + SmallVector Candidates; + if (Error Err = + extractOffloadBinaries(Buffer->getMemBufferRef(), Candidates)) + return Err; + for (OffloadFile &OffF : Candidates) + if (llvm::Triple(OffF.getBinary()->getTriple()) == Triple) + DeviceLibs.emplace_back(std::move(OffF)); + return Error(Error::success()); + }; + + for (auto &File : DeviceLibFiles) { + + if (Error Err = processFile(File)) + return Err; + } + + // For NVPTX backend we need to also link libclc and CUDA libdevice. + if (Triple.isNVPTX()) { + if (Arg *A = Args.getLastArg(OPT_sycl_nvptx_device_lib_EQ)) { + if (A->getValues().size() == 0) + return createStringError( + inconvertibleErrorCode(), + "Number of device library files cannot be zero."); + for (StringRef Val : A->getValues()) { + SmallString<128> LibName(Val); + if (llvm::sys::fs::exists(LibName)) { + if (auto Err = processFile(LibName)) + return Err; + } else + return createStringError( + inconvertibleErrorCode(), + std::string(LibName) + + " SYCL device library file for NVPTX is not found."); + } + } + } + return Error::success(); +} + /// This routine is used to convert SPIR-V input files into LLVM IR files. /// 'llvm-spirv -r' command is used for this purpose. /// If input is not a SPIR-V file, then the original file is returned. @@ -625,6 +678,25 @@ static Expected convertSPIRVToIR(StringRef Filename, return *TempFileOrErr; } +static bool considerOnlyKernelsAsEntryPoints(const ArgList &Args, + const llvm::Triple Triple) { + const llvm::Triple HostTriple(Args.getLastArgValue(OPT_host_triple_EQ)); + bool SYCLNativeCPU = (HostTriple == Triple); + // On Intel targets we don't need non-kernel functions as entry points, + // because it only increases amount of code for device compiler to handle, + // without any actual benefits. + // TODO: Try to extend this feature for non-Intel GPUs. + return (!Args.hasFlag(OPT_no_sycl_remove_unused_external_funcs, + OPT_sycl_remove_unused_external_funcs, false) && + !SYCLNativeCPU) && + !Triple.isNVPTX() && !Triple.isAMDGPU(); +} + +bool isSYCLThinLTO(const ArgList &Args, const llvm::Triple Triple) { + // TODO: Support CUDA/HIP + return Triple.isSPIROrSPIRV() && Args.hasArg(OPT_sycl_thin_lto); +} + /// Add any sycl-post-link options that rely on a specific Triple in addition /// to user supplied options. /// NOTE: Any changes made here should be reflected in the similarly named @@ -661,10 +733,7 @@ getTripleBasedSYCLPostLinkOpts(const ArgList &Args, // because it only increases amount of code for device compiler to handle, // without any actual benefits. // TODO: Try to extend this feature for non-Intel GPUs. - if ((!Args.hasFlag(OPT_no_sycl_remove_unused_external_funcs, - OPT_sycl_remove_unused_external_funcs, false) && - !SYCLNativeCPU) && - !Triple.isNVPTX() && !Triple.isAMDGPU()) + if (considerOnlyKernelsAsEntryPoints(Args, Triple)) PostLinkArgs.push_back("-emit-only-kernels-as-entry-points"); if (!Triple.isAMDGCN()) @@ -677,7 +746,7 @@ getTripleBasedSYCLPostLinkOpts(const ArgList &Args, bool SplitEsimd = Args.hasFlag(OPT_sycl_device_code_split_esimd, OPT_no_sycl_device_code_split_esimd, SplitEsimdByDefault); - if (!Args.hasArg(OPT_sycl_thin_lto)) + if (!isSYCLThinLTO(Args, Triple)) PostLinkArgs.push_back("-symbols"); // Specialization constant info generation is mandatory - // add options unconditionally @@ -881,27 +950,33 @@ getTripleBasedSPIRVTransOpts(const ArgList &Args, TranslatorArgs.push_back(Args.MakeArgString(ExtArg)); } +void computeLLVMToSPIRVTranslationToolArgs(const ArgList &Args, + SmallVector &CmdArgs) { + const llvm::Triple Triple(Args.getLastArgValue(OPT_triple_EQ)); + getTripleBasedSPIRVTransOpts(Args, CmdArgs, Triple); + StringRef LLVMToSPIRVOptions; + if (Arg *A = Args.getLastArg(OPT_llvm_spirv_options_EQ)) + LLVMToSPIRVOptions = A->getValue(); + LLVMToSPIRVOptions.split(CmdArgs, " ", /* MaxSplit = */ -1, + /* KeepEmpty = */ false); +} + /// Run LLVM to SPIR-V translation. /// Converts 'File' from LLVM bitcode to SPIR-V format using llvm-spirv tool. /// 'Args' encompasses all arguments required for linking and wrapping device /// code and will be parsed to generate options required to be passed into the /// llvm-spirv tool. -static Expected runLLVMToSPIRVTranslation(StringRef File, - const ArgList &Args) { + +static Expected +runLLVMToSPIRVTranslation(StringRef File, + SmallVectorImpl &&CmdArgs) { Expected LLVMToSPIRVPath = findProgram("llvm-spirv", {getMainExecutable("llvm-spirv")}); if (!LLVMToSPIRVPath) return LLVMToSPIRVPath.takeError(); - SmallVector CmdArgs; - CmdArgs.push_back(*LLVMToSPIRVPath); - const llvm::Triple Triple(Args.getLastArgValue(OPT_triple_EQ)); - getTripleBasedSPIRVTransOpts(Args, CmdArgs, Triple); - StringRef LLVMToSPIRVOptions; - if (Arg *A = Args.getLastArg(OPT_llvm_spirv_options_EQ)) - LLVMToSPIRVOptions = A->getValue(); - LLVMToSPIRVOptions.split(CmdArgs, " ", /* MaxSplit = */ -1, - /* KeepEmpty = */ false); + CmdArgs.insert(CmdArgs.begin(), (*LLVMToSPIRVPath)); + CmdArgs.push_back("-o"); // Create a new file to write the translated file to. @@ -941,6 +1016,13 @@ static Expected runLLVMToSPIRVTranslation(StringRef File, return *TempFileOrErr; } +static Expected runLLVMToSPIRVTranslation(StringRef File, + const ArgList &Args) { + SmallVector ToolArgs; + computeLLVMToSPIRVTranslationToolArgs(Args, ToolArgs); + return runLLVMToSPIRVTranslation(File, std::move(ToolArgs)); +} + /// Adds all AOT backend options required for SYCL AOT compilation step to /// 'CmdArgs'. /// 'Args' encompasses all arguments required for linking and wrapping device @@ -1350,6 +1432,44 @@ static Expected linkDevice(ArrayRef InputFiles, return *DeviceLinkedFile; } +llvm::sycl::GlobalBinImageProps +computeGlobalBinProps(const ArgList &Args, const llvm::Triple Triple) { + auto findParam = [](const SmallVectorImpl &Vec, StringRef Param) { + for (auto El : Vec) + if (Param == El) + return true; + return false; + }; + SmallVector CmdArgs; + getTripleBasedSYCLPostLinkOpts(Args, CmdArgs, Triple); + bool EmitKernelParamInfo = findParam(CmdArgs, "-emit-param-info"); + bool EmitProgramMetadata = findParam(CmdArgs, "-emit-program-metadata"); + bool EmitExportedSymbols = findParam(CmdArgs, "-emit-exported-symbols"); + bool EmitImportedSymbols = findParam(CmdArgs, "-emit-imported-symbols"); + // DeviceGlobals is not triple-based, so it will be present in Args. + bool DeviceGlobals = false; + if (Arg *A = Args.getLastArg(OPT_sycl_post_link_options_EQ)) + DeviceGlobals = StringRef(A->getValue()).contains("-device-globals"); + + return {EmitKernelParamInfo, EmitProgramMetadata, EmitExportedSymbols, + EmitImportedSymbols, DeviceGlobals}; +} + +Error validateThinLTOModule(BitcodeModule &M, const ArgList &Args) { + Expected LTOInfo = M.getLTOInfo(); + if (!LTOInfo || !(*LTOInfo).IsThinLTO) + return createStringError( + "All code must be compiled with -foffload-lto=thin"); + + // For O0 we don't run function importing so it defeats + // the whole point of thinLTO. Maybe we could lift this + // restriction by enabling only required passes for importing for O0. + if (Args.getLastArgValue(OPT_opt_level, "") == "O0") + return createStringError("O0 is not supported"); + + return Error::success(); +} + } // namespace sycl namespace generic { @@ -1566,6 +1686,8 @@ std::vector getTargetFeatures(ArrayRef InputFiles) { template > std::unique_ptr createLTO( const ArgList &Args, const std::vector &Features, + SmallVectorImpl &BitcodeInputFiles, + std::vector ModulesToCompile = {}, ModuleHook Hook = [](size_t, const Module &) { return true; }) { const llvm::Triple Triple(Args.getLastArgValue(OPT_triple_EQ)); // We need to remove AMD's target-id from the processor if present. @@ -1595,6 +1717,14 @@ std::unique_ptr createLTO( Conf.OptLevel = OptLevel[1] - '0'; Conf.DefaultTriple = Triple.getTriple(); + // We need to set up the backend to use thinLTO + // even if we don't actually use it, and there is no + // backend for the spir64 triple, so override it to + // the SPIR-V backlend. + // TODO: Remove once SYCL uses the SPIR-V backend. + if (sycl::isSYCLThinLTO(Args, Triple)) + Conf.OverrideTriple = "spirv64-unknown-unknown"; + // TODO: Should we complain about combining --opt-level and -passes, as opt // does? That might be too limiting in clang-linker-wrapper, so for now we // just warn in the help entry for -passes that the default corresponding @@ -1611,11 +1741,21 @@ std::unique_ptr createLTO( Conf.PTO.LoopVectorization = Conf.OptLevel > 1; Conf.PTO.SLPVectorization = Conf.OptLevel > 1; - + std::string TempName = (sys::path::filename(ExecutableName) + "." + + Triple.getTriple() + "." + Arch) + .str(); + auto PreCodeGenSaveTemps = [=](size_t Task, const Module &M) { + std::string File = + !Task ? TempName + ".postopt.bc" + : TempName + "." + std::to_string(Task) + ".postopt.bc"; + error_code EC; + raw_fd_ostream LinkedBitcode(File, EC, sys::fs::OF_None); + if (EC) + reportError(errorCodeToError(EC)); + WriteBitcodeToFile(M, LinkedBitcode); + return true; + }; if (SaveTemps) { - std::string TempName = (sys::path::filename(ExecutableName) + "." + - Triple.getTriple() + "." + TargetID) - .str(); Conf.PostInternalizeModuleHook = [=](size_t Task, const Module &M) { std::string File = !Task ? TempName + ".postlink.bc" @@ -1627,17 +1767,7 @@ std::unique_ptr createLTO( WriteBitcodeToFile(M, LinkedBitcode); return true; }; - Conf.PreCodeGenModuleHook = [=](size_t Task, const Module &M) { - std::string File = - !Task ? TempName + ".postopt.bc" - : TempName + "." + std::to_string(Task) + ".postopt.bc"; - error_code EC; - raw_fd_ostream LinkedBitcode(File, EC, sys::fs::OF_None); - if (EC) - reportError(errorCodeToError(EC)); - WriteBitcodeToFile(M, LinkedBitcode); - return true; - }; + Conf.PreCodeGenModuleHook = PreCodeGenSaveTemps; } Conf.PostOptModuleHook = Hook; Conf.CGFileType = (Triple.isNVPTX() || SaveTemps) @@ -1646,6 +1776,97 @@ std::unique_ptr createLTO( // TODO: Handle remark files Conf.HasWholeProgramVisibility = Args.hasArg(OPT_whole_program); + if (sycl::isSYCLThinLTO(Args, Triple)) { + // Passing Args to each thinLTO thread causes crashes, so compute everything + // we can here. + const llvm::Triple Triple(Args.getLastArgValue(OPT_triple_EQ)); + bool OnlyKernelsAsEntryPoints = + sycl::considerOnlyKernelsAsEntryPoints(Args, Triple); + auto GlobalBinProps = sycl::computeGlobalBinProps(Args, Triple); + SmallVector SPIRVArgs; + sycl::computeLLVMToSPIRVTranslationToolArgs(Args, SPIRVArgs); + Conf.PreCodeGenModuleHook = [=, &BitcodeInputFiles](unsigned Task, + const Module &M) { + // This is the main part of SYCL LTO handling. + // Here we process the IR from each BC file, compute module + // properties and the module symbol table, convert to SPV (using the + // translator for now) and save required information for binary created + // inside the OffloadFile. + + assert(Task != 0 && "Unexpected task"); + auto &OffloadF = BitcodeInputFiles[Task - 1]; + if (OffloadF.getBinary()->getOffloadKind() != OFK_SYCL) { + if (SaveTemps) + PreCodeGenSaveTemps(Task, M); + return true; + } + + llvm::sycl::EntryPointSet EntryPoints; + + for (const Function &F : M.functions()) { + if (llvm::module_split::isEntryPoint(F, OnlyKernelsAsEntryPoints)) + EntryPoints.insert(const_cast(&F)); + } + // No entry points, don't proceed + if (EntryPoints.empty()) + return false; + + if (SaveTemps) + PreCodeGenSaveTemps(Task, M); + + // TODO: Handle spec constants. + + // TODO: Handle internalization of non-entry-points, we don't do it during + // early split anymore. + // One problem is that the modules are pased in as `const Module&`, and + // ideally we want to delete non-entry point functions, but const-casting + // and modifying the module seems from here seems wrong. + + auto ModuleProps = llvm::sycl::computeModuleProperties( + M, EntryPoints, GlobalBinProps, + /*SpecConstsMet=*/false, /*SpecConstsMet=*/false); + std::string ModulePropsStr; + raw_string_ostream SCOut(ModulePropsStr); + ModuleProps.write(SCOut); + std::string ModuleSyms = + llvm::sycl::computeModuleSymbolTable(M, EntryPoints); + // This part is the hackiest part of this change. However, this code is + // run on multiple threads, so the data structures we can use are more + // limited. We can't use StringRef because we would need a StringSaver to + // keep the values around, but StringSaver is not thread safe. + OffloadF.getBinary()->addTmpString(ModulePropsStr); + OffloadF.getBinary()->addTmpString(ModuleSyms); + // TODO: Use SPIR-V backend instead of SPIR-V translator once the backend + // is mature. + auto IRFile = createOutputFile(sys::path::filename(ExecutableName) + "." + + std::to_string(Task) + ".to.spv", + "spv"); + if (!IRFile) + reportError(IRFile.takeError()); + error_code EC; + raw_fd_ostream LinkedBitcode(*IRFile, EC, sys::fs::OF_None); + if (EC) + reportError(errorCodeToError(EC)); + WriteBitcodeToFile(M, LinkedBitcode); + LinkedBitcode.close(); + // We need this copy to prevent data corruption of the arguments when + // calling llvm-spirv. Probably some multithreading thing, I didn't deeply + // investigate it yet. + SmallVector SPIRVArgsCopy = SPIRVArgs; + auto SPVFile = + sycl::runLLVMToSPIRVTranslation(*IRFile, std::move(SPIRVArgsCopy)); + if (!SPVFile) + reportError(SPVFile.takeError()); + OffloadF.getBinary()->addTmpString((*SPVFile).str()); + // Return false so the thinLTO backend doesn't continue to process this + // module. We already emitted SPIR-V ourselves, so we don't need to do + // anything else. Once the SPIR-V backend is ready, we can remove the + // manual SPIR-V translator call and return true here. + return false; + }; + // Only compile user modules to SPV, not device libraries. + Conf.ThinLTOModulesToCompile = ModulesToCompile; + } return std::make_unique(std::move(Conf), Backend); } @@ -1660,16 +1881,16 @@ bool isValidCIdentifier(StringRef S) { Error linkBitcodeFiles(SmallVectorImpl &InputFiles, SmallVectorImpl &OutputFiles, + SmallVector &BitcodeInputFiles, const ArgList &Args) { llvm::TimeTraceScope TimeScope("Link bitcode files"); const llvm::Triple Triple(Args.getLastArgValue(OPT_triple_EQ)); StringRef Arch = Args.getLastArgValue(OPT_arch_EQ); - // Early exit for SPIR targets - if (Triple.isSPIROrSPIRV()) + // Early exit for non-thin-LTO SPIR targets + if (Triple.isSPIROrSPIRV() && !sycl::isSYCLThinLTO(Args, Triple)) return Error::success(); - SmallVector BitcodeInputFiles; DenseSet StrongResolutions; DenseSet UsedInRegularObj; DenseSet UsedInSharedLib; @@ -1732,6 +1953,17 @@ Error linkBitcodeFiles(SmallVectorImpl &InputFiles, // LTO Module hook to output bitcode without running the backend. SmallVector BitcodeOutput; + std::vector ModulesToCompile; + if (sycl::isSYCLThinLTO(Args, Triple)) { + for (const OffloadFile &BitcodeInput : BitcodeInputFiles) { + auto ModuleName = BitcodeInput.getBinary()->getFileName(); + // TODO: This is pretty hacky, maybe we could check some module metadata + // or something. + if (ModuleName.find("libsycl-") == std::string::npos) + ModulesToCompile.push_back(ModuleName.str()); + } + } + auto OutputBitcode = [&](size_t, const Module &M) { auto TempFileOrErr = createOutputFile(sys::path::filename(ExecutableName) + "-jit-" + Triple.getTriple(), @@ -1750,11 +1982,11 @@ Error linkBitcodeFiles(SmallVectorImpl &InputFiles, // We assume visibility of the whole program if every input file was bitcode. auto Features = getTargetFeatures(BitcodeInputFiles); - auto LTOBackend = Args.hasArg(OPT_embed_bitcode) || - Args.hasArg(OPT_builtin_bitcode_EQ) || - Args.hasArg(OPT_clang_backend) - ? createLTO(Args, Features, OutputBitcode) - : createLTO(Args, Features); + auto LTOBackend = + Args.hasArg(OPT_embed_bitcode) || Args.hasArg(OPT_builtin_bitcode_EQ) || + Args.hasArg(OPT_clang_backend) + ? createLTO(Args, Features, BitcodeInputFiles, {}, OutputBitcode) + : createLTO(Args, Features, BitcodeInputFiles, ModulesToCompile); // We need to resolve the symbols so the LTO backend knows which symbols need // to be kept or can be internalized. This is a simplified symbol resolution @@ -1773,6 +2005,17 @@ Error linkBitcodeFiles(SmallVectorImpl &InputFiles, if (!BitcodeFileOrErr) return BitcodeFileOrErr.takeError(); + if (sycl::isSYCLThinLTO(Args, Triple)) { + // Error if any module was not compiled with thinLTO. Other platforms + // can fall back to binary linking if thinLTO fails, but we don't have + // that for SPIR-V (besides spirv-link). In the future we may be able to + // fall back to normal SYCL processing and throw a warning instead of a + // fatal error. + if (auto Err = sycl::validateThinLTOModule( + (*BitcodeFileOrErr)->getSingleBitcodeModule(), Args)) + return Err; + } + // Save the input file and the buffer associated with its memory. const auto Symbols = (*BitcodeFileOrErr)->symbols(); SmallVector Resolutions(Symbols.size()); @@ -2175,76 +2418,125 @@ Expected> linkAndWrapDeviceFiles( } if (HasSYCLOffloadKind) { SmallVector InputFiles; - // Write device inputs to an output file for the linker. - for (const OffloadFile &File : Input) { - auto FileNameOrErr = writeOffloadFile(File); - if (!FileNameOrErr) - return FileNameOrErr.takeError(); - InputFiles.emplace_back(*FileNameOrErr); + SmallVector BitcodeInputFiles; + StringRef TmpOutput; + llvm::Triple Triple(LinkerArgs.getLastArgValue(OPT_triple_EQ)); + if (sycl::isSYCLThinLTO(Args, Triple)) { + // For thinLTO, we consider device libs as normal compiler input + // and add them to the files to be processed by the LTO backend. + // Later we set ModulesToCompile so that we don't + // actually emit code for them, we just link in their functions in + // modules that use them. + if (auto Err = sycl::getDeviceLibsForLTO(Input, LinkerArgs, Triple)) + return Err; + if (auto Err = linkBitcodeFiles(Input, InputFiles, BitcodeInputFiles, + LinkerArgs)) + return Err; + } else { + // Write device inputs to an output file for the linker. + for (const OffloadFile &File : Input) { + auto FileNameOrErr = writeOffloadFile(File); + if (!FileNameOrErr) + return FileNameOrErr.takeError(); + InputFiles.emplace_back(*FileNameOrErr); + } + // Link the input device files using the device linker for SYCL + // offload. + auto TmpOutputOrErr = sycl::linkDevice(InputFiles, LinkerArgs); + if (!TmpOutputOrErr) + return TmpOutputOrErr.takeError(); + TmpOutput = *TmpOutputOrErr; } - // Link the input device files using the device linker for SYCL - // offload. - auto TmpOutputOrErr = sycl::linkDevice(InputFiles, LinkerArgs); - if (!TmpOutputOrErr) - return TmpOutputOrErr.takeError(); SmallVector InputFilesSYCL; - InputFilesSYCL.emplace_back(*TmpOutputOrErr); - auto SplitModulesOrErr = - SYCLModuleSplitMode - ? sycl::runSYCLSplitLibrary(InputFilesSYCL, LinkerArgs, - *SYCLModuleSplitMode) - : sycl::runSYCLPostLinkTool(InputFilesSYCL, LinkerArgs); - if (!SplitModulesOrErr) - return SplitModulesOrErr.takeError(); - - auto &SplitModules = *SplitModulesOrErr; - const llvm::Triple Triple(LinkerArgs.getLastArgValue(OPT_triple_EQ)); - if ((Triple.isNVPTX() || Triple.isAMDGCN()) && - LinkerArgs.hasArg(OPT_sycl_embed_ir)) { - // When compiling for Nvidia/AMD devices and the user requested the - // IR to be embedded in the application (via option), run the output - // of sycl-post-link (filetable referencing LLVM Bitcode + symbols) - // through the offload wrapper and link the resulting object to the - // application. - auto OutputFile = - sycl::runWrapperAndCompile(SplitModules, LinkerArgs, /* IsEmbeddedIR */ true); - if (!OutputFile) - return OutputFile.takeError(); - WrappedOutput.push_back(*OutputFile); - } - for (size_t I = 0, E = SplitModules.size(); I != E; ++I) { - SmallVector Files = {SplitModules[I].ModuleFilePath}; - StringRef Arch = LinkerArgs.getLastArgValue(OPT_arch_EQ); - if (Arch.empty()) - Arch = "native"; - SmallVector, 4> BundlerInputFiles; - auto ClangOutputOrErr = - linkDevice(Files, LinkerArgs, true /* IsSYCLKind */); - if (!ClangOutputOrErr) - return ClangOutputOrErr.takeError(); - if (Triple.isNVPTX()) { - auto VirtualArch = StringRef(clang::OffloadArchToVirtualArchString( - clang::StringToOffloadArch(Arch))); - auto PtxasOutputOrErr = - nvptx::ptxas(*ClangOutputOrErr, LinkerArgs, Arch); - if (!PtxasOutputOrErr) - return PtxasOutputOrErr.takeError(); - BundlerInputFiles.emplace_back(*ClangOutputOrErr, VirtualArch); - BundlerInputFiles.emplace_back(*PtxasOutputOrErr, Arch); - auto BundledFileOrErr = - nvptx::fatbinary(BundlerInputFiles, LinkerArgs); - if (!BundledFileOrErr) - return BundledFileOrErr.takeError(); - SplitModules[I].ModuleFilePath = *BundledFileOrErr; - } else if (Triple.isAMDGCN()) { - BundlerInputFiles.emplace_back(*ClangOutputOrErr, Arch); - auto BundledFileOrErr = - amdgcn::fatbinary(BundlerInputFiles, LinkerArgs); - if (!BundledFileOrErr) - return BundledFileOrErr.takeError(); - SplitModules[I].ModuleFilePath = *BundledFileOrErr; - } else { - SplitModules[I].ModuleFilePath = *ClangOutputOrErr; + std::vector SplitModules; + if (sycl::isSYCLThinLTO(Args, Triple)) { + for (size_t FileIdx = 0; FileIdx < BitcodeInputFiles.size(); + FileIdx++) { + // After we have run the LTO backend, extract the information computed + // in the backend (module props/symbol table/spv file path) and set it + // up to be used by SYCL image creation. + // TODO: Once SYCL image creation is reconsiled with the non-SYCL + // path, we can move all of the thinLTO handling to be more in-line + // with community code. + const OffloadFile &F = BitcodeInputFiles[FileIdx]; + const auto &SYCLInfo = F.getBinary()->getTmpStrings(); + if (SYCLInfo.size() != 3) + continue; + // The hardcoded vector indexes are very hacky, + // but I feel the most controversial part of this hcange is how we + // store the required information for later and it's likely to change + // based on feedback, so I didn't completely design that part yet. + StringRef CodegenPath = SYCLInfo[2]; + assert(!CodegenPath.empty() && "Codegen failed"); + const auto &Props = SYCLInfo[0]; + auto MB = MemoryBuffer::getMemBuffer(Props); + auto PropSetOrErr = llvm::util::PropertySetRegistry::read(MB.get()); + if (!PropSetOrErr) + return PropSetOrErr.takeError(); + llvm::util::PropertySetRegistry Properties = + std::move(**PropSetOrErr); + const auto &Syms = SYCLInfo[1]; + SplitModules.emplace_back(CodegenPath, std::move(Properties), Syms); + } + // We don't need the OffloadFiles anymore, so free them from memory. + BitcodeInputFiles.clear(); + } else { + InputFilesSYCL.emplace_back(TmpOutput); + auto SplitModulesOrErr = + SYCLModuleSplitMode + ? sycl::runSYCLSplitLibrary(InputFilesSYCL, LinkerArgs, + *SYCLModuleSplitMode) + : sycl::runSYCLPostLinkTool(InputFilesSYCL, LinkerArgs); + if (!SplitModulesOrErr) + return SplitModulesOrErr.takeError(); + SplitModules = std::move(*SplitModulesOrErr); + if ((Triple.isNVPTX() || Triple.isAMDGCN()) && + LinkerArgs.hasArg(OPT_sycl_embed_ir)) { + // When compiling for Nvidia/AMD devices and the user requested the + // IR to be embedded in the application (via option), run the output + // of sycl-post-link (filetable referencing LLVM Bitcode + symbols) + // through the offload wrapper and link the resulting object to the + // application. + auto OutputFile = sycl::runWrapperAndCompile(SplitModules, LinkerArgs, + /* IsEmbeddedIR */ true); + if (!OutputFile) + return OutputFile.takeError(); + WrappedOutput.push_back(*OutputFile); + } + for (size_t I = 0, E = SplitModules.size(); I != E; ++I) { + SmallVector Files = {SplitModules[I].ModuleFilePath}; + StringRef Arch = LinkerArgs.getLastArgValue(OPT_arch_EQ); + if (Arch.empty()) + Arch = "native"; + SmallVector, 4> BundlerInputFiles; + auto ClangOutputOrErr = + linkDevice(Files, LinkerArgs, true /* IsSYCLKind */); + if (!ClangOutputOrErr) + return ClangOutputOrErr.takeError(); + if (Triple.isNVPTX()) { + auto VirtualArch = StringRef(clang::OffloadArchToVirtualArchString( + clang::StringToOffloadArch(Arch))); + auto PtxasOutputOrErr = + nvptx::ptxas(*ClangOutputOrErr, LinkerArgs, Arch); + if (!PtxasOutputOrErr) + return PtxasOutputOrErr.takeError(); + BundlerInputFiles.emplace_back(*ClangOutputOrErr, VirtualArch); + BundlerInputFiles.emplace_back(*PtxasOutputOrErr, Arch); + auto BundledFileOrErr = + nvptx::fatbinary(BundlerInputFiles, LinkerArgs); + if (!BundledFileOrErr) + return BundledFileOrErr.takeError(); + SplitModules[I].ModuleFilePath = *BundledFileOrErr; + } else if (Triple.isAMDGCN()) { + BundlerInputFiles.emplace_back(*ClangOutputOrErr, Arch); + auto BundledFileOrErr = + amdgcn::fatbinary(BundlerInputFiles, LinkerArgs); + if (!BundledFileOrErr) + return BundledFileOrErr.takeError(); + SplitModules[I].ModuleFilePath = *BundledFileOrErr; + } else { + SplitModules[I].ModuleFilePath = *ClangOutputOrErr; + } } } // TODO(NOM7): Remove this call and use community flow for bundle/wrap @@ -2263,7 +2555,9 @@ Expected> linkAndWrapDeviceFiles( if (HasNonSYCLOffloadKinds) { // First link and remove all the input files containing bitcode. SmallVector InputFiles; - if (Error Err = linkBitcodeFiles(Input, InputFiles, LinkerArgs)) + SmallVector BitcodeInputFiles; + if (Error Err = linkBitcodeFiles(Input, InputFiles, BitcodeInputFiles, + LinkerArgs)) return Err; // Write any remaining device inputs to an output file for the linker. diff --git a/llvm/include/llvm/Object/OffloadBinary.h b/llvm/include/llvm/Object/OffloadBinary.h index d590110ffa598..3d1b18b7f3418 100644 --- a/llvm/include/llvm/Object/OffloadBinary.h +++ b/llvm/include/llvm/Object/OffloadBinary.h @@ -103,6 +103,14 @@ class OffloadBinary : public Binary { StringRef getString(StringRef Key) const { return StringData.lookup(Key); } + /// XXX: Hack + const SmallVectorImpl &getTmpStrings() const { + return TmpStringData; + } + + /// XXX: Hack + void addTmpString(std::string Value) { TmpStringData.push_back(Value); } + static bool classof(const Binary *V) { return V->isOffloadFile(); } struct Header { @@ -151,6 +159,9 @@ class OffloadBinary : public Binary { const Header *TheHeader; /// Location of the metadata entries within the binary. const Entry *TheEntry; + + /// XXX: Hack + SmallVector TmpStringData; }; /// A class to contain the binary information for a single OffloadBinary that diff --git a/llvm/include/llvm/SYCLLowerIR/ModuleSplitter.h b/llvm/include/llvm/SYCLLowerIR/ModuleSplitter.h index 0da3706ad3626..cb6049910b934 100644 --- a/llvm/include/llvm/SYCLLowerIR/ModuleSplitter.h +++ b/llvm/include/llvm/SYCLLowerIR/ModuleSplitter.h @@ -321,6 +321,7 @@ splitSYCLModule(std::unique_ptr M, ModuleSplitterSettings Settings); bool isESIMDFunction(const Function &F); bool canBeImportedFunction(const Function &F); +bool isEntryPoint(const Function &F, bool EmitOnlyKernelsAsEntryPoints); } // namespace module_split diff --git a/llvm/lib/LTO/LTO.cpp b/llvm/lib/LTO/LTO.cpp index d303f228aa72c..9cdcf2738a6d7 100644 --- a/llvm/lib/LTO/LTO.cpp +++ b/llvm/lib/LTO/LTO.cpp @@ -1077,8 +1077,8 @@ Error LTO::addThinLTO(BitcodeModule BM, ArrayRef Syms, for (const std::string &Name : Conf.ThinLTOModulesToCompile) { if (BM.getModuleIdentifier().contains(Name)) { ThinLTO.ModulesToCompile->insert({BM.getModuleIdentifier(), BM}); - llvm::errs() << "[ThinLTO] Selecting " << BM.getModuleIdentifier() - << " to compile\n"; + LLVM_DEBUG(dbgs() << "[ThinLTO] Selecting " << BM.getModuleIdentifier() + << " to compile\n"); } } } diff --git a/llvm/lib/SYCLLowerIR/ModuleSplitter.cpp b/llvm/lib/SYCLLowerIR/ModuleSplitter.cpp index b9eda1376663f..a012d8a68e2f2 100644 --- a/llvm/lib/SYCLLowerIR/ModuleSplitter.cpp +++ b/llvm/lib/SYCLLowerIR/ModuleSplitter.cpp @@ -117,32 +117,6 @@ bool isKernel(const Function &F) { F.getCallingConv() == CallingConv::AMDGPU_KERNEL; } -bool isEntryPoint(const Function &F, bool EmitOnlyKernelsAsEntryPoints) { - // Skip declarations, if any: they should not be included into a vector of - // entry points groups or otherwise we will end up with incorrectly generated - // list of symbols. - if (F.isDeclaration()) - return false; - - // Kernels are always considered to be entry points - if (isKernel(F)) - return true; - - if (!EmitOnlyKernelsAsEntryPoints) { - // If not disabled, SYCL_EXTERNAL functions with sycl-module-id attribute - // are also considered as entry points (except __spirv_* and __sycl_* - // functions) - return llvm::sycl::utils::isSYCLExternalFunction(&F) && - !isSpirvSyclBuiltin(F.getName()) && !isESIMDBuiltin(F.getName()) && - !isGenericBuiltin(F.getName()); - } - - // Even if we are emitting only kernels as entry points, virtual functions - // should still be treated as entry points, because they are going to be - // outlined into separate device images and linked in later. - return F.hasFnAttribute("indirectly-callable"); -} - // Represents "dependency" or "use" graph of global objects (functions and // global variables) in a module. It is used during device code split to // understand which global variables and functions (other than entry points) @@ -445,6 +419,32 @@ class ModuleSplitter : public ModuleSplitterBase { namespace llvm { namespace module_split { +bool isEntryPoint(const Function &F, bool EmitOnlyKernelsAsEntryPoints) { + // Skip declarations, if any: they should not be included into a vector of + // entry points groups or otherwise we will end up with incorrectly generated + // list of symbols. + if (F.isDeclaration()) + return false; + + // Kernels are always considered to be entry points + if (isKernel(F)) + return true; + + if (!EmitOnlyKernelsAsEntryPoints) { + // If not disabled, SYCL_EXTERNAL functions with sycl-module-id attribute + // are also considered as entry points (except __spirv_* and __sycl_* + // functions) + return llvm::sycl::utils::isSYCLExternalFunction(&F) && + !isSpirvSyclBuiltin(F.getName()) && !isESIMDBuiltin(F.getName()) && + !isGenericBuiltin(F.getName()); + } + + // Even if we are emitting only kernels as entry points, virtual functions + // should still be treated as entry points, because they are going to be + // outlined into separate device images and linked in later. + return F.hasFnAttribute("indirectly-callable"); +} + std::optional convertStringToSplitMode(StringRef S) { static const StringMap Values = {{"kernel", SPLIT_PER_KERNEL}, {"source", SPLIT_PER_TU}, From 00a28c1276881300306cfe2b880153c4f977d061 Mon Sep 17 00:00:00 2001 From: "Sarnie, Nick" Date: Wed, 18 Sep 2024 08:16:14 -0700 Subject: [PATCH 3/9] add design doc, rework prototype with new design Signed-off-by: Sarnie, Nick --- clang/lib/Driver/ToolChains/Clang.cpp | 6 +- .../ClangLinkerWrapper.cpp | 180 ++++++++++-------- .../SYCLLowerIR/SYCLLinkedModuleProcessor.h | 22 +++ llvm/lib/SYCLLowerIR/CMakeLists.txt | 1 + .../SYCLLowerIR/SYCLLinkedModuleProcessor.cpp | 45 +++++ sycl/doc/design/CompilerAndRuntimeDesign.md | 3 + sycl/doc/design/ThinLTO.md | 147 ++++++++++++++ .../design/images/ThinLTOCommunityFlow.svg | 1 + sycl/doc/design/images/ThinLTOSYCLFlow.svg | 1 + .../images/ThinLTOSYCLSPIRVBackendFlow.svg | 1 + 10 files changed, 326 insertions(+), 81 deletions(-) create mode 100644 llvm/include/llvm/SYCLLowerIR/SYCLLinkedModuleProcessor.h create mode 100644 llvm/lib/SYCLLowerIR/SYCLLinkedModuleProcessor.cpp create mode 100644 sycl/doc/design/ThinLTO.md create mode 100644 sycl/doc/design/images/ThinLTOCommunityFlow.svg create mode 100644 sycl/doc/design/images/ThinLTOSYCLFlow.svg create mode 100644 sycl/doc/design/images/ThinLTOSYCLSPIRVBackendFlow.svg diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp index 216b216ba797b..8df597de8f5ff 100644 --- a/clang/lib/Driver/ToolChains/Clang.cpp +++ b/clang/lib/Driver/ToolChains/Clang.cpp @@ -11231,12 +11231,8 @@ void LinkerWrapper::ConstructJob(Compilation &C, const JobAction &JA, bool IsUsingLTO = D.isUsingLTO(/*IsDeviceOffloadAction=*/true); auto LTOMode = D.getLTOMode(/*IsDeviceOffloadAction=*/true); - if (IsUsingLTO && LTOMode == LTOK_Thin) { + if (IsUsingLTO && LTOMode == LTOK_Thin) CmdArgs.push_back(Args.MakeArgString("-sycl-thin-lto")); - // TODO: Pass the same value for this argument once we start using it - // for non-thinLTO. - CmdArgs.push_back(Args.MakeArgString("-sycl-module-split-mode=auto")); - } if (Args.hasArg(options::OPT_fsycl_embed_ir)) CmdArgs.push_back(Args.MakeArgString("-sycl-embed-ir")); diff --git a/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp b/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp index 7b38cebe77eef..f799d3cf05d46 100644 --- a/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp +++ b/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp @@ -44,6 +44,7 @@ #include "llvm/Remarks/HotnessThresholdParser.h" #include "llvm/SYCLLowerIR/ComputeModuleRuntimeInfo.h" #include "llvm/SYCLLowerIR/ModuleSplitter.h" +#include "llvm/SYCLLowerIR/SYCLLinkedModuleProcessor.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Errc.h" #include "llvm/Support/FileOutputBuffer.h" @@ -697,6 +698,13 @@ bool isSYCLThinLTO(const ArgList &Args, const llvm::Triple Triple) { return Triple.isSPIROrSPIRV() && Args.hasArg(OPT_sycl_thin_lto); } +bool areSpecConstsSupported(const ArgList &Args, const llvm::Triple Triple) { + const llvm::Triple HostTriple(Args.getLastArgValue(OPT_host_triple_EQ)); + bool SYCLNativeCPU = (HostTriple == Triple); + return (!Triple.isNVPTX() && !Triple.isAMDGCN() && !Triple.isSPIRAOT() && + !SYCLNativeCPU); +} + /// Add any sycl-post-link options that rely on a specific Triple in addition /// to user supplied options. /// NOTE: Any changes made here should be reflected in the similarly named @@ -1687,6 +1695,7 @@ template > std::unique_ptr createLTO( const ArgList &Args, const std::vector &Features, SmallVectorImpl &BitcodeInputFiles, + SmallVectorImpl &Files, std::vector ModulesToCompile = {}, ModuleHook Hook = [](size_t, const Module &) { return true; }) { const llvm::Triple Triple(Args.getLastArgValue(OPT_triple_EQ)); @@ -1780,18 +1789,16 @@ std::unique_ptr createLTO( // Passing Args to each thinLTO thread causes crashes, so compute everything // we can here. const llvm::Triple Triple(Args.getLastArgValue(OPT_triple_EQ)); - bool OnlyKernelsAsEntryPoints = - sycl::considerOnlyKernelsAsEntryPoints(Args, Triple); - auto GlobalBinProps = sycl::computeGlobalBinProps(Args, Triple); SmallVector SPIRVArgs; sycl::computeLLVMToSPIRVTranslationToolArgs(Args, SPIRVArgs); - Conf.PreCodeGenModuleHook = [=, &BitcodeInputFiles](unsigned Task, - const Module &M) { - // This is the main part of SYCL LTO handling. - // Here we process the IR from each BC file, compute module - // properties and the module symbol table, convert to SPV (using the - // translator for now) and save required information for binary created - // inside the OffloadFile. + auto SpecConstArg = sycl::areSpecConstsSupported(Args, Triple) + ? SpecConstantsPass::HandlingMode::native + : SpecConstantsPass::HandlingMode::emulation; + Conf.PreCodeGenModuleHook = [=, &BitcodeInputFiles, &Files]( + unsigned Task, const Module &M) mutable { + // Here we process the IR from each BC file, save the module for later + // use, convert to SPV (using the translator for now) and save the path to + // the output file. assert(Task != 0 && "Unexpected task"); auto &OffloadF = BitcodeInputFiles[Task - 1]; @@ -1801,43 +1808,18 @@ std::unique_ptr createLTO( return true; } - llvm::sycl::EntryPointSet EntryPoints; - - for (const Function &F : M.functions()) { - if (llvm::module_split::isEntryPoint(F, OnlyKernelsAsEntryPoints)) - EntryPoints.insert(const_cast(&F)); - } - // No entry points, don't proceed - if (EntryPoints.empty()) - return false; - if (SaveTemps) PreCodeGenSaveTemps(Task, M); - // TODO: Handle spec constants. - - // TODO: Handle internalization of non-entry-points, we don't do it during - // early split anymore. - // One problem is that the modules are pased in as `const Module&`, and - // ideally we want to delete non-entry point functions, but const-casting - // and modifying the module seems from here seems wrong. - - auto ModuleProps = llvm::sycl::computeModuleProperties( - M, EntryPoints, GlobalBinProps, - /*SpecConstsMet=*/false, /*SpecConstsMet=*/false); - std::string ModulePropsStr; - raw_string_ostream SCOut(ModulePropsStr); - ModuleProps.write(SCOut); - std::string ModuleSyms = - llvm::sycl::computeModuleSymbolTable(M, EntryPoints); - // This part is the hackiest part of this change. However, this code is - // run on multiple threads, so the data structures we can use are more - // limited. We can't use StringRef because we would need a StringSaver to - // keep the values around, but StringSaver is not thread safe. - OffloadF.getBinary()->addTmpString(ModulePropsStr); - OffloadF.getBinary()->addTmpString(ModuleSyms); - // TODO: Use SPIR-V backend instead of SPIR-V translator once the backend - // is mature. + // Use the legacy PM because eventually we will use the + // PreCodeGenPassesHook field of LTOConfig which requires the legacy PM. + legacy::PassManager PM; + + // LTO does not continue processing the module after this + // function finishes, so it's safe to modify the module. + PM.add(createSYCLLinkedModuleProcessorPass(SpecConstArg)); + PM.run(const_cast(M)); + auto IRFile = createOutputFile(sys::path::filename(ExecutableName) + "." + std::to_string(Task) + ".to.spv", "spv"); @@ -1849,15 +1831,32 @@ std::unique_ptr createLTO( reportError(errorCodeToError(EC)); WriteBitcodeToFile(M, LinkedBitcode); LinkedBitcode.close(); - // We need this copy to prevent data corruption of the arguments when - // calling llvm-spirv. Probably some multithreading thing, I didn't deeply - // investigate it yet. - SmallVector SPIRVArgsCopy = SPIRVArgs; + { + // Overwrite the fully linked module in BitcodeInputFiles + // so we can compute the module properties and symbol table. + // We need a fully linked module to accurately compute these. + llvm::ErrorOr> ImageOrError = + llvm::MemoryBuffer::getFileOrSTDIN(*IRFile); + assert(ImageOrError); + OffloadingImage Image{}; + Image.TheImageKind = IMG_Bitcode; + Image.TheOffloadKind = OffloadF.getBinary()->getOffloadKind(); + Image.StringData["triple"] = OffloadF.getBinary()->getTriple(); + Image.StringData["arch"] = OffloadF.getBinary()->getArch(); + Image.Image = std::move(*ImageOrError); + + std::unique_ptr Binary = + MemoryBuffer::getMemBufferCopy(OffloadBinary::write(Image)); + auto NewBinaryOrErr = OffloadBinary::create(*Binary); + assert(NewBinaryOrErr); + BitcodeInputFiles[Task - 1] = + OffloadFile(std::move(*NewBinaryOrErr), std::move(Binary)); + } auto SPVFile = - sycl::runLLVMToSPIRVTranslation(*IRFile, std::move(SPIRVArgsCopy)); + sycl::runLLVMToSPIRVTranslation(*IRFile, std::move(SPIRVArgs)); if (!SPVFile) reportError(SPVFile.takeError()); - OffloadF.getBinary()->addTmpString((*SPVFile).str()); + Files[Task] = *SPVFile; // Return false so the thinLTO backend doesn't continue to process this // module. We already emitted SPIR-V ourselves, so we don't need to do // anything else. Once the SPIR-V backend is ready, we can remove the @@ -1981,12 +1980,15 @@ Error linkBitcodeFiles(SmallVectorImpl &InputFiles, }; // We assume visibility of the whole program if every input file was bitcode. + SmallVector Files; auto Features = getTargetFeatures(BitcodeInputFiles); - auto LTOBackend = - Args.hasArg(OPT_embed_bitcode) || Args.hasArg(OPT_builtin_bitcode_EQ) || - Args.hasArg(OPT_clang_backend) - ? createLTO(Args, Features, BitcodeInputFiles, {}, OutputBitcode) - : createLTO(Args, Features, BitcodeInputFiles, ModulesToCompile); + auto LTOBackend = Args.hasArg(OPT_embed_bitcode) || + Args.hasArg(OPT_builtin_bitcode_EQ) || + Args.hasArg(OPT_clang_backend) + ? createLTO(Args, Features, BitcodeInputFiles, Files, + {}, OutputBitcode) + : createLTO(Args, Features, BitcodeInputFiles, Files, + ModulesToCompile); // We need to resolve the symbols so the LTO backend knows which symbols need // to be kept or can be internalized. This is a simplified symbol resolution @@ -2067,7 +2069,7 @@ Error linkBitcodeFiles(SmallVectorImpl &InputFiles, // Run the LTO job to compile the bitcode. size_t MaxTasks = LTOBackend->getMaxTasks(); - SmallVector Files(MaxTasks); + Files.resize(MaxTasks); auto AddStream = [&](size_t Task, const Twine &ModuleName) -> std::unique_ptr { @@ -2450,33 +2452,59 @@ Expected> linkAndWrapDeviceFiles( SmallVector InputFilesSYCL; std::vector SplitModules; if (sycl::isSYCLThinLTO(Args, Triple)) { + size_t LastSPVFilePath = 0; for (size_t FileIdx = 0; FileIdx < BitcodeInputFiles.size(); FileIdx++) { - // After we have run the LTO backend, extract the information computed - // in the backend (module props/symbol table/spv file path) and set it - // up to be used by SYCL image creation. + // After we have run the LTO backend, compute module props/symbol + // table/spv file path and set it up to be used by SYCL image + // creation. // TODO: Once SYCL image creation is reconsiled with the non-SYCL // path, we can move all of the thinLTO handling to be more in-line // with community code. + + // This is a bit hacky but not every BitcodeInputFile will end up as a + // SPV file in InputFiles, for example if it is a device library file. + // If the file name is empty, that means we didn't generate SPV for + // it, so just find the next non-empty file name. Should be easy to + // clean this up later if we go with this overall design. const OffloadFile &F = BitcodeInputFiles[FileIdx]; - const auto &SYCLInfo = F.getBinary()->getTmpStrings(); - if (SYCLInfo.size() != 3) + StringRef CodegenPath; + for (size_t OutputNum = LastSPVFilePath; + OutputNum < InputFiles.size(); OutputNum++) { + auto SPVFilePath = InputFiles[OutputNum]; + if (!SPVFilePath.empty()) { + LastSPVFilePath = OutputNum + 1; + CodegenPath = SPVFilePath; + break; + } + } + if (CodegenPath.empty()) continue; - // The hardcoded vector indexes are very hacky, - // but I feel the most controversial part of this hcange is how we - // store the required information for later and it's likely to change - // based on feedback, so I didn't completely design that part yet. - StringRef CodegenPath = SYCLInfo[2]; - assert(!CodegenPath.empty() && "Codegen failed"); - const auto &Props = SYCLInfo[0]; - auto MB = MemoryBuffer::getMemBuffer(Props); - auto PropSetOrErr = llvm::util::PropertySetRegistry::read(MB.get()); - if (!PropSetOrErr) - return PropSetOrErr.takeError(); - llvm::util::PropertySetRegistry Properties = - std::move(**PropSetOrErr); - const auto &Syms = SYCLInfo[1]; - SplitModules.emplace_back(CodegenPath, std::move(Properties), Syms); + LLVMContext Context; + auto Buf = MemoryBuffer::getMemBuffer(F.getBinary()->getImage()); + auto ModOrErr = parseBitcodeFile(*Buf, Context); + if (!ModOrErr) + return ModOrErr.takeError(); + auto &M = **ModOrErr; + + llvm::sycl::EntryPointSet EntryPoints; + bool OnlyKernelsAsEntryPoints = + sycl::considerOnlyKernelsAsEntryPoints(Args, Triple); + auto GlobalBinProps = sycl::computeGlobalBinProps(Args, Triple); + for (const Function &F : M.functions()) { + if (llvm::module_split::isEntryPoint(F, OnlyKernelsAsEntryPoints)) + EntryPoints.insert(const_cast(&F)); + } + if (EntryPoints.empty()) + continue; + + auto Properties = llvm::sycl::computeModuleProperties( + M, EntryPoints, GlobalBinProps, true, true); + + std::string ModuleSyms = + llvm::sycl::computeModuleSymbolTable(M, EntryPoints); + SplitModules.emplace_back(CodegenPath, std::move(Properties), + ModuleSyms); } // We don't need the OffloadFiles anymore, so free them from memory. BitcodeInputFiles.clear(); diff --git a/llvm/include/llvm/SYCLLowerIR/SYCLLinkedModuleProcessor.h b/llvm/include/llvm/SYCLLowerIR/SYCLLinkedModuleProcessor.h new file mode 100644 index 0000000000000..171992dfb2586 --- /dev/null +++ b/llvm/include/llvm/SYCLLowerIR/SYCLLinkedModuleProcessor.h @@ -0,0 +1,22 @@ +//===-- SYCLLinkedModuleProcessor.h - finalize a fully linked module ---===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// The file contains a number of functions to create a pass that can be called +// by the LTO backend that will finalize a fully-linked module. +//===----------------------------------------------------------------------===// +#pragma once +#include "SpecConstants.h" +namespace llvm { + +class PassRegistry; +class ModulePass; +ModulePass * + createSYCLLinkedModuleProcessorPass(llvm::SpecConstantsPass::HandlingMode); +void initializeSYCLLinkedModuleProcessorPass(PassRegistry &); + +} // namespace llvm diff --git a/llvm/lib/SYCLLowerIR/CMakeLists.txt b/llvm/lib/SYCLLowerIR/CMakeLists.txt index 3eb84ba9864cb..9e7543d898a2a 100644 --- a/llvm/lib/SYCLLowerIR/CMakeLists.txt +++ b/llvm/lib/SYCLLowerIR/CMakeLists.txt @@ -64,6 +64,7 @@ add_llvm_component_library(LLVMSYCLLowerIR SYCLDeviceLibReqMask.cpp SYCLDeviceRequirements.cpp SYCLKernelParamOptInfo.cpp + SYCLLinkedModuleProcessor.cpp SYCLPropagateAspectsUsage.cpp SYCLPropagateJointMatrixUsage.cpp SYCLVirtualFunctionsAnalysis.cpp diff --git a/llvm/lib/SYCLLowerIR/SYCLLinkedModuleProcessor.cpp b/llvm/lib/SYCLLowerIR/SYCLLinkedModuleProcessor.cpp new file mode 100644 index 0000000000000..672d49d6ad161 --- /dev/null +++ b/llvm/lib/SYCLLowerIR/SYCLLinkedModuleProcessor.cpp @@ -0,0 +1,45 @@ +//===-- SYCLLinkedModuleProcessor.cpp - finalize a fully linked module ---===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// See comments in the header. +//===----------------------------------------------------------------------===// + +#include "llvm/SYCLLowerIR/SYCLLinkedModuleProcessor.h" + +#include "llvm/Pass.h" + +#define DEBUG_TYPE "sycl-linked-module-processor" +using namespace llvm; + +namespace { +class SYCLLinkedModuleProcessor : public ModulePass { +public: + static char ID; + SYCLLinkedModuleProcessor(SpecConstantsPass::HandlingMode Mode) + : ModulePass(ID), Mode(Mode) { + initializeSYCLLinkedModuleProcessorPass(*PassRegistry::getPassRegistry()); + } + + bool runOnModule(Module &M) override { + // TODO: determine if we need to run other passes + ModuleAnalysisManager MAM; + SpecConstantsPass SCP(Mode); + auto PA = SCP.run(M, MAM); + return !PA.areAllPreserved(); + } + +private: + SpecConstantsPass::HandlingMode Mode; +}; +} // namespace +char SYCLLinkedModuleProcessor::ID = 0; +INITIALIZE_PASS(SYCLLinkedModuleProcessor, "SYCLLinkedModuleProcessor", + "Finalize a fully linked SYCL module", false, false) +ModulePass *llvm::createSYCLLinkedModuleProcessorPass( + SpecConstantsPass::HandlingMode Mode) { + return new SYCLLinkedModuleProcessor(Mode); +} diff --git a/sycl/doc/design/CompilerAndRuntimeDesign.md b/sycl/doc/design/CompilerAndRuntimeDesign.md index 52ae88a2c0ef1..954f66109c390 100644 --- a/sycl/doc/design/CompilerAndRuntimeDesign.md +++ b/sycl/doc/design/CompilerAndRuntimeDesign.md @@ -550,6 +550,9 @@ unit) - `off` - disables device code split. If `-fno-sycl-rdc` is specified, the behavior is the same as `per_source` +If ThinLTO is enabled, device code splitting is run during the compilation stage. +See [here](ThinLTO.md) for more information. + ##### Symbol table generation TBD diff --git a/sycl/doc/design/ThinLTO.md b/sycl/doc/design/ThinLTO.md new file mode 100644 index 0000000000000..3ffb7819bef68 --- /dev/null +++ b/sycl/doc/design/ThinLTO.md @@ -0,0 +1,147 @@ +# ThinLTO for SYCL + +This document describes the purpose and design of ThinLTO for SYCL. + +**NOTE**: This is not the final version. The document is still in progress. + +## Background + +With traditional SYCL device code linking, all user code is linked together +along with device libraries into a single huge module and then split and +processed by `sycl-post-link`. This requires sequential processing, has a large +memory footprint, and differs from the linking flow for AMD and NVIDIA devices. + +## Summary +SYCL ThinLTO will hook into the existing community mechanism to run LTO as part +of device linking inside `clang-linker-wrapper`. We split the device images +early at compilation time, and at link time we use ThinLTO's function importing +feature +to bring in the defintions for referenced functions. Only the new offload model +is supported. + +## Device code compilation time changes +Most of the changes for ThinLTO occur during device link time, however there is +one major change during compilation (-c) time: we now run device code split +during compilaton instead of linking. +The main reason for doing this is increased parallelization. Many compilation +jobs can be run at the same time, but linking happens once total for the +application. Device code split is currently a common source of performance +issues. + +Splitting early means that the resulting IR after splitting is not complete, it +still may contain calls to functions (user code and/or the SYCL device +libraries) from other object files. + +We rely on the assumption that all function defintions matching a declaration +will be the same and we can let ThinLTO pull in any one. + +For example, let's start with user device code that defines a `SYCL_EXTERNAL` +function `foo` in translation unit `tu_foo`. There is also another translation +unit `tu_bar` that references `foo`. +During the early device code splitting run of `tu_foo`, we may find that more +than one of the resultant device images contain a defintion for `foo`. + +We assert that any function defintion for `foo` that is deemed a match by the +ThinLTO infrastruction during the processing of `tu_bar` is valid. + +As a result of running early device code split, the fat object file generated +as part of device compilation may contain multiple device code images. + +# Device code link time changes + +Before we go into the link time changes for SYCL, let's understand the device +linking flow for community devices (AMD/NVIDIA): + +![Community linking flow](images/ThinLTOCommunityFlow.svg) + +SYCL has two differenting requirements: +1) The SPIR-V backend is not production ready and the SPIR-V translator is used. +2) The SYCL runtime requires metadata (module properties and module symbol +table) computed from device images that will be stored along the device images +in the fat executable. + +The effect of requirement 1) is that instead of letting ThinLTO call the SPIR-V +backend, we add a callback that runs right before codegen would run. +In that callback, we call the SPIR-V translator and store the resultant file +path for use later, and we instruct the ThinLTO framework to not +perform codegen. + +An interesting additional fact about requirement 2) is that we actually need to +process fully linked module to accurate compute the module properties. One +example where we need the full module is to [compute the required devicelib mask](https://github.com/intel/llvm/blob/sycl/llvm/lib/SYCLLowerIR/SYCLDeviceLibReqMask.cpp). +If we only process the device code that was included in the +original fat object input to `clang-linker-wrapper`, we will miss devicelib +calls in referenced `SYCL_EXTERNAL` functions. + +The effect of requirement 2) is that we store the fully linked device image for +metadata computation in the SYCL-specific handing code after the ThinLTO +framework has completed. Another option would be to try to compute the metadata +inside the ThinLTO framework callbacks, but this would require SYCL-specific +arguments to many caller functions in the stack and pollute community code. + +Here is the current ThinLTO flow for SYCL: + +![SYCL linking flow](images/ThinLTOSYCLFlow.svg) + +We add a `PreCodeGenModuleHook` function to the `LTOConfig` object so that we +can process the fully linked module without running the backend. + +However, the flow is not ideal for many reasons: +1) We are relying on the external `llvm-spirv` tool instead of the SPIR-V +backend. We could slightly improve this issue by using a library call to the +SPIR-V translator instead of the tool, however the library API requires setting +up an object to represent the arguments while we only have strings, and it's +non-trivial to parse the trings to figure out how to create the argument +object. Since we plan to use the SPIR-V backend in the long term, this does not +seem to be worth the effort. + +2) We manually run passes inside `PreCodeGenModuleHook`. This is because we +don't run codegen, so we can't take advantage of the `PreCodeGenPassesHook` +field of `LTOConfig` to run some custom passes, as those passes are only run +when we actually are going to run codegen. + +3) We have to store the fully linked module. This is needed because we need a +fully linked module to accurately compute metadata, see the above explanation +of SYCL requirement 2). We could get around storing the module by computing the +metadata inside the LTO framework and storing it for late use by the SYCL +bundling code, but doing this would require SYCL-only customizations including +even more new function arguments and modifications of the `OffloadFile` class. +It's also complicated because the LTO framework is multithreaded, and not all +LLVM data structures are thread safe. + +The proposed long-term SYCL ThinLTO flow is as follows: + +![SYCL SPIR-V backend linking flow](images/ThinLTOSYCLSPIRVBackendFlow.svg) + +The biggest difference here is that we are running codegen using the SPIR-V +backend. + +Also, instead of using a lambda function in the `PreCodeGenModuleHook` +callback, we can take advantage of the `PreCodeGenPassesHook` field to add +passes to the pass manager that the LTO framework will run. + +It is possible that the number of device images in the fat executable +and which device image contains which kernel is different with ThinLTO +enabled, but we do expect this to have any impact on correctness or +performance, nor we do expect users to care. + + +# Current limitations + +`-O0`: Compiling with `-O0` prevent clang from generating ThinLTO metadata +during the compilation phase. In the current implementation, this is an error. +In the final version, we could either silently fall back to full LTO or +generate ThinLTO metadata even for `-O0`. + +SYCL libdevice: Current all `libdevice` functions are explicitly marked to be +weak symbols. The ThinLTO framework does not consider a defintion of function +with weak linkage as it cannot be sure that this definiton is the correct one. +Ideally we could remove the weak symbol annotation. + +No binary linkage: The SPIR-V target does not currently have a production +quality binary linker. This means that we must generate a fully linked image as +part of device linkage. At least for AMD devices, this is not a requirement as +`lld` is used for the final link which can resolve any unresolved symbols. +`-fno-gpu-rdc` is default for AMD, so in that case it can call `lld` during +compile, but if `-fno-gpu-rdc` is passed, the lld call happens as part of +`clang-linker-wrapper` to resolve any symbols not resolved by ThinLTO. \ No newline at end of file diff --git a/sycl/doc/design/images/ThinLTOCommunityFlow.svg b/sycl/doc/design/images/ThinLTOCommunityFlow.svg new file mode 100644 index 0000000000000..f2fe257a07af8 --- /dev/null +++ b/sycl/doc/design/images/ThinLTOCommunityFlow.svg @@ -0,0 +1 @@ +Extract device codefrom inputsclang-linker-wrapperProcess symbols andadd device code to LTOframeworkThinLTOframeworkImport functiondefinitionsOptimizationCodegenBundlingCall host linkerwith bundled fatobject \ No newline at end of file diff --git a/sycl/doc/design/images/ThinLTOSYCLFlow.svg b/sycl/doc/design/images/ThinLTOSYCLFlow.svg new file mode 100644 index 0000000000000..622f4d20d158c --- /dev/null +++ b/sycl/doc/design/images/ThinLTOSYCLFlow.svg @@ -0,0 +1 @@ +Extractdevicecodefrominputsclang-linker-wrapperProcesssymbolsandadddevicecodetoLTOframeworkThinLTOframeworkImportfunctiondefinitionsOptimizationBundlingCall host linkerwith bundledfat objectPreCodeGenModuleHookCall SPIR-VtranslatorStore fullylinkedmoduleEarlyexitComputemetadata for fullylinked modulesRun linkedmodulefinalizationpasses \ No newline at end of file diff --git a/sycl/doc/design/images/ThinLTOSYCLSPIRVBackendFlow.svg b/sycl/doc/design/images/ThinLTOSYCLSPIRVBackendFlow.svg new file mode 100644 index 0000000000000..2b048f531b8b7 --- /dev/null +++ b/sycl/doc/design/images/ThinLTOSYCLSPIRVBackendFlow.svg @@ -0,0 +1 @@ +Extractdevicecodefrominputsclang-linker-wrapperProcesssymbolsandadddevicecodetoLTOframeworkThinLTOframeworkImportfunctiondefinitionsOptimizationBundlingCall host linkerwith bundledfat objectPreCodeGenPassesHookStore fullylinkedmoduleCodegenComputemetadata for fullylinked modulesRun linkedmodulefinalizationpasses \ No newline at end of file From 8c4edb31c2afcfd0f046d47c3cd6913c9551fb26 Mon Sep 17 00:00:00 2001 From: "Sarnie, Nick" Date: Wed, 18 Sep 2024 10:09:18 -0700 Subject: [PATCH 4/9] fix merge Signed-off-by: Sarnie, Nick --- .../ClangLinkerWrapper.cpp | 7 -- foo | 68 ------------------- llvm/include/llvm/Object/OffloadBinary.h | 11 --- 3 files changed, 86 deletions(-) delete mode 100644 foo diff --git a/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp b/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp index 165f1d253cd55..9bf3b7d7bb594 100644 --- a/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp +++ b/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp @@ -2641,13 +2641,6 @@ Expected> linkAndWrapDeviceFiles( if (HasSYCLOffloadKind) { StringRef TmpOutput; if (!sycl::isSYCLThinLTO(Args, Triple)) { - // Write device inputs to an output file for the linker. - for (const OffloadFile &File : Input) { - auto FileNameOrErr = writeOffloadFile(File); - if (!FileNameOrErr) - return FileNameOrErr.takeError(); - InputFiles.emplace_back(*FileNameOrErr); - } // Link the input device files using the device linker for SYCL // offload. auto TmpOutputOrErr = sycl::linkDevice(InputFiles, LinkerArgs); diff --git a/foo b/foo deleted file mode 100644 index 36206fcd9b1f0..0000000000000 --- a/foo +++ /dev/null @@ -1,68 +0,0 @@ -diff --git a/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp b/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp -index 147b6010228a..9173ff2ac48b 100644 ---- a/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp -+++ b/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp -@@ -154,6 +154,8 @@ static std::atomic LTOError; - - static std::optional SYCLModuleSplitMode; - -+static bool UseSYCLPostLinkTool; -+ - SmallString<128> SPIRVDumpDir; - - using OffloadingImage = OffloadBinary::OffloadingImage; -@@ -2392,10 +2394,10 @@ Expected> linkAndWrapDeviceFiles( - SmallVector InputFilesSYCL; - InputFilesSYCL.emplace_back(*TmpOutputOrErr); - auto SplitModulesOrErr = -- SYCLModuleSplitMode -- ? sycl::runSYCLSplitLibrary(InputFilesSYCL, LinkerArgs, -- *SYCLModuleSplitMode) -- : sycl::runSYCLPostLinkTool(InputFilesSYCL, LinkerArgs); -+ UseSYCLPostLinkTool -+ ? sycl::runSYCLPostLinkTool(InputFilesSYCL, LinkerArgs) -+ : sycl::runSYCLSplitLibrary(InputFilesSYCL, LinkerArgs, -+ *SYCLModuleSplitMode); - if (!SplitModulesOrErr) - return SplitModulesOrErr.takeError(); - -@@ -2961,7 +2963,19 @@ int main(int Argc, char **Argv) { - timeTraceProfilerInitialize(Granularity, Argv[0]); - } - -+ UseSYCLPostLinkTool = Args.hasFlag(OPT_use_sycl_post_link_tool, -+ OPT_no_use_sycl_post_link_tool, true); -+ if (!UseSYCLPostLinkTool && Args.hasArg(OPT_use_sycl_post_link_tool)) -+ reportError(createStringError("-use-sycl-post-link-tool and " -+ "-no-use-sycl-post-link-tool options can't " -+ "be used together.")); -+ - if (Args.hasArg(OPT_sycl_module_split_mode_EQ)) { -+ if (UseSYCLPostLinkTool) -+ reportError(createStringError( -+ "-sycl-module-split-mode should be used with " -+ "the -no-use-sycl-post-link-tool command line option.")); -+ - StringRef StrMode = Args.getLastArgValue(OPT_sycl_module_split_mode_EQ); - SYCLModuleSplitMode = module_split::convertStringToSplitMode(StrMode); - if (!SYCLModuleSplitMode) -diff --git a/clang/tools/clang-linker-wrapper/LinkerWrapperOpts.td b/clang/tools/clang-linker-wrapper/LinkerWrapperOpts.td -index 2926a08c8759..60a13b23ba30 100644 ---- a/clang/tools/clang-linker-wrapper/LinkerWrapperOpts.td -+++ b/clang/tools/clang-linker-wrapper/LinkerWrapperOpts.td -@@ -184,6 +184,15 @@ def sycl_module_split_mode_EQ : - Flags<[WrapperOnlyOption]>, - HelpText<"Option that turns on split library with the given split mode">; - -+// TODO: Options will be removed when the sycl-post-link tool becomes removed. -+def use_sycl_post_link_tool : Flag<["--", "-"], "use-sycl-post-link-tool">, -+ Flags<[WrapperOnlyOption]>, -+ HelpText<"Use the sycl-post-link tool. On by default">; -+ -+def no_use_sycl_post_link_tool : Flag<["--", "-"], "no-use-sycl-post-link-tool">, -+ Flags<[WrapperOnlyOption]>, -+ HelpText<"Use a SYCL library instead of sycl-post-link tool. (experimental)">; -+ - // Special option to pass in llvm-spirv options - def llvm_spirv_options_EQ : Joined<["--", "-"], "llvm-spirv-options=">, - Flags<[WrapperOnlyOption]>, diff --git a/llvm/include/llvm/Object/OffloadBinary.h b/llvm/include/llvm/Object/OffloadBinary.h index 3d1b18b7f3418..d590110ffa598 100644 --- a/llvm/include/llvm/Object/OffloadBinary.h +++ b/llvm/include/llvm/Object/OffloadBinary.h @@ -103,14 +103,6 @@ class OffloadBinary : public Binary { StringRef getString(StringRef Key) const { return StringData.lookup(Key); } - /// XXX: Hack - const SmallVectorImpl &getTmpStrings() const { - return TmpStringData; - } - - /// XXX: Hack - void addTmpString(std::string Value) { TmpStringData.push_back(Value); } - static bool classof(const Binary *V) { return V->isOffloadFile(); } struct Header { @@ -159,9 +151,6 @@ class OffloadBinary : public Binary { const Header *TheHeader; /// Location of the metadata entries within the binary. const Entry *TheEntry; - - /// XXX: Hack - SmallVector TmpStringData; }; /// A class to contain the binary information for a single OffloadBinary that From 33226848b7f23d96f4e5ca41085887e529e92ab0 Mon Sep 17 00:00:00 2001 From: Sarnie Date: Thu, 19 Sep 2024 07:12:51 -0700 Subject: [PATCH 5/9] fix nvptx libdevice duplicate symbol error, investigate later Signed-off-by: Sarnie --- libdevice/fallback-cassert.cpp | 5 ----- 1 file changed, 5 deletions(-) diff --git a/libdevice/fallback-cassert.cpp b/libdevice/fallback-cassert.cpp index 5d3c99d63c556..1c685737002f4 100644 --- a/libdevice/fallback-cassert.cpp +++ b/libdevice/fallback-cassert.cpp @@ -114,9 +114,4 @@ DEVICE_EXTERN_C void __devicelib_assert_fail(const char *expr, const char *file, __assertfail(expr, file, line, func, 1); } -DEVICE_EXTERN_C void _wassert(const char *_Message, const char *_File, - unsigned _Line) { - __assertfail(_Message, _File, _Line, 0, 1); -} - #endif From 43df8c948764749a7a40421ddfc14acb69721f6e Mon Sep 17 00:00:00 2001 From: Nick Sarnie Date: Thu, 19 Sep 2024 10:39:39 -0700 Subject: [PATCH 6/9] typos Signed-off-by: Nick Sarnie --- sycl/doc/design/ThinLTO.md | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/sycl/doc/design/ThinLTO.md b/sycl/doc/design/ThinLTO.md index 3ffb7819bef68..b1cf4d4082698 100644 --- a/sycl/doc/design/ThinLTO.md +++ b/sycl/doc/design/ThinLTO.md @@ -91,7 +91,7 @@ However, the flow is not ideal for many reasons: backend. We could slightly improve this issue by using a library call to the SPIR-V translator instead of the tool, however the library API requires setting up an object to represent the arguments while we only have strings, and it's -non-trivial to parse the trings to figure out how to create the argument +non-trivial to parse the strings to figure out how to create the argument object. Since we plan to use the SPIR-V backend in the long term, this does not seem to be worth the effort. @@ -104,9 +104,9 @@ when we actually are going to run codegen. fully linked module to accurately compute metadata, see the above explanation of SYCL requirement 2). We could get around storing the module by computing the metadata inside the LTO framework and storing it for late use by the SYCL -bundling code, but doing this would require SYCL-only customizations including +bundling code, but doing this would require even more SYCL-only customizations including even more new function arguments and modifications of the `OffloadFile` class. -It's also complicated because the LTO framework is multithreaded, and not all +There are also compliations because the LTO framework is multithreaded, and not all LLVM data structures are thread safe. The proposed long-term SYCL ThinLTO flow is as follows: @@ -117,7 +117,7 @@ The biggest difference here is that we are running codegen using the SPIR-V backend. Also, instead of using a lambda function in the `PreCodeGenModuleHook` -callback, we can take advantage of the `PreCodeGenPassesHook` field to add +callback to run SYCL finalization passes, we can take advantage of the `PreCodeGenPassesHook` field to add passes to the pass manager that the LTO framework will run. It is possible that the number of device images in the fat executable From 9922d3093eea9545aea04d6ba06443e37e57c1ed Mon Sep 17 00:00:00 2001 From: Alexey Bader Date: Thu, 19 Sep 2024 16:48:55 -0700 Subject: [PATCH 7/9] [NFC] Fix typos, markdown linter issues Among other changes: - removed trailing spaces - fixed 80-char line limitations --- sycl/doc/design/CompilerAndRuntimeDesign.md | 4 +- sycl/doc/design/ThinLTO.md | 190 ++++++++++---------- 2 files changed, 98 insertions(+), 96 deletions(-) diff --git a/sycl/doc/design/CompilerAndRuntimeDesign.md b/sycl/doc/design/CompilerAndRuntimeDesign.md index e22492970d99e..7843b5d3b88e6 100644 --- a/sycl/doc/design/CompilerAndRuntimeDesign.md +++ b/sycl/doc/design/CompilerAndRuntimeDesign.md @@ -550,8 +550,8 @@ unit) - `off` - disables device code split. If `-fno-sycl-rdc` is specified, the behavior is the same as `per_source` -If ThinLTO is enabled, device code splitting is run during the compilation stage. -See [here](ThinLTO.md) for more information. +If ThinLTO is enabled, device code splitting is run during the compilation +stage. See [here](ThinLTO.md) for more information. ##### Symbol table generation diff --git a/sycl/doc/design/ThinLTO.md b/sycl/doc/design/ThinLTO.md index b1cf4d4082698..41a0a00786bdf 100644 --- a/sycl/doc/design/ThinLTO.md +++ b/sycl/doc/design/ThinLTO.md @@ -6,142 +6,144 @@ This document describes the purpose and design of ThinLTO for SYCL. ## Background -With traditional SYCL device code linking, all user code is linked together -along with device libraries into a single huge module and then split and -processed by `sycl-post-link`. This requires sequential processing, has a large +With traditional SYCL device code linking, all user code is linked together +along with device libraries into a single huge module and then split and +processed by `sycl-post-link`. This requires sequential processing, has a large memory footprint, and differs from the linking flow for AMD and NVIDIA devices. ## Summary -SYCL ThinLTO will hook into the existing community mechanism to run LTO as part -of device linking inside `clang-linker-wrapper`. We split the device images -early at compilation time, and at link time we use ThinLTO's function importing -feature -to bring in the defintions for referenced functions. Only the new offload model -is supported. + +SYCL ThinLTO will hook into the existing community mechanism to run LTO as part +of device linking inside `clang-linker-wrapper`. We split the device images +early at compilation time, and at link time we use ThinLTO's function importing +feature to bring in the definitions for referenced functions. Only the new +offload model is supported. ## Device code compilation time changes -Most of the changes for ThinLTO occur during device link time, however there is -one major change during compilation (-c) time: we now run device code split -during compilaton instead of linking. -The main reason for doing this is increased parallelization. Many compilation -jobs can be run at the same time, but linking happens once total for the -application. Device code split is currently a common source of performance -issues. - -Splitting early means that the resulting IR after splitting is not complete, it -still may contain calls to functions (user code and/or the SYCL device + +Most of the changes for ThinLTO occur during device link time, however there is +one major change during compilation (-c) time: we now run device code split +during compilation instead of linking. The main reason for doing this is +increased parallelization. Many compilation jobs can be run at the same time, +but linking happens once total for the application. Device code split is +currently a common source of performance issues. + +Splitting early means that the resulting IR after splitting is not complete, it +still may contain calls to functions (user code and/or the SYCL device libraries) from other object files. -We rely on the assumption that all function defintions matching a declaration +We rely on the assumption that all function definitions matching a declaration will be the same and we can let ThinLTO pull in any one. -For example, let's start with user device code that defines a `SYCL_EXTERNAL` -function `foo` in translation unit `tu_foo`. There is also another translation -unit `tu_bar` that references `foo`. -During the early device code splitting run of `tu_foo`, we may find that more -than one of the resultant device images contain a defintion for `foo`. +For example, let's start with user device code that defines a `SYCL_EXTERNAL` +function `foo` in translation unit `tu_foo`. There is also another translation +unit `tu_bar` that references `foo`. During the early device code splitting run +of `tu_foo`, we may find that more than one of the resultant device images +contain a definition for `foo`. -We assert that any function defintion for `foo` that is deemed a match by the -ThinLTO infrastruction during the processing of `tu_bar` is valid. +We assert that any function definition for `foo` that is deemed a match by the +ThinLTO infrastructure during the processing of `tu_bar` is valid. -As a result of running early device code split, the fat object file generated -as part of device compilation may contain multiple device code images. +As a result of running early device code split, the fat object file generated as +part of device compilation may contain multiple device code images. -# Device code link time changes +## Device code link time changes -Before we go into the link time changes for SYCL, let's understand the device +Before we go into the link time changes for SYCL, let's understand the device linking flow for community devices (AMD/NVIDIA): ![Community linking flow](images/ThinLTOCommunityFlow.svg) -SYCL has two differenting requirements: +SYCL has two differentiating requirements: + 1) The SPIR-V backend is not production ready and the SPIR-V translator is used. -2) The SYCL runtime requires metadata (module properties and module symbol -table) computed from device images that will be stored along the device images +2) The SYCL runtime requires metadata (module properties and module symbol +table) computed from device images that will be stored along the device images in the fat executable. -The effect of requirement 1) is that instead of letting ThinLTO call the SPIR-V -backend, we add a callback that runs right before codegen would run. -In that callback, we call the SPIR-V translator and store the resultant file -path for use later, and we instruct the ThinLTO framework to not -perform codegen. - -An interesting additional fact about requirement 2) is that we actually need to -process fully linked module to accurate compute the module properties. One -example where we need the full module is to [compute the required devicelib mask](https://github.com/intel/llvm/blob/sycl/llvm/lib/SYCLLowerIR/SYCLDeviceLibReqMask.cpp). -If we only process the device code that was included in the -original fat object input to `clang-linker-wrapper`, we will miss devicelib -calls in referenced `SYCL_EXTERNAL` functions. - -The effect of requirement 2) is that we store the fully linked device image for -metadata computation in the SYCL-specific handing code after the ThinLTO -framework has completed. Another option would be to try to compute the metadata -inside the ThinLTO framework callbacks, but this would require SYCL-specific +The effect of requirement 1) is that instead of letting ThinLTO call the SPIR-V +backend, we add a callback that runs right before CodeGen would run. In that +callback, we call the SPIR-V translator and store the resultant file path for +use later, and we instruct the ThinLTO framework to not perform CodeGen. + +An interesting additional fact about requirement 2) is that we actually need to +process fully linked module to accurate compute the module properties. One +example where we need the full module is to [compute the required devicelib +mask](https://github.com/intel/llvm/blob/sycl/llvm/lib/SYCLLowerIR/SYCLDeviceLibReqMask.cpp). +If we only process the device code that was included in the original fat object +input to `clang-linker-wrapper`, we will miss devicelib calls in referenced +`SYCL_EXTERNAL` functions. + +The effect of requirement 2) is that we store the fully linked device image for +metadata computation in the SYCL-specific handing code after the ThinLTO +framework has completed. Another option would be to try to compute the metadata +inside the ThinLTO framework callbacks, but this would require SYCL-specific arguments to many caller functions in the stack and pollute community code. Here is the current ThinLTO flow for SYCL: ![SYCL linking flow](images/ThinLTOSYCLFlow.svg) -We add a `PreCodeGenModuleHook` function to the `LTOConfig` object so that we +We add a `PreCodeGenModuleHook` function to the `LTOConfig` object so that we can process the fully linked module without running the backend. However, the flow is not ideal for many reasons: -1) We are relying on the external `llvm-spirv` tool instead of the SPIR-V -backend. We could slightly improve this issue by using a library call to the -SPIR-V translator instead of the tool, however the library API requires setting -up an object to represent the arguments while we only have strings, and it's -non-trivial to parse the strings to figure out how to create the argument -object. Since we plan to use the SPIR-V backend in the long term, this does not + +1) We are relying on the external `llvm-spirv` tool instead of the SPIR-V +backend. We could slightly improve this issue by using a library call to the +SPIR-V translator instead of the tool, however the library API requires setting +up an object to represent the arguments while we only have strings, and it's +non-trivial to parse the strings to figure out how to create the argument +object. Since we plan to use the SPIR-V backend in the long term, this does not seem to be worth the effort. -2) We manually run passes inside `PreCodeGenModuleHook`. This is because we -don't run codegen, so we can't take advantage of the `PreCodeGenPassesHook` -field of `LTOConfig` to run some custom passes, as those passes are only run -when we actually are going to run codegen. +2) We manually run passes inside `PreCodeGenModuleHook`. This is because we +don't run CodeGen, so we can't take advantage of the `PreCodeGenPassesHook` +field of `LTOConfig` to run some custom passes, as those passes are only run +when we actually are going to run CodeGen. -3) We have to store the fully linked module. This is needed because we need a -fully linked module to accurately compute metadata, see the above explanation -of SYCL requirement 2). We could get around storing the module by computing the -metadata inside the LTO framework and storing it for late use by the SYCL -bundling code, but doing this would require even more SYCL-only customizations including -even more new function arguments and modifications of the `OffloadFile` class. -There are also compliations because the LTO framework is multithreaded, and not all -LLVM data structures are thread safe. +3) We have to store the fully linked module. This is needed because we need a +fully linked module to accurately compute metadata, see the above explanation of +SYCL requirement 2). We could get around storing the module by computing the +metadata inside the LTO framework and storing it for late use by the SYCL +bundling code, but doing this would require even more SYCL-only customizations +including even more new function arguments and modifications of the +`OffloadFile` class. There are also compilations because the LTO framework is +multithreaded, and not all LLVM data structures are thread safe. The proposed long-term SYCL ThinLTO flow is as follows: ![SYCL SPIR-V backend linking flow](images/ThinLTOSYCLSPIRVBackendFlow.svg) -The biggest difference here is that we are running codegen using the SPIR-V +The biggest difference here is that we are running CodeGen using the SPIR-V backend. -Also, instead of using a lambda function in the `PreCodeGenModuleHook` -callback to run SYCL finalization passes, we can take advantage of the `PreCodeGenPassesHook` field to add -passes to the pass manager that the LTO framework will run. - -It is possible that the number of device images in the fat executable -and which device image contains which kernel is different with ThinLTO -enabled, but we do expect this to have any impact on correctness or -performance, nor we do expect users to care. +Also, instead of using a lambda function in the `PreCodeGenModuleHook` callback +to run SYCL finalization passes, we can take advantage of the +`PreCodeGenPassesHook` field to add passes to the pass manager that the LTO +framework will run. +It is possible that the number of device images in the fat executable and which +device image contains which kernel is different with ThinLTO enabled, but we do +expect this to have any impact on correctness or performance, nor we do expect +users to care. -# Current limitations +## Current limitations -`-O0`: Compiling with `-O0` prevent clang from generating ThinLTO metadata -during the compilation phase. In the current implementation, this is an error. -In the final version, we could either silently fall back to full LTO or -generate ThinLTO metadata even for `-O0`. +`-O0`: Compiling with `-O0` prevent clang from generating ThinLTO metadata +during the compilation phase. In the current implementation, this is an error. +In the final version, we could either silently fall back to full LTO or generate +ThinLTO metadata even for `-O0`. -SYCL libdevice: Current all `libdevice` functions are explicitly marked to be -weak symbols. The ThinLTO framework does not consider a defintion of function -with weak linkage as it cannot be sure that this definiton is the correct one. +SYCL libdevice: Current all `libdevice` functions are explicitly marked to be +weak symbols. The ThinLTO framework does not consider a definition of function +with weak linkage as it cannot be sure that this definition is the correct one. Ideally we could remove the weak symbol annotation. -No binary linkage: The SPIR-V target does not currently have a production -quality binary linker. This means that we must generate a fully linked image as -part of device linkage. At least for AMD devices, this is not a requirement as -`lld` is used for the final link which can resolve any unresolved symbols. -`-fno-gpu-rdc` is default for AMD, so in that case it can call `lld` during -compile, but if `-fno-gpu-rdc` is passed, the lld call happens as part of -`clang-linker-wrapper` to resolve any symbols not resolved by ThinLTO. \ No newline at end of file +No binary linkage: The SPIR-V target does not currently have a production +quality binary linker. This means that we must generate a fully linked image as +part of device linkage. At least for AMD devices, this is not a requirement as +`lld` is used for the final link which can resolve any unresolved symbols. +`-fno-gpu-rdc` is default for AMD, so in that case it can call `lld` during +compile, but if `-fno-gpu-rdc` is passed, the lld call happens as part of +`clang-linker-wrapper` to resolve any symbols not resolved by ThinLTO. From 1deaea2839c0bef5fd0963e7dd253779b22b6162 Mon Sep 17 00:00:00 2001 From: Alexey Bader Date: Thu, 19 Sep 2024 17:23:36 -0700 Subject: [PATCH 8/9] [NFC] Small word tweaking. --- sycl/doc/design/ThinLTO.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sycl/doc/design/ThinLTO.md b/sycl/doc/design/ThinLTO.md index 41a0a00786bdf..58232f0aff157 100644 --- a/sycl/doc/design/ThinLTO.md +++ b/sycl/doc/design/ThinLTO.md @@ -30,7 +30,7 @@ currently a common source of performance issues. Splitting early means that the resulting IR after splitting is not complete, it still may contain calls to functions (user code and/or the SYCL device -libraries) from other object files. +libraries) defined in other translation units. We rely on the assumption that all function definitions matching a declaration will be the same and we can let ThinLTO pull in any one. @@ -50,7 +50,7 @@ part of device compilation may contain multiple device code images. ## Device code link time changes Before we go into the link time changes for SYCL, let's understand the device -linking flow for community devices (AMD/NVIDIA): +linking flow for AMD/NVIDIA devices: ![Community linking flow](images/ThinLTOCommunityFlow.svg) From c68e7976bf6a49d983767db21b004f4083e2bc0a Mon Sep 17 00:00:00 2001 From: "Sarnie, Nick" Date: Mon, 23 Sep 2024 08:42:46 -0700 Subject: [PATCH 9/9] address initial feedback Signed-off-by: Sarnie, Nick --- .../ClangLinkerWrapper.cpp | 25 +++++++------------ 1 file changed, 9 insertions(+), 16 deletions(-) diff --git a/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp b/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp index 9bf3b7d7bb594..e58883a932efe 100644 --- a/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp +++ b/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp @@ -640,14 +640,13 @@ static Error getDeviceLibsForLTO(SmallVector &DeviceLibs, "Number of device library files cannot be zero."); for (StringRef Val : A->getValues()) { SmallString<128> LibName(Val); - if (llvm::sys::fs::exists(LibName)) { - if (auto Err = processFile(LibName)) - return Err; - } else + if (!llvm::sys::fs::exists(LibName)) return createStringError( inconvertibleErrorCode(), std::string(LibName) + " SYCL device library file for NVPTX is not found."); + if (auto Err = processFile(LibName)) + return Err; } } } @@ -692,19 +691,17 @@ static Expected convertSPIRVToIR(StringRef Filename, static bool considerOnlyKernelsAsEntryPoints(const ArgList &Args, const llvm::Triple Triple) { const llvm::Triple HostTriple(Args.getLastArgValue(OPT_host_triple_EQ)); - bool SYCLNativeCPU = (HostTriple == Triple); // On Intel targets we don't need non-kernel functions as entry points, // because it only increases amount of code for device compiler to handle, // without any actual benefits. // TODO: Try to extend this feature for non-Intel GPUs. - return (!Args.hasFlag(OPT_no_sycl_remove_unused_external_funcs, - OPT_sycl_remove_unused_external_funcs, false) && - !SYCLNativeCPU) && - !Triple.isNVPTX() && !Triple.isAMDGPU(); + return !Args.hasFlag(OPT_no_sycl_remove_unused_external_funcs, + OPT_sycl_remove_unused_external_funcs, false) && + Triple.isSPIROrSPIRV(); } bool isSYCLThinLTO(const ArgList &Args, const llvm::Triple Triple) { - // TODO: Support CUDA/HIP + // TODO: Support AMDGPU/NVPTX targets return Triple.isSPIROrSPIRV() && Args.hasArg(OPT_sycl_thin_lto); } @@ -747,10 +744,6 @@ getTripleBasedSYCLPostLinkOpts(const ArgList &Args, if (NoSplit && (Triple.getSubArch() != llvm::Triple::SPIRSubArch_fpga)) PostLinkArgs.push_back("-split=auto"); - // On Intel targets we don't need non-kernel functions as entry points, - // because it only increases amount of code for device compiler to handle, - // without any actual benefits. - // TODO: Try to extend this feature for non-Intel GPUs. if (considerOnlyKernelsAsEntryPoints(Args, Triple)) PostLinkArgs.push_back("-emit-only-kernels-as-entry-points"); @@ -1917,8 +1910,8 @@ std::unique_ptr createLTO( .str(); auto PreCodeGenSaveTemps = [=](size_t Task, const Module &M) { std::string File = - !Task ? TempName + ".postopt.bc" - : TempName + "." + std::to_string(Task) + ".postopt.bc"; + !Task ? TempName + ".precodegen.bc" + : TempName + "." + std::to_string(Task) + ".precodegen.bc"; error_code EC; raw_fd_ostream LinkedBitcode(File, EC, sys::fs::OF_None); if (EC)