diff --git a/clang/tools/clang-linker-wrapper/CMakeLists.txt b/clang/tools/clang-linker-wrapper/CMakeLists.txt index 9dc1f244f2802..8ad197d85d535 100644 --- a/clang/tools/clang-linker-wrapper/CMakeLists.txt +++ b/clang/tools/clang-linker-wrapper/CMakeLists.txt @@ -1,5 +1,6 @@ set(LLVM_LINK_COMPONENTS ${LLVM_TARGETS_TO_BUILD} + BitReader BitWriter Core BinaryFormat diff --git a/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp b/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp index 9173ff2ac48bd..e58883a932efe 100644 --- a/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp +++ b/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp @@ -42,7 +42,9 @@ #include "llvm/Option/Option.h" #include "llvm/Passes/PassPlugin.h" #include "llvm/Remarks/HotnessThresholdParser.h" +#include "llvm/SYCLLowerIR/ComputeModuleRuntimeInfo.h" #include "llvm/SYCLLowerIR/ModuleSplitter.h" +#include "llvm/SYCLLowerIR/SYCLLinkedModuleProcessor.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Errc.h" #include "llvm/Support/FileOutputBuffer.h" @@ -600,6 +602,57 @@ static Error getSYCLDeviceLibs(SmallVector &DeviceLibFiles, return Error::success(); } +static Error getDeviceLibsForLTO(SmallVector &DeviceLibs, + const ArgList &Args, + const llvm::Triple Triple) { + // TODO: Fix copy paste + SmallVector DeviceLibFiles; + if (Error Err = sycl::getSYCLDeviceLibs(DeviceLibFiles, Args)) + return Err; + + auto processFile = [&](StringRef File) { + auto BufferOrErr = MemoryBuffer::getFile(File); + if (!BufferOrErr) + return createFileError(File, BufferOrErr.getError()); + auto Buffer = std::move(*BufferOrErr); + SmallVector Candidates; + if (Error Err = + extractOffloadBinaries(Buffer->getMemBufferRef(), Candidates)) + return Err; + for (OffloadFile &OffF : Candidates) + if (llvm::Triple(OffF.getBinary()->getTriple()) == Triple) + DeviceLibs.emplace_back(std::move(OffF)); + return Error(Error::success()); + }; + + for (auto &File : DeviceLibFiles) { + + if (Error Err = processFile(File)) + return Err; + } + + // For NVPTX backend we need to also link libclc and CUDA libdevice. + if (Triple.isNVPTX()) { + if (Arg *A = Args.getLastArg(OPT_sycl_nvptx_device_lib_EQ)) { + if (A->getValues().size() == 0) + return createStringError( + inconvertibleErrorCode(), + "Number of device library files cannot be zero."); + for (StringRef Val : A->getValues()) { + SmallString<128> LibName(Val); + if (!llvm::sys::fs::exists(LibName)) + return createStringError( + inconvertibleErrorCode(), + std::string(LibName) + + " SYCL device library file for NVPTX is not found."); + if (auto Err = processFile(LibName)) + return Err; + } + } + } + return Error::success(); +} + /// This routine is used to convert SPIR-V input files into LLVM IR files. /// 'llvm-spirv -r' command is used for this purpose. /// If input is not a SPIR-V file, then the original file is returned. @@ -635,6 +688,30 @@ static Expected convertSPIRVToIR(StringRef Filename, return *TempFileOrErr; } +static bool considerOnlyKernelsAsEntryPoints(const ArgList &Args, + const llvm::Triple Triple) { + const llvm::Triple HostTriple(Args.getLastArgValue(OPT_host_triple_EQ)); + // On Intel targets we don't need non-kernel functions as entry points, + // because it only increases amount of code for device compiler to handle, + // without any actual benefits. + // TODO: Try to extend this feature for non-Intel GPUs. + return !Args.hasFlag(OPT_no_sycl_remove_unused_external_funcs, + OPT_sycl_remove_unused_external_funcs, false) && + Triple.isSPIROrSPIRV(); +} + +bool isSYCLThinLTO(const ArgList &Args, const llvm::Triple Triple) { + // TODO: Support AMDGPU/NVPTX targets + return Triple.isSPIROrSPIRV() && Args.hasArg(OPT_sycl_thin_lto); +} + +bool areSpecConstsSupported(const ArgList &Args, const llvm::Triple Triple) { + const llvm::Triple HostTriple(Args.getLastArgValue(OPT_host_triple_EQ)); + bool SYCLNativeCPU = (HostTriple == Triple); + return (!Triple.isNVPTX() && !Triple.isAMDGCN() && !Triple.isSPIRAOT() && + !SYCLNativeCPU); +} + /// Add any sycl-post-link options that rely on a specific Triple in addition /// to user supplied options. /// NOTE: Any changes made here should be reflected in the similarly named @@ -667,14 +744,7 @@ getTripleBasedSYCLPostLinkOpts(const ArgList &Args, if (NoSplit && (Triple.getSubArch() != llvm::Triple::SPIRSubArch_fpga)) PostLinkArgs.push_back("-split=auto"); - // On Intel targets we don't need non-kernel functions as entry points, - // because it only increases amount of code for device compiler to handle, - // without any actual benefits. - // TODO: Try to extend this feature for non-Intel GPUs. - if ((!Args.hasFlag(OPT_no_sycl_remove_unused_external_funcs, - OPT_sycl_remove_unused_external_funcs, false) && - !SYCLNativeCPU) && - !Triple.isNVPTX() && !Triple.isAMDGPU()) + if (considerOnlyKernelsAsEntryPoints(Args, Triple)) PostLinkArgs.push_back("-emit-only-kernels-as-entry-points"); if (!Triple.isAMDGCN()) @@ -687,7 +757,7 @@ getTripleBasedSYCLPostLinkOpts(const ArgList &Args, bool SplitEsimd = Args.hasFlag(OPT_sycl_device_code_split_esimd, OPT_no_sycl_device_code_split_esimd, SplitEsimdByDefault); - if (!Args.hasArg(OPT_sycl_thin_lto)) + if (!isSYCLThinLTO(Args, Triple)) PostLinkArgs.push_back("-symbols"); // Specialization constant info generation is mandatory - // add options unconditionally @@ -891,27 +961,33 @@ getTripleBasedSPIRVTransOpts(const ArgList &Args, TranslatorArgs.push_back(Args.MakeArgString(ExtArg)); } +void computeLLVMToSPIRVTranslationToolArgs(const ArgList &Args, + SmallVector &CmdArgs) { + const llvm::Triple Triple(Args.getLastArgValue(OPT_triple_EQ)); + getTripleBasedSPIRVTransOpts(Args, CmdArgs, Triple); + StringRef LLVMToSPIRVOptions; + if (Arg *A = Args.getLastArg(OPT_llvm_spirv_options_EQ)) + LLVMToSPIRVOptions = A->getValue(); + LLVMToSPIRVOptions.split(CmdArgs, " ", /* MaxSplit = */ -1, + /* KeepEmpty = */ false); +} + /// Run LLVM to SPIR-V translation. /// Converts 'File' from LLVM bitcode to SPIR-V format using llvm-spirv tool. /// 'Args' encompasses all arguments required for linking and wrapping device /// code and will be parsed to generate options required to be passed into the /// llvm-spirv tool. -static Expected runLLVMToSPIRVTranslation(StringRef File, - const ArgList &Args) { + +static Expected +runLLVMToSPIRVTranslation(StringRef File, + SmallVectorImpl &&CmdArgs) { Expected LLVMToSPIRVPath = findProgram("llvm-spirv", {getMainExecutable("llvm-spirv")}); if (!LLVMToSPIRVPath) return LLVMToSPIRVPath.takeError(); - SmallVector CmdArgs; - CmdArgs.push_back(*LLVMToSPIRVPath); - const llvm::Triple Triple(Args.getLastArgValue(OPT_triple_EQ)); - getTripleBasedSPIRVTransOpts(Args, CmdArgs, Triple); - StringRef LLVMToSPIRVOptions; - if (Arg *A = Args.getLastArg(OPT_llvm_spirv_options_EQ)) - LLVMToSPIRVOptions = A->getValue(); - LLVMToSPIRVOptions.split(CmdArgs, " ", /* MaxSplit = */ -1, - /* KeepEmpty = */ false); + CmdArgs.insert(CmdArgs.begin(), (*LLVMToSPIRVPath)); + CmdArgs.push_back("-o"); // Create a new file to write the translated file to. @@ -951,6 +1027,13 @@ static Expected runLLVMToSPIRVTranslation(StringRef File, return *TempFileOrErr; } +static Expected runLLVMToSPIRVTranslation(StringRef File, + const ArgList &Args) { + SmallVector ToolArgs; + computeLLVMToSPIRVTranslationToolArgs(Args, ToolArgs); + return runLLVMToSPIRVTranslation(File, std::move(ToolArgs)); +} + /// Adds all AOT backend options required for SYCL AOT compilation step to /// 'CmdArgs'. /// 'Args' encompasses all arguments required for linking and wrapping device @@ -1497,6 +1580,43 @@ Error extractBundledObjects(StringRef Filename, const ArgList &Args, return Error::success(); } +llvm::sycl::GlobalBinImageProps +computeGlobalBinProps(const ArgList &Args, const llvm::Triple Triple) { + auto findParam = [](const SmallVectorImpl &Vec, StringRef Param) { + for (auto El : Vec) + if (Param == El) + return true; + return false; + }; + SmallVector CmdArgs; + getTripleBasedSYCLPostLinkOpts(Args, CmdArgs, Triple); + bool EmitKernelParamInfo = findParam(CmdArgs, "-emit-param-info"); + bool EmitProgramMetadata = findParam(CmdArgs, "-emit-program-metadata"); + bool EmitExportedSymbols = findParam(CmdArgs, "-emit-exported-symbols"); + bool EmitImportedSymbols = findParam(CmdArgs, "-emit-imported-symbols"); + // DeviceGlobals is not triple-based, so it will be present in Args. + bool DeviceGlobals = false; + if (Arg *A = Args.getLastArg(OPT_sycl_post_link_options_EQ)) + DeviceGlobals = StringRef(A->getValue()).contains("-device-globals"); + + return {EmitKernelParamInfo, EmitProgramMetadata, EmitExportedSymbols, + EmitImportedSymbols, DeviceGlobals}; +} + +Error validateThinLTOModule(BitcodeModule &M, const ArgList &Args) { + Expected LTOInfo = M.getLTOInfo(); + if (!LTOInfo || !(*LTOInfo).IsThinLTO) + return createStringError( + "All code must be compiled with -foffload-lto=thin"); + + // For O0 we don't run function importing so it defeats + // the whole point of thinLTO. Maybe we could lift this + // restriction by enabling only required passes for importing for O0. + if (Args.getLastArgValue(OPT_opt_level, "") == "O0") + return createStringError("O0 is not supported"); + return Error::success(); +} + } // namespace sycl namespace generic { @@ -1729,6 +1849,9 @@ std::vector getTargetFeatures(ArrayRef InputFiles) { template > std::unique_ptr createLTO( const ArgList &Args, const std::vector &Features, + SmallVectorImpl &BitcodeInputFiles, + SmallVectorImpl &Files, + std::vector ModulesToCompile = {}, ModuleHook Hook = [](size_t, const Module &) { return true; }) { const llvm::Triple Triple(Args.getLastArgValue(OPT_triple_EQ)); // We need to remove AMD's target-id from the processor if present. @@ -1758,6 +1881,14 @@ std::unique_ptr createLTO( Conf.OptLevel = OptLevel[1] - '0'; Conf.DefaultTriple = Triple.getTriple(); + // We need to set up the backend to use thinLTO + // even if we don't actually use it, and there is no + // backend for the spir64 triple, so override it to + // the SPIR-V backlend. + // TODO: Remove once SYCL uses the SPIR-V backend. + if (sycl::isSYCLThinLTO(Args, Triple)) + Conf.OverrideTriple = "spirv64-unknown-unknown"; + // TODO: Should we complain about combining --opt-level and -passes, as opt // does? That might be too limiting in clang-linker-wrapper, so for now we // just warn in the help entry for -passes that the default corresponding @@ -1774,11 +1905,21 @@ std::unique_ptr createLTO( Conf.PTO.LoopVectorization = Conf.OptLevel > 1; Conf.PTO.SLPVectorization = Conf.OptLevel > 1; - + std::string TempName = (sys::path::filename(ExecutableName) + "." + + Triple.getTriple() + "." + Arch) + .str(); + auto PreCodeGenSaveTemps = [=](size_t Task, const Module &M) { + std::string File = + !Task ? TempName + ".precodegen.bc" + : TempName + "." + std::to_string(Task) + ".precodegen.bc"; + error_code EC; + raw_fd_ostream LinkedBitcode(File, EC, sys::fs::OF_None); + if (EC) + reportError(errorCodeToError(EC)); + WriteBitcodeToFile(M, LinkedBitcode); + return true; + }; if (SaveTemps) { - std::string TempName = (sys::path::filename(ExecutableName) + "." + - Triple.getTriple() + "." + TargetID) - .str(); Conf.PostInternalizeModuleHook = [=](size_t Task, const Module &M) { std::string File = !Task ? TempName + ".postlink.bc" @@ -1790,17 +1931,7 @@ std::unique_ptr createLTO( WriteBitcodeToFile(M, LinkedBitcode); return true; }; - Conf.PreCodeGenModuleHook = [=](size_t Task, const Module &M) { - std::string File = - !Task ? TempName + ".postopt.bc" - : TempName + "." + std::to_string(Task) + ".postopt.bc"; - error_code EC; - raw_fd_ostream LinkedBitcode(File, EC, sys::fs::OF_None); - if (EC) - reportError(errorCodeToError(EC)); - WriteBitcodeToFile(M, LinkedBitcode); - return true; - }; + Conf.PreCodeGenModuleHook = PreCodeGenSaveTemps; } Conf.PostOptModuleHook = Hook; Conf.CGFileType = (Triple.isNVPTX() || SaveTemps) @@ -1809,6 +1940,90 @@ std::unique_ptr createLTO( // TODO: Handle remark files Conf.HasWholeProgramVisibility = Args.hasArg(OPT_whole_program); + if (sycl::isSYCLThinLTO(Args, Triple)) { + // Passing Args to each thinLTO thread causes crashes, so compute everything + // we can here. + const llvm::Triple Triple(Args.getLastArgValue(OPT_triple_EQ)); + SmallVector SPIRVArgs; + sycl::computeLLVMToSPIRVTranslationToolArgs(Args, SPIRVArgs); + auto SpecConstArg = sycl::areSpecConstsSupported(Args, Triple) + ? SpecConstantsPass::HandlingMode::native + : SpecConstantsPass::HandlingMode::emulation; + Conf.PreCodeGenModuleHook = [=, &BitcodeInputFiles, + &Files](unsigned Task, const Module &M) { + // Here we process the IR from each BC file, save the module for later + // use, convert to SPV (using the translator for now) and save the path to + // the output file. + + assert(Task != 0 && "Unexpected task"); + auto &OffloadF = BitcodeInputFiles[Task - 1]; + if (OffloadF.getBinary()->getOffloadKind() != OFK_SYCL) { + if (SaveTemps) + PreCodeGenSaveTemps(Task, M); + return true; + } + + if (SaveTemps) + PreCodeGenSaveTemps(Task, M); + + // Use the legacy PM because eventually we will use the + // PreCodeGenPassesHook field of LTOConfig which requires the legacy PM. + legacy::PassManager PM; + + // LTO does not continue processing the module after this + // function finishes, so it's safe to modify the module. + PM.add(createSYCLLinkedModuleProcessorPass(SpecConstArg)); + PM.run(const_cast(M)); + + auto IRFile = createOutputFile(sys::path::filename(ExecutableName) + "." + + std::to_string(Task) + ".to.spv", + "spv"); + if (!IRFile) + reportError(IRFile.takeError()); + error_code EC; + raw_fd_ostream LinkedBitcode(*IRFile, EC, sys::fs::OF_None); + if (EC) + reportError(errorCodeToError(EC)); + WriteBitcodeToFile(M, LinkedBitcode); + LinkedBitcode.close(); + { + // Overwrite the fully linked module in BitcodeInputFiles + // so we can compute the module properties and symbol table. + // We need a fully linked module to accurately compute these. + llvm::ErrorOr> ImageOrError = + llvm::MemoryBuffer::getFileOrSTDIN(*IRFile); + assert(ImageOrError); + OffloadingImage Image{}; + Image.TheImageKind = IMG_Bitcode; + Image.TheOffloadKind = OffloadF.getBinary()->getOffloadKind(); + Image.StringData["triple"] = OffloadF.getBinary()->getTriple(); + Image.StringData["arch"] = OffloadF.getBinary()->getArch(); + Image.Image = std::move(*ImageOrError); + + std::unique_ptr Binary = + MemoryBuffer::getMemBufferCopy(OffloadBinary::write(Image)); + auto NewBinaryOrErr = OffloadBinary::create(*Binary); + assert(NewBinaryOrErr); + BitcodeInputFiles[Task - 1] = + OffloadFile(std::move(*NewBinaryOrErr), std::move(Binary)); + } + // runLLVMToSPIRVTranslation takes in an r-value ref of arguments + // and writes to it, so we need to copy the arguments. + SmallVector SPIRVArgsCopy = SPIRVArgs; + auto SPVFile = + sycl::runLLVMToSPIRVTranslation(*IRFile, std::move(SPIRVArgsCopy)); + if (!SPVFile) + reportError(SPVFile.takeError()); + Files[Task] = *SPVFile; + // Return false so the thinLTO backend doesn't continue to process this + // module. We already emitted SPIR-V ourselves, so we don't need to do + // anything else. Once the SPIR-V backend is ready, we can remove the + // manual SPIR-V translator call and return true here. + return false; + }; + // Only compile user modules to SPV, not device libraries. + Conf.ThinLTOModulesToCompile = ModulesToCompile; + } return std::make_unique(std::move(Conf), Backend); } @@ -1823,16 +2038,16 @@ bool isValidCIdentifier(StringRef S) { Error linkBitcodeFiles(SmallVectorImpl &InputFiles, SmallVectorImpl &OutputFiles, + SmallVector &BitcodeInputFiles, const ArgList &Args) { llvm::TimeTraceScope TimeScope("Link bitcode files"); const llvm::Triple Triple(Args.getLastArgValue(OPT_triple_EQ)); StringRef Arch = Args.getLastArgValue(OPT_arch_EQ); - // Early exit for SPIR targets - if (Triple.isSPIROrSPIRV()) + // Early exit for non-thin-LTO SPIR targets + if (Triple.isSPIROrSPIRV() && !sycl::isSYCLThinLTO(Args, Triple)) return Error::success(); - SmallVector BitcodeInputFiles; DenseSet StrongResolutions; DenseSet UsedInRegularObj; DenseSet UsedInSharedLib; @@ -1897,6 +2112,18 @@ Error linkBitcodeFiles(SmallVectorImpl &InputFiles, // LTO Module hook to output bitcode without running the backend. SmallVector BitcodeOutput; + std::vector ModulesToCompile; + SmallVector Files; + if (sycl::isSYCLThinLTO(Args, Triple)) { + for (const OffloadFile &BitcodeInput : BitcodeInputFiles) { + auto ModuleName = BitcodeInput.getBinary()->getFileName(); + // TODO: This is pretty hacky, maybe we could check some module metadata + // or something. + if (ModuleName.find("libsycl-") == std::string::npos) + ModulesToCompile.push_back(ModuleName.str()); + } + } + auto OutputBitcode = [&](size_t, const Module &M) { auto TempFileOrErr = createOutputFile(sys::path::filename(ExecutableName) + "-jit-" + Triple.getTriple(), @@ -1919,8 +2146,10 @@ Error linkBitcodeFiles(SmallVectorImpl &InputFiles, auto LTOBackend = Args.hasArg(OPT_embed_bitcode) || Args.hasArg(OPT_builtin_bitcode_EQ) || Args.hasArg(OPT_clang_backend) - ? createLTO(Args, Features, OutputBitcode) - : createLTO(Args, Features); + ? createLTO(Args, Features, BitcodeInputFiles, Files, + {}, OutputBitcode) + : createLTO(Args, Features, BitcodeInputFiles, Files, + ModulesToCompile); // We need to resolve the symbols so the LTO backend knows which symbols // need to be kept or can be internalized. This is a simplified symbol @@ -1939,6 +2168,17 @@ Error linkBitcodeFiles(SmallVectorImpl &InputFiles, if (!BitcodeFileOrErr) return BitcodeFileOrErr.takeError(); + if (sycl::isSYCLThinLTO(Args, Triple)) { + // Error if any module was not compiled with thinLTO. Other platforms + // can fall back to binary linking if thinLTO fails, but we don't have + // that for SPIR-V (besides spirv-link). In the future we may be able to + // fall back to normal SYCL processing and throw a warning instead of a + // fatal error. + if (auto Err = sycl::validateThinLTOModule( + (*BitcodeFileOrErr)->getSingleBitcodeModule(), Args)) + return Err; + } + // Save the input file and the buffer associated with its memory. const auto Symbols = (*BitcodeFileOrErr)->symbols(); SmallVector Resolutions(Symbols.size()); @@ -1991,7 +2231,7 @@ Error linkBitcodeFiles(SmallVectorImpl &InputFiles, // Run the LTO job to compile the bitcode. size_t MaxTasks = LTOBackend->getMaxTasks(); - SmallVector Files(MaxTasks); + Files.resize(MaxTasks); auto AddStream = [&](size_t Task, const Twine &ModuleName) -> std::unique_ptr { @@ -2353,6 +2593,8 @@ Expected> linkAndWrapDeviceFiles( DenseSet ActiveOffloadKinds; bool HasSYCLOffloadKind = false; bool HasNonSYCLOffloadKinds = false; + SmallVector BitcodeInputFiles; + for (const auto &File : Input) { if (File.getBinary()->getOffloadKind() != OFK_None) ActiveOffloadKinds.insert(File.getBinary()->getOffloadKind()); @@ -2365,9 +2607,21 @@ Expected> linkAndWrapDeviceFiles( // First link and remove all the input files containing bitcode if // the target linker does not support it natively. SmallVector InputFiles; - if (!linkerSupportsLTO(LinkerArgs)) - if (Error Err = linkBitcodeFiles(Input, InputFiles, LinkerArgs)) + llvm::Triple Triple(LinkerArgs.getLastArgValue(OPT_triple_EQ)); + if (!linkerSupportsLTO(LinkerArgs)) { + if (sycl::isSYCLThinLTO(Args, Triple)) { + // For thinLTO, we consider device libs as normal compiler input + // and add them to the files to be processed by the LTO backend. + // Later we set ModulesToCompile so that we don't + // actually emit code for them, we just link in their functions in + // modules that use them. + if (auto Err = sycl::getDeviceLibsForLTO(Input, LinkerArgs, Triple)) + return Err; + } + if (Error Err = linkBitcodeFiles(Input, InputFiles, BitcodeInputFiles, + LinkerArgs)) return Err; + } // Write any remaining device inputs to an output file for the // linker. @@ -2378,77 +2632,131 @@ Expected> linkAndWrapDeviceFiles( InputFiles.emplace_back(*FileNameOrErr); } if (HasSYCLOffloadKind) { - SmallVector InputFiles; - // Write device inputs to an output file for the linker. - for (const OffloadFile &File : Input) { - auto FileNameOrErr = writeOffloadFile(File); - if (!FileNameOrErr) - return FileNameOrErr.takeError(); - InputFiles.emplace_back(*FileNameOrErr); + StringRef TmpOutput; + if (!sycl::isSYCLThinLTO(Args, Triple)) { + // Link the input device files using the device linker for SYCL + // offload. + auto TmpOutputOrErr = sycl::linkDevice(InputFiles, LinkerArgs); + if (!TmpOutputOrErr) + return TmpOutputOrErr.takeError(); + TmpOutput = *TmpOutputOrErr; } - // Link the input device files using the device linker for SYCL - // offload. - auto TmpOutputOrErr = sycl::linkDevice(InputFiles, LinkerArgs); - if (!TmpOutputOrErr) - return TmpOutputOrErr.takeError(); SmallVector InputFilesSYCL; - InputFilesSYCL.emplace_back(*TmpOutputOrErr); - auto SplitModulesOrErr = - UseSYCLPostLinkTool - ? sycl::runSYCLPostLinkTool(InputFilesSYCL, LinkerArgs) - : sycl::runSYCLSplitLibrary(InputFilesSYCL, LinkerArgs, - *SYCLModuleSplitMode); - if (!SplitModulesOrErr) - return SplitModulesOrErr.takeError(); - - auto &SplitModules = *SplitModulesOrErr; - const llvm::Triple Triple(LinkerArgs.getLastArgValue(OPT_triple_EQ)); - if ((Triple.isNVPTX() || Triple.isAMDGCN()) && - LinkerArgs.hasArg(OPT_sycl_embed_ir)) { - // When compiling for Nvidia/AMD devices and the user requested the - // IR to be embedded in the application (via option), run the output - // of sycl-post-link (filetable referencing LLVM Bitcode + symbols) - // through the offload wrapper and link the resulting object to the - // application. - auto OutputFile = - sycl::runWrapperAndCompile(SplitModules, LinkerArgs, /* IsEmbeddedIR */ true); - if (!OutputFile) - return OutputFile.takeError(); - WrappedOutput.push_back(*OutputFile); - } - for (size_t I = 0, E = SplitModules.size(); I != E; ++I) { - SmallVector Files = {SplitModules[I].ModuleFilePath}; - StringRef Arch = LinkerArgs.getLastArgValue(OPT_arch_EQ); - if (Arch.empty()) - Arch = "native"; - SmallVector, 4> BundlerInputFiles; - auto ClangOutputOrErr = - linkDevice(Files, LinkerArgs, true /* IsSYCLKind */); - if (!ClangOutputOrErr) - return ClangOutputOrErr.takeError(); - if (Triple.isNVPTX()) { - auto VirtualArch = StringRef(clang::OffloadArchToVirtualArchString( - clang::StringToOffloadArch(Arch))); - auto PtxasOutputOrErr = - nvptx::ptxas(*ClangOutputOrErr, LinkerArgs, Arch); - if (!PtxasOutputOrErr) - return PtxasOutputOrErr.takeError(); - BundlerInputFiles.emplace_back(*ClangOutputOrErr, VirtualArch); - BundlerInputFiles.emplace_back(*PtxasOutputOrErr, Arch); - auto BundledFileOrErr = - nvptx::fatbinary(BundlerInputFiles, LinkerArgs); - if (!BundledFileOrErr) - return BundledFileOrErr.takeError(); - SplitModules[I].ModuleFilePath = *BundledFileOrErr; - } else if (Triple.isAMDGCN()) { - BundlerInputFiles.emplace_back(*ClangOutputOrErr, Arch); - auto BundledFileOrErr = - amdgcn::fatbinary(BundlerInputFiles, LinkerArgs); - if (!BundledFileOrErr) - return BundledFileOrErr.takeError(); - SplitModules[I].ModuleFilePath = *BundledFileOrErr; - } else { - SplitModules[I].ModuleFilePath = *ClangOutputOrErr; + std::vector SplitModules; + if (sycl::isSYCLThinLTO(Args, Triple)) { + size_t LastSPVFilePath = 0; + for (size_t FileIdx = 0; FileIdx < BitcodeInputFiles.size(); + FileIdx++) { + // After we have run the LTO backend, compute module props/symbol + // table/spv file path and set it up to be used by SYCL image + // creation. + // TODO: Once SYCL image creation is reconsiled with the non-SYCL + // path, we can move all of the thinLTO handling to be more in-line + // with community code. + + // This is a bit hacky but not every BitcodeInputFile will end up as a + // SPV file in InputFiles, for example if it is a device library file. + // If the file name is empty, that means we didn't generate SPV for + // it, so just find the next non-empty file name. Should be easy to + // clean this up later if we go with this overall design. + const OffloadFile &F = BitcodeInputFiles[FileIdx]; + StringRef CodegenPath; + for (size_t OutputNum = LastSPVFilePath; + OutputNum < InputFiles.size(); OutputNum++) { + auto SPVFilePath = InputFiles[OutputNum]; + if (!SPVFilePath.empty()) { + LastSPVFilePath = OutputNum + 1; + CodegenPath = SPVFilePath; + break; + } + } + if (CodegenPath.empty()) + continue; + LLVMContext Context; + auto Buf = MemoryBuffer::getMemBuffer(F.getBinary()->getImage()); + auto ModOrErr = parseBitcodeFile(*Buf, Context); + if (!ModOrErr) + return ModOrErr.takeError(); + auto &M = **ModOrErr; + + llvm::sycl::EntryPointSet EntryPoints; + bool OnlyKernelsAsEntryPoints = + sycl::considerOnlyKernelsAsEntryPoints(Args, Triple); + auto GlobalBinProps = sycl::computeGlobalBinProps(Args, Triple); + for (const Function &F : M.functions()) { + if (llvm::module_split::isEntryPoint(F, OnlyKernelsAsEntryPoints)) + EntryPoints.insert(const_cast(&F)); + } + if (EntryPoints.empty()) + continue; + + auto Properties = llvm::sycl::computeModuleProperties(M, EntryPoints, + GlobalBinProps); + + std::string ModuleSyms = + llvm::sycl::computeModuleSymbolTable(M, EntryPoints); + SplitModules.emplace_back(CodegenPath, std::move(Properties), + ModuleSyms); + } + // We don't need the OffloadFiles anymore, so free them from memory. + BitcodeInputFiles.clear(); + } else { + InputFilesSYCL.emplace_back(TmpOutput); + auto SplitModulesOrErr = + UseSYCLPostLinkTool + ? sycl::runSYCLPostLinkTool(InputFilesSYCL, LinkerArgs) + : sycl::runSYCLSplitLibrary(InputFilesSYCL, LinkerArgs, + *SYCLModuleSplitMode); + if (!SplitModulesOrErr) + return SplitModulesOrErr.takeError(); + SplitModules = std::move(*SplitModulesOrErr); + if ((Triple.isNVPTX() || Triple.isAMDGCN()) && + LinkerArgs.hasArg(OPT_sycl_embed_ir)) { + // When compiling for Nvidia/AMD devices and the user requested the + // IR to be embedded in the application (via option), run the output + // of sycl-post-link (filetable referencing LLVM Bitcode + symbols) + // through the offload wrapper and link the resulting object to the + // application. + auto OutputFile = sycl::runWrapperAndCompile(SplitModules, LinkerArgs, + /* IsEmbeddedIR */ true); + if (!OutputFile) + return OutputFile.takeError(); + WrappedOutput.push_back(*OutputFile); + } + for (size_t I = 0, E = SplitModules.size(); I != E; ++I) { + SmallVector Files = {SplitModules[I].ModuleFilePath}; + StringRef Arch = LinkerArgs.getLastArgValue(OPT_arch_EQ); + if (Arch.empty()) + Arch = "native"; + SmallVector, 4> BundlerInputFiles; + auto ClangOutputOrErr = + linkDevice(Files, LinkerArgs, true /* IsSYCLKind */); + if (!ClangOutputOrErr) + return ClangOutputOrErr.takeError(); + if (Triple.isNVPTX()) { + auto VirtualArch = StringRef(clang::OffloadArchToVirtualArchString( + clang::StringToOffloadArch(Arch))); + auto PtxasOutputOrErr = + nvptx::ptxas(*ClangOutputOrErr, LinkerArgs, Arch); + if (!PtxasOutputOrErr) + return PtxasOutputOrErr.takeError(); + BundlerInputFiles.emplace_back(*ClangOutputOrErr, VirtualArch); + BundlerInputFiles.emplace_back(*PtxasOutputOrErr, Arch); + auto BundledFileOrErr = + nvptx::fatbinary(BundlerInputFiles, LinkerArgs); + if (!BundledFileOrErr) + return BundledFileOrErr.takeError(); + SplitModules[I].ModuleFilePath = *BundledFileOrErr; + } else if (Triple.isAMDGCN()) { + BundlerInputFiles.emplace_back(*ClangOutputOrErr, Arch); + auto BundledFileOrErr = + amdgcn::fatbinary(BundlerInputFiles, LinkerArgs); + if (!BundledFileOrErr) + return BundledFileOrErr.takeError(); + SplitModules[I].ModuleFilePath = *BundledFileOrErr; + } else { + SplitModules[I].ModuleFilePath = *ClangOutputOrErr; + } } } @@ -2468,7 +2776,9 @@ Expected> linkAndWrapDeviceFiles( if (HasNonSYCLOffloadKinds) { // First link and remove all the input files containing bitcode. SmallVector InputFiles; - if (Error Err = linkBitcodeFiles(Input, InputFiles, LinkerArgs)) + SmallVector BitcodeInputFiles; + if (Error Err = linkBitcodeFiles(Input, InputFiles, BitcodeInputFiles, + LinkerArgs)) return Err; // Write any remaining device inputs to an output file for the linker. diff --git a/libdevice/device.h b/libdevice/device.h index 360af54f9b4c4..e2f0da6e3c082 100644 --- a/libdevice/device.h +++ b/libdevice/device.h @@ -17,9 +17,9 @@ #if defined(__SPIR__) || defined(__SPIRV__) || defined(__NVPTX__) #ifdef __SYCL_DEVICE_ONLY__ -#define DEVICE_EXTERNAL SYCL_EXTERNAL __attribute__((weak)) +#define DEVICE_EXTERNAL SYCL_EXTERNAL #else // __SYCL_DEVICE_ONLY__ -#define DEVICE_EXTERNAL __attribute__((weak)) +#define DEVICE_EXTERNAL #endif // __SYCL_DEVICE_ONLY__ #define DEVICE_EXTERN_C DEVICE_EXTERNAL EXTERN_C diff --git a/libdevice/fallback-cassert.cpp b/libdevice/fallback-cassert.cpp index 5d3c99d63c556..1c685737002f4 100644 --- a/libdevice/fallback-cassert.cpp +++ b/libdevice/fallback-cassert.cpp @@ -114,9 +114,4 @@ DEVICE_EXTERN_C void __devicelib_assert_fail(const char *expr, const char *file, __assertfail(expr, file, line, func, 1); } -DEVICE_EXTERN_C void _wassert(const char *_Message, const char *_File, - unsigned _Line) { - __assertfail(_Message, _File, _Line, 0, 1); -} - #endif diff --git a/llvm/include/llvm/SYCLLowerIR/ModuleSplitter.h b/llvm/include/llvm/SYCLLowerIR/ModuleSplitter.h index 0da3706ad3626..cb6049910b934 100644 --- a/llvm/include/llvm/SYCLLowerIR/ModuleSplitter.h +++ b/llvm/include/llvm/SYCLLowerIR/ModuleSplitter.h @@ -321,6 +321,7 @@ splitSYCLModule(std::unique_ptr M, ModuleSplitterSettings Settings); bool isESIMDFunction(const Function &F); bool canBeImportedFunction(const Function &F); +bool isEntryPoint(const Function &F, bool EmitOnlyKernelsAsEntryPoints); } // namespace module_split diff --git a/llvm/include/llvm/SYCLLowerIR/SYCLLinkedModuleProcessor.h b/llvm/include/llvm/SYCLLowerIR/SYCLLinkedModuleProcessor.h new file mode 100644 index 0000000000000..171992dfb2586 --- /dev/null +++ b/llvm/include/llvm/SYCLLowerIR/SYCLLinkedModuleProcessor.h @@ -0,0 +1,22 @@ +//===-- SYCLLinkedModuleProcessor.h - finalize a fully linked module ---===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// The file contains a number of functions to create a pass that can be called +// by the LTO backend that will finalize a fully-linked module. +//===----------------------------------------------------------------------===// +#pragma once +#include "SpecConstants.h" +namespace llvm { + +class PassRegistry; +class ModulePass; +ModulePass * + createSYCLLinkedModuleProcessorPass(llvm::SpecConstantsPass::HandlingMode); +void initializeSYCLLinkedModuleProcessorPass(PassRegistry &); + +} // namespace llvm diff --git a/llvm/lib/LTO/LTO.cpp b/llvm/lib/LTO/LTO.cpp index bb3c9f7acdb8e..924fe1389f311 100644 --- a/llvm/lib/LTO/LTO.cpp +++ b/llvm/lib/LTO/LTO.cpp @@ -1077,8 +1077,8 @@ Error LTO::addThinLTO(BitcodeModule BM, ArrayRef Syms, for (const std::string &Name : Conf.ThinLTOModulesToCompile) { if (BM.getModuleIdentifier().contains(Name)) { ThinLTO.ModulesToCompile->insert({BM.getModuleIdentifier(), BM}); - llvm::errs() << "[ThinLTO] Selecting " << BM.getModuleIdentifier() - << " to compile\n"; + LLVM_DEBUG(dbgs() << "[ThinLTO] Selecting " << BM.getModuleIdentifier() + << " to compile\n"); } } } diff --git a/llvm/lib/SYCLLowerIR/CMakeLists.txt b/llvm/lib/SYCLLowerIR/CMakeLists.txt index ed578aaeb21c0..bfa3d3f6dba7f 100644 --- a/llvm/lib/SYCLLowerIR/CMakeLists.txt +++ b/llvm/lib/SYCLLowerIR/CMakeLists.txt @@ -65,6 +65,7 @@ add_llvm_component_library(LLVMSYCLLowerIR SYCLDeviceRequirements.cpp SYCLKernelParamOptInfo.cpp SYCLJointMatrixTransform.cpp + SYCLLinkedModuleProcessor.cpp SYCLPropagateAspectsUsage.cpp SYCLPropagateJointMatrixUsage.cpp SYCLVirtualFunctionsAnalysis.cpp diff --git a/llvm/lib/SYCLLowerIR/ModuleSplitter.cpp b/llvm/lib/SYCLLowerIR/ModuleSplitter.cpp index 6068ce58f414f..6db9aece81569 100644 --- a/llvm/lib/SYCLLowerIR/ModuleSplitter.cpp +++ b/llvm/lib/SYCLLowerIR/ModuleSplitter.cpp @@ -117,32 +117,6 @@ bool isKernel(const Function &F) { F.getCallingConv() == CallingConv::AMDGPU_KERNEL; } -bool isEntryPoint(const Function &F, bool EmitOnlyKernelsAsEntryPoints) { - // Skip declarations, if any: they should not be included into a vector of - // entry points groups or otherwise we will end up with incorrectly generated - // list of symbols. - if (F.isDeclaration()) - return false; - - // Kernels are always considered to be entry points - if (isKernel(F)) - return true; - - if (!EmitOnlyKernelsAsEntryPoints) { - // If not disabled, SYCL_EXTERNAL functions with sycl-module-id attribute - // are also considered as entry points (except __spirv_* and __sycl_* - // functions) - return llvm::sycl::utils::isSYCLExternalFunction(&F) && - !isSpirvSyclBuiltin(F.getName()) && !isESIMDBuiltin(F.getName()) && - !isGenericBuiltin(F.getName()); - } - - // Even if we are emitting only kernels as entry points, virtual functions - // should still be treated as entry points, because they are going to be - // outlined into separate device images and linked in later. - return F.hasFnAttribute("indirectly-callable"); -} - // Represents "dependency" or "use" graph of global objects (functions and // global variables) in a module. It is used during device code split to // understand which global variables and functions (other than entry points) @@ -445,6 +419,32 @@ class ModuleSplitter : public ModuleSplitterBase { namespace llvm { namespace module_split { +bool isEntryPoint(const Function &F, bool EmitOnlyKernelsAsEntryPoints) { + // Skip declarations, if any: they should not be included into a vector of + // entry points groups or otherwise we will end up with incorrectly generated + // list of symbols. + if (F.isDeclaration()) + return false; + + // Kernels are always considered to be entry points + if (isKernel(F)) + return true; + + if (!EmitOnlyKernelsAsEntryPoints) { + // If not disabled, SYCL_EXTERNAL functions with sycl-module-id attribute + // are also considered as entry points (except __spirv_* and __sycl_* + // functions) + return llvm::sycl::utils::isSYCLExternalFunction(&F) && + !isSpirvSyclBuiltin(F.getName()) && !isESIMDBuiltin(F.getName()) && + !isGenericBuiltin(F.getName()); + } + + // Even if we are emitting only kernels as entry points, virtual functions + // should still be treated as entry points, because they are going to be + // outlined into separate device images and linked in later. + return F.hasFnAttribute("indirectly-callable"); +} + std::optional convertStringToSplitMode(StringRef S) { static const StringMap Values = {{"kernel", SPLIT_PER_KERNEL}, {"source", SPLIT_PER_TU}, diff --git a/llvm/lib/SYCLLowerIR/SYCLLinkedModuleProcessor.cpp b/llvm/lib/SYCLLowerIR/SYCLLinkedModuleProcessor.cpp new file mode 100644 index 0000000000000..672d49d6ad161 --- /dev/null +++ b/llvm/lib/SYCLLowerIR/SYCLLinkedModuleProcessor.cpp @@ -0,0 +1,45 @@ +//===-- SYCLLinkedModuleProcessor.cpp - finalize a fully linked module ---===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// See comments in the header. +//===----------------------------------------------------------------------===// + +#include "llvm/SYCLLowerIR/SYCLLinkedModuleProcessor.h" + +#include "llvm/Pass.h" + +#define DEBUG_TYPE "sycl-linked-module-processor" +using namespace llvm; + +namespace { +class SYCLLinkedModuleProcessor : public ModulePass { +public: + static char ID; + SYCLLinkedModuleProcessor(SpecConstantsPass::HandlingMode Mode) + : ModulePass(ID), Mode(Mode) { + initializeSYCLLinkedModuleProcessorPass(*PassRegistry::getPassRegistry()); + } + + bool runOnModule(Module &M) override { + // TODO: determine if we need to run other passes + ModuleAnalysisManager MAM; + SpecConstantsPass SCP(Mode); + auto PA = SCP.run(M, MAM); + return !PA.areAllPreserved(); + } + +private: + SpecConstantsPass::HandlingMode Mode; +}; +} // namespace +char SYCLLinkedModuleProcessor::ID = 0; +INITIALIZE_PASS(SYCLLinkedModuleProcessor, "SYCLLinkedModuleProcessor", + "Finalize a fully linked SYCL module", false, false) +ModulePass *llvm::createSYCLLinkedModuleProcessorPass( + SpecConstantsPass::HandlingMode Mode) { + return new SYCLLinkedModuleProcessor(Mode); +} diff --git a/sycl/doc/design/CompilerAndRuntimeDesign.md b/sycl/doc/design/CompilerAndRuntimeDesign.md index 1935356d0a3a8..7843b5d3b88e6 100644 --- a/sycl/doc/design/CompilerAndRuntimeDesign.md +++ b/sycl/doc/design/CompilerAndRuntimeDesign.md @@ -550,6 +550,9 @@ unit) - `off` - disables device code split. If `-fno-sycl-rdc` is specified, the behavior is the same as `per_source` +If ThinLTO is enabled, device code splitting is run during the compilation +stage. See [here](ThinLTO.md) for more information. + ##### Symbol table generation TBD diff --git a/sycl/doc/design/ThinLTO.md b/sycl/doc/design/ThinLTO.md new file mode 100644 index 0000000000000..58232f0aff157 --- /dev/null +++ b/sycl/doc/design/ThinLTO.md @@ -0,0 +1,149 @@ +# ThinLTO for SYCL + +This document describes the purpose and design of ThinLTO for SYCL. + +**NOTE**: This is not the final version. The document is still in progress. + +## Background + +With traditional SYCL device code linking, all user code is linked together +along with device libraries into a single huge module and then split and +processed by `sycl-post-link`. This requires sequential processing, has a large +memory footprint, and differs from the linking flow for AMD and NVIDIA devices. + +## Summary + +SYCL ThinLTO will hook into the existing community mechanism to run LTO as part +of device linking inside `clang-linker-wrapper`. We split the device images +early at compilation time, and at link time we use ThinLTO's function importing +feature to bring in the definitions for referenced functions. Only the new +offload model is supported. + +## Device code compilation time changes + +Most of the changes for ThinLTO occur during device link time, however there is +one major change during compilation (-c) time: we now run device code split +during compilation instead of linking. The main reason for doing this is +increased parallelization. Many compilation jobs can be run at the same time, +but linking happens once total for the application. Device code split is +currently a common source of performance issues. + +Splitting early means that the resulting IR after splitting is not complete, it +still may contain calls to functions (user code and/or the SYCL device +libraries) defined in other translation units. + +We rely on the assumption that all function definitions matching a declaration +will be the same and we can let ThinLTO pull in any one. + +For example, let's start with user device code that defines a `SYCL_EXTERNAL` +function `foo` in translation unit `tu_foo`. There is also another translation +unit `tu_bar` that references `foo`. During the early device code splitting run +of `tu_foo`, we may find that more than one of the resultant device images +contain a definition for `foo`. + +We assert that any function definition for `foo` that is deemed a match by the +ThinLTO infrastructure during the processing of `tu_bar` is valid. + +As a result of running early device code split, the fat object file generated as +part of device compilation may contain multiple device code images. + +## Device code link time changes + +Before we go into the link time changes for SYCL, let's understand the device +linking flow for AMD/NVIDIA devices: + +![Community linking flow](images/ThinLTOCommunityFlow.svg) + +SYCL has two differentiating requirements: + +1) The SPIR-V backend is not production ready and the SPIR-V translator is used. +2) The SYCL runtime requires metadata (module properties and module symbol +table) computed from device images that will be stored along the device images +in the fat executable. + +The effect of requirement 1) is that instead of letting ThinLTO call the SPIR-V +backend, we add a callback that runs right before CodeGen would run. In that +callback, we call the SPIR-V translator and store the resultant file path for +use later, and we instruct the ThinLTO framework to not perform CodeGen. + +An interesting additional fact about requirement 2) is that we actually need to +process fully linked module to accurate compute the module properties. One +example where we need the full module is to [compute the required devicelib +mask](https://github.com/intel/llvm/blob/sycl/llvm/lib/SYCLLowerIR/SYCLDeviceLibReqMask.cpp). +If we only process the device code that was included in the original fat object +input to `clang-linker-wrapper`, we will miss devicelib calls in referenced +`SYCL_EXTERNAL` functions. + +The effect of requirement 2) is that we store the fully linked device image for +metadata computation in the SYCL-specific handing code after the ThinLTO +framework has completed. Another option would be to try to compute the metadata +inside the ThinLTO framework callbacks, but this would require SYCL-specific +arguments to many caller functions in the stack and pollute community code. + +Here is the current ThinLTO flow for SYCL: + +![SYCL linking flow](images/ThinLTOSYCLFlow.svg) + +We add a `PreCodeGenModuleHook` function to the `LTOConfig` object so that we +can process the fully linked module without running the backend. + +However, the flow is not ideal for many reasons: + +1) We are relying on the external `llvm-spirv` tool instead of the SPIR-V +backend. We could slightly improve this issue by using a library call to the +SPIR-V translator instead of the tool, however the library API requires setting +up an object to represent the arguments while we only have strings, and it's +non-trivial to parse the strings to figure out how to create the argument +object. Since we plan to use the SPIR-V backend in the long term, this does not +seem to be worth the effort. + +2) We manually run passes inside `PreCodeGenModuleHook`. This is because we +don't run CodeGen, so we can't take advantage of the `PreCodeGenPassesHook` +field of `LTOConfig` to run some custom passes, as those passes are only run +when we actually are going to run CodeGen. + +3) We have to store the fully linked module. This is needed because we need a +fully linked module to accurately compute metadata, see the above explanation of +SYCL requirement 2). We could get around storing the module by computing the +metadata inside the LTO framework and storing it for late use by the SYCL +bundling code, but doing this would require even more SYCL-only customizations +including even more new function arguments and modifications of the +`OffloadFile` class. There are also compilations because the LTO framework is +multithreaded, and not all LLVM data structures are thread safe. + +The proposed long-term SYCL ThinLTO flow is as follows: + +![SYCL SPIR-V backend linking flow](images/ThinLTOSYCLSPIRVBackendFlow.svg) + +The biggest difference here is that we are running CodeGen using the SPIR-V +backend. + +Also, instead of using a lambda function in the `PreCodeGenModuleHook` callback +to run SYCL finalization passes, we can take advantage of the +`PreCodeGenPassesHook` field to add passes to the pass manager that the LTO +framework will run. + +It is possible that the number of device images in the fat executable and which +device image contains which kernel is different with ThinLTO enabled, but we do +expect this to have any impact on correctness or performance, nor we do expect +users to care. + +## Current limitations + +`-O0`: Compiling with `-O0` prevent clang from generating ThinLTO metadata +during the compilation phase. In the current implementation, this is an error. +In the final version, we could either silently fall back to full LTO or generate +ThinLTO metadata even for `-O0`. + +SYCL libdevice: Current all `libdevice` functions are explicitly marked to be +weak symbols. The ThinLTO framework does not consider a definition of function +with weak linkage as it cannot be sure that this definition is the correct one. +Ideally we could remove the weak symbol annotation. + +No binary linkage: The SPIR-V target does not currently have a production +quality binary linker. This means that we must generate a fully linked image as +part of device linkage. At least for AMD devices, this is not a requirement as +`lld` is used for the final link which can resolve any unresolved symbols. +`-fno-gpu-rdc` is default for AMD, so in that case it can call `lld` during +compile, but if `-fno-gpu-rdc` is passed, the lld call happens as part of +`clang-linker-wrapper` to resolve any symbols not resolved by ThinLTO. diff --git a/sycl/doc/design/images/ThinLTOCommunityFlow.svg b/sycl/doc/design/images/ThinLTOCommunityFlow.svg new file mode 100644 index 0000000000000..f2fe257a07af8 --- /dev/null +++ b/sycl/doc/design/images/ThinLTOCommunityFlow.svg @@ -0,0 +1 @@ +Extract device codefrom inputsclang-linker-wrapperProcess symbols andadd device code to LTOframeworkThinLTOframeworkImport functiondefinitionsOptimizationCodegenBundlingCall host linkerwith bundled fatobject \ No newline at end of file diff --git a/sycl/doc/design/images/ThinLTOSYCLFlow.svg b/sycl/doc/design/images/ThinLTOSYCLFlow.svg new file mode 100644 index 0000000000000..622f4d20d158c --- /dev/null +++ b/sycl/doc/design/images/ThinLTOSYCLFlow.svg @@ -0,0 +1 @@ +Extractdevicecodefrominputsclang-linker-wrapperProcesssymbolsandadddevicecodetoLTOframeworkThinLTOframeworkImportfunctiondefinitionsOptimizationBundlingCall host linkerwith bundledfat objectPreCodeGenModuleHookCall SPIR-VtranslatorStore fullylinkedmoduleEarlyexitComputemetadata for fullylinked modulesRun linkedmodulefinalizationpasses \ No newline at end of file diff --git a/sycl/doc/design/images/ThinLTOSYCLSPIRVBackendFlow.svg b/sycl/doc/design/images/ThinLTOSYCLSPIRVBackendFlow.svg new file mode 100644 index 0000000000000..2b048f531b8b7 --- /dev/null +++ b/sycl/doc/design/images/ThinLTOSYCLSPIRVBackendFlow.svg @@ -0,0 +1 @@ +Extractdevicecodefrominputsclang-linker-wrapperProcesssymbolsandadddevicecodetoLTOframeworkThinLTOframeworkImportfunctiondefinitionsOptimizationBundlingCall host linkerwith bundledfat objectPreCodeGenPassesHookStore fullylinkedmoduleCodegenComputemetadata for fullylinked modulesRun linkedmodulefinalizationpasses \ No newline at end of file