Skip to content
Merged
Show file tree
Hide file tree
Changes from 6 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
53 changes: 51 additions & 2 deletions clang/tools/clang-offload-wrapper/ClangOffloadWrapper.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1293,6 +1293,50 @@ class BinaryWrapper {
appendToGlobalDtors(M, Func, /*Priority*/ 1);
}

void createSyclRegisterWithAtexitUnregister(GlobalVariable *BinDesc) {
auto *UnregFuncTy =
FunctionType::get(Type::getVoidTy(C), /*isVarArg*/ false);
auto *UnregFunc =
Function::Create(UnregFuncTy, GlobalValue::InternalLinkage,
"sycl.descriptor_unreg.atexit", &M);
UnregFunc->setSection(".text.startup");

// Declaration for __sycl_unregister_lib(void*).
auto *UnregTargetTy =
FunctionType::get(Type::getVoidTy(C), getPtrTy(), false);
FunctionCallee UnregTargetC =
M.getOrInsertFunction("__sycl_unregister_lib", UnregTargetTy);

IRBuilder<> UnregBuilder(BasicBlock::Create(C, "entry", UnregFunc));
UnregBuilder.CreateCall(UnregTargetC, BinDesc);
UnregBuilder.CreateRetVoid();

auto *RegFuncTy = FunctionType::get(Type::getVoidTy(C), /*isVarArg*/ false);
auto *RegFunc = Function::Create(RegFuncTy, GlobalValue::InternalLinkage,
"sycl.descriptor_reg", &M);
RegFunc->setSection(".text.startup");

auto *RegTargetTy =
FunctionType::get(Type::getVoidTy(C), getPtrTy(), false);
FunctionCallee RegTargetC =
M.getOrInsertFunction("__sycl_register_lib", RegTargetTy);

// `atexit` takes a `void(*)()` function pointer. In LLVM IR, this is
// typically represented as `i32 (ptr)`.
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

im not sure i understand this comment, to me it seems like the abi would be different so we can't use i32 (ptr) to represent void(*)()

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I simplified it. I just wanted to describe atexit.

FunctionType *AtExitTy =
FunctionType::get(Type::getInt32Ty(C), getPtrTy(), false);
FunctionCallee AtExitC = M.getOrInsertFunction("atexit", AtExitTy);

IRBuilder<> RegBuilder(BasicBlock::Create(C, "entry", RegFunc));
RegBuilder.CreateCall(RegTargetC, BinDesc);
RegBuilder.CreateCall(AtExitC, UnregFunc);
RegBuilder.CreateRetVoid();

// Add this function to global destructors.
// Match priority of __tgt_register_lib
appendToGlobalCtors(M, RegFunc, /*Priority*/ 1);
}

public:
BinaryWrapper(StringRef Target, StringRef ToolName,
StringRef SymPropBCFiles = "")
Expand Down Expand Up @@ -1370,8 +1414,13 @@ class BinaryWrapper {

if (EmitRegFuncs) {
GlobalVariable *Desc = *DescOrErr;
createRegisterFunction(Kind, Desc);
createUnregisterFunction(Kind, Desc);
if (Kind == OffloadKind::SYCL &&
Triple(M.getTargetTriple()).isOSWindows()) {
createSyclRegisterWithAtexitUnregister(Desc);
} else {
createRegisterFunction(Kind, Desc);
createUnregisterFunction(Kind, Desc);
}
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It doesn't necessarily have to be part of this patch, but could you please document this approach in a design document?

}
}
return &M;
Expand Down
54 changes: 52 additions & 2 deletions llvm/lib/Frontend/Offloading/SYCLOffloadWrapper.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@
#include "llvm/Support/ErrorHandling.h"
#include "llvm/Support/LineIterator.h"
#include "llvm/Support/PropertySetIO.h"
#include "llvm/TargetParser/Triple.h"
#include "llvm/Transforms/Utils/ModuleUtils.h"
#include <memory>
#include <string>
Expand Down Expand Up @@ -734,6 +735,51 @@ struct Wrapper {
// Add this function to global destructors.
appendToGlobalDtors(M, Func, /*Priority*/ 1);
}

void createSyclRegisterWithAtexitUnregister(GlobalVariable *FatbinDesc) {
auto *UnregFuncTy =
FunctionType::get(Type::getVoidTy(C), /*isVarArg*/ false);
auto *UnregFunc =
Function::Create(UnregFuncTy, GlobalValue::InternalLinkage,
"sycl.descriptor_unreg.atexit", &M);
UnregFunc->setSection(".text.startup");

// Declaration for __sycl_unregister_lib(void*).
auto *UnregTargetTy =
FunctionType::get(Type::getVoidTy(C), PointerType::getUnqual(C), false);
FunctionCallee UnregTargetC =
M.getOrInsertFunction("__sycl_unregister_lib", UnregTargetTy);

// Body of the unregister wrapper.
IRBuilder<> UnregBuilder(BasicBlock::Create(C, "entry", UnregFunc));
UnregBuilder.CreateCall(UnregTargetC, FatbinDesc);
UnregBuilder.CreateRetVoid();

auto *RegFuncTy = FunctionType::get(Type::getVoidTy(C), /*isVarArg*/ false);
auto *RegFunc = Function::Create(RegFuncTy, GlobalValue::InternalLinkage,
"sycl.descriptor_reg", &M);
RegFunc->setSection(".text.startup");

auto *RegTargetTy =
FunctionType::get(Type::getVoidTy(C), PointerType::getUnqual(C), false);
FunctionCallee RegTargetC =
M.getOrInsertFunction("__sycl_register_lib", RegTargetTy);

// `atexit` takes a `void(*)()` function pointer. In LLVM IR, this is
// typically represented as `i32 (ptr)`.
FunctionType *AtExitTy = FunctionType::get(
Type::getInt32Ty(C), PointerType::getUnqual(C), false);
FunctionCallee AtExitC = M.getOrInsertFunction("atexit", AtExitTy);

IRBuilder<> RegBuilder(BasicBlock::Create(C, "entry", RegFunc));
RegBuilder.CreateCall(RegTargetC, FatbinDesc);
RegBuilder.CreateCall(AtExitC, UnregFunc);
RegBuilder.CreateRetVoid();

// Finally, add to global constructors.
appendToGlobalCtors(M, RegFunc, /*Priority*/ 1);
}

}; // end of Wrapper

} // anonymous namespace
Expand All @@ -747,7 +793,11 @@ Error llvm::offloading::wrapSYCLBinaries(llvm::Module &M,
return createStringError(inconvertibleErrorCode(),
"No binary descriptors created.");

W.createRegisterFatbinFunction(Desc);
W.createUnregisterFunction(Desc);
if (Triple(M.getTargetTriple()).isOSWindows()) {
W.createSyclRegisterWithAtexitUnregister(Desc);
} else {
W.createRegisterFatbinFunction(Desc);
W.createUnregisterFunction(Desc);
}
return Error::success();
}
19 changes: 0 additions & 19 deletions sycl/source/detail/device_global_map.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -73,25 +73,6 @@ class DeviceGlobalMap {
}
}

void eraseEntries(const RTDeviceBinaryImage *Img) {
const auto &DeviceGlobals = Img->getDeviceGlobals();
std::lock_guard<std::mutex> DeviceGlobalsGuard(MDeviceGlobalsMutex);
for (const sycl_device_binary_property &DeviceGlobal : DeviceGlobals) {
if (auto DevGlobalIt = MDeviceGlobals.find(DeviceGlobal->Name);
DevGlobalIt != MDeviceGlobals.end()) {
auto findDevGlobalByValue = std::find_if(
MPtr2DeviceGlobal.begin(), MPtr2DeviceGlobal.end(),
[&DevGlobalIt](
const std::pair<const void *, DeviceGlobalMapEntry *> &Entry) {
return Entry.second == DevGlobalIt->second.get();
});
if (findDevGlobalByValue != MPtr2DeviceGlobal.end())
MPtr2DeviceGlobal.erase(findDevGlobalByValue);
MDeviceGlobals.erase(DevGlobalIt);
}
}
}

void addOrInitialize(const void *DeviceGlobalPtr, const char *UniqueId) {
std::lock_guard<std::mutex> DeviceGlobalsGuard(MDeviceGlobalsMutex);
auto ExistingDeviceGlobal = MDeviceGlobals.find(UniqueId);
Expand Down
7 changes: 0 additions & 7 deletions sycl/source/detail/program_manager/program_manager.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2180,8 +2180,6 @@ void ProgramManager::removeImages(sycl_device_binaries DeviceBinary) {
m_VFSet2BinImage.erase(SetName);
}

m_DeviceGlobals.eraseEntries(Img);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The device global entries in the map often point to the global variables in host code corresponding to the device_global. Not cleaning these up when we remove these may result in dangling pointers. Is that safe to leave around in the map?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There has been some other change to SYCLOS, so I've downgraded this to draft. I've been rechecking things.

Presently in SYCLOS there are TWO different code paths that lead to the ~DeviceGlboalUSMMem destructor. That destructor checks to see its Ptr and Event have been cleared ( by DeviceGlobalMapEntry::removeAssociatedResources() ) and asserts if they have not. But they are only cleared in one of the two paths.

PATH A:
~context_impl() => DeviceGlobalMapEntry::removeAssociatedResources() => ~DeviceGlobalUSMMem()

PATH B:
__sycl_unregister_lib() => program_mananger.removeImages() => DeviceGlobalMap::eraseEntries => MDeviceGlobals.erase(DevGlobalIT) => ~DeviceGlobalUSMMem()
but that last destruction in the path only happens if the map entry wasn't deleted earlier.

__sycl_unregister_lib() is called every time a user shared library (or app) that contains kernels is unloaded. We do need to clean up these images correctly, but using device globals across different shared libraries is not possible right now.

~context_impl() destructor is called when the context is destroyed, which is usually just when the app shuts down.

On Linux, during app shutdown ~context_impl() fires BEFORE __sycl_unregister_lib() so we go down path A, then path B. Everything is great.

On Windows, the order is reversed. __sycl_unregister_lib() is called first. And, furthermore, it has no way of telling at the time it is called if it's shutting down or just unloading some .so (as in my e2e test). So path B fires first, which means we hit the asserting destructor because removeAssociatedResources() is not yet called.

Given that we don't support using the same device global in different shared libraries anyway, my fix should be correct and safe.

BUT, we shall see. Like I said, there have been some other changes, and I am retreating to Draft to try and see what needs to be done now.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

But what happens if I load a shared library with a device global, then unload it and load it again. The addresses would be different. Would it be able to update the maps?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We can't "share" a single device global between and app and a shared library right now. Or, at least, I can't. I can do that stunt with regular global variables, but not with device globals. I'd have to consult my notes to remember the exact failure, but I'm pretty sure it was linking.

So, given that limitation, reloading a shared library with a device global should work correctly. But I'll test it to be sure and maybe expand the testing.

Copy link
Contributor Author

@cperkinsintel cperkinsintel Aug 19, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@steffenlarsen - Reloading a shared library with a device global triggers an assertion in both SYCLOS and the fix to __sycl_unregister_lib that I'm working on.

Assertion `!MDeviceGlobalPtr && "Device global pointer has already been initialized."' failed.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@steffenlarsen I might need your advice on this last bit.

When we are reloading a shared lib with a device global, we first unload, which ends up calling eraseEntries(Img) but that doesn't actually erase any entries. The Img->getDeviceGlobals() is empty. So there are no matches.

The reason for that seems to be that the device globals from the shared lib don't originate from any RTBinaryImage instead, they originate from the __sycl_device_global_registration() call in the integration footer.

__sycl_device_global_registration::__sycl_device_global_registration() noexcept {
device_global_map::add((void *)&::DGVarsame, "_Z9DGVarsame");
}

I didn't even know that was a possibility. I see no mechanism for removing device globals added in this way.

We have a couple of options:

  • remove the asserts that stress out about MDeviceGlobals being set already. This works and allows shared libraries with device globals to be reloaded. Would there be a memory leak? Probably. Would there be other consequences? Not likely.
  • figure out a way to remove these integration footer device globals. Might be simple. I haven't thought about it.
  • refactor. We presently have device globals coming in from the kernel compiler, the binary image, the integration footer, and being erased/released during image removal, context destruction, and the ManagedDeviceGlobalRegistry destruction. Maybe that could be unified? It's hard to say - a lot of it seems to be recently changed, so maybe other things are under way.
  • ignore for now.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Discussed offline. Let's change the assert for now.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I changed the assert last night and tested it Win and Lin. Everything was fine. But the CI was experiencing problems. I managed to reproduce and fix on battlemage, but afterwards the CI continues to have problems and I am unable to reproduce the problem locally (Win + battlemage). The CI seems to be able to reproduce it , but not for me. Passes every single time.
But whatever the failure is, it's definitely in this load/reload thing.

So I'm reverting those changes back to where we had the discussion and the tests were passing. I think we should proceed with "ignore for now" option and we can open a ticket about the load/reload issue.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Since it isn't a regression, I am okay with making it a follow-up. We will need a tracker for it though.

Copy link
Contributor Author

@cperkinsintel cperkinsintel Aug 26, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes. I'll make a tracker once this is merged so I can use its test as a reproducer. It demonstrates it easily when -DWITH_DEVICE_GLOBALS is added.


{
std::lock_guard<std::mutex> HostPipesGuard(m_HostPipesMutex);
auto HostPipes = Img->getHostPipes();
Expand Down Expand Up @@ -3824,10 +3822,5 @@ extern "C" void __sycl_register_lib(sycl_device_binaries desc) {

// Executed as a part of current module's (.exe, .dll) static initialization
extern "C" void __sycl_unregister_lib(sycl_device_binaries desc) {
// Partial cleanup is not necessary at shutdown
#ifndef _WIN32
if (!sycl::detail::GlobalHandler::instance().isOkToDefer())
return;
sycl::detail::ProgramManager::getInstance().removeImages(desc);
#endif
}
3 changes: 3 additions & 0 deletions sycl/test-e2e/Basic/stream/zero_buffer_size.cpp
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
// UNSUPPORTED: hip
// UNSUPPORTED-TRACKER: CMPLRLLVM-69478

// RUN: %{build} -o %t.out
// RUN: %{run} %t.out

Expand Down
25 changes: 25 additions & 0 deletions sycl/test-e2e/IntermediateLib/Inputs/incrementing_lib.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
#include <sycl/detail/core.hpp>

#if defined(_WIN32)
#define API_EXPORT __declspec(dllexport)
#else
#define API_EXPORT
#endif

#ifndef INC
#define INC 1
#endif

#ifndef CLASSNAME
#define CLASSNAME same
#endif

extern "C" API_EXPORT void performIncrementation(sycl::queue &q,
sycl::buffer<int, 1> &buf) {
sycl::range<1> r = buf.get_range();
q.submit([&](sycl::handler &cgh) {
auto acc = buf.get_access<sycl::access::mode::write>(cgh);
cgh.parallel_for<class CLASSNAME>(
r, [=](sycl::id<1> idx) { acc[idx] += INC; });
});
}
142 changes: 142 additions & 0 deletions sycl/test-e2e/IntermediateLib/multi_lib_app.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,142 @@
// UNSUPPORTED: cuda || hip
// UNSUPPORTED-TRACKER: CMPLRLLVM-69415

// REQUIRES: level_zero

// DEFINE: %{fPIC_flag} = %if windows %{%} %else %{-fPIC%}
// DEFINE: %{shared_lib_ext} = %if windows %{dll%} %else %{so%}

// clang-format off
// IMPORTANT -DSO_PATH='R"(%T)"' WTF ??
// We need to capture %T, the build directory, in a string
// and the normal STRINGIFY() macros hack won't work.
// Because on Windows, the path delimiters are \,
// which C++ preprocessor converts to escape sequences,
// which becomes a nightmare.
// SO the hack here is to put heredoc in the definition
// and use single quotes, which Python forgivingly accepts.
// clang-format on

// RUN: %{build} %{fPIC_flag} -DSO_PATH='R"(%T)"' -o %t.out

// RUN: %clangxx -fsycl %{fPIC_flag} -shared -DINC=1 -o %T/lib_a.%{shared_lib_ext} %S/Inputs/incrementing_lib.cpp
// RUN: %clangxx -fsycl %{fPIC_flag} -shared -DINC=2 -o %T/lib_b.%{shared_lib_ext} %S/Inputs/incrementing_lib.cpp
// RUN: %clangxx -fsycl %{fPIC_flag} -shared -DINC=4 -o %T/lib_c.%{shared_lib_ext} %S/Inputs/incrementing_lib.cpp

// RUN: env UR_L0_LEAKS_DEBUG=1 %{run} %t.out

// This test uses a kernel of the same name in three different shared libraries.
// It loads each library, calls the kernel, and checks that the incrementation
// is done correctly, and then unloads the library.
// This test ensures that __sycl_register_lib() and __sycl_unregister_lib()
// are called correctly, and that the device images are cleaned up properly.

#include <sycl/detail/core.hpp>

using namespace sycl::ext::oneapi::experimental;


#ifdef _WIN32
#include <windows.h>

void *loadOsLibrary(const std::string &LibraryPath) {
HMODULE h =
LoadLibraryExA(LibraryPath.c_str(), NULL, LOAD_WITH_ALTERED_SEARCH_PATH);
return (void *)h;
}
int unloadOsLibrary(void *Library) {
return FreeLibrary((HMODULE)Library) ? 0 : 1;
}
void *getOsLibraryFuncAddress(void *Library, const std::string &FunctionName) {
return (void *)GetProcAddress((HMODULE)Library, FunctionName.c_str());
}

#else
#include <dlfcn.h>

void *loadOsLibrary(const std::string &LibraryPath) {
void *so = dlopen(LibraryPath.c_str(), RTLD_NOW);
if (!so) {
char *Error = dlerror();
std::cerr << "dlopen(" << LibraryPath << ") failed with <"
<< (Error ? Error : "unknown error") << ">" << std::endl;
}
return so;
}

int unloadOsLibrary(void *Library) { return dlclose(Library); }

void *getOsLibraryFuncAddress(void *Library, const std::string &FunctionName) {
return dlsym(Library, FunctionName.c_str());
}
#endif

// Define the function pointer type for performIncrementation
using IncFuncT = void(sycl::queue &, sycl::buffer<int, 1> &);

void initializeBuffer(sycl::buffer<int, 1> &buf) {
auto acc = sycl::host_accessor<int, 1>(buf);
for (size_t i = 0; i < buf.size(); ++i)
acc[i] = 0;
}

void checkIncrementation(sycl::buffer<int, 1> &buf, int val) {
auto acc = sycl::host_accessor<int, 1>(buf);
for (size_t i = 0; i < buf.size(); ++i) {
std::cout << acc[i] << " ";
assert(acc[i] == val);
}
std::cout << std::endl;
}

int main() {
sycl::queue q;

sycl::range<1> r(8);
sycl::buffer<int, 1> buf(r);
initializeBuffer(buf);

std::string base_path = SO_PATH;

#ifdef _WIN32
std::string path_to_lib_a = base_path + "\\lib_a.dll";
std::string path_to_lib_b = base_path + "\\lib_b.dll";
std::string path_to_lib_c = base_path + "\\lib_c.dll";
#else
std::string path_to_lib_a = base_path + "/lib_a.so";
std::string path_to_lib_b = base_path + "/lib_b.so";
std::string path_to_lib_c = base_path + "/lib_c.so";
#endif

std::cout << "paths: " << path_to_lib_a << std::endl;
std::cout << "SO_PATH: " << SO_PATH << std::endl;

void *lib_a = loadOsLibrary(path_to_lib_a);
void *f = getOsLibraryFuncAddress(lib_a, "performIncrementation");
auto performIncrementationFuncA = reinterpret_cast<IncFuncT *>(f);
performIncrementationFuncA(q, buf); // call the function from lib_a
q.wait();
checkIncrementation(buf, 1);
unloadOsLibrary(lib_a);
std::cout << "lib_a done" << std::endl;

void *lib_b = loadOsLibrary(path_to_lib_b);
f = getOsLibraryFuncAddress(lib_b, "performIncrementation");
auto performIncrementationFuncB = reinterpret_cast<IncFuncT *>(f);
performIncrementationFuncB(q, buf); // call the function from lib_b
q.wait();
checkIncrementation(buf, 1 + 2);
unloadOsLibrary(lib_b);
std::cout << "lib_b done" << std::endl;

void *lib_c = loadOsLibrary(path_to_lib_c);
f = getOsLibraryFuncAddress(lib_c, "performIncrementation");
auto performIncrementationFuncC = reinterpret_cast<IncFuncT *>(f);
q.wait();
performIncrementationFuncC(q, buf); // call the function from lib_c
checkIncrementation(buf, 1 + 2 + 4);
unloadOsLibrary(lib_c);
std::cout << "lib_c done" << std::endl;

return 0;
}
10 changes: 0 additions & 10 deletions sycl/unittests/program_manager/Cleanup.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -303,16 +303,6 @@ void checkAllInvolvedContainers(ProgramManagerExposed &PM, size_t ExpectedCount,
}
EXPECT_EQ(PM.getKernelImplicitLocalArgPos().size(), ExpectedCount) << Comment;

{
sycl::detail::DeviceGlobalMap &DeviceGlobalMap = PM.getDeviceGlobals();
EXPECT_EQ(DeviceGlobalMap.size(), ExpectedCount) << Comment;
EXPECT_TRUE(DeviceGlobalMap.count(generateRefName("A", "DeviceGlobal")) > 0)
<< Comment;
EXPECT_TRUE(DeviceGlobalMap.count(generateRefName("B", "DeviceGlobal")) > 0)
<< Comment;
EXPECT_EQ(DeviceGlobalMap.getPointerMap().size(), ExpectedCount) << Comment;
}

{
EXPECT_EQ(PM.getHostPipes().size(), ExpectedCount) << Comment;
EXPECT_TRUE(PM.getHostPipes().count(generateRefName("A", "HostPipe")) > 0)
Expand Down