From 599dfaa8b3b155c101e87f8451964fd0aef0887f Mon Sep 17 00:00:00 2001 From: "Paul J. Davis" Date: Tue, 11 Mar 2025 13:00:42 -0500 Subject: [PATCH 1/2] Fix segfault race condition at process exit I discovered a couple race conditions leading to segfaults in a separate repository. After debugging I managed to narrow this down to issues around the lifetimes of our static global state instances. As these instances are all destructed as the main thread exits, anything in separate threads can end up attempting to reuse these instances after they have been destructed. The test included in this PR doesn't actually use threads due to flakiness concerns. Instead we can observe the same phenomenon by attempting to store a `static std::optional` instance which ends up having its destructor run after the destructors for GlobalState and the root Logger. This change updates the GlobalState to be a `static std::shared_ptr` and the `Logger` to be a `static Logger*`. In the case of `GlobalState` we then just rely on shared_ptr reference counting to ensure that the instance is alive for as long as a `Context` needs it. The `Logger*` on the other hand is just never delete'd so that any log messages generated are included rather than silencing any thread after the main thread exits. --- tiledb/common/logger.cc | 8 ++- tiledb/sm/global_state/global_state.cc | 4 +- tiledb/sm/global_state/global_state.h | 9 +-- tiledb/sm/global_state/watchdog.cc | 4 +- tiledb/sm/global_state/watchdog.h | 2 + tiledb/sm/storage_manager/CMakeLists.txt | 2 + tiledb/sm/storage_manager/storage_manager.cc | 11 ++-- .../storage_manager_canonical.h | 4 ++ tiledb/sm/storage_manager/test/CMakeLists.txt | 35 ++++++++++ tiledb/sm/storage_manager/test/main.cc | 34 ++++++++++ .../test/unit_static_context.cc | 65 +++++++++++++++++++ 11 files changed, 163 insertions(+), 15 deletions(-) create mode 100644 tiledb/sm/storage_manager/test/CMakeLists.txt create mode 100644 tiledb/sm/storage_manager/test/main.cc create mode 100644 tiledb/sm/storage_manager/test/unit_static_context.cc diff --git a/tiledb/common/logger.cc b/tiledb/common/logger.cc index 6bbfd0f7cbc..d852022ade7 100644 --- a/tiledb/common/logger.cc +++ b/tiledb/common/logger.cc @@ -328,8 +328,12 @@ std::string global_logger_name(const Logger::Format format) { } Logger& global_logger(Logger::Format format) { - static Logger l(global_logger_name(format), Logger::Level::ERR, format, true); - return l; + // Note: We are *not* deallocating this root `Logger` instance so that + // during process exit, threads other than main will not trigger segfault's + // if they attempt to log after the main thread has exited. + static Logger* l = tiledb_new( + HERE(), global_logger_name(format), Logger::Level::ERR, format, true); + return *l; } /** Logs a trace. */ diff --git a/tiledb/sm/global_state/global_state.cc b/tiledb/sm/global_state/global_state.cc index 4be667881e4..cdf80f63b16 100644 --- a/tiledb/sm/global_state/global_state.cc +++ b/tiledb/sm/global_state/global_state.cc @@ -47,9 +47,9 @@ using namespace tiledb::common; namespace tiledb::sm::global_state { -GlobalState& GlobalState::GetGlobalState() { +shared_ptr GlobalState::GetGlobalState() { // This is thread-safe in C++11. - static GlobalState globalState; + static shared_ptr globalState = make_shared(HERE()); return globalState; } diff --git a/tiledb/sm/global_state/global_state.h b/tiledb/sm/global_state/global_state.h index 224dda0972c..79395843d12 100644 --- a/tiledb/sm/global_state/global_state.h +++ b/tiledb/sm/global_state/global_state.h @@ -37,6 +37,7 @@ #include #include +#include "tiledb/common/common.h" #include "tiledb/sm/config/config.h" #include "tiledb/sm/storage_manager/storage_manager_declaration.h" @@ -59,8 +60,11 @@ class GlobalState { GlobalState& operator=(const GlobalState&) = delete; GlobalState& operator=(const GlobalState&&) = delete; + /** Constructor. */ + GlobalState(); + /** Returns a reference to the singleton GlobalState instance. */ - static GlobalState& GetGlobalState(); + static shared_ptr GetGlobalState(); /** * Initializes all TileDB global state in an idempotent and threadsafe way. @@ -101,9 +105,6 @@ class GlobalState { /** Mutex protecting list of StorageManagers. */ std::mutex storage_managers_mtx_; - - /** Constructor. */ - GlobalState(); }; } // namespace global_state diff --git a/tiledb/sm/global_state/watchdog.cc b/tiledb/sm/global_state/watchdog.cc index a474346797a..17262c70157 100644 --- a/tiledb/sm/global_state/watchdog.cc +++ b/tiledb/sm/global_state/watchdog.cc @@ -73,13 +73,15 @@ void Watchdog::watchdog_thread(Watchdog* watchdog) { return; } + auto global_state = GlobalState::GetGlobalState(); + while (true) { std::unique_lock lck(watchdog->mtx_); watchdog->cv_.wait_for( lck, std::chrono::milliseconds(constants::watchdog_thread_sleep_ms)); if (SignalHandlers::signal_received()) { - for (auto* sm : GlobalState::GetGlobalState().storage_managers()) { + for (auto* sm : global_state->storage_managers()) { throw_if_not_ok(sm->cancel_all_tasks()); } } diff --git a/tiledb/sm/global_state/watchdog.h b/tiledb/sm/global_state/watchdog.h index d954b7ef1ca..0cd27ae758f 100644 --- a/tiledb/sm/global_state/watchdog.h +++ b/tiledb/sm/global_state/watchdog.h @@ -37,6 +37,8 @@ #include #include +#include "tiledb/sm/global_state/global_state.h" + namespace tiledb { namespace sm { namespace global_state { diff --git a/tiledb/sm/storage_manager/CMakeLists.txt b/tiledb/sm/storage_manager/CMakeLists.txt index cbdb8ac0d9d..e8fa0288c65 100644 --- a/tiledb/sm/storage_manager/CMakeLists.txt +++ b/tiledb/sm/storage_manager/CMakeLists.txt @@ -33,3 +33,5 @@ commence(object_library context_resources) this_target_sources(context_resources.cc) this_target_object_libraries(baseline config rest_client stats thread_pool vfs) conclude(object_library) + +add_test_subdirectory() diff --git a/tiledb/sm/storage_manager/storage_manager.cc b/tiledb/sm/storage_manager/storage_manager.cc index dad55991dc5..b1c84d19cef 100644 --- a/tiledb/sm/storage_manager/storage_manager.cc +++ b/tiledb/sm/storage_manager/storage_manager.cc @@ -60,18 +60,17 @@ StorageManagerCanonical::StorageManagerCanonical( ContextResources& resources, const shared_ptr&, // unused const Config& config) - : vfs_(resources.vfs()) + : global_state_(global_state::GlobalState::GetGlobalState()) + , vfs_(resources.vfs()) , cancellation_in_progress_(false) , config_(config) , queries_in_progress_(0) { - auto& global_state = global_state::GlobalState::GetGlobalState(); - global_state.init(config_); - - global_state.register_storage_manager(this); + global_state_->init(config_); + global_state_->register_storage_manager(this); } StorageManagerCanonical::~StorageManagerCanonical() { - global_state::GlobalState::GetGlobalState().unregister_storage_manager(this); + global_state_->unregister_storage_manager(this); throw_if_not_ok(cancel_all_tasks()); diff --git a/tiledb/sm/storage_manager/storage_manager_canonical.h b/tiledb/sm/storage_manager/storage_manager_canonical.h index 11e80b06246..d7e9dd93cf8 100644 --- a/tiledb/sm/storage_manager/storage_manager_canonical.h +++ b/tiledb/sm/storage_manager/storage_manager_canonical.h @@ -52,6 +52,7 @@ #include "tiledb/sm/array/array_directory.h" #include "tiledb/sm/enums/walk_order.h" #include "tiledb/sm/filesystem/uri.h" +#include "tiledb/sm/global_state/global_state.h" #include "tiledb/sm/group/group.h" #include "tiledb/sm/misc/cancelable_tasks.h" #include "tiledb/sm/misc/types.h" @@ -146,6 +147,9 @@ class StorageManagerCanonical { /* PRIVATE ATTRIBUTES */ /* ********************************* */ + /** The GlobalState to use for this StorageManager. */ + shared_ptr global_state_; + /** * VFS instance used in `cancel_all_tasks`. */ diff --git a/tiledb/sm/storage_manager/test/CMakeLists.txt b/tiledb/sm/storage_manager/test/CMakeLists.txt new file mode 100644 index 00000000000..ee55e6a8b62 --- /dev/null +++ b/tiledb/sm/storage_manager/test/CMakeLists.txt @@ -0,0 +1,35 @@ +# +# tiledb/sm/storage_manager/CMakeLists.txt +# +# The MIT License +# +# Copyright (c) 2025 TileDB, Inc. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +# THE SOFTWARE. +# + +include(unit_test) + +commence(unit_test static_context) + this_target_sources(main.cc unit_static_context.cc) + target_include_directories(unit_static_context PRIVATE + "$") + this_target_link_libraries(TILEDB_CORE_OBJECTS) + this_target_link_libraries(tiledb_test_support_lib) +conclude(unit_test) diff --git a/tiledb/sm/storage_manager/test/main.cc b/tiledb/sm/storage_manager/test/main.cc new file mode 100644 index 00000000000..82a59917a90 --- /dev/null +++ b/tiledb/sm/storage_manager/test/main.cc @@ -0,0 +1,34 @@ +/** + * @file tiledb/sm/storage_manager/test/main.cc + * + * @section LICENSE + * + * The MIT License + * + * @copyright Copyright (c) 2021 TileDB, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + * @section DESCRIPTION + * + * This file defines a test `main()` + */ + +#define CATCH_CONFIG_MAIN +#include diff --git a/tiledb/sm/storage_manager/test/unit_static_context.cc b/tiledb/sm/storage_manager/test/unit_static_context.cc new file mode 100644 index 00000000000..f627d1d2dd0 --- /dev/null +++ b/tiledb/sm/storage_manager/test/unit_static_context.cc @@ -0,0 +1,65 @@ +/** + * @file tiledb/sm/storage_manager/test/unit_static_context.cc + * + * @section LICENSE + * + * The MIT License + * + * @copyright Copyright (c) 2021-2025 TileDB, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + * @section DESCRIPTION + * + * This file defines unit tests for the checking StorageManager and GlobalState + * lifetimes. + */ + +#include +#if defined(DELETE) +#undef DELETE +#endif +#include +#include +#include +#include +#include +#include + +#include +#include + +using namespace tiledb; +using namespace tiledb::common; +using namespace tiledb::sm; +using namespace tiledb::type; + +TEST_CASE("Static Context", "[context]") { + // While non-obvious from this implementation, the issue here is that the + // destructor of the `static std::optional` is run after the other + // static instance finalizers for GlobalState and Logger. This ordering ends + // up causing a segfault when the `Context` attempts to use those resources + // after they have been destructed. + // + // Thus, the actual assertion of this test is that we don't segfault which + // is reported as an error by Catch2. + static tiledb::Config cfg; + static std::optional ctx; + ctx = std::make_optional(tiledb::Context(cfg)); +} From ad9d1c7d6710b06be2e3c13a89a8273473c0bb9e Mon Sep 17 00:00:00 2001 From: Ryan Roelke Date: Tue, 3 Jun 2025 13:13:42 -0400 Subject: [PATCH 2/2] Fix unit_static_context build by removing unused includes --- tiledb/sm/cpp_api/config.h | 1 + .../test/unit_static_context.cc | 18 +----------------- 2 files changed, 2 insertions(+), 17 deletions(-) diff --git a/tiledb/sm/cpp_api/config.h b/tiledb/sm/cpp_api/config.h index 3e0b882ca60..130de42ec2b 100644 --- a/tiledb/sm/cpp_api/config.h +++ b/tiledb/sm/cpp_api/config.h @@ -38,6 +38,7 @@ #include #include +#include #include #include diff --git a/tiledb/sm/storage_manager/test/unit_static_context.cc b/tiledb/sm/storage_manager/test/unit_static_context.cc index f627d1d2dd0..abe78da4952 100644 --- a/tiledb/sm/storage_manager/test/unit_static_context.cc +++ b/tiledb/sm/storage_manager/test/unit_static_context.cc @@ -30,25 +30,9 @@ * This file defines unit tests for the checking StorageManager and GlobalState * lifetimes. */ +#include "tiledb/sm/cpp_api/context.h" #include -#if defined(DELETE) -#undef DELETE -#endif -#include -#include -#include -#include -#include -#include - -#include -#include - -using namespace tiledb; -using namespace tiledb::common; -using namespace tiledb::sm; -using namespace tiledb::type; TEST_CASE("Static Context", "[context]") { // While non-obvious from this implementation, the issue here is that the