diff --git a/src/duckdb_py/CMakeLists.txt b/src/duckdb_py/CMakeLists.txt index 2252ba29..7673d17f 100644 --- a/src/duckdb_py/CMakeLists.txt +++ b/src/duckdb_py/CMakeLists.txt @@ -17,6 +17,7 @@ add_library(python_src OBJECT duckdb_python.cpp importer.cpp map.cpp + module_state.cpp path_like.cpp pyconnection.cpp pyexpression.cpp diff --git a/src/duckdb_py/duckdb_python.cpp b/src/duckdb_py/duckdb_python.cpp index 939fa41a..0e6e6de4 100644 --- a/src/duckdb_py/duckdb_python.cpp +++ b/src/duckdb_py/duckdb_python.cpp @@ -20,6 +20,7 @@ #include "duckdb_python/pybind11/conversions/python_udf_type_enum.hpp" #include "duckdb_python/pybind11/conversions/python_csv_line_terminator_enum.hpp" #include "duckdb/common/enums/statement_type.hpp" +#include "duckdb_python/module_state.hpp" #include "duckdb.hpp" @@ -31,6 +32,16 @@ namespace py = pybind11; namespace duckdb { +// Private function to initialize module state +void InitializeModuleState(py::module_ &m) { + auto state_ptr = new DuckDBPyModuleState(); + SetModuleState(state_ptr); + + // https://pybind11.readthedocs.io/en/stable/advanced/misc.html#module-destructors + auto capsule = py::capsule(state_ptr, [](void *p) { delete static_cast(p); }); + m.attr("__duckdb_state") = capsule; +} + enum PySQLTokenType : uint8_t { PY_SQL_TOKEN_IDENTIFIER = 0, PY_SQL_TOKEN_NUMERIC_CONSTANT, @@ -1007,7 +1018,13 @@ static void RegisterExpectedResultType(py::handle &m) { expected_return_type.export_values(); } -PYBIND11_MODULE(DUCKDB_PYTHON_LIB_NAME, m) { // NOLINT +PYBIND11_MODULE(DUCKDB_PYTHON_LIB_NAME, m, + py::multiple_interpreters::not_supported()) { // NOLINT + // Initialize module state completely during initialization + // PEP 489 wants calls for state to be module local, but currently + // static via g_module_state. + InitializeModuleState(m); + py::enum_(m, "ExplainType") .value("STANDARD", duckdb::ExplainType::EXPLAIN_STANDARD) .value("ANALYZE", duckdb::ExplainType::EXPLAIN_ANALYZE) @@ -1046,9 +1063,10 @@ PYBIND11_MODULE(DUCKDB_PYTHON_LIB_NAME, m) { // NOLINT m.attr("__version__") = std::string(DuckDB::LibraryVersion()).substr(1); m.attr("__standard_vector_size__") = DuckDB::StandardVectorSize(); m.attr("__git_revision__") = DuckDB::SourceID(); - m.attr("__interactive__") = DuckDBPyConnection::DetectAndGetEnvironment(); - m.attr("__jupyter__") = DuckDBPyConnection::IsJupyter(); - m.attr("__formatted_python_version__") = DuckDBPyConnection::FormattedPythonVersion(); + auto &module_state = GetModuleState(); + m.attr("__interactive__") = module_state.environment != PythonEnvironmentType::NORMAL; + m.attr("__jupyter__") = module_state.environment == PythonEnvironmentType::JUPYTER; + m.attr("__formatted_python_version__") = module_state.formatted_python_version; m.def("default_connection", &DuckDBPyConnection::DefaultConnection, "Retrieve the connection currently registered as the default to be used by the module"); m.def("set_default_connection", &DuckDBPyConnection::SetDefaultConnection, diff --git a/src/duckdb_py/include/duckdb_python/module_state.hpp b/src/duckdb_py/include/duckdb_python/module_state.hpp new file mode 100644 index 00000000..2170f629 --- /dev/null +++ b/src/duckdb_py/include/duckdb_python/module_state.hpp @@ -0,0 +1,50 @@ +//===----------------------------------------------------------------------===// +// DuckDB +// +// duckdb_python/module_state.hpp +// +// +//===----------------------------------------------------------------------===// + +#pragma once + +#include "duckdb_python/pybind11/pybind_wrapper.hpp" +#include "duckdb/common/shared_ptr.hpp" +#include "duckdb/main/db_instance_cache.hpp" +#include "duckdb_python/import_cache/python_import_cache.hpp" +#include "duckdb_python/pyconnection/pyconnection.hpp" + +namespace duckdb { + +// Module state structure to hold per-interpreter state +struct DuckDBPyModuleState { + // Core state + DefaultConnectionHolder default_connection; + shared_ptr import_cache; + std::unique_ptr instance_cache; + + // Python environment tracking + PythonEnvironmentType environment = PythonEnvironmentType::NORMAL; + string formatted_python_version; + + DuckDBPyModuleState(); + + // Encapsulated default connection operations for future free threading control + shared_ptr GetDefaultConnection(); + void SetDefaultConnection(shared_ptr connection); + void ClearDefaultConnection(); + + // Encapsulated import cache operations for future free threading control + PythonImportCache* GetImportCache(); + void ResetImportCache(); + +private: + // Non-copyable + DuckDBPyModuleState(const DuckDBPyModuleState &) = delete; + DuckDBPyModuleState &operator=(const DuckDBPyModuleState &) = delete; +}; + +DuckDBPyModuleState &GetModuleState(); +void SetModuleState(DuckDBPyModuleState *state); + +} // namespace duckdb \ No newline at end of file diff --git a/src/duckdb_py/include/duckdb_python/pyconnection/pyconnection.hpp b/src/duckdb_py/include/duckdb_python/pyconnection/pyconnection.hpp index 48ee055e..0821cb22 100644 --- a/src/duckdb_py/include/duckdb_python/pyconnection/pyconnection.hpp +++ b/src/duckdb_py/include/duckdb_python/pyconnection/pyconnection.hpp @@ -28,6 +28,7 @@ namespace duckdb { struct BoundParameterData; +struct DuckDBPyModuleState; enum class PythonEnvironmentType { NORMAL, INTERACTIVE, JUPYTER }; @@ -172,8 +173,7 @@ struct DuckDBPyConnection : public enable_shared_from_this { case_insensitive_set_t registered_objects; public: - explicit DuckDBPyConnection() { - } + DuckDBPyConnection(); ~DuckDBPyConnection(); public: @@ -191,8 +191,15 @@ struct DuckDBPyConnection : public enable_shared_from_this { static shared_ptr DefaultConnection(); static void SetDefaultConnection(shared_ptr conn); static PythonImportCache *ImportCache(); + // Instance method for fast import cache access using cached module state + PythonImportCache *GetImportCache(); static bool IsInteractive(); + // Instance methods for optimized module state access + bool IsJupyterInstance() const; + bool IsInteractiveInstance() const; + std::string FormattedPythonVersionInstance() const; + unique_ptr ReadCSV(const py::object &name, py::kwargs &kwargs); py::list ExtractStatements(const string &query); @@ -337,11 +344,6 @@ struct DuckDBPyConnection : public enable_shared_from_this { py::list ListFilesystems(); bool FileSystemIsRegistered(const string &name); - //! Default connection to an in-memory database - static DefaultConnectionHolder default_connection; - //! Caches and provides an interface to get frequently used modules+subtypes - static shared_ptr import_cache; - static bool IsPandasDataframe(const py::object &object); static PyArrowObjectType GetArrowType(const py::handle &obj); static bool IsAcceptedArrowObject(const py::object &object); @@ -358,9 +360,6 @@ struct DuckDBPyConnection : public enable_shared_from_this { void RegisterArrowObject(const py::object &arrow_object, const string &name); vector> GetStatements(const py::object &query); - static PythonEnvironmentType environment; - static std::string formatted_python_version; - static void DetectEnvironment(); }; template diff --git a/src/duckdb_py/module_state.cpp b/src/duckdb_py/module_state.cpp new file mode 100644 index 00000000..f4ec1fc2 --- /dev/null +++ b/src/duckdb_py/module_state.cpp @@ -0,0 +1,89 @@ +//===----------------------------------------------------------------------===// +// DuckDB +// +// duckdb_python/module_state.cpp +// +// +//===----------------------------------------------------------------------===// + +#include "duckdb_python/module_state.hpp" +#include + +namespace duckdb { + +// TODO: Make non-static. +// Left static because of scope required to efficiently pass import_cache +// without expensive lookups +static DuckDBPyModuleState* g_module_state; + +// Module state constructor +DuckDBPyModuleState::DuckDBPyModuleState() { + // Create caches + instance_cache = make_uniq(); + import_cache = make_shared_ptr(); + + // Detects Python environment and version during intialization + // Moved from DuckDBPyConnection::DetectEnvironment() + py::module_ sys = py::module_::import("sys"); + py::object version_info = sys.attr("version_info"); + int major = py::cast(version_info.attr("major")); + int minor = py::cast(version_info.attr("minor")); + formatted_python_version = std::to_string(major) + "." + std::to_string(minor); + + // If __main__ does not have a __file__ attribute, we are in interactive mode + auto main_module = py::module_::import("__main__"); + if (!py::hasattr(main_module, "__file__")) { + environment = PythonEnvironmentType::INTERACTIVE; + + if (ModuleIsLoaded()) { + // Check to see if we are in a Jupyter Notebook + auto get_ipython = import_cache->IPython.get_ipython(); + if (get_ipython.ptr() != nullptr) { + auto ipython = get_ipython(); + if (py::hasattr(ipython, "config")) { + py::dict ipython_config = ipython.attr("config"); + if (ipython_config.contains("IPKernelApp")) { + environment = PythonEnvironmentType::JUPYTER; + } + } + } + } + } + +} + +DuckDBPyModuleState &GetModuleState() { + // TODO: Externalize this static cache when adding multi-interpreter support + // For now, single interpreter assumption allows simple static caching + if (!g_module_state) { + throw InternalException("Module state not initialized - call SetModuleState() during module init"); + } + return *g_module_state; +} + +void SetModuleState(DuckDBPyModuleState *state) { + printf("DEBUG: SetModuleState() called - initializing static cache\n"); + g_module_state = state; +} + +shared_ptr DuckDBPyModuleState::GetDefaultConnection() { + return default_connection.Get(); +} + +void DuckDBPyModuleState::SetDefaultConnection(shared_ptr connection) { + default_connection.Set(std::move(connection)); +} + +void DuckDBPyModuleState::ClearDefaultConnection() { + default_connection.Set(nullptr); +} + +PythonImportCache* DuckDBPyModuleState::GetImportCache() { + return import_cache.get(); +} + +void DuckDBPyModuleState::ResetImportCache() { + import_cache.reset(); +} + +} // namespace duckdb \ No newline at end of file diff --git a/src/duckdb_py/native/python_conversion.cpp b/src/duckdb_py/native/python_conversion.cpp index caa409c5..ea891535 100644 --- a/src/duckdb_py/native/python_conversion.cpp +++ b/src/duckdb_py/native/python_conversion.cpp @@ -961,6 +961,9 @@ void TransformPythonObjectInternal(py::handle ele, A &result, const B ¶m, bo break; } if (conversion_target.id() == LogicalTypeId::UBIGINT) { + if (PyErr_Occurred()) { + PyErr_Clear(); + } throw InvalidInputException("Python Conversion Failure: Value out of range for type %s", conversion_target); } diff --git a/src/duckdb_py/pyconnection.cpp b/src/duckdb_py/pyconnection.cpp index b88b88ed..cb609206 100644 --- a/src/duckdb_py/pyconnection.cpp +++ b/src/duckdb_py/pyconnection.cpp @@ -1,4 +1,5 @@ #include "duckdb_python/pyconnection/pyconnection.hpp" +#include "duckdb_python/module_state.hpp" #include "duckdb/catalog/default/default_types.hpp" #include "duckdb/common/arrow/arrow.hpp" @@ -66,11 +67,8 @@ namespace duckdb { -DefaultConnectionHolder DuckDBPyConnection::default_connection; // NOLINT: allow global -DBInstanceCache instance_cache; // NOLINT: allow global -shared_ptr DuckDBPyConnection::import_cache = nullptr; // NOLINT: allow global -PythonEnvironmentType DuckDBPyConnection::environment = PythonEnvironmentType::NORMAL; // NOLINT: allow global -std::string DuckDBPyConnection::formatted_python_version = ""; +DuckDBPyConnection::DuckDBPyConnection() { +} DuckDBPyConnection::~DuckDBPyConnection() { try { @@ -82,53 +80,17 @@ DuckDBPyConnection::~DuckDBPyConnection() { } } -void DuckDBPyConnection::DetectEnvironment() { - // Get the formatted Python version - py::module_ sys = py::module_::import("sys"); - py::object version_info = sys.attr("version_info"); - int major = py::cast(version_info.attr("major")); - int minor = py::cast(version_info.attr("minor")); - DuckDBPyConnection::formatted_python_version = std::to_string(major) + "." + std::to_string(minor); - - // If __main__ does not have a __file__ attribute, we are in interactive mode - auto main_module = py::module_::import("__main__"); - if (py::hasattr(main_module, "__file__")) { - return; - } - DuckDBPyConnection::environment = PythonEnvironmentType::INTERACTIVE; - if (!ModuleIsLoaded()) { - return; - } - - // Check to see if we are in a Jupyter Notebook - auto &import_cache_py = *DuckDBPyConnection::ImportCache(); - auto get_ipython = import_cache_py.IPython.get_ipython(); - if (get_ipython.ptr() == nullptr) { - // Could either not load the IPython module, or it has no 'get_ipython' attribute - return; - } - auto ipython = get_ipython(); - if (!py::hasattr(ipython, "config")) { - return; - } - py::dict ipython_config = ipython.attr("config"); - if (ipython_config.contains("IPKernelApp")) { - DuckDBPyConnection::environment = PythonEnvironmentType::JUPYTER; - } - return; -} - bool DuckDBPyConnection::DetectAndGetEnvironment() { - DuckDBPyConnection::DetectEnvironment(); + // Environment detection now happens during module state construction return DuckDBPyConnection::IsInteractive(); } bool DuckDBPyConnection::IsJupyter() { - return DuckDBPyConnection::environment == PythonEnvironmentType::JUPYTER; + return GetModuleState().environment == PythonEnvironmentType::JUPYTER; } std::string DuckDBPyConnection::FormattedPythonVersion() { - return DuckDBPyConnection::formatted_python_version; + return GetModuleState().formatted_python_version; } // NOTE: this function is generated by tools/pythonpkg/scripts/generate_connection_methods.py. @@ -2113,8 +2075,8 @@ static shared_ptr FetchOrCreateInstance(const string &databa D_ASSERT(py::gil_check()); py::gil_scoped_release release; unique_lock lock(res->py_connection_lock); - auto database = - instance_cache.GetOrCreateInstance(database_path, config, cache_instance, InstantiateNewInstance); + auto database = GetModuleState().instance_cache->GetOrCreateInstance(database_path, config, cache_instance, + InstantiateNewInstance); res->con.SetDatabase(std::move(database)); res->con.SetConnection(make_uniq(res->con.GetDatabase())); } @@ -2162,6 +2124,7 @@ shared_ptr DuckDBPyConnection::Connect(const py::object &dat "python_scan_all_frames", "If set, restores the old behavior of scanning all preceding frames to locate the referenced variable.", LogicalType::BOOLEAN, Value::BOOLEAN(false)); + // Use static methods here since we don't have connection instance yet if (!DuckDBPyConnection::IsJupyter()) { config_dict["duckdb_api"] = Value("python/" + DuckDBPyConnection::FormattedPythonVersion()); } else { @@ -2197,24 +2160,37 @@ case_insensitive_map_t DuckDBPyConnection::TransformPythonPa } shared_ptr DuckDBPyConnection::DefaultConnection() { - return default_connection.Get(); + return GetModuleState().GetDefaultConnection(); } void DuckDBPyConnection::SetDefaultConnection(shared_ptr connection) { - return default_connection.Set(std::move(connection)); + GetModuleState().SetDefaultConnection(std::move(connection)); } PythonImportCache *DuckDBPyConnection::ImportCache() { - if (!import_cache) { - import_cache = make_shared_ptr(); - } - return import_cache.get(); + return GetModuleState().GetImportCache(); +} + +PythonImportCache *DuckDBPyConnection::GetImportCache() { + return GetModuleState().GetImportCache(); +} + +bool DuckDBPyConnection::IsJupyterInstance() const { + return GetModuleState().environment == PythonEnvironmentType::JUPYTER; +} + +bool DuckDBPyConnection::IsInteractiveInstance() const { + return GetModuleState().environment != PythonEnvironmentType::NORMAL; +} + +std::string DuckDBPyConnection::FormattedPythonVersionInstance() const { + return GetModuleState().formatted_python_version; } ModifiedMemoryFileSystem &DuckDBPyConnection::GetObjectFileSystem() { if (!internal_object_filesystem) { D_ASSERT(!FileSystemIsRegistered("DUCKDB_INTERNAL_OBJECTSTORE")); - auto &import_cache_py = *ImportCache(); + auto &import_cache_py = *GetImportCache(); auto modified_memory_fs = import_cache_py.duckdb.filesystem.ModifiedMemoryFileSystem(); if (modified_memory_fs.ptr() == nullptr) { throw InvalidInputException( @@ -2228,7 +2204,7 @@ ModifiedMemoryFileSystem &DuckDBPyConnection::GetObjectFileSystem() { } bool DuckDBPyConnection::IsInteractive() { - return DuckDBPyConnection::environment != PythonEnvironmentType::NORMAL; + return GetModuleState().environment != PythonEnvironmentType::NORMAL; } shared_ptr DuckDBPyConnection::Enter() { @@ -2246,8 +2222,12 @@ void DuckDBPyConnection::Exit(DuckDBPyConnection &self, const py::object &exc_ty } void DuckDBPyConnection::Cleanup() { - default_connection.Set(nullptr); - import_cache.reset(); + try { + GetModuleState().ClearDefaultConnection(); + GetModuleState().ResetImportCache(); + } catch (const pybind11::error_already_set &) { + // Python is shutting down, ignore cleanup failures + } } bool DuckDBPyConnection::IsPandasDataframe(const py::object &object) {