diff --git a/.github/workflows/linux.yml b/.github/workflows/linux.yml index 552bdf4..cc997aa 100644 --- a/.github/workflows/linux.yml +++ b/.github/workflows/linux.yml @@ -50,6 +50,11 @@ jobs: working-directory: build run: cmake --build . --target run_tests_with_junit_report + - name: Run sparrow integration tests + if: matrix.build_shared == 'ON' + working-directory: build + run: cmake --build . --target run_sparrow_tests_direct + - name: Install working-directory: build run: cmake --install . diff --git a/.github/workflows/osx.yml b/.github/workflows/osx.yml index 0935925..41dbd32 100644 --- a/.github/workflows/osx.yml +++ b/.github/workflows/osx.yml @@ -55,6 +55,11 @@ jobs: working-directory: build run: cmake --build . --target run_tests_with_junit_report + # - name: Run Sparrow integration tests + # if: matrix.build_shared == 'ON' + # working-directory: build + # run: cmake --build . --target run_sparrow_tests_direct + - name: Install working-directory: build run: cmake --install . diff --git a/.github/workflows/windows.yml b/.github/workflows/windows.yml index 575082a..24a1885 100644 --- a/.github/workflows/windows.yml +++ b/.github/workflows/windows.yml @@ -55,6 +55,11 @@ jobs: run: | cmake --build . --config ${{ matrix.build_type }} --target run_tests_with_junit_report + - name: Run Sparrow integration tests + if: matrix.build_shared == 'ON' + working-directory: build + run: cmake --build . --config ${{ matrix.build_type }} --target run_sparrow_tests_direct + - name: Install working-directory: build run: cmake --install . --config ${{ matrix.build_type }} diff --git a/.gitignore b/.gitignore index 046c078..dc3bf39 100644 --- a/.gitignore +++ b/.gitignore @@ -1,2 +1,3 @@ /build /.vscode +*.pyc diff --git a/CMakeLists.txt b/CMakeLists.txt index 6946c66..32bb7dd 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -151,10 +151,12 @@ set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY_RELEASE "${BINARY_BUILD_DIR}") set(SPARROW_PYCAPSULE_HEADERS ${SPARROW_PYCAPSULE_INCLUDE_DIR}/sparrow-pycapsule/config/sparrow_pycapsule_version.hpp ${SPARROW_PYCAPSULE_INCLUDE_DIR}/sparrow-pycapsule/pycapsule.hpp + ${SPARROW_PYCAPSULE_INCLUDE_DIR}/sparrow-pycapsule/sparrow_array_python_class.hpp ) set(SPARROW_PYCAPSULE_SOURCES src/pycapsule.cpp + src/sparrow_array_python_class.cpp ) option(SPARROW_PYCAPSULE_BUILD_SHARED "Build sparrow pycapsule as a shared library" ON) diff --git a/README.md b/README.md index a3bf936..fceb869 100644 --- a/README.md +++ b/README.md @@ -1,2 +1,295 @@ # sparrow-pycapsule -The Sparrow PyCapsuleInterface + +The Sparrow PyCapsule Interface - A C++ library for exchanging Apache Arrow data between C++ and Python using the Arrow C Data Interface via PyCapsules. + +## Overview + +`sparrow-pycapsule` provides a clean C++ API for: +- Exporting sparrow arrays to Python as PyCapsules (Arrow C Data Interface) +- Importing Arrow data from Python PyCapsules into sparrow arrays +- Zero-copy data exchange with Python libraries like Polars, PyArrow, and pandas +- A `SparrowArray` Python class that implements the Arrow PyCapsule Interface + +## Features + +- ✅ **Zero-copy data exchange** between C++ and Python +- ✅ **Arrow C Data Interface** compliant +- ✅ **PyCapsule-based** for safe memory management +- ✅ **Compatible with Polars, PyArrow, pandas** and other Arrow-based libraries +- ✅ **Bidirectional** data flow (C++ ↔ Python) +- ✅ **Type-safe** with proper ownership semantics +- ✅ **SparrowArray Python class** implementing `__arrow_c_array__` protocol + +## Building + +### Prerequisites + +```bash +# Using conda (recommended) +conda env create -f environment-dev.yml +conda activate sparrow-pycapsule + +# Or install manually +# - CMake >= 3.28 +# - C++20 compiler +# - Python 3.x with development headers +# - sparrow library +``` + +### Build Instructions + +```bash +mkdir build && cd build +cmake .. -DCMAKE_BUILD_TYPE=Release +cmake --build . +``` + +### Build with Tests + +```bash +mkdir build && cd build +cmake .. -DSPARROW_PYCAPSULE_BUILD_TESTS=ON -DCMAKE_BUILD_TYPE=Debug +cmake --build . +ctest --output-on-failure +``` + +## Usage Example + +### C++ Side: Creating a SparrowArray for Python + +```cpp +#include +#include +#include + +// Create a sparrow array +sparrow::array my_array = /* ... */; + +// Create a SparrowArray Python object that implements __arrow_c_array__ +PyObject* sparrow_array = sparrow::pycapsule::create_sparrow_array_object(std::move(my_array)); + +// Return to Python - it can be used directly with Polars, PyArrow, etc. +``` + +### Python Side: Using SparrowArray + +```python +from test_sparrow_helper import SparrowArray +import polars as pl +import pyarrow as pa + +# Create SparrowArray from any Arrow-compatible object +pa_array = pa.array([1, 2, None, 4, 5], type=pa.int32()) +sparrow_array = SparrowArray(pa_array) + +# SparrowArray implements __arrow_c_array__, so it works with Polars +# Using Polars internal API for primitive arrays: +from polars._plr import PySeries +from polars._utils.wrap import wrap_s + +ps = PySeries.from_arrow_c_array(sparrow_array) +series = wrap_s(ps) +print(series) # shape: (5,), dtype: Int32 + +# Get array size +print(sparrow_array.size()) # 5 +``` + +### Python Side: Exporting to C++ + +```python +import pyarrow as pa + +# Any object implementing __arrow_c_array__ can be imported by sparrow +arrow_array = pa.array([1, 2, None, 4, 5]) + +# The SparrowArray constructor accepts any ArrowArrayExportable +sparrow_array = SparrowArray(arrow_array) +``` + +### C++ Side: Importing from Python + +```cpp +#include + +// Receive capsules from Python (e.g., from __arrow_c_array__) +PyObject* schema_capsule = /* ... */; +PyObject* array_capsule = /* ... */; + +// Import into sparrow array +sparrow::array imported_array = + sparrow::pycapsule::import_array_from_capsules( + schema_capsule, array_capsule); + +// Use the array +std::cout << "Array size: " << imported_array.size() << std::endl; +``` + +## SparrowArray Python Class + +The `SparrowArray` class is a Python type implemented in C++ that: + +- **Wraps a sparrow array** and exposes it to Python +- **Implements `__arrow_c_array__`** (ArrowArrayExportable protocol) +- **Accepts any ArrowArrayExportable** in its constructor (PyArrow, Polars, etc.) +- **Provides a `size()` method** to get the number of elements + +```python +# Constructor accepts any object with __arrow_c_array__ +sparrow_array = SparrowArray(pyarrow_array) +sparrow_array = SparrowArray(another_sparrow_array) + +# Implements ArrowArrayExportable protocol +schema_capsule, array_capsule = sparrow_array.__arrow_c_array__() + +# Get array size +n = sparrow_array.size() +``` + +## Testing + +### C++ Unit Tests + +```bash +cd build +./bin/Debug/test_sparrow_pycapsule_lib +``` + +### Integration Tests + +Test bidirectional data exchange with Polars and PyArrow: + +```bash +# Run integration tests (recommended) +cmake --build . --target run_polars_tests_direct + +# Check dependencies first +cmake --build . --target check_polars_deps +``` + +See [test/README_POLARS_TESTS.md](test/README_POLARS_TESTS.md) for detailed documentation. + +## CMake Targets + +The project provides several convenient CMake targets for testing: + +| Target | Description | +|--------|-------------| +| `run_tests` | Run all C++ unit tests | +| `run_tests_with_junit_report` | Run C++ tests with JUnit XML output | +| `run_polars_tests_direct` | Run integration tests (recommended) | +| `check_polars_deps` | Check Python dependencies (polars, pyarrow) | +| `test_library_load` | Debug library loading issues | + +**Usage:** +```bash +cd build + +# Run integration tests +cmake --build . --target run_polars_tests_direct + +# Check dependencies first +cmake --build . --target check_polars_deps +``` + +## API Reference + +### SparrowArray Python Class + +```cpp +// Create a SparrowArray Python object from a sparrow::array +PyObject* create_sparrow_array_object(sparrow::array&& arr); + +// Create a SparrowArray from PyCapsules +PyObject* create_sparrow_array_object_from_capsules( + PyObject* schema_capsule, PyObject* array_capsule); + +// Register SparrowArray type with a Python module +int register_sparrow_array_type(PyObject* module); + +// Get the SparrowArray type object +PyTypeObject* get_sparrow_array_type(); +``` + +### Export Functions + +- `export_arrow_schema_pycapsule(array& arr)` - Export schema to PyCapsule +- `export_arrow_array_pycapsule(array& arr)` - Export array data to PyCapsule +- `export_array_to_capsules(array& arr)` - Export both schema and array (recommended) + +### Import Functions + +- `get_arrow_schema_pycapsule(PyObject* capsule)` - Get ArrowSchema pointer from capsule +- `get_arrow_array_pycapsule(PyObject* capsule)` - Get ArrowArray pointer from capsule +- `import_array_from_capsules(PyObject* schema, PyObject* array)` - Import complete array + +### Memory Management + +All capsules have destructors that properly clean up Arrow structures. + +## Supported Data Types + +The library supports all Arrow data types that sparrow supports: +- Integer types (Int8, Int16, Int32, Int64, UInt8, UInt16, UInt32, UInt64) +- Floating point (Float32, Float64) +- Boolean +- String (UTF-8) +- And more... + +All types support nullable values via the Arrow null bitmap. + +## Integration with Python Libraries + +### Polars + +```python +from polars._plr import PySeries +from polars._utils.wrap import wrap_s + +# SparrowArray implements __arrow_c_array__, use Polars internal API +sparrow_array = SparrowArray(some_arrow_array) +ps = PySeries.from_arrow_c_array(sparrow_array) +series = wrap_s(ps) +``` + +### PyArrow + +```python +import pyarrow as pa + +# Create SparrowArray from PyArrow +pa_array = pa.array([1, 2, 3]) +sparrow_array = SparrowArray(pa_array) + +# Export back to PyArrow +schema_capsule, array_capsule = sparrow_array.__arrow_c_array__() +``` + +### pandas (via PyArrow) + +```python +import pandas as pd +import pyarrow as pa + +series = pd.Series([1, 2, 3]) +arrow_array = pa.Array.from_pandas(series) +sparrow_array = SparrowArray(arrow_array) +``` + +## License + +See [LICENSE](LICENSE) file for details. + +## Contributing + +Contributions are welcome! Please ensure: +- Code follows the existing style +- All tests pass (`ctest --output-on-failure`) +- New features include tests +- Documentation is updated + +## Related Projects + +- [sparrow](https://github.com/man-group/sparrow) - Modern C++ library for Apache Arrow +- [Apache Arrow](https://arrow.apache.org/) - Cross-language development platform +- [Polars](https://www.pola.rs/) - Fast DataFrame library diff --git a/environment-dev.yml b/environment-dev.yml index 8d38ea5..ea07591 100644 --- a/environment-dev.yml +++ b/environment-dev.yml @@ -10,6 +10,9 @@ dependencies: - python # Tests - doctest + - polars + - pyarrow + - pytest # Documentation - doxygen - graphviz diff --git a/include/sparrow-pycapsule/sparrow_array_python_class.hpp b/include/sparrow-pycapsule/sparrow_array_python_class.hpp new file mode 100644 index 0000000..3418c77 --- /dev/null +++ b/include/sparrow-pycapsule/sparrow_array_python_class.hpp @@ -0,0 +1,97 @@ +#pragma once + +#define PY_SSIZE_T_CLEAN +#include + +#include + +#include "sparrow-pycapsule/config/config.hpp" +#include "sparrow-pycapsule/pycapsule.hpp" + +namespace sparrow::pycapsule +{ + /** + * @brief Python object structure for SparrowArray. + * + * This structure holds a pointer to a sparrow::array. The pointer is used + * to avoid issues with C++ objects in C-style Python object structures. + */ + struct SparrowArrayObject + { + PyObject_HEAD sparrow::array* arr; + }; + + /** + * @brief Deallocator for SparrowArray Python objects. + */ + SPARROW_PYCAPSULE_API void SparrowArray_dealloc(SparrowArrayObject* self); + + /** + * @brief Implementation of __arrow_c_array__ method. + * + * This method exports the wrapped sparrow array as Arrow PyCapsules, + * implementing the Arrow PyCapsule Interface (ArrowArrayExportable protocol). + * + * @param self The SparrowArray object. + * @param args Positional arguments (unused). + * @param kwargs Keyword arguments (optional requested_schema). + * @return A tuple of (schema_capsule, array_capsule). + */ + SPARROW_PYCAPSULE_API PyObject* + SparrowArray_arrow_c_array(SparrowArrayObject* self, PyObject* args, PyObject* kwargs); + + /** + * @brief Get the size of the wrapped array. + * + * @param self The SparrowArray object. + * @param args Positional arguments (unused). + * @return The size of the array as a Python integer. + */ + SPARROW_PYCAPSULE_API PyObject* SparrowArray_size(SparrowArrayObject* self, PyObject* args); + + /** + * @brief Get the Python type object for SparrowArray. + * + * This function returns a pointer to the SparrowArrayType. The type is + * initialized on first call if necessary. + * + * @return Pointer to the SparrowArrayType, or nullptr on error. + */ + SPARROW_PYCAPSULE_API PyTypeObject* get_sparrow_array_type(); + + /** + * @brief Create a new SparrowArray Python object from a sparrow::array. + * + * This function creates a new Python object that wraps the given sparrow array. + * The array is moved into the Python object, so the caller should not use it + * after this call. + * + * @param arr The sparrow array to wrap (will be moved). + * @return A new reference to a SparrowArray Python object, or nullptr on error. + */ + SPARROW_PYCAPSULE_API PyObject* create_sparrow_array_object(sparrow::array&& arr); + + /** + * @brief Create a new SparrowArray Python object from PyCapsules. + * + * This function creates a new Python object by importing from existing + * Arrow PyCapsules. + * + * @param schema_capsule The schema PyCapsule. + * @param array_capsule The array PyCapsule. + * @return A new reference to a SparrowArray Python object, or nullptr on error. + */ + SPARROW_PYCAPSULE_API PyObject* + create_sparrow_array_object_from_capsules(PyObject* schema_capsule, PyObject* array_capsule); + + /** + * @brief Register the SparrowArray type with a Python module. + * + * This function adds the SparrowArray type to the given module. + * + * @param module The Python module to add the type to. + * @return 0 on success, -1 on error. + */ + SPARROW_PYCAPSULE_API int register_sparrow_array_type(PyObject* module); + +} // namespace sparrow::pycapsule diff --git a/src/pycapsule.cpp b/src/pycapsule.cpp index 5d14c7b..989540f 100644 --- a/src/pycapsule.cpp +++ b/src/pycapsule.cpp @@ -1,3 +1,4 @@ +#include #include #include @@ -38,8 +39,12 @@ namespace sparrow::pycapsule ArrowSchema* arrow_schema_ptr = new ArrowSchema(); *arrow_schema_ptr = extract_arrow_schema(std::move(arr)); - PyObject* capsule_ptr = PyCapsule_New(arrow_schema_ptr, arrow_schema_str.data(), release_arrow_schema_pycapsule); - if(capsule_ptr == nullptr) + PyObject* capsule_ptr = PyCapsule_New( + arrow_schema_ptr, + arrow_schema_str.data(), + release_arrow_schema_pycapsule + ); + if (capsule_ptr == nullptr) { arrow_schema_ptr->release(arrow_schema_ptr); delete arrow_schema_ptr; @@ -77,8 +82,12 @@ namespace sparrow::pycapsule ArrowArray* arrow_array_ptr = new ArrowArray(); *arrow_array_ptr = extract_arrow_array(std::move(arr)); - PyObject* capsule_ptr = PyCapsule_New(arrow_array_ptr, arrow_array_str.data(), release_arrow_array_pycapsule); - if(capsule_ptr == nullptr) + PyObject* capsule_ptr = PyCapsule_New( + arrow_array_ptr, + arrow_array_str.data(), + release_arrow_array_pycapsule + ); + if (capsule_ptr == nullptr) { arrow_array_ptr->release(arrow_array_ptr); delete arrow_array_ptr; @@ -130,16 +139,76 @@ namespace sparrow::pycapsule ArrowSchema* schema_ptr = new ArrowSchema(std::move(arrow_schema)); ArrowArray* array_ptr = new ArrowArray(std::move(arrow_array)); - PyObject* schema_capsule = PyCapsule_New(schema_ptr, arrow_schema_str.data(), release_arrow_schema_pycapsule); - if (!schema_capsule) { + // Check if Python is initialized before creating capsules + if (!Py_IsInitialized()) + { delete schema_ptr; delete array_ptr; - return {nullptr, nullptr}; + throw std::runtime_error("Python is not initialized. Cannot create PyCapsules."); } - PyObject* array_capsule = PyCapsule_New(array_ptr, arrow_array_str.data(), release_arrow_array_pycapsule); - if (!array_capsule) { + + PyObject* schema_capsule = PyCapsule_New( + schema_ptr, + arrow_schema_str.data(), + release_arrow_schema_pycapsule + ); + + if (!schema_capsule) + { + // Check for Python error + if (PyErr_Occurred()) + { + PyObject *type, *value, *traceback; + PyErr_Fetch(&type, &value, &traceback); + + PyObject* str_value = PyObject_Str(value); + const char* error_msg = str_value ? PyUnicode_AsUTF8(str_value) : "Unknown error"; + + std::string error_str = std::string("PyCapsule_New failed for schema: ") + error_msg; + + Py_XDECREF(str_value); + Py_XDECREF(type); + Py_XDECREF(value); + Py_XDECREF(traceback); + + delete schema_ptr; + delete array_ptr; + throw std::runtime_error(error_str); + } delete schema_ptr; delete array_ptr; + return {nullptr, nullptr}; + } + + PyObject* array_capsule = PyCapsule_New( + array_ptr, + arrow_array_str.data(), + release_arrow_array_pycapsule + ); + + if (!array_capsule) + { + // Check for Python error + if (PyErr_Occurred()) + { + PyObject *type, *value, *traceback; + PyErr_Fetch(&type, &value, &traceback); + + PyObject* str_value = PyObject_Str(value); + const char* error_msg = str_value ? PyUnicode_AsUTF8(str_value) : "Unknown error"; + + std::string error_str = std::string("PyCapsule_New failed for array: ") + error_msg; + + Py_XDECREF(str_value); + Py_XDECREF(type); + Py_XDECREF(value); + Py_XDECREF(traceback); + + delete array_ptr; + Py_DECREF(schema_capsule); + throw std::runtime_error(error_str); + } + delete array_ptr; Py_DECREF(schema_capsule); return {nullptr, nullptr}; } diff --git a/src/sparrow_array_python_class.cpp b/src/sparrow_array_python_class.cpp new file mode 100644 index 0000000..e5d07be --- /dev/null +++ b/src/sparrow_array_python_class.cpp @@ -0,0 +1,271 @@ +#include "sparrow-pycapsule/sparrow_array_python_class.hpp" + +#include +#include + +namespace sparrow::pycapsule +{ + void SparrowArray_dealloc(SparrowArrayObject* self) + { + delete self->arr; + self->arr = nullptr; + Py_TYPE(self)->tp_free(reinterpret_cast(self)); + } + + PyObject* SparrowArray_arrow_c_array(SparrowArrayObject* self, PyObject* args, PyObject* kwargs) + { + static const char* kwlist[] = {"requested_schema", nullptr}; + PyObject* requested_schema = nullptr; + + if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|O", const_cast(kwlist), &requested_schema)) + { + return nullptr; + } + + // requested_schema is typically ignored for simple cases + // In a full implementation, you might use it to cast to a different type + (void) requested_schema; + + if (self->arr == nullptr) + { + PyErr_SetString(PyExc_ValueError, "SparrowArray contains no data"); + return nullptr; + } + + try + { + auto [schema_capsule, array_capsule] = export_array_to_capsules(*self->arr); + + if (schema_capsule == nullptr || array_capsule == nullptr) + { + Py_XDECREF(schema_capsule); + Py_XDECREF(array_capsule); + PyErr_SetString(PyExc_RuntimeError, "Failed to create Arrow PyCapsules"); + return nullptr; + } + + PyObject* result = PyTuple_Pack(2, schema_capsule, array_capsule); + Py_DECREF(schema_capsule); + Py_DECREF(array_capsule); + return result; + } + catch (const std::exception& e) + { + PyErr_SetString(PyExc_RuntimeError, e.what()); + return nullptr; + } + } + + PyObject* SparrowArray_size(SparrowArrayObject* self, [[maybe_unused]] PyObject* args) + { + if (self->arr == nullptr) + { + PyErr_SetString(PyExc_ValueError, "SparrowArray contains no data"); + return nullptr; + } + + return PyLong_FromSize_t(self->arr->size()); + } + + /** + * @brief Constructor for SparrowArray. + * + * Accepts an object implementing __arrow_c_array__ and imports it. + */ + static PyObject* SparrowArray_new(PyTypeObject* type, PyObject* args, PyObject* kwargs) + { + static const char* kwlist[] = {"arrow_array", nullptr}; + PyObject* arrow_array = nullptr; + + if (!PyArg_ParseTupleAndKeywords(args, kwargs, "O", const_cast(kwlist), &arrow_array)) + { + return nullptr; + } + + // Get __arrow_c_array__ method from the input object + PyObject* arrow_c_array_method = PyObject_GetAttrString(arrow_array, "__arrow_c_array__"); + if (arrow_c_array_method == nullptr) + { + PyErr_SetString( + PyExc_TypeError, + "Input object must implement __arrow_c_array__ (ArrowArrayExportable protocol)" + ); + return nullptr; + } + + // Call __arrow_c_array__() to get the capsules + PyObject* capsules = PyObject_CallNoArgs(arrow_c_array_method); + Py_DECREF(arrow_c_array_method); + + if (capsules == nullptr) + { + return nullptr; + } + + // Unpack the tuple (schema_capsule, array_capsule) + if (!PyTuple_Check(capsules) || PyTuple_Size(capsules) != 2) + { + Py_DECREF(capsules); + PyErr_SetString(PyExc_TypeError, "__arrow_c_array__ must return a tuple of 2 elements"); + return nullptr; + } + + PyObject* schema_capsule = PyTuple_GetItem(capsules, 0); + PyObject* array_capsule = PyTuple_GetItem(capsules, 1); + + try + { + sparrow::array arr = import_array_from_capsules(schema_capsule, array_capsule); + Py_DECREF(capsules); + + // Allocate the object + SparrowArrayObject* self = reinterpret_cast(type->tp_alloc(type, 0)); + if (self == nullptr) + { + return nullptr; + } + + self->arr = new sparrow::array(std::move(arr)); + return reinterpret_cast(self); + } + catch (const std::bad_alloc&) + { + Py_DECREF(capsules); + PyErr_NoMemory(); + return nullptr; + } + catch (const std::exception& e) + { + Py_DECREF(capsules); + PyErr_SetString(PyExc_RuntimeError, e.what()); + return nullptr; + } + } + + static PyMethodDef SparrowArray_methods[] = { + {"__arrow_c_array__", + reinterpret_cast(SparrowArray_arrow_c_array), + METH_VARARGS | METH_KEYWORDS, + "Export the array via the Arrow PyCapsule interface.\n\n" + "Parameters\n" + "----------\n" + "requested_schema : object, optional\n" + " Requested schema for the output (typically ignored).\n\n" + "Returns\n" + "-------\n" + "tuple[object, object]\n" + " A tuple of (schema_capsule, array_capsule)."}, + {"size", + reinterpret_cast(SparrowArray_size), + METH_NOARGS, + "Get the number of elements in the array.\n\n" + "Returns\n" + "-------\n" + "int\n" + " The size of the array."}, + {nullptr, nullptr, 0, nullptr} // Sentinel + }; + + // The type object - defined as a static variable + static PyTypeObject SparrowArrayType = { + .ob_base = PyVarObject_HEAD_INIT(nullptr, 0).tp_name = "sparrow.SparrowArray", + .tp_basicsize = sizeof(SparrowArrayObject), + .tp_itemsize = 0, + .tp_dealloc = reinterpret_cast(SparrowArray_dealloc), + .tp_flags = Py_TPFLAGS_DEFAULT, + .tp_doc = PyDoc_STR( + "SparrowArray(arrow_array) - Arrow array wrapper implementing __arrow_c_array__.\n\n" + "This class wraps a sparrow array and implements the Arrow PyCapsule\n" + "Interface (ArrowArrayExportable protocol), allowing it to be passed\n" + "directly to libraries like Polars via pl.from_arrow().\n\n" + "Parameters\n" + "----------\n" + "arrow_array : ArrowArrayExportable\n" + " An object implementing __arrow_c_array__ (e.g., PyArrow array)." + ), + .tp_methods = SparrowArray_methods, + .tp_new = SparrowArray_new, + }; + + static bool type_initialized = false; + + PyTypeObject* get_sparrow_array_type() + { + if (!type_initialized) + { + if (PyType_Ready(&SparrowArrayType) < 0) + { + return nullptr; + } + type_initialized = true; + } + return &SparrowArrayType; + } + + PyObject* create_sparrow_array_object(sparrow::array&& arr) + { + PyTypeObject* type = get_sparrow_array_type(); + if (type == nullptr) + { + return nullptr; + } + + SparrowArrayObject* obj = PyObject_New(SparrowArrayObject, type); + if (obj == nullptr) + { + return nullptr; + } + + try + { + obj->arr = new sparrow::array(std::move(arr)); + } + catch (const std::bad_alloc&) + { + Py_DECREF(obj); + PyErr_NoMemory(); + return nullptr; + } + catch (const std::exception& e) + { + Py_DECREF(obj); + PyErr_SetString(PyExc_RuntimeError, e.what()); + return nullptr; + } + + return reinterpret_cast(obj); + } + + PyObject* create_sparrow_array_object_from_capsules(PyObject* schema_capsule, PyObject* array_capsule) + { + try + { + sparrow::array arr = import_array_from_capsules(schema_capsule, array_capsule); + return create_sparrow_array_object(std::move(arr)); + } + catch (const std::exception& e) + { + PyErr_SetString(PyExc_RuntimeError, e.what()); + return nullptr; + } + } + + int register_sparrow_array_type(PyObject* module) + { + PyTypeObject* type = get_sparrow_array_type(); + if (type == nullptr) + { + return -1; + } + + Py_INCREF(type); + if (PyModule_AddObject(module, "SparrowArray", reinterpret_cast(type)) < 0) + { + Py_DECREF(type); + return -1; + } + + return 0; + } + +} // namespace sparrow::pycapsule diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index 0e7353f..50435b5 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -84,3 +84,81 @@ add_custom_target(run_tests_with_junit_report ) set_target_properties(run_tests_with_junit_report PROPERTIES FOLDER "Tests utilities") + +Python_add_library(test_sparrow_helper MODULE test_sparrow_helper_module.cpp) + +target_link_libraries(test_sparrow_helper + PRIVATE + sparrow-pycapsule + sparrow::sparrow +) + +target_compile_features(test_sparrow_helper PRIVATE cxx_std_20) + +if(MSVC) + target_compile_options(test_sparrow_helper PRIVATE /W4) +else() + target_compile_options(test_sparrow_helper PRIVATE -Wall -Wextra -Wpedantic) +endif() + +set_target_properties(test_sparrow_helper PROPERTIES + FOLDER tests + LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin/${CMAKE_BUILD_TYPE} + # Python modules must not have a debug suffix - Python won't find them + DEBUG_POSTFIX "" +) + +# Python integration test +# ======================= +find_package(Python COMPONENTS Interpreter QUIET) + +if(Python_Interpreter_FOUND) + # Add a test that runs the Python integration script + add_test( + NAME test_sparrow_integration + COMMAND ${Python_EXECUTABLE} ${CMAKE_CURRENT_SOURCE_DIR}/test_sparrow_integration.py + WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} + ) + + # Set environment variables so Python can find the libraries + # Use generator expressions to get the actual library paths from targets + set_tests_properties(test_sparrow_integration PROPERTIES + ENVIRONMENT "TEST_SPARROW_HELPER_LIB_PATH=$;SPARROW_PYCAPSULE_LIB_PATH=$" + TIMEOUT 300 + DEPENDS test_sparrow_helper + ) + + message(STATUS "Added sparrow integration test (Python ${Python_VERSION})") +else() + message(WARNING "Python interpreter not found, skipping sparrow integration test") +endif() + +if(Python_Interpreter_FOUND) + add_custom_target(run_sparrow_tests_direct + COMMAND ${CMAKE_COMMAND} -E echo "==================================" + COMMAND ${CMAKE_COMMAND} -E echo "Sparrow Integration Test Runner" + COMMAND ${CMAKE_COMMAND} -E echo "==================================" + COMMAND ${CMAKE_COMMAND} -E echo "" + COMMAND ${CMAKE_COMMAND} -E echo "Checking Python dependencies..." + COMMAND ${Python_EXECUTABLE} -c "import polars" || ${CMAKE_COMMAND} -E cmake_echo_color --red "ERROR: polars not installed. Install with: pip install polars" + COMMAND ${Python_EXECUTABLE} -c "import pyarrow" || ${CMAKE_COMMAND} -E cmake_echo_color --red "ERROR: pyarrow not installed. Install with: pip install pyarrow" + COMMAND ${CMAKE_COMMAND} -E echo "" + COMMAND ${CMAKE_COMMAND} -E echo "Library paths:" + COMMAND ${CMAKE_COMMAND} -E echo " TEST_SPARROW_HELPER_LIB_PATH=$" + COMMAND ${CMAKE_COMMAND} -E echo " SPARROW_PYCAPSULE_LIB_PATH=$" + COMMAND ${CMAKE_COMMAND} -E echo "" + COMMAND ${CMAKE_COMMAND} -E echo "Running tests..." + COMMAND ${CMAKE_COMMAND} -E echo "" + COMMAND ${CMAKE_COMMAND} -E env + "TEST_SPARROW_HELPER_LIB_PATH=$" + "SPARROW_PYCAPSULE_LIB_PATH=$" + ${Python_EXECUTABLE} ${CMAKE_CURRENT_SOURCE_DIR}/test_sparrow_integration.py + COMMAND ${CMAKE_COMMAND} -E echo "" + DEPENDS test_sparrow_helper sparrow-pycapsule + WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} + COMMENT "Running Sparrow integration tests directly" + USES_TERMINAL + ) + + set_target_properties(run_sparrow_tests_direct PROPERTIES FOLDER "Tests utilities") +endif() diff --git a/test/sparrow_helpers.py b/test/sparrow_helpers.py new file mode 100644 index 0000000..24d64b6 --- /dev/null +++ b/test/sparrow_helpers.py @@ -0,0 +1,72 @@ +""" +Python helper module for sparrow-pycapsule integration tests. + +This module provides helper functions that wrap the C++ SparrowArray class, +making it easy to create test arrays and perform roundtrip operations. +""" + +from __future__ import annotations + +import sys +import os +from pathlib import Path +from typing import Any, Protocol, Tuple + +class ArrowArrayExportable(Protocol): + """Protocol for objects implementing the Arrow PyCapsule Interface.""" + + def __arrow_c_array__( + self, requested_schema: Any = None + ) -> Tuple[Any, Any]: + """Export the array as Arrow PyCapsules. + + Returns + ------- + Tuple[Any, Any] + A tuple of (schema_capsule, array_capsule). + """ + ... + + +class SparrowArrayType(ArrowArrayExportable, Protocol): + """Type definition for SparrowArray from C++ extension.""" + + def size(self) -> int: + """Get the number of elements in the array.""" + ... + + +def _setup_module_path() -> None: + """Add the build directory to Python path so we can import test_sparrow_helper.""" + # Check for environment variable first + helper_path = os.environ.get('TEST_SPARROW_HELPER_PATH') + if helper_path: + module_dir = Path(helper_path).parent + if module_dir.exists(): + sys.path.insert(0, str(module_dir)) + return + + # Try to find in build directory + test_dir = Path(__file__).parent + build_dirs = [ + test_dir.parent / "build" / "bin" / "Debug", + test_dir.parent / "build" / "bin" / "Release", + test_dir.parent / "build" / "bin", + ] + + for build_dir in build_dirs: + if build_dir.exists(): + sys.path.insert(0, str(build_dir)) + return + + raise ImportError( + "Could not find test_sparrow_helper module. " + "Build the project first or set TEST_SPARROW_HELPER_PATH." + ) + + +# Set up module path and import the C++ module +_setup_module_path() + +# Import the native Python extension module that provides SparrowArray +from test_sparrow_helper import SparrowArray # noqa: E402 diff --git a/test/test_sparrow_helper_module.cpp b/test/test_sparrow_helper_module.cpp new file mode 100644 index 0000000..acb701e --- /dev/null +++ b/test/test_sparrow_helper_module.cpp @@ -0,0 +1,86 @@ +#define PY_SSIZE_T_CLEAN +#include + +#include +#include + +#include +#include +#include + +#include +#include + +/** + * Create a test array and return a SparrowArray object. + * + * Python signature: create_test_array() -> SparrowArray + */ +static PyObject* py_create_test_array(PyObject* self, PyObject* args) +{ + (void)self; + (void)args; + + try + { + // Create a test array with nullable integers + std::vector> values = { + sparrow::make_nullable(10, true), + sparrow::make_nullable(20, true), + sparrow::make_nullable(0, false), // null + sparrow::make_nullable(40, true), + sparrow::make_nullable(50, true) + }; + + sparrow::primitive_array prim_array(std::move(values)); + sparrow::array arr(std::move(prim_array)); + + // Return a SparrowArray object that implements __arrow_c_array__ + return sparrow::pycapsule::create_sparrow_array_object(std::move(arr)); + } + catch (const std::exception& e) + { + PyErr_SetString(PyExc_RuntimeError, e.what()); + return nullptr; + } +} + +// Method definitions +static PyMethodDef TestSparrowHelperMethods[] = { + { + "create_test_array", + py_create_test_array, + METH_NOARGS, + "Create a test array and return a SparrowArray object implementing __arrow_c_array__." + }, + {nullptr, nullptr, 0, nullptr} // Sentinel +}; + +// Module definition +static struct PyModuleDef test_sparrow_helper_module = { + PyModuleDef_HEAD_INIT, + "test_sparrow_helper", // Module name + "Native Python extension providing SparrowArray type for Arrow data exchange.\n" + "Higher-level helpers are available in sparrow_helpers.py.", + -1, // Module state size (-1 = no state) + TestSparrowHelperMethods +}; + +// Module initialization function +PyMODINIT_FUNC PyInit_test_sparrow_helper(void) +{ + PyObject* module = PyModule_Create(&test_sparrow_helper_module); + if (module == nullptr) + { + return nullptr; + } + + // Register the SparrowArray type with this module + if (sparrow::pycapsule::register_sparrow_array_type(module) < 0) + { + Py_DECREF(module); + return nullptr; + } + + return module; +} diff --git a/test/test_sparrow_integration.py b/test/test_sparrow_integration.py new file mode 100644 index 0000000..a249752 --- /dev/null +++ b/test/test_sparrow_integration.py @@ -0,0 +1,193 @@ +#!/usr/bin/env python3 +""" +Integration test for sparrow-pycapsule with Polars and PyArrow. + +This test demonstrates: +1. Sparrow → Polars: Create array in C++ (sparrow), import to Polars +2. PyArrow → Sparrow: Create array in PyArrow, import to sparrow + +The C++ SparrowArray class implements the Arrow PyCapsule Interface (__arrow_c_array__), +allowing direct integration with Polars without going through PyArrow. +""" + +import sys + +import pytest +import polars as pl +import pyarrow as pa +from polars._plr import PySeries +from polars._utils.wrap import wrap_s + + +# Import helpers from our Python module +from sparrow_helpers import ( + ArrowArrayExportable, + SparrowArray, + SparrowArrayType, +) + +# Import the C++ module for create_test_array +import test_sparrow_helper # noqa: E402 + + +def arrow_array_to_series( + arrow_array: ArrowArrayExportable, name: str = "" +) -> pl.Series: + """ + Convert an object implementing __arrow_c_array__ to a Polars Series. + + This function uses Polars' internal PySeries.from_arrow_c_array to create + a Series directly from an Arrow array. + + Parameters + ---------- + arrow_array : ArrowArrayExportable + An object that implements __arrow_c_array__ method. + name : str, optional + Name for the resulting Series. Default is empty string. + + Returns + ------- + pl.Series + A Polars Series containing the array data. + """ + ps = PySeries.from_arrow_c_array(arrow_array) + series = wrap_s(ps) + if name: + series = series.alias(name) + return series + + +# ============================================================================= +# Test 1: Sparrow → Polars (Create array in C++, import to Polars) +# ============================================================================= + + +class TestSparrowToPolars: + """Test creating an array in C++ (sparrow) and importing to Polars.""" + + def test_create_sparrow_array(self): + """Create a SparrowArray in C++ that implements __arrow_c_array__.""" + sparrow_array = test_sparrow_helper.create_test_array() + + assert sparrow_array is not None, "Received null SparrowArray from C++" + assert hasattr(sparrow_array, "__arrow_c_array__"), ( + "SparrowArray missing __arrow_c_array__ method" + ) + assert sparrow_array.size() == 5, f"Expected size 5, got {sparrow_array.size()}" + + def test_sparrow_array_type(self): + """Verify that created array is a sparrow.SparrowArray instance.""" + sparrow_array = test_sparrow_helper.create_test_array() + + # Check the type name + type_name = type(sparrow_array).__name__ + assert type_name == "SparrowArray", ( + f"Expected type 'SparrowArray', got '{type_name}'" + ) + + # Check the module-qualified name + full_name = f"{type(sparrow_array).__module__}.{type_name}" + assert full_name == "sparrow.SparrowArray", ( + f"Expected 'sparrow.SparrowArray', got '{full_name}'" + ) + + def test_sparrow_to_polars_series(self): + """Convert SparrowArray to Polars Series using the Arrow PyCapsule Interface.""" + sparrow_array = test_sparrow_helper.create_test_array() + polars_series = arrow_array_to_series(sparrow_array) + + assert polars_series.dtype == pl.Int32, ( + f"Expected Int32, got {polars_series.dtype}" + ) + expected = [10, 20, None, 40, 50] + actual = polars_series.to_list() + assert expected == actual, ( + f"Data mismatch! Expected: {expected}, Actual: {actual}" + ) + + def test_sparrow_to_polars_preserves_nulls(self): + """Verify that null values from sparrow are preserved in Polars.""" + sparrow_array = test_sparrow_helper.create_test_array() + polars_series = arrow_array_to_series(sparrow_array) + + # The test array has a null at index 2 + values = polars_series.to_list() + assert values[2] is None, "Null value not preserved at index 2" + + +# ============================================================================= +# Test 2: PyArrow → Sparrow (Create array in PyArrow, import to sparrow) +# ============================================================================= + + +class TestPyArrowToSparrow: + """Test creating an array in PyArrow and importing to sparrow.""" + + def test_create_sparrow_array_from_pyarrow(self): + """Create a SparrowArray directly from a PyArrow array using the constructor.""" + # Create a PyArrow array + pa_array = pa.array([100, 200, None, 400, 500], type=pa.int32()) + + # Create SparrowArray directly using the type constructor + sparrow_array = SparrowArray(pa_array) + + # Verify it's a SparrowArray + assert type(sparrow_array).__name__ == "SparrowArray" + assert sparrow_array.size() == 5 + + # Verify we can convert it to Polars + polars_series = arrow_array_to_series(sparrow_array) + expected = [100, 200, None, 400, 500] + assert polars_series.to_list() == expected + + def test_pyarrow_to_sparrow(self): + """Import PyArrow array to sparrow using Arrow PyCapsule Interface.""" + # Create a PyArrow array + pa_array = pa.array([100, 200, None, 400, 500], type=pa.int32()) + + # Verify sparrow can import and read the data via __arrow_c_array__ + + sparrow_array: SparrowArrayType = SparrowArray(pa_array) + assert sparrow_array.size() == 5 + + def test_pyarrow_roundtrip_through_sparrow(self): + """Round-trip: PyArrow → sparrow → Polars.""" + # Create a PyArrow array + pa_array = pa.array([1, 2, None, 4, 5], type=pa.int32()) + + # Round-trip through sparrow (import then export as SparrowArray) + sparrow_array = SparrowArray(pa_array) + + # Import the result into Polars + result_series = arrow_array_to_series(sparrow_array) + + # Verify data matches + expected = [1, 2, None, 4, 5] + actual = result_series.to_list() + assert expected == actual, ( + f"Data mismatch! Expected: {expected}, Actual: {actual}" + ) + + def test_pyarrow_nulls_preserved_in_sparrow(self): + """Verify that null values from PyArrow are preserved through sparrow.""" + # Create a PyArrow array with nulls + pa_array = pa.array([None, 1, None, 3, None], type=pa.int32()) + + # Round-trip through sparrow + sparrow_array = SparrowArray(pa_array) + + # Import into Polars + result_series = arrow_array_to_series(sparrow_array) + + # Check null positions + values = result_series.to_list() + assert values[0] is None, "Null not preserved at index 0" + assert values[1] == 1, "Value changed at index 1" + assert values[2] is None, "Null not preserved at index 2" + assert values[3] == 3, "Value changed at index 3" + assert values[4] is None, "Null not preserved at index 4" + + +if __name__ == "__main__": + sys.exit(pytest.main([__file__, "-v"]))