feat: add Arrow support to PostgreSQL adapters (asyncpg, psycopg, psqlpy) (#157)

cofin · web-flow · commit d699b11fad0c · 2025-10-27T21:54:57.000-05:00
Tests &amp; cleanup for Arrow integration
diff --git a/sqlspec/adapters/adbc/config.py b/sqlspec/adapters/adbc/config.py
@@ -77,23 +77,12 @@ class AdbcDriverFeatures(TypedDict):
             When True, preserves Arrow extension type metadata when reading data.
             When False, falls back to storage types.
             Default: True
-        enable_arrow_results: Enable native Arrow query results.
-            When True, select_to_arrow() uses cursor.fetch_arrow_table() for
-            zero-copy data transfer (5-10x faster for large datasets).
-            When False, falls back to dict conversion path.
-            Default: True
-        arrow_batch_size: Batch size for Arrow result streaming.
-            Number of rows per batch when streaming Arrow results.
-            Used for future streaming implementation.
-            Default: 1024
     """
 
     json_serializer: "NotRequired[Callable[[Any], str]]"
     enable_cast_detection: NotRequired[bool]
     strict_type_coercion: NotRequired[bool]
     arrow_extension_types: NotRequired[bool]
-    enable_arrow_results: NotRequired[bool]
-    arrow_batch_size: NotRequired[int]
 
 
 __all__ = ("AdbcConfig", "AdbcConnectionParams", "AdbcDriverFeatures")
@@ -158,10 +147,6 @@ def __init__(
             driver_features["strict_type_coercion"] = False
         if "arrow_extension_types" not in driver_features:
             driver_features["arrow_extension_types"] = True
-        if "enable_arrow_results" not in driver_features:
-            driver_features["enable_arrow_results"] = True
-        if "arrow_batch_size" not in driver_features:
-            driver_features["arrow_batch_size"] = 1024
 
         super().__init__(
             connection_config=self.connection_config,
diff --git a/sqlspec/adapters/bigquery/config.py b/sqlspec/adapters/bigquery/config.py
@@ -78,16 +78,6 @@ class BigQueryDriverFeatures(TypedDict):
         enable_uuid_conversion: Enable automatic UUID string conversion.
             When True (default), UUID strings are automatically converted to UUID objects.
             When False, UUID strings are treated as regular strings.
-        enable_arrow_results: Enable native Arrow query results via Storage API.
-            When True (default), select_to_arrow() uses query_job.to_arrow() with
-            Storage API for zero-copy data transfer (5-10x faster for large datasets).
-            Requires google-cloud-bigquery-storage package and API enabled.
-            Falls back to dict conversion if Storage API unavailable.
-            Default: True
-        arrow_batch_size: Batch size for Arrow result streaming.
-            Number of rows per batch when streaming Arrow results.
-            Used for future streaming implementation.
-            Default: 1024
     """
 
     connection_instance: NotRequired["BigQueryConnection"]
@@ -96,8 +86,6 @@ class BigQueryDriverFeatures(TypedDict):
     on_connection_create: NotRequired["Callable[[Any], None]"]
     json_serializer: NotRequired["Callable[[Any], str]"]
     enable_uuid_conversion: NotRequired[bool]
-    enable_arrow_results: NotRequired[bool]
-    arrow_batch_size: NotRequired[int]
 
 
 __all__ = ("BigQueryConfig", "BigQueryConnectionParams", "BigQueryDriverFeatures")
@@ -149,11 +137,6 @@ def __init__(
 
             self.driver_features["json_serializer"] = to_json
 
-        if "enable_arrow_results" not in self.driver_features:
-            self.driver_features["enable_arrow_results"] = True
-        if "arrow_batch_size" not in self.driver_features:
-            self.driver_features["arrow_batch_size"] = 1024
-
         self._connection_instance: BigQueryConnection | None = self.driver_features.get("connection_instance")
 
         if "default_query_job_config" not in self.connection_config:
diff --git a/sqlspec/adapters/duckdb/config.py b/sqlspec/adapters/duckdb/config.py
@@ -121,24 +121,13 @@ class DuckDBDriverFeatures(TypedDict):
         enable_uuid_conversion: Enable automatic UUID string conversion.
             When True (default), UUID strings are automatically converted to UUID objects.
             When False, UUID strings are treated as regular strings.
-        enable_arrow_results: Enable native Arrow query results.
-            When True (default), select_to_arrow() uses cursor.arrow() for
-            zero-copy data transfer. DuckDB has the fastest Arrow path due to
-            its columnar architecture.
-            Default: True
-        arrow_batch_size: Batch size for Arrow result streaming.
-            Number of rows per batch when streaming Arrow results.
-            Used for future streaming implementation.
-            Default: 1024
     """
 
     extensions: NotRequired[Sequence[DuckDBExtensionConfig]]
     secrets: NotRequired[Sequence[DuckDBSecretConfig]]
     on_connection_create: NotRequired["Callable[[DuckDBConnection], DuckDBConnection | None]"]
     json_serializer: NotRequired["Callable[[Any], str]"]
     enable_uuid_conversion: NotRequired[bool]
-    enable_arrow_results: NotRequired[bool]
-    arrow_batch_size: NotRequired[int]
 
 
 class DuckDBConfig(SyncDatabaseConfig[DuckDBConnection, DuckDBConnectionPool, DuckDBDriver]):
@@ -223,10 +212,6 @@ def __init__(
         processed_features = dict(driver_features) if driver_features else {}
         if "enable_uuid_conversion" not in processed_features:
             processed_features["enable_uuid_conversion"] = True
-        if "enable_arrow_results" not in processed_features:
-            processed_features["enable_arrow_results"] = True
-        if "arrow_batch_size" not in processed_features:
-            processed_features["arrow_batch_size"] = 1024
 
         super().__init__(
             bind_key=bind_key,
diff --git a/tests/integration/test_adapters/test_asyncpg/test_arrow.py b/tests/integration/test_adapters/test_asyncpg/test_arrow.py
@@ -0,0 +1,234 @@
+"""Integration tests for asyncpg Arrow support."""
+
+import pytest
+from pytest_databases.docker.postgres import PostgresService
+
+from sqlspec._typing import PYARROW_INSTALLED
+from sqlspec.adapters.asyncpg import AsyncpgConfig
+
+pytestmark = [
+    pytest.mark.xdist_group("postgres"),
+    pytest.mark.skipif(not PYARROW_INSTALLED, reason="pyarrow not installed"),
+]
+
+
+@pytest.fixture
+async def asyncpg_config(postgres_service: PostgresService) -> AsyncpgConfig:
+    """Create AsyncPG configuration for testing."""
+    return AsyncpgConfig(
+        pool_config={
+            "dsn": f"postgres://{postgres_service.user}:{postgres_service.password}@{postgres_service.host}:{postgres_service.port}/{postgres_service.database}",
+            "min_size": 1,
+            "max_size": 2,
+        }
+    )
+
+
+async def test_select_to_arrow_basic(asyncpg_config: AsyncpgConfig) -> None:
+    """Test basic select_to_arrow functionality."""
+    import pyarrow as pa
+
+    try:
+        async with asyncpg_config.provide_session() as session:
+            # Create test table with unique name
+            await session.execute("DROP TABLE IF EXISTS arrow_users CASCADE")
+            await session.execute("CREATE TABLE arrow_users (id INTEGER, name TEXT, age INTEGER)")
+            await session.execute("INSERT INTO arrow_users VALUES (1, 'Alice', 30), (2, 'Bob', 25)")
+
+            # Test Arrow query
+            result = await session.select_to_arrow("SELECT * FROM arrow_users ORDER BY id")
+
+            assert result is not None
+            assert isinstance(result.data, (pa.Table, pa.RecordBatch))
+            assert result.rows_affected == 2
+
+            # Convert to pandas and verify
+            df = result.to_pandas()
+            assert len(df) == 2
+            assert list(df["name"]) == ["Alice", "Bob"]
+            assert list(df["age"]) == [30, 25]
+    finally:
+        await asyncpg_config.close_pool()
+
+
+async def test_select_to_arrow_table_format(asyncpg_config: AsyncpgConfig) -> None:
+    """Test select_to_arrow with table return format (default)."""
+    import pyarrow as pa
+
+    try:
+        async with asyncpg_config.provide_session() as session:
+            await session.execute("DROP TABLE IF EXISTS arrow_table_test CASCADE")
+            await session.execute("CREATE TABLE arrow_table_test (id INTEGER, value TEXT)")
+            await session.execute("INSERT INTO arrow_table_test VALUES (1, 'a'), (2, 'b'), (3, 'c')")
+
+            result = await session.select_to_arrow("SELECT * FROM arrow_table_test ORDER BY id", return_format="table")
+
+            assert isinstance(result.data, pa.Table)
+            assert result.rows_affected == 3
+    finally:
+        await asyncpg_config.close_pool()
+
+
+async def test_select_to_arrow_batch_format(asyncpg_config: AsyncpgConfig) -> None:
+    """Test select_to_arrow with batch return format."""
+    import pyarrow as pa
+
+    try:
+        async with asyncpg_config.provide_session() as session:
+            await session.execute("DROP TABLE IF EXISTS arrow_batch_test CASCADE")
+            await session.execute("CREATE TABLE arrow_batch_test (id INTEGER, value TEXT)")
+            await session.execute("INSERT INTO arrow_batch_test VALUES (1, 'a'), (2, 'b')")
+
+            result = await session.select_to_arrow(
+                "SELECT * FROM arrow_batch_test ORDER BY id", return_format="batches"
+            )
+
+            assert isinstance(result.data, pa.RecordBatch)
+            assert result.rows_affected == 2
+    finally:
+        await asyncpg_config.close_pool()
+
+
+async def test_select_to_arrow_with_parameters(asyncpg_config: AsyncpgConfig) -> None:
+    """Test select_to_arrow with query parameters."""
+    try:
+        async with asyncpg_config.provide_session() as session:
+            await session.execute("DROP TABLE IF EXISTS arrow_params_test CASCADE")
+            await session.execute("CREATE TABLE arrow_params_test (id INTEGER, value INTEGER)")
+            await session.execute("INSERT INTO arrow_params_test VALUES (1, 100), (2, 200), (3, 300)")
+
+            # Test with parameterized query
+            result = await session.select_to_arrow("SELECT * FROM arrow_params_test WHERE value > $1 ORDER BY id", 150)
+
+            assert result.rows_affected == 2
+            df = result.to_pandas()
+            assert list(df["value"]) == [200, 300]
+    finally:
+        await asyncpg_config.close_pool()
+
+
+async def test_select_to_arrow_empty_result(asyncpg_config: AsyncpgConfig) -> None:
+    """Test select_to_arrow with empty result set."""
+    try:
+        async with asyncpg_config.provide_session() as session:
+            await session.execute("DROP TABLE IF EXISTS arrow_empty_test CASCADE")
+            await session.execute("CREATE TABLE arrow_empty_test (id INTEGER)")
+
+            result = await session.select_to_arrow("SELECT * FROM arrow_empty_test")
+
+            assert result.rows_affected == 0
+            assert len(result.to_pandas()) == 0
+    finally:
+        await asyncpg_config.close_pool()
+
+
+async def test_select_to_arrow_null_handling(asyncpg_config: AsyncpgConfig) -> None:
+    """Test select_to_arrow with NULL values."""
+    try:
+        async with asyncpg_config.provide_session() as session:
+            await session.execute("DROP TABLE IF EXISTS arrow_null_test CASCADE")
+            await session.execute("CREATE TABLE arrow_null_test (id INTEGER, value TEXT)")
+            await session.execute("INSERT INTO arrow_null_test VALUES (1, 'a'), (2, NULL), (3, 'c')")
+
+            result = await session.select_to_arrow("SELECT * FROM arrow_null_test ORDER BY id")
+
+            df = result.to_pandas()
+            assert len(df) == 3
+            assert df.iloc[1]["value"] is None or df.isna().iloc[1]["value"]
+    finally:
+        await asyncpg_config.close_pool()
+
+
+async def test_select_to_arrow_to_polars(asyncpg_config: AsyncpgConfig) -> None:
+    """Test select_to_arrow conversion to Polars DataFrame."""
+    pytest.importorskip("polars")
+
+    try:
+        async with asyncpg_config.provide_session() as session:
+            await session.execute("DROP TABLE IF EXISTS arrow_polars_test CASCADE")
+            await session.execute("CREATE TABLE arrow_polars_test (id INTEGER, value TEXT)")
+            await session.execute("INSERT INTO arrow_polars_test VALUES (1, 'a'), (2, 'b')")
+
+            result = await session.select_to_arrow("SELECT * FROM arrow_polars_test ORDER BY id")
+            df = result.to_polars()
+
+            assert len(df) == 2
+            assert df["value"].to_list() == ["a", "b"]
+    finally:
+        await asyncpg_config.close_pool()
+
+
+async def test_select_to_arrow_large_dataset(asyncpg_config: AsyncpgConfig) -> None:
+    """Test select_to_arrow with larger dataset."""
+    try:
+        async with asyncpg_config.provide_session() as session:
+            await session.execute("DROP TABLE IF EXISTS arrow_large_test CASCADE")
+            await session.execute("CREATE TABLE arrow_large_test (id INTEGER, value INTEGER)")
+
+            # Insert 1000 rows
+            values = ", ".join(f"({i}, {i * 10})" for i in range(1, 1001))
+            await session.execute(f"INSERT INTO arrow_large_test VALUES {values}")
+
+            result = await session.select_to_arrow("SELECT * FROM arrow_large_test ORDER BY id")
+
+            assert result.rows_affected == 1000
+            df = result.to_pandas()
+            assert len(df) == 1000
+            assert df["value"].sum() == sum(i * 10 for i in range(1, 1001))
+    finally:
+        await asyncpg_config.close_pool()
+
+
+async def test_select_to_arrow_type_preservation(asyncpg_config: AsyncpgConfig) -> None:
+    """Test that PostgreSQL types are properly converted to Arrow types."""
+    try:
+        async with asyncpg_config.provide_session() as session:
+            await session.execute("DROP TABLE IF EXISTS arrow_types_test CASCADE")
+            await session.execute(
+                """
+                CREATE TABLE arrow_types_test (
+                    id INTEGER,
+                    name TEXT,
+                    price NUMERIC,
+                    created_at TIMESTAMP,
+                    is_active BOOLEAN
+                )
+                """
+            )
+            await session.execute(
+                """
+                INSERT INTO arrow_types_test VALUES
+                (1, 'Item 1', 19.99, '2025-01-01 10:00:00', true),
+                (2, 'Item 2', 29.99, '2025-01-02 15:30:00', false)
+                """
+            )
+
+            result = await session.select_to_arrow("SELECT * FROM arrow_types_test ORDER BY id")
+
+            df = result.to_pandas()
+            assert len(df) == 2
+            assert df["name"].dtype == object
+            assert df["is_active"].dtype == bool
+    finally:
+        await asyncpg_config.close_pool()
+
+
+async def test_select_to_arrow_postgres_array(asyncpg_config: AsyncpgConfig) -> None:
+    """Test PostgreSQL array type handling in Arrow results."""
+    try:
+        async with asyncpg_config.provide_session() as session:
+            await session.execute("DROP TABLE IF EXISTS arrow_array_test CASCADE")
+            await session.execute("CREATE TABLE arrow_array_test (id INTEGER, tags TEXT[])")
+            await session.execute(
+                "INSERT INTO arrow_array_test VALUES (1, ARRAY['python', 'rust']), (2, ARRAY['js', 'ts'])"
+            )
+
+            result = await session.select_to_arrow("SELECT * FROM arrow_array_test ORDER BY id")
+
+            # PostgreSQL arrays are returned as Python lists in dict format,
+            # which Arrow converts to list type
+            df = result.to_pandas()
+            assert len(df) == 2
+            assert isinstance(df["tags"].iloc[0], (list, object))
+    finally:
+        await asyncpg_config.close_pool()
diff --git a/tests/integration/test_adapters/test_psqlpy/test_arrow.py b/tests/integration/test_adapters/test_psqlpy/test_arrow.py
diff --git a/tests/integration/test_adapters/test_psycopg/test_arrow.py b/tests/integration/test_adapters/test_psycopg/test_arrow.py