Merge changes from "New Arrow C-API #18246"

evertlammerts · web-flow · commit 0f67f6bb1665 · 2025-07-21T10:51:50.000+02:00
diff --git a/external/duckdb b/external/duckdb
@@ -1 +1 @@
-Subproject commit 641e95d140ca7728085445a919c3e8d436aaf0c1
+Subproject commit 61f07c3221e01674d8fe40b4a25364ba2f3159a7
diff --git a/src/duckdb_py/arrow/arrow_array_stream.cpp b/src/duckdb_py/arrow/arrow_array_stream.cpp
@@ -41,10 +41,8 @@ py::object PythonTableArrowArrayStreamFactory::ProduceScanner(DBConfig &config,
 	D_ASSERT(!py::isinstance<py::capsule>(arrow_obj_handle));
 	ArrowSchemaWrapper schema;
 	PythonTableArrowArrayStreamFactory::GetSchemaInternal(arrow_obj_handle, schema);
-	vector<string> unused_names;
-	vector<LogicalType> unused_types;
-	ArrowTableType arrow_table;
-	ArrowTableFunction::PopulateArrowTableType(config, arrow_table, schema, unused_names, unused_types);
+	ArrowTableSchema arrow_table;
+	ArrowTableFunction::PopulateArrowTableSchema(config, arrow_table, schema.arrow_schema);
 
 	auto filters = parameters.filters;
 	auto &column_list = parameters.projected_columns.columns;
@@ -466,7 +464,7 @@ py::object PythonTableArrowArrayStreamFactory::TransformFilter(TableFilterSet &f
                                                                std::unordered_map<idx_t, string> &columns,
                                                                unordered_map<idx_t, idx_t> filter_to_col,
                                                                const ClientProperties &config,
-                                                               const ArrowTableType &arrow_table) {
+                                                               const ArrowTableSchema &arrow_table) {
 	auto &filters_map = filter_collection.filters;
 
 	py::object expression = py::none();
diff --git a/src/duckdb_py/include/duckdb_python/arrow/arrow_array_stream.hpp b/src/duckdb_py/include/duckdb_python/arrow/arrow_array_stream.hpp
@@ -89,7 +89,7 @@ class PythonTableArrowArrayStreamFactory {
 	//! We transform a TableFilterSet to an Arrow Expression Object
 	static py::object TransformFilter(TableFilterSet &filters, std::unordered_map<idx_t, string> &columns,
 	                                  unordered_map<idx_t, idx_t> filter_to_col,
-	                                  const ClientProperties &client_properties, const ArrowTableType &arrow_table);
+	                                  const ClientProperties &client_properties, const ArrowTableSchema &arrow_table);
 
 	static py::object ProduceScanner(DBConfig &config, py::object &arrow_scanner, py::handle &arrow_obj_handle,
 	                                 ArrowStreamParameters &parameters, const ClientProperties &client_properties);
diff --git a/tests/fast/adbc/test_adbc.py b/tests/fast/adbc/test_adbc.py
@@ -3,6 +3,7 @@
 import sys
 import datetime
 import os
+import numpy as np
 
 if sys.version_info < (3, 9):
     pytest.skip(
@@ -224,7 +225,7 @@ def test_insertion(duck_conn):
     with duck_conn.cursor() as cursor:
         with pytest.raises(
             adbc_driver_manager_lib.InternalError,
-            match=r'Failed to create table \'ingest_table\': Table with name "ingest_table" already exists!',
+            match=r'Table with name "ingest_table" already exists!',
         ):
             cursor.adbc_ingest("ingest_table", table, "create")
         cursor.adbc_ingest("ingest_table", table, "append")
@@ -277,6 +278,93 @@ def test_read(duck_conn):
         }
 
 
+def test_large_chunk(tmp_path):
+    num_chunks = 3
+    chunk_size = 10_000
+
+    # Create data for each chunk
+    chunks_col1 = [pyarrow.array(np.random.randint(0, 100, chunk_size)) for _ in range(num_chunks)]
+    chunks_col2 = [pyarrow.array(np.random.rand(chunk_size)) for _ in range(num_chunks)]
+    chunks_col3 = [
+        pyarrow.array([f"str_{i}" for i in range(j * chunk_size, (j + 1) * chunk_size)]) for j in range(num_chunks)
+    ]
+
+    # Create chunked arrays
+    col1 = pyarrow.chunked_array(chunks_col1)
+    col2 = pyarrow.chunked_array(chunks_col2)
+    col3 = pyarrow.chunked_array(chunks_col3)
+
+    # Create the table
+    table = pyarrow.table([col1, col2, col3], names=["ints", "floats", "strings"])
+
+    db = os.path.join(tmp_path, "tmp.db")
+    if os.path.exists(db):
+        os.remove(db)
+    db_kwargs = {"path": f"{db}"}
+    with adbc_driver_manager.connect(
+        driver=driver_path,
+        entrypoint="duckdb_adbc_init",
+        db_kwargs=db_kwargs,
+        autocommit=True,
+    ) as conn:
+        with conn.cursor() as cur:
+            cur.adbc_ingest("ingest", table, "create")
+            cur.execute("SELECT count(*) from ingest")
+            assert cur.fetch_arrow_table().to_pydict() == {'count_star()': [30_000]}
+
+
+def test_dictionary_data(tmp_path):
+    data = ['apple', 'banana', 'apple', 'orange', 'banana', 'banana']
+
+    dict_type = pyarrow.dictionary(index_type=pyarrow.int32(), value_type=pyarrow.string())
+    dict_array = pyarrow.array(data, type=dict_type)
+
+    # Wrap in a table
+    table = pyarrow.table({'fruits': dict_array})
+    db = os.path.join(tmp_path, "tmp.db")
+    if os.path.exists(db):
+        os.remove(db)
+    db_kwargs = {"path": f"{db}"}
+    with adbc_driver_manager.connect(
+        driver=driver_path,
+        entrypoint="duckdb_adbc_init",
+        db_kwargs=db_kwargs,
+        autocommit=True,
+    ) as conn:
+        with conn.cursor() as cur:
+            cur.adbc_ingest("ingest", table, "create")
+            cur.execute("from ingest")
+            assert cur.fetch_arrow_table().to_pydict() == {
+                'fruits': ['apple', 'banana', 'apple', 'orange', 'banana', 'banana']
+            }
+
+
+def test_ree_data(tmp_path):
+    run_ends = pyarrow.array([3, 5, 6], type=pyarrow.int32())  # positions: [0-2], [3-4], [5]
+    values = pyarrow.array(["apple", "banana", "orange"], type=pyarrow.string())
+
+    ree_array = pyarrow.RunEndEncodedArray.from_arrays(run_ends, values)
+
+    table = pyarrow.table({"fruits": ree_array})
+
+    db = os.path.join(tmp_path, "tmp.db")
+    if os.path.exists(db):
+        os.remove(db)
+    db_kwargs = {"path": f"{db}"}
+    with adbc_driver_manager.connect(
+        driver=driver_path,
+        entrypoint="duckdb_adbc_init",
+        db_kwargs=db_kwargs,
+        autocommit=True,
+    ) as conn:
+        with conn.cursor() as cur:
+            cur.adbc_ingest("ingest", table, "create")
+            cur.execute("from ingest")
+            assert cur.fetch_arrow_table().to_pydict() == {
+                'fruits': ['apple', 'apple', 'apple', 'banana', 'banana', 'orange']
+            }
+
+
 def sorted_get_objects(catalogs):
     res = []
     for catalog in sorted(catalogs, key=lambda cat: cat['catalog_name']):