duckdb · evertlammerts · Jul 21, 2025 · Jul 21, 2025
diff --git a/external/duckdb b/external/duckdb
diff --git a/src/duckdb_py/arrow/arrow_array_stream.cpp b/src/duckdb_py/arrow/arrow_array_stream.cpp
@@ -41,10 +41,8 @@ py::object PythonTableArrowArrayStreamFactory::ProduceScanner(DBConfig &config,
 	D_ASSERT(!py::isinstance<py::capsule>(arrow_obj_handle));
 	ArrowSchemaWrapper schema;
 	PythonTableArrowArrayStreamFactory::GetSchemaInternal(arrow_obj_handle, schema);
-	vector<string> unused_names;
-	vector<LogicalType> unused_types;
-	ArrowTableType arrow_table;
-	ArrowTableFunction::PopulateArrowTableType(config, arrow_table, schema, unused_names, unused_types);
+	ArrowTableSchema arrow_table;
+	ArrowTableFunction::PopulateArrowTableSchema(config, arrow_table, schema.arrow_schema);
 
 	auto filters = parameters.filters;
 	auto &column_list = parameters.projected_columns.columns;
@@ -466,7 +464,7 @@ py::object PythonTableArrowArrayStreamFactory::TransformFilter(TableFilterSet &f
                                                                std::unordered_map<idx_t, string> &columns,
                                                                unordered_map<idx_t, idx_t> filter_to_col,
                                                                const ClientProperties &config,
-                                                               const ArrowTableType &arrow_table) {
+                                                               const ArrowTableSchema &arrow_table) {
 	auto &filters_map = filter_collection.filters;
 
 	py::object expression = py::none();

diff --git a/src/duckdb_py/include/duckdb_python/arrow/arrow_array_stream.hpp b/src/duckdb_py/include/duckdb_python/arrow/arrow_array_stream.hpp
@@ -89,7 +89,7 @@ class PythonTableArrowArrayStreamFactory {
 	//! We transform a TableFilterSet to an Arrow Expression Object
 	static py::object TransformFilter(TableFilterSet &filters, std::unordered_map<idx_t, string> &columns,
 	                                  unordered_map<idx_t, idx_t> filter_to_col,
-	                                  const ClientProperties &client_properties, const ArrowTableType &arrow_table);
+	                                  const ClientProperties &client_properties, const ArrowTableSchema &arrow_table);
 
 	static py::object ProduceScanner(DBConfig &config, py::object &arrow_scanner, py::handle &arrow_obj_handle,
 	                                 ArrowStreamParameters &parameters, const ClientProperties &client_properties);

diff --git a/tests/fast/adbc/test_adbc.py b/tests/fast/adbc/test_adbc.py
@@ -3,6 +3,7 @@
 import sys
 import datetime
 import os
+import numpy as np
 
 if sys.version_info < (3, 9):
     pytest.skip(
@@ -224,7 +225,7 @@ def test_insertion(duck_conn):
     with duck_conn.cursor() as cursor:
         with pytest.raises(
             adbc_driver_manager_lib.InternalError,
-            match=r'Failed to create table \'ingest_table\': Table with name "ingest_table" already exists!',
+            match=r'Table with name "ingest_table" already exists!',
         ):
             cursor.adbc_ingest("ingest_table", table, "create")
         cursor.adbc_ingest("ingest_table", table, "append")
@@ -277,6 +278,93 @@ def test_read(duck_conn):
         }
 
 
+def test_large_chunk(tmp_path):
+    num_chunks = 3
+    chunk_size = 10_000
+
+    # Create data for each chunk
+    chunks_col1 = [pyarrow.array(np.random.randint(0, 100, chunk_size)) for _ in range(num_chunks)]
+    chunks_col2 = [pyarrow.array(np.random.rand(chunk_size)) for _ in range(num_chunks)]
+    chunks_col3 = [
+        pyarrow.array([f"str_{i}" for i in range(j * chunk_size, (j + 1) * chunk_size)]) for j in range(num_chunks)
+    ]
+
+    # Create chunked arrays
+    col1 = pyarrow.chunked_array(chunks_col1)
+    col2 = pyarrow.chunked_array(chunks_col2)
+    col3 = pyarrow.chunked_array(chunks_col3)
+
+    # Create the table
+    table = pyarrow.table([col1, col2, col3], names=["ints", "floats", "strings"])
+
+    db = os.path.join(tmp_path, "tmp.db")
+    if os.path.exists(db):
+        os.remove(db)
+    db_kwargs = {"path": f"{db}"}
+    with adbc_driver_manager.connect(
+        driver=driver_path,
+        entrypoint="duckdb_adbc_init",
+        db_kwargs=db_kwargs,
+        autocommit=True,
+    ) as conn:
+        with conn.cursor() as cur:
+            cur.adbc_ingest("ingest", table, "create")
+            cur.execute("SELECT count(*) from ingest")
+            assert cur.fetch_arrow_table().to_pydict() == {'count_star()': [30_000]}
+
+
+def test_dictionary_data(tmp_path):
+    data = ['apple', 'banana', 'apple', 'orange', 'banana', 'banana']
+
+    dict_type = pyarrow.dictionary(index_type=pyarrow.int32(), value_type=pyarrow.string())
+    dict_array = pyarrow.array(data, type=dict_type)
+
+    # Wrap in a table
+    table = pyarrow.table({'fruits': dict_array})
+    db = os.path.join(tmp_path, "tmp.db")
+    if os.path.exists(db):
+        os.remove(db)
+    db_kwargs = {"path": f"{db}"}
+    with adbc_driver_manager.connect(
+        driver=driver_path,
+        entrypoint="duckdb_adbc_init",
+        db_kwargs=db_kwargs,
+        autocommit=True,
+    ) as conn:
+        with conn.cursor() as cur:
+            cur.adbc_ingest("ingest", table, "create")
+            cur.execute("from ingest")
+            assert cur.fetch_arrow_table().to_pydict() == {
+                'fruits': ['apple', 'banana', 'apple', 'orange', 'banana', 'banana']
+            }
+
+
+def test_ree_data(tmp_path):
+    run_ends = pyarrow.array([3, 5, 6], type=pyarrow.int32())  # positions: [0-2], [3-4], [5]
+    values = pyarrow.array(["apple", "banana", "orange"], type=pyarrow.string())
+
+    ree_array = pyarrow.RunEndEncodedArray.from_arrays(run_ends, values)
+
+    table = pyarrow.table({"fruits": ree_array})
+
+    db = os.path.join(tmp_path, "tmp.db")
+    if os.path.exists(db):
+        os.remove(db)
+    db_kwargs = {"path": f"{db}"}
+    with adbc_driver_manager.connect(
+        driver=driver_path,
+        entrypoint="duckdb_adbc_init",
+        db_kwargs=db_kwargs,
+        autocommit=True,
+    ) as conn:
+        with conn.cursor() as cur:
+            cur.adbc_ingest("ingest", table, "create")
+            cur.execute("from ingest")
+            assert cur.fetch_arrow_table().to_pydict() == {
+                'fruits': ['apple', 'apple', 'apple', 'banana', 'banana', 'orange']
+            }
+
+
 def sorted_get_objects(catalogs):
     res = []
     for catalog in sorted(catalogs, key=lambda cat: cat['catalog_name']):
+34 −0		.github/patches/extensions/excel/excel_copy.patch
+35 −0		.github/patches/extensions/spatial/arrow_capi.patch
+2 −2		extension/parquet/parquet_reader.cpp
+5 −2		extension/parquet/parquet_writer.cpp
+188 −99		src/common/adbc/adbc.cpp
+2 −0		src/common/arrow/arrow_converter.cpp
+19 −0		src/common/enum_util.cpp
+20 −10		src/function/table/arrow.cpp
+2 −3		src/function/table/arrow/arrow_array_scan_state.cpp
+24 −2		src/function/table/arrow/arrow_duck_schema.cpp
+166 −176		src/function/table/arrow_conversion.cpp
+107 −1		src/include/duckdb.h
+68 −1		src/include/duckdb/common/adbc/adbc.hpp
+10 −0		src/include/duckdb/common/arrow/arrow_wrapper.hpp
+8 −0		src/include/duckdb/common/enum_util.hpp
+3 −3		src/include/duckdb/common/multi_file/multi_file_function.hpp
+9 −1		src/include/duckdb/function/copy_function.hpp
+25 −10		src/include/duckdb/function/table/arrow.hpp
+8 −2		src/include/duckdb/function/table/arrow/arrow_duck_schema.hpp
+2 −0		src/include/duckdb/function/table/arrow/arrow_type_info.hpp
+5 −0		src/include/duckdb/main/capi/capi_internal.hpp
+25 −0		src/include/duckdb/main/capi/extension_api.hpp
+11 −0		src/include/duckdb/main/capi/header_generation/apis/v1/unstable/new_arrow_functions.json
+3 −1		src/include/duckdb/main/capi/header_generation/apis/v1/unstable/new_open_connect_functions.json
+7 −0		src/include/duckdb/main/capi/header_generation/apis/v1/unstable/new_query_execution_functions.json
+154 −1		src/include/duckdb/main/capi/header_generation/functions/arrow_interface.json
+37 −0		src/include/duckdb/main/capi/header_generation/functions/open_connect.json
+17 −0		src/include/duckdb/main/capi/header_generation/functions/query_execution.json
+24 −1		src/include/duckdb/main/capi/header_generation/header_base.hpp.template
+33 −0		src/include/duckdb_extension.h
+142 −0		src/main/capi/arrow-c.cpp
+19 −0		src/main/capi/duckdb-c.cpp
+12 −0		src/main/capi/result-c.cpp
+2 −2		src/planner/binder/statement/bind_copy.cpp
+51 −44		test/api/adbc/test_adbc.cpp
+156 −0		test/api/capi/test_capi_arrow.cpp
+5 −0		test/helpers/test_config.cpp
+4 −0		test/helpers/test_helpers.cpp
+1 −0		test/include/test_config.hpp
+1 −1		test/sql/copy/parquet/parquet_stats.test
+3 −5		tools/pythonpkg/src/arrow/arrow_array_stream.cpp
+1 −1		tools/pythonpkg/src/include/duckdb_python/arrow/arrow_array_stream.hpp
+89 −1		tools/pythonpkg/tests/fast/adbc/test_adbc.py