diff --git a/external/duckdb b/external/duckdb index 641e95d1..61f07c32 160000 --- a/external/duckdb +++ b/external/duckdb @@ -1 +1 @@ -Subproject commit 641e95d140ca7728085445a919c3e8d436aaf0c1 +Subproject commit 61f07c3221e01674d8fe40b4a25364ba2f3159a7 diff --git a/src/duckdb_py/arrow/arrow_array_stream.cpp b/src/duckdb_py/arrow/arrow_array_stream.cpp index 6094dcb1..533c31ed 100644 --- a/src/duckdb_py/arrow/arrow_array_stream.cpp +++ b/src/duckdb_py/arrow/arrow_array_stream.cpp @@ -41,10 +41,8 @@ py::object PythonTableArrowArrayStreamFactory::ProduceScanner(DBConfig &config, D_ASSERT(!py::isinstance(arrow_obj_handle)); ArrowSchemaWrapper schema; PythonTableArrowArrayStreamFactory::GetSchemaInternal(arrow_obj_handle, schema); - vector unused_names; - vector unused_types; - ArrowTableType arrow_table; - ArrowTableFunction::PopulateArrowTableType(config, arrow_table, schema, unused_names, unused_types); + ArrowTableSchema arrow_table; + ArrowTableFunction::PopulateArrowTableSchema(config, arrow_table, schema.arrow_schema); auto filters = parameters.filters; auto &column_list = parameters.projected_columns.columns; @@ -466,7 +464,7 @@ py::object PythonTableArrowArrayStreamFactory::TransformFilter(TableFilterSet &f std::unordered_map &columns, unordered_map filter_to_col, const ClientProperties &config, - const ArrowTableType &arrow_table) { + const ArrowTableSchema &arrow_table) { auto &filters_map = filter_collection.filters; py::object expression = py::none(); diff --git a/src/duckdb_py/include/duckdb_python/arrow/arrow_array_stream.hpp b/src/duckdb_py/include/duckdb_python/arrow/arrow_array_stream.hpp index 494be16a..7eb6d20b 100644 --- a/src/duckdb_py/include/duckdb_python/arrow/arrow_array_stream.hpp +++ b/src/duckdb_py/include/duckdb_python/arrow/arrow_array_stream.hpp @@ -89,7 +89,7 @@ class PythonTableArrowArrayStreamFactory { //! We transform a TableFilterSet to an Arrow Expression Object static py::object TransformFilter(TableFilterSet &filters, std::unordered_map &columns, unordered_map filter_to_col, - const ClientProperties &client_properties, const ArrowTableType &arrow_table); + const ClientProperties &client_properties, const ArrowTableSchema &arrow_table); static py::object ProduceScanner(DBConfig &config, py::object &arrow_scanner, py::handle &arrow_obj_handle, ArrowStreamParameters ¶meters, const ClientProperties &client_properties); diff --git a/tests/fast/adbc/test_adbc.py b/tests/fast/adbc/test_adbc.py index 3f9111bc..663563cf 100644 --- a/tests/fast/adbc/test_adbc.py +++ b/tests/fast/adbc/test_adbc.py @@ -3,6 +3,7 @@ import sys import datetime import os +import numpy as np if sys.version_info < (3, 9): pytest.skip( @@ -224,7 +225,7 @@ def test_insertion(duck_conn): with duck_conn.cursor() as cursor: with pytest.raises( adbc_driver_manager_lib.InternalError, - match=r'Failed to create table \'ingest_table\': Table with name "ingest_table" already exists!', + match=r'Table with name "ingest_table" already exists!', ): cursor.adbc_ingest("ingest_table", table, "create") cursor.adbc_ingest("ingest_table", table, "append") @@ -277,6 +278,93 @@ def test_read(duck_conn): } +def test_large_chunk(tmp_path): + num_chunks = 3 + chunk_size = 10_000 + + # Create data for each chunk + chunks_col1 = [pyarrow.array(np.random.randint(0, 100, chunk_size)) for _ in range(num_chunks)] + chunks_col2 = [pyarrow.array(np.random.rand(chunk_size)) for _ in range(num_chunks)] + chunks_col3 = [ + pyarrow.array([f"str_{i}" for i in range(j * chunk_size, (j + 1) * chunk_size)]) for j in range(num_chunks) + ] + + # Create chunked arrays + col1 = pyarrow.chunked_array(chunks_col1) + col2 = pyarrow.chunked_array(chunks_col2) + col3 = pyarrow.chunked_array(chunks_col3) + + # Create the table + table = pyarrow.table([col1, col2, col3], names=["ints", "floats", "strings"]) + + db = os.path.join(tmp_path, "tmp.db") + if os.path.exists(db): + os.remove(db) + db_kwargs = {"path": f"{db}"} + with adbc_driver_manager.connect( + driver=driver_path, + entrypoint="duckdb_adbc_init", + db_kwargs=db_kwargs, + autocommit=True, + ) as conn: + with conn.cursor() as cur: + cur.adbc_ingest("ingest", table, "create") + cur.execute("SELECT count(*) from ingest") + assert cur.fetch_arrow_table().to_pydict() == {'count_star()': [30_000]} + + +def test_dictionary_data(tmp_path): + data = ['apple', 'banana', 'apple', 'orange', 'banana', 'banana'] + + dict_type = pyarrow.dictionary(index_type=pyarrow.int32(), value_type=pyarrow.string()) + dict_array = pyarrow.array(data, type=dict_type) + + # Wrap in a table + table = pyarrow.table({'fruits': dict_array}) + db = os.path.join(tmp_path, "tmp.db") + if os.path.exists(db): + os.remove(db) + db_kwargs = {"path": f"{db}"} + with adbc_driver_manager.connect( + driver=driver_path, + entrypoint="duckdb_adbc_init", + db_kwargs=db_kwargs, + autocommit=True, + ) as conn: + with conn.cursor() as cur: + cur.adbc_ingest("ingest", table, "create") + cur.execute("from ingest") + assert cur.fetch_arrow_table().to_pydict() == { + 'fruits': ['apple', 'banana', 'apple', 'orange', 'banana', 'banana'] + } + + +def test_ree_data(tmp_path): + run_ends = pyarrow.array([3, 5, 6], type=pyarrow.int32()) # positions: [0-2], [3-4], [5] + values = pyarrow.array(["apple", "banana", "orange"], type=pyarrow.string()) + + ree_array = pyarrow.RunEndEncodedArray.from_arrays(run_ends, values) + + table = pyarrow.table({"fruits": ree_array}) + + db = os.path.join(tmp_path, "tmp.db") + if os.path.exists(db): + os.remove(db) + db_kwargs = {"path": f"{db}"} + with adbc_driver_manager.connect( + driver=driver_path, + entrypoint="duckdb_adbc_init", + db_kwargs=db_kwargs, + autocommit=True, + ) as conn: + with conn.cursor() as cur: + cur.adbc_ingest("ingest", table, "create") + cur.execute("from ingest") + assert cur.fetch_arrow_table().to_pydict() == { + 'fruits': ['apple', 'apple', 'apple', 'banana', 'banana', 'orange'] + } + + def sorted_get_objects(catalogs): res = [] for catalog in sorted(catalogs, key=lambda cat: cat['catalog_name']):