Skip to content

Commit 0f67f6b

Browse files
Merge changes from "New Arrow C-API #18246"
2 parents 99fe330 + f4bb56b commit 0f67f6b

File tree

4 files changed

+94
-8
lines changed

4 files changed

+94
-8
lines changed

external/duckdb

Submodule duckdb updated 43 files

src/duckdb_py/arrow/arrow_array_stream.cpp

Lines changed: 3 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -41,10 +41,8 @@ py::object PythonTableArrowArrayStreamFactory::ProduceScanner(DBConfig &config,
4141
D_ASSERT(!py::isinstance<py::capsule>(arrow_obj_handle));
4242
ArrowSchemaWrapper schema;
4343
PythonTableArrowArrayStreamFactory::GetSchemaInternal(arrow_obj_handle, schema);
44-
vector<string> unused_names;
45-
vector<LogicalType> unused_types;
46-
ArrowTableType arrow_table;
47-
ArrowTableFunction::PopulateArrowTableType(config, arrow_table, schema, unused_names, unused_types);
44+
ArrowTableSchema arrow_table;
45+
ArrowTableFunction::PopulateArrowTableSchema(config, arrow_table, schema.arrow_schema);
4846

4947
auto filters = parameters.filters;
5048
auto &column_list = parameters.projected_columns.columns;
@@ -466,7 +464,7 @@ py::object PythonTableArrowArrayStreamFactory::TransformFilter(TableFilterSet &f
466464
std::unordered_map<idx_t, string> &columns,
467465
unordered_map<idx_t, idx_t> filter_to_col,
468466
const ClientProperties &config,
469-
const ArrowTableType &arrow_table) {
467+
const ArrowTableSchema &arrow_table) {
470468
auto &filters_map = filter_collection.filters;
471469

472470
py::object expression = py::none();

src/duckdb_py/include/duckdb_python/arrow/arrow_array_stream.hpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -89,7 +89,7 @@ class PythonTableArrowArrayStreamFactory {
8989
//! We transform a TableFilterSet to an Arrow Expression Object
9090
static py::object TransformFilter(TableFilterSet &filters, std::unordered_map<idx_t, string> &columns,
9191
unordered_map<idx_t, idx_t> filter_to_col,
92-
const ClientProperties &client_properties, const ArrowTableType &arrow_table);
92+
const ClientProperties &client_properties, const ArrowTableSchema &arrow_table);
9393

9494
static py::object ProduceScanner(DBConfig &config, py::object &arrow_scanner, py::handle &arrow_obj_handle,
9595
ArrowStreamParameters &parameters, const ClientProperties &client_properties);

tests/fast/adbc/test_adbc.py

Lines changed: 89 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
import sys
44
import datetime
55
import os
6+
import numpy as np
67

78
if sys.version_info < (3, 9):
89
pytest.skip(
@@ -224,7 +225,7 @@ def test_insertion(duck_conn):
224225
with duck_conn.cursor() as cursor:
225226
with pytest.raises(
226227
adbc_driver_manager_lib.InternalError,
227-
match=r'Failed to create table \'ingest_table\': Table with name "ingest_table" already exists!',
228+
match=r'Table with name "ingest_table" already exists!',
228229
):
229230
cursor.adbc_ingest("ingest_table", table, "create")
230231
cursor.adbc_ingest("ingest_table", table, "append")
@@ -277,6 +278,93 @@ def test_read(duck_conn):
277278
}
278279

279280

281+
def test_large_chunk(tmp_path):
282+
num_chunks = 3
283+
chunk_size = 10_000
284+
285+
# Create data for each chunk
286+
chunks_col1 = [pyarrow.array(np.random.randint(0, 100, chunk_size)) for _ in range(num_chunks)]
287+
chunks_col2 = [pyarrow.array(np.random.rand(chunk_size)) for _ in range(num_chunks)]
288+
chunks_col3 = [
289+
pyarrow.array([f"str_{i}" for i in range(j * chunk_size, (j + 1) * chunk_size)]) for j in range(num_chunks)
290+
]
291+
292+
# Create chunked arrays
293+
col1 = pyarrow.chunked_array(chunks_col1)
294+
col2 = pyarrow.chunked_array(chunks_col2)
295+
col3 = pyarrow.chunked_array(chunks_col3)
296+
297+
# Create the table
298+
table = pyarrow.table([col1, col2, col3], names=["ints", "floats", "strings"])
299+
300+
db = os.path.join(tmp_path, "tmp.db")
301+
if os.path.exists(db):
302+
os.remove(db)
303+
db_kwargs = {"path": f"{db}"}
304+
with adbc_driver_manager.connect(
305+
driver=driver_path,
306+
entrypoint="duckdb_adbc_init",
307+
db_kwargs=db_kwargs,
308+
autocommit=True,
309+
) as conn:
310+
with conn.cursor() as cur:
311+
cur.adbc_ingest("ingest", table, "create")
312+
cur.execute("SELECT count(*) from ingest")
313+
assert cur.fetch_arrow_table().to_pydict() == {'count_star()': [30_000]}
314+
315+
316+
def test_dictionary_data(tmp_path):
317+
data = ['apple', 'banana', 'apple', 'orange', 'banana', 'banana']
318+
319+
dict_type = pyarrow.dictionary(index_type=pyarrow.int32(), value_type=pyarrow.string())
320+
dict_array = pyarrow.array(data, type=dict_type)
321+
322+
# Wrap in a table
323+
table = pyarrow.table({'fruits': dict_array})
324+
db = os.path.join(tmp_path, "tmp.db")
325+
if os.path.exists(db):
326+
os.remove(db)
327+
db_kwargs = {"path": f"{db}"}
328+
with adbc_driver_manager.connect(
329+
driver=driver_path,
330+
entrypoint="duckdb_adbc_init",
331+
db_kwargs=db_kwargs,
332+
autocommit=True,
333+
) as conn:
334+
with conn.cursor() as cur:
335+
cur.adbc_ingest("ingest", table, "create")
336+
cur.execute("from ingest")
337+
assert cur.fetch_arrow_table().to_pydict() == {
338+
'fruits': ['apple', 'banana', 'apple', 'orange', 'banana', 'banana']
339+
}
340+
341+
342+
def test_ree_data(tmp_path):
343+
run_ends = pyarrow.array([3, 5, 6], type=pyarrow.int32()) # positions: [0-2], [3-4], [5]
344+
values = pyarrow.array(["apple", "banana", "orange"], type=pyarrow.string())
345+
346+
ree_array = pyarrow.RunEndEncodedArray.from_arrays(run_ends, values)
347+
348+
table = pyarrow.table({"fruits": ree_array})
349+
350+
db = os.path.join(tmp_path, "tmp.db")
351+
if os.path.exists(db):
352+
os.remove(db)
353+
db_kwargs = {"path": f"{db}"}
354+
with adbc_driver_manager.connect(
355+
driver=driver_path,
356+
entrypoint="duckdb_adbc_init",
357+
db_kwargs=db_kwargs,
358+
autocommit=True,
359+
) as conn:
360+
with conn.cursor() as cur:
361+
cur.adbc_ingest("ingest", table, "create")
362+
cur.execute("from ingest")
363+
assert cur.fetch_arrow_table().to_pydict() == {
364+
'fruits': ['apple', 'apple', 'apple', 'banana', 'banana', 'orange']
365+
}
366+
367+
280368
def sorted_get_objects(catalogs):
281369
res = []
282370
for catalog in sorted(catalogs, key=lambda cat: cat['catalog_name']):

0 commit comments

Comments
 (0)