Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion external/duckdb
Submodule duckdb updated 43 files
+34 −0 .github/patches/extensions/excel/excel_copy.patch
+35 −0 .github/patches/extensions/spatial/arrow_capi.patch
+2 −2 extension/parquet/parquet_reader.cpp
+5 −2 extension/parquet/parquet_writer.cpp
+188 −99 src/common/adbc/adbc.cpp
+2 −0 src/common/arrow/arrow_converter.cpp
+19 −0 src/common/enum_util.cpp
+20 −10 src/function/table/arrow.cpp
+2 −3 src/function/table/arrow/arrow_array_scan_state.cpp
+24 −2 src/function/table/arrow/arrow_duck_schema.cpp
+166 −176 src/function/table/arrow_conversion.cpp
+107 −1 src/include/duckdb.h
+68 −1 src/include/duckdb/common/adbc/adbc.hpp
+10 −0 src/include/duckdb/common/arrow/arrow_wrapper.hpp
+8 −0 src/include/duckdb/common/enum_util.hpp
+3 −3 src/include/duckdb/common/multi_file/multi_file_function.hpp
+9 −1 src/include/duckdb/function/copy_function.hpp
+25 −10 src/include/duckdb/function/table/arrow.hpp
+8 −2 src/include/duckdb/function/table/arrow/arrow_duck_schema.hpp
+2 −0 src/include/duckdb/function/table/arrow/arrow_type_info.hpp
+5 −0 src/include/duckdb/main/capi/capi_internal.hpp
+25 −0 src/include/duckdb/main/capi/extension_api.hpp
+11 −0 src/include/duckdb/main/capi/header_generation/apis/v1/unstable/new_arrow_functions.json
+3 −1 src/include/duckdb/main/capi/header_generation/apis/v1/unstable/new_open_connect_functions.json
+7 −0 src/include/duckdb/main/capi/header_generation/apis/v1/unstable/new_query_execution_functions.json
+154 −1 src/include/duckdb/main/capi/header_generation/functions/arrow_interface.json
+37 −0 src/include/duckdb/main/capi/header_generation/functions/open_connect.json
+17 −0 src/include/duckdb/main/capi/header_generation/functions/query_execution.json
+24 −1 src/include/duckdb/main/capi/header_generation/header_base.hpp.template
+33 −0 src/include/duckdb_extension.h
+142 −0 src/main/capi/arrow-c.cpp
+19 −0 src/main/capi/duckdb-c.cpp
+12 −0 src/main/capi/result-c.cpp
+2 −2 src/planner/binder/statement/bind_copy.cpp
+51 −44 test/api/adbc/test_adbc.cpp
+156 −0 test/api/capi/test_capi_arrow.cpp
+5 −0 test/helpers/test_config.cpp
+4 −0 test/helpers/test_helpers.cpp
+1 −0 test/include/test_config.hpp
+1 −1 test/sql/copy/parquet/parquet_stats.test
+3 −5 tools/pythonpkg/src/arrow/arrow_array_stream.cpp
+1 −1 tools/pythonpkg/src/include/duckdb_python/arrow/arrow_array_stream.hpp
+89 −1 tools/pythonpkg/tests/fast/adbc/test_adbc.py
8 changes: 3 additions & 5 deletions src/duckdb_py/arrow/arrow_array_stream.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -41,10 +41,8 @@ py::object PythonTableArrowArrayStreamFactory::ProduceScanner(DBConfig &config,
D_ASSERT(!py::isinstance<py::capsule>(arrow_obj_handle));
ArrowSchemaWrapper schema;
PythonTableArrowArrayStreamFactory::GetSchemaInternal(arrow_obj_handle, schema);
vector<string> unused_names;
vector<LogicalType> unused_types;
ArrowTableType arrow_table;
ArrowTableFunction::PopulateArrowTableType(config, arrow_table, schema, unused_names, unused_types);
ArrowTableSchema arrow_table;
ArrowTableFunction::PopulateArrowTableSchema(config, arrow_table, schema.arrow_schema);

auto filters = parameters.filters;
auto &column_list = parameters.projected_columns.columns;
Expand Down Expand Up @@ -466,7 +464,7 @@ py::object PythonTableArrowArrayStreamFactory::TransformFilter(TableFilterSet &f
std::unordered_map<idx_t, string> &columns,
unordered_map<idx_t, idx_t> filter_to_col,
const ClientProperties &config,
const ArrowTableType &arrow_table) {
const ArrowTableSchema &arrow_table) {
auto &filters_map = filter_collection.filters;

py::object expression = py::none();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -89,7 +89,7 @@ class PythonTableArrowArrayStreamFactory {
//! We transform a TableFilterSet to an Arrow Expression Object
static py::object TransformFilter(TableFilterSet &filters, std::unordered_map<idx_t, string> &columns,
unordered_map<idx_t, idx_t> filter_to_col,
const ClientProperties &client_properties, const ArrowTableType &arrow_table);
const ClientProperties &client_properties, const ArrowTableSchema &arrow_table);

static py::object ProduceScanner(DBConfig &config, py::object &arrow_scanner, py::handle &arrow_obj_handle,
ArrowStreamParameters &parameters, const ClientProperties &client_properties);
Expand Down
90 changes: 89 additions & 1 deletion tests/fast/adbc/test_adbc.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import sys
import datetime
import os
import numpy as np

if sys.version_info < (3, 9):
pytest.skip(
Expand Down Expand Up @@ -224,7 +225,7 @@ def test_insertion(duck_conn):
with duck_conn.cursor() as cursor:
with pytest.raises(
adbc_driver_manager_lib.InternalError,
match=r'Failed to create table \'ingest_table\': Table with name "ingest_table" already exists!',
match=r'Table with name "ingest_table" already exists!',
):
cursor.adbc_ingest("ingest_table", table, "create")
cursor.adbc_ingest("ingest_table", table, "append")
Expand Down Expand Up @@ -277,6 +278,93 @@ def test_read(duck_conn):
}


def test_large_chunk(tmp_path):
num_chunks = 3
chunk_size = 10_000

# Create data for each chunk
chunks_col1 = [pyarrow.array(np.random.randint(0, 100, chunk_size)) for _ in range(num_chunks)]
chunks_col2 = [pyarrow.array(np.random.rand(chunk_size)) for _ in range(num_chunks)]
chunks_col3 = [
pyarrow.array([f"str_{i}" for i in range(j * chunk_size, (j + 1) * chunk_size)]) for j in range(num_chunks)
]

# Create chunked arrays
col1 = pyarrow.chunked_array(chunks_col1)
col2 = pyarrow.chunked_array(chunks_col2)
col3 = pyarrow.chunked_array(chunks_col3)

# Create the table
table = pyarrow.table([col1, col2, col3], names=["ints", "floats", "strings"])

db = os.path.join(tmp_path, "tmp.db")
if os.path.exists(db):
os.remove(db)
db_kwargs = {"path": f"{db}"}
with adbc_driver_manager.connect(
driver=driver_path,
entrypoint="duckdb_adbc_init",
db_kwargs=db_kwargs,
autocommit=True,
) as conn:
with conn.cursor() as cur:
cur.adbc_ingest("ingest", table, "create")
cur.execute("SELECT count(*) from ingest")
assert cur.fetch_arrow_table().to_pydict() == {'count_star()': [30_000]}


def test_dictionary_data(tmp_path):
data = ['apple', 'banana', 'apple', 'orange', 'banana', 'banana']

dict_type = pyarrow.dictionary(index_type=pyarrow.int32(), value_type=pyarrow.string())
dict_array = pyarrow.array(data, type=dict_type)

# Wrap in a table
table = pyarrow.table({'fruits': dict_array})
db = os.path.join(tmp_path, "tmp.db")
if os.path.exists(db):
os.remove(db)
db_kwargs = {"path": f"{db}"}
with adbc_driver_manager.connect(
driver=driver_path,
entrypoint="duckdb_adbc_init",
db_kwargs=db_kwargs,
autocommit=True,
) as conn:
with conn.cursor() as cur:
cur.adbc_ingest("ingest", table, "create")
cur.execute("from ingest")
assert cur.fetch_arrow_table().to_pydict() == {
'fruits': ['apple', 'banana', 'apple', 'orange', 'banana', 'banana']
}


def test_ree_data(tmp_path):
run_ends = pyarrow.array([3, 5, 6], type=pyarrow.int32()) # positions: [0-2], [3-4], [5]
values = pyarrow.array(["apple", "banana", "orange"], type=pyarrow.string())

ree_array = pyarrow.RunEndEncodedArray.from_arrays(run_ends, values)

table = pyarrow.table({"fruits": ree_array})

db = os.path.join(tmp_path, "tmp.db")
if os.path.exists(db):
os.remove(db)
db_kwargs = {"path": f"{db}"}
with adbc_driver_manager.connect(
driver=driver_path,
entrypoint="duckdb_adbc_init",
db_kwargs=db_kwargs,
autocommit=True,
) as conn:
with conn.cursor() as cur:
cur.adbc_ingest("ingest", table, "create")
cur.execute("from ingest")
assert cur.fetch_arrow_table().to_pydict() == {
'fruits': ['apple', 'apple', 'apple', 'banana', 'banana', 'orange']
}


def sorted_get_objects(catalogs):
res = []
for catalog in sorted(catalogs, key=lambda cat: cat['catalog_name']):
Expand Down
Loading