Skip to content

Commit c8a3014

Browse files
committed
Fwd ports of:
PR #18642: Change arrow() to export record batch reader PR #18624: Adjust filter pushdown to latest polars release PR #18547: Rename the Varint type to Bignum
1 parent aa76be5 commit c8a3014

39 files changed

+249
-209
lines changed

duckdb/__init__.pyi

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -180,6 +180,7 @@ class StatementType:
180180
DETACH: StatementType
181181
MULTI: StatementType
182182
COPY_DATABASE: StatementType
183+
MERGE_INTO: StatementType
183184
def __int__(self) -> int: ...
184185
def __index__(self) -> int: ...
185186
@property
@@ -320,8 +321,8 @@ class DuckDBPyConnection:
320321
def fetch_df_chunk(self, vectors_per_chunk: int = 1, *, date_as_object: bool = False) -> pandas.DataFrame: ...
321322
def pl(self, rows_per_batch: int = 1000000, *, lazy: bool = False) -> polars.DataFrame: ...
322323
def fetch_arrow_table(self, rows_per_batch: int = 1000000) -> pyarrow.lib.Table: ...
323-
def arrow(self, rows_per_batch: int = 1000000) -> pyarrow.lib.Table: ...
324324
def fetch_record_batch(self, rows_per_batch: int = 1000000) -> pyarrow.lib.RecordBatchReader: ...
325+
def arrow(self, rows_per_batch: int = 1000000) -> pyarrow.lib.RecordBatchReader: ...
325326
def torch(self) -> dict: ...
326327
def tf(self) -> dict: ...
327328
def begin(self) -> DuckDBPyConnection: ...
@@ -668,8 +669,8 @@ def df(*, date_as_object: bool = False, connection: DuckDBPyConnection = ...) ->
668669
def fetch_df_chunk(vectors_per_chunk: int = 1, *, date_as_object: bool = False, connection: DuckDBPyConnection = ...) -> pandas.DataFrame: ...
669670
def pl(rows_per_batch: int = 1000000, *, lazy: bool = False, connection: DuckDBPyConnection = ...) -> polars.DataFrame: ...
670671
def fetch_arrow_table(rows_per_batch: int = 1000000, *, connection: DuckDBPyConnection = ...) -> pyarrow.lib.Table: ...
671-
def arrow(rows_per_batch: int = 1000000, *, connection: DuckDBPyConnection = ...) -> pyarrow.lib.Table: ...
672672
def fetch_record_batch(rows_per_batch: int = 1000000, *, connection: DuckDBPyConnection = ...) -> pyarrow.lib.RecordBatchReader: ...
673+
def arrow(rows_per_batch: int = 1000000, *, connection: DuckDBPyConnection = ...) -> pyarrow.lib.RecordBatchReader: ...
673674
def torch(*, connection: DuckDBPyConnection = ...) -> dict: ...
674675
def tf(*, connection: DuckDBPyConnection = ...) -> dict: ...
675676
def begin(*, connection: DuckDBPyConnection = ...) -> DuckDBPyConnection: ...

duckdb/polars_io.py

Lines changed: 29 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -123,47 +123,61 @@ def _pl_tree_to_sql(tree: dict) -> str:
123123
raise NotImplementedError(f"Unsupported function type: {func_dict}")
124124

125125
if node_type == "Scalar":
126-
# Handle scalar values with typed representations
127-
dtype = str(subtree["dtype"])
128-
value = subtree["value"]
126+
# Detect format: old style (dtype/value) or new style (direct type key)
127+
if "dtype" in subtree and "value" in subtree:
128+
dtype = str(subtree["dtype"])
129+
value = subtree["value"]
130+
else:
131+
# New style: dtype is the single key in the dict
132+
dtype = next(iter(subtree.keys()))
133+
value = subtree
129134

130135
# Decimal support
131-
if dtype.startswith("{'Decimal'"):
136+
if dtype.startswith("{'Decimal'") or dtype == "Decimal":
132137
decimal_value = value['Decimal']
133138
decimal_value = Decimal(decimal_value[0]) / Decimal(10 ** decimal_value[1])
134139
return str(decimal_value)
135140

136141
# Datetime with microseconds since epoch
137-
if dtype.startswith("{'Datetime'"):
142+
if dtype.startswith("{'Datetime'") or dtype == "Datetime":
138143
micros = value['Datetime'][0]
139144
dt_timestamp = datetime.datetime.fromtimestamp(micros / 1_000_000, tz=datetime.UTC)
140145
return f"'{str(dt_timestamp)}'::TIMESTAMP"
141146

142-
# Match simple types
143-
if dtype in ("Int8", "Int16", "Int32", "Int64", "UInt8", "UInt16", "UInt32", "UInt64", "Float32", "Float64", "Boolean"):
147+
# Match simple numeric/boolean types
148+
if dtype in ("Int8", "Int16", "Int32", "Int64",
149+
"UInt8", "UInt16", "UInt32", "UInt64",
150+
"Float32", "Float64", "Boolean"):
144151
return str(value[dtype])
145152

153+
# Time type
146154
if dtype == "Time":
147-
# Convert nanoseconds to TIME
148155
nanoseconds = value["Time"]
149156
seconds = nanoseconds // 1_000_000_000
150157
microseconds = (nanoseconds % 1_000_000_000) // 1_000
151-
dt_time = (datetime.datetime.min + datetime.timedelta(seconds=seconds, microseconds=microseconds)).time()
152-
return f"'{str(dt_time)}'::TIME"
158+
dt_time = (datetime.datetime.min + datetime.timedelta(
159+
seconds=seconds, microseconds=microseconds
160+
)).time()
161+
return f"'{dt_time}'::TIME"
153162

163+
# Date type
154164
if dtype == "Date":
155-
# Convert days since Unix epoch to SQL DATE
156165
days_since_epoch = value["Date"]
157166
date = datetime.date(1970, 1, 1) + datetime.timedelta(days=days_since_epoch)
158-
return f"'{str(date)}'::DATE"
167+
return f"'{date}'::DATE"
168+
169+
# Binary type
159170
if dtype == "Binary":
160-
# Convert binary data to hex string for BLOB
161171
binary_data = bytes(value["Binary"])
162172
escaped = ''.join(f'\\x{b:02x}' for b in binary_data)
163173
return f"'{escaped}'::BLOB"
164174

165-
if dtype == "String":
166-
return f"'{value['StringOwned']}'"
175+
# String type
176+
if dtype == "String" or dtype == "StringOwned":
177+
# Some new formats may store directly under StringOwned
178+
string_val = value.get("StringOwned", value.get("String", None))
179+
return f"'{string_val}'"
180+
167181

168182
raise NotImplementedError(f"Unsupported scalar type {str(dtype)}, with value {value}")
169183

scripts/connection_methods.json

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -395,10 +395,7 @@
395395
"return": "polars.DataFrame"
396396
},
397397
{
398-
"name": [
399-
"fetch_arrow_table",
400-
"arrow"
401-
],
398+
"name": "fetch_arrow_table",
402399
"function": "FetchArrow",
403400
"docs": "Fetch a result as Arrow table following execute()",
404401
"args": [
@@ -411,7 +408,11 @@
411408
"return": "pyarrow.lib.Table"
412409
},
413410
{
414-
"name": "fetch_record_batch",
411+
"name": [
412+
"fetch_record_batch",
413+
"arrow"
414+
],
415+
415416
"function": "FetchRecordBatchReader",
416417
"docs": "Fetch an Arrow RecordBatchReader following execute()",
417418
"args": [

src/duckdb_py/duckdb_python.cpp

Lines changed: 20 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -427,17 +427,17 @@ static void InitializeConnectionMethods(py::module_ &m) {
427427
"Fetch a result as Arrow table following execute()", py::arg("rows_per_batch") = 1000000, py::kw_only(),
428428
py::arg("connection") = py::none());
429429
m.def(
430-
"arrow",
431-
[](idx_t rows_per_batch, shared_ptr<DuckDBPyConnection> conn = nullptr) {
430+
"fetch_record_batch",
431+
[](const idx_t rows_per_batch, shared_ptr<DuckDBPyConnection> conn = nullptr) {
432432
if (!conn) {
433433
conn = DuckDBPyConnection::DefaultConnection();
434434
}
435-
return conn->FetchArrow(rows_per_batch);
435+
return conn->FetchRecordBatchReader(rows_per_batch);
436436
},
437-
"Fetch a result as Arrow table following execute()", py::arg("rows_per_batch") = 1000000, py::kw_only(),
437+
"Fetch an Arrow RecordBatchReader following execute()", py::arg("rows_per_batch") = 1000000, py::kw_only(),
438438
py::arg("connection") = py::none());
439439
m.def(
440-
"fetch_record_batch",
440+
"arrow",
441441
[](const idx_t rows_per_batch, shared_ptr<DuckDBPyConnection> conn = nullptr) {
442442
if (!conn) {
443443
conn = DuckDBPyConnection::DefaultConnection();
@@ -971,21 +971,21 @@ static void InitializeConnectionMethods(py::module_ &m) {
971971
static void RegisterStatementType(py::handle &m) {
972972
auto statement_type = py::enum_<duckdb::StatementType>(m, "StatementType");
973973
static const duckdb::StatementType TYPES[] = {
974-
duckdb::StatementType::INVALID_STATEMENT, duckdb::StatementType::SELECT_STATEMENT,
975-
duckdb::StatementType::INSERT_STATEMENT, duckdb::StatementType::UPDATE_STATEMENT,
976-
duckdb::StatementType::CREATE_STATEMENT, duckdb::StatementType::DELETE_STATEMENT,
977-
duckdb::StatementType::PREPARE_STATEMENT, duckdb::StatementType::EXECUTE_STATEMENT,
978-
duckdb::StatementType::ALTER_STATEMENT, duckdb::StatementType::TRANSACTION_STATEMENT,
979-
duckdb::StatementType::COPY_STATEMENT, duckdb::StatementType::ANALYZE_STATEMENT,
980-
duckdb::StatementType::VARIABLE_SET_STATEMENT, duckdb::StatementType::CREATE_FUNC_STATEMENT,
981-
duckdb::StatementType::EXPLAIN_STATEMENT, duckdb::StatementType::DROP_STATEMENT,
982-
duckdb::StatementType::EXPORT_STATEMENT, duckdb::StatementType::PRAGMA_STATEMENT,
983-
duckdb::StatementType::VACUUM_STATEMENT, duckdb::StatementType::CALL_STATEMENT,
984-
duckdb::StatementType::SET_STATEMENT, duckdb::StatementType::LOAD_STATEMENT,
985-
duckdb::StatementType::RELATION_STATEMENT, duckdb::StatementType::EXTENSION_STATEMENT,
986-
duckdb::StatementType::LOGICAL_PLAN_STATEMENT, duckdb::StatementType::ATTACH_STATEMENT,
987-
duckdb::StatementType::DETACH_STATEMENT, duckdb::StatementType::MULTI_STATEMENT,
988-
duckdb::StatementType::COPY_DATABASE_STATEMENT};
974+
duckdb::StatementType::INVALID_STATEMENT, duckdb::StatementType::SELECT_STATEMENT,
975+
duckdb::StatementType::INSERT_STATEMENT, duckdb::StatementType::UPDATE_STATEMENT,
976+
duckdb::StatementType::CREATE_STATEMENT, duckdb::StatementType::DELETE_STATEMENT,
977+
duckdb::StatementType::PREPARE_STATEMENT, duckdb::StatementType::EXECUTE_STATEMENT,
978+
duckdb::StatementType::ALTER_STATEMENT, duckdb::StatementType::TRANSACTION_STATEMENT,
979+
duckdb::StatementType::COPY_STATEMENT, duckdb::StatementType::ANALYZE_STATEMENT,
980+
duckdb::StatementType::VARIABLE_SET_STATEMENT, duckdb::StatementType::CREATE_FUNC_STATEMENT,
981+
duckdb::StatementType::EXPLAIN_STATEMENT, duckdb::StatementType::DROP_STATEMENT,
982+
duckdb::StatementType::EXPORT_STATEMENT, duckdb::StatementType::PRAGMA_STATEMENT,
983+
duckdb::StatementType::VACUUM_STATEMENT, duckdb::StatementType::CALL_STATEMENT,
984+
duckdb::StatementType::SET_STATEMENT, duckdb::StatementType::LOAD_STATEMENT,
985+
duckdb::StatementType::RELATION_STATEMENT, duckdb::StatementType::EXTENSION_STATEMENT,
986+
duckdb::StatementType::LOGICAL_PLAN_STATEMENT, duckdb::StatementType::ATTACH_STATEMENT,
987+
duckdb::StatementType::DETACH_STATEMENT, duckdb::StatementType::MULTI_STATEMENT,
988+
duckdb::StatementType::COPY_DATABASE_STATEMENT, duckdb::StatementType::MERGE_INTO_STATEMENT};
989989
static const idx_t AMOUNT = sizeof(TYPES) / sizeof(duckdb::StatementType);
990990
for (idx_t i = 0; i < AMOUNT; i++) {
991991
auto &type = TYPES[i];

src/duckdb_py/native/python_objects.cpp

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88
#include "duckdb/common/operator/cast_operators.hpp"
99
#include "duckdb_python/pyconnection/pyconnection.hpp"
1010
#include "duckdb/common/operator/add.hpp"
11-
#include "duckdb/common/types/varint.hpp"
11+
#include "duckdb/common/types/bignum.hpp"
1212
#include "duckdb/function/to_interval.hpp"
1313

1414
#include "datetime.h" // Python datetime initialize #1
@@ -683,9 +683,9 @@ py::object PythonObject::FromValue(const Value &val, const LogicalType &type,
683683
auto uuid_value = val.GetValueUnsafe<hugeint_t>();
684684
return import_cache.uuid.UUID()(UUID::ToString(uuid_value));
685685
}
686-
case LogicalTypeId::VARINT: {
687-
auto varint_value = val.GetValueUnsafe<string_t>();
688-
return py::str(Varint::VarIntToVarchar(varint_value));
686+
case LogicalTypeId::BIGNUM: {
687+
auto bignum_value = val.GetValueUnsafe<bignum_t>();
688+
return py::str(Bignum::BignumToVarchar(bignum_value));
689689
}
690690
case LogicalTypeId::INTERVAL: {
691691
auto interval_value = val.GetValueUnsafe<interval_t>();

src/duckdb_py/pyconnection.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -204,10 +204,10 @@ static void InitializeConnectionMethods(py::class_<DuckDBPyConnection, shared_pt
204204
py::arg("rows_per_batch") = 1000000, py::kw_only(), py::arg("lazy") = false);
205205
m.def("fetch_arrow_table", &DuckDBPyConnection::FetchArrow, "Fetch a result as Arrow table following execute()",
206206
py::arg("rows_per_batch") = 1000000);
207-
m.def("arrow", &DuckDBPyConnection::FetchArrow, "Fetch a result as Arrow table following execute()",
208-
py::arg("rows_per_batch") = 1000000);
209207
m.def("fetch_record_batch", &DuckDBPyConnection::FetchRecordBatchReader,
210208
"Fetch an Arrow RecordBatchReader following execute()", py::arg("rows_per_batch") = 1000000);
209+
m.def("arrow", &DuckDBPyConnection::FetchRecordBatchReader, "Fetch an Arrow RecordBatchReader following execute()",
210+
py::arg("rows_per_batch") = 1000000);
211211
m.def("torch", &DuckDBPyConnection::FetchPyTorch, "Fetch a result as dict of PyTorch Tensors following execute()");
212212
m.def("tf", &DuckDBPyConnection::FetchTF, "Fetch a result as dict of TensorFlow Tensors following execute()");
213213
m.def("begin", &DuckDBPyConnection::Begin, "Start a new transaction");

tests/fast/api/test_dbapi_fetch.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -40,11 +40,11 @@ def test_multiple_fetch_arrow(self, duckdb_cursor):
4040
arrow = pytest.importorskip("pyarrow")
4141
con = duckdb.connect()
4242
c = con.execute('SELECT 42::BIGINT AS a')
43-
table = c.arrow()
43+
table = c.fetch_arrow_table()
4444
df = table.to_pandas()
4545
pd.testing.assert_frame_equal(df, pd.DataFrame.from_dict({'a': [42]}))
46-
assert c.arrow() is None
47-
assert c.arrow() is None
46+
assert c.fetch_arrow_table() is None
47+
assert c.fetch_arrow_table() is None
4848

4949
def test_multiple_close(self, duckdb_cursor):
5050
con = duckdb.connect()

tests/fast/api/test_duckdb_connection.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -52,7 +52,7 @@ def test_default_connection_from_connect(self):
5252
def test_arrow(self):
5353
pyarrow = pytest.importorskip("pyarrow")
5454
duckdb.execute("select [1,2,3]")
55-
result = duckdb.arrow()
55+
result = duckdb.fetch_arrow_table()
5656

5757
def test_begin_commit(self):
5858
duckdb.begin()
@@ -299,7 +299,7 @@ def test_unregister_problematic_behavior(self, duckdb_cursor):
299299
assert duckdb_cursor.execute("select * from vw").fetchone() == (0,)
300300

301301
# Create a registered object called 'vw'
302-
arrow_result = duckdb_cursor.execute("select 42").arrow()
302+
arrow_result = duckdb_cursor.execute("select 42").fetch_arrow_table()
303303
with pytest.raises(duckdb.CatalogException, match='View with name "vw" already exists'):
304304
duckdb_cursor.register('vw', arrow_result)
305305

tests/fast/api/test_native_tz.py

Lines changed: 22 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -59,25 +59,41 @@ def test_pandas_timestamp_time(self, duckdb_cursor):
5959
)
6060
def test_arrow_timestamp_timezone(self, duckdb_cursor):
6161
res = duckdb_cursor.execute("SET timezone='America/Los_Angeles';")
62-
table = duckdb_cursor.execute(f"select TimeRecStart as tz from '{filename}'").arrow()
62+
table = duckdb_cursor.execute(f"select TimeRecStart as tz from '{filename}'").fetch_arrow_table()
6363
res = table.to_pandas()
6464
assert res.dtypes["tz"].tz.zone == 'America/Los_Angeles'
6565
assert res['tz'][0].hour == 14 and res['tz'][0].minute == 52
6666

6767
duckdb_cursor.execute("SET timezone='UTC';")
68-
res = duckdb_cursor.execute(f"select TimeRecStart as tz from '{filename}'").arrow().to_pandas()
68+
res = duckdb_cursor.execute(f"select TimeRecStart as tz from '{filename}'").fetch_arrow_table().to_pandas()
6969
assert res.dtypes["tz"].tz.zone == 'UTC'
7070
assert res['tz'][0].hour == 21 and res['tz'][0].minute == 52
7171

7272
def test_arrow_timestamp_time(self, duckdb_cursor):
7373
duckdb_cursor.execute("SET timezone='America/Los_Angeles';")
74-
res1 = duckdb_cursor.execute(f"select TimeRecStart::TIMETZ as tz from '{filename}'").arrow().to_pandas()
75-
res2 = duckdb_cursor.execute(f"select TimeRecStart::TIMETZ::TIME as tz from '{filename}'").arrow().to_pandas()
74+
res1 = (
75+
duckdb_cursor.execute(f"select TimeRecStart::TIMETZ as tz from '{filename}'")
76+
.fetch_arrow_table()
77+
.to_pandas()
78+
)
79+
res2 = (
80+
duckdb_cursor.execute(f"select TimeRecStart::TIMETZ::TIME as tz from '{filename}'")
81+
.fetch_arrow_table()
82+
.to_pandas()
83+
)
7684
assert res1['tz'][0].hour == 14 and res1['tz'][0].minute == 52
7785
assert res2['tz'][0].hour == res2['tz'][0].hour and res2['tz'][0].minute == res1['tz'][0].minute
7886

7987
duckdb_cursor.execute("SET timezone='UTC';")
80-
res1 = duckdb_cursor.execute(f"select TimeRecStart::TIMETZ as tz from '{filename}'").arrow().to_pandas()
81-
res2 = duckdb_cursor.execute(f"select TimeRecStart::TIMETZ::TIME as tz from '{filename}'").arrow().to_pandas()
88+
res1 = (
89+
duckdb_cursor.execute(f"select TimeRecStart::TIMETZ as tz from '{filename}'")
90+
.fetch_arrow_table()
91+
.to_pandas()
92+
)
93+
res2 = (
94+
duckdb_cursor.execute(f"select TimeRecStart::TIMETZ::TIME as tz from '{filename}'")
95+
.fetch_arrow_table()
96+
.to_pandas()
97+
)
8298
assert res1['tz'][0].hour == 21 and res1['tz'][0].minute == 52
8399
assert res2['tz'][0].hour == res2['tz'][0].hour and res2['tz'][0].minute == res1['tz'][0].minute

tests/fast/arrow/test_6584.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77

88
def f(cur, i, data):
99
cur.execute(f"create table t_{i} as select * from data")
10-
return cur.execute(f"select * from t_{i}").arrow()
10+
return cur.execute(f"select * from t_{i}").fetch_arrow_table()
1111

1212

1313
def test_6584():

0 commit comments

Comments
 (0)