Skip to content

Commit 62f17f4

Browse files
author
Evert
committed
7 PRs between v1.3.1 and 1.3.2
- Add start/end offset percentage options to Python test runner (#18091) - Switch to Optional for type hints in polars lazy dataframe function (#18078) - Use `timestamp_t` instead of `time_t` for file last modified time (#18037) - fix star expr exclude error (#18063) - Remove match-case statements from polars_io.py (#18052) - Add support to produce Polars Lazy Dataframes (#17947) - Implement consumption and production of Arrow Binary View (#17975)
1 parent add2965 commit 62f17f4

File tree

20 files changed

+881
-42
lines changed

20 files changed

+881
-42
lines changed

duckdb/__init__.pyi

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -318,7 +318,7 @@ class DuckDBPyConnection:
318318
def fetch_df(self, *, date_as_object: bool = False) -> pandas.DataFrame: ...
319319
def df(self, *, date_as_object: bool = False) -> pandas.DataFrame: ...
320320
def fetch_df_chunk(self, vectors_per_chunk: int = 1, *, date_as_object: bool = False) -> pandas.DataFrame: ...
321-
def pl(self, rows_per_batch: int = 1000000) -> polars.DataFrame: ...
321+
def pl(self, rows_per_batch: int = 1000000, *, lazy: bool = False) -> polars.DataFrame: ...
322322
def fetch_arrow_table(self, rows_per_batch: int = 1000000) -> pyarrow.lib.Table: ...
323323
def arrow(self, rows_per_batch: int = 1000000) -> pyarrow.lib.Table: ...
324324
def fetch_record_batch(self, rows_per_batch: int = 1000000) -> pyarrow.lib.RecordBatchReader: ...
@@ -666,7 +666,7 @@ def fetchdf(*, date_as_object: bool = False, connection: DuckDBPyConnection = ..
666666
def fetch_df(*, date_as_object: bool = False, connection: DuckDBPyConnection = ...) -> pandas.DataFrame: ...
667667
def df(*, date_as_object: bool = False, connection: DuckDBPyConnection = ...) -> pandas.DataFrame: ...
668668
def fetch_df_chunk(vectors_per_chunk: int = 1, *, date_as_object: bool = False, connection: DuckDBPyConnection = ...) -> pandas.DataFrame: ...
669-
def pl(rows_per_batch: int = 1000000, *, connection: DuckDBPyConnection = ...) -> polars.DataFrame: ...
669+
def pl(rows_per_batch: int = 1000000, *, lazy: bool = False, connection: DuckDBPyConnection = ...) -> polars.DataFrame: ...
670670
def fetch_arrow_table(rows_per_batch: int = 1000000, *, connection: DuckDBPyConnection = ...) -> pyarrow.lib.Table: ...
671671
def arrow(rows_per_batch: int = 1000000, *, connection: DuckDBPyConnection = ...) -> pyarrow.lib.Table: ...
672672
def fetch_record_batch(rows_per_batch: int = 1000000, *, connection: DuckDBPyConnection = ...) -> pyarrow.lib.RecordBatchReader: ...

duckdb/polars_io.py

Lines changed: 211 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,211 @@
1+
import duckdb
2+
import polars as pl
3+
from typing import Iterator, Optional
4+
5+
from polars.io.plugins import register_io_source
6+
from duckdb import SQLExpression
7+
import json
8+
from decimal import Decimal
9+
import datetime
10+
11+
def _predicate_to_expression(predicate: pl.Expr) -> Optional[SQLExpression]:
12+
"""
13+
Convert a Polars predicate expression to a DuckDB-compatible SQL expression.
14+
15+
Parameters:
16+
predicate (pl.Expr): A Polars expression (e.g., col("foo") > 5)
17+
18+
Returns:
19+
SQLExpression: A DuckDB SQL expression string equivalent.
20+
None: If conversion fails.
21+
22+
Example:
23+
>>> _predicate_to_expression(pl.col("foo") > 5)
24+
SQLExpression("(foo > 5)")
25+
"""
26+
# Serialize the Polars expression tree to JSON
27+
tree = json.loads(predicate.meta.serialize(format="json"))
28+
29+
try:
30+
# Convert the tree to SQL
31+
sql_filter = _pl_tree_to_sql(tree)
32+
return SQLExpression(sql_filter)
33+
except:
34+
# If the conversion fails, we return None
35+
return None
36+
37+
38+
def _pl_operation_to_sql(op: str) -> str:
39+
"""
40+
Map Polars binary operation strings to SQL equivalents.
41+
42+
Example:
43+
>>> _pl_operation_to_sql("Eq")
44+
'='
45+
"""
46+
try:
47+
return {
48+
"Lt": "<",
49+
"LtEq": "<=",
50+
"Gt": ">",
51+
"GtEq": ">=",
52+
"Eq": "=",
53+
"Modulus": "%",
54+
"And": "AND",
55+
"Or": "OR",
56+
}[op]
57+
except KeyError:
58+
raise NotImplementedError(op)
59+
60+
61+
def _pl_tree_to_sql(tree: dict) -> str:
62+
"""
63+
Recursively convert a Polars expression tree (as JSON) to a SQL string.
64+
65+
Parameters:
66+
tree (dict): JSON-deserialized expression tree from Polars
67+
68+
Returns:
69+
str: SQL expression string
70+
71+
Example:
72+
Input tree:
73+
{
74+
"BinaryExpr": {
75+
"left": { "Column": "foo" },
76+
"op": "Gt",
77+
"right": { "Literal": { "Int": 5 } }
78+
}
79+
}
80+
Output: "(foo > 5)"
81+
"""
82+
[node_type] = tree.keys()
83+
subtree = tree[node_type]
84+
85+
if node_type == "BinaryExpr":
86+
# Binary expressions: left OP right
87+
return (
88+
"(" +
89+
" ".join((
90+
_pl_tree_to_sql(subtree['left']),
91+
_pl_operation_to_sql(subtree['op']),
92+
_pl_tree_to_sql(subtree['right'])
93+
)) +
94+
")"
95+
)
96+
if node_type == "Column":
97+
# A reference to a column name
98+
return subtree
99+
100+
if node_type in ("Literal", "Dyn"):
101+
# Recursively process dynamic or literal values
102+
return _pl_tree_to_sql(subtree)
103+
104+
if node_type == "Int":
105+
# Direct integer literals
106+
return str(subtree)
107+
108+
if node_type == "Function":
109+
# Handle boolean functions like IsNull, IsNotNull
110+
inputs = subtree["input"]
111+
func_dict = subtree["function"]
112+
113+
if "Boolean" in func_dict:
114+
func = func_dict["Boolean"]
115+
arg_sql = _pl_tree_to_sql(inputs[0])
116+
117+
if func == "IsNull":
118+
return f"({arg_sql} IS NULL)"
119+
if func == "IsNotNull":
120+
return f"({arg_sql} IS NOT NULL)"
121+
raise NotImplementedError(f"Boolean function not supported: {func}")
122+
123+
raise NotImplementedError(f"Unsupported function type: {func_dict}")
124+
125+
if node_type == "Scalar":
126+
# Handle scalar values with typed representations
127+
dtype = str(subtree["dtype"])
128+
value = subtree["value"]
129+
130+
# Decimal support
131+
if dtype.startswith("{'Decimal'"):
132+
decimal_value = value['Decimal']
133+
decimal_value = Decimal(decimal_value[0]) / Decimal(10 ** decimal_value[1])
134+
return str(decimal_value)
135+
136+
# Datetime with microseconds since epoch
137+
if dtype.startswith("{'Datetime'"):
138+
micros = value['Datetime'][0]
139+
dt_timestamp = datetime.datetime.fromtimestamp(micros / 1_000_000, tz=datetime.UTC)
140+
return f"'{str(dt_timestamp)}'::TIMESTAMP"
141+
142+
# Match simple types
143+
if dtype in ("Int8", "Int16", "Int32", "Int64", "UInt8", "UInt16", "UInt32", "UInt64", "Float32", "Float64", "Boolean"):
144+
return str(value[dtype])
145+
146+
if dtype == "Time":
147+
# Convert nanoseconds to TIME
148+
nanoseconds = value["Time"]
149+
seconds = nanoseconds // 1_000_000_000
150+
microseconds = (nanoseconds % 1_000_000_000) // 1_000
151+
dt_time = (datetime.datetime.min + datetime.timedelta(seconds=seconds, microseconds=microseconds)).time()
152+
return f"'{str(dt_time)}'::TIME"
153+
154+
if dtype == "Date":
155+
# Convert days since Unix epoch to SQL DATE
156+
days_since_epoch = value["Date"]
157+
date = datetime.date(1970, 1, 1) + datetime.timedelta(days=days_since_epoch)
158+
return f"'{str(date)}'::DATE"
159+
if dtype == "Binary":
160+
# Convert binary data to hex string for BLOB
161+
binary_data = bytes(value["Binary"])
162+
escaped = ''.join(f'\\x{b:02x}' for b in binary_data)
163+
return f"'{escaped}'::BLOB"
164+
165+
if dtype == "String":
166+
return f"'{value['StringOwned']}'"
167+
168+
raise NotImplementedError(f"Unsupported scalar type {str(dtype)}, with value {value}")
169+
170+
raise NotImplementedError(f"Node type: {node_type} is not implemented. {subtree}")
171+
172+
def duckdb_source(relation: duckdb.DuckDBPyRelation, schema: pl.schema.Schema) -> pl.LazyFrame:
173+
"""
174+
A polars IO plugin for DuckDB.
175+
"""
176+
def source_generator(
177+
with_columns: Optional[list[str]],
178+
predicate: Optional[pl.Expr],
179+
n_rows: Optional[int],
180+
batch_size: Optional[int],
181+
) -> Iterator[pl.DataFrame]:
182+
duck_predicate = None
183+
relation_final = relation
184+
if with_columns is not None:
185+
cols = ",".join(with_columns)
186+
relation_final = relation_final.project(cols)
187+
if n_rows is not None:
188+
relation_final = relation_final.limit(n_rows)
189+
if predicate is not None:
190+
# We have a predicate, if possible, we push it down to DuckDB
191+
duck_predicate = _predicate_to_expression(predicate)
192+
# Try to pushdown filter, if one exists
193+
if duck_predicate is not None:
194+
relation_final = relation_final.filter(duck_predicate)
195+
if batch_size is None:
196+
results = relation_final.fetch_arrow_reader()
197+
else:
198+
results = relation_final.fetch_arrow_reader(batch_size)
199+
while True:
200+
try:
201+
record_batch = results.read_next_batch()
202+
df = pl.from_arrow(record_batch)
203+
if predicate is not None and duck_predicate is None:
204+
# We have a predicate, but did not manage to push it down, we fallback here
205+
yield pl.from_arrow(record_batch).filter(predicate)
206+
else:
207+
yield pl.from_arrow(record_batch)
208+
except StopIteration:
209+
break
210+
211+
return register_io_source(source_generator, schema=schema)

external/duckdb

Submodule duckdb updated 477 files

scripts/cache_data.json

Lines changed: 18 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -538,7 +538,8 @@
538538
"name": "duckdb",
539539
"children": [
540540
"duckdb.filesystem",
541-
"duckdb.Value"
541+
"duckdb.Value",
542+
"duckdb.polars_io"
542543
]
543544
},
544545
"duckdb.filesystem": {
@@ -692,5 +693,21 @@
692693
"full_path": "pyarrow.ipc.MessageReader",
693694
"name": "MessageReader",
694695
"children": []
696+
},
697+
"duckdb.polars_io": {
698+
"type": "module",
699+
"full_path": "duckdb.polars_io",
700+
"name": "polars_io",
701+
"children": [
702+
"duckdb.polars_io.duckdb_source"
703+
],
704+
"required": false
705+
},
706+
"duckdb.polars_io.duckdb_source": {
707+
"type": "attribute",
708+
"full_path": "duckdb.polars_io.duckdb_source",
709+
"name": "duckdb_source",
710+
"children": [],
711+
"required": false
695712
}
696713
}

scripts/connection_methods.json

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -385,6 +385,13 @@
385385
"type": "int"
386386
}
387387
],
388+
"kwargs": [
389+
{
390+
"name": "lazy",
391+
"default": "False",
392+
"type": "bool"
393+
}
394+
],
388395
"return": "polars.DataFrame"
389396
},
390397
{

scripts/imports.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -122,3 +122,7 @@
122122

123123
collections.abc.Iterable
124124
collections.abc.Mapping
125+
126+
import duckdb.polars_io
127+
128+
duckdb.polars_io.duckdb_source

0 commit comments

Comments
 (0)