evertlammerts
diff --git a/‎duckdb/__init__.pyi‎
Lines changed: 2 additions & 2 deletions b/‎duckdb/__init__.pyi‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎duckdb/polars_io.py‎
Lines changed: 211 additions & 0 deletions b/‎duckdb/polars_io.py‎
Lines changed: 211 additions & 0 deletions
diff --git a/‎external/duckdb‎ b/‎external/duckdb‎
diff --git a/‎scripts/cache_data.json‎
Lines changed: 18 additions & 1 deletion b/‎scripts/cache_data.json‎
Lines changed: 18 additions & 1 deletion
diff --git a/‎scripts/connection_methods.json‎
Lines changed: 7 additions & 0 deletions b/‎scripts/connection_methods.json‎
Lines changed: 7 additions & 0 deletions
diff --git a/‎scripts/imports.py‎
Lines changed: 4 additions & 0 deletions b/‎scripts/imports.py‎
Lines changed: 4 additions & 0 deletions
@@ -318,7 +318,7 @@ class DuckDBPyConnection:
     def fetch_df(self, *, date_as_object: bool = False) -> pandas.DataFrame: ...
     def df(self, *, date_as_object: bool = False) -> pandas.DataFrame: ...
     def fetch_df_chunk(self, vectors_per_chunk: int = 1, *, date_as_object: bool = False) -> pandas.DataFrame: ...
-    def pl(self, rows_per_batch: int = 1000000) -> polars.DataFrame: ...
+    def pl(self, rows_per_batch: int = 1000000, *, lazy: bool = False) -> polars.DataFrame: ...
     def fetch_arrow_table(self, rows_per_batch: int = 1000000) -> pyarrow.lib.Table: ...
     def arrow(self, rows_per_batch: int = 1000000) -> pyarrow.lib.Table: ...
     def fetch_record_batch(self, rows_per_batch: int = 1000000) -> pyarrow.lib.RecordBatchReader: ...
@@ -666,7 +666,7 @@ def fetchdf(*, date_as_object: bool = False, connection: DuckDBPyConnection = ..
 def fetch_df(*, date_as_object: bool = False, connection: DuckDBPyConnection = ...) -> pandas.DataFrame: ...
 def df(*, date_as_object: bool = False, connection: DuckDBPyConnection = ...) -> pandas.DataFrame: ...
 def fetch_df_chunk(vectors_per_chunk: int = 1, *, date_as_object: bool = False, connection: DuckDBPyConnection = ...) -> pandas.DataFrame: ...
-def pl(rows_per_batch: int = 1000000, *, connection: DuckDBPyConnection = ...) -> polars.DataFrame: ...
+def pl(rows_per_batch: int = 1000000, *, lazy: bool = False, connection: DuckDBPyConnection = ...) -> polars.DataFrame: ...
 def fetch_arrow_table(rows_per_batch: int = 1000000, *, connection: DuckDBPyConnection = ...) -> pyarrow.lib.Table: ...
 def arrow(rows_per_batch: int = 1000000, *, connection: DuckDBPyConnection = ...) -> pyarrow.lib.Table: ...
 def fetch_record_batch(rows_per_batch: int = 1000000, *, connection: DuckDBPyConnection = ...) -> pyarrow.lib.RecordBatchReader: ...
 
@@ -0,0 +1,211 @@
+import duckdb
+import polars as pl
+from typing import Iterator, Optional
+
+from polars.io.plugins import register_io_source
+from duckdb import SQLExpression
+import json
+from decimal import Decimal
+import datetime
+
+def _predicate_to_expression(predicate: pl.Expr) -> Optional[SQLExpression]:
+    """
+    Convert a Polars predicate expression to a DuckDB-compatible SQL expression.
+    
+    Parameters:
+        predicate (pl.Expr): A Polars expression (e.g., col("foo") > 5)
+    
+    Returns:
+        SQLExpression: A DuckDB SQL expression string equivalent.
+        None: If conversion fails.
+
+    Example:
+        >>> _predicate_to_expression(pl.col("foo") > 5)
+        SQLExpression("(foo > 5)")
+    """
+    # Serialize the Polars expression tree to JSON
+    tree = json.loads(predicate.meta.serialize(format="json"))
+    
+    try:
+        # Convert the tree to SQL
+        sql_filter = _pl_tree_to_sql(tree)
+        return SQLExpression(sql_filter)
+    except:
+        # If the conversion fails, we return None
+        return None
+
+
+def _pl_operation_to_sql(op: str) -> str:
+    """
+    Map Polars binary operation strings to SQL equivalents.
+    
+    Example:
+        >>> _pl_operation_to_sql("Eq")
+        '='
+    """
+    try:
+        return {
+            "Lt": "<",
+            "LtEq": "<=",
+            "Gt": ">",
+            "GtEq": ">=",
+            "Eq": "=",
+            "Modulus": "%",
+            "And": "AND",
+            "Or": "OR",
+        }[op]
+    except KeyError:
+        raise NotImplementedError(op)
+
+
+def _pl_tree_to_sql(tree: dict) -> str:
+    """
+    Recursively convert a Polars expression tree (as JSON) to a SQL string.
+    
+    Parameters:
+        tree (dict): JSON-deserialized expression tree from Polars
+    
+    Returns:
+        str: SQL expression string
+    
+    Example:
+        Input tree:
+        {
+            "BinaryExpr": {
+                "left": { "Column": "foo" },
+                "op": "Gt",
+                "right": { "Literal": { "Int": 5 } }
+            }
+        }
+        Output: "(foo > 5)"
+    """
+    [node_type] = tree.keys()
+    subtree = tree[node_type]
+
+    if node_type == "BinaryExpr":
+        # Binary expressions: left OP right
+        return (
+                "(" +
+                " ".join((
+                    _pl_tree_to_sql(subtree['left']),
+                    _pl_operation_to_sql(subtree['op']),
+                    _pl_tree_to_sql(subtree['right'])
+                )) +
+                ")"
+        )
+    if node_type == "Column":
+        # A reference to a column name
+        return subtree
+
+    if node_type in ("Literal", "Dyn"):
+        # Recursively process dynamic or literal values
+        return _pl_tree_to_sql(subtree)
+
+    if node_type == "Int":
+        # Direct integer literals
+        return str(subtree)
+
+    if node_type == "Function":
+        # Handle boolean functions like IsNull, IsNotNull
+        inputs = subtree["input"]
+        func_dict = subtree["function"]
+
+        if "Boolean" in func_dict:
+            func = func_dict["Boolean"]
+            arg_sql = _pl_tree_to_sql(inputs[0])
+
+            if func == "IsNull":
+                return f"({arg_sql} IS NULL)"
+            if func == "IsNotNull":
+                return f"({arg_sql} IS NOT NULL)"
+            raise NotImplementedError(f"Boolean function not supported: {func}")
+
+        raise NotImplementedError(f"Unsupported function type: {func_dict}")
+
+    if node_type == "Scalar":
+        # Handle scalar values with typed representations
+        dtype = str(subtree["dtype"])
+        value = subtree["value"]
+
+        # Decimal support
+        if dtype.startswith("{'Decimal'"):
+            decimal_value = value['Decimal']
+            decimal_value = Decimal(decimal_value[0]) / Decimal(10 ** decimal_value[1])
+            return str(decimal_value)
+
+        # Datetime with microseconds since epoch
+        if dtype.startswith("{'Datetime'"):
+            micros = value['Datetime'][0]
+            dt_timestamp = datetime.datetime.fromtimestamp(micros / 1_000_000, tz=datetime.UTC)
+            return f"'{str(dt_timestamp)}'::TIMESTAMP"
+
+        # Match simple types
+        if dtype in ("Int8", "Int16", "Int32", "Int64", "UInt8", "UInt16", "UInt32", "UInt64", "Float32", "Float64", "Boolean"):
+            return str(value[dtype])
+
+        if dtype == "Time":
+            # Convert nanoseconds to TIME
+            nanoseconds = value["Time"]
+            seconds = nanoseconds // 1_000_000_000
+            microseconds = (nanoseconds % 1_000_000_000) // 1_000
+            dt_time = (datetime.datetime.min + datetime.timedelta(seconds=seconds, microseconds=microseconds)).time()
+            return f"'{str(dt_time)}'::TIME"
+
+        if dtype == "Date":
+            # Convert days since Unix epoch to SQL DATE
+            days_since_epoch = value["Date"]
+            date = datetime.date(1970, 1, 1) + datetime.timedelta(days=days_since_epoch)
+            return f"'{str(date)}'::DATE"
+        if dtype == "Binary":
+            # Convert binary data to hex string for BLOB
+            binary_data = bytes(value["Binary"])
+            escaped = ''.join(f'\\x{b:02x}' for b in binary_data)
+            return f"'{escaped}'::BLOB"
+
+        if dtype == "String":
+            return f"'{value['StringOwned']}'"
+
+        raise NotImplementedError(f"Unsupported scalar type {str(dtype)}, with value {value}")
+
+    raise NotImplementedError(f"Node type: {node_type} is not implemented. {subtree}")
+
+def duckdb_source(relation: duckdb.DuckDBPyRelation, schema: pl.schema.Schema) -> pl.LazyFrame:
+    """
+    A polars IO plugin for DuckDB.
+    """
+    def source_generator(
+        with_columns: Optional[list[str]],
+        predicate: Optional[pl.Expr],
+        n_rows: Optional[int],
+        batch_size: Optional[int],
+    ) -> Iterator[pl.DataFrame]:
+        duck_predicate = None
+        relation_final = relation
+        if with_columns is not None:
+            cols = ",".join(with_columns)
+            relation_final = relation_final.project(cols)
+        if n_rows is not None:
+            relation_final = relation_final.limit(n_rows)
+        if predicate is not None:
+            # We have a predicate, if possible, we push it down to DuckDB
+            duck_predicate = _predicate_to_expression(predicate)
+        # Try to pushdown filter, if one exists
+        if duck_predicate is not None:
+            relation_final = relation_final.filter(duck_predicate)
+        if batch_size is None:
+            results = relation_final.fetch_arrow_reader()
+        else:
+            results = relation_final.fetch_arrow_reader(batch_size)
+        while True:
+            try:
+                record_batch = results.read_next_batch()
+                df = pl.from_arrow(record_batch)
+                if predicate is not None and duck_predicate is None:
+                    # We have a predicate, but did not manage to push it down, we fallback here
+                    yield pl.from_arrow(record_batch).filter(predicate)
+                else:
+                    yield pl.from_arrow(record_batch)
+            except StopIteration:
+                break
+
+    return register_io_source(source_generator, schema=schema)
@@ -538,7 +538,8 @@
         "name": "duckdb",
         "children": [
             "duckdb.filesystem",
-            "duckdb.Value"
+            "duckdb.Value",
+            "duckdb.polars_io"
         ]
     },
     "duckdb.filesystem": {
@@ -692,5 +693,21 @@
         "full_path": "pyarrow.ipc.MessageReader",
         "name": "MessageReader",
         "children": []
+    },
+    "duckdb.polars_io": {
+        "type": "module",
+        "full_path": "duckdb.polars_io",
+        "name": "polars_io",
+        "children": [
+            "duckdb.polars_io.duckdb_source"
+        ],
+        "required": false
+    },
+    "duckdb.polars_io.duckdb_source": {
+        "type": "attribute",
+        "full_path": "duckdb.polars_io.duckdb_source",
+        "name": "duckdb_source",
+        "children": [],
+        "required": false
     }
 }
@@ -385,6 +385,13 @@
 				"type": "int"
 			}
 		],
+		"kwargs": [
+			{
+				"name": "lazy",
+				"default": "False",
+				"type": "bool"
+			}
+		],
 		"return": "polars.DataFrame"
 	},
 	{
 
@@ -122,3 +122,7 @@
 
 collections.abc.Iterable
 collections.abc.Mapping
+
+import duckdb.polars_io
+
+duckdb.polars_io.duckdb_source
Original file line number	Diff line number	Diff line change
`@@ -385,6 +385,13 @@`
`385`	`385`	`"type": "int"`
`386`	`386`	`}`
`387`	`387`	`],`
	`388`	`+ "kwargs": [`
	`389`	`+ {`
	`390`	`+ "name": "lazy",`
	`391`	`+ "default": "False",`
	`392`	`+ "type": "bool"`
	`393`	`+ }`
	`394`	`+ ],`
`388`	`395`	`"return": "polars.DataFrame"`
`389`	`396`	`},`
`390`	`397`	`{`