Merge branch 'main' into pandas_analyzer_doesnt_need_pandas

evertlammerts · web-flow · commit f9b3610c59ca · 2025-09-15T14:13:43.000+02:00
diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml
@@ -78,7 +78,7 @@ jobs:
           if [[ test == "${{ inputs.pypi-index }}" ]]; then
             ci_env=pypi-test
           elif [[ prod == "${{ inputs.pypi-index }}" ]]; then
-            ci_env=pypi-prod${{ inputs.stable-version && '' || '-nightly' }}
+            ci_env=pypi-prod${{ inputs.stable-version == '' && '-nightly' || '' }}
           else
             echo "::error::Invalid value for inputs.pypi-index: ${{ inputs.pypi-index }}"
             exit 1
@@ -146,6 +146,7 @@ jobs:
   publish_pypi:
     name: Publish Artifacts to PyPI
     runs-on: ubuntu-latest
+    if: ${{ !always() }}
     needs: [workflow_state, build_sdist, build_wheels]
     environment:
       name: ${{ needs.workflow_state.outputs.ci_env }}
diff --git a/README.md b/README.md
@@ -7,8 +7,8 @@
 </div>
 <br />
 <p align="center">
-  <a href="https://discord.gg/tcvwpjfnZx"><img src="https://shields.io/discord/909674491309850675" alt="discord" /></a>
-  <a href="https://pypi.org/project/duckdb/"><img src="https://img.shields.io/pypi/v/duckdb.svg" alt="PyPi Latest Release"/></a>
+  <a href="https://discord.gg/tcvwpjfnZx"><img src="https://shields.io/discord/909674491309850675" alt="Discord" /></a>
+  <a href="https://pypi.org/project/duckdb/"><img src="https://img.shields.io/pypi/v/duckdb.svg" alt="PyPI Latest Release"/></a>
 </p>
 <br />
 <p align="center">
@@ -30,7 +30,7 @@
 
 ## Installation
 
-Install the latest release of DuckDB directly from [PyPi](https://pypi.org/project/duckdb/):
+Install the latest release of DuckDB directly from [PyPI](https://pypi.org/project/duckdb/):
 
 ```bash
 pip install duckdb
@@ -169,11 +169,11 @@ uvx gcovr \
 ### Typechecking and linting
 
 - We're not running any mypy typechecking tests at the moment
-- We're not running any ruff / linting / formatting at the moment
+- We're not running any Ruff / linting / formatting at the moment
 
 ### Cibuildwheel
 
-You can run cibuildwheel locally for linux. E.g. limited to Python 3.9:
+You can run cibuildwheel locally for Linux. E.g. limited to Python 3.9:
 ```bash
 CIBW_BUILD='cp39-*' uvx cibuildwheel --platform linux .
 ```
@@ -186,7 +186,7 @@ CIBW_BUILD='cp39-*' uvx cibuildwheel --platform linux .
 ### Tooling
 
 This codebase is developed with the following tools:
-- [Astral UV](https://docs.astral.sh/uv/) - for dependency management across all platforms we provide wheels for,
+- [Astral uv](https://docs.astral.sh/uv/) - for dependency management across all platforms we provide wheels for,
   and for Python environment management. It will be hard to work on this codebase without having UV installed.
 - [Scikit-build-core](https://scikit-build-core.readthedocs.io/en/latest/index.html) - the build backend for
   building the extension. On the background, scikit-build-core uses cmake and ninja for compilation.
diff --git a/duckdb/__init__.pyi b/duckdb/__init__.pyi
@@ -415,7 +415,7 @@ class DuckDBPyRelation:
     def variance(self, column: str, groups: str = ..., window_spec: str = ..., projected_columns: str = ...) -> DuckDBPyRelation: ...
     def list(self, column: str, groups: str = ..., window_spec: str = ..., projected_columns: str = ...) -> DuckDBPyRelation: ...
 
-    def arrow(self, batch_size: int = ...) -> pyarrow.lib.Table: ...
+    def arrow(self, batch_size: int = ...) -> pyarrow.lib.RecordBatchReader: ...
     def __arrow_c_stream__(self, requested_schema: Optional[object] = None) -> object: ...
     def create(self, table_name: str) -> None: ...
     def create_view(self, view_name: str, replace: bool = ...) -> DuckDBPyRelation: ...
@@ -448,6 +448,7 @@ class DuckDBPyRelation:
     def pl(self, rows_per_batch: int = ..., connection: DuckDBPyConnection = ...) -> polars.DataFrame: ...
     def query(self, virtual_table_name: str, sql_query: str) -> DuckDBPyRelation: ...
     def record_batch(self, batch_size: int = ...) -> pyarrow.lib.RecordBatchReader: ...
+    def fetch_record_batch(self, rows_per_batch: int = 1000000, *, connection: DuckDBPyConnection = ...) -> pyarrow.lib.RecordBatchReader: ...
     def select_types(self, types: List[Union[str, DuckDBPyType]]) -> DuckDBPyRelation: ...
     def select_dtypes(self, types: List[Union[str, DuckDBPyType]]) -> DuckDBPyRelation: ...
     def set_alias(self, alias: str) -> DuckDBPyRelation: ...
diff --git a/duckdb/experimental/spark/sql/dataframe.py b/duckdb/experimental/spark/sql/dataframe.py
@@ -75,7 +75,7 @@ def toArrow(self) -> "pa.Table":
         age: [[2,5]]
         name: [["Alice","Bob"]]
         """
-        return self.relation.arrow()
+        return self.relation.to_arrow_table()
 
     def createOrReplaceTempView(self, name: str) -> None:
         """Creates or replaces a local temporary view with this :class:`DataFrame`.
diff --git a/duckdb/polars_io.py b/duckdb/polars_io.py
@@ -58,6 +58,18 @@ def _pl_operation_to_sql(op: str) -> str:
         raise NotImplementedError(op)
 
 
+def _escape_sql_identifier(identifier: str) -> str:
+    """
+    Escape SQL identifiers by doubling any double quotes and wrapping in double quotes.
+
+    Example:
+        >>> _escape_sql_identifier('column"name')
+        '"column""name"'
+    """
+    escaped = identifier.replace('"', '""')
+    return f'"{escaped}"'
+
+
 def _pl_tree_to_sql(tree: dict) -> str:
     """
     Recursively convert a Polars expression tree (as JSON) to a SQL string.
@@ -95,7 +107,8 @@ def _pl_tree_to_sql(tree: dict) -> str:
         )
     if node_type == "Column":
         # A reference to a column name
-        return subtree
+        # Wrap in quotes to handle special characters
+        return _escape_sql_identifier(subtree)
 
     if node_type in ("Literal", "Dyn"):
         # Recursively process dynamic or literal values
@@ -196,7 +209,7 @@ def source_generator(
         duck_predicate = None
         relation_final = relation
         if with_columns is not None:
-            cols = ",".join(with_columns)
+            cols = ",".join(map(_escape_sql_identifier, with_columns))
             relation_final = relation_final.project(cols)
         if n_rows is not None:
             relation_final = relation_final.limit(n_rows)
@@ -213,7 +226,6 @@ def source_generator(
         while True:
             try:
                 record_batch = results.read_next_batch()
-                df = pl.from_arrow(record_batch)
                 if predicate is not None and duck_predicate is None:
                     # We have a predicate, but did not manage to push it down, we fallback here
                     yield pl.from_arrow(record_batch).filter(predicate)
diff --git a/external/duckdb b/external/duckdb
@@ -1 +1 @@
-Subproject commit f99fed1e0b16a842573f9dad529f6c170a004f6e
+Subproject commit 25ebb000e3f18e6346ac7a600280b7eb18624ed1
diff --git a/scripts/connection_methods.json b/scripts/connection_methods.json
@@ -412,7 +412,7 @@
 			"fetch_record_batch",
 			"arrow"
 		],
-		
+
 		"function": "FetchRecordBatchReader",
 		"docs": "Fetch an Arrow RecordBatchReader following execute()",
 		"args": [
@@ -992,7 +992,7 @@
 		"args": [
 			{
 				"name": "file_globs",
-				"type": "str"
+				"type": "List[str]"
 			},
 			{
 				"name": "binary_as_string",
diff --git a/scripts/generate_connection_stubs.py b/scripts/generate_connection_stubs.py
@@ -51,8 +51,12 @@ def create_arguments(arguments) -> list:
             result.append(argument)
         return result
 
-    def create_definition(name, method) -> str:
-        definition = f"def {name}("
+    def create_definition(name, method, overloaded: bool) -> str:
+        if overloaded:
+            definition: str = "@overload\n"
+        else:
+            definition: str = ""
+        definition += f"def {name}("
         arguments = ['self']
         if 'args' in method:
             arguments.extend(create_arguments(method['args']))
@@ -65,20 +69,17 @@ def create_definition(name, method) -> str:
         definition += f" -> {method['return']}: ..."
         return definition
 
-    # We have "duplicate" methods, which are overloaded
-    # maybe we should add @overload to these instead, but this is easier
-    written_methods = set()
+    # We have "duplicate" methods, which are overloaded.
+    # We keep note of them to add the @overload decorator.
+    overloaded_methods: set[str] = {m for m in connection_methods if isinstance(m['name'], list)}
 
     for method in connection_methods:
         if isinstance(method['name'], list):
             names = method['name']
         else:
             names = [method['name']]
         for name in names:
-            if name in written_methods:
-                continue
-            body.append(create_definition(name, method))
-            written_methods.add(name)
+            body.append(create_definition(name, method, name in overloaded_methods))
 
     # ---- End of generation code ----
 
diff --git a/scripts/generate_connection_wrapper_stubs.py b/scripts/generate_connection_wrapper_stubs.py
@@ -66,8 +66,12 @@ def create_arguments(arguments) -> list:
             result.append(argument)
         return result
 
-    def create_definition(name, method) -> str:
-        definition = f"def {name}("
+    def create_definition(name, method, overloaded: bool) -> str:
+        if overloaded:
+            definition: str = "@overload\n"
+        else:
+            definition: str = ""
+        definition += f"def {name}("
         arguments = []
         if name in SPECIAL_METHOD_NAMES:
             arguments.append('df: pandas.DataFrame')
@@ -82,9 +86,9 @@ def create_definition(name, method) -> str:
         definition += f" -> {method['return']}: ..."
         return definition
 
-    # We have "duplicate" methods, which are overloaded
-    # maybe we should add @overload to these instead, but this is easier
-    written_methods = set()
+    # We have "duplicate" methods, which are overloaded.
+    # We keep note of them to add the @overload decorator.
+    overloaded_methods: set[str] = {m for m in connection_methods if isinstance(m['name'], list)}
 
     body = []
     for method in methods:
@@ -99,10 +103,7 @@ def create_definition(name, method) -> str:
         method['kwargs'].append({'name': 'connection', 'type': 'DuckDBPyConnection', 'default': '...'})
 
         for name in names:
-            if name in written_methods:
-                continue
-            body.append(create_definition(name, method))
-            written_methods.add(name)
+            body.append(create_definition(name, method, name in overloaded_methods))
 
     # ---- End of generation code ----
 
diff --git a/src/duckdb_py/pyrelation/initialize.cpp b/src/duckdb_py/pyrelation/initialize.cpp
@@ -61,7 +61,7 @@ static void InitializeConsumers(py::class_<DuckDBPyRelation> &m) {
 	         py::arg("date_as_object") = false)
 	    .def("fetch_df_chunk", &DuckDBPyRelation::FetchDFChunk, "Execute and fetch a chunk of the rows",
 	         py::arg("vectors_per_chunk") = 1, py::kw_only(), py::arg("date_as_object") = false)
-	    .def("arrow", &DuckDBPyRelation::ToArrowTable, "Execute and fetch all rows as an Arrow Table",
+	    .def("arrow", &DuckDBPyRelation::ToRecordBatch, "Execute and return an Arrow Record Batch Reader that yields all rows",
 	         py::arg("batch_size") = 1000000)
 	    .def("fetch_arrow_table", &DuckDBPyRelation::ToArrowTable, "Execute and fetch all rows as an Arrow Table",
 	         py::arg("batch_size") = 1000000)
@@ -78,10 +78,18 @@ static void InitializeConsumers(py::class_<DuckDBPyRelation> &m) {
 		)";
 	m.def("__arrow_c_stream__", &DuckDBPyRelation::ToArrowCapsule, capsule_docs,
 	      py::arg("requested_schema") = py::none());
-	m.def("record_batch", &DuckDBPyRelation::ToRecordBatch,
-	      "Execute and return an Arrow Record Batch Reader that yields all rows", py::arg("batch_size") = 1000000)
-	    .def("fetch_arrow_reader", &DuckDBPyRelation::ToRecordBatch,
-	         "Execute and return an Arrow Record Batch Reader that yields all rows", py::arg("batch_size") = 1000000);
+	m.def("fetch_record_batch", &DuckDBPyRelation::ToRecordBatch,
+	      "Execute and return an Arrow Record Batch Reader that yields all rows", py::arg("rows_per_batch") = 1000000)
+	.def("fetch_arrow_reader", &DuckDBPyRelation::ToRecordBatch,
+	         "Execute and return an Arrow Record Batch Reader that yields all rows", py::arg("batch_size") = 1000000)
+	.def("record_batch",
+			 [](pybind11::object &self, idx_t rows_per_batch)
+			 {
+			 	PyErr_WarnEx(PyExc_DeprecationWarning,
+			 		"record_batch() is deprecated, use fetch_record_batch() instead.",
+			 		0);
+			 	return self.attr("fetch_record_batch")(rows_per_batch);
+			 }, py::arg("batch_size") = 1000000);
 }
 
 static void InitializeAggregates(py::class_<DuckDBPyRelation> &m) {
diff --git a/tests/fast/arrow/test_polars.py b/tests/fast/arrow/test_polars.py
@@ -131,6 +131,36 @@ def test_polars_lazy(self, duckdb_cursor):
         ]
         assert lazy_df.filter(pl.col("b") < 32).select('a').collect().to_dicts() == [{'a': 'Mark'}, {'a': 'Thijs'}]
 
+    def test_polars_column_with_tricky_name(self, duckdb_cursor):
+        # Test that a polars DataFrame with a column name that is non standard still works
+        df_colon = pl.DataFrame({"x:y": [1, 2]})
+        lf = duckdb_cursor.sql("from df_colon").pl(lazy=True)
+        result = lf.select(pl.all()).collect()
+        assert result.to_dicts() == [{"x:y": 1}, {"x:y": 2}]
+        result = lf.select(pl.all()).filter(pl.col("x:y") == 1).collect()
+        assert result.to_dicts() == [{"x:y": 1}]
+
+        df_space = pl.DataFrame({"x y": [1, 2]})
+        lf = duckdb_cursor.sql("from df_space").pl(lazy=True)
+        result = lf.select(pl.all()).collect()
+        assert result.to_dicts() == [{"x y": 1}, {"x y": 2}]
+        result = lf.select(pl.all()).filter(pl.col("x y") == 1).collect()
+        assert result.to_dicts() == [{"x y": 1}]
+
+        df_dot = pl.DataFrame({"x.y": [1, 2]})
+        lf = duckdb_cursor.sql("from df_dot").pl(lazy=True)
+        result = lf.select(pl.all()).collect()
+        assert result.to_dicts() == [{"x.y": 1}, {"x.y": 2}]
+        result = lf.select(pl.all()).filter(pl.col("x.y") == 1).collect()
+        assert result.to_dicts() == [{"x.y": 1}]
+
+        df_quote = pl.DataFrame({'"xy"': [1, 2]})
+        lf = duckdb_cursor.sql("from df_quote").pl(lazy=True)
+        result = lf.select(pl.all()).collect()
+        assert result.to_dicts() == [{'"xy"': 1}, {'"xy"': 2}]
+        result = lf.select(pl.all()).filter(pl.col('"xy"') == 1).collect()
+        assert result.to_dicts() == [{'"xy"': 1}]
+
     @pytest.mark.parametrize(
         'data_type',
         [
diff --git a/tests/pytest.ini b/tests/pytest.ini
@@ -3,6 +3,7 @@
 filterwarnings =
     error
     ignore::UserWarning
+    ignore::DeprecationWarning
     # Jupyter is throwing DeprecationWarnings
     ignore:function ham\(\) is deprecated:DeprecationWarning
     # Pyspark is throwing these warnings