Skip to content

Commit f9b3610

Browse files
Merge branch 'main' into pandas_analyzer_doesnt_need_pandas
2 parents 9947be5 + f640841 commit f9b3610

File tree

12 files changed

+93
-38
lines changed

12 files changed

+93
-38
lines changed

.github/workflows/release.yml

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -78,7 +78,7 @@ jobs:
7878
if [[ test == "${{ inputs.pypi-index }}" ]]; then
7979
ci_env=pypi-test
8080
elif [[ prod == "${{ inputs.pypi-index }}" ]]; then
81-
ci_env=pypi-prod${{ inputs.stable-version && '' || '-nightly' }}
81+
ci_env=pypi-prod${{ inputs.stable-version == '' && '-nightly' || '' }}
8282
else
8383
echo "::error::Invalid value for inputs.pypi-index: ${{ inputs.pypi-index }}"
8484
exit 1
@@ -146,6 +146,7 @@ jobs:
146146
publish_pypi:
147147
name: Publish Artifacts to PyPI
148148
runs-on: ubuntu-latest
149+
if: ${{ !always() }}
149150
needs: [workflow_state, build_sdist, build_wheels]
150151
environment:
151152
name: ${{ needs.workflow_state.outputs.ci_env }}

README.md

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -7,8 +7,8 @@
77
</div>
88
<br />
99
<p align="center">
10-
<a href="https://discord.gg/tcvwpjfnZx"><img src="https://shields.io/discord/909674491309850675" alt="discord" /></a>
11-
<a href="https://pypi.org/project/duckdb/"><img src="https://img.shields.io/pypi/v/duckdb.svg" alt="PyPi Latest Release"/></a>
10+
<a href="https://discord.gg/tcvwpjfnZx"><img src="https://shields.io/discord/909674491309850675" alt="Discord" /></a>
11+
<a href="https://pypi.org/project/duckdb/"><img src="https://img.shields.io/pypi/v/duckdb.svg" alt="PyPI Latest Release"/></a>
1212
</p>
1313
<br />
1414
<p align="center">
@@ -30,7 +30,7 @@
3030

3131
## Installation
3232

33-
Install the latest release of DuckDB directly from [PyPi](https://pypi.org/project/duckdb/):
33+
Install the latest release of DuckDB directly from [PyPI](https://pypi.org/project/duckdb/):
3434

3535
```bash
3636
pip install duckdb
@@ -169,11 +169,11 @@ uvx gcovr \
169169
### Typechecking and linting
170170

171171
- We're not running any mypy typechecking tests at the moment
172-
- We're not running any ruff / linting / formatting at the moment
172+
- We're not running any Ruff / linting / formatting at the moment
173173

174174
### Cibuildwheel
175175

176-
You can run cibuildwheel locally for linux. E.g. limited to Python 3.9:
176+
You can run cibuildwheel locally for Linux. E.g. limited to Python 3.9:
177177
```bash
178178
CIBW_BUILD='cp39-*' uvx cibuildwheel --platform linux .
179179
```
@@ -186,7 +186,7 @@ CIBW_BUILD='cp39-*' uvx cibuildwheel --platform linux .
186186
### Tooling
187187

188188
This codebase is developed with the following tools:
189-
- [Astral UV](https://docs.astral.sh/uv/) - for dependency management across all platforms we provide wheels for,
189+
- [Astral uv](https://docs.astral.sh/uv/) - for dependency management across all platforms we provide wheels for,
190190
and for Python environment management. It will be hard to work on this codebase without having UV installed.
191191
- [Scikit-build-core](https://scikit-build-core.readthedocs.io/en/latest/index.html) - the build backend for
192192
building the extension. On the background, scikit-build-core uses cmake and ninja for compilation.

duckdb/__init__.pyi

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -415,7 +415,7 @@ class DuckDBPyRelation:
415415
def variance(self, column: str, groups: str = ..., window_spec: str = ..., projected_columns: str = ...) -> DuckDBPyRelation: ...
416416
def list(self, column: str, groups: str = ..., window_spec: str = ..., projected_columns: str = ...) -> DuckDBPyRelation: ...
417417

418-
def arrow(self, batch_size: int = ...) -> pyarrow.lib.Table: ...
418+
def arrow(self, batch_size: int = ...) -> pyarrow.lib.RecordBatchReader: ...
419419
def __arrow_c_stream__(self, requested_schema: Optional[object] = None) -> object: ...
420420
def create(self, table_name: str) -> None: ...
421421
def create_view(self, view_name: str, replace: bool = ...) -> DuckDBPyRelation: ...
@@ -448,6 +448,7 @@ class DuckDBPyRelation:
448448
def pl(self, rows_per_batch: int = ..., connection: DuckDBPyConnection = ...) -> polars.DataFrame: ...
449449
def query(self, virtual_table_name: str, sql_query: str) -> DuckDBPyRelation: ...
450450
def record_batch(self, batch_size: int = ...) -> pyarrow.lib.RecordBatchReader: ...
451+
def fetch_record_batch(self, rows_per_batch: int = 1000000, *, connection: DuckDBPyConnection = ...) -> pyarrow.lib.RecordBatchReader: ...
451452
def select_types(self, types: List[Union[str, DuckDBPyType]]) -> DuckDBPyRelation: ...
452453
def select_dtypes(self, types: List[Union[str, DuckDBPyType]]) -> DuckDBPyRelation: ...
453454
def set_alias(self, alias: str) -> DuckDBPyRelation: ...

duckdb/experimental/spark/sql/dataframe.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -75,7 +75,7 @@ def toArrow(self) -> "pa.Table":
7575
age: [[2,5]]
7676
name: [["Alice","Bob"]]
7777
"""
78-
return self.relation.arrow()
78+
return self.relation.to_arrow_table()
7979

8080
def createOrReplaceTempView(self, name: str) -> None:
8181
"""Creates or replaces a local temporary view with this :class:`DataFrame`.

duckdb/polars_io.py

Lines changed: 15 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -58,6 +58,18 @@ def _pl_operation_to_sql(op: str) -> str:
5858
raise NotImplementedError(op)
5959

6060

61+
def _escape_sql_identifier(identifier: str) -> str:
62+
"""
63+
Escape SQL identifiers by doubling any double quotes and wrapping in double quotes.
64+
65+
Example:
66+
>>> _escape_sql_identifier('column"name')
67+
'"column""name"'
68+
"""
69+
escaped = identifier.replace('"', '""')
70+
return f'"{escaped}"'
71+
72+
6173
def _pl_tree_to_sql(tree: dict) -> str:
6274
"""
6375
Recursively convert a Polars expression tree (as JSON) to a SQL string.
@@ -95,7 +107,8 @@ def _pl_tree_to_sql(tree: dict) -> str:
95107
)
96108
if node_type == "Column":
97109
# A reference to a column name
98-
return subtree
110+
# Wrap in quotes to handle special characters
111+
return _escape_sql_identifier(subtree)
99112

100113
if node_type in ("Literal", "Dyn"):
101114
# Recursively process dynamic or literal values
@@ -196,7 +209,7 @@ def source_generator(
196209
duck_predicate = None
197210
relation_final = relation
198211
if with_columns is not None:
199-
cols = ",".join(with_columns)
212+
cols = ",".join(map(_escape_sql_identifier, with_columns))
200213
relation_final = relation_final.project(cols)
201214
if n_rows is not None:
202215
relation_final = relation_final.limit(n_rows)
@@ -213,7 +226,6 @@ def source_generator(
213226
while True:
214227
try:
215228
record_batch = results.read_next_batch()
216-
df = pl.from_arrow(record_batch)
217229
if predicate is not None and duck_predicate is None:
218230
# We have a predicate, but did not manage to push it down, we fallback here
219231
yield pl.from_arrow(record_batch).filter(predicate)

external/duckdb

Submodule duckdb updated 363 files

scripts/connection_methods.json

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -412,7 +412,7 @@
412412
"fetch_record_batch",
413413
"arrow"
414414
],
415-
415+
416416
"function": "FetchRecordBatchReader",
417417
"docs": "Fetch an Arrow RecordBatchReader following execute()",
418418
"args": [
@@ -992,7 +992,7 @@
992992
"args": [
993993
{
994994
"name": "file_globs",
995-
"type": "str"
995+
"type": "List[str]"
996996
},
997997
{
998998
"name": "binary_as_string",

scripts/generate_connection_stubs.py

Lines changed: 10 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -51,8 +51,12 @@ def create_arguments(arguments) -> list:
5151
result.append(argument)
5252
return result
5353

54-
def create_definition(name, method) -> str:
55-
definition = f"def {name}("
54+
def create_definition(name, method, overloaded: bool) -> str:
55+
if overloaded:
56+
definition: str = "@overload\n"
57+
else:
58+
definition: str = ""
59+
definition += f"def {name}("
5660
arguments = ['self']
5761
if 'args' in method:
5862
arguments.extend(create_arguments(method['args']))
@@ -65,20 +69,17 @@ def create_definition(name, method) -> str:
6569
definition += f" -> {method['return']}: ..."
6670
return definition
6771

68-
# We have "duplicate" methods, which are overloaded
69-
# maybe we should add @overload to these instead, but this is easier
70-
written_methods = set()
72+
# We have "duplicate" methods, which are overloaded.
73+
# We keep note of them to add the @overload decorator.
74+
overloaded_methods: set[str] = {m for m in connection_methods if isinstance(m['name'], list)}
7175

7276
for method in connection_methods:
7377
if isinstance(method['name'], list):
7478
names = method['name']
7579
else:
7680
names = [method['name']]
7781
for name in names:
78-
if name in written_methods:
79-
continue
80-
body.append(create_definition(name, method))
81-
written_methods.add(name)
82+
body.append(create_definition(name, method, name in overloaded_methods))
8283

8384
# ---- End of generation code ----
8485

scripts/generate_connection_wrapper_stubs.py

Lines changed: 10 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -66,8 +66,12 @@ def create_arguments(arguments) -> list:
6666
result.append(argument)
6767
return result
6868

69-
def create_definition(name, method) -> str:
70-
definition = f"def {name}("
69+
def create_definition(name, method, overloaded: bool) -> str:
70+
if overloaded:
71+
definition: str = "@overload\n"
72+
else:
73+
definition: str = ""
74+
definition += f"def {name}("
7175
arguments = []
7276
if name in SPECIAL_METHOD_NAMES:
7377
arguments.append('df: pandas.DataFrame')
@@ -82,9 +86,9 @@ def create_definition(name, method) -> str:
8286
definition += f" -> {method['return']}: ..."
8387
return definition
8488

85-
# We have "duplicate" methods, which are overloaded
86-
# maybe we should add @overload to these instead, but this is easier
87-
written_methods = set()
89+
# We have "duplicate" methods, which are overloaded.
90+
# We keep note of them to add the @overload decorator.
91+
overloaded_methods: set[str] = {m for m in connection_methods if isinstance(m['name'], list)}
8892

8993
body = []
9094
for method in methods:
@@ -99,10 +103,7 @@ def create_definition(name, method) -> str:
99103
method['kwargs'].append({'name': 'connection', 'type': 'DuckDBPyConnection', 'default': '...'})
100104

101105
for name in names:
102-
if name in written_methods:
103-
continue
104-
body.append(create_definition(name, method))
105-
written_methods.add(name)
106+
body.append(create_definition(name, method, name in overloaded_methods))
106107

107108
# ---- End of generation code ----
108109

src/duckdb_py/pyrelation/initialize.cpp

Lines changed: 13 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -61,7 +61,7 @@ static void InitializeConsumers(py::class_<DuckDBPyRelation> &m) {
6161
py::arg("date_as_object") = false)
6262
.def("fetch_df_chunk", &DuckDBPyRelation::FetchDFChunk, "Execute and fetch a chunk of the rows",
6363
py::arg("vectors_per_chunk") = 1, py::kw_only(), py::arg("date_as_object") = false)
64-
.def("arrow", &DuckDBPyRelation::ToArrowTable, "Execute and fetch all rows as an Arrow Table",
64+
.def("arrow", &DuckDBPyRelation::ToRecordBatch, "Execute and return an Arrow Record Batch Reader that yields all rows",
6565
py::arg("batch_size") = 1000000)
6666
.def("fetch_arrow_table", &DuckDBPyRelation::ToArrowTable, "Execute and fetch all rows as an Arrow Table",
6767
py::arg("batch_size") = 1000000)
@@ -78,10 +78,18 @@ static void InitializeConsumers(py::class_<DuckDBPyRelation> &m) {
7878
)";
7979
m.def("__arrow_c_stream__", &DuckDBPyRelation::ToArrowCapsule, capsule_docs,
8080
py::arg("requested_schema") = py::none());
81-
m.def("record_batch", &DuckDBPyRelation::ToRecordBatch,
82-
"Execute and return an Arrow Record Batch Reader that yields all rows", py::arg("batch_size") = 1000000)
83-
.def("fetch_arrow_reader", &DuckDBPyRelation::ToRecordBatch,
84-
"Execute and return an Arrow Record Batch Reader that yields all rows", py::arg("batch_size") = 1000000);
81+
m.def("fetch_record_batch", &DuckDBPyRelation::ToRecordBatch,
82+
"Execute and return an Arrow Record Batch Reader that yields all rows", py::arg("rows_per_batch") = 1000000)
83+
.def("fetch_arrow_reader", &DuckDBPyRelation::ToRecordBatch,
84+
"Execute and return an Arrow Record Batch Reader that yields all rows", py::arg("batch_size") = 1000000)
85+
.def("record_batch",
86+
[](pybind11::object &self, idx_t rows_per_batch)
87+
{
88+
PyErr_WarnEx(PyExc_DeprecationWarning,
89+
"record_batch() is deprecated, use fetch_record_batch() instead.",
90+
0);
91+
return self.attr("fetch_record_batch")(rows_per_batch);
92+
}, py::arg("batch_size") = 1000000);
8593
}
8694

8795
static void InitializeAggregates(py::class_<DuckDBPyRelation> &m) {

0 commit comments

Comments
 (0)