Skip to content

Commit fbebbfb

Browse files
authored
Merge branch 'main' into missing_expr
2 parents 66f8a38 + 91b6635 commit fbebbfb

File tree

23 files changed

+2028
-398
lines changed

23 files changed

+2028
-398
lines changed

Cargo.lock

Lines changed: 372 additions & 230 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Cargo.toml

Lines changed: 13 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@
1717

1818
[package]
1919
name = "datafusion-python"
20-
version = "45.2.0"
20+
version = "46.0.0"
2121
homepage = "https://datafusion.apache.org/python"
2222
repository = "https://github.com/apache/datafusion-python"
2323
authors = ["Apache DataFusion <[email protected]>"]
@@ -34,25 +34,25 @@ protoc = [ "datafusion-substrait/protoc" ]
3434
substrait = ["dep:datafusion-substrait"]
3535

3636
[dependencies]
37-
tokio = { version = "1.43", features = ["macros", "rt", "rt-multi-thread", "sync"] }
38-
pyo3 = { version = "0.23", features = ["extension-module", "abi3", "abi3-py39"] }
39-
pyo3-async-runtimes = { version = "0.23", features = ["tokio-runtime"]}
40-
arrow = { version = "54.2.1", features = ["pyarrow"] }
41-
datafusion = { version = "46.0.1", features = ["avro", "unicode_expressions"] }
42-
datafusion-substrait = { version = "46.0.1", optional = true }
43-
datafusion-proto = { version = "46.0.1" }
44-
datafusion-ffi = { version = "46.0.1" }
37+
tokio = { version = "1.44", features = ["macros", "rt", "rt-multi-thread", "sync"] }
38+
pyo3 = { version = "0.24", features = ["extension-module", "abi3", "abi3-py39"] }
39+
pyo3-async-runtimes = { version = "0.24", features = ["tokio-runtime"]}
40+
arrow = { version = "55.0.0", features = ["pyarrow"] }
41+
datafusion = { version = "47.0.0", features = ["avro", "unicode_expressions"] }
42+
datafusion-substrait = { version = "47.0.0", optional = true }
43+
datafusion-proto = { version = "47.0.0" }
44+
datafusion-ffi = { version = "47.0.0" }
4545
prost = "0.13.1" # keep in line with `datafusion-substrait`
46-
uuid = { version = "1.12", features = ["v4"] }
46+
uuid = { version = "1.16", features = ["v4"] }
4747
mimalloc = { version = "0.1", optional = true, default-features = false, features = ["local_dynamic_tls"] }
48-
async-trait = "0.1.73"
48+
async-trait = "0.1.88"
4949
futures = "0.3"
50-
object_store = { version = "0.11.0", features = ["aws", "gcp", "azure", "http"] }
50+
object_store = { version = "0.12.0", features = ["aws", "gcp", "azure", "http"] }
5151
url = "2"
5252

5353
[build-dependencies]
5454
prost-types = "0.13.1" # keep in line with `datafusion-substrait`
55-
pyo3-build-config = "0.23"
55+
pyo3-build-config = "0.24"
5656

5757
[lib]
5858
name = "datafusion_python"

dev/changelog/46.0.0.md

Lines changed: 73 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,73 @@
1+
<!--
2+
Licensed to the Apache Software Foundation (ASF) under one
3+
or more contributor license agreements. See the NOTICE file
4+
distributed with this work for additional information
5+
regarding copyright ownership. The ASF licenses this file
6+
to you under the Apache License, Version 2.0 (the
7+
"License"); you may not use this file except in compliance
8+
with the License. You may obtain a copy of the License at
9+
10+
http://www.apache.org/licenses/LICENSE-2.0
11+
12+
Unless required by applicable law or agreed to in writing,
13+
software distributed under the License is distributed on an
14+
"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15+
KIND, either express or implied. See the License for the
16+
specific language governing permissions and limitations
17+
under the License.
18+
-->
19+
20+
# Apache DataFusion Python 46.0.0 Changelog
21+
22+
This release consists of 21 commits from 11 contributors. See credits at the end of this changelog for more information.
23+
24+
**Implemented enhancements:**
25+
26+
- feat: reads using global ctx [#982](https://github.com/apache/datafusion-python/pull/982) (ion-elgreco)
27+
- feat: Implementation of udf and udaf decorator [#1040](https://github.com/apache/datafusion-python/pull/1040) (CrystalZhou0529)
28+
- feat: expose regex_count function [#1066](https://github.com/apache/datafusion-python/pull/1066) (nirnayroy)
29+
- feat: Update DataFusion dependency to 46 [#1079](https://github.com/apache/datafusion-python/pull/1079) (timsaucer)
30+
31+
**Fixed bugs:**
32+
33+
- fix: add to_timestamp_nanos [#1020](https://github.com/apache/datafusion-python/pull/1020) (chenkovsky)
34+
- fix: type checking [#993](https://github.com/apache/datafusion-python/pull/993) (chenkovsky)
35+
36+
**Other:**
37+
38+
- [infra] Fail Clippy on rust build warnings [#1029](https://github.com/apache/datafusion-python/pull/1029) (kevinjqliu)
39+
- Add user documentation for the FFI approach [#1031](https://github.com/apache/datafusion-python/pull/1031) (timsaucer)
40+
- build(deps): bump arrow from 54.1.0 to 54.2.0 [#1035](https://github.com/apache/datafusion-python/pull/1035) (dependabot[bot])
41+
- Chore: Release datafusion-python 45 [#1024](https://github.com/apache/datafusion-python/pull/1024) (timsaucer)
42+
- Enable Dataframe to be converted into views which can be used in register_table [#1016](https://github.com/apache/datafusion-python/pull/1016) (kosiew)
43+
- Add ruff check for missing futures import [#1052](https://github.com/apache/datafusion-python/pull/1052) (timsaucer)
44+
- Enable take comments to assign issues to users [#1058](https://github.com/apache/datafusion-python/pull/1058) (timsaucer)
45+
- Update python min version to 3.9 [#1043](https://github.com/apache/datafusion-python/pull/1043) (kevinjqliu)
46+
- feat/improve ruff test coverage [#1055](https://github.com/apache/datafusion-python/pull/1055) (timsaucer)
47+
- feat/making global context accessible for users [#1060](https://github.com/apache/datafusion-python/pull/1060) (jsai28)
48+
- Renaming Internal Structs [#1059](https://github.com/apache/datafusion-python/pull/1059) (Spaarsh)
49+
- test: add pytest asyncio tests [#1063](https://github.com/apache/datafusion-python/pull/1063) (jsai28)
50+
- Add decorator for udwf [#1061](https://github.com/apache/datafusion-python/pull/1061) (kosiew)
51+
- Add additional ruff suggestions [#1062](https://github.com/apache/datafusion-python/pull/1062) (Spaarsh)
52+
- Improve collection during repr and repr_html [#1036](https://github.com/apache/datafusion-python/pull/1036) (timsaucer)
53+
54+
## Credits
55+
56+
Thank you to everyone who contributed to this release. Here is a breakdown of commits (PRs merged) per contributor.
57+
58+
```
59+
7 Tim Saucer
60+
2 Kevin Liu
61+
2 Spaarsh
62+
2 jsai28
63+
2 kosiew
64+
1 Chen Chongchen
65+
1 Chongchen Chen
66+
1 Crystal Zhou
67+
1 Ion Koutsouris
68+
1 Nirnay Roy
69+
1 dependabot[bot]
70+
```
71+
72+
Thank you also to everyone who contributed in other ways such as filing issues, reviewing PRs, and providing feedback on this release.
73+

docs/source/index.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -72,6 +72,7 @@ Example
7272
user-guide/introduction
7373
user-guide/basics
7474
user-guide/data-sources
75+
user-guide/dataframe
7576
user-guide/common-operations/index
7677
user-guide/io/index
7778
user-guide/configuration

docs/source/user-guide/basics.rst

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -20,8 +20,9 @@
2020
Concepts
2121
========
2222

23-
In this section, we will cover a basic example to introduce a few key concepts. We will use the same
24-
source file as described in the :ref:`Introduction <guide>`, the Pokemon data set.
23+
In this section, we will cover a basic example to introduce a few key concepts. We will use the
24+
2021 Yellow Taxi Trip Records (`download <https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2021-01.parquet>`_),
25+
from the `TLC Trip Record Data <https://www.nyc.gov/site/tlc/about/tlc-trip-record-data.page>`_.
2526

2627
.. ipython:: python
2728
@@ -72,6 +73,8 @@ DataFrames are typically created by calling a method on :py:class:`~datafusion.c
7273
calling the transformation methods, such as :py:func:`~datafusion.dataframe.DataFrame.filter`, :py:func:`~datafusion.dataframe.DataFrame.select`, :py:func:`~datafusion.dataframe.DataFrame.aggregate`,
7374
and :py:func:`~datafusion.dataframe.DataFrame.limit` to build up a query definition.
7475

76+
For more details on working with DataFrames, including visualization options and conversion to other formats, see :doc:`dataframe`.
77+
7578
Expressions
7679
-----------
7780

Lines changed: 179 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,179 @@
1+
.. Licensed to the Apache Software Foundation (ASF) under one
2+
.. or more contributor license agreements. See the NOTICE file
3+
.. distributed with this work for additional information
4+
.. regarding copyright ownership. The ASF licenses this file
5+
.. to you under the Apache License, Version 2.0 (the
6+
.. "License"); you may not use this file except in compliance
7+
.. with the License. You may obtain a copy of the License at
8+
9+
.. http://www.apache.org/licenses/LICENSE-2.0
10+
11+
.. Unless required by applicable law or agreed to in writing,
12+
.. software distributed under the License is distributed on an
13+
.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14+
.. KIND, either express or implied. See the License for the
15+
.. specific language governing permissions and limitations
16+
.. under the License.
17+
18+
DataFrames
19+
==========
20+
21+
Overview
22+
--------
23+
24+
DataFusion's DataFrame API provides a powerful interface for building and executing queries against data sources.
25+
It offers a familiar API similar to pandas and other DataFrame libraries, but with the performance benefits of Rust
26+
and Arrow.
27+
28+
A DataFrame represents a logical plan that can be composed through operations like filtering, projection, and aggregation.
29+
The actual execution happens when terminal operations like ``collect()`` or ``show()`` are called.
30+
31+
Basic Usage
32+
-----------
33+
34+
.. code-block:: python
35+
36+
import datafusion
37+
from datafusion import col, lit
38+
39+
# Create a context and register a data source
40+
ctx = datafusion.SessionContext()
41+
ctx.register_csv("my_table", "path/to/data.csv")
42+
43+
# Create and manipulate a DataFrame
44+
df = ctx.sql("SELECT * FROM my_table")
45+
46+
# Or use the DataFrame API directly
47+
df = (ctx.table("my_table")
48+
.filter(col("age") > lit(25))
49+
.select([col("name"), col("age")]))
50+
51+
# Execute and collect results
52+
result = df.collect()
53+
54+
# Display the first few rows
55+
df.show()
56+
57+
HTML Rendering
58+
--------------
59+
60+
When working in Jupyter notebooks or other environments that support HTML rendering, DataFrames will
61+
automatically display as formatted HTML tables, making it easier to visualize your data.
62+
63+
The ``_repr_html_`` method is called automatically by Jupyter to render a DataFrame. This method
64+
controls how DataFrames appear in notebook environments, providing a richer visualization than
65+
plain text output.
66+
67+
Customizing HTML Rendering
68+
--------------------------
69+
70+
You can customize how DataFrames are rendered in HTML by configuring the formatter:
71+
72+
.. code-block:: python
73+
74+
from datafusion.html_formatter import configure_formatter
75+
76+
# Change the default styling
77+
configure_formatter(
78+
max_rows=50, # Maximum number of rows to display
79+
max_width=None, # Maximum width in pixels (None for auto)
80+
theme="light", # Theme: "light" or "dark"
81+
precision=2, # Floating point precision
82+
thousands_separator=",", # Separator for thousands
83+
date_format="%Y-%m-%d", # Date format
84+
truncate_width=20 # Max width for string columns before truncating
85+
)
86+
87+
The formatter settings affect all DataFrames displayed after configuration.
88+
89+
Custom Style Providers
90+
----------------------
91+
92+
For advanced styling needs, you can create a custom style provider:
93+
94+
.. code-block:: python
95+
96+
from datafusion.html_formatter import StyleProvider, configure_formatter
97+
98+
class MyStyleProvider(StyleProvider):
99+
def get_table_styles(self):
100+
return {
101+
"table": "border-collapse: collapse; width: 100%;",
102+
"th": "background-color: #007bff; color: white; padding: 8px; text-align: left;",
103+
"td": "border: 1px solid #ddd; padding: 8px;",
104+
"tr:nth-child(even)": "background-color: #f2f2f2;",
105+
}
106+
107+
def get_value_styles(self, dtype, value):
108+
"""Return custom styles for specific values"""
109+
if dtype == "float" and value < 0:
110+
return "color: red;"
111+
return None
112+
113+
# Apply the custom style provider
114+
configure_formatter(style_provider=MyStyleProvider())
115+
116+
Creating a Custom Formatter
117+
---------------------------
118+
119+
For complete control over rendering, you can implement a custom formatter:
120+
121+
.. code-block:: python
122+
123+
from datafusion.html_formatter import Formatter, get_formatter
124+
125+
class MyFormatter(Formatter):
126+
def format_html(self, batches, schema, has_more=False, table_uuid=None):
127+
# Create your custom HTML here
128+
html = "<div class='my-custom-table'>"
129+
# ... formatting logic ...
130+
html += "</div>"
131+
return html
132+
133+
# Set as the global formatter
134+
configure_formatter(formatter_class=MyFormatter)
135+
136+
# Or use the formatter just for specific operations
137+
formatter = get_formatter()
138+
custom_html = formatter.format_html(batches, schema)
139+
140+
Managing Formatters
141+
-------------------
142+
143+
Reset to default formatting:
144+
145+
.. code-block:: python
146+
147+
from datafusion.html_formatter import reset_formatter
148+
149+
# Reset to default settings
150+
reset_formatter()
151+
152+
Get the current formatter settings:
153+
154+
.. code-block:: python
155+
156+
from datafusion.html_formatter import get_formatter
157+
158+
formatter = get_formatter()
159+
print(formatter.max_rows)
160+
print(formatter.theme)
161+
162+
Contextual Formatting
163+
---------------------
164+
165+
You can also use a context manager to temporarily change formatting settings:
166+
167+
.. code-block:: python
168+
169+
from datafusion.html_formatter import formatting_context
170+
171+
# Default formatting
172+
df.show()
173+
174+
# Temporarily use different formatting
175+
with formatting_context(max_rows=100, theme="dark"):
176+
df.show() # Will use the temporary settings
177+
178+
# Back to default formatting
179+
df.show()

python/datafusion/__init__.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@
2626
except ImportError:
2727
import importlib_metadata
2828

29-
from . import functions, object_store, substrait
29+
from . import functions, object_store, substrait, unparser
3030

3131
# The following imports are okay to remain as opaque to the user.
3232
from ._internal import Config
@@ -45,6 +45,7 @@
4545
Expr,
4646
WindowFrame,
4747
)
48+
from .html_formatter import configure_formatter
4849
from .io import read_avro, read_csv, read_json, read_parquet
4950
from .plan import ExecutionPlan, LogicalPlan
5051
from .record_batch import RecordBatch, RecordBatchStream
@@ -76,6 +77,7 @@
7677
"col",
7778
"column",
7879
"common",
80+
"configure_formatter",
7981
"expr",
8082
"functions",
8183
"lit",
@@ -89,6 +91,7 @@
8991
"udaf",
9092
"udf",
9193
"udwf",
94+
"unparser",
9295
]
9396

9497

python/datafusion/expr.py

Lines changed: 11 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -456,9 +456,17 @@ def column(value: str) -> Expr:
456456
"""Creates a new expression representing a column."""
457457
return Expr(expr_internal.RawExpr.column(value))
458458

459-
def alias(self, name: str) -> Expr:
460-
"""Assign a name to the expression."""
461-
return Expr(self.expr.alias(name))
459+
def alias(self, name: str, metadata: Optional[dict[str, str]] = None) -> Expr:
460+
"""Assign a name to the expression.
461+
462+
Args:
463+
name: The name to assign to the expression.
464+
metadata: Optional metadata to attach to the expression.
465+
466+
Returns:
467+
A new expression with the assigned name.
468+
"""
469+
return Expr(self.expr.alias(name, metadata))
462470

463471
def sort(self, ascending: bool = True, nulls_first: bool = True) -> SortExpr:
464472
"""Creates a sort :py:class:`Expr` from an existing :py:class:`Expr`.

0 commit comments

Comments
 (0)