Refactor and enhance expression handling, test coverage, and documentation

kosiew · kosiew · commit f591617c0a3c · 2025-09-02T16:20:06.000+08:00
- Introduced `ensure_expr_list` to validate and flatten nested
  expressions, treating strings as atomic
- Updated expression utilities to improve consistency across aggregation
  and window functions
- Consolidated and expanded parameterized tests for string equivalence
  in ranking and window functions
- Exposed `EXPR_TYPE_ERROR` for consistent error messaging across
  modules and tests
- Improved internal sort logic using `expr_internal.SortExpr`
- Clarified expectations for `join_on` expressions in documentation
- Standardized imports and improved test clarity for maintainability
diff --git a/docs/source/user-guide/dataframe/index.rst b/docs/source/user-guide/dataframe/index.rst
@@ -138,6 +138,8 @@ existing column. These include:
 * :py:meth:`~datafusion.DataFrame.join` (``on`` argument)
 * :py:meth:`~datafusion.DataFrame.aggregate` (grouping columns)
 
+Note that :py:meth:`~datafusion.DataFrame.join_on` expects ``col()``/``column()`` expressions rather than plain strings.
+
 For such methods, you can pass column names directly:
 
 .. code-block:: python
diff --git a/python/datafusion/context.py b/python/datafusion/context.py
@@ -20,7 +20,8 @@
 from __future__ import annotations
 
 import warnings
-from typing import TYPE_CHECKING, Any, Protocol, Sequence
+from typing import TYPE_CHECKING, Any, Protocol
+from collections.abc import Sequence
 
 import pyarrow as pa
 
@@ -39,6 +40,7 @@
 from ._internal import SessionConfig as SessionConfigInternal
 from ._internal import SessionContext as SessionContextInternal
 from ._internal import SQLOptions as SQLOptionsInternal
+from ._internal import expr as expr_internal
 
 if TYPE_CHECKING:
     import pathlib
@@ -1177,8 +1179,8 @@ def execute(self, plan: ExecutionPlan, partitions: int) -> RecordBatchStream:
     @staticmethod
     def _convert_file_sort_order(
         file_sort_order: Sequence[Sequence[SortKey]] | None,
-    ) -> list[list[Any]] | None:
-        """Convert nested ``SortKey`` sequences into raw sort representations.
+    ) -> list[list[expr_internal.SortExpr]] | None:
+        """Convert nested ``SortKey`` sequences into raw sort expressions.
 
         Each ``SortKey`` can be a column name string, an ``Expr``, or a
         ``SortExpr`` and will be converted using
diff --git a/python/datafusion/dataframe.py b/python/datafusion/dataframe.py
@@ -22,6 +22,7 @@
 from __future__ import annotations
 
 import warnings
+from collections.abc import Sequence
 from typing import (
     TYPE_CHECKING,
     Any,
@@ -44,6 +45,7 @@
     Expr,
     SortKey,
     ensure_expr,
+    ensure_expr_list,
     expr_list_to_raw_expr_list,
     sort_list_to_raw_sort_list,
 )
@@ -52,7 +54,7 @@
 
 if TYPE_CHECKING:
     import pathlib
-    from typing import Callable, Sequence
+    from typing import Callable
 
     import pandas as pd
     import polars as pl
@@ -487,17 +489,7 @@ def with_columns(
         Returns:
             DataFrame with the new columns added.
         """
-
-        def _iter_exprs(items: Iterable[Expr | Iterable[Expr]]) -> Iterable[Expr | str]:
-            for expr in items:
-                if isinstance(expr, str):
-                    yield expr
-                elif isinstance(expr, Iterable) and not isinstance(expr, Expr):
-                    yield from _iter_exprs(expr)
-                else:
-                    yield expr
-
-        expressions = [ensure_expr(e) for e in _iter_exprs(exprs)]
+        expressions = ensure_expr_list(exprs)
         for alias, expr in named_exprs.items():
             ensure_expr(expr)
             expressions.append(expr.alias(alias).expr)
@@ -523,23 +515,31 @@ def with_column_renamed(self, old_name: str, new_name: str) -> DataFrame:
 
     def aggregate(
         self,
-        group_by: list[Expr | str] | Expr | str,
-        aggs: list[Expr] | Expr,
+        group_by: Sequence[Expr | str] | Expr | str,
+        aggs: Sequence[Expr] | Expr,
     ) -> DataFrame:
         """Aggregates the rows of the current DataFrame.
 
         Args:
-            group_by: List of expressions or column names to group by.
-            aggs: List of expressions to aggregate.
+            group_by: Sequence of expressions or column names to group by.
+            aggs: Sequence of expressions to aggregate.
 
         Returns:
             DataFrame after aggregation.
         """
-        group_by_list = group_by if isinstance(group_by, list) else [group_by]
-        aggs_list = aggs if isinstance(aggs, list) else [aggs]
+        group_by_list = (
+            list(group_by)
+            if isinstance(group_by, Sequence) and not isinstance(group_by, (Expr, str))
+            else [group_by]
+        )
+        aggs_list = (
+            list(aggs)
+            if isinstance(aggs, Sequence) and not isinstance(aggs, Expr)
+            else [aggs]
+        )
 
         group_by_exprs = expr_list_to_raw_expr_list(group_by_list)
-        aggs_exprs = [ensure_expr(agg) for agg in aggs_list]
+        aggs_exprs = ensure_expr_list(aggs_list)
         return DataFrame(self.df.aggregate(group_by_exprs, aggs_exprs))
 
     def sort(self, *exprs: SortKey) -> DataFrame:
diff --git a/python/datafusion/expr.py b/python/datafusion/expr.py
@@ -22,7 +22,8 @@
 
 from __future__ import annotations
 
-from typing import TYPE_CHECKING, Any, ClassVar, Optional, Sequence
+from typing import TYPE_CHECKING, Any, ClassVar, Iterable, Optional
+from collections.abc import Sequence
 
 import pyarrow as pa
 
@@ -131,6 +132,7 @@
 WindowExpr = expr_internal.WindowExpr
 
 __all__ = [
+    "EXPR_TYPE_ERROR",
     "Aggregate",
     "AggregateFunction",
     "Alias",
@@ -219,6 +221,7 @@
     "WindowFrame",
     "WindowFrameBound",
     "ensure_expr",
+    "ensure_expr_list",
 ]
 
 
@@ -243,6 +246,34 @@ def ensure_expr(value: Expr | Any) -> expr_internal.Expr:
     return value.expr
 
 
+def ensure_expr_list(
+    exprs: Iterable[Expr | Iterable[Expr]],
+) -> list[expr_internal.Expr]:
+    """Flatten an iterable of expressions, validating each via ``ensure_expr``.
+
+    Args:
+        exprs: Possibly nested iterable containing expressions.
+
+    Returns:
+        A flat list of raw expressions.
+
+    Raises:
+        TypeError: If any item is not an instance of :class:`Expr`.
+    """
+
+    def _iter(items: Iterable[Expr | Iterable[Expr]]) -> Iterable[expr_internal.Expr]:
+        for expr in items:
+            if isinstance(expr, Iterable) and not isinstance(
+                expr, (Expr, str, bytes, bytearray)
+            ):
+                # Treat string-like objects as atomic to surface standard errors
+                yield from _iter(expr)
+            else:
+                yield ensure_expr(expr)
+
+    return list(_iter(exprs))
+
+
 def _to_raw_expr(value: Expr | str) -> expr_internal.Expr:
     """Convert a Python expression or column name to its raw variant.
 
diff --git a/python/tests/test_dataframe.py b/python/tests/test_dataframe.py
@@ -303,6 +303,18 @@ def test_aggregate_string_and_expression_equivalent(df):
     assert result_str == result_expr
 
 
+def test_aggregate_tuple_group_by(df):
+    result_list = df.aggregate(["a"], [f.count()]).sort("a").to_pydict()
+    result_tuple = df.aggregate(("a",), [f.count()]).sort("a").to_pydict()
+    assert result_tuple == result_list
+
+
+def test_aggregate_tuple_aggs(df):
+    result_list = df.aggregate("a", [f.count()]).sort("a").to_pydict()
+    result_tuple = df.aggregate("a", (f.count(),)).sort("a").to_pydict()
+    assert result_tuple == result_list
+
+
 def test_filter_string_unsupported(df):
     with pytest.raises(TypeError, match=re.escape(EXPR_TYPE_ERROR)):
         df.filter("a > 1")
@@ -416,14 +428,14 @@ def test_with_columns(df):
 
 
 def test_with_columns_invalid_expr(df):
-    with pytest.raises(
-        TypeError, match=r"Use col\(\)/column\(\) or lit\(\)/literal\(\)"
-    ):
+    with pytest.raises(TypeError, match=re.escape(EXPR_TYPE_ERROR)):
         df.with_columns("a")
-    with pytest.raises(
-        TypeError, match=r"Use col\(\)/column\(\) or lit\(\)/literal\(\)"
-    ):
+    with pytest.raises(TypeError, match=re.escape(EXPR_TYPE_ERROR)):
         df.with_columns(c="a")
+    with pytest.raises(TypeError, match=re.escape(EXPR_TYPE_ERROR)):
+        df.with_columns(["a"])
+    with pytest.raises(TypeError, match=re.escape(EXPR_TYPE_ERROR)):
+        df.with_columns(c=["a"])
 
 
 def test_cast(df):
@@ -843,6 +855,27 @@ def test_window_functions(partitioned_df, name, expr, result):
     assert table.sort_by("a").to_pydict() == expected
 
 
+@pytest.mark.parametrize("partition", ["c", df_col("c")])
+def test_rank_partition_by_accepts_string(partitioned_df, partition):
+    """Passing a string to partition_by should match using col()."""
+    df = partitioned_df.select(
+        f.rank(order_by=column("a"), partition_by=partition).alias("r")
+    )
+    table = pa.Table.from_batches(df.sort(column("a")).collect())
+    assert table.column("r").to_pylist() == [1, 2, 3, 4, 1, 2, 3]
+
+
+@pytest.mark.parametrize("partition", ["c", df_col("c")])
+def test_window_partition_by_accepts_string(partitioned_df, partition):
+    """Window.partition_by accepts string identifiers."""
+    expr = f.first_value(column("a")).over(
+        Window(partition_by=partition, order_by=column("b"))
+    )
+    df = partitioned_df.select(expr.alias("fv"))
+    table = pa.Table.from_batches(df.sort(column("a")).collect())
+    assert table.column("fv").to_pylist() == [1, 1, 1, 1, 5, 5, 5]
+
+
 @pytest.mark.parametrize(
     ("units", "start_bound", "end_bound"),
     [
@@ -913,6 +946,69 @@ def test_window_frame_defaults_match_postgres(partitioned_df):
     assert df_2.sort(col_a).to_pydict() == expected
 
 
+def _build_last_value_df(df):
+    return df.select(
+        f.last_value(column("a"))
+        .over(
+            Window(
+                partition_by=[column("c")],
+                order_by=[column("b")],
+                window_frame=WindowFrame("rows", None, None),
+            )
+        )
+        .alias("expr"),
+        f.last_value(column("a"))
+        .over(
+            Window(
+                partition_by=[column("c")],
+                order_by="b",
+                window_frame=WindowFrame("rows", None, None),
+            )
+        )
+        .alias("str"),
+    )
+
+
+def _build_nth_value_df(df):
+    return df.select(
+        f.nth_value(column("b"), 3).over(Window(order_by=[column("a")])).alias("expr"),
+        f.nth_value(column("b"), 3).over(Window(order_by="a")).alias("str"),
+    )
+
+
+def _build_rank_df(df):
+    return df.select(
+        f.rank(order_by=[column("b")]).alias("expr"),
+        f.rank(order_by="b").alias("str"),
+    )
+
+
+def _build_array_agg_df(df):
+    return df.aggregate(
+        [column("c")],
+        [
+            f.array_agg(column("a"), order_by=[column("a")]).alias("expr"),
+            f.array_agg(column("a"), order_by="a").alias("str"),
+        ],
+    ).sort(column("c"))
+
+
+@pytest.mark.parametrize(
+    ("builder", "expected"),
+    [
+        pytest.param(_build_last_value_df, [3, 3, 3, 3, 6, 6, 6], id="last_value"),
+        pytest.param(_build_nth_value_df, [None, None, 7, 7, 7, 7, 7], id="nth_value"),
+        pytest.param(_build_rank_df, [1, 1, 3, 3, 5, 6, 6], id="rank"),
+        pytest.param(_build_array_agg_df, [[0, 1, 2, 3], [4, 5, 6]], id="array_agg"),
+    ],
+)
+def test_order_by_string_equivalence(partitioned_df, builder, expected):
+    df = builder(partitioned_df)
+    table = pa.Table.from_batches(df.collect())
+    assert table.column("expr").to_pylist() == expected
+    assert table.column("expr").to_pylist() == table.column("str").to_pylist()
+
+
 def test_html_formatter_cell_dimension(df, clean_formatter_state):
     """Test configuring the HTML formatter with different options."""
     # Configure with custom settings
diff --git a/python/tests/test_expr.py b/python/tests/test_expr.py
@@ -50,6 +50,7 @@
     TransactionStart,
     Values,
     ensure_expr,
+    ensure_expr_list,
 )
 
 
@@ -890,3 +891,18 @@ def test_ensure_expr():
     assert ensure_expr(e) is e.expr
     with pytest.raises(TypeError, match=re.escape(EXPR_TYPE_ERROR)):
         ensure_expr("a")
+
+
+def test_ensure_expr_list_string():
+    with pytest.raises(TypeError, match=re.escape(EXPR_TYPE_ERROR)):
+        ensure_expr_list("a")
+
+
+def test_ensure_expr_list_bytes():
+    with pytest.raises(TypeError, match=re.escape(EXPR_TYPE_ERROR)):
+        ensure_expr_list(b"a")
+
+
+def test_ensure_expr_list_bytearray():
+    with pytest.raises(TypeError, match=re.escape(EXPR_TYPE_ERROR)):
+        ensure_expr_list(bytearray(b"a"))