refactor: unify expression and sorting logic; improve docs and error handling

kosiew · kosiew · commit 91167b085c1a · 2025-09-02T16:20:06.000+08:00
- Update `order_by` handling in Window class for better type support
- Improve type checking in DataFrame expression handling
- Replace `Expr`/`SortExpr` with `SortKey` in file_sort_order and
  related functions
- Simplify file_sort_order handling in SessionContext
- Rename `_EXPR_TYPE_ERROR` → `EXPR_TYPE_ERROR` for consistency
- Clarify usage of `col()` vs `column()` in DataFrame examples
- Enhance documentation for file_sort_order in SessionContext
diff --git a/docs/source/user-guide/dataframe/index.rst b/docs/source/user-guide/dataframe/index.rst
@@ -142,21 +142,21 @@ For such methods, you can pass column names directly:
 
 .. code-block:: python
 
-    from datafusion import col, column, functions as f
+    from datafusion import col, functions as f
 
     df.sort('id')
     df.aggregate('id', [f.count(col('value'))])
 
-The same operation can also be written with an explicit column expression:
+The same operation can also be written with explicit column expressions, using either ``col()`` or ``column()``:
 
 .. code-block:: python
 
     from datafusion import col, column, functions as f
 
     df.sort(col('id'))
-    df.aggregate(col('id'), [f.count(col('value'))])
+    df.aggregate(column('id'), [f.count(col('value'))])
 
-Note that ``column()`` is an alias of ``col()``, so you can use either name.
+Note that ``column()`` is an alias of ``col()``, so you can use either name; the example above shows both in action.
 
 Whenever an argument represents an expression—such as in
 :py:meth:`~datafusion.DataFrame.filter` or
diff --git a/python/datafusion/context.py b/python/datafusion/context.py
@@ -31,7 +31,7 @@
 
 from datafusion.catalog import Catalog, CatalogProvider, Table
 from datafusion.dataframe import DataFrame
-from datafusion.expr import Expr, SortExpr, sort_list_to_raw_sort_list
+from datafusion.expr import SortKey, sort_list_to_raw_sort_list
 from datafusion.record_batch import RecordBatchStream
 from datafusion.user_defined import AggregateUDF, ScalarUDF, TableFunction, WindowUDF
 
@@ -553,7 +553,7 @@ def register_listing_table(
         table_partition_cols: list[tuple[str, str | pa.DataType]] | None = None,
         file_extension: str = ".parquet",
         schema: pa.Schema | None = None,
-        file_sort_order: list[list[Expr | SortExpr | str]] | None = None,
+        file_sort_order: list[list[SortKey]] | None = None,
     ) -> None:
         """Register multiple files as a single table.
 
@@ -567,23 +567,20 @@ def register_listing_table(
             table_partition_cols: Partition columns.
             file_extension: File extension of the provided table.
             schema: The data source schema.
-            file_sort_order: Sort order for the file.
+            file_sort_order: Sort order for the file. Each sort key can be
+                specified as a column name (``str``), an expression
+                (``Expr``), or a ``SortExpr``.
         """
         if table_partition_cols is None:
             table_partition_cols = []
         table_partition_cols = self._convert_table_partition_cols(table_partition_cols)
-        file_sort_order_raw = (
-            [sort_list_to_raw_sort_list(f) for f in file_sort_order]
-            if file_sort_order is not None
-            else None
-        )
         self.ctx.register_listing_table(
             name,
             str(path),
             table_partition_cols,
             file_extension,
             schema,
-            file_sort_order_raw,
+            self._convert_file_sort_order(file_sort_order),
         )
 
     def sql(self, query: str, options: SQLOptions | None = None) -> DataFrame:
@@ -808,7 +805,7 @@ def register_parquet(
         file_extension: str = ".parquet",
         skip_metadata: bool = True,
         schema: pa.Schema | None = None,
-        file_sort_order: list[list[Expr | SortExpr | str]] | None = None,
+        file_sort_order: list[list[SortKey]] | None = None,
     ) -> None:
         """Register a Parquet file as a table.
 
@@ -827,7 +824,9 @@ def register_parquet(
                 that may be in the file schema. This can help avoid schema
                 conflicts due to metadata.
             schema: The data source schema.
-            file_sort_order: Sort order for the file.
+            file_sort_order: Sort order for the file. Each sort key can be
+                specified as a column name (``str``), an expression
+                (``Expr``), or a ``SortExpr``.
         """
         if table_partition_cols is None:
             table_partition_cols = []
@@ -840,9 +839,7 @@ def register_parquet(
             file_extension,
             skip_metadata,
             schema,
-            [sort_list_to_raw_sort_list(exprs) for exprs in file_sort_order]
-            if file_sort_order is not None
-            else None,
+            self._convert_file_sort_order(file_sort_order),
         )
 
     def register_csv(
@@ -1099,7 +1096,7 @@ def read_parquet(
         file_extension: str = ".parquet",
         skip_metadata: bool = True,
         schema: pa.Schema | None = None,
-        file_sort_order: list[list[Expr | SortExpr | str]] | None = None,
+        file_sort_order: list[list[SortKey]] | None = None,
     ) -> DataFrame:
         """Read a Parquet source into a :py:class:`~datafusion.dataframe.Dataframe`.
 
@@ -1116,19 +1113,17 @@ def read_parquet(
             schema: An optional schema representing the parquet files. If None,
                 the parquet reader will try to infer it based on data in the
                 file.
-            file_sort_order: Sort order for the file.
+            file_sort_order: Sort order for the file. Each sort key can be
+                specified as a column name (``str``), an expression
+                (``Expr``), or a ``SortExpr``.
 
         Returns:
             DataFrame representation of the read Parquet files
         """
         if table_partition_cols is None:
             table_partition_cols = []
         table_partition_cols = self._convert_table_partition_cols(table_partition_cols)
-        file_sort_order = (
-            [sort_list_to_raw_sort_list(f) for f in file_sort_order]
-            if file_sort_order is not None
-            else None
-        )
+        file_sort_order = self._convert_file_sort_order(file_sort_order)
         return DataFrame(
             self.ctx.read_parquet(
                 str(path),
@@ -1179,6 +1174,16 @@ def execute(self, plan: ExecutionPlan, partitions: int) -> RecordBatchStream:
         """Execute the ``plan`` and return the results."""
         return RecordBatchStream(self.ctx.execute(plan._raw_plan, partitions))
 
+    @staticmethod
+    def _convert_file_sort_order(
+        file_sort_order: list[list[Expr | SortExpr | str]] | None,
+    ) -> list[list[Any]] | None:
+        return (
+            [sort_list_to_raw_sort_list(f) for f in file_sort_order]
+            if file_sort_order is not None
+            else None
+        )
+
     @staticmethod
     def _convert_table_partition_cols(
         table_partition_cols: list[tuple[str, str | pa.DataType]],
diff --git a/python/datafusion/dataframe.py b/python/datafusion/dataframe.py
@@ -41,9 +41,9 @@
 from datafusion._internal import ParquetColumnOptions as ParquetColumnOptionsInternal
 from datafusion._internal import ParquetWriterOptions as ParquetWriterOptionsInternal
 from datafusion.expr import (
-    _EXPR_TYPE_ERROR,
+    EXPR_TYPE_ERROR,
     Expr,
-    SortExpr,
+    SortKey,
     expr_list_to_raw_expr_list,
     sort_list_to_raw_sort_list,
 )
@@ -431,7 +431,7 @@ def filter(self, *predicates: Expr) -> DataFrame:
         df = self.df
         for p in predicates:
             if not isinstance(p, Expr):
-                raise TypeError(_EXPR_TYPE_ERROR)
+                raise TypeError(EXPR_TYPE_ERROR)
             df = df.filter(p.expr)
         return DataFrame(df)
 
@@ -446,7 +446,7 @@ def with_column(self, name: str, expr: Expr) -> DataFrame:
             DataFrame with the new column.
         """
         if not isinstance(expr, Expr):
-            raise TypeError(_EXPR_TYPE_ERROR)
+            raise TypeError(EXPR_TYPE_ERROR)
         return DataFrame(self.df.with_column(name, expr.expr))
 
     def with_columns(
@@ -478,19 +478,21 @@ def _simplify_expression(
         ) -> list[expr_internal.Expr]:
             expr_list: list[expr_internal.Expr] = []
             for expr in exprs:
-                if isinstance(expr, str) or (
-                    isinstance(expr, Iterable)
-                    and not isinstance(expr, Expr)
-                    and any(isinstance(inner, str) for inner in expr)
-                ):
-                    raise TypeError(_EXPR_TYPE_ERROR)
+                if isinstance(expr, str):
+                    raise TypeError(EXPR_TYPE_ERROR)
+                if isinstance(expr, Iterable) and not isinstance(expr, Expr):
+                    expr_value = list(expr)
+                    if any(isinstance(inner, str) for inner in expr_value):
+                        raise TypeError(EXPR_TYPE_ERROR)
+                else:
+                    expr_value = expr
                 try:
-                    expr_list.extend(expr_list_to_raw_expr_list(expr))
+                    expr_list.extend(expr_list_to_raw_expr_list(expr_value))
                 except TypeError as err:
-                    raise TypeError(_EXPR_TYPE_ERROR) from err
+                    raise TypeError(EXPR_TYPE_ERROR) from err
             for alias, expr in named_exprs.items():
                 if not isinstance(expr, Expr):
-                    raise TypeError(_EXPR_TYPE_ERROR)
+                    raise TypeError(EXPR_TYPE_ERROR)
                 expr_list.append(expr.alias(alias).expr)
             return expr_list
 
@@ -536,11 +538,11 @@ def aggregate(
         aggs_exprs = []
         for agg in aggs_list:
             if not isinstance(agg, Expr):
-                raise TypeError(_EXPR_TYPE_ERROR)
+                raise TypeError(EXPR_TYPE_ERROR)
             aggs_exprs.append(agg.expr)
         return DataFrame(self.df.aggregate(group_by_exprs, aggs_exprs))
 
-    def sort(self, *exprs: Expr | SortExpr | str) -> DataFrame:
+    def sort(self, *exprs: SortKey) -> DataFrame:
         """Sort the DataFrame by the specified sorting expressions or column names.
 
         Note that any expression can be turned into a sort expression by
@@ -779,7 +781,7 @@ def join_on(
         exprs = []
         for expr in on_exprs:
             if not isinstance(expr, Expr):
-                raise TypeError(_EXPR_TYPE_ERROR)
+                raise TypeError(EXPR_TYPE_ERROR)
             exprs.append(expr.expr)
         return DataFrame(self.df.join_on(right.df, exprs, how))
 
diff --git a/python/datafusion/expr.py b/python/datafusion/expr.py
@@ -22,7 +22,7 @@
 
 from __future__ import annotations
 
-from typing import TYPE_CHECKING, Any, ClassVar, Optional, Sequence
+from typing import TYPE_CHECKING, Any, ClassVar, Optional, Sequence, Union
 
 import pyarrow as pa
 
@@ -41,7 +41,9 @@
 
 
 # Standard error message for invalid expression types
-_EXPR_TYPE_ERROR = "Use col() or lit() to construct expressions"
+EXPR_TYPE_ERROR = "Use col() or lit() to construct expressions"
+
+SortKey = Union["Expr", "SortExpr", str]
 
 # The following are imported from the internal representation. We may choose to
 # give these all proper wrappers, or to simply leave as is. These were added
@@ -199,6 +201,7 @@
     "SimilarTo",
     "Sort",
     "SortExpr",
+    "SortKey",
     "Subquery",
     "SubqueryAlias",
     "TableScan",
@@ -236,7 +239,7 @@ def expr_list_to_raw_expr_list(
         else:
             error = (
                 "Expected Expr or column name, found:"
-                f" {type(e).__name__}. {_EXPR_TYPE_ERROR}."
+                f" {type(e).__name__}. {EXPR_TYPE_ERROR}."
             )
             raise TypeError(error)
     return raw_exprs
@@ -250,7 +253,7 @@ def sort_or_default(e: Expr | SortExpr) -> expr_internal.SortExpr:
 
 
 def sort_list_to_raw_sort_list(
-    sort_list: Optional[list[Expr | SortExpr | str] | Expr | SortExpr | str],
+    sort_list: Optional[list[SortKey] | SortKey],
 ) -> Optional[list[expr_internal.SortExpr]]:
     """Helper function to return an optional sort list to raw variant."""
     if isinstance(sort_list, (Expr, SortExpr, str)):
@@ -266,7 +269,7 @@ def sort_list_to_raw_sort_list(
         else:
             error = (
                 "Expected Expr or column name, found:"
-                f" {type(item).__name__}. {_EXPR_TYPE_ERROR}."
+                f" {type(item).__name__}. {EXPR_TYPE_ERROR}."
             )
             raise TypeError(error)
         raw_sort_list.append(sort_or_default(expr_obj))
@@ -693,7 +696,7 @@ def over(self, window: Window) -> Expr:
             window: Window definition
         """
         partition_by_raw = expr_list_to_raw_expr_list(window._partition_by)
-        order_by_raw = sort_list_to_raw_sort_list(window._order_by)
+        order_by_raw = window._order_by
         window_frame_raw = (
             window._window_frame.window_frame
             if window._window_frame is not None
@@ -1179,7 +1182,7 @@ def __init__(
         self,
         partition_by: Optional[list[Expr] | Expr] = None,
         window_frame: Optional[WindowFrame] = None,
-        order_by: Optional[list[SortExpr | Expr] | Expr | SortExpr] = None,
+        order_by: Optional[list[SortExpr | Expr | str] | Expr | SortExpr | str] = None,
         null_treatment: Optional[NullTreatment] = None,
     ) -> None:
         """Construct a window definition.
@@ -1192,7 +1195,7 @@ def __init__(
         """
         self._partition_by = partition_by
         self._window_frame = window_frame
-        self._order_by = order_by
+        self._order_by = sort_list_to_raw_sort_list(order_by)
         self._null_treatment = null_treatment
 
 
diff --git a/python/datafusion/functions.py b/python/datafusion/functions.py
diff --git a/python/tests/test_dataframe.py b/python/tests/test_dataframe.py