apache
diff --git a/‎Cargo.lock‎
Lines changed: 400 additions & 417 deletions b/‎Cargo.lock‎
Lines changed: 400 additions & 417 deletions
diff --git a/‎Cargo.toml‎
Lines changed: 9 additions & 9 deletions b/‎Cargo.toml‎
Lines changed: 9 additions & 9 deletions
diff --git a/‎benchmarks/max_cpu_usage.py‎
Lines changed: 1 addition & 1 deletion b/‎benchmarks/max_cpu_usage.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎docs/source/user-guide/common-operations/windows.rst‎
Lines changed: 24 additions & 20 deletions b/‎docs/source/user-guide/common-operations/windows.rst‎
Lines changed: 24 additions & 20 deletions
diff --git a/‎docs/source/user-guide/data-sources.rst‎
Lines changed: 34 additions & 3 deletions b/‎docs/source/user-guide/data-sources.rst‎
Lines changed: 34 additions & 3 deletions
diff --git a/‎docs/source/user-guide/dataframe/index.rst‎
Lines changed: 50 additions & 0 deletions b/‎docs/source/user-guide/dataframe/index.rst‎
Lines changed: 50 additions & 0 deletions
diff --git a/‎python/datafusion/context.py‎
Lines changed: 39 additions & 24 deletions b/‎python/datafusion/context.py‎
Lines changed: 39 additions & 24 deletions
@@ -34,15 +34,15 @@ protoc = [ "datafusion-substrait/protoc" ]
 substrait = ["dep:datafusion-substrait"]
 
 [dependencies]
-tokio = { version = "1.45", features = ["macros", "rt", "rt-multi-thread", "sync"] }
-pyo3 = { version = "0.24", features = ["extension-module", "abi3", "abi3-py39"] }
-pyo3-async-runtimes = { version = "0.24", features = ["tokio-runtime"]}
+tokio = { version = "1.47", features = ["macros", "rt", "rt-multi-thread", "sync"] }
+pyo3 = { version = "0.25", features = ["extension-module", "abi3", "abi3-py39"] }
+pyo3-async-runtimes = { version = "0.25", features = ["tokio-runtime"]}
 pyo3-log = "0.12.4"
-arrow = { version = "55.1.0", features = ["pyarrow"] }
-datafusion = { version = "49.0.2", features = ["avro", "unicode_expressions"] }
-datafusion-substrait = { version = "49.0.2", optional = true }
-datafusion-proto = { version = "49.0.2" }
-datafusion-ffi = { version = "49.0.2" }
+arrow = { version = "56", features = ["pyarrow"] }
+datafusion = { version = "50", features = ["avro", "unicode_expressions"] }
+datafusion-substrait = { version = "50", optional = true }
+datafusion-proto = { version = "50" }
+datafusion-ffi = { version = "50" }
 prost = "0.13.1" # keep in line with `datafusion-substrait`
 uuid = { version = "1.18", features = ["v4"] }
 mimalloc = { version = "0.1", optional = true, default-features = false, features = ["local_dynamic_tls"] }
@@ -54,7 +54,7 @@ log = "0.4.27"
 
 [build-dependencies]
 prost-types = "0.13.1" # keep in line with `datafusion-substrait`
-pyo3-build-config = "0.24"
+pyo3-build-config = "0.25"
 
 [lib]
 name = "datafusion_python"
 
@@ -53,7 +53,7 @@
 
 def main(num_rows: int, partitions: int) -> None:
     """Run a simple aggregation after repartitioning.
-    
+
     This function demonstrates basic partitioning concepts using synthetic data.
     Real-world performance will depend on your specific data sources, query types,
     and system configuration.
 
@@ -31,7 +31,7 @@ We'll use the pokemon dataset (from Ritchie Vink) in the following examples.
 .. ipython:: python
 
     from datafusion import SessionContext
-    from datafusion import col
+    from datafusion import col, lit
     from datafusion import functions as f
 
     ctx = SessionContext()
@@ -120,16 +120,14 @@ two preceding rows.
 
 .. ipython:: python
 
-    from datafusion.expr import WindowFrame
+    from datafusion.expr import Window, WindowFrame
 
     df.select(
         col('"Name"'),
         col('"Speed"'),
-        f.window("avg",
-            [col('"Speed"')],
-            order_by=[col('"Speed"')],
-            window_frame=WindowFrame("rows", 2, 0)
-        ).alias("Previous Speed")
+        f.avg(col('"Speed"'))
+        .over(Window(window_frame=WindowFrame("rows", 2, 0), order_by=[col('"Speed"')]))
+        .alias("Previous Speed"),
     )
 
 Null Treatment
@@ -151,21 +149,27 @@ it's ``Type 2`` column that are null.
 
     from datafusion.common import NullTreatment
 
-    df.filter(col('"Type 1"') ==  lit("Bug")).select(
+    df.filter(col('"Type 1"') == lit("Bug")).select(
         '"Name"',
         '"Type 2"',
-        f.window("last_value", [col('"Type 2"')])
-            .window_frame(WindowFrame("rows", None, 0))
-            .order_by(col('"Speed"'))
-            .null_treatment(NullTreatment.IGNORE_NULLS)
-            .build()
-            .alias("last_wo_null"),
-        f.window("last_value", [col('"Type 2"')])
-            .window_frame(WindowFrame("rows", None, 0))
-            .order_by(col('"Speed"'))
-            .null_treatment(NullTreatment.RESPECT_NULLS)
-            .build()
-            .alias("last_with_null")
+        f.last_value(col('"Type 2"'))
+        .over(
+            Window(
+                window_frame=WindowFrame("rows", None, 0),
+                order_by=[col('"Speed"')],
+                null_treatment=NullTreatment.IGNORE_NULLS,
+            )
+        )
+        .alias("last_wo_null"),
+        f.last_value(col('"Type 2"'))
+        .over(
+            Window(
+                window_frame=WindowFrame("rows", None, 0),
+                order_by=[col('"Speed"')],
+                null_treatment=NullTreatment.RESPECT_NULLS,
+            )
+        )
+        .alias("last_with_null"),
     )
 
 Aggregate Functions
 
@@ -181,10 +181,41 @@ which can lead to a significant performance difference.
     df = ctx.table("my_delta_table")
     df.show()
 
-Iceberg
--------
+Apache Iceberg
+--------------
 
-Coming soon!
+DataFusion 45.0.0 and later support the ability to register Apache Iceberg tables as table providers through the Custom Table Provider interface.
+
+This requires either the `pyiceberg <https://pypi.org/project/pyiceberg/>`__ library (>=0.10.0) or the `pyiceberg-core <https://pypi.org/project/pyiceberg-core/>`__ library (>=0.5.0).
+
+* The ``pyiceberg-core`` library exposes Iceberg Rust's implementation of the Custom Table Provider interface as python bindings.
+* The ``pyiceberg`` library utilizes the ``pyiceberg-core`` python bindings under the hood and provides a native way for Python users to interact with the DataFusion.
+
+.. code-block:: python
+
+    from datafusion import SessionContext
+    from pyiceberg.catalog import load_catalog
+    import pyarrow as pa
+
+    # Load catalog and create/load a table
+    catalog = load_catalog("catalog", type="in-memory")
+    catalog.create_namespace_if_not_exists("default")
+
+    # Create some sample data
+    data = pa.table({"x": [1, 2, 3], "y": [4, 5, 6]})
+    iceberg_table = catalog.create_table("default.test", schema=data.schema)
+    iceberg_table.append(data)
+
+    # Register the table with DataFusion
+    ctx = SessionContext()
+    ctx.register_table_provider("test", iceberg_table)
+
+    # Query the table using DataFusion
+    ctx.table("test").show()
+
+
+Note that the Datafusion integration rely on features from the `Iceberg Rust <https://github.com/apache/iceberg-rust/>`_ implementation instead of the `PyIceberg <https://github.com/apache/iceberg-python/>`_ implementation. 
+Features that are available in PyIceberg but not yet in Iceberg Rust will not be available when using DataFusion.
 
 Custom Table Provider
 ---------------------
 
@@ -126,6 +126,56 @@ DataFusion's DataFrame API offers a wide range of operations:
     # Drop columns
     df = df.drop("temporary_column")
 
+Column Names as Function Arguments
+----------------------------------
+
+Some ``DataFrame`` methods accept column names when an argument refers to an
+existing column. These include:
+
+* :py:meth:`~datafusion.DataFrame.select`
+* :py:meth:`~datafusion.DataFrame.sort`
+* :py:meth:`~datafusion.DataFrame.drop`
+* :py:meth:`~datafusion.DataFrame.join` (``on`` argument)
+* :py:meth:`~datafusion.DataFrame.aggregate` (grouping columns)
+
+See the full function documentation for details on any specific function.
+
+Note that :py:meth:`~datafusion.DataFrame.join_on` expects ``col()``/``column()`` expressions rather than plain strings.
+
+For such methods, you can pass column names directly:
+
+.. code-block:: python
+
+    from datafusion import col, functions as f
+
+    df.sort('id')
+    df.aggregate('id', [f.count(col('value'))])
+
+The same operation can also be written with explicit column expressions, using either ``col()`` or ``column()``:
+
+.. code-block:: python
+
+    from datafusion import col, column, functions as f
+
+    df.sort(col('id'))
+    df.aggregate(column('id'), [f.count(col('value'))])
+
+Note that ``column()`` is an alias of ``col()``, so you can use either name; the example above shows both in action.
+
+Whenever an argument represents an expression—such as in
+:py:meth:`~datafusion.DataFrame.filter` or
+:py:meth:`~datafusion.DataFrame.with_column`—use ``col()`` to reference
+columns. The comparison and arithmetic operators on ``Expr`` will automatically
+convert any non-``Expr`` value into a literal expression, so writing
+
+.. code-block:: python
+
+    from datafusion import col
+    df.filter(col("age") > 21)
+
+is equivalent to using ``lit(21)`` explicitly. Use ``lit()`` (also available
+as ``literal()``) when you need to construct a literal expression directly.
+
 Terminal Operations
 -------------------
 
 
@@ -22,16 +22,16 @@
 import warnings
 from typing import TYPE_CHECKING, Any, Protocol
 
-import pyarrow as pa
-
 try:
     from warnings import deprecated  # Python 3.13+
 except ImportError:
     from typing_extensions import deprecated  # Python 3.12
 
+import pyarrow as pa
+
 from datafusion.catalog import Catalog, CatalogProvider, Table
 from datafusion.dataframe import DataFrame
-from datafusion.expr import Expr, SortExpr, sort_list_to_raw_sort_list
+from datafusion.expr import SortKey, sort_list_to_raw_sort_list
 from datafusion.record_batch import RecordBatchStream
 from datafusion.user_defined import AggregateUDF, ScalarUDF, TableFunction, WindowUDF
 from datafusion.utils import _normalize_table_provider
@@ -40,12 +40,14 @@
 from ._internal import SessionConfig as SessionConfigInternal
 from ._internal import SessionContext as SessionContextInternal
 from ._internal import SQLOptions as SQLOptionsInternal
+from ._internal import expr as expr_internal
 
 if TYPE_CHECKING:
     import pathlib
+    from collections.abc import Sequence
 
     import pandas as pd
-    import polars as pl
+    import polars as pl  # type: ignore[import]
 
     from datafusion import TableProvider
     from datafusion.plan import ExecutionPlan, LogicalPlan
@@ -555,7 +557,7 @@ def register_listing_table(
         table_partition_cols: list[tuple[str, str | pa.DataType]] | None = None,
         file_extension: str = ".parquet",
         schema: pa.Schema | None = None,
-        file_sort_order: list[list[Expr | SortExpr]] | None = None,
+        file_sort_order: Sequence[Sequence[SortKey]] | None = None,
     ) -> None:
         """Register multiple files as a single table.
 
@@ -569,23 +571,20 @@ def register_listing_table(
             table_partition_cols: Partition columns.
             file_extension: File extension of the provided table.
             schema: The data source schema.
-            file_sort_order: Sort order for the file.
+            file_sort_order: Sort order for the file. Each sort key can be
+                specified as a column name (``str``), an expression
+                (``Expr``), or a ``SortExpr``.
         """
         if table_partition_cols is None:
             table_partition_cols = []
         table_partition_cols = self._convert_table_partition_cols(table_partition_cols)
-        file_sort_order_raw = (
-            [sort_list_to_raw_sort_list(f) for f in file_sort_order]
-            if file_sort_order is not None
-            else None
-        )
         self.ctx.register_listing_table(
             name,
             str(path),
             table_partition_cols,
             file_extension,
             schema,
-            file_sort_order_raw,
+            self._convert_file_sort_order(file_sort_order),
         )
 
     def sql(self, query: str, options: SQLOptions | None = None) -> DataFrame:
@@ -831,7 +830,7 @@ def register_parquet(
         file_extension: str = ".parquet",
         skip_metadata: bool = True,
         schema: pa.Schema | None = None,
-        file_sort_order: list[list[SortExpr]] | None = None,
+        file_sort_order: Sequence[Sequence[SortKey]] | None = None,
     ) -> None:
         """Register a Parquet file as a table.
 
@@ -850,7 +849,9 @@ def register_parquet(
                 that may be in the file schema. This can help avoid schema
                 conflicts due to metadata.
             schema: The data source schema.
-            file_sort_order: Sort order for the file.
+            file_sort_order: Sort order for the file. Each sort key can be
+                specified as a column name (``str``), an expression
+                (``Expr``), or a ``SortExpr``.
         """
         if table_partition_cols is None:
             table_partition_cols = []
@@ -863,9 +864,7 @@ def register_parquet(
             file_extension,
             skip_metadata,
             schema,
-            [sort_list_to_raw_sort_list(exprs) for exprs in file_sort_order]
-            if file_sort_order is not None
-            else None,
+            self._convert_file_sort_order(file_sort_order),
         )
 
     def register_csv(
@@ -1122,7 +1121,7 @@ def read_parquet(
         file_extension: str = ".parquet",
         skip_metadata: bool = True,
         schema: pa.Schema | None = None,
-        file_sort_order: list[list[Expr | SortExpr]] | None = None,
+        file_sort_order: Sequence[Sequence[SortKey]] | None = None,
     ) -> DataFrame:
         """Read a Parquet source into a :py:class:`~datafusion.dataframe.Dataframe`.
 
@@ -1139,19 +1138,17 @@ def read_parquet(
             schema: An optional schema representing the parquet files. If None,
                 the parquet reader will try to infer it based on data in the
                 file.
-            file_sort_order: Sort order for the file.
+            file_sort_order: Sort order for the file. Each sort key can be
+                specified as a column name (``str``), an expression
+                (``Expr``), or a ``SortExpr``.
 
         Returns:
             DataFrame representation of the read Parquet files
         """
         if table_partition_cols is None:
             table_partition_cols = []
         table_partition_cols = self._convert_table_partition_cols(table_partition_cols)
-        file_sort_order = (
-            [sort_list_to_raw_sort_list(f) for f in file_sort_order]
-            if file_sort_order is not None
-            else None
-        )
+        file_sort_order = self._convert_file_sort_order(file_sort_order)
         return DataFrame(
             self.ctx.read_parquet(
                 str(path),
@@ -1202,6 +1199,24 @@ def execute(self, plan: ExecutionPlan, partitions: int) -> RecordBatchStream:
         """Execute the ``plan`` and return the results."""
         return RecordBatchStream(self.ctx.execute(plan._raw_plan, partitions))
 
+    @staticmethod
+    def _convert_file_sort_order(
+        file_sort_order: Sequence[Sequence[SortKey]] | None,
+    ) -> list[list[expr_internal.SortExpr]] | None:
+        """Convert nested ``SortKey`` sequences into raw sort expressions.
+
+        Each ``SortKey`` can be a column name string, an ``Expr``, or a
+        ``SortExpr`` and will be converted using
+        :func:`datafusion.expr.sort_list_to_raw_sort_list`.
+        """
+        # Convert each ``SortKey`` in the provided sort order to the low-level
+        # representation expected by the Rust bindings.
+        return (
+            [sort_list_to_raw_sort_list(f) for f in file_sort_order]
+            if file_sort_order is not None
+            else None
+        )
+
     @staticmethod
     def _convert_table_partition_cols(
         table_partition_cols: list[tuple[str, str | pa.DataType]],