Upgrade to DF50 release candidate

timsaucer · timsaucer · commit 24a59f2c8b66 · 2025-09-08T09:12:08.000-04:00
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/Cargo.toml b/Cargo.toml
@@ -34,15 +34,15 @@ protoc = [ "datafusion-substrait/protoc" ]
 substrait = ["dep:datafusion-substrait"]
 
 [dependencies]
-tokio = { version = "1.45", features = ["macros", "rt", "rt-multi-thread", "sync"] }
-pyo3 = { version = "0.24", features = ["extension-module", "abi3", "abi3-py39"] }
-pyo3-async-runtimes = { version = "0.24", features = ["tokio-runtime"]}
+tokio = { version = "1.47", features = ["macros", "rt", "rt-multi-thread", "sync"] }
+pyo3 = { version = "0.25", features = ["extension-module", "abi3", "abi3-py39"] }
+pyo3-async-runtimes = { version = "0.25", features = ["tokio-runtime"]}
 pyo3-log = "0.12.4"
-arrow = { version = "55.1.0", features = ["pyarrow"] }
-datafusion = { version = "49.0.2", features = ["avro", "unicode_expressions"] }
-datafusion-substrait = { version = "49.0.2", optional = true }
-datafusion-proto = { version = "49.0.2" }
-datafusion-ffi = { version = "49.0.2" }
+arrow = { version = "56", features = ["pyarrow"] }
+datafusion = { version = "50", features = ["avro", "unicode_expressions"] }
+datafusion-substrait = { version = "50", optional = true }
+datafusion-proto = { version = "50" }
+datafusion-ffi = { version = "50" }
 prost = "0.13.1" # keep in line with `datafusion-substrait`
 uuid = { version = "1.18", features = ["v4"] }
 mimalloc = { version = "0.1", optional = true, default-features = false, features = ["local_dynamic_tls"] }
@@ -54,7 +54,7 @@ log = "0.4.27"
 
 [build-dependencies]
 prost-types = "0.13.1" # keep in line with `datafusion-substrait`
-pyo3-build-config = "0.24"
+pyo3-build-config = "0.25"
 
 [lib]
 name = "datafusion_python"
@@ -63,3 +63,9 @@ crate-type = ["cdylib", "rlib"]
 [profile.release]
 lto = true
 codegen-units = 1
+
+[patch.crates-io]
+datafusion = { git = "https://github.com/apache/datafusion.git", branch = "branch-50" }
+datafusion-substrait = { git = "https://github.com/apache/datafusion.git", branch = "branch-50" }
+datafusion-proto = { git = "https://github.com/apache/datafusion.git", branch = "branch-50" }
+datafusion-ffi = { git = "https://github.com/apache/datafusion.git", branch = "branch-50" }
diff --git a/python/datafusion/functions.py b/python/datafusion/functions.py
@@ -31,6 +31,7 @@
     WindowFrame,
     expr_list_to_raw_expr_list,
     sort_list_to_raw_sort_list,
+    sort_or_default,
 )
 
 if TYPE_CHECKING:
@@ -1659,7 +1660,7 @@ def approx_median(expression: Expr, filter: Optional[Expr] = None) -> Expr:
 
 
 def approx_percentile_cont(
-    expression: Expr,
+    sort_expression: Expr | SortExpr,
     percentile: float,
     num_centroids: Optional[int] = None,
     filter: Optional[Expr] = None,
@@ -1680,21 +1681,26 @@ def approx_percentile_cont(
     the options ``order_by``, ``null_treatment``, and ``distinct``.
 
     Args:
-        expression: Values for which to find the approximate percentile
+        sort_expression: Values for which to find the approximate percentile
         percentile: This must be between 0.0 and 1.0, inclusive
         num_centroids: Max bin size for the t-digest algorithm
         filter: If provided, only compute against rows for which the filter is True
     """
+    sort_expr_raw = sort_or_default(sort_expression)
     filter_raw = filter.expr if filter is not None else None
     return Expr(
         f.approx_percentile_cont(
-            expression.expr, percentile, num_centroids=num_centroids, filter=filter_raw
+            sort_expr_raw, percentile, num_centroids=num_centroids, filter=filter_raw
         )
     )
 
 
 def approx_percentile_cont_with_weight(
-    expression: Expr, weight: Expr, percentile: float, filter: Optional[Expr] = None
+    sort_expression: Expr | SortExpr,
+    weight: Expr,
+    percentile: float,
+    num_centroids: Optional[int] = None,
+    filter: Optional[Expr] = None,
 ) -> Expr:
     """Returns the value of the weighted approximate percentile.
 
@@ -1705,16 +1711,22 @@ def approx_percentile_cont_with_weight(
     the options ``order_by``, ``null_treatment``, and ``distinct``.
 
     Args:
-        expression: Values for which to find the approximate percentile
+        sort_expression: Values for which to find the approximate percentile
         weight: Relative weight for each of the values in ``expression``
         percentile: This must be between 0.0 and 1.0, inclusive
+        num_centroids: Max bin size for the t-digest algorithm
         filter: If provided, only compute against rows for which the filter is True
 
     """
+    sort_expr_raw = sort_or_default(sort_expression)
     filter_raw = filter.expr if filter is not None else None
     return Expr(
         f.approx_percentile_cont_with_weight(
-            expression.expr, weight.expr, percentile, filter=filter_raw
+            sort_expr_raw,
+            weight.expr,
+            percentile,
+            num_centroids=num_centroids,
+            filter=filter_raw,
         )
     )
 
diff --git a/python/tests/test_aggregation.py b/python/tests/test_aggregation.py
@@ -130,11 +130,27 @@ def test_aggregation_stats(df, agg_expr, calc_expected):
         (f.median(column("b"), filter=column("a") != 2), pa.array([5]), False),
         (f.approx_median(column("b"), filter=column("a") != 2), pa.array([5]), False),
         (f.approx_percentile_cont(column("b"), 0.5), pa.array([4]), False),
+        (
+            f.approx_percentile_cont(
+                column("b").sort(ascending=True, nulls_first=False),
+                0.5,
+                num_centroids=2,
+            ),
+            pa.array([4]),
+            False,
+        ),
         (
             f.approx_percentile_cont_with_weight(column("b"), lit(0.6), 0.5),
             pa.array([6], type=pa.float64()),
             False,
         ),
+        (
+            f.approx_percentile_cont_with_weight(
+                column("b").sort(ascending=False, nulls_first=False), lit(0.6), 0.5
+            ),
+            pa.array([6], type=pa.float64()),
+            False,
+        ),
         (
             f.approx_percentile_cont_with_weight(
                 column("b"), lit(0.6), 0.5, filter=column("a") != lit(3)
diff --git a/src/common/data_type.rs b/src/common/data_type.rs
@@ -224,6 +224,16 @@ impl DataTypeMap {
             DataType::Dictionary(_, _) => Err(py_datafusion_err(DataFusionError::NotImplemented(
                 format!("{arrow_type:?}"),
             ))),
+            DataType::Decimal32(precision, scale) => Ok(DataTypeMap::new(
+                DataType::Decimal32(*precision, *scale),
+                PythonType::Float,
+                SqlType::DECIMAL,
+            )),
+            DataType::Decimal64(precision, scale) => Ok(DataTypeMap::new(
+                DataType::Decimal64(*precision, *scale),
+                PythonType::Float,
+                SqlType::DECIMAL,
+            )),
             DataType::Decimal128(precision, scale) => Ok(DataTypeMap::new(
                 DataType::Decimal128(*precision, *scale),
                 PythonType::Float,
@@ -612,6 +622,8 @@ impl DataTypeMap {
             DataType::Struct(_) => "Struct",
             DataType::Union(_, _) => "Union",
             DataType::Dictionary(_, _) => "Dictionary",
+            DataType::Decimal32(_, _) => "Decimal32",
+            DataType::Decimal64(_, _) => "Decimal64",
             DataType::Decimal128(_, _) => "Decimal128",
             DataType::Decimal256(_, _) => "Decimal256",
             DataType::Map(_, _) => "Map",
diff --git a/src/dataframe.rs b/src/dataframe.rs
@@ -276,7 +276,6 @@ impl PyParquetColumnOptions {
                 statistics_enabled,
                 bloom_filter_fpp,
                 bloom_filter_ndv,
-                ..Default::default()
             },
         }
     }
diff --git a/src/expr/sort_expr.rs b/src/expr/sort_expr.rs
@@ -23,7 +23,7 @@ use std::fmt::{self, Display, Formatter};
 #[pyclass(name = "SortExpr", module = "datafusion.expr", subclass)]
 #[derive(Clone)]
 pub struct PySortExpr {
-    sort: SortExpr,
+    pub(crate) sort: SortExpr,
 }
 
 impl From<PySortExpr> for SortExpr {
diff --git a/src/functions.rs b/src/functions.rs
@@ -319,21 +319,25 @@ fn find_window_fn(
 }
 
 /// Creates a new Window function expression
+#[allow(clippy::too_many_arguments)]
 #[pyfunction]
-#[pyo3(signature = (name, args, partition_by=None, order_by=None, window_frame=None, ctx=None))]
+#[pyo3(signature = (name, args, partition_by=None, order_by=None, window_frame=None, filter=None, distinct=false, ctx=None))]
 fn window(
     name: &str,
     args: Vec<PyExpr>,
     partition_by: Option<Vec<PyExpr>>,
     order_by: Option<Vec<PySortExpr>>,
     window_frame: Option<PyWindowFrame>,
+    filter: Option<PyExpr>,
+    distinct: bool,
     ctx: Option<PySessionContext>,
 ) -> PyResult<PyExpr> {
     let fun = find_window_fn(name, ctx)?;
 
     let window_frame = window_frame
         .map(|w| w.into())
         .unwrap_or(WindowFrame::new(order_by.as_ref().map(|v| !v.is_empty())));
+    let filter = filter.map(|f| f.expr.into());
 
     Ok(PyExpr {
         expr: datafusion::logical_expr::Expr::WindowFunction(Box::new(WindowFunction {
@@ -351,6 +355,8 @@ fn window(
                     .map(|x| x.into())
                     .collect::<Vec<_>>(),
                 window_frame,
+                filter,
+                distinct,
                 null_treatment: None,
             },
         })),
@@ -649,36 +655,36 @@ aggregate_function!(approx_median);
 // aggregate_function!(grouping);
 
 #[pyfunction]
-#[pyo3(signature = (expression, percentile, num_centroids=None, filter=None))]
+#[pyo3(signature = (sort_expression, percentile, num_centroids=None, filter=None))]
 pub fn approx_percentile_cont(
-    expression: PyExpr,
+    sort_expression: PySortExpr,
     percentile: f64,
     num_centroids: Option<i64>, // enforces optional arguments at the end, currently
     filter: Option<PyExpr>,
 ) -> PyDataFusionResult<PyExpr> {
-    let args = if let Some(num_centroids) = num_centroids {
-        vec![expression.expr, lit(percentile), lit(num_centroids)]
-    } else {
-        vec![expression.expr, lit(percentile)]
-    };
-    let udaf = functions_aggregate::approx_percentile_cont::approx_percentile_cont_udaf();
-    let agg_fn = udaf.call(args);
+    let agg_fn = functions_aggregate::expr_fn::approx_percentile_cont(
+        sort_expression.sort,
+        lit(percentile),
+        num_centroids.map(lit),
+    );
 
     add_builder_fns_to_aggregate(agg_fn, None, filter, None, None)
 }
 
 #[pyfunction]
-#[pyo3(signature = (expression, weight, percentile, filter=None))]
+#[pyo3(signature = (sort_expression, weight, percentile, num_centroids=None, filter=None))]
 pub fn approx_percentile_cont_with_weight(
-    expression: PyExpr,
+    sort_expression: PySortExpr,
     weight: PyExpr,
     percentile: f64,
+    num_centroids: Option<i64>,
     filter: Option<PyExpr>,
 ) -> PyDataFusionResult<PyExpr> {
     let agg_fn = functions_aggregate::expr_fn::approx_percentile_cont_with_weight(
-        expression.expr,
+        sort_expression.sort,
         weight.expr,
         lit(percentile),
+        num_centroids.map(lit),
     );
 
     add_builder_fns_to_aggregate(agg_fn, None, filter, None, None)
diff --git a/src/udwf.rs b/src/udwf.rs
@@ -33,6 +33,7 @@ use crate::utils::{parse_volatility, validate_pycapsule};
 use datafusion::arrow::datatypes::DataType;
 use datafusion::arrow::pyarrow::{FromPyArrow, PyArrowType, ToPyArrow};
 use datafusion::error::{DataFusionError, Result};
+use datafusion::logical_expr::ptr_eq::PtrEq;
 use datafusion::logical_expr::{
     PartitionEvaluator, PartitionEvaluatorFactory, Signature, Volatility, WindowUDF, WindowUDFImpl,
 };
@@ -271,11 +272,12 @@ impl PyWindowUDF {
     }
 }
 
+#[derive(Hash, Eq, PartialEq)]
 pub struct MultiColumnWindowUDF {
     name: String,
     signature: Signature,
     return_type: DataType,
-    partition_evaluator_factory: PartitionEvaluatorFactory,
+    partition_evaluator_factory: PtrEq<PartitionEvaluatorFactory>,
 }
 
 impl std::fmt::Debug for MultiColumnWindowUDF {
@@ -303,7 +305,7 @@ impl MultiColumnWindowUDF {
             name,
             signature,
             return_type,
-            partition_evaluator_factory,
+            partition_evaluator_factory: partition_evaluator_factory.into(),
         }
     }
 }