Merge branch 'fea/polars/rank' into branch-25.10

Matt711 · Matt711 · commit 1581500354d4 · 2025-08-21T15:40:26.000Z
diff --git a/python/cudf_polars/cudf_polars/dsl/expressions/unary.py b/python/cudf_polars/cudf_polars/dsl/expressions/unary.py
@@ -106,12 +106,13 @@ class UnaryFunction(Expr):
             "drop_nulls",
             "fill_null",
             "mask_nans",
+            "null_count",
+            "rank",
             "round",
             "set_sorted",
+            "top_k",
             "unique",
             "value_counts",
-            "null_count",
-            "top_k",
         }
     )
     _supported_cum_aggs = frozenset(
@@ -135,13 +136,14 @@ def __init__(
         self.children = children
         self.is_pointwise = self.name not in (
             "as_struct",
-            "cum_min",
             "cum_max",
+            "cum_min",
             "cum_prod",
             "cum_sum",
             "drop_nulls",
-            "unique",
+            "rank",
             "top_k",
+            "unique",
         )
 
         if self.name not in UnaryFunction._supported_fns:
@@ -152,6 +154,12 @@ def __init__(
                 raise NotImplementedError(
                     "reverse=True is not supported for cumulative aggregations"
                 )
+        if self.name == "rank":
+            method, _, _ = self.options
+            if method not in {"average", "min", "max", "dense", "ordinal"}:
+                raise NotImplementedError(
+                    f"ranking with {method=} is not yet supported"
+                )
 
     def do_evaluate(
         self, df: DataFrame, *, context: ExecutionContext = ExecutionContext.FRAME
@@ -342,6 +350,71 @@ def do_evaluate(
                 ),
                 dtype=self.dtype,
             )
+        elif self.name == "rank":
+            (column,) = (child.evaluate(df, context=context) for child in self.children)
+            method_str, descending, _ = self.options
+
+            method = {
+                "average": plc.aggregation.RankMethod.AVERAGE,
+                "min": plc.aggregation.RankMethod.MIN,
+                "max": plc.aggregation.RankMethod.MAX,
+                "dense": plc.aggregation.RankMethod.DENSE,
+                "ordinal": plc.aggregation.RankMethod.FIRST,
+            }[method_str]
+
+            order = (
+                plc.types.Order.DESCENDING if descending else plc.types.Order.ASCENDING
+            )
+
+            ranked: plc.Column = plc.sorting.rank(
+                column.obj,
+                method,
+                order,
+                plc.types.NullPolicy.EXCLUDE,
+                plc.types.NullOrder.BEFORE,
+                percentage=False,
+            )
+
+            null_count = column.null_count
+            if null_count and not descending:
+                # libcudf rank is offset when nulls would sort first and are excluded:
+                #  - dense: +1 (nulls count as a skipped leading group)
+                #  - min/max/ordinal/average: +k (nulls counted before all valid rows)
+                rank_dtype = ranked.type()
+                if method_str == "dense":
+                    one = plc.Scalar.from_py(
+                        1.0
+                        if rank_dtype.id() in {plc.TypeId.FLOAT32, plc.TypeId.FLOAT64}
+                        else 1,
+                        rank_dtype,
+                    )
+                    ranked = plc.binaryop.binary_operation(
+                        ranked, one, plc.binaryop.BinaryOperator.SUB, rank_dtype
+                    )
+                else:
+                    k_scalar = plc.Scalar.from_py(
+                        float(null_count)
+                        if rank_dtype.id() in {plc.TypeId.FLOAT32, plc.TypeId.FLOAT64}
+                        else int(null_count),
+                        rank_dtype,
+                    )
+                    ranked = plc.binaryop.binary_operation(
+                        ranked, k_scalar, plc.binaryop.BinaryOperator.SUB, rank_dtype
+                    )
+
+            # Min/Max/Dense/Ordinal -> IDX_DTYPE
+            # See https://github.com/pola-rs/polars/blob/main/crates/polars-ops/src/series/ops/rank.rs
+            if method_str in {"min", "max", "dense", "ordinal"}:
+                dest = self.dtype.plc.id()
+                src = ranked.type().id()
+                if dest == plc.TypeId.UINT32 and src != plc.TypeId.UINT32:
+                    ranked = plc.unary.cast(ranked, plc.DataType(plc.TypeId.UINT32))
+                elif (
+                    dest == plc.TypeId.UINT64 and src != plc.TypeId.UINT64
+                ):  # pragma: no cover
+                    ranked = plc.unary.cast(ranked, plc.DataType(plc.TypeId.UINT64))
+
+            return Column(ranked, dtype=self.dtype)
         elif self.name == "top_k":
             (column, k) = (
                 child.evaluate(df, context=context) for child in self.children
diff --git a/python/cudf_polars/cudf_polars/dsl/utils/aggregations.py b/python/cudf_polars/cudf_polars/dsl/utils/aggregations.py
@@ -89,6 +89,11 @@ def decompose_single_agg(
     """
     agg = named_expr.value
     name = named_expr.name
+    if isinstance(agg, expr.UnaryFunction) and agg.name in {"rank"}:
+        name = agg.name
+        raise NotImplementedError(
+            f"UnaryFunction {name=} not supported in groupby context"
+        )
     if isinstance(agg, expr.UnaryFunction) and agg.name == "null_count":
         (child,) = agg.children
 
diff --git a/python/cudf_polars/tests/expressions/test_numeric_unaryops.py b/python/cudf_polars/tests/expressions/test_numeric_unaryops.py
@@ -8,7 +8,11 @@
 
 import polars as pl
 
-from cudf_polars.testing.asserts import assert_gpu_result_equal
+from cudf_polars.testing.asserts import (
+    assert_gpu_result_equal,
+    assert_ir_translation_raises,
+)
+from cudf_polars.utils.versions import POLARS_VERSION_LT_132
 
 
 @pytest.fixture(
@@ -112,3 +116,42 @@ def test_null_count():
         pl.col("baz").is_null().sum(),
     )
     assert_gpu_result_equal(q)
+
+
+@pytest.mark.parametrize("method", ["ordinal", "dense", "min", "max", "average"])
+@pytest.mark.parametrize("descending", [False, True])
+def test_rank_supported(request, ldf: pl.LazyFrame, method: str, *, descending: bool):
+    request.applymarker(
+        pytest.mark.xfail(condition=POLARS_VERSION_LT_132, reason="nested loop join")
+    )
+    expr = pl.col("a").rank(method=method, descending=descending)
+    q = ldf.select(expr)
+    assert_gpu_result_equal(q)
+
+
+@pytest.mark.parametrize("method", ["ordinal", "dense", "min", "max", "average"])
+@pytest.mark.parametrize("descending", [False, True])
+@pytest.mark.parametrize("test", ["with_nulls", "with_ties"])
+def test_rank_methods_with_nulls_or_ties(
+    request, ldf: pl.LazyFrame, method: str, *, descending: bool, test: str
+) -> None:
+    request.applymarker(
+        pytest.mark.xfail(condition=POLARS_VERSION_LT_132, reason="nested loop join")
+    )
+
+    base = pl.col("a")
+    if test == "with_nulls":
+        expr = pl.when((base % 2) == 0).then(None).otherwise(base)
+    else:
+        expr = pl.when((base % 2) == 0).then(pl.lit(-5)).otherwise(base)
+
+    q = ldf.select(expr.rank(method=method, descending=descending))
+    assert_gpu_result_equal(q)
+
+
+@pytest.mark.parametrize("seed", [42])
+@pytest.mark.parametrize("method", ["random"])
+def test_rank_unsupported(ldf: pl.LazyFrame, method: str, seed: int) -> None:
+    expr = pl.col("a").rank(method=method, seed=seed)
+    q = ldf.select(expr)
+    assert_ir_translation_raises(q, NotImplementedError)
diff --git a/python/cudf_polars/tests/test_groupby.py b/python/cudf_polars/tests/test_groupby.py
@@ -384,3 +384,9 @@ def test_groupby_aggs_keep_unsupported_as_null(df: pl.LazyFrame, agg_expr) -> No
 def test_groupby_ternary_supported(df: pl.LazyFrame, expr: pl.Expr) -> None:
     q = df.group_by("key1").agg(expr)
     assert_gpu_result_equal(q, check_row_order=False)
+
+
+def test_groupby_rank_raises(df: pl.LazyFrame) -> None:
+    q = df.group_by("key1").agg(pl.col("int").rank())
+
+    assert_ir_translation_raises(q, NotImplementedError)
diff --git a/python/cudf_polars/tests/test_rolling.py b/python/cudf_polars/tests/test_rolling.py
@@ -272,3 +272,10 @@ def test_rolling_ternary_supported(df, expr):
 def test_rolling_ternary_unsupported(df, expr):
     q = df.rolling("dt", period="48h", closed="both").agg(expr.alias("out"))
     assert_ir_translation_raises(q, NotImplementedError)
+
+
+def test_rolling_rank_unsupported(df):
+    q = df.rolling("dt", period="48h", closed="both").agg(
+        pl.col("values").rank(method="dense", descending=False)
+    )
+    assert_ir_translation_raises(q, NotImplementedError)