feat: add cum_count and cum_prod to PySpark and DuckDB (#2286)

raisadz · web-flow · commit 9607e5f1c924 · 2025-03-25T11:50:48.000Z
diff --git a/narwhals/_duckdb/expr.py b/narwhals/_duckdb/expr.py
@@ -80,7 +80,10 @@ def __narwhals_namespace__(self) -> DuckDBNamespace:  # pragma: no cover
         )
 
     def _cum_window_func(
-        self, *, reverse: bool, func_name: Literal["sum", "max", "min"]
+        self,
+        *,
+        reverse: bool,
+        func_name: Literal["sum", "max", "min", "count", "product"],
     ) -> WindowFunction:
         def func(window_inputs: WindowInputs) -> duckdb.Expression:
             order_by_sql = generate_order_by_sql(
@@ -516,6 +519,16 @@ def cum_min(self, *, reverse: bool) -> Self:
             self._cum_window_func(reverse=reverse, func_name="min")
         )
 
+    def cum_count(self, *, reverse: bool) -> Self:
+        return self._with_window_function(
+            self._cum_window_func(reverse=reverse, func_name="count")
+        )
+
+    def cum_prod(self, *, reverse: bool) -> Self:
+        return self._with_window_function(
+            self._cum_window_func(reverse=reverse, func_name="product")
+        )
+
     def rolling_sum(self, window_size: int, *, min_samples: int, center: bool) -> Self:
         if center:
             half = (window_size - 1) // 2
@@ -580,5 +593,3 @@ def struct(self: Self) -> DuckDBExprStructNamespace:
     drop_nulls = not_implemented()
     unique = not_implemented()
     is_unique = not_implemented()
-    cum_count = not_implemented()
-    cum_prod = not_implemented()
diff --git a/narwhals/_spark_like/expr.py b/narwhals/_spark_like/expr.py
@@ -135,7 +135,10 @@ def _with_window_function(
         return result
 
     def _cum_window_func(
-        self: Self, *, reverse: bool, func_name: Literal["sum", "max", "min"]
+        self: Self,
+        *,
+        reverse: bool,
+        func_name: Literal["sum", "max", "min", "count", "product"],
     ) -> WindowFunction:
         def func(window_inputs: WindowInputs) -> Column:
             if reverse:
@@ -594,6 +597,16 @@ def cum_min(self, *, reverse: bool) -> Self:
             self._cum_window_func(reverse=reverse, func_name="min")
         )
 
+    def cum_count(self, *, reverse: bool) -> Self:
+        return self._with_window_function(
+            self._cum_window_func(reverse=reverse, func_name="count")
+        )
+
+    def cum_prod(self, *, reverse: bool) -> Self:
+        return self._with_window_function(
+            self._cum_window_func(reverse=reverse, func_name="product")
+        )
+
     def fill_null(
         self,
         value: Any | None,
@@ -657,6 +670,4 @@ def struct(self: Self) -> SparkLikeExprStructNamespace:
 
     drop_nulls = not_implemented()
     unique = not_implemented()
-    cum_count = not_implemented()
-    cum_prod = not_implemented()
     quantile = not_implemented()
diff --git a/tests/expr_and_series/cum_count_test.py b/tests/expr_and_series/cum_count_test.py
@@ -3,6 +3,9 @@
 import pytest
 
 import narwhals.stable.v1 as nw
+from tests.utils import DUCKDB_VERSION
+from tests.utils import POLARS_VERSION
+from tests.utils import Constructor
 from tests.utils import ConstructorEager
 from tests.utils import assert_equal_data
 
@@ -36,3 +39,50 @@ def test_cum_count_series(constructor_eager: ConstructorEager) -> None:
         "reverse_cum_count": [3, 2, 1, 1],
     }
     assert_equal_data(result, expected)
+
+
+@pytest.mark.parametrize(
+    ("reverse", "expected_a"),
+    [
+        (False, [1, 1, 2]),
+        (True, [1, 2, 1]),
+    ],
+)
+def test_lazy_cum_count_grouped(
+    constructor: Constructor,
+    request: pytest.FixtureRequest,
+    *,
+    reverse: bool,
+    expected_a: list[int],
+) -> None:
+    if "pyarrow_table" in str(constructor):
+        # grouped window functions not yet supported
+        request.applymarker(pytest.mark.xfail)
+    if "modin" in str(constructor):
+        pytest.skip(reason="probably bugged")
+    if "dask" in str(constructor):
+        # https://github.com/dask/dask/issues/11806
+        request.applymarker(pytest.mark.xfail)
+    if ("polars" in str(constructor) and POLARS_VERSION < (1, 9)) or (
+        "duckdb" in str(constructor) and DUCKDB_VERSION < (1, 3)
+    ):
+        pytest.skip(reason="too old version")
+    if "cudf" in str(constructor):
+        # https://github.com/rapidsai/cudf/issues/18159
+        request.applymarker(pytest.mark.xfail)
+
+    df = nw.from_native(
+        constructor(
+            {
+                "arg entina": [None, 2, 3],
+                "ban gkock": [1, 0, 2],
+                "i ran": [0, 1, 2],
+                "g": [1, 1, 1],
+            }
+        )
+    )
+    result = df.with_columns(
+        nw.col("arg entina").cum_count(reverse=reverse).over("g", order_by="ban gkock")
+    ).sort("i ran")
+    expected = {"arg entina": expected_a, "ban gkock": [1, 0, 2], "i ran": [0, 1, 2]}
+    assert_equal_data(result, expected)
diff --git a/tests/expr_and_series/cum_prod_test.py b/tests/expr_and_series/cum_prod_test.py
@@ -3,8 +3,11 @@
 import pytest
 
 import narwhals.stable.v1 as nw
+from tests.utils import DUCKDB_VERSION
 from tests.utils import PANDAS_VERSION
+from tests.utils import POLARS_VERSION
 from tests.utils import PYARROW_VERSION
+from tests.utils import Constructor
 from tests.utils import ConstructorEager
 from tests.utils import assert_equal_data
 
@@ -54,3 +57,53 @@ def test_cum_prod_series(
         reverse_cum_prod=df["a"].cum_prod(reverse=True),
     )
     assert_equal_data(result, expected)
+
+
+@pytest.mark.parametrize(
+    ("reverse", "expected_a"),
+    [
+        (False, [2, 2, 6]),
+        (True, [3, 6, 3]),
+    ],
+)
+def test_lazy_cum_prod_grouped(
+    constructor: Constructor,
+    request: pytest.FixtureRequest,
+    *,
+    reverse: bool,
+    expected_a: list[int],
+) -> None:
+    if "pyarrow_table" in str(constructor):
+        # grouped window functions not yet supported
+        request.applymarker(pytest.mark.xfail)
+    if "modin" in str(constructor):
+        pytest.skip(reason="probably bugged")
+    if "dask" in str(constructor):
+        # https://github.com/dask/dask/issues/11806
+        request.applymarker(pytest.mark.xfail)
+    if ("polars" in str(constructor) and POLARS_VERSION < (1, 9)) or (
+        "duckdb" in str(constructor) and DUCKDB_VERSION < (1, 3)
+    ):
+        pytest.skip(reason="too old version")
+    if "cudf" in str(constructor):
+        # https://github.com/rapidsai/cudf/issues/18159
+        request.applymarker(pytest.mark.xfail)
+    if "sqlframe" in str(constructor):
+        # https://github.com/eakmanrq/sqlframe/issues/348
+        request.applymarker(pytest.mark.xfail)
+
+    df = nw.from_native(
+        constructor(
+            {
+                "arg entina": [1, 2, 3],
+                "ban gkock": [1, 0, 2],
+                "i ran": [0, 1, 2],
+                "g": [1, 1, 1],
+            }
+        )
+    )
+    result = df.with_columns(
+        nw.col("arg entina").cum_prod(reverse=reverse).over("g", order_by="ban gkock")
+    ).sort("i ran")
+    expected = {"arg entina": expected_a, "ban gkock": [1, 0, 2], "i ran": [0, 1, 2]}
+    assert_equal_data(result, expected)