enh: support unique(keep='none') for pyspark/sqlframe (#2338)

MarcoGorelli · web-flow · commit 9b0bc75fa3b7 · 2025-04-05T09:03:08.000+01:00
diff --git a/narwhals/_duckdb/dataframe.py b/narwhals/_duckdb/dataframe.py
@@ -350,25 +350,26 @@ def collect_schema(self: Self) -> dict[str, DType]:
     def unique(
         self: Self, subset: Sequence[str] | None, *, keep: Literal["any", "none"]
     ) -> Self:
-        if subset is not None:
-            rel = self.native
+        subset_ = subset if keep == "any" else (subset or self.columns)
+        if subset_:
             # Sanitise input
-            if any(x not in rel.columns for x in subset):
-                msg = f"Columns {set(subset).difference(rel.columns)} not found in {rel.columns}."
+            if any(x not in self.columns for x in subset_):
+                msg = f"Columns {set(subset_).difference(self.columns)} not found in {self.columns}."
                 raise ColumnNotFoundError(msg)
-            idx_name = generate_temporary_column_name(8, rel.columns)
-            count_name = generate_temporary_column_name(8, [*rel.columns, idx_name])
-            if keep == "none":
-                keep_condition = col(count_name) == lit(1)
-            else:
-                keep_condition = col(idx_name) == lit(1)
-            partition_by_sql = generate_partition_by_sql(*subset)
+            idx_name = generate_temporary_column_name(8, self.columns)
+            count_name = generate_temporary_column_name(8, [*self.columns, idx_name])
+            partition_by_sql = generate_partition_by_sql(*(subset_))
+            rel = self.native  # noqa: F841
             query = f"""
                 select *,
                         row_number() over ({partition_by_sql}) as "{idx_name}",
                         count(*) over ({partition_by_sql}) as "{count_name}"
                 from rel
                 """  # noqa: S608
+            if keep == "none":
+                keep_condition = col(count_name) == lit(1)
+            else:
+                keep_condition = col(idx_name) == lit(1)
             return self._with_native(
                 duckdb.sql(query)
                 .filter(keep_condition)
@@ -394,8 +395,7 @@ def sort(
         return self._with_native(self.native.sort(*it))
 
     def drop_nulls(self: Self, subset: Sequence[str] | None) -> Self:
-        rel = self.native
-        subset_ = subset if subset is not None else rel.columns
+        subset_ = subset if subset is not None else self.columns
         keep_condition = reduce(and_, (col(name).isnotnull() for name in subset_))
         return self._with_native(self.native.filter(keep_condition))
 
diff --git a/narwhals/_spark_like/dataframe.py b/narwhals/_spark_like/dataframe.py
@@ -21,6 +21,7 @@
 from narwhals.utils import Implementation
 from narwhals.utils import check_column_exists
 from narwhals.utils import find_stacklevel
+from narwhals.utils import generate_temporary_column_name
 from narwhals.utils import import_dtypes_module
 from narwhals.utils import is_spark_like_dataframe
 from narwhals.utils import not_implemented
@@ -335,11 +336,17 @@ def unique(
         *,
         keep: Literal["any", "none"],
     ) -> Self:
-        if keep != "any":
-            msg = "`LazyFrame.unique` with PySpark backend only supports `keep='any'`."
-            raise ValueError(msg)
         check_column_exists(self.columns, subset)
         subset = list(subset) if subset else None
+        if keep == "none":
+            tmp = generate_temporary_column_name(8, self.columns)
+            window = self._Window().partitionBy(subset or self.columns)
+            df = (
+                self.native.withColumn(tmp, self._F.count("*").over(window))
+                .filter(self._F.col(tmp) == 1)
+                .drop(tmp)
+            )
+            return self._with_native(df)
         return self._with_native(self.native.dropDuplicates(subset=subset))
 
     def join(
diff --git a/tests/frame/unique_test.py b/tests/frame/unique_test.py
@@ -1,7 +1,6 @@
 from __future__ import annotations
 
-from contextlib import nullcontext as does_not_raise
-from typing import Any
+from typing import Literal
 
 import pytest
 
@@ -10,6 +9,7 @@
 import narwhals as nw
 from narwhals.exceptions import ColumnNotFoundError
 from tests.utils import Constructor
+from tests.utils import ConstructorEager
 from tests.utils import assert_equal_data
 
 data = {"a": [1, 3, 2], "b": [4, 4, 6], "z": [7.0, 8.0, 9.0]}
@@ -21,44 +21,71 @@
     [
         ("first", {"a": [1, 2], "b": [4, 6], "z": [7.0, 9.0]}),
         ("last", {"a": [3, 2], "b": [4, 6], "z": [8.0, 9.0]}),
+    ],
+)
+def test_unique_eager(
+    constructor_eager: ConstructorEager,
+    subset: str | list[str] | None,
+    keep: Literal["first", "last"],
+    expected: dict[str, list[float]],
+) -> None:
+    df_raw = constructor_eager(data)
+    df = nw.from_native(df_raw)
+    result = df.unique(subset, keep=keep).sort("z")
+    assert_equal_data(result, expected)
+
+
+def test_unique_invalid_subset(constructor: Constructor) -> None:
+    df_raw = constructor(data)
+    df = nw.from_native(df_raw)
+    with pytest.raises(ColumnNotFoundError):
+        df.lazy().unique(["fdssfad"]).collect()
+
+
+@pytest.mark.parametrize("subset", ["b", ["b"]])
+@pytest.mark.parametrize(
+    ("keep", "expected"),
+    [
         ("any", {"a": [1, 2], "b": [4, 6], "z": [7.0, 9.0]}),
         ("none", {"a": [2], "b": [6], "z": [9]}),
-        ("foo", {"a": [2], "b": [6], "z": [9]}),
     ],
 )
 def test_unique(
     constructor: Constructor,
     subset: str | list[str] | None,
-    keep: str,
+    keep: Literal["any", "none"],
     expected: dict[str, list[float]],
 ) -> None:
     df_raw = constructor(data)
     df = nw.from_native(df_raw)
-    if isinstance(df, nw.LazyFrame) and keep in {
-        "first",
-        "last",
-    }:
-        context: Any = pytest.raises(ValueError, match="row order")
-    elif keep == "none" and df.implementation.is_spark_like():  # pragma: no cover
-        context = pytest.raises(
-            ValueError,
-            match="`LazyFrame.unique` with PySpark backend only supports `keep='any'`.",
-        )
-    elif keep == "foo":
-        context = pytest.raises(ValueError, match=": foo")
-    else:
-        context = does_not_raise()
-
-    with context:
-        result = df.unique(subset, keep=keep).sort("z")  # type: ignore[arg-type]
-        assert_equal_data(result, expected)
+    result = df.unique(subset, keep=keep).sort("z")
+    assert_equal_data(result, expected)
 
 
-def test_unique_invalid_subset(constructor: Constructor) -> None:
+@pytest.mark.parametrize("subset", [None, ["a", "b"]])
+@pytest.mark.parametrize(
+    ("keep", "expected"),
+    [
+        ("any", {"a": [1, 1, 2], "b": [3, 4, 4]}),
+        ("none", {"a": [1, 2], "b": [4, 4]}),
+    ],
+)
+def test_unique_full_subset(
+    constructor: Constructor,
+    subset: list[str] | None,
+    keep: Literal["any", "none"],
+    expected: dict[str, list[float]],
+) -> None:
+    data = {"a": [1, 1, 1, 2], "b": [3, 3, 4, 4]}
     df_raw = constructor(data)
     df = nw.from_native(df_raw)
-    with pytest.raises(ColumnNotFoundError):
-        df.lazy().unique(["fdssfad"]).collect()
+    result = df.unique(subset, keep=keep).sort("a", "b")
+    assert_equal_data(result, expected)
+
+
+def test_unique_invalid_keep(constructor: Constructor) -> None:
+    with pytest.raises(ValueError, match=r"(Got|got): cabbage"):
+        nw.from_native(constructor(data)).unique(keep="cabbage")  # type: ignore[arg-type]
 
 
 @pytest.mark.filterwarnings("ignore:.*backwards-compatibility:UserWarning")