feat(expr-ir): Implement ArrowDataFrame.partition_by

dangotbanned · dangotbanned · commit e0d1a00ba188 · 2025-10-21T20:23:24.000Z
Supports selector input for partitions
diff --git a/narwhals/_plan/arrow/dataframe.py b/narwhals/_plan/arrow/dataframe.py
@@ -9,10 +9,11 @@
 import pyarrow.compute as pc  # ignore-banned-import
 
 from narwhals._arrow.utils import native_to_narwhals_dtype
-from narwhals._plan.arrow import acero, functions as fn
+from narwhals._plan.arrow import acero, functions as fn, group_by
 from narwhals._plan.arrow.expr import ArrowExpr as Expr, ArrowScalar as Scalar
 from narwhals._plan.arrow.group_by import ArrowGroupBy as GroupBy
 from narwhals._plan.arrow.series import ArrowSeries as Series
+from narwhals._plan.common import temp
 from narwhals._plan.compliant.dataframe import EagerDataFrame
 from narwhals._plan.compliant.typing import namespace
 from narwhals._plan.expressions import NamedIR
@@ -172,7 +173,25 @@ def filter(self, predicate: NamedIR) -> Self:
             mask = acero.lit(resolved.native)
         return self._with_native(self.native.filter(mask))
 
+    # TODO @dangotbanned: Clean this up after getting more tests in place
     def partition_by(self, by: Sequence[str], *, include_key: bool = True) -> list[Self]:
-        """Review https://github.com/pola-rs/polars/blob/870f0e01811b8b0cf9b846ded9d97685f143d27c/crates/polars-core/src/frame/mod.rs#L3225-L3284."""
-        msg = "TODO: `ArrowDataFrame.partition_by`"
-        raise NotImplementedError(msg)
+        original_names = self.columns
+        temp_name = temp.column_name(original_names)
+        native = self.native
+        composite_values = group_by.concat_str(acero.select_names_table(native, by))
+        re_keyed = native.add_column(0, temp_name, composite_values)
+        source = acero.table_source(re_keyed)
+        if include_key:
+            keep = original_names
+        else:
+            ignore = {*by, temp_name}
+            keep = [name for name in original_names if name not in ignore]
+        select = acero.select_names(keep)
+        key = acero.col(temp_name)
+        # Need to iterate over the whole thing, so py_list first should be faster
+        partitions = (
+            acero.declare(source, acero.filter(key == v), select)
+            for v in composite_values.unique().to_pylist()
+        )
+        from_native = self._with_native
+        return [from_native(decl.to_table()) for decl in partitions]
diff --git a/tests/plan/frame_partition_by_test.py b/tests/plan/frame_partition_by_test.py
@@ -0,0 +1,55 @@
+from __future__ import annotations
+
+from typing import TYPE_CHECKING, Any
+
+import pytest
+
+import narwhals as nw
+from narwhals._plan import selectors as ncs
+from narwhals._utils import zip_strict
+from tests.plan.utils import assert_equal_data, dataframe
+
+if TYPE_CHECKING:
+    from narwhals._plan.typing import ColumnNameOrSelector
+    from tests.conftest import Data
+
+
+@pytest.fixture
+def data() -> Data:
+    return {"a": ["a", "b", "a", "b", "c"], "b": [1, 2, 1, 3, 3], "c": [5, 4, 3, 2, 1]}
+
+
+@pytest.mark.parametrize(
+    ("include_key", "expected"),
+    [
+        (
+            True,
+            [
+                {"a": ["a", "a"], "b": [1, 1], "c": [5, 3]},
+                {"a": ["b", "b"], "b": [2, 3], "c": [4, 2]},
+                {"a": ["c"], "b": [3], "c": [1]},
+            ],
+        ),
+        (
+            False,
+            [
+                {"b": [1, 1], "c": [5, 3]},
+                {"b": [2, 3], "c": [4, 2]},
+                {"b": [3], "c": [1]},
+            ],
+        ),
+    ],
+    ids=["include_key", "exclude_key"],
+)
+@pytest.mark.parametrize(
+    "by",
+    ["a", ncs.string(), ncs.matches("a"), ncs.by_name("a"), ncs.by_dtype(nw.String)],
+    ids=["str", "ncs.string", "ncs.matches", "ncs.by_name", "ncs.by_dtype"],
+)
+def test_partition_by_single(
+    data: Data, by: ColumnNameOrSelector, *, include_key: bool, expected: Any
+) -> None:
+    df = dataframe(data)
+    results = df.partition_by(by, include_key=include_key)
+    for df, expect in zip_strict(results, expected):
+        assert_equal_data(df, expect)