perf: Add an optimized path for single-column partition_by

dangotbanned · dangotbanned · commit ac779ddf0fc4 · 2025-10-22T12:44:36.000Z
Avoids the need for a tempoary composite key column, by using `dictionary_encode` and generating boolean masks based on index position
diff --git a/narwhals/_plan/arrow/dataframe.py b/narwhals/_plan/arrow/dataframe.py
@@ -24,7 +24,7 @@
 if TYPE_CHECKING:
     from collections.abc import Iterable, Iterator, Mapping, Sequence
 
-    from typing_extensions import Self
+    from typing_extensions import Self, TypeAlias
 
     from narwhals._arrow.typing import ChunkedArrayAny
     from narwhals._plan.arrow.namespace import ArrowNamespace
@@ -34,6 +34,8 @@
     from narwhals.dtypes import DType
     from narwhals.typing import IntoSchema
 
+Incomplete: TypeAlias = Any
+
 
 class ArrowDataFrame(EagerDataFrame[Series, "pa.Table", "ChunkedArrayAny"]):
     implementation = Implementation.PYARROW
@@ -173,25 +175,54 @@ def filter(self, predicate: NamedIR) -> Self:
             mask = acero.lit(resolved.native)
         return self._with_native(self.native.filter(mask))
 
-    # TODO @dangotbanned: Clean this up after getting more tests in place
     def partition_by(self, by: Sequence[str], *, include_key: bool = True) -> list[Self]:
-        original_names = self.columns
-        temp_name = temp.column_name(original_names)
-        native = self.native
-        composite_values = group_by.concat_str(acero.select_names_table(native, by))
-        re_keyed = native.add_column(0, temp_name, composite_values)
-        source = acero.table_source(re_keyed)
-        if include_key:
-            keep = original_names
-        else:
-            ignore = {*by, temp_name}
-            keep = [name for name in original_names if name not in ignore]
-        select = acero.select_names(keep)
-        key = acero.col(temp_name)
-        # Need to iterate over the whole thing, so py_list first should be faster
-        partitions = (
-            acero.declare(source, acero.filter(key == v), select)
-            for v in composite_values.unique().to_pylist()
-        )
         from_native = self._with_native
-        return [from_native(decl.to_table()) for decl in partitions]
+        partitions = partition_by(self.native, by, include_key=include_key)
+        return [from_native(df) for df in partitions]
+
+
+def partition_by(
+    native: pa.Table, by: Sequence[str], *, include_key: bool = True
+) -> Iterator[pa.Table]:
+    if len(by) == 1:
+        yield from _partition_by_one(native, by[0], include_key=include_key)
+    else:
+        yield from _partition_by_many(native, by, include_key=include_key)
+
+
+def _partition_by_one(
+    native: pa.Table, by: str, *, include_key: bool = True
+) -> Iterator[pa.Table]:
+    """Optimized path for single-column partition."""
+    arr_dict: Incomplete = fn.array(native.column(by).dictionary_encode("encode"))
+    indices: pa.Int32Array = arr_dict.indices
+    if not include_key:
+        native = native.remove_column(native.schema.get_field_index(by))
+    for idx in range(len(arr_dict.dictionary)):
+        # NOTE: Acero filter doesn't support `null_selection_behavior="emit_null"`
+        # Is there any reasonable way to do this in Acero?
+        yield native.filter(pc.equal(pa.scalar(idx), indices))
+
+
+def _partition_by_many(
+    native: pa.Table, by: Sequence[str], *, include_key: bool = True
+) -> Iterator[pa.Table]:
+    original_names = native.column_names
+    temp_name = temp.column_name(original_names)
+    key = acero.col(temp_name)
+    composite_values = group_by.concat_str(acero.select_names_table(native, by))
+    # Need to iterate over the whole thing, so py_list first should be faster
+    unique_py = composite_values.unique().to_pylist()
+    re_keyed = native.add_column(0, temp_name, composite_values)
+    source = acero.table_source(re_keyed)
+    if include_key:
+        keep = original_names
+    else:
+        ignore = {*by, temp_name}
+        keep = [name for name in original_names if name not in ignore]
+    select = acero.select_names(keep)
+    for v in unique_py:
+        # NOTE: May want to split the `Declaration` production iterator into it's own function
+        # E.g, to push down column selection to *before* collection
+        # Not needed for this task though
+        yield acero.collect(source, acero.filter(key == v), select)
diff --git a/narwhals/_plan/arrow/functions.py b/narwhals/_plan/arrow/functions.py
@@ -4,7 +4,7 @@
 
 import typing as t
 from collections.abc import Callable
-from typing import TYPE_CHECKING, Any
+from typing import TYPE_CHECKING, Any, overload
 
 import pyarrow as pa  # ignore-banned-import
 import pyarrow.compute as pc  # ignore-banned-import
@@ -54,7 +54,7 @@
         StringType,
         UnaryFunction,
     )
-    from narwhals.typing import ClosedInterval, IntoArrowSchema
+    from narwhals.typing import ClosedInterval, IntoArrowSchema, PythonLiteral
 
 BACKEND_VERSION = Implementation.PYARROW._backend_version()
 
@@ -348,20 +348,33 @@ def lit(value: Any, dtype: DataType | None = None) -> NativeScalar:
     return pa.scalar(value) if dtype is None else pa.scalar(value, dtype)
 
 
+@overload
+def array(data: ArrowAny, /) -> ArrayAny: ...
+@overload
 def array(
-    value: NativeScalar | Iterable[Any], dtype: DataType | None = None, /
+    data: Iterable[PythonLiteral], dtype: DataType | None = None, /
+) -> ArrayAny: ...
+def array(
+    data: ArrowAny | Iterable[PythonLiteral], dtype: DataType | None = None, /
 ) -> ArrayAny:
-    return (
-        pa.array([value], value.type)
-        if isinstance(value, pa.Scalar)
-        else pa.array(value, dtype)
-    )
+    """Convert `data` into an Array instance.
+
+    Note:
+        `dtype` is not used for existing `pyarrow` data, use `cast` instead.
+    """
+    if isinstance(data, pa.ChunkedArray):
+        return data.combine_chunks()
+    if isinstance(data, pa.Array):
+        return data
+    if isinstance(data, pa.Scalar):
+        return pa.array([data], data.type)
+    return pa.array(data, dtype)
 
 
 def chunked_array(
-    arr: ArrowAny | list[Iterable[Any]], dtype: DataType | None = None, /
+    data: ArrowAny | list[Iterable[Any]], dtype: DataType | None = None, /
 ) -> ChunkedArrayAny:
-    return _chunked_array(array(arr) if isinstance(arr, pa.Scalar) else arr, dtype)
+    return _chunked_array(array(data) if isinstance(data, pa.Scalar) else data, dtype)
 
 
 def concat_vertical_chunked(