Skip to content

Commit 80185e6

Browse files
authored
perf: Set pandas attributes inplace when it is safe to do so (#2559)
1 parent 776a799 commit 80185e6

File tree

5 files changed

+29
-56
lines changed

5 files changed

+29
-56
lines changed

CONTRIBUTING.md

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -139,6 +139,23 @@ If you add code that should be tested, please add tests.
139139
- To run tests using `cudf.pandas`, run `NARWHALS_DEFAULT_CONSTRUCTORS=pandas python -m cudf.pandas -m pytest`
140140
- To run tests using `polars[gpu]`, run `NARWHALS_POLARS_GPU=1 pytest --constructors=polars[lazy]`
141141
142+
### Backend-specific advice
143+
144+
- pandas:
145+
146+
- Don't use `apply` or `map`. The only place we currently use `apply` is in `group_by` for operations
147+
which the pandas API just doesn't support, and even then, it's accompanied by a big warning.
148+
- Don't use inplace methods, unless you're creating a new object and are sure it's safe to modify
149+
it. In particular, you should never ever modify the user's input data.
150+
151+
- Polars:
152+
153+
- Never use `map_elements`.
154+
155+
- DuckDB / PySpark / anything lazy-only:
156+
157+
- Never assume that your data is ordered in any pre-defined way.
158+
142159
### Test Failure Patterns
143160
144161
We aim to use three standard patterns for handling test failures:

narwhals/_pandas_like/group_by.py

Lines changed: 10 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,6 @@
1212
from narwhals._compliant import EagerGroupBy
1313
from narwhals._expression_parsing import evaluate_output_names_and_aliases
1414
from narwhals._pandas_like.utils import select_columns_by_name
15-
from narwhals._pandas_like.utils import set_columns
1615
from narwhals.utils import find_stacklevel
1716

1817
if TYPE_CHECKING:
@@ -199,25 +198,17 @@ def agg(self, *exprs: PandasLikeExpr) -> PandasLikeDataFrame: # noqa: C901, PLR
199198
result_aggs.append(result_nunique_aggs)
200199

201200
if std_aggs:
202-
result_aggs.extend(
203-
set_columns(
204-
self._grouped[std_output_names].std(ddof=ddof),
205-
columns=std_aliases,
206-
implementation=implementation,
207-
backend_version=backend_version,
208-
)
209-
for ddof, (std_output_names, std_aliases) in std_aggs.items()
210-
)
201+
for ddof, (std_output_names, std_aliases) in std_aggs.items():
202+
_aggregation = self._grouped[std_output_names].std(ddof=ddof)
203+
# `_aggregation` is a new object so it's OK to operate inplace.
204+
_aggregation.columns = std_aliases
205+
result_aggs.append(_aggregation)
211206
if var_aggs:
212-
result_aggs.extend(
213-
set_columns(
214-
self._grouped[var_output_names].var(ddof=ddof),
215-
columns=var_aliases,
216-
implementation=implementation,
217-
backend_version=backend_version,
218-
)
219-
for ddof, (var_output_names, var_aliases) in var_aggs.items()
220-
)
207+
for ddof, (var_output_names, var_aliases) in var_aggs.items():
208+
_aggregation = self._grouped[var_output_names].var(ddof=ddof)
209+
# `_aggregation` is a new object so it's OK to operate inplace.
210+
_aggregation.columns = var_aliases
211+
result_aggs.append(_aggregation)
221212

222213
if result_aggs:
223214
output_names_counter = collections.Counter(

narwhals/_pandas_like/series_list.py

Lines changed: 2 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,6 @@
66
from narwhals._pandas_like.utils import PandasLikeSeriesNamespace
77
from narwhals._pandas_like.utils import get_dtype_backend
88
from narwhals._pandas_like.utils import narwhals_to_native_dtype
9-
from narwhals._pandas_like.utils import set_index
109

1110
if TYPE_CHECKING:
1211
from narwhals._pandas_like.series import PandasLikeSeries
@@ -20,12 +19,8 @@ def len(self) -> PandasLikeSeries:
2019
implementation = self.implementation
2120
backend_version = self.backend_version
2221
if implementation.is_pandas() and backend_version < (3, 0): # pragma: no cover
23-
result = set_index(
24-
result,
25-
self.native.index,
26-
implementation=implementation,
27-
backend_version=backend_version,
28-
)
22+
# `result` is a new object so it's safe to do this inplace.
23+
result.index = self.native.index
2924
dtype = narwhals_to_native_dtype(
3025
self.version.dtypes.UInt32(),
3126
get_dtype_backend(result.dtype, implementation),

narwhals/_pandas_like/utils.py

Lines changed: 0 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -166,36 +166,6 @@ def set_index(
166166
return obj.set_axis(index, axis=0, **kwargs) # type: ignore[attr-defined]
167167

168168

169-
def set_columns(
170-
obj: T,
171-
columns: list[str],
172-
*,
173-
implementation: Implementation,
174-
backend_version: tuple[int, ...],
175-
) -> T:
176-
"""Wrapper around pandas' set_axis to set object columns.
177-
178-
We can set `copy` / `inplace` based on implementation/version.
179-
"""
180-
if implementation is Implementation.CUDF: # pragma: no cover
181-
obj = obj.copy(deep=False) # type: ignore[attr-defined]
182-
obj.columns = columns # type: ignore[attr-defined]
183-
return obj
184-
if implementation is Implementation.PANDAS and (
185-
backend_version < (1,)
186-
): # pragma: no cover
187-
kwargs = {"inplace": False}
188-
else:
189-
kwargs = {}
190-
if implementation is Implementation.PANDAS and (
191-
(1, 5) <= backend_version < (3,)
192-
): # pragma: no cover
193-
kwargs["copy"] = False
194-
else: # pragma: no cover
195-
pass
196-
return obj.set_axis(columns, axis=1, **kwargs) # type: ignore[attr-defined]
197-
198-
199169
def rename(
200170
obj: T,
201171
*args: Any,
File renamed without changes.

0 commit comments

Comments
 (0)