Skip to content

Commit 7979909

Browse files
committed
Merge remote-tracking branch 'upstream/main' into more-dedup-1
2 parents 372f1eb + 490d029 commit 7979909

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

57 files changed

+911
-1662
lines changed

.github/workflows/pytest.yml

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -53,12 +53,12 @@ jobs:
5353
cache-dependency-glob: "pyproject.toml"
5454
- name: install-reqs
5555
# we are not testing pyspark on Windows here because it is very slow
56-
run: uv pip install -e ".[tests, core, extra, dask, modin]" --system
56+
run: uv pip install -e ".[tests, core, extra, dask, modin, sqlframe]" --system
5757
- name: show-deps
5858
run: uv pip freeze
5959
- name: Run pytest
6060
run: |
61-
pytest tests --cov=narwhals --cov=tests --runslow --cov-fail-under=95 --constructors=pandas,pandas[nullable],pandas[pyarrow],pyarrow,modin[pyarrow],polars[eager],polars[lazy],dask,duckdb
61+
pytest tests --cov=narwhals --cov=tests --runslow --cov-fail-under=95 --constructors=pandas,pandas[nullable],pandas[pyarrow],pyarrow,modin[pyarrow],polars[eager],polars[lazy],dask,duckdb,sqlframe
6262
6363
pytest-full-coverage:
6464
strategy:
@@ -83,7 +83,7 @@ jobs:
8383
cache-suffix: ${{ matrix.python-version }}
8484
cache-dependency-glob: "pyproject.toml"
8585
- name: install-reqs
86-
run: uv pip install -e ".[tests, core, extra, modin, dask]" --system
86+
run: uv pip install -e ".[tests, core, extra, modin, dask, sqlframe]" --system
8787
- name: install pyspark
8888
run: uv pip install -e ".[pyspark]" --system
8989
# PySpark is not yet available on Python3.12+

narwhals/_arrow/dataframe.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -181,7 +181,7 @@ def get_column(self: Self, name: str) -> ArrowSeries:
181181
version=self._version,
182182
)
183183

184-
def __array__(self: Self, dtype: Any, copy: bool | None) -> _2DArray:
184+
def __array__(self: Self, dtype: Any, *, copy: bool | None) -> _2DArray:
185185
return self._native_frame.__array__(dtype, copy=copy)
186186

187187
@overload
@@ -356,7 +356,7 @@ def select(self: ArrowDataFrame, *exprs: ArrowExpr) -> ArrowDataFrame:
356356
names = [s.name for s in new_series]
357357
reshaped = align_series_full_broadcast(*new_series)
358358
df = pa.Table.from_arrays([s._native_series for s in reshaped], names=names)
359-
return self._from_native_frame(df, validate_column_names=False)
359+
return self._from_native_frame(df, validate_column_names=True)
360360

361361
def with_columns(self: ArrowDataFrame, *exprs: ArrowExpr) -> ArrowDataFrame:
362362
native_frame = self._native_frame

narwhals/_arrow/expr.py

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414
from narwhals._arrow.series import ArrowSeries
1515
from narwhals._expression_parsing import ExprKind
1616
from narwhals._expression_parsing import evaluate_output_names_and_aliases
17+
from narwhals._expression_parsing import is_scalar_like
1718
from narwhals._expression_parsing import reuse_series_implementation
1819
from narwhals.dependencies import get_numpy
1920
from narwhals.dependencies import is_numpy_array
@@ -414,10 +415,8 @@ def clip(self: Self, lower_bound: Any | None, upper_bound: Any | None) -> Self:
414415
)
415416

416417
def over(self: Self, keys: list[str], kind: ExprKind) -> Self:
417-
if kind is ExprKind.TRANSFORM:
418-
msg = (
419-
"Elementwise operations in `over` context are not supported for PyArrow."
420-
)
418+
if not is_scalar_like(kind):
419+
msg = "Only aggregation or literal operations are supported in `over` context for PyArrow."
421420
raise NotImplementedError(msg)
422421

423422
def func(df: ArrowDataFrame) -> list[ArrowSeries]:

narwhals/_arrow/namespace.py

Lines changed: 11 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -33,12 +33,15 @@
3333
from typing import Callable
3434

3535
from typing_extensions import Self
36+
from typing_extensions import TypeAlias
3637

3738
from narwhals._arrow.typing import Incomplete
3839
from narwhals._arrow.typing import IntoArrowExpr
3940
from narwhals.dtypes import DType
4041
from narwhals.utils import Version
4142

43+
_Scalar: TypeAlias = Any
44+
4245

4346
class ArrowNamespace(CompliantNamespace[ArrowDataFrame, ArrowSeries]):
4447
def _create_expr_from_callable(
@@ -385,15 +388,15 @@ def __init__(
385388
self: Self,
386389
condition: ArrowExpr,
387390
backend_version: tuple[int, ...],
388-
then_value: Any = None,
389-
otherwise_value: Any = None,
391+
then_value: ArrowExpr | _Scalar = None,
392+
otherwise_value: ArrowExpr | _Scalar = None,
390393
*,
391394
version: Version,
392395
) -> None:
393396
self._backend_version = backend_version
394397
self._condition: ArrowExpr = condition
395-
self._then_value: ArrowExpr | Any = then_value
396-
self._otherwise_value: ArrowExpr | Any = otherwise_value
398+
self._then_value: ArrowExpr | _Scalar = then_value
399+
self._otherwise_value: ArrowExpr | _Scalar = otherwise_value
397400
self._version = version
398401

399402
def __call__(self: Self, df: ArrowDataFrame) -> Sequence[ArrowSeries]:
@@ -404,7 +407,6 @@ def __call__(self: Self, df: ArrowDataFrame) -> Sequence[ArrowSeries]:
404407
if isinstance(self._then_value, ArrowExpr):
405408
value_series = self._then_value(df)[0]
406409
else:
407-
# `self._then_value` is a scalar
408410
value_series = plx._create_series_from_scalar(
409411
self._then_value, reference_series=condition.alias("literal")
410412
)
@@ -423,7 +425,6 @@ def __call__(self: Self, df: ArrowDataFrame) -> Sequence[ArrowSeries]:
423425
if isinstance(self._otherwise_value, ArrowExpr):
424426
otherwise_series = self._otherwise_value(df)[0]
425427
else:
426-
# `self._otherwise_value` is a scalar
427428
otherwise_series = plx._create_series_from_scalar(
428429
self._otherwise_value, reference_series=condition.alias("literal")
429430
)
@@ -438,7 +439,7 @@ def __call__(self: Self, df: ArrowDataFrame) -> Sequence[ArrowSeries]:
438439
)
439440
]
440441

441-
def then(self: Self, value: ArrowExpr | ArrowSeries | Any) -> ArrowThen:
442+
def then(self: Self, value: ArrowExpr | ArrowSeries | _Scalar) -> ArrowThen:
442443
self._then_value = value
443444

444445
return ArrowThen(
@@ -469,17 +470,14 @@ def __init__(
469470
) -> None:
470471
self._backend_version = backend_version
471472
self._version = version
472-
self._call = call
473+
self._call: ArrowWhen = call
473474
self._depth = depth
474475
self._function_name = function_name
475476
self._evaluate_output_names = evaluate_output_names
476477
self._alias_output_names = alias_output_names
477478
self._call_kwargs = call_kwargs or {}
478479

479-
def otherwise(self: Self, value: ArrowExpr | ArrowSeries | Any) -> ArrowExpr:
480-
# type ignore because we are setting the `_call` attribute to a
481-
# callable object of type `PandasWhen`, base class has the attribute as
482-
# only a `Callable`
483-
self._call._otherwise_value = value # type: ignore[attr-defined]
480+
def otherwise(self: Self, value: ArrowExpr | ArrowSeries | _Scalar) -> ArrowExpr:
481+
self._call._otherwise_value = value
484482
self._function_name = "whenotherwise"
485483
return self

narwhals/_arrow/series.py

Lines changed: 8 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -419,7 +419,7 @@ def scatter(self: Self, indices: int | Sequence[int], values: Any) -> Self:
419419
def to_list(self: Self) -> list[Any]:
420420
return self._native_series.to_pylist()
421421

422-
def __array__(self: Self, dtype: Any = None, copy: bool | None = None) -> _1DArray:
422+
def __array__(self: Self, dtype: Any = None, *, copy: bool | None = None) -> _1DArray:
423423
return self._native_series.__array__(dtype=dtype, copy=copy)
424424

425425
def to_numpy(self: Self) -> _1DArray:
@@ -997,7 +997,7 @@ def rolling_var(
997997
)
998998

999999
cum_sum_sq = (
1000-
padded_series.__pow__(2)
1000+
pow(padded_series, 2)
10011001
.cum_sum(reverse=False)
10021002
.fill_null(value=None, strategy="forward", limit=None)
10031003
)
@@ -1091,7 +1091,6 @@ def hist( # noqa: PLR0915
10911091
def _hist_from_bin_count(bin_count: int): # type: ignore[no-untyped-def] # noqa: ANN202
10921092
d = pc.min_max(self._native_series)
10931093
lower, upper = d["min"], d["max"]
1094-
pad_lowest_bin = False
10951094
pa_float = pa.type_for_alias("float")
10961095
if lower == upper:
10971096
range_ = lit(1.0)
@@ -1100,7 +1099,6 @@ def _hist_from_bin_count(bin_count: int): # type: ignore[no-untyped-def] # noqa
11001099
lower = pc.subtract(lower, mid)
11011100
upper = pc.add(upper, mid)
11021101
else:
1103-
pad_lowest_bin = True
11041102
range_ = pc.subtract(upper, lower)
11051103
width = pc.divide(pc.cast(range_, pa_float), lit(float(bin_count)))
11061104

@@ -1151,15 +1149,7 @@ def _hist_from_bin_count(bin_count: int): # type: ignore[no-untyped-def] # noqa
11511149
# extract left/right side of the intervals
11521150
bin_left = pc.add(lower, pc.multiply(counts.column("values"), width))
11531151
bin_right = pc.add(bin_left, width)
1154-
if pad_lowest_bin:
1155-
# pad lowest bin by 1% of range
1156-
lowest_padded = [
1157-
pc.subtract(
1158-
bin_left[0], pc.multiply(pc.cast(range_, pa_float), lit(0.001))
1159-
)
1160-
]
1161-
bin_left = chunked_array([lowest_padded, cast("Any", bin_left[1:])])
1162-
return counts.column("counts"), bin_left, bin_right
1152+
return counts.column("counts"), bin_right
11631153

11641154
def _hist_from_bins(bins: Sequence[int | float]): # type: ignore[no-untyped-def] # noqa: ANN202
11651155
bin_indices = np.searchsorted(bins, self._native_series, side="left")
@@ -1169,20 +1159,19 @@ def _hist_from_bins(bins: Sequence[int | float]): # type: ignore[no-untyped-def
11691159
counts[np.isin(obj_cats, obs_cats)] = obs_counts[np.isin(obs_cats, obj_cats)]
11701160

11711161
bin_right = bins[1:]
1172-
bin_left = bins[:-1]
1173-
return counts, bin_left, bin_right
1162+
return counts, bin_right
11741163

11751164
if bins is not None:
11761165
if len(bins) < 2:
1177-
counts, bin_left, bin_right = [], [], []
1166+
counts, bin_right = [], []
11781167
else:
1179-
counts, bin_left, bin_right = _hist_from_bins(bins)
1168+
counts, bin_right = _hist_from_bins(bins)
11801169

11811170
elif bin_count is not None:
11821171
if bin_count == 0:
1183-
counts, bin_left, bin_right = [], [], []
1172+
counts, bin_right = [], []
11841173
else:
1185-
counts, bin_left, bin_right = _hist_from_bin_count(bin_count)
1174+
counts, bin_right = _hist_from_bin_count(bin_count)
11861175

11871176
else: # pragma: no cover
11881177
# caller guarantees that either bins or bin_count is specified

narwhals/_arrow/typing.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,7 @@
3333
TieBreaker: TypeAlias = Literal["min", "max", "first", "dense"]
3434
NullPlacement: TypeAlias = Literal["at_start", "at_end"]
3535

36-
StringArray: TypeAlias = "pc.StringArray"
36+
StringArray: TypeAlias = pc.StringArray
3737
ArrowChunkedArray: TypeAlias = pa.ChunkedArray[Any]
3838
ArrowArray: TypeAlias = pa.Array[Any]
3939
_AsPyType = TypeVar("_AsPyType")

0 commit comments

Comments
 (0)