Merge branch 'main' into refac-pandas-concat

dangotbanned · web-flow · commit fa307064fdd8 · 2025-04-12T11:14:10.000+01:00
diff --git a/narwhals/_arrow/series.py b/narwhals/_arrow/series.py
@@ -1065,58 +1065,20 @@ def hist(  # noqa: PLR0915
 
         def _hist_from_bin_count(bin_count: int):  # type: ignore[no-untyped-def] # noqa: ANN202
             d = pc.min_max(self.native)
-            lower, upper = d["min"], d["max"]
-            pa_float = pa.type_for_alias("float")
+            lower, upper = d["min"].as_py(), d["max"].as_py()
             if lower == upper:
-                range_: pa.Scalar[Any] = lit(1.0)
-                mid = lit(0.5)
-                width = pc.divide(range_, lit(bin_count))
-                lower = pc.subtract(lower, mid)
-                upper = pc.add(upper, mid)
-            else:
-                range_ = pc.subtract(upper, lower)
-                width = pc.divide(pc.cast(range_, pa_float), lit(float(bin_count)))
-
-            bin_proportions = pc.divide(pc.subtract(self.native, lower), width)
-            bin_indices = pc.floor(bin_proportions)
-
-            # shift bins so they are right-closed
-            bin_indices = pc.if_else(
-                pc.and_(
-                    pc.equal(bin_indices, bin_proportions),
-                    pc.greater(bin_indices, lit(0)),
-                ),
-                pc.subtract(bin_indices, lit(1)),
-                bin_indices,
-            )
-            possible = pa.Table.from_arrays(
-                [pa.Array.from_pandas(np.arange(bin_count, dtype="int64"))], ["values"]
-            )
-            counts = (  # count bin id occurrences
-                pa.Table.from_arrays(
-                    pc.value_counts(bin_indices).flatten(),
-                    names=["values", "counts"],
-                )
-                # nan values are implicitly dropped in value_counts
-                .filter(~pc.field("values").is_nan())
-                .cast(pa.schema([("values", pa.int64()), ("counts", pa.int64())]))
-                # align bin ids to all possible bin ids (populate in missing bins)
-                .join(possible, keys="values", join_type="right outer")
-                .sort_by("values")
-            )
-            # empty bin intervals should have a 0 count
-            counts_coalesce = cast(
-                "ArrowArray", pc.coalesce(counts.column("counts"), lit(0))
-            )
-            counts = counts.set_column(0, "counts", counts_coalesce)
-
-            # extract left/right side of the intervals
-            bin_left = pc.add(lower, pc.multiply(counts.column("values"), width))
-            bin_right = pc.add(bin_left, width)
-            return counts.column("counts"), bin_right
+                lower -= 0.5
+                upper += 0.5
+            bins = np.linspace(lower, upper, bin_count + 1)
+            return _hist_from_bins(bins)
 
         def _hist_from_bins(bins: Sequence[int | float]):  # type: ignore[no-untyped-def] # noqa: ANN202
             bin_indices = np.searchsorted(bins, self.native, side="left")
+            bin_indices = pc.if_else(  # lowest bin is inclusive
+                pc.equal(self.native, lit(bins[0])), 1, bin_indices
+            )
+
+            # align unique categories and counts appropriately
             obs_cats, obs_counts = np.unique(bin_indices, return_counts=True)
             obj_cats = np.arange(1, len(bins))
             counts = np.zeros_like(obj_cats)
@@ -1125,15 +1087,51 @@ def _hist_from_bins(bins: Sequence[int | float]):  # type: ignore[no-untyped-def
             bin_right = bins[1:]
             return counts, bin_right
 
+        counts: Sequence[int | float] | np.typing.ArrayLike
+        bin_right: Sequence[int | float] | np.typing.ArrayLike
+
+        data_count = pc.sum(
+            pc.invert(pc.or_(pc.is_nan(self.native), pc.is_null(self.native))).cast(
+                pa.uint8()
+            ),
+            min_count=0,
+        )
         if bins is not None:
             if len(bins) < 2:
                 counts, bin_right = [], []
+
+            elif data_count == pa.scalar(0, type=pa.uint64()):  # type:ignore[comparison-overlap]
+                counts = np.zeros(len(bins) - 1)
+                bin_right = bins[1:]
+
+            elif len(bins) == 2:
+                counts = [
+                    pc.sum(
+                        pc.and_(
+                            pc.greater_equal(self.native, lit(float(bins[0]))),
+                            pc.less_equal(self.native, lit(float(bins[1]))),
+                        ).cast(pa.uint8())
+                    )
+                ]
+                bin_right = [bins[-1]]
             else:
                 counts, bin_right = _hist_from_bins(bins)
 
         elif bin_count is not None:
             if bin_count == 0:
                 counts, bin_right = [], []
+            elif data_count == pa.scalar(0, type=pa.uint64()):  # type:ignore[comparison-overlap]
+                counts, bin_right = (
+                    np.zeros(bin_count),
+                    np.linspace(0, 1, bin_count + 1)[1:],
+                )
+            elif bin_count == 1:
+                d = pc.min_max(self.native)
+                lower, upper = d["min"], d["max"]
+                if lower == upper:
+                    counts, bin_right = [data_count], [pc.add(upper, pa.scalar(0.5))]
+                else:
+                    counts, bin_right = [data_count], [upper]
             else:
                 counts, bin_right = _hist_from_bin_count(bin_count)
 
diff --git a/narwhals/_dask/expr.py b/narwhals/_dask/expr.py
@@ -644,9 +644,18 @@ def func(df: DaskLazyFrame) -> Sequence[dx.Series]:
                         message=".*`meta` is not specified",
                         category=UserWarning,
                     )
-                    res_native = df.native.groupby(partition_by)[
-                        list(output_names)
-                    ].transform(dask_function_name, **self._call_kwargs)
+                    grouped = df.native.groupby(partition_by)
+                    if dask_function_name == "size":
+                        if len(output_names) != 1:  # pragma: no cover
+                            msg = "Safety check failed, please report a bug."
+                            raise AssertionError(msg)
+                        res_native = grouped.transform(
+                            dask_function_name, **self._call_kwargs
+                        ).to_frame(output_names[0])
+                    else:
+                        res_native = grouped[list(output_names)].transform(
+                            dask_function_name, **self._call_kwargs
+                        )
                 result_frame = df._with_native(
                     res_native.rename(columns=dict(zip(output_names, aliases)))
                 ).native
diff --git a/narwhals/_pandas_like/expr.py b/narwhals/_pandas_like/expr.py
@@ -198,7 +198,7 @@ def cum_sum(self: Self, *, reverse: bool) -> Self:
     def shift(self: Self, n: int) -> Self:
         return self._reuse_series("shift", call_kwargs={"n": n})
 
-    def over(
+    def over(  # noqa: PLR0915
         self: Self,
         partition_by: Sequence[str],
         order_by: Sequence[str] | None,
@@ -265,21 +265,25 @@ def func(df: PandasLikeDataFrame) -> Sequence[PandasLikeSeries]:
                 elif reverse:
                     columns = list(set(partition_by).union(output_names))
                     df = df[columns][::-1]
+                grouped = df._native_frame.groupby(partition_by)
                 if function_name.startswith("rolling"):
-                    rolling = df._native_frame.groupby(partition_by)[
-                        list(output_names)
-                    ].rolling(**pandas_kwargs)
+                    rolling = grouped[list(output_names)].rolling(**pandas_kwargs)
                     assert pandas_function_name is not None  # help mypy  # noqa: S101
                     if pandas_function_name in {"std", "var"}:
                         res_native = getattr(rolling, pandas_function_name)(
                             ddof=self._call_kwargs["ddof"]
                         )
                     else:
                         res_native = getattr(rolling, pandas_function_name)()
+                elif function_name == "len":
+                    if len(output_names) != 1:  # pragma: no cover
+                        msg = "Safety check failed, please report a bug."
+                        raise AssertionError(msg)
+                    res_native = grouped.transform("size").to_frame(aliases[0])
                 else:
-                    res_native = df._native_frame.groupby(partition_by)[
-                        list(output_names)
-                    ].transform(pandas_function_name, **pandas_kwargs)
+                    res_native = grouped[list(output_names)].transform(
+                        pandas_function_name, **pandas_kwargs
+                    )
                 result_frame = df._with_native(res_native).rename(
                     dict(zip(output_names, aliases))
                 )
diff --git a/narwhals/_pandas_like/series.py b/narwhals/_pandas_like/series.py
@@ -966,34 +966,50 @@ def hist(
                 data["breakpoint"] = []
             data["count"] = []
             return PandasLikeDataFrame.from_native(ns.DataFrame(data), context=self)
-        elif self.native.count() < 1:
+
+        if self.native.count() < 1:
             if bins is not None:
                 data = {"breakpoint": bins[1:], "count": zeros(shape=len(bins) - 1)}
             else:
                 count = cast("int", bin_count)
-                data = {"breakpoint": linspace(0, 1, count), "count": zeros(shape=count)}
+                if bin_count == 1:
+                    data = {"breakpoint": [1.0], "count": [0]}
+                else:
+                    data = {
+                        "breakpoint": linspace(0, 1, count + 1)[1:],
+                        "count": zeros(shape=count),
+                    }
             if not include_breakpoint:
                 del data["breakpoint"]
             return PandasLikeDataFrame.from_native(ns.DataFrame(data), context=self)
 
-        elif bin_count is not None:  # use Polars binning behavior
+        if bin_count is not None:
+            # use Polars binning behavior
             lower, upper = self.native.min(), self.native.max()
-            pad_lowest_bin = False
             if lower == upper:
                 lower -= 0.5
                 upper += 0.5
-            else:
-                pad_lowest_bin = True
+
+            if bin_count == 1:
+                data = {
+                    "breakpoint": [upper],
+                    "count": [self.native.count()],
+                }
+                if not include_breakpoint:
+                    del data["breakpoint"]
+                return PandasLikeDataFrame.from_native(ns.DataFrame(data), context=self)
 
             bins = linspace(lower, upper, bin_count + 1)
-            if pad_lowest_bin and bins is not None:
-                bins[0] -= 0.001 * abs(bins[0]) if bins[0] != 0 else 0.001
             bin_count = None
 
         # pandas (2.2.*) .value_counts(bins=int) adjusts the lowest bin twice, result in improper counts.
         # pandas (2.2.*) .value_counts(bins=[...]) adjusts the lowest bin which should not happen since
         #   the bins were explicitly passed in.
-        categories = ns.cut(self.native, bins=bins if bin_count is None else bin_count)
+        categories = ns.cut(
+            self.native,
+            bins=bins if bin_count is None else bin_count,
+            include_lowest=True,  # Polars 1.27.0 always includes the lowest bin
+        )
         # modin (0.32.0) .value_counts(...) silently drops bins with empty observations, .reindex
         #   is necessary to restore these bins.
         result = categories.value_counts(dropna=True, sort=False).reindex(
diff --git a/narwhals/_polars/series.py b/narwhals/_polars/series.py
@@ -499,18 +499,26 @@ def hist(
                 data.append(pl.Series("breakpoint", [], dtype=pl.Float64))
             data.append(pl.Series("count", [], dtype=pl.UInt32))
             return PolarsDataFrame.from_native(pl.DataFrame(data), context=self)
-        elif (self._backend_version < (1, 15)) and self.native.count() < 1:
+
+        if self.native.count() < 1:
             data_dict: dict[str, Sequence[Any] | pl.Series]
             if bins is not None:
                 data_dict = {
                     "breakpoint": bins[1:],
                     "count": pl.zeros(n=len(bins) - 1, dtype=pl.Int64, eager=True),
                 }
-            elif bin_count is not None:
+            elif (bin_count is not None) and bin_count == 1:
+                data_dict = {"breakpoint": [1.0], "count": [0]}
+            elif (bin_count is not None) and bin_count > 1:
                 data_dict = {
-                    "breakpoint": pl.int_range(0, bin_count, eager=True) / bin_count,
+                    "breakpoint": pl.int_range(1, bin_count + 1, eager=True) / bin_count,
                     "count": pl.zeros(n=bin_count, dtype=pl.Int64, eager=True),
                 }
+            else:  # pragma: no cover
+                msg = (
+                    "congratulations, you entered unreachable code - please report a bug"
+                )
+                raise AssertionError(msg)
             if not include_breakpoint:
                 del data_dict["breakpoint"]
             return PolarsDataFrame.from_native(pl.DataFrame(data_dict), context=self)
@@ -519,25 +527,19 @@ def hist(
         # polars <1.5 with bin_count=...
         # returns bins that range from -inf to +inf and has bin_count + 1 bins.
         #   for compat: convert `bin_count=` call to `bins=`
-        if (
-            (self._backend_version < (1, 15))
-            and (bin_count is not None)
-            and (self.native.count() > 0)
+        if (self._backend_version < (1, 15)) and (
+            bin_count is not None
         ):  # pragma: no cover
             lower = cast("float", self.native.min())
             upper = cast("float", self.native.max())
-            pad_lowest_bin = False
             if lower == upper:
                 width = 1 / bin_count
                 lower -= 0.5
                 upper += 0.5
             else:
-                pad_lowest_bin = True
                 width = (upper - lower) / bin_count
 
             bins = (pl.int_range(0, bin_count + 1, eager=True) * width + lower).to_list()
-            if pad_lowest_bin:
-                bins[0] -= 0.001 * abs(bins[0]) if bins[0] != 0 else 0.001
             bin_count = None
 
         # Polars inconsistently handles NaN values when computing histograms
@@ -552,16 +554,22 @@ def hist(
             include_category=False,
             include_breakpoint=include_breakpoint,
         )
+
         if not include_breakpoint:
             df.columns = ["count"]
 
+        if self._backend_version < (1, 0) and include_breakpoint:
+            df = df.rename({"break_point": "breakpoint"})
+
         #  polars<1.15 implicitly adds -inf and inf to either end of bins
         if self._backend_version < (1, 15) and bins is not None:  # pragma: no cover
             r = pl.int_range(0, len(df))
             df = df.filter((r > 0) & (r < len(df) - 1))
 
-        if self._backend_version < (1, 0) and include_breakpoint:
-            df = df.rename({"break_point": "breakpoint"})
+        # polars<1.27 makes the lowest bin a left/right closed interval.
+        if self._backend_version < (1, 27) and bins is not None:
+            df[0, "count"] += (series == bins[0]).sum()
+
         return PolarsDataFrame.from_native(df, context=self)
 
     def to_polars(self: Self) -> pl.Series:
diff --git a/tests/expr_and_series/over_test.py b/tests/expr_and_series/over_test.py
@@ -428,3 +428,17 @@ def test_over_without_partition_by(
     )
     expected = {"a": [1, 2, -1], "b": [1, 3, 4], "i": [0, 1, 2]}
     assert_equal_data(result, expected)
+
+
+def test_len_over_2369(constructor: Constructor, request: pytest.FixtureRequest) -> None:
+    if "duckdb" in str(constructor) and DUCKDB_VERSION < (1, 3):
+        pytest.skip()
+    if "pandas" in str(constructor) and PANDAS_VERSION < (1, 5):
+        pytest.skip()
+    if any(x in str(constructor) for x in ("modin",)):
+        # https://github.com/modin-project/modin/issues/7508
+        request.applymarker(pytest.mark.xfail)
+    df = nw.from_native(constructor({"a": [1, 2, 4], "b": ["x", "x", "y"]}))
+    result = df.with_columns(a_len_per_group=nw.len().over("b")).sort("a")
+    expected = {"a": [1, 2, 4], "b": ["x", "x", "y"], "a_len_per_group": [2, 2, 1]}
+    assert_equal_data(result, expected)
diff --git a/tests/series_only/hist_test.py b/tests/series_only/hist_test.py