feat: add support for the 'right' parameter in 'pandas.cut' (#1496)

chelsea-lin · web-flow · commit 8aff1285b267 · 2025-03-17T10:18:17.000-07:00
diff --git a/bigframes/core/compile/aggregate_compiler.py b/bigframes/core/compile/aggregate_compiler.py
@@ -364,8 +364,12 @@ def _(
 
         if op.labels is False:
             for this_bin in range(op.bins - 1):
+                if op.right:
+                    case_expr = x <= (col_min + (this_bin + 1) * bin_width)
+                else:
+                    case_expr = x < (col_min + (this_bin + 1) * bin_width)
                 out = out.when(
-                    x <= (col_min + (this_bin + 1) * bin_width),
+                    case_expr,
                     compile_ibis_types.literal_to_ibis_scalar(
                         this_bin, force_dtype=pd.Int64Dtype()
                     ),
@@ -375,32 +379,49 @@ def _(
             interval_struct = None
             adj = (col_max - col_min) * 0.001
             for this_bin in range(op.bins):
-                left_edge = (
-                    col_min + this_bin * bin_width - (0 if this_bin > 0 else adj)
-                )
-                right_edge = col_min + (this_bin + 1) * bin_width
-                interval_struct = ibis_types.struct(
-                    {
-                        "left_exclusive": left_edge,
-                        "right_inclusive": right_edge,
-                    }
-                )
+                left_edge_adj = adj if this_bin == 0 and op.right else 0
+                right_edge_adj = adj if this_bin == op.bins - 1 and not op.right else 0
+
+                left_edge = col_min + this_bin * bin_width - left_edge_adj
+                right_edge = col_min + (this_bin + 1) * bin_width + right_edge_adj
+
+                if op.right:
+                    interval_struct = ibis_types.struct(
+                        {
+                            "left_exclusive": left_edge,
+                            "right_inclusive": right_edge,
+                        }
+                    )
+                else:
+                    interval_struct = ibis_types.struct(
+                        {
+                            "left_inclusive": left_edge,
+                            "right_exclusive": right_edge,
+                        }
+                    )
 
                 if this_bin < op.bins - 1:
-                    out = out.when(
-                        x <= (col_min + (this_bin + 1) * bin_width),
-                        interval_struct,
-                    )
+                    if op.right:
+                        case_expr = x <= (col_min + (this_bin + 1) * bin_width)
+                    else:
+                        case_expr = x < (col_min + (this_bin + 1) * bin_width)
+                    out = out.when(case_expr, interval_struct)
                 else:
                     out = out.when(x.notnull(), interval_struct)
     else:  # Interpret as intervals
         for interval in op.bins:
             left = compile_ibis_types.literal_to_ibis_scalar(interval[0])
             right = compile_ibis_types.literal_to_ibis_scalar(interval[1])
-            condition = (x > left) & (x <= right)
-            interval_struct = ibis_types.struct(
-                {"left_exclusive": left, "right_inclusive": right}
-            )
+            if op.right:
+                condition = (x > left) & (x <= right)
+                interval_struct = ibis_types.struct(
+                    {"left_exclusive": left, "right_inclusive": right}
+                )
+            else:
+                condition = (x >= left) & (x < right)
+                interval_struct = ibis_types.struct(
+                    {"left_inclusive": left, "right_exclusive": right}
+                )
             out = out.when(condition, interval_struct)
     return out.end()
 
diff --git a/bigframes/core/reshape/tile.py b/bigframes/core/reshape/tile.py
@@ -15,7 +15,6 @@
 from __future__ import annotations
 
 import typing
-from typing import Iterable, Optional, Union
 
 import bigframes_vendored.constants as constants
 import bigframes_vendored.pandas.core.reshape.tile as vendored_pandas_tile
@@ -33,26 +32,34 @@
 
 def cut(
     x: bigframes.series.Series,
-    bins: Union[
+    bins: typing.Union[
         int,
         pd.IntervalIndex,
-        Iterable,
+        typing.Iterable,
     ],
     *,
-    labels: Union[Iterable[str], bool, None] = None,
+    right: typing.Optional[bool] = True,
+    labels: typing.Union[typing.Iterable[str], bool, None] = None,
 ) -> bigframes.series.Series:
     if isinstance(bins, int) and bins <= 0:
         raise ValueError("`bins` should be a positive integer.")
 
-    if isinstance(bins, Iterable):
+    # TODO: Check `right` does not apply for IntervalIndex.
+
+    if isinstance(bins, typing.Iterable):
         if isinstance(bins, pd.IntervalIndex):
+            # TODO: test an empty internval index
             as_index: pd.IntervalIndex = bins
             bins = tuple((bin.left.item(), bin.right.item()) for bin in bins)
+            # To maintain consistency with pandas' behavior
+            right = True
         elif len(list(bins)) == 0:
             raise ValueError("`bins` iterable should have at least one item")
         elif isinstance(list(bins)[0], tuple):
             as_index = pd.IntervalIndex.from_tuples(list(bins))
             bins = tuple(bins)
+            # To maintain consistency with pandas' behavior
+            right = True
         elif pd.api.types.is_number(list(bins)[0]):
             bins_list = list(bins)
             if len(bins_list) < 2:
@@ -82,7 +89,8 @@ def cut(
         )
 
     return x._apply_window_op(
-        agg_ops.CutOp(bins, labels=labels), window_spec=window_specs.unbound()
+        agg_ops.CutOp(bins, right=right, labels=labels),
+        window_spec=window_specs.unbound(),
     )
 
 
@@ -93,7 +101,7 @@ def qcut(
     x: bigframes.series.Series,
     q: typing.Union[int, typing.Sequence[float]],
     *,
-    labels: Optional[bool] = None,
+    labels: typing.Optional[bool] = None,
     duplicates: typing.Literal["drop", "error"] = "error",
 ) -> bigframes.series.Series:
     if isinstance(q, int) and q <= 0:
diff --git a/bigframes/operations/aggregations.py b/bigframes/operations/aggregations.py
@@ -339,6 +339,7 @@ def output_type(self, *input_types: dtypes.ExpressionType) -> dtypes.ExpressionT
 class CutOp(UnaryWindowOp):
     # TODO: Unintuitive, refactor into multiple ops?
     bins: typing.Union[int, Iterable]
+    right: Optional[bool]
     labels: Optional[bool]
 
     @property
@@ -357,10 +358,19 @@ def output_type(self, *input_types: dtypes.ExpressionType) -> dtypes.ExpressionT
             )
             pa_type = pa.struct(
                 [
-                    pa.field("left_exclusive", interval_dtype, nullable=True),
-                    pa.field("right_inclusive", interval_dtype, nullable=True),
+                    pa.field(
+                        "left_exclusive" if self.right else "left_inclusive",
+                        interval_dtype,
+                        nullable=True,
+                    ),
+                    pa.field(
+                        "right_inclusive" if self.right else "right_exclusive",
+                        interval_dtype,
+                        nullable=True,
+                    ),
                 ]
             )
+
             return pd.ArrowDtype(pa_type)
 
     @property
diff --git a/tests/system/small/test_pandas.py b/tests/system/small/test_pandas.py
@@ -387,33 +387,52 @@ def test_merge_series(scalars_dfs, merge_how):
     assert_pandas_df_equal(bf_result, pd_result, ignore_order=True)
 
 
-def test_cut(scalars_dfs):
+@pytest.mark.parametrize(
+    ("right"),
+    [
+        pytest.param(True),
+        pytest.param(False),
+    ],
+)
+def test_cut(scalars_dfs, right):
     scalars_df, scalars_pandas_df = scalars_dfs
 
-    pd_result = pd.cut(scalars_pandas_df["float64_col"], 5, labels=False)
-    bf_result = bpd.cut(scalars_df["float64_col"], 5, labels=False)
+    pd_result = pd.cut(scalars_pandas_df["float64_col"], 5, labels=False, right=right)
+    bf_result = bpd.cut(scalars_df["float64_col"], 5, labels=False, right=right)
 
     # make sure the result is a supported dtype
     assert bf_result.dtype == bpd.Int64Dtype()
     pd_result = pd_result.astype("Int64")
     pd.testing.assert_series_equal(bf_result.to_pandas(), pd_result)
 
 
-def test_cut_default_labels(scalars_dfs):
+@pytest.mark.parametrize(
+    ("right"),
+    [
+        pytest.param(True),
+        pytest.param(False),
+    ],
+)
+def test_cut_default_labels(scalars_dfs, right):
     scalars_df, scalars_pandas_df = scalars_dfs
 
-    pd_result = pd.cut(scalars_pandas_df["float64_col"], 5)
-    bf_result = bpd.cut(scalars_df["float64_col"], 5).to_pandas()
+    pd_result = pd.cut(scalars_pandas_df["float64_col"], 5, right=right)
+    bf_result = bpd.cut(scalars_df["float64_col"], 5, right=right).to_pandas()
 
     # Convert to match data format
+    pd_interval = pd_result.cat.categories[pd_result.cat.codes]
+    if pd_interval.closed == "left":
+        left_key = "left_inclusive"
+        right_key = "right_exclusive"
+    else:
+        left_key = "left_exclusive"
+        right_key = "right_inclusive"
     pd_result_converted = pd.Series(
         [
-            {"left_exclusive": interval.left, "right_inclusive": interval.right}
+            {left_key: interval.left, right_key: interval.right}
             if pd.notna(val)
             else pd.NA
-            for val, interval in zip(
-                pd_result, pd_result.cat.categories[pd_result.cat.codes]
-            )
+            for val, interval in zip(pd_result, pd_interval)
         ],
         name=pd_result.name,
     )
@@ -424,28 +443,35 @@ def test_cut_default_labels(scalars_dfs):
 
 
 @pytest.mark.parametrize(
-    ("breaks",),
+    ("breaks", "right"),
     [
-        ([0, 5, 10, 15, 20, 100, 1000],),  # ints
-        ([0.5, 10.5, 15.5, 20.5, 100.5, 1000.5],),  # floats
-        ([0, 5, 10.5, 15.5, 20, 100, 1000.5],),  # mixed
+        pytest.param([0, 5, 10, 15, 20, 100, 1000], True, id="int_right"),
+        pytest.param([0, 5, 10, 15, 20, 100, 1000], False, id="int_left"),
+        pytest.param([0.5, 10.5, 15.5, 20.5, 100.5, 1000.5], False, id="float_left"),
+        pytest.param([0, 5, 10.5, 15.5, 20, 100, 1000.5], True, id="mixed_right"),
     ],
 )
-def test_cut_numeric_breaks(scalars_dfs, breaks):
+def test_cut_numeric_breaks(scalars_dfs, breaks, right):
     scalars_df, scalars_pandas_df = scalars_dfs
 
-    pd_result = pd.cut(scalars_pandas_df["float64_col"], breaks)
-    bf_result = bpd.cut(scalars_df["float64_col"], breaks).to_pandas()
+    pd_result = pd.cut(scalars_pandas_df["float64_col"], breaks, right=right)
+    bf_result = bpd.cut(scalars_df["float64_col"], breaks, right=right).to_pandas()
 
     # Convert to match data format
+    pd_interval = pd_result.cat.categories[pd_result.cat.codes]
+    if pd_interval.closed == "left":
+        left_key = "left_inclusive"
+        right_key = "right_exclusive"
+    else:
+        left_key = "left_exclusive"
+        right_key = "right_inclusive"
+
     pd_result_converted = pd.Series(
         [
-            {"left_exclusive": interval.left, "right_inclusive": interval.right}
+            {left_key: interval.left, right_key: interval.right}
             if pd.notna(val)
             else pd.NA
-            for val, interval in zip(
-                pd_result, pd_result.cat.categories[pd_result.cat.codes]
-            )
+            for val, interval in zip(pd_result, pd_interval)
         ],
         name=pd_result.name,
     )
@@ -476,29 +502,47 @@ def test_cut_errors(scalars_dfs, bins):
 
 
 @pytest.mark.parametrize(
-    ("bins",),
+    ("bins", "right"),
     [
-        ([(-5, 2), (2, 3), (-3000, -10)],),
-        (pd.IntervalIndex.from_tuples([(1, 2), (2, 3), (4, 5)]),),
+        pytest.param([(-5, 2), (2, 3), (-3000, -10)], True, id="tuple_right"),
+        pytest.param([(-5, 2), (2, 3), (-3000, -10)], False, id="tuple_left"),
+        pytest.param(
+            pd.IntervalIndex.from_tuples([(1, 2), (2, 3), (4, 5)]),
+            True,
+            id="interval_right",
+        ),
+        pytest.param(
+            pd.IntervalIndex.from_tuples([(1, 2), (2, 3), (4, 5)]),
+            False,
+            id="interval_left",
+        ),
     ],
 )
-def test_cut_with_interval(scalars_dfs, bins):
+def test_cut_with_interval(scalars_dfs, bins, right):
     scalars_df, scalars_pandas_df = scalars_dfs
-    bf_result = bpd.cut(scalars_df["int64_too"], bins, labels=False).to_pandas()
+    bf_result = bpd.cut(
+        scalars_df["int64_too"], bins, labels=False, right=right
+    ).to_pandas()
 
     if isinstance(bins, list):
         bins = pd.IntervalIndex.from_tuples(bins)
-    pd_result = pd.cut(scalars_pandas_df["int64_too"], bins, labels=False)
+    pd_result = pd.cut(scalars_pandas_df["int64_too"], bins, labels=False, right=right)
 
     # Convert to match data format
+    pd_interval = pd_result.cat.categories[pd_result.cat.codes]
+    if pd_interval.closed == "left":
+        left_key = "left_inclusive"
+        right_key = "right_exclusive"
+    else:
+        left_key = "left_exclusive"
+        right_key = "right_inclusive"
+
     pd_result_converted = pd.Series(
         [
-            {"left_exclusive": interval.left, "right_inclusive": interval.right}
+            {left_key: interval.left, right_key: interval.right}
             if pd.notna(val)
             else pd.NA
-            for val, interval in zip(
-                pd_result, pd_result.cat.categories[pd_result.cat.codes]
-            )
+            for val, interval in zip(pd_result, pd_interval)
         ],
         name=pd_result.name,
     )
diff --git a/third_party/bigframes_vendored/pandas/core/reshape/tile.py b/third_party/bigframes_vendored/pandas/core/reshape/tile.py