fix: fix pandas.cut errors with empty bins (#1499)

chelsea-lin · web-flow · commit 434fb5dd60d1 · 2025-03-18T11:38:23.000-07:00
* fix: fix pandas.cut errors with empty bins

* nit

* refactor if branches for more readable
diff --git a/bigframes/core/reshape/tile.py b/bigframes/core/reshape/tile.py
@@ -41,32 +41,33 @@ def cut(
     right: typing.Optional[bool] = True,
     labels: typing.Union[typing.Iterable[str], bool, None] = None,
 ) -> bigframes.series.Series:
-    if isinstance(bins, int) and bins <= 0:
-        raise ValueError("`bins` should be a positive integer.")
-
-    # TODO: Check `right` does not apply for IntervalIndex.
+    if labels is not None and labels is not False:
+        raise NotImplementedError(
+            "The 'labels' parameter must be either False or None. "
+            "Please provide a valid value for 'labels'."
+        )
 
-    if isinstance(bins, typing.Iterable):
+    if isinstance(bins, int):
+        if bins <= 0:
+            raise ValueError("`bins` should be a positive integer.")
+        op = agg_ops.CutOp(bins, right=right, labels=labels)
+        return x._apply_window_op(op, window_spec=window_specs.unbound())
+    elif isinstance(bins, typing.Iterable):
         if isinstance(bins, pd.IntervalIndex):
-            # TODO: test an empty internval index
             as_index: pd.IntervalIndex = bins
             bins = tuple((bin.left.item(), bin.right.item()) for bin in bins)
             # To maintain consistency with pandas' behavior
             right = True
         elif len(list(bins)) == 0:
-            raise ValueError("`bins` iterable should have at least one item")
+            as_index = pd.IntervalIndex.from_tuples(list(bins))
+            bins = tuple()
         elif isinstance(list(bins)[0], tuple):
             as_index = pd.IntervalIndex.from_tuples(list(bins))
             bins = tuple(bins)
             # To maintain consistency with pandas' behavior
             right = True
         elif pd.api.types.is_number(list(bins)[0]):
             bins_list = list(bins)
-            if len(bins_list) < 2:
-                raise ValueError(
-                    "`bins` iterable of numeric breaks should have"
-                    " at least two items"
-                )
             as_index = pd.IntervalIndex.from_breaks(bins_list)
             single_type = all([isinstance(n, type(bins_list[0])) for n in bins_list])
             numeric_type = type(bins_list[0]) if single_type else float
@@ -77,21 +78,20 @@ def cut(
                 ]
             )
         else:
-            raise ValueError("`bins` iterable should contain tuples or numerics")
+            raise ValueError("`bins` iterable should contain tuples or numerics.")
 
         if as_index.is_overlapping:
             raise ValueError("Overlapping IntervalIndex is not accepted.")
-
-    if labels is not None and labels is not False:
-        raise NotImplementedError(
-            "The 'labels' parameter must be either False or None. "
-            "Please provide a valid value for 'labels'."
-        )
-
-    return x._apply_window_op(
-        agg_ops.CutOp(bins, right=right, labels=labels),
-        window_spec=window_specs.unbound(),
-    )
+        elif len(as_index) == 0:
+            op = agg_ops.CutOp(bins, right=right, labels=labels)
+            return bigframes.series.Series(
+                [pd.NA] * len(x), dtype=op.output_type(), name=x.name
+            )
+        else:
+            op = agg_ops.CutOp(bins, right=right, labels=labels)
+            return x._apply_window_op(op, window_spec=window_specs.unbound())
+    else:
+        raise ValueError("`bins` must be an integer or interable.")
 
 
 cut.__doc__ = vendored_pandas_tile.cut.__doc__
diff --git a/bigframes/operations/aggregations.py b/bigframes/operations/aggregations.py
@@ -351,11 +351,12 @@ def output_type(self, *input_types: dtypes.ExpressionType) -> dtypes.ExpressionT
             return dtypes.INT_DTYPE
         else:
             # Assumption: buckets use same numeric type
-            interval_dtype = (
-                pa.float64()
-                if isinstance(self.bins, int)
-                else dtypes.infer_literal_arrow_type(list(self.bins)[0][0])
-            )
+            if isinstance(self.bins, int):
+                interval_dtype = pa.float64()
+            elif len(list(self.bins)) == 0:
+                interval_dtype = pa.int64()
+            else:
+                interval_dtype = dtypes.infer_literal_arrow_type(list(self.bins)[0][0])
             pa_type = pa.struct(
                 [
                     pa.field(
diff --git a/tests/system/small/test_pandas.py b/tests/system/small/test_pandas.py
@@ -387,6 +387,31 @@ def test_merge_series(scalars_dfs, merge_how):
     assert_pandas_df_equal(bf_result, pd_result, ignore_order=True)
 
 
+def _convert_pandas_category(pd_s: pd.Series):
+    if not isinstance(pd_s.dtype, pd.CategoricalDtype):
+        raise ValueError("Input must be a pandas Series with categorical data.")
+
+    if len(pd_s.dtype.categories) == 0:
+        return pd.Series([pd.NA] * len(pd_s), name=pd_s.name)
+
+    pd_interval: pd.IntervalIndex = pd_s.cat.categories[pd_s.cat.codes]  # type: ignore
+    if pd_interval.closed == "left":
+        left_key = "left_inclusive"
+        right_key = "right_exclusive"
+    else:
+        left_key = "left_exclusive"
+        right_key = "right_inclusive"
+    return pd.Series(
+        [
+            {left_key: interval.left, right_key: interval.right}
+            if pd.notna(val)
+            else pd.NA
+            for val, interval in zip(pd_s, pd_interval)
+        ],
+        name=pd_s.name,
+    )
+
+
 @pytest.mark.parametrize(
     ("right"),
     [
@@ -420,23 +445,7 @@ def test_cut_default_labels(scalars_dfs, right):
     bf_result = bpd.cut(scalars_df["float64_col"], 5, right=right).to_pandas()
 
     # Convert to match data format
-    pd_interval = pd_result.cat.categories[pd_result.cat.codes]
-    if pd_interval.closed == "left":
-        left_key = "left_inclusive"
-        right_key = "right_exclusive"
-    else:
-        left_key = "left_exclusive"
-        right_key = "right_inclusive"
-    pd_result_converted = pd.Series(
-        [
-            {left_key: interval.left, right_key: interval.right}
-            if pd.notna(val)
-            else pd.NA
-            for val, interval in zip(pd_result, pd_interval)
-        ],
-        name=pd_result.name,
-    )
-
+    pd_result_converted = _convert_pandas_category(pd_result)
     pd.testing.assert_series_equal(
         bf_result, pd_result_converted, check_index=False, check_dtype=False
     )
@@ -458,47 +467,36 @@ def test_cut_numeric_breaks(scalars_dfs, breaks, right):
     bf_result = bpd.cut(scalars_df["float64_col"], breaks, right=right).to_pandas()
 
     # Convert to match data format
-    pd_interval = pd_result.cat.categories[pd_result.cat.codes]
-    if pd_interval.closed == "left":
-        left_key = "left_inclusive"
-        right_key = "right_exclusive"
-    else:
-        left_key = "left_exclusive"
-        right_key = "right_inclusive"
-
-    pd_result_converted = pd.Series(
-        [
-            {left_key: interval.left, right_key: interval.right}
-            if pd.notna(val)
-            else pd.NA
-            for val, interval in zip(pd_result, pd_interval)
-        ],
-        name=pd_result.name,
-    )
+    pd_result_converted = _convert_pandas_category(pd_result)
 
     pd.testing.assert_series_equal(
         bf_result, pd_result_converted, check_index=False, check_dtype=False
     )
 
 
 @pytest.mark.parametrize(
-    ("bins",),
+    "bins",
     [
-        (-1,),  # negative integer bins argument
-        ([],),  # empty iterable of bins
-        (["notabreak"],),  # iterable of wrong type
-        ([1],),  # numeric breaks with only one numeric
-        # this is supported by pandas but not by
-        # the bigquery operation and a bigframes workaround
-        # is not yet available. Should return column
-        # of structs with all NaN values.
+        pytest.param([], id="empty_list"),
+        pytest.param(
+            [1], id="single_int_list", marks=pytest.mark.skip(reason="b/404338651")
+        ),
+        pytest.param(pd.IntervalIndex.from_tuples([]), id="empty_interval_index"),
     ],
 )
-def test_cut_errors(scalars_dfs, bins):
-    scalars_df, _ = scalars_dfs
+def test_cut_w_edge_cases(scalars_dfs, bins):
+    scalars_df, scalars_pandas_df = scalars_dfs
+    bf_result = bpd.cut(scalars_df["int64_too"], bins, labels=False).to_pandas()
+    if isinstance(bins, list):
+        bins = pd.IntervalIndex.from_tuples(bins)
+    pd_result = pd.cut(scalars_pandas_df["int64_too"], bins, labels=False)
+
+    # Convert to match data format
+    pd_result_converted = _convert_pandas_category(pd_result)
 
-    with pytest.raises(ValueError):
-        bpd.cut(scalars_df["float64_col"], bins)
+    pd.testing.assert_series_equal(
+        bf_result, pd_result_converted, check_index=False, check_dtype=False
+    )
 
 
 @pytest.mark.parametrize(
@@ -529,23 +527,7 @@ def test_cut_with_interval(scalars_dfs, bins, right):
     pd_result = pd.cut(scalars_pandas_df["int64_too"], bins, labels=False, right=right)
 
     # Convert to match data format
-    pd_interval = pd_result.cat.categories[pd_result.cat.codes]
-    if pd_interval.closed == "left":
-        left_key = "left_inclusive"
-        right_key = "right_exclusive"
-    else:
-        left_key = "left_exclusive"
-        right_key = "right_inclusive"
-
-    pd_result_converted = pd.Series(
-        [
-            {left_key: interval.left, right_key: interval.right}
-            if pd.notna(val)
-            else pd.NA
-            for val, interval in zip(pd_result, pd_interval)
-        ],
-        name=pd_result.name,
-    )
+    pd_result_converted = _convert_pandas_category(pd_result)
 
     pd.testing.assert_series_equal(
         bf_result, pd_result_converted, check_index=False, check_dtype=False
diff --git a/tests/unit/test_pandas.py b/tests/unit/test_pandas.py
@@ -101,14 +101,20 @@ def test_cut_raises_with_labels():
 
 
 @pytest.mark.parametrize(
-    ("bins",),
-    (
-        (0,),
-        (-1,),
-    ),
+    ("bins", "error_message"),
+    [
+        pytest.param(1.5, "`bins` must be an integer or interable.", id="float"),
+        pytest.param(0, "`bins` should be a positive integer.", id="zero_int"),
+        pytest.param(-1, "`bins` should be a positive integer.", id="neg_int"),
+        pytest.param(
+            ["notabreak"],
+            "`bins` iterable should contain tuples or numerics",
+            id="iterable_w_wrong_type",
+        ),
+    ],
 )
-def test_cut_raises_with_invalid_bins(bins: int):
-    with pytest.raises(ValueError, match="`bins` should be a positive integer."):
+def test_cut_raises_with_invalid_bins(bins: int, error_message: str):
+    with pytest.raises(ValueError, match=error_message):
         mock_series = mock.create_autospec(bigframes.pandas.Series, instance=True)
         bigframes.pandas.cut(mock_series, bins, labels=False)