feat: implement Index.get_loc (#1921)

shuoweil · web-flow · commit bbbcaf35df11 · 2025-07-24T11:25:34.000-07:00
* feat: add index get_loc API

* update docstring

* code update

* final polish of the helper function

* fix mypy

* reset index of result

* change docstring

* fix docstring

* change a function call
diff --git a/bigframes/core/indexes/base.py b/bigframes/core/indexes/base.py
@@ -27,16 +27,21 @@
 import pandas
 
 from bigframes import dtypes
+from bigframes.core.array_value import ArrayValue
 import bigframes.core.block_transforms as block_ops
 import bigframes.core.blocks as blocks
 import bigframes.core.expression as ex
+import bigframes.core.identifiers as ids
+import bigframes.core.nodes as nodes
 import bigframes.core.ordering as order
 import bigframes.core.utils as utils
 import bigframes.core.validations as validations
+import bigframes.core.window_spec as window_spec
 import bigframes.dtypes
 import bigframes.formatting_helpers as formatter
 import bigframes.operations as ops
 import bigframes.operations.aggregations as agg_ops
+import bigframes.series
 
 if typing.TYPE_CHECKING:
     import bigframes.dataframe
@@ -247,6 +252,118 @@ def query_job(self) -> bigquery.QueryJob:
             self._query_job = query_job
         return self._query_job
 
+    def get_loc(self, key) -> typing.Union[int, slice, "bigframes.series.Series"]:
+        """Get integer location, slice or boolean mask for requested label.
+
+        Args:
+            key:
+                The label to search for in the index.
+
+        Returns:
+            An integer, slice, or boolean mask representing the location(s) of the key.
+
+        Raises:
+            NotImplementedError: If the index has more than one level.
+            KeyError: If the key is not found in the index.
+        """
+        if self.nlevels != 1:
+            raise NotImplementedError("get_loc only supports single-level indexes")
+
+        # Get the index column from the block
+        index_column = self._block.index_columns[0]
+
+        # Apply row numbering to the original data
+        row_number_column_id = ids.ColumnId.unique()
+        window_node = nodes.WindowOpNode(
+            child=self._block._expr.node,
+            expression=ex.NullaryAggregation(agg_ops.RowNumberOp()),
+            window_spec=window_spec.unbound(),
+            output_name=row_number_column_id,
+            never_skip_nulls=True,
+        )
+
+        windowed_array = ArrayValue(window_node)
+        windowed_block = blocks.Block(
+            windowed_array,
+            index_columns=self._block.index_columns,
+            column_labels=self._block.column_labels.insert(
+                len(self._block.column_labels), None
+            ),
+            index_labels=self._block._index_labels,
+        )
+
+        # Create expression to find matching positions
+        match_expr = ops.eq_op.as_expr(ex.deref(index_column), ex.const(key))
+        windowed_block, match_col_id = windowed_block.project_expr(match_expr)
+
+        # Filter to only rows where the key matches
+        filtered_block = windowed_block.filter_by_id(match_col_id)
+
+        # Check if key exists at all by counting on the filtered block
+        count_agg = ex.UnaryAggregation(
+            agg_ops.count_op, ex.deref(row_number_column_id.name)
+        )
+        count_result = filtered_block._expr.aggregate([(count_agg, "count")])
+        count_scalar = self._block.session._executor.execute(
+            count_result
+        ).to_py_scalar()
+
+        if count_scalar == 0:
+            raise KeyError(f"'{key}' is not in index")
+
+        # If only one match, return integer position
+        if count_scalar == 1:
+            min_agg = ex.UnaryAggregation(
+                agg_ops.min_op, ex.deref(row_number_column_id.name)
+            )
+            position_result = filtered_block._expr.aggregate([(min_agg, "position")])
+            position_scalar = self._block.session._executor.execute(
+                position_result
+            ).to_py_scalar()
+            return int(position_scalar)
+
+        # Handle multiple matches based on index monotonicity
+        is_monotonic = self.is_monotonic_increasing or self.is_monotonic_decreasing
+        if is_monotonic:
+            return self._get_monotonic_slice(filtered_block, row_number_column_id)
+        else:
+            # Return boolean mask for non-monotonic duplicates
+            mask_block = windowed_block.select_columns([match_col_id])
+            # Reset the index to use positional integers instead of original index values
+            mask_block = mask_block.reset_index(drop=True)
+            # Ensure correct dtype and name to match pandas behavior
+            result_series = bigframes.series.Series(mask_block)
+            return result_series.astype("boolean")
+
+    def _get_monotonic_slice(
+        self, filtered_block, row_number_column_id: "ids.ColumnId"
+    ) -> slice:
+        """Helper method to get a slice for monotonic duplicates with an optimized query."""
+        # Combine min and max aggregations into a single query for efficiency
+        min_max_aggs = [
+            (
+                ex.UnaryAggregation(
+                    agg_ops.min_op, ex.deref(row_number_column_id.name)
+                ),
+                "min_pos",
+            ),
+            (
+                ex.UnaryAggregation(
+                    agg_ops.max_op, ex.deref(row_number_column_id.name)
+                ),
+                "max_pos",
+            ),
+        ]
+        combined_result = filtered_block._expr.aggregate(min_max_aggs)
+
+        # Execute query and extract positions
+        result_df = self._block.session._executor.execute(combined_result).to_pandas()
+        min_pos = int(result_df["min_pos"].iloc[0])
+        max_pos = int(result_df["max_pos"].iloc[0])
+
+        # Create slice (stop is exclusive)
+        return slice(min_pos, max_pos + 1)
+
     def __repr__(self) -> str:
         # Protect against errors with uninitialized Series. See:
         # https://github.com/googleapis/python-bigquery-dataframes/issues/728
diff --git a/tests/system/small/test_index.py b/tests/system/small/test_index.py
@@ -32,6 +32,110 @@ def test_index_construct_from_list():
     pd.testing.assert_index_equal(bf_result, pd_result)
 
 
+@pytest.mark.parametrize("key, expected_loc", [("a", 0), ("b", 1), ("c", 2)])
+def test_get_loc_should_return_int_for_unique_index(key, expected_loc):
+    """Behavior: get_loc on a unique index returns an integer position."""
+    # The pandas result is used as the known-correct value.
+    # We assert our implementation matches it and the expected type.
+    bf_index = bpd.Index(["a", "b", "c"])
+
+    result = bf_index.get_loc(key)
+
+    assert result == expected_loc
+    assert isinstance(result, int)
+
+
+def test_get_loc_should_return_slice_for_monotonic_duplicates():
+    """Behavior: get_loc on a monotonic string index with duplicates returns a slice."""
+    bf_index = bpd.Index(["a", "b", "b", "c"])
+    pd_index = pd.Index(["a", "b", "b", "c"])
+
+    bf_result = bf_index.get_loc("b")
+    pd_result = pd_index.get_loc("b")
+
+    assert isinstance(bf_result, slice)
+    assert bf_result == pd_result  # Should be slice(1, 3, None)
+
+
+def test_get_loc_should_return_slice_for_monotonic_numeric_duplicates():
+    """Behavior: get_loc on a monotonic numeric index with duplicates returns a slice."""
+    bf_index = bpd.Index([1, 2, 2, 3])
+    pd_index = pd.Index([1, 2, 2, 3])
+
+    bf_result = bf_index.get_loc(2)
+    pd_result = pd_index.get_loc(2)
+
+    assert isinstance(bf_result, slice)
+    assert bf_result == pd_result  # Should be slice(1, 3, None)
+
+
+def test_get_loc_should_return_mask_for_non_monotonic_duplicates():
+    """Behavior: get_loc on a non-monotonic string index returns a boolean array."""
+    bf_index = bpd.Index(["a", "b", "c", "b"])
+    pd_index = pd.Index(["a", "b", "c", "b"])
+
+    pd_result = pd_index.get_loc("b")
+    bf_result = bf_index.get_loc("b")
+
+    assert not isinstance(bf_result, (int, slice))
+
+    if hasattr(bf_result, "to_numpy"):
+        bf_array = bf_result.to_numpy()
+    else:
+        bf_array = bf_result.to_pandas().to_numpy()
+    numpy.testing.assert_array_equal(bf_array, pd_result)
+
+
+def test_get_loc_should_return_mask_for_non_monotonic_numeric_duplicates():
+    """Behavior: get_loc on a non-monotonic numeric index returns a boolean array."""
+    bf_index = bpd.Index([1, 2, 3, 2])
+    pd_index = pd.Index([1, 2, 3, 2])
+
+    pd_result = pd_index.get_loc(2)
+    bf_result = bf_index.get_loc(2)
+
+    assert not isinstance(bf_result, (int, slice))
+
+    if hasattr(bf_result, "to_numpy"):
+        bf_array = bf_result.to_numpy()
+    else:
+        bf_array = bf_result.to_pandas().to_numpy()
+    numpy.testing.assert_array_equal(bf_array, pd_result)
+
+
+def test_get_loc_should_raise_error_for_missing_key():
+    """Behavior: get_loc raises KeyError when a string key is not found."""
+    bf_index = bpd.Index(["a", "b", "c"])
+
+    with pytest.raises(KeyError):
+        bf_index.get_loc("d")
+
+
+def test_get_loc_should_raise_error_for_missing_numeric_key():
+    """Behavior: get_loc raises KeyError when a numeric key is not found."""
+    bf_index = bpd.Index([1, 2, 3])
+
+    with pytest.raises(KeyError):
+        bf_index.get_loc(4)
+
+
+def test_get_loc_should_work_for_single_element_index():
+    """Behavior: get_loc on a single-element index returns 0."""
+    assert bpd.Index(["a"]).get_loc("a") == pd.Index(["a"]).get_loc("a")
+
+
+def test_get_loc_should_return_slice_when_all_elements_are_duplicates():
+    """Behavior: get_loc returns a full slice if all elements match the key."""
+    bf_index = bpd.Index(["a", "a", "a"])
+    pd_index = pd.Index(["a", "a", "a"])
+
+    bf_result = bf_index.get_loc("a")
+    pd_result = pd_index.get_loc("a")
+
+    assert isinstance(bf_result, slice)
+    assert bf_result == pd_result  # Should be slice(0, 3, None)
+
+
 def test_index_construct_from_series():
     bf_result = bpd.Index(
         bpd.Series([3, 14, 159], dtype=pd.Float64Dtype(), name="series_name"),
diff --git a/third_party/bigframes_vendored/pandas/core/indexes/base.py b/third_party/bigframes_vendored/pandas/core/indexes/base.py
@@ -4,6 +4,7 @@
 from collections.abc import Hashable
 import typing
 
+import bigframes
 from bigframes import constants
 
 
@@ -741,6 +742,47 @@ def argmin(self) -> int:
         """
         raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
 
+    def get_loc(
+        self, key: typing.Any
+    ) -> typing.Union[int, slice, bigframes.series.Series]:
+        """
+        Get integer location, slice or boolean mask for requested label.
+
+        **Examples:**
+
+            >>> import bigframes.pandas as bpd
+            >>> bpd.options.display.progress_bar = None
+
+            >>> unique_index = bpd.Index(list('abc'))
+            >>> unique_index.get_loc('b')
+            1
+
+            >>> monotonic_index = bpd.Index(list('abbc'))
+            >>> monotonic_index.get_loc('b')
+            slice(1, 3, None)
+
+            >>> non_monotonic_index = bpd.Index(list('abcb'))
+            >>> non_monotonic_index.get_loc('b')
+            0    False
+            1     True
+            2    False
+            3     True
+            Name: nan, dtype: boolean
+
+        Args:
+            key: Label to get the location for.
+
+        Returns:
+            Union[int, slice, bigframes.pandas.Series]:
+                Integer position of the label for unique indexes.
+                Slice object for monotonic indexes with duplicates.
+                Boolean Series mask for non-monotonic indexes with duplicates.
+
+        Raises:
+            KeyError: If the key is not found in the index.
+        """
+        raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
+
     def argmax(self) -> int:
         """
         Return int position of the largest value in the Series.