From 19167c2189463cd2529009fa6eb23636b0a6a3fe Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Fri, 23 Feb 2024 12:07:39 -0800 Subject: [PATCH 1/5] PERF: Return a RangeIndex from __getitem__ with a bool mask when possible --- doc/source/whatsnew/v3.0.0.rst | 1 + pandas/core/indexes/range.py | 10 ++++++ pandas/tests/indexes/ranges/test_range.py | 38 +++++++++++++++++++++++ 3 files changed, 49 insertions(+) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index 1f9f5e85a6c4b..35fdbbd43813a 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -175,6 +175,7 @@ Performance improvements - Performance improvement in :meth:`MultiIndex.equals` for equal length indexes (:issue:`56990`) - Performance improvement in :meth:`RangeIndex.append` when appending the same index (:issue:`57252`) - Performance improvement in :meth:`RangeIndex.take` returning a :class:`RangeIndex` instead of a :class:`Index` when possible. (:issue:`57445`) +- Performance improvement in :meth:`RangeIndex.__getitem__` with a boolean mask returning a :class:`RangeIndex` instead of a :class:`Index` when possible. (:issue:`?`) - Performance improvement in indexing operations for string dtypes (:issue:`56997`) - :meth:`Series.str.extract` returns a :class:`RangeIndex` columns instead of an :class:`Index` column when possible (:issue:`?``) diff --git a/pandas/core/indexes/range.py b/pandas/core/indexes/range.py index 8a2e3fbf500a4..2122bc5214842 100644 --- a/pandas/core/indexes/range.py +++ b/pandas/core/indexes/range.py @@ -42,6 +42,7 @@ from pandas.core import ops import pandas.core.common as com from pandas.core.construction import extract_array +from pandas.core.indexers import check_array_indexer import pandas.core.indexes.base as ibase from pandas.core.indexes.base import ( Index, @@ -1048,6 +1049,15 @@ def __getitem__(self, key): "and integer or boolean " "arrays are valid indices" ) + elif com.is_bool_indexer(key): + np_key = np.asarray(key, dtype=bool) + check_array_indexer(self._range, np_key) + # Short circuit potential _shallow_copy check + if np_key.all(): + return self._simple_new(self._range, name=self.name) + elif not np_key.any(): + return self._simple_new(_empty_range, name=self.name) + return self.take(np.flatnonzero(np_key)) return super().__getitem__(key) def _getitem_slice(self, slobj: slice) -> Self: diff --git a/pandas/tests/indexes/ranges/test_range.py b/pandas/tests/indexes/ranges/test_range.py index d500687763a1e..8b4b7a5d70ee4 100644 --- a/pandas/tests/indexes/ranges/test_range.py +++ b/pandas/tests/indexes/ranges/test_range.py @@ -623,3 +623,41 @@ def test_append_one_nonempty_preserve_step(): expected = RangeIndex(0, -1, -1) result = RangeIndex(0).append([expected]) tm.assert_index_equal(result, expected, exact=True) + + +def test_getitem_boolmask_all_true(): + ri = RangeIndex(3, name="foo") + expected = ri.copy() + result = ri[[True] * 3] + tm.assert_index_equal(result, expected, exact=True) + + +def test_getitem_boolmask_all_false(): + ri = RangeIndex(3, name="foo") + result = ri[[False] * 3] + expected = RangeIndex(0, name="foo") + tm.assert_index_equal(result, expected, exact=True) + + +def test_getitem_boolmask_returns_rangeindex(): + ri = RangeIndex(3, name="foo") + result = ri[[False, True, True]] + expected = RangeIndex(1, 3, name="foo") + tm.assert_index_equal(result, expected, exact=True) + + result = ri[[True, False, True]] + expected = RangeIndex(0, 3, 2, name="foo") + tm.assert_index_equal(result, expected, exact=True) + + +def test_getitem_boolmask_returns_index(): + ri = RangeIndex(4, name="foo") + result = ri[[True, True, False, True]] + expected = Index([0, 1, 3], name="foo") + tm.assert_index_equal(result, expected) + + +def test_getitem_boolmask_wrong_length(): + ri = RangeIndex(4, name="foo") + with pytest.raises(IndexError, match="Boolean index has wrong length"): + ri[[True]] From 01823a036698dded434daa5d0eb3710638d44d0a Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Fri, 23 Feb 2024 12:09:18 -0800 Subject: [PATCH 2/5] Add whatsnew --- doc/source/whatsnew/v3.0.0.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index 35fdbbd43813a..6a71479def1d4 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -166,6 +166,7 @@ Removal of prior version deprecations/changes Performance improvements ~~~~~~~~~~~~~~~~~~~~~~~~ +- :meth:`Series.str.extract` returns a :class:`RangeIndex` columns instead of an :class:`Index` column when possible (:issue:`57542`) - Performance improvement in :meth:`DataFrame.join` for sorted but non-unique indexes (:issue:`56941`) - Performance improvement in :meth:`DataFrame.join` when left and/or right are non-unique and ``how`` is ``"left"``, ``"right"``, or ``"inner"`` (:issue:`56817`) - Performance improvement in :meth:`DataFrame.join` with ``how="left"`` or ``how="right"`` and ``sort=True`` (:issue:`56919`) @@ -173,11 +174,10 @@ Performance improvements - Performance improvement in :meth:`Index.join` by propagating cached attributes in cases where the result matches one of the inputs (:issue:`57023`) - Performance improvement in :meth:`Index.take` when ``indices`` is a full range indexer from zero to length of index (:issue:`56806`) - Performance improvement in :meth:`MultiIndex.equals` for equal length indexes (:issue:`56990`) +- Performance improvement in :meth:`RangeIndex.__getitem__` with a boolean mask returning a :class:`RangeIndex` instead of a :class:`Index` when possible. (:issue:`57588`) - Performance improvement in :meth:`RangeIndex.append` when appending the same index (:issue:`57252`) - Performance improvement in :meth:`RangeIndex.take` returning a :class:`RangeIndex` instead of a :class:`Index` when possible. (:issue:`57445`) -- Performance improvement in :meth:`RangeIndex.__getitem__` with a boolean mask returning a :class:`RangeIndex` instead of a :class:`Index` when possible. (:issue:`?`) - Performance improvement in indexing operations for string dtypes (:issue:`56997`) -- :meth:`Series.str.extract` returns a :class:`RangeIndex` columns instead of an :class:`Index` column when possible (:issue:`?``) .. --------------------------------------------------------------------------- .. _whatsnew_300.bug_fixes: From f1e2d4ea108a1b1b76d80c5ce15e2580fd390ace Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Fri, 23 Feb 2024 12:53:05 -0800 Subject: [PATCH 3/5] Handle EA types --- pandas/core/indexes/range.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/pandas/core/indexes/range.py b/pandas/core/indexes/range.py index 2122bc5214842..e307da4b8f12e 100644 --- a/pandas/core/indexes/range.py +++ b/pandas/core/indexes/range.py @@ -29,6 +29,7 @@ doc, ) +from pandas.core.dtypes.base import ExtensionDtype from pandas.core.dtypes.common import ( ensure_platform_int, ensure_python_int, @@ -1050,7 +1051,10 @@ def __getitem__(self, key): "arrays are valid indices" ) elif com.is_bool_indexer(key): - np_key = np.asarray(key, dtype=bool) + if isinstance(getattr(key, "dtype", None), ExtensionDtype): + np_key = key.to_numpy(dtype=bool, na_value=False) + else: + np_key = np.asarray(key, dtype=bool) check_array_indexer(self._range, np_key) # Short circuit potential _shallow_copy check if np_key.all(): From 61d1e3b70be48f5b3fa6286ab20b04cc60afd727 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Fri, 23 Feb 2024 14:12:41 -0800 Subject: [PATCH 4/5] force to Index --- pandas/tests/io/pytables/test_append.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pandas/tests/io/pytables/test_append.py b/pandas/tests/io/pytables/test_append.py index aff08e5ad3882..529d6d789596f 100644 --- a/pandas/tests/io/pytables/test_append.py +++ b/pandas/tests/io/pytables/test_append.py @@ -132,6 +132,8 @@ def test_append_series(setup_path): # select on the index and values expected = ns[(ns > 70) & (ns.index < 90)] + # Reading/writing RangeIndex info is not supported yet + expected.index = Index(expected.index._data) result = store.select("ns", "foo>70 and index<90") tm.assert_series_equal(result, expected, check_index_type=True) From 62a862d5388db7ee9a80b9a6d75d3f061062d149 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Fri, 23 Feb 2024 16:12:21 -0800 Subject: [PATCH 5/5] Type ignore --- pandas/core/indexes/range.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/indexes/range.py b/pandas/core/indexes/range.py index e307da4b8f12e..c5036a2b32967 100644 --- a/pandas/core/indexes/range.py +++ b/pandas/core/indexes/range.py @@ -1055,7 +1055,7 @@ def __getitem__(self, key): np_key = key.to_numpy(dtype=bool, na_value=False) else: np_key = np.asarray(key, dtype=bool) - check_array_indexer(self._range, np_key) + check_array_indexer(self._range, np_key) # type: ignore[arg-type] # Short circuit potential _shallow_copy check if np_key.all(): return self._simple_new(self._range, name=self.name)