From 3be332ca695990ec3fd436c95b524e53c64ac914 Mon Sep 17 00:00:00 2001
From: Nitish Satyavolu
Date: Thu, 20 Feb 2025 23:40:46 -0800
Subject: [PATCH 01/27] BUG: Revert SQLAlchemy minimum version back to 1.4.36
---
ci/deps/actions-310-minimum_versions.yaml | 2 +-
ci/deps/actions-310.yaml | 2 +-
ci/deps/actions-311-downstream_compat.yaml | 2 +-
ci/deps/actions-311.yaml | 2 +-
ci/deps/actions-312.yaml | 2 +-
doc/source/getting_started/install.rst | 2 +-
environment.yml | 2 +-
pandas/compat/_optional.py | 2 +-
pyproject.toml | 2 +-
requirements-dev.txt | 2 +-
10 files changed, 10 insertions(+), 10 deletions(-)
diff --git a/ci/deps/actions-310-minimum_versions.yaml b/ci/deps/actions-310-minimum_versions.yaml
index c7c72828db481..2aadf42a510eb 100644
--- a/ci/deps/actions-310-minimum_versions.yaml
+++ b/ci/deps/actions-310-minimum_versions.yaml
@@ -52,7 +52,7 @@ dependencies:
- pyxlsb=1.0.10
- s3fs=2022.11.0
- scipy=1.10.0
- - sqlalchemy=2.0.0
+ - sqlalchemy=1.4.36
- tabulate=0.9.0
- xarray=2022.12.0
- xlrd=2.0.1
diff --git a/ci/deps/actions-310.yaml b/ci/deps/actions-310.yaml
index 74cab4e0970dc..5688d3143e621 100644
--- a/ci/deps/actions-310.yaml
+++ b/ci/deps/actions-310.yaml
@@ -50,7 +50,7 @@ dependencies:
- pyxlsb>=1.0.10
- s3fs>=2022.11.0
- scipy>=1.10.0
- - sqlalchemy>=2.0.0
+ - sqlalchemy>=1.4.36
- tabulate>=0.9.0
- xarray>=2022.12.0, <=2024.9.0
- xlrd>=2.0.1
diff --git a/ci/deps/actions-311-downstream_compat.yaml b/ci/deps/actions-311-downstream_compat.yaml
index 092ca18d61259..7713ae0232623 100644
--- a/ci/deps/actions-311-downstream_compat.yaml
+++ b/ci/deps/actions-311-downstream_compat.yaml
@@ -51,7 +51,7 @@ dependencies:
- pyxlsb>=1.0.10
- s3fs>=2022.11.0
- scipy>=1.10.0
- - sqlalchemy>=2.0.0
+ - sqlalchemy>=1.4.36
- tabulate>=0.9.0
- xarray>=2022.12.0, <=2024.9.0
- xlrd>=2.0.1
diff --git a/ci/deps/actions-311.yaml b/ci/deps/actions-311.yaml
index b6f515dceaea9..c160eae364ba2 100644
--- a/ci/deps/actions-311.yaml
+++ b/ci/deps/actions-311.yaml
@@ -50,7 +50,7 @@ dependencies:
- pyxlsb>=1.0.10
- s3fs>=2022.11.0
- scipy>=1.10.0
- - sqlalchemy>=2.0.0
+ - sqlalchemy>=1.4.36
- tabulate>=0.9.0
- xarray>=2022.12.0, <=2024.9.0
- xlrd>=2.0.1
diff --git a/ci/deps/actions-312.yaml b/ci/deps/actions-312.yaml
index bc66f8a5382c9..034653d207c0b 100644
--- a/ci/deps/actions-312.yaml
+++ b/ci/deps/actions-312.yaml
@@ -50,7 +50,7 @@ dependencies:
- pyxlsb>=1.0.10
- s3fs>=2022.11.0
- scipy>=1.10.0
- - sqlalchemy>=2.0.0
+ - sqlalchemy>=1.4.36
- tabulate>=0.9.0
- xarray>=2022.12.0, <=2024.9.0
- xlrd>=2.0.1
diff --git a/doc/source/getting_started/install.rst b/doc/source/getting_started/install.rst
index bda959f380e8a..5d11e9574091e 100644
--- a/doc/source/getting_started/install.rst
+++ b/doc/source/getting_started/install.rst
@@ -287,7 +287,7 @@ Traditional drivers are installable with ``pip install "pandas[postgresql, mysql
================================================================== ================== =============== ============================================
Dependency Minimum Version pip extra Notes
================================================================== ================== =============== ============================================
-`SQLAlchemy `__ 2.0.0 postgresql, SQL support for databases other than sqlite
+`SQLAlchemy `__ 1.4.36 postgresql, SQL support for databases other than sqlite
mysql,
sql-other
`psycopg2 `__ 2.9.6 postgresql PostgreSQL engine for sqlalchemy
diff --git a/environment.yml b/environment.yml
index ca8f1996c61cf..8faf27e1dad82 100644
--- a/environment.yml
+++ b/environment.yml
@@ -54,7 +54,7 @@ dependencies:
- pyxlsb>=1.0.10
- s3fs>=2022.11.0
- scipy>=1.10.0
- - sqlalchemy>=2.0.0
+ - sqlalchemy>=1.4.36
- tabulate>=0.9.0
- xarray>=2022.12.0, <=2024.9.0
- xlrd>=2.0.1
diff --git a/pandas/compat/_optional.py b/pandas/compat/_optional.py
index 6b90389a62056..1d5a68c8f5d8a 100644
--- a/pandas/compat/_optional.py
+++ b/pandas/compat/_optional.py
@@ -47,7 +47,7 @@
"pyxlsb": "1.0.10",
"s3fs": "2022.11.0",
"scipy": "1.10.0",
- "sqlalchemy": "2.0.0",
+ "sqlalchemy": "1.4.36",
"tables": "3.8.0",
"tabulate": "0.9.0",
"xarray": "2022.12.0",
diff --git a/pyproject.toml b/pyproject.toml
index b7d53b0d8934a..b106431daf081 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -113,7 +113,7 @@ all = ['adbc-driver-postgresql>=0.10.0',
'qtpy>=2.3.0',
'scipy>=1.10.0',
's3fs>=2022.11.0',
- 'SQLAlchemy>=2.0.0',
+ 'SQLAlchemy>=1.4.36',
'tables>=3.8.0',
'tabulate>=0.9.0',
'xarray>=2022.12.0',
diff --git a/requirements-dev.txt b/requirements-dev.txt
index 20fc21be75a06..7d2870de27cfa 100644
--- a/requirements-dev.txt
+++ b/requirements-dev.txt
@@ -43,7 +43,7 @@ pytz>=2023.4
pyxlsb>=1.0.10
s3fs>=2022.11.0
scipy>=1.10.0
-SQLAlchemy>=2.0.0
+SQLAlchemy>=1.4.36
tabulate>=0.9.0
xarray>=2022.12.0, <=2024.9.0
xlrd>=2.0.1
From 25f2fb5127bd9c61331952fb22544614f6bbd492 Mon Sep 17 00:00:00 2001
From: Nitish Satyavolu
Date: Fri, 21 Feb 2025 11:58:15 -0800
Subject: [PATCH 02/27] Update pyproject.toml
---
pyproject.toml | 6 +++---
1 file changed, 3 insertions(+), 3 deletions(-)
diff --git a/pyproject.toml b/pyproject.toml
index b106431daf081..0b4d85c0a62f1 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -72,9 +72,9 @@ hdf5 = [# blosc only available on conda (https://github.com/Blosc/python-blosc/i
#'blosc>=1.20.1',
'tables>=3.8.0']
spss = ['pyreadstat>=1.2.0']
-postgresql = ['SQLAlchemy>=2.0.0', 'psycopg2>=2.9.6', 'adbc-driver-postgresql>=0.10.0']
-mysql = ['SQLAlchemy>=2.0.0', 'pymysql>=1.0.2']
-sql-other = ['SQLAlchemy>=2.0.0', 'adbc-driver-postgresql>=0.10.0', 'adbc-driver-sqlite>=0.8.0']
+postgresql = ['SQLAlchemy>=1.4.36', 'psycopg2>=2.9.6', 'adbc-driver-postgresql>=0.10.0']
+mysql = ['SQLAlchemy>=1.4.36', 'pymysql>=1.0.2']
+sql-other = ['SQLAlchemy>=1.4.36', 'adbc-driver-postgresql>=0.10.0', 'adbc-driver-sqlite>=0.8.0']
html = ['beautifulsoup4>=4.11.2', 'html5lib>=1.1', 'lxml>=4.9.2']
xml = ['lxml>=4.9.2']
plot = ['matplotlib>=3.6.3']
From 1e50cc39cef6b2a40055185a59c448cf93db7cc8 Mon Sep 17 00:00:00 2001
From: "snitish.iitk@gmail.com"
Date: Wed, 26 Feb 2025 13:20:00 -0600
Subject: [PATCH 03/27] Merge against master
---
doc/source/whatsnew/v2.3.0.rst | 1 +
1 file changed, 1 insertion(+)
diff --git a/doc/source/whatsnew/v2.3.0.rst b/doc/source/whatsnew/v2.3.0.rst
index 230332319e0ac..d830dd8277ea9 100644
--- a/doc/source/whatsnew/v2.3.0.rst
+++ b/doc/source/whatsnew/v2.3.0.rst
@@ -38,6 +38,7 @@ Other enhancements
- :meth:`Series.str.decode` result now has ``StringDtype`` when ``future.infer_string`` is True (:issue:`60709`)
- :meth:`~Series.to_hdf` and :meth:`~DataFrame.to_hdf` now round-trip with ``StringDtype`` (:issue:`60663`)
- Improved ``repr`` of :class:`.NumpyExtensionArray` to account for NEP51 (:issue:`61085`)
+- Reverted the minimum version for the ``sqlalchemy`` optional dependency back to ``1.4.36`` (:issue:`57049`)
- The :meth:`Series.str.decode` has gained the argument ``dtype`` to control the dtype of the result (:issue:`60940`)
- The :meth:`~Series.cumsum`, :meth:`~Series.cummin`, and :meth:`~Series.cummax` reductions are now implemented for ``StringDtype`` columns (:issue:`60633`)
- The :meth:`~Series.sum` reduction is now implemented for ``StringDtype`` columns (:issue:`59853`)
From 993f6a9302158d64c6a7627603dc4c54bf5fb3c9 Mon Sep 17 00:00:00 2001
From: "Christine P. Chai"
Date: Mon, 7 Apr 2025 09:35:30 -0700
Subject: [PATCH 04/27] DOC Update the awkward-pandas GitHub link (#61241)
---
web/pandas/community/ecosystem.md | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/web/pandas/community/ecosystem.md b/web/pandas/community/ecosystem.md
index c6dddd5c2ef9f..4863093387f37 100644
--- a/web/pandas/community/ecosystem.md
+++ b/web/pandas/community/ecosystem.md
@@ -655,7 +655,7 @@ Pandas provides an interface for defining
The following libraries implement that interface to provide types not found in NumPy or pandas,
which work well with pandas' data containers.
-### [awkward-pandas](https://awkward-pandas.readthedocs.io/)
+### [awkward-pandas](https://github.com/scikit-hep/awkward)
Awkward-pandas provides an extension type for storing [Awkward
Arrays](https://awkward-array.org/) inside pandas' Series and
From f6760921a00c43251612afa6b7ce1ea686f8ffba Mon Sep 17 00:00:00 2001
From: Matthieu Thiboust <14574229+mthiboust@users.noreply.github.com>
Date: Mon, 7 Apr 2025 18:55:09 +0200
Subject: [PATCH 05/27] BUG: Fix #61222: Keep index name when resampling with
pyarrow dtype (#61229)
* BUG: Fix pandas-dev#61222: Keep index name when resampling with pyarrow dtype
* Update doc/source/whatsnew/v3.0.0.rst
---------
Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
---
doc/source/whatsnew/v3.0.0.rst | 1 +
pandas/core/resample.py | 1 +
pandas/tests/resample/test_datetime_index.py | 10 ++++++++++
3 files changed, 12 insertions(+)
diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst
index e6fafc8b1b14c..29be9a7341f00 100644
--- a/doc/source/whatsnew/v3.0.0.rst
+++ b/doc/source/whatsnew/v3.0.0.rst
@@ -773,6 +773,7 @@ Groupby/resample/rolling
- Bug in :meth:`.DataFrameGroupBy.quantile` when ``interpolation="nearest"`` is inconsistent with :meth:`DataFrame.quantile` (:issue:`47942`)
- Bug in :meth:`.Resampler.interpolate` on a :class:`DataFrame` with non-uniform sampling and/or indices not aligning with the resulting resampled index would result in wrong interpolation (:issue:`21351`)
- Bug in :meth:`DataFrame.ewm` and :meth:`Series.ewm` when passed ``times`` and aggregation functions other than mean (:issue:`51695`)
+- Bug in :meth:`DataFrame.resample` and :meth:`Series.resample` were not keeping the index name when the index had :class:`ArrowDtype` timestamp dtype (:issue:`61222`)
- Bug in :meth:`DataFrame.resample` changing index type to :class:`MultiIndex` when the dataframe is empty and using an upsample method (:issue:`55572`)
- Bug in :meth:`DataFrameGroupBy.agg` that raises ``AttributeError`` when there is dictionary input and duplicated columns, instead of returning a DataFrame with the aggregation of all duplicate columns. (:issue:`55041`)
- Bug in :meth:`DataFrameGroupBy.apply` and :meth:`SeriesGroupBy.apply` for empty data frame with ``group_keys=False`` still creating output index using group keys. (:issue:`60471`)
diff --git a/pandas/core/resample.py b/pandas/core/resample.py
index 753f7fb6cea1a..08e3beef99e60 100644
--- a/pandas/core/resample.py
+++ b/pandas/core/resample.py
@@ -518,6 +518,7 @@ def _wrap_result(self, result):
if self._timegrouper._arrow_dtype is not None:
result.index = result.index.astype(self._timegrouper._arrow_dtype)
+ result.index.name = self.obj.index.name
return result
diff --git a/pandas/tests/resample/test_datetime_index.py b/pandas/tests/resample/test_datetime_index.py
index 3a7fd548ca961..f871c0bf0218c 100644
--- a/pandas/tests/resample/test_datetime_index.py
+++ b/pandas/tests/resample/test_datetime_index.py
@@ -2155,6 +2155,16 @@ def test_arrow_timestamp_resample(tz):
tm.assert_series_equal(result, expected)
+@td.skip_if_no("pyarrow")
+def test_arrow_timestamp_resample_keep_index_name():
+ # https://github.com/pandas-dev/pandas/issues/61222
+ idx = Series(date_range("2020-01-01", periods=5), dtype="timestamp[ns][pyarrow]")
+ expected = Series(np.arange(5, dtype=np.float64), index=idx)
+ expected.index.name = "index_name"
+ result = expected.resample("1D").mean()
+ tm.assert_series_equal(result, expected)
+
+
@pytest.mark.parametrize("freq", ["1A", "2A-MAR"])
def test_resample_A_raises(freq):
msg = f"Invalid frequency: {freq[1:]}"
From 150be39092f7557cb3c68119dde4fe3f8bb89049 Mon Sep 17 00:00:00 2001
From: John Hendricks
Date: Mon, 7 Apr 2025 12:57:53 -0400
Subject: [PATCH 06/27] DOC: Added docstrings to min, max, and reso (#61238)
Added docstrings to min, max, and reso
Co-authored-by: John Hendricks
---
ci/code_checks.sh | 3 --
pandas/_libs/tslibs/timestamps.pyx | 84 +++++++++++++++++++++++++++---
2 files changed, 77 insertions(+), 10 deletions(-)
diff --git a/ci/code_checks.sh b/ci/code_checks.sh
index 2c32eb4f0c584..a0d23aa0478d2 100755
--- a/ci/code_checks.sh
+++ b/ci/code_checks.sh
@@ -72,9 +72,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then
-i "pandas.Series.dt PR01" `# Accessors are implemented as classes, but we do not document the Parameters section` \
-i "pandas.Period.freq GL08" \
-i "pandas.Period.ordinal GL08" \
- -i "pandas.Timestamp.max PR02" \
- -i "pandas.Timestamp.min PR02" \
- -i "pandas.Timestamp.resolution PR02" \
-i "pandas.core.groupby.DataFrameGroupBy.plot PR02" \
-i "pandas.core.groupby.SeriesGroupBy.plot PR02" \
-i "pandas.core.resample.Resampler.quantile PR01,PR07" \
diff --git a/pandas/_libs/tslibs/timestamps.pyx b/pandas/_libs/tslibs/timestamps.pyx
index 23197b9a55afc..390267db8267f 100644
--- a/pandas/_libs/tslibs/timestamps.pyx
+++ b/pandas/_libs/tslibs/timestamps.pyx
@@ -200,8 +200,9 @@ class MinMaxReso:
See also: timedeltas.MinMaxReso
"""
- def __init__(self, name):
+ def __init__(self, name, docstring):
self._name = name
+ self.__doc__ = docstring
def __get__(self, obj, type=None):
cls = Timestamp
@@ -216,11 +217,15 @@ class MinMaxReso:
if obj is None:
# i.e. this is on the class, default to nanos
- return cls(val)
+ result = cls(val)
elif self._name == "resolution":
- return Timedelta._from_value_and_reso(val, obj._creso)
+ result = Timedelta._from_value_and_reso(val, obj._creso)
else:
- return Timestamp._from_value_and_reso(val, obj._creso, tz=None)
+ result = Timestamp._from_value_and_reso(val, obj._creso, tz=None)
+
+ result.__doc__ = self.__doc__
+
+ return result
def __set__(self, obj, value):
raise AttributeError(f"{self._name} is not settable.")
@@ -235,9 +240,74 @@ cdef class _Timestamp(ABCTimestamp):
dayofweek = _Timestamp.day_of_week
dayofyear = _Timestamp.day_of_year
- min = MinMaxReso("min")
- max = MinMaxReso("max")
- resolution = MinMaxReso("resolution") # GH#21336, GH#21365
+ _docstring_min = """
+ Returns the minimum bound possible for Timestamp.
+
+ This property provides access to the smallest possible value that
+ can be represented by a Timestamp object.
+
+ Returns
+ -------
+ Timestamp
+
+ See Also
+ --------
+ Timestamp.max: Returns the maximum bound possible for Timestamp.
+ Timestamp.resolution: Returns the smallest possible difference between
+ non-equal Timestamp objects.
+
+ Examples
+ --------
+ >>> pd.Timestamp.min
+ Timestamp('1677-09-21 00:12:43.145224193')
+ """
+
+ _docstring_max = """
+ Returns the maximum bound possible for Timestamp.
+
+ This property provides access to the largest possible value that
+ can be represented by a Timestamp object.
+
+ Returns
+ -------
+ Timestamp
+
+ See Also
+ --------
+ Timestamp.min: Returns the minimum bound possible for Timestamp.
+ Timestamp.resolution: Returns the smallest possible difference between
+ non-equal Timestamp objects.
+
+ Examples
+ --------
+ >>> pd.Timestamp.max
+ Timestamp('2262-04-11 23:47:16.854775807')
+ """
+
+ _docstring_reso = """
+ Returns the smallest possible difference between non-equal Timestamp objects.
+
+ The resolution value is determined by the underlying representation of time
+ units and is equivalent to Timedelta(nanoseconds=1).
+
+ Returns
+ -------
+ Timedelta
+
+ See Also
+ --------
+ Timestamp.max: Returns the maximum bound possible for Timestamp.
+ Timestamp.min: Returns the minimum bound possible for Timestamp.
+
+ Examples
+ --------
+ >>> pd.Timestamp.resolution
+ Timedelta('0 days 00:00:00.000000001')
+ """
+
+ min = MinMaxReso("min", _docstring_min)
+ max = MinMaxReso("max", _docstring_max)
+ resolution = MinMaxReso("resolution", _docstring_reso) # GH#21336, GH#21365
@property
def value(self) -> int:
From ea353ae01f21df32c19735e49064dee228de706e Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Mon, 7 Apr 2025 14:21:32 -0700
Subject: [PATCH 07/27] STY: Bump pre-commit checks (#61246)
* Bump pre-commit version, bump clang-format and meson
* Fix type checking abbreviation
* Bump to 0.11.4
* Put minimum version at 4
* Change misc to arg-type
---
.pre-commit-config.yaml | 12 ++++++------
asv_bench/benchmarks/frame_methods.py | 2 +-
environment.yml | 2 +-
pandas/_libs/tslibs/timedeltas.pyi | 6 ++----
pandas/core/apply.py | 2 +-
pandas/core/arrays/string_arrow.py | 2 +-
pandas/core/generic.py | 2 +-
pandas/core/groupby/generic.py | 2 +-
pandas/core/groupby/groupby.py | 4 ++--
pandas/core/internals/blocks.py | 2 +-
pandas/core/internals/construction.py | 2 +-
pandas/core/internals/managers.py | 2 +-
pandas/io/formats/format.py | 2 +-
pandas/io/formats/style_render.py | 6 +++---
pandas/io/pytables.py | 6 +++---
pandas/io/sql.py | 6 +++---
pandas/tests/apply/test_frame_apply.py | 4 ++--
pandas/tests/dtypes/cast/test_maybe_box_native.py | 2 +-
pandas/tests/io/parser/test_na_values.py | 2 +-
pandas/tests/series/accessors/test_cat_accessor.py | 2 +-
pandas/tests/series/test_constructors.py | 2 +-
pyproject.toml | 8 +++++---
requirements-dev.txt | 2 +-
setup.py | 2 +-
24 files changed, 42 insertions(+), 42 deletions(-)
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 09bfda1755e03..5308c98e96937 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -1,4 +1,4 @@
-minimum_pre_commit_version: 2.15.0
+minimum_pre_commit_version: 4.0.0
exclude: ^LICENSES/|\.(html|csv|svg)$
# reserve "manual" for relatively slow hooks which we still want to run in CI
default_stages: [
@@ -19,13 +19,13 @@ ci:
skip: [pyright, mypy]
repos:
- repo: https://github.com/astral-sh/ruff-pre-commit
- rev: v0.9.9
+ rev: v0.11.4
hooks:
- id: ruff
args: [--exit-non-zero-on-fix]
exclude: ^pandas/tests/frame/test_query_eval.py
- id: ruff
- # TODO: remove autofixe-only rules when they are checked by ruff
+ # TODO: remove autofix only rules when they are checked by ruff
name: ruff-selected-autofixes
alias: ruff-selected-autofixes
files: ^pandas
@@ -34,7 +34,7 @@ repos:
- id: ruff-format
exclude: ^scripts|^pandas/tests/frame/test_query_eval.py
- repo: https://github.com/jendrikseipp/vulture
- rev: 'v2.14'
+ rev: v2.14
hooks:
- id: vulture
entry: python scripts/run_vulture.py
@@ -95,14 +95,14 @@ repos:
- id: sphinx-lint
args: ["--enable", "all", "--disable", "line-too-long"]
- repo: https://github.com/pre-commit/mirrors-clang-format
- rev: v19.1.7
+ rev: v20.1.0
hooks:
- id: clang-format
files: ^pandas/_libs/src|^pandas/_libs/include
args: [-i]
types_or: [c, c++]
- repo: https://github.com/trim21/pre-commit-mirror-meson
- rev: v1.7.0
+ rev: v1.7.2
hooks:
- id: meson-fmt
args: ['--inplace']
diff --git a/asv_bench/benchmarks/frame_methods.py b/asv_bench/benchmarks/frame_methods.py
index 6a2ab24df26fe..cd7851acae3f2 100644
--- a/asv_bench/benchmarks/frame_methods.py
+++ b/asv_bench/benchmarks/frame_methods.py
@@ -517,7 +517,7 @@ def setup(self):
self.df = DataFrame(np.random.randn(1000, 100))
self.s = Series(np.arange(1028.0))
- self.df2 = DataFrame({i: self.s for i in range(1028)})
+ self.df2 = DataFrame(dict.fromkeys(range(1028), self.s))
self.df3 = DataFrame(np.random.randn(1000, 3), columns=list("ABC"))
def time_apply_user_func(self):
diff --git a/environment.yml b/environment.yml
index 8faf27e1dad82..704bf5d767b86 100644
--- a/environment.yml
+++ b/environment.yml
@@ -80,7 +80,7 @@ dependencies:
- flake8=7.1.0 # run in subprocess over docstring examples
- mypy=1.13.0 # pre-commit uses locally installed mypy
- tokenize-rt # scripts/check_for_inconsistent_pandas_namespace.py
- - pre-commit>=4.0.1
+ - pre-commit>=4.2.0
# documentation
- gitpython # obtain contributors from git for whatsnew
diff --git a/pandas/_libs/tslibs/timedeltas.pyi b/pandas/_libs/tslibs/timedeltas.pyi
index 979a5666661b2..c885543b2fc6d 100644
--- a/pandas/_libs/tslibs/timedeltas.pyi
+++ b/pandas/_libs/tslibs/timedeltas.pyi
@@ -3,7 +3,6 @@ from typing import (
ClassVar,
Literal,
TypeAlias,
- TypeVar,
overload,
)
@@ -60,7 +59,6 @@ UnitChoices: TypeAlias = Literal[
"nanos",
"nanosecond",
]
-_S = TypeVar("_S", bound=timedelta)
def get_unit_for_round(freq, creso: int) -> int: ...
def disallow_ambiguous_unit(unit: str | None) -> None: ...
@@ -95,11 +93,11 @@ class Timedelta(timedelta):
_value: int # np.int64
# error: "__new__" must return a class instance (got "Union[Timestamp, NaTType]")
def __new__( # type: ignore[misc]
- cls: type[_S],
+ cls: type[Self],
value=...,
unit: str | None = ...,
**kwargs: float | np.integer | np.floating,
- ) -> _S | NaTType: ...
+ ) -> Self | NaTType: ...
@classmethod
def _from_value_and_reso(cls, value: np.int64, reso: int) -> Timedelta: ...
@property
diff --git a/pandas/core/apply.py b/pandas/core/apply.py
index da6124307e3f1..2c96f1ef020ac 100644
--- a/pandas/core/apply.py
+++ b/pandas/core/apply.py
@@ -327,7 +327,7 @@ def transform(self) -> DataFrame | Series:
if is_series:
func = {com.get_callable_name(v) or v: v for v in func}
else:
- func = {col: func for col in obj}
+ func = dict.fromkeys(obj, func)
if is_dict_like(func):
func = cast(AggFuncTypeDict, func)
diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py
index d35083fd892a8..a39d64429d162 100644
--- a/pandas/core/arrays/string_arrow.py
+++ b/pandas/core/arrays/string_arrow.py
@@ -281,7 +281,7 @@ def isin(self, values: ArrayLike) -> npt.NDArray[np.bool_]:
]
# short-circuit to return all False array.
- if not len(value_set):
+ if not value_set:
return np.zeros(len(self), dtype=bool)
result = pc.is_in(
diff --git a/pandas/core/generic.py b/pandas/core/generic.py
index 6a45ef9325bec..884107d4bc6af 100644
--- a/pandas/core/generic.py
+++ b/pandas/core/generic.py
@@ -9705,7 +9705,7 @@ def _where(
# CoW: Make sure reference is not kept alive
if cond.ndim == 1 and self.ndim == 2:
cond = cond._constructor_expanddim(
- {i: cond for i in range(len(self.columns))},
+ dict.fromkeys(range(len(self.columns)), cond),
copy=False,
)
cond.columns = self.columns
diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py
index 1251403db6ff3..b520ad69aae96 100644
--- a/pandas/core/groupby/generic.py
+++ b/pandas/core/groupby/generic.py
@@ -2505,7 +2505,7 @@ def _apply_to_column_groupbys(self, func) -> DataFrame:
)
results = [func(sgb) for sgb in sgbs]
- if not len(results):
+ if not results:
# concat would raise
res_df = DataFrame([], columns=columns, index=self._grouper.result_index)
else:
diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py
index f9438b348c140..d31e50bbd311b 100644
--- a/pandas/core/groupby/groupby.py
+++ b/pandas/core/groupby/groupby.py
@@ -5175,8 +5175,8 @@ def diff(
shifted = shifted.astype("float32")
else:
to_coerce = [c for c, dtype in obj.dtypes.items() if dtype in dtypes_to_f32]
- if len(to_coerce):
- shifted = shifted.astype({c: "float32" for c in to_coerce})
+ if to_coerce:
+ shifted = shifted.astype(dict.fromkeys(to_coerce, "float32"))
return obj - shifted
diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py
index dc64da35e9725..b846af1c83736 100644
--- a/pandas/core/internals/blocks.py
+++ b/pandas/core/internals/blocks.py
@@ -805,7 +805,7 @@ def replace_list(
for x, y in zip(src_list, dest_list)
if (self._can_hold_element(x) or (self.dtype == "string" and is_re(x)))
]
- if not len(pairs):
+ if not pairs:
return [self.copy(deep=False)]
src_len = len(pairs) - 1
diff --git a/pandas/core/internals/construction.py b/pandas/core/internals/construction.py
index 69da2be0306f6..d098f8d42d3db 100644
--- a/pandas/core/internals/construction.py
+++ b/pandas/core/internals/construction.py
@@ -864,7 +864,7 @@ def _finalize_columns_and_data(
# GH#26429 do not raise user-facing AssertionError
raise ValueError(err) from err
- if len(contents) and contents[0].dtype == np.object_:
+ if contents and contents[0].dtype == np.object_:
contents = convert_object_array(contents, dtype=dtype)
return contents, columns
diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py
index a3738bb25f56c..e238bb78bbdfa 100644
--- a/pandas/core/internals/managers.py
+++ b/pandas/core/internals/managers.py
@@ -1298,7 +1298,7 @@ def value_getitem(placement):
# Defer setting the new values to enable consolidation
self._iset_split_block(blkno_l, blk_locs, refs=refs)
- if len(removed_blknos):
+ if removed_blknos:
# Remove blocks & update blknos accordingly
is_deleted = np.zeros(self.nblocks, dtype=np.bool_)
is_deleted[removed_blknos] = True
diff --git a/pandas/io/formats/format.py b/pandas/io/formats/format.py
index fb799361fea67..f1be0b41ad7f7 100644
--- a/pandas/io/formats/format.py
+++ b/pandas/io/formats/format.py
@@ -566,7 +566,7 @@ def _initialize_colspace(self, col_space: ColspaceArgType | None) -> ColspaceTyp
result = {}
elif isinstance(col_space, (int, str)):
result = {"": col_space}
- result.update({column: col_space for column in self.frame.columns})
+ result.update(dict.fromkeys(self.frame.columns, col_space))
elif isinstance(col_space, Mapping):
for column in col_space.keys():
if column not in self.frame.columns and column != "":
diff --git a/pandas/io/formats/style_render.py b/pandas/io/formats/style_render.py
index 482ed316c7ce4..6752c83d5169b 100644
--- a/pandas/io/formats/style_render.py
+++ b/pandas/io/formats/style_render.py
@@ -1225,7 +1225,7 @@ def format(
data = self.data.loc[subset]
if not isinstance(formatter, dict):
- formatter = {col: formatter for col in data.columns}
+ formatter = dict.fromkeys(data.columns, formatter)
cis = self.columns.get_indexer_for(data.columns)
ris = self.index.get_indexer_for(data.index)
@@ -1411,7 +1411,7 @@ def format_index(
return self # clear the formatter / revert to default and avoid looping
if not isinstance(formatter, dict):
- formatter = {level: formatter for level in levels_}
+ formatter = dict.fromkeys(levels_, formatter)
else:
formatter = {
obj._get_level_number(level): formatter_
@@ -1708,7 +1708,7 @@ def format_index_names(
return self # clear the formatter / revert to default and avoid looping
if not isinstance(formatter, dict):
- formatter = {level: formatter for level in levels_}
+ formatter = dict.fromkeys(levels_, formatter)
else:
formatter = {
obj._get_level_number(level): formatter_
diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py
index a689cfbcb1418..b83b5aba3cf13 100644
--- a/pandas/io/pytables.py
+++ b/pandas/io/pytables.py
@@ -1760,7 +1760,7 @@ def info(self) -> str:
if self.is_open:
lkeys = sorted(self.keys())
- if len(lkeys):
+ if lkeys:
keys = []
values = []
@@ -4540,7 +4540,7 @@ def write_data(self, chunksize: int | None, dropna: bool = False) -> None:
masks.append(mask.astype("u1", copy=False))
# consolidate masks
- if len(masks):
+ if masks:
mask = masks[0]
for m in masks[1:]:
mask = mask & m
@@ -4660,7 +4660,7 @@ def delete(
groups = list(diff[diff > 1].index)
# 1 group
- if not len(groups):
+ if not groups:
groups = [0]
# final element
diff --git a/pandas/io/sql.py b/pandas/io/sql.py
index 0e0f07c0f8ff3..7376843f7e8ff 100644
--- a/pandas/io/sql.py
+++ b/pandas/io/sql.py
@@ -1901,7 +1901,7 @@ def prep_table(
# Type[str], Type[float], Type[int], Type[complex], Type[bool],
# Type[object]]]]"; expected type "Union[ExtensionDtype, str,
# dtype[Any], Type[object]]"
- dtype = {col_name: dtype for col_name in frame} # type: ignore[misc]
+ dtype = dict.fromkeys(frame, dtype) # type: ignore[arg-type]
else:
dtype = cast(dict, dtype)
@@ -2615,7 +2615,7 @@ def _create_table_setup(self):
]
ix_cols = [cname for cname, _, is_index in column_names_and_types if is_index]
- if len(ix_cols):
+ if ix_cols:
cnames = "_".join(ix_cols)
cnames_br = ",".join([escape(c) for c in ix_cols])
create_stmts.append(
@@ -2859,7 +2859,7 @@ def to_sql(
# Type[str], Type[float], Type[int], Type[complex], Type[bool],
# Type[object]]]]"; expected type "Union[ExtensionDtype, str,
# dtype[Any], Type[object]]"
- dtype = {col_name: dtype for col_name in frame} # type: ignore[misc]
+ dtype = dict.fromkeys(frame, dtype) # type: ignore[arg-type]
else:
dtype = cast(dict, dtype)
diff --git a/pandas/tests/apply/test_frame_apply.py b/pandas/tests/apply/test_frame_apply.py
index 2d47cd851ad10..dde1158dc7951 100644
--- a/pandas/tests/apply/test_frame_apply.py
+++ b/pandas/tests/apply/test_frame_apply.py
@@ -334,7 +334,7 @@ def test_apply_broadcast_scalars(float_frame):
def test_apply_broadcast_scalars_axis1(float_frame):
result = float_frame.apply(np.mean, axis=1, result_type="broadcast")
m = float_frame.mean(axis=1)
- expected = DataFrame({c: m for c in float_frame.columns})
+ expected = DataFrame(dict.fromkeys(float_frame.columns, m))
tm.assert_frame_equal(result, expected)
@@ -361,7 +361,7 @@ def test_apply_broadcast_lists_index(float_frame):
)
m = list(range(len(float_frame.index)))
expected = DataFrame(
- {c: m for c in float_frame.columns},
+ dict.fromkeys(float_frame.columns, m),
dtype="float64",
index=float_frame.index,
)
diff --git a/pandas/tests/dtypes/cast/test_maybe_box_native.py b/pandas/tests/dtypes/cast/test_maybe_box_native.py
index 3f62f31dac219..151586962d517 100644
--- a/pandas/tests/dtypes/cast/test_maybe_box_native.py
+++ b/pandas/tests/dtypes/cast/test_maybe_box_native.py
@@ -17,7 +17,7 @@
"obj,expected_dtype",
[
(b"\x00\x10", bytes),
- (int(4), int),
+ ((4), int),
(np.uint(4), int),
(np.int32(-4), int),
(np.uint8(4), int),
diff --git a/pandas/tests/io/parser/test_na_values.py b/pandas/tests/io/parser/test_na_values.py
index 3a68d38cc0bde..213fa2c01cef4 100644
--- a/pandas/tests/io/parser/test_na_values.py
+++ b/pandas/tests/io/parser/test_na_values.py
@@ -546,7 +546,7 @@ def test_na_values_dict_null_column_name(all_parsers):
parser = all_parsers
data = ",x,y\n\nMA,1,2\nNA,2,1\nOA,,3"
names = [None, "x", "y"]
- na_values = {name: STR_NA_VALUES for name in names}
+ na_values = dict.fromkeys(names, STR_NA_VALUES)
dtype = {None: "object", "x": "float64", "y": "float64"}
if parser.engine == "pyarrow":
diff --git a/pandas/tests/series/accessors/test_cat_accessor.py b/pandas/tests/series/accessors/test_cat_accessor.py
index ce8ea27ea1fa2..f017ccd963972 100644
--- a/pandas/tests/series/accessors/test_cat_accessor.py
+++ b/pandas/tests/series/accessors/test_cat_accessor.py
@@ -40,7 +40,7 @@ def test_getname_categorical_accessor(self, method):
def test_cat_accessor(self):
ser = Series(Categorical(["a", "b", np.nan, "a"]))
tm.assert_index_equal(ser.cat.categories, Index(["a", "b"]))
- assert not ser.cat.ordered, False
+ assert not ser.cat.ordered
exp = Categorical(["a", "b", np.nan, "a"], categories=["b", "a"])
diff --git a/pandas/tests/series/test_constructors.py b/pandas/tests/series/test_constructors.py
index 5f4a100e7ccc7..f82451a2be84d 100644
--- a/pandas/tests/series/test_constructors.py
+++ b/pandas/tests/series/test_constructors.py
@@ -632,7 +632,7 @@ def test_constructor_maskedarray_hardened(self):
def test_series_ctor_plus_datetimeindex(self):
rng = date_range("20090415", "20090519", freq="B")
- data = {k: 1 for k in rng}
+ data = dict.fromkeys(rng, 1)
result = Series(data, index=rng)
assert result.index.is_(rng)
diff --git a/pyproject.toml b/pyproject.toml
index 0b4d85c0a62f1..8093a1939883b 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -234,8 +234,8 @@ select = [
"TID",
# implicit string concatenation
"ISC",
- # type-checking imports
- "TCH",
+ # flake8-type-checking
+ "TC",
# comprehensions
"C4",
# pygrep-hooks
@@ -390,6 +390,8 @@ ignore = [
"PLW0108",
# global-statement
"PLW0603",
+ # runtime-cast-value
+ "TC006",
]
exclude = [
@@ -429,7 +431,7 @@ exclude = [
"pandas/tests/*" = ["B028", "FLY"]
"scripts/*" = ["B028"]
# Keep this one enabled
-"pandas/_typing.py" = ["TCH"]
+"pandas/_typing.py" = ["TC"]
[tool.ruff.lint.flake8-pytest-style]
fixture-parentheses = false
diff --git a/requirements-dev.txt b/requirements-dev.txt
index 7d2870de27cfa..5607f2fe97fd9 100644
--- a/requirements-dev.txt
+++ b/requirements-dev.txt
@@ -57,7 +57,7 @@ asv>=0.6.1
flake8==7.1.0
mypy==1.13.0
tokenize-rt
-pre-commit>=4.0.1
+pre-commit>=4.2.0
gitpython
gitdb
google-auth
diff --git a/setup.py b/setup.py
index 737ebd270d1e4..db1852b43cfa9 100755
--- a/setup.py
+++ b/setup.py
@@ -364,7 +364,7 @@ def run(self) -> None:
# enable coverage by building cython files by setting the environment variable
# "PANDAS_CYTHON_COVERAGE" (with a Truthy value) or by running build_ext
# with `--with-cython-coverage`enabled
-linetrace = os.environ.get("PANDAS_CYTHON_COVERAGE", False)
+linetrace = os.environ.get("PANDAS_CYTHON_COVERAGE", False) # noqa: PLW1508
if "--with-cython-coverage" in sys.argv:
linetrace = True
sys.argv.remove("--with-cython-coverage")
From aebdd970c7955b71c4155a464abe9de775882e43 Mon Sep 17 00:00:00 2001
From: Arthur Laureus Wigo <126365160+arthurlw@users.noreply.github.com>
Date: Wed, 9 Apr 2025 09:27:04 -0700
Subject: [PATCH 08/27] ENH: Support `Series[bool]` as indexer for
`iloc.__getitem__` (#61162)
* updated indexing.py to allow iloc.__getitem__
* Updated test_iloc_mask test
* bugfix test_iloc_mask test
* bugfix test_iloc_mask
* whatsnew
* added test to test_iloc_mask
* formatting
* precommit
* added tests for series bool mask
* precommit
* reformatted tests
---
doc/source/whatsnew/v3.0.0.rst | 1 +
pandas/core/indexing.py | 6 +--
pandas/tests/indexing/test_iloc.py | 70 +++++++++++++++++++++---------
3 files changed, 52 insertions(+), 25 deletions(-)
diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst
index 29be9a7341f00..a945c7183a059 100644
--- a/doc/source/whatsnew/v3.0.0.rst
+++ b/doc/source/whatsnew/v3.0.0.rst
@@ -68,6 +68,7 @@ Other enhancements
- :func:`read_parquet` accepts ``to_pandas_kwargs`` which are forwarded to :meth:`pyarrow.Table.to_pandas` which enables passing additional keywords to customize the conversion to pandas, such as ``maps_as_pydicts`` to read the Parquet map data type as python dictionaries (:issue:`56842`)
- :meth:`.DataFrameGroupBy.transform`, :meth:`.SeriesGroupBy.transform`, :meth:`.DataFrameGroupBy.agg`, :meth:`.SeriesGroupBy.agg`, :meth:`.SeriesGroupBy.apply`, :meth:`.DataFrameGroupBy.apply` now support ``kurt`` (:issue:`40139`)
- :meth:`DataFrame.apply` supports using third-party execution engines like the Bodo.ai JIT compiler (:issue:`60668`)
+- :meth:`DataFrame.iloc` and :meth:`Series.iloc` now support boolean masks in ``__getitem__`` for more consistent indexing behavior (:issue:`60994`)
- :meth:`DataFrameGroupBy.transform`, :meth:`SeriesGroupBy.transform`, :meth:`DataFrameGroupBy.agg`, :meth:`SeriesGroupBy.agg`, :meth:`RollingGroupby.apply`, :meth:`ExpandingGroupby.apply`, :meth:`Rolling.apply`, :meth:`Expanding.apply`, :meth:`DataFrame.apply` with ``engine="numba"`` now supports positional arguments passed as kwargs (:issue:`58995`)
- :meth:`Rolling.agg`, :meth:`Expanding.agg` and :meth:`ExponentialMovingWindow.agg` now accept :class:`NamedAgg` aggregations through ``**kwargs`` (:issue:`28333`)
- :meth:`Series.map` can now accept kwargs to pass on to func (:issue:`59814`)
diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py
index bbbcc4da9fb39..34a437ba40bd8 100644
--- a/pandas/core/indexing.py
+++ b/pandas/core/indexing.py
@@ -1582,11 +1582,7 @@ def _validate_key(self, key, axis: AxisInt) -> None:
if com.is_bool_indexer(key):
if hasattr(key, "index") and isinstance(key.index, Index):
if key.index.inferred_type == "integer":
- raise NotImplementedError(
- "iLocation based boolean "
- "indexing on an integer type "
- "is not available"
- )
+ return
raise ValueError(
"iLocation based boolean indexing cannot use an indexable as a mask"
)
diff --git a/pandas/tests/indexing/test_iloc.py b/pandas/tests/indexing/test_iloc.py
index 2f6998a85c80b..3be69617cad43 100644
--- a/pandas/tests/indexing/test_iloc.py
+++ b/pandas/tests/indexing/test_iloc.py
@@ -726,15 +726,16 @@ def test_iloc_setitem_with_scalar_index(self, indexer, value):
@pytest.mark.filterwarnings("ignore::UserWarning")
def test_iloc_mask(self):
- # GH 3631, iloc with a mask (of a series) should raise
+ # GH 60994, iloc with a mask (of a series) should return accordingly
df = DataFrame(list(range(5)), index=list("ABCDE"), columns=["a"])
mask = df.a % 2 == 0
msg = "iLocation based boolean indexing cannot use an indexable as a mask"
with pytest.raises(ValueError, match=msg):
df.iloc[mask]
+
mask.index = range(len(mask))
- msg = "iLocation based boolean indexing on an integer type is not available"
- with pytest.raises(NotImplementedError, match=msg):
+ msg = "Unalignable boolean Series provided as indexer"
+ with pytest.raises(IndexingError, match=msg):
df.iloc[mask]
# ndarray ok
@@ -753,18 +754,13 @@ def test_iloc_mask(self):
(None, ".iloc"): "0b1100",
("index", ""): "0b11",
("index", ".loc"): "0b11",
- ("index", ".iloc"): (
- "iLocation based boolean indexing cannot use an indexable as a mask"
- ),
- ("locs", ""): "Unalignable boolean Series provided as indexer "
- "(index of the boolean Series and of the indexed "
- "object do not match).",
- ("locs", ".loc"): "Unalignable boolean Series provided as indexer "
- "(index of the boolean Series and of the "
- "indexed object do not match).",
- ("locs", ".iloc"): (
- "iLocation based boolean indexing on an integer type is not available"
- ),
+ (
+ "index",
+ ".iloc",
+ ): "iLocation based boolean indexing cannot use an indexable as a mask",
+ ("locs", ""): "Unalignable boolean Series provided as indexer",
+ ("locs", ".loc"): "Unalignable boolean Series provided as indexer",
+ ("locs", ".iloc"): "Unalignable boolean Series provided as indexer",
}
# UserWarnings from reindex of a boolean mask
@@ -780,18 +776,52 @@ def test_iloc_mask(self):
else:
accessor = df
answer = str(bin(accessor[mask]["nums"].sum()))
- except (ValueError, IndexingError, NotImplementedError) as err:
+ except (ValueError, IndexingError) as err:
answer = str(err)
key = (
idx,
method,
)
- r = expected.get(key)
- if r != answer:
- raise AssertionError(
- f"[{key}] does not match [{answer}], received [{r}]"
+ expected_result = expected.get(key)
+
+ # Fix the assertion to check for substring match
+ if (
+ idx is None or (idx == "index" and method != ".iloc")
+ ) and "0b" in expected_result:
+ # For successful numeric results, exact match is needed
+ assert expected_result == answer, (
+ f"[{key}] does not match [{answer}]"
)
+ else:
+ # For error messages, substring match is sufficient
+ assert expected_result in answer, f"[{key}] not found in [{answer}]"
+
+ def test_iloc_with_numpy_bool_array(self):
+ df = DataFrame(list(range(5)), index=list("ABCDE"), columns=["a"])
+ result = df.iloc[np.array([True, False, True, False, True], dtype=bool)]
+ expected = DataFrame({"a": [0, 2, 4]}, index=["A", "C", "E"])
+ tm.assert_frame_equal(result, expected)
+
+ def test_iloc_series_mask_with_index_mismatch_raises(self):
+ df = DataFrame(list(range(5)), index=list("ABCDE"), columns=["a"])
+ mask = df.a % 2 == 0
+ msg = "Unalignable boolean Series provided as indexer"
+ with pytest.raises(IndexingError, match=msg):
+ df.iloc[Series([True] * len(mask), dtype=bool)]
+
+ def test_iloc_series_mask_all_true(self):
+ df = DataFrame(list(range(5)), columns=["a"])
+ mask = Series([True] * len(df), dtype=bool)
+ result = df.iloc[mask]
+ tm.assert_frame_equal(result, df)
+
+ def test_iloc_series_mask_alternate_true(self):
+ df = DataFrame(list(range(5)), columns=["a"])
+ mask = Series([True, False, True, False, True], dtype=bool)
+ result = df.iloc[mask]
+ expected = DataFrame({"a": [0, 2, 4]}, index=[0, 2, 4])
+ tm.assert_frame_equal(result, expected)
def test_iloc_non_unique_indexing(self):
# GH 4017, non-unique indexing (on the axis)
From 3009202ec0d6dcafb1ecf747fd5a9b74b72a242f Mon Sep 17 00:00:00 2001
From: Martin Braquet
Date: Wed, 9 Apr 2025 23:28:51 +0700
Subject: [PATCH 09/27] BUG: Handle overlapping line and scatter on the same
plot (#61244)
* Refactor time series plotting logic for improved clarity
Extract and streamline time series preparation steps into `prepare_ts_data`, replacing redundant logic across methods. Simplifies axis frequency handling and improves code readability while maintaining functionality.
* Add test to validate xtick alignment for scatter and line plots
This test ensures that the x-axis ticks are consistent between scatter and line plots when sharing the same axis. It addresses a potential issue related to GH#61005, verifying proper rendering of datetime x-axis labels.
* Fix bug in Series.plot misalignment for line and scatter plots
This resolves an issue where line and scatter plots were not aligned when using Series.plot. The fix ensures proper alignment and improves plot consistency. Refer to issue #61005 for further details.
* Update scatter plot test to support datetime.time data
Datetime.time is now supported in scatter plots due to added converter implementation in ScatterPlot. Removed the test expecting a TypeError and updated it to validate the new functionality.
* Refactor handling of x_data in matplotlib plotting.
Simplify and streamline the code by directly assigning x_data from the data variable and replacing the intermediate Series object with a clearer `s` variable. This improves readability and maintains the existing functionality.
* Move test_scatter_line_xticks from Series to DataFrame tests
Relocated the `test_scatter_line_xticks` test from `test_series.py` to `test_frame.py` for better alignment with DataFrame-specific functionality. This refactor ensures the test resides in the appropriate context based on its usage and focus.
* Refactor `prepare_ts_data` to improve type annotations.
Added precise type annotations to the function signature for better clarity and type checking. Replaced `data` with `series` and `kwds` with `kwargs` to enhance readability and consistency.
* Refactor test_scatter_line_xticks to simplify DataFrame creation
The DataFrame creation in the test has been streamlined for clarity and conciseness by replacing the loop with a list comprehension. This improves code readability and maintains the same functionality.
* Refactor Series import to optimize scope and maintain consistency
Moved the `Series` import inside relevant function scopes to minimize unnecessary top-level imports and align with existing import patterns. This helps improve code readability and ensures imports are only loaded where needed.
* `Reorder import statement in _make_plot method`
Moved the import of `Series` within the `_make_plot` method to comply with styling or runtime considerations. This ensures consistency and avoids potential import-related issues.
---
doc/source/whatsnew/v3.0.0.rst | 1 +
pandas/plotting/_matplotlib/core.py | 55 ++++++++++++-----------
pandas/plotting/_matplotlib/timeseries.py | 30 +++++++++----
pandas/tests/plotting/frame/test_frame.py | 22 ++++++---
4 files changed, 69 insertions(+), 39 deletions(-)
diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst
index a945c7183a059..184ca581902ee 100644
--- a/doc/source/whatsnew/v3.0.0.rst
+++ b/doc/source/whatsnew/v3.0.0.rst
@@ -763,6 +763,7 @@ Plotting
- Bug in :meth:`DataFrame.plot.bar` with ``stacked=True`` where labels on stacked bars with zero-height segments were incorrectly positioned at the base instead of the label position of the previous segment (:issue:`59429`)
- Bug in :meth:`DataFrame.plot.line` raising ``ValueError`` when set both color and a ``dict`` style (:issue:`59461`)
- Bug in :meth:`DataFrame.plot` that causes a shift to the right when the frequency multiplier is greater than one. (:issue:`57587`)
+- Bug in :meth:`Series.plot` preventing a line and scatter plot from being aligned (:issue:`61005`)
- Bug in :meth:`Series.plot` with ``kind="pie"`` with :class:`ArrowDtype` (:issue:`59192`)
Groupby/resample/rolling
diff --git a/pandas/plotting/_matplotlib/core.py b/pandas/plotting/_matplotlib/core.py
index 1035150302d2c..24aa848de1b4c 100644
--- a/pandas/plotting/_matplotlib/core.py
+++ b/pandas/plotting/_matplotlib/core.py
@@ -64,10 +64,9 @@
from pandas.plotting._matplotlib.misc import unpack_single_str_list
from pandas.plotting._matplotlib.style import get_standard_colors
from pandas.plotting._matplotlib.timeseries import (
- decorate_axes,
format_dateaxis,
maybe_convert_index,
- maybe_resample,
+ prepare_ts_data,
use_dynamic_x,
)
from pandas.plotting._matplotlib.tools import (
@@ -288,6 +287,21 @@ def __init__(
self.data = self._ensure_frame(self.data)
+ from pandas.plotting import plot_params
+
+ self.x_compat = plot_params["x_compat"]
+ if "x_compat" in self.kwds:
+ self.x_compat = bool(self.kwds.pop("x_compat"))
+
+ @final
+ def _is_ts_plot(self) -> bool:
+ # this is slightly deceptive
+ return not self.x_compat and self.use_index and self._use_dynamic_x()
+
+ @final
+ def _use_dynamic_x(self) -> bool:
+ return use_dynamic_x(self._get_ax(0), self.data.index)
+
@final
@staticmethod
def _validate_sharex(sharex: bool | None, ax, by) -> bool:
@@ -1324,10 +1338,20 @@ def __init__(
c = self.data.columns[c]
self.c = c
+ @register_pandas_matplotlib_converters
def _make_plot(self, fig: Figure) -> None:
x, y, c, data = self.x, self.y, self.c, self.data
ax = self.axes[0]
+ from pandas import Series
+
+ x_data = data[x]
+ s = Series(index=x_data)
+ if use_dynamic_x(ax, s.index):
+ s = maybe_convert_index(ax, s)
+ freq, s = prepare_ts_data(s, ax, self.kwds)
+ x_data = s.index
+
c_is_column = is_hashable(c) and c in self.data.columns
color_by_categorical = c_is_column and isinstance(
@@ -1344,7 +1368,7 @@ def _make_plot(self, fig: Figure) -> None:
else:
label = None
- # if a list of non color strings is passed in as c, color points
+ # if a list of non-color strings is passed in as c, color points
# by uniqueness of the strings, such same strings get same color
create_colors = not self._are_valid_colors(c_values)
if create_colors:
@@ -1360,7 +1384,7 @@ def _make_plot(self, fig: Figure) -> None:
)
scatter = ax.scatter(
- data[x].values,
+ x_data.values,
data[y].values,
c=c_values,
label=label,
@@ -1520,23 +1544,9 @@ def _kind(self) -> Literal["line", "area", "hist", "kde", "box"]:
return "line"
def __init__(self, data, **kwargs) -> None:
- from pandas.plotting import plot_params
-
MPLPlot.__init__(self, data, **kwargs)
if self.stacked:
self.data = self.data.fillna(value=0)
- self.x_compat = plot_params["x_compat"]
- if "x_compat" in self.kwds:
- self.x_compat = bool(self.kwds.pop("x_compat"))
-
- @final
- def _is_ts_plot(self) -> bool:
- # this is slightly deceptive
- return not self.x_compat and self.use_index and self._use_dynamic_x()
-
- @final
- def _use_dynamic_x(self) -> bool:
- return use_dynamic_x(self._get_ax(0), self.data)
def _make_plot(self, fig: Figure) -> None:
if self._is_ts_plot():
@@ -1626,15 +1636,8 @@ def _ts_plot(self, ax: Axes, x, data: Series, style=None, **kwds):
# accept x to be consistent with normal plot func,
# x is not passed to tsplot as it uses data.index as x coordinate
# column_num must be in kwds for stacking purpose
- freq, data = maybe_resample(data, ax, kwds)
+ freq, data = prepare_ts_data(data, ax, kwds)
- # Set ax with freq info
- decorate_axes(ax, freq)
- # digging deeper
- if hasattr(ax, "left_ax"):
- decorate_axes(ax.left_ax, freq)
- if hasattr(ax, "right_ax"):
- decorate_axes(ax.right_ax, freq)
# TODO #54485
ax._plot_data.append((data, self._kind, kwds)) # type: ignore[attr-defined]
diff --git a/pandas/plotting/_matplotlib/timeseries.py b/pandas/plotting/_matplotlib/timeseries.py
index d95ccad2da565..beaf5b6259ef3 100644
--- a/pandas/plotting/_matplotlib/timeseries.py
+++ b/pandas/plotting/_matplotlib/timeseries.py
@@ -48,7 +48,6 @@
from pandas._typing import NDFrameT
from pandas import (
- DataFrame,
DatetimeIndex,
Index,
PeriodIndex,
@@ -231,8 +230,8 @@ def _get_freq(ax: Axes, series: Series):
return freq, ax_freq
-def use_dynamic_x(ax: Axes, data: DataFrame | Series) -> bool:
- freq = _get_index_freq(data.index)
+def use_dynamic_x(ax: Axes, index: Index) -> bool:
+ freq = _get_index_freq(index)
ax_freq = _get_ax_freq(ax)
if freq is None: # convert irregular if axes has freq info
@@ -250,16 +249,15 @@ def use_dynamic_x(ax: Axes, data: DataFrame | Series) -> bool:
return False
# FIXME: hack this for 0.10.1, creating more technical debt...sigh
- if isinstance(data.index, ABCDatetimeIndex):
+ if isinstance(index, ABCDatetimeIndex):
# error: "BaseOffset" has no attribute "_period_dtype_code"
freq_str = OFFSET_TO_PERIOD_FREQSTR.get(freq_str, freq_str)
base = to_offset(freq_str, is_period=True)._period_dtype_code # type: ignore[attr-defined]
- x = data.index
if base <= FreqGroup.FR_DAY.value:
- return x[:1].is_normalized
- period = Period(x[0], freq_str)
+ return index[:1].is_normalized
+ period = Period(index[0], freq_str)
assert isinstance(period, Period)
- return period.to_timestamp().tz_localize(x.tz) == x[0]
+ return period.to_timestamp().tz_localize(index.tz) == index[0]
return True
@@ -366,3 +364,19 @@ def format_dateaxis(
raise TypeError("index type not supported")
plt.draw_if_interactive()
+
+
+def prepare_ts_data(
+ series: Series, ax: Axes, kwargs: dict[str, Any]
+) -> tuple[BaseOffset | str, Series]:
+ freq, data = maybe_resample(series, ax, kwargs)
+
+ # Set ax with freq info
+ decorate_axes(ax, freq)
+ # digging deeper
+ if hasattr(ax, "left_ax"):
+ decorate_axes(ax.left_ax, freq)
+ if hasattr(ax, "right_ax"):
+ decorate_axes(ax.right_ax, freq)
+
+ return freq, data
diff --git a/pandas/tests/plotting/frame/test_frame.py b/pandas/tests/plotting/frame/test_frame.py
index d18f098267599..3f274a336ad44 100644
--- a/pandas/tests/plotting/frame/test_frame.py
+++ b/pandas/tests/plotting/frame/test_frame.py
@@ -840,14 +840,26 @@ def test_plot_scatter_shape(self):
axes = df.plot(x="x", y="y", kind="scatter", subplots=True)
_check_axes_shape(axes, axes_num=1, layout=(1, 1))
- def test_raise_error_on_datetime_time_data(self):
- # GH 8113, datetime.time type is not supported by matplotlib in scatter
+ def test_scatter_on_datetime_time_data(self):
+ # datetime.time type is now supported in scatter, since a converter
+ # is implemented in ScatterPlot
df = DataFrame(np.random.default_rng(2).standard_normal(10), columns=["a"])
df["dtime"] = date_range(start="2014-01-01", freq="h", periods=10).time
- msg = "must be a string or a (real )?number, not 'datetime.time'"
+ df.plot(kind="scatter", x="dtime", y="a")
- with pytest.raises(TypeError, match=msg):
- df.plot(kind="scatter", x="dtime", y="a")
+ def test_scatter_line_xticks(self):
+ # GH#61005
+ df = DataFrame(
+ [(datetime(year=2025, month=1, day=1, hour=n), n) for n in range(3)],
+ columns=["datetime", "y"],
+ )
+ fig, ax = plt.subplots(2, sharex=True)
+ df.plot.scatter(x="datetime", y="y", ax=ax[0])
+ scatter_xticks = ax[0].get_xticks()
+ df.plot(x="datetime", y="y", ax=ax[1])
+ line_xticks = ax[1].get_xticks()
+ assert scatter_xticks[0] == line_xticks[0]
+ assert scatter_xticks[-1] == line_xticks[-1]
@pytest.mark.parametrize("x, y", [("dates", "vals"), (0, 1)])
def test_scatterplot_datetime_data(self, x, y):
From ca97f1afc371643b3e9ba8fdccc5af5b4ab33659 Mon Sep 17 00:00:00 2001
From: "Christine P. Chai"
Date: Wed, 9 Apr 2025 09:45:48 -0700
Subject: [PATCH 10/27] DOC: Update the last ArcticDB link in ecosystem.md
(#61258)
DOC: Update ArcticDB link in ecosystem.md
---
web/pandas/community/ecosystem.md | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/web/pandas/community/ecosystem.md b/web/pandas/community/ecosystem.md
index 4863093387f37..3555d67c70620 100644
--- a/web/pandas/community/ecosystem.md
+++ b/web/pandas/community/ecosystem.md
@@ -469,7 +469,7 @@ read_record.data
df.dtypes
```
-ArcticDB also supports appending, updating, and querying data from storage to a pandas DataFrame. Please find more information [here](https://docs.arcticdb.io/latest/api/query_builder/).
+ArcticDB also supports appending, updating, and querying data from storage to a pandas DataFrame. Please find more information [here](https://docs.arcticdb.io/latest/api/processing/#arcticdb.QueryBuilder).
### [Hugging Face](https://huggingface.co/datasets)
From 88d73e178c4e6c61b3c8773701e01d43307c4dda Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Wed, 9 Apr 2025 10:56:29 -0700
Subject: [PATCH 11/27] BLD: Try installing older Cython for windows free
threading build (#61249)
* BLD: Try pinning ninja<1.11.1.4 for windows free threading build
* quote
* change to triple and double quotes
* Add in script instead
* Add in script instead
* Try numpy 2.2.3
* Double quotes
* Try 2.2.2
* Install older Cython
* Try commit from March 17
* try march 19 commit 0b866bf7d43ced968dba4e9726316f963aae8f3c
* Try march 18 commit b4917f731da50062f8ba53737ade7b82b4c8fcf2
* One commit after c compiler warnings PR
* Use March 20 commit 93a7d09d47d8aae0dfcea41d06f4b140a1161499
* Use cb156c48d94b7e13363ab791b16bdeeb3392f21e before vector call
* One more divmod commit
* USe divmod commit
* Use commit before divmod, undo ninja and numpy changes
---
pyproject.toml | 3 ++-
scripts/cibw_before_build_windows.sh | 7 ++++---
2 files changed, 6 insertions(+), 4 deletions(-)
diff --git a/pyproject.toml b/pyproject.toml
index 8093a1939883b..8f8a8120802b6 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -148,7 +148,7 @@ setup = ['--vsenv'] # For Windows
[tool.cibuildwheel]
skip = "cp36-* cp37-* cp38-* cp39-* pp* *_i686 *_ppc64le *_s390x"
-build-verbosity = "3"
+build-verbosity = 3
environment = {LDFLAGS="-Wl,--strip-all"}
test-requires = "hypothesis>=6.84.0 pytest>=7.3.2 pytest-xdist>=3.4.0"
test-command = """
@@ -160,6 +160,7 @@ free-threaded-support = true
before-build = "PACKAGE_DIR={package} bash {package}/scripts/cibw_before_build.sh"
[tool.cibuildwheel.windows]
+environment = {}
before-build = "pip install delvewheel && bash {package}/scripts/cibw_before_build_windows.sh"
before-test = "bash {package}/scripts/cibw_before_test_windows.sh"
test-command = """
diff --git a/scripts/cibw_before_build_windows.sh b/scripts/cibw_before_build_windows.sh
index f9e1e68d8efba..dbf1d95d911bf 100644
--- a/scripts/cibw_before_build_windows.sh
+++ b/scripts/cibw_before_build_windows.sh
@@ -5,10 +5,11 @@ for file in $PACKAGE_DIR/LICENSES/*; do
done
# TODO: Delete when there's a PyPI Cython release that supports free-threaded Python 3.13
-# and a NumPy Windows wheel for the free-threaded build on PyPI.
FREE_THREADED_BUILD="$(python -c"import sysconfig; print(bool(sysconfig.get_config_var('Py_GIL_DISABLED')))")"
if [[ $FREE_THREADED_BUILD == "True" ]]; then
python -m pip install -U pip
- python -m pip install -i https://pypi.anaconda.org/scientific-python-nightly-wheels/simple numpy cython
- python -m pip install ninja meson-python versioneer[toml]
+ # python -m pip install -i https://pypi.anaconda.org/scientific-python-nightly-wheels/simple cython
+ # TODO: Remove below and uncomment above once https://github.com/cython/cython/pull/6717 no longer breaks tests
+ python -m pip install git+https://github.com/cython/cython.git@3276b588720a053c78488e5de788605950f4b136
+ python -m pip install ninja meson-python versioneer[toml] numpy
fi
From bdf846d706c643865f6b301b4769b9c36c2523ee Mon Sep 17 00:00:00 2001
From: Sathvik Mulukutla <72032381+DarthKitten2130@users.noreply.github.com>
Date: Thu, 10 Apr 2025 21:34:02 +0530
Subject: [PATCH 12/27] Changed term non-null to NA (#61257)
* Changed term non-null to NA
* Update pandas/io/formats/info.py
Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
---------
Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
---
pandas/io/formats/info.py | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/pandas/io/formats/info.py b/pandas/io/formats/info.py
index c9a6e94a0c7c1..eb579f7149d44 100644
--- a/pandas/io/formats/info.py
+++ b/pandas/io/formats/info.py
@@ -249,7 +249,7 @@
Print a concise summary of a {klass}.
This method prints information about a {klass} including
- the index dtype{type_sub}, non-null values and memory usage.
+ the index dtype{type_sub}, non-NA values and memory usage.
{version_added_sub}\
Parameters
From e7ae09d62b1671695bb61dfea93a6ac354774978 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Lo=C3=AFc=20Est=C3=A8ve?=
Date: Thu, 10 Apr 2025 18:46:26 +0200
Subject: [PATCH 13/27] CI Use released numpy for Windows wheels testing
(#61248)
CI Maybe fix Windows free-threaded
Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
---
pyproject.toml | 1 -
scripts/cibw_before_test_windows.sh | 6 ------
2 files changed, 7 deletions(-)
delete mode 100644 scripts/cibw_before_test_windows.sh
diff --git a/pyproject.toml b/pyproject.toml
index 8f8a8120802b6..7db85f0037d33 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -162,7 +162,6 @@ before-build = "PACKAGE_DIR={package} bash {package}/scripts/cibw_before_build.s
[tool.cibuildwheel.windows]
environment = {}
before-build = "pip install delvewheel && bash {package}/scripts/cibw_before_build_windows.sh"
-before-test = "bash {package}/scripts/cibw_before_test_windows.sh"
test-command = """
set PANDAS_CI='1' && \
python -c "import pandas as pd; \
diff --git a/scripts/cibw_before_test_windows.sh b/scripts/cibw_before_test_windows.sh
deleted file mode 100644
index 8878e3950452f..0000000000000
--- a/scripts/cibw_before_test_windows.sh
+++ /dev/null
@@ -1,6 +0,0 @@
-#!/bin/bash
-# TODO: Delete when there's a NumPy Windows wheel for the free-threaded build on PyPI.
-FREE_THREADED_BUILD="$(python -c"import sysconfig; print(bool(sysconfig.get_config_var('Py_GIL_DISABLED')))")"
-if [[ $FREE_THREADED_BUILD == "True" ]]; then
- python -m pip install -i https://pypi.anaconda.org/scientific-python-nightly-wheels/simple numpy
-fi
From f8247f1e67b917078ed486cd2b6a9a64a91ba1ad Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Thu, 10 Apr 2025 13:17:42 -0700
Subject: [PATCH 14/27] TYP: Add ignores for numpy 2.2 updates (#61265)
* TYP: Add ignores for numpy 2.2 updates
* fix tests and plotting
* ignore pyright error
---
pandas/core/algorithms.py | 2 +-
pandas/core/array_algos/quantile.py | 4 ++--
pandas/core/arrays/_mixins.py | 8 +-------
pandas/core/arrays/arrow/_arrow_utils.py | 2 +-
pandas/core/arrays/arrow/array.py | 2 +-
pandas/core/arrays/base.py | 2 +-
pandas/core/arrays/categorical.py | 2 +-
pandas/core/arrays/datetimelike.py | 2 +-
pandas/core/arrays/datetimes.py | 9 +++------
pandas/core/arrays/masked.py | 6 +++---
pandas/core/arrays/sparse/scipy_sparse.py | 2 +-
pandas/core/arrays/timedeltas.py | 2 +-
pandas/core/base.py | 2 +-
pandas/core/groupby/generic.py | 2 +-
pandas/core/groupby/groupby.py | 8 ++++----
pandas/core/groupby/ops.py | 4 ++--
pandas/core/indexers/objects.py | 8 ++++----
pandas/core/indexes/interval.py | 9 +--------
pandas/core/internals/blocks.py | 2 +-
pandas/core/internals/construction.py | 2 +-
pandas/core/missing.py | 8 +++-----
pandas/core/reshape/encoding.py | 2 +-
pandas/core/reshape/merge.py | 4 +---
pandas/core/sorting.py | 2 +-
pandas/io/formats/format.py | 2 +-
pandas/io/parsers/python_parser.py | 2 +-
pandas/plotting/_matplotlib/style.py | 2 +-
pandas/tests/dtypes/test_missing.py | 4 ++--
28 files changed, 43 insertions(+), 63 deletions(-)
diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py
index 76f2fdad591ff..e6847b380a7e8 100644
--- a/pandas/core/algorithms.py
+++ b/pandas/core/algorithms.py
@@ -215,7 +215,7 @@ def _reconstruct_data(
values = cls._from_sequence(values, dtype=dtype) # type: ignore[assignment]
else:
- values = values.astype(dtype, copy=False)
+ values = values.astype(dtype, copy=False) # type: ignore[assignment]
return values
diff --git a/pandas/core/array_algos/quantile.py b/pandas/core/array_algos/quantile.py
index 8a920d1849bb3..eb5026454552c 100644
--- a/pandas/core/array_algos/quantile.py
+++ b/pandas/core/array_algos/quantile.py
@@ -102,7 +102,7 @@ def quantile_with_mask(
interpolation=interpolation,
)
- result = np.asarray(result)
+ result = np.asarray(result) # type: ignore[assignment]
result = result.T
return result
@@ -196,7 +196,7 @@ def _nanquantile(
# Caller is responsible for ensuring mask shape match
assert mask.shape == values.shape
result = [
- _nanquantile_1d(val, m, qs, na_value, interpolation=interpolation)
+ _nanquantile_1d(val, m, qs, na_value, interpolation=interpolation) # type: ignore[arg-type]
for (val, m) in zip(list(values), list(mask))
]
if values.dtype.kind == "f":
diff --git a/pandas/core/arrays/_mixins.py b/pandas/core/arrays/_mixins.py
index 4e6f20e6ad3dd..26585e7bab8e3 100644
--- a/pandas/core/arrays/_mixins.py
+++ b/pandas/core/arrays/_mixins.py
@@ -142,18 +142,12 @@ def view(self, dtype: Dtype | None = None) -> ArrayLike:
dt64_values = arr.view(dtype)
return DatetimeArray._simple_new(dt64_values, dtype=dtype)
-
elif lib.is_np_dtype(dtype, "m") and is_supported_dtype(dtype):
from pandas.core.arrays import TimedeltaArray
td64_values = arr.view(dtype)
return TimedeltaArray._simple_new(td64_values, dtype=dtype)
-
- # error: Argument "dtype" to "view" of "_ArrayOrScalarCommon" has incompatible
- # type "Union[ExtensionDtype, dtype[Any]]"; expected "Union[dtype[Any], None,
- # type, _SupportsDType, str, Union[Tuple[Any, int], Tuple[Any, Union[int,
- # Sequence[int]]], List[Any], _DTypeDict, Tuple[Any, Any]]]"
- return arr.view(dtype=dtype) # type: ignore[arg-type]
+ return arr.view(dtype=dtype)
def take(
self,
diff --git a/pandas/core/arrays/arrow/_arrow_utils.py b/pandas/core/arrays/arrow/_arrow_utils.py
index 285c3fd465ffc..7da83e2257e30 100644
--- a/pandas/core/arrays/arrow/_arrow_utils.py
+++ b/pandas/core/arrays/arrow/_arrow_utils.py
@@ -44,7 +44,7 @@ def pyarrow_array_to_numpy_and_mask(
mask = pyarrow.BooleanArray.from_buffers(
pyarrow.bool_(), len(arr), [None, bitmask], offset=arr.offset
)
- mask = np.asarray(mask)
+ mask = np.asarray(mask) # type: ignore[assignment]
else:
mask = np.ones(len(arr), dtype=bool)
return data, mask
diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py
index 9295cf7873d98..d7187b57a69e4 100644
--- a/pandas/core/arrays/arrow/array.py
+++ b/pandas/core/arrays/arrow/array.py
@@ -2540,7 +2540,7 @@ def _str_get_dummies(self, sep: str = "|", dtype: NpDtype | None = None):
dummies_dtype = np.bool_
dummies = np.zeros(n_rows * n_cols, dtype=dummies_dtype)
dummies[indices] = True
- dummies = dummies.reshape((n_rows, n_cols))
+ dummies = dummies.reshape((n_rows, n_cols)) # type: ignore[assignment]
result = type(self)(pa.array(list(dummies)))
return result, uniques_sorted.to_pylist()
diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py
index 42be07e03bad8..d0048e122051a 100644
--- a/pandas/core/arrays/base.py
+++ b/pandas/core/arrays/base.py
@@ -596,7 +596,7 @@ def to_numpy(
if copy or na_value is not lib.no_default:
result = result.copy()
if na_value is not lib.no_default:
- result[self.isna()] = na_value
+ result[self.isna()] = na_value # type: ignore[index]
return result
# ------------------------------------------------------------------------
diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py
index 647530151d5f6..e5c5716165e2f 100644
--- a/pandas/core/arrays/categorical.py
+++ b/pandas/core/arrays/categorical.py
@@ -1853,7 +1853,7 @@ def value_counts(self, dropna: bool = True) -> Series:
count = np.bincount(obs, minlength=ncat or 0)
else:
count = np.bincount(np.where(mask, code, ncat))
- ix = np.append(ix, -1)
+ ix = np.append(ix, -1) # type: ignore[assignment]
ix = coerce_indexer_dtype(ix, self.dtype.categories)
ix_categorical = self._from_backing_data(ix)
diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py
index b27bf19f2f593..994d7b1d0081c 100644
--- a/pandas/core/arrays/datetimelike.py
+++ b/pandas/core/arrays/datetimelike.py
@@ -2394,7 +2394,7 @@ def take(
)
indices = np.asarray(indices, dtype=np.intp)
- maybe_slice = lib.maybe_indices_to_slice(indices, len(self))
+ maybe_slice = lib.maybe_indices_to_slice(indices, len(self)) # type: ignore[arg-type]
if isinstance(maybe_slice, slice):
freq = self._get_getitem_freq(maybe_slice)
diff --git a/pandas/core/arrays/datetimes.py b/pandas/core/arrays/datetimes.py
index df40c9c11b117..b31c543188282 100644
--- a/pandas/core/arrays/datetimes.py
+++ b/pandas/core/arrays/datetimes.py
@@ -331,7 +331,7 @@ def _simple_new( # type: ignore[override]
else:
# DatetimeTZDtype. If we have e.g. DatetimeTZDtype[us, UTC],
# then values.dtype should be M8[us].
- assert dtype._creso == get_unit_from_dtype(values.dtype)
+ assert dtype._creso == get_unit_from_dtype(values.dtype) # type: ignore[union-attr]
result = super()._simple_new(values, dtype)
result._freq = freq
@@ -542,7 +542,7 @@ def _unbox_scalar(self, value) -> np.datetime64:
raise ValueError("'value' should be a Timestamp.")
self._check_compatible_with(value)
if value is NaT:
- return np.datetime64(value._value, self.unit)
+ return np.datetime64(value._value, self.unit) # type: ignore[call-overload]
else:
return value.as_unit(self.unit, round_ok=False).asm8
@@ -813,10 +813,7 @@ def _add_offset(self, offset: BaseOffset) -> Self:
try:
res_values = offset._apply_array(values._ndarray)
if res_values.dtype.kind == "i":
- # error: Argument 1 to "view" of "ndarray" has incompatible type
- # "dtype[datetime64] | DatetimeTZDtype"; expected
- # "dtype[Any] | type[Any] | _SupportsDType[dtype[Any]]"
- res_values = res_values.view(values.dtype) # type: ignore[arg-type]
+ res_values = res_values.view(values.dtype)
except NotImplementedError:
if get_option("performance_warnings"):
warnings.warn(
diff --git a/pandas/core/arrays/masked.py b/pandas/core/arrays/masked.py
index 07c875337e4f6..62e6119204bd5 100644
--- a/pandas/core/arrays/masked.py
+++ b/pandas/core/arrays/masked.py
@@ -515,7 +515,7 @@ def tolist(self) -> list:
if self.ndim > 1:
return [x.tolist() for x in self]
dtype = None if self._hasna else self._data.dtype
- return self.to_numpy(dtype=dtype, na_value=libmissing.NA).tolist()
+ return self.to_numpy(dtype=dtype, na_value=libmissing.NA).tolist() # type: ignore[return-value]
@overload
def astype(self, dtype: npt.DTypeLike, copy: bool = ...) -> np.ndarray: ...
@@ -1497,10 +1497,10 @@ def all(
result = values.all(axis=axis)
if skipna:
- return result
+ return result # type: ignore[return-value]
else:
if not result or len(self) == 0 or not self._mask.any():
- return result
+ return result # type: ignore[return-value]
else:
return self.dtype.na_value
diff --git a/pandas/core/arrays/sparse/scipy_sparse.py b/pandas/core/arrays/sparse/scipy_sparse.py
index cc9fd2d5fb8b0..d4ef3003583c3 100644
--- a/pandas/core/arrays/sparse/scipy_sparse.py
+++ b/pandas/core/arrays/sparse/scipy_sparse.py
@@ -79,7 +79,7 @@ def _levels_to_axis(
ax_coords = codes[valid_ilocs]
ax_labels = ax_labels.tolist()
- return ax_coords, ax_labels
+ return ax_coords, ax_labels # pyright: ignore[reportReturnType]
def _to_ijv(
diff --git a/pandas/core/arrays/timedeltas.py b/pandas/core/arrays/timedeltas.py
index c5b3129c506c8..9012b9f36348a 100644
--- a/pandas/core/arrays/timedeltas.py
+++ b/pandas/core/arrays/timedeltas.py
@@ -325,7 +325,7 @@ def _unbox_scalar(self, value) -> np.timedelta64:
raise ValueError("'value' should be a Timedelta.")
self._check_compatible_with(value)
if value is NaT:
- return np.timedelta64(value._value, self.unit)
+ return np.timedelta64(value._value, self.unit) # type: ignore[call-overload]
else:
return value.as_unit(self.unit, round_ok=False).asm8
diff --git a/pandas/core/base.py b/pandas/core/base.py
index 6cc28d4e46634..8304af48c39ac 100644
--- a/pandas/core/base.py
+++ b/pandas/core/base.py
@@ -875,7 +875,7 @@ def tolist(self) -> list:
>>> idx.to_list()
[1, 2, 3]
"""
- return self._values.tolist()
+ return self._values.tolist() # type: ignore[return-value]
to_list = tolist
diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py
index b520ad69aae96..a1c1163435611 100644
--- a/pandas/core/groupby/generic.py
+++ b/pandas/core/groupby/generic.py
@@ -2142,7 +2142,7 @@ def _wrap_applied_output_series(
if stacked_values.dtype == object:
# We'll have the DataFrame constructor do inference
- stacked_values = stacked_values.tolist()
+ stacked_values = stacked_values.tolist() # type: ignore[assignment]
result = self.obj._constructor(stacked_values, index=index, columns=columns)
if not self.as_index:
diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py
index d31e50bbd311b..9cfeb53821fbc 100644
--- a/pandas/core/groupby/groupby.py
+++ b/pandas/core/groupby/groupby.py
@@ -1878,7 +1878,7 @@ def _apply_filter(self, indices, dropna):
mask.fill(False)
mask[indices.astype(int)] = True
# mask fails to broadcast when passed to where; broadcast manually.
- mask = np.tile(mask, list(self._selected_obj.shape[1:]) + [1]).T
+ mask = np.tile(mask, list(self._selected_obj.shape[1:]) + [1]).T # type: ignore[assignment]
filtered = self._selected_obj.where(mask) # Fill with NaNs.
return filtered
@@ -4441,11 +4441,11 @@ def blk_func(values: ArrayLike) -> ArrayLike:
)
if vals.ndim == 1:
- out = out.ravel("K")
+ out = out.ravel("K") # type: ignore[assignment]
if result_mask is not None:
- result_mask = result_mask.ravel("K")
+ result_mask = result_mask.ravel("K") # type: ignore[assignment]
else:
- out = out.reshape(ncols, ngroups * nqs)
+ out = out.reshape(ncols, ngroups * nqs) # type: ignore[assignment]
return post_processor(out, inference, result_mask, orig_vals)
diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py
index c4c7f73ee166c..75f3495041917 100644
--- a/pandas/core/groupby/ops.py
+++ b/pandas/core/groupby/ops.py
@@ -1131,7 +1131,7 @@ def get_iterator(self, data: NDFrame):
"""
slicer = lambda start, edge: data.iloc[start:edge]
- start = 0
+ start: np.int64 | int = 0
for edge, label in zip(self.bins, self.binlabels):
if label is not NaT:
yield label, slicer(start, edge)
@@ -1144,7 +1144,7 @@ def get_iterator(self, data: NDFrame):
def indices(self):
indices = collections.defaultdict(list)
- i = 0
+ i: np.int64 | int = 0
for label, bin in zip(self.binlabels, self.bins):
if i < bin:
if label is not NaT:
diff --git a/pandas/core/indexers/objects.py b/pandas/core/indexers/objects.py
index 88379164534f2..6fc638e85bc5e 100644
--- a/pandas/core/indexers/objects.py
+++ b/pandas/core/indexers/objects.py
@@ -131,8 +131,8 @@ def get_window_bounds(
if closed in ["left", "neither"]:
end -= 1
- end = np.clip(end, 0, num_values)
- start = np.clip(start, 0, num_values)
+ end = np.clip(end, 0, num_values) # type: ignore[assignment]
+ start = np.clip(start, 0, num_values) # type: ignore[assignment]
return start, end
@@ -402,7 +402,7 @@ def get_window_bounds(
start = np.arange(0, num_values, step, dtype="int64")
end = start + self.window_size
if self.window_size:
- end = np.clip(end, 0, num_values)
+ end = np.clip(end, 0, num_values) # type: ignore[assignment]
return start, end
@@ -488,7 +488,7 @@ def get_window_bounds(
)
window_indices_start += len(indices)
# Extend as we'll be slicing window like [start, end)
- window_indices = np.append(window_indices, [window_indices[-1] + 1]).astype(
+ window_indices = np.append(window_indices, [window_indices[-1] + 1]).astype( # type: ignore[assignment]
np.int64, copy=False
)
start_arrays.append(window_indices.take(ensure_platform_int(start)))
diff --git a/pandas/core/indexes/interval.py b/pandas/core/indexes/interval.py
index 13811c28e6c1e..8c40b630e8cfd 100644
--- a/pandas/core/indexes/interval.py
+++ b/pandas/core/indexes/interval.py
@@ -1279,14 +1279,7 @@ def interval_range(
breaks = np.linspace(start, end, periods)
if all(is_integer(x) for x in com.not_none(start, end, freq)):
# np.linspace always produces float output
-
- # error: Argument 1 to "maybe_downcast_numeric" has incompatible type
- # "Union[ndarray[Any, Any], TimedeltaIndex, DatetimeIndex]";
- # expected "ndarray[Any, Any]" [
- breaks = maybe_downcast_numeric(
- breaks, # type: ignore[arg-type]
- dtype,
- )
+ breaks = maybe_downcast_numeric(breaks, dtype)
else:
# delegate to the appropriate range function
if isinstance(endpoint, Timestamp):
diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py
index b846af1c83736..98520bf82098e 100644
--- a/pandas/core/internals/blocks.py
+++ b/pandas/core/internals/blocks.py
@@ -2094,7 +2094,7 @@ def _unstack(
self.values.take(
indices, allow_fill=needs_masking[i], fill_value=fill_value
),
- BlockPlacement(place),
+ BlockPlacement(place), # type: ignore[arg-type]
ndim=2,
)
for i, (indices, place) in enumerate(zip(new_values, new_placement))
diff --git a/pandas/core/internals/construction.py b/pandas/core/internals/construction.py
index d098f8d42d3db..35de97d570bd3 100644
--- a/pandas/core/internals/construction.py
+++ b/pandas/core/internals/construction.py
@@ -634,7 +634,7 @@ def reorder_arrays(
arr = np.empty(length, dtype=object)
arr.fill(np.nan)
else:
- arr = arrays[k]
+ arr = arrays[k] # type: ignore[assignment]
new_arrays.append(arr)
arrays = new_arrays
diff --git a/pandas/core/missing.py b/pandas/core/missing.py
index e2fb3b9a6fc0b..66609fa870f14 100644
--- a/pandas/core/missing.py
+++ b/pandas/core/missing.py
@@ -241,7 +241,8 @@ def find_valid_index(how: str, is_valid: npt.NDArray[np.bool_]) -> int | None:
return None
if is_valid.ndim == 2:
- is_valid = is_valid.any(axis=1) # reduce axis 1
+ # reduce axis 1
+ is_valid = is_valid.any(axis=1) # type: ignore[assignment]
if how == "first":
idxpos = is_valid[::].argmax()
@@ -404,10 +405,7 @@ def func(yvalues: np.ndarray) -> None:
**kwargs,
)
- # error: No overload variant of "apply_along_axis" matches
- # argument types "Callable[[ndarray[Any, Any]], None]",
- # "int", "ndarray[Any, Any]"
- np.apply_along_axis(func, axis, data) # type: ignore[call-overload]
+ np.apply_along_axis(func, axis, data)
def _index_to_interp_indices(index: Index, method: str) -> np.ndarray:
diff --git a/pandas/core/reshape/encoding.py b/pandas/core/reshape/encoding.py
index 6a590ee5b227e..54704b274b74f 100644
--- a/pandas/core/reshape/encoding.py
+++ b/pandas/core/reshape/encoding.py
@@ -357,7 +357,7 @@ def get_empty_frame(data) -> DataFrame:
if drop_first:
# remove first GH12042
- dummy_mat = dummy_mat[:, 1:]
+ dummy_mat = dummy_mat[:, 1:] # type: ignore[assignment]
dummy_cols = dummy_cols[1:]
return DataFrame(dummy_mat, index=index, columns=dummy_cols, dtype=_dtype)
diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py
index 09be82c59a5c6..68d61da0cf7dd 100644
--- a/pandas/core/reshape/merge.py
+++ b/pandas/core/reshape/merge.py
@@ -2921,9 +2921,7 @@ def _convert_arrays_and_get_rizer_klass(
lk = lk.astype(dtype, copy=False)
rk = rk.astype(dtype, copy=False)
if isinstance(lk, BaseMaskedArray):
- # Invalid index type "type" for "Dict[Type[object], Type[Factorizer]]";
- # expected type "Type[object]"
- klass = _factorizers[lk.dtype.type] # type: ignore[index]
+ klass = _factorizers[lk.dtype.type]
elif isinstance(lk.dtype, ArrowDtype):
klass = _factorizers[lk.dtype.numpy_dtype.type]
else:
diff --git a/pandas/core/sorting.py b/pandas/core/sorting.py
index 0d8f42694ccb4..18983af12976c 100644
--- a/pandas/core/sorting.py
+++ b/pandas/core/sorting.py
@@ -476,7 +476,7 @@ def nargminmax(values: ExtensionArray, method: str, axis: AxisInt = 0):
zipped = zip(arr_values, mask)
else:
zipped = zip(arr_values.T, mask.T)
- return np.array([_nanargminmax(v, m, func) for v, m in zipped])
+ return np.array([_nanargminmax(v, m, func) for v, m in zipped]) # type: ignore[arg-type]
return func(arr_values, axis=axis)
return _nanargminmax(arr_values, mask, func)
diff --git a/pandas/io/formats/format.py b/pandas/io/formats/format.py
index f1be0b41ad7f7..189dfc1dde6aa 100644
--- a/pandas/io/formats/format.py
+++ b/pandas/io/formats/format.py
@@ -1495,7 +1495,7 @@ def _format_strings(self) -> list[str]:
fmt_values = values._format_native_types(
na_rep=self.nat_rep, date_format=self.date_format
)
- return fmt_values.tolist()
+ return fmt_values.tolist() # type: ignore[return-value]
class _ExtensionArrayFormatter(_GenericArrayFormatter):
diff --git a/pandas/io/parsers/python_parser.py b/pandas/io/parsers/python_parser.py
index e7b5c7f06a79a..547d8c1fe3d19 100644
--- a/pandas/io/parsers/python_parser.py
+++ b/pandas/io/parsers/python_parser.py
@@ -1468,7 +1468,7 @@ def detect_colspecs(
shifted[0] = 0
edges = np.where((mask ^ shifted) == 1)[0]
edge_pairs = list(zip(edges[::2], edges[1::2]))
- return edge_pairs
+ return edge_pairs # type: ignore[return-value]
def __next__(self) -> list[str]:
# Argument 1 to "next" has incompatible type "Union[IO[str],
diff --git a/pandas/plotting/_matplotlib/style.py b/pandas/plotting/_matplotlib/style.py
index 962f9711d9916..6e343b176b5eb 100644
--- a/pandas/plotting/_matplotlib/style.py
+++ b/pandas/plotting/_matplotlib/style.py
@@ -273,7 +273,7 @@ def _random_color(column: int) -> list[float]:
"""Get a random color represented as a list of length 3"""
# GH17525 use common._random_state to avoid resetting the seed
rs = com.random_state(column)
- return rs.rand(3).tolist()
+ return rs.rand(3).tolist() # type: ignore[return-value]
def _is_single_string_color(color: Color) -> bool:
diff --git a/pandas/tests/dtypes/test_missing.py b/pandas/tests/dtypes/test_missing.py
index c61cda83cf6e0..a5b22ac30d820 100644
--- a/pandas/tests/dtypes/test_missing.py
+++ b/pandas/tests/dtypes/test_missing.py
@@ -769,8 +769,8 @@ def test_empty_like(self):
np.datetime64("NaT"),
np.timedelta64("NaT"),
]
- + [np.datetime64("NaT", unit) for unit in m8_units]
- + [np.timedelta64("NaT", unit) for unit in m8_units]
+ + [np.datetime64("NaT", unit) for unit in m8_units] # type: ignore[call-overload]
+ + [np.timedelta64("NaT", unit) for unit in m8_units] # type: ignore[call-overload]
)
inf_vals = [
From fa98b691c86e9636539e6f4fc530bb442a446bf6 Mon Sep 17 00:00:00 2001
From: Marc Garcia
Date: Sat, 12 Apr 2025 20:51:17 +0200
Subject: [PATCH 15/27] WEB: Add pandas cookbook 3 to home page (#61279)
---
web/pandas/index.html | 5 +++++
.../static/img/books/pandas_cookbook_3.gif | Bin 0 -> 8897 bytes
2 files changed, 5 insertions(+)
create mode 100644 web/pandas/static/img/books/pandas_cookbook_3.gif
diff --git a/web/pandas/index.html b/web/pandas/index.html
index bbd8632e06840..c520a16b8160f 100644
--- a/web/pandas/index.html
+++ b/web/pandas/index.html
@@ -96,6 +96,11 @@ Recommended books
+
+
+
+
+
diff --git a/web/pandas/static/img/books/pandas_cookbook_3.gif b/web/pandas/static/img/books/pandas_cookbook_3.gif
new file mode 100644
index 0000000000000000000000000000000000000000..aa9d351d489e0b450addda721fb4791a04a1b220
GIT binary patch
literal 8897
zcmV;yB0k+mNk%v~Vfp|x0fZL-9~TlEC`U6hGe|NiGe0^$NkvRXNMAxVGD>_qS5KT!
zIznw`FmzXkV@+#oYdDT=)@475i;B90X1kGstE;QIrnSq;%m4rX|NsC0|NsC0|NsC0
z|NsC0|NsC0|NsC0EC2ui0Qvwl0RRR5pdgN9X`X1Ru59bRa4gSsZQppV?|kq7z@TtQ
zEE+T6nbESC<2US4hW%{sEIk|
ziJQ`+7
z=onT#c1SAObD$=dLS>nL$)Zm1$a#W$ND-HUt)#?7=o_6GAndThu0X0!uDS`7tUv};
zE1oFedZ$w={zywgfs-=JMuRVy%GGemHo9!0i0T?Gd*-)c3k2rREJJb12z{0YTu
zWu3xHf}GrzpsynBUcl%Y{krfN3f}4zE4j{!kf64r#wTTU{~|c#l`+~W!-dRVxG=H}
z<12a;T2tLv6pWRx1en=))ob%!mc^#*k>qP-p$nsYTDwIFi5s>f)5aunQx$wQ5H
z)yE~^E3tD*^ZayQQDe;RP`n{sbj2*FiWbv#=G0xlVPz^HQZu)8jnyuH=mbPjb{es_11D?~(A(K!zT55SMwYYKV
zx%bQdF(}Rz(URjt`QX}JW%T7aamyE4o_|hl4zE9%^S7m6aC)27`8kPudJp5LADE@y
zSMVGivInG8q!zmigXjT>4a)2Bh+^1hul@Gickli8;D?We@~v1g5AqgX!#EZ8^aFHw
z>n9=GWd4TPg{k|9t`qu4%n!qi_X|`x0E4Ue6~TY;vL6@l_YAV2%3^bS0s;-Dz%4M)
z7TyY;(4sKG765R80z3)WQWb5f%DTp@S^=panx1DH+xv55`f&Io7rl
zN`Xf#v0;_SH1&rZ4s0(u1Wscvwvz{*V1?{*O&nl21N|JaR2LFr&AMV8vCQZ~vAg~p
z`oxGZ8e-9gZD`#MYNR*FZR&(wJV9Ir#=4zoOI2!tqH#)ChNxVT1#}=k0@y2m$veyuMjGZ6XFRY8>z-M
zS(0}>LyxJr;h+?jDukW%BpP909A-rYG?rx1fBd&fR(@()3$o6@VE7d=%@Sqe`ePbQ
zCCy!Fa$Zxhj-E`_r?!Yv1!5~BPR4dZSK=gni#JuP
zyzxg{DbgGRy1B!R?5JdoLShLZ0s;B>UcE
zPPeze&1Q7(nc<1{HoNB?W@by--j#khyhjjjGsAn?2p9LgLp}mEms;HjfAhxOZRJXb
zTIFUIfw=ix>6F`>+cmd&xr>lte+PNSL!PmSY4HAL5lFn+4B7LwuPtqahg|9KX8O|M
zt!0-x`{6Q|y0|MI@|E8k>goQtxod82Qj@*qgpT$?vJP&tFFotj_Oj8T4sWajVZsU?
z^3Ne0bb{-+;9%xBq}Q&HM{~OHEpIcWJ&xvKn_Hl4
zQxgH#J(%-xbFZ&t;=6*&xZ_s@cIDNTedt3
zke`qe$bJUupZu0p&~g*J9u2wId+x
zZsVqK-9}~?S8-E!ZIJhMAqZ{cHg{e&X51EoUIv5Rr+?r_0LGPpy61m*Cw(vAWiN+z
zEEsm(R)6mXb|mO-K?rtrcX`g%b!UfgO6P88XLi?ycA`gSX_#!`hIQ7qbQ2haX-9Jp
zNCo+~gv_Uh;KzR(=z(1Zb9GpSP`83OSY|=UhH}SpU{{1$mvUmra%32WlsJJ|#)3ll
zZTiN7h1h~cXK6;KXYK}ZF?e}f{$PPgsE6W5aMtI9bw&f#7HZHof}~c4B-n7tR%pyN
zavf)83s;2t)`Kr-Y2?Ob?sjpSsEIS@ixFst4fliJhHGKAhzwVJFn5Zpn0tHYeah#D
z9GCXa=f?feEmR%7=s+IDSe1YB^whK)`-9@M|Psk7wX>-sgR}
z7jOaQje5s@S`d(WsCNLFiWW$a2B`(8hO{k`}0U!8TqdnN}l-fqJ-&Hc62JiIFi$1t`gi{%DgmsfRiFQ8ht3
z!oUdJ$Bp2(jRN_THK~055HqpNs%^L0PeVoQmH9Rq6@3gUe^Fs
z++Yei6cK|VE)*Y3aOAy8D!a*jbYgcnj)Be&_jG-7=n2dKd}gz>6WY$&Rb&
zkzDDO?D$;vwWW8;qk#IT7D$<$%A%St3fPkR@W0f4LLu+~rOlk~ob&z#hqL*r?D$1#^xT$*z1JDXEsIaa}
z(yQFy3W=*
z2+`9Cr`iN`YM>>#sLOh*&Kf$j)UE}~2N_!p8Cx2zP_Pm!3)OHNd_bzJunnkyw6`Iy
z{s%iq*dUx8%L+f6v_0Fd%uumNd$LU`B$p69PV1^mu&>A}r~#If<=L;zI;lIP3e6y)
z-iiwEx(V^2K74=|a2pfz^IKCj1&qU5})wY@V?lXa=AF&nGSd%7T2
zo*$(o*3+1>fLT}j4r#dwhS0IHuqCxQvKV^Aw>b;OY7DBn3@V%qX^9M;y9CEuz%o3;
zMf$}tP?wpy!1;=z5ZtUZfJ3+`MB$*S+t9}KFg?1!2?%lwaO@K6DhwmL2yBa+XnK^L
zf|Q*?DT|=0O;7;GYXUGl!y_QZF7T&4+NidAoUm!WW$VCdYzB2p2a5hoVz?`*G`z@;
zT*@zCrzd8Sc?yy#+=MS7RuZE$;uqg-Mj)dnU_n+z?nRB
zXls5v%+AwC%*|X}qWs9*jKKCB&jU%u`Ud&%ZUzSewanA-)Cdb)Pdj@yj83SH17
zV9a3*llN@OC6I&&{misVx;gC60lm(5h6M%v(I6et^UMZcI?^b>(DYoS20hL!jmyux
zl8h>lmu#pyrp81{1s^Q}BE8cf-Hy2%1_({MJ)NIkd7dk6(EbV?WW{U(FMZC-8O<+?
z$(kI{QD6X)>IWaaVm!UoKF!TxaA00t0tNle+;jADMRq881LGJVZM
zx2d?B$*GG3!%`{*fYn!hXCST9eBIZP9o8G^*SQ1br(AmX;*Pea4tew+hUD}1+j)IM^3rg6kT>^zo0w^uopnS|po!LPqs~)-C
zT&bn#e95v&)BilzJ^}}YEJz;of{skZA)7)Fw->j41)vuIU%+_t)
z#PyLorefP&(y~jQ7fHLitz(0oVk^ek=l$S!{or}M-t67pUa8#iEt3ZRrS6@d9Dbhk
zt~&YUQ>Hn_52DTz=w0
zP~#D9<6})U>TkSA9mp*PUmy}F5>!q69MpK``GcYRm>cq`v-Q0(s=)g_u;C|!74ceoP%xB)=@BLauaOso&-`1Y#
zM?RYu{^|IA;hIj`)6QaUPU_NL(mvhFvJ9I7kL5b8)oG0fi=Oa`zVJ7m>%1Q6lkVd}
z5a9hf%a0u0B(UipJ^~n@&yN0l3npOu4&RXN>!gd=!;Ye@e%)-|EwldRFwf|;p6gak
z?hnt`X^!cNMcy>v=93QPKM&o^p5Ok--tluTxc-t~Iz=r@ntfu8P_jsqdT;iN3|ogVMf-SO6r=NRvm3pV$3U-xJK^Yi}K
z&He0ne)TRN+zLPQGH>3XZP%h5_G16^l%8QWAo4%|_(PBEv-$QHKlI5>Vrp*pns4`b
z-_;;(z|*Y*S#RSC|MHCf;5d%dqAb$hT->~#=8M1PFaYhOPRyTd-NugJM1SA9kLilt
z?wij53%~#mApCca;dAfwM6S@R{R60P`hXw(ra$wyZtI@y?#8?Nh+ouWANz_0@0FkY
zknihpUit7&`5dqDKTl$sPyYPs{sP|o`QQu$&tEF`AN^ZD{i`eP5q{QPT)-euh$C5=
zWP&Ei!Y(fy%kNs-YatK<&lB~oR{-_|wjl48KpIyJ#^T9%JQmC%HmluoyIz3-Y#4*f
zV{$m17MIOo_PKY}6$y3`pG7vU2k1nzQq%
zS~Sm_>oT3r&K_gQY%R&HO)g92O5UPQIn$51aHpJ-9#-B;6qVI=zCKnnsu-!XXVco~
zo1^W{wrwN4{TUa}{sto@S9&EO0*|5)U5%zZc(?E4vVPD2qJmcH6(=3Z
zG$rCLL8?-wtOKh#BJ6fnT;Wde4S8A0v49de@|ndX1v-YT>hzb%EU`h2%XLX3JAWYw
zTi3G=X&{YWn1<~(b;;G!9QSx_JlP&OQEU5+)aZ7O+e>)%y{vHrFXD!un_bO#anh62
z)RSKpwa!8QAJ_A0=kEDpcOO<0d9U}Ry;J5XH=sYwu?L`2uw2B$b@kzw9e#t=vxE!(
zA*d2S=ea?kbIe`%T!Zv!$3!I%J_T22{+0D1NUSB-+EOsG*d9AUG-9F+Sx6Y#58}=E
zk&9}f2P0%Z0{9h!fAzY>86>RnYbmFJ$gjvC+0Plo@;>e3FQr6YB59$h7P)gpU=c8sB?>2
z8Qf2jDwAQOYYA#8r=1p+p{7Bm=P9YBM!6iBsj5<@sja%o52~4pYSOE+PK6U8`za
zCZ3%z22eJ|sAxs9Oz4@UFK93;+NC12iD-y3meG(x~`;
zRKOSx6ySh0upI#K02vd!vBBY<_N$oY&grqkXgNUf7!apS0%GhQ%(2D<4{$&OB)1Ih
zaw4y1bID$uTyb+L^Q%w5GSdvNz%$p}UdTDeSj4bBe{wR#Pzy}(%5EL7^}t6<@HN=(
z(mVmrt>I-u`h9
zz@Pzlf^@gid569Cw@jnmbTyAbJvb-^6h6Qh-X^Yi8UqX+O48m&E_vbyA5bKfqr3Gwj4N!mrxOTLf0j*`2gCJ8V7z)rG
z4sjR+*a0?Hu%bA?UmYx1!R)raj7<W29&i+y;LBXleC~p&>yj(Z|{5gPTS1TCkfOIf`xlUkW)8W64CcZ3YPK#Y!
zh7>*cM9PisWlMygJ`SJ&(Q)sH?~;JGisvl_hzoF}NZ}oo5yb(%5Q%SNVju_EuDlWQ
zUG&?V1Q>}$IM6XtLZsmI3RcP1-3^oa7$hRuIJ|##@?VIY<0@7qji5ym6fqmXDk;ac
zD1u~`U|S%(LP@*^;F2+{r{C+E07Q=T%CppzNp
z+GR*}zU+N1;$$1UnaI4A^G&1B78{}|N%yI;km=-O0bLo$uf@y$ZX-126=8F%B9%~|
z(%jhk0M;H)
zUKfn`RQr7kk#()BUGIulQt4G@>x-+{xZ2i_ARw;a8e>cmn^?O_%Bwv9Bo+kbg`|Nt
zo5bRp3^Te;$?|j`0SJI)A~pbEf&gxDs{q{gfVcixH~<7hnOolq0Jyt_fB<+)
z++GcHV{j#;+$#Fld>|kI39#Wy3k$y5zIL-#+8S=b3jz2EvIo91E)T-%*_j2`tGr!r
zX2GanO)^qZo&kb~3l3#jXPGFF^-OPyq~Bz?*&Xj}Clc<0?0Vy9J?)C4A!>
zhZna4&~0#e3<30ZDgqHG<1jni$-N!9j8kkt1S&v*WK?&*1bBe(lKCwHw7A7DPV9_w
zyx`*|myx?=uW*segEP~(!WU+9a1ShFNjabbcK)salBww7z1_J2O^&jiP3nM8RQbdY
z*rf1MOn?A)*~?#Mk(kR|E==tg(>%O<{12-hsH5SBOB#f31!APmA^gHVuWJ~puzmup`S3e`qVvWHPE
z?K@Lo&|EmIp{el14w{43-u|*}7mFZ3OFN0Bo;JEGa8heS&BOu*07o_*W~(PFZsz+zVclVx{;$-?P^=V02Xk7&_NOUEI6R^mUop&1(14^mNWtvaKP(T4GPy=
zU<pNz%TugP5Swgo>b61Fsbk(PCPQkJ9f8VEnfy
zKJf{_`JI&Od0y@H-}eQd>dD{h-5$#UTd_e{XhEQ-4FIqSfcA-!ur=5Qw#V2so7`MAoneo_W7Y5LdhDQp
zago^^!r^cw)Et`OX=MH$4;D{23B(PVUO)ccl%13SuHXeM;QJ}$KdN6W4w^R?
zg#q#Xp8kDgEohq5@Nwgmsh}+4;^(~}@Ui3pLSsF`piD00?bIZsvVgV
zir*$?-X?;L=A~Zl
PC13WXU;ZVZga80Lg4je_
literal 0
HcmV?d00001
From d45095bfb2bdee32e30491e7f3f20367d63bcec5 Mon Sep 17 00:00:00 2001
From: Marc Garcia
Date: Sun, 13 Apr 2025 17:37:19 +0200
Subject: [PATCH 16/27] WEB: Removed Coiled as a sponsor, and update past
sponsors list (#61278)
---
web/pandas/config.yml | 20 +-
web/pandas/static/img/partners/coiled.svg | 234 ----------------------
2 files changed, 15 insertions(+), 239 deletions(-)
delete mode 100644 web/pandas/static/img/partners/coiled.svg
diff --git a/web/pandas/config.yml b/web/pandas/config.yml
index 679778330b68d..cb5447591dab6 100644
--- a/web/pandas/config.yml
+++ b/web/pandas/config.yml
@@ -146,11 +146,6 @@ sponsors:
url: https://numfocus.org/
logo: static/img/partners/numfocus.svg
kind: numfocus
- - name: "Coiled"
- url: https://www.coiled.io
- logo: static/img/partners/coiled.svg
- kind: partner
- description: "Patrick Hoefler"
- name: "Nvidia"
url: https://www.nvidia.com
logo: static/img/partners/nvidia.svg
@@ -192,5 +187,20 @@ sponsors:
- name: "d-fine GmbH"
url: https://www.d-fine.com/en/
kind: partner
+ - name: "Two Sigma"
+ url: https://www.twosigma.com/
+ kind: partner
+ - name: "Voltron Data"
+ url: https://voltrondata.com/
+ kind: partner
+ - name: "Intel"
+ url: https://www.intel.com/
+ kind: partner
+ - name: "Chan Zuckerberg Initiative"
+ url: https://chanzuckerberg.com/
+ kind: regular
+ - name: "Coiled"
+ url: https://www.coiled.io
+ kind: partner
roadmap:
pdeps_path: pdeps
diff --git a/web/pandas/static/img/partners/coiled.svg b/web/pandas/static/img/partners/coiled.svg
deleted file mode 100644
index 2d76ce150084b..0000000000000
--- a/web/pandas/static/img/partners/coiled.svg
+++ /dev/null
@@ -1,234 +0,0 @@
-
-
-
-
From 9ea412a043c08f3044369494550c05fceca6a47c Mon Sep 17 00:00:00 2001
From: Jack
Date: Sun, 13 Apr 2025 11:58:40 -0600
Subject: [PATCH 17/27] DOC: Updated link for OVH server benchmark
visualization (#61108)
---
web/pandas/community/benchmarks.md | 5 ++---
1 file changed, 2 insertions(+), 3 deletions(-)
diff --git a/web/pandas/community/benchmarks.md b/web/pandas/community/benchmarks.md
index 1e63832a5a2ba..5a8198a979d90 100644
--- a/web/pandas/community/benchmarks.md
+++ b/web/pandas/community/benchmarks.md
@@ -36,9 +36,8 @@ available at the [pandas sponsors]({{ base_url }}about/sponsors.html) page.
Results of the benchmarks are available at:
-- Original server: [asv](https://asv-runner.github.io/asv-collection/pandas/)
-- OVH server: [asv](https://pandas.pydata.org/benchmarks/asv/) (benchmarks results can
- also be visualized in this [Conbench PoC](http://57.128.112.95:5000/)
+- GitHub Actions results: [asv](https://pandas-dev.github.io/asv-runner/)
+- OVH server: [asv](https://pandas.pydata.org/benchmarks/asv/)
### Original server configuration
From a7b9718922135bb473ea6fdf082f4cf0f743a66d Mon Sep 17 00:00:00 2001
From: Marc Garcia
Date: Mon, 14 Apr 2025 15:14:28 +0200
Subject: [PATCH 18/27] API: Rename `arg` to `func` in `Series.map` (#61264)
---
doc/source/whatsnew/v3.0.0.rst | 1 +
pandas/core/series.py | 28 +++++++++++++++++++------
pandas/tests/series/methods/test_map.py | 24 +++++++++++++++++++++
3 files changed, 47 insertions(+), 6 deletions(-)
diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst
index 184ca581902ee..8873f7c1a8fe8 100644
--- a/doc/source/whatsnew/v3.0.0.rst
+++ b/doc/source/whatsnew/v3.0.0.rst
@@ -421,6 +421,7 @@ Other Deprecations
- Deprecated lowercase strings ``w``, ``w-mon``, ``w-tue``, etc. denoting frequencies in :class:`Week` in favour of ``W``, ``W-MON``, ``W-TUE``, etc. (:issue:`58998`)
- Deprecated parameter ``method`` in :meth:`DataFrame.reindex_like` / :meth:`Series.reindex_like` (:issue:`58667`)
- Deprecated strings ``w``, ``d``, ``MIN``, ``MS``, ``US`` and ``NS`` denoting units in :class:`Timedelta` in favour of ``W``, ``D``, ``min``, ``ms``, ``us`` and ``ns`` (:issue:`59051`)
+- Deprecated the ``arg`` parameter of ``Series.map``; pass the added ``func`` argument instead. (:issue:`61260`)
- Deprecated using ``epoch`` date format in :meth:`DataFrame.to_json` and :meth:`Series.to_json`, use ``iso`` instead. (:issue:`57063`)
.. ---------------------------------------------------------------------------
diff --git a/pandas/core/series.py b/pandas/core/series.py
index 03a2ce85a08c9..d6a982c65e9fd 100644
--- a/pandas/core/series.py
+++ b/pandas/core/series.py
@@ -52,6 +52,9 @@
doc,
set_module,
)
+from pandas.util._exceptions import (
+ find_stack_level,
+)
from pandas.util._validators import (
validate_ascending,
validate_bool_kwarg,
@@ -4320,7 +4323,7 @@ def unstack(
def map(
self,
- arg: Callable | Mapping | Series,
+ func: Callable | Mapping | Series | None = None,
na_action: Literal["ignore"] | None = None,
**kwargs,
) -> Series:
@@ -4333,8 +4336,8 @@ def map(
Parameters
----------
- arg : function, collections.abc.Mapping subclass or Series
- Mapping correspondence.
+ func : function, collections.abc.Mapping subclass or Series
+ Function or mapping correspondence.
na_action : {None, 'ignore'}, default None
If 'ignore', propagate NaN values, without passing them to the
mapping correspondence.
@@ -4404,9 +4407,22 @@ def map(
3 I am a rabbit
dtype: object
"""
- if callable(arg):
- arg = functools.partial(arg, **kwargs)
- new_values = self._map_values(arg, na_action=na_action)
+ if func is None:
+ if "arg" in kwargs:
+ # `.map(arg=my_func)`
+ func = kwargs.pop("arg")
+ warnings.warn(
+ "The parameter `arg` has been renamed to `func`, and it "
+ "will stop being supported in a future version of pandas.",
+ FutureWarning,
+ stacklevel=find_stack_level(),
+ )
+ else:
+ raise ValueError("The `func` parameter is required")
+
+ if callable(func):
+ func = functools.partial(func, **kwargs)
+ new_values = self._map_values(func, na_action=na_action)
return self._constructor(new_values, index=self.index, copy=False).__finalize__(
self, method="map"
)
diff --git a/pandas/tests/series/methods/test_map.py b/pandas/tests/series/methods/test_map.py
index 84b60a2afe6eb..384b7ce3dc985 100644
--- a/pandas/tests/series/methods/test_map.py
+++ b/pandas/tests/series/methods/test_map.py
@@ -604,3 +604,27 @@ def test_map_kwargs():
result = Series([2, 4, 5]).map(lambda x, y: x + y, y=2)
expected = Series([4, 6, 7])
tm.assert_series_equal(result, expected)
+
+
+def test_map_arg_as_kwarg():
+ with tm.assert_produces_warning(
+ FutureWarning, match="`arg` has been renamed to `func`"
+ ):
+ Series([1, 2]).map(arg={})
+
+
+def test_map_func_and_arg():
+ # `arg`is considered a normal kwarg that should be passed to the function
+ result = Series([1, 2]).map(lambda _, arg: arg, arg=3)
+ expected = Series([3, 3])
+ tm.assert_series_equal(result, expected)
+
+
+def test_map_no_func_or_arg():
+ with pytest.raises(ValueError, match="The `func` parameter is required"):
+ Series([1, 2]).map()
+
+
+def test_map_func_is_none():
+ with pytest.raises(ValueError, match="The `func` parameter is required"):
+ Series([1, 2]).map(func=None)
From e176c64b6f28323dbb404ca1b0f38618cf3a57c8 Mon Sep 17 00:00:00 2001
From: ChiLin Chiu
Date: Tue, 15 Apr 2025 00:39:18 +0800
Subject: [PATCH 19/27] BUG: Improve ImportError message to suggest importing
dependencies directly for full error details (#61084)
* Add hint to display full message for missing dependencies in pandas/init.py
* ENH: Improve import error handling to preserve original traceback
* TST: refactor testing for hard dependency package
* Update pandas/__init__.py
Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
* Refactor prevent statement too long
* ENH: change unittest to verify ImportError is raised when required dependencies are missing
* TST: Use pytest.raises match parameter in test_missing_required_dependency
---------
Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
---
pandas/__init__.py | 12 ++++-----
pandas/tests/test_downstream.py | 47 ++++++++++-----------------------
2 files changed, 19 insertions(+), 40 deletions(-)
diff --git a/pandas/__init__.py b/pandas/__init__.py
index c570fb8d70204..5dc6a8c3bc50c 100644
--- a/pandas/__init__.py
+++ b/pandas/__init__.py
@@ -4,19 +4,17 @@
# Let users know if they're missing any of our hard dependencies
_hard_dependencies = ("numpy", "dateutil")
-_missing_dependencies = []
for _dependency in _hard_dependencies:
try:
__import__(_dependency)
except ImportError as _e: # pragma: no cover
- _missing_dependencies.append(f"{_dependency}: {_e}")
+ raise ImportError(
+ f"Unable to import required dependency {_dependency}. "
+ "Please see the traceback for details."
+ ) from _e
-if _missing_dependencies: # pragma: no cover
- raise ImportError(
- "Unable to import required dependencies:\n" + "\n".join(_missing_dependencies)
- )
-del _hard_dependencies, _dependency, _missing_dependencies
+del _hard_dependencies, _dependency
try:
# numpy compat
diff --git a/pandas/tests/test_downstream.py b/pandas/tests/test_downstream.py
index 76fad35304fe6..6282aecdfe977 100644
--- a/pandas/tests/test_downstream.py
+++ b/pandas/tests/test_downstream.py
@@ -4,6 +4,7 @@
import array
from functools import partial
+import importlib
import subprocess
import sys
@@ -186,41 +187,21 @@ def test_yaml_dump(df):
tm.assert_frame_equal(df, loaded2)
-@pytest.mark.single_cpu
-def test_missing_required_dependency():
- # GH 23868
- # To ensure proper isolation, we pass these flags
- # -S : disable site-packages
- # -s : disable user site-packages
- # -E : disable PYTHON* env vars, especially PYTHONPATH
- # https://github.com/MacPython/pandas-wheels/pull/50
-
- pyexe = sys.executable.replace("\\", "/")
-
- # We skip this test if pandas is installed as a site package. We first
- # import the package normally and check the path to the module before
- # executing the test which imports pandas with site packages disabled.
- call = [pyexe, "-c", "import pandas;print(pandas.__file__)"]
- output = subprocess.check_output(call).decode()
- if "site-packages" in output:
- pytest.skip("pandas installed as site package")
-
- # This test will fail if pandas is installed as a site package. The flags
- # prevent pandas being imported and the test will report Failed: DID NOT
- # RAISE
- call = [pyexe, "-sSE", "-c", "import pandas"]
-
- msg = (
- rf"Command '\['{pyexe}', '-sSE', '-c', 'import pandas'\]' "
- "returned non-zero exit status 1."
- )
+@pytest.mark.parametrize("dependency", ["numpy", "dateutil"])
+def test_missing_required_dependency(monkeypatch, dependency):
+ # GH#61030
+ original_import = __import__
+ mock_error = ImportError(f"Mock error for {dependency}")
+
+ def mock_import(name, *args, **kwargs):
+ if name == dependency:
+ raise mock_error
+ return original_import(name, *args, **kwargs)
- with pytest.raises(subprocess.CalledProcessError, match=msg) as exc:
- subprocess.check_output(call, stderr=subprocess.STDOUT)
+ monkeypatch.setattr("builtins.__import__", mock_import)
- output = exc.value.stdout.decode()
- for name in ["numpy", "dateutil"]:
- assert name in output
+ with pytest.raises(ImportError, match=dependency):
+ importlib.reload(importlib.import_module("pandas"))
def test_frame_setitem_dask_array_into_new_col(request):
From 2183651b61259f5aea841a3ccc2b48ec8cb43d82 Mon Sep 17 00:00:00 2001
From: Richard Shadrach <45562402+rhshadrach@users.noreply.github.com>
Date: Mon, 14 Apr 2025 12:44:07 -0400
Subject: [PATCH 20/27] PERF: future_stack=True with non-MulitIndex columns
(#58817)
* PERF: stack on non-MultiIndex columns
* WIP
* Use reshape instead of ravel
* arrays -> blocks
* Update test
* whatsnew
---
doc/source/whatsnew/v3.0.0.rst | 1 +
pandas/core/reshape/reshape.py | 37 +++++++++++++++---------
pandas/tests/extension/base/reshaping.py | 10 ++++++-
3 files changed, 34 insertions(+), 14 deletions(-)
diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst
index 8873f7c1a8fe8..46af55cedf01e 100644
--- a/doc/source/whatsnew/v3.0.0.rst
+++ b/doc/source/whatsnew/v3.0.0.rst
@@ -623,6 +623,7 @@ Performance improvements
- Performance improvement in :meth:`CategoricalDtype.update_dtype` when ``dtype`` is a :class:`CategoricalDtype` with non ``None`` categories and ordered (:issue:`59647`)
- Performance improvement in :meth:`DataFrame.__getitem__` when ``key`` is a :class:`DataFrame` with many columns (:issue:`61010`)
- Performance improvement in :meth:`DataFrame.astype` when converting to extension floating dtypes, e.g. "Float64" (:issue:`60066`)
+- Performance improvement in :meth:`DataFrame.stack` when using ``future_stack=True`` and the DataFrame does not have a :class:`MultiIndex` (:issue:`58391`)
- Performance improvement in :meth:`DataFrame.where` when ``cond`` is a :class:`DataFrame` with many columns (:issue:`61010`)
- Performance improvement in :meth:`to_hdf` avoid unnecessary reopenings of the HDF5 file to speedup data addition to files with a very large number of groups . (:issue:`58248`)
- Performance improvement in ``DataFrameGroupBy.__len__`` and ``SeriesGroupBy.__len__`` (:issue:`57595`)
diff --git a/pandas/core/reshape/reshape.py b/pandas/core/reshape/reshape.py
index c60fe71a7ff28..d2a838b616426 100644
--- a/pandas/core/reshape/reshape.py
+++ b/pandas/core/reshape/reshape.py
@@ -936,7 +936,20 @@ def stack_v3(frame: DataFrame, level: list[int]) -> Series | DataFrame:
[k for k in range(frame.columns.nlevels - 1, -1, -1) if k not in set_levels]
)
- result = stack_reshape(frame, level, set_levels, stack_cols)
+ result: Series | DataFrame
+ if not isinstance(frame.columns, MultiIndex):
+ # GH#58817 Fast path when we're stacking the columns of a non-MultiIndex.
+ # When columns are homogeneous EAs, we pass through object
+ # dtype but this is still slightly faster than the normal path.
+ if len(frame.columns) > 0 and frame._is_homogeneous_type:
+ dtype = frame._mgr.blocks[0].dtype
+ else:
+ dtype = None
+ result = frame._constructor_sliced(
+ frame._values.reshape(-1, order="F"), dtype=dtype
+ )
+ else:
+ result = stack_reshape(frame, level, set_levels, stack_cols)
# Construct the correct MultiIndex by combining the frame's index and
# stacked columns.
@@ -1018,6 +1031,8 @@ def stack_reshape(
-------
The data of behind the stacked DataFrame.
"""
+ # non-MultIndex takes a fast path.
+ assert isinstance(frame.columns, MultiIndex)
# If we need to drop `level` from columns, it needs to be in descending order
drop_levnums = sorted(level, reverse=True)
@@ -1027,18 +1042,14 @@ def stack_reshape(
if len(frame.columns) == 1:
data = frame.copy(deep=False)
else:
- if not isinstance(frame.columns, MultiIndex) and not isinstance(idx, tuple):
- # GH#57750 - if the frame is an Index with tuples, .loc below will fail
- column_indexer = idx
- else:
- # Take the data from frame corresponding to this idx value
- if len(level) == 1:
- idx = (idx,)
- gen = iter(idx)
- column_indexer = tuple(
- next(gen) if k in set_levels else slice(None)
- for k in range(frame.columns.nlevels)
- )
+ # Take the data from frame corresponding to this idx value
+ if len(level) == 1:
+ idx = (idx,)
+ gen = iter(idx)
+ column_indexer = tuple(
+ next(gen) if k in set_levels else slice(None)
+ for k in range(frame.columns.nlevels)
+ )
data = frame.loc[:, column_indexer]
if len(level) < frame.columns.nlevels:
diff --git a/pandas/tests/extension/base/reshaping.py b/pandas/tests/extension/base/reshaping.py
index 2915c0585f373..a760cbc3995b3 100644
--- a/pandas/tests/extension/base/reshaping.py
+++ b/pandas/tests/extension/base/reshaping.py
@@ -3,6 +3,8 @@
import numpy as np
import pytest
+from pandas.core.dtypes.dtypes import NumpyEADtype
+
import pandas as pd
import pandas._testing as tm
from pandas.api.extensions import ExtensionArray
@@ -266,7 +268,13 @@ def test_stack(self, data, columns, future_stack):
expected = expected.astype(object)
if isinstance(expected, pd.Series):
- assert result.dtype == df.iloc[:, 0].dtype
+ if future_stack and isinstance(data.dtype, NumpyEADtype):
+ # GH#58817 future_stack=True constructs the result specifying the dtype
+ # using the dtype of the input; we thus get the underlying
+ # NumPy dtype as the result instead of the NumpyExtensionArray
+ assert result.dtype == df.iloc[:, 0].to_numpy().dtype
+ else:
+ assert result.dtype == df.iloc[:, 0].dtype
else:
assert all(result.dtypes == df.iloc[:, 0].dtype)
From 1944d787e27da343f947239362de7c2fd295c625 Mon Sep 17 00:00:00 2001
From: Pedro Marques
Date: Mon, 14 Apr 2025 17:58:58 +0100
Subject: [PATCH 21/27] BUG: OverflowError when fillna on DataFrame with a
pd.Timestamp (#61208) (#61216)
* Fix #61208: OverflowError when fillna on DataFrame with a pd.Timestamp
- Now correctly raises OutOfBoundsDatetime
- Added test_fillna_out_of_bounds_datetime()
* Comply with pre-commit and added an entry in v3.0.0.rst
* Removed flag 'inplace=True' from test and fixed the bug for this case.
---
doc/source/whatsnew/v3.0.0.rst | 1 +
pandas/core/internals/blocks.py | 4 ++++
pandas/tests/frame/methods/test_fillna.py | 14 ++++++++++++++
3 files changed, 19 insertions(+)
diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst
index 46af55cedf01e..2f07105a62edd 100644
--- a/doc/source/whatsnew/v3.0.0.rst
+++ b/doc/source/whatsnew/v3.0.0.rst
@@ -651,6 +651,7 @@ Datetimelike
- Bug in :func:`date_range` where using a negative frequency value would not include all points between the start and end values (:issue:`56147`)
- Bug in :func:`tseries.api.guess_datetime_format` would fail to infer time format when "%Y" == "%H%M" (:issue:`57452`)
- Bug in :func:`tseries.frequencies.to_offset` would fail to parse frequency strings starting with "LWOM" (:issue:`59218`)
+- Bug in :meth:`DataFrame.fillna` raising an ``AssertionError`` instead of ``OutOfBoundsDatetime`` when filling a ``datetime64[ns]`` column with an out-of-bounds timestamp. Now correctly raises ``OutOfBoundsDatetime``. (:issue:`61208`)
- Bug in :meth:`DataFrame.min` and :meth:`DataFrame.max` casting ``datetime64`` and ``timedelta64`` columns to ``float64`` and losing precision (:issue:`60850`)
- Bug in :meth:`Dataframe.agg` with df with missing values resulting in IndexError (:issue:`58810`)
- Bug in :meth:`DatetimeIndex.is_year_start` and :meth:`DatetimeIndex.is_quarter_start` does not raise on Custom business days frequencies bigger then "1C" (:issue:`58664`)
diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py
index 98520bf82098e..6aa5062b8ed86 100644
--- a/pandas/core/internals/blocks.py
+++ b/pandas/core/internals/blocks.py
@@ -1679,6 +1679,8 @@ def where(self, other, cond) -> list[Block]:
try:
res_values = arr._where(cond, other).T
+ except OutOfBoundsDatetime:
+ raise
except (ValueError, TypeError):
if self.ndim == 1 or self.shape[0] == 1:
if isinstance(self.dtype, (IntervalDtype, StringDtype)):
@@ -1746,6 +1748,8 @@ def putmask(self, mask, new) -> list[Block]:
try:
# Caller is responsible for ensuring matching lengths
values._putmask(mask, new)
+ except OutOfBoundsDatetime:
+ raise
except (TypeError, ValueError):
if self.ndim == 1 or self.shape[0] == 1:
if isinstance(self.dtype, IntervalDtype):
diff --git a/pandas/tests/frame/methods/test_fillna.py b/pandas/tests/frame/methods/test_fillna.py
index 67d1d45af1cb3..8915d6f205d65 100644
--- a/pandas/tests/frame/methods/test_fillna.py
+++ b/pandas/tests/frame/methods/test_fillna.py
@@ -1,6 +1,8 @@
import numpy as np
import pytest
+from pandas.errors import OutOfBoundsDatetime
+
from pandas import (
Categorical,
DataFrame,
@@ -781,3 +783,15 @@ def test_fillna_with_none_object(test_frame, dtype):
if test_frame:
expected = expected.to_frame()
tm.assert_equal(result, expected)
+
+
+def test_fillna_out_of_bounds_datetime():
+ # GH#61208
+ df = DataFrame(
+ {"datetime": date_range("1/1/2011", periods=3, freq="h"), "value": [1, 2, 3]}
+ )
+ df.iloc[0, 0] = None
+
+ msg = "Cannot cast 0001-01-01 00:00:00 to unit='ns' without overflow"
+ with pytest.raises(OutOfBoundsDatetime, match=msg):
+ df.fillna(Timestamp("0001-01-01"))
From bf8f1cd4e5d32340ccd2315e61994ad8e00aaca8 Mon Sep 17 00:00:00 2001
From: Robin Mader <128372203+robin-mader-bis@users.noreply.github.com>
Date: Mon, 14 Apr 2025 19:00:04 +0200
Subject: [PATCH 22/27] BUG: Fix pyarrow categoricals not working for pivot and
multiindex (#61193)
* BUG: Fix bug with DataFrame.pivot and .set_index not compatible with pyarrow dictionary categoricals
Relates to #53051
Code for fix taken and adapted from #59099
* TST: Add tests for faulty behavior relating to pyarrow categoricals
* CLN: Fix issues reported by pre-commit hooks
* TST: Fix failing tests for minimum version by ignoring obsolete deprecation warning
* DOC: Add entry for bugfix to whatsnew v3.0.0
* CLN: Refactor code and clean up according to PR feedback
* CLN: Refactor code and clean up according to PR feedback
* CLN: Refactor tests to adress PR feedback
* CLN: Refactor tests to adress PR feedback
---
doc/source/whatsnew/v3.0.0.rst | 1 +
pandas/core/arrays/categorical.py | 2 +-
pandas/tests/reshape/test_pivot.py | 29 +++++++++++++++++++++++++++++
pandas/tests/test_multilevel.py | 29 +++++++++++++++++++++++++++++
4 files changed, 60 insertions(+), 1 deletion(-)
diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst
index 2f07105a62edd..3294af742fae0 100644
--- a/doc/source/whatsnew/v3.0.0.rst
+++ b/doc/source/whatsnew/v3.0.0.rst
@@ -639,6 +639,7 @@ Bug fixes
Categorical
^^^^^^^^^^^
- Bug in :func:`Series.apply` where ``nan`` was ignored for :class:`CategoricalDtype` (:issue:`59938`)
+- Bug in :meth:`DataFrame.pivot` and :meth:`DataFrame.set_index` raising an ``ArrowNotImplementedError`` for columns with pyarrow dictionary dtype (:issue:`53051`)
- Bug in :meth:`Series.convert_dtypes` with ``dtype_backend="pyarrow"`` where empty :class:`CategoricalDtype` :class:`Series` raised an error or got converted to ``null[pyarrow]`` (:issue:`59934`)
-
diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py
index e5c5716165e2f..df1aa21e9203c 100644
--- a/pandas/core/arrays/categorical.py
+++ b/pandas/core/arrays/categorical.py
@@ -452,7 +452,7 @@ def __init__(
if isinstance(values, Index):
arr = values._data._pa_array.combine_chunks()
else:
- arr = values._pa_array.combine_chunks()
+ arr = extract_array(values)._pa_array.combine_chunks()
categories = arr.dictionary.to_pandas(types_mapper=ArrowDtype)
codes = arr.indices.to_numpy()
dtype = CategoricalDtype(categories, values.dtype.pyarrow_dtype.ordered)
diff --git a/pandas/tests/reshape/test_pivot.py b/pandas/tests/reshape/test_pivot.py
index 46eee13755b2d..614200ae5b7c2 100644
--- a/pandas/tests/reshape/test_pivot.py
+++ b/pandas/tests/reshape/test_pivot.py
@@ -15,6 +15,7 @@
import pandas as pd
from pandas import (
+ ArrowDtype,
Categorical,
DataFrame,
Grouper,
@@ -2851,3 +2852,31 @@ def test_pivot_margins_with_none_index(self):
),
)
tm.assert_frame_equal(result, expected)
+
+ @pytest.mark.filterwarnings("ignore:Passing a BlockManager:DeprecationWarning")
+ def test_pivot_with_pyarrow_categorical(self):
+ # GH#53051
+ pa = pytest.importorskip("pyarrow")
+
+ df = DataFrame(
+ {"string_column": ["A", "B", "C"], "number_column": [1, 2, 3]}
+ ).astype(
+ {
+ "string_column": ArrowDtype(pa.dictionary(pa.int32(), pa.string())),
+ "number_column": "float[pyarrow]",
+ }
+ )
+
+ df = df.pivot(columns=["string_column"], values=["number_column"])
+
+ multi_index = MultiIndex.from_arrays(
+ [["number_column", "number_column", "number_column"], ["A", "B", "C"]],
+ names=(None, "string_column"),
+ )
+ df_expected = DataFrame(
+ [[1.0, np.nan, np.nan], [np.nan, 2.0, np.nan], [np.nan, np.nan, 3.0]],
+ columns=multi_index,
+ )
+ tm.assert_frame_equal(
+ df, df_expected, check_dtype=False, check_column_type=False
+ )
diff --git a/pandas/tests/test_multilevel.py b/pandas/tests/test_multilevel.py
index a23e6d9b3973a..ff7ab22c197d8 100644
--- a/pandas/tests/test_multilevel.py
+++ b/pandas/tests/test_multilevel.py
@@ -5,6 +5,7 @@
import pandas as pd
from pandas import (
+ ArrowDtype,
DataFrame,
MultiIndex,
Series,
@@ -318,6 +319,34 @@ def test_multiindex_dt_with_nan(self):
expected = Series(["a", "b", "c", "d"], name=("sub", np.nan))
tm.assert_series_equal(result, expected)
+ @pytest.mark.filterwarnings("ignore:Passing a BlockManager:DeprecationWarning")
+ def test_multiindex_with_pyarrow_categorical(self):
+ # GH#53051
+ pa = pytest.importorskip("pyarrow")
+
+ df = DataFrame(
+ {"string_column": ["A", "B", "C"], "number_column": [1, 2, 3]}
+ ).astype(
+ {
+ "string_column": ArrowDtype(pa.dictionary(pa.int32(), pa.string())),
+ "number_column": "float[pyarrow]",
+ }
+ )
+
+ df = df.set_index(["string_column", "number_column"])
+
+ df_expected = DataFrame(
+ index=MultiIndex.from_arrays(
+ [["A", "B", "C"], [1, 2, 3]], names=["string_column", "number_column"]
+ )
+ )
+ tm.assert_frame_equal(
+ df,
+ df_expected,
+ check_index_type=False,
+ check_column_type=False,
+ )
+
class TestSorted:
"""everything you wanted to test about sorting"""
From 4b14508f86b1ba3959094c4dd5f8698b642c26f2 Mon Sep 17 00:00:00 2001
From: Arthur Laureus Wigo <126365160+arthurlw@users.noreply.github.com>
Date: Mon, 14 Apr 2025 13:05:34 -0700
Subject: [PATCH 23/27] DOC: Add documentation for `groupby.ewm()` (#61283)
* updated doc and references
* precommit
* shortened summary
* updated according to reviewer suggestions and removed test.py
* Fixed docstring error
---
doc/source/reference/groupby.rst | 2 +
pandas/core/groupby/groupby.py | 68 +++++++++++++++++++++++++++++++-
2 files changed, 68 insertions(+), 2 deletions(-)
diff --git a/doc/source/reference/groupby.rst b/doc/source/reference/groupby.rst
index fc180c8161a7e..ce9aeeb358c19 100644
--- a/doc/source/reference/groupby.rst
+++ b/doc/source/reference/groupby.rst
@@ -79,6 +79,7 @@ Function application
DataFrameGroupBy.cumsum
DataFrameGroupBy.describe
DataFrameGroupBy.diff
+ DataFrameGroupBy.ewm
DataFrameGroupBy.ffill
DataFrameGroupBy.first
DataFrameGroupBy.head
@@ -130,6 +131,7 @@ Function application
SeriesGroupBy.cumsum
SeriesGroupBy.describe
SeriesGroupBy.diff
+ SeriesGroupBy.ewm
SeriesGroupBy.ffill
SeriesGroupBy.first
SeriesGroupBy.head
diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py
index 9cfeb53821fbc..2cb523d2f2f55 100644
--- a/pandas/core/groupby/groupby.py
+++ b/pandas/core/groupby/groupby.py
@@ -3824,15 +3824,79 @@ def expanding(self, *args, **kwargs) -> ExpandingGroupby:
)
@final
- @Substitution(name="groupby")
- @Appender(_common_see_also)
def ewm(self, *args, **kwargs) -> ExponentialMovingWindowGroupby:
"""
Return an ewm grouper, providing ewm functionality per group.
+ Parameters
+ ----------
+ *args : tuple
+ Positional arguments passed to the EWM window constructor.
+ **kwargs : dict
+ Keyword arguments passed to the EWM window constructor, such as:
+
+ com : float, optional
+ Specify decay in terms of center of mass.
+ ``span``, ``halflife``, and ``alpha`` are alternative ways to specify
+ decay.
+ span : float, optional
+ Specify decay in terms of span.
+ halflife : float, optional
+ Specify decay in terms of half-life.
+ alpha : float, optional
+ Specify smoothing factor directly.
+ min_periods : int, default 0
+ Minimum number of observations in the window required to have a value;
+ otherwise, result is ``np.nan``.
+ adjust : bool, default True
+ Divide by decaying adjustment factor to account for imbalance in
+ relative weights.
+ ignore_na : bool, default False
+ Ignore missing values when calculating weights.
+ times : str or array-like of datetime64, optional
+ Times corresponding to the observations.
+ axis : {0 or 'index', 1 or 'columns'}, default 0
+ Axis along which the EWM function is applied.
+
Returns
-------
pandas.api.typing.ExponentialMovingWindowGroupby
+ An object that supports exponentially weighted moving transformations over
+ each group.
+
+ See Also
+ --------
+ Series.ewm : EWM transformations for Series.
+ DataFrame.ewm : EWM transformations for DataFrames.
+ Series.groupby : Apply a function groupby to a Series.
+ DataFrame.groupby : Apply a function groupby.
+
+ Examples
+ --------
+ >>> df = pd.DataFrame(
+ ... {
+ ... "Class": ["A", "A", "A", "B", "B", "B"],
+ ... "Value": [10, 20, 30, 40, 50, 60],
+ ... }
+ ... )
+ >>> df
+ Class Value
+ 0 A 10
+ 1 A 20
+ 2 A 30
+ 3 B 40
+ 4 B 50
+ 5 B 60
+
+ >>> df.groupby("Class").ewm(com=0.5).mean()
+ Value
+ Class
+ A 0 10.000000
+ 1 17.500000
+ 2 26.153846
+ B 3 40.000000
+ 4 47.500000
+ 5 56.153846
"""
from pandas.core.window import ExponentialMovingWindowGroupby
From d1b0c6d4dc064711680bfeb278353cbaf354cd7a Mon Sep 17 00:00:00 2001
From: Arthur Laureus Wigo <126365160+arthurlw@users.noreply.github.com>
Date: Mon, 14 Apr 2025 15:41:29 -0700
Subject: [PATCH 24/27] DOC: Add documentation for `groupby.expanding()`
(#61274)
---
doc/source/reference/groupby.rst | 2 ++
pandas/core/groupby/groupby.py | 50 +++++++++++++++++++++++++++++---
2 files changed, 48 insertions(+), 4 deletions(-)
diff --git a/doc/source/reference/groupby.rst b/doc/source/reference/groupby.rst
index ce9aeeb358c19..004651ac0074f 100644
--- a/doc/source/reference/groupby.rst
+++ b/doc/source/reference/groupby.rst
@@ -80,6 +80,7 @@ Function application
DataFrameGroupBy.describe
DataFrameGroupBy.diff
DataFrameGroupBy.ewm
+ DataFrameGroupBy.expanding
DataFrameGroupBy.ffill
DataFrameGroupBy.first
DataFrameGroupBy.head
@@ -132,6 +133,7 @@ Function application
SeriesGroupBy.describe
SeriesGroupBy.diff
SeriesGroupBy.ewm
+ SeriesGroupBy.expanding
SeriesGroupBy.ffill
SeriesGroupBy.first
SeriesGroupBy.head
diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py
index 2cb523d2f2f55..7d58d8f867c12 100644
--- a/pandas/core/groupby/groupby.py
+++ b/pandas/core/groupby/groupby.py
@@ -3803,16 +3803,58 @@ def rolling(
)
@final
- @Substitution(name="groupby")
- @Appender(_common_see_also)
def expanding(self, *args, **kwargs) -> ExpandingGroupby:
"""
- Return an expanding grouper, providing expanding
- functionality per group.
+ Return an expanding grouper, providing expanding functionality per group.
+
+ Arguments are the same as `:meth:DataFrame.rolling` except that ``step`` cannot
+ be specified.
+
+ Parameters
+ ----------
+ *args : tuple
+ Positional arguments passed to the expanding window constructor.
+ **kwargs : dict
+ Keyword arguments passed to the expanding window constructor.
Returns
-------
pandas.api.typing.ExpandingGroupby
+ An object that supports expanding transformations over each group.
+
+ See Also
+ --------
+ Series.expanding : Expanding transformations for Series.
+ DataFrame.expanding : Expanding transformations for DataFrames.
+ Series.groupby : Apply a function groupby to a Series.
+ DataFrame.groupby : Apply a function groupby.
+
+ Examples
+ --------
+ >>> df = pd.DataFrame(
+ ... {
+ ... "Class": ["A", "A", "A", "B", "B", "B"],
+ ... "Value": [10, 20, 30, 40, 50, 60],
+ ... }
+ ... )
+ >>> df
+ Class Value
+ 0 A 10
+ 1 A 20
+ 2 A 30
+ 3 B 40
+ 4 B 50
+ 5 B 60
+
+ >>> df.groupby("Class").expanding().mean()
+ Value
+ Class
+ A 0 10.0
+ 1 15.0
+ 2 20.0
+ B 3 40.0
+ 4 45.0
+ 5 50.0
"""
from pandas.core.window import ExpandingGroupby
From 85aebde3b4e27c2f1f8dd32768f79dcc89ed263c Mon Sep 17 00:00:00 2001
From: Jeff Harrison
Date: Mon, 14 Apr 2025 18:59:23 -0600
Subject: [PATCH 25/27] Bug: Save original index and remap after function
completes (#61116)
* Save original index and remap after function completes.
* precommit passes
* use stable sorting 'mergesort' in tests
* Change sorts to `stable` instead of mergesort
* modify 'keep' to use a Literal instead of string
* address comments
* update doc to include stable sort change
---
doc/source/whatsnew/v3.0.0.rst | 2 +
pandas/core/methods/selectn.py | 43 +++++++++++++++------
pandas/tests/frame/methods/test_nlargest.py | 4 +-
3 files changed, 36 insertions(+), 13 deletions(-)
diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst
index 3294af742fae0..2d74be6f503a2 100644
--- a/doc/source/whatsnew/v3.0.0.rst
+++ b/doc/source/whatsnew/v3.0.0.rst
@@ -61,6 +61,7 @@ Other enhancements
- :meth:`Series.cummin` and :meth:`Series.cummax` now supports :class:`CategoricalDtype` (:issue:`52335`)
- :meth:`Series.plot` now correctly handle the ``ylabel`` parameter for pie charts, allowing for explicit control over the y-axis label (:issue:`58239`)
- :meth:`DataFrame.plot.scatter` argument ``c`` now accepts a column of strings, where rows with the same string are colored identically (:issue:`16827` and :issue:`16485`)
+- :meth:`Series.nlargest` uses a 'stable' sort internally and will preserve original ordering.
- :class:`ArrowDtype` now supports ``pyarrow.JsonType`` (:issue:`60958`)
- :class:`DataFrameGroupBy` and :class:`SeriesGroupBy` methods ``sum``, ``mean``, ``median``, ``prod``, ``min``, ``max``, ``std``, ``var`` and ``sem`` now accept ``skipna`` parameter (:issue:`15675`)
- :class:`Rolling` and :class:`Expanding` now support ``nunique`` (:issue:`26958`)
@@ -593,6 +594,7 @@ Performance improvements
- :func:`concat` returns a :class:`RangeIndex` column when possible when ``objs`` contains :class:`Series` and :class:`DataFrame` and ``axis=0`` (:issue:`58119`)
- :func:`concat` returns a :class:`RangeIndex` level in the :class:`MultiIndex` result when ``keys`` is a ``range`` or :class:`RangeIndex` (:issue:`57542`)
- :meth:`RangeIndex.append` returns a :class:`RangeIndex` instead of a :class:`Index` when appending values that could continue the :class:`RangeIndex` (:issue:`57467`)
+- :meth:`Series.nlargest` has improved performance when there are duplicate values in the index (:issue:`55767`)
- :meth:`Series.str.extract` returns a :class:`RangeIndex` columns instead of an :class:`Index` column when possible (:issue:`57542`)
- :meth:`Series.str.partition` with :class:`ArrowDtype` returns a :class:`RangeIndex` columns instead of an :class:`Index` column when possible (:issue:`57768`)
- Performance improvement in :class:`DataFrame` when ``data`` is a ``dict`` and ``columns`` is specified (:issue:`24368`)
diff --git a/pandas/core/methods/selectn.py b/pandas/core/methods/selectn.py
index 02e7445f1d275..59516b16905dc 100644
--- a/pandas/core/methods/selectn.py
+++ b/pandas/core/methods/selectn.py
@@ -11,6 +11,7 @@
from typing import (
TYPE_CHECKING,
Generic,
+ Literal,
cast,
final,
)
@@ -54,7 +55,9 @@
class SelectN(Generic[NDFrameT]):
- def __init__(self, obj: NDFrameT, n: int, keep: str) -> None:
+ def __init__(
+ self, obj: NDFrameT, n: int, keep: Literal["first", "last", "all"]
+ ) -> None:
self.obj = obj
self.n = n
self.keep = keep
@@ -111,15 +114,25 @@ def compute(self, method: str) -> Series:
if n <= 0:
return self.obj[[]]
- dropped = self.obj.dropna()
- nan_index = self.obj.drop(dropped.index)
+ # Save index and reset to default index to avoid performance impact
+ # from when index contains duplicates
+ original_index: Index = self.obj.index
+ default_index = self.obj.reset_index(drop=True)
- # slow method
- if n >= len(self.obj):
+ # Slower method used when taking the full length of the series
+ # In this case, it is equivalent to a sort.
+ if n >= len(default_index):
ascending = method == "nsmallest"
- return self.obj.sort_values(ascending=ascending).head(n)
+ result = default_index.sort_values(ascending=ascending, kind="stable").head(
+ n
+ )
+ result.index = original_index.take(result.index)
+ return result
+
+ # Fast method used in the general case
+ dropped = default_index.dropna()
+ nan_index = default_index.drop(dropped.index)
- # fast method
new_dtype = dropped.dtype
# Similar to algorithms._ensure_data
@@ -158,7 +171,7 @@ def compute(self, method: str) -> Series:
else:
kth_val = np.nan
(ns,) = np.nonzero(arr <= kth_val)
- inds = ns[arr[ns].argsort(kind="mergesort")]
+ inds = ns[arr[ns].argsort(kind="stable")]
if self.keep != "all":
inds = inds[:n]
@@ -173,7 +186,9 @@ def compute(self, method: str) -> Series:
# reverse indices
inds = narr - 1 - inds
- return concat([dropped.iloc[inds], nan_index]).iloc[:findex]
+ result = concat([dropped.iloc[inds], nan_index]).iloc[:findex]
+ result.index = original_index.take(result.index)
+ return result
class SelectNFrame(SelectN[DataFrame]):
@@ -192,7 +207,13 @@ class SelectNFrame(SelectN[DataFrame]):
nordered : DataFrame
"""
- def __init__(self, obj: DataFrame, n: int, keep: str, columns: IndexLabel) -> None:
+ def __init__(
+ self,
+ obj: DataFrame,
+ n: int,
+ keep: Literal["first", "last", "all"],
+ columns: IndexLabel,
+ ) -> None:
super().__init__(obj, n, keep)
if not is_list_like(columns) or isinstance(columns, tuple):
columns = [columns]
@@ -277,4 +298,4 @@ def get_indexer(current_indexer: Index, other_indexer: Index) -> Index:
ascending = method == "nsmallest"
- return frame.sort_values(columns, ascending=ascending, kind="mergesort")
+ return frame.sort_values(columns, ascending=ascending, kind="stable")
diff --git a/pandas/tests/frame/methods/test_nlargest.py b/pandas/tests/frame/methods/test_nlargest.py
index c6e5304ae3cb4..08b7128e6ec11 100644
--- a/pandas/tests/frame/methods/test_nlargest.py
+++ b/pandas/tests/frame/methods/test_nlargest.py
@@ -153,11 +153,11 @@ def test_nlargest_n_duplicate_index(self, n, order, request):
index=[0, 0, 1, 1, 1],
)
result = df.nsmallest(n, order)
- expected = df.sort_values(order).head(n)
+ expected = df.sort_values(order, kind="stable").head(n)
tm.assert_frame_equal(result, expected)
result = df.nlargest(n, order)
- expected = df.sort_values(order, ascending=False).head(n)
+ expected = df.sort_values(order, ascending=False, kind="stable").head(n)
if Version(np.__version__) >= Version("1.25") and (
(order == ["a"] and n in (1, 2, 3, 4)) or ((order == ["a", "b"]) and n == 5)
):
From 91b7a439da38675726959f01ed298d5aaa0d1a8d Mon Sep 17 00:00:00 2001
From: Richard Shadrach <45562402+rhshadrach@users.noreply.github.com>
Date: Wed, 16 Apr 2025 10:25:08 -0400
Subject: [PATCH 26/27] ENH(string dtype): fallback for HDF5 with UTF-8
surrogates (#60993)
---
pandas/io/pytables.py | 114 ++++++++++++++++++-------
pandas/tests/io/pytables/test_store.py | 21 ++---
2 files changed, 96 insertions(+), 39 deletions(-)
diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py
index b83b5aba3cf13..c58b4a4be6df1 100644
--- a/pandas/io/pytables.py
+++ b/pandas/io/pytables.py
@@ -39,6 +39,7 @@
)
from pandas._libs.lib import is_string_array
from pandas._libs.tslibs import timezones
+from pandas.compat import HAS_PYARROW
from pandas.compat._optional import import_optional_dependency
from pandas.compat.pickle_compat import patch_pickle
from pandas.errors import (
@@ -381,6 +382,13 @@ def read_hdf(
DataFrame.to_hdf : Write a HDF file from a DataFrame.
HDFStore : Low-level access to HDF files.
+ Notes
+ -----
+ When ``errors="surrogatepass"``, ``pd.options.future.infer_string`` is true,
+ and PyArrow is installed, if a UTF-16 surrogate is encountered when decoding
+ to UTF-8, the resulting dtype will be
+ ``pd.StringDtype(storage="python", na_value=np.nan)``.
+
Examples
--------
>>> df = pd.DataFrame([[1, 1.0, "a"]], columns=["x", "y", "z"]) # doctest: +SKIP
@@ -2257,6 +2265,20 @@ def convert(
# making an Index instance could throw a number of different errors
try:
new_pd_index = factory(values, **kwargs)
+ except UnicodeEncodeError as err:
+ if (
+ errors == "surrogatepass"
+ and get_option("future.infer_string")
+ and str(err).endswith("surrogates not allowed")
+ and HAS_PYARROW
+ ):
+ new_pd_index = factory(
+ values,
+ dtype=StringDtype(storage="python", na_value=np.nan),
+ **kwargs,
+ )
+ else:
+ raise
except ValueError:
# if the output freq is different that what we recorded,
# it should be None (see also 'doc example part 2')
@@ -3170,12 +3192,29 @@ def read_index_node(
**kwargs,
)
else:
- index = factory(
- _unconvert_index(
- data, kind, encoding=self.encoding, errors=self.errors
- ),
- **kwargs,
- )
+ try:
+ index = factory(
+ _unconvert_index(
+ data, kind, encoding=self.encoding, errors=self.errors
+ ),
+ **kwargs,
+ )
+ except UnicodeEncodeError as err:
+ if (
+ self.errors == "surrogatepass"
+ and get_option("future.infer_string")
+ and str(err).endswith("surrogates not allowed")
+ and HAS_PYARROW
+ ):
+ index = factory(
+ _unconvert_index(
+ data, kind, encoding=self.encoding, errors=self.errors
+ ),
+ dtype=StringDtype(storage="python", na_value=np.nan),
+ **kwargs,
+ )
+ else:
+ raise
index.name = name
@@ -3311,13 +3350,24 @@ def read(
self.validate_read(columns, where)
index = self.read_index("index", start=start, stop=stop)
values = self.read_array("values", start=start, stop=stop)
- result = Series(values, index=index, name=self.name, copy=False)
- if (
- using_string_dtype()
- and isinstance(values, np.ndarray)
- and is_string_array(values, skipna=True)
- ):
- result = result.astype(StringDtype(na_value=np.nan))
+ try:
+ result = Series(values, index=index, name=self.name, copy=False)
+ except UnicodeEncodeError as err:
+ if (
+ self.errors == "surrogatepass"
+ and get_option("future.infer_string")
+ and str(err).endswith("surrogates not allowed")
+ and HAS_PYARROW
+ ):
+ result = Series(
+ values,
+ index=index,
+ name=self.name,
+ copy=False,
+ dtype=StringDtype(storage="python", na_value=np.nan),
+ )
+ else:
+ raise
return result
def write(self, obj, **kwargs) -> None:
@@ -4764,7 +4814,24 @@ def read(
values = values.reshape((1, values.shape[0]))
if isinstance(values, (np.ndarray, DatetimeArray)):
- df = DataFrame(values.T, columns=cols_, index=index_, copy=False)
+ try:
+ df = DataFrame(values.T, columns=cols_, index=index_, copy=False)
+ except UnicodeEncodeError as err:
+ if (
+ self.errors == "surrogatepass"
+ and get_option("future.infer_string")
+ and str(err).endswith("surrogates not allowed")
+ and HAS_PYARROW
+ ):
+ df = DataFrame(
+ values.T,
+ columns=cols_,
+ index=index_,
+ copy=False,
+ dtype=StringDtype(storage="python", na_value=np.nan),
+ )
+ else:
+ raise
elif isinstance(values, Index):
df = DataFrame(values, columns=cols_, index=index_)
else:
@@ -4774,23 +4841,10 @@ def read(
assert (df.dtypes == values.dtype).all(), (df.dtypes, values.dtype)
# If str / string dtype is stored in meta, use that.
- converted = False
for column in cols_:
dtype = getattr(self.table.attrs, f"{column}_meta", None)
if dtype in ["str", "string"]:
df[column] = df[column].astype(dtype)
- converted = True
- # Otherwise try inference.
- if (
- not converted
- and using_string_dtype()
- and isinstance(values, np.ndarray)
- and is_string_array(
- values,
- skipna=True,
- )
- ):
- df = df.astype(StringDtype(na_value=np.nan))
frames.append(df)
if len(frames) == 1:
@@ -5224,7 +5278,7 @@ def _convert_string_array(data: np.ndarray, encoding: str, errors: str) -> np.nd
# encode if needed
if len(data):
data = (
- Series(data.ravel(), copy=False)
+ Series(data.ravel(), copy=False, dtype="object")
.str.encode(encoding, errors)
._values.reshape(data.shape)
)
@@ -5264,7 +5318,9 @@ def _unconvert_string_array(
dtype = f"U{itemsize}"
if isinstance(data[0], bytes):
- ser = Series(data, copy=False).str.decode(encoding, errors=errors)
+ ser = Series(data, copy=False).str.decode(
+ encoding, errors=errors, dtype="object"
+ )
data = ser.to_numpy()
data.flags.writeable = True
else:
diff --git a/pandas/tests/io/pytables/test_store.py b/pandas/tests/io/pytables/test_store.py
index bb2058c050f2a..b3ab6b48508e1 100644
--- a/pandas/tests/io/pytables/test_store.py
+++ b/pandas/tests/io/pytables/test_store.py
@@ -7,8 +7,6 @@
import numpy as np
import pytest
-from pandas._config import using_string_dtype
-
from pandas.compat import PY312
import pandas as pd
@@ -25,7 +23,6 @@
timedelta_range,
)
import pandas._testing as tm
-from pandas.conftest import has_pyarrow
from pandas.tests.io.pytables.common import (
_maybe_remove,
ensure_clean_store,
@@ -385,20 +382,24 @@ def test_to_hdf_with_min_itemsize(tmp_path, setup_path):
tm.assert_series_equal(read_hdf(path, "ss4"), concat([df["B"], df2["B"]]))
-@pytest.mark.xfail(
- using_string_dtype() and has_pyarrow,
- reason="TODO(infer_string): can't encode '\ud800': surrogates not allowed",
-)
@pytest.mark.parametrize("format", ["fixed", "table"])
-def test_to_hdf_errors(tmp_path, format, setup_path):
+def test_to_hdf_errors(tmp_path, format, setup_path, using_infer_string):
data = ["\ud800foo"]
- ser = Series(data, index=Index(data))
+ ser = Series(data, index=Index(data, dtype="object"), dtype="object")
path = tmp_path / setup_path
# GH 20835
ser.to_hdf(path, key="table", format=format, errors="surrogatepass")
result = read_hdf(path, "table", errors="surrogatepass")
- tm.assert_series_equal(result, ser)
+
+ if using_infer_string:
+ # https://github.com/pandas-dev/pandas/pull/60993
+ # Surrogates fallback to python storage.
+ dtype = pd.StringDtype(storage="python", na_value=np.nan)
+ else:
+ dtype = "object"
+ expected = Series(data, index=Index(data, dtype=dtype), dtype=dtype)
+ tm.assert_series_equal(result, expected)
def test_create_table_index(setup_path):
From a6dae8aa66872658cd5481baeaaf8545c15fecc1 Mon Sep 17 00:00:00 2001
From: William Andrea <22385371+wjandrea@users.noreply.github.com>
Date: Wed, 16 Apr 2025 17:41:28 -0300
Subject: [PATCH 27/27] DOC: copyedit _base.py (#61299)
No need to restate the library name
---
pandas/io/excel/_base.py | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/pandas/io/excel/_base.py b/pandas/io/excel/_base.py
index ebcafce8f4de2..1dc6c1f08b49a 100644
--- a/pandas/io/excel/_base.py
+++ b/pandas/io/excel/_base.py
@@ -89,7 +89,7 @@
)
_read_excel_doc = (
"""
-Read an Excel file into a ``pandas`` ``DataFrame``.
+Read an Excel file into a ``DataFrame``.
Supports `xls`, `xlsx`, `xlsm`, `xlsb`, `odf`, `ods` and `odt` file extensions
read from a local filesystem or URL. Supports an option to read