Merge remote-tracking branch 'upstream/2.3.x' into backport-61446

jorisvandenbossche · jorisvandenbossche · commit 5e59c7833906 · 2025-07-03T09:26:31.000+02:00
diff --git a/.github/workflows/package-checks.yml b/.github/workflows/package-checks.yml
@@ -53,7 +53,7 @@ jobs:
     runs-on: ubuntu-22.04
     strategy:
       matrix:
-        python-version: ['3.10', '3.11']
+        python-version: ['3.9', '3.10', '3.11']
       fail-fast: false
     name: Test Conda Forge Recipe - Python ${{ matrix.python-version }}
     concurrency:
diff --git a/.github/workflows/unit-tests.yml b/.github/workflows/unit-tests.yml
@@ -27,7 +27,7 @@ jobs:
     strategy:
       matrix:
         platform: [ubuntu-22.04, ubuntu-24.04-arm]
-        env_file: [actions-310.yaml, actions-311.yaml, actions-312.yaml]
+        env_file: [actions-39.yaml, actions-310.yaml, actions-311.yaml, actions-312.yaml]
         # Prevent the include jobs from overriding other jobs
         pattern: [""]
         pandas_future_infer_string: ["0"]
@@ -38,7 +38,7 @@ jobs:
             pytest_target: "pandas/tests/test_downstream.py"
             platform: ubuntu-22.04
           - name: "Minimum Versions"
-            env_file: actions-310-minimum_versions.yaml
+            env_file: actions-39-minimum_versions.yaml
             pattern: "not slow and not network and not single_cpu"
             platform: ubuntu-22.04
           - name: "Locale: it_IT"
@@ -63,6 +63,11 @@ jobs:
             # It will be temporarily activated during tests with locale.setlocale
             extra_loc: "zh_CN"
             platform: ubuntu-22.04
+          - name: "Copy-on-Write 3.9"
+            env_file: actions-39.yaml
+            pattern: "not slow and not network and not single_cpu"
+            pandas_copy_on_write: "1"
+            platform: ubuntu-22.04
           - name: "Copy-on-Write 3.10"
             env_file: actions-310.yaml
             pattern: "not slow and not network and not single_cpu"
@@ -88,6 +93,11 @@ jobs:
             pattern: "not slow and not network and not single_cpu"
             pandas_copy_on_write: "warn"
             platform: ubuntu-22.04
+          - name: "Copy-on-Write 3.9 (warnings)"
+            env_file: actions-39.yaml
+            pattern: "not slow and not network and not single_cpu"
+            pandas_copy_on_write: "warn"
+            platform: ubuntu-22.04
           - name: "Future infer strings"
             env_file: actions-312.yaml
             pandas_future_infer_string: "1"
@@ -218,7 +228,7 @@ jobs:
       matrix:
         # Note: Don't use macOS latest since macos 14 appears to be arm64 only
         os: [macos-13, macos-14, windows-latest]
-        env_file: [actions-310.yaml, actions-311.yaml, actions-312.yaml]
+        env_file: [actions-39.yaml, actions-310.yaml, actions-311.yaml, actions-312.yaml]
       fail-fast: false
     runs-on: ${{ matrix.os }}
     name: ${{ format('{0} {1}', matrix.os, matrix.env_file) }}
diff --git a/.github/workflows/wheels.yml b/.github/workflows/wheels.yml
@@ -95,12 +95,13 @@ jobs:
         - [ubuntu-22.04, manylinux_x86_64]
         - [ubuntu-22.04, musllinux_x86_64]
         - [ubuntu-24.04-arm, manylinux_aarch64]
+        - [ubuntu-24.04-arm, musllinux_aarch64]
         - [macos-13, macosx_x86_64]
         # Note: M1 images on Github Actions start from macOS 14
         - [macos-14, macosx_arm64]
         - [windows-2022, win_amd64]
         # TODO: support PyPy?
-        python: [["cp310", "3.10"], ["cp311", "3.11"], ["cp312", "3.12"], ["cp313", "3.13"], ["cp313t", "3.13"]]
+        python: [["cp39", "3.9"], ["cp310", "3.10"], ["cp311", "3.11"], ["cp312", "3.12"], ["cp313", "3.13"], ["cp313t", "3.13"]]
         # TODO: Build free-threaded wheels for Windows
         exclude:
         - buildplat: [windows-2022, win_amd64]
diff --git a/ci/deps/actions-311-downstream_compat.yaml b/ci/deps/actions-311-downstream_compat.yaml
@@ -51,7 +51,8 @@ dependencies:
   - python-calamine>=0.1.7
   - pyxlsb>=1.0.10
   - s3fs>=2022.11.0
-  - scipy>=1.10.0
+  # TEMP upper pin for scipy (https://github.com/statsmodels/statsmodels/issues/9584)
+  - scipy>=1.10.0,<1.16
   - sqlalchemy>=2.0.0
   - tabulate>=0.9.0
   - xarray>=2022.12.0, <=2024.9.0
diff --git a/ci/deps/actions-39-minimum_versions.yaml b/ci/deps/actions-39-minimum_versions.yaml
@@ -4,7 +4,7 @@ name: pandas-dev
 channels:
   - conda-forge
 dependencies:
-  - python=3.10
+  - python=3.9
 
   # build dependencies
   - versioneer
diff --git a/ci/deps/actions-39.yaml b/ci/deps/actions-39.yaml
@@ -0,0 +1,64 @@
+name: pandas-dev
+channels:
+  - conda-forge
+dependencies:
+  - python=3.9
+
+  # build dependencies
+  - versioneer[toml]
+  - cython>=0.29.33
+  - meson[ninja]=1.2.1
+  - meson-python=0.13.1
+
+  # test dependencies
+  - pytest>=7.3.2
+  - pytest-cov
+  - pytest-xdist>=2.2.0
+  - pytest-qt>=4.2.0
+  - boto3
+
+  # required dependencies
+  - python-dateutil
+  - numpy
+  # pytz 2024.2 timezones cause wrong results
+  - pytz<2024.2
+
+  # optional dependencies
+  - beautifulsoup4>=4.11.2
+  - blosc>=1.21.3
+  - bottleneck>=1.3.6
+  - fastparquet>=2022.12.0
+  - fsspec>=2022.11.0
+  - html5lib>=1.1
+  - hypothesis>=6.46.1
+  - gcsfs>=2022.11.0
+  - jinja2>=3.1.2
+  - lxml>=4.9.2
+  - matplotlib>=3.6.3
+  - numba>=0.56.4
+  - numexpr>=2.8.4
+  - odfpy>=1.4.1
+  - qtpy>=2.3.0
+  - openpyxl>=3.1.0
+  - psycopg2>=2.9.6
+  - pyarrow>=10.0.1
+  - pymysql>=1.0.2
+  - pyqt>=5.15.9
+  - pyreadstat>=1.2.0
+  - pytables>=3.8.0
+  - python-calamine>=0.1.7
+  - pyxlsb>=1.0.10
+  - s3fs>=2022.11.0
+  - scipy>=1.10.0
+  - sqlalchemy>=2.0.0
+  - tabulate>=0.9.0
+  - xarray>=2022.12.0
+  - xlrd>=2.0.1
+  - xlsxwriter>=3.0.5
+  - zstandard>=0.19.0
+
+  - pip:
+    - adbc-driver-postgresql>=0.8.0
+    - adbc-driver-sqlite>=0.8.0
+    - tzdata>=2022.7
+    - pytest-localserver>=0.7.1
diff --git a/doc/source/whatsnew/v2.3.0.rst b/doc/source/whatsnew/v2.3.0.rst
@@ -30,50 +30,6 @@ Other enhancements
 - The :meth:`~Series.cumsum`, :meth:`~Series.cummin`, and :meth:`~Series.cummax` reductions are now implemented for :class:`StringDtype` columns (:issue:`60633`)
 - The :meth:`~Series.sum` reduction is now implemented for :class:`StringDtype` columns (:issue:`59853`)
 
-.. ---------------------------------------------------------------------------
-.. _whatsnew_230.notable_bug_fixes:
-
-Notable bug fixes
-~~~~~~~~~~~~~~~~~
-
-These are bug fixes that might have notable behavior changes.
-
-.. _whatsnew_230.notable_bug_fixes.string_comparisons:
-
-Comparisons between different string dtypes
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-
-In previous versions, comparing Series of different string dtypes (e.g. ``pd.StringDtype("pyarrow", na_value=pd.NA)`` against ``pd.StringDtype("python", na_value=np.nan)``) would result in inconsistent resulting dtype or incorrectly raise. pandas will now use the hierarchy
-
-    object < (python, NaN) < (pyarrow, NaN) < (python, NA) < (pyarrow, NA)
-
-in determining the result dtype when there are different string dtypes compared. Some examples:
-
-- When ``pd.StringDtype("pyarrow", na_value=pd.NA)`` is compared against any other string dtype, the result will always be ``boolean[pyarrow]``.
-- When ``pd.StringDtype("python", na_value=pd.NA)`` is compared against ``pd.StringDtype("pyarrow", na_value=np.nan)``, the result will be ``boolean``, the NumPy-backed nullable extension array.
-- When ``pd.StringDtype("python", na_value=pd.NA)`` is compared against ``pd.StringDtype("python", na_value=np.nan)``, the result will be ``boolean``, the NumPy-backed nullable extension array.
-
-In previous versions, comparing :class:`Series` of different string dtypes (e.g. ``pd.StringDtype("pyarrow", na_value=pd.NA)`` against ``pd.StringDtype("python", na_value=np.nan)``) would result in inconsistent resulting dtype or incorrectly raise. pandas will now use the hierarchy
-
-Increased minimum version for Python
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-in determining the result dtype when there are different string dtypes compared. Some examples:
-
-- When ``pd.StringDtype("pyarrow", na_value=pd.NA)`` is compared against any other string dtype, the result will always be ``boolean[pyarrow]``.
-- When ``pd.StringDtype("python", na_value=pd.NA)`` is compared against ``pd.StringDtype("pyarrow", na_value=np.nan)``, the result will be ``boolean``, the NumPy-backed nullable extension array.
-- When ``pd.StringDtype("python", na_value=pd.NA)`` is compared against ``pd.StringDtype("python", na_value=np.nan)``, the result will be ``boolean``, the NumPy-backed nullable extension array.
-
-.. _whatsnew_230.api_changes:
-
-API changes
-~~~~~~~~~~~
-
-- When enabling the ``future.infer_string`` option, :class:`Index` set operations (like
-  union or intersection) will now ignore the dtype of an empty :class:`RangeIndex` or
-  empty :class:`Index` with ``object`` dtype when determining the dtype of the resulting
-  Index (:issue:`60797`)
-
 .. ---------------------------------------------------------------------------
 .. _whatsnew_230.deprecations:
 
@@ -96,8 +52,6 @@ Numeric
 
 Strings
 ^^^^^^^
-- Bug in :meth:`.DataFrameGroupBy.min`, :meth:`.DataFrameGroupBy.max`, :meth:`.Resampler.min`, :meth:`.Resampler.max` where all NA values of string dtype would return float instead of string dtype (:issue:`60810`)
-- Bug in :meth:`DataFrame.sum` with ``axis=1``, :meth:`.DataFrameGroupBy.sum` or :meth:`.SeriesGroupBy.sum` with ``skipna=True``, and :meth:`.Resampler.sum` with all NA values of :class:`StringDtype` resulted in ``0`` instead of the empty string ``""`` (:issue:`60229`)
 - Bug in :meth:`Series.__pos__` and :meth:`DataFrame.__pos__` where an ``Exception`` was not raised for :class:`StringDtype` with ``storage="pyarrow"`` (:issue:`60710`)
 - Bug in :meth:`Series.rank` for :class:`StringDtype` with ``storage="pyarrow"`` that incorrectly returned integer results with ``method="average"`` and raised an error if it would truncate results (:issue:`59768`)
 - Bug in :meth:`Series.replace` with :class:`StringDtype` when replacing with a non-string value was not upcasting to ``object`` dtype (:issue:`60282`)
diff --git a/doc/source/whatsnew/v2.3.1.rst b/doc/source/whatsnew/v2.3.1.rst
@@ -9,11 +9,57 @@ including other versions of pandas.
 {{ header }}
 
 .. ---------------------------------------------------------------------------
-.. _whatsnew_231.enhancements:
+.. _whatsnew_231.string_fixes:
+
+Improvements and fixes for the StringDtype
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. _whatsnew_231.string_fixes.string_comparisons:
+
+Comparisons between different string dtypes
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+In previous versions, comparing :class:`Series` of different string dtypes (e.g. ``pd.StringDtype("pyarrow", na_value=pd.NA)`` against ``pd.StringDtype("python", na_value=np.nan)``) would result in inconsistent resulting dtype or incorrectly raise. pandas will now use the hierarchy
+
+    object < (python, NaN) < (pyarrow, NaN) < (python, NA) < (pyarrow, NA)
+
+in determining the result dtype when there are different string dtypes compared. Some examples:
+
+- When ``pd.StringDtype("pyarrow", na_value=pd.NA)`` is compared against any other string dtype, the result will always be ``boolean[pyarrow]``.
+- When ``pd.StringDtype("python", na_value=pd.NA)`` is compared against ``pd.StringDtype("pyarrow", na_value=np.nan)``, the result will be ``boolean``, the NumPy-backed nullable extension array.
+- When ``pd.StringDtype("python", na_value=pd.NA)`` is compared against ``pd.StringDtype("python", na_value=np.nan)``, the result will be ``boolean``, the NumPy-backed nullable extension array.
+
+.. _whatsnew_231.string_fixes.ignore_empty:
+
+Index set operations ignore empty RangeIndex and object dtype Index
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+When enabling the ``future.infer_string`` option, :class:`Index` set operations (like
+union or intersection) will now ignore the dtype of an empty :class:`RangeIndex` or
+empty :class:`Index` with ``object`` dtype when determining the dtype of the resulting
+Index (:issue:`60797`).
+
+This ensures that combining such empty Index with strings will infer the string dtype
+correctly, rather than defaulting to ``object`` dtype. For example:
+
+.. code-block:: python
+
+    >>> pd.options.mode.infer_string = True
+    >>> df = pd.DataFrame()
+    >>> df.columns.dtype
+    dtype('int64')               # default RangeIndex for empty columns
+    >>> df["a"] = [1, 2, 3]
+    >>> df.columns.dtype
+    <StringDtype(na_value=nan)>  # new columns use string dtype instead of object dtype
+
+.. _whatsnew_231.string_fixes.bugs:
+
+Bug fixes
+^^^^^^^^^
+- Bug in :meth:`.DataFrameGroupBy.min`, :meth:`.DataFrameGroupBy.max`, :meth:`.Resampler.min`, :meth:`.Resampler.max` where all NA values of string dtype would return float instead of string dtype (:issue:`60810`)
+- Bug in :meth:`DataFrame.sum` with ``axis=1``, :meth:`.DataFrameGroupBy.sum` or :meth:`.SeriesGroupBy.sum` with ``skipna=True``, and :meth:`.Resampler.sum` with all NA values of :class:`StringDtype` resulted in ``0`` instead of the empty string ``""`` (:issue:`60229`)
+- Fixed bug in :meth:`DataFrame.explode` and :meth:`Series.explode` where methods would fail with ``dtype="str"`` (:issue:`61623`)
 
-Enhancements
-~~~~~~~~~~~~
--
 
 .. _whatsnew_231.regressions:
 
@@ -26,7 +72,7 @@ Fixed regressions
 
 Bug fixes
 ~~~~~~~~~
-- Fixed bug in :meth:`DataFrame.explode` and :meth:`Series.explode` where methods would fail with ``dtype="str"`` (:issue:`61623`)
+-
 
 .. ---------------------------------------------------------------------------
 .. _whatsnew_231.other:
diff --git a/scripts/validate_docstrings.py b/scripts/validate_docstrings.py
@@ -269,8 +269,15 @@ def pandas_validate(func_name: str):
     # Some objects are instances, e.g. IndexSlice, which numpydoc can't validate
     doc_obj = get_doc_object(func_obj, doc=func_obj.__doc__)
     doc = PandasDocstring(func_name, doc_obj)
-    result = validate(doc_obj)
-
+    if func_obj.__doc__ is not None:
+        result = validate(doc_obj)
+    else:
+        result = {
+            "docstring": "",
+            "file": None,
+            "file_line": None,
+            "errors": [("GL08", "The object does not have a docstring")],
+        }
     mentioned_errs = doc.mentioned_private_classes
     if mentioned_errs:
         result["errors"].append(