pandas-dev · rhshadrach · Apr 11, 2024 · Apr 30, 2024 · Jun 25, 2024 · Aug 27, 2024
@@ -14,3 +14,9 @@ runs:
         condarc-file: ci/.condarc
         cache-environment: true
         cache-downloads: true
+
+    - name: Uninstall pyarrow
+      if: ${{ env.REMOVE_PYARROW == '1' }}
+      run: |
+        micromamba remove -y pyarrow
+      shell: bash -el {0}
@@ -4,11 +4,11 @@ on:
   push:
     branches:
       - main
-      - 2.2.x
+      - 2.3.x
   pull_request:
     branches:
       - main
-      - 2.2.x
+      - 2.3.x
     paths-ignore:
       - "doc/**"
       - "web/**"
@@ -29,6 +29,7 @@ jobs:
         env_file: [actions-39.yaml, actions-310.yaml, actions-311.yaml, actions-312.yaml]
         # Prevent the include jobs from overriding other jobs
         pattern: [""]
+        pandas_future_infer_string: ["0"]
         include:
           - name: "Downstream Compat"
             env_file: actions-311-downstream_compat.yaml
@@ -85,6 +86,12 @@ jobs:
             env_file: actions-39.yaml
             pattern: "not slow and not network and not single_cpu"
             pandas_copy_on_write: "warn"
+          - name: "Future infer strings"
+            env_file: actions-312.yaml
+            pandas_future_infer_string: "1"
+          - name: "Future infer strings (without pyarrow)"
+            env_file: actions-311.yaml
+            pandas_future_infer_string: "1"
           - name: "Pypy"
             env_file: actions-pypy-39.yaml
             pattern: "not slow and not network and not single_cpu"
@@ -103,16 +110,18 @@ jobs:
       LANG: ${{ matrix.lang || 'C.UTF-8' }}
       LC_ALL: ${{ matrix.lc_all || '' }}
       PANDAS_COPY_ON_WRITE: ${{ matrix.pandas_copy_on_write || '0' }}
-      PANDAS_CI: ${{ matrix.pandas_ci || '1' }}
+      PANDAS_CI: '1'
+      PANDAS_FUTURE_INFER_STRING: ${{ matrix.pandas_future_infer_string || '0' }}
       TEST_ARGS: ${{ matrix.test_args || '' }}
       PYTEST_WORKERS: ${{ matrix.pytest_workers || 'auto' }}
       PYTEST_TARGET: ${{ matrix.pytest_target || 'pandas' }}
       NPY_PROMOTION_STATE: ${{ matrix.env_file == 'actions-311-numpydev.yaml' && 'weak' || 'legacy' }}
       # Clipboard tests
       QT_QPA_PLATFORM: offscreen
+      REMOVE_PYARROW: ${{ matrix.name == 'Future infer strings (without pyarrow)' && '1' || '0' }}
     concurrency:
       # https://github.community/t/concurrecy-not-work-for-push/183068/7
-      group: ${{ github.event_name == 'push' && github.run_number || github.ref }}-${{ matrix.env_file }}-${{ matrix.pattern }}-${{ matrix.extra_apt || '' }}-${{ matrix.pandas_copy_on_write || '' }}
+      group: ${{ github.event_name == 'push' && github.run_number || github.ref }}-${{ matrix.env_file }}-${{ matrix.pattern }}-${{ matrix.extra_apt || '' }}-${{ matrix.pandas_copy_on_write || '' }}-${{ matrix.pandas_future_infer_string }}
       cancel-in-progress: true
 
     services:
@@ -329,7 +338,7 @@ jobs:
     #    To freeze this file, uncomment out the ``if: false`` condition, and migrate the jobs
     #    to the corresponding posix/windows-macos/sdist etc. workflows.
     # Feel free to modify this comment as necessary.
-    if: false # Uncomment this to freeze the workflow, comment it to unfreeze
+    # if: false # Uncomment this to freeze the workflow, comment it to unfreeze
     defaults:
       run:
         shell: bash -eou pipefail {0}
@@ -361,7 +370,7 @@ jobs:
       - name: Set up Python Dev Version
         uses: actions/setup-python@v5
         with:
-          python-version: '3.12-dev'
+          python-version: '3.13-dev'
 
       - name: Build Environment
         run: |

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -274,13 +274,6 @@ repos:
         language: python
         types: [rst]
         files: ^doc/source/(development|reference)/
-    -   id: unwanted-patterns-bare-pytest-raises
-        name: Check for use of bare pytest raises
-        language: python
-        entry: python scripts/validate_unwanted_patterns.py --validation-type="bare_pytest_raises"
-        types: [python]
-        files: ^pandas/tests/
-        exclude: ^pandas/tests/extension/
     -   id: unwanted-patterns-private-function-across-module
         name: Check for use of private functions across modules
         language: python

@@ -24,8 +24,6 @@ dependencies:
 
   # optional dependencies
   - beautifulsoup4>=4.11.2
-  # https://github.com/conda-forge/pytables-feedstock/issues/97
-  - c-blosc2=2.13.2
   - blosc>=1.21.3
   - bottleneck>=1.3.6
   - fastparquet>=2022.12.0

@@ -26,8 +26,6 @@ dependencies:
 
   # optional dependencies
   - beautifulsoup4>=4.11.2
-  # https://github.com/conda-forge/pytables-feedstock/issues/97
-  - c-blosc2=2.13.2
   - blosc>=1.21.3
   - bottleneck>=1.3.6
   - fastparquet>=2022.12.0

@@ -24,8 +24,6 @@ dependencies:
 
   # optional dependencies
   - beautifulsoup4>=4.11.2
-  # https://github.com/conda-forge/pytables-feedstock/issues/97
-  - c-blosc2=2.13.2
   - blosc>=1.21.3
   - bottleneck>=1.3.6
   - fastparquet>=2022.12.0

@@ -24,8 +24,6 @@ dependencies:
 
   # optional dependencies
   - beautifulsoup4>=4.11.2
-  # https://github.com/conda-forge/pytables-feedstock/issues/97
-  - c-blosc2=2.13.2
   - blosc>=1.21.3
   - bottleneck>=1.3.6
   - fastparquet>=2022.12.0

@@ -27,8 +27,6 @@ dependencies:
 
   # optional dependencies
   - beautifulsoup4=4.11.2
-  # https://github.com/conda-forge/pytables-feedstock/issues/97
-  - c-blosc2=2.13.2
   - blosc=1.21.3
   - bottleneck=1.3.6
   - fastparquet=2022.12.0

@@ -24,8 +24,6 @@ dependencies:
 
   # optional dependencies
   - beautifulsoup4>=4.11.2
-  # https://github.com/conda-forge/pytables-feedstock/issues/97
-  - c-blosc2=2.13.2
   - blosc>=1.21.3
   - bottleneck>=1.3.6
   - fastparquet>=2022.12.0

@@ -25,8 +25,6 @@ dependencies:
 
   # optional dependencies
   - beautifulsoup4>=4.11.2
-  # https://github.com/conda-forge/pytables-feedstock/issues/97
-  - c-blosc2=2.13.2
   - blosc>=1.21.3
   - bottleneck>=1.3.6
   - fastparquet>=2022.12.0

diff --git a/environment.yml b/environment.yml
@@ -27,8 +27,6 @@ dependencies:
 
   # optional dependencies
   - beautifulsoup4>=4.11.2
-  # https://github.com/conda-forge/pytables-feedstock/issues/97
-  - c-blosc2=2.13.2
   - blosc
   - bottleneck>=1.3.6
   - fastparquet>=2022.12.0

diff --git a/pandas/_config/__init__.py b/pandas/_config/__init__.py
@@ -52,6 +52,6 @@ def using_nullable_dtypes() -> bool:
     return _mode_options["nullable_dtypes"]
 
 
-def using_pyarrow_string_dtype() -> bool:
+def using_string_dtype() -> bool:
     _mode_options = _global_config["future"]
     return _mode_options["infer_string"]
@@ -37,7 +37,7 @@ from cython cimport (
     floating,
 )
 
-from pandas._config import using_pyarrow_string_dtype
+from pandas._config import using_string_dtype
 
 from pandas._libs.missing import check_na_tuples_nonequal
 
@@ -2725,10 +2725,10 @@ def maybe_convert_objects(ndarray[object] objects,
         seen.object_ = True
 
     elif seen.str_:
-        if using_pyarrow_string_dtype() and is_string_array(objects, skipna=True):
+        if using_string_dtype() and is_string_array(objects, skipna=True):
             from pandas.core.arrays.string_ import StringDtype
 
-            dtype = StringDtype(storage="pyarrow_numpy")
+            dtype = StringDtype(na_value=np.nan)
             return dtype.construct_array_type()._from_sequence(objects, dtype=dtype)
 
         seen.object_ = True

@@ -410,8 +410,8 @@ static void NpyArr_iterBegin(JSOBJ _obj, JSONTypeContext *tc) {
   npyarr->type_num = PyArray_DESCR(obj)->type_num;
 
   if (GET_TC(tc)->transpose) {
-    npyarr->dim = PyArray_DIM(obj, npyarr->ndim);
-    npyarr->stride = PyArray_STRIDE(obj, npyarr->ndim);
+    npyarr->dim = PyArray_DIM(obj, (int)npyarr->ndim);
+    npyarr->stride = PyArray_STRIDE(obj, (int)npyarr->ndim);
     npyarr->stridedim = npyarr->ndim;
     npyarr->index[npyarr->ndim] = 0;
     npyarr->inc = -1;
@@ -452,8 +452,8 @@ static void NpyArrPassThru_iterEnd(JSOBJ obj, JSONTypeContext *tc) {
     return;
   }
   const PyArrayObject *arrayobj = (const PyArrayObject *)npyarr->array;
-  npyarr->dim = PyArray_DIM(arrayobj, npyarr->stridedim);
-  npyarr->stride = PyArray_STRIDE(arrayobj, npyarr->stridedim);
+  npyarr->dim = PyArray_DIM(arrayobj, (int)npyarr->stridedim);
+  npyarr->stride = PyArray_STRIDE(arrayobj, (int)npyarr->stridedim);
   npyarr->dataptr += npyarr->stride;
 
   NpyArr_freeItemValue(obj, tc);
@@ -524,8 +524,8 @@ static int NpyArr_iterNext(JSOBJ _obj, JSONTypeContext *tc) {
   }
   const PyArrayObject *arrayobj = (const PyArrayObject *)npyarr->array;
 
-  npyarr->dim = PyArray_DIM(arrayobj, npyarr->stridedim);
-  npyarr->stride = PyArray_STRIDE(arrayobj, npyarr->stridedim);
+  npyarr->dim = PyArray_DIM(arrayobj, (int)npyarr->stridedim);
+  npyarr->stride = PyArray_STRIDE(arrayobj, (int)npyarr->stridedim);
   npyarr->index[npyarr->stridedim] = 0;
 
   ((PyObjectEncoder *)tc->encoder)->npyCtxtPassthru = npyarr;

diff --git a/pandas/_libs/tslibs/offsets.pyx b/pandas/_libs/tslibs/offsets.pyx
@@ -4960,7 +4960,12 @@ cpdef to_offset(freq, bint is_period=False):
     if result is None:
         raise ValueError(INVALID_FREQ_ERR_MSG.format(freq))
 
-    if is_period and not hasattr(result, "_period_dtype_code"):
+    try:
+        has_period_dtype_code = hasattr(result, "_period_dtype_code")
+    except ValueError:
+        has_period_dtype_code = False
+
+    if is_period and not has_period_dtype_code:
         if isinstance(freq, str):
             raise ValueError(f"{result.name} is not supported as period frequency")
         else:

diff --git a/pandas/_testing/__init__.py b/pandas/_testing/__init__.py
@@ -14,6 +14,7 @@
 
 import numpy as np
 
+from pandas._config import using_string_dtype
 from pandas._config.localization import (
     can_set_locale,
     get_locales,
@@ -110,7 +111,11 @@
 ALL_FLOAT_DTYPES: list[Dtype] = [*FLOAT_NUMPY_DTYPES, *FLOAT_EA_DTYPES]
 
 COMPLEX_DTYPES: list[Dtype] = [complex, "complex64", "complex128"]
-STRING_DTYPES: list[Dtype] = [str, "str", "U"]
+if using_string_dtype():
+    STRING_DTYPES: list[Dtype] = [str, "U"]
+else:
+    STRING_DTYPES: list[Dtype] = [str, "str", "U"]  # type: ignore[no-redef]
+COMPLEX_FLOAT_DTYPES: list[Dtype] = [*COMPLEX_DTYPES, *FLOAT_NUMPY_DTYPES]
 
 DATETIME64_DTYPES: list[Dtype] = ["datetime64[ns]", "M8[ns]"]
 TIMEDELTA64_DTYPES: list[Dtype] = ["timedelta64[ns]", "m8[ns]"]
@@ -526,14 +531,14 @@ def shares_memory(left, right) -> bool:
     if (
         isinstance(left, ExtensionArray)
         and is_string_dtype(left.dtype)
-        and left.dtype.storage in ("pyarrow", "pyarrow_numpy")  # type: ignore[attr-defined]
+        and left.dtype.storage == "pyarrow"  # type: ignore[attr-defined]
     ):
         # https://github.com/pandas-dev/pandas/pull/43930#discussion_r736862669
         left = cast("ArrowExtensionArray", left)
         if (
             isinstance(right, ExtensionArray)
             and is_string_dtype(right.dtype)
-            and right.dtype.storage in ("pyarrow", "pyarrow_numpy")  # type: ignore[attr-defined]
+            and right.dtype.storage == "pyarrow"  # type: ignore[attr-defined]
         ):
             right = cast("ArrowExtensionArray", right)
             left_pa_data = left._pa_array

diff --git a/pandas/_testing/asserters.py b/pandas/_testing/asserters.py
@@ -593,13 +593,19 @@ def raise_assert_detail(
 
     if isinstance(left, np.ndarray):
         left = pprint_thing(left)
-    elif isinstance(left, (CategoricalDtype, NumpyEADtype, StringDtype)):
+    elif isinstance(left, (CategoricalDtype, NumpyEADtype)):
         left = repr(left)
+    elif isinstance(left, StringDtype):
+        # TODO(infer_string) this special case could be avoided if we have
+        # a more informative repr https://github.com/pandas-dev/pandas/issues/59342
+        left = f"StringDtype(storage={left.storage}, na_value={left.na_value})"
 
     if isinstance(right, np.ndarray):
         right = pprint_thing(right)
-    elif isinstance(right, (CategoricalDtype, NumpyEADtype, StringDtype)):
+    elif isinstance(right, (CategoricalDtype, NumpyEADtype)):
         right = repr(right)
+    elif isinstance(right, StringDtype):
+        right = f"StringDtype(storage={right.storage}, na_value={right.na_value})"
 
     msg += f"""
 [left]:  {left}
@@ -805,6 +811,24 @@ def assert_extension_array_equal(
         left_na, right_na, obj=f"{obj} NA mask", index_values=index_values
     )
 
+    # Specifically for StringArrayNumpySemantics, validate here we have a valid array
+    if (
+        isinstance(left.dtype, StringDtype)
+        and left.dtype.storage == "python"
+        and left.dtype.na_value is np.nan
+    ):
+        assert np.all(
+            [np.isnan(val) for val in left._ndarray[left_na]]  # type: ignore[attr-defined]
+        ), "wrong missing value sentinels"
+    if (
+        isinstance(right.dtype, StringDtype)
+        and right.dtype.storage == "python"
+        and right.dtype.na_value is np.nan
+    ):
+        assert np.all(
+            [np.isnan(val) for val in right._ndarray[right_na]]  # type: ignore[attr-defined]
+        ), "wrong missing value sentinels"
+
     left_valid = left[~left_na].to_numpy(dtype=object)
     right_valid = right[~right_na].to_numpy(dtype=object)
     if check_exact:

diff --git a/pandas/compat/__init__.py b/pandas/compat/__init__.py
@@ -25,6 +25,7 @@
 import pandas.compat.compressors
 from pandas.compat.numpy import is_numpy_dev
 from pandas.compat.pyarrow import (
+    HAS_PYARROW,
     pa_version_under10p1,
     pa_version_under11p0,
     pa_version_under13p0,
@@ -190,6 +191,7 @@ def get_bz2_file() -> type[pandas.compat.compressors.BZ2File]:
     "pa_version_under14p1",
     "pa_version_under16p0",
     "pa_version_under17p0",
+    "HAS_PYARROW",
     "IS64",
     "ISMUSL",
     "PY310",

diff --git a/pandas/compat/pyarrow.py b/pandas/compat/pyarrow.py
@@ -17,6 +17,7 @@
     pa_version_under15p0 = _palv < Version("15.0.0")
     pa_version_under16p0 = _palv < Version("16.0.0")
     pa_version_under17p0 = _palv < Version("17.0.0")
+    HAS_PYARROW = True
 except ImportError:
     pa_version_under10p1 = True
     pa_version_under11p0 = True
@@ -27,3 +28,4 @@
     pa_version_under15p0 = True
     pa_version_under16p0 = True
     pa_version_under17p0 = True
+    HAS_PYARROW = False