Skip to content

Commit 27ce106

Browse files
authored
Merge branch 'main' into tz_localize-single-single-quote
2 parents 3385182 + d093fae commit 27ce106

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

58 files changed

+473
-234
lines changed

.devcontainer.json

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,6 @@
88
// Use 'settings' to set *default* container specific settings.json values on container create.
99
// You can edit these settings after create using File > Preferences > Settings > Remote.
1010
"settings": {
11-
"terminal.integrated.shell.linux": "/bin/bash",
1211
"python.pythonPath": "/usr/local/bin/python",
1312
"python.formatting.provider": "black",
1413
"python.linting.enabled": true,

.github/actions/setup-conda/action.yml

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,3 +14,9 @@ runs:
1414
condarc-file: ci/.condarc
1515
cache-environment: true
1616
cache-downloads: true
17+
18+
- name: Uninstall pyarrow
19+
if: ${{ env.REMOVE_PYARROW == '1' }}
20+
run: |
21+
micromamba remove -y pyarrow
22+
shell: bash -el {0}

.github/workflows/unit-tests.yml

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@ jobs:
2929
env_file: [actions-310.yaml, actions-311.yaml, actions-312.yaml]
3030
# Prevent the include jobs from overriding other jobs
3131
pattern: [""]
32+
pandas_future_infer_string: ["0"]
3233
include:
3334
- name: "Downstream Compat"
3435
env_file: actions-311-downstream_compat.yaml
@@ -58,6 +59,9 @@ jobs:
5859
# It will be temporarily activated during tests with locale.setlocale
5960
extra_loc: "zh_CN"
6061
- name: "Future infer strings"
62+
env_file: actions-312.yaml
63+
pandas_future_infer_string: "1"
64+
- name: "Future infer strings (without pyarrow)"
6165
env_file: actions-311.yaml
6266
pandas_future_infer_string: "1"
6367
- name: "Pypy"
@@ -85,9 +89,10 @@ jobs:
8589
NPY_PROMOTION_STATE: ${{ matrix.env_file == 'actions-311-numpydev.yaml' && 'weak' || 'legacy' }}
8690
# Clipboard tests
8791
QT_QPA_PLATFORM: offscreen
92+
REMOVE_PYARROW: ${{ matrix.name == 'Future infer strings (without pyarrow)' && '1' || '0' }}
8893
concurrency:
8994
# https://github.community/t/concurrecy-not-work-for-push/183068/7
90-
group: ${{ github.event_name == 'push' && github.run_number || github.ref }}-${{ matrix.env_file }}-${{ matrix.pattern }}-${{ matrix.extra_apt || '' }}}
95+
group: ${{ github.event_name == 'push' && github.run_number || github.ref }}-${{ matrix.env_file }}-${{ matrix.pattern }}-${{ matrix.extra_apt || '' }}-${{ matrix.pandas_future_infer_string }}
9196
cancel-in-progress: true
9297

9398
services:

.gitpod.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@ tasks:
1414
cp gitpod/settings.json .vscode/settings.json
1515
git fetch --tags
1616
python -m pip install -ve . --no-build-isolation --config-settings editable-verbose=true
17-
pre-commit install
17+
pre-commit install --install-hooks
1818
command: |
1919
python -m pip install -ve . --no-build-isolation --config-settings editable-verbose=true
2020
echo "✨ Pre-build complete! You can close this terminal ✨ "

Dockerfile

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@ FROM python:3.10.8
22
WORKDIR /home/pandas
33

44
RUN apt-get update && apt-get -y upgrade
5-
RUN apt-get install -y build-essential
5+
RUN apt-get install -y build-essential bash-completion
66

77
# hdf5 needed for pytables installation
88
# libgles2-mesa needed for pytest-qt
@@ -12,4 +12,6 @@ RUN python -m pip install --upgrade pip
1212
COPY requirements-dev.txt /tmp
1313
RUN python -m pip install -r /tmp/requirements-dev.txt
1414
RUN git config --global --add safe.directory /home/pandas
15+
16+
ENV SHELL "/bin/bash"
1517
CMD ["/bin/bash"]

ci/code_checks.sh

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -163,7 +163,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then
163163
-i "pandas.Series.str.center RT03,SA01" \
164164
-i "pandas.Series.str.decode PR07,RT03,SA01" \
165165
-i "pandas.Series.str.encode PR07,RT03,SA01" \
166-
-i "pandas.Series.str.index RT03" \
167166
-i "pandas.Series.str.ljust RT03,SA01" \
168167
-i "pandas.Series.str.lower RT03" \
169168
-i "pandas.Series.str.lstrip RT03" \
@@ -172,7 +171,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then
172171
-i "pandas.Series.str.partition RT03" \
173172
-i "pandas.Series.str.repeat SA01" \
174173
-i "pandas.Series.str.replace SA01" \
175-
-i "pandas.Series.str.rindex RT03" \
176174
-i "pandas.Series.str.rjust RT03,SA01" \
177175
-i "pandas.Series.str.rpartition RT03" \
178176
-i "pandas.Series.str.rstrip RT03" \

doc/source/whatsnew/v3.0.0.rst

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,7 @@ Other enhancements
5050
- :meth:`DataFrame.pivot_table` and :func:`pivot_table` now allow the passing of keyword arguments to ``aggfunc`` through ``**kwargs`` (:issue:`57884`)
5151
- :meth:`Series.cummin` and :meth:`Series.cummax` now supports :class:`CategoricalDtype` (:issue:`52335`)
5252
- :meth:`Series.plot` now correctly handle the ``ylabel`` parameter for pie charts, allowing for explicit control over the y-axis label (:issue:`58239`)
53+
- :meth:`pandas.concat` will raise a ``ValueError`` when ``ignore_index=True`` and ``keys`` is not ``None`` (:issue:`59274`)
5354
- Multiplying two :class:`DateOffset` objects will now raise a ``TypeError`` instead of a ``RecursionError`` (:issue:`59442`)
5455
- Restore support for reading Stata 104-format and enable reading 103-format dta files (:issue:`58554`)
5556
- Support reading Stata 102-format (Stata 1) dta files (:issue:`58978`)
@@ -618,6 +619,7 @@ Groupby/resample/rolling
618619

619620
Reshaping
620621
^^^^^^^^^
622+
- Bug in :func:`qcut` where values at the quantile boundaries could be incorrectly assigned (:issue:`59355`)
621623
- Bug in :meth:`DataFrame.join` inconsistently setting result index name (:issue:`55815`)
622624
- Bug in :meth:`DataFrame.unstack` producing incorrect results when ``sort=False`` (:issue:`54987`, :issue:`55516`)
623625
- Bug in :meth:`DataFrame.unstack` producing incorrect results when manipulating empty :class:`DataFrame` with an :class:`ExtentionDtype` (:issue:`59123`)

pandas/core/array_algos/quantile.py

Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -94,9 +94,9 @@ def quantile_with_mask(
9494
flat = np.array([fill_value] * len(qs))
9595
result = np.repeat(flat, len(values)).reshape(len(values), len(qs))
9696
else:
97-
result = _nanpercentile(
97+
result = _nanquantile(
9898
values,
99-
qs * 100.0,
99+
qs,
100100
na_value=fill_value,
101101
mask=mask,
102102
interpolation=interpolation,
@@ -108,15 +108,15 @@ def quantile_with_mask(
108108
return result
109109

110110

111-
def _nanpercentile_1d(
111+
def _nanquantile_1d(
112112
values: np.ndarray,
113113
mask: npt.NDArray[np.bool_],
114114
qs: npt.NDArray[np.float64],
115115
na_value: Scalar,
116116
interpolation: str,
117117
) -> Scalar | np.ndarray:
118118
"""
119-
Wrapper for np.percentile that skips missing values, specialized to
119+
Wrapper for np.quantile that skips missing values, specialized to
120120
1-dimensional case.
121121
122122
Parameters
@@ -142,7 +142,7 @@ def _nanpercentile_1d(
142142
# equiv: 'np.array([na_value] * len(qs))' but much faster
143143
return np.full(len(qs), na_value)
144144

145-
return np.percentile(
145+
return np.quantile(
146146
values,
147147
qs,
148148
# error: No overload variant of "percentile" matches argument
@@ -152,7 +152,7 @@ def _nanpercentile_1d(
152152
)
153153

154154

155-
def _nanpercentile(
155+
def _nanquantile(
156156
values: np.ndarray,
157157
qs: npt.NDArray[np.float64],
158158
*,
@@ -161,7 +161,7 @@ def _nanpercentile(
161161
interpolation: str,
162162
):
163163
"""
164-
Wrapper for np.percentile that skips missing values.
164+
Wrapper for np.quantile that skips missing values.
165165
166166
Parameters
167167
----------
@@ -180,7 +180,7 @@ def _nanpercentile(
180180

181181
if values.dtype.kind in "mM":
182182
# need to cast to integer to avoid rounding errors in numpy
183-
result = _nanpercentile(
183+
result = _nanquantile(
184184
values.view("i8"),
185185
qs=qs,
186186
na_value=na_value.view("i8"),
@@ -196,7 +196,7 @@ def _nanpercentile(
196196
# Caller is responsible for ensuring mask shape match
197197
assert mask.shape == values.shape
198198
result = [
199-
_nanpercentile_1d(val, m, qs, na_value, interpolation=interpolation)
199+
_nanquantile_1d(val, m, qs, na_value, interpolation=interpolation)
200200
for (val, m) in zip(list(values), list(mask))
201201
]
202202
if values.dtype.kind == "f":
@@ -215,7 +215,7 @@ def _nanpercentile(
215215
result = result.astype(values.dtype, copy=False)
216216
return result
217217
else:
218-
return np.percentile(
218+
return np.quantile(
219219
values,
220220
qs,
221221
axis=1,

pandas/core/arrays/string_.py

Lines changed: 92 additions & 90 deletions
Original file line numberDiff line numberDiff line change
@@ -346,14 +346,64 @@ def _from_scalars(cls, scalars, dtype: DtypeObj) -> Self:
346346
raise ValueError
347347
return cls._from_sequence(scalars, dtype=dtype)
348348

349+
def _str_map(
350+
self, f, na_value=None, dtype: Dtype | None = None, convert: bool = True
351+
):
352+
if self.dtype.na_value is np.nan:
353+
return self._str_map_nan_semantics(
354+
f, na_value=na_value, dtype=dtype, convert=convert
355+
)
356+
357+
from pandas.arrays import BooleanArray
358+
359+
if dtype is None:
360+
dtype = self.dtype
361+
if na_value is None:
362+
na_value = self.dtype.na_value
363+
364+
mask = isna(self)
365+
arr = np.asarray(self)
366+
367+
if is_integer_dtype(dtype) or is_bool_dtype(dtype):
368+
constructor: type[IntegerArray | BooleanArray]
369+
if is_integer_dtype(dtype):
370+
constructor = IntegerArray
371+
else:
372+
constructor = BooleanArray
373+
374+
na_value_is_na = isna(na_value)
375+
if na_value_is_na:
376+
na_value = 1
377+
elif dtype == np.dtype("bool"):
378+
# GH#55736
379+
na_value = bool(na_value)
380+
result = lib.map_infer_mask(
381+
arr,
382+
f,
383+
mask.view("uint8"),
384+
convert=False,
385+
na_value=na_value,
386+
# error: Argument 1 to "dtype" has incompatible type
387+
# "Union[ExtensionDtype, str, dtype[Any], Type[object]]"; expected
388+
# "Type[object]"
389+
dtype=np.dtype(cast(type, dtype)),
390+
)
391+
392+
if not na_value_is_na:
393+
mask[:] = False
394+
395+
return constructor(result, mask)
396+
397+
else:
398+
return self._str_map_str_or_object(dtype, na_value, arr, f, mask)
399+
349400
def _str_map_str_or_object(
350401
self,
351402
dtype,
352403
na_value,
353404
arr: np.ndarray,
354405
f,
355406
mask: npt.NDArray[np.bool_],
356-
convert: bool,
357407
):
358408
# _str_map helper for case where dtype is either string dtype or object
359409
if is_string_dtype(dtype) and not is_object_dtype(dtype):
@@ -377,6 +427,47 @@ def _str_map_str_or_object(
377427
# -> We don't know the result type. E.g. `.get` can return anything.
378428
return lib.map_infer_mask(arr, f, mask.view("uint8"))
379429

430+
def _str_map_nan_semantics(
431+
self, f, na_value=None, dtype: Dtype | None = None, convert: bool = True
432+
):
433+
if dtype is None:
434+
dtype = self.dtype
435+
if na_value is None:
436+
na_value = self.dtype.na_value
437+
438+
mask = isna(self)
439+
arr = np.asarray(self)
440+
441+
if is_integer_dtype(dtype) or is_bool_dtype(dtype):
442+
na_value_is_na = isna(na_value)
443+
if na_value_is_na:
444+
if is_integer_dtype(dtype):
445+
na_value = 0
446+
else:
447+
na_value = True
448+
449+
result = lib.map_infer_mask(
450+
arr,
451+
f,
452+
mask.view("uint8"),
453+
convert=False,
454+
na_value=na_value,
455+
dtype=np.dtype(cast(type, dtype)),
456+
)
457+
if na_value_is_na and mask.any():
458+
# TODO: we could alternatively do this check before map_infer_mask
459+
# and adjust the dtype/na_value we pass there. Which is more
460+
# performant?
461+
if is_integer_dtype(dtype):
462+
result = result.astype("float64")
463+
else:
464+
result = result.astype("object")
465+
result[mask] = np.nan
466+
return result
467+
468+
else:
469+
return self._str_map_str_or_object(dtype, na_value, arr, f, mask)
470+
380471

381472
# error: Definition of "_concat_same_type" in base class "NDArrayBacked" is
382473
# incompatible with definition in base class "ExtensionArray"
@@ -742,95 +833,6 @@ def _cmp_method(self, other, op):
742833
# base class "NumpyExtensionArray" defined the type as "float")
743834
_str_na_value = libmissing.NA # type: ignore[assignment]
744835

745-
def _str_map_nan_semantics(
746-
self, f, na_value=None, dtype: Dtype | None = None, convert: bool = True
747-
):
748-
if dtype is None:
749-
dtype = self.dtype
750-
if na_value is None:
751-
na_value = self.dtype.na_value
752-
753-
mask = isna(self)
754-
arr = np.asarray(self)
755-
convert = convert and not np.all(mask)
756-
757-
if is_integer_dtype(dtype) or is_bool_dtype(dtype):
758-
na_value_is_na = isna(na_value)
759-
if na_value_is_na:
760-
if is_integer_dtype(dtype):
761-
na_value = 0
762-
else:
763-
na_value = True
764-
765-
result = lib.map_infer_mask(
766-
arr,
767-
f,
768-
mask.view("uint8"),
769-
convert=False,
770-
na_value=na_value,
771-
dtype=np.dtype(cast(type, dtype)),
772-
)
773-
if na_value_is_na and mask.any():
774-
if is_integer_dtype(dtype):
775-
result = result.astype("float64")
776-
else:
777-
result = result.astype("object")
778-
result[mask] = np.nan
779-
return result
780-
781-
else:
782-
return self._str_map_str_or_object(dtype, na_value, arr, f, mask, convert)
783-
784-
def _str_map(
785-
self, f, na_value=None, dtype: Dtype | None = None, convert: bool = True
786-
):
787-
if self.dtype.na_value is np.nan:
788-
return self._str_map_nan_semantics(
789-
f, na_value=na_value, dtype=dtype, convert=convert
790-
)
791-
792-
from pandas.arrays import BooleanArray
793-
794-
if dtype is None:
795-
dtype = StringDtype(storage="python")
796-
if na_value is None:
797-
na_value = self.dtype.na_value
798-
799-
mask = isna(self)
800-
arr = np.asarray(self)
801-
802-
if is_integer_dtype(dtype) or is_bool_dtype(dtype):
803-
constructor: type[IntegerArray | BooleanArray]
804-
if is_integer_dtype(dtype):
805-
constructor = IntegerArray
806-
else:
807-
constructor = BooleanArray
808-
809-
na_value_is_na = isna(na_value)
810-
if na_value_is_na:
811-
na_value = 1
812-
elif dtype == np.dtype("bool"):
813-
na_value = bool(na_value)
814-
result = lib.map_infer_mask(
815-
arr,
816-
f,
817-
mask.view("uint8"),
818-
convert=False,
819-
na_value=na_value,
820-
# error: Argument 1 to "dtype" has incompatible type
821-
# "Union[ExtensionDtype, str, dtype[Any], Type[object]]"; expected
822-
# "Type[object]"
823-
dtype=np.dtype(cast(type, dtype)),
824-
)
825-
826-
if not na_value_is_na:
827-
mask[:] = False
828-
829-
return constructor(result, mask)
830-
831-
else:
832-
return self._str_map_str_or_object(dtype, na_value, arr, f, mask, convert)
833-
834836

835837
class StringArrayNumpySemantics(StringArray):
836838
_storage = "python"

0 commit comments

Comments
 (0)