Skip to content

Commit 55f86c2

Browse files
authored
Merge branch 'main' into example-correction-groupby
2 parents f879523 + 9c8c685 commit 55f86c2

File tree

159 files changed

+796
-146
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

159 files changed

+796
-146
lines changed

.github/workflows/unit-tests.yml

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -59,7 +59,6 @@ jobs:
5959
extra_loc: "zh_CN"
6060
- name: "Future infer strings"
6161
env_file: actions-311.yaml
62-
pattern: "not slow and not network and not single_cpu"
6362
pandas_future_infer_string: "1"
6463
- name: "Pypy"
6564
env_file: actions-pypy-39.yaml

ci/code_checks.sh

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -223,7 +223,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then
223223
-i "pandas.Timestamp.fromordinal SA01" \
224224
-i "pandas.Timestamp.fromtimestamp PR01,SA01" \
225225
-i "pandas.Timestamp.hour GL08" \
226-
-i "pandas.Timestamp.isoweekday SA01" \
227226
-i "pandas.Timestamp.max PR02" \
228227
-i "pandas.Timestamp.microsecond GL08" \
229228
-i "pandas.Timestamp.min PR02" \
@@ -328,7 +327,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then
328327
-i "pandas.core.groupby.DataFrameGroupBy.hist RT03" \
329328
-i "pandas.core.groupby.DataFrameGroupBy.indices SA01" \
330329
-i "pandas.core.groupby.DataFrameGroupBy.max SA01" \
331-
-i "pandas.core.groupby.DataFrameGroupBy.median SA01" \
332330
-i "pandas.core.groupby.DataFrameGroupBy.min SA01" \
333331
-i "pandas.core.groupby.DataFrameGroupBy.nth PR02" \
334332
-i "pandas.core.groupby.DataFrameGroupBy.nunique SA01" \
@@ -347,7 +345,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then
347345
-i "pandas.core.groupby.SeriesGroupBy.is_monotonic_decreasing SA01" \
348346
-i "pandas.core.groupby.SeriesGroupBy.is_monotonic_increasing SA01" \
349347
-i "pandas.core.groupby.SeriesGroupBy.max SA01" \
350-
-i "pandas.core.groupby.SeriesGroupBy.median SA01" \
351348
-i "pandas.core.groupby.SeriesGroupBy.min SA01" \
352349
-i "pandas.core.groupby.SeriesGroupBy.nth PR02" \
353350
-i "pandas.core.groupby.SeriesGroupBy.ohlc SA01" \
@@ -362,7 +359,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then
362359
-i "pandas.core.resample.Resampler.indices SA01" \
363360
-i "pandas.core.resample.Resampler.max PR01,RT03,SA01" \
364361
-i "pandas.core.resample.Resampler.mean SA01" \
365-
-i "pandas.core.resample.Resampler.median SA01" \
366362
-i "pandas.core.resample.Resampler.min PR01,RT03,SA01" \
367363
-i "pandas.core.resample.Resampler.ohlc SA01" \
368364
-i "pandas.core.resample.Resampler.prod SA01" \

ci/run_tests.sh

Lines changed: 0 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -16,11 +16,5 @@ if [[ "$PATTERN" ]]; then
1616
PYTEST_CMD="$PYTEST_CMD -m \"$PATTERN\""
1717
fi
1818

19-
# temporarily let pytest always succeed (many tests are not yet passing in the
20-
# build enabling the future string dtype)
21-
if [[ "$PANDAS_FUTURE_INFER_STRING" == "1" ]]; then
22-
PYTEST_CMD="$PYTEST_CMD || true"
23-
fi
24-
2519
echo $PYTEST_CMD
2620
sh -c "$PYTEST_CMD"

pandas/_libs/lib.pyx

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2702,7 +2702,7 @@ def maybe_convert_objects(ndarray[object] objects,
27022702
if using_string_dtype() and is_string_array(objects, skipna=True):
27032703
from pandas.core.arrays.string_ import StringDtype
27042704

2705-
dtype = StringDtype(storage="pyarrow_numpy")
2705+
dtype = StringDtype(storage="pyarrow", na_value=np.nan)
27062706
return dtype.construct_array_type()._from_sequence(objects, dtype=dtype)
27072707

27082708
elif convert_to_nullable_dtype and is_string_array(objects, skipna=True):

pandas/_libs/tslibs/nattype.pyx

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -441,6 +441,13 @@ class NaTType(_NaT):
441441
442442
Monday == 1 ... Sunday == 7.
443443
444+
See Also
445+
--------
446+
Timestamp.weekday : Return the day of the week with Monday=0, Sunday=6.
447+
Timestamp.isocalendar : Return a tuple containing ISO year, week number
448+
and weekday.
449+
datetime.date.isoweekday : Equivalent method in datetime module.
450+
444451
Examples
445452
--------
446453
>>> ts = pd.Timestamp('2023-01-01 10:00:00')

pandas/_libs/tslibs/timestamps.pyx

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2775,6 +2775,13 @@ default 'raise'
27752775
27762776
Monday == 1 ... Sunday == 7.
27772777
2778+
See Also
2779+
--------
2780+
Timestamp.weekday : Return the day of the week with Monday=0, Sunday=6.
2781+
Timestamp.isocalendar : Return a tuple containing ISO year, week number
2782+
and weekday.
2783+
datetime.date.isoweekday : Equivalent method in datetime module.
2784+
27782785
Examples
27792786
--------
27802787
>>> ts = pd.Timestamp('2023-01-01 10:00:00')

pandas/_testing/__init__.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -509,14 +509,14 @@ def shares_memory(left, right) -> bool:
509509
if (
510510
isinstance(left, ExtensionArray)
511511
and is_string_dtype(left.dtype)
512-
and left.dtype.storage in ("pyarrow", "pyarrow_numpy") # type: ignore[attr-defined]
512+
and left.dtype.storage == "pyarrow" # type: ignore[attr-defined]
513513
):
514514
# https://github.com/pandas-dev/pandas/pull/43930#discussion_r736862669
515515
left = cast("ArrowExtensionArray", left)
516516
if (
517517
isinstance(right, ExtensionArray)
518518
and is_string_dtype(right.dtype)
519-
and right.dtype.storage in ("pyarrow", "pyarrow_numpy") # type: ignore[attr-defined]
519+
and right.dtype.storage == "pyarrow" # type: ignore[attr-defined]
520520
):
521521
right = cast("ArrowExtensionArray", right)
522522
left_pa_data = left._pa_array

pandas/core/arrays/arrow/array.py

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -575,10 +575,8 @@ def __getitem__(self, item: PositionalIndexer):
575575
if isinstance(item, np.ndarray):
576576
if not len(item):
577577
# Removable once we migrate StringDtype[pyarrow] to ArrowDtype[string]
578-
if self._dtype.name == "string" and self._dtype.storage in (
579-
"pyarrow",
580-
"pyarrow_numpy",
581-
):
578+
if self._dtype.name == "string" and self._dtype.storage == "pyarrow":
579+
# TODO(infer_string) should this be large_string?
582580
pa_dtype = pa.string()
583581
else:
584582
pa_dtype = self._dtype.pyarrow_dtype

pandas/core/arrays/string_.py

Lines changed: 68 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,10 @@
99

1010
import numpy as np
1111

12-
from pandas._config import get_option
12+
from pandas._config import (
13+
get_option,
14+
using_string_dtype,
15+
)
1316

1417
from pandas._libs import (
1518
lib,
@@ -81,8 +84,10 @@ class StringDtype(StorageExtensionDtype):
8184
8285
Parameters
8386
----------
84-
storage : {"python", "pyarrow", "pyarrow_numpy"}, optional
87+
storage : {"python", "pyarrow"}, optional
8588
If not given, the value of ``pd.options.mode.string_storage``.
89+
na_value : {np.nan, pd.NA}, default pd.NA
90+
Whether the dtype follows NaN or NA missing value semantics.
8691
8792
Attributes
8893
----------
@@ -113,30 +118,67 @@ class StringDtype(StorageExtensionDtype):
113118
# follows NumPy semantics, which uses nan.
114119
@property
115120
def na_value(self) -> libmissing.NAType | float: # type: ignore[override]
116-
if self.storage == "pyarrow_numpy":
117-
return np.nan
118-
else:
119-
return libmissing.NA
121+
return self._na_value
120122

121-
_metadata = ("storage",)
123+
_metadata = ("storage", "_na_value") # type: ignore[assignment]
122124

123-
def __init__(self, storage=None) -> None:
125+
def __init__(
126+
self,
127+
storage: str | None = None,
128+
na_value: libmissing.NAType | float = libmissing.NA,
129+
) -> None:
130+
# infer defaults
124131
if storage is None:
125-
infer_string = get_option("future.infer_string")
126-
if infer_string:
127-
storage = "pyarrow_numpy"
132+
if using_string_dtype():
133+
storage = "pyarrow"
128134
else:
129135
storage = get_option("mode.string_storage")
130-
if storage not in {"python", "pyarrow", "pyarrow_numpy"}:
136+
137+
if storage == "pyarrow_numpy":
138+
# TODO raise a deprecation warning
139+
storage = "pyarrow"
140+
na_value = np.nan
141+
142+
# validate options
143+
if storage not in {"python", "pyarrow"}:
131144
raise ValueError(
132-
f"Storage must be 'python', 'pyarrow' or 'pyarrow_numpy'. "
133-
f"Got {storage} instead."
145+
f"Storage must be 'python' or 'pyarrow'. Got {storage} instead."
134146
)
135-
if storage in ("pyarrow", "pyarrow_numpy") and pa_version_under10p1:
147+
if storage == "pyarrow" and pa_version_under10p1:
136148
raise ImportError(
137149
"pyarrow>=10.0.1 is required for PyArrow backed StringArray."
138150
)
151+
152+
if isinstance(na_value, float) and np.isnan(na_value):
153+
# when passed a NaN value, always set to np.nan to ensure we use
154+
# a consistent NaN value (and we can use `dtype.na_value is np.nan`)
155+
na_value = np.nan
156+
elif na_value is not libmissing.NA:
157+
raise ValueError("'na_value' must be np.nan or pd.NA, got {na_value}")
158+
139159
self.storage = storage
160+
self._na_value = na_value
161+
162+
def __eq__(self, other: object) -> bool:
163+
# we need to override the base class __eq__ because na_value (NA or NaN)
164+
# cannot be checked with normal `==`
165+
if isinstance(other, str):
166+
if other == self.name:
167+
return True
168+
try:
169+
other = self.construct_from_string(other)
170+
except TypeError:
171+
return False
172+
if isinstance(other, type(self)):
173+
return self.storage == other.storage and self.na_value is other.na_value
174+
return False
175+
176+
def __hash__(self) -> int:
177+
# need to override __hash__ as well because of overriding __eq__
178+
return super().__hash__()
179+
180+
def __reduce__(self):
181+
return StringDtype, (self.storage, self.na_value)
140182

141183
@property
142184
def type(self) -> type[str]:
@@ -181,6 +223,7 @@ def construct_from_string(cls, string) -> Self:
181223
elif string == "string[pyarrow]":
182224
return cls(storage="pyarrow")
183225
elif string == "string[pyarrow_numpy]":
226+
# TODO deprecate
184227
return cls(storage="pyarrow_numpy")
185228
else:
186229
raise TypeError(f"Cannot construct a '{cls.__name__}' from '{string}'")
@@ -205,7 +248,7 @@ def construct_array_type( # type: ignore[override]
205248

206249
if self.storage == "python":
207250
return StringArray
208-
elif self.storage == "pyarrow":
251+
elif self.storage == "pyarrow" and self._na_value is libmissing.NA:
209252
return ArrowStringArray
210253
else:
211254
return ArrowStringArrayNumpySemantics
@@ -217,13 +260,17 @@ def __from_arrow__(
217260
Construct StringArray from pyarrow Array/ChunkedArray.
218261
"""
219262
if self.storage == "pyarrow":
220-
from pandas.core.arrays.string_arrow import ArrowStringArray
263+
if self._na_value is libmissing.NA:
264+
from pandas.core.arrays.string_arrow import ArrowStringArray
265+
266+
return ArrowStringArray(array)
267+
else:
268+
from pandas.core.arrays.string_arrow import (
269+
ArrowStringArrayNumpySemantics,
270+
)
221271

222-
return ArrowStringArray(array)
223-
elif self.storage == "pyarrow_numpy":
224-
from pandas.core.arrays.string_arrow import ArrowStringArrayNumpySemantics
272+
return ArrowStringArrayNumpySemantics(array)
225273

226-
return ArrowStringArrayNumpySemantics(array)
227274
else:
228275
import pyarrow
229276

pandas/core/arrays/string_arrow.py

Lines changed: 5 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -131,6 +131,7 @@ class ArrowStringArray(ObjectStringArrayMixin, ArrowExtensionArray, BaseStringAr
131131
# base class "ArrowExtensionArray" defined the type as "ArrowDtype")
132132
_dtype: StringDtype # type: ignore[assignment]
133133
_storage = "pyarrow"
134+
_na_value: libmissing.NAType | float = libmissing.NA
134135

135136
def __init__(self, values) -> None:
136137
_chk_pyarrow_available()
@@ -140,7 +141,7 @@ def __init__(self, values) -> None:
140141
values = pc.cast(values, pa.large_string())
141142

142143
super().__init__(values)
143-
self._dtype = StringDtype(storage=self._storage)
144+
self._dtype = StringDtype(storage=self._storage, na_value=self._na_value)
144145

145146
if not pa.types.is_large_string(self._pa_array.type) and not (
146147
pa.types.is_dictionary(self._pa_array.type)
@@ -187,10 +188,7 @@ def _from_sequence(
187188

188189
if dtype and not (isinstance(dtype, str) and dtype == "string"):
189190
dtype = pandas_dtype(dtype)
190-
assert isinstance(dtype, StringDtype) and dtype.storage in (
191-
"pyarrow",
192-
"pyarrow_numpy",
193-
)
191+
assert isinstance(dtype, StringDtype) and dtype.storage == "pyarrow"
194192

195193
if isinstance(scalars, BaseMaskedArray):
196194
# avoid costly conversion to object dtype in ensure_string_array and
@@ -597,7 +595,8 @@ def _rank(
597595

598596

599597
class ArrowStringArrayNumpySemantics(ArrowStringArray):
600-
_storage = "pyarrow_numpy"
598+
_storage = "pyarrow"
599+
_na_value = np.nan
601600

602601
@classmethod
603602
def _result_converter(cls, values, na=None):

0 commit comments

Comments
 (0)