Skip to content

Commit 82cecd1

Browse files
committed
adding tm.skip_if_no decorator to tests per feedback
2 parents c1ed1f0 + 4f328f0 commit 82cecd1

File tree

21 files changed

+438
-165
lines changed

21 files changed

+438
-165
lines changed

ci/code_checks.sh

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -73,9 +73,30 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then
7373
-i "pandas.Period.freq GL08" \
7474
-i "pandas.Period.ordinal GL08" \
7575
-i "pandas.RangeIndex.from_range PR01,SA01" \
76+
-i "pandas.Series.cat.add_categories PR01,PR02" \
77+
-i "pandas.Series.cat.as_ordered PR01" \
78+
-i "pandas.Series.cat.as_unordered PR01" \
79+
-i "pandas.Series.cat.remove_categories PR01,PR02" \
80+
-i "pandas.Series.cat.remove_unused_categories PR01" \
81+
-i "pandas.Series.cat.rename_categories PR01,PR02" \
82+
-i "pandas.Series.cat.reorder_categories PR01,PR02" \
83+
-i "pandas.Series.cat.set_categories PR01,PR02" \
84+
-i "pandas.Series.dt.as_unit PR01,PR02" \
85+
-i "pandas.Series.dt.ceil PR01,PR02" \
86+
-i "pandas.Series.dt.day_name PR01,PR02" \
87+
-i "pandas.Series.dt.floor PR01,PR02" \
7688
-i "pandas.Series.dt.freq GL08" \
89+
-i "pandas.Series.dt.month_name PR01,PR02" \
90+
-i "pandas.Series.dt.normalize PR01" \
91+
-i "pandas.Series.dt.round PR01,PR02" \
92+
-i "pandas.Series.dt.strftime PR01,PR02" \
93+
-i "pandas.Series.dt.to_period PR01,PR02" \
94+
-i "pandas.Series.dt.total_seconds PR01" \
95+
-i "pandas.Series.dt.tz_convert PR01,PR02" \
96+
-i "pandas.Series.dt.tz_localize PR01,PR02" \
7797
-i "pandas.Series.dt.unit GL08" \
7898
-i "pandas.Series.pad PR01,SA01" \
99+
-i "pandas.Series.sparse.from_coo PR07,SA01" \
79100
-i "pandas.Timedelta.max PR02" \
80101
-i "pandas.Timedelta.min PR02" \
81102
-i "pandas.Timedelta.resolution PR02" \
@@ -85,11 +106,13 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then
85106
-i "pandas.Timestamp.resolution PR02" \
86107
-i "pandas.Timestamp.tzinfo GL08" \
87108
-i "pandas.Timestamp.year GL08" \
109+
-i "pandas.api.types.is_float PR01,SA01" \
88110
-i "pandas.api.types.is_integer PR01,SA01" \
89111
-i "pandas.api.types.is_iterator PR07,SA01" \
90112
-i "pandas.api.types.is_re_compilable PR07,SA01" \
91113
-i "pandas.api.types.pandas_dtype PR07,RT03,SA01" \
92114
-i "pandas.arrays.ArrowExtensionArray PR07,SA01" \
115+
-i "pandas.arrays.DatetimeArray SA01" \
93116
-i "pandas.arrays.IntegerArray SA01" \
94117
-i "pandas.arrays.IntervalArray.left SA01" \
95118
-i "pandas.arrays.IntervalArray.length SA01" \
@@ -140,6 +163,7 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then
140163
-i "pandas.errors.DuplicateLabelError SA01" \
141164
-i "pandas.errors.IntCastingNaNError SA01" \
142165
-i "pandas.errors.InvalidIndexError SA01" \
166+
-i "pandas.errors.InvalidVersion SA01" \
143167
-i "pandas.errors.NullFrequencyError SA01" \
144168
-i "pandas.errors.NumExprClobberingError SA01" \
145169
-i "pandas.errors.NumbaUtilError SA01" \
@@ -148,18 +172,24 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then
148172
-i "pandas.errors.PerformanceWarning SA01" \
149173
-i "pandas.errors.PossibleDataLossError SA01" \
150174
-i "pandas.errors.PossiblePrecisionLoss SA01" \
175+
-i "pandas.errors.SpecificationError SA01" \
151176
-i "pandas.errors.UndefinedVariableError PR01,SA01" \
152177
-i "pandas.errors.UnsortedIndexError SA01" \
153178
-i "pandas.errors.UnsupportedFunctionCall SA01" \
154179
-i "pandas.errors.ValueLabelTypeMismatch SA01" \
155180
-i "pandas.infer_freq SA01" \
156181
-i "pandas.io.json.build_table_schema PR07,RT03,SA01" \
182+
-i "pandas.io.stata.StataReader.data_label SA01" \
183+
-i "pandas.io.stata.StataReader.value_labels RT03,SA01" \
157184
-i "pandas.io.stata.StataReader.variable_labels RT03,SA01" \
158185
-i "pandas.io.stata.StataWriter.write_file SA01" \
159186
-i "pandas.json_normalize RT03,SA01" \
187+
-i "pandas.period_range RT03,SA01" \
160188
-i "pandas.plotting.andrews_curves RT03,SA01" \
189+
-i "pandas.plotting.lag_plot RT03,SA01" \
161190
-i "pandas.plotting.scatter_matrix PR07,SA01" \
162191
-i "pandas.set_eng_float_format RT03,SA01" \
192+
-i "pandas.testing.assert_extension_array_equal SA01" \
163193
-i "pandas.tseries.offsets.BDay PR02,SA01" \
164194
-i "pandas.tseries.offsets.BQuarterBegin.is_on_offset GL08" \
165195
-i "pandas.tseries.offsets.BQuarterBegin.n GL08" \

doc/source/whatsnew/v3.0.0.rst

Lines changed: 64 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -203,6 +203,67 @@ In cases with mixed-resolution inputs, the highest resolution is used:
203203
In [2]: pd.to_datetime([pd.Timestamp("2024-03-22 11:43:01"), "2024-03-22 11:43:01.002"]).dtype
204204
Out[2]: dtype('<M8[ns]')
205205
206+
.. _whatsnew_300.api_breaking.value_counts_sorting:
207+
208+
Changed behavior in :meth:`DataFrame.value_counts` and :meth:`DataFrameGroupBy.value_counts` when ``sort=False``
209+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
210+
211+
In previous versions of pandas, :meth:`DataFrame.value_counts` with ``sort=False`` would sort the result by row labels (as was documented). This was nonintuitive and inconsistent with :meth:`Series.value_counts` which would maintain the order of the input. Now :meth:`DataFrame.value_counts` will maintain the order of the input.
212+
213+
.. ipython:: python
214+
215+
df = pd.DataFrame(
216+
{
217+
"a": [2, 2, 2, 2, 1, 1, 1, 1],
218+
"b": [2, 1, 3, 1, 2, 3, 1, 1],
219+
}
220+
)
221+
df
222+
223+
*Old behavior*
224+
225+
.. code-block:: ipython
226+
227+
In [3]: df.value_counts(sort=False)
228+
Out[3]:
229+
a b
230+
1 1 2
231+
2 1
232+
3 1
233+
2 1 2
234+
2 1
235+
3 1
236+
Name: count, dtype: int64
237+
238+
*New behavior*
239+
240+
.. ipython:: python
241+
242+
df.value_counts(sort=False)
243+
244+
This change also applies to :meth:`.DataFrameGroupBy.value_counts`. Here, there are two options for sorting: one ``sort`` passed to :meth:`DataFrame.groupby` and one passed directly to :meth:`.DataFrameGroupBy.value_counts`. The former will determine whether to sort the groups, the latter whether to sort the counts. All non-grouping columns will maintain the order of the input *within groups*.
245+
246+
*Old behavior*
247+
248+
.. code-block:: ipython
249+
250+
In [5]: df.groupby("a", sort=True).value_counts(sort=False)
251+
Out[5]:
252+
a b
253+
1 1 2
254+
2 1
255+
3 1
256+
2 1 2
257+
2 1
258+
3 1
259+
dtype: int64
260+
261+
*New behavior*
262+
263+
.. ipython:: python
264+
265+
df.groupby("a", sort=True).value_counts(sort=False)
266+
206267
.. _whatsnew_300.api_breaking.deps:
207268

208269
Increased minimum version for Python
@@ -544,7 +605,7 @@ Bug fixes
544605

545606
Categorical
546607
^^^^^^^^^^^
547-
-
608+
- Bug in :func:`Series.apply` where ``nan`` was ignored for :class:`CategoricalDtype` (:issue:`59938`)
548609
-
549610

550611
Datetimelike
@@ -682,6 +743,7 @@ Sparse
682743
^^^^^^
683744
- Bug in :class:`SparseDtype` for equal comparison with na fill value. (:issue:`54770`)
684745
- Bug in :meth:`DataFrame.sparse.from_spmatrix` which hard coded an invalid ``fill_value`` for certain subtypes. (:issue:`59063`)
746+
- Bug in :meth:`DataFrame.sparse.to_dense` which ignored subclassing and always returned an instance of :class:`DataFrame` (:issue:`59913`)
685747

686748
ExtensionArray
687749
^^^^^^^^^^^^^^
@@ -700,6 +762,7 @@ Other
700762
- Bug in :func:`eval` on :class:`ExtensionArray` on including division ``/`` failed with a ``TypeError``. (:issue:`58748`)
701763
- Bug in :func:`eval` where the names of the :class:`Series` were not preserved when using ``engine="numexpr"``. (:issue:`10239`)
702764
- Bug in :func:`eval` with ``engine="numexpr"`` returning unexpected result for float division. (:issue:`59736`)
765+
- Bug in :func:`to_numeric` raising ``TypeError`` when ``arg`` is a :class:`Timedelta` or :class:`Timestamp` scalar. (:issue:`59944`)
703766
- Bug in :func:`unique` on :class:`Index` not always returning :class:`Index` (:issue:`57043`)
704767
- Bug in :meth:`DataFrame.apply` where passing ``engine="numba"`` ignored ``args`` passed to the applied function (:issue:`58712`)
705768
- Bug in :meth:`DataFrame.eval` and :meth:`DataFrame.query` which caused an exception when using NumPy attributes via ``@`` notation, e.g., ``df.eval("@np.floor(a)")``. (:issue:`58041`)

pandas/core/arrays/_arrow_string_mixins.py

Lines changed: 27 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -10,15 +10,14 @@
1010

1111
import numpy as np
1212

13+
from pandas._libs import lib
1314
from pandas.compat import (
1415
pa_version_under10p1,
1516
pa_version_under11p0,
1617
pa_version_under13p0,
1718
pa_version_under17p0,
1819
)
1920

20-
from pandas.core.dtypes.missing import isna
21-
2221
if not pa_version_under10p1:
2322
import pyarrow as pa
2423
import pyarrow.compute as pc
@@ -38,7 +37,7 @@ class ArrowStringArrayMixin:
3837
def __init__(self, *args, **kwargs) -> None:
3938
raise NotImplementedError
4039

41-
def _convert_bool_result(self, result):
40+
def _convert_bool_result(self, result, na=lib.no_default, method_name=None):
4241
# Convert a bool-dtype result to the appropriate result type
4342
raise NotImplementedError
4443

@@ -212,7 +211,9 @@ def _str_removesuffix(self, suffix: str):
212211
result = pc.if_else(ends_with, removed, self._pa_array)
213212
return type(self)(result)
214213

215-
def _str_startswith(self, pat: str | tuple[str, ...], na: Scalar | None = None):
214+
def _str_startswith(
215+
self, pat: str | tuple[str, ...], na: Scalar | lib.NoDefault = lib.no_default
216+
):
216217
if isinstance(pat, str):
217218
result = pc.starts_with(self._pa_array, pattern=pat)
218219
else:
@@ -225,11 +226,11 @@ def _str_startswith(self, pat: str | tuple[str, ...], na: Scalar | None = None):
225226

226227
for p in pat[1:]:
227228
result = pc.or_(result, pc.starts_with(self._pa_array, pattern=p))
228-
if not isna(na): # pyright: ignore [reportGeneralTypeIssues]
229-
result = result.fill_null(na)
230-
return self._convert_bool_result(result)
229+
return self._convert_bool_result(result, na=na, method_name="startswith")
231230

232-
def _str_endswith(self, pat: str | tuple[str, ...], na: Scalar | None = None):
231+
def _str_endswith(
232+
self, pat: str | tuple[str, ...], na: Scalar | lib.NoDefault = lib.no_default
233+
):
233234
if isinstance(pat, str):
234235
result = pc.ends_with(self._pa_array, pattern=pat)
235236
else:
@@ -242,9 +243,7 @@ def _str_endswith(self, pat: str | tuple[str, ...], na: Scalar | None = None):
242243

243244
for p in pat[1:]:
244245
result = pc.or_(result, pc.ends_with(self._pa_array, pattern=p))
245-
if not isna(na): # pyright: ignore [reportGeneralTypeIssues]
246-
result = result.fill_null(na)
247-
return self._convert_bool_result(result)
246+
return self._convert_bool_result(result, na=na, method_name="endswith")
248247

249248
def _str_isalnum(self):
250249
result = pc.utf8_is_alnum(self._pa_array)
@@ -283,7 +282,12 @@ def _str_isupper(self):
283282
return self._convert_bool_result(result)
284283

285284
def _str_contains(
286-
self, pat, case: bool = True, flags: int = 0, na=None, regex: bool = True
285+
self,
286+
pat,
287+
case: bool = True,
288+
flags: int = 0,
289+
na: Scalar | lib.NoDefault = lib.no_default,
290+
regex: bool = True,
287291
):
288292
if flags:
289293
raise NotImplementedError(f"contains not implemented with {flags=}")
@@ -293,19 +297,25 @@ def _str_contains(
293297
else:
294298
pa_contains = pc.match_substring
295299
result = pa_contains(self._pa_array, pat, ignore_case=not case)
296-
if not isna(na): # pyright: ignore [reportGeneralTypeIssues]
297-
result = result.fill_null(na)
298-
return self._convert_bool_result(result)
300+
return self._convert_bool_result(result, na=na, method_name="contains")
299301

300302
def _str_match(
301-
self, pat: str, case: bool = True, flags: int = 0, na: Scalar | None = None
303+
self,
304+
pat: str,
305+
case: bool = True,
306+
flags: int = 0,
307+
na: Scalar | lib.NoDefault = lib.no_default,
302308
):
303309
if not pat.startswith("^"):
304310
pat = f"^{pat}"
305311
return self._str_contains(pat, case, flags, na, regex=True)
306312

307313
def _str_fullmatch(
308-
self, pat, case: bool = True, flags: int = 0, na: Scalar | None = None
314+
self,
315+
pat,
316+
case: bool = True,
317+
flags: int = 0,
318+
na: Scalar | lib.NoDefault = lib.no_default,
309319
):
310320
if not pat.endswith("$") or pat.endswith("\\$"):
311321
pat = f"{pat}$"

pandas/core/arrays/arrow/array.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2318,7 +2318,9 @@ def _apply_elementwise(self, func: Callable) -> list[list[Any]]:
23182318
for chunk in self._pa_array.iterchunks()
23192319
]
23202320

2321-
def _convert_bool_result(self, result):
2321+
def _convert_bool_result(self, result, na=lib.no_default, method_name=None):
2322+
if na is not lib.no_default and not isna(na): # pyright: ignore [reportGeneralTypeIssues]
2323+
result = result.fill_null(na)
23222324
return type(self)(result)
23232325

23242326
def _convert_int_result(self, result):

pandas/core/arrays/categorical.py

Lines changed: 16 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -2679,16 +2679,28 @@ def isin(self, values: ArrayLike) -> npt.NDArray[np.bool_]:
26792679
# ------------------------------------------------------------------------
26802680
# String methods interface
26812681
def _str_map(
2682-
self, f, na_value=np.nan, dtype=np.dtype("object"), convert: bool = True
2682+
self, f, na_value=lib.no_default, dtype=np.dtype("object"), convert: bool = True
26832683
):
26842684
# Optimization to apply the callable `f` to the categories once
26852685
# and rebuild the result by `take`ing from the result with the codes.
26862686
# Returns the same type as the object-dtype implementation though.
2687-
from pandas.core.arrays import NumpyExtensionArray
2688-
26892687
categories = self.categories
26902688
codes = self.codes
2691-
result = NumpyExtensionArray(categories.to_numpy())._str_map(f, na_value, dtype)
2689+
if categories.dtype == "string":
2690+
result = categories.array._str_map(f, na_value, dtype) # type: ignore[attr-defined]
2691+
if (
2692+
categories.dtype.na_value is np.nan # type: ignore[union-attr]
2693+
and is_bool_dtype(dtype)
2694+
and (na_value is lib.no_default or isna(na_value))
2695+
):
2696+
# NaN propagates as False for functions with boolean return type
2697+
na_value = False
2698+
else:
2699+
from pandas.core.arrays import NumpyExtensionArray
2700+
2701+
result = NumpyExtensionArray(categories.to_numpy())._str_map(
2702+
f, na_value, dtype
2703+
)
26922704
return take_nd(result, codes, fill_value=na_value)
26932705

26942706
def _str_get_dummies(self, sep: str = "|", dtype: NpDtype | None = None):

pandas/core/arrays/string_.py

Lines changed: 20 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -381,7 +381,11 @@ def _from_scalars(cls, scalars, dtype: DtypeObj) -> Self:
381381
return cls._from_sequence(scalars, dtype=dtype)
382382

383383
def _str_map(
384-
self, f, na_value=None, dtype: Dtype | None = None, convert: bool = True
384+
self,
385+
f,
386+
na_value=lib.no_default,
387+
dtype: Dtype | None = None,
388+
convert: bool = True,
385389
):
386390
if self.dtype.na_value is np.nan:
387391
return self._str_map_nan_semantics(f, na_value=na_value, dtype=dtype)
@@ -390,7 +394,7 @@ def _str_map(
390394

391395
if dtype is None:
392396
dtype = self.dtype
393-
if na_value is None:
397+
if na_value is lib.no_default:
394398
na_value = self.dtype.na_value
395399

396400
mask = isna(self)
@@ -459,11 +463,17 @@ def _str_map_str_or_object(
459463
# -> We don't know the result type. E.g. `.get` can return anything.
460464
return lib.map_infer_mask(arr, f, mask.view("uint8"))
461465

462-
def _str_map_nan_semantics(self, f, na_value=None, dtype: Dtype | None = None):
466+
def _str_map_nan_semantics(
467+
self, f, na_value=lib.no_default, dtype: Dtype | None = None
468+
):
463469
if dtype is None:
464470
dtype = self.dtype
465-
if na_value is None:
466-
na_value = self.dtype.na_value
471+
if na_value is lib.no_default:
472+
if is_bool_dtype(dtype):
473+
# NaN propagates as False
474+
na_value = False
475+
else:
476+
na_value = self.dtype.na_value
467477

468478
mask = isna(self)
469479
arr = np.asarray(self)
@@ -474,7 +484,8 @@ def _str_map_nan_semantics(self, f, na_value=None, dtype: Dtype | None = None):
474484
if is_integer_dtype(dtype):
475485
na_value = 0
476486
else:
477-
na_value = True
487+
# NaN propagates as False
488+
na_value = False
478489

479490
result = lib.map_infer_mask(
480491
arr,
@@ -484,15 +495,13 @@ def _str_map_nan_semantics(self, f, na_value=None, dtype: Dtype | None = None):
484495
na_value=na_value,
485496
dtype=np.dtype(cast(type, dtype)),
486497
)
487-
if na_value_is_na and mask.any():
498+
if na_value_is_na and is_integer_dtype(dtype) and mask.any():
488499
# TODO: we could alternatively do this check before map_infer_mask
489500
# and adjust the dtype/na_value we pass there. Which is more
490501
# performant?
491-
if is_integer_dtype(dtype):
492-
result = result.astype("float64")
493-
else:
494-
result = result.astype("object")
502+
result = result.astype("float64")
495503
result[mask] = np.nan
504+
496505
return result
497506

498507
else:

0 commit comments

Comments
 (0)