Skip to content

Commit 2e0f9a2

Browse files
authored
Merge branch 'main' into to_timestamp
2 parents c7cc6db + 3c2c5f4 commit 2e0f9a2

File tree

13 files changed

+206
-58
lines changed

13 files changed

+206
-58
lines changed

ci/code_checks.sh

Lines changed: 0 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -76,7 +76,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then
7676
-i "pandas.Series.dt.freq GL08" \
7777
-i "pandas.Series.dt.unit GL08" \
7878
-i "pandas.Series.pad PR01,SA01" \
79-
-i "pandas.Series.sparse.from_coo PR07,SA01" \
8079
-i "pandas.Timedelta.max PR02" \
8180
-i "pandas.Timedelta.min PR02" \
8281
-i "pandas.Timedelta.resolution PR02" \
@@ -86,13 +85,11 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then
8685
-i "pandas.Timestamp.resolution PR02" \
8786
-i "pandas.Timestamp.tzinfo GL08" \
8887
-i "pandas.Timestamp.year GL08" \
89-
-i "pandas.api.types.is_float PR01,SA01" \
9088
-i "pandas.api.types.is_integer PR01,SA01" \
9189
-i "pandas.api.types.is_iterator PR07,SA01" \
9290
-i "pandas.api.types.is_re_compilable PR07,SA01" \
9391
-i "pandas.api.types.pandas_dtype PR07,RT03,SA01" \
9492
-i "pandas.arrays.ArrowExtensionArray PR07,SA01" \
95-
-i "pandas.arrays.DatetimeArray SA01" \
9693
-i "pandas.arrays.IntegerArray SA01" \
9794
-i "pandas.arrays.IntervalArray.left SA01" \
9895
-i "pandas.arrays.IntervalArray.length SA01" \
@@ -143,7 +140,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then
143140
-i "pandas.errors.DuplicateLabelError SA01" \
144141
-i "pandas.errors.IntCastingNaNError SA01" \
145142
-i "pandas.errors.InvalidIndexError SA01" \
146-
-i "pandas.errors.InvalidVersion SA01" \
147143
-i "pandas.errors.NullFrequencyError SA01" \
148144
-i "pandas.errors.NumExprClobberingError SA01" \
149145
-i "pandas.errors.NumbaUtilError SA01" \
@@ -152,7 +148,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then
152148
-i "pandas.errors.PerformanceWarning SA01" \
153149
-i "pandas.errors.PossibleDataLossError SA01" \
154150
-i "pandas.errors.PossiblePrecisionLoss SA01" \
155-
-i "pandas.errors.SpecificationError SA01" \
156151
-i "pandas.errors.UndefinedVariableError PR01,SA01" \
157152
-i "pandas.errors.UnsortedIndexError SA01" \
158153
-i "pandas.errors.UnsupportedFunctionCall SA01" \

doc/source/whatsnew/v3.0.0.rst

Lines changed: 61 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -203,6 +203,67 @@ In cases with mixed-resolution inputs, the highest resolution is used:
203203
In [2]: pd.to_datetime([pd.Timestamp("2024-03-22 11:43:01"), "2024-03-22 11:43:01.002"]).dtype
204204
Out[2]: dtype('<M8[ns]')
205205
206+
.. _whatsnew_300.api_breaking.value_counts_sorting:
207+
208+
Changed behavior in :meth:`DataFrame.value_counts` and :meth:`DataFrameGroupBy.value_counts` when ``sort=False``
209+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
210+
211+
In previous versions of pandas, :meth:`DataFrame.value_counts` with ``sort=False`` would sort the result by row labels (as was documented). This was nonintuitive and inconsistent with :meth:`Series.value_counts` which would maintain the order of the input. Now :meth:`DataFrame.value_counts` will maintain the order of the input.
212+
213+
.. ipython:: python
214+
215+
df = pd.DataFrame(
216+
{
217+
"a": [2, 2, 2, 2, 1, 1, 1, 1],
218+
"b": [2, 1, 3, 1, 2, 3, 1, 1],
219+
}
220+
)
221+
df
222+
223+
*Old behavior*
224+
225+
.. code-block:: ipython
226+
227+
In [3]: df.value_counts(sort=False)
228+
Out[3]:
229+
a b
230+
1 1 2
231+
2 1
232+
3 1
233+
2 1 2
234+
2 1
235+
3 1
236+
Name: count, dtype: int64
237+
238+
*New behavior*
239+
240+
.. ipython:: python
241+
242+
df.value_counts(sort=False)
243+
244+
This change also applies to :meth:`.DataFrameGroupBy.value_counts`. Here, there are two options for sorting: one ``sort`` passed to :meth:`DataFrame.groupby` and one passed directly to :meth:`.DataFrameGroupBy.value_counts`. The former will determine whether to sort the groups, the latter whether to sort the counts. All non-grouping columns will maintain the order of the input *within groups*.
245+
246+
*Old behavior*
247+
248+
.. code-block:: ipython
249+
250+
In [5]: df.groupby("a", sort=True).value_counts(sort=False)
251+
Out[5]:
252+
a b
253+
1 1 2
254+
2 1
255+
3 1
256+
2 1 2
257+
2 1
258+
3 1
259+
dtype: int64
260+
261+
*New behavior*
262+
263+
.. ipython:: python
264+
265+
df.groupby("a", sort=True).value_counts(sort=False)
266+
206267
.. _whatsnew_300.api_breaking.deps:
207268

208269
Increased minimum version for Python

pandas/_libs/lib.pyx

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1089,9 +1089,23 @@ def is_float(obj: object) -> bool:
10891089
"""
10901090
Return True if given object is float.
10911091

1092+
This method checks whether the passed object is a float type. It
1093+
returns `True` if the object is a float, and `False` otherwise.
1094+
1095+
Parameters
1096+
----------
1097+
obj : object
1098+
The object to check for float type.
1099+
10921100
Returns
10931101
-------
10941102
bool
1103+
`True` if the object is of float type, otherwise `False`.
1104+
1105+
See Also
1106+
--------
1107+
api.types.is_integer : Check if an object is of integer type.
1108+
api.types.is_numeric_dtype : Check if an object is of numeric type.
10951109

10961110
Examples
10971111
--------

pandas/core/arrays/datetimes.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -205,6 +205,14 @@ class DatetimeArray(dtl.TimelikeOps, dtl.DatelikeOps): # type: ignore[misc]
205205
-------
206206
None
207207
208+
See Also
209+
--------
210+
DatetimeIndex : Immutable Index for datetime-like data.
211+
Series : One-dimensional labeled array capable of holding datetime-like data.
212+
Timestamp : Pandas replacement for python datetime.datetime object.
213+
to_datetime : Convert argument to datetime.
214+
period_range : Return a fixed frequency PeriodIndex.
215+
208216
Examples
209217
--------
210218
>>> pd.arrays.DatetimeArray._from_sequence(

pandas/core/arrays/sparse/accessor.py

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -88,9 +88,17 @@ def from_coo(cls, A, dense_index: bool = False) -> Series:
8888
"""
8989
Create a Series with sparse values from a scipy.sparse.coo_matrix.
9090
91+
This method takes a ``scipy.sparse.coo_matrix`` (coordinate format) as input and
92+
returns a pandas ``Series`` where the non-zero elements are represented as
93+
sparse values. The index of the Series can either include only the coordinates
94+
of non-zero elements (default behavior) or the full sorted set of coordinates
95+
from the matrix if ``dense_index`` is set to `True`.
96+
9197
Parameters
9298
----------
9399
A : scipy.sparse.coo_matrix
100+
The sparse matrix in coordinate format from which the sparse Series
101+
will be created.
94102
dense_index : bool, default False
95103
If False (default), the index consists of only the
96104
coords of the non-null entries of the original coo_matrix.
@@ -102,6 +110,12 @@ def from_coo(cls, A, dense_index: bool = False) -> Series:
102110
s : Series
103111
A Series with sparse values.
104112
113+
See Also
114+
--------
115+
DataFrame.sparse.from_spmatrix : Create a new DataFrame from a scipy sparse
116+
matrix.
117+
scipy.sparse.coo_matrix : A sparse matrix in COOrdinate format.
118+
105119
Examples
106120
--------
107121
>>> from scipy import sparse

pandas/core/frame.py

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -7266,7 +7266,11 @@ def value_counts(
72667266
normalize : bool, default False
72677267
Return proportions rather than frequencies.
72687268
sort : bool, default True
7269-
Sort by frequencies when True. Sort by DataFrame column values when False.
7269+
Sort by frequencies when True. Preserve the order of the data when False.
7270+
7271+
.. versionchanged:: 3.0.0
7272+
7273+
Prior to 3.0.0, ``sort=False`` would sort by the columns values.
72707274
ascending : bool, default False
72717275
Sort in ascending order.
72727276
dropna : bool, default True
@@ -7372,7 +7376,9 @@ def value_counts(
73727376
subset = self.columns.tolist()
73737377

73747378
name = "proportion" if normalize else "count"
7375-
counts = self.groupby(subset, dropna=dropna, observed=False)._grouper.size()
7379+
counts = self.groupby(
7380+
subset, sort=False, dropna=dropna, observed=False
7381+
)._grouper.size()
73767382
counts.name = name
73777383

73787384
if sort:

pandas/core/groupby/generic.py

Lines changed: 17 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -2621,7 +2621,13 @@ def value_counts(
26212621
normalize : bool, default False
26222622
Return proportions rather than frequencies.
26232623
sort : bool, default True
2624-
Sort by frequencies.
2624+
Sort by frequencies when True. When False, non-grouping columns will appear
2625+
in the order they occur in within groups.
2626+
2627+
.. versionchanged:: 3.0.0
2628+
2629+
In prior versions, ``sort=False`` would sort the non-grouping columns
2630+
by label.
26252631
ascending : bool, default False
26262632
Sort in ascending order.
26272633
dropna : bool, default True
@@ -2673,43 +2679,43 @@ def value_counts(
26732679
26742680
>>> df.groupby("gender").value_counts()
26752681
gender education country
2676-
female high FR 1
2677-
US 1
2682+
female high US 1
2683+
FR 1
26782684
male low FR 2
26792685
US 1
26802686
medium FR 1
26812687
Name: count, dtype: int64
26822688
26832689
>>> df.groupby("gender").value_counts(ascending=True)
26842690
gender education country
2685-
female high FR 1
2686-
US 1
2691+
female high US 1
2692+
FR 1
26872693
male low US 1
26882694
medium FR 1
26892695
low FR 2
26902696
Name: count, dtype: int64
26912697
26922698
>>> df.groupby("gender").value_counts(normalize=True)
26932699
gender education country
2694-
female high FR 0.50
2695-
US 0.50
2700+
female high US 0.50
2701+
FR 0.50
26962702
male low FR 0.50
26972703
US 0.25
26982704
medium FR 0.25
26992705
Name: proportion, dtype: float64
27002706
27012707
>>> df.groupby("gender", as_index=False).value_counts()
27022708
gender education country count
2703-
0 female high FR 1
2704-
1 female high US 1
2709+
0 female high US 1
2710+
1 female high FR 1
27052711
2 male low FR 2
27062712
3 male low US 1
27072713
4 male medium FR 1
27082714
27092715
>>> df.groupby("gender", as_index=False).value_counts(normalize=True)
27102716
gender education country proportion
2711-
0 female high FR 0.50
2712-
1 female high US 0.50
2717+
0 female high US 0.50
2718+
1 female high FR 0.50
27132719
2 male low FR 0.50
27142720
3 male low US 0.25
27152721
4 male medium FR 0.25

pandas/core/groupby/groupby.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2519,7 +2519,7 @@ def _value_counts(
25192519
grouper, _, _ = get_grouper(
25202520
df,
25212521
key=key,
2522-
sort=self.sort,
2522+
sort=False,
25232523
observed=False,
25242524
dropna=dropna,
25252525
)
@@ -2528,7 +2528,7 @@ def _value_counts(
25282528
# Take the size of the overall columns
25292529
gb = df.groupby(
25302530
groupings,
2531-
sort=self.sort,
2531+
sort=False,
25322532
observed=self.observed,
25332533
dropna=self.dropna,
25342534
)

pandas/core/groupby/ops.py

Lines changed: 35 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -755,6 +755,7 @@ def result_index_and_ids(self) -> tuple[Index, npt.NDArray[np.intp]]:
755755
obs = [
756756
ping._observed or not ping._passed_categorical for ping in self.groupings
757757
]
758+
sorts = [ping._sort for ping in self.groupings]
758759
# When passed a categorical grouping, keep all categories
759760
for k, (ping, level) in enumerate(zip(self.groupings, levels)):
760761
if ping._passed_categorical:
@@ -765,7 +766,9 @@ def result_index_and_ids(self) -> tuple[Index, npt.NDArray[np.intp]]:
765766
result_index.name = self.names[0]
766767
ids = ensure_platform_int(self.codes[0])
767768
elif all(obs):
768-
result_index, ids = self._ob_index_and_ids(levels, self.codes, self.names)
769+
result_index, ids = self._ob_index_and_ids(
770+
levels, self.codes, self.names, sorts
771+
)
769772
elif not any(obs):
770773
result_index, ids = self._unob_index_and_ids(levels, self.codes, self.names)
771774
else:
@@ -778,6 +781,7 @@ def result_index_and_ids(self) -> tuple[Index, npt.NDArray[np.intp]]:
778781
levels=[levels[idx] for idx in ob_indices],
779782
codes=[codes[idx] for idx in ob_indices],
780783
names=[names[idx] for idx in ob_indices],
784+
sorts=[sorts[idx] for idx in ob_indices],
781785
)
782786
unob_index, unob_ids = self._unob_index_and_ids(
783787
levels=[levels[idx] for idx in unob_indices],
@@ -800,9 +804,18 @@ def result_index_and_ids(self) -> tuple[Index, npt.NDArray[np.intp]]:
800804
).reorder_levels(index)
801805
ids = len(unob_index) * ob_ids + unob_ids
802806

803-
if self._sort:
807+
if any(sorts):
804808
# Sort result_index and recode ids using the new order
805-
sorter = result_index.argsort()
809+
n_levels = len(sorts)
810+
drop_levels = [
811+
n_levels - idx
812+
for idx, sort in enumerate(reversed(sorts), 1)
813+
if not sort
814+
]
815+
if len(drop_levels) > 0:
816+
sorter = result_index._drop_level_numbers(drop_levels).argsort()
817+
else:
818+
sorter = result_index.argsort()
806819
result_index = result_index.take(sorter)
807820
_, index = np.unique(sorter, return_index=True)
808821
ids = ensure_platform_int(ids)
@@ -837,10 +850,13 @@ def _ob_index_and_ids(
837850
levels: list[Index],
838851
codes: list[npt.NDArray[np.intp]],
839852
names: list[Hashable],
853+
sorts: list[bool],
840854
) -> tuple[MultiIndex, npt.NDArray[np.intp]]:
855+
consistent_sorting = all(sorts[0] == sort for sort in sorts[1:])
856+
sort_in_compress = sorts[0] if consistent_sorting else False
841857
shape = tuple(len(level) for level in levels)
842858
group_index = get_group_index(codes, shape, sort=True, xnull=True)
843-
ob_ids, obs_group_ids = compress_group_index(group_index, sort=self._sort)
859+
ob_ids, obs_group_ids = compress_group_index(group_index, sort=sort_in_compress)
844860
ob_ids = ensure_platform_int(ob_ids)
845861
ob_index_codes = decons_obs_group_ids(
846862
ob_ids, obs_group_ids, shape, codes, xnull=True
@@ -851,6 +867,21 @@ def _ob_index_and_ids(
851867
names=names,
852868
verify_integrity=False,
853869
)
870+
if not consistent_sorting:
871+
# Sort by the levels where the corresponding sort argument is True
872+
n_levels = len(sorts)
873+
drop_levels = [
874+
n_levels - idx
875+
for idx, sort in enumerate(reversed(sorts), 1)
876+
if not sort
877+
]
878+
if len(drop_levels) > 0:
879+
sorter = ob_index._drop_level_numbers(drop_levels).argsort()
880+
else:
881+
sorter = ob_index.argsort()
882+
ob_index = ob_index.take(sorter)
883+
_, index = np.unique(sorter, return_index=True)
884+
ob_ids = np.where(ob_ids == -1, -1, index.take(ob_ids))
854885
ob_ids = ensure_platform_int(ob_ids)
855886
return ob_index, ob_ids
856887

pandas/errors/__init__.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -444,6 +444,11 @@ class SpecificationError(Exception):
444444
The second way is calling ``agg`` on a Dataframe with duplicated functions
445445
names without assigning column name.
446446
447+
See Also
448+
--------
449+
DataFrame.agg : Aggregate using one or more operations over the specified axis.
450+
Series.agg : Aggregate using one or more operations over the specified axis.
451+
447452
Examples
448453
--------
449454
>>> df = pd.DataFrame({"A": [1, 1, 1, 2, 2], "B": range(5), "C": range(5)})

0 commit comments

Comments
 (0)