Skip to content

Commit 1290366

Browse files
committed
API: value_counts to consistently maintain order of input
1 parent 80b6850 commit 1290366

File tree

5 files changed

+69
-15
lines changed

5 files changed

+69
-15
lines changed

pandas/core/frame.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7370,7 +7370,9 @@ def value_counts(
73707370
subset = self.columns.tolist()
73717371

73727372
name = "proportion" if normalize else "count"
7373-
counts = self.groupby(subset, dropna=dropna, observed=False)._grouper.size()
7373+
counts = self.groupby(
7374+
subset, sort=False, dropna=dropna, observed=False
7375+
)._grouper.size()
73747376
counts.name = name
73757377

73767378
if sort:

pandas/core/groupby/groupby.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2669,7 +2669,7 @@ def _value_counts(
26692669
grouper, _, _ = get_grouper(
26702670
df,
26712671
key=key,
2672-
sort=self.sort,
2672+
sort=False,
26732673
observed=False,
26742674
dropna=dropna,
26752675
)
@@ -2678,7 +2678,7 @@ def _value_counts(
26782678
# Take the size of the overall columns
26792679
gb = df.groupby(
26802680
groupings,
2681-
sort=self.sort,
2681+
sort=False,
26822682
observed=self.observed,
26832683
dropna=self.dropna,
26842684
)

pandas/core/groupby/ops.py

Lines changed: 37 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -747,12 +747,14 @@ def result_index(self) -> Index:
747747
def ids(self) -> npt.NDArray[np.intp]:
748748
return self.result_index_and_ids[1]
749749

750-
@cache_readonly
750+
# @cache_readonly
751+
@property
751752
def result_index_and_ids(self) -> tuple[Index, npt.NDArray[np.intp]]:
752753
levels = [Index._with_infer(ping.uniques) for ping in self.groupings]
753754
obs = [
754755
ping._observed or not ping._passed_categorical for ping in self.groupings
755756
]
757+
sorts = [ping._sort for ping in self.groupings]
756758
# When passed a categorical grouping, keep all categories
757759
for k, (ping, level) in enumerate(zip(self.groupings, levels)):
758760
if ping._passed_categorical:
@@ -763,7 +765,9 @@ def result_index_and_ids(self) -> tuple[Index, npt.NDArray[np.intp]]:
763765
result_index.name = self.names[0]
764766
ids = ensure_platform_int(self.codes[0])
765767
elif all(obs):
766-
result_index, ids = self._ob_index_and_ids(levels, self.codes, self.names)
768+
result_index, ids = self._ob_index_and_ids(
769+
levels, self.codes, self.names, sorts
770+
)
767771
elif not any(obs):
768772
result_index, ids = self._unob_index_and_ids(levels, self.codes, self.names)
769773
else:
@@ -776,6 +780,7 @@ def result_index_and_ids(self) -> tuple[Index, npt.NDArray[np.intp]]:
776780
levels=[levels[idx] for idx in ob_indices],
777781
codes=[codes[idx] for idx in ob_indices],
778782
names=[names[idx] for idx in ob_indices],
783+
sorts=[sorts[idx] for idx in ob_indices],
779784
)
780785
unob_index, unob_ids = self._unob_index_and_ids(
781786
levels=[levels[idx] for idx in unob_indices],
@@ -798,9 +803,18 @@ def result_index_and_ids(self) -> tuple[Index, npt.NDArray[np.intp]]:
798803
).reorder_levels(index)
799804
ids = len(unob_index) * ob_ids + unob_ids
800805

801-
if self._sort:
806+
if any(sorts):
802807
# Sort result_index and recode ids using the new order
803-
sorter = result_index.argsort()
808+
n_levels = len(sorts)
809+
drop_levels = [
810+
n_levels - idx
811+
for idx, sort in enumerate(reversed(sorts), 1)
812+
if not sort
813+
]
814+
if len(drop_levels) > 0:
815+
sorter = result_index._drop_level_numbers(drop_levels).argsort()
816+
else:
817+
sorter = result_index.argsort()
804818
result_index = result_index.take(sorter)
805819
_, index = np.unique(sorter, return_index=True)
806820
ids = ensure_platform_int(ids)
@@ -835,10 +849,13 @@ def _ob_index_and_ids(
835849
levels: list[Index],
836850
codes: list[npt.NDArray[np.intp]],
837851
names: list[Hashable],
852+
sorts: list[bool],
838853
) -> tuple[MultiIndex, npt.NDArray[np.intp]]:
854+
consistent_sorting = all(sorts[0] == sort for sort in sorts[1:])
855+
sort_in_compress = sorts[0] if consistent_sorting else False
839856
shape = tuple(len(level) for level in levels)
840857
group_index = get_group_index(codes, shape, sort=True, xnull=True)
841-
ob_ids, obs_group_ids = compress_group_index(group_index, sort=self._sort)
858+
ob_ids, obs_group_ids = compress_group_index(group_index, sort=sort_in_compress)
842859
ob_ids = ensure_platform_int(ob_ids)
843860
ob_index_codes = decons_obs_group_ids(
844861
ob_ids, obs_group_ids, shape, codes, xnull=True
@@ -849,6 +866,21 @@ def _ob_index_and_ids(
849866
names=names,
850867
verify_integrity=False,
851868
)
869+
if not consistent_sorting:
870+
# Sort by the levels where the corresponding sort argument is True
871+
n_levels = len(sorts)
872+
drop_levels = [
873+
n_levels - idx
874+
for idx, sort in enumerate(reversed(sorts), 1)
875+
if not sort
876+
]
877+
if len(drop_levels) > 0:
878+
sorter = ob_index._drop_level_numbers(drop_levels).argsort()
879+
else:
880+
sorter = ob_index.argsort()
881+
ob_index = ob_index.take(sorter)
882+
_, index = np.unique(sorter, return_index=True)
883+
ob_ids = np.where(ob_ids == -1, -1, index.take(ob_ids))
852884
ob_ids = ensure_platform_int(ob_ids)
853885
return ob_index, ob_ids
854886

pandas/tests/frame/methods/test_value_counts.py

Lines changed: 12 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -128,7 +128,17 @@ def test_data_frame_value_counts_dropna_true(nulls_fixture):
128128
expected = pd.Series(
129129
data=[1, 1],
130130
index=pd.MultiIndex.from_arrays(
131-
[("Beth", "John"), ("Louise", "Smith")], names=["first_name", "middle_name"]
131+
[
132+
(
133+
"John",
134+
"Beth",
135+
),
136+
(
137+
"Smith",
138+
"Louise",
139+
),
140+
],
141+
names=["first_name", "middle_name"],
132142
),
133143
name="count",
134144
)
@@ -156,7 +166,7 @@ def test_data_frame_value_counts_dropna_false(nulls_fixture):
156166
pd.Index(["Anne", "Beth", "John"]),
157167
pd.Index(["Louise", "Smith", np.nan]),
158168
],
159-
codes=[[0, 1, 2, 2], [2, 0, 1, 2]],
169+
codes=[[2, 0, 2, 1], [1, 2, 2, 0]],
160170
names=["first_name", "middle_name"],
161171
),
162172
name="count",

pandas/tests/groupby/methods/test_value_counts.py

Lines changed: 15 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -484,7 +484,7 @@ def test_data_frame_value_counts(
484484
[0.5, 0.5, 1.0, 0.25, 0.25, 0.25, 0.25, 1.0, 1.0],
485485
),
486486
(False, True, [0, 1, 3, 5, 2, 4], [0.5, 0.5, 1.0, 1.0, 1.0, 1.0]),
487-
(True, False, [0, 1, 5, 7, 6, 8], [0.5, 0.5, 0.25, 0.25, 0.25, 0.25]),
487+
(True, False, [0, 1, 5, 6, 7, 8], [0.5, 0.5, 0.25, 0.25, 0.25, 0.25]),
488488
(True, True, [0, 1, 5], [0.5, 0.5, 1.0]),
489489
],
490490
)
@@ -526,7 +526,17 @@ def test_dropna_combinations(
526526
True,
527527
[1, 1],
528528
MultiIndex.from_arrays(
529-
[(1, 1), ("Beth", "John"), ("Louise", "Smith")],
529+
[
530+
(1, 1),
531+
(
532+
"John",
533+
"Beth",
534+
),
535+
(
536+
"Smith",
537+
"Louise",
538+
),
539+
],
530540
names=["key", "first_name", "middle_name"],
531541
),
532542
),
@@ -539,7 +549,7 @@ def test_dropna_combinations(
539549
Index(["Anne", "Beth", "John"]),
540550
Index(["Louise", "Smith", np.nan]),
541551
],
542-
codes=[[0, 0, 0, 0], [0, 1, 2, 2], [2, 0, 1, 2]],
552+
codes=[[0, 0, 0, 0], [2, 0, 2, 1], [1, 2, 2, 0]],
543553
names=["key", "first_name", "middle_name"],
544554
),
545555
),
@@ -845,8 +855,8 @@ def test_categorical_single_grouper_observed_false(
845855
("US", "high", "male"),
846856
("US", "low", "male"),
847857
("US", "low", "female"),
848-
("US", "medium", "female"),
849858
("US", "medium", "male"),
859+
("US", "medium", "female"),
850860
],
851861
),
852862
(
@@ -1186,7 +1196,7 @@ def test_value_counts_sort(sort, vc_sort, normalize):
11861196
if sort and vc_sort:
11871197
taker = [0, 1, 2]
11881198
elif sort and not vc_sort:
1189-
taker = [0, 1, 2]
1199+
taker = [1, 0, 2]
11901200
elif not sort and vc_sort:
11911201
taker = [0, 2, 1]
11921202
else:

0 commit comments

Comments
 (0)