Skip to content

Commit 92d53da

Browse files
committed
Add nsorted method for dataframe and series
1 parent e476e18 commit 92d53da

File tree

4 files changed

+238
-51
lines changed

4 files changed

+238
-51
lines changed

pandas/_typing.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -429,7 +429,7 @@ def closed(self) -> bool:
429429
SortKind = Literal["quicksort", "mergesort", "heapsort", "stable"]
430430
NaPosition = Literal["first", "last"]
431431

432-
# Arguments for nsmallest and nlargest
432+
# Arguments for nsorted, nsmallest and nlargest
433433
NsmallestNlargestKeep = Literal["first", "last", "all"]
434434

435435
# quantile interpolation

pandas/core/frame.py

Lines changed: 167 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -7447,6 +7447,160 @@ def value_counts(
74477447

74487448
return counts
74497449

7450+
def nsorted(
7451+
self,
7452+
n: int,
7453+
columns: IndexLabel,
7454+
ascending: bool | Sequence[bool],
7455+
keep: NsmallestNlargestKeep = "first",
7456+
) -> DataFrame:
7457+
"""
7458+
Return the first `n` rows ordered by `columns` in the order defined by
7459+
`ascending`.
7460+
7461+
The columns that are not specified are returned as
7462+
well, but not used for ordering.
7463+
7464+
This method is equivalent to
7465+
``df.sort_values(columns, ascending=ascending).head(n)``, but more
7466+
performant.
7467+
7468+
Parameters
7469+
----------
7470+
n : int
7471+
Number of rows to return.
7472+
columns : label or list of labels
7473+
Column label(s) to order by.
7474+
ascending : bool or list of bools
7475+
Whether to sort in ascending or descending order.
7476+
If a list, must be the same length as `columns`.
7477+
keep : {'first', 'last', 'all'}, default 'first'
7478+
Where there are duplicate values:
7479+
7480+
- ``first`` : prioritize the first occurrence(s)
7481+
- ``last`` : prioritize the last occurrence(s)
7482+
- ``all`` : keep all the ties of the smallest item even if it means
7483+
selecting more than ``n`` items.
7484+
7485+
Returns
7486+
-------
7487+
DataFrame
7488+
The first `n` rows ordered by the given columns in the order given
7489+
in `ascending`.
7490+
7491+
See Also
7492+
--------
7493+
DataFrame.nlargest : Return the first `n` rows ordered by `columns` in
7494+
descending order.
7495+
DataFrame.nsmallest : Return the first `n` rows ordered by `columns` in
7496+
ascending order.
7497+
DataFrame.sort_values : Sort DataFrame by the values.
7498+
DataFrame.head : Return the first `n` rows without re-ordering.
7499+
7500+
Notes
7501+
-----
7502+
This function cannot be used with all column types. For example, when
7503+
specifying columns with `object` or `category` dtypes, ``TypeError`` is
7504+
raised.
7505+
7506+
Examples
7507+
--------
7508+
>>> df = pd.DataFrame(
7509+
... {
7510+
... "population": [
7511+
... 59000000,
7512+
... 65000000,
7513+
... 434000,
7514+
... 434000,
7515+
... 434000,
7516+
... 337000,
7517+
... 11300,
7518+
... 11300,
7519+
... 11300,
7520+
... ],
7521+
... "GDP": [1937894, 2583560, 12011, 4520, 12128, 17036, 182, 38, 311],
7522+
... "alpha-2": ["IT", "FR", "MT", "MV", "BN", "IS", "NR", "TV", "AI"],
7523+
... },
7524+
... index=[
7525+
... "Italy",
7526+
... "France",
7527+
... "Malta",
7528+
... "Maldives",
7529+
... "Brunei",
7530+
... "Iceland",
7531+
... "Nauru",
7532+
... "Tuvalu",
7533+
... "Anguilla",
7534+
... ],
7535+
... )
7536+
>>> df
7537+
population GDP alpha-2
7538+
Italy 59000000 1937894 IT
7539+
France 65000000 2583560 FR
7540+
Malta 434000 12011 MT
7541+
Maldives 434000 4520 MV
7542+
Brunei 434000 12128 BN
7543+
Iceland 337000 17036 IS
7544+
Nauru 11300 182 NR
7545+
Tuvalu 11300 38 TV
7546+
Anguilla 11300 311 AI
7547+
7548+
In the following example, we will use ``nsorted`` to select the three
7549+
rows having the largest values in column "population".
7550+
7551+
>>> df.nsorted(3, "population", ascending=False)
7552+
population GDP alpha-2
7553+
France 65000000 2583560 FR
7554+
Italy 59000000 1937894 IT
7555+
Malta 434000 12011 MT
7556+
7557+
When using ``keep='last'``, ties are resolved in reverse order:
7558+
7559+
>>> df.nsorted(3, "population", ascending=False, keep="last")
7560+
population GDP alpha-2
7561+
France 65000000 2583560 FR
7562+
Italy 59000000 1937894 IT
7563+
Brunei 434000 12128 BN
7564+
7565+
When using ``keep='all'``, the number of elements kept can go beyond ``n``
7566+
if there are duplicate values for the smallest element. All the
7567+
ties are kept:
7568+
7569+
>>> df.nsorted(3, "population", ascending=False, keep="all")
7570+
population GDP alpha-2
7571+
France 65000000 2583560 FR
7572+
Italy 59000000 1937894 IT
7573+
Malta 434000 12011 MT
7574+
Maldives 434000 4520 MV
7575+
Brunei 434000 12128 BN
7576+
7577+
However, ``nsorted`` does not keep ``n`` distinct largest elements:
7578+
7579+
>>> df.nsorted(5, "population", ascending=False, keep="all")
7580+
population GDP alpha-2
7581+
France 65000000 2583560 FR
7582+
Italy 59000000 1937894 IT
7583+
Malta 434000 12011 MT
7584+
Maldives 434000 4520 MV
7585+
Brunei 434000 12128 BN
7586+
7587+
To order by the largest values in column "population" and break ties
7588+
according to the smallest values in column "GDP", we can specify
7589+
multiple columns and ascending orders like in the next example.
7590+
7591+
>>> df.nsorted(3, ["population", "GDP"], ascending=[False, True])
7592+
population GDP alpha-2
7593+
France 65000000 2583560 FR
7594+
Italy 59000000 1937894 IT
7595+
Maldives 434000 4520 MV
7596+
"""
7597+
return selectn.SelectNFrame(
7598+
self,
7599+
n=n,
7600+
keep=keep,
7601+
columns=columns,
7602+
).nsorted(ascending=ascending)
7603+
74507604
def nlargest(
74517605
self, n: int, columns: IndexLabel, keep: NsmallestNlargestKeep = "first"
74527606
) -> DataFrame:
@@ -7457,6 +7611,9 @@ def nlargest(
74577611
descending order. The columns that are not specified are returned as
74587612
well, but not used for ordering.
74597613
7614+
This method is equivalent to
7615+
``df.nsorted(n, columns, ascending=False)``.
7616+
74607617
This method is equivalent to
74617618
``df.sort_values(columns, ascending=False).head(n)``, but more
74627619
performant.
@@ -7485,6 +7642,8 @@ def nlargest(
74857642
--------
74867643
DataFrame.nsmallest : Return the first `n` rows ordered by `columns` in
74877644
ascending order.
7645+
DataFrame.nsorted : Return the first `n` rows ordered by `columns` in
7646+
the order given in `ascending`.
74887647
DataFrame.sort_values : Sort DataFrame by the values.
74897648
DataFrame.head : Return the first `n` rows without re-ordering.
74907649
@@ -7553,7 +7712,7 @@ def nlargest(
75537712
Italy 59000000 1937894 IT
75547713
Brunei 434000 12128 BN
75557714
7556-
When using ``keep='all'``, the number of element kept can go beyond ``n``
7715+
When using ``keep='all'``, the number of elements kept can go beyond ``n``
75577716
if there are duplicate values for the smallest element, all the
75587717
ties are kept:
75597718
@@ -7584,7 +7743,7 @@ def nlargest(
75847743
Italy 59000000 1937894 IT
75857744
Brunei 434000 12128 BN
75867745
"""
7587-
return selectn.SelectNFrame(self, n=n, keep=keep, columns=columns).nlargest()
7746+
return self.nsorted(n=n, columns=columns, ascending=False, keep=keep)
75887747

75897748
def nsmallest(
75907749
self, n: int, columns: IndexLabel, keep: NsmallestNlargestKeep = "first"
@@ -7596,6 +7755,9 @@ def nsmallest(
75967755
ascending order. The columns that are not specified are returned as
75977756
well, but not used for ordering.
75987757
7758+
This method is equivalent to
7759+
``df.nsorted(n, columns, ascending=True)``.
7760+
75997761
This method is equivalent to
76007762
``df.sort_values(columns, ascending=True).head(n)``, but more
76017763
performant.
@@ -7623,6 +7785,8 @@ def nsmallest(
76237785
--------
76247786
DataFrame.nlargest : Return the first `n` rows ordered by `columns` in
76257787
descending order.
7788+
DataFrame.nsorted : Return the first `n` rows ordered by `columns` in
7789+
the order given in `ascending`.
76267790
DataFrame.sort_values : Sort DataFrame by the values.
76277791
DataFrame.head : Return the first `n` rows without re-ordering.
76287792
@@ -7715,7 +7879,7 @@ def nsmallest(
77157879
Anguilla 11300 311 AI
77167880
Nauru 337000 182 NR
77177881
"""
7718-
return selectn.SelectNFrame(self, n=n, keep=keep, columns=columns).nsmallest()
7882+
return self.nsorted(n=n, columns=columns, ascending=True, keep=keep)
77197883

77207884
def swaplevel(self, i: Axis = -2, j: Axis = -1, axis: Axis = 0) -> DataFrame:
77217885
"""

0 commit comments

Comments
 (0)