Skip to content

Commit 31f1c33

Browse files
Merge remote-tracking branch 'upstream/main' into string-dtype-object
2 parents bae8d65 + 4b4c86e commit 31f1c33

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

49 files changed

+420
-124
lines changed

ci/code_checks.sh

Lines changed: 0 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -70,16 +70,10 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then
7070
--format=actions \
7171
-i ES01 `# For now it is ok if docstrings are missing the extended summary` \
7272
-i "pandas.Series.dt PR01" `# Accessors are implemented as classes, but we do not document the Parameters section` \
73-
-i "pandas.MultiIndex.append PR07,SA01" \
74-
-i "pandas.MultiIndex.copy PR07,RT03,SA01" \
7573
-i "pandas.MultiIndex.get_level_values SA01" \
76-
-i "pandas.MultiIndex.get_loc PR07" \
7774
-i "pandas.MultiIndex.get_loc_level PR07" \
78-
-i "pandas.MultiIndex.levshape SA01" \
7975
-i "pandas.MultiIndex.names SA01" \
80-
-i "pandas.MultiIndex.remove_unused_levels RT03,SA01" \
8176
-i "pandas.MultiIndex.reorder_levels RT03,SA01" \
82-
-i "pandas.MultiIndex.set_levels RT03,SA01" \
8377
-i "pandas.MultiIndex.sortlevel PR07,SA01" \
8478
-i "pandas.MultiIndex.to_frame RT03" \
8579
-i "pandas.NA SA01" \

doc/source/user_guide/style.ipynb

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -351,7 +351,7 @@
351351
"\n",
352352
"- Using [.set_table_styles()][table] to control broader areas of the table with specified internal CSS. Although table styles allow the flexibility to add CSS selectors and properties controlling all individual parts of the table, they are unwieldy for individual cell specifications. Also, note that table styles cannot be exported to Excel. \n",
353353
"- Using [.set_td_classes()][td_class] to directly link either external CSS classes to your data cells or link the internal CSS classes created by [.set_table_styles()][table]. See [here](#Setting-Classes-and-Linking-to-External-CSS). These cannot be used on column header rows or indexes, and also won't export to Excel. \n",
354-
"- Using the [.apply()][apply] and [.map()][map] functions to add direct internal CSS to specific data cells. See [here](#Styler-Functions). As of v1.4.0 there are also methods that work directly on column header rows or indexes; [.apply_index()][applyindex] and [.map_index()][mapindex]. Note that only these methods add styles that will export to Excel. These methods work in a similar way to [DataFrame.apply()][dfapply] and [DataFrame.map()][dfmap].\n",
354+
"- Using the [.apply()][apply] and [.map()][map] functions to add direct internal CSS to specific data cells. See [here](#Styler-Functions). As of v1.4.0 there are also methods that work directly on column header rows or indexes: [.apply_index()][applyindex] and [.map_index()][mapindex]. Note that only these methods add styles that will export to Excel. These methods work in a similar way to [DataFrame.apply()][dfapply] and [DataFrame.map()][dfmap].\n",
355355
"\n",
356356
"[table]: ../reference/api/pandas.io.formats.style.Styler.set_table_styles.rst\n",
357357
"[styler]: ../reference/api/pandas.io.formats.style.Styler.rst\n",

doc/source/whatsnew/v3.0.0.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,7 @@ Other enhancements
3535
- :meth:`DataFrame.agg` called with ``axis=1`` and a ``func`` which relabels the result index now raises a ``NotImplementedError`` (:issue:`58807`).
3636
- :meth:`Index.get_loc` now accepts also subclasses of ``tuple`` as keys (:issue:`57922`)
3737
- :meth:`Styler.set_tooltips` provides alternative method to storing tooltips by using title attribute of td elements. (:issue:`56981`)
38+
- Added missing parameter ``weights`` in :meth:`DataFrame.plot.kde` for the estimation of the PDF (:issue:`59337`)
3839
- Allow dictionaries to be passed to :meth:`pandas.Series.str.replace` via ``pat`` parameter (:issue:`51748`)
3940
- Support passing a :class:`Series` input to :func:`json_normalize` that retains the :class:`Series` :class:`Index` (:issue:`51452`)
4041
- Support reading value labels from Stata 108-format (Stata 6) and earlier files (:issue:`58154`)

pandas/_testing/asserters.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -581,6 +581,8 @@ def raise_assert_detail(
581581
elif isinstance(left, (CategoricalDtype, NumpyEADtype)):
582582
left = repr(left)
583583
elif isinstance(left, StringDtype):
584+
# TODO(infer_string) this special case could be avoided if we have
585+
# a more informative repr https://github.com/pandas-dev/pandas/issues/59342
584586
left = f"StringDtype(storage={left.storage}, na_value={left.na_value})"
585587

586588
if isinstance(right, np.ndarray):

pandas/conftest.py

Lines changed: 24 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -1355,21 +1355,36 @@ def object_dtype(request):
13551355

13561356
@pytest.fixture(
13571357
params=[
1358-
"object",
1359-
"string[python]",
1360-
"string[python_numpy]",
1361-
pytest.param("string[pyarrow]", marks=td.skip_if_no("pyarrow")),
1362-
pytest.param("string[pyarrow_numpy]", marks=td.skip_if_no("pyarrow")),
1363-
]
1358+
np.dtype("object"),
1359+
("python", pd.NA),
1360+
pytest.param(("pyarrow", pd.NA), marks=td.skip_if_no("pyarrow")),
1361+
pytest.param(("pyarrow", np.nan), marks=td.skip_if_no("pyarrow")),
1362+
("python", np.nan),
1363+
],
1364+
ids=[
1365+
"string=object",
1366+
"string=string[python]",
1367+
"string=string[pyarrow]",
1368+
"string=str[pyarrow]",
1369+
"string=str[python]",
1370+
],
13641371
)
13651372
def any_string_dtype(request):
13661373
"""
13671374
Parametrized fixture for string dtypes.
13681375
* 'object'
1369-
* 'string[python]'
1370-
* 'string[pyarrow]'
1376+
* 'string[python]' (NA variant)
1377+
* 'string[pyarrow]' (NA variant)
1378+
* 'str' (NaN variant, with pyarrow)
1379+
* 'str' (NaN variant, without pyarrow)
13711380
"""
1372-
return request.param
1381+
if isinstance(request.param, np.dtype):
1382+
return request.param
1383+
else:
1384+
# need to instantiate the StringDtype here instead of in the params
1385+
# to avoid importing pyarrow during test collection
1386+
storage, na_value = request.param
1387+
return pd.StringDtype(storage, na_value)
13731388

13741389

13751390
@pytest.fixture(params=tm.DATETIME64_DTYPES)
@@ -2028,14 +2043,6 @@ def warsaw(request) -> str:
20282043
return request.param
20292044

20302045

2031-
@pytest.fixture
2032-
def arrow_string_storage():
2033-
"""
2034-
Fixture that lists possible PyArrow values for StringDtype storage field.
2035-
"""
2036-
return ("pyarrow", "pyarrow_numpy")
2037-
2038-
20392046
@pytest.fixture
20402047
def temp_file(tmp_path):
20412048
"""

pandas/core/arrays/string_.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -134,7 +134,7 @@ def __init__(
134134
) -> None:
135135
# infer defaults
136136
if storage is None:
137-
if using_string_dtype():
137+
if using_string_dtype() and na_value is not libmissing.NA:
138138
if HAS_PYARROW:
139139
storage = "pyarrow"
140140
else:
@@ -179,7 +179,9 @@ def __eq__(self, other: object) -> bool:
179179
return True
180180
try:
181181
other = self.construct_from_string(other)
182-
except TypeError:
182+
except (TypeError, ImportError):
183+
# TypeError if `other` is not a valid string for StringDtype
184+
# ImportError if pyarrow is not installed for "string[pyarrow]"
183185
return False
184186
if isinstance(other, type(self)):
185187
return self.storage == other.storage and self.na_value is other.na_value

pandas/core/base.py

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1049,6 +1049,34 @@ def value_counts(
10491049
4.0 1
10501050
NaN 1
10511051
Name: count, dtype: int64
1052+
1053+
**Categorical Dtypes**
1054+
1055+
Rows with categorical type will be counted as one group
1056+
if they have same categories and order.
1057+
In the example below, even though ``a``, ``c``, and ``d``
1058+
all have the same data types of ``category``,
1059+
only ``c`` and ``d`` will be counted as one group
1060+
since ``a`` doesn't have the same categories.
1061+
1062+
>>> df = pd.DataFrame({"a": [1], "b": ["2"], "c": [3], "d": [3]})
1063+
>>> df = df.astype({"a": "category", "c": "category", "d": "category"})
1064+
>>> df
1065+
a b c d
1066+
0 1 2 3 3
1067+
1068+
>>> df.dtypes
1069+
a category
1070+
b object
1071+
c category
1072+
d category
1073+
dtype: object
1074+
1075+
>>> df.dtypes.value_counts()
1076+
category 2
1077+
category 1
1078+
object 1
1079+
Name: count, dtype: int64
10521080
"""
10531081
return algorithms.value_counts_internal(
10541082
self,

pandas/core/indexes/multi.py

Lines changed: 87 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -933,6 +933,19 @@ def set_levels(
933933
"""
934934
Set new levels on MultiIndex. Defaults to returning new index.
935935
936+
The `set_levels` method provides a flexible way to change the levels of a
937+
`MultiIndex`. This is particularly useful when you need to update the
938+
index structure of your DataFrame without altering the data. The method
939+
returns a new `MultiIndex` unless the operation is performed in-place,
940+
ensuring that the original index remains unchanged unless explicitly
941+
modified.
942+
943+
The method checks the integrity of the new levels against the existing
944+
codes by default, but this can be disabled if you are confident that
945+
your levels are consistent with the underlying data. This can be useful
946+
when you want to perform optimizations or make specific adjustments to
947+
the index levels that do not strictly adhere to the original structure.
948+
936949
Parameters
937950
----------
938951
levels : sequence or list of sequence
@@ -945,6 +958,14 @@ def set_levels(
945958
Returns
946959
-------
947960
MultiIndex
961+
A new `MultiIndex` with the updated levels.
962+
963+
See Also
964+
--------
965+
MultiIndex.set_codes : Set new codes on the existing `MultiIndex`.
966+
MultiIndex.remove_unused_levels : Create new MultiIndex from current that
967+
removes unused levels.
968+
Index.set_names : Set Index or MultiIndex name.
948969
949970
Examples
950971
--------
@@ -1051,7 +1072,19 @@ def nlevels(self) -> int:
10511072
@property
10521073
def levshape(self) -> Shape:
10531074
"""
1054-
A tuple with the length of each level.
1075+
A tuple representing the length of each level in the MultiIndex.
1076+
1077+
In a `MultiIndex`, each level can contain multiple unique values. The
1078+
`levshape` property provides a quick way to assess the size of each
1079+
level by returning a tuple where each entry represents the number of
1080+
unique values in that specific level. This is particularly useful in
1081+
scenarios where you need to understand the structure and distribution
1082+
of your index levels, such as when working with multidimensional data.
1083+
1084+
See Also
1085+
--------
1086+
MultiIndex.shape : Return a tuple of the shape of the MultiIndex.
1087+
MultiIndex.levels : Returns the levels of the MultiIndex.
10551088
10561089
Examples
10571090
--------
@@ -1282,20 +1315,37 @@ def copy( # type: ignore[override]
12821315
name=None,
12831316
) -> Self:
12841317
"""
1285-
Make a copy of this object.
1318+
Make a copy of this object. Names, dtype, levels and codes can be passed and \
1319+
will be set on new copy.
12861320
1287-
Names, dtype, levels and codes can be passed and will be set on new copy.
1321+
The `copy` method provides a mechanism to create a duplicate of an
1322+
existing MultiIndex object. This is particularly useful in scenarios where
1323+
modifications are required on an index, but the original MultiIndex should
1324+
remain unchanged. By specifying the `deep` parameter, users can control
1325+
whether the copy should be a deep or shallow copy, providing flexibility
1326+
depending on the size and complexity of the MultiIndex.
12881327
12891328
Parameters
12901329
----------
12911330
names : sequence, optional
1331+
Names to set on the new MultiIndex object.
12921332
deep : bool, default False
1333+
If False, the new object will be a shallow copy. If True, a deep copy
1334+
will be attempted. Deep copying can be potentially expensive for large
1335+
MultiIndex objects.
12931336
name : Label
12941337
Kept for compatibility with 1-dimensional Index. Should not be used.
12951338
12961339
Returns
12971340
-------
12981341
MultiIndex
1342+
A new MultiIndex object with the specified modifications.
1343+
1344+
See Also
1345+
--------
1346+
MultiIndex.from_arrays : Convert arrays to MultiIndex.
1347+
MultiIndex.from_tuples : Convert list of tuples to MultiIndex.
1348+
MultiIndex.from_frame : Convert DataFrame to MultiIndex.
12991349
13001350
Notes
13011351
-----
@@ -2041,9 +2091,22 @@ def remove_unused_levels(self) -> MultiIndex:
20412091
appearance, meaning the same .values and ordering. It will
20422092
also be .equals() to the original.
20432093
2094+
The `remove_unused_levels` method is useful in cases where you have a
2095+
MultiIndex with hierarchical levels, but some of these levels are no
2096+
longer needed due to filtering or subsetting operations. By removing
2097+
the unused levels, the resulting MultiIndex becomes more compact and
2098+
efficient, which can improve performance in subsequent operations.
2099+
20442100
Returns
20452101
-------
20462102
MultiIndex
2103+
A new MultiIndex with unused levels removed.
2104+
2105+
See Also
2106+
--------
2107+
MultiIndex.droplevel : Remove specified levels from a MultiIndex.
2108+
MultiIndex.reorder_levels : Rearrange levels of a MultiIndex.
2109+
MultiIndex.set_levels : Set new levels on a MultiIndex.
20472110
20482111
Examples
20492112
--------
@@ -2223,15 +2286,28 @@ def append(self, other):
22232286
"""
22242287
Append a collection of Index options together.
22252288
2289+
The `append` method is used to combine multiple `Index` objects into a single
2290+
`Index`. This is particularly useful when dealing with multi-level indexing
2291+
(MultiIndex) where you might need to concatenate different levels of indices.
2292+
The method handles the alignment of the levels and codes of the indices being
2293+
appended to ensure consistency in the resulting `MultiIndex`.
2294+
22262295
Parameters
22272296
----------
22282297
other : Index or list/tuple of indices
2298+
Index or list/tuple of Index objects to be appended.
22292299
22302300
Returns
22312301
-------
22322302
Index
22332303
The combined index.
22342304
2305+
See Also
2306+
--------
2307+
MultiIndex: A multi-level, or hierarchical, index object for pandas objects.
2308+
Index.append : Append a collection of Index options together.
2309+
concat : Concatenate pandas objects along a particular axis.
2310+
22352311
Examples
22362312
--------
22372313
>>> mi = pd.MultiIndex.from_arrays([["a"], ["b"]])
@@ -2955,14 +3031,19 @@ def _get_loc_single_level_index(self, level_index: Index, key: Hashable) -> int:
29553031

29563032
def get_loc(self, key):
29573033
"""
2958-
Get location for a label or a tuple of labels.
3034+
Get location for a label or a tuple of labels. The location is returned \
3035+
as an integer/slice or boolean mask.
29593036
2960-
The location is returned as an integer/slice or boolean
2961-
mask.
3037+
This method returns the integer location, slice object, or boolean mask
3038+
corresponding to the specified key, which can be a single label or a tuple
3039+
of labels. The key represents a position in the MultiIndex, and the location
3040+
indicates where the key is found within the index.
29623041
29633042
Parameters
29643043
----------
29653044
key : label or tuple of labels (one for each level)
3045+
A label or tuple of labels that correspond to the levels of the MultiIndex.
3046+
The key must match the structure of the MultiIndex.
29663047
29673048
Returns
29683049
-------

pandas/core/series.py

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1815,14 +1815,30 @@ def _set_name(
18151815
Parrot 30.0
18161816
Parrot 20.0
18171817
Name: Max Speed, dtype: float64
1818+
1819+
We can pass a list of values to group the Series data by custom labels:
1820+
18181821
>>> ser.groupby(["a", "b", "a", "b"]).mean()
18191822
a 210.0
18201823
b 185.0
18211824
Name: Max Speed, dtype: float64
1825+
1826+
Grouping by numeric labels yields similar results:
1827+
1828+
>>> ser.groupby([0, 1, 0, 1]).mean()
1829+
0 210.0
1830+
1 185.0
1831+
Name: Max Speed, dtype: float64
1832+
1833+
We can group by a level of the index:
1834+
18221835
>>> ser.groupby(level=0).mean()
18231836
Falcon 370.0
18241837
Parrot 25.0
18251838
Name: Max Speed, dtype: float64
1839+
1840+
We can group by a condition applied to the Series values:
1841+
18261842
>>> ser.groupby(ser > 100).mean()
18271843
Max Speed
18281844
False 25.0
@@ -1845,11 +1861,16 @@ def _set_name(
18451861
Parrot Captive 30.0
18461862
Wild 20.0
18471863
Name: Max Speed, dtype: float64
1864+
18481865
>>> ser.groupby(level=0).mean()
18491866
Animal
18501867
Falcon 370.0
18511868
Parrot 25.0
18521869
Name: Max Speed, dtype: float64
1870+
1871+
We can also group by the 'Type' level of the hierarchical index
1872+
to get the mean speed for each type:
1873+
18531874
>>> ser.groupby(level="Type").mean()
18541875
Type
18551876
Captive 210.0
@@ -1865,12 +1886,17 @@ def _set_name(
18651886
b 3
18661887
dtype: int64
18671888
1889+
To include `NA` values in the group keys, set `dropna=False`:
1890+
18681891
>>> ser.groupby(level=0, dropna=False).sum()
18691892
a 3
18701893
b 3
18711894
NaN 3
18721895
dtype: int64
18731896
1897+
We can also group by a custom list with NaN values to handle
1898+
missing group labels:
1899+
18741900
>>> arrays = ['Falcon', 'Falcon', 'Parrot', 'Parrot']
18751901
>>> ser = pd.Series([390., 350., 30., 20.], index=arrays, name="Max Speed")
18761902
>>> ser.groupby(["a", "b", "a", np.nan]).mean()

0 commit comments

Comments
 (0)