Skip to content

Commit b4f9202

Browse files
Merge remote-tracking branch 'upstream/main' into string-dtype-tests-fixtures-arrow_string_storage
2 parents d7184ee + 9c08431 commit b4f9202

File tree

18 files changed

+213
-63
lines changed

18 files changed

+213
-63
lines changed

ci/code_checks.sh

Lines changed: 0 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -70,15 +70,10 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then
7070
--format=actions \
7171
-i ES01 `# For now it is ok if docstrings are missing the extended summary` \
7272
-i "pandas.Series.dt PR01" `# Accessors are implemented as classes, but we do not document the Parameters section` \
73-
-i "pandas.MultiIndex.copy PR07,RT03,SA01" \
7473
-i "pandas.MultiIndex.get_level_values SA01" \
75-
-i "pandas.MultiIndex.get_loc PR07" \
7674
-i "pandas.MultiIndex.get_loc_level PR07" \
77-
-i "pandas.MultiIndex.levshape SA01" \
7875
-i "pandas.MultiIndex.names SA01" \
79-
-i "pandas.MultiIndex.remove_unused_levels RT03,SA01" \
8076
-i "pandas.MultiIndex.reorder_levels RT03,SA01" \
81-
-i "pandas.MultiIndex.set_levels RT03,SA01" \
8277
-i "pandas.MultiIndex.sortlevel PR07,SA01" \
8378
-i "pandas.MultiIndex.to_frame RT03" \
8479
-i "pandas.NA SA01" \

doc/source/whatsnew/v3.0.0.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,7 @@ Other enhancements
3535
- :meth:`DataFrame.agg` called with ``axis=1`` and a ``func`` which relabels the result index now raises a ``NotImplementedError`` (:issue:`58807`).
3636
- :meth:`Index.get_loc` now accepts also subclasses of ``tuple`` as keys (:issue:`57922`)
3737
- :meth:`Styler.set_tooltips` provides alternative method to storing tooltips by using title attribute of td elements. (:issue:`56981`)
38+
- Added missing parameter ``weights`` in :meth:`DataFrame.plot.kde` for the estimation of the PDF (:issue:`59337`)
3839
- Allow dictionaries to be passed to :meth:`pandas.Series.str.replace` via ``pat`` parameter (:issue:`51748`)
3940
- Support passing a :class:`Series` input to :func:`json_normalize` that retains the :class:`Series` :class:`Index` (:issue:`51452`)
4041
- Support reading value labels from Stata 108-format (Stata 6) and earlier files (:issue:`58154`)

pandas/conftest.py

Lines changed: 21 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1354,20 +1354,33 @@ def object_dtype(request):
13541354

13551355
@pytest.fixture(
13561356
params=[
1357-
"object",
1358-
"string[python]",
1359-
pytest.param("string[pyarrow]", marks=td.skip_if_no("pyarrow")),
1360-
pytest.param("string[pyarrow_numpy]", marks=td.skip_if_no("pyarrow")),
1361-
]
1357+
np.dtype("object"),
1358+
("python", pd.NA),
1359+
pytest.param(("pyarrow", pd.NA), marks=td.skip_if_no("pyarrow")),
1360+
pytest.param(("pyarrow", np.nan), marks=td.skip_if_no("pyarrow")),
1361+
],
1362+
ids=[
1363+
"string=object",
1364+
"string=string[python]",
1365+
"string=string[pyarrow]",
1366+
"string=str[pyarrow]",
1367+
],
13621368
)
13631369
def any_string_dtype(request):
13641370
"""
13651371
Parametrized fixture for string dtypes.
13661372
* 'object'
1367-
* 'string[python]'
1368-
* 'string[pyarrow]'
1373+
* 'string[python]' (NA variant)
1374+
* 'string[pyarrow]' (NA variant)
1375+
* 'str' (NaN variant, with pyarrow)
13691376
"""
1370-
return request.param
1377+
if isinstance(request.param, np.dtype):
1378+
return request.param
1379+
else:
1380+
# need to instantiate the StringDtype here instead of in the params
1381+
# to avoid importing pyarrow during test collection
1382+
storage, na_value = request.param
1383+
return pd.StringDtype(storage, na_value)
13711384

13721385

13731386
@pytest.fixture(params=tm.DATETIME64_DTYPES)

pandas/core/arrays/string_.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -129,7 +129,7 @@ def __init__(
129129
) -> None:
130130
# infer defaults
131131
if storage is None:
132-
if using_string_dtype():
132+
if using_string_dtype() and na_value is not libmissing.NA:
133133
storage = "pyarrow"
134134
else:
135135
storage = get_option("mode.string_storage")
@@ -167,7 +167,9 @@ def __eq__(self, other: object) -> bool:
167167
return True
168168
try:
169169
other = self.construct_from_string(other)
170-
except TypeError:
170+
except (TypeError, ImportError):
171+
# TypeError if `other` is not a valid string for StringDtype
172+
# ImportError if pyarrow is not installed for "string[pyarrow]"
171173
return False
172174
if isinstance(other, type(self)):
173175
return self.storage == other.storage and self.na_value is other.na_value

pandas/core/indexes/multi.py

Lines changed: 74 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -933,6 +933,19 @@ def set_levels(
933933
"""
934934
Set new levels on MultiIndex. Defaults to returning new index.
935935
936+
The `set_levels` method provides a flexible way to change the levels of a
937+
`MultiIndex`. This is particularly useful when you need to update the
938+
index structure of your DataFrame without altering the data. The method
939+
returns a new `MultiIndex` unless the operation is performed in-place,
940+
ensuring that the original index remains unchanged unless explicitly
941+
modified.
942+
943+
The method checks the integrity of the new levels against the existing
944+
codes by default, but this can be disabled if you are confident that
945+
your levels are consistent with the underlying data. This can be useful
946+
when you want to perform optimizations or make specific adjustments to
947+
the index levels that do not strictly adhere to the original structure.
948+
936949
Parameters
937950
----------
938951
levels : sequence or list of sequence
@@ -945,6 +958,14 @@ def set_levels(
945958
Returns
946959
-------
947960
MultiIndex
961+
A new `MultiIndex` with the updated levels.
962+
963+
See Also
964+
--------
965+
MultiIndex.set_codes : Set new codes on the existing `MultiIndex`.
966+
MultiIndex.remove_unused_levels : Create new MultiIndex from current that
967+
removes unused levels.
968+
Index.set_names : Set Index or MultiIndex name.
948969
949970
Examples
950971
--------
@@ -1051,7 +1072,19 @@ def nlevels(self) -> int:
10511072
@property
10521073
def levshape(self) -> Shape:
10531074
"""
1054-
A tuple with the length of each level.
1075+
A tuple representing the length of each level in the MultiIndex.
1076+
1077+
In a `MultiIndex`, each level can contain multiple unique values. The
1078+
`levshape` property provides a quick way to assess the size of each
1079+
level by returning a tuple where each entry represents the number of
1080+
unique values in that specific level. This is particularly useful in
1081+
scenarios where you need to understand the structure and distribution
1082+
of your index levels, such as when working with multidimensional data.
1083+
1084+
See Also
1085+
--------
1086+
MultiIndex.shape : Return a tuple of the shape of the MultiIndex.
1087+
MultiIndex.levels : Returns the levels of the MultiIndex.
10551088
10561089
Examples
10571090
--------
@@ -1282,20 +1315,37 @@ def copy( # type: ignore[override]
12821315
name=None,
12831316
) -> Self:
12841317
"""
1285-
Make a copy of this object.
1318+
Make a copy of this object. Names, dtype, levels and codes can be passed and \
1319+
will be set on new copy.
12861320
1287-
Names, dtype, levels and codes can be passed and will be set on new copy.
1321+
The `copy` method provides a mechanism to create a duplicate of an
1322+
existing MultiIndex object. This is particularly useful in scenarios where
1323+
modifications are required on an index, but the original MultiIndex should
1324+
remain unchanged. By specifying the `deep` parameter, users can control
1325+
whether the copy should be a deep or shallow copy, providing flexibility
1326+
depending on the size and complexity of the MultiIndex.
12881327
12891328
Parameters
12901329
----------
12911330
names : sequence, optional
1331+
Names to set on the new MultiIndex object.
12921332
deep : bool, default False
1333+
If False, the new object will be a shallow copy. If True, a deep copy
1334+
will be attempted. Deep copying can be potentially expensive for large
1335+
MultiIndex objects.
12931336
name : Label
12941337
Kept for compatibility with 1-dimensional Index. Should not be used.
12951338
12961339
Returns
12971340
-------
12981341
MultiIndex
1342+
A new MultiIndex object with the specified modifications.
1343+
1344+
See Also
1345+
--------
1346+
MultiIndex.from_arrays : Convert arrays to MultiIndex.
1347+
MultiIndex.from_tuples : Convert list of tuples to MultiIndex.
1348+
MultiIndex.from_frame : Convert DataFrame to MultiIndex.
12991349
13001350
Notes
13011351
-----
@@ -2041,9 +2091,22 @@ def remove_unused_levels(self) -> MultiIndex:
20412091
appearance, meaning the same .values and ordering. It will
20422092
also be .equals() to the original.
20432093
2094+
The `remove_unused_levels` method is useful in cases where you have a
2095+
MultiIndex with hierarchical levels, but some of these levels are no
2096+
longer needed due to filtering or subsetting operations. By removing
2097+
the unused levels, the resulting MultiIndex becomes more compact and
2098+
efficient, which can improve performance in subsequent operations.
2099+
20442100
Returns
20452101
-------
20462102
MultiIndex
2103+
A new MultiIndex with unused levels removed.
2104+
2105+
See Also
2106+
--------
2107+
MultiIndex.droplevel : Remove specified levels from a MultiIndex.
2108+
MultiIndex.reorder_levels : Rearrange levels of a MultiIndex.
2109+
MultiIndex.set_levels : Set new levels on a MultiIndex.
20472110
20482111
Examples
20492112
--------
@@ -2968,14 +3031,19 @@ def _get_loc_single_level_index(self, level_index: Index, key: Hashable) -> int:
29683031

29693032
def get_loc(self, key):
29703033
"""
2971-
Get location for a label or a tuple of labels.
3034+
Get location for a label or a tuple of labels. The location is returned \
3035+
as an integer/slice or boolean mask.
29723036
2973-
The location is returned as an integer/slice or boolean
2974-
mask.
3037+
This method returns the integer location, slice object, or boolean mask
3038+
corresponding to the specified key, which can be a single label or a tuple
3039+
of labels. The key represents a position in the MultiIndex, and the location
3040+
indicates where the key is found within the index.
29753041
29763042
Parameters
29773043
----------
29783044
key : label or tuple of labels (one for each level)
3045+
A label or tuple of labels that correspond to the levels of the MultiIndex.
3046+
The key must match the structure of the MultiIndex.
29793047
29803048
Returns
29813049
-------

pandas/plotting/_core.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1450,6 +1450,7 @@ def kde(
14501450
self,
14511451
bw_method: Literal["scott", "silverman"] | float | Callable | None = None,
14521452
ind: np.ndarray | int | None = None,
1453+
weights: np.ndarray | None = None,
14531454
**kwargs,
14541455
) -> PlotAccessor:
14551456
"""
@@ -1475,6 +1476,9 @@ def kde(
14751476
1000 equally spaced points are used. If `ind` is a NumPy array, the
14761477
KDE is evaluated at the points passed. If `ind` is an integer,
14771478
`ind` number of equally spaced points are used.
1479+
weights : NumPy array, optional
1480+
Weights of datapoints. This must be the same shape as datapoints.
1481+
If None (default), the samples are assumed to be equally weighted.
14781482
**kwargs
14791483
Additional keyword arguments are documented in
14801484
:meth:`DataFrame.plot`.
@@ -1560,7 +1564,7 @@ def kde(
15601564
15611565
>>> ax = df.plot.kde(ind=[1, 2, 3, 4, 5, 6])
15621566
"""
1563-
return self(kind="kde", bw_method=bw_method, ind=ind, **kwargs)
1567+
return self(kind="kde", bw_method=bw_method, ind=ind, weights=weights, **kwargs)
15641568

15651569
density = kde
15661570

pandas/plotting/_matplotlib/hist.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -269,6 +269,7 @@ def _plot( # type: ignore[override]
269269
y: np.ndarray,
270270
style=None,
271271
bw_method=None,
272+
weights=None,
272273
ind=None,
273274
column_num=None,
274275
stacking_id: int | None = None,
@@ -277,7 +278,7 @@ def _plot( # type: ignore[override]
277278
from scipy.stats import gaussian_kde
278279

279280
y = remove_na_arraylike(y)
280-
gkde = gaussian_kde(y, bw_method=bw_method)
281+
gkde = gaussian_kde(y, bw_method=bw_method, weights=weights)
281282

282283
y = gkde.evaluate(ind)
283284
lines = MPLPlot._plot(ax, ind, y, style=style, **kwds)

pandas/tests/arrays/categorical/test_constructors.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -735,7 +735,6 @@ def test_interval(self):
735735
tm.assert_numpy_array_equal(cat.codes, expected_codes)
736736
tm.assert_index_equal(cat.categories, idx)
737737

738-
@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
739738
def test_categorical_extension_array_nullable(self, nulls_fixture):
740739
# GH:
741740
arr = pd.arrays.StringArray._from_sequence(

pandas/tests/copy_view/test_array.py

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,6 @@
11
import numpy as np
22
import pytest
33

4-
from pandas._config import using_string_dtype
5-
64
from pandas import (
75
DataFrame,
86
Series,
@@ -119,7 +117,6 @@ def test_dataframe_array_ea_dtypes():
119117
assert arr.flags.writeable is False
120118

121119

122-
@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
123120
def test_dataframe_array_string_dtype():
124121
df = DataFrame({"a": ["a", "b"]}, dtype="string")
125122
arr = np.asarray(df)

pandas/tests/copy_view/test_astype.py

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -84,7 +84,6 @@ def test_astype_numpy_to_ea():
8484
assert np.shares_memory(get_array(ser), get_array(result))
8585

8686

87-
@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
8887
@pytest.mark.parametrize(
8988
"dtype, new_dtype", [("object", "string"), ("string", "object")]
9089
)
@@ -98,7 +97,6 @@ def test_astype_string_and_object(dtype, new_dtype):
9897
tm.assert_frame_equal(df, df_orig)
9998

10099

101-
@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
102100
@pytest.mark.parametrize(
103101
"dtype, new_dtype", [("object", "string"), ("string", "object")]
104102
)

0 commit comments

Comments
 (0)