Merge remote-tracking branch 'upstream/main' into string-dtype-tests-fixtures-arrow_string_storage

jorisvandenbossche · jorisvandenbossche · commit b4f92026b3c4 · 2024-07-31T19:39:30.000+02:00
diff --git a/ci/code_checks.sh b/ci/code_checks.sh
@@ -70,15 +70,10 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then
         --format=actions \
         -i ES01 `# For now it is ok if docstrings are missing the extended summary` \
         -i "pandas.Series.dt PR01" `# Accessors are implemented as classes, but we do not document the Parameters section` \
-        -i "pandas.MultiIndex.copy PR07,RT03,SA01" \
         -i "pandas.MultiIndex.get_level_values SA01" \
-        -i "pandas.MultiIndex.get_loc PR07" \
         -i "pandas.MultiIndex.get_loc_level PR07" \
-        -i "pandas.MultiIndex.levshape SA01" \
         -i "pandas.MultiIndex.names SA01" \
-        -i "pandas.MultiIndex.remove_unused_levels RT03,SA01" \
         -i "pandas.MultiIndex.reorder_levels RT03,SA01" \
-        -i "pandas.MultiIndex.set_levels RT03,SA01" \
         -i "pandas.MultiIndex.sortlevel PR07,SA01" \
         -i "pandas.MultiIndex.to_frame RT03" \
         -i "pandas.NA SA01" \
diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst
@@ -35,6 +35,7 @@ Other enhancements
 - :meth:`DataFrame.agg` called with ``axis=1`` and a ``func`` which relabels the result index now raises a ``NotImplementedError`` (:issue:`58807`).
 - :meth:`Index.get_loc` now accepts also subclasses of ``tuple`` as keys (:issue:`57922`)
 - :meth:`Styler.set_tooltips` provides alternative method to storing tooltips by using title attribute of td elements. (:issue:`56981`)
+- Added missing parameter ``weights`` in :meth:`DataFrame.plot.kde` for the estimation of the PDF (:issue:`59337`)
 - Allow dictionaries to be passed to :meth:`pandas.Series.str.replace` via ``pat`` parameter (:issue:`51748`)
 - Support passing a :class:`Series` input to :func:`json_normalize` that retains the :class:`Series` :class:`Index` (:issue:`51452`)
 - Support reading value labels from Stata 108-format (Stata 6) and earlier files (:issue:`58154`)
diff --git a/pandas/conftest.py b/pandas/conftest.py
@@ -1354,20 +1354,33 @@ def object_dtype(request):
 
 @pytest.fixture(
     params=[
-        "object",
-        "string[python]",
-        pytest.param("string[pyarrow]", marks=td.skip_if_no("pyarrow")),
-        pytest.param("string[pyarrow_numpy]", marks=td.skip_if_no("pyarrow")),
-    ]
+        np.dtype("object"),
+        ("python", pd.NA),
+        pytest.param(("pyarrow", pd.NA), marks=td.skip_if_no("pyarrow")),
+        pytest.param(("pyarrow", np.nan), marks=td.skip_if_no("pyarrow")),
+    ],
+    ids=[
+        "string=object",
+        "string=string[python]",
+        "string=string[pyarrow]",
+        "string=str[pyarrow]",
+    ],
 )
 def any_string_dtype(request):
     """
     Parametrized fixture for string dtypes.
     * 'object'
-    * 'string[python]'
-    * 'string[pyarrow]'
+    * 'string[python]' (NA variant)
+    * 'string[pyarrow]' (NA variant)
+    * 'str' (NaN variant, with pyarrow)
     """
-    return request.param
+    if isinstance(request.param, np.dtype):
+        return request.param
+    else:
+        # need to instantiate the StringDtype here instead of in the params
+        # to avoid importing pyarrow during test collection
+        storage, na_value = request.param
+        return pd.StringDtype(storage, na_value)
 
 
 @pytest.fixture(params=tm.DATETIME64_DTYPES)
diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py
@@ -129,7 +129,7 @@ def __init__(
     ) -> None:
         # infer defaults
         if storage is None:
-            if using_string_dtype():
+            if using_string_dtype() and na_value is not libmissing.NA:
                 storage = "pyarrow"
             else:
                 storage = get_option("mode.string_storage")
@@ -167,7 +167,9 @@ def __eq__(self, other: object) -> bool:
                 return True
             try:
                 other = self.construct_from_string(other)
-            except TypeError:
+            except (TypeError, ImportError):
+                # TypeError if `other` is not a valid string for StringDtype
+                # ImportError if pyarrow is not installed for "string[pyarrow]"
                 return False
         if isinstance(other, type(self)):
             return self.storage == other.storage and self.na_value is other.na_value
diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py
@@ -933,6 +933,19 @@ def set_levels(
         """
         Set new levels on MultiIndex. Defaults to returning new index.
 
+        The `set_levels` method provides a flexible way to change the levels of a
+        `MultiIndex`. This is particularly useful when you need to update the
+        index structure of your DataFrame without altering the data. The method
+        returns a new `MultiIndex` unless the operation is performed in-place,
+        ensuring that the original index remains unchanged unless explicitly
+        modified.
+
+        The method checks the integrity of the new levels against the existing
+        codes by default, but this can be disabled if you are confident that
+        your levels are consistent with the underlying data. This can be useful
+        when you want to perform optimizations or make specific adjustments to
+        the index levels that do not strictly adhere to the original structure.
+
         Parameters
         ----------
         levels : sequence or list of sequence
@@ -945,6 +958,14 @@ def set_levels(
         Returns
         -------
         MultiIndex
+            A new `MultiIndex` with the updated levels.
+
+        See Also
+        --------
+        MultiIndex.set_codes : Set new codes on the existing `MultiIndex`.
+        MultiIndex.remove_unused_levels : Create new MultiIndex from current that
+            removes unused levels.
+        Index.set_names : Set Index or MultiIndex name.
 
         Examples
         --------
@@ -1051,7 +1072,19 @@ def nlevels(self) -> int:
     @property
     def levshape(self) -> Shape:
         """
-        A tuple with the length of each level.
+        A tuple representing the length of each level in the MultiIndex.
+
+        In a `MultiIndex`, each level can contain multiple unique values. The
+        `levshape` property provides a quick way to assess the size of each
+        level by returning a tuple where each entry represents the number of
+        unique values in that specific level. This is particularly useful in
+        scenarios where you need to understand the structure and distribution
+        of your index levels, such as when working with multidimensional data.
+
+        See Also
+        --------
+        MultiIndex.shape : Return a tuple of the shape of the MultiIndex.
+        MultiIndex.levels : Returns the levels of the MultiIndex.
 
         Examples
         --------
@@ -1282,20 +1315,37 @@ def copy(  # type: ignore[override]
         name=None,
     ) -> Self:
         """
-        Make a copy of this object.
+        Make a copy of this object. Names, dtype, levels and codes can be passed and \
+        will be set on new copy.
 
-        Names, dtype, levels and codes can be passed and will be set on new copy.
+        The `copy` method provides a mechanism to create a duplicate of an
+        existing MultiIndex object. This is particularly useful in scenarios where
+        modifications are required on an index, but the original MultiIndex should
+        remain unchanged. By specifying the `deep` parameter, users can control
+        whether the copy should be a deep or shallow copy, providing flexibility
+        depending on the size and complexity of the MultiIndex.
 
         Parameters
         ----------
         names : sequence, optional
+            Names to set on the new MultiIndex object.
         deep : bool, default False
+            If False, the new object will be a shallow copy. If True, a deep copy
+            will be attempted. Deep copying can be potentially expensive for large
+            MultiIndex objects.
         name : Label
             Kept for compatibility with 1-dimensional Index. Should not be used.
 
         Returns
         -------
         MultiIndex
+            A new MultiIndex object with the specified modifications.
+
+        See Also
+        --------
+        MultiIndex.from_arrays : Convert arrays to MultiIndex.
+        MultiIndex.from_tuples : Convert list of tuples to MultiIndex.
+        MultiIndex.from_frame : Convert DataFrame to MultiIndex.
 
         Notes
         -----
@@ -2041,9 +2091,22 @@ def remove_unused_levels(self) -> MultiIndex:
         appearance, meaning the same .values and ordering. It will
         also be .equals() to the original.
 
+        The `remove_unused_levels` method is useful in cases where you have a
+        MultiIndex with hierarchical levels, but some of these levels are no
+        longer needed due to filtering or subsetting operations. By removing
+        the unused levels, the resulting MultiIndex becomes more compact and
+        efficient, which can improve performance in subsequent operations.
+
         Returns
         -------
         MultiIndex
+            A new MultiIndex with unused levels removed.
+
+        See Also
+        --------
+        MultiIndex.droplevel : Remove specified levels from a MultiIndex.
+        MultiIndex.reorder_levels : Rearrange levels of a MultiIndex.
+        MultiIndex.set_levels : Set new levels on a MultiIndex.
 
         Examples
         --------
@@ -2968,14 +3031,19 @@ def _get_loc_single_level_index(self, level_index: Index, key: Hashable) -> int:
 
     def get_loc(self, key):
         """
-        Get location for a label or a tuple of labels.
+        Get location for a label or a tuple of labels. The location is returned \
+        as an integer/slice or boolean mask.
 
-        The location is returned as an integer/slice or boolean
-        mask.
+        This method returns the integer location, slice object, or boolean mask
+        corresponding to the specified key, which can be a single label or a tuple
+        of labels. The key represents a position in the MultiIndex, and the location
+        indicates where the key is found within the index.
 
         Parameters
         ----------
         key : label or tuple of labels (one for each level)
+            A label or tuple of labels that correspond to the levels of the MultiIndex.
+            The key must match the structure of the MultiIndex.
 
         Returns
         -------
diff --git a/pandas/plotting/_core.py b/pandas/plotting/_core.py
@@ -1450,6 +1450,7 @@ def kde(
         self,
         bw_method: Literal["scott", "silverman"] | float | Callable | None = None,
         ind: np.ndarray | int | None = None,
+        weights: np.ndarray | None = None,
         **kwargs,
     ) -> PlotAccessor:
         """
@@ -1475,6 +1476,9 @@ def kde(
             1000 equally spaced points are used. If `ind` is a NumPy array, the
             KDE is evaluated at the points passed. If `ind` is an integer,
             `ind` number of equally spaced points are used.
+        weights : NumPy array, optional
+            Weights of datapoints. This must be the same shape as datapoints.
+            If None (default), the samples are assumed to be equally weighted.
         **kwargs
             Additional keyword arguments are documented in
             :meth:`DataFrame.plot`.
@@ -1560,7 +1564,7 @@ def kde(
 
             >>> ax = df.plot.kde(ind=[1, 2, 3, 4, 5, 6])
         """
-        return self(kind="kde", bw_method=bw_method, ind=ind, **kwargs)
+        return self(kind="kde", bw_method=bw_method, ind=ind, weights=weights, **kwargs)
 
     density = kde
 
diff --git a/pandas/plotting/_matplotlib/hist.py b/pandas/plotting/_matplotlib/hist.py
@@ -269,6 +269,7 @@ def _plot(  #  type: ignore[override]
         y: np.ndarray,
         style=None,
         bw_method=None,
+        weights=None,
         ind=None,
         column_num=None,
         stacking_id: int | None = None,
@@ -277,7 +278,7 @@ def _plot(  #  type: ignore[override]
         from scipy.stats import gaussian_kde
 
         y = remove_na_arraylike(y)
-        gkde = gaussian_kde(y, bw_method=bw_method)
+        gkde = gaussian_kde(y, bw_method=bw_method, weights=weights)
 
         y = gkde.evaluate(ind)
         lines = MPLPlot._plot(ax, ind, y, style=style, **kwds)
diff --git a/pandas/tests/arrays/categorical/test_constructors.py b/pandas/tests/arrays/categorical/test_constructors.py
@@ -735,7 +735,6 @@ def test_interval(self):
         tm.assert_numpy_array_equal(cat.codes, expected_codes)
         tm.assert_index_equal(cat.categories, idx)
 
-    @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
     def test_categorical_extension_array_nullable(self, nulls_fixture):
         # GH:
         arr = pd.arrays.StringArray._from_sequence(
diff --git a/pandas/tests/copy_view/test_array.py b/pandas/tests/copy_view/test_array.py
@@ -1,8 +1,6 @@
 import numpy as np
 import pytest
 
-from pandas._config import using_string_dtype
-
 from pandas import (
     DataFrame,
     Series,
@@ -119,7 +117,6 @@ def test_dataframe_array_ea_dtypes():
     assert arr.flags.writeable is False
 
 
-@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
 def test_dataframe_array_string_dtype():
     df = DataFrame({"a": ["a", "b"]}, dtype="string")
     arr = np.asarray(df)
diff --git a/pandas/tests/copy_view/test_astype.py b/pandas/tests/copy_view/test_astype.py
@@ -84,7 +84,6 @@ def test_astype_numpy_to_ea():
     assert np.shares_memory(get_array(ser), get_array(result))
 
 
-@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
 @pytest.mark.parametrize(
     "dtype, new_dtype", [("object", "string"), ("string", "object")]
 )
@@ -98,7 +97,6 @@ def test_astype_string_and_object(dtype, new_dtype):
     tm.assert_frame_equal(df, df_orig)
 
 
-@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
 @pytest.mark.parametrize(
     "dtype, new_dtype", [("object", "string"), ("string", "object")]
 )
diff --git a/pandas/tests/dtypes/test_common.py b/pandas/tests/dtypes/test_common.py
@@ -3,8 +3,6 @@
 import numpy as np
 import pytest
 
-from pandas._config import using_string_dtype
-
 import pandas.util._test_decorators as td
 
 from pandas.core.dtypes.astype import astype_array
@@ -130,7 +128,6 @@ def test_dtype_equal(name1, dtype1, name2, dtype2):
         assert not com.is_dtype_equal(dtype1, dtype2)
 
 
-@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False)
 @pytest.mark.parametrize("name,dtype", list(dtypes.items()), ids=lambda x: str(x))
 def test_pyarrow_string_import_error(name, dtype):
     # GH-44276
diff --git a/pandas/tests/io/parser/test_index_col.py b/pandas/tests/io/parser/test_index_col.py
@@ -9,6 +9,8 @@
 import numpy as np
 import pytest
 
+from pandas._config import using_string_dtype
+
 from pandas import (
     DataFrame,
     Index,
@@ -343,6 +345,7 @@ def test_infer_types_boolean_sum(all_parsers):
     tm.assert_frame_equal(result, expected, check_index_type=False)
 
 
+@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False)
 @pytest.mark.parametrize("dtype, val", [(object, "01"), ("int64", 1)])
 def test_specify_dtype_for_index_col(all_parsers, dtype, val, request):
     # GH#9435
diff --git a/pandas/tests/plotting/test_series.py b/pandas/tests/plotting/test_series.py
@@ -538,6 +538,22 @@ def test_kde_kwargs(self, ts, bw_method, ind):
         pytest.importorskip("scipy")
         _check_plot_works(ts.plot.kde, bw_method=bw_method, ind=ind)
 
+    @pytest.mark.parametrize(
+        "bw_method, ind, weights",
+        [
+            ["scott", 20, None],
+            [None, 20, None],
+            [None, np.int_(20), None],
+            [0.5, np.linspace(-100, 100, 20), None],
+            ["scott", 40, np.linspace(0.0, 2.0, 50)],
+        ],
+    )
+    def test_kde_kwargs_weights(self, bw_method, ind, weights):
+        # GH59337
+        pytest.importorskip("scipy")
+        s = Series(np.random.default_rng(2).uniform(size=50))
+        _check_plot_works(s.plot.kde, bw_method=bw_method, ind=ind, weights=weights)
+
     def test_density_kwargs(self, ts):
         pytest.importorskip("scipy")
         sample_points = np.linspace(-100, 100, 20)
diff --git a/pandas/tests/series/test_constructors.py b/pandas/tests/series/test_constructors.py
@@ -2118,7 +2118,7 @@ def test_series_string_inference_storage_definition(self):
         # returning the NA string dtype, so expected is changed from
         # "string[pyarrow_numpy]" to "string[pyarrow]"
         pytest.importorskip("pyarrow")
-        expected = Series(["a", "b"], dtype="string[pyarrow]")
+        expected = Series(["a", "b"], dtype="string[python]")
         with pd.option_context("future.infer_string", True):
             result = Series(["a", "b"], dtype="string")
         tm.assert_series_equal(result, expected)
diff --git a/pandas/tests/strings/__init__.py b/pandas/tests/strings/__init__.py
@@ -2,7 +2,15 @@
 
 import pandas as pd
 
-object_pyarrow_numpy = ("object", "string[pyarrow_numpy]")
+
+def is_object_or_nan_string_dtype(dtype):
+    """
+    Check if string-like dtype is following NaN semantics, i.e. is object
+    dtype or a NaN-variant of the StringDtype.
+    """
+    return (isinstance(dtype, np.dtype) and dtype == "object") or (
+        dtype.na_value is np.nan
+    )
 
 
 def _convert_na_value(ser, expected):
diff --git a/pandas/tests/strings/test_find_replace.py b/pandas/tests/strings/test_find_replace.py
diff --git a/pandas/tests/strings/test_split_partition.py b/pandas/tests/strings/test_split_partition.py
diff --git a/pandas/tests/strings/test_strings.py b/pandas/tests/strings/test_strings.py

Original file line number	Diff line number	Diff line change
`@@ -84,7 +84,6 @@ def test_astype_numpy_to_ea():`
`84`	`84`	`assert np.shares_memory(get_array(ser), get_array(result))`
`85`	`85`
`86`	`86`
`87`		`-@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")`
`88`	`87`	`@pytest.mark.parametrize(`
`89`	`88`	`"dtype, new_dtype", [("object", "string"), ("string", "object")]`
`90`	`89`	`)`
`@@ -98,7 +97,6 @@ def test_astype_string_and_object(dtype, new_dtype):`
`98`	`97`	`tm.assert_frame_equal(df, df_orig)`
`99`	`98`
`100`	`99`
`101`		`-@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")`
`102`	`100`	`@pytest.mark.parametrize(`
`103`	`101`	`"dtype, new_dtype", [("object", "string"), ("string", "object")]`
`104`	`102`	`)`