Merge branch 'pandas-dev:main' into main

uditbaliyan · web-flow · commit 24dcc63c1527 · 2024-09-04T21:16:54.000+05:30
diff --git a/ci/code_checks.sh b/ci/code_checks.sh
@@ -131,10 +131,8 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then
         -i "pandas.Timestamp.nanosecond GL08" \
         -i "pandas.Timestamp.resolution PR02" \
         -i "pandas.Timestamp.tzinfo GL08" \
-        -i "pandas.Timestamp.value GL08" \
         -i "pandas.Timestamp.year GL08" \
         -i "pandas.api.extensions.ExtensionArray.interpolate PR01,SA01" \
-        -i "pandas.api.interchange.from_dataframe RT03,SA01" \
         -i "pandas.api.types.is_bool PR01,SA01" \
         -i "pandas.api.types.is_categorical_dtype SA01" \
         -i "pandas.api.types.is_complex PR01,SA01" \
diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst
@@ -53,6 +53,7 @@ Other enhancements
 - :meth:`DataFrame.pivot_table` and :func:`pivot_table` now allow the passing of keyword arguments to ``aggfunc`` through ``**kwargs`` (:issue:`57884`)
 - :meth:`Series.cummin` and :meth:`Series.cummax` now supports :class:`CategoricalDtype` (:issue:`52335`)
 - :meth:`Series.plot` now correctly handle the ``ylabel`` parameter for pie charts, allowing for explicit control over the y-axis label (:issue:`58239`)
+- :meth:`DataFrame.plot.scatter` argument ``c`` now accepts a column of strings, where rows with the same string are colored identically (:issue:`16827` and :issue:`16485`)
 - :meth:`pandas.concat` will raise a ``ValueError`` when ``ignore_index=True`` and ``keys`` is not ``None`` (:issue:`59274`)
 - Multiplying two :class:`DateOffset` objects will now raise a ``TypeError`` instead of a ``RecursionError`` (:issue:`59442`)
 - Restore support for reading Stata 104-format and enable reading 103-format dta files (:issue:`58554`)
@@ -503,6 +504,7 @@ Performance improvements
 - :meth:`Series.str.extract` returns a :class:`RangeIndex` columns instead of an :class:`Index` column when possible (:issue:`57542`)
 - :meth:`Series.str.partition` with :class:`ArrowDtype` returns a :class:`RangeIndex` columns instead of an :class:`Index` column when possible (:issue:`57768`)
 - Performance improvement in :class:`DataFrame` when ``data`` is a ``dict`` and ``columns`` is specified (:issue:`24368`)
+- Performance improvement in :class:`MultiIndex` when setting :attr:`MultiIndex.names` doesn't invalidate all cached operations (:issue:`59578`)
 - Performance improvement in :meth:`DataFrame.join` for sorted but non-unique indexes (:issue:`56941`)
 - Performance improvement in :meth:`DataFrame.join` when left and/or right are non-unique and ``how`` is ``"left"``, ``"right"``, or ``"inner"`` (:issue:`56817`)
 - Performance improvement in :meth:`DataFrame.join` with ``how="left"`` or ``how="right"`` and ``sort=True`` (:issue:`56919`)
diff --git a/pandas/_libs/tslibs/timestamps.pyx b/pandas/_libs/tslibs/timestamps.pyx
@@ -240,6 +240,27 @@ cdef class _Timestamp(ABCTimestamp):
 
     @property
     def value(self) -> int:
+        """
+        Return the value of the Timestamp.
+
+        Returns
+        -------
+        int
+            The integer representation of the Timestamp object in nanoseconds
+            since the Unix epoch (1970-01-01 00:00:00 UTC).
+
+        See Also
+        --------
+        Timestamp.second : Return the second of the Timestamp.
+        Timestamp.minute : Return the minute of the Timestamp.
+
+        Examples
+        --------
+        >>> ts = pd.Timestamp("2024-08-31 16:16:30")
+        >>> ts.value
+        1725120990000000000
+        """
+
         try:
             return convert_reso(self._value, self._creso, NPY_FR_ns, False)
         except OverflowError:
@@ -1020,8 +1041,8 @@ cdef class _Timestamp(ABCTimestamp):
 
         See Also
         --------
-        Timestamp.day : Return the day of the year.
-        Timestamp.year : Return the year of the week.
+        Timestamp.day : Return the day of the Timestamp.
+        Timestamp.year : Return the year of the Timestamp.
 
         Examples
         --------
diff --git a/pandas/_testing/asserters.py b/pandas/_testing/asserters.py
@@ -188,7 +188,7 @@ def assert_index_equal(
     check_order: bool = True,
     rtol: float = 1.0e-5,
     atol: float = 1.0e-8,
-    obj: str = "Index",
+    obj: str | None = None,
 ) -> None:
     """
     Check that left and right Index are equal.
@@ -217,7 +217,7 @@ def assert_index_equal(
         Relative tolerance. Only used when check_exact is False.
     atol : float, default 1e-8
         Absolute tolerance. Only used when check_exact is False.
-    obj : str, default 'Index'
+    obj : str, default 'Index' or 'MultiIndex'
         Specify object name being compared, internally used to show appropriate
         assertion message.
 
@@ -235,6 +235,9 @@ def assert_index_equal(
     """
     __tracebackhide__ = True
 
+    if obj is None:
+        obj = "MultiIndex" if isinstance(left, MultiIndex) else "Index"
+
     def _check_types(left, right, obj: str = "Index") -> None:
         if not exact:
             return
@@ -283,7 +286,7 @@ def _check_types(left, right, obj: str = "Index") -> None:
         right = cast(MultiIndex, right)
 
         for level in range(left.nlevels):
-            lobj = f"MultiIndex level [{level}]"
+            lobj = f"{obj} level [{level}]"
             try:
                 # try comparison on levels/codes to avoid densifying MultiIndex
                 assert_index_equal(
@@ -314,7 +317,7 @@ def _check_types(left, right, obj: str = "Index") -> None:
                     obj=lobj,
                 )
             # get_level_values may change dtype
-            _check_types(left.levels[level], right.levels[level], obj=obj)
+            _check_types(left.levels[level], right.levels[level], obj=lobj)
 
     # skip exact index checking when `check_categorical` is False
     elif check_exact and check_categorical:
@@ -527,7 +530,7 @@ def assert_interval_array_equal(
         kwargs["check_freq"] = False
 
     assert_equal(left._left, right._left, obj=f"{obj}.left", **kwargs)
-    assert_equal(left._right, right._right, obj=f"{obj}.left", **kwargs)
+    assert_equal(left._right, right._right, obj=f"{obj}.right", **kwargs)
 
     assert_attr_equal("closed", left, right, obj=obj)
 
diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py
@@ -799,7 +799,7 @@ def dtypes(self) -> Series:
         """
         from pandas import Series
 
-        names = com.fill_missing_names([level.name for level in self.levels])
+        names = com.fill_missing_names(self.names)
         return Series([level.dtype for level in self.levels], index=Index(names))
 
     def __len__(self) -> int:
@@ -1572,7 +1572,7 @@ def _format_multi(
     def _get_names(self) -> FrozenList:
         return FrozenList(self._names)
 
-    def _set_names(self, names, *, level=None, validate: bool = True) -> None:
+    def _set_names(self, names, *, level=None) -> None:
         """
         Set new names on index. Each name has to be a hashable type.
 
@@ -1583,8 +1583,6 @@ def _set_names(self, names, *, level=None, validate: bool = True) -> None:
         level : int, level name, or sequence of int/level names (default None)
             If the index is a MultiIndex (hierarchical), level(s) to set (None
             for all levels).  Otherwise level must be None
-        validate : bool, default True
-            validate that the names match level lengths
 
         Raises
         ------
@@ -1603,13 +1601,12 @@ def _set_names(self, names, *, level=None, validate: bool = True) -> None:
             raise ValueError("Names should be list-like for a MultiIndex")
         names = list(names)
 
-        if validate:
-            if level is not None and len(names) != len(level):
-                raise ValueError("Length of names must match length of level.")
-            if level is None and len(names) != self.nlevels:
-                raise ValueError(
-                    "Length of names must match number of levels in MultiIndex."
-                )
+        if level is not None and len(names) != len(level):
+            raise ValueError("Length of names must match length of level.")
+        if level is None and len(names) != self.nlevels:
+            raise ValueError(
+                "Length of names must match number of levels in MultiIndex."
+            )
 
         if level is None:
             level = range(self.nlevels)
@@ -1627,8 +1624,9 @@ def _set_names(self, names, *, level=None, validate: bool = True) -> None:
                     )
             self._names[lev] = name
 
-        # If .levels has been accessed, the names in our cache will be stale.
-        self._reset_cache()
+        # If .levels has been accessed, the .name of each level in our cache
+        # will be stale.
+        self._reset_cache("levels")
 
     names = property(
         fset=_set_names,
diff --git a/pandas/core/interchange/from_dataframe.py b/pandas/core/interchange/from_dataframe.py
@@ -60,6 +60,13 @@ def from_dataframe(df, allow_copy: bool = True) -> pd.DataFrame:
     Returns
     -------
     pd.DataFrame
+        A pandas DataFrame built from the provided interchange
+        protocol object.
+
+    See Also
+    --------
+    pd.DataFrame : DataFrame class which can be created from various input data
+        formats, including objects that support the interchange protocol.
 
     Examples
     --------
diff --git a/pandas/plotting/_matplotlib/core.py b/pandas/plotting/_matplotlib/core.py
@@ -1343,6 +1343,22 @@ def _make_plot(self, fig: Figure) -> None:
             label = self.label
         else:
             label = None
+
+        # if a list of non color strings is passed in as c, color points
+        # by uniqueness of the strings, such same strings get same color
+        create_colors = not self._are_valid_colors(c_values)
+        if create_colors:
+            color_mapping = self._get_color_mapping(c_values)
+            c_values = [color_mapping[s] for s in c_values]
+
+            # build legend for labeling custom colors
+            ax.legend(
+                handles=[
+                    mpl.patches.Circle((0, 0), facecolor=c, label=s)
+                    for s, c in color_mapping.items()
+                ]
+            )
+
         scatter = ax.scatter(
             data[x].values,
             data[y].values,
@@ -1353,6 +1369,7 @@ def _make_plot(self, fig: Figure) -> None:
             s=self.s,
             **self.kwds,
         )
+
         if cb:
             cbar_label = c if c_is_column else ""
             cbar = self._plot_colorbar(ax, fig=fig, label=cbar_label)
@@ -1392,6 +1409,30 @@ def _get_c_values(self, color, color_by_categorical: bool, c_is_column: bool):
             c_values = c
         return c_values
 
+    def _are_valid_colors(self, c_values: Series) -> bool:
+        # check if c_values contains strings and if these strings are valid mpl colors.
+        # no need to check numerics as these (and mpl colors) will be validated for us
+        # in .Axes.scatter._parse_scatter_color_args(...)
+        unique = np.unique(c_values)
+        try:
+            if len(c_values) and all(isinstance(c, str) for c in unique):
+                mpl.colors.to_rgba_array(unique)
+
+            return True
+
+        except (TypeError, ValueError) as _:
+            return False
+
+    def _get_color_mapping(self, c_values: Series) -> dict[str, np.ndarray]:
+        unique = np.unique(c_values)
+        n_colors = len(unique)
+
+        # passing `None` here will default to :rc:`image.cmap`
+        cmap = mpl.colormaps.get_cmap(self.colormap)
+        colors = cmap(np.linspace(0, 1, n_colors))  # RGB tuples
+
+        return dict(zip(unique, colors))
+
     def _get_norm_and_cmap(self, c_values, color_by_categorical: bool):
         c = self.c
         if self.colormap is not None:
diff --git a/pandas/tests/indexing/multiindex/test_chaining_and_caching.py b/pandas/tests/indexing/multiindex/test_chaining_and_caching.py
@@ -5,6 +5,7 @@
 from pandas import (
     DataFrame,
     MultiIndex,
+    RangeIndex,
     Series,
 )
 import pandas._testing as tm
@@ -68,3 +69,19 @@ def test_indexer_caching(monkeypatch):
         s[s == 0] = 1
     expected = Series(np.ones(size_cutoff), index=index)
     tm.assert_series_equal(s, expected)
+
+
+def test_set_names_only_clears_level_cache():
+    mi = MultiIndex.from_arrays([range(4), range(4)], names=["a", "b"])
+    mi.dtypes
+    mi.is_monotonic_increasing
+    mi._engine
+    mi.levels
+    old_cache_keys = sorted(mi._cache.keys())
+    assert old_cache_keys == ["_engine", "dtypes", "is_monotonic_increasing", "levels"]
+    mi.names = ["A", "B"]
+    new_cache_keys = sorted(mi._cache.keys())
+    assert new_cache_keys == ["_engine", "dtypes", "is_monotonic_increasing"]
+    new_levels = mi.levels
+    tm.assert_index_equal(new_levels[0], RangeIndex(4, name="A"))
+    tm.assert_index_equal(new_levels[1], RangeIndex(4, name="B"))
diff --git a/pandas/tests/plotting/frame/test_frame_color.py b/pandas/tests/plotting/frame/test_frame_color.py
@@ -217,8 +217,53 @@ def test_scatter_with_c_column_name_with_colors(self, cmap):
                 ax = df.plot.scatter(x=0, y=1, cmap=cmap, c="species")
         else:
             ax = df.plot.scatter(x=0, y=1, c="species", cmap=cmap)
+
+        assert len(np.unique(ax.collections[0].get_facecolor(), axis=0)) == 3  # r/g/b
+        assert (
+            np.unique(ax.collections[0].get_facecolor(), axis=0)
+            == np.array(
+                [[0.0, 0.0, 1.0, 1.0], [0.0, 0.5, 0.0, 1.0], [1.0, 0.0, 0.0, 1.0]]
+            )  # r/g/b
+        ).all()
         assert ax.collections[0].colorbar is None
 
+    def test_scatter_with_c_column_name_without_colors(self):
+        # Given
+        colors = ["NY", "MD", "MA", "CA"]
+        color_count = 4  # 4 unique colors
+
+        # When
+        df = DataFrame(
+            {
+                "dataX": range(100),
+                "dataY": range(100),
+                "color": (colors[i % len(colors)] for i in range(100)),
+            }
+        )
+
+        # Then
+        ax = df.plot.scatter("dataX", "dataY", c="color")
+        assert len(np.unique(ax.collections[0].get_facecolor(), axis=0)) == color_count
+
+        # Given
+        colors = ["r", "g", "not-a-color"]
+        color_count = 3
+        # Also, since not all are mpl-colors, points matching 'r' or 'g'
+        # are not necessarily red or green
+
+        # When
+        df = DataFrame(
+            {
+                "dataX": range(100),
+                "dataY": range(100),
+                "color": (colors[i % len(colors)] for i in range(100)),
+            }
+        )
+
+        # Then
+        ax = df.plot.scatter("dataX", "dataY", c="color")
+        assert len(np.unique(ax.collections[0].get_facecolor(), axis=0)) == color_count
+
     def test_scatter_colors(self):
         df = DataFrame({"a": [1, 2, 3], "b": [1, 2, 3], "c": [1, 2, 3]})
         with pytest.raises(TypeError, match="Specify exactly one of `c` and `color`"):
@@ -229,7 +274,14 @@ def test_scatter_colors_not_raising_warnings(self):
         # provided via 'c'. Parameters 'cmap' will be ignored
         df = DataFrame({"x": [1, 2, 3], "y": [1, 2, 3]})
         with tm.assert_produces_warning(None):
-            df.plot.scatter(x="x", y="y", c="b")
+            ax = df.plot.scatter(x="x", y="y", c="b")
+            assert (
+                len(np.unique(ax.collections[0].get_facecolor(), axis=0)) == 1
+            )  # blue
+            assert (
+                np.unique(ax.collections[0].get_facecolor(), axis=0)
+                == np.array([[0.0, 0.0, 1.0, 1.0]])
+            ).all()  # blue
 
     def test_scatter_colors_default(self):
         df = DataFrame({"a": [1, 2, 3], "b": [1, 2, 3], "c": [1, 2, 3]})
diff --git a/pandas/tests/util/test_assert_frame_equal.py b/pandas/tests/util/test_assert_frame_equal.py
@@ -79,7 +79,7 @@ def test_frame_equal_shape_mismatch(df1, df2, frame_or_series):
             DataFrame.from_records(
                 {"a": [1.0, 2.0], "b": [2.1, 1.5], "c": ["l1", "l2"]}, index=["a", "b"]
             ),
-            "MultiIndex level \\[0\\] are different",
+            "DataFrame\\.index level \\[0\\] are different",
         ),
     ],
 )
diff --git a/pandas/tests/util/test_assert_interval_array_equal.py b/pandas/tests/util/test_assert_interval_array_equal.py
@@ -1,7 +1,11 @@
 import pytest
 
-from pandas import interval_range
+from pandas import (
+    Interval,
+    interval_range,
+)
 import pandas._testing as tm
+from pandas.arrays import IntervalArray
 
 
 @pytest.mark.parametrize(
@@ -79,3 +83,18 @@ def test_interval_array_equal_start_mismatch():
 
     with pytest.raises(AssertionError, match=msg):
         tm.assert_interval_array_equal(arr1, arr2)
+
+
+def test_interval_array_equal_end_mismatch_only():
+    arr1 = IntervalArray([Interval(0, 1), Interval(0, 5)])
+    arr2 = IntervalArray([Interval(0, 1), Interval(0, 6)])
+
+    msg = """\
+IntervalArray.right are different
+
+IntervalArray.right values are different \\(50.0 %\\)
+\\[left\\]:  \\[1, 5\\]
+\\[right\\]: \\[1, 6\\]"""
+
+    with pytest.raises(AssertionError, match=msg):
+        tm.assert_interval_array_equal(arr1, arr2)
diff --git a/pandas/tests/util/test_assert_series_equal.py b/pandas/tests/util/test_assert_series_equal.py

Original file line number	Diff line number	Diff line change
`@@ -79,7 +79,7 @@ def test_frame_equal_shape_mismatch(df1, df2, frame_or_series):`
`79`	`79`	`DataFrame.from_records(`
`80`	`80`	`{"a": [1.0, 2.0], "b": [2.1, 1.5], "c": ["l1", "l2"]}, index=["a", "b"]`
`81`	`81`	`),`
`82`		`- "MultiIndex level \\[0\\] are different",`
	`82`	`+ "DataFrame\\.index level \\[0\\] are different",`
`83`	`83`	`),`
`84`	`84`	`],`
`85`	`85`	`)`