Skip to content

Commit 24dcc63

Browse files
authored
Merge branch 'pandas-dev:main' into main
2 parents 779e43a + 197e8db commit 24dcc63

File tree

12 files changed

+184
-26
lines changed

12 files changed

+184
-26
lines changed

ci/code_checks.sh

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -131,10 +131,8 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then
131131
-i "pandas.Timestamp.nanosecond GL08" \
132132
-i "pandas.Timestamp.resolution PR02" \
133133
-i "pandas.Timestamp.tzinfo GL08" \
134-
-i "pandas.Timestamp.value GL08" \
135134
-i "pandas.Timestamp.year GL08" \
136135
-i "pandas.api.extensions.ExtensionArray.interpolate PR01,SA01" \
137-
-i "pandas.api.interchange.from_dataframe RT03,SA01" \
138136
-i "pandas.api.types.is_bool PR01,SA01" \
139137
-i "pandas.api.types.is_categorical_dtype SA01" \
140138
-i "pandas.api.types.is_complex PR01,SA01" \

doc/source/whatsnew/v3.0.0.rst

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,7 @@ Other enhancements
5353
- :meth:`DataFrame.pivot_table` and :func:`pivot_table` now allow the passing of keyword arguments to ``aggfunc`` through ``**kwargs`` (:issue:`57884`)
5454
- :meth:`Series.cummin` and :meth:`Series.cummax` now supports :class:`CategoricalDtype` (:issue:`52335`)
5555
- :meth:`Series.plot` now correctly handle the ``ylabel`` parameter for pie charts, allowing for explicit control over the y-axis label (:issue:`58239`)
56+
- :meth:`DataFrame.plot.scatter` argument ``c`` now accepts a column of strings, where rows with the same string are colored identically (:issue:`16827` and :issue:`16485`)
5657
- :meth:`pandas.concat` will raise a ``ValueError`` when ``ignore_index=True`` and ``keys`` is not ``None`` (:issue:`59274`)
5758
- Multiplying two :class:`DateOffset` objects will now raise a ``TypeError`` instead of a ``RecursionError`` (:issue:`59442`)
5859
- Restore support for reading Stata 104-format and enable reading 103-format dta files (:issue:`58554`)
@@ -503,6 +504,7 @@ Performance improvements
503504
- :meth:`Series.str.extract` returns a :class:`RangeIndex` columns instead of an :class:`Index` column when possible (:issue:`57542`)
504505
- :meth:`Series.str.partition` with :class:`ArrowDtype` returns a :class:`RangeIndex` columns instead of an :class:`Index` column when possible (:issue:`57768`)
505506
- Performance improvement in :class:`DataFrame` when ``data`` is a ``dict`` and ``columns`` is specified (:issue:`24368`)
507+
- Performance improvement in :class:`MultiIndex` when setting :attr:`MultiIndex.names` doesn't invalidate all cached operations (:issue:`59578`)
506508
- Performance improvement in :meth:`DataFrame.join` for sorted but non-unique indexes (:issue:`56941`)
507509
- Performance improvement in :meth:`DataFrame.join` when left and/or right are non-unique and ``how`` is ``"left"``, ``"right"``, or ``"inner"`` (:issue:`56817`)
508510
- Performance improvement in :meth:`DataFrame.join` with ``how="left"`` or ``how="right"`` and ``sort=True`` (:issue:`56919`)

pandas/_libs/tslibs/timestamps.pyx

Lines changed: 23 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -240,6 +240,27 @@ cdef class _Timestamp(ABCTimestamp):
240240

241241
@property
242242
def value(self) -> int:
243+
"""
244+
Return the value of the Timestamp.
245+
246+
Returns
247+
-------
248+
int
249+
The integer representation of the Timestamp object in nanoseconds
250+
since the Unix epoch (1970-01-01 00:00:00 UTC).
251+
252+
See Also
253+
--------
254+
Timestamp.second : Return the second of the Timestamp.
255+
Timestamp.minute : Return the minute of the Timestamp.
256+
257+
Examples
258+
--------
259+
>>> ts = pd.Timestamp("2024-08-31 16:16:30")
260+
>>> ts.value
261+
1725120990000000000
262+
"""
263+
243264
try:
244265
return convert_reso(self._value, self._creso, NPY_FR_ns, False)
245266
except OverflowError:
@@ -1020,8 +1041,8 @@ cdef class _Timestamp(ABCTimestamp):
10201041

10211042
See Also
10221043
--------
1023-
Timestamp.day : Return the day of the year.
1024-
Timestamp.year : Return the year of the week.
1044+
Timestamp.day : Return the day of the Timestamp.
1045+
Timestamp.year : Return the year of the Timestamp.
10251046

10261047
Examples
10271048
--------

pandas/_testing/asserters.py

Lines changed: 8 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -188,7 +188,7 @@ def assert_index_equal(
188188
check_order: bool = True,
189189
rtol: float = 1.0e-5,
190190
atol: float = 1.0e-8,
191-
obj: str = "Index",
191+
obj: str | None = None,
192192
) -> None:
193193
"""
194194
Check that left and right Index are equal.
@@ -217,7 +217,7 @@ def assert_index_equal(
217217
Relative tolerance. Only used when check_exact is False.
218218
atol : float, default 1e-8
219219
Absolute tolerance. Only used when check_exact is False.
220-
obj : str, default 'Index'
220+
obj : str, default 'Index' or 'MultiIndex'
221221
Specify object name being compared, internally used to show appropriate
222222
assertion message.
223223
@@ -235,6 +235,9 @@ def assert_index_equal(
235235
"""
236236
__tracebackhide__ = True
237237

238+
if obj is None:
239+
obj = "MultiIndex" if isinstance(left, MultiIndex) else "Index"
240+
238241
def _check_types(left, right, obj: str = "Index") -> None:
239242
if not exact:
240243
return
@@ -283,7 +286,7 @@ def _check_types(left, right, obj: str = "Index") -> None:
283286
right = cast(MultiIndex, right)
284287

285288
for level in range(left.nlevels):
286-
lobj = f"MultiIndex level [{level}]"
289+
lobj = f"{obj} level [{level}]"
287290
try:
288291
# try comparison on levels/codes to avoid densifying MultiIndex
289292
assert_index_equal(
@@ -314,7 +317,7 @@ def _check_types(left, right, obj: str = "Index") -> None:
314317
obj=lobj,
315318
)
316319
# get_level_values may change dtype
317-
_check_types(left.levels[level], right.levels[level], obj=obj)
320+
_check_types(left.levels[level], right.levels[level], obj=lobj)
318321

319322
# skip exact index checking when `check_categorical` is False
320323
elif check_exact and check_categorical:
@@ -527,7 +530,7 @@ def assert_interval_array_equal(
527530
kwargs["check_freq"] = False
528531

529532
assert_equal(left._left, right._left, obj=f"{obj}.left", **kwargs)
530-
assert_equal(left._right, right._right, obj=f"{obj}.left", **kwargs)
533+
assert_equal(left._right, right._right, obj=f"{obj}.right", **kwargs)
531534

532535
assert_attr_equal("closed", left, right, obj=obj)
533536

pandas/core/indexes/multi.py

Lines changed: 11 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -799,7 +799,7 @@ def dtypes(self) -> Series:
799799
"""
800800
from pandas import Series
801801

802-
names = com.fill_missing_names([level.name for level in self.levels])
802+
names = com.fill_missing_names(self.names)
803803
return Series([level.dtype for level in self.levels], index=Index(names))
804804

805805
def __len__(self) -> int:
@@ -1572,7 +1572,7 @@ def _format_multi(
15721572
def _get_names(self) -> FrozenList:
15731573
return FrozenList(self._names)
15741574

1575-
def _set_names(self, names, *, level=None, validate: bool = True) -> None:
1575+
def _set_names(self, names, *, level=None) -> None:
15761576
"""
15771577
Set new names on index. Each name has to be a hashable type.
15781578
@@ -1583,8 +1583,6 @@ def _set_names(self, names, *, level=None, validate: bool = True) -> None:
15831583
level : int, level name, or sequence of int/level names (default None)
15841584
If the index is a MultiIndex (hierarchical), level(s) to set (None
15851585
for all levels). Otherwise level must be None
1586-
validate : bool, default True
1587-
validate that the names match level lengths
15881586
15891587
Raises
15901588
------
@@ -1603,13 +1601,12 @@ def _set_names(self, names, *, level=None, validate: bool = True) -> None:
16031601
raise ValueError("Names should be list-like for a MultiIndex")
16041602
names = list(names)
16051603

1606-
if validate:
1607-
if level is not None and len(names) != len(level):
1608-
raise ValueError("Length of names must match length of level.")
1609-
if level is None and len(names) != self.nlevels:
1610-
raise ValueError(
1611-
"Length of names must match number of levels in MultiIndex."
1612-
)
1604+
if level is not None and len(names) != len(level):
1605+
raise ValueError("Length of names must match length of level.")
1606+
if level is None and len(names) != self.nlevels:
1607+
raise ValueError(
1608+
"Length of names must match number of levels in MultiIndex."
1609+
)
16131610

16141611
if level is None:
16151612
level = range(self.nlevels)
@@ -1627,8 +1624,9 @@ def _set_names(self, names, *, level=None, validate: bool = True) -> None:
16271624
)
16281625
self._names[lev] = name
16291626

1630-
# If .levels has been accessed, the names in our cache will be stale.
1631-
self._reset_cache()
1627+
# If .levels has been accessed, the .name of each level in our cache
1628+
# will be stale.
1629+
self._reset_cache("levels")
16321630

16331631
names = property(
16341632
fset=_set_names,

pandas/core/interchange/from_dataframe.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -60,6 +60,13 @@ def from_dataframe(df, allow_copy: bool = True) -> pd.DataFrame:
6060
Returns
6161
-------
6262
pd.DataFrame
63+
A pandas DataFrame built from the provided interchange
64+
protocol object.
65+
66+
See Also
67+
--------
68+
pd.DataFrame : DataFrame class which can be created from various input data
69+
formats, including objects that support the interchange protocol.
6370
6471
Examples
6572
--------

pandas/plotting/_matplotlib/core.py

Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1343,6 +1343,22 @@ def _make_plot(self, fig: Figure) -> None:
13431343
label = self.label
13441344
else:
13451345
label = None
1346+
1347+
# if a list of non color strings is passed in as c, color points
1348+
# by uniqueness of the strings, such same strings get same color
1349+
create_colors = not self._are_valid_colors(c_values)
1350+
if create_colors:
1351+
color_mapping = self._get_color_mapping(c_values)
1352+
c_values = [color_mapping[s] for s in c_values]
1353+
1354+
# build legend for labeling custom colors
1355+
ax.legend(
1356+
handles=[
1357+
mpl.patches.Circle((0, 0), facecolor=c, label=s)
1358+
for s, c in color_mapping.items()
1359+
]
1360+
)
1361+
13461362
scatter = ax.scatter(
13471363
data[x].values,
13481364
data[y].values,
@@ -1353,6 +1369,7 @@ def _make_plot(self, fig: Figure) -> None:
13531369
s=self.s,
13541370
**self.kwds,
13551371
)
1372+
13561373
if cb:
13571374
cbar_label = c if c_is_column else ""
13581375
cbar = self._plot_colorbar(ax, fig=fig, label=cbar_label)
@@ -1392,6 +1409,30 @@ def _get_c_values(self, color, color_by_categorical: bool, c_is_column: bool):
13921409
c_values = c
13931410
return c_values
13941411

1412+
def _are_valid_colors(self, c_values: Series) -> bool:
1413+
# check if c_values contains strings and if these strings are valid mpl colors.
1414+
# no need to check numerics as these (and mpl colors) will be validated for us
1415+
# in .Axes.scatter._parse_scatter_color_args(...)
1416+
unique = np.unique(c_values)
1417+
try:
1418+
if len(c_values) and all(isinstance(c, str) for c in unique):
1419+
mpl.colors.to_rgba_array(unique)
1420+
1421+
return True
1422+
1423+
except (TypeError, ValueError) as _:
1424+
return False
1425+
1426+
def _get_color_mapping(self, c_values: Series) -> dict[str, np.ndarray]:
1427+
unique = np.unique(c_values)
1428+
n_colors = len(unique)
1429+
1430+
# passing `None` here will default to :rc:`image.cmap`
1431+
cmap = mpl.colormaps.get_cmap(self.colormap)
1432+
colors = cmap(np.linspace(0, 1, n_colors)) # RGB tuples
1433+
1434+
return dict(zip(unique, colors))
1435+
13951436
def _get_norm_and_cmap(self, c_values, color_by_categorical: bool):
13961437
c = self.c
13971438
if self.colormap is not None:

pandas/tests/indexing/multiindex/test_chaining_and_caching.py

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
from pandas import (
66
DataFrame,
77
MultiIndex,
8+
RangeIndex,
89
Series,
910
)
1011
import pandas._testing as tm
@@ -68,3 +69,19 @@ def test_indexer_caching(monkeypatch):
6869
s[s == 0] = 1
6970
expected = Series(np.ones(size_cutoff), index=index)
7071
tm.assert_series_equal(s, expected)
72+
73+
74+
def test_set_names_only_clears_level_cache():
75+
mi = MultiIndex.from_arrays([range(4), range(4)], names=["a", "b"])
76+
mi.dtypes
77+
mi.is_monotonic_increasing
78+
mi._engine
79+
mi.levels
80+
old_cache_keys = sorted(mi._cache.keys())
81+
assert old_cache_keys == ["_engine", "dtypes", "is_monotonic_increasing", "levels"]
82+
mi.names = ["A", "B"]
83+
new_cache_keys = sorted(mi._cache.keys())
84+
assert new_cache_keys == ["_engine", "dtypes", "is_monotonic_increasing"]
85+
new_levels = mi.levels
86+
tm.assert_index_equal(new_levels[0], RangeIndex(4, name="A"))
87+
tm.assert_index_equal(new_levels[1], RangeIndex(4, name="B"))

pandas/tests/plotting/frame/test_frame_color.py

Lines changed: 53 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -217,8 +217,53 @@ def test_scatter_with_c_column_name_with_colors(self, cmap):
217217
ax = df.plot.scatter(x=0, y=1, cmap=cmap, c="species")
218218
else:
219219
ax = df.plot.scatter(x=0, y=1, c="species", cmap=cmap)
220+
221+
assert len(np.unique(ax.collections[0].get_facecolor(), axis=0)) == 3 # r/g/b
222+
assert (
223+
np.unique(ax.collections[0].get_facecolor(), axis=0)
224+
== np.array(
225+
[[0.0, 0.0, 1.0, 1.0], [0.0, 0.5, 0.0, 1.0], [1.0, 0.0, 0.0, 1.0]]
226+
) # r/g/b
227+
).all()
220228
assert ax.collections[0].colorbar is None
221229

230+
def test_scatter_with_c_column_name_without_colors(self):
231+
# Given
232+
colors = ["NY", "MD", "MA", "CA"]
233+
color_count = 4 # 4 unique colors
234+
235+
# When
236+
df = DataFrame(
237+
{
238+
"dataX": range(100),
239+
"dataY": range(100),
240+
"color": (colors[i % len(colors)] for i in range(100)),
241+
}
242+
)
243+
244+
# Then
245+
ax = df.plot.scatter("dataX", "dataY", c="color")
246+
assert len(np.unique(ax.collections[0].get_facecolor(), axis=0)) == color_count
247+
248+
# Given
249+
colors = ["r", "g", "not-a-color"]
250+
color_count = 3
251+
# Also, since not all are mpl-colors, points matching 'r' or 'g'
252+
# are not necessarily red or green
253+
254+
# When
255+
df = DataFrame(
256+
{
257+
"dataX": range(100),
258+
"dataY": range(100),
259+
"color": (colors[i % len(colors)] for i in range(100)),
260+
}
261+
)
262+
263+
# Then
264+
ax = df.plot.scatter("dataX", "dataY", c="color")
265+
assert len(np.unique(ax.collections[0].get_facecolor(), axis=0)) == color_count
266+
222267
def test_scatter_colors(self):
223268
df = DataFrame({"a": [1, 2, 3], "b": [1, 2, 3], "c": [1, 2, 3]})
224269
with pytest.raises(TypeError, match="Specify exactly one of `c` and `color`"):
@@ -229,7 +274,14 @@ def test_scatter_colors_not_raising_warnings(self):
229274
# provided via 'c'. Parameters 'cmap' will be ignored
230275
df = DataFrame({"x": [1, 2, 3], "y": [1, 2, 3]})
231276
with tm.assert_produces_warning(None):
232-
df.plot.scatter(x="x", y="y", c="b")
277+
ax = df.plot.scatter(x="x", y="y", c="b")
278+
assert (
279+
len(np.unique(ax.collections[0].get_facecolor(), axis=0)) == 1
280+
) # blue
281+
assert (
282+
np.unique(ax.collections[0].get_facecolor(), axis=0)
283+
== np.array([[0.0, 0.0, 1.0, 1.0]])
284+
).all() # blue
233285

234286
def test_scatter_colors_default(self):
235287
df = DataFrame({"a": [1, 2, 3], "b": [1, 2, 3], "c": [1, 2, 3]})

pandas/tests/util/test_assert_frame_equal.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -79,7 +79,7 @@ def test_frame_equal_shape_mismatch(df1, df2, frame_or_series):
7979
DataFrame.from_records(
8080
{"a": [1.0, 2.0], "b": [2.1, 1.5], "c": ["l1", "l2"]}, index=["a", "b"]
8181
),
82-
"MultiIndex level \\[0\\] are different",
82+
"DataFrame\\.index level \\[0\\] are different",
8383
),
8484
],
8585
)

0 commit comments

Comments
 (0)