Skip to content

Commit 8be7259

Browse files
committed
BUG: Fixed slow plotting with DatetimeIndex
1 parent cdc5b74 commit 8be7259

File tree

4 files changed

+358
-27
lines changed

4 files changed

+358
-27
lines changed

pandas/plotting/_matplotlib/converter.py

Lines changed: 50 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,7 @@
4040
is_integer_dtype,
4141
is_nested_list_like,
4242
)
43+
from pandas.core.dtypes.generic import ABCDatetimeIndex
4344

4445
from pandas import (
4546
Index,
@@ -301,17 +302,37 @@ def try_parse(values):
301302
except Exception:
302303
return values
303304

305+
# Fast path for single values
304306
if isinstance(values, (datetime, pydt.date, np.datetime64, pydt.time)):
305307
return mdates.date2num(values)
306308
elif is_integer(values) or is_float(values):
307309
return values
308310
elif isinstance(values, str):
309311
return try_parse(values)
310312
elif isinstance(values, (list, tuple, np.ndarray, Index, Series)):
313+
# Check for cache to avoid redundant conversions
314+
# This is especially important for DataFrames with the same DatetimeIndex
315+
# for all columns
316+
if isinstance(values, Index) and hasattr(axis, "_converter_cache"):
317+
cache_key = id(values)
318+
if cache_key in axis._converter_cache:
319+
return axis._converter_cache[cache_key]
320+
311321
if isinstance(values, Series):
312322
# https://github.com/matplotlib/matplotlib/issues/11391
313323
# Series was skipped. Convert to DatetimeIndex to get asi8
314324
values = Index(values)
325+
326+
# For DatetimeIndex objects, directly use _mpl_repr() for better efficiency
327+
if isinstance(values, ABCDatetimeIndex):
328+
result = values._mpl_repr()
329+
# Cache result for reuse with subsequent columns
330+
if hasattr(axis, "_converter_cache"):
331+
axis._converter_cache[id(values)] = result
332+
elif axis is not None:
333+
axis._converter_cache = {id(values): result}
334+
return result
335+
315336
if isinstance(values, Index):
316337
values = values.values
317338
if not isinstance(values, np.ndarray):
@@ -325,7 +346,15 @@ def try_parse(values):
325346
except Exception:
326347
pass
327348

328-
values = mdates.date2num(values)
349+
result = mdates.date2num(values)
350+
351+
# Cache result if possible
352+
if hasattr(axis, "_converter_cache"):
353+
axis._converter_cache[id(values)] = result
354+
elif axis is not None:
355+
axis._converter_cache = {id(values): result}
356+
357+
return result
329358

330359
return values
331360

@@ -426,10 +455,29 @@ def __call__(self):
426455
)
427456

428457
interval = self._get_interval()
429-
freq = f"{interval}ms"
458+
459+
# Use seconds instead of milliseconds for large intervals to improve performance
460+
if interval >= 1000:
461+
# Use seconds instead of ms for better performance
462+
sec_interval = interval / 1000
463+
freq = f"{sec_interval}s"
464+
else:
465+
freq = f"{interval}ms"
466+
430467
tz = self.tz.tzname(None)
431468
st = dmin.replace(tzinfo=None)
432469
ed = dmax.replace(tzinfo=None)
470+
471+
# Limit ticks for large date ranges to improve performance
472+
date_diff = (ed - st).total_seconds()
473+
if (
474+
date_diff > 86400 * 365 and interval < 1000
475+
): # Year+ of data with small interval
476+
# Generate limited ticks for large datasets instead of a full date range
477+
num_ticks = max_millis_ticks
478+
tick_locs = np.linspace(mdates.date2num(st), mdates.date2num(ed), num_ticks)
479+
return tick_locs
480+
433481
all_dates = date_range(start=st, end=ed, freq=freq, tz=tz).astype(object)
434482

435483
try:

pandas/plotting/_matplotlib/core.py

Lines changed: 176 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1549,9 +1549,157 @@ def __init__(self, data, **kwargs) -> None:
15491549
self.data = self.data.fillna(value=0)
15501550

15511551
def _make_plot(self, fig: Figure) -> None:
1552+
"""Create the plot.
1553+
1554+
This method contains a fast path optimization for DataFrames with DatetimeIndex
1555+
and multiple columns. For large DataFrames with DatetimeIndex, plotting can be
1556+
very slow due to the overhead of date conversions for each column.
1557+
1558+
The optimization follows this strategy:
1559+
1. For the first column only: Use standard DatetimeIndex plotting to get ticks
1560+
2. For remaining columns: Plot with a simpler numeric index (much faster)
1561+
3. Apply the datetime tick labels from the first plot to all other plots
1562+
1563+
This avoids redundant DatetimeIndex -> PeriodIndex conversions and tick
1564+
calculations when plotting many columns with the same index.
1565+
1566+
The optimization can yield a 10x+ speedup on large DataFrames with many columns.
1567+
"""
1568+
# Fast path for DatetimeIndex with many columns
1569+
# Implement the same strategy as the user's workaround that showed 11x speedup
1570+
if (
1571+
self._is_ts_plot()
1572+
and isinstance(self.data.index, ABCDatetimeIndex)
1573+
and len(self.data.columns) >= 2
1574+
): # Need at least 2 columns for this optimization
1575+
# Get the first axis for the plot
1576+
ax = self._get_ax(0)
1577+
1578+
# STEP 1: Plot only the first column to get datetime ticks
1579+
first_column = self.data.iloc[:, 0]
1580+
first_series = first_column.copy()
1581+
first_style = None
1582+
1583+
# Apply colors and style just for first column
1584+
colors = self._get_colors()
1585+
first_col_label = self.data.columns[0]
1586+
kwds = self.kwds.copy()
1587+
if self.color is not None:
1588+
kwds["color"] = self.color
1589+
1590+
# Set up style for first column
1591+
first_style, kwds = self._apply_style_colors(
1592+
colors,
1593+
kwds,
1594+
0,
1595+
first_col_label, # type: ignore[arg-type]
1596+
)
1597+
1598+
# Add label to kwds for the first column
1599+
first_label = pprint_thing(first_col_label)
1600+
first_label = self._mark_right_label(first_label, index=0)
1601+
kwds["label"] = first_label
1602+
1603+
# Plot the first column with DatetimeIndex to set up ticks
1604+
first_ax = self._get_ax(0)
1605+
# We need to specifically add column_num for stacking
1606+
kwds["column_num"] = 0
1607+
lines = self._ts_plot(
1608+
first_ax, None, first_series, style=first_style, **kwds
1609+
)
1610+
1611+
# Get the x-ticks and labels from the first plot
1612+
xticks = first_ax.get_xticks()
1613+
xticklabels = [label.get_text() for label in first_ax.get_xticklabels()]
1614+
1615+
# Keep reference to the first line for the legend
1616+
first_line = lines[0]
1617+
self._append_legend_handles_labels(first_line, first_label)
1618+
1619+
# STEP 2: Plot all columns with a numeric index (much faster)
1620+
# Reset axes for faster plotting
1621+
data_without_index = self.data.reset_index(drop=True)
1622+
1623+
# Plot remaining columns
1624+
stacking_id = self._get_stacking_id()
1625+
is_errorbar = com.any_not_none(*self.errors.values())
1626+
1627+
# Skip the first column and process the remaining ones
1628+
for i, (col_idx, (label, y)) in enumerate(
1629+
zip(
1630+
range(1, len(data_without_index.columns)),
1631+
list(data_without_index.items())[1:],
1632+
)
1633+
):
1634+
# Get the actual axis for this column - use the right column index
1635+
# Note: i is 0-based for the remaining columns after skipping the first
1636+
ax = self._get_ax(col_idx) # Use col_idx which starts from 1
1637+
1638+
# Reset kwds for each column
1639+
kwds = self.kwds.copy()
1640+
if self.color is not None:
1641+
kwds["color"] = self.color
1642+
1643+
# Apply style and colors
1644+
style, kwds = self._apply_style_colors(
1645+
colors,
1646+
kwds,
1647+
col_idx, # Use 1-based index to match column
1648+
label, # type: ignore[arg-type]
1649+
)
1650+
1651+
# Handle any error bars
1652+
errors = self._get_errorbars(label=label, index=col_idx)
1653+
kwds = dict(kwds, **errors)
1654+
1655+
# Format the label
1656+
label_str = pprint_thing(label)
1657+
label_str = self._mark_right_label(label_str, index=col_idx)
1658+
kwds["label"] = label_str
1659+
1660+
# Add column number for stacking
1661+
kwds["column_num"] = col_idx
1662+
1663+
try:
1664+
# Use regular plot (not ts_plot) for better performance
1665+
newlines = self._plot(
1666+
ax,
1667+
data_without_index.index, # Use numeric index for speed
1668+
np.asarray(y.values),
1669+
style=style,
1670+
stacking_id=stacking_id,
1671+
is_errorbar=is_errorbar,
1672+
**kwds,
1673+
)
1674+
self._append_legend_handles_labels(newlines[0], label_str)
1675+
1676+
# STEP 3: Apply the datetime x-axis formatting to each plot
1677+
# Use ticks from first plot for all subsequent plots
1678+
num_ticks = len(xticks)
1679+
new_xticks = np.linspace(0, len(self.data.index) - 1, num_ticks)
1680+
ax.set_xlim(0, len(self.data.index) - 1)
1681+
ax.set_xticks(new_xticks)
1682+
ax.set_xticklabels(xticklabels)
1683+
except Exception as e:
1684+
# If anything goes wrong with the plotting, log it but don't crash
1685+
# This ensures the fix doesn't introduce new issues
1686+
import warnings
1687+
1688+
warnings.warn(
1689+
f"Fast path plotting failed for column {col_idx}: {e!s}. "
1690+
"Falling back to regular plotting method for remaining columns",
1691+
stacklevel=2,
1692+
)
1693+
# Return without 'return' to fall back to the normal plotting path
1694+
break
1695+
else:
1696+
# If we've successfully plotted all columns, return from the method
1697+
# We've already plotted everything with the fast path
1698+
return
1699+
1700+
# Regular path for other cases
15521701
if self._is_ts_plot():
15531702
data = maybe_convert_index(self._get_ax(0), self.data)
1554-
15551703
x = data.index # dummy, not used
15561704
plotf = self._ts_plot
15571705
it = data.items()
@@ -1570,6 +1718,7 @@ def _make_plot(self, fig: Figure) -> None:
15701718
is_errorbar = com.any_not_none(*self.errors.values())
15711719

15721720
colors = self._get_colors()
1721+
15731722
for i, (label, y) in enumerate(it):
15741723
ax = self._get_ax(i)
15751724
kwds = self.kwds.copy()
@@ -1636,15 +1785,34 @@ def _ts_plot(self, ax: Axes, x, data: Series, style=None, **kwds):
16361785
# accept x to be consistent with normal plot func,
16371786
# x is not passed to tsplot as it uses data.index as x coordinate
16381787
# column_num must be in kwds for stacking purpose
1639-
freq, data = prepare_ts_data(data, ax, kwds)
16401788

1641-
# TODO #54485
1642-
ax._plot_data.append((data, self._kind, kwds)) # type: ignore[attr-defined]
1789+
# Optimization for multi-column DatetimeIndex plots
1790+
if hasattr(ax, "_datetime_ticks_setup_done") and kwds.get("column_num", 0) > 0:
1791+
# Skip the expensive date axis setup for columns after the first one
1792+
# We'll just copy the ticks from the first plot
1793+
freq = getattr(ax, "freq", None)
1794+
lines = self._plot(
1795+
ax, data.index, np.asarray(data.values), style=style, **kwds
1796+
)
1797+
1798+
if hasattr(ax, "_xticks") and hasattr(ax, "_xticklabels"):
1799+
# Use the stored ticks and labels from the first column plot
1800+
ax.set_xticks(ax._xticks)
1801+
ax.set_xticklabels(ax._xticklabels)
1802+
else:
1803+
# Regular path for first column or non-optimized plots
1804+
freq, data = prepare_ts_data(data, ax, kwds)
1805+
1806+
# TODO #54485
1807+
ax._plot_data.append((data, self._kind, kwds)) # type: ignore[attr-defined]
1808+
1809+
lines = self._plot(
1810+
ax, data.index, np.asarray(data.values), style=style, **kwds
1811+
)
1812+
# set date formatter, locators and rescale limits
1813+
# TODO #54485
1814+
format_dateaxis(ax, ax.freq, data.index) # type: ignore[arg-type, attr-defined]
16431815

1644-
lines = self._plot(ax, data.index, np.asarray(data.values), style=style, **kwds)
1645-
# set date formatter, locators and rescale limits
1646-
# TODO #54485
1647-
format_dateaxis(ax, ax.freq, data.index) # type: ignore[arg-type, attr-defined]
16481816
return lines
16491817

16501818
@final

0 commit comments

Comments
 (0)