Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions lumibot/backtesting/polygon_backtesting.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,13 @@ def __init__(
allow_option_quote_fallback=True, **kwargs
)

# Polygon's philosophy: always cache the finest granularity available and resample
# up on demand. When a strategy calls get_historical_prices(..., timestep="1 day")
# after a prior "15 minute" call has warmed the cache, the cached minute data must
# be allowed to satisfy the day request so Data.get_bars() can resample minute → day.
# This is the *opposite* of ThetaData, which requires exact timestep matches.
self.allow_day_resampling = True

# Memory limit, off by default
self.MAX_STORAGE_BYTES = max_memory

Expand Down
8 changes: 8 additions & 0 deletions lumibot/backtesting/thetadata_backtesting_pandas.py
Original file line number Diff line number Diff line change
Expand Up @@ -122,6 +122,14 @@ def __init__(
super().__init__(datetime_start=datetime_start, datetime_end=datetime_end, pandas_data=pandas_data,
allow_option_quote_fallback=False, **kwargs)

# ThetaData stores minute and day data under SEPARATE canonical keys
# (asset, quote, "minute") vs (asset, quote, "day") and applies provider-specific
# split-spike repair / split-adjustment only to day bars. Allowing a cached minute
# dataset to silently satisfy a day-bar lookup would bypass that normalisation.
# Set allow_day_resampling=False so find_asset_in_data_store always requires an
# exact timestep match and forces an explicit day fetch when day bars are needed.
self.allow_day_resampling = False

# Default to minute; broker can flip to day for daily strategies.
self._timestep = self.MIN_TIMESTEP
# PERF: Avoid scanning the entire pandas_data store on every quote/snapshot call to infer day-mode.
Expand Down
37 changes: 29 additions & 8 deletions lumibot/data_sources/pandas_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,8 @@ class PandasData(DataSourceBacktesting):
{"timestep": "minute", "representations": ["1M", "minute"]},
]

def __init__(self, *args, pandas_data=None, auto_adjust=True, allow_option_quote_fallback: bool = False, **kwargs):
def __init__(self, *args, pandas_data=None, auto_adjust=True, allow_option_quote_fallback: bool = False,
allow_day_resampling: bool = True, **kwargs):
super().__init__(*args, **kwargs)
self.option_quote_fallback_allowed = allow_option_quote_fallback
self.name = "pandas"
Expand All @@ -35,6 +36,16 @@ def __init__(self, *args, pandas_data=None, auto_adjust=True, allow_option_quote
self._date_index = None
self._date_supply = None
self._timestep = "minute"
# Controls whether minute-level cached data may satisfy day-bar lookup requests.
#
# True (default) — Polygon and base PandasData: minute data can be resampled to day
# bars by Data.get_bars(), so find_asset_in_data_store accepts a
# minute dataset for a day request.
# False — ThetaData: each timestep has its own normalised/adjusted dataset;
# minute data must never silently proxy for day data. This prevents
# bypassing ThetaData's split-spike repair and forces an explicit day
# fetch when day bars are needed.
self.allow_day_resampling: bool = allow_day_resampling
# PERF: `find_asset_in_data_store()` is called in tight loops (quotes + history). Cache the
# resolved key for repeated `(asset, quote, timestep)` lookups within a backtest run.
self._find_asset_in_data_store_cache = {}
Expand Down Expand Up @@ -414,14 +425,24 @@ def _accepts_timestep(data_obj) -> bool:
# Hour requests can be satisfied by either hour data or minute data (resample).
return data_ts in {"hour", "minute"}
if requested_unit == "day":
# IMPORTANT:
# Keep explicit day requests pinned to native day datasets.
# IMPORTANT — two conflicting philosophies exist across data sources:
#
# allow_day_resampling=False (ThetaData):
# Keep explicit day requests pinned to native day datasets.
# ThetaData stores minute and day data under separate canonical keys
# (asset, quote, "minute") vs (asset, quote, "day"). Allowing minute
# data to satisfy day requests would silently bypass ThetaData's
# split-spike repair / split-adjustment normalisation and could trigger
# expensive re-fetch churn in daily-cadence backtests.
#
# Allowing minute datasets to satisfy day requests can silently bypass provider-
# specific day-bar normalization (for example split-spike repair/timestamp
# alignment in IBKR helpers), and can trigger expensive minute fetch churn in
# daily-cadence backtests.
if requested_asset_type in {"stock", "index"}:
# allow_day_resampling=True (Polygon, base PandasData — the default):
# Polygon's _update_pandas_data always tries to obtain the finest
# granularity available and relies on Data.get_bars() to resample
# minute → day on demand. If only minute data is cached for a stock,
# the day request must be allowed to reach Data.get_bars() so the
# resampling path fires. The same applies to user-provided minute
# CSV data in the plain PandasData source.
if not self.allow_day_resampling:
return data_ts == "day"
return data_ts in {"day", "minute"}
Comment on lines 427 to 447
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟠 Major

Day lookups still reject warmed hourly caches.

With allow_day_resampling=True, this branch now re-enables minute→day reuse, but it still excludes hour. A prior 1 hour fetch followed by 1 day will still miss here, even though lumibot/backtesting/polygon_backtesting.py already preserves hourly caches for that path. Please include hour here and add a matching regression alongside the new minute→day cases.

♻️ Suggested fix
-                return data_ts in {"day", "minute"}
+                return data_ts in {"day", "hour", "minute"}
📝 Committable suggestion

‼️ IMPORTANT
Carefully review the code before committing. Ensure that it accurately replaces the highlighted code, contains no missing lines, and has no issues with indentation. Thoroughly test & benchmark the code to ensure it meets the requirements.

Suggested change
if requested_unit == "day":
# IMPORTANT:
# Keep explicit day requests pinned to native day datasets.
# IMPORTANT — two conflicting philosophies exist across data sources:
#
# allow_day_resampling=False (ThetaData):
# Keep explicit day requests pinned to native day datasets.
# ThetaData stores minute and day data under separate canonical keys
# (asset, quote, "minute") vs (asset, quote, "day"). Allowing minute
# data to satisfy day requests would silently bypass ThetaData's
# split-spike repair / split-adjustment normalisation and could trigger
# expensive re-fetch churn in daily-cadence backtests.
#
# Allowing minute datasets to satisfy day requests can silently bypass provider-
# specific day-bar normalization (for example split-spike repair/timestamp
# alignment in IBKR helpers), and can trigger expensive minute fetch churn in
# daily-cadence backtests.
if requested_asset_type in {"stock", "index"}:
# allow_day_resampling=True (Polygon, base PandasData — the default):
# Polygon's _update_pandas_data always tries to obtain the finest
# granularity available and relies on Data.get_bars() to resample
# minute → day on demand. If only minute data is cached for a stock,
# the day request must be allowed to reach Data.get_bars() so the
# resampling path fires. The same applies to user-provided minute
# CSV data in the plain PandasData source.
if not self.allow_day_resampling:
return data_ts == "day"
return data_ts in {"day", "minute"}
if requested_unit == "day":
# IMPORTANT — two conflicting philosophies exist across data sources:
#
# allow_day_resampling=False (ThetaData):
# Keep explicit day requests pinned to native day datasets.
# ThetaData stores minute and day data under separate canonical keys
# (asset, quote, "minute") vs (asset, quote, "day"). Allowing minute
# data to satisfy day requests would silently bypass ThetaData's
# split-spike repair / split-adjustment normalisation and could trigger
# expensive re-fetch churn in daily-cadence backtests.
#
# allow_day_resampling=True (Polygon, base PandasData — the default):
# Polygon's _update_pandas_data always tries to obtain the finest
# granularity available and relies on Data.get_bars() to resample
# minute → day on demand. If only minute data is cached for a stock,
# the day request must be allowed to reach Data.get_bars() so the
# resampling path fires. The same applies to user-provided minute
# CSV data in the plain PandasData source.
if not self.allow_day_resampling:
return data_ts == "day"
return data_ts in {"day", "hour", "minute"}
🤖 Prompt for AI Agents
Verify each finding against the current code and only fix it if needed.

In `@lumibot/data_sources/pandas_data.py` around lines 427 - 447, The
day-resolution lookup logic currently treats allowed resampling as only
minute→day by checking data_ts in {"day", "minute"}, which ignores warmed hourly
caches; update the condition in the pandas_data resolution branch (the block
using self.allow_day_resampling, requested_unit and data_ts) to also accept
"hour" when requested_unit == "day" so hourly cached data can be resampled to
daily, and add a regression test alongside the new minute→day tests that primes
an "hour" cache then requests "day" to assert the hour data is
accepted/resampled.

# Fallback: require exact match for other timesteps.
Expand Down
153 changes: 153 additions & 0 deletions tests/test_pandas_data.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from datetime import datetime, timedelta

import pandas as pd
import pytz

from lumibot.data_sources import PandasData
from lumibot.entities import Asset
Expand Down Expand Up @@ -117,3 +118,155 @@ def test_get_price_snapshot_returns_metadata(self):
assert snapshot["last_trade_time"] == idx[-1].to_pydatetime()
assert snapshot["last_bid_time"] == idx[-1].to_pydatetime()
assert snapshot["last_ask_time"] == idx[-1].to_pydatetime()


class TestGetHistoricalPricesMinuteToDayRegression:
"""
Integration-level regression tests for the stock/index guard in _accepts_timestep
(pandas_data.py lines ~416-426) that silently blocks minute-to-day resampling.

The guard reads:
if requested_unit == "day":
if requested_asset_type in {"stock", "index"}:
return data_ts == "day" # minute data REJECTED for stocks

This causes get_historical_prices(asset, timestep="1 day") to return None whenever
only minute-level data is present for a stock/index asset, even though data.py's
get_bars() (lines 1307-1312) supports minute→day resampling.

Observed failure scenario (live backtest):
1. get_historical_prices(asset, 5, timestep="15 minute") – works fine
2. get_historical_prices(asset, 3, timestep="1 day") – returns None (BUG)
"""

@staticmethod
def _make_minute_data(asset: Asset, quote: Asset, n_bars: int = 120) -> Data:
"""
Build n_bars of 1-minute bars starting at NYSE open on 2025-01-02.
Uses UTC so no tz-localize issues arise inside Data.__init__.
"""
idx = pd.date_range("2025-01-02 14:30:00", periods=n_bars, freq="1min", tz="UTC")
price = [100.0 + i * 0.01 for i in range(n_bars)]
df = pd.DataFrame(
{
"open": price,
"high": [p + 0.5 for p in price],
"low": [p - 0.5 for p in price],
"close": price,
"volume": [1000] * n_bars,
},
index=idx,
)
data = Data(asset=asset, df=df, timestep="minute", quote=quote)
# Prime datalines so data.get_bars() is callable if the lookup ever succeeds.
data.repair_times_and_fill(data.df.index)
return data

@staticmethod
def _make_ds(minute_data: Data, asset: Asset, quote: Asset, current_dt) -> PandasData:
"""
Build a PandasData instance using the real constructor with the supplied
minute-level Data object in its store and _datetime set to current_dt.

Uses the real constructor (not PandasData.__new__) so that all instance
attributes — including allow_day_resampling — are properly initialised.
Overrides _datetime after construction to position the backtest clock.
"""
tz = pytz.timezone("America/New_York")
start = datetime(2025, 1, 2, 9, 30, tzinfo=tz)
end = datetime(2025, 1, 3, 16, 0, tzinfo=tz)
ds = PandasData(
datetime_start=start,
datetime_end=end,
pandas_data=[minute_data],
)
ds._datetime = current_dt
return ds

def test_15m_request_finds_minute_data_store(self):
"""
Baseline: find_asset_in_data_store must find the minute store for a
15-minute intraday request. This is the call that works before the day
request fails.
"""
spy = Asset("SPY", asset_type=Asset.AssetType.STOCK)
quote = Asset("USD", asset_type=Asset.AssetType.FOREX)
minute_data = self._make_minute_data(spy, quote)
current_dt = minute_data.df.index[60].to_pydatetime()
ds = self._make_ds(minute_data, spy, quote, current_dt)

found = ds.find_asset_in_data_store(spy, quote=quote, timestep="15 minute")
assert found == (spy, quote), (
"15-minute request must locate the minute data store for SPY."
)

def test_1day_request_returns_none_for_stock_with_only_minute_data(self):
"""
FIX VERIFIED: get_historical_prices with timestep="1 day" no longer returns None
when a STOCK asset has only minute data in the store.

With allow_day_resampling=True (the PandasData default), find_asset_in_data_store
accepts the minute dataset for a day request and Data.get_bars() resamples
minute → day bars, so the result is a non-None Bars object.
"""
spy = Asset("SPY", asset_type=Asset.AssetType.STOCK)
quote = Asset("USD", asset_type=Asset.AssetType.FOREX)
minute_data = self._make_minute_data(spy, quote, n_bars=120)
current_dt = minute_data.df.index[100].to_pydatetime()
ds = self._make_ds(minute_data, spy, quote, current_dt)

result_day = ds.get_historical_prices(spy, length=1, timestep="1 day", quote=quote)

assert result_day is not None, (
"FIX VERIFIED: get_historical_prices must return resampled day bars for a "
"stock asset when only minute data is in the store and allow_day_resampling=True."
)

def test_1day_request_after_15m_request_same_asset(self):
"""
FIX VERIFIED: mirrors the exact live-backtest failure sequence.

Step 1: call get_historical_prices with "15 minute" → succeeds (not None).
Step 2: call get_historical_prices with "1 day" → now also succeeds (not None).

With allow_day_resampling=True, the second call finds the cached minute data and
Data.get_bars() resamples it to daily bars instead of returning None.
"""
spy = Asset("SPY", asset_type=Asset.AssetType.STOCK)
quote = Asset("USD", asset_type=Asset.AssetType.FOREX)
minute_data = self._make_minute_data(spy, quote, n_bars=120)
current_dt = minute_data.df.index[100].to_pydatetime()
ds = self._make_ds(minute_data, spy, quote, current_dt)

# Step 1 – 15-minute bars: find_asset_in_data_store succeeds
ds.get_historical_prices(spy, length=5, timestep="15 minute", quote=quote)
assert (spy, quote) in ds._find_asset_in_data_store_cache.values(), (
"After the 15-minute call the lookup cache must contain the (spy, quote) key."
)

# Step 2 – 1-day bars: now returns resampled bars instead of None (FIX)
result_day = ds.get_historical_prices(spy, length=1, timestep="1 day", quote=quote)
assert result_day is not None, (
"FIX VERIFIED: after a successful 15-minute call, a subsequent 1-day call on "
"the same stock must return resampled day bars (allow_day_resampling=True)."
)

def test_crypto_1day_request_with_only_minute_data_not_blocked(self):
"""
Symmetry test: crypto assets behave the same as stocks with allow_day_resampling=True —
day requests on a crypto asset with only minute data are satisfied via resampling.

Both stock and crypto assets use the same allow_day_resampling flag path, so this
test verifies the uniform behavior post-fix.
"""
btc = Asset("BTC", asset_type=Asset.AssetType.CRYPTO)
usd = Asset("USD", asset_type=Asset.AssetType.FOREX)
minute_data = self._make_minute_data(btc, usd, n_bars=120)
current_dt = minute_data.df.index[100].to_pydatetime()
ds = self._make_ds(minute_data, btc, usd, current_dt)

# For crypto, find_asset_in_data_store must find the minute data for day requests.
found = ds.find_asset_in_data_store((btc, usd), quote=None, timestep="day")
assert found == (btc, usd), (
"Crypto with minute data must satisfy day requests (allow_day_resampling=True)."
)
Loading
Loading