Lumiwealth · davidlatte · Mar 13, 2026 · Mar 13, 2026 · Mar 13, 2026 · coderabbitai
@@ -37,6 +37,13 @@ def __init__(
             allow_option_quote_fallback=True, **kwargs
         )
 
+        # Polygon's philosophy: always cache the finest granularity available and resample
+        # up on demand.  When a strategy calls get_historical_prices(..., timestep="1 day")
+        # after a prior "15 minute" call has warmed the cache, the cached minute data must
+        # be allowed to satisfy the day request so Data.get_bars() can resample minute → day.
+        # This is the *opposite* of ThetaData, which requires exact timestep matches.
+        self.allow_day_resampling = True
+
         # Memory limit, off by default
         self.MAX_STORAGE_BYTES = max_memory
 

@@ -122,6 +122,14 @@ def __init__(
         super().__init__(datetime_start=datetime_start, datetime_end=datetime_end, pandas_data=pandas_data,
                          allow_option_quote_fallback=False, **kwargs)
 
+        # ThetaData stores minute and day data under SEPARATE canonical keys
+        # (asset, quote, "minute") vs (asset, quote, "day") and applies provider-specific
+        # split-spike repair / split-adjustment only to day bars.  Allowing a cached minute
+        # dataset to silently satisfy a day-bar lookup would bypass that normalisation.
+        # Set allow_day_resampling=False so find_asset_in_data_store always requires an
+        # exact timestep match and forces an explicit day fetch when day bars are needed.
+        self.allow_day_resampling = False
+
         # Default to minute; broker can flip to day for daily strategies.
         self._timestep = self.MIN_TIMESTEP
         # PERF: Avoid scanning the entire pandas_data store on every quote/snapshot call to infer day-mode.

@@ -25,7 +25,8 @@ class PandasData(DataSourceBacktesting):
         {"timestep": "minute", "representations": ["1M", "minute"]},
     ]
 
-    def __init__(self, *args, pandas_data=None, auto_adjust=True, allow_option_quote_fallback: bool = False, **kwargs):
+    def __init__(self, *args, pandas_data=None, auto_adjust=True, allow_option_quote_fallback: bool = False,
+                 allow_day_resampling: bool = True, **kwargs):
         super().__init__(*args, **kwargs)
         self.option_quote_fallback_allowed = allow_option_quote_fallback
         self.name = "pandas"
@@ -35,6 +36,16 @@ def __init__(self, *args, pandas_data=None, auto_adjust=True, allow_option_quote
         self._date_index = None
         self._date_supply = None
         self._timestep = "minute"
+        # Controls whether minute-level cached data may satisfy day-bar lookup requests.
+        #
+        # True  (default) — Polygon and base PandasData: minute data can be resampled to day
+        #                    bars by Data.get_bars(), so find_asset_in_data_store accepts a
+        #                    minute dataset for a day request.
+        # False           — ThetaData: each timestep has its own normalised/adjusted dataset;
+        #                    minute data must never silently proxy for day data.  This prevents
+        #                    bypassing ThetaData's split-spike repair and forces an explicit day
+        #                    fetch when day bars are needed.
+        self.allow_day_resampling: bool = allow_day_resampling
         # PERF: `find_asset_in_data_store()` is called in tight loops (quotes + history). Cache the
         # resolved key for repeated `(asset, quote, timestep)` lookups within a backtest run.
         self._find_asset_in_data_store_cache = {}
@@ -414,14 +425,24 @@ def _accepts_timestep(data_obj) -> bool:
                 # Hour requests can be satisfied by either hour data or minute data (resample).
                 return data_ts in {"hour", "minute"}
             if requested_unit == "day":
-                # IMPORTANT:
-                # Keep explicit day requests pinned to native day datasets.
+                # IMPORTANT — two conflicting philosophies exist across data sources:
+                #
+                # allow_day_resampling=False (ThetaData):
+                #   Keep explicit day requests pinned to native day datasets.
+                #   ThetaData stores minute and day data under separate canonical keys
+                #   (asset, quote, "minute") vs (asset, quote, "day").  Allowing minute
+                #   data to satisfy day requests would silently bypass ThetaData's
+                #   split-spike repair / split-adjustment normalisation and could trigger
+                #   expensive re-fetch churn in daily-cadence backtests.
                 #
-                # Allowing minute datasets to satisfy day requests can silently bypass provider-
-                # specific day-bar normalization (for example split-spike repair/timestamp
-                # alignment in IBKR helpers), and can trigger expensive minute fetch churn in
-                # daily-cadence backtests.
-                if requested_asset_type in {"stock", "index"}:
+                # allow_day_resampling=True (Polygon, base PandasData — the default):
+                #   Polygon's _update_pandas_data always tries to obtain the finest
+                #   granularity available and relies on Data.get_bars() to resample
+                #   minute → day on demand.  If only minute data is cached for a stock,
+                #   the day request must be allowed to reach Data.get_bars() so the
+                #   resampling path fires.  The same applies to user-provided minute
+                #   CSV data in the plain PandasData source.
+                if not self.allow_day_resampling:
                     return data_ts == "day"
                 return data_ts in {"day", "minute"}
-            if requested_unit == "day":
-                # IMPORTANT:
-                # Keep explicit day requests pinned to native day datasets.
-                # IMPORTANT — two conflicting philosophies exist across data sources:
-                #
-                # allow_day_resampling=False (ThetaData):
-                #   Keep explicit day requests pinned to native day datasets.
-                #   ThetaData stores minute and day data under separate canonical keys
-                #   (asset, quote, "minute") vs (asset, quote, "day").  Allowing minute
-                #   data to satisfy day requests would silently bypass ThetaData's
-                #   split-spike repair / split-adjustment normalisation and could trigger
-                #   expensive re-fetch churn in daily-cadence backtests.
-                #
-                # Allowing minute datasets to satisfy day requests can silently bypass provider-
-                # specific day-bar normalization (for example split-spike repair/timestamp
-                # alignment in IBKR helpers), and can trigger expensive minute fetch churn in
-                # daily-cadence backtests.
-                if requested_asset_type in {"stock", "index"}:
-                # allow_day_resampling=True (Polygon, base PandasData — the default):
-                #   Polygon's _update_pandas_data always tries to obtain the finest
-                #   granularity available and relies on Data.get_bars() to resample
-                #   minute → day on demand.  If only minute data is cached for a stock,
-                #   the day request must be allowed to reach Data.get_bars() so the
-                #   resampling path fires.  The same applies to user-provided minute
-                #   CSV data in the plain PandasData source.
-                if not self.allow_day_resampling:
-                    return data_ts == "day"
-                return data_ts in {"day", "minute"}
+            if requested_unit == "day":
+                # IMPORTANT — two conflicting philosophies exist across data sources:
+                #
+                # allow_day_resampling=False (ThetaData):
+                #   Keep explicit day requests pinned to native day datasets.
+                #   ThetaData stores minute and day data under separate canonical keys
+                #   (asset, quote, "minute") vs (asset, quote, "day").  Allowing minute
+                #   data to satisfy day requests would silently bypass ThetaData's
+                #   split-spike repair / split-adjustment normalisation and could trigger
+                #   expensive re-fetch churn in daily-cadence backtests.
+                #
+                # allow_day_resampling=True (Polygon, base PandasData — the default):
+                #   Polygon's _update_pandas_data always tries to obtain the finest
+                #   granularity available and relies on Data.get_bars() to resample
+                #   minute → day on demand.  If only minute data is cached for a stock,
+                #   the day request must be allowed to reach Data.get_bars() so the
+                #   resampling path fires.  The same applies to user-provided minute
+                #   CSV data in the plain PandasData source.
+                if not self.allow_day_resampling:
+                    return data_ts == "day"
+                return data_ts in {"day", "hour", "minute"}
-            if requested_unit == "day":
-                # IMPORTANT:
-                # Keep explicit day requests pinned to native day datasets.
-                # IMPORTANT — two conflicting philosophies exist across data sources:
-                #
-                # allow_day_resampling=False (ThetaData):
-                #   Keep explicit day requests pinned to native day datasets.
-                #   ThetaData stores minute and day data under separate canonical keys
-                #   (asset, quote, "minute") vs (asset, quote, "day").  Allowing minute
-                #   data to satisfy day requests would silently bypass ThetaData's
-                #   split-spike repair / split-adjustment normalisation and could trigger
-                #   expensive re-fetch churn in daily-cadence backtests.
-                #
-                # Allowing minute datasets to satisfy day requests can silently bypass provider-
-                # specific day-bar normalization (for example split-spike repair/timestamp
-                # alignment in IBKR helpers), and can trigger expensive minute fetch churn in
-                # daily-cadence backtests.
-                if requested_asset_type in {"stock", "index"}:
-                # allow_day_resampling=True (Polygon, base PandasData — the default):
-                #   Polygon's _update_pandas_data always tries to obtain the finest
-                #   granularity available and relies on Data.get_bars() to resample
-                #   minute → day on demand.  If only minute data is cached for a stock,
-                #   the day request must be allowed to reach Data.get_bars() so the
-                #   resampling path fires.  The same applies to user-provided minute
-                #   CSV data in the plain PandasData source.
-                if not self.allow_day_resampling:
-                    return data_ts == "day"
-                return data_ts in {"day", "minute"}
+            if requested_unit == "day":
+                # IMPORTANT — two conflicting philosophies exist across data sources:
+                #
+                # allow_day_resampling=False (ThetaData):
+                #   Keep explicit day requests pinned to native day datasets.
+                #   ThetaData stores minute and day data under separate canonical keys
+                #   (asset, quote, "minute") vs (asset, quote, "day").  Allowing minute
+                #   data to satisfy day requests would silently bypass ThetaData's
+                #   split-spike repair / split-adjustment normalisation and could trigger
+                #   expensive re-fetch churn in daily-cadence backtests.
+                #
+                # allow_day_resampling=True (Polygon, base PandasData — the default):
+                #   Polygon's _update_pandas_data always tries to obtain the finest
+                #   granularity available and relies on Data.get_bars() to resample
+                #   minute → day on demand.  If only minute data is cached for a stock,
+                #   the day request must be allowed to reach Data.get_bars() so the
+                #   resampling path fires.  The same applies to user-provided minute
+                #   CSV data in the plain PandasData source.
+                if not self.allow_day_resampling:
+                    return data_ts == "day"
+                return data_ts in {"day", "hour", "minute"}
             # Fallback: require exact match for other timesteps.

@@ -1,6 +1,7 @@
 from datetime import datetime, timedelta
 
 import pandas as pd
+import pytz
 
 from lumibot.data_sources import PandasData
 from lumibot.entities import Asset
@@ -117,3 +118,155 @@ def test_get_price_snapshot_returns_metadata(self):
         assert snapshot["last_trade_time"] == idx[-1].to_pydatetime()
         assert snapshot["last_bid_time"] == idx[-1].to_pydatetime()
         assert snapshot["last_ask_time"] == idx[-1].to_pydatetime()
+
+
+class TestGetHistoricalPricesMinuteToDayRegression:
+    """
+    Integration-level regression tests for the stock/index guard in _accepts_timestep
+    (pandas_data.py lines ~416-426) that silently blocks minute-to-day resampling.
+
+    The guard reads:
+        if requested_unit == "day":
+            if requested_asset_type in {"stock", "index"}:
+                return data_ts == "day"   # minute data REJECTED for stocks
+
+    This causes get_historical_prices(asset, timestep="1 day") to return None whenever
+    only minute-level data is present for a stock/index asset, even though data.py's
+    get_bars() (lines 1307-1312) supports minute→day resampling.
+
+    Observed failure scenario (live backtest):
+      1. get_historical_prices(asset, 5, timestep="15 minute") – works fine
+      2. get_historical_prices(asset, 3, timestep="1 day")     – returns None (BUG)
+    """
+
+    @staticmethod
+    def _make_minute_data(asset: Asset, quote: Asset, n_bars: int = 120) -> Data:
+        """
+        Build n_bars of 1-minute bars starting at NYSE open on 2025-01-02.
+        Uses UTC so no tz-localize issues arise inside Data.__init__.
+        """
+        idx = pd.date_range("2025-01-02 14:30:00", periods=n_bars, freq="1min", tz="UTC")
+        price = [100.0 + i * 0.01 for i in range(n_bars)]
+        df = pd.DataFrame(
+            {
+                "open": price,
+                "high": [p + 0.5 for p in price],
+                "low": [p - 0.5 for p in price],
+                "close": price,
+                "volume": [1000] * n_bars,
+            },
+            index=idx,
+        )
+        data = Data(asset=asset, df=df, timestep="minute", quote=quote)
+        # Prime datalines so data.get_bars() is callable if the lookup ever succeeds.
+        data.repair_times_and_fill(data.df.index)
+        return data
+
+    @staticmethod
+    def _make_ds(minute_data: Data, asset: Asset, quote: Asset, current_dt) -> PandasData:
+        """
+        Build a PandasData instance using the real constructor with the supplied
+        minute-level Data object in its store and _datetime set to current_dt.
+
+        Uses the real constructor (not PandasData.__new__) so that all instance
+        attributes — including allow_day_resampling — are properly initialised.
+        Overrides _datetime after construction to position the backtest clock.
+        """
+        tz = pytz.timezone("America/New_York")
+        start = datetime(2025, 1, 2, 9, 30, tzinfo=tz)
+        end = datetime(2025, 1, 3, 16, 0, tzinfo=tz)
+        ds = PandasData(
+            datetime_start=start,
+            datetime_end=end,
+            pandas_data=[minute_data],
+        )
+        ds._datetime = current_dt
+        return ds
+
+    def test_15m_request_finds_minute_data_store(self):
+        """
+        Baseline: find_asset_in_data_store must find the minute store for a
+        15-minute intraday request.  This is the call that works before the day
+        request fails.
+        """
+        spy = Asset("SPY", asset_type=Asset.AssetType.STOCK)
+        quote = Asset("USD", asset_type=Asset.AssetType.FOREX)
+        minute_data = self._make_minute_data(spy, quote)
+        current_dt = minute_data.df.index[60].to_pydatetime()
+        ds = self._make_ds(minute_data, spy, quote, current_dt)
+
+        found = ds.find_asset_in_data_store(spy, quote=quote, timestep="15 minute")
+        assert found == (spy, quote), (
+            "15-minute request must locate the minute data store for SPY."
+        )
+
+    def test_1day_request_returns_none_for_stock_with_only_minute_data(self):
+        """
+        FIX VERIFIED: get_historical_prices with timestep="1 day" no longer returns None
+        when a STOCK asset has only minute data in the store.
+
+        With allow_day_resampling=True (the PandasData default), find_asset_in_data_store
+        accepts the minute dataset for a day request and Data.get_bars() resamples
+        minute → day bars, so the result is a non-None Bars object.
+        """
+        spy = Asset("SPY", asset_type=Asset.AssetType.STOCK)
+        quote = Asset("USD", asset_type=Asset.AssetType.FOREX)
+        minute_data = self._make_minute_data(spy, quote, n_bars=120)
+        current_dt = minute_data.df.index[100].to_pydatetime()
+        ds = self._make_ds(minute_data, spy, quote, current_dt)
+
+        result_day = ds.get_historical_prices(spy, length=1, timestep="1 day", quote=quote)
+
+        assert result_day is not None, (
+            "FIX VERIFIED: get_historical_prices must return resampled day bars for a "
+            "stock asset when only minute data is in the store and allow_day_resampling=True."
+        )
+
+    def test_1day_request_after_15m_request_same_asset(self):
+        """
+        FIX VERIFIED: mirrors the exact live-backtest failure sequence.
+
+        Step 1: call get_historical_prices with "15 minute" → succeeds (not None).
+        Step 2: call get_historical_prices with "1 day"     → now also succeeds (not None).
+
+        With allow_day_resampling=True, the second call finds the cached minute data and
+        Data.get_bars() resamples it to daily bars instead of returning None.
+        """
+        spy = Asset("SPY", asset_type=Asset.AssetType.STOCK)
+        quote = Asset("USD", asset_type=Asset.AssetType.FOREX)
+        minute_data = self._make_minute_data(spy, quote, n_bars=120)
+        current_dt = minute_data.df.index[100].to_pydatetime()
+        ds = self._make_ds(minute_data, spy, quote, current_dt)
+
+        # Step 1 – 15-minute bars: find_asset_in_data_store succeeds
+        ds.get_historical_prices(spy, length=5, timestep="15 minute", quote=quote)
+        assert (spy, quote) in ds._find_asset_in_data_store_cache.values(), (
+            "After the 15-minute call the lookup cache must contain the (spy, quote) key."
+        )
+
+        # Step 2 – 1-day bars: now returns resampled bars instead of None (FIX)
+        result_day = ds.get_historical_prices(spy, length=1, timestep="1 day", quote=quote)
+        assert result_day is not None, (
+            "FIX VERIFIED: after a successful 15-minute call, a subsequent 1-day call on "
+            "the same stock must return resampled day bars (allow_day_resampling=True)."
+        )
+
+    def test_crypto_1day_request_with_only_minute_data_not_blocked(self):
+        """
+        Symmetry test: crypto assets behave the same as stocks with allow_day_resampling=True —
+        day requests on a crypto asset with only minute data are satisfied via resampling.
+
+        Both stock and crypto assets use the same allow_day_resampling flag path, so this
+        test verifies the uniform behavior post-fix.
+        """
+        btc = Asset("BTC", asset_type=Asset.AssetType.CRYPTO)
+        usd = Asset("USD", asset_type=Asset.AssetType.FOREX)
+        minute_data = self._make_minute_data(btc, usd, n_bars=120)
+        current_dt = minute_data.df.index[100].to_pydatetime()
+        ds = self._make_ds(minute_data, btc, usd, current_dt)
+
+        # For crypto, find_asset_in_data_store must find the minute data for day requests.
+        found = ds.find_asset_in_data_store((btc, usd), quote=None, timestep="day")
+        assert found == (btc, usd), (
+            "Crypto with minute data must satisfy day requests (allow_day_resampling=True)."
+        )