Merge pull request #49 from AKKI0511/analyze-quanttradeai-for-new-feature

AKKI0511 · web-flow · commit a14878923919 · 2025-10-15T21:13:30.000-04:00
feat: support secondary timeframe ingestion
diff --git a/config/model_config.yaml b/config/model_config.yaml
@@ -5,8 +5,11 @@ data:
   symbols: ['AAPL', 'META', 'TSLA', 'JPM', 'AMZN']
   start_date: '2015-01-01'
   end_date: '2024-12-31'
-  timeframe: '1d'
-  cache_dir: 'data/raw'
+  timeframe: '1d'
+  secondary_timeframes:
+    - '1h'
+    - '30m'
+  cache_dir: 'data/raw'
   cache_path: 'data/raw'  # Directory to load/store cached OHLCV files
   cache_expiration_days: 7  # Refresh cache after this many days
   use_cache: true
diff --git a/docs/configuration.md b/docs/configuration.md
@@ -24,6 +24,9 @@ data:
   end_date: '2024-12-31'
   cache_dir: 'data/raw'
   cache_path: 'data/raw'
+  secondary_timeframes:
+    - '1h'
+    - '30m'
   cache_expiration_days: 7
   use_cache: true
   refresh: false
@@ -37,6 +40,7 @@ data:
 - `symbols`: List of stock symbols to process
 - `start_date`/`end_date`: Data date range
 - `cache_dir`: Directory for cached data
+- `secondary_timeframes`: Optional list of higher-frequency bars to resample into the primary `timeframe` using OHLCV aggregations (`open→first`, `high→max`, `low→min`, `close→last`, `volume→sum`)
 - `use_cache`: Enable/disable caching
 - `refresh`: Force fresh data download
 - `max_workers`: Parallel processing workers
diff --git a/quanttradeai/data/loader.py b/quanttradeai/data/loader.py
@@ -52,7 +52,8 @@ def __init__(
         self.symbols = data_cfg.symbols
         self.start_date = data_cfg.start_date
         self.end_date = data_cfg.end_date
-        self.timeframe = data_cfg.timeframe or "1d"
+        self.timeframe = data_cfg.timeframe or "1d"
+        self.secondary_timeframes = data_cfg.secondary_timeframes or []
         # allow both legacy 'cache_dir' and new 'cache_path' keys
         self.cache_dir = data_cfg.cache_path or data_cfg.cache_dir or "data/raw"
         self.cache_expiration_days = data_cfg.cache_expiration_days
@@ -72,37 +73,66 @@ def _is_cache_valid(self, cache_file: str) -> bool:
 
     def _fetch_single(self, symbol: str, refresh: bool) -> Optional[pd.DataFrame]:
         """Fetch data for a single symbol and handle caching."""
-        cache_file = os.path.join(
-            self.cache_dir, f"{symbol}_{self.timeframe}_data.parquet"
-        )
-        try:
-            if self.use_cache and not refresh and self._is_cache_valid(cache_file):
-                logger.info(f"Loading cached data for {symbol} from {cache_file}")
-                df = pd.read_parquet(cache_file)
-            else:
-                logger.info(f"Fetching data for {symbol}")
-                df = self.data_source.fetch(
-                    symbol, self.start_date, self.end_date, self.timeframe
-                )
-
-                if df is None or df.empty:
-                    logger.error(f"No data found for {symbol}")
-                    return None
-
-                if self.use_cache:
-                    os.makedirs(self.cache_dir, exist_ok=True)
-                    df.to_parquet(cache_file)
-                    logger.info(f"Cached data for {symbol} at {cache_file}")
-
-            missing_dates = self._check_missing_dates(df)
-            if missing_dates:
-                logger.warning(f"Missing dates for {symbol}: {len(missing_dates)} days")
-
-            logger.info(f"Successfully retrieved {len(df)} records for {symbol}")
-            return df
-        except Exception as e:
-            logger.error(f"Error fetching data for {symbol}: {str(e)}")
-            return None
+        cache_file = os.path.join(
+            self.cache_dir, f"{symbol}_{self.timeframe}_data.parquet"
+        )
+        try:
+            df = self._load_timeframe_data(
+                symbol=symbol,
+                timeframe=self.timeframe,
+                cache_file=cache_file,
+                refresh=refresh,
+            )
+
+            if df is None or df.empty:
+                logger.error(f"No data found for {symbol}")
+                return None
+
+            df = df.sort_index()
+
+            for secondary_tf in self.secondary_timeframes:
+                secondary_cache = os.path.join(
+                    self.cache_dir, f"{symbol}_{secondary_tf}_data.parquet"
+                )
+                secondary_df = self._load_timeframe_data(
+                    symbol=symbol,
+                    timeframe=secondary_tf,
+                    cache_file=secondary_cache,
+                    refresh=refresh,
+                )
+
+                if secondary_df is None or secondary_df.empty:
+                    logger.warning(
+                        "No data found for %s at secondary timeframe %s",
+                        symbol,
+                        secondary_tf,
+                    )
+                    continue
+
+                try:
+                    resampled = self._resample_secondary(
+                        secondary_df, df.index, secondary_tf
+                    )
+                except ValueError as exc:
+                    logger.warning(
+                        "Skipping secondary timeframe %s for %s: %s",
+                        secondary_tf,
+                        symbol,
+                        exc,
+                    )
+                    continue
+
+                df = df.join(resampled, how="left")
+
+            missing_dates = self._check_missing_dates(df)
+            if missing_dates:
+                logger.warning(f"Missing dates for {symbol}: {len(missing_dates)} days")
+
+            logger.info(f"Successfully retrieved {len(df)} records for {symbol}")
+            return df
+        except Exception as e:
+            logger.error(f"Error fetching data for {symbol}: {str(e)}")
+            return None
 
     def fetch_data(
         self, symbols: Optional[List[str]] = None, refresh: Optional[bool] = None
@@ -139,11 +169,77 @@ def fetch_data(
 
         return data_dict
 
-    def _check_missing_dates(self, df: pd.DataFrame) -> List[datetime]:
-        """Check for missing trading days in the data."""
-        all_dates = pd.date_range(start=df.index.min(), end=df.index.max(), freq="B")
-        missing_dates = all_dates.difference(df.index)
-        return list(missing_dates)
+    def _check_missing_dates(self, df: pd.DataFrame) -> List[datetime]:
+        """Check for missing trading days in the data."""
+        all_dates = pd.date_range(start=df.index.min(), end=df.index.max(), freq="B")
+        missing_dates = all_dates.difference(df.index)
+        return list(missing_dates)
+
+    def _load_timeframe_data(
+        self, symbol: str, timeframe: str, cache_file: str, refresh: bool
+    ) -> Optional[pd.DataFrame]:
+        """Load data for a given timeframe from cache or datasource."""
+
+        if self.use_cache and not refresh and self._is_cache_valid(cache_file):
+            logger.info(
+                "Loading cached data for %s (%s) from %s", symbol, timeframe, cache_file
+            )
+            return pd.read_parquet(cache_file)
+
+        logger.info(f"Fetching data for {symbol} ({timeframe})")
+        df = self.data_source.fetch(
+            symbol, self.start_date, self.end_date, timeframe
+        )
+
+        if df is None or df.empty:
+            return None
+
+        if self.use_cache:
+            os.makedirs(self.cache_dir, exist_ok=True)
+            df.to_parquet(cache_file)
+            logger.info(
+                "Cached data for %s (%s) at %s", symbol, timeframe, cache_file
+            )
+
+        return df
+
+    def _resample_secondary(
+        self,
+        df: pd.DataFrame,
+        target_index: pd.Index,
+        source_timeframe: str,
+    ) -> pd.DataFrame:
+        """Resample a secondary timeframe to the loader's primary timeframe."""
+
+        if not isinstance(df.index, pd.DatetimeIndex):
+            df = df.copy()
+            df.index = pd.to_datetime(df.index)
+
+        df = df.sort_index()
+
+        required_columns = {
+            "Open": "first",
+            "High": "max",
+            "Low": "min",
+            "Close": "last",
+            "Volume": "sum",
+        }
+
+        missing = [col for col in required_columns if col not in df.columns]
+        if missing:
+            raise ValueError(
+                f"Secondary timeframe data missing required columns: {missing}"
+            )
+
+        resampled = df.resample(self.timeframe).agg(required_columns)
+
+        renamed = {
+            col: f"{col.lower()}_{source_timeframe}_{required_columns[col]}"
+            for col in required_columns
+        }
+        resampled = resampled.rename(columns=renamed)
+        resampled = resampled.reindex(target_index)
+        return resampled
 
     def validate_data(self, data_dict: Dict[str, pd.DataFrame]) -> bool:
         """
diff --git a/quanttradeai/utils/config_schemas.py b/quanttradeai/utils/config_schemas.py
@@ -13,13 +13,14 @@
 from pydantic import BaseModel, Field
 
 
-class DataSection(BaseModel):
-    symbols: List[str]
-    start_date: str
-    end_date: str
-    timeframe: Optional[str] = "1d"
-    cache_path: Optional[str] = None
-    cache_dir: Optional[str] = None
+class DataSection(BaseModel):
+    symbols: List[str]
+    start_date: str
+    end_date: str
+    timeframe: Optional[str] = "1d"
+    secondary_timeframes: Optional[List[str]] = None
+    cache_path: Optional[str] = None
+    cache_dir: Optional[str] = None
     cache_expiration_days: Optional[int] = None
     use_cache: Optional[bool] = True
     refresh: Optional[bool] = False
diff --git a/tests/data/test_loader.py b/tests/data/test_loader.py
@@ -1,5 +1,5 @@
 import unittest
-from unittest.mock import patch
+from unittest.mock import patch, call
 import pandas as pd
 import os
 import yaml
@@ -30,21 +30,23 @@ def setUp(self):
         )
         self.df.to_parquet(os.path.join(self.cache_dir, "TEST_1d_data.parquet"))
 
-    def _write_config(self, expiration):
-        config = {
-            "data": {
-                "symbols": ["TEST"],
-                "start_date": "2020-01-01",
-                "end_date": "2020-01-10",
-                "cache_path": self.cache_dir,
-                "timeframe": "1d",
-                "cache_expiration_days": expiration,
-                "use_cache": True,
-                "refresh": False,
-            }
-        }
-        with open(self.config_path, "w") as f:
-            yaml.dump(config, f)
+    def _write_config(self, expiration, secondary_timeframes=None):
+        config = {
+            "data": {
+                "symbols": ["TEST"],
+                "start_date": "2020-01-01",
+                "end_date": "2020-01-10",
+                "cache_path": self.cache_dir,
+                "timeframe": "1d",
+                "cache_expiration_days": expiration,
+                "use_cache": True,
+                "refresh": False,
+            }
+        }
+        if secondary_timeframes is not None:
+            config["data"]["secondary_timeframes"] = secondary_timeframes
+        with open(self.config_path, "w") as f:
+            yaml.dump(config, f)
 
     def tearDown(self):
         shutil.rmtree(self.tmpdir)
@@ -78,20 +80,88 @@ def test_fetch_data_refreshes_cache(self, mock_fetch):
         pd.testing.assert_frame_equal(data_dict["TEST"], mock_history)
 
     @patch("quanttradeai.data.datasource.YFinanceDataSource.fetch")
-    def test_fetch_data_expired_cache(self, mock_fetch):
-        self._write_config(expiration=0)
-
-        mock_history = pd.DataFrame(
-            {"Open": [1], "High": [1], "Low": [1], "Close": [1], "Volume": [1]},
-            index=pd.date_range("2020-01-01", periods=1),
-        )
-        mock_fetch.return_value = mock_history
-
-        loader = DataLoader(self.config_path)
-        data_dict = loader.fetch_data()
-
-        mock_fetch.assert_called_once()
-        pd.testing.assert_frame_equal(data_dict["TEST"], mock_history)
+    def test_fetch_data_expired_cache(self, mock_fetch):
+        self._write_config(expiration=0)
+
+        mock_history = pd.DataFrame(
+            {"Open": [1], "High": [1], "Low": [1], "Close": [1], "Volume": [1]},
+            index=pd.date_range("2020-01-01", periods=1),
+        )
+        mock_fetch.return_value = mock_history
+
+        loader = DataLoader(self.config_path)
+        data_dict = loader.fetch_data()
+
+        mock_fetch.assert_called_once()
+        pd.testing.assert_frame_equal(data_dict["TEST"], mock_history)
+
+    @patch("quanttradeai.data.datasource.YFinanceDataSource.fetch")
+    def test_fetch_data_with_secondary_timeframes(self, mock_fetch):
+        self._write_config(expiration=10, secondary_timeframes=["1h"])
+
+        primary_index = pd.date_range("2020-01-01", periods=2, freq="D")
+        primary_df = pd.DataFrame(
+            {
+                "Open": [100.0, 110.0],
+                "High": [101.0, 111.0],
+                "Low": [99.0, 109.0],
+                "Close": [100.5, 110.5],
+                "Volume": [1000, 1100],
+            },
+            index=primary_index,
+        )
+
+        hourly_index = pd.date_range("2020-01-01", periods=48, freq="h")
+        hourly_df = pd.DataFrame(
+            {
+                "Open": range(48),
+                "High": [value + 1 for value in range(48)],
+                "Low": range(48),
+                "Close": [value + 0.5 for value in range(48)],
+                "Volume": [10] * 48,
+            },
+            index=hourly_index,
+        )
+
+        mock_fetch.side_effect = [primary_df, hourly_df]
+
+        loader = DataLoader(self.config_path)
+        data_dict = loader.fetch_data(refresh=True)
+
+        df = data_dict["TEST"]
+
+        expected_secondary = (
+            hourly_df.resample("1D")
+            .agg({
+                "Open": "first",
+                "High": "max",
+                "Low": "min",
+                "Close": "last",
+                "Volume": "sum",
+            })
+            .rename(
+                columns={
+                    "Open": "open_1h_first",
+                    "High": "high_1h_max",
+                    "Low": "low_1h_min",
+                    "Close": "close_1h_last",
+                    "Volume": "volume_1h_sum",
+                }
+            )
+            .reindex(primary_index)
+        )
+
+        for column in expected_secondary.columns:
+            assert column in df.columns
+            pd.testing.assert_series_equal(
+                df[column], expected_secondary[column], check_names=True
+            )
+
+        expected_calls = [
+            call("TEST", "2020-01-01", "2020-01-10", "1d"),
+            call("TEST", "2020-01-01", "2020-01-10", "1h"),
+        ]
+        assert mock_fetch.call_args_list == expected_calls
 
 
 if __name__ == "__main__":
diff --git a/tests/data/test_timeframe.py b/tests/data/test_timeframe.py
diff --git a/tests/integration/test_pipeline_time_splits.py b/tests/integration/test_pipeline_time_splits.py