Merge pull request #17 from AKKI0511/codex/refactor-fetch_data-for-parallel-download

AKKI0511 · web-flow · commit b205de80ff66 · 2025-07-01T08:46:27.000-04:00
Enable parallel data loading
diff --git a/config/model_config.yaml b/config/model_config.yaml
@@ -10,6 +10,7 @@ data:
   cache_expiration_days: 7  # Refresh cache after this many days
   use_cache: true
   refresh: false
+  max_workers: 1
   test_start: '2025-01-01'
   test_end: '2025-01-31'
 
diff --git a/src/data/loader.py b/src/data/loader.py
@@ -5,6 +5,7 @@
 from datetime import datetime, timedelta
 import yaml
 from pydantic import ValidationError
+from concurrent.futures import ThreadPoolExecutor, as_completed
 
 from utils.config_schemas import ModelConfigSchema
 import os
@@ -36,6 +37,7 @@ def __init__(self, config_path: str = "config/model_config.yaml"):
         self.cache_expiration_days = data_cfg.cache_expiration_days
         self.use_cache = data_cfg.use_cache
         self.default_refresh = data_cfg.refresh
+        self.max_workers = data_cfg.max_workers or 1
 
     def _is_cache_valid(self, cache_file: str) -> bool:
         """Return True if the cache file exists and is not expired."""
@@ -46,6 +48,37 @@ def _is_cache_valid(self, cache_file: str) -> bool:
         file_time = datetime.fromtimestamp(os.path.getmtime(cache_file))
         return datetime.now() - file_time < timedelta(days=self.cache_expiration_days)
 
+    def _fetch_single(self, symbol: str, refresh: bool) -> Optional[pd.DataFrame]:
+        """Fetch data for a single symbol and handle caching."""
+        cache_file = os.path.join(self.cache_dir, f"{symbol}_data.parquet")
+        try:
+            if self.use_cache and not refresh and self._is_cache_valid(cache_file):
+                logger.info(f"Loading cached data for {symbol} from {cache_file}")
+                df = pd.read_parquet(cache_file)
+            else:
+                logger.info(f"Fetching data for {symbol}")
+                ticker = yf.Ticker(symbol)
+                df = ticker.history(start=self.start_date, end=self.end_date)
+
+                if df.empty:
+                    logger.error(f"No data found for {symbol}")
+                    return None
+
+                if self.use_cache:
+                    os.makedirs(self.cache_dir, exist_ok=True)
+                    df.to_parquet(cache_file)
+                    logger.info(f"Cached data for {symbol} at {cache_file}")
+
+            missing_dates = self._check_missing_dates(df)
+            if missing_dates:
+                logger.warning(f"Missing dates for {symbol}: {len(missing_dates)} days")
+
+            logger.info(f"Successfully retrieved {len(df)} records for {symbol}")
+            return df
+        except Exception as e:
+            logger.error(f"Error fetching data for {symbol}: {str(e)}")
+            return None
+
     def fetch_data(
         self, symbols: Optional[List[str]] = None, refresh: Optional[bool] = None
     ) -> Dict[str, pd.DataFrame]:
@@ -61,42 +94,23 @@ def fetch_data(
         """
         symbols = symbols or self.symbols
         refresh = self.default_refresh if refresh is None else refresh
-        data_dict = {}
-
-        for symbol in symbols:
-            cache_file = os.path.join(self.cache_dir, f"{symbol}_data.parquet")
-            try:
-                if self.use_cache and not refresh and self._is_cache_valid(cache_file):
-                    logger.info(f"Loading cached data for {symbol} from {cache_file}")
-                    df = pd.read_parquet(cache_file)
-                else:
-                    logger.info(f"Fetching data for {symbol}")
-                    ticker = yf.Ticker(symbol)
-                    df = ticker.history(start=self.start_date, end=self.end_date)
-
-                    if df.empty:
-                        logger.error(f"No data found for {symbol}")
-                        continue
-
-                    # Save to cache if enabled
-                    if self.use_cache:
-                        os.makedirs(self.cache_dir, exist_ok=True)
-                        df.to_parquet(cache_file)
-                        logger.info(f"Cached data for {symbol} at {cache_file}")
-
-                # Validate data completeness
-                missing_dates = self._check_missing_dates(df)
-                if missing_dates:
-                    logger.warning(
-                        f"Missing dates for {symbol}: {len(missing_dates)} days"
-                    )
-
-                data_dict[symbol] = df
-                logger.info(f"Successfully retrieved {len(df)} records for {symbol}")
-
-            except Exception as e:
-                logger.error(f"Error fetching data for {symbol}: {str(e)}")
-                continue
+        data_dict: Dict[str, pd.DataFrame] = {}
+
+        if self.max_workers and self.max_workers > 1:
+            with ThreadPoolExecutor(max_workers=self.max_workers) as executor:
+                futures = {
+                    executor.submit(self._fetch_single, s, refresh): s for s in symbols
+                }
+                for future in as_completed(futures):
+                    symbol = futures[future]
+                    df = future.result()
+                    if df is not None:
+                        data_dict[symbol] = df
+        else:
+            for symbol in symbols:
+                df = self._fetch_single(symbol, refresh)
+                if df is not None:
+                    data_dict[symbol] = df
 
         return data_dict
 
diff --git a/src/utils/config_schemas.py b/src/utils/config_schemas.py
@@ -13,6 +13,7 @@ class DataSection(BaseModel):
     refresh: Optional[bool] = False
     test_start: Optional[str] = None
     test_end: Optional[str] = None
+    max_workers: Optional[int] = 1
 
 
 class ModelConfigSchema(BaseModel):
diff --git a/tests/data/test_loader_extra.py b/tests/data/test_loader_extra.py
@@ -1,4 +1,5 @@
 import unittest
+from unittest.mock import patch
 import pandas as pd
 import os
 import yaml
@@ -80,5 +81,66 @@ def test_is_cache_valid_expired(self):
         self.assertFalse(self.loader._is_cache_valid(self.cache_file))
 
 
+class TestFetchDataParallel(unittest.TestCase):
+    def setUp(self):
+        self.tmpdir = tempfile.mkdtemp()
+        self.cache_dir = os.path.join(self.tmpdir, "cache")
+        os.makedirs(self.cache_dir, exist_ok=True)
+        self.config_path = os.path.join(self.tmpdir, "config.yaml")
+        config = {
+            "data": {
+                "symbols": ["AAA", "BBB"],
+                "start_date": "2020-01-01",
+                "end_date": "2020-01-02",
+                "cache_path": self.cache_dir,
+                "use_cache": False,
+                "max_workers": 2,
+            }
+        }
+        with open(self.config_path, "w") as f:
+            yaml.dump(config, f)
+
+    def tearDown(self):
+        shutil.rmtree(self.tmpdir)
+
+    @patch("data.loader.as_completed", side_effect=lambda fs: fs)
+    @patch("data.loader.ThreadPoolExecutor")
+    @patch("yfinance.Ticker")
+    def test_parallel_execution(self, mock_ticker, mock_executor, _mock_ac):
+        class DummyFuture:
+            def __init__(self, result):
+                self._result = result
+
+            def result(self):
+                return self._result
+
+        class DummyExecutor:
+            def __init__(self, max_workers=None):
+                self.max_workers = max_workers
+
+            def __enter__(self):
+                return self
+
+            def __exit__(self, exc_type, exc, tb):
+                pass
+
+            def submit(self, fn, *args):
+                return DummyFuture(fn(*args))
+
+        mock_executor.return_value = DummyExecutor(max_workers=2)
+        mock_history = pd.DataFrame(
+            {"Open": [1], "High": [1], "Low": [1], "Close": [1], "Volume": [1]},
+            index=pd.date_range("2020-01-01", periods=1),
+        )
+        mock_ticker.return_value.history.return_value = mock_history
+
+        loader = DataLoader(self.config_path)
+        data = loader.fetch_data()
+
+        mock_executor.assert_called_once_with(max_workers=2)
+        self.assertIn("AAA", data)
+        self.assertIn("BBB", data)
+
+
 if __name__ == "__main__":
     unittest.main()