@@ -75,6 +75,23 @@ def _write_validation_report(report_path: Path, report: dict) -> None:
7575 logger .warning ("Failed to write CSV validation report: %s" , exc )
7676
7777
78+ def _write_coverage_report (report_path : Path , coverage : dict ) -> None :
79+ """Persist test-window coverage results to JSON and CSV formats."""
80+
81+ report_path .parent .mkdir (parents = True , exist_ok = True )
82+ with open (report_path , "w" ) as fh :
83+ json .dump (coverage , fh , indent = 2 )
84+
85+ try :
86+ rows = []
87+ for symbol , details in coverage .items ():
88+ row = {"symbol" : symbol , ** details }
89+ rows .append (row )
90+ pd .DataFrame (rows ).to_csv (report_path .with_suffix (".csv" ), index = False )
91+ except Exception as exc : # pragma: no cover - defensive
92+ logger .warning ("Failed to write CSV coverage report: %s" , exc )
93+
94+
7895def _ensure_datetime_index (df : pd .DataFrame ) -> pd .DataFrame :
7996 """Ensure the DataFrame index is a DatetimeIndex.
8097
@@ -103,7 +120,7 @@ def _ensure_datetime_index(df: pd.DataFrame) -> pd.DataFrame:
103120def time_aware_split (
104121 df_labeled : pd .DataFrame ,
105122 cfg : dict ,
106- ) -> Tuple [pd .DataFrame , pd .DataFrame ]:
123+ ) -> Tuple [pd .DataFrame , pd .DataFrame , dict ]:
107124 """Return chronological train/test splits using config windows.
108125
109126 Rules
@@ -150,6 +167,10 @@ def _window_has_full_coverage(
150167 return True
151168
152169 fallback_used = False
170+ coverage_ok : bool | None = True if test_start else None
171+ data_start = df .index .min ()
172+ data_end = df .index .max ()
173+ split_strategy = "window" if test_start else "fraction"
153174 if test_start :
154175 start_dt = pd .to_datetime (test_start )
155176 end_dt = pd .to_datetime (test_end ) if test_end else None
@@ -184,7 +205,22 @@ def _window_has_full_coverage(
184205 raise ValueError (
185206 f"Invalid train/test window produced empty split when using { window_msg } . Adjust data.test_* or training.test_size."
186207 )
187- return train_df , test_df
208+ coverage : dict = {
209+ "data_start" : data_start .isoformat (),
210+ "data_end" : data_end .isoformat (),
211+ "test_start" : pd .to_datetime (test_start ).isoformat () if test_start else None ,
212+ "test_end" : pd .to_datetime (test_end ).isoformat () if test_end else None ,
213+ "train_start" : train_df .index .min ().isoformat (),
214+ "train_end" : train_df .index .max ().isoformat (),
215+ "test_start_actual" : test_df .index .min ().isoformat (),
216+ "test_end_actual" : test_df .index .max ().isoformat (),
217+ "train_size" : len (train_df ),
218+ "test_size" : len (test_df ),
219+ "coverage_ok" : coverage_ok ,
220+ "fallback_used" : fallback_used ,
221+ "split_strategy" : split_strategy if not fallback_used else "fraction_fallback" ,
222+ }
223+ return train_df , test_df , coverage
188224
189225
190226def _validate_or_raise (
@@ -225,7 +261,10 @@ def _validate_or_raise(
225261
226262
227263def run_pipeline (
228- config_path : str = "config/model_config.yaml" , * , skip_validation : bool = False
264+ config_path : str = "config/model_config.yaml" ,
265+ * ,
266+ skip_validation : bool = False ,
267+ include_coverage : bool = False ,
229268):
230269 """Run the end-to-end training pipeline.
231270
@@ -238,6 +277,12 @@ def run_pipeline(
238277 >>> results = run_pipeline("config/model_config.yaml")
239278 >>> sorted(results.keys()) # doctest: +ELLIPSIS
240279 ...
280+
281+ Set ``include_coverage=True`` to also receive coverage metadata:
282+
283+ >>> results, coverage = run_pipeline("config/model_config.yaml", include_coverage=True)
284+ >>> sorted(coverage.keys())
285+ ['fallback_symbols', 'path']
241286 """
242287
243288 # Load configuration
@@ -274,6 +319,7 @@ def run_pipeline(
274319
275320 # Process each stock
276321 results = {}
322+ coverage_report : dict [str , dict ] = {}
277323 for symbol , df in data_dict .items ():
278324 logger .info (f"\n Processing { symbol } ..." )
279325
@@ -284,7 +330,8 @@ def run_pipeline(
284330 df_labeled = data_processor .generate_labels (df_processed )
285331
286332 # 4. Time-aware Split
287- train_df , test_df = time_aware_split (df_labeled , config )
333+ train_df , test_df , coverage = time_aware_split (df_labeled , config )
334+ coverage_report [symbol ] = coverage
288335 X_train , y_train = model .prepare_data (train_df )
289336 X_test , y_test = model .prepare_data (test_df )
290337 # Log split summary
@@ -327,11 +374,35 @@ def run_pipeline(
327374 logger .info (f"Train Metrics: { train_metrics } " )
328375 logger .info (f"Test Metrics: { test_metrics } " )
329376
377+ coverage_path = Path (experiment_dir ) / "test_window_coverage.json"
378+ _write_coverage_report (coverage_path , coverage_report )
379+ fallback_symbols = [
380+ symbol
381+ for symbol , details in coverage_report .items ()
382+ if details .get ("fallback_used" )
383+ ]
384+ if fallback_symbols :
385+ logger .warning (
386+ "Fallback chronological split applied for symbols: %s. Coverage report: %s" ,
387+ ", " .join (fallback_symbols ),
388+ coverage_path ,
389+ )
390+ logger .info ("Coverage report saved to %s" , coverage_path )
391+
330392 # Save experiment results
331393 with open (f"{ experiment_dir } /results.json" , "w" ) as f :
332394 json .dump (results , f , indent = 4 )
333395
334396 logger .info ("\n Pipeline completed successfully!" )
397+ if include_coverage :
398+ return (
399+ results ,
400+ {
401+ "path" : coverage_path .as_posix (),
402+ "fallback_symbols" : fallback_symbols ,
403+ },
404+ )
405+
335406 return results
336407
337408 except Exception as e :
@@ -539,6 +610,7 @@ def run_model_backtest(
539610 summary : dict = {}
540611 prepared_data : dict [str , pd .DataFrame ] = {}
541612 artifact_dirs : dict [str , Path ] = {}
613+ coverage_report : dict [str , dict ] = {}
542614
543615 trading_cfg = (cfg or {}).get ("trading" , {})
544616 stop_loss = trading_cfg .get ("stop_loss" )
@@ -595,7 +667,8 @@ def _execution_for(symbol: str) -> dict:
595667 try :
596668 df_proc = processor .process_data (df )
597669 df_lbl = processor .generate_labels (df_proc )
598- train_df , test_df = time_aware_split (df_lbl , cfg )
670+ train_df , test_df , coverage = time_aware_split (df_lbl , cfg )
671+ coverage_report [symbol ] = coverage
599672 # Build features from saved order
600673 missing = [
601674 c for c in (clf .feature_columns or []) if c not in test_df .columns
@@ -676,6 +749,25 @@ def _execution_for(symbol: str) -> dict:
676749 "output_dir" : out_dir .as_posix (),
677750 }
678751
752+ coverage_path = base_dir / "test_window_coverage.json"
753+ _write_coverage_report (coverage_path , coverage_report )
754+ fallback_symbols = [
755+ symbol
756+ for symbol , details in coverage_report .items ()
757+ if details .get ("fallback_used" )
758+ ]
759+ if fallback_symbols :
760+ logger .warning (
761+ "Fallback chronological split applied for symbols: %s. Coverage report: %s" ,
762+ ", " .join (fallback_symbols ),
763+ coverage_path ,
764+ )
765+ logger .info ("Coverage report saved to %s" , coverage_path )
766+ summary ["coverage_report" ] = {
767+ "path" : coverage_path .as_posix (),
768+ "fallback_symbols" : fallback_symbols ,
769+ }
770+
679771 return summary
680772
681773
0 commit comments