diff --git a/.gitignore b/.gitignore index d98fba6..44d8028 100644 --- a/.gitignore +++ b/.gitignore @@ -72,3 +72,4 @@ TEST/ # predictions historical_predictions/ forecasts/ +dynamic_eval_data/ diff --git a/app/app/evaluation/page.tsx b/app/app/evaluation/page.tsx index ad180ec..a5d8c88 100644 --- a/app/app/evaluation/page.tsx +++ b/app/app/evaluation/page.tsx @@ -6,6 +6,8 @@ import { TrendingUp, AlertCircle, Calendar, Activity } from "lucide-react"; import { LineChart, Line, + BarChart, + Bar, XAxis, YAxis, CartesianGrid, @@ -56,6 +58,19 @@ interface ChartDataPoint { MAE: number; } +interface MonthlyMetrics { + overall_rmse: number; + overall_mae: number; + samples: number; + by_horizon: Record; +} + +interface MonthlyAnalysis { + evaluation_period: { start: string; end: string }; + monthly_metrics: Record; + computed_at: string; +} + // ============================================================================ // Components // ============================================================================ @@ -202,10 +217,13 @@ const WarningMessage = ({ message }: { message: string }) => ( export default function EvaluationPage() { const [staticEval, setStaticEval] = useState(null); const [dynamicEval, setDynamicEval] = useState(null); + const [monthlyAnalysis, setMonthlyAnalysis] = useState(null); const [loadingStatic, setLoadingStatic] = useState(true); const [loadingDynamic, setLoadingDynamic] = useState(true); + const [loadingMonthly, setLoadingMonthly] = useState(true); const [errorStatic, setErrorStatic] = useState(null); const [errorDynamic, setErrorDynamic] = useState(null); + const [errorMonthly, setErrorMonthly] = useState(null); const hasFetched = useRef(false); useEffect(() => { @@ -240,8 +258,23 @@ export default function EvaluationPage() { } }; + const fetchMonthlyAnalysis = async () => { + try { + const response = await fetch(`${API_BASE_URL}/evaluation/static/monthly`); + if (!response.ok) throw new Error("Failed to fetch monthly analysis"); + const data = await response.json(); + setMonthlyAnalysis(data); + setErrorMonthly(null); + } catch (error) { + setErrorMonthly(error instanceof Error ? error.message : "Unknown error"); + } finally { + setLoadingMonthly(false); + } + }; + fetchStaticEvaluation(); fetchDynamicEvaluation(); + fetchMonthlyAnalysis(); }, []); const formatDate = (dateStr: string) => { @@ -337,11 +370,199 @@ export default function EvaluationPage() { data={prepareChartData(staticEval.metrics.by_horizon)} title="Error vs Forecast Horizon" /> + + {/* Monthly Error Analysis */} + {renderMonthlyAnalysis()} ); }; + const renderMonthlyAnalysis = () => { + if (loadingMonthly) { + return ( +
+
+
+ {[1, 2, 3].map((i) => ( +
+ ))} +
+
+ ); + } + + if (errorMonthly) { + return ( +
+ +
+ ); + } + + if (!monthlyAnalysis) return null; + + // Sort months by overall RMSE to identify best and worst + const sortedMonths = Object.entries(monthlyAnalysis.monthly_metrics) + .map(([month, metrics]) => ({ + month, + rmse: metrics.overall_rmse, + mae: metrics.overall_mae, + samples: metrics.samples, + })) + .sort((a, b) => b.rmse - a.rmse); // Highest error first + + const worstMonth = sortedMonths[0]; + const bestMonth = sortedMonths[sortedMonths.length - 1]; + + // Format month for display + const formatMonth = (monthStr: string) => { + // monthStr is in format "YYYY-MM" + // Parse directly to avoid timezone issues + const [year, month] = monthStr.split("-"); + const date = new Date(parseInt(year), parseInt(month) - 1, 1); + return date.toLocaleDateString("en-US", { year: "numeric", month: "long" }); + }; + + // Prepare chart data (sorted chronologically by month key) + const chartData = Object.entries(monthlyAnalysis.monthly_metrics) + .sort(([a], [b]) => a.localeCompare(b)) // Sort by YYYY-MM string + .map(([month, metrics]) => ({ + month: formatMonth(month).split(" ")[0], // Short month name + fullMonth: formatMonth(month), + RMSE: parseFloat(metrics.overall_rmse.toFixed(2)), + MAE: parseFloat(metrics.overall_mae.toFixed(2)), + })); + + // Calculate improvement percentage + const improvementPercent = ( + ((worstMonth.rmse - bestMonth.rmse) / worstMonth.rmse) * + 100 + ).toFixed(1); + + // Determine season text based on actual months + const getSeasonText = () => { + const months = Object.keys(monthlyAnalysis.monthly_metrics).sort(); + const firstMonth = parseInt(months[0].split("-")[1]); // Get month number + + if (firstMonth <= 2) return "Winter → Summer"; + if (firstMonth <= 5) return "Spring → Summer"; + return "Seasonal Variation"; + }; + + return ( +
+

+ + Monthly Error Analysis +

+ + {/* Best and Worst Months Highlight */} +
+
+
+ Highest Error +
+
+ {formatMonth(worstMonth.month)} +
+
+ {worstMonth.rmse.toFixed(2)}°C RMSE +
+
+ +
+
+ Lowest Error +
+
+ {formatMonth(bestMonth.month)} +
+
+ {bestMonth.rmse.toFixed(2)}°C RMSE +
+
+ +
+
+ Trend +
+
+ ↓ {improvementPercent}% +
+
{getSeasonText()}
+
+
+ + {/* Simplified Line Chart */} + + + + + Math.ceil(dataMax * 1.15)]} + /> + [`${value.toFixed(2)}°C`, ""]} + labelFormatter={(label) => { + const data = chartData.find((d) => d.month === label); + return data?.fullMonth || label; + }} + /> + + + + + + +
+ Error metrics decrease from late winter through summer, reflecting more stable + atmospheric patterns and improved predictability in warmer months +
+
+ ); + }; + const renderDynamicSection = () => { if (loadingDynamic) { return ( diff --git a/backend/app/main.py b/backend/app/main.py index ad14434..6d919e5 100644 --- a/backend/app/main.py +++ b/backend/app/main.py @@ -569,6 +569,64 @@ async def get_forecast_logs(limit: int = 100) -> dict[str, Any]: ) from e +@app.get("/evaluation/static/monthly") +async def get_static_monthly_analysis() -> dict[str, Any]: + """Get monthly error analysis for static evaluation period. + + Provides breakdown of RMSE/MAE by month to identify seasonal patterns + and months with highest/lowest errors. + + Returns + ------- + dict[str, Any] + Monthly metrics with per-horizon breakdowns + """ + logger = get_logger() + logger.info("=" * 80) + logger.info("GET /evaluation/static/monthly endpoint called") + + if not hasattr(app.state, "eval_storage"): + raise HTTPException( + status_code=503, + detail="Evaluation storage not available. BigQuery may not be configured.", + ) + + try: + # Use same static evaluation period as main endpoint + start_date = datetime(2024, 2, 6, 12, 0, 0) + end_date = datetime(2024, 7, 19, 17, 0, 0) + + logger.info(f"Computing monthly metrics for: {start_date} to {end_date}") + + # Compute monthly metrics using SQL aggregation in BigQuery + monthly_data = await asyncio.to_thread( + app.state.eval_storage.compute_monthly_metrics, start_date, end_date + ) + + logger.info(f"Monthly data computed for {len(monthly_data['by_month'])} months") + + response = { + "evaluation_period": { + "start": start_date.isoformat(), + "end": end_date.isoformat(), + }, + "monthly_metrics": monthly_data["by_month"], + "computed_at": datetime.now().isoformat(), + } + + logger.info("Monthly analysis endpoint completed successfully") + logger.info("=" * 80) + return response + + except Exception as e: + logger.error(f"Failed to compute monthly analysis: {str(e)}") + logger.info("=" * 80) + raise HTTPException( + status_code=500, + detail=f"Failed to compute monthly analysis: {str(e)}", + ) from e + + @app.get("/evaluation/dynamic") async def get_dynamic_evaluation() -> dict[str, Any]: """Get dynamic evaluation metrics (rolling 1-month window). diff --git a/scripts/clean_dynamic_eval.py b/scripts/clean_dynamic_eval.py new file mode 100755 index 0000000..1e6e8c3 --- /dev/null +++ b/scripts/clean_dynamic_eval.py @@ -0,0 +1,344 @@ +#!/usr/bin/env python +"""Clean dynamic evaluation data before repopulating. + +This script deletes predictions and ground truth data from the rolling 30-day +window to prepare for fresh data population. Use this when switching from +biased (24h interval) to unbiased (1h interval) predictions. + +Usage: + # Interactive mode (asks for confirmation) + GCP_PROJECT_ID=your-project python scripts/clean_dynamic_eval.py + + # Force mode (no confirmation) + python scripts/clean_dynamic_eval.py --force + + # Custom day range + python scripts/clean_dynamic_eval.py --days 30 --force +""" + +import argparse +import os +import sys +from datetime import datetime, timedelta + +from google.cloud import bigquery +from rich.console import Console +from rich.panel import Panel +from rich.prompt import Confirm +from rich.table import Table + + +console = Console() + + +def get_data_counts( + client: bigquery.Client, + project_id: str, + dataset_id: str, + start_date: datetime, + end_date: datetime, +) -> tuple[int, int]: + """Get counts of predictions and ground truth to be deleted. + + Parameters + ---------- + client : bigquery.Client + BigQuery client + project_id : str + GCP project ID + dataset_id : str + BigQuery dataset ID + start_date : datetime + Start of deletion range + end_date : datetime + End of deletion range + + Returns + ------- + tuple[int, int] + (prediction_count, ground_truth_count) + """ + # Count predictions + pred_query = f""" + SELECT COUNT(*) as count + FROM `{project_id}.{dataset_id}.predictions` + WHERE run_timestamp >= @start_date + AND run_timestamp <= @end_date + """ + + job_config = bigquery.QueryJobConfig( + query_parameters=[ + bigquery.ScalarQueryParameter("start_date", "TIMESTAMP", start_date), + bigquery.ScalarQueryParameter("end_date", "TIMESTAMP", end_date), + ] + ) + + pred_job = client.query(pred_query, job_config=job_config) + pred_results = list(pred_job.result()) + pred_count = pred_results[0]["count"] if pred_results else 0 + + # Count ground truth (extended by 48h for forecast horizon) + gt_end = end_date + timedelta(hours=48) + gt_query = f""" + SELECT COUNT(*) as count + FROM `{project_id}.{dataset_id}.ground_truth` + WHERE timestamp >= @start_date + AND timestamp <= @end_date + """ + + job_config_gt = bigquery.QueryJobConfig( + query_parameters=[ + bigquery.ScalarQueryParameter("start_date", "TIMESTAMP", start_date), + bigquery.ScalarQueryParameter("end_date", "TIMESTAMP", gt_end), + ] + ) + + gt_job = client.query(gt_query, job_config=job_config_gt) + gt_results = list(gt_job.result()) + gt_count = gt_results[0]["count"] if gt_results else 0 + + return pred_count, gt_count + + +def delete_predictions( + client: bigquery.Client, + project_id: str, + dataset_id: str, + start_date: datetime, + end_date: datetime, +) -> int: + """Delete predictions in the specified date range. + + Parameters + ---------- + client : bigquery.Client + BigQuery client + project_id : str + GCP project ID + dataset_id : str + BigQuery dataset ID + start_date : datetime + Start of deletion range + end_date : datetime + End of deletion range + + Returns + ------- + int + Number of rows deleted + """ + query = f""" + DELETE FROM `{project_id}.{dataset_id}.predictions` + WHERE run_timestamp >= @start_date + AND run_timestamp <= @end_date + """ + + job_config = bigquery.QueryJobConfig( + query_parameters=[ + bigquery.ScalarQueryParameter("start_date", "TIMESTAMP", start_date), + bigquery.ScalarQueryParameter("end_date", "TIMESTAMP", end_date), + ] + ) + + console.print("[cyan]Deleting predictions...[/cyan]") + job = client.query(query, job_config=job_config) + job.result() + + console.print("[green]✓[/green] Deleted predictions") + return job.num_dml_affected_rows or 0 + + +def delete_ground_truth( + client: bigquery.Client, + project_id: str, + dataset_id: str, + start_date: datetime, + end_date: datetime, +) -> int: + """Delete ground truth in the specified date range. + + Extends end_date by 48h to include ground truth for forecast horizons. + + Parameters + ---------- + client : bigquery.Client + BigQuery client + project_id : str + GCP project ID + dataset_id : str + BigQuery dataset ID + start_date : datetime + Start of deletion range + end_date : datetime + End of deletion range + + Returns + ------- + int + Number of rows deleted + """ + # Extend by 48h to cover forecast ground truth + gt_end = end_date + timedelta(hours=48) + + query = f""" + DELETE FROM `{project_id}.{dataset_id}.ground_truth` + WHERE timestamp >= @start_date + AND timestamp <= @end_date + """ + + job_config = bigquery.QueryJobConfig( + query_parameters=[ + bigquery.ScalarQueryParameter("start_date", "TIMESTAMP", start_date), + bigquery.ScalarQueryParameter("end_date", "TIMESTAMP", gt_end), + ] + ) + + console.print("[cyan]Deleting ground truth...[/cyan]") + job = client.query(query, job_config=job_config) + job.result() + + console.print("[green]✓[/green] Deleted ground truth") + return job.num_dml_affected_rows or 0 + + +def main() -> None: + """Clean dynamic evaluation data.""" + parser = argparse.ArgumentParser( + description="Clean dynamic evaluation data before repopulating" + ) + parser.add_argument( + "--days", + type=int, + default=30, + help="Number of days to clean (default: 30)", + ) + parser.add_argument( + "--force", + "-f", + action="store_true", + help="Skip confirmation prompt", + ) + parser.add_argument( + "--predictions-only", + action="store_true", + help="Delete only predictions, keep ground truth", + ) + + args = parser.parse_args() + + # Get project ID + project_id = os.getenv("GCP_PROJECT_ID") + if not project_id: + console.print("[red]✗ GCP_PROJECT_ID environment variable not set[/red]") + console.print("Set it with: export GCP_PROJECT_ID=your-project-id") + sys.exit(1) + + dataset_id = os.getenv("BIGQUERY_DATASET", "gaca_evaluation") + + # Calculate date range + end_date = datetime.utcnow() + start_date = end_date - timedelta(days=args.days) + + console.print() + console.print( + Panel.fit( + "[bold red]Clean Dynamic Evaluation Data[/bold red]\n" + "[dim]This will permanently delete data from BigQuery[/dim]", + border_style="red", + ) + ) + console.print() + + # Display deletion info + info_table = Table(show_header=False, box=None, padding=(0, 2)) + info_table.add_column(style="cyan") + info_table.add_column() + info_table.add_row("Project", project_id) + info_table.add_row("Dataset", dataset_id) + info_table.add_row("Start Date", start_date.strftime("%Y-%m-%d %H:%M UTC")) + info_table.add_row("End Date", end_date.strftime("%Y-%m-%d %H:%M UTC")) + info_table.add_row("Days", str(args.days)) + console.print(info_table) + console.print() + + try: + # Initialize BigQuery client + client = bigquery.Client(project=project_id) + + # Get counts before deletion + console.print("[cyan]Checking existing data...[/cyan]") + pred_count, gt_count = get_data_counts( + client, project_id, dataset_id, start_date, end_date + ) + + if pred_count == 0 and gt_count == 0: + console.print("[yellow]⚠ No data found in the specified range[/yellow]") + console.print( + f"Date range: {start_date.strftime('%Y-%m-%d')} to {end_date.strftime('%Y-%m-%d')}" + ) + sys.exit(0) + + # Display what will be deleted + console.print() + delete_table = Table(title="Data to be Deleted", box=None) + delete_table.add_column("Table", style="cyan") + delete_table.add_column("Rows", justify="right", style="yellow") + delete_table.add_row("Predictions", f"{pred_count:,}") + if not args.predictions_only: + delete_table.add_row( + "Ground Truth", f"{gt_count:,} (includes +48h forecast data)" + ) + console.print(delete_table) + console.print() + + # Confirmation + if not args.force: + confirmed = Confirm.ask( + "[bold red]⚠ This will permanently delete the data above. Continue?[/bold red]" + ) + if not confirmed: + console.print("[yellow]⊘ Cancelled by user[/yellow]") + sys.exit(0) + + console.print() + console.print("=" * 70) + console.print("[bold]Deleting data...[/bold]") + console.print("=" * 70) + console.print() + + # Delete predictions + deleted_pred = delete_predictions( + client, project_id, dataset_id, start_date, end_date + ) + + # Delete ground truth (unless predictions-only) + deleted_gt = 0 + if not args.predictions_only: + deleted_gt = delete_ground_truth( + client, project_id, dataset_id, start_date, end_date + ) + + # Summary + console.print() + console.print("=" * 70) + console.print("[bold green]✓ Deletion Complete![/bold green]") + console.print(f" Predictions deleted: {deleted_pred:,}") + if not args.predictions_only: + console.print(f" Ground truth deleted: {deleted_gt:,}") + console.print("=" * 70) + console.print() + + console.print("[green]✓ Ready to repopulate with unbiased data![/green]\n") + console.print("[dim]Next steps:[/dim]") + console.print( + f" python scripts/populate_dynamic_evaluation.py --days {args.days} --interval 1 --verbose\n" + ) + + except Exception as e: + console.print(f"\n[red]✗ Error: {e}[/red]") + console.print_exception() + sys.exit(1) + + +if __name__ == "__main__": + main() diff --git a/scripts/populate_dynamic_evaluation.py b/scripts/populate_dynamic_evaluation.py index e25f67e..86aff04 100644 --- a/scripts/populate_dynamic_evaluation.py +++ b/scripts/populate_dynamic_evaluation.py @@ -241,8 +241,8 @@ def main() -> None: parser.add_argument( "--interval", type=int, - default=24, - help="Interval between predictions in hours (default: 24)", + default=1, + help="Interval between predictions in hours (default: 1 for unbiased evaluation)", ) parser.add_argument( "--output", @@ -285,6 +285,21 @@ def main() -> None: "(+48h)[/dim]\n" ) + # Warn about temporal bias + if args.interval > 1: + console.print( + f"[bold yellow]⚠ WARNING: Using interval={args.interval}h may introduce temporal bias![/bold yellow]" + ) + console.print( + "[yellow] Each forecast horizon will be evaluated at only specific times of day,[/yellow]" + ) + console.print( + "[yellow] causing diurnal effects to bias error metrics (e.g., 48h < 36h errors).[/yellow]" + ) + console.print( + "[yellow] For unbiased evaluation, use --interval 1 (hourly predictions).[/yellow]\n" + ) + # Step 1: Run batch predictions (unless skipped) if not args.skip_inference: console.print("=" * 70) diff --git a/src/gaca_ews/evaluation/storage.py b/src/gaca_ews/evaluation/storage.py index 69a05b5..5c851bb 100644 --- a/src/gaca_ews/evaluation/storage.py +++ b/src/gaca_ews/evaluation/storage.py @@ -615,6 +615,120 @@ def get_last_forecast_timestamp(self) -> str | None: return results[0]["last_run"].isoformat() + def compute_monthly_metrics( + self, start_date: datetime, end_date: datetime + ) -> dict[str, Any]: + """Compute error metrics grouped by month for error analysis. + + Parameters + ---------- + start_date : datetime + Start of the range (inclusive) + end_date : datetime + End of the range (inclusive) + + Returns + ------- + dict[str, Any] + Monthly metrics with overall and per-horizon breakdowns + """ + query = f""" + WITH matched AS ( + SELECT + p.forecast_time, + p.horizon_hours, + p.predicted_temp, + g.actual_temp + FROM `{self.project_id}.{self.dataset_id}.predictions` p + JOIN `{self.project_id}.{self.dataset_id}.ground_truth` g + ON p.forecast_time = g.timestamp + AND ROUND(p.lat, 2) = ROUND(g.lat, 2) + AND ROUND(p.lon, 2) = ROUND(g.lon, 2) + WHERE p.run_timestamp BETWEEN @start_date AND @end_date + ), + monthly AS ( + SELECT + FORMAT_TIMESTAMP('%Y-%m', forecast_time) AS month, + horizon_hours, + predicted_temp, + actual_temp + FROM matched + WHERE forecast_time >= @start_date + AND forecast_time <= @end_date + ) + SELECT + month, + horizon_hours, + SQRT(AVG(POW(predicted_temp - actual_temp, 2))) AS rmse, + AVG(ABS(predicted_temp - actual_temp)) AS mae, + COUNT(*) AS sample_count + FROM monthly + GROUP BY month, horizon_hours + ORDER BY month, horizon_hours + """ + + job_config = bigquery.QueryJobConfig( + query_parameters=[ + bigquery.ScalarQueryParameter("start_date", "TIMESTAMP", start_date), + bigquery.ScalarQueryParameter("end_date", "TIMESTAMP", end_date), + ] + ) + + console.print( + f"[cyan]Computing monthly metrics for {start_date} to {end_date}...[/cyan]" + ) + + query_job = self.client.query(query, job_config=job_config) + results = query_job.result() + + # Parse results into nested structure + by_month: dict[str, dict[str, Any]] = {} + + for row in results: + month = str(row["month"]) + horizon = int(row["horizon_hours"]) + rmse = float(row["rmse"]) + mae = float(row["mae"]) + count = int(row["sample_count"]) + + if month not in by_month: + by_month[month] = {"by_horizon": {}, "samples": 0} + + by_month[month]["by_horizon"][horizon] = { + "rmse": rmse, + "mae": mae, + "sample_count": count, + } + by_month[month]["samples"] += count + + # Compute overall metrics per month (across all horizons) + for _month, month_data in by_month.items(): + total_samples = 0 + sum_squared_errors = 0.0 + sum_abs_errors = 0.0 + + for horizon_data in month_data["by_horizon"].values(): + count = horizon_data["sample_count"] + rmse = horizon_data["rmse"] + mae = horizon_data["mae"] + + total_samples += count + sum_squared_errors += (rmse**2) * count + sum_abs_errors += mae * count + + if total_samples > 0: + month_data["overall_rmse"] = (sum_squared_errors / total_samples) ** 0.5 + month_data["overall_mae"] = sum_abs_errors / total_samples + else: + month_data["overall_rmse"] = 0.0 + month_data["overall_mae"] = 0.0 + + console.print( + f"[green]✓[/green] Computed monthly metrics for {len(by_month)} months" + ) + + return {"by_month": by_month} + def get_last_forecast_run_info(self) -> dict[str, Any] | None: """Get information about the last forecast run.