diff --git a/docs/development/evaluations/history/.nav.yml b/docs/development/evaluations/history/.nav.yml index 8c83753ad..d71105aeb 100644 --- a/docs/development/evaluations/history/.nav.yml +++ b/docs/development/evaluations/history/.nav.yml @@ -1,4 +1,5 @@ +sort: + direction: desc nav: - index.md - - Weekly: weekly/ - - Special: special/ + - "*" diff --git a/docs/development/evaluations/history/special/results_20250930_153753.md b/docs/development/evaluations/history/custom_claude_results_20250930_153753.md similarity index 100% rename from docs/development/evaluations/history/special/results_20250930_153753.md rename to docs/development/evaluations/history/custom_claude_results_20250930_153753.md diff --git a/docs/development/evaluations/history/special/results_20251008_053744.md b/docs/development/evaluations/history/custom_self_hosted_results_20251008_053744.md similarity index 100% rename from docs/development/evaluations/history/special/results_20251008_053744.md rename to docs/development/evaluations/history/custom_self_hosted_results_20251008_053744.md diff --git a/docs/development/evaluations/history/index.md b/docs/development/evaluations/history/index.md index 22fb6f63c..52ad48e0f 100644 --- a/docs/development/evaluations/history/index.md +++ b/docs/development/evaluations/history/index.md @@ -1,18 +1,9 @@ # Historical Evaluation Results -## Weekly Runs +Browse through our past benchmark runs to track performance trends over time. -Weekly benchmark runs with a standard set of models. +## Weekly Results +Regular weekly benchmark runs that track model performance over time. -See the **Weekly** section in the navigation sidebar for all weekly benchmark results. - -## Special Benchmark Runs - -One-off benchmark runs for specific purposes such as: - -- Comparing self-hosted models -- Testing new model versions -- Performance analysis for specific scenarios -- Custom model comparisons - -See the **Special** section in the navigation sidebar for all special benchmark runs. +## Extended Comparisons +Special benchmark runs comparing multiple models and configurations. diff --git a/docs/development/evaluations/history/weekly/results_20250928_001434.md b/docs/development/evaluations/history/results_20250928_001434.md similarity index 100% rename from docs/development/evaluations/history/weekly/results_20250928_001434.md rename to docs/development/evaluations/history/results_20250928_001434.md diff --git a/docs/development/evaluations/history/weekly/results_20250930_085923.md b/docs/development/evaluations/history/results_20250930_085923.md similarity index 100% rename from docs/development/evaluations/history/weekly/results_20250930_085923.md rename to docs/development/evaluations/history/results_20250930_085923.md diff --git a/docs/development/evaluations/history/weekly/results_20251012_170303.md b/docs/development/evaluations/history/results_20251012_170303.md similarity index 100% rename from docs/development/evaluations/history/weekly/results_20251012_170303.md rename to docs/development/evaluations/history/results_20251012_170303.md diff --git a/docs/development/evaluations/history/special/.nav.yml b/docs/development/evaluations/history/special/.nav.yml deleted file mode 100644 index d71105aeb..000000000 --- a/docs/development/evaluations/history/special/.nav.yml +++ /dev/null @@ -1,5 +0,0 @@ -sort: - direction: desc -nav: - - index.md - - "*" diff --git a/docs/development/evaluations/history/special/index.md b/docs/development/evaluations/history/special/index.md deleted file mode 100644 index c55826838..000000000 --- a/docs/development/evaluations/history/special/index.md +++ /dev/null @@ -1,7 +0,0 @@ -# Special Benchmark Runs - -One-off benchmark runs for specific purposes such as comparing self-hosted models, testing new model versions, or custom performance analysis. - -## Available Results - -All special benchmark results are listed in the navigation sidebar. diff --git a/docs/development/evaluations/history/weekly/.nav.yml b/docs/development/evaluations/history/weekly/.nav.yml deleted file mode 100644 index d71105aeb..000000000 --- a/docs/development/evaluations/history/weekly/.nav.yml +++ /dev/null @@ -1,5 +0,0 @@ -sort: - direction: desc -nav: - - index.md - - "*" diff --git a/docs/development/evaluations/history/weekly/index.md b/docs/development/evaluations/history/weekly/index.md deleted file mode 100644 index 7b87e1b61..000000000 --- a/docs/development/evaluations/history/weekly/index.md +++ /dev/null @@ -1,7 +0,0 @@ -# Weekly Benchmark Runs - -Weekly benchmark runs with a standard set of models. - -## Available Results - -All weekly benchmark results are listed in the navigation sidebar. diff --git a/run_benchmarks_local.sh b/run_benchmarks_local.sh index 19e86c0a9..09ae7a167 100755 --- a/run_benchmarks_local.sh +++ b/run_benchmarks_local.sh @@ -154,10 +154,10 @@ if [ -f "scripts/generate_eval_report.py" ]; then --models "$MODELS" echo "✅ Report generated: docs/development/evaluations/latest-results.md" - # Also generate timestamped version for history (always in weekly/) - mkdir -p docs/development/evaluations/history/weekly + # Also generate timestamped version for history + mkdir -p docs/development/evaluations/history TIMESTAMP=$(date +%Y%m%d_%H%M%S) - HISTORY_FILE="docs/development/evaluations/history/weekly/results_${TIMESTAMP}.md" + HISTORY_FILE="docs/development/evaluations/history/results_${TIMESTAMP}.md" poetry run python scripts/generate_eval_report.py \ --json-file eval_results.json \ --output-file "$HISTORY_FILE" \