Skip to content

Commit 0dc0ed6

Browse files
committed
Update eval_optillmbench.py
1 parent 6baa4bc commit 0dc0ed6

File tree

1 file changed

+5
-3
lines changed

1 file changed

+5
-3
lines changed

scripts/eval_optillmbench.py

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -659,17 +659,19 @@ def generate_report(all_metrics: Dict[str, Dict[str, float]], output_dir: str, i
659659
logger.info(f"Report saved to {report_path}")
660660

661661
def main():
662-
parser = argparse.ArgumentParser(description="Evaluate a model on OptiLLM Bench")
662+
parser = argparse.ArgumentParser(
663+
description="Evaluate a model on OptiLLM Bench. By default, runs test-time compute evaluation with pass@1, maj@64, and genselect@64."
664+
)
663665
parser.add_argument("--model", required=True, help="Model identifier")
664666
parser.add_argument("--base-url", default="http://localhost:8000/v1",
665667
help="Base URL for API endpoint")
666668
parser.add_argument("--max-samples", type=int, help="Maximum number of samples to evaluate")
667669
parser.add_argument("--output-dir", default="results",
668670
help="Directory to save results")
669671
parser.add_argument("--approaches", nargs="+",
670-
help="Specific approaches to evaluate (default: all)")
672+
help="Specific approaches to evaluate (overrides default test-time compute)")
671673
parser.add_argument("--test-time-compute", action="store_true",
672-
help="Evaluate test-time compute approaches (sequential and parallel scaling)")
674+
help="Evaluate full test-time compute scaling approaches (ThinkDeeper and various k values)")
673675
parser.add_argument("--debug", action="store_true", help="Enable debug logging")
674676
args = parser.parse_args()
675677

0 commit comments

Comments
 (0)