@@ -659,17 +659,19 @@ def generate_report(all_metrics: Dict[str, Dict[str, float]], output_dir: str, i
659659 logger .info (f"Report saved to { report_path } " )
660660
661661def main ():
662- parser = argparse .ArgumentParser (description = "Evaluate a model on OptiLLM Bench" )
662+ parser = argparse .ArgumentParser (
663+ description = "Evaluate a model on OptiLLM Bench. By default, runs test-time compute evaluation with pass@1, maj@64, and genselect@64."
664+ )
663665 parser .add_argument ("--model" , required = True , help = "Model identifier" )
664666 parser .add_argument ("--base-url" , default = "http://localhost:8000/v1" ,
665667 help = "Base URL for API endpoint" )
666668 parser .add_argument ("--max-samples" , type = int , help = "Maximum number of samples to evaluate" )
667669 parser .add_argument ("--output-dir" , default = "results" ,
668670 help = "Directory to save results" )
669671 parser .add_argument ("--approaches" , nargs = "+" ,
670- help = "Specific approaches to evaluate (default: all )" )
672+ help = "Specific approaches to evaluate (overrides default test-time compute )" )
671673 parser .add_argument ("--test-time-compute" , action = "store_true" ,
672- help = "Evaluate test-time compute approaches (sequential and parallel scaling )" )
674+ help = "Evaluate full test-time compute scaling approaches (ThinkDeeper and various k values )" )
673675 parser .add_argument ("--debug" , action = "store_true" , help = "Enable debug logging" )
674676 args = parser .parse_args ()
675677
0 commit comments