|
36 | 36 | ] |
37 | 37 |
|
38 | 38 | def load_optillm_bench() -> datasets.Dataset: |
39 | | - """Load the OptILLM Bench dataset.""" |
| 39 | + """Load the OptiLLM Bench dataset.""" |
40 | 40 | try: |
41 | 41 | dataset = load_dataset("codelion/optillmbench") |
42 | 42 | return dataset["test"] # We use the test split for evaluation |
@@ -161,10 +161,6 @@ def evaluate_model( |
161 | 161 | ], |
162 | 162 | temperature=0.2, |
163 | 163 | max_tokens=4096, |
164 | | - reasoning_effort="low", |
165 | | - extra_body = { |
166 | | - "decoding" : "thinkdeeper", |
167 | | - } |
168 | 164 | ) |
169 | 165 |
|
170 | 166 | # Calculate time taken |
@@ -255,7 +251,7 @@ def generate_report(all_metrics: Dict[str, Dict[str, float]], output_dir: str): |
255 | 251 | report = [] |
256 | 252 |
|
257 | 253 | # Header |
258 | | - report.append("# OptILLM Bench Evaluation Report") |
| 254 | + report.append("# OptiLLM Bench Evaluation Report") |
259 | 255 | report.append(f"Generated on: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n") |
260 | 256 |
|
261 | 257 | # Overall Results Table |
@@ -303,7 +299,7 @@ def generate_report(all_metrics: Dict[str, Dict[str, float]], output_dir: str): |
303 | 299 | logger.info(f"Report saved to {report_path}") |
304 | 300 |
|
305 | 301 | def main(): |
306 | | - parser = argparse.ArgumentParser(description="Evaluate a model on OptILLM Bench") |
| 302 | + parser = argparse.ArgumentParser(description="Evaluate a model on OptiLLM Bench") |
307 | 303 | parser.add_argument("--model", required=True, help="Model identifier") |
308 | 304 | parser.add_argument("--base-url", default="http://localhost:8000/v1", |
309 | 305 | help="Base URL for API endpoint") |
|
0 commit comments