Skip to content

Commit 7755946

Browse files
committed
fix scripts
1 parent 3a7d0f0 commit 7755946

File tree

2 files changed

+11
-15
lines changed

2 files changed

+11
-15
lines changed

scripts/eval_aime_benchmark.py

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -19,9 +19,9 @@
1919
logger = logging.getLogger(__name__)
2020

2121
# Initialize OpenAI client
22-
# client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY"), base_url="https://openrouter.ai/api/v1")
22+
client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY"), base_url="https://openrouter.ai/api/v1")
2323

24-
client = OpenAI(api_key="optillm", base_url="http://localhost:8001/v1")
24+
# client = OpenAI(api_key="optillm", base_url="http://localhost:8001/v1")
2525

2626
SYSTEM_PROMPT = '''You are solving AIME (American Invitational Mathematics Examination) problems.
2727
@@ -282,12 +282,12 @@ def get_llm_response(problem: str, model: str, analyze_logits: bool = False) ->
282282
{"role": "user", "content": SYSTEM_PROMPT + problem}
283283
],
284284
max_tokens=8192,
285-
extra_body={
286-
"decoding": "thinkdeeper",
287-
"min_thinking_tokens" : 0,
288-
"max_thinking_tokens" : 8000,
289-
"max_thoughts": 100,
290-
},
285+
# extra_body={
286+
# "decoding": "thinkdeeper",
287+
# "min_thinking_tokens" : 0,
288+
# "max_thinking_tokens" : 8000,
289+
# "max_thoughts": 100,
290+
# },
291291
**kwargs
292292
)
293293

scripts/eval_optillmbench.py

Lines changed: 3 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,7 @@
3636
]
3737

3838
def load_optillm_bench() -> datasets.Dataset:
39-
"""Load the OptILLM Bench dataset."""
39+
"""Load the OptiLLM Bench dataset."""
4040
try:
4141
dataset = load_dataset("codelion/optillmbench")
4242
return dataset["test"] # We use the test split for evaluation
@@ -161,10 +161,6 @@ def evaluate_model(
161161
],
162162
temperature=0.2,
163163
max_tokens=4096,
164-
reasoning_effort="low",
165-
extra_body = {
166-
"decoding" : "thinkdeeper",
167-
}
168164
)
169165

170166
# Calculate time taken
@@ -255,7 +251,7 @@ def generate_report(all_metrics: Dict[str, Dict[str, float]], output_dir: str):
255251
report = []
256252

257253
# Header
258-
report.append("# OptILLM Bench Evaluation Report")
254+
report.append("# OptiLLM Bench Evaluation Report")
259255
report.append(f"Generated on: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n")
260256

261257
# Overall Results Table
@@ -303,7 +299,7 @@ def generate_report(all_metrics: Dict[str, Dict[str, float]], output_dir: str):
303299
logger.info(f"Report saved to {report_path}")
304300

305301
def main():
306-
parser = argparse.ArgumentParser(description="Evaluate a model on OptILLM Bench")
302+
parser = argparse.ArgumentParser(description="Evaluate a model on OptiLLM Bench")
307303
parser.add_argument("--model", required=True, help="Model identifier")
308304
parser.add_argument("--base-url", default="http://localhost:8000/v1",
309305
help="Base URL for API endpoint")

0 commit comments

Comments
 (0)