1515logger = logging .getLogger (__name__ )
1616
1717# Initialize OpenAI client
18- client = OpenAI (api_key = os .environ .get ("OPENAI_API_KEY" ), base_url = "http ://localhost:8000 /v1" )
18+ client = OpenAI (api_key = os .environ .get ("OPENAI_API_KEY" ), base_url = "https ://ot7nh9nqf4l7b43s.us-east-1.aws.endpoints.huggingface.cloud /v1/ " )
1919
2020SYSTEM_PROMPT = '''You are solving AIME (American Invitational Mathematics Examination) problems.
2121
@@ -104,10 +104,11 @@ def get_llm_response(problem: str, model: str) -> Union[str, List[Dict]]:
104104 try :
105105 response = client .with_options (timeout = 1000.0 ).chat .completions .create (
106106 model = model ,
107+ temperature = 0.2 ,
107108 messages = [
108109 {"role" : "user" , "content" : SYSTEM_PROMPT + problem }
109110 ],
110- max_tokens = 8192 ,
111+ max_tokens = 40000 ,
111112 )
112113
113114 # If there's more than one choice, format as attempts
@@ -241,18 +242,21 @@ def analyze_results(results: List[Dict], n: int):
241242 print ("---" )
242243
243244def main (model : str , n_attempts : int ):
244- """Main evaluation function."""
245+ """Main evaluation function that handles gaps in processed indexes ."""
245246 os .makedirs ("results" , exist_ok = True )
246247
247- # Include n_attempts in filename to keep separate results for different n values
248248 results_file = f"evaluation_results_{ model .replace ('/' , '_' )} _pass_at_{ n_attempts } .json"
249249
250250 dataset = load_2024_dataset ()
251251 existing_results = load_existing_results (results_file )
252- last_processed_index = get_last_processed_index (existing_results )
253252
254- for idx , item in enumerate (tqdm (dataset , desc = "Evaluating problems" )):
255- if idx <= last_processed_index :
253+ # Create a set of already processed indexes for efficient lookup
254+ processed_indexes = {result ['index' ] for result in existing_results }
255+
256+ for _ , item in enumerate (tqdm (dataset , desc = "Evaluating problems" )):
257+ id = int (item ['id' ])
258+ # Skip if this index has already been processed
259+ if id in processed_indexes :
256260 continue
257261
258262 problem_text = item ['problem' ]
@@ -263,7 +267,7 @@ def main(model: str, n_attempts: int):
263267 is_correct , first_correct = evaluate_pass_at_n (attempts , correct_answer )
264268
265269 result = {
266- "index" : idx ,
270+ "index" : id ,
267271 "problem" : problem_text ,
268272 "attempts" : attempts ,
269273 "correct_answer" : correct_answer ,
@@ -275,6 +279,7 @@ def main(model: str, n_attempts: int):
275279 final_results = load_existing_results (results_file )
276280 analyze_results (final_results , n_attempts )
277281
282+
278283if __name__ == "__main__" :
279284 parser = argparse .ArgumentParser (description = "Evaluate LLM performance on AIME 2024 problems" )
280285 parser .add_argument ("--model" , type = str , required = True , help = "OpenAI model to use (e.g., gpt-4, gpt-3.5-turbo)" )
0 commit comments