1+ import os
2+ import json
3+ import argparse
4+ import asyncio
5+ from tqdm import tqdm
6+ from datasets import load_dataset
7+ from openai import AsyncOpenAI
8+ from typing import List , Dict , Any , Tuple
9+ import random
10+
11+ # OptILM approaches remain the same as in original script
12+ APPROACHES = ["none" , "mcts" , "bon" , "moa" , "rto" , "z3" , "self_consistency" , "pvg" , "rstar" , "cot_reflection" , "plansearch" , "leap" , "re2" ]
13+
14+ # Dataset configurations
15+ DATASET_CONFIGS = [
16+ ("MixEval" , "free_form" ),
17+ ("MixEval" , "multiple_choice" ),
18+ ("MixEval_Hard" , "free_form" ),
19+ ("MixEval_Hard" , "multiple_choice" )
20+ ]
21+
22+ def construct_prompt (sample : Dict [str , Any ], split_type : str ) -> str :
23+ """Construct prompt based on split type."""
24+ context = sample .get ("context" , "" )
25+ prompt = sample ["prompt" ]
26+
27+ if split_type == "multiple_choice" :
28+ options = sample ["options" ]
29+ options_text = "\n Options:\n " + "\n " .join ([f"{ i + 1 } . { opt } " for i , opt in enumerate (options )])
30+ return f"Context: { context } \n \n Question: { prompt } { options_text } \n \n Provide the correct answer from the options above."
31+ else :
32+ return f"Context: { context } \n \n Question: { prompt } \n \n Provide your answer."
33+
34+ def is_correct_response (response : str , targets : List [str ]) -> bool :
35+ """Check if response matches any of the target answers."""
36+ response = response .strip ().lower ()
37+ return any (target .strip ().lower () == response for target in targets )
38+
39+ async def generate_response (prompt : str , approach : str ) -> Dict [str , Any ]:
40+ """Generate a response using the specified approach."""
41+ if approach == "none" :
42+ client = AsyncOpenAI ()
43+ response = await client .chat .completions .create (
44+ model = "gpt-4o-mini" ,
45+ messages = [{"role" : "user" , "content" : prompt }],
46+ )
47+ return {
48+ "content" : response .choices [0 ].message .content ,
49+ "tokens" : response .usage .completion_tokens ,
50+ }
51+ else :
52+ client = AsyncOpenAI (api_key = "none" , base_url = "http://localhost:8000/v1" )
53+ response = await client .chat .completions .create (
54+ model = f"{ approach } -gpt-4o-mini" ,
55+ messages = [{"role" : "user" , "content" : prompt }],
56+ )
57+ return {
58+ "content" : response .choices [0 ].message .content ,
59+ "tokens" : response .usage .completion_tokens ,
60+ }
61+
62+ def rank_responses (responses : List [Dict [str , Any ]], targets : List [str ]) -> List [int ]:
63+ """Rank responses based on correctness and token efficiency."""
64+ # Create tuples of (index, is_correct, tokens) for sorting
65+ ranked_data = []
66+ for i , response in enumerate (responses ):
67+ is_correct = is_correct_response (response ["content" ], targets )
68+ ranked_data .append ((i , is_correct , response ["tokens" ]))
69+
70+ # Sort by correctness (True first) and then by tokens (ascending)
71+ ranked_data .sort (key = lambda x : (- int (x [1 ]), x [2 ]))
72+
73+ # Extract indices for final ranking
74+ return [idx for idx , _ , _ in ranked_data ]
75+
76+ async def process_sample (sample : Dict [str , Any ], split_type : str ) -> Dict [str , Any ]:
77+ """Process a single sample from the dataset."""
78+ prompt = construct_prompt (sample , split_type )
79+ results = []
80+
81+ # Generate responses for each approach
82+ for approach in APPROACHES :
83+ response = await generate_response (prompt , approach )
84+ results .append ({"approach" : approach , ** response })
85+
86+ # Rank the responses based on correctness and token efficiency
87+ rankings = rank_responses (results , sample ["target" ])
88+
89+ # Add rankings to results
90+ for rank , idx in enumerate (rankings ):
91+ results [idx ]["rank" ] = rank
92+
93+ return {
94+ "prompt" : prompt ,
95+ "results" : results ,
96+ }
97+
98+ async def generate_dataset (num_samples : int , output_file : str ):
99+ """Generate the dataset and save it to a JSONL file."""
100+ with open (output_file , "w" ) as f :
101+ for config , split_type in DATASET_CONFIGS :
102+ print (f"Processing { config } - { split_type } " )
103+ dataset = load_dataset ("MixEval/MixEval" , config , split = split_type )
104+
105+ # Calculate samples per configuration
106+ samples_per_config = max (1 , num_samples // len (DATASET_CONFIGS ))
107+
108+ for sample in tqdm (dataset .select (range (samples_per_config )),
109+ total = samples_per_config ,
110+ desc = f"{ config } -{ split_type } " ):
111+ try :
112+ result = await process_sample (sample , split_type )
113+ f .write (json .dumps (result ) + "\n " )
114+ except Exception as e :
115+ print (f"Error processing sample: { str (e )} " )
116+
117+ def main ():
118+ parser = argparse .ArgumentParser (description = "Generate OptILM Ground Truth dataset" )
119+ parser .add_argument ("--num_samples" , type = int , default = 100 ,
120+ help = "Total number of samples to process (divided among configurations)" )
121+ parser .add_argument ("--output_file" , type = str ,
122+ default = "optillm_ground_truth_dataset.jsonl" ,
123+ help = "Output file path" )
124+ args = parser .parse_args ()
125+
126+ asyncio .run (generate_dataset (args .num_samples , args .output_file ))
127+ print (f"Dataset generated and saved to { args .output_file } " )
128+
129+ if __name__ == "__main__" :
130+ main ()
0 commit comments