3838
3939# Constants
4040SIMPLEQA_CSV_URL = "https://openaipublic.blob.core.windows.net/simple-evals/simple_qa_test_set.csv"
41+ SIMPLEQA_VERIFIED_CSV_URL = "https://huggingface.co/datasets/codelion/SimpleQA-Verified/raw/main/simpleqa_verified.csv"
4142DEFAULT_TIMEOUT = 600 # 10 minutes for potentially long research operations
4243DEFAULT_GRADER_MODEL = "gpt-4o"
4344DEFAULT_BASE_URL = "http://localhost:8000/v1"
@@ -90,12 +91,14 @@ def __init__(self,
9091 grader_model : str = DEFAULT_GRADER_MODEL ,
9192 timeout : int = DEFAULT_TIMEOUT ,
9293 cache_dir : str = "cache" ,
93- output_dir : str = "results" ):
94+ output_dir : str = "results" ,
95+ use_verified : bool = False ):
9496 self .model = model
9597 self .approach = approach
9698 self .base_url = base_url
9799 self .grader_model = grader_model
98100 self .timeout = timeout
101+ self .use_verified = use_verified
99102 self .cache_dir = Path (cache_dir )
100103 self .output_dir = Path (output_dir )
101104
@@ -137,16 +140,23 @@ def __init__(self,
137140
138141 def download_dataset (self ) -> str :
139142 """Download SimpleQA dataset if not cached"""
140- cache_file = self .cache_dir / "simple_qa_test_set.csv"
143+ if self .use_verified :
144+ cache_file = self .cache_dir / "simpleqa_verified.csv"
145+ url = SIMPLEQA_VERIFIED_CSV_URL
146+ dataset_name = "SimpleQA-Verified"
147+ else :
148+ cache_file = self .cache_dir / "simple_qa_test_set.csv"
149+ url = SIMPLEQA_CSV_URL
150+ dataset_name = "SimpleQA"
141151
142152 if cache_file .exists ():
143- logger .info (f"Using cached dataset: { cache_file } " )
153+ logger .info (f"Using cached { dataset_name } dataset: { cache_file } " )
144154 return str (cache_file )
145155
146- logger .info (f"Downloading SimpleQA dataset from { SIMPLEQA_CSV_URL } " )
156+ logger .info (f"Downloading { dataset_name } dataset from { url } " )
147157
148158 try :
149- response = requests .get (SIMPLEQA_CSV_URL , timeout = 30 )
159+ response = requests .get (url , timeout = 30 )
150160 response .raise_for_status ()
151161
152162 with open (cache_file , 'wb' ) as f :
@@ -176,21 +186,35 @@ def load_dataset(self, num_samples: Optional[int] = None, start_index: int = 0)
176186 if num_samples and len (questions ) >= num_samples :
177187 break
178188
179- # Parse metadata if it's JSON string
180- try :
181- metadata = json .loads (row ['metadata' ]) if row ['metadata' ] else {}
182- except :
183- metadata = {}
189+ if self .use_verified :
190+ # SimpleQA-Verified dataset has different fields
191+ metadata = {
192+ 'original_index' : row .get ('original_index' , i ),
193+ 'topic' : row .get ('topic' , '' ),
194+ 'answer_type' : row .get ('answer_type' , '' ),
195+ 'multi_step' : row .get ('multi_step' , '' ),
196+ 'requires_reasoning' : row .get ('requires_reasoning' , '' ),
197+ 'urls' : row .get ('urls' , '' )
198+ }
199+ question_id = row .get ('original_index' , i )
200+ else :
201+ # Original SimpleQA dataset
202+ try :
203+ metadata = json .loads (row ['metadata' ]) if row .get ('metadata' ) else {}
204+ except :
205+ metadata = {}
206+ question_id = i
184207
185208 question_data = {
186- 'id' : i ,
209+ 'id' : question_id ,
187210 'metadata' : metadata ,
188211 'question' : row ['problem' ],
189212 'gold_answer' : row ['answer' ]
190213 }
191214 questions .append (question_data )
192215
193- logger .info (f"Loaded { len (questions )} questions from dataset" )
216+ dataset_type = "SimpleQA-Verified" if self .use_verified else "SimpleQA"
217+ logger .info (f"Loaded { len (questions )} questions from { dataset_type } dataset" )
194218 return questions
195219
196220 except Exception as e :
@@ -377,7 +401,8 @@ def calculate_metrics(self) -> Dict:
377401 def save_results (self , timestamp : str ) -> Tuple [str , str , str ]:
378402 """Save evaluation results to files"""
379403 # Create output directory for this run
380- run_dir = self .output_dir / f"simpleqa_{ self .model } _{ self .approach } "
404+ dataset_suffix = "_verified" if self .use_verified else ""
405+ run_dir = self .output_dir / f"simpleqa{ dataset_suffix } _{ self .model } _{ self .approach } "
381406 run_dir .mkdir (parents = True , exist_ok = True )
382407
383408 # File paths
@@ -416,9 +441,11 @@ def run_evaluation(self,
416441 """Run the complete evaluation"""
417442 timestamp = datetime .now ().strftime ("%Y%m%d_%H%M%S" )
418443
419- logger .info (f"Starting SimpleQA evaluation" )
444+ dataset_type = "SimpleQA-Verified" if self .use_verified else "SimpleQA"
445+ logger .info (f"Starting { dataset_type } evaluation" )
420446 logger .info (f"Model: { self .model } " )
421447 logger .info (f"Approach: { self .approach } " )
448+ logger .info (f"Dataset: { dataset_type } ({ '1k verified questions' if self .use_verified else '4.3k questions' } )" )
422449 logger .info (f"Base URL: { self .base_url } " )
423450 logger .info (f"Timeout: { self .timeout } s" )
424451
@@ -502,6 +529,10 @@ def parse_args():
502529 parser .add_argument ("--output-dir" , type = str , default = "results" ,
503530 help = "Directory for saving results (default: results)" )
504531
532+ # Dataset selection
533+ parser .add_argument ("--verified" , action = "store_true" ,
534+ help = "Use SimpleQA-Verified dataset (1k verified questions) instead of original SimpleQA" )
535+
505536 # Debugging
506537 parser .add_argument ("--verbose" , action = "store_true" ,
507538 help = "Enable verbose logging" )
@@ -524,7 +555,8 @@ def main():
524555 grader_model = args .grader_model ,
525556 timeout = args .timeout ,
526557 cache_dir = args .cache_dir ,
527- output_dir = args .output_dir
558+ output_dir = args .output_dir ,
559+ use_verified = args .verified
528560 )
529561
530562 try :
0 commit comments