Skip to content

Commit ac891ab

Browse files
committed
as
1 parent 492b853 commit ac891ab

File tree

2 files changed

+68
-20
lines changed

2 files changed

+68
-20
lines changed

optillm/server.py

Lines changed: 21 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -887,14 +887,30 @@ def proxy_models():
887887
try:
888888
if server_config['base_url']:
889889
client = OpenAI(api_key=API_KEY, base_url=server_config['base_url'])
890+
# For external API, fetch models using the OpenAI client
891+
models_response = client.models.list()
892+
# Convert to dict format
893+
models_data = {
894+
"object": "list",
895+
"data": [model.dict() for model in models_response.data]
896+
}
890897
else:
891-
client = default_client
892-
893-
# Fetch models using the OpenAI client and return the raw response
894-
models_response = client.models.list().json()
898+
# For local inference, create a models response manually
899+
current_model = server_config.get('model', 'gpt-3.5-turbo')
900+
models_data = {
901+
"object": "list",
902+
"data": [
903+
{
904+
"id": current_model,
905+
"object": "model",
906+
"created": 1677610602,
907+
"owned_by": "optillm"
908+
}
909+
]
910+
}
895911

896912
logger.debug('Models retrieved successfully')
897-
return models_response, 200
913+
return jsonify(models_data), 200
898914
except Exception as e:
899915
logger.error(f"Error fetching models: {str(e)}")
900916
return jsonify({"error": f"Error fetching models: {str(e)}"}), 500

scripts/eval_simpleqa_benchmark.py

Lines changed: 47 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,7 @@
3838

3939
# Constants
4040
SIMPLEQA_CSV_URL = "https://openaipublic.blob.core.windows.net/simple-evals/simple_qa_test_set.csv"
41+
SIMPLEQA_VERIFIED_CSV_URL = "https://huggingface.co/datasets/codelion/SimpleQA-Verified/raw/main/simpleqa_verified.csv"
4142
DEFAULT_TIMEOUT = 600 # 10 minutes for potentially long research operations
4243
DEFAULT_GRADER_MODEL = "gpt-4o"
4344
DEFAULT_BASE_URL = "http://localhost:8000/v1"
@@ -90,12 +91,14 @@ def __init__(self,
9091
grader_model: str = DEFAULT_GRADER_MODEL,
9192
timeout: int = DEFAULT_TIMEOUT,
9293
cache_dir: str = "cache",
93-
output_dir: str = "results"):
94+
output_dir: str = "results",
95+
use_verified: bool = False):
9496
self.model = model
9597
self.approach = approach
9698
self.base_url = base_url
9799
self.grader_model = grader_model
98100
self.timeout = timeout
101+
self.use_verified = use_verified
99102
self.cache_dir = Path(cache_dir)
100103
self.output_dir = Path(output_dir)
101104

@@ -137,16 +140,23 @@ def __init__(self,
137140

138141
def download_dataset(self) -> str:
139142
"""Download SimpleQA dataset if not cached"""
140-
cache_file = self.cache_dir / "simple_qa_test_set.csv"
143+
if self.use_verified:
144+
cache_file = self.cache_dir / "simpleqa_verified.csv"
145+
url = SIMPLEQA_VERIFIED_CSV_URL
146+
dataset_name = "SimpleQA-Verified"
147+
else:
148+
cache_file = self.cache_dir / "simple_qa_test_set.csv"
149+
url = SIMPLEQA_CSV_URL
150+
dataset_name = "SimpleQA"
141151

142152
if cache_file.exists():
143-
logger.info(f"Using cached dataset: {cache_file}")
153+
logger.info(f"Using cached {dataset_name} dataset: {cache_file}")
144154
return str(cache_file)
145155

146-
logger.info(f"Downloading SimpleQA dataset from {SIMPLEQA_CSV_URL}")
156+
logger.info(f"Downloading {dataset_name} dataset from {url}")
147157

148158
try:
149-
response = requests.get(SIMPLEQA_CSV_URL, timeout=30)
159+
response = requests.get(url, timeout=30)
150160
response.raise_for_status()
151161

152162
with open(cache_file, 'wb') as f:
@@ -176,21 +186,35 @@ def load_dataset(self, num_samples: Optional[int] = None, start_index: int = 0)
176186
if num_samples and len(questions) >= num_samples:
177187
break
178188

179-
# Parse metadata if it's JSON string
180-
try:
181-
metadata = json.loads(row['metadata']) if row['metadata'] else {}
182-
except:
183-
metadata = {}
189+
if self.use_verified:
190+
# SimpleQA-Verified dataset has different fields
191+
metadata = {
192+
'original_index': row.get('original_index', i),
193+
'topic': row.get('topic', ''),
194+
'answer_type': row.get('answer_type', ''),
195+
'multi_step': row.get('multi_step', ''),
196+
'requires_reasoning': row.get('requires_reasoning', ''),
197+
'urls': row.get('urls', '')
198+
}
199+
question_id = row.get('original_index', i)
200+
else:
201+
# Original SimpleQA dataset
202+
try:
203+
metadata = json.loads(row['metadata']) if row.get('metadata') else {}
204+
except:
205+
metadata = {}
206+
question_id = i
184207

185208
question_data = {
186-
'id': i,
209+
'id': question_id,
187210
'metadata': metadata,
188211
'question': row['problem'],
189212
'gold_answer': row['answer']
190213
}
191214
questions.append(question_data)
192215

193-
logger.info(f"Loaded {len(questions)} questions from dataset")
216+
dataset_type = "SimpleQA-Verified" if self.use_verified else "SimpleQA"
217+
logger.info(f"Loaded {len(questions)} questions from {dataset_type} dataset")
194218
return questions
195219

196220
except Exception as e:
@@ -377,7 +401,8 @@ def calculate_metrics(self) -> Dict:
377401
def save_results(self, timestamp: str) -> Tuple[str, str, str]:
378402
"""Save evaluation results to files"""
379403
# Create output directory for this run
380-
run_dir = self.output_dir / f"simpleqa_{self.model}_{self.approach}"
404+
dataset_suffix = "_verified" if self.use_verified else ""
405+
run_dir = self.output_dir / f"simpleqa{dataset_suffix}_{self.model}_{self.approach}"
381406
run_dir.mkdir(parents=True, exist_ok=True)
382407

383408
# File paths
@@ -416,9 +441,11 @@ def run_evaluation(self,
416441
"""Run the complete evaluation"""
417442
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
418443

419-
logger.info(f"Starting SimpleQA evaluation")
444+
dataset_type = "SimpleQA-Verified" if self.use_verified else "SimpleQA"
445+
logger.info(f"Starting {dataset_type} evaluation")
420446
logger.info(f"Model: {self.model}")
421447
logger.info(f"Approach: {self.approach}")
448+
logger.info(f"Dataset: {dataset_type} ({'1k verified questions' if self.use_verified else '4.3k questions'})")
422449
logger.info(f"Base URL: {self.base_url}")
423450
logger.info(f"Timeout: {self.timeout}s")
424451

@@ -502,6 +529,10 @@ def parse_args():
502529
parser.add_argument("--output-dir", type=str, default="results",
503530
help="Directory for saving results (default: results)")
504531

532+
# Dataset selection
533+
parser.add_argument("--verified", action="store_true",
534+
help="Use SimpleQA-Verified dataset (1k verified questions) instead of original SimpleQA")
535+
505536
# Debugging
506537
parser.add_argument("--verbose", action="store_true",
507538
help="Enable verbose logging")
@@ -524,7 +555,8 @@ def main():
524555
grader_model=args.grader_model,
525556
timeout=args.timeout,
526557
cache_dir=args.cache_dir,
527-
output_dir=args.output_dir
558+
output_dir=args.output_dir,
559+
use_verified=args.verified
528560
)
529561

530562
try:

0 commit comments

Comments
 (0)