Skip to content

Commit 644682e

Browse files
committed
Use 5 repeats instead of 1 in GPQA example
1 parent 2ad0679 commit 644682e

File tree

3 files changed

+21
-6
lines changed

3 files changed

+21
-6
lines changed

examples/06_GPT-OSS-120B_SGLang_Example/eval_accuracy.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -36,8 +36,8 @@ def main(args):
3636
)
3737

3838
# Score the dataset
39-
score = scorer.score()
40-
print(f"Pass@1 Score: {score}")
39+
score, n_repeats = scorer.score()
40+
print(f"Pass@1 Score ({n_repeats} repeats): {score}")
4141

4242

4343
if __name__ == "__main__":

examples/06_GPT-OSS-120B_SGLang_Example/run.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -239,7 +239,7 @@ def run_benchmark_session(dataset: Dataset, issuer: HttpClientSampleIssuer, args
239239
name="gpqa_sglang_benchmark",
240240
report_dir=args.report_dir,
241241
dump_events_log=True,
242-
max_shutdown_timeout_s=600.0,
242+
max_shutdown_timeout_s=None,
243243
)
244244
sess.wait_for_test_end()
245245

@@ -267,7 +267,9 @@ def run_main(args):
267267
print("Creating dataset with transforms...")
268268
print(df.columns)
269269
df.to_parquet("datasets/gqpa_diamond_pre-transformed_gpt-oss.parquet")
270-
dataset = GPQA(df, transforms=transforms)
270+
dataset = GPQA(
271+
df, transforms=transforms, repeats=5
272+
) # Artificial Analysis uses 5 repeats
271273
dataset.load()
272274
print(f"Dataset loaded with {dataset.num_samples()} samples")
273275

src/inference_endpoint/evaluation/scoring.py

Lines changed: 15 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -85,9 +85,21 @@ def match_sample_index(self, row: pd.Series) -> pd.Series:
8585
def score_single_sample(self, value: str, ground_truth: str) -> float:
8686
raise NotImplementedError
8787

88-
def score(self) -> float:
88+
def score(self) -> tuple[float, int]:
89+
"""Scores the dataset and returns the mean score and the number of repeats.
90+
91+
Returns:
92+
tuple[float, int]: The mean score and the number of repeats.
93+
"""
8994
df = self.get_outputs()
95+
96+
# Outputs are for all samples, not just the target dataset
97+
valid_uuids = self.sample_index_map.keys()
98+
df = df[df["sample_uuid"].isin(valid_uuids)]
99+
100+
# Match to sample index from dataset
90101
df = df.apply(self.match_sample_index, axis=1)
102+
91103
empirical = df["output"]
92104
if self.extractor is not None:
93105
empirical = empirical.apply(self.extractor.extract)
@@ -101,7 +113,8 @@ def score(self) -> float:
101113
for i in range(len(empirical)):
102114
scores.append(self.score_single_sample(empirical[i], ground_truths[i]))
103115

104-
return np.mean(scores)
116+
n_repeats = len(scores) // self.dataset.num_samples()
117+
return np.mean(scores), n_repeats
105118

106119

107120
class PassAt1Scorer(Scorer):

0 commit comments

Comments
 (0)