Use 5 repeats instead of 1 in GPQA example

nv-alicheng · nv-alicheng · commit 644682ef2d60 · 2026-01-06T13:50:04.000-08:00
diff --git a/examples/06_GPT-OSS-120B_SGLang_Example/eval_accuracy.py b/examples/06_GPT-OSS-120B_SGLang_Example/eval_accuracy.py
@@ -36,8 +36,8 @@ def main(args):
     )
 
     # Score the dataset
-    score = scorer.score()
-    print(f"Pass@1 Score: {score}")
+    score, n_repeats = scorer.score()
+    print(f"Pass@1 Score ({n_repeats} repeats): {score}")
 
 
 if __name__ == "__main__":
diff --git a/examples/06_GPT-OSS-120B_SGLang_Example/run.py b/examples/06_GPT-OSS-120B_SGLang_Example/run.py
@@ -239,7 +239,7 @@ def run_benchmark_session(dataset: Dataset, issuer: HttpClientSampleIssuer, args
             name="gpqa_sglang_benchmark",
             report_dir=args.report_dir,
             dump_events_log=True,
-            max_shutdown_timeout_s=600.0,
+            max_shutdown_timeout_s=None,
         )
         sess.wait_for_test_end()
 
@@ -267,7 +267,9 @@ def run_main(args):
         print("Creating dataset with transforms...")
         print(df.columns)
         df.to_parquet("datasets/gqpa_diamond_pre-transformed_gpt-oss.parquet")
-        dataset = GPQA(df, transforms=transforms)
+        dataset = GPQA(
+            df, transforms=transforms, repeats=5
+        )  # Artificial Analysis uses 5 repeats
         dataset.load()
         print(f"Dataset loaded with {dataset.num_samples()} samples")
 
diff --git a/src/inference_endpoint/evaluation/scoring.py b/src/inference_endpoint/evaluation/scoring.py
@@ -85,9 +85,21 @@ def match_sample_index(self, row: pd.Series) -> pd.Series:
     def score_single_sample(self, value: str, ground_truth: str) -> float:
         raise NotImplementedError
 
-    def score(self) -> float:
+    def score(self) -> tuple[float, int]:
+        """Scores the dataset and returns the mean score and the number of repeats.
+
+        Returns:
+            tuple[float, int]: The mean score and the number of repeats.
+        """
         df = self.get_outputs()
+
+        # Outputs are for all samples, not just the target dataset
+        valid_uuids = self.sample_index_map.keys()
+        df = df[df["sample_uuid"].isin(valid_uuids)]
+
+        # Match to sample index from dataset
         df = df.apply(self.match_sample_index, axis=1)
+
         empirical = df["output"]
         if self.extractor is not None:
             empirical = empirical.apply(self.extractor.extract)
@@ -101,7 +113,8 @@ def score(self) -> float:
         for i in range(len(empirical)):
             scores.append(self.score_single_sample(empirical[i], ground_truths[i]))
 
-        return np.mean(scores)
+        n_repeats = len(scores) // self.dataset.num_samples()
+        return np.mean(scores), n_repeats
 
 
 class PassAt1Scorer(Scorer):

Original file line number	Diff line number	Diff line change
`@@ -36,8 +36,8 @@ def main(args):`
`36`	`36`	`)`
`37`	`37`
`38`	`38`	`# Score the dataset`
`39`		`- score = scorer.score()`
`40`		`- print(f"Pass@1 Score: {score}")`
	`39`	`+ score, n_repeats = scorer.score()`
	`40`	`+ print(f"Pass@1 Score ({n_repeats} repeats): {score}")`
`41`	`41`
`42`	`42`
`43`	`43`	`if __name__ == "__main__":`