Fixup integration

arekay-nv · arekay-nv · commit 0a17fea35528 · 2026-01-12T19:54:45.000-08:00
Signed-off-by: Rashid Kaleem &lt;230885705+arekay-nv@users.noreply.github.com&gt;
diff --git a/examples/04_GPTOSS120B_Example/sglang_gptoss_120b_example.yaml b/examples/04_GPTOSS120B_Example/sglang_gptoss_120b_example.yaml
@@ -22,12 +22,14 @@ datasets:
       eval_method: "pass_at_1"
       ground_truth: "answer"
       extractor: "boxed_math_extractor"
+      num_repeats: 8
   - name: "gpqa_gptoss_sglang"
     type: "accuracy"
     accuracy_config:
       eval_method: "pass_at_1"
       extractor: "abcd_extractor"
-
+      num_repeats: 5
+  # LCB - 3
 settings:
   runtime:
     min_duration_ms: 300
@@ -55,4 +57,4 @@ endpoint_config:
   api_key: null
   api_type: "sglang"
 
-report_dir: "results/sglang_gptoss_120b_benchmark_mlperf_13_JAN_26/"
+report_dir: "results/sglang_gptoss_120b_benchmark_mlperf_13_JAN_26_DP4_FULL/"
diff --git a/src/inference_endpoint/commands/benchmark.py b/src/inference_endpoint/commands/benchmark.py
@@ -21,10 +21,12 @@
 import argparse
 import json
 import logging
+import os
 import shutil
 import signal
 import tempfile
 import uuid
+from dataclasses import dataclass
 from pathlib import Path
 from urllib.parse import urljoin
 
@@ -63,7 +65,7 @@
 from inference_endpoint.endpoint_client.http_client import HTTPEndpointClient
 from inference_endpoint.endpoint_client.http_sample_issuer import HttpClientSampleIssuer
 from inference_endpoint.evaluation import Extractor
-from inference_endpoint.evaluation.scoring import PassAt1Scorer
+from inference_endpoint.evaluation.scoring import Scorer
 from inference_endpoint.exceptions import (
     ExecutionError,
     InputValidationError,
@@ -141,6 +143,17 @@ def on_complete_hook(self, result: QueryResult):
             self.pbar.update(1)
 
 
+@dataclass
+class AccuracyConfiguration:
+    scorer: Scorer
+    extractor: Extractor
+    dataset_name: str
+    dataset: Dataset
+    report_dir: os.PathLike
+    ground_truth_column: str
+    num_repeats: int
+
+
 async def run_benchmark_command(args: argparse.Namespace) -> None:
     """Run performance benchmark in offline, online, or YAML-configured mode.
 
@@ -336,49 +349,6 @@ def _build_config_from_cli(
     )
 
 
-def _get_dataset_path(args: argparse.Namespace, config: BenchmarkConfig) -> Path:
-    """Get dataset path from CLI args or config.
-
-    CURRENT LIMITATION: Only supports single dataset execution.
-    Priority: CLI args > config datasets[0]
-
-    Args:
-        args: Command arguments
-        config: BenchmarkConfig
-
-    Returns:
-        Path to dataset file
-
-    Raises:
-        InputValidationError: If no dataset specified or file doesn't exist
-
-    TODO: Multi-dataset support
-    When implemented, this should:
-    1. Return list[Path] for multiple datasets
-    2. Validate all dataset paths exist
-    3. Support dataset interleaving strategies
-    """
-    if hasattr(args, "dataset") and args.dataset:
-        dataset_path = Path(args.dataset)
-    else:
-        # TODO: Multi-dataset - currently just picks single dataset
-        single_dataset = config.get_single_dataset()
-        if single_dataset:
-            dataset_path = Path(single_dataset.path)
-        else:
-            logger.error("Dataset required: --dataset PATH or specify in config")
-            raise InputValidationError(
-                "Dataset required: --dataset PATH or specify in config"
-            )
-
-    # Validate file exists
-    if not dataset_path.exists():
-        logger.error(f"Dataset not found: {dataset_path}")
-        raise InputValidationError(f"Dataset not found: {dataset_path}")
-
-    return dataset_path
-
-
 def _run_benchmark(
     config: BenchmarkConfig,
     collect_responses: bool,
@@ -498,33 +468,31 @@ def _run_benchmark(
             "top_k": config.model_params.top_k,
             "repetition_penalty": config.model_params.repetition_penalty,
         }
-        accuracy_datasets = [
-            DataLoaderFactory.create_loader(dataset, metadata=metadata)
-            for dataset in accuracy_configs
-        ]
 
         # Pack the evaluation parameters for each accuracy dataset
-        for i in range(len(accuracy_configs)):
-            dataset = accuracy_configs[i]
-            extractor = Extractor.get(dataset.accuracy_config.extractor)
-            ground_truth_column = dataset.accuracy_config.ground_truth
-            scorer = PassAt1Scorer  # currently only PassAt1Scorer is supported
-            # TODO add support for other scorers
+        for acc_config in accuracy_configs:
+            extractor = Extractor.get(acc_config.accuracy_config.extractor)
+            ground_truth_column = acc_config.accuracy_config.ground_truth
+            scorer = Scorer.get(acc_config.accuracy_config.eval_method)
+            num_repeats = acc_config.accuracy_config.num_repeats
+            dataset = DataLoaderFactory.create_loader(
+                acc_config, metadata=metadata, num_repeats=num_repeats
+            )
+            accuracy_datasets.append(dataset)
             # TODO add tests and defaults
             eval_configs.append(
-                (
+                AccuracyConfiguration(
                     scorer,
                     extractor,
-                    dataset.name,
-                    accuracy_datasets[i],
+                    acc_config.name,
+                    dataset,
                     config.report_dir,
                     ground_truth_column,
+                    num_repeats,
                 )
             )
-            accuracy_datasets[i].load()
-            logger.info(
-                f"Loaded {accuracy_datasets[i]} - {accuracy_datasets[i].num_samples()} samples"
-            )
+            dataset.load()
+            logger.info(f"Loaded {dataset} - {dataset.num_samples()} samples")
 
     else:
         logger.info("No accuracy datasets provided")
@@ -659,31 +627,26 @@ def signal_handler(signum, frame):
             # Always restore original handler
             signal.signal(signal.SIGINT, old_handler)
         accuracy_scores = {}
-        for (
-            scorer,
-            extractor,
-            dataset_id,
-            dataset,
-            report_dir,
-            ground_truth_column,
-        ) in eval_configs:
-            scorer_instance = scorer(
-                dataset_id,
-                dataset,
-                report_dir,
-                extractor=extractor,
-                ground_truth_column=ground_truth_column,
+        for eval_config in eval_configs:
+            scorer_instance = eval_config.scorer(
+                eval_config.dataset_name,
+                eval_config.dataset,
+                eval_config.report_dir,
+                extractor=eval_config.extractor,
+                ground_truth_column=eval_config.ground_truth_column,
             )
             score, n_repeats = scorer_instance.score()
-            accuracy_scores[dataset_id] = {
-                "dataset_id": dataset_id,
-                "num_samples": len(dataset.data),
-                "extractor": extractor.__name__,
-                "ground_truth_column": ground_truth_column,
+            accuracy_scores[eval_config.dataset_name] = {
+                "dataset_name": eval_config.dataset_name,
+                "num_samples": len(eval_config.dataset.data),
+                "extractor": eval_config.extractor.__name__,
+                "ground_truth_column": eval_config.ground_truth_column,
                 "score": score,
                 "n_repeats": n_repeats,
             }
-            logger.info(f"Score for {dataset_id}: {score} ({n_repeats} repeats)")
+            logger.info(
+                f"Score for {eval_config.dataset_name}: {score} ({n_repeats} repeats)"
+            )
 
         # Prefer authoritative metrics from the session report
         report = getattr(sess, "report", None)
diff --git a/src/inference_endpoint/config/schema.py b/src/inference_endpoint/config/schema.py
@@ -207,16 +207,19 @@ class AccuracyConfig(BaseModel):
     The eval_method is the method to use to evaluate the accuracy of the model. Currently only "pass_at_1" is supported.
     The ground_truth is the column in the dataset that contains the ground truth. Defaults to "ground_truth" if not specified.
     The extractor is the extractor to use to extract the ground truth from the output. Currently "boxed_math_extractor" and "abcd_extractor" are supported.
+    The num_repeats is the number of times to repeat the dataset for evaluation. Defaults to 1 if not specified.
     Example:
         accuracy_config:
           eval_method: "pass_at_1"
           ground_truth: "answer"
           extractor: "boxed_math_extractor"
+          num_repeats: 5
     """
 
     eval_method: str | None = None
     ground_truth: str = "ground_truth"
     extractor: str | None = None
+    num_repeats: int = 1
 
 
 class RuntimeConfig(BaseModel):
diff --git a/src/inference_endpoint/dataset_manager/factory.py b/src/inference_endpoint/dataset_manager/factory.py
@@ -42,6 +42,7 @@ class DataLoaderFactory:
     def create_loader(
         config: Dataset,
         metadata: dict | None = None,
+        **kwargs,
     ) -> Dataset:
         """Create appropriate dataset loader based on format.
 
@@ -54,7 +55,7 @@ def create_loader(
         remap = config.parser
         name = config.name
         if name in Dataset.PREDEFINED:
-            return Dataset.PREDEFINED[name].get_dataloader()
+            return Dataset.PREDEFINED[name].get_dataloader(**kwargs)
         if format is not None:
             format = DatasetFormat(format)
 
diff --git a/src/inference_endpoint/evaluation/scoring.py b/src/inference_endpoint/evaluation/scoring.py
@@ -14,9 +14,11 @@
 # limitations under the License.
 
 
+import inspect
 import os
 from abc import ABC, abstractmethod
 from pathlib import Path
+from typing import ClassVar
 
 import numpy as np
 import orjson
@@ -33,6 +35,46 @@ class Scorer(ABC):
     can be compared against the ground truth.
     """
 
+    PREDEFINED: ClassVar[dict[str, type["Scorer"]]] = {}
+
+    def __init_subclass__(
+        cls,
+        scorer_id: str | None = None,
+        **kwargs,
+    ):
+        super().__init_subclass__(**kwargs)
+
+        if not inspect.isabstract(cls):
+            if scorer_id is None:
+                scorer_id = cls.__name__
+            cls.SCORER_ID = scorer_id
+            Scorer.PREDEFINED[scorer_id] = cls
+
+    @classmethod
+    def get(cls, name: str) -> type["Scorer"]:
+        """Look up an Scorer subclass by its registered name.
+
+        Args:
+            name: str, the registered scorer name
+
+        Returns:
+            Scorer subclass
+
+        Raises:
+            KeyError: If no scorer with the given name is found
+        """
+        try:
+            return Scorer.PREDEFINED[name]
+        except KeyError as e:
+            raise KeyError(
+                f"Scorer '{name}' is not registered - available scorers: {Scorer.available_scorers()}"
+            ) from e
+
+    @classmethod
+    def available_scorers(cls) -> list[str]:
+        """Return the list of registered scorer names."""
+        return list(Scorer.PREDEFINED.keys())
+
     def __init__(
         self,
         dataset_name: str,
@@ -125,7 +167,7 @@ def score(self) -> tuple[float, int]:
         return np.mean(scores), n_repeats
 
 
-class PassAt1Scorer(Scorer):
+class PassAt1Scorer(Scorer, scorer_id="pass_at_1"):
     """Implements pass@1 scoring as defined by Artificial Analysis.
     pass@1 means the model gets exactly one attempt to produce the correct answer.
     The score is 1 if the output matches the ground truth exactly, 0 otherwise.