Align with latest modelbench and fix tests.

superdosh · superdosh · commit 44055cf136ab · 2025-12-09T14:10:51.000-05:00
diff --git a/src/modelplane/cli.py b/src/modelplane/cli.py
@@ -1,11 +1,10 @@
 from typing import List
 
 import click
+from modelgauge.data_schema import DEFAULT_ANNOTATION_SCHEMA
+from modelgauge.ensemble_strategies import ENSEMBLE_STRATEGIES
 
-from modelgauge.data_schema import DEFAULT_ANNOTATION_SCHEMA as ANNOTATION_SCHEMA
-from modelgauge.ensemble_annotator_set import ENSEMBLE_STRATEGIES
-
-from modelplane.runways.annotator import annotate, KNOWN_ENSEMBLES
+from modelplane.runways.annotator import annotate
 from modelplane.runways.lister import (
     list_annotators,
     list_ensemble_strategies,
@@ -152,13 +151,6 @@ def get_sut_responses(
     help="The ensemble strategy to use. If set, individual annotator results will be combined using the given strategy. "
     "Available strategies: " + ", ".join(list(ENSEMBLE_STRATEGIES.keys())),
 )
-@click.option(
-    "--ensemble_id",
-    type=str,
-    default=None,
-    help="Use a fixed ensemble id to use a predefined ensemble strategy. Options include: "
-    + ", ".join(list(KNOWN_ENSEMBLES.keys())),
-)
 @click.option(
     "--overwrite",
     is_flag=True,
@@ -204,12 +196,11 @@ def get_sut_responses(
 @load_from_dotenv
 def get_annotations(
     experiment: str,
+    annotator_id: List[str],
     dvc_repo: str | None = None,
     response_file: str | None = None,
     response_run_id: str | None = None,
-    annotator_id: List[str] | None = None,
     ensemble_strategy: str | None = None,
-    ensemble_id: str | None = None,
     overwrite: bool = False,
     disable_cache: bool = False,
     num_workers: int = 1,
@@ -225,7 +216,6 @@ def get_annotations(
         response_run_id=response_run_id,
         annotator_ids=annotator_id,
         ensemble_strategy=ensemble_strategy,
-        ensemble_id=ensemble_id,
         overwrite=overwrite,
         disable_cache=disable_cache,
         num_workers=num_workers,
@@ -285,8 +275,8 @@ def score_annotations(
     ground_truth: str,
     dvc_repo: str | None = None,
     sample_uid_col: str | None = None,
-    annotator_uid_col: str = ANNOTATION_SCHEMA.annotator_uid,
-    annotation_col: str = ANNOTATION_SCHEMA.annotation,
+    annotator_uid_col: str | None = DEFAULT_ANNOTATION_SCHEMA.annotator_uid,
+    annotation_col: str | None = DEFAULT_ANNOTATION_SCHEMA.annotation,
 ):
     return score(
         annotation_run_id=annotation_run_id,
diff --git a/src/modelplane/runways/annotator.py b/src/modelplane/runways/annotator.py
@@ -11,12 +11,18 @@
 from matplotlib import pyplot as plt
 from modelgauge.annotator import Annotator
 from modelgauge.annotator_registry import ANNOTATORS
-from modelgauge.annotator_set import AnnotatorSet
 from modelgauge.dataset import AnnotationDataset
-from modelgauge.ensemble_annotator_set import ENSEMBLE_STRATEGIES, EnsembleAnnotatorSet
+from modelgauge.ensemble_annotator import EnsembleAnnotator
+from modelgauge.ensemble_strategies import ENSEMBLE_STRATEGIES
 from modelgauge.pipeline_runner import build_runner
 
 from modelplane.mlflow.loghelpers import log_tags
+from modelplane.runways.data import (
+    Artifact,
+    BaseInput,
+    RunArtifacts,
+    build_and_log_input,
+)
 from modelplane.runways.utils import (
     CACHE_DIR,
     MODELGAUGE_RUN_TAG_NAME,
@@ -27,32 +33,16 @@
     is_debug_mode,
     setup_annotator_credentials,
 )
-from modelplane.runways.data import (
-    Artifact,
-    BaseInput,
-    RunArtifacts,
-    build_and_log_input,
-)
-
-KNOWN_ENSEMBLES: Dict[str, AnnotatorSet] = {}
-# try to load the private ensemble
-try:
-    from modelgauge.private_ensemble_annotator_set import PRIVATE_ANNOTATOR_SET
-
-    KNOWN_ENSEMBLES["official-1.0"] = PRIVATE_ANNOTATOR_SET
-except NotImplementedError:
-    pass
 
 
 def annotate(
     experiment: str,
+    annotator_ids: List[str],
     input_object: BaseInput | None = None,
     dvc_repo: str | None = None,
     response_file: str | None = None,
     response_run_id: str | None = None,
-    annotator_ids: List[str] | None = None,
     ensemble_strategy: str | None = None,
-    ensemble_id: str | None = None,
     overwrite: bool = False,
     disable_cache: bool = False,
     num_workers: int = 1,
@@ -65,9 +55,7 @@ def annotate(
     Run annotations and record measurements.
     """
     # this will set annotator_ids and optionally ensemble
-    pipeline_kwargs = _get_annotator_settings(
-        annotator_ids, ensemble_strategy, ensemble_id
-    )
+    pipeline_kwargs = _get_annotator_settings(annotator_ids, ensemble_strategy)
     if not disable_cache:
         pipeline_kwargs["cache_dir"] = CACHE_DIR
     pipeline_kwargs["num_workers"] = num_workers
@@ -83,8 +71,6 @@ def annotate(
     )
     if ensemble_strategy is not None:
         tags["ensemble_strategy"] = ensemble_strategy
-    if ensemble_id is not None:
-        tags["ensemble_id"] = ensemble_id
 
     experiment_id = get_experiment_id(experiment)
     if overwrite and response_run_id:
@@ -155,38 +141,26 @@ def annotate(
 
 
 def _get_annotator_settings(
-    annotator_ids: List[str] | None,
+    annotator_ids: List[str],
     ensemble_strategy: str | None,
-    ensemble_id: str | None,
 ) -> Dict[str, Any]:
 
     kwargs = {}
 
-    if not ((annotator_ids is not None) ^ (ensemble_id is not None)):
-        raise ValueError("Either annotator_ids or ensemble_id must be provided.")
-    if annotator_ids is not None:
-        kwargs["annotators"] = _get_annotators(annotator_ids)
-
-        if ensemble_strategy is not None:
-            if ensemble_strategy not in ENSEMBLE_STRATEGIES:
-                raise ValueError(
-                    f"Unknown ensemble strategy: {ensemble_strategy}. "
-                    f"Available strategies: {list(ENSEMBLE_STRATEGIES.keys())}"
-                )
-            kwargs["ensemble"] = EnsembleAnnotatorSet(
-                annotators=annotator_ids,
-                strategy=ENSEMBLE_STRATEGIES[ensemble_strategy],
-            )
-        return kwargs
-    else:
-        if ensemble_id not in KNOWN_ENSEMBLES:
+    kwargs["annotators"] = _get_annotators(annotator_ids)
+
+    if ensemble_strategy is not None:
+        if ensemble_strategy not in ENSEMBLE_STRATEGIES:
             raise ValueError(
-                f"Unknown ensemble_id: {ensemble_id}. "
-                f"Available strategies: {list(KNOWN_ENSEMBLES.keys())}"
+                f"Unknown ensemble strategy: {ensemble_strategy}. "
+                f"Available strategies: {list(ENSEMBLE_STRATEGIES.keys())}"
             )
-        kwargs["ensemble"] = KNOWN_ENSEMBLES[ensemble_id]
-        kwargs["annotators"] = _get_annotators(KNOWN_ENSEMBLES[ensemble_id].annotators)
-        return kwargs
+        kwargs["ensemble"] = EnsembleAnnotator(
+            uid="ensemble",
+            annotators=annotator_ids,
+            ensemble_strategy=ensemble_strategy,
+        )
+    return kwargs
 
 
 def _get_annotators(annotator_ids: List[str]) -> Dict[str, Annotator]:
diff --git a/src/modelplane/runways/lister.py b/src/modelplane/runways/lister.py
@@ -1,5 +1,5 @@
 from modelgauge.annotator_registry import ANNOTATORS
-from modelgauge.ensemble_annotator_set import ENSEMBLE_STRATEGIES
+from modelgauge.ensemble_strategies import ENSEMBLE_STRATEGIES
 from modelgauge.sut_registry import SUTS
 
 
diff --git a/tests/it/runways/half_safe_annotator.py b/tests/it/runways/half_safe_annotator.py
@@ -1,23 +1,18 @@
-import random
-
-from pydantic import BaseModel
-
-from modelgauge.annotator import CompletionAnnotator
+from modelgauge.annotation import SafetyAnnotation
+from modelgauge.annotator import Annotator
 from modelgauge.annotator_registry import ANNOTATORS
-from modelgauge.annotators.llama_guard_annotator import LlamaGuardAnnotation
 from modelgauge.prompt import ChatPrompt, TextPrompt
 from modelgauge.sut import SUTResponse
-from modelgauge.tests.safe_v1 import BaseSafeTestVersion1
+from pydantic import BaseModel
 
 
 class AnnotationRequest(BaseModel):
     prompt: ChatPrompt | TextPrompt
     response: SUTResponse
 
 
-class TestAnnotator(CompletionAnnotator[LlamaGuardAnnotation]):
-    """Simple implementation for demonstration. Uses LlamaGuardAnnotation
-    for consistency with AILuminate benchmarks. Every other item is marked safe.
+class TestAnnotator(Annotator):
+    """Simple implementation for demonstration.
 
     The methods to implement are based on the idea that annotate will
     potentially make a call to another system and the translators are helpful
@@ -34,21 +29,16 @@ def translate_prompt(self, prompt: ChatPrompt | TextPrompt, response: SUTRespons
             response=response,
         )
 
-    def annotate(self, annotation_request: AnnotationRequest) -> LlamaGuardAnnotation:
-        unsafe_categories = []
-        if not self.next_is_safe:
-            unsafe_categories = BaseSafeTestVersion1.hazards
-        annotation = LlamaGuardAnnotation(
+    def annotate(self, annotation_request: AnnotationRequest) -> SafetyAnnotation:
+        annotation = SafetyAnnotation(
             is_safe=self.next_is_safe,
-            is_safe_logprob=random.random(),
-            violation_categories=unsafe_categories,
         )
         self.next_is_safe = not self.next_is_safe
         return annotation
 
     def translate_response(
-        self, request: AnnotationRequest, response: LlamaGuardAnnotation
-    ) -> LlamaGuardAnnotation:
+        self, request: AnnotationRequest, response: SafetyAnnotation
+    ) -> SafetyAnnotation:
         return response
 
 
diff --git a/tests/unit/test_lister.py b/tests/unit/test_lister.py
@@ -1,9 +1,10 @@
+from modelgauge.ensemble_strategies import ENSEMBLE_STRATEGIES
+
 from modelplane.runways.lister import (
     list_annotators,
     list_ensemble_strategies,
     list_suts,
 )
-from modelgauge.ensemble_annotator_set import ENSEMBLE_STRATEGIES
 
 
 def test_list_annotators(capsys):
@@ -13,7 +14,7 @@ def test_list_annotators(capsys):
 
 
 def test_list_ensemble_strategies(capsys):
-    ENSEMBLE_STRATEGIES["demo_ensemble_strategy"] = "Demo Ensemble Strategy"
+    ENSEMBLE_STRATEGIES["demo_ensemble_strategy"] = ENSEMBLE_STRATEGIES["any_unsafe"]
     list_ensemble_strategies()
     output = capsys.readouterr().out.strip()
     assert "demo_ensemble_strategy" in output