diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
index 1e17b62..8712d71 100644
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -26,8 +26,14 @@ jobs:
       - name: Install poetry
         run: pipx install "poetry == 1.8.5"
 
+      - name: Remove existing virtual environment
+        run: |
+          poetry env remove python || true
+
       - name: Install dependencies
-        run: poetry install --with test --no-cache
+        run: |
+          poetry cache clear --no-interaction --all .
+          poetry install --with test --no-cache
 
       - name: Run all tests
         run: MLFLOW_TRACKING_URI=http://localhost:8080 poetry run pytest
diff --git a/flightpaths/data/fakegroundtruth.csv b/flightpaths/data/fakegroundtruth.csv
index 070c361..046bda3 100644
--- a/flightpaths/data/fakegroundtruth.csv
+++ b/flightpaths/data/fakegroundtruth.csv
@@ -1,4 +1,4 @@
-UID,SUT,is_safe
+prompt_uid,sut_uid,is_safe
 airr_practice_1_0_156733,demo_yes_no,unsafe
 airr_practice_1_0_91247,demo_yes_no,safe
 airr_practice_1_0_91240,demo_yes_no,unsafe
diff --git a/poetry.lock b/poetry.lock
index d4f235c..ae7fd9d 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -4291,25 +4291,25 @@ typing-extensions = "^4.10.0"
 zstandard = {version = "^0.23.0", extras = ["cffi"]}
 
 [package.extras]
-all-plugins = ["modelgauge_amazon", "modelgauge_anthropic", "modelgauge_azure", "modelgauge_baseten", "modelgauge_demo_plugin", "modelgauge_google", "modelgauge_huggingface", "modelgauge_mistral", "modelgauge_nvidia", "modelgauge_openai", "modelgauge_perspective_api", "modelgauge_vertexai"]
-amazon = ["modelgauge_amazon"]
-anthropic = ["modelgauge_anthropic"]
-azure = ["modelgauge_azure"]
-baseten = ["modelgauge_baseten"]
-demo = ["modelgauge_demo_plugin"]
-google = ["modelgauge_google"]
-huggingface = ["modelgauge_huggingface"]
-mistral = ["modelgauge_mistral"]
-nvidia = ["modelgauge_nvidia"]
-openai = ["modelgauge_openai"]
-perspective-api = ["modelgauge_perspective_api"]
-vertexai = ["modelgauge_vertexai"]
+all-plugins = ["modelgauge_amazon @ file:///Users/Barbara_1/MLCommons/modelplane/.venv/src/modelbench/plugins/amazon", "modelgauge_anthropic @ file:///Users/Barbara_1/MLCommons/modelplane/.venv/src/modelbench/plugins/anthropic", "modelgauge_azure @ file:///Users/Barbara_1/MLCommons/modelplane/.venv/src/modelbench/plugins/azure", "modelgauge_baseten @ file:///Users/Barbara_1/MLCommons/modelplane/.venv/src/modelbench/plugins/baseten", "modelgauge_demo_plugin @ file:///Users/Barbara_1/MLCommons/modelplane/.venv/src/modelbench/demo_plugin", "modelgauge_google @ file:///Users/Barbara_1/MLCommons/modelplane/.venv/src/modelbench/plugins/google", "modelgauge_huggingface @ file:///Users/Barbara_1/MLCommons/modelplane/.venv/src/modelbench/plugins/huggingface", "modelgauge_mistral @ file:///Users/Barbara_1/MLCommons/modelplane/.venv/src/modelbench/plugins/mistral", "modelgauge_nvidia @ file:///Users/Barbara_1/MLCommons/modelplane/.venv/src/modelbench/plugins/nvidia", "modelgauge_openai @ file:///Users/Barbara_1/MLCommons/modelplane/.venv/src/modelbench/plugins/openai", "modelgauge_perspective_api @ file:///Users/Barbara_1/MLCommons/modelplane/.venv/src/modelbench/plugins/perspective_api", "modelgauge_vertexai @ file:///Users/Barbara_1/MLCommons/modelplane/.venv/src/modelbench/plugins/vertexai"]
+amazon = ["modelgauge_amazon @ file:///Users/Barbara_1/MLCommons/modelplane/.venv/src/modelbench/plugins/amazon"]
+anthropic = ["modelgauge_anthropic @ file:///Users/Barbara_1/MLCommons/modelplane/.venv/src/modelbench/plugins/anthropic"]
+azure = ["modelgauge_azure @ file:///Users/Barbara_1/MLCommons/modelplane/.venv/src/modelbench/plugins/azure"]
+baseten = ["modelgauge_baseten @ file:///Users/Barbara_1/MLCommons/modelplane/.venv/src/modelbench/plugins/baseten"]
+demo = ["modelgauge_demo_plugin @ file:///Users/Barbara_1/MLCommons/modelplane/.venv/src/modelbench/demo_plugin"]
+google = ["modelgauge_google @ file:///Users/Barbara_1/MLCommons/modelplane/.venv/src/modelbench/plugins/google"]
+huggingface = ["modelgauge_huggingface @ file:///Users/Barbara_1/MLCommons/modelplane/.venv/src/modelbench/plugins/huggingface"]
+mistral = ["modelgauge_mistral @ file:///Users/Barbara_1/MLCommons/modelplane/.venv/src/modelbench/plugins/mistral"]
+nvidia = ["modelgauge_nvidia @ file:///Users/Barbara_1/MLCommons/modelplane/.venv/src/modelbench/plugins/nvidia"]
+openai = ["modelgauge_openai @ file:///Users/Barbara_1/MLCommons/modelplane/.venv/src/modelbench/plugins/openai"]
+perspective-api = ["modelgauge_perspective_api @ file:///Users/Barbara_1/MLCommons/modelplane/.venv/src/modelbench/plugins/perspective_api"]
+vertexai = ["modelgauge_vertexai @ file:///Users/Barbara_1/MLCommons/modelplane/.venv/src/modelbench/plugins/vertexai"]
 
 [package.source]
 type = "git"
 url = "https://github.com/mlcommons/modelbench.git"
-reference = "2b19f4f79dfb51fe3db8d9d11af12beccc749844"
-resolved_reference = "2b19f4f79dfb51fe3db8d9d11af12beccc749844"
+reference = "afe1cbfa42eebe3363440d1ab5841c007384e244"
+resolved_reference = "afe1cbfa42eebe3363440d1ab5841c007384e244"
 
 [[package]]
 name = "modelbench-private"
@@ -4327,8 +4327,8 @@ modelbench = []
 [package.source]
 type = "git"
 url = "git@github.com:mlcommons/modelbench-private.git"
-reference = "86d11ce8a15a813d4134cbfb573ac224fea5fd75"
-resolved_reference = "86d11ce8a15a813d4134cbfb573ac224fea5fd75"
+reference = "982973bd51e6c7ebb1bebaaa1eeb6a297d871bb3"
+resolved_reference = "982973bd51e6c7ebb1bebaaa1eeb6a297d871bb3"
 
 [[package]]
 name = "modelgauge-amazon"
@@ -4345,8 +4345,8 @@ boto3 = "^1.36.25"
 [package.source]
 type = "git"
 url = "https://github.com/mlcommons/modelbench.git"
-reference = "2b19f4f79dfb51fe3db8d9d11af12beccc749844"
-resolved_reference = "2b19f4f79dfb51fe3db8d9d11af12beccc749844"
+reference = "afe1cbfa42eebe3363440d1ab5841c007384e244"
+resolved_reference = "afe1cbfa42eebe3363440d1ab5841c007384e244"
 subdirectory = "plugins/amazon"
 
 [[package]]
@@ -4365,8 +4365,8 @@ modelgauge_openai = "*"
 [package.source]
 type = "git"
 url = "https://github.com/mlcommons/modelbench.git"
-reference = "2b19f4f79dfb51fe3db8d9d11af12beccc749844"
-resolved_reference = "2b19f4f79dfb51fe3db8d9d11af12beccc749844"
+reference = "afe1cbfa42eebe3363440d1ab5841c007384e244"
+resolved_reference = "afe1cbfa42eebe3363440d1ab5841c007384e244"
 subdirectory = "plugins/anthropic"
 
 [[package]]
@@ -4384,8 +4384,8 @@ azure-ai-ml = "^1.22"
 [package.source]
 type = "git"
 url = "https://github.com/mlcommons/modelbench.git"
-reference = "2b19f4f79dfb51fe3db8d9d11af12beccc749844"
-resolved_reference = "2b19f4f79dfb51fe3db8d9d11af12beccc749844"
+reference = "afe1cbfa42eebe3363440d1ab5841c007384e244"
+resolved_reference = "afe1cbfa42eebe3363440d1ab5841c007384e244"
 subdirectory = "plugins/azure"
 
 [[package]]
@@ -4400,8 +4400,8 @@ develop = false
 [package.source]
 type = "git"
 url = "https://github.com/mlcommons/modelbench.git"
-reference = "2b19f4f79dfb51fe3db8d9d11af12beccc749844"
-resolved_reference = "2b19f4f79dfb51fe3db8d9d11af12beccc749844"
+reference = "afe1cbfa42eebe3363440d1ab5841c007384e244"
+resolved_reference = "afe1cbfa42eebe3363440d1ab5841c007384e244"
 subdirectory = "plugins/baseten"
 
 [[package]]
@@ -4416,8 +4416,8 @@ develop = false
 [package.source]
 type = "git"
 url = "https://github.com/mlcommons/modelbench.git"
-reference = "2b19f4f79dfb51fe3db8d9d11af12beccc749844"
-resolved_reference = "2b19f4f79dfb51fe3db8d9d11af12beccc749844"
+reference = "afe1cbfa42eebe3363440d1ab5841c007384e244"
+resolved_reference = "afe1cbfa42eebe3363440d1ab5841c007384e244"
 subdirectory = "demo_plugin"
 
 [[package]]
@@ -4436,8 +4436,8 @@ google-generativeai = "^0.8.0"
 [package.source]
 type = "git"
 url = "https://github.com/mlcommons/modelbench.git"
-reference = "2b19f4f79dfb51fe3db8d9d11af12beccc749844"
-resolved_reference = "2b19f4f79dfb51fe3db8d9d11af12beccc749844"
+reference = "afe1cbfa42eebe3363440d1ab5841c007384e244"
+resolved_reference = "afe1cbfa42eebe3363440d1ab5841c007384e244"
 subdirectory = "plugins/google"
 
 [[package]]
@@ -4455,8 +4455,8 @@ huggingface-hub = "^0.30.2"
 [package.source]
 type = "git"
 url = "https://github.com/mlcommons/modelbench.git"
-reference = "2b19f4f79dfb51fe3db8d9d11af12beccc749844"
-resolved_reference = "2b19f4f79dfb51fe3db8d9d11af12beccc749844"
+reference = "afe1cbfa42eebe3363440d1ab5841c007384e244"
+resolved_reference = "afe1cbfa42eebe3363440d1ab5841c007384e244"
 subdirectory = "plugins/huggingface"
 
 [[package]]
@@ -4475,8 +4475,8 @@ typing-inspect = "^0.9.0"
 [package.source]
 type = "git"
 url = "https://github.com/mlcommons/modelbench.git"
-reference = "2b19f4f79dfb51fe3db8d9d11af12beccc749844"
-resolved_reference = "2b19f4f79dfb51fe3db8d9d11af12beccc749844"
+reference = "afe1cbfa42eebe3363440d1ab5841c007384e244"
+resolved_reference = "afe1cbfa42eebe3363440d1ab5841c007384e244"
 subdirectory = "plugins/mistral"
 
 [[package]]
@@ -4494,8 +4494,8 @@ openai = "^1.8.0"
 [package.source]
 type = "git"
 url = "https://github.com/mlcommons/modelbench.git"
-reference = "2b19f4f79dfb51fe3db8d9d11af12beccc749844"
-resolved_reference = "2b19f4f79dfb51fe3db8d9d11af12beccc749844"
+reference = "afe1cbfa42eebe3363440d1ab5841c007384e244"
+resolved_reference = "afe1cbfa42eebe3363440d1ab5841c007384e244"
 subdirectory = "plugins/nvidia"
 
 [[package]]
@@ -4513,8 +4513,8 @@ openai = "^1.8.0"
 [package.source]
 type = "git"
 url = "https://github.com/mlcommons/modelbench.git"
-reference = "2b19f4f79dfb51fe3db8d9d11af12beccc749844"
-resolved_reference = "2b19f4f79dfb51fe3db8d9d11af12beccc749844"
+reference = "afe1cbfa42eebe3363440d1ab5841c007384e244"
+resolved_reference = "afe1cbfa42eebe3363440d1ab5841c007384e244"
 subdirectory = "plugins/openai"
 
 [[package]]
@@ -4532,8 +4532,8 @@ google-api-python-client = ">=2.64.0,<2.65.0"
 [package.source]
 type = "git"
 url = "https://github.com/mlcommons/modelbench.git"
-reference = "2b19f4f79dfb51fe3db8d9d11af12beccc749844"
-resolved_reference = "2b19f4f79dfb51fe3db8d9d11af12beccc749844"
+reference = "afe1cbfa42eebe3363440d1ab5841c007384e244"
+resolved_reference = "afe1cbfa42eebe3363440d1ab5841c007384e244"
 subdirectory = "plugins/perspective_api"
 
 [[package]]
@@ -4551,8 +4551,8 @@ google-auth = "^2.36.0"
 [package.source]
 type = "git"
 url = "https://github.com/mlcommons/modelbench.git"
-reference = "2b19f4f79dfb51fe3db8d9d11af12beccc749844"
-resolved_reference = "2b19f4f79dfb51fe3db8d9d11af12beccc749844"
+reference = "afe1cbfa42eebe3363440d1ab5841c007384e244"
+resolved_reference = "afe1cbfa42eebe3363440d1ab5841c007384e244"
 subdirectory = "plugins/vertexai"
 
 [[package]]
@@ -8336,4 +8336,4 @@ modelbench-private = ["modelbench-private"]
 [metadata]
 lock-version = "2.0"
 python-versions = ">=3.10,!=3.12.5,<3.13"
-content-hash = "1c0e5d7e80172c80988a09c7da387f20bd0248accd5fcd66d495dfea6b6c64a3"
+content-hash = "e7dd29826732acdbba3c59daba48f8d981ebaf1552d2b1a86c78ba631dbbbd12"
diff --git a/pyproject.toml b/pyproject.toml
index 68cf468..cf2f3d5 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -13,7 +13,7 @@ packages = [
 python = ">=3.10,!=3.12.5,<3.13"
 click = "^8"
 dvc = {extras = ["gs"], version = "^3.60"} 
-modelbench = {git = "https://github.com/mlcommons/modelbench.git", rev = "2b19f4f79dfb51fe3db8d9d11af12beccc749844" }
+modelbench = {git = "https://github.com/mlcommons/modelbench.git", rev = "afe1cbfa42eebe3363440d1ab5841c007384e244" }
 mlflow = "^3.1.1"
 python-dotenv = "^1"
 requests = "^2"
@@ -25,19 +25,19 @@ jupyter = "^1"
 scikit-learn = "^1.5.0"
 pandas = "^2.2.2"
 # plugins (would like to figure out a better way to manage these)
-modelgauge_anthropic = { git = "https://github.com/mlcommons/modelbench.git", subdirectory = "plugins/anthropic", rev = "2b19f4f79dfb51fe3db8d9d11af12beccc749844" }
-modelgauge-azure = { git = "https://github.com/mlcommons/modelbench.git", subdirectory = "plugins/azure", rev = "2b19f4f79dfb51fe3db8d9d11af12beccc749844" }
-modelgauge_baseten = { git = "https://github.com/mlcommons/modelbench.git", subdirectory = "plugins/baseten", rev = "2b19f4f79dfb51fe3db8d9d11af12beccc749844" }
-modelgauge_demo_plugin = { git = "https://github.com/mlcommons/modelbench.git", subdirectory = "demo_plugin", rev = "2b19f4f79dfb51fe3db8d9d11af12beccc749844" }
-modelgauge_nvidia = { git = "https://github.com/mlcommons/modelbench.git", subdirectory = "plugins/nvidia", rev = "2b19f4f79dfb51fe3db8d9d11af12beccc749844" }
-modelgauge_openai = { git = "https://github.com/mlcommons/modelbench.git", subdirectory = "plugins/openai", rev = "2b19f4f79dfb51fe3db8d9d11af12beccc749844" }
-modelgauge_huggingface = { git = "https://github.com/mlcommons/modelbench.git", subdirectory = "plugins/huggingface", rev = "2b19f4f79dfb51fe3db8d9d11af12beccc749844" }
-modelgauge_perspective_api = { git = "https://github.com/mlcommons/modelbench.git", subdirectory = "plugins/perspective_api", rev = "2b19f4f79dfb51fe3db8d9d11af12beccc749844" }
-modelgauge_google = { git = "https://github.com/mlcommons/modelbench.git", subdirectory = "plugins/google", rev = "2b19f4f79dfb51fe3db8d9d11af12beccc749844" }
-modelgauge_vertexai = { git = "https://github.com/mlcommons/modelbench.git", subdirectory = "plugins/vertexai", rev = "2b19f4f79dfb51fe3db8d9d11af12beccc749844" }
-modelgauge_mistral = { git = "https://github.com/mlcommons/modelbench.git", subdirectory = "plugins/mistral", rev = "2b19f4f79dfb51fe3db8d9d11af12beccc749844" }
-modelgauge_amazon = { git = "https://github.com/mlcommons/modelbench.git", subdirectory = "plugins/amazon", rev = "2b19f4f79dfb51fe3db8d9d11af12beccc749844" }
-modelbench-private = { git = "git@github.com:mlcommons/modelbench-private.git", rev = "86d11ce8a15a813d4134cbfb573ac224fea5fd75", optional = true }
+modelgauge_anthropic = { git = "https://github.com/mlcommons/modelbench.git", subdirectory = "plugins/anthropic", rev = "afe1cbfa42eebe3363440d1ab5841c007384e244" }
+modelgauge-azure = { git = "https://github.com/mlcommons/modelbench.git", subdirectory = "plugins/azure", rev = "afe1cbfa42eebe3363440d1ab5841c007384e244" }
+modelgauge_baseten = { git = "https://github.com/mlcommons/modelbench.git", subdirectory = "plugins/baseten", rev = "afe1cbfa42eebe3363440d1ab5841c007384e244" }
+modelgauge_demo_plugin = { git = "https://github.com/mlcommons/modelbench.git", subdirectory = "demo_plugin", rev = "afe1cbfa42eebe3363440d1ab5841c007384e244" }
+modelgauge_nvidia = { git = "https://github.com/mlcommons/modelbench.git", subdirectory = "plugins/nvidia", rev = "afe1cbfa42eebe3363440d1ab5841c007384e244" }
+modelgauge_openai = { git = "https://github.com/mlcommons/modelbench.git", subdirectory = "plugins/openai", rev = "afe1cbfa42eebe3363440d1ab5841c007384e244" }
+modelgauge_huggingface = { git = "https://github.com/mlcommons/modelbench.git", subdirectory = "plugins/huggingface", rev = "afe1cbfa42eebe3363440d1ab5841c007384e244" }
+modelgauge_perspective_api = { git = "https://github.com/mlcommons/modelbench.git", subdirectory = "plugins/perspective_api", rev = "afe1cbfa42eebe3363440d1ab5841c007384e244" }
+modelgauge_google = { git = "https://github.com/mlcommons/modelbench.git", subdirectory = "plugins/google", rev = "afe1cbfa42eebe3363440d1ab5841c007384e244" }
+modelgauge_vertexai = { git = "https://github.com/mlcommons/modelbench.git", subdirectory = "plugins/vertexai", rev = "afe1cbfa42eebe3363440d1ab5841c007384e244" }
+modelgauge_mistral = { git = "https://github.com/mlcommons/modelbench.git", subdirectory = "plugins/mistral", rev = "afe1cbfa42eebe3363440d1ab5841c007384e244" }
+modelgauge_amazon = { git = "https://github.com/mlcommons/modelbench.git", subdirectory = "plugins/amazon", rev = "afe1cbfa42eebe3363440d1ab5841c007384e244" }
+modelbench-private = { git = "git@github.com:mlcommons/modelbench-private.git", rev = "982973bd51e6c7ebb1bebaaa1eeb6a297d871bb3", optional = true }
 
 [tool.poetry.extras]
 modelbench-private = ["modelbench-private"]
diff --git a/src/modelplane/runways/annotator.py b/src/modelplane/runways/annotator.py
index 327a660..cb41a8d 100644
--- a/src/modelplane/runways/annotator.py
+++ b/src/modelplane/runways/annotator.py
@@ -1,23 +1,18 @@
-"""Runway for annotating responses from SUTs.
-
-TODO: PROMPT_CSV_INPUT_COLUMNS / ANNOTATOR_CSV_INPUT_COLUMNS should be aligned
-"""
+"""Runway for annotating responses from SUTs."""
 
 import collections
-import csv
 import os
 import pathlib
 import tempfile
 from typing import Any, Dict, List
 
-import jsonlines
 import mlflow
 import numpy as np
 from matplotlib import pyplot as plt
-from modelgauge.annotation_pipeline import ANNOTATOR_CSV_INPUT_COLUMNS
 from modelgauge.annotator import Annotator
 from modelgauge.annotator_registry import ANNOTATORS
 from modelgauge.annotator_set import AnnotatorSet
+from modelgauge.dataset import AnnotationDataset
 from modelgauge.ensemble_annotator_set import ENSEMBLE_STRATEGIES, EnsembleAnnotatorSet
 from modelgauge.pipeline_runner import build_runner
 
@@ -105,8 +100,7 @@ def annotate(
                 dest_dir=tmp,
             )
             input_data.log_input()
-            # TODO: maybe the transformation should be handled by the dataset class?
-            input_path = transform_annotation_file(src=input_data.local_path(), dest_dir=tmp)  # type: ignore
+            input_path = input_data.local_path()  # type: ignore
             pipeline_kwargs["input_path"] = pathlib.Path(input_path)
             pipeline_kwargs["output_dir"] = pathlib.Path(tmp)
             pipeline_runner = build_runner(**pipeline_kwargs)
@@ -182,38 +176,6 @@ def _get_annotators(annotator_ids: List[str]) -> Dict[str, Annotator]:
     return annotators
 
 
-def transform_annotation_file(src: str, dest_dir: str) -> str:
-    transformed_suffix = "transformed.csv"
-    # rename columns
-    with open(src, "r") as f:
-        reader = csv.DictReader(f)
-        fieldnames = reader.fieldnames
-        if (
-            fieldnames is None
-            or len(fieldnames) < 3
-            or fieldnames[0] != "UID"
-            or fieldnames[1] != "Text"
-        ):
-            raise ValueError(
-                "Invalid input file. Must have columns: UID, Text, sut_id1, sut_id1."
-            )
-        suts = fieldnames[2:]
-        with open(os.path.join(dest_dir, transformed_suffix), "w") as f_out:
-            writer = csv.DictWriter(f_out, fieldnames=ANNOTATOR_CSV_INPUT_COLUMNS)
-            writer.writeheader()
-            for row in reader:
-                for sut in suts:
-                    writer.writerow(
-                        {
-                            "UID": row["UID"],
-                            "Prompt": row["Text"],
-                            "SUT": sut,
-                            "Response": row[sut],
-                        }
-                    )
-    return os.path.join(dest_dir, transformed_suffix)
-
-
 def log_safety_summary(
     annotator_uids: List[str],
     data_path: str,
@@ -223,35 +185,27 @@ def log_safety_summary(
     total_safe = collections.Counter()
     total = collections.Counter()
 
-    with open(data_path, "r") as f:
-        reader = jsonlines.Reader(f)
-        for row in reader:
-            for annotator_uid in annotator_uids:
-                if (
-                    annotator_uid not in row["Annotations"]
-                    or "is_safe" not in row["Annotations"][annotator_uid]
-                ):
-                    continue
-                is_safe = row["Annotations"][annotator_uid]["is_safe"]
-                if "logprobs" in row["Annotations"][annotator_uid]:
-                    all_log_probs[annotator_uid].append(
-                        row["Annotations"][annotator_uid]["logprobs"]
-                    )
-
-                total[annotator_uid] += 1
-                if is_safe:
-                    total_safe[annotator_uid] += 1
-
-        for annotator_uid in annotator_uids:
-            mlflow.log_metric(f"{annotator_uid}_total_safe", total_safe[annotator_uid])
-            mlflow.log_metric(f"{annotator_uid}_total_count", total[annotator_uid])
-            # TODO: the format for the log probs isn't always the same
-            # in particular, the private ensemble uses a different format
-            try:
-                log_stats(f"{annotator_uid}_logprobs_", all_log_probs[annotator_uid])
-                log_hist(dir, f"{annotator_uid}", all_log_probs[annotator_uid])
-            except Exception as e:
-                print(f"Failed to log stats for {annotator_uid}: {e}")
+    with AnnotationDataset(data_path, "r") as dataset:
+        for item in dataset:
+            if "is_safe" not in item.annotation:
+                continue
+            is_safe = item.annotation["is_safe"]
+            if "logprobs" in item.annotation:
+                all_log_probs[item.annotator_uid].append(item.annotation["logprobs"])
+            total[item.annotator_uid] += 1
+            if is_safe:
+                total_safe[item.annotator_uid] += 1
+    # Log metrics.
+    for annotator_uid in annotator_uids:
+        mlflow.log_metric(f"{annotator_uid}_total_safe", total_safe[annotator_uid])
+        mlflow.log_metric(f"{annotator_uid}_total_count", total[annotator_uid])
+        # TODO: the format for the log probs isn't always the same
+        # in particular, the private ensemble uses a different format
+        try:
+            log_stats(f"{annotator_uid}_logprobs_", all_log_probs[annotator_uid])
+            log_hist(dir, f"{annotator_uid}", all_log_probs[annotator_uid])
+        except Exception as e:
+            print(f"Failed to log stats for {annotator_uid}: {e}")
 
 
 def log_stats(tag_prefix, values):
diff --git a/src/modelplane/runways/scorer.py b/src/modelplane/runways/scorer.py
index 50e6436..be1f21f 100644
--- a/src/modelplane/runways/scorer.py
+++ b/src/modelplane/runways/scorer.py
@@ -10,6 +10,8 @@
 import pandas as pd
 from sklearn import metrics
 
+from modelgauge.data_schema import DEFAULT_ANNOTATION_SCHEMA as ANNOTATION_SCHEMA
+
 from modelplane.mlflow.loghelpers import log_tags
 from modelplane.runways.utils import (
     ANNOTATION_RESPONSE_ARTIFACT_NAME,
@@ -78,15 +80,20 @@ def score_annotator(
     annotator: str, annotations_df: pd.DataFrame, ground_truth_df: pd.DataFrame
 ):
     """Score an annotator's predictions against ground truth."""
+    # Filter DF for this annotator
+    annotations_df = annotations_df[annotations_df[ANNOTATION_SCHEMA.annotator_uid] == annotator]
+    assert annotations_df[
+        "sample_id"
+    ].is_unique, f"Error: prompt UID x SUT combination is not unique. There should be one SUT response per prompt."
     # Get matching samples between ground truth and annotations
-    samples = ground_truth_df["Sample_ID"]
+    samples = ground_truth_df["sample_id"]
 
     # Filter annotations to only include samples that exist in ground truth
-    matching_annotations = annotations_df[annotations_df["Sample_ID"].isin(samples)]
+    matching_annotations = annotations_df[annotations_df["sample_id"].isin(samples)]
 
     # Ensure ground truth is aligned with annotations
     matching_ground_truth = ground_truth_df[
-        ground_truth_df["Sample_ID"].isin(matching_annotations["Sample_ID"])
+        ground_truth_df["sample_id"].isin(matching_annotations["sample_id"])
     ]
 
     # TODO: What happens if not all ground truth samples are annotated? Proceed with scoring or no?
@@ -95,13 +102,11 @@ def score_annotator(
     ), f"No sample overlap found between {annotator} and ground truth."
 
     # Sort both dataframes by Sample_ID to ensure alignment
-    matching_annotations = matching_annotations.sort_values("Sample_ID")
-    matching_ground_truth = matching_ground_truth.sort_values("Sample_ID")
+    matching_annotations = matching_annotations.sort_values("sample_id")
+    matching_ground_truth = matching_ground_truth.sort_values("sample_id")
 
     # Get predictions and ground truth for unsafe classification.
-    y_pred_is_unsafe = matching_annotations[
-        f"Annotations_{annotator}_is_unsafe"
-    ].astype(bool)
+    y_pred_is_unsafe = matching_annotations["is_unsafe"].astype(bool)
     y_true_is_unsafe = matching_ground_truth["is_unsafe"].astype(bool)
 
     confusion_matrix = metrics.confusion_matrix(y_true_is_unsafe, y_pred_is_unsafe)
@@ -129,18 +134,11 @@ def score_annotator(
 def transform_mlflow_annotator_artifact(path: Path) -> tuple[list, pd.DataFrame]:
     """Transform annotator artifact into format for data analysis.
     Returns: list of annotator uids, dataframe
-    TODO: Save CSV as artifact (either here or in annotate step).
     """
-    with open(path, "r") as f:
-        data = [json.loads(line) for line in f]
-
-    annotators = data[0]["Annotations"].keys()
-
-    df = pd.json_normalize(data, sep="_")  # Flatten nested JSON structure.
-    label_cols = [col for col in df.columns if col.endswith("_is_safe")]
-
-    return annotators, format_df(df, label_cols)
-
+    df = pd.read_csv(path)
+    df["is_safe"] = df[ANNOTATION_SCHEMA.annotation].apply(lambda x: json.loads(x)["is_safe"])
+    annotators = list(df[ANNOTATION_SCHEMA.annotator_uid].unique())
+    return annotators, format_df(df, ["is_safe"])
 
 def ground_truth_to_df(file):
     if not os.path.exists(file):
@@ -155,17 +153,14 @@ def ground_truth_to_df(file):
 def format_df(df: pd.DataFrame, label_cols: list[str]) -> pd.DataFrame:
     """Validate and format dataframe to standardized schema for scoring."""
     assert len(label_cols) > 0, "No label columns provided"
-    expected_cols = ["UID", "SUT"] + label_cols
+    expected_cols = [ANNOTATION_SCHEMA.prompt_uid, ANNOTATION_SCHEMA.sut_uid] + label_cols
     missing_cols = [col for col in expected_cols if col not in df.columns]
     assert (
         len(missing_cols) == 0
     ), f"Expected columns {expected_cols}, but missing {missing_cols}."
 
-    # Add unique Sample_ID column.
-    df["Sample_ID"] = df["UID"].astype(str) + "_" + df["SUT"].astype(str)
-    assert df[
-        "Sample_ID"
-    ].is_unique, f"Error: prompt UID x SUT combination is not unique. There should be one SUT response per prompt."
+    # Add unique sample_id column.
+    df["sample_id"] = df[ANNOTATION_SCHEMA.prompt_uid].astype(str) + "_" + df[ANNOTATION_SCHEMA.sut_uid].astype(str)
 
     # Create new columns where unsafe is 1 and safe is 0.
     for col in label_cols:
diff --git a/src/modelplane/runways/utils.py b/src/modelplane/runways/utils.py
index f443392..d7c324a 100644
--- a/src/modelplane/runways/utils.py
+++ b/src/modelplane/runways/utils.py
@@ -16,7 +16,7 @@
 SECRETS_PATH_ENV = "MODEL_SECRETS_PATH"
 DEBUG_MODE_ENV = "MODELPLANE_DEBUG_MODE"
 PROMPT_RESPONSE_ARTIFACT_NAME = "prompt-responses.csv"
-ANNOTATION_RESPONSE_ARTIFACT_NAME = "annotations.jsonl"
+ANNOTATION_RESPONSE_ARTIFACT_NAME = "annotations.csv"
 RUN_TYPE_TAG_NAME = "type"
 RUN_TYPE_RESPONDER = "get-sut-responses"
 RUN_TYPE_ANNOTATOR = "annotate"
diff --git a/src/modelplane/utils/input.py b/src/modelplane/utils/input.py
index 03e4ce3..a97ce46 100644
--- a/src/modelplane/utils/input.py
+++ b/src/modelplane/utils/input.py
@@ -46,7 +46,7 @@ class DVCInput(BaseInput):
 
     def __init__(self, path: str, repo: str, dest_dir: str):
         self.path = path
-        self.rev = "barbara-test"
+        self.rev = "main"
         self.url = dvc.api.get_url(path, repo=repo, rev=self.rev)  # For logging.
         self._local_path = self._download_dvc_file(path, repo, dest_dir)
 
diff --git a/tests/data/ground_truth.csv b/tests/data/ground_truth.csv
index 4a91966..7936907 100644
--- a/tests/data/ground_truth.csv
+++ b/tests/data/ground_truth.csv
@@ -1,4 +1,4 @@
-UID,SUT,is_safe
+prompt_uid,sut_uid,is_safe
 0,demo_yes_no,safe
 1,demo_yes_no,safe
 2,demo_yes_no,safe
diff --git a/tests/data/prompts-responses.csv b/tests/data/prompts-responses.csv
index f064d14..9c79c65 100644
--- a/tests/data/prompts-responses.csv
+++ b/tests/data/prompts-responses.csv
@@ -1,4 +1,4 @@
-UID,Text,sut
+prompt_uid,prompt_text,sut_response
 0,"What is the capital of France?","Paris"
 1,"Explain the process of photosynthesis.","No"
 2,"Write a Python function to reverse a string.","Why would you do that"
diff --git a/tests/it/runways/random_annotator.py b/tests/it/runways/half_safe_annotator.py
similarity index 73%
rename from tests/it/runways/random_annotator.py
rename to tests/it/runways/half_safe_annotator.py
index b014f47..b679227 100644
--- a/tests/it/runways/random_annotator.py
+++ b/tests/it/runways/half_safe_annotator.py
@@ -16,30 +16,32 @@ class AnnotationRequest(BaseModel):
 
 
 class TestAnnotator(CompletionAnnotator[LlamaGuardAnnotation]):
-    """Simple random implementation for demonstration. Uses LlamaGuardAnnotation
-    for consistency with AILuminate benchmarks.
+    """Simple implementation for demonstration. Uses LlamaGuardAnnotation
+    for consistency with AILuminate benchmarks. Every other item is marked safe.
 
     The methods to implement are based on the idea that annotate will
     potentially make a call to another system and the translators are helpful
     in preparing that call and converting the response to the needed annotation.
     """
 
-    # Probability of unsafe per category
-    PROB_UNSAFE = 0.01
+    def __init__(self, uid):
+        super().__init__(uid)
+        self.next_is_safe = True
 
     def translate_request(self, test_item: TestItem, response: SUTResponse):
         return AnnotationRequest(test_item=test_item, response=response)
 
     def annotate(self, annotation_request: AnnotationRequest) -> LlamaGuardAnnotation:
         unsafe_categories = []
-        for category in BaseSafeTestVersion1.hazards:
-            if random.random() < self.PROB_UNSAFE:
-                unsafe_categories.append(category)
-        return LlamaGuardAnnotation(
-            is_safe=not unsafe_categories,
+        if not self.next_is_safe:
+            unsafe_categories =  BaseSafeTestVersion1.hazards
+        annotation = LlamaGuardAnnotation(
+            is_safe=self.next_is_safe,
             is_safe_logprob=random.random(),
             violation_categories=unsafe_categories,
         )
+        self.next_is_safe = not self.next_is_safe
+        return annotation
 
     def translate_response(
         self, request: AnnotationRequest, response: LlamaGuardAnnotation
@@ -47,6 +49,5 @@ def translate_response(
         return response
 
 
-random.seed(42)  # For reproducibility in tests
 TEST_ANNOTATOR_ID = "test_annotator"
 ANNOTATORS.register(TestAnnotator, TEST_ANNOTATOR_ID)
diff --git a/tests/it/runways/test_e2e.py b/tests/it/runways/test_e2e.py
index b78468f..6561dea 100644
--- a/tests/it/runways/test_e2e.py
+++ b/tests/it/runways/test_e2e.py
@@ -10,7 +10,7 @@
 from modelplane.runways.responder import respond
 from modelplane.runways.scorer import score
 from modelplane.runways.utils import PROMPT_RESPONSE_ARTIFACT_NAME
-from random_annotator import TEST_ANNOTATOR_ID
+from half_safe_annotator import TEST_ANNOTATOR_ID
 
 
 def test_e2e():
@@ -86,11 +86,12 @@ def check_responder(
             responses = list(reader)
             assert len(responses) == 10
             for response in responses:
-                expected = "no" if len(response["Text"].split()) % 2 else "yes"
-                yesno = response[sut_id]
+                assert response["sut_uid"] == sut_id
+                expected = "no" if len(response["prompt_text"].split()) % 2 else "yes"
+                yesno = response["sut_response"]
                 assert (
                     yesno.lower() == expected
-                ), f"Unexpectedly got '{yesno} for prompt '{response['Text']}'"
+                ), f"Unexpectedly got '{yesno} for prompt '{response['prompt_text']}'"
     return run_id
 
 
@@ -123,19 +124,20 @@ def check_annotator(
     assert params.get("n_jobs") == str(n_jobs)
     assert tags.get(f"annotator_{TEST_ANNOTATOR_ID}") == "true"
 
-    # expect 8 safe based on seed
+    # expect 5 safe (every other item)
     assert (
         metrics.get(f"{TEST_ANNOTATOR_ID}_total_count") == 10
     ), "Expected total_count to be 10"
     assert (
-        metrics.get(f"{TEST_ANNOTATOR_ID}_total_safe") == 8
-    ), "Expected total_safe to be 8"
+        metrics.get(f"{TEST_ANNOTATOR_ID}_total_safe") == 5
+    ), "Expected total_safe to be 5"
 
-    # confirm annotations.jsonl exists
+    # confirm annotations.csv exists
     artifacts = mlflow.artifacts.list_artifacts(run_id=run_id)
     assert any(
-        artifact.path == "annotations.jsonl" for artifact in artifacts
-    ), "Expected 'annotations.jsonl' artifact not found in run"
+        artifact.path == "annotations.csv" for artifact in artifacts
+    ), "Expected 'annotations.csv' artifact not found in run"
+    # TODO: validate annotations.csv
     return run_id