diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
index b5dbf1f..ef0d7b2 100644
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -38,11 +38,10 @@ jobs:
         run: |
           poetry env remove python || true
 
-      # TODO: get rid of the POETRY_INSTALLER_MAX_WORKERS env var once we get rid of plugins
       - name: Install dependencies
         run: |
           poetry cache clear --no-interaction --all .
-          POETRY_INSTALLER_MAX_WORKERS=1 poetry install --with test --no-cache
+          poetry install --with test --no-cache
 
       - name: Run all tests
         run: MLFLOW_TRACKING_URI=http://localhost:8080 poetry run pytest
diff --git a/flightpaths/Annotator Development Template.ipynb b/flightpaths/Annotator Development Template.ipynb
index 7587259..9019f45 100644
--- a/flightpaths/Annotator Development Template.ipynb	
+++ b/flightpaths/Annotator Development Template.ipynb	
@@ -147,7 +147,7 @@
    "source": [
     "## Run the model\n",
     "\n",
-    "This step will get responses to the prompts from the given SUT.\n",
+    "This step will get responses to the prompts from the given SUT. You can optionally pass arguments `prompt_uid_col` and `prompt_text_col` if your prompts dataset has different column names than the default ones.\n",
     "\n",
     "Save this run_id to avoid having to re-run the model later. The results are saved as an artifact in mlflow.\n",
     "\n",
diff --git a/poetry.lock b/poetry.lock
index a20fcf0..04c1c78 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -4305,16 +4305,24 @@ files = []
 develop = false
 
 [package.dependencies]
+anthropic = "*"
+azure-ai-ml = "^1.22"
+boto3 = "^1.36.25"
 casefy = "^1.0.0"
 click = "^8.1.7"
 diskcache = "^5.6.3"
 fastapi = "^0.115.0"
 gdown = ">=5.1.0"
+google-api-python-client = ">=2.64.0,<2.65.0"
+google-auth = "^2.36.0"
+google-genai = "^1.17.0"
+google-generativeai = "^0.8.0"
 huggingface-hub = "^0.30.2"
 jinja2 = "^3.1.3"
 jq = "^1.6.0"
 jsonlines = "^4.0.0"
 llama-api-client = "^0.1.1"
+mistralai = "1.6.0"
 openai = "^1.8.0"
 pip = ">=24,<26"
 prometheus-client = "^0.21.1"
@@ -4332,26 +4340,14 @@ tomli = "^2.0.1"
 tqdm = ">=4.66.1"
 types-tqdm = "^4.66.0.0"
 typing-extensions = "^4.10.0"
+typing-inspect = "^0.9.0"
 zstandard = {version = "^0.23.0", extras = ["cffi"]}
 
-[package.extras]
-all-plugins = ["modelgauge_amazon @ file:///Users/vishal/Library/Caches/pypoetry/virtualenvs/modelplane-8xT__OfZ-py3.12/src/modelbench/plugins/amazon", "modelgauge_anthropic @ file:///Users/vishal/Library/Caches/pypoetry/virtualenvs/modelplane-8xT__OfZ-py3.12/src/modelbench/plugins/anthropic", "modelgauge_azure @ file:///Users/vishal/Library/Caches/pypoetry/virtualenvs/modelplane-8xT__OfZ-py3.12/src/modelbench/plugins/azure", "modelgauge_baseten @ file:///Users/vishal/Library/Caches/pypoetry/virtualenvs/modelplane-8xT__OfZ-py3.12/src/modelbench/plugins/baseten", "modelgauge_demo_plugin @ file:///Users/vishal/Library/Caches/pypoetry/virtualenvs/modelplane-8xT__OfZ-py3.12/src/modelbench/demo_plugin", "modelgauge_google @ file:///Users/vishal/Library/Caches/pypoetry/virtualenvs/modelplane-8xT__OfZ-py3.12/src/modelbench/plugins/google", "modelgauge_mistral @ file:///Users/vishal/Library/Caches/pypoetry/virtualenvs/modelplane-8xT__OfZ-py3.12/src/modelbench/plugins/mistral", "modelgauge_nvidia @ file:///Users/vishal/Library/Caches/pypoetry/virtualenvs/modelplane-8xT__OfZ-py3.12/src/modelbench/plugins/nvidia", "modelgauge_perspective_api @ file:///Users/vishal/Library/Caches/pypoetry/virtualenvs/modelplane-8xT__OfZ-py3.12/src/modelbench/plugins/perspective_api", "modelgauge_vertexai @ file:///Users/vishal/Library/Caches/pypoetry/virtualenvs/modelplane-8xT__OfZ-py3.12/src/modelbench/plugins/vertexai"]
-amazon = ["modelgauge_amazon @ file:///Users/vishal/Library/Caches/pypoetry/virtualenvs/modelplane-8xT__OfZ-py3.12/src/modelbench/plugins/amazon"]
-anthropic = ["modelgauge_anthropic @ file:///Users/vishal/Library/Caches/pypoetry/virtualenvs/modelplane-8xT__OfZ-py3.12/src/modelbench/plugins/anthropic"]
-azure = ["modelgauge_azure @ file:///Users/vishal/Library/Caches/pypoetry/virtualenvs/modelplane-8xT__OfZ-py3.12/src/modelbench/plugins/azure"]
-baseten = ["modelgauge_baseten @ file:///Users/vishal/Library/Caches/pypoetry/virtualenvs/modelplane-8xT__OfZ-py3.12/src/modelbench/plugins/baseten"]
-demo = ["modelgauge_demo_plugin @ file:///Users/vishal/Library/Caches/pypoetry/virtualenvs/modelplane-8xT__OfZ-py3.12/src/modelbench/demo_plugin"]
-google = ["modelgauge_google @ file:///Users/vishal/Library/Caches/pypoetry/virtualenvs/modelplane-8xT__OfZ-py3.12/src/modelbench/plugins/google"]
-mistral = ["modelgauge_mistral @ file:///Users/vishal/Library/Caches/pypoetry/virtualenvs/modelplane-8xT__OfZ-py3.12/src/modelbench/plugins/mistral"]
-nvidia = ["modelgauge_nvidia @ file:///Users/vishal/Library/Caches/pypoetry/virtualenvs/modelplane-8xT__OfZ-py3.12/src/modelbench/plugins/nvidia"]
-perspective-api = ["modelgauge_perspective_api @ file:///Users/vishal/Library/Caches/pypoetry/virtualenvs/modelplane-8xT__OfZ-py3.12/src/modelbench/plugins/perspective_api"]
-vertexai = ["modelgauge_vertexai @ file:///Users/vishal/Library/Caches/pypoetry/virtualenvs/modelplane-8xT__OfZ-py3.12/src/modelbench/plugins/vertexai"]
-
 [package.source]
 type = "git"
 url = "https://github.com/mlcommons/modelbench.git"
-reference = "ba582ed35a58cf6219786624c83e6915a9bcf98a"
-resolved_reference = "ba582ed35a58cf6219786624c83e6915a9bcf98a"
+reference = "a71a902ef2e503569696ea76d56ac40fad9ee97b"
+resolved_reference = "a71a902ef2e503569696ea76d56ac40fad9ee97b"
 
 [[package]]
 name = "modelbench-private"
@@ -4362,200 +4358,11 @@ python-versions = ">=3.10,!=3.12.5,<3.13"
 files = []
 develop = false
 
-[package.extras]
-all-plugins = []
-
 [package.source]
 type = "git"
 url = "git@github.com:mlcommons/modelbench-private.git"
-reference = "ecac4cfd343411c8011c0bba875d638fc22e9478"
-resolved_reference = "ecac4cfd343411c8011c0bba875d638fc22e9478"
-
-[[package]]
-name = "modelgauge-amazon"
-version = "1.0.0"
-description = ""
-optional = false
-python-versions = "^3.10"
-files = []
-develop = false
-
-[package.dependencies]
-boto3 = "^1.36.25"
-
-[package.source]
-type = "git"
-url = "https://github.com/mlcommons/modelbench.git"
-reference = "ba582ed35a58cf6219786624c83e6915a9bcf98a"
-resolved_reference = "ba582ed35a58cf6219786624c83e6915a9bcf98a"
-subdirectory = "plugins/amazon"
-
-[[package]]
-name = "modelgauge-anthropic"
-version = "1.0.0"
-description = ""
-optional = false
-python-versions = "^3.10"
-files = []
-develop = false
-
-[package.dependencies]
-anthropic = "*"
-
-[package.source]
-type = "git"
-url = "https://github.com/mlcommons/modelbench.git"
-reference = "ba582ed35a58cf6219786624c83e6915a9bcf98a"
-resolved_reference = "ba582ed35a58cf6219786624c83e6915a9bcf98a"
-subdirectory = "plugins/anthropic"
-
-[[package]]
-name = "modelgauge-azure"
-version = "1.0.0"
-description = ""
-optional = false
-python-versions = "^3.10"
-files = []
-develop = false
-
-[package.dependencies]
-azure-ai-ml = "^1.22"
-
-[package.source]
-type = "git"
-url = "https://github.com/mlcommons/modelbench.git"
-reference = "ba582ed35a58cf6219786624c83e6915a9bcf98a"
-resolved_reference = "ba582ed35a58cf6219786624c83e6915a9bcf98a"
-subdirectory = "plugins/azure"
-
-[[package]]
-name = "modelgauge-baseten"
-version = "1.0.0"
-description = ""
-optional = false
-python-versions = "^3.10"
-files = []
-develop = false
-
-[package.source]
-type = "git"
-url = "https://github.com/mlcommons/modelbench.git"
-reference = "ba582ed35a58cf6219786624c83e6915a9bcf98a"
-resolved_reference = "ba582ed35a58cf6219786624c83e6915a9bcf98a"
-subdirectory = "plugins/baseten"
-
-[[package]]
-name = "modelgauge-demo-plugin"
-version = "1.0.0"
-description = ""
-optional = false
-python-versions = "^3.10"
-files = []
-develop = false
-
-[package.source]
-type = "git"
-url = "https://github.com/mlcommons/modelbench.git"
-reference = "ba582ed35a58cf6219786624c83e6915a9bcf98a"
-resolved_reference = "ba582ed35a58cf6219786624c83e6915a9bcf98a"
-subdirectory = "demo_plugin"
-
-[[package]]
-name = "modelgauge-google"
-version = "1.0.0"
-description = ""
-optional = false
-python-versions = "^3.10"
-files = []
-develop = false
-
-[package.dependencies]
-google-genai = "^1.17.0"
-google-generativeai = "^0.8.0"
-
-[package.source]
-type = "git"
-url = "https://github.com/mlcommons/modelbench.git"
-reference = "ba582ed35a58cf6219786624c83e6915a9bcf98a"
-resolved_reference = "ba582ed35a58cf6219786624c83e6915a9bcf98a"
-subdirectory = "plugins/google"
-
-[[package]]
-name = "modelgauge-mistral"
-version = "1.0.0"
-description = "Mistral SUT"
-optional = false
-python-versions = "^3.10"
-files = []
-develop = false
-
-[package.dependencies]
-mistralai = "1.6.0"
-typing-inspect = "^0.9.0"
-
-[package.source]
-type = "git"
-url = "https://github.com/mlcommons/modelbench.git"
-reference = "ba582ed35a58cf6219786624c83e6915a9bcf98a"
-resolved_reference = "ba582ed35a58cf6219786624c83e6915a9bcf98a"
-subdirectory = "plugins/mistral"
-
-[[package]]
-name = "modelgauge-nvidia"
-version = "1.0.0"
-description = ""
-optional = false
-python-versions = "^3.10"
-files = []
-develop = false
-
-[package.dependencies]
-openai = "^1.8.0"
-
-[package.source]
-type = "git"
-url = "https://github.com/mlcommons/modelbench.git"
-reference = "ba582ed35a58cf6219786624c83e6915a9bcf98a"
-resolved_reference = "ba582ed35a58cf6219786624c83e6915a9bcf98a"
-subdirectory = "plugins/nvidia"
-
-[[package]]
-name = "modelgauge-perspective-api"
-version = "1.0.0"
-description = ""
-optional = false
-python-versions = "^3.10"
-files = []
-develop = false
-
-[package.dependencies]
-google-api-python-client = ">=2.64.0,<2.65.0"
-
-[package.source]
-type = "git"
-url = "https://github.com/mlcommons/modelbench.git"
-reference = "ba582ed35a58cf6219786624c83e6915a9bcf98a"
-resolved_reference = "ba582ed35a58cf6219786624c83e6915a9bcf98a"
-subdirectory = "plugins/perspective_api"
-
-[[package]]
-name = "modelgauge-vertexai"
-version = "1.0.0"
-description = "Mistral SUT"
-optional = false
-python-versions = "^3.10"
-files = []
-develop = false
-
-[package.dependencies]
-google-auth = "^2.36.0"
-
-[package.source]
-type = "git"
-url = "https://github.com/mlcommons/modelbench.git"
-reference = "ba582ed35a58cf6219786624c83e6915a9bcf98a"
-resolved_reference = "ba582ed35a58cf6219786624c83e6915a9bcf98a"
-subdirectory = "plugins/vertexai"
+reference = "c1e4f214f8a4ac3466a8a428a668e7847c94ee4d"
+resolved_reference = "c1e4f214f8a4ac3466a8a428a668e7847c94ee4d"
 
 [[package]]
 name = "msal"
@@ -8377,4 +8184,4 @@ modelbench-private = ["modelbench-private"]
 [metadata]
 lock-version = "2.0"
 python-versions = ">=3.10,!=3.12.5,<3.13"
-content-hash = "2da3a294b9467fac2b22afdd669f1d6de24b333c3b054de05f03eec75b6427f0"
+content-hash = "21fca89e389b18bdfec909ef8d564f5ce02b5048adb04ccd6f82e1e45d21675c"
diff --git a/pyproject.toml b/pyproject.toml
index 4116981..76f4852 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -13,7 +13,7 @@ packages = [
 python = ">=3.10,!=3.12.5,<3.13"
 click = "^8"
 dvc = {extras = ["gs"], version = "^3.60"} 
-modelbench = {git = "https://github.com/mlcommons/modelbench.git", rev = "ba582ed35a58cf6219786624c83e6915a9bcf98a" }
+modelbench = {git = "https://github.com/mlcommons/modelbench.git", rev = "a71a902ef2e503569696ea76d56ac40fad9ee97b" }
 mlflow = "^3.1.1"
 python-dotenv = "^1"
 requests = "^2"
@@ -25,18 +25,7 @@ jupyter = "^1"
 jupyterlab-git = "*"
 scikit-learn = "^1.5.0"
 pandas = "^2.2.2"
-# plugins (would like to figure out a better way to manage these)
-modelgauge_anthropic = { git = "https://github.com/mlcommons/modelbench.git", subdirectory = "plugins/anthropic", rev = "ba582ed35a58cf6219786624c83e6915a9bcf98a" }
-modelgauge-azure = { git = "https://github.com/mlcommons/modelbench.git", subdirectory = "plugins/azure", rev = "ba582ed35a58cf6219786624c83e6915a9bcf98a" }
-modelgauge_baseten = { git = "https://github.com/mlcommons/modelbench.git", subdirectory = "plugins/baseten", rev = "ba582ed35a58cf6219786624c83e6915a9bcf98a" }
-modelgauge_demo_plugin = { git = "https://github.com/mlcommons/modelbench.git", subdirectory = "demo_plugin", rev = "ba582ed35a58cf6219786624c83e6915a9bcf98a" }
-modelgauge_nvidia = { git = "https://github.com/mlcommons/modelbench.git", subdirectory = "plugins/nvidia", rev = "ba582ed35a58cf6219786624c83e6915a9bcf98a" }
-modelgauge_perspective_api = { git = "https://github.com/mlcommons/modelbench.git", subdirectory = "plugins/perspective_api", rev = "ba582ed35a58cf6219786624c83e6915a9bcf98a" }
-modelgauge_google = { git = "https://github.com/mlcommons/modelbench.git", subdirectory = "plugins/google", rev = "ba582ed35a58cf6219786624c83e6915a9bcf98a" }
-modelgauge_vertexai = { git = "https://github.com/mlcommons/modelbench.git", subdirectory = "plugins/vertexai", rev = "ba582ed35a58cf6219786624c83e6915a9bcf98a" }
-modelgauge_mistral = { git = "https://github.com/mlcommons/modelbench.git", subdirectory = "plugins/mistral", rev = "ba582ed35a58cf6219786624c83e6915a9bcf98a" }
-modelgauge_amazon = { git = "https://github.com/mlcommons/modelbench.git", subdirectory = "plugins/amazon", rev = "ba582ed35a58cf6219786624c83e6915a9bcf98a" }
-modelbench-private = { git = "git@github.com:mlcommons/modelbench-private.git", rev = "ecac4cfd343411c8011c0bba875d638fc22e9478", optional = true }
+modelbench-private = { git = "git@github.com:mlcommons/modelbench-private.git", rev = "c1e4f214f8a4ac3466a8a428a668e7847c94ee4d", optional = true }
 
 [tool.poetry.extras]
 modelbench-private = ["modelbench-private"]
diff --git a/src/modelplane/cli.py b/src/modelplane/cli.py
index 52521ef..ca75622 100644
--- a/src/modelplane/cli.py
+++ b/src/modelplane/cli.py
@@ -1,6 +1,8 @@
 from typing import List
 
 import click
+
+from modelgauge.data_schema import DEFAULT_ANNOTATION_SCHEMA as ANNOTATION_SCHEMA
 from modelgauge.ensemble_annotator_set import ENSEMBLE_STRATEGIES
 
 from modelplane.runways.annotator import annotate, KNOWN_ENSEMBLES
@@ -71,6 +73,18 @@ def list_suts_cli():
     default=1,
     help="The number of workers to run in parallel. Defaults to 1.",
 )
+@click.option(
+    "--prompt_uid_col",
+    type=str,
+    required=False,
+    help="The name of the prompt UID column in the dataset.",
+)
+@click.option(
+    "--prompt_text_col",
+    type=str,
+    required=False,
+    help="The name of the prompt text column in the dataset.",
+)
 @load_from_dotenv
 def get_sut_responses(
     sut_id: str,
@@ -79,6 +93,8 @@ def get_sut_responses(
     dvc_repo: str | None = None,
     disable_cache: bool = False,
     num_workers: int = 1,
+    prompt_uid_col: str | None = None,
+    prompt_text_col: str | None = None,
 ):
     """
     Run the pipeline to get responses from SUTs.
@@ -90,6 +106,8 @@ def get_sut_responses(
         dvc_repo=dvc_repo,
         disable_cache=disable_cache,
         num_workers=num_workers,
+        prompt_uid_col=prompt_uid_col,
+        prompt_text_col=prompt_text_col,
     )
 
 
@@ -159,6 +177,30 @@ def get_sut_responses(
     default=1,
     help="The number of workers to run in parallel. Defaults to 1.",
 )
+@click.option(
+    "--prompt_uid_col",
+    type=str,
+    required=False,
+    help="The name of the prompt UID column in the dataset.",
+)
+@click.option(
+    "--prompt_text_col",
+    type=str,
+    required=False,
+    help="The name of the prompt text column in the dataset.",
+)
+@click.option(
+    "--sut_uid_col",
+    type=str,
+    required=False,
+    help="The name of the SUT UID column in the dataset.",
+)
+@click.option(
+    "--sut_response_col",
+    type=str,
+    required=False,
+    help="The name of the SUT response column in the dataset.",
+)
 @load_from_dotenv
 def get_annotations(
     experiment: str,
@@ -171,6 +213,10 @@ def get_annotations(
     overwrite: bool = False,
     disable_cache: bool = False,
     num_workers: int = 1,
+    prompt_uid_col: str | None = None,
+    prompt_text_col: str | None = None,
+    sut_uid_col: str | None = None,
+    sut_response_col: str | None = None,
 ):
     return annotate(
         experiment=experiment,
@@ -183,6 +229,10 @@ def get_annotations(
         overwrite=overwrite,
         disable_cache=disable_cache,
         num_workers=num_workers,
+        prompt_uid_col=prompt_uid_col,
+        prompt_text_col=prompt_text_col,
+        sut_uid_col=sut_uid_col,
+        sut_response_col=sut_response_col,
     )
 
 
@@ -210,18 +260,42 @@ def get_annotations(
     required=False,
     help="URL of the DVC repo to get the ground truth from. E.g. https://github.com/my-org/my-repo.git",
 )
+@click.option(
+    "--sample_uid_col",
+    type=str,
+    required=False,
+    help="The name of the sample uid columns in the annotations and ground truth files. prompt_uid x sut_uid will be used by default.",
+)
+@click.option(
+    "--annotator_uid_col",
+    type=str,
+    required=False,
+    help="The name of the annotator UID column in the annotations file.",
+)
+@click.option(
+    "--annotation_col",
+    type=str,
+    required=False,
+    help="The name of the JSON annotation column in the annotations file.",
+)
 @load_from_dotenv
 def score_annotations(
     experiment: str,
     annotation_run_id: str,
     ground_truth: str,
     dvc_repo: str | None = None,
+    sample_uid_col: str | None = None,
+    annotator_uid_col: str = ANNOTATION_SCHEMA.annotator_uid,
+    annotation_col: str = ANNOTATION_SCHEMA.annotation,
 ):
     return score(
         annotation_run_id=annotation_run_id,
         experiment=experiment,
         ground_truth=ground_truth,
         dvc_repo=dvc_repo,
+        sample_uid_col=sample_uid_col,
+        annotator_uid_col=annotator_uid_col,
+        annotation_col=annotation_col,
     )
 
 
diff --git a/src/modelplane/runways/__init__.py b/src/modelplane/runways/__init__.py
index 1c8795f..ee8a8c5 100644
--- a/src/modelplane/runways/__init__.py
+++ b/src/modelplane/runways/__init__.py
@@ -1,4 +1,4 @@
-from modelgauge.load_plugins import load_plugins
+from modelgauge.load_namespaces import load_namespaces
 
 
-load_plugins(disable_progress_bar=True)
+load_namespaces(disable_progress_bar=True)
diff --git a/src/modelplane/runways/annotator.py b/src/modelplane/runways/annotator.py
index d078ca4..700b9fd 100644
--- a/src/modelplane/runways/annotator.py
+++ b/src/modelplane/runways/annotator.py
@@ -50,6 +50,10 @@ def annotate(
     overwrite: bool = False,
     disable_cache: bool = False,
     num_workers: int = 1,
+    prompt_uid_col=None,
+    prompt_text_col=None,
+    sut_uid_col=None,
+    sut_response_col=None,
 ) -> str:
     """
     Run annotations and record measurements.
@@ -102,7 +106,13 @@ def annotate(
             input_path = input_data.local_path()  # type: ignore
             pipeline_kwargs["input_path"] = pathlib.Path(input_path)
             pipeline_kwargs["output_dir"] = pathlib.Path(tmp)
-            pipeline_runner = build_runner(**pipeline_kwargs)
+            pipeline_runner = build_runner(
+                prompt_uid_col=prompt_uid_col,
+                prompt_text_col=prompt_text_col,
+                sut_uid_col=sut_uid_col,
+                sut_response_col=sut_response_col,
+                **pipeline_kwargs,
+            )
 
             pipeline_runner.run(
                 progress_callback=mlflow.log_metrics, debug=is_debug_mode()
diff --git a/src/modelplane/runways/responder.py b/src/modelplane/runways/responder.py
index 9c81c41..fb31cc6 100644
--- a/src/modelplane/runways/responder.py
+++ b/src/modelplane/runways/responder.py
@@ -5,7 +5,7 @@
 
 import mlflow
 
-from modelgauge.pipeline_runner import PromptRunner
+from modelgauge.pipeline_runner import build_runner
 from modelgauge.sut_registry import SUTS
 
 from modelplane.runways.utils import (
@@ -27,6 +27,8 @@ def respond(
     dvc_repo: str | None = None,
     disable_cache: bool = False,
     num_workers: int = 1,
+    prompt_uid_col=None,
+    prompt_text_col=None,
 ) -> str:
     secrets = setup_sut_credentials(sut_id)
     sut = SUTS.make_instance(uid=sut_id, secrets=secrets)
@@ -41,12 +43,14 @@ def respond(
         with tempfile.TemporaryDirectory() as tmp:
             input_data = build_input(path=prompts, dvc_repo=dvc_repo, dest_dir=tmp)
             input_data.log_input()
-            pipeline_runner = PromptRunner(
+            pipeline_runner = build_runner(
                 num_workers=num_workers,
                 input_path=input_data.local_path(),
                 output_dir=pathlib.Path(tmp),
                 cache_dir=None if disable_cache else CACHE_DIR,
                 suts={sut_id: sut},
+                prompt_uid_col=prompt_uid_col,
+                prompt_text_col=prompt_text_col,
             )
 
             pipeline_runner.run(
diff --git a/src/modelplane/runways/scorer.py b/src/modelplane/runways/scorer.py
index 5d1259a..4100ace 100644
--- a/src/modelplane/runways/scorer.py
+++ b/src/modelplane/runways/scorer.py
@@ -27,10 +27,15 @@ def score(
     experiment: str,
     ground_truth: str,
     dvc_repo: str | None = None,
+    sample_uid_col: str | None = None,
+    annotator_uid_col: str = ANNOTATION_SCHEMA.annotator_uid,
+    annotation_col: str = ANNOTATION_SCHEMA.annotation,
 ):
     """
     Score annotations against ground truth.
-    ground_truth is a path to a CSV file with columns: UID, SUT, is_safe.
+    Annotations are expected to be in JSON format with an "is_safe" field.
+    Ground truth should have an "is_safe" column with values "safe" or "unsafe".
+    if `sample_uid_col` is not provided, samples will be keyed by prompt_uid X sut_uid.
     """
     params = {
         "ground_truth": ground_truth,
@@ -45,26 +50,25 @@ def score(
 
         with tempfile.TemporaryDirectory() as tmp:
             # Load annotations
-            annotation_dataset = build_input(
+            annotation_input = build_input(
                 run_id=annotation_run_id,
                 artifact_path=ANNOTATION_RESPONSE_ARTIFACT_NAME,
                 dest_dir=tmp,
             )
-            annotation_dataset.log_input()
-            # Maybe this should be handled by the dataset class?
-            annotators, annotations_df = transform_mlflow_annotator_artifact(
-                annotation_dataset.local_path()
-            )
+            annotation_input.log_input()
+            annotation_data = AnnotationData(annotation_input.local_path(), is_json_annotation=True, sample_uid_col=sample_uid_col, annotator_uid_col=annotator_uid_col, annotation_col=annotation_col)
+
             # Load ground truth
-            ground_truth_dataset = build_input(
+            ground_truth_input = build_input(
                 path=ground_truth, dvc_repo=dvc_repo, dest_dir=tmp
             )
-            ground_truth_dataset.log_input()
-            ground_truth_df = ground_truth_to_df(ground_truth_dataset.local_path())
-            mlflow.log_metric("num_ground_truth_samples", len(ground_truth_df))
+            ground_truth_input.log_input()
+            ground_truth_data = AnnotationData(ground_truth_input.local_path(), is_json_annotation=False, annotation_col="is_safe", annotator_uid_col=None, sample_uid_col=sample_uid_col)
+            mlflow.log_metric("num_ground_truth_samples", len(ground_truth_data.df))
+
         # Score each annotator in the annotation dataframe.
-        for annotator in annotators:
-            score = score_annotator(annotator, annotations_df, ground_truth_df)
+        for annotator in annotation_data.annotators:
+            score = score_annotator(annotator, annotation_data, ground_truth_data)
             for metric in score:
                 # There's a bug in graphql (used by mlflow ui) that crashes
                 # the UI if a metric is NaN or infinity.
@@ -79,24 +83,25 @@ def score(
         return run.info.run_id
 
 
-def score_annotator(
-    annotator: str, annotations_df: pd.DataFrame, ground_truth_df: pd.DataFrame
-):
+def score_annotator(annotator: str, annotation_data, ground_truth_data):
     """Score an annotator's predictions against ground truth."""
     # Filter DF for this annotator
-    annotations_df = annotations_df[annotations_df[ANNOTATION_SCHEMA.annotator_uid] == annotator]
+    annotations_df = annotation_data.df[
+        annotation_data.df[annotation_data.annotator_uid_col] == annotator
+    ]
     assert annotations_df[
-        "sample_id"
-    ].is_unique, f"Error: prompt UID x SUT combination is not unique. There should be one SUT response per prompt."
+        "sample_uid"
+    ].is_unique, f"Error: sample UID for annotator {annotator} is not unique."
+
     # Get matching samples between ground truth and annotations
-    samples = ground_truth_df["sample_id"]
+    samples = ground_truth_data.df["sample_uid"]
 
     # Filter annotations to only include samples that exist in ground truth
-    matching_annotations = annotations_df[annotations_df["sample_id"].isin(samples)]
+    matching_annotations = annotations_df[annotations_df["sample_uid"].isin(samples)]
 
     # Ensure ground truth is aligned with annotations
-    matching_ground_truth = ground_truth_df[
-        ground_truth_df["sample_id"].isin(matching_annotations["sample_id"])
+    matching_ground_truth = ground_truth_data.df[
+        ground_truth_data.df["sample_uid"].isin(matching_annotations["sample_uid"])
     ]
 
     # TODO: What happens if not all ground truth samples are annotated? Proceed with scoring or no?
@@ -105,8 +110,8 @@ def score_annotator(
     ), f"No sample overlap found between {annotator} and ground truth."
 
     # Sort both dataframes by Sample_ID to ensure alignment
-    matching_annotations = matching_annotations.sort_values("sample_id")
-    matching_ground_truth = matching_ground_truth.sort_values("sample_id")
+    matching_annotations = matching_annotations.sort_values("sample_uid")
+    matching_ground_truth = matching_ground_truth.sort_values("sample_uid")
 
     # Get predictions and ground truth for unsafe classification.
     y_pred_is_unsafe = matching_annotations["is_unsafe"].astype(bool)
@@ -134,39 +139,66 @@ def score_annotator(
     }
 
 
-def transform_mlflow_annotator_artifact(path: Path) -> tuple[list, pd.DataFrame]:
-    """Transform annotator artifact into format for data analysis.
-    Returns: list of annotator uids, dataframe
-    """
-    df = pd.read_csv(path)
-    df["is_safe"] = df[ANNOTATION_SCHEMA.annotation].apply(lambda x: json.loads(x)["is_safe"])
-    annotators = list(df[ANNOTATION_SCHEMA.annotator_uid].unique())
-    return annotators, format_df(df, ["is_safe"])
-
-def ground_truth_to_df(file):
-    if not os.path.exists(file):
-        raise FileNotFoundError(f"Ground truth file does not exist: {file}")
-    df = pd.read_csv(file)
-    # TODO: standardize ground truth schema.
-    # currently, is_safe is a str "safe" or "unsafe"; convert it to boolean.
-    df["is_safe"] = df["is_safe"].apply(lambda x: x.lower() == "safe")
-    return format_df(df, ["is_safe"])
-
-
-def format_df(df: pd.DataFrame, label_cols: list[str]) -> pd.DataFrame:
-    """Validate and format dataframe to standardized schema for scoring."""
-    assert len(label_cols) > 0, "No label columns provided"
-    expected_cols = [ANNOTATION_SCHEMA.prompt_uid, ANNOTATION_SCHEMA.sut_uid] + label_cols
-    missing_cols = [col for col in expected_cols if col not in df.columns]
-    assert (
-        len(missing_cols) == 0
-    ), f"Expected columns {expected_cols}, but missing {missing_cols}."
+class AnnotationData:
+    """Transform a CSV to a dataframe with columns `sample_uid` and `is_unsafe`."""
+    sample_uid_col = "sample_uid"
+    unsafe_col = "is_unsafe"
+
+    def __init__(
+        self,
+        path: Path,
+        is_json_annotation: bool,
+        sample_uid_col: str | None = None,
+        annotator_uid_col: str = ANNOTATION_SCHEMA.annotator_uid,
+        annotation_col: str = ANNOTATION_SCHEMA.annotation,
+    ):
+        self.annotator_uid_col = annotator_uid_col # Not used for ground truth data.
 
-    # Add unique sample_id column.
-    df["sample_id"] = df[ANNOTATION_SCHEMA.prompt_uid].astype(str) + "_" + df[ANNOTATION_SCHEMA.sut_uid].astype(str)
+        self.path = path
+        self.df = pd.read_csv(path)
+
+        self._set_sample_uid(sample_uid_col)
+        self._format_annotation(is_json_annotation, annotation_col)
+
+
+    @property
+    def annotators(self) -> list[str]:
+        assert self.annotator_uid_col in self.df.columns, (
+            f"Annotator UID column '{self.annotator_uid_col}' not found in dataframe for {self.path}. "
+        )
+        return  list(self.df[self.annotator_uid_col].unique())
+
+    def _set_sample_uid(self, sample_uid_col):
+        if sample_uid_col is not None:
+            assert sample_uid_col in self.df.columns, (
+                f"Sample UID column '{sample_uid_col}' not found in dataframe for {self.path}. "
+            )
+            self.df[self.sample_uid_col] = self.df[sample_uid_col]
+        else:
+            missing_cols = []
+            required_cols = [ANNOTATION_SCHEMA.prompt_uid, ANNOTATION_SCHEMA.sut_uid]
+            for col in required_cols:
+                if col not in self.df.columns:
+                    missing_cols.append(col)
+            assert len(missing_cols) == 0, f"If no sample_uid_col is passed, a sample UID column is made from {required_cols}. Missing columns {missing_cols}  from {self.path}."
+            self.df[self.sample_uid_col] = (
+                    self.df[ANNOTATION_SCHEMA.prompt_uid].astype(str)
+                    + "_"
+                    + self.df[ANNOTATION_SCHEMA.sut_uid].astype(str)
+            )
+
+    def _format_annotation(self, is_json_annotation, annotation_col):
+        assert annotation_col in self.df.columns, (
+            f"Annotation column '{annotation_col}' not found in dataframe for {self.path}."
+        )
+        if is_json_annotation:
+            is_safe = self.df[annotation_col].apply(
+                lambda x: json.loads(x)["is_safe"]
+            )
+        else:
+            assert self.df[annotation_col].str.lower().isin({"safe", "unsafe"}).all(), f"Invalid annotation values found in the {annotation_col} column of {self.path}"
+            # currently, is_safe is a str "safe" or "unsafe"; convert it to boolean.
+            is_safe = self.df[annotation_col].apply(lambda x: x.lower() == "safe")
 
-    # Create new columns where unsafe is 1 and safe is 0.
-    for col in label_cols:
-        unsafe_col = col.replace("is_safe", "is_unsafe")
-        df[unsafe_col] = ~df[col].astype(bool)
-    return df
+        # Create new column where unsafe is 1 and safe is 0.
+        self.df[self.unsafe_col] = ~is_safe.astype(bool)
diff --git a/tests/unit/test_scorer.py b/tests/unit/test_scorer.py
index 020ea3d..214ad9a 100644
--- a/tests/unit/test_scorer.py
+++ b/tests/unit/test_scorer.py
@@ -1,128 +1,129 @@
 import pytest
-import pandas as pd
-import json
-import os
-from pathlib import Path
-
-from modelplane.runways.scorer import (
-    transform_mlflow_annotator_artifact,
-    ground_truth_to_df,
-    format_df,
-    score_annotator
-)
-
-ANNOTATIONS_FILE = "tests/data/annotations.csv"
-GROUND_TRUTH_FILE = "tests/data/ground_truth.csv"
-
-def test_transform_mlflow_annotator_artifact():
-    annotators, df = transform_mlflow_annotator_artifact(ANNOTATIONS_FILE)
-    assert annotators == ["test_annotator"]
-    assert "is_safe" in df.columns
-    assert "is_unsafe" in df.columns
-    assert "sample_id" in df.columns
-    assert len(df) == 10  # Number of rows in test data
-    # Check that is_safe was properly extracted from JSON
-    first_row = df.iloc[0]
-    assert first_row["is_safe"] == True
-    assert first_row["is_unsafe"] == False
-    assert first_row["sample_id"] == "0_demo_yes_no"
-
-def test_ground_truth_to_df():
-    df = ground_truth_to_df(GROUND_TRUTH_FILE)
-    assert "is_safe" in df.columns
-    assert "is_unsafe" in df.columns
-    assert "sample_id" in df.columns
-    assert len(df) == 10  # Number of rows in test data
-    # Check that string "safe" was converted to boolean
-    assert df["is_safe"].dtype == bool
-    assert df["is_unsafe"].dtype == bool
-
-def test_format_df():
-    # Create a test dataframe
-    test_df = pd.DataFrame({
-        "prompt_uid": ["1", "2"],
-        "sut_uid": ["a", "b"],
-        "is_safe": [True, False]
-    })
-    
-    formatted_df = format_df(test_df, ["is_safe"])
-    assert "sample_id" in formatted_df.columns
-    assert "is_unsafe" in formatted_df.columns
-    assert formatted_df["sample_id"].iloc[0] == "1_a"
-    assert formatted_df["is_unsafe"].iloc[0] == False
-    assert formatted_df["is_unsafe"].iloc[1] == True
-
-def test_format_df_missing_columns():
-    # Test that format_df raises assertion error when required columns are missing
-    test_df = pd.DataFrame({
-        "prompt_uid": ["1", "2"]  # Missing sut_uid
-    })
-    
-    with pytest.raises(AssertionError):
-        format_df(test_df, ["is_safe"])
-
-def test_score_annotator():
-    # Load test data
-    _, annotations_df = transform_mlflow_annotator_artifact(ANNOTATIONS_FILE)
-    ground_truth_df = ground_truth_to_df(GROUND_TRUTH_FILE)
-    
-    # Score test_annotator
-    scores = score_annotator("test_annotator", annotations_df, ground_truth_df)
-    
-    # Check that all expected metrics are present
-    expected_metrics = [
-        "num_annotator_samples",
-        "num_samples_scored",
-        "peters_metric",
-        "false_safe_rate",
-        "false_unsafe_rate",
-        "precision",
-        "negative_predictive_value",
-        "false_safe",
-        "true_safe",
-        "false_unsafe",
-        "true_unsafe",
-        "recall",
-        "f1",
-        "accuracy"
-    ]
-    
-    for metric in expected_metrics:
-        assert metric in scores
-    
-    # Check basic metric properties
-    # Ground truth is all safe aka all negative. Annotations are half safe and half unsafe.
-    assert scores["num_annotator_samples"] == 10
-    assert scores["num_samples_scored"] == 10
+
+from modelplane.runways.scorer import AnnotationData, score_annotator
+
+@pytest.fixture
+def annotations_csv(tmp_path):
+    file_path = tmp_path / "annotations.csv"
+    content = (
+        "prompt_uid,sut_uid,annotator_uid,annotation_json\n"
+        "p1,s1,a1,{\"is_safe\": true}\n"
+        "p1,s2,a1,{\"is_safe\": true}\n"
+        "p1,s1,a2,{\"is_safe\": true}\n"
+        "p1,s2,a2,{\"is_safe\": false}\n"
+    )
+    file_path.write_text(content)
+    return file_path
+
+
+@pytest.fixture
+def annotation_data(annotations_csv):
+    return AnnotationData(annotations_csv, is_json_annotation=True)
+
+
+def test_annotation_data(annotation_data):
+    assert annotation_data.annotators == ["a1", "a2"]
+    assert len(annotation_data.df) == 4
+    assert annotation_data.df["sample_uid"].tolist() == ["p1_s1", "p1_s2", "p1_s1", "p1_s2"]
+    assert annotation_data.df["is_unsafe"].tolist() == [False, False, False, True]
+
+def test_annotation_data_missing_columns(annotations_csv):
+    with pytest.raises(AssertionError, match="Sample UID column 'missing_col' not found"):
+        AnnotationData(annotations_csv, is_json_annotation=True, sample_uid_col="missing_col")
+
+    with pytest.raises(AssertionError, match="Annotation column 'missing_col' not found"):
+        AnnotationData(annotations_csv, is_json_annotation=True, annotation_col="missing_col")
+
+def test_annotation_data_custom_sample_uid(tmp_path):
+    file_path = tmp_path / "annotations.csv"
+    content = (
+        "sample_uid,annotator_uid,annotation_json\n"
+        "x1,a1,{\"is_safe\": true}\n"
+        "x2,a1,{\"is_safe\": true}\n"
+        "x1,a2,{\"is_safe\": true}\n"
+        "x2,a2,{\"is_safe\": false}\n"
+    )
+    file_path.write_text(content)
+
+    data = AnnotationData(file_path, is_json_annotation=True, sample_uid_col="sample_uid")
+    assert data.annotators == ["a1", "a2"]
+    assert len(data.df) == 4
+    assert data.df["sample_uid"].tolist() == ["x1", "x2", "x1", "x2"]
+    assert data.df["is_unsafe"].tolist() == [False, False, False, True]
+
+@pytest.fixture
+def ground_truth_csv(tmp_path):
+    file_path = tmp_path / "groundtruth.csv"
+    content = (
+        "prompt_uid,sut_uid,is_safe\n"
+        "p1,s1,safe\n"
+        "p1,s2,unsafe\n"
+    )
+    file_path.write_text(content)
+    return file_path
+
+
+@pytest.fixture
+def ground_truth_data(ground_truth_csv):
+    return AnnotationData(ground_truth_csv, is_json_annotation=False, annotator_uid_col=None, annotation_col="is_safe")
+
+
+def test_ground_truth_data(ground_truth_data):
+    assert len(ground_truth_data.df) == 2
+    assert ground_truth_data.df["sample_uid"].tolist() == ["p1_s1", "p1_s2"]
+    assert ground_truth_data.df["is_unsafe"].tolist() == [False, True]
+
+
+def test_ground_truth_data_invalid_annotation(tmp_path):
+    file_path = tmp_path / "bad_groundtruth.csv"
+    content = (
+        "prompt_uid,sut_uid,is_safe\n"
+        "p1,s2,oops\n"
+    )
+    file_path.write_text(content)
+    with pytest.raises(AssertionError, match="Invalid annotation values"):
+        data = AnnotationData(file_path, is_json_annotation=False, annotator_uid_col=None,annotation_col="is_safe")
+
+
+def test_score_annotator(annotation_data, ground_truth_data):
+    # A1: safe, safe
+    # Ground truth: safe, unsafe
+    scores = score_annotator("a1", annotation_data, ground_truth_data)
+
+    assert scores["num_annotator_samples"] == 2
+    assert scores["num_samples_scored"] == 2
+    assert scores["peters_metric"] == 0.5
+    assert scores["false_unsafe_rate"] == 0.0
+    assert scores["false_safe_rate"] == 1.0
+    assert scores["precision"] == 0.0  # No true positives ie unsafes
+    assert scores["negative_predictive_value"] == 0.5
+    assert scores["false_safe"] == 1
+    assert scores["true_safe"] == 1
+    assert scores["false_unsafe"] == 0
+    assert scores["true_unsafe"] == 0
+    assert scores["recall"] == 0
+    assert scores["f1"] == 0
     assert scores["accuracy"] == 0.5
-    assert scores["false_unsafe_rate"] == 0.5
-    assert scores["precision"] == 0.0 # No true positives
-    assert scores["false_safe"] == 0
-    assert scores["true_safe"] == 5
-    assert scores["false_unsafe"] == 5
-
-def test_score_annotator_no_overlap():
-    # Create test dataframes with no overlapping samples
-    annotations_df = pd.DataFrame({
-        "sample_id": ["1_a", "2_b"],
-        "prompt_uid": ["1", "2"],
-        "sut_uid": ["a", "b"],
-        "annotator_uid": ["test_annotator", "test_annotator"],
-        "is_safe": [True, False],
-        "is_unsafe": [False, True]
-    })
-    
-    ground_truth_df = pd.DataFrame({
-        "sample_id": ["3_c", "4_d"],
-        "prompt_uid": ["3", "4"],
-        "sut_uid": ["c", "d"],
-        "is_safe": [True, True],
-        "is_unsafe": [False, False]
-    })
-    
-    # Test that score_annotator raises assertion error when no overlapping samples
-    with pytest.raises(AssertionError):
-        score_annotator("test_annotator", annotations_df, ground_truth_df)
 
 
+def test_score_annotator_no_overlap(tmp_path):
+    # Create data files with no overlapping samples
+    ground_truth_path = tmp_path / "groundtruth.csv"
+    content = (
+        "prompt_uid,sut_uid,is_safe\n"
+        "p1,s1,safe\n"
+    )
+    ground_truth_path.write_text(content)
+    ground_truth_data = AnnotationData(ground_truth_path, is_json_annotation=False, annotator_uid_col=None, annotation_col="is_safe")
 
+    annotations_path = tmp_path / "annotations.csv"
+    content = (
+        "prompt_uid,sut_uid,annotator_uid,annotation_json\n"
+        "p5,s5,a1,{\"is_safe\": true}\n"
+    )
+    annotations_path.write_text(content)
+    annotation_data = AnnotationData(annotations_path, is_json_annotation=True)
+
+    # Test that score_annotator raises assertion error when no overlapping samples
+    with pytest.raises(AssertionError):
+        score_annotator("a1", annotation_data, ground_truth_data)