feat: use Pydantic models for MCP submit_evaluation tool parameters (#102)

tarilabs · claude · web-flow · commit 0c0fb3d0a138 · 2026-03-30T07:54:16.000+02:00
* feat: use Pydantic models for MCP submit_evaluation tool parameters

Replace parameters types with typed Pydantic models
this way FastMCP generates congruent JSON Schema
for AI agents

Co-Authored-By: Claude &lt;noreply@anthropic.com&gt;
Signed-off-by: tarilabs &lt;matteo.mortari@gmail.com&gt;

* chore: impl code review feedback

Co-Authored-By: Claude &lt;noreply@anthropic.com&gt;
Signed-off-by: tarilabs &lt;matteo.mortari@gmail.com&gt;

* chore: linting

Co-Authored-By: Claude &lt;noreply@anthropic.com&gt;
Signed-off-by: tarilabs &lt;matteo.mortari@gmail.com&gt;

---------

Signed-off-by: tarilabs &lt;matteo.mortari@gmail.com&gt;
Co-authored-by: Claude &lt;noreply@anthropic.com&gt;
diff --git a/src/evalhub/mcp/server.py b/src/evalhub/mcp/server.py
@@ -241,71 +241,46 @@ async def handle_completion(
 )
 async def submit_evaluation(
     name: str,
-    model: dict[str, Any],
-    benchmarks: list[dict[str, Any]] | None = None,
-    collection: dict[str, Any] | None = None,
+    model: ModelConfig,
+    benchmarks: list[BenchmarkConfig] | None = None,
+    collection: CollectionRef | None = None,
     description: str | None = None,
     tags: list[str] | None = None,
-    experiment: dict[str, Any] | None = None,
+    experiment: ExperimentConfig | None = None,
 ) -> str:
     """Submit a new evaluation job.
 
-    Evaluation Job fields have been separated for easier fill by AI agents.
+    Provide either 'benchmarks' or 'collection', not both.
+    Use the providers and benchmarks resources to discover available
+    provider_id and benchmark id values.
 
     Args:
         name: Job name.
-        model: Model to evaluate. Keys: "url" (model endpoint), "name" (model identifier),
-            optional "auth" with "secret_ref" (Kubernetes Secret name for model credentials).
-            Examples:
-              Remote vLLM:  {"url": "http://vllm-server.models.svc.cluster.local:8000/v1", "name": "meta-llama/Llama-3.2-1B-Instruct"}
-              With auth:    {"url": "http://model:8000/v1", "name": "my-model", "auth": {"secret_ref": "model-api-key"}}
-        benchmarks: List of benchmarks to run. Each entry has "id", "provider_id", and optional "parameters".
-            Mutually exclusive with 'collection'.
-            Examples:
-              Simple:     [{"id": "demo_benchmark", "provider_id": "demo"}]
-              With params: [{"id": "quick_perf_test", "provider_id": "guidellm", "parameters": {"profile": "constant", "rate": 5, "max_seconds": 10, "max_requests": 20}}]
-              Multiple:   [{"id": "gsm8k", "provider_id": "lm_eval"}, {"id": "mmlu", "provider_id": "lm_eval"}]
-        collection: Collection reference to run all benchmarks in a predefined collection.
-            Mutually exclusive with 'benchmarks'. Keys: "id" (collection identifier),
-            optional "benchmarks" to run only a subset.
-            Examples:
-              Full collection: {"id": "standard"}
-              Subset:          {"id": "standard", "benchmarks": [{"id": "gsm8k", "provider_id": "lm_eval"}]}
+        model: Model to evaluate (url and name are required).
+        benchmarks: List of benchmarks to run. Mutually exclusive with 'collection'.
+        collection: Collection reference. Mutually exclusive with 'benchmarks'.
         description: Optional job description.
-        tags: Optional list of tags for organizing jobs, e.g. ["nightly", "regression"].
-        experiment: Optional MLflow experiment config. Keys: "name", optional "tags" (list of {"key": ..., "value": ...}),
-            optional "artifact_location".
-            Example: {"name": "llama3-eval-experiment", "tags": [{"key": "team", "value": "nlp"}]}
+        tags: Optional list of tags for organizing jobs.
+        experiment: Optional MLflow experiment configuration.
     """
-    has_benchmarks = bool(benchmarks)
+    has_benchmarks = benchmarks is not None
     has_collection = collection is not None
     if has_benchmarks == has_collection:
         raise ValueError("Provide exactly one of 'benchmarks' or 'collection'.")
 
-    client = _get_client()
-
-    model_config = ModelConfig(**model)
-
-    benchmark_configs = None
-    if benchmarks is not None:
-        benchmark_configs = [BenchmarkConfig(**b) for b in benchmarks]
+    if benchmarks is not None and len(benchmarks) == 0:
+        raise ValueError("'benchmarks' cannot be empty when provided.")
 
-    collection_ref = None
-    if collection is not None:
-        collection_ref = CollectionRef(**collection)
-
-    experiment_config = None
-    if experiment is not None:
-        experiment_config = ExperimentConfig(**experiment)
+    client = _get_client()
 
     request = JobSubmissionRequest(
         name=name,
         description=description,
         tags=tags or [],
-        model=model_config,
-        benchmarks=benchmark_configs,
-        collection=collection_ref,
-        experiment=experiment_config,
+        model=model,
+        benchmarks=benchmarks,
+        collection=collection,
+        experiment=experiment,
     )
 
     job = await client.jobs.submit(request)
diff --git a/tests/unit/test_mcp_server.py b/tests/unit/test_mcp_server.py
@@ -32,10 +32,13 @@
     BenchmarkConfig,
     BenchmarkReference,
     Collection,
+    CollectionRef,
     EvaluationJob,
     EvaluationJobResource,
     EvaluationJobStatus,
+    ExperimentConfig,
     JobStatus,
+    ModelAuth,
     ModelConfig,
     Provider,
     Resource,
@@ -269,16 +272,83 @@ async def test_list_tools() -> None:
     assert len(tool_names) == 2
 
 
+async def test_submit_evaluation_schema() -> None:
+    """Verify the generated inputSchema contains typed $defs for Pydantic models."""
+    tools = await mcp.list_tools()
+    tool = next(t for t in tools if t.name == "submit_evaluation")
+    schema = tool.inputSchema
+
+    # Required top-level params
+    assert "name" in schema["required"]
+    assert "model" in schema["required"]
+
+    # Pydantic models generate $defs with full property definitions
+    defs = schema["$defs"]
+    assert "ModelConfig" in defs
+    assert "BenchmarkConfig" in defs
+
+    # ModelConfig has url and name as required
+    model_def = defs["ModelConfig"]
+    assert "url" in model_def["properties"]
+    assert "name" in model_def["properties"]
+    assert "url" in model_def["required"]
+    assert "name" in model_def["required"]
+
+    # BenchmarkConfig has id and provider_id as required
+    bench_def = defs["BenchmarkConfig"]
+    assert "id" in bench_def["properties"]
+    assert "provider_id" in bench_def["properties"]
+    assert "id" in bench_def["required"]
+    assert "provider_id" in bench_def["required"]
+
+
+async def test_submit_evaluation_wire_path(mock_client: MagicMock) -> None:
+    """Invoke submit_evaluation through FastMCP's call_tool with JSON-like dicts."""
+    await mcp.call_tool(
+        "submit_evaluation",
+        {
+            "name": "wire-eval",
+            "model": {"url": "http://model:8000/v1", "name": "llama3"},
+            "benchmarks": [
+                {"id": "gsm8k", "provider_id": "lm_eval"},
+                {
+                    "id": "mmlu",
+                    "provider_id": "lm_eval",
+                    "parameters": {"num_few_shot": 5},
+                },
+            ],
+            "experiment": {
+                "name": "my-experiment",
+                "tags": [{"key": "team", "value": "nlp"}],
+            },
+        },
+    )
+
+    mock_client.jobs.submit.assert_awaited_once()
+    request = mock_client.jobs.submit.call_args[0][0]
+    assert isinstance(request.model, ModelConfig)
+    assert request.model.url == "http://model:8000/v1"
+    assert request.model.name == "llama3"
+    assert len(request.benchmarks) == 2
+    assert isinstance(request.benchmarks[0], BenchmarkConfig)
+    assert request.benchmarks[0].id == "gsm8k"
+    assert request.benchmarks[1].parameters == {"num_few_shot": 5}
+    assert isinstance(request.experiment, ExperimentConfig)
+    assert request.experiment.name == "my-experiment"
+    assert len(request.experiment.tags) == 1
+    assert request.experiment.tags[0].key == "team"
+
+
 # ---------------------------------------------------------------------------
-# Tool call tests
+# Tool call tests (direct invocation)
 # ---------------------------------------------------------------------------
 
 
 async def test_submit_evaluation(mock_client: MagicMock) -> None:
     result = await submit_evaluation(
         name="my-eval",
-        model={"url": "http://model:8000", "name": "llama3"},
-        benchmarks=[{"id": "gsm8k", "provider_id": "lm_eval"}],
+        model=ModelConfig(url="http://model:8000", name="llama3"),
+        benchmarks=[BenchmarkConfig(id="gsm8k", provider_id="lm_eval")],
     )
     data = json.loads(result)
     assert data["name"] == "test-eval"
@@ -297,8 +367,8 @@ async def test_submit_evaluation(mock_client: MagicMock) -> None:
 async def test_submit_evaluation_with_collection(mock_client: MagicMock) -> None:
     result = await submit_evaluation(
         name="collection-eval",
-        model={"url": "http://model:8000", "name": "llama3"},
-        collection={"id": "standard"},
+        model=ModelConfig(url="http://model:8000", name="llama3"),
+        collection=CollectionRef(id="standard"),
     )
     json.loads(result)  # validate JSON output
 
@@ -312,12 +382,12 @@ async def test_submit_evaluation_with_collection(mock_client: MagicMock) -> None
 async def test_submit_evaluation_with_model_auth(mock_client: MagicMock) -> None:
     await submit_evaluation(
         name="auth-eval",
-        model={
-            "url": "http://model:8000",
-            "name": "llama3",
-            "auth": {"secret_ref": "my-secret"},
-        },
-        benchmarks=[{"id": "gsm8k", "provider_id": "lm_eval"}],
+        model=ModelConfig(
+            url="http://model:8000",
+            name="llama3",
+            auth=ModelAuth(secret_ref="my-secret"),
+        ),
+        benchmarks=[BenchmarkConfig(id="gsm8k", provider_id="lm_eval")],
     )
 
     call_args = mock_client.jobs.submit.call_args
@@ -329,9 +399,9 @@ async def test_submit_evaluation_with_model_auth(mock_client: MagicMock) -> None
 async def test_submit_evaluation_with_experiment(mock_client: MagicMock) -> None:
     await submit_evaluation(
         name="exp-eval",
-        model={"url": "http://model:8000", "name": "llama3"},
-        benchmarks=[{"id": "gsm8k", "provider_id": "lm_eval"}],
-        experiment={"name": "my-experiment"},
+        model=ModelConfig(url="http://model:8000", name="llama3"),
+        benchmarks=[BenchmarkConfig(id="gsm8k", provider_id="lm_eval")],
+        experiment=ExperimentConfig(name="my-experiment"),
     )
 
     call_args = mock_client.jobs.submit.call_args
@@ -341,24 +411,35 @@ async def test_submit_evaluation_with_experiment(mock_client: MagicMock) -> None
 
 
 async def test_submit_evaluation_both_benchmarks_and_collection(
-    mock_client: MagicMock
+    mock_client: MagicMock,
 ) -> None:
     with pytest.raises(ValueError, match="exactly one"):
         await submit_evaluation(
             name="bad-eval",
-            model={"url": "http://model:8000", "name": "llama3"},
-            benchmarks=[{"id": "gsm8k", "provider_id": "lm_eval"}],
-            collection={"id": "standard"},
+            model=ModelConfig(url="http://model:8000", name="llama3"),
+            benchmarks=[BenchmarkConfig(id="gsm8k", provider_id="lm_eval")],
+            collection=CollectionRef(id="standard"),
         )
 
 
 async def test_submit_evaluation_neither_benchmarks_nor_collection(
-    mock_client: MagicMock
+    mock_client: MagicMock,
 ) -> None:
     with pytest.raises(ValueError, match="exactly one"):
         await submit_evaluation(
             name="bad-eval",
-            model={"url": "http://model:8000", "name": "llama3"},
+            model=ModelConfig(url="http://model:8000", name="llama3"),
+        )
+
+
+async def test_submit_evaluation_empty_benchmarks(
+    mock_client: MagicMock,
+) -> None:
+    with pytest.raises(ValueError, match="cannot be empty"):
+        await submit_evaluation(
+            name="bad-eval",
+            model=ModelConfig(url="http://model:8000", name="llama3"),
+            benchmarks=[],
         )