Skip to content

Commit 0c0fb3d

Browse files
tarilabsclaude
andauthored
feat: use Pydantic models for MCP submit_evaluation tool parameters (#102)
* feat: use Pydantic models for MCP submit_evaluation tool parameters Replace parameters types with typed Pydantic models this way FastMCP generates congruent JSON Schema for AI agents Co-Authored-By: Claude <noreply@anthropic.com> Signed-off-by: tarilabs <matteo.mortari@gmail.com> * chore: impl code review feedback Co-Authored-By: Claude <noreply@anthropic.com> Signed-off-by: tarilabs <matteo.mortari@gmail.com> * chore: linting Co-Authored-By: Claude <noreply@anthropic.com> Signed-off-by: tarilabs <matteo.mortari@gmail.com> --------- Signed-off-by: tarilabs <matteo.mortari@gmail.com> Co-authored-by: Claude <noreply@anthropic.com>
1 parent 6539586 commit 0c0fb3d

File tree

2 files changed

+121
-65
lines changed

2 files changed

+121
-65
lines changed

src/evalhub/mcp/server.py

Lines changed: 20 additions & 45 deletions
Original file line numberDiff line numberDiff line change
@@ -241,71 +241,46 @@ async def handle_completion(
241241
)
242242
async def submit_evaluation(
243243
name: str,
244-
model: dict[str, Any],
245-
benchmarks: list[dict[str, Any]] | None = None,
246-
collection: dict[str, Any] | None = None,
244+
model: ModelConfig,
245+
benchmarks: list[BenchmarkConfig] | None = None,
246+
collection: CollectionRef | None = None,
247247
description: str | None = None,
248248
tags: list[str] | None = None,
249-
experiment: dict[str, Any] | None = None,
249+
experiment: ExperimentConfig | None = None,
250250
) -> str:
251251
"""Submit a new evaluation job.
252252
253-
Evaluation Job fields have been separated for easier fill by AI agents.
253+
Provide either 'benchmarks' or 'collection', not both.
254+
Use the providers and benchmarks resources to discover available
255+
provider_id and benchmark id values.
254256
255257
Args:
256258
name: Job name.
257-
model: Model to evaluate. Keys: "url" (model endpoint), "name" (model identifier),
258-
optional "auth" with "secret_ref" (Kubernetes Secret name for model credentials).
259-
Examples:
260-
Remote vLLM: {"url": "http://vllm-server.models.svc.cluster.local:8000/v1", "name": "meta-llama/Llama-3.2-1B-Instruct"}
261-
With auth: {"url": "http://model:8000/v1", "name": "my-model", "auth": {"secret_ref": "model-api-key"}}
262-
benchmarks: List of benchmarks to run. Each entry has "id", "provider_id", and optional "parameters".
263-
Mutually exclusive with 'collection'.
264-
Examples:
265-
Simple: [{"id": "demo_benchmark", "provider_id": "demo"}]
266-
With params: [{"id": "quick_perf_test", "provider_id": "guidellm", "parameters": {"profile": "constant", "rate": 5, "max_seconds": 10, "max_requests": 20}}]
267-
Multiple: [{"id": "gsm8k", "provider_id": "lm_eval"}, {"id": "mmlu", "provider_id": "lm_eval"}]
268-
collection: Collection reference to run all benchmarks in a predefined collection.
269-
Mutually exclusive with 'benchmarks'. Keys: "id" (collection identifier),
270-
optional "benchmarks" to run only a subset.
271-
Examples:
272-
Full collection: {"id": "standard"}
273-
Subset: {"id": "standard", "benchmarks": [{"id": "gsm8k", "provider_id": "lm_eval"}]}
259+
model: Model to evaluate (url and name are required).
260+
benchmarks: List of benchmarks to run. Mutually exclusive with 'collection'.
261+
collection: Collection reference. Mutually exclusive with 'benchmarks'.
274262
description: Optional job description.
275-
tags: Optional list of tags for organizing jobs, e.g. ["nightly", "regression"].
276-
experiment: Optional MLflow experiment config. Keys: "name", optional "tags" (list of {"key": ..., "value": ...}),
277-
optional "artifact_location".
278-
Example: {"name": "llama3-eval-experiment", "tags": [{"key": "team", "value": "nlp"}]}
263+
tags: Optional list of tags for organizing jobs.
264+
experiment: Optional MLflow experiment configuration.
279265
"""
280-
has_benchmarks = bool(benchmarks)
266+
has_benchmarks = benchmarks is not None
281267
has_collection = collection is not None
282268
if has_benchmarks == has_collection:
283269
raise ValueError("Provide exactly one of 'benchmarks' or 'collection'.")
284270

285-
client = _get_client()
286-
287-
model_config = ModelConfig(**model)
288-
289-
benchmark_configs = None
290-
if benchmarks is not None:
291-
benchmark_configs = [BenchmarkConfig(**b) for b in benchmarks]
271+
if benchmarks is not None and len(benchmarks) == 0:
272+
raise ValueError("'benchmarks' cannot be empty when provided.")
292273

293-
collection_ref = None
294-
if collection is not None:
295-
collection_ref = CollectionRef(**collection)
296-
297-
experiment_config = None
298-
if experiment is not None:
299-
experiment_config = ExperimentConfig(**experiment)
274+
client = _get_client()
300275

301276
request = JobSubmissionRequest(
302277
name=name,
303278
description=description,
304279
tags=tags or [],
305-
model=model_config,
306-
benchmarks=benchmark_configs,
307-
collection=collection_ref,
308-
experiment=experiment_config,
280+
model=model,
281+
benchmarks=benchmarks,
282+
collection=collection,
283+
experiment=experiment,
309284
)
310285

311286
job = await client.jobs.submit(request)

tests/unit/test_mcp_server.py

Lines changed: 101 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -32,10 +32,13 @@
3232
BenchmarkConfig,
3333
BenchmarkReference,
3434
Collection,
35+
CollectionRef,
3536
EvaluationJob,
3637
EvaluationJobResource,
3738
EvaluationJobStatus,
39+
ExperimentConfig,
3840
JobStatus,
41+
ModelAuth,
3942
ModelConfig,
4043
Provider,
4144
Resource,
@@ -269,16 +272,83 @@ async def test_list_tools() -> None:
269272
assert len(tool_names) == 2
270273

271274

275+
async def test_submit_evaluation_schema() -> None:
276+
"""Verify the generated inputSchema contains typed $defs for Pydantic models."""
277+
tools = await mcp.list_tools()
278+
tool = next(t for t in tools if t.name == "submit_evaluation")
279+
schema = tool.inputSchema
280+
281+
# Required top-level params
282+
assert "name" in schema["required"]
283+
assert "model" in schema["required"]
284+
285+
# Pydantic models generate $defs with full property definitions
286+
defs = schema["$defs"]
287+
assert "ModelConfig" in defs
288+
assert "BenchmarkConfig" in defs
289+
290+
# ModelConfig has url and name as required
291+
model_def = defs["ModelConfig"]
292+
assert "url" in model_def["properties"]
293+
assert "name" in model_def["properties"]
294+
assert "url" in model_def["required"]
295+
assert "name" in model_def["required"]
296+
297+
# BenchmarkConfig has id and provider_id as required
298+
bench_def = defs["BenchmarkConfig"]
299+
assert "id" in bench_def["properties"]
300+
assert "provider_id" in bench_def["properties"]
301+
assert "id" in bench_def["required"]
302+
assert "provider_id" in bench_def["required"]
303+
304+
305+
async def test_submit_evaluation_wire_path(mock_client: MagicMock) -> None:
306+
"""Invoke submit_evaluation through FastMCP's call_tool with JSON-like dicts."""
307+
await mcp.call_tool(
308+
"submit_evaluation",
309+
{
310+
"name": "wire-eval",
311+
"model": {"url": "http://model:8000/v1", "name": "llama3"},
312+
"benchmarks": [
313+
{"id": "gsm8k", "provider_id": "lm_eval"},
314+
{
315+
"id": "mmlu",
316+
"provider_id": "lm_eval",
317+
"parameters": {"num_few_shot": 5},
318+
},
319+
],
320+
"experiment": {
321+
"name": "my-experiment",
322+
"tags": [{"key": "team", "value": "nlp"}],
323+
},
324+
},
325+
)
326+
327+
mock_client.jobs.submit.assert_awaited_once()
328+
request = mock_client.jobs.submit.call_args[0][0]
329+
assert isinstance(request.model, ModelConfig)
330+
assert request.model.url == "http://model:8000/v1"
331+
assert request.model.name == "llama3"
332+
assert len(request.benchmarks) == 2
333+
assert isinstance(request.benchmarks[0], BenchmarkConfig)
334+
assert request.benchmarks[0].id == "gsm8k"
335+
assert request.benchmarks[1].parameters == {"num_few_shot": 5}
336+
assert isinstance(request.experiment, ExperimentConfig)
337+
assert request.experiment.name == "my-experiment"
338+
assert len(request.experiment.tags) == 1
339+
assert request.experiment.tags[0].key == "team"
340+
341+
272342
# ---------------------------------------------------------------------------
273-
# Tool call tests
343+
# Tool call tests (direct invocation)
274344
# ---------------------------------------------------------------------------
275345

276346

277347
async def test_submit_evaluation(mock_client: MagicMock) -> None:
278348
result = await submit_evaluation(
279349
name="my-eval",
280-
model={"url": "http://model:8000", "name": "llama3"},
281-
benchmarks=[{"id": "gsm8k", "provider_id": "lm_eval"}],
350+
model=ModelConfig(url="http://model:8000", name="llama3"),
351+
benchmarks=[BenchmarkConfig(id="gsm8k", provider_id="lm_eval")],
282352
)
283353
data = json.loads(result)
284354
assert data["name"] == "test-eval"
@@ -297,8 +367,8 @@ async def test_submit_evaluation(mock_client: MagicMock) -> None:
297367
async def test_submit_evaluation_with_collection(mock_client: MagicMock) -> None:
298368
result = await submit_evaluation(
299369
name="collection-eval",
300-
model={"url": "http://model:8000", "name": "llama3"},
301-
collection={"id": "standard"},
370+
model=ModelConfig(url="http://model:8000", name="llama3"),
371+
collection=CollectionRef(id="standard"),
302372
)
303373
json.loads(result) # validate JSON output
304374

@@ -312,12 +382,12 @@ async def test_submit_evaluation_with_collection(mock_client: MagicMock) -> None
312382
async def test_submit_evaluation_with_model_auth(mock_client: MagicMock) -> None:
313383
await submit_evaluation(
314384
name="auth-eval",
315-
model={
316-
"url": "http://model:8000",
317-
"name": "llama3",
318-
"auth": {"secret_ref": "my-secret"},
319-
},
320-
benchmarks=[{"id": "gsm8k", "provider_id": "lm_eval"}],
385+
model=ModelConfig(
386+
url="http://model:8000",
387+
name="llama3",
388+
auth=ModelAuth(secret_ref="my-secret"),
389+
),
390+
benchmarks=[BenchmarkConfig(id="gsm8k", provider_id="lm_eval")],
321391
)
322392

323393
call_args = mock_client.jobs.submit.call_args
@@ -329,9 +399,9 @@ async def test_submit_evaluation_with_model_auth(mock_client: MagicMock) -> None
329399
async def test_submit_evaluation_with_experiment(mock_client: MagicMock) -> None:
330400
await submit_evaluation(
331401
name="exp-eval",
332-
model={"url": "http://model:8000", "name": "llama3"},
333-
benchmarks=[{"id": "gsm8k", "provider_id": "lm_eval"}],
334-
experiment={"name": "my-experiment"},
402+
model=ModelConfig(url="http://model:8000", name="llama3"),
403+
benchmarks=[BenchmarkConfig(id="gsm8k", provider_id="lm_eval")],
404+
experiment=ExperimentConfig(name="my-experiment"),
335405
)
336406

337407
call_args = mock_client.jobs.submit.call_args
@@ -341,24 +411,35 @@ async def test_submit_evaluation_with_experiment(mock_client: MagicMock) -> None
341411

342412

343413
async def test_submit_evaluation_both_benchmarks_and_collection(
344-
mock_client: MagicMock
414+
mock_client: MagicMock,
345415
) -> None:
346416
with pytest.raises(ValueError, match="exactly one"):
347417
await submit_evaluation(
348418
name="bad-eval",
349-
model={"url": "http://model:8000", "name": "llama3"},
350-
benchmarks=[{"id": "gsm8k", "provider_id": "lm_eval"}],
351-
collection={"id": "standard"},
419+
model=ModelConfig(url="http://model:8000", name="llama3"),
420+
benchmarks=[BenchmarkConfig(id="gsm8k", provider_id="lm_eval")],
421+
collection=CollectionRef(id="standard"),
352422
)
353423

354424

355425
async def test_submit_evaluation_neither_benchmarks_nor_collection(
356-
mock_client: MagicMock
426+
mock_client: MagicMock,
357427
) -> None:
358428
with pytest.raises(ValueError, match="exactly one"):
359429
await submit_evaluation(
360430
name="bad-eval",
361-
model={"url": "http://model:8000", "name": "llama3"},
431+
model=ModelConfig(url="http://model:8000", name="llama3"),
432+
)
433+
434+
435+
async def test_submit_evaluation_empty_benchmarks(
436+
mock_client: MagicMock,
437+
) -> None:
438+
with pytest.raises(ValueError, match="cannot be empty"):
439+
await submit_evaluation(
440+
name="bad-eval",
441+
model=ModelConfig(url="http://model:8000", name="llama3"),
442+
benchmarks=[],
362443
)
363444

364445

0 commit comments

Comments
 (0)