3232 BenchmarkConfig ,
3333 BenchmarkReference ,
3434 Collection ,
35+ CollectionRef ,
3536 EvaluationJob ,
3637 EvaluationJobResource ,
3738 EvaluationJobStatus ,
39+ ExperimentConfig ,
3840 JobStatus ,
41+ ModelAuth ,
3942 ModelConfig ,
4043 Provider ,
4144 Resource ,
@@ -269,16 +272,83 @@ async def test_list_tools() -> None:
269272 assert len (tool_names ) == 2
270273
271274
275+ async def test_submit_evaluation_schema () -> None :
276+ """Verify the generated inputSchema contains typed $defs for Pydantic models."""
277+ tools = await mcp .list_tools ()
278+ tool = next (t for t in tools if t .name == "submit_evaluation" )
279+ schema = tool .inputSchema
280+
281+ # Required top-level params
282+ assert "name" in schema ["required" ]
283+ assert "model" in schema ["required" ]
284+
285+ # Pydantic models generate $defs with full property definitions
286+ defs = schema ["$defs" ]
287+ assert "ModelConfig" in defs
288+ assert "BenchmarkConfig" in defs
289+
290+ # ModelConfig has url and name as required
291+ model_def = defs ["ModelConfig" ]
292+ assert "url" in model_def ["properties" ]
293+ assert "name" in model_def ["properties" ]
294+ assert "url" in model_def ["required" ]
295+ assert "name" in model_def ["required" ]
296+
297+ # BenchmarkConfig has id and provider_id as required
298+ bench_def = defs ["BenchmarkConfig" ]
299+ assert "id" in bench_def ["properties" ]
300+ assert "provider_id" in bench_def ["properties" ]
301+ assert "id" in bench_def ["required" ]
302+ assert "provider_id" in bench_def ["required" ]
303+
304+
305+ async def test_submit_evaluation_wire_path (mock_client : MagicMock ) -> None :
306+ """Invoke submit_evaluation through FastMCP's call_tool with JSON-like dicts."""
307+ await mcp .call_tool (
308+ "submit_evaluation" ,
309+ {
310+ "name" : "wire-eval" ,
311+ "model" : {"url" : "http://model:8000/v1" , "name" : "llama3" },
312+ "benchmarks" : [
313+ {"id" : "gsm8k" , "provider_id" : "lm_eval" },
314+ {
315+ "id" : "mmlu" ,
316+ "provider_id" : "lm_eval" ,
317+ "parameters" : {"num_few_shot" : 5 },
318+ },
319+ ],
320+ "experiment" : {
321+ "name" : "my-experiment" ,
322+ "tags" : [{"key" : "team" , "value" : "nlp" }],
323+ },
324+ },
325+ )
326+
327+ mock_client .jobs .submit .assert_awaited_once ()
328+ request = mock_client .jobs .submit .call_args [0 ][0 ]
329+ assert isinstance (request .model , ModelConfig )
330+ assert request .model .url == "http://model:8000/v1"
331+ assert request .model .name == "llama3"
332+ assert len (request .benchmarks ) == 2
333+ assert isinstance (request .benchmarks [0 ], BenchmarkConfig )
334+ assert request .benchmarks [0 ].id == "gsm8k"
335+ assert request .benchmarks [1 ].parameters == {"num_few_shot" : 5 }
336+ assert isinstance (request .experiment , ExperimentConfig )
337+ assert request .experiment .name == "my-experiment"
338+ assert len (request .experiment .tags ) == 1
339+ assert request .experiment .tags [0 ].key == "team"
340+
341+
272342# ---------------------------------------------------------------------------
273- # Tool call tests
343+ # Tool call tests (direct invocation)
274344# ---------------------------------------------------------------------------
275345
276346
277347async def test_submit_evaluation (mock_client : MagicMock ) -> None :
278348 result = await submit_evaluation (
279349 name = "my-eval" ,
280- model = { " url" : " http://model:8000" , " name" : " llama3"} ,
281- benchmarks = [{ "id" : " gsm8k" , " provider_id" : " lm_eval"} ],
350+ model = ModelConfig ( url = " http://model:8000" , name = " llama3") ,
351+ benchmarks = [BenchmarkConfig ( id = " gsm8k" , provider_id = " lm_eval") ],
282352 )
283353 data = json .loads (result )
284354 assert data ["name" ] == "test-eval"
@@ -297,8 +367,8 @@ async def test_submit_evaluation(mock_client: MagicMock) -> None:
297367async def test_submit_evaluation_with_collection (mock_client : MagicMock ) -> None :
298368 result = await submit_evaluation (
299369 name = "collection-eval" ,
300- model = { " url" : " http://model:8000" , " name" : " llama3"} ,
301- collection = { "id" : " standard"} ,
370+ model = ModelConfig ( url = " http://model:8000" , name = " llama3") ,
371+ collection = CollectionRef ( id = " standard") ,
302372 )
303373 json .loads (result ) # validate JSON output
304374
@@ -312,12 +382,12 @@ async def test_submit_evaluation_with_collection(mock_client: MagicMock) -> None
312382async def test_submit_evaluation_with_model_auth (mock_client : MagicMock ) -> None :
313383 await submit_evaluation (
314384 name = "auth-eval" ,
315- model = {
316- " url" : "http://model:8000" ,
317- " name" : "llama3" ,
318- " auth" : { " secret_ref" : " my-secret"} ,
319- } ,
320- benchmarks = [{ "id" : " gsm8k" , " provider_id" : " lm_eval"} ],
385+ model = ModelConfig (
386+ url = "http://model:8000" ,
387+ name = "llama3" ,
388+ auth = ModelAuth ( secret_ref = " my-secret") ,
389+ ) ,
390+ benchmarks = [BenchmarkConfig ( id = " gsm8k" , provider_id = " lm_eval") ],
321391 )
322392
323393 call_args = mock_client .jobs .submit .call_args
@@ -329,9 +399,9 @@ async def test_submit_evaluation_with_model_auth(mock_client: MagicMock) -> None
329399async def test_submit_evaluation_with_experiment (mock_client : MagicMock ) -> None :
330400 await submit_evaluation (
331401 name = "exp-eval" ,
332- model = { " url" : " http://model:8000" , " name" : " llama3"} ,
333- benchmarks = [{ "id" : " gsm8k" , " provider_id" : " lm_eval"} ],
334- experiment = { " name" : " my-experiment"} ,
402+ model = ModelConfig ( url = " http://model:8000" , name = " llama3") ,
403+ benchmarks = [BenchmarkConfig ( id = " gsm8k" , provider_id = " lm_eval") ],
404+ experiment = ExperimentConfig ( name = " my-experiment") ,
335405 )
336406
337407 call_args = mock_client .jobs .submit .call_args
@@ -341,24 +411,35 @@ async def test_submit_evaluation_with_experiment(mock_client: MagicMock) -> None
341411
342412
343413async def test_submit_evaluation_both_benchmarks_and_collection (
344- mock_client : MagicMock
414+ mock_client : MagicMock ,
345415) -> None :
346416 with pytest .raises (ValueError , match = "exactly one" ):
347417 await submit_evaluation (
348418 name = "bad-eval" ,
349- model = { " url" : " http://model:8000" , " name" : " llama3"} ,
350- benchmarks = [{ "id" : " gsm8k" , " provider_id" : " lm_eval"} ],
351- collection = { "id" : " standard"} ,
419+ model = ModelConfig ( url = " http://model:8000" , name = " llama3") ,
420+ benchmarks = [BenchmarkConfig ( id = " gsm8k" , provider_id = " lm_eval") ],
421+ collection = CollectionRef ( id = " standard") ,
352422 )
353423
354424
355425async def test_submit_evaluation_neither_benchmarks_nor_collection (
356- mock_client : MagicMock
426+ mock_client : MagicMock ,
357427) -> None :
358428 with pytest .raises (ValueError , match = "exactly one" ):
359429 await submit_evaluation (
360430 name = "bad-eval" ,
361- model = {"url" : "http://model:8000" , "name" : "llama3" },
431+ model = ModelConfig (url = "http://model:8000" , name = "llama3" ),
432+ )
433+
434+
435+ async def test_submit_evaluation_empty_benchmarks (
436+ mock_client : MagicMock ,
437+ ) -> None :
438+ with pytest .raises (ValueError , match = "cannot be empty" ):
439+ await submit_evaluation (
440+ name = "bad-eval" ,
441+ model = ModelConfig (url = "http://model:8000" , name = "llama3" ),
442+ benchmarks = [],
362443 )
363444
364445
0 commit comments