Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
60 changes: 16 additions & 44 deletions src/evalhub/mcp/server.py
Original file line number Diff line number Diff line change
Expand Up @@ -241,41 +241,27 @@ async def handle_completion(
)
async def submit_evaluation(
name: str,
model: dict[str, Any],
benchmarks: list[dict[str, Any]] | None = None,
collection: dict[str, Any] | None = None,
model: ModelConfig,
benchmarks: list[BenchmarkConfig] | None = None,
collection: CollectionRef | None = None,
description: str | None = None,
tags: list[str] | None = None,
experiment: dict[str, Any] | None = None,
experiment: ExperimentConfig | None = None,
) -> str:
"""Submit a new evaluation job.

Evaluation Job fields have been separated for easier fill by AI agents.
Provide either 'benchmarks' or 'collection', not both.
Use the providers and benchmarks resources to discover available
provider_id and benchmark id values.

Args:
name: Job name.
model: Model to evaluate. Keys: "url" (model endpoint), "name" (model identifier),
optional "auth" with "secret_ref" (Kubernetes Secret name for model credentials).
Examples:
Remote vLLM: {"url": "http://vllm-server.models.svc.cluster.local:8000/v1", "name": "meta-llama/Llama-3.2-1B-Instruct"}
With auth: {"url": "http://model:8000/v1", "name": "my-model", "auth": {"secret_ref": "model-api-key"}}
benchmarks: List of benchmarks to run. Each entry has "id", "provider_id", and optional "parameters".
Mutually exclusive with 'collection'.
Examples:
Simple: [{"id": "demo_benchmark", "provider_id": "demo"}]
With params: [{"id": "quick_perf_test", "provider_id": "guidellm", "parameters": {"profile": "constant", "rate": 5, "max_seconds": 10, "max_requests": 20}}]
Multiple: [{"id": "gsm8k", "provider_id": "lm_eval"}, {"id": "mmlu", "provider_id": "lm_eval"}]
collection: Collection reference to run all benchmarks in a predefined collection.
Mutually exclusive with 'benchmarks'. Keys: "id" (collection identifier),
optional "benchmarks" to run only a subset.
Examples:
Full collection: {"id": "standard"}
Subset: {"id": "standard", "benchmarks": [{"id": "gsm8k", "provider_id": "lm_eval"}]}
model: Model to evaluate (url and name are required).
benchmarks: List of benchmarks to run. Mutually exclusive with 'collection'.
collection: Collection reference. Mutually exclusive with 'benchmarks'.
description: Optional job description.
tags: Optional list of tags for organizing jobs, e.g. ["nightly", "regression"].
experiment: Optional MLflow experiment config. Keys: "name", optional "tags" (list of {"key": ..., "value": ...}),
optional "artifact_location".
Example: {"name": "llama3-eval-experiment", "tags": [{"key": "team", "value": "nlp"}]}
tags: Optional list of tags for organizing jobs.
experiment: Optional MLflow experiment configuration.
"""
has_benchmarks = bool(benchmarks)
has_collection = collection is not None
Expand All @@ -284,28 +270,14 @@ async def submit_evaluation(

client = _get_client()

model_config = ModelConfig(**model)

benchmark_configs = None
if benchmarks is not None:
benchmark_configs = [BenchmarkConfig(**b) for b in benchmarks]

collection_ref = None
if collection is not None:
collection_ref = CollectionRef(**collection)

experiment_config = None
if experiment is not None:
experiment_config = ExperimentConfig(**experiment)

request = JobSubmissionRequest(
name=name,
description=description,
tags=tags or [],
model=model_config,
benchmarks=benchmark_configs,
collection=collection_ref,
experiment=experiment_config,
model=model,
benchmarks=benchmarks,
collection=collection,
experiment=experiment,
)

job = await client.jobs.submit(request)
Expand Down
41 changes: 22 additions & 19 deletions tests/unit/test_mcp_server.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,10 +32,13 @@
BenchmarkConfig,
BenchmarkReference,
Collection,
CollectionRef,
EvaluationJob,
EvaluationJobResource,
EvaluationJobStatus,
ExperimentConfig,
JobStatus,
ModelAuth,
ModelConfig,
Provider,
Resource,
Expand Down Expand Up @@ -277,8 +280,8 @@ async def test_list_tools() -> None:
async def test_submit_evaluation(mock_client: MagicMock) -> None:
result = await submit_evaluation(
name="my-eval",
model={"url": "http://model:8000", "name": "llama3"},
benchmarks=[{"id": "gsm8k", "provider_id": "lm_eval"}],
model=ModelConfig(url="http://model:8000", name="llama3"),
benchmarks=[BenchmarkConfig(id="gsm8k", provider_id="lm_eval")],
)
data = json.loads(result)
assert data["name"] == "test-eval"
Expand All @@ -297,8 +300,8 @@ async def test_submit_evaluation(mock_client: MagicMock) -> None:
async def test_submit_evaluation_with_collection(mock_client: MagicMock) -> None:
result = await submit_evaluation(
name="collection-eval",
model={"url": "http://model:8000", "name": "llama3"},
collection={"id": "standard"},
model=ModelConfig(url="http://model:8000", name="llama3"),
collection=CollectionRef(id="standard"),
)
json.loads(result) # validate JSON output

Expand All @@ -312,12 +315,12 @@ async def test_submit_evaluation_with_collection(mock_client: MagicMock) -> None
async def test_submit_evaluation_with_model_auth(mock_client: MagicMock) -> None:
await submit_evaluation(
name="auth-eval",
model={
"url": "http://model:8000",
"name": "llama3",
"auth": {"secret_ref": "my-secret"},
},
benchmarks=[{"id": "gsm8k", "provider_id": "lm_eval"}],
model=ModelConfig(
url="http://model:8000",
name="llama3",
auth=ModelAuth(secret_ref="my-secret"),
),
benchmarks=[BenchmarkConfig(id="gsm8k", provider_id="lm_eval")],
)

call_args = mock_client.jobs.submit.call_args
Expand All @@ -329,9 +332,9 @@ async def test_submit_evaluation_with_model_auth(mock_client: MagicMock) -> None
async def test_submit_evaluation_with_experiment(mock_client: MagicMock) -> None:
await submit_evaluation(
name="exp-eval",
model={"url": "http://model:8000", "name": "llama3"},
benchmarks=[{"id": "gsm8k", "provider_id": "lm_eval"}],
experiment={"name": "my-experiment"},
model=ModelConfig(url="http://model:8000", name="llama3"),
benchmarks=[BenchmarkConfig(id="gsm8k", provider_id="lm_eval")],
experiment=ExperimentConfig(name="my-experiment"),
)

call_args = mock_client.jobs.submit.call_args
Expand All @@ -341,24 +344,24 @@ async def test_submit_evaluation_with_experiment(mock_client: MagicMock) -> None


async def test_submit_evaluation_both_benchmarks_and_collection(
mock_client: MagicMock
mock_client: MagicMock,
) -> None:
with pytest.raises(ValueError, match="exactly one"):
await submit_evaluation(
name="bad-eval",
model={"url": "http://model:8000", "name": "llama3"},
benchmarks=[{"id": "gsm8k", "provider_id": "lm_eval"}],
collection={"id": "standard"},
model=ModelConfig(url="http://model:8000", name="llama3"),
benchmarks=[BenchmarkConfig(id="gsm8k", provider_id="lm_eval")],
collection=CollectionRef(id="standard"),
)


async def test_submit_evaluation_neither_benchmarks_nor_collection(
mock_client: MagicMock
mock_client: MagicMock,
) -> None:
with pytest.raises(ValueError, match="exactly one"):
await submit_evaluation(
name="bad-eval",
model={"url": "http://model:8000", "name": "llama3"},
model=ModelConfig(url="http://model:8000", name="llama3"),
)


Expand Down
Loading