Skip to content

Commit 0820a9c

Browse files
committed
move mcp-eval into opslevel-mcp
1 parent 46d5966 commit 0820a9c

File tree

17 files changed

+3594
-0
lines changed

17 files changed

+3594
-0
lines changed

.gitignore

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,3 +8,7 @@ src/*.yaml
88
.vscode/
99
src/cli
1010
src/opslevel-mcp
11+
mcp-eval/node_modules/
12+
.env
13+
mcp-eval/results/*
14+
!mcp-eval/results/.keep

mcp-eval/.node-version

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
20.17.0

mcp-eval/.prettierignore

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
# Ignore artifacts:
2+
build
3+
coverage

mcp-eval/.prettierrc

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
{}

mcp-eval/README.md

Lines changed: 112 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,112 @@
1+
# mcp-eval
2+
3+
Evaluate result of various queries/prompts using opslevel-mcp.
4+
5+
At the moment is just uses Anthropic's Claude API, but OpenAI support should come soon.
6+
7+
## Setup
8+
9+
Copy the `template.env` to `.env` and set the necessary variables.
10+
11+
```sh
12+
ANTHROPIC_API_KEY='' # required
13+
OPSLEVEL_APP_URL='' # optional
14+
OPSLEVEL_API_TOKEN='' # required
15+
MCP_SERVER_PATH='' # required
16+
```
17+
18+
You can also pass these as flags to the command line.
19+
See `yarn eval --help` for more details
20+
21+
## Prompts
22+
23+
Prompts are kept in `prompts.js`. Each prompt has a `slug` and a `query`.
24+
25+
## Running
26+
27+
The `MCP_SERVER_PATH` environment variable is the path on your local file system that points to the MCP server you want to test.
28+
29+
```
30+
yarn run eval
31+
```
32+
33+
If you don't set the `MCP_SERVER_PATH` environment variable, you must provide the path as a command-line argument.
34+
35+
```
36+
yarn run eval path/to/opslevel-mcp
37+
```
38+
39+
If you provide both the `MCP_SERVER_PATH` variable and the command-line argument, the command-line argument takes precedence.
40+
41+
For more detailed logging:
42+
43+
```
44+
DEBUG=true yarn run eval
45+
```
46+
47+
## Results
48+
49+
The results are in the `results` folder. Each run creates a new folder with the time of the run (eg: `2025-04-21T17:57/`).
50+
51+
For each slug in `prompts.js`, there's a `<slug>.json` file. Example:
52+
53+
```json
54+
{
55+
"prompt": {
56+
"slug": "employees",
57+
"query": "Who works at opslevel"
58+
},
59+
"response": "Based on the information retrieved from the OpsLevel account, there are 2 users registered:\n\n1. **Alice**\n - Email: [email protected]\n - Role: Admin\n\n2. **Foobar**\n - Email: [email protected]\n - Role: Team Member\n\nThese are the individuals who have accounts in this OpsLevel system. Alice appears to have administrator privileges, while Foobar is a regular team member.",
60+
"raw_messages": [
61+
{
62+
"role": "user",
63+
"content": "Who works at opslevel"
64+
},
65+
{
66+
"role": "assistant",
67+
"content": [
68+
{
69+
"type": "text",
70+
"text": "I'll help you find information about who works at OpsLevel. Let me retrieve the list of users in the OpsLevel account."
71+
},
72+
{
73+
"type": "tool_use",
74+
"id": "toolu_013HeznQfrB3tLQbRaDfvPdi",
75+
"name": "users",
76+
"input": {}
77+
}
78+
]
79+
},
80+
{
81+
"role": "user",
82+
"content": [
83+
{
84+
"type": "tool_result",
85+
"tool_use_id": "toolu_013HeznQfrB3tLQbRaDfvPdi",
86+
"content": [
87+
{
88+
"type": "text",
89+
"text": "[{\"Id\":\"Z2lkOi8vb3BzbGV2ZWwvVXNlci8x\",\"Email\":\"[email protected]\",\"HTMLUrl\":\"http://app.opslevel.local:3000/users/1\",\"Name\":\"Alice\",\"Role\":\"admin\"},{\"Id\":\"Z2lkOi8vb3BzbGV2ZWwvVXNlci8y\",\"Email\":\"[email protected]\",\"HTMLUrl\":\"http://app.opslevel.local:3000/users/2\",\"Name\":\"Foobar\",\"Role\":\"team_member\"}]"
90+
}
91+
]
92+
}
93+
]
94+
},
95+
{
96+
"role": "assistant",
97+
"content": [
98+
{
99+
"type": "text",
100+
"text": "Based on the information retrieved from the OpsLevel account, there are 2 users registered:\n\n1. **Alice**\n - Email: [email protected]\n - Role: Admin\n\n2. **Foobar**\n - Email: [email protected]\n - Role: Team Member\n\nThese are the individuals who have accounts in this OpsLevel system. Alice appears to have administrator privileges, while Foobar is a regular team member."
101+
}
102+
]
103+
}
104+
],
105+
"run_at": "2025-04-21T17:57:49.157Z",
106+
"ops_level_mcp_version": "unknown",
107+
"model_config": {
108+
"model": "claude-3-7-sonnet-20250219",
109+
"max_tokens": 1000
110+
}
111+
}
112+
```

mcp-eval/jsconfig.json

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
{
2+
"compilerOptions": {
3+
"module": "Node16",
4+
"target": "ES2022",
5+
"checkJs": false
6+
},
7+
"exclude": ["node_modules"]
8+
}

mcp-eval/package.json

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,29 @@
1+
{
2+
"name": "mcp-eval",
3+
"version": "0.0.1",
4+
"description": "Framework for evaluating ops level mcp server",
5+
"main": "src/index.js",
6+
"author": "OpsLevel",
7+
"license": "UNLICENSED",
8+
"private": true,
9+
"type": "module",
10+
"scripts": {
11+
"eval": "node ./src/eval_cli.js",
12+
"eval-debug": "node inspect ./src/eval_cli.js",
13+
"report": "node ./src/report_cli.js",
14+
"test": "node --experimental-vm-modules node_modules/jest/bin/jest.js",
15+
"lint": "prettier --write ."
16+
},
17+
"dependencies": {
18+
"@anthropic-ai/sdk": "^0.39.0",
19+
"@modelcontextprotocol/sdk": "^1.10.0",
20+
"commander": "^13.1.0",
21+
"dotenv": "^16.5.0",
22+
"handlebars": "^4.7.8",
23+
"marked": "^15.0.9"
24+
},
25+
"devDependencies": {
26+
"jest": "^29.7.0",
27+
"prettier": "^3.5.3"
28+
}
29+
}

mcp-eval/results/.keep

Whitespace-only changes.

mcp-eval/src/eval_cli.js

Lines changed: 125 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,125 @@
1+
import { mkdir, writeFile } from "node:fs/promises";
2+
3+
import { Command, Option } from "commander";
4+
import dotenv from "dotenv";
5+
6+
import { MCPClient } from "./mcp_client.js";
7+
import { PROMPTS } from "./prompts.js";
8+
import { renderReport, saveReport } from "./report.js";
9+
10+
dotenv.config();
11+
12+
const program = new Command();
13+
14+
program
15+
.name("mcp-eval")
16+
.description("mcp evaluation tool from opslevel")
17+
.version("0.0.1")
18+
.addOption(
19+
new Option("--debug", "Enable debug logging").env("DEBUG").default(false),
20+
)
21+
.addOption(
22+
new Option("--anthropic-api-key <key>", "Anthropic API key")
23+
.env("ANTHROPIC_API_KEY")
24+
.makeOptionMandatory(true),
25+
)
26+
.addOption(
27+
new Option("--anthropic-model <model>", "Anthropic model")
28+
.env("ANTHROPIC_MODEL")
29+
.default("claude-3-7-sonnet-20250219"),
30+
)
31+
.addOption(
32+
new Option("--mcp-server-path <path>", "Path to MCP server binary")
33+
.env("MCP_SERVER_PATH")
34+
.makeOptionMandatory(true),
35+
)
36+
.addOption(
37+
new Option("--opslevel-api-token <token>", "OpsLevel API token")
38+
.env("OPSLEVEL_API_TOKEN")
39+
.makeOptionMandatory(true),
40+
)
41+
.addOption(
42+
new Option("--opslevel-app-url <url>", "OpsLevel app URL for local development or self-hosted")
43+
.env("OPSLEVEL_APP_URL")
44+
)
45+
.addOption(
46+
new Option("--slugs <slugs...>", "Prompt Slugs to run, defaults to all"),
47+
);
48+
49+
const saveResult = async (folder, fileName, data) => {
50+
const filePath = `${folder}/${fileName}.json`;
51+
await writeFile(filePath, JSON.stringify(data, null, 2));
52+
console.log("Result saved to:", filePath);
53+
};
54+
55+
const packageResult = (prompt, messages, version, model_config) => {
56+
// maybe we can extract tool calls and results
57+
return {
58+
prompt: prompt,
59+
response: messages[messages.length - 1]["content"][0]["text"],
60+
raw_messages: messages,
61+
run_at: new Date().toISOString(),
62+
ops_level_mcp_version: version,
63+
model_config: model_config,
64+
};
65+
};
66+
67+
async function main() {
68+
console.log("Starting MCP eval client");
69+
70+
program.parse(process.argv);
71+
const {
72+
anthropicApiKey,
73+
anthropicModel,
74+
mcpServerPath,
75+
opslevelApiToken,
76+
opslevelAppUrl,
77+
debug,
78+
slugs,
79+
} = program.opts();
80+
81+
const mcpClient = new MCPClient(anthropicApiKey, anthropicModel, debug);
82+
const results = [];
83+
const now = new Date();
84+
85+
// easiest way I can think of to get yyyy-mm-dd-hh-mm
86+
const folder = `results/${now.toISOString().slice(0, 16)}`
87+
.replace(/:/g, "-")
88+
.replace("T", "_");
89+
await mkdir(folder, { recursive: true });
90+
console.log("Saving results to folder:", folder);
91+
92+
try {
93+
await mcpClient.connectToServer(mcpServerPath, [], {
94+
OPSLEVEL_API_TOKEN: opslevelApiToken,
95+
OPSLEVEL_APP_URL: opslevelAppUrl,
96+
});
97+
98+
const version = await mcpClient.mcp.getServerVersion()["version"];
99+
100+
// Filter prompts based on slugs if provided
101+
const filteredPrompts = slugs
102+
? PROMPTS.filter((prompt) => slugs.includes(prompt.slug))
103+
: PROMPTS;
104+
105+
for (const prompt of filteredPrompts) {
106+
console.log("Prompt: ", prompt.query);
107+
const messages = await mcpClient.processQuery(prompt.query);
108+
const result = packageResult(
109+
prompt,
110+
messages,
111+
version,
112+
mcpClient.modelConfig(),
113+
);
114+
saveResult(folder, prompt.slug, result);
115+
results.push(result);
116+
}
117+
} finally {
118+
await mcpClient.cleanup();
119+
}
120+
121+
const report = renderReport(results);
122+
saveReport(folder, report);
123+
}
124+
125+
main();

0 commit comments

Comments
 (0)