OpsLevel
diff --git a/‎.gitignore‎
Lines changed: 4 additions & 0 deletions b/‎.gitignore‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎mcp-eval/.node-version‎
Lines changed: 1 addition & 0 deletions b/‎mcp-eval/.node-version‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎mcp-eval/.prettierignore‎
Lines changed: 3 additions & 0 deletions b/‎mcp-eval/.prettierignore‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎mcp-eval/.prettierrc‎
Lines changed: 1 addition & 0 deletions b/‎mcp-eval/.prettierrc‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎mcp-eval/README.md‎
Lines changed: 112 additions & 0 deletions b/‎mcp-eval/README.md‎
Lines changed: 112 additions & 0 deletions
diff --git a/‎mcp-eval/jsconfig.json‎
Lines changed: 8 additions & 0 deletions b/‎mcp-eval/jsconfig.json‎
Lines changed: 8 additions & 0 deletions
diff --git a/‎mcp-eval/package.json‎
Lines changed: 29 additions & 0 deletions b/‎mcp-eval/package.json‎
Lines changed: 29 additions & 0 deletions
diff --git a/‎mcp-eval/results/.keep‎ b/‎mcp-eval/results/.keep‎
diff --git a/‎mcp-eval/src/eval_cli.js‎
Lines changed: 125 additions & 0 deletions b/‎mcp-eval/src/eval_cli.js‎
Lines changed: 125 additions & 0 deletions
@@ -8,3 +8,7 @@ src/*.yaml
 .vscode/
 src/cli
 src/opslevel-mcp
+mcp-eval/node_modules/
+.env
+mcp-eval/results/*
+!mcp-eval/results/.keep
@@ -0,0 +1 @@
+20.17.0
@@ -0,0 +1,3 @@
+# Ignore artifacts:
+build
+coverage
@@ -0,0 +1 @@
+{}
@@ -0,0 +1,112 @@
+# mcp-eval
+
+Evaluate result of various queries/prompts using opslevel-mcp.
+
+At the moment is just uses Anthropic's Claude API, but OpenAI support should come soon.
+
+## Setup
+
+Copy the `template.env` to `.env` and set the necessary variables.
+
+```sh
+ANTHROPIC_API_KEY='' # required
+OPSLEVEL_APP_URL='' # optional
+OPSLEVEL_API_TOKEN='' # required
+MCP_SERVER_PATH='' # required
+```
+
+You can also pass these as flags to the command line.
+See `yarn eval --help` for more details
+
+## Prompts
+
+Prompts are kept in `prompts.js`. Each prompt has a `slug` and a `query`.
+
+## Running
+
+The `MCP_SERVER_PATH` environment variable is the path on your local file system that points to the MCP server you want to test.
+
+```
+yarn run eval
+```
+
+If you don't set the `MCP_SERVER_PATH` environment variable, you must provide the path as a command-line argument.
+
+```
+yarn run eval path/to/opslevel-mcp
+```
+
+If you provide both the `MCP_SERVER_PATH` variable and the command-line argument, the command-line argument takes precedence.
+
+For more detailed logging:
+
+```
+DEBUG=true yarn run eval
+```
+
+## Results
+
+The results are in the `results` folder. Each run creates a new folder with the time of the run (eg: `2025-04-21T17:57/`).
+
+For each slug in `prompts.js`, there's a `<slug>.json` file. Example:
+
+```json
+{
+  "prompt": {
+    "slug": "employees",
+    "query": "Who works at opslevel"
+  },
+  "response": "Based on the information retrieved from the OpsLevel account, there are 2 users registered:\n\n1. **Alice**\n   - Email: [email protected]\n   - Role: Admin\n\n2. **Foobar**\n   - Email: [email protected]\n   - Role: Team Member\n\nThese are the individuals who have accounts in this OpsLevel system. Alice appears to have administrator privileges, while Foobar is a regular team member.",
+  "raw_messages": [
+    {
+      "role": "user",
+      "content": "Who works at opslevel"
+    },
+    {
+      "role": "assistant",
+      "content": [
+        {
+          "type": "text",
+          "text": "I'll help you find information about who works at OpsLevel. Let me retrieve the list of users in the OpsLevel account."
+        },
+        {
+          "type": "tool_use",
+          "id": "toolu_013HeznQfrB3tLQbRaDfvPdi",
+          "name": "users",
+          "input": {}
+        }
+      ]
+    },
+    {
+      "role": "user",
+      "content": [
+        {
+          "type": "tool_result",
+          "tool_use_id": "toolu_013HeznQfrB3tLQbRaDfvPdi",
+          "content": [
+            {
+              "type": "text",
+              "text": "[{\"Id\":\"Z2lkOi8vb3BzbGV2ZWwvVXNlci8x\",\"Email\":\"[email protected]\",\"HTMLUrl\":\"http://app.opslevel.local:3000/users/1\",\"Name\":\"Alice\",\"Role\":\"admin\"},{\"Id\":\"Z2lkOi8vb3BzbGV2ZWwvVXNlci8y\",\"Email\":\"[email protected]\",\"HTMLUrl\":\"http://app.opslevel.local:3000/users/2\",\"Name\":\"Foobar\",\"Role\":\"team_member\"}]"
+            }
+          ]
+        }
+      ]
+    },
+    {
+      "role": "assistant",
+      "content": [
+        {
+          "type": "text",
+          "text": "Based on the information retrieved from the OpsLevel account, there are 2 users registered:\n\n1. **Alice**\n   - Email: [email protected]\n   - Role: Admin\n\n2. **Foobar**\n   - Email: [email protected]\n   - Role: Team Member\n\nThese are the individuals who have accounts in this OpsLevel system. Alice appears to have administrator privileges, while Foobar is a regular team member."
+        }
+      ]
+    }
+  ],
+  "run_at": "2025-04-21T17:57:49.157Z",
+  "ops_level_mcp_version": "unknown",
+  "model_config": {
+    "model": "claude-3-7-sonnet-20250219",
+    "max_tokens": 1000
+  }
+}
+```
@@ -0,0 +1,8 @@
+{
+  "compilerOptions": {
+    "module": "Node16",
+    "target": "ES2022",
+    "checkJs": false
+  },
+  "exclude": ["node_modules"]
+}
@@ -0,0 +1,29 @@
+{
+  "name": "mcp-eval",
+  "version": "0.0.1",
+  "description": "Framework for evaluating ops level mcp server",
+  "main": "src/index.js",
+  "author": "OpsLevel",
+  "license": "UNLICENSED",
+  "private": true,
+  "type": "module",
+  "scripts": {
+    "eval": "node ./src/eval_cli.js",
+    "eval-debug": "node inspect ./src/eval_cli.js",
+    "report": "node ./src/report_cli.js",
+    "test": "node --experimental-vm-modules node_modules/jest/bin/jest.js",
+    "lint": "prettier --write ."
+  },
+  "dependencies": {
+    "@anthropic-ai/sdk": "^0.39.0",
+    "@modelcontextprotocol/sdk": "^1.10.0",
+    "commander": "^13.1.0",
+    "dotenv": "^16.5.0",
+    "handlebars": "^4.7.8",
+    "marked": "^15.0.9"
+  },
+  "devDependencies": {
+    "jest": "^29.7.0",
+    "prettier": "^3.5.3"
+  }
+}
@@ -0,0 +1,125 @@
+import { mkdir, writeFile } from "node:fs/promises";
+
+import { Command, Option } from "commander";
+import dotenv from "dotenv";
+
+import { MCPClient } from "./mcp_client.js";
+import { PROMPTS } from "./prompts.js";
+import { renderReport, saveReport } from "./report.js";
+
+dotenv.config();
+
+const program = new Command();
+
+program
+  .name("mcp-eval")
+  .description("mcp evaluation tool from opslevel")
+  .version("0.0.1")
+  .addOption(
+    new Option("--debug", "Enable debug logging").env("DEBUG").default(false),
+  )
+  .addOption(
+    new Option("--anthropic-api-key <key>", "Anthropic API key")
+      .env("ANTHROPIC_API_KEY")
+      .makeOptionMandatory(true),
+  )
+  .addOption(
+    new Option("--anthropic-model <model>", "Anthropic model")
+      .env("ANTHROPIC_MODEL")
+      .default("claude-3-7-sonnet-20250219"),
+  )
+  .addOption(
+    new Option("--mcp-server-path <path>", "Path to MCP server binary")
+      .env("MCP_SERVER_PATH")
+      .makeOptionMandatory(true),
+  )
+  .addOption(
+    new Option("--opslevel-api-token <token>", "OpsLevel API token")
+      .env("OPSLEVEL_API_TOKEN")
+      .makeOptionMandatory(true),
+  )
+  .addOption(
+    new Option("--opslevel-app-url <url>", "OpsLevel app URL for local development or self-hosted")
+      .env("OPSLEVEL_APP_URL")
+  )
+  .addOption(
+    new Option("--slugs <slugs...>", "Prompt Slugs to run, defaults to all"),
+  );
+
+const saveResult = async (folder, fileName, data) => {
+  const filePath = `${folder}/${fileName}.json`;
+  await writeFile(filePath, JSON.stringify(data, null, 2));
+  console.log("Result saved to:", filePath);
+};
+
+const packageResult = (prompt, messages, version, model_config) => {
+  // maybe we can extract tool calls and results
+  return {
+    prompt: prompt,
+    response: messages[messages.length - 1]["content"][0]["text"],
+    raw_messages: messages,
+    run_at: new Date().toISOString(),
+    ops_level_mcp_version: version,
+    model_config: model_config,
+  };
+};
+
+async function main() {
+  console.log("Starting MCP eval client");
+
+  program.parse(process.argv);
+  const {
+    anthropicApiKey,
+    anthropicModel,
+    mcpServerPath,
+    opslevelApiToken,
+    opslevelAppUrl,
+    debug,
+    slugs,
+  } = program.opts();
+
+  const mcpClient = new MCPClient(anthropicApiKey, anthropicModel, debug);
+  const results = [];
+  const now = new Date();
+
+  // easiest way I can think of to get yyyy-mm-dd-hh-mm
+  const folder = `results/${now.toISOString().slice(0, 16)}`
+    .replace(/:/g, "-")
+    .replace("T", "_");
+  await mkdir(folder, { recursive: true });
+  console.log("Saving results to folder:", folder);
+
+  try {
+    await mcpClient.connectToServer(mcpServerPath, [], {
+      OPSLEVEL_API_TOKEN: opslevelApiToken,
+      OPSLEVEL_APP_URL: opslevelAppUrl,
+    });
+
+    const version = await mcpClient.mcp.getServerVersion()["version"];
+
+    // Filter prompts based on slugs if provided
+    const filteredPrompts = slugs
+      ? PROMPTS.filter((prompt) => slugs.includes(prompt.slug))
+      : PROMPTS;
+
+    for (const prompt of filteredPrompts) {
+      console.log("Prompt: ", prompt.query);
+      const messages = await mcpClient.processQuery(prompt.query);
+      const result = packageResult(
+        prompt,
+        messages,
+        version,
+        mcpClient.modelConfig(),
+      );
+      saveResult(folder, prompt.slug, result);
+      results.push(result);
+    }
+  } finally {
+    await mcpClient.cleanup();
+  }
+
+  const report = renderReport(results);
+  saveReport(folder, report);
+}
+
+main();
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+# Ignore artifacts:`
	`2`	`+build`
	`3`	`+coverage`