chore: add example accuracy test

kmruiz · kmruiz · commit cda8caabada2 · 2025-06-18T11:22:45.000+02:00
diff --git a/package-lock.json b/package-lock.json
diff --git a/package.json b/package.json
@@ -29,7 +29,10 @@
     "check:types": "tsc --noEmit --project tsconfig.json",
     "reformat": "prettier --write .",
     "generate": "./scripts/generate.sh",
-    "test": "node --experimental-vm-modules node_modules/jest/bin/jest.js --coverage"
+    "test": "npm run test:unit && npm run test:integration",
+    "test:accuracy": "node --experimental-vm-modules node_modules/jest/bin/jest.js --coverage --testPathPattern=tests/accuracy",
+    "test:unit": "node --experimental-vm-modules node_modules/jest/bin/jest.js --coverage --testPathPattern=tests/unit",
+    "test:integration": "node --experimental-vm-modules node_modules/jest/bin/jest.js --coverage --testPathPattern=tests/integration"
   },
   "license": "Apache-2.0",
   "devDependencies": {
@@ -57,7 +60,8 @@
     "tsx": "^4.19.3",
     "typescript": "^5.8.2",
     "typescript-eslint": "^8.29.1",
-    "yaml": "^2.7.1"
+    "yaml": "^2.7.1",
+    "zod-to-json-schema": "^3.24.5"
   },
   "dependencies": {
     "@modelcontextprotocol/sdk": "^1.11.2",
diff --git a/tests/accuracy/1-step/simple-find-query.test.ts b/tests/accuracy/1-step/simple-find-query.test.ts
@@ -0,0 +1,20 @@
+import { describeAccuracyTest } from "../test-sdk.js";
+
+describeAccuracyTest("1 step find queries", ({ prompt }) => {
+    prompt("find all users in database 'my' and collection 'users'", (tool) => {
+        tool("find").verifyCalled({ database: "my", collection: "users", limit: 10 });
+    });
+
+    prompt("find all red cards in database 'production' and collection 'cars'", (tool) => {
+        tool("find").verifyCalled({ filter: { color: "red" }, database: "production", collection: "cars", limit: 10 });
+    });
+
+    prompt("get 100 books in database 'prod' and collection 'books' where the author is J.R.R Tolkien", (tool) => {
+        tool("find").verifyCalled({
+            filter: { author: "J.R.R Tolkien" },
+            database: "prod",
+            collection: "books",
+            limit: 100,
+        });
+    });
+});
diff --git a/tests/accuracy/models/gemini.ts b/tests/accuracy/models/gemini.ts
@@ -0,0 +1,82 @@
+import { ModelFacade, ToolCall, ToolDefinition } from "./model.js";
+
+type GeminiModel = "gemini-2.0-flash" | "gemini-1.5-flash";
+
+export class GeminiModelFacade implements ModelFacade {
+    readonly name: GeminiModel;
+
+    constructor(modelName: GeminiModel) {
+        this.name = modelName;
+    }
+
+    available(): boolean {
+        return process.env.MONGODB_MCP_TEST_GEMINI_API_KEY !== undefined;
+    }
+
+    async generateContent(prompt: string, tools: ToolDefinition[]): Promise<{ toolCall: ToolCall[]; text?: string }> {
+        const toolDefinitions = tools.map((tool) => ({
+            name: tool.name,
+            description: tool.description,
+            parameters: tool.parameters || {},
+        }));
+
+        const chatHistory = [{ role: "user", parts: [{ text: prompt }] }];
+        const payload = {
+            contents: chatHistory,
+            tools: {
+                function_declarations: [toolDefinitions],
+            },
+        };
+
+        const apiKey = process.env.MONGODB_MCP_TEST_GEMINI_API_KEY;
+        const apiUrl = `https://generativelanguage.googleapis.com/v1beta/models/${this.name}:generateContent?key=${apiKey}`;
+
+        try {
+            const response = await fetch(apiUrl, {
+                method: "POST",
+                headers: { "Content-Type": "application/json" },
+                body: JSON.stringify(payload),
+            });
+
+            if (!response.ok) {
+                const errorData = await response.text();
+                console.error(`[Gemini API Error] HTTP error! status: ${response.status}, data: ${errorData}`);
+                return { toolCall: [], text: `Gemini API error: ${response.status}` };
+            }
+
+            const result = (await response.json()) as {
+                candidates: Array<{
+                    content: {
+                        parts: Array<{
+                            text?: string;
+                            functionCall?: {
+                                name: string;
+                                args: Record<string, unknown>;
+                            };
+                        }>;
+                    };
+                }>;
+            };
+
+            if (result.candidates && result.candidates.length > 0) {
+                const firstPart = result.candidates[0]?.content.parts[0];
+                if (firstPart?.functionCall) {
+                    return {
+                        toolCall: [
+                            {
+                                name: firstPart.functionCall.name,
+                                args: firstPart.functionCall.args,
+                            },
+                        ],
+                    };
+                } else if (firstPart?.text) {
+                    return { toolCall: [], text: firstPart.text };
+                }
+            }
+            return { toolCall: [], text: "Gemini response was empty or unexpected." };
+        } catch (error: unknown) {
+            console.error("[Gemini API Fetch Error]", error);
+            return { toolCall: [], text: `Error contacting Gemini LLM.` };
+        }
+    }
+}
diff --git a/tests/accuracy/models/index.ts b/tests/accuracy/models/index.ts
@@ -0,0 +1,11 @@
+import { ModelFacade } from "./model.js";
+import { GeminiModelFacade } from "./gemini.js";
+
+const ALL_MODELS: ModelFacade[] = [
+    new GeminiModelFacade("gemini-2.0-flash"),
+    new GeminiModelFacade("gemini-1.5-flash"),
+];
+
+export function availableModels(): ModelFacade[] {
+    return ALL_MODELS.filter((model) => model.available());
+}
diff --git a/tests/accuracy/models/model.ts b/tests/accuracy/models/model.ts
@@ -0,0 +1,12 @@
+export type ToolCall = { name: string; args: Record<string, unknown> };
+export type ToolDefinition = {
+    name: string;
+    description: string;
+    parameters: Record<string, unknown>;
+};
+
+export interface ModelFacade {
+    name: string;
+    available(): boolean;
+    generateContent(prompt: string, tools: ToolDefinition[]): Promise<{ toolCall: ToolCall[]; text?: string }>;
+}
diff --git a/tests/accuracy/test-sdk.ts b/tests/accuracy/test-sdk.ts
@@ -0,0 +1,190 @@
+import { Client } from "@modelcontextprotocol/sdk/client/index.js";
+import { McpServer, RegisteredTool, ToolCallback } from "@modelcontextprotocol/sdk/server/mcp.js";
+import { InMemoryTransport } from "@modelcontextprotocol/sdk/inMemory.js";
+import { Server } from "../../src/server.js";
+import { Session } from "../../src/session.js";
+import { Telemetry } from "../../src/telemetry/telemetry.js";
+import { config, UserConfig } from "../../src/config.js";
+import { afterEach } from "node:test";
+import { availableModels } from "./models/index.js";
+import { ToolDefinition } from "./models/model.js";
+import { zodToJsonSchema } from "zod-to-json-schema";
+
+class ToolMock {
+    readonly name: string;
+    arguments: unknown;
+    returns: unknown;
+    wasCalledWith: unknown;
+
+    constructor(name: string) {
+        this.name = name;
+        this.arguments = {};
+        this.returns = {};
+    }
+
+    verifyCalled(args: unknown): this {
+        this.arguments = args;
+        return this;
+    }
+
+    thenReturn(value: unknown): this {
+        this.returns = value;
+        return this;
+    }
+
+    _wasCalledWith(args: unknown): this {
+        this.wasCalledWith = args;
+        return this;
+    }
+
+    _verify(): void {
+        if (this.wasCalledWith) {
+            expect(this.wasCalledWith).toEqual(this.arguments);
+        } else {
+            expect(this.arguments).not.toBe(null);
+        }
+    }
+}
+
+interface McpServerUnsafe {
+    mcpServer: McpServer;
+}
+
+type AccuracyToolSetupFunction = (toolName: string) => ToolMock;
+type AccuracyTestCaseFn = (tools: AccuracyToolSetupFunction) => void;
+type AccuracyItFn = (prompt: string, testCase: AccuracyTestCaseFn) => void;
+type AccuracyTestSuite = { prompt: AccuracyItFn };
+
+export function describeAccuracyTest(useCase: string, testCaseFn: (testSuite: AccuracyTestSuite) => void) {
+    const models = availableModels();
+    if (models.length === 0) {
+        throw new Error("No models available for accuracy tests.");
+    }
+
+    models.forEach((model) => {
+        describe(`${model.name}: ${useCase}`, () => {
+            let mcpServer: Server;
+            let mcpClient: Client;
+            let userConfig: UserConfig;
+            let session: Session;
+            let telemetry: Telemetry;
+
+            beforeEach(async () => {
+                mcpClient = new Client(
+                    {
+                        name: "test-client",
+                        version: "1.2.3",
+                    },
+                    {
+                        capabilities: {},
+                    }
+                );
+
+                userConfig = { ...config };
+                session = new Session(userConfig);
+                telemetry = Telemetry.create(session, userConfig);
+
+                mcpServer = new Server({
+                    session,
+                    userConfig,
+                    telemetry,
+                    mcpServer: new McpServer({
+                        name: "test-server",
+                        version: "5.2.3",
+                    }),
+                });
+
+                const [clientTransport, serverTransport] = InMemoryTransport.createLinkedPair();
+
+                await Promise.all([mcpServer.connect(serverTransport), mcpClient.connect(clientTransport)]);
+            });
+
+            afterEach(async () => {
+                await Promise.all([mcpServer.close(), mcpClient.close()]);
+            });
+
+            const promptFn: AccuracyItFn = (prompt: string, testCase: AccuracyTestCaseFn) => {
+                it(prompt, async () => {
+                    const mcpServerUnsafe = (mcpServer as unknown as McpServerUnsafe).mcpServer;
+                    const tools = mcpServerUnsafe["_registeredTools"] as { [toolName: string]: RegisteredTool };
+                    const toolDefinitions = Object.entries(tools).map(([toolName, tool]) => {
+                        if (!tool.inputSchema) {
+                            throw new Error(`Tool ${toolName} does not have an input schema defined.`);
+                        }
+
+                        const toolForApi: ToolDefinition = {
+                            name: toolName,
+                            description: tool.description ?? "",
+                            parameters: zodToJsonSchema(tool.inputSchema, {
+                                target: "jsonSchema7",
+                                allowedAdditionalProperties: undefined,
+                                rejectedAdditionalProperties: undefined,
+                                postProcess: (schema) => {
+                                    if (schema && typeof schema === "object") {
+                                        return {
+                                            ...schema,
+                                            $schema: undefined,
+                                            const: undefined,
+                                            additionalProperties: undefined,
+                                        };
+                                    }
+                                    return schema;
+                                },
+                            }),
+                        };
+                        delete toolForApi.parameters.$schema;
+                        return toolForApi;
+                    });
+
+                    const mocks: Array<ToolMock> = [];
+                    const toolFn: AccuracyToolSetupFunction = (toolName: string) => {
+                        const mock = new ToolMock(toolName);
+
+                        const mcpServerUnsafe = (mcpServer as unknown as McpServerUnsafe).mcpServer;
+                        const tools = mcpServerUnsafe["_registeredTools"] as { [toolName: string]: RegisteredTool };
+
+                        if (tools[toolName] !== undefined) {
+                            tools[toolName].callback = ((args: unknown) => {
+                                mock._wasCalledWith(args);
+                                return mock.returns;
+                            }) as unknown as ToolCallback;
+                        }
+
+                        mocks.push(mock);
+                        return mock;
+                    };
+
+                    testCase(toolFn);
+
+                    const consumePromptUntilNoMoreCall = async (prompt: string[]) => {
+                        const promptStr = prompt.join("\n");
+                        const response = await model.generateContent(promptStr, toolDefinitions);
+
+                        if (response.toolCall.length > 0) {
+                            const toolCallResults = await Promise.all(
+                                response.toolCall.map((tc) =>
+                                    mcpClient.callTool({
+                                        name: tc.name,
+                                        arguments: tc.args,
+                                    })
+                                )
+                            );
+                            const newPrompt = toolCallResults.flatMap((result) =>
+                                (result.content as Array<{ text: string }>).map((c) => c.text)
+                            );
+
+                            if (newPrompt.join("\n").trim().length > 0) {
+                                return consumePromptUntilNoMoreCall(newPrompt);
+                            }
+                        }
+                    };
+
+                    await consumePromptUntilNoMoreCall([prompt]);
+                    mocks.forEach((mock) => mock._verify());
+                });
+            };
+
+            testCaseFn({ prompt: promptFn });
+        });
+    });
+}