chore: add plans for multistep actions

kmruiz · kmruiz · commit bbe2b838bfa7 · 2025-06-20T14:20:13.000+02:00
diff --git a/tests/accuracy/1-step/simple-delete-query.test.ts b/tests/accuracy/1-step/simple-delete-query.test.ts
@@ -0,0 +1,11 @@
+import { describeAccuracyTest } from "../test-sdk.js";
+
+describeAccuracyTest("1 step delete queries", ({ prompt }) => {
+    prompt("delete all disabled users (disabled = true) in database 'my' and collection 'users'", (tool) => {
+        tool("delete-many").verifyCalled({
+            database: "my",
+            collection: "users",
+            filter: { disabled: true },
+        });
+    });
+});
diff --git a/tests/accuracy/1-step/simple-find-query.test.ts b/tests/accuracy/1-step/simple-find-query.test.ts
@@ -5,7 +5,7 @@ describeAccuracyTest("1 step find queries", ({ prompt }) => {
         tool("find").verifyCalled({ database: "my", collection: "users", limit: 10 });
     });
 
-    prompt("find all red cards in database 'production' and collection 'cars'", (tool) => {
+    prompt("find all red cars in database 'production' and collection 'cars'", (tool) => {
         tool("find").verifyCalled({ filter: { color: "red" }, database: "production", collection: "cars", limit: 10 });
     });
 
diff --git a/tests/accuracy/1-step/simple-update-query.test.ts b/tests/accuracy/1-step/simple-update-query.test.ts
@@ -0,0 +1,12 @@
+import { describeAccuracyTest } from "../test-sdk.js";
+
+describeAccuracyTest("1 step update queries", ({ prompt }) => {
+    prompt("set all users with an empty email to disabled in database 'my' and collection 'users'", (tool) => {
+        tool("update-many").verifyCalled({
+            database: "my",
+            collection: "users",
+            filter: { email: "" },
+            update: { $set: { disabled: true } },
+        });
+    });
+});
diff --git a/tests/accuracy/2-step/create-collection-with-sample-data.test.ts b/tests/accuracy/2-step/create-collection-with-sample-data.test.ts
@@ -0,0 +1,32 @@
+import { describeAccuracyTest } from "../test-sdk.js";
+
+describeAccuracyTest("2 step create collection", ({ prompt }) => {
+    prompt(
+        `
+        create a new collection named 'users' in database 'my' and afterwards create a sample document with the following data:
+        - username: "john_doe"
+        - email: test@mongodb.com
+        - password: "password123"
+        - disabled: false
+    `,
+        (tool) => {
+            tool("create-collection").verifyCalled({
+                database: "my",
+                collection: "users",
+            });
+
+            tool("insert-many").verifyCalled({
+                database: "my",
+                collection: "users",
+                documents: [
+                    {
+                        username: "john_doe",
+                        email: "test@mongodb.com",
+                        password: "password123",
+                        disabled: false,
+                    },
+                ],
+            });
+        }
+    );
+});
diff --git a/tests/accuracy/models/gemini.ts b/tests/accuracy/models/gemini.ts
@@ -13,14 +13,79 @@ export class GeminiModelFacade implements ModelFacade {
         return process.env.MONGODB_MCP_TEST_GEMINI_API_KEY !== undefined;
     }
 
-    async generateContent(prompt: string, tools: ToolDefinition[]): Promise<{ toolCall: ToolCall[]; text?: string }> {
+    async generatePlan(prompt: string, tools: ToolDefinition[]): Promise<string[]> {
+        const planPrompt = `You are an expert MongoDB developer. Create a plan for the following task: \n ${prompt} \n Return the plan as a list of steps, as a JSON array. For example: [ "Step 1: ...", "Step 2: ...", "Step 3: ..." ]. Only return the JSON array, nothing else. Do not include any wrapper markdown or anything, just the plain JSON array.`;
+        const chatHistory = [{ role: "user", parts: [{ text: planPrompt }] }];
+
+        const apiKey = process.env.MONGODB_MCP_TEST_GEMINI_API_KEY;
+        const apiUrl = `https://generativelanguage.googleapis.com/v1beta/models/${this.name}:generateContent?key=${apiKey}`;
+
+        const toolDefinitions = tools.map((tool) => ({
+            name: tool.name,
+            description: tool.description,
+            parameters: tool.parameters || {},
+        }));
+
+        const payload = {
+            contents: chatHistory,
+            tools: {
+                function_declarations: [toolDefinitions],
+            },
+        };
+
+        try {
+            const response = await fetch(apiUrl, {
+                method: "POST",
+                headers: { "Content-Type": "application/json" },
+                body: JSON.stringify(payload),
+            });
+
+            if (!response.ok) {
+                const errorData = await response.text();
+                console.error(`[Gemini API Error] HTTP error! status: ${response.status}, data: ${errorData}`);
+                return [];
+            }
+
+            const result = (await response.json()) as {
+                candidates: Array<{
+                    content: {
+                        parts: Array<{
+                            text?: string;
+                            functionCall?: {
+                                name: string;
+                                args: Record<string, unknown>;
+                            };
+                        }>;
+                    };
+                }>;
+            };
+
+            const responseString = result.candidates
+                .flatMap((candidate) => candidate.content.parts.map((part) => part.text || ""))
+                .join("")
+                .replace("```json", "")
+                .replace("```", "");
+
+            try {
+                return JSON.parse(responseString) as string[];
+            } catch (parseError) {
+                console.error("[Gemini API JSON.parse Error]", responseString, parseError);
+            }
+            return [];
+        } catch (error: unknown) {
+            console.error("[Gemini API Fetch Error]", error);
+            return [];
+        }
+    }
+
+    async generateContent(parts: string[], tools: ToolDefinition[]): Promise<{ toolCall: ToolCall[]; text?: string }> {
         const toolDefinitions = tools.map((tool) => ({
             name: tool.name,
             description: tool.description,
             parameters: tool.parameters || {},
         }));
 
-        const chatHistory = [{ role: "user", parts: [{ text: prompt }] }];
+        const chatHistory = [{ role: "user", parts: parts.map((part) => ({ text: part })) }];
         const payload = {
             contents: chatHistory,
             tools: {
diff --git a/tests/accuracy/models/model.ts b/tests/accuracy/models/model.ts
@@ -8,5 +8,7 @@ export type ToolDefinition = {
 export interface ModelFacade {
     name: string;
     available(): boolean;
-    generateContent(prompt: string, tools: ToolDefinition[]): Promise<{ toolCall: ToolCall[]; text?: string }>;
+
+    generatePlan(prompt: string, tools: ToolDefinition[]): Promise<string[]>;
+    generateContent(parts: string[], tools: ToolDefinition[]): Promise<{ toolCall: ToolCall[]; text?: string }>;
 }
diff --git a/tests/accuracy/test-sdk.ts b/tests/accuracy/test-sdk.ts
@@ -10,6 +10,7 @@ import { availableModels } from "./models/index.js";
 import { ToolDefinition } from "./models/model.js";
 import { zodToJsonSchema } from "zod-to-json-schema";
 
+type ToolMockReturn = { content: Array<{ type: string; text: string }> };
 class ToolMock {
     readonly name: string;
     arguments: unknown;
@@ -27,7 +28,7 @@ class ToolMock {
         return this;
     }
 
-    thenReturn(value: unknown): this {
+    thenReturn(value: ToolMockReturn): this {
         this.returns = value;
         return this;
     }
@@ -55,6 +56,36 @@ type AccuracyTestCaseFn = (tools: AccuracyToolSetupFunction) => void;
 type AccuracyItFn = (prompt: string, testCase: AccuracyTestCaseFn) => void;
 type AccuracyTestSuite = { prompt: AccuracyItFn };
 
+type NonMockedCallError = { tool: string; args: unknown };
+
+function logVerbose(...args: unknown[]): void {
+    if (process.env.MONGODB_MCP_TEST_VERBOSE === "true") {
+        console.log(...args);
+    }
+}
+
+function printModelPlanIfVerbose(model: string, plan: string[]): void {
+    logVerbose(model, "📝: ", plan.join("\n"));
+}
+
+function testPromptIsVerbose(model: string, prompt: string): void {
+    logVerbose(model, "📜: ", prompt);
+}
+
+function modelSaidVerbose(model: string, response: string): void {
+    if (response.length > 0) {
+        logVerbose(model, "🗣️: ", response);
+    }
+}
+
+function modelToolCalledVerbose(model: string, toolCall: string, args: unknown): void {
+    logVerbose(model, "🛠️: ", toolCall, JSON.stringify(args));
+}
+
+function toolCallsReturnedVerbose(model: string, answer: string): void {
+    logVerbose(model, "📋: ", answer);
+}
+
 export function describeAccuracyTest(useCase: string, testCaseFn: (testSuite: AccuracyTestSuite) => void) {
     const models = availableModels();
     if (models.length === 0) {
@@ -105,8 +136,13 @@ export function describeAccuracyTest(useCase: string, testCaseFn: (testSuite: Ac
 
             const promptFn: AccuracyItFn = (prompt: string, testCase: AccuracyTestCaseFn) => {
                 it(prompt, async () => {
+                    testPromptIsVerbose(model.name, prompt);
+
                     const mcpServerUnsafe = (mcpServer as unknown as McpServerUnsafe).mcpServer;
                     const tools = mcpServerUnsafe["_registeredTools"] as { [toolName: string]: RegisteredTool };
+                    const mockedTools = new Set<string>();
+                    const nonMockedCallErrors = new Array<NonMockedCallError>();
+
                     const toolDefinitions = Object.entries(tools).map(([toolName, tool]) => {
                         if (!tool.inputSchema) {
                             throw new Error(`Tool ${toolName} does not have an input schema defined.`);
@@ -136,17 +172,22 @@ export function describeAccuracyTest(useCase: string, testCaseFn: (testSuite: Ac
                         return toolForApi;
                     });
 
-                    const mocks: Array<ToolMock> = [];
+                    const plan = await model.generatePlan(prompt, toolDefinitions);
+                    printModelPlanIfVerbose(model.name, plan);
+
+
+                   const mocks: Array<ToolMock> = [];
                     const toolFn: AccuracyToolSetupFunction = (toolName: string) => {
                         const mock = new ToolMock(toolName);
+                        mockedTools.add(toolName);
 
                         const mcpServerUnsafe = (mcpServer as unknown as McpServerUnsafe).mcpServer;
                         const tools = mcpServerUnsafe["_registeredTools"] as { [toolName: string]: RegisteredTool };
 
                         if (tools[toolName] !== undefined) {
                             tools[toolName].callback = ((args: unknown) => {
                                 mock._wasCalledWith(args);
-                                return mock.returns;
+                                return Promise.resolve(mock.returns);
                             }) as unknown as ToolCallback;
                         }
 
@@ -157,30 +198,55 @@ export function describeAccuracyTest(useCase: string, testCaseFn: (testSuite: Ac
                     testCase(toolFn);
 
                     const consumePromptUntilNoMoreCall = async (prompt: string[]) => {
-                        const promptStr = prompt.join("\n");
-                        const response = await model.generateContent(promptStr, toolDefinitions);
+                        const response = await model.generateContent(prompt, toolDefinitions);
 
+                        modelSaidVerbose(model.name, response.text || "<no text>");
                         if (response.toolCall.length > 0) {
                             const toolCallResults = await Promise.all(
-                                response.toolCall.map((tc) =>
-                                    mcpClient.callTool({
+                                response.toolCall.map((tc) => {
+                                    modelToolCalledVerbose(model.name, tc.name, tc.args);
+
+                                    if (!mockedTools.has(tc.name)) {
+                                        nonMockedCallErrors.push({ tool: tc.name, args: tc.args });
+                                    }
+
+                                    return mcpClient.callTool({
                                         name: tc.name,
                                         arguments: tc.args,
-                                    })
-                                )
+                                    });
+                                })
                             );
-                            const newPrompt = toolCallResults.flatMap((result) =>
+
+                            const responseParts = toolCallResults.flatMap((result) =>
                                 (result.content as Array<{ text: string }>).map((c) => c.text)
                             );
 
-                            if (newPrompt.join("\n").trim().length > 0) {
+                            const newPrompt = prompt.concat(responseParts);
+                            toolCallsReturnedVerbose(model.name, newPrompt.join("\n"));
+
+                            if (responseParts.length > 0) {
                                 return consumePromptUntilNoMoreCall(newPrompt);
                             }
                         }
                     };
 
+                    for (const step of plan) {
+                        await consumePromptUntilNoMoreCall([ step ]);
+                    }
+                    
                     await consumePromptUntilNoMoreCall([prompt]);
+
                     mocks.forEach((mock) => mock._verify());
+                    if (nonMockedCallErrors.length > 0) {
+                        for (const call of nonMockedCallErrors) {
+                            console.error(
+                                `Non-mocked tool call detected: ${call.tool} with args:`,
+                                JSON.stringify(call.args, null, 2)
+                            );
+                        }
+
+                        throw new Error("Non-mocked tool calls detected. Check the console for details.");
+                    }
                 });
             };