chore: simplified toolCallingAccuracy calculation

himanshusinghs · himanshusinghs · commit 7cd61aa6b81e · 2025-07-08T23:39:12.000+02:00
diff --git a/package-lock.json b/package-lock.json
diff --git a/package.json b/package.json
@@ -56,6 +56,7 @@
     "jest-environment-node": "^29.7.0",
     "jest-extended": "^6.0.0",
     "json-schema": "^0.4.0",
+    "microdiff": "^1.5.0",
     "mongodb-runner": "^5.8.2",
     "ollama-ai-provider": "^1.2.0",
     "openapi-types": "^12.1.3",
diff --git a/tests/accuracy/sdk/accuracy-scorers.ts b/tests/accuracy/sdk/accuracy-scorers.ts
@@ -1,133 +1,60 @@
-export type ToolCall = {
-    toolCallId: string;
-    toolName: string;
-    parameters: unknown;
-};
-export type ExpectedToolCall = Omit<ToolCall, "toolCallId">;
+import diff from "microdiff";
+import { ExpectedToolCall, ActualToolCall } from "./accuracy-snapshot-storage/snapshot-storage.js";
 
-export function toolCallingAccuracyScorer(expectedToolCalls: ExpectedToolCall[], actualToolCalls: ToolCall[]): number {
-    if (actualToolCalls.length < expectedToolCalls.length) {
-        return 0;
-    }
-
-    const possibleScore = actualToolCalls.length > expectedToolCalls.length ? 0.75 : 1;
-    const checkedToolCallIds = new Set<string>();
-    for (const expectedToolCall of expectedToolCalls) {
-        const matchingActualToolCall = actualToolCalls.find(
-            (actualToolCall) =>
-                actualToolCall.toolName === expectedToolCall.toolName &&
-                !checkedToolCallIds.has(actualToolCall.toolCallId)
-        );
-
-        if (!matchingActualToolCall) {
-            return 0;
-        }
-
-        checkedToolCallIds.add(matchingActualToolCall.toolCallId);
-    }
-
-    return possibleScore;
-}
-
-export function parameterMatchingAccuracyScorer(
+export function calculateToolCallingAccuracy(
     expectedToolCalls: ExpectedToolCall[],
-    actualToolCalls: ToolCall[]
+    actualToolCalls: ActualToolCall[]
 ): number {
     if (expectedToolCalls.length === 0) {
-        return 1;
+        return actualToolCalls.length === 0 ? 1 : 0.75;
     }
 
-    const usedActualIndexes = new Set<number>();
-    const scores: number[] = [];
+    const maxAccuracy = actualToolCalls.length > expectedToolCalls.length ? 0.75 : 1;
+
+    const individualAccuracies: number[] = [];
+    const checkedActualToolCallIndexes = new Set<number>();
 
     for (const expectedCall of expectedToolCalls) {
-        // Find all unmatched actual tool calls with the same tool name
         const candidates = actualToolCalls
             .map((call, index) => ({ call, index }))
-            .filter(({ call, index }) => !usedActualIndexes.has(index) && call.toolName === expectedCall.toolName);
-
-        if (candidates.length === 0) {
-            scores.push(0);
-            continue;
-        }
-
-        // Pick the candidate with the best parameter match
-        let bestScore = -1;
-        let bestIndex = -1;
-        for (const { call, index } of candidates) {
-            const score = compareParams(expectedCall.parameters, call.parameters);
-            if (score > bestScore) {
-                bestScore = score;
-                bestIndex = index;
-            }
-        }
-
-        usedActualIndexes.add(bestIndex);
-        scores.push(bestScore);
-    }
-
-    const totalScore = scores.reduce((sum, score) => sum + score, 0);
-    return totalScore / scores.length;
+            .filter(
+                ({ call, index }) => !checkedActualToolCallIndexes.has(index) && call.toolName === expectedCall.toolName
+            )
+            .map(({ call, index }) => ({
+                call,
+                index,
+                score: compareParams(expectedCall.parameters, call.parameters),
+            }))
+            .filter(({ score }) => score >= 0.75)
+            .sort((a, b) => b.score - a.score);
+
+        const bestMatch = candidates[0];
+        if (!bestMatch) {
+            individualAccuracies.push(0);
+        } else {
+            checkedActualToolCallIndexes.add(bestMatch.index);
+            const individualAccuracy = Math.min(bestMatch.score, maxAccuracy);
+            individualAccuracies.push(individualAccuracy);
+        }
+    }
+
+    return Math.min(...individualAccuracies);
 }
 
-/**
- * Recursively compares expected and actual parameters and returns a score.
- * - 1: Perfect match.
- * - 0.75: All expected parameters are present and match, but there are extra actual parameters.
- * - 0: Missing parameters or mismatched values.
- */
-function compareParams(expected: unknown, actual: unknown): number {
-    if (expected === null || expected === undefined) {
-        return actual === null || actual === undefined ? 1 : 0;
-    }
-    if (actual === null || actual === undefined) {
-        return 0;
-    }
+function compareParams(expected: Record<string, unknown>, actual: Record<string, unknown>): number {
+    const differences = diff(expected, actual);
 
-    if (Array.isArray(expected)) {
-        if (!Array.isArray(actual) || actual.length < expected.length) {
-            return 0;
-        }
-        let minScore = 1;
-        for (let i = 0; i < expected.length; i++) {
-            minScore = Math.min(minScore, compareParams(expected[i], actual[i]));
-        }
-        if (minScore === 0) {
-            return 0;
-        }
-        if (actual.length > expected.length) {
-            minScore = Math.min(minScore, 0.75);
-        }
-        return minScore;
+    if (differences.length === 0) {
+        return 1;
     }
 
-    if (typeof expected === "object") {
-        if (typeof actual !== "object" || Array.isArray(actual)) {
-            return 0;
-        }
-        const expectedKeys = Object.keys(expected as Record<string, unknown>);
-        const actualKeys = Object.keys(actual as Record<string, unknown>);
-
-        let minScore = 1;
-        for (const key of expectedKeys) {
-            if (!Object.prototype.hasOwnProperty.call(actual, key)) {
-                return 0;
-            }
-            minScore = Math.min(
-                minScore,
-                compareParams((expected as Record<string, unknown>)[key], (actual as Record<string, unknown>)[key])
-            );
-        }
+    const hasOnlyAdditions = differences.every((d) => d.type === "CREATE");
+    const hasRemovals = differences.some((d) => d.type === "REMOVE");
+    const hasChanges = differences.some((d) => d.type === "CHANGE");
 
-        if (minScore === 0) {
-            return 0;
-        }
-
-        if (actualKeys.length > expectedKeys.length) {
-            minScore = Math.min(minScore, 0.75);
-        }
-        return minScore;
+    if (hasOnlyAdditions && !hasRemovals && !hasChanges) {
+        return 0.75;
     }
 
-    return expected == actual ? 1 : 0;
+    return 0;
 }
diff --git a/tests/accuracy/sdk/accuracy-snapshot-storage/mdb-snapshot-storage.ts b/tests/accuracy/sdk/accuracy-snapshot-storage/mdb-snapshot-storage.ts
@@ -28,11 +28,13 @@ export class MongoDBSnapshotStorage implements AccuracySnapshotStorage {
     async createSnapshotEntry(
         snapshotEntry: Pick<
             AccuracySnapshotEntry,
+            | "provider"
             | "requestedModel"
             | "test"
             | "prompt"
             | "toolCallingAccuracy"
-            | "parameterAccuracy"
+            | "expectedToolCalls"
+            | "actualToolCalls"
             | "llmResponseTime"
             | "tokensUsage"
             | "respondingModel"
diff --git a/tests/accuracy/sdk/accuracy-snapshot-storage/snapshot-storage.ts b/tests/accuracy/sdk/accuracy-snapshot-storage/snapshot-storage.ts
@@ -1,16 +1,30 @@
 import z from "zod";
 
+const ExpectedToolCallSchema = z.object({
+    toolCallId: z.string(),
+    toolName: z.string(),
+    parameters: z.record(z.string(), z.unknown()),
+});
+
+const ActualToolCallSchema = ExpectedToolCallSchema.omit({ toolCallId: undefined });
+
+export type ExpectedToolCall = z.infer<typeof ExpectedToolCallSchema>;
+export type ActualToolCall = z.infer<typeof ActualToolCallSchema>;
+
 export const AccuracySnapshotEntrySchema = z.object({
     // Git and meta information for snapshot entries
     accuracyRunId: z.string(),
     createdOn: z.number(),
     commitSHA: z.string(),
     // Accuracy info
+    provider: z.string(),
     requestedModel: z.string(),
     test: z.string(),
     prompt: z.string(),
     toolCallingAccuracy: z.number(),
-    parameterAccuracy: z.number(),
+    // debug info for further investigations
+    expectedToolCalls: ExpectedToolCallSchema.array(),
+    actualToolCalls: ActualToolCallSchema.array(),
     llmResponseTime: z.number(),
     tokensUsage: z
         .object({
@@ -30,11 +44,13 @@ export interface AccuracySnapshotStorage {
     createSnapshotEntry(
         snapshotEntry: Pick<
             AccuracySnapshotEntry,
+            | "provider"
             | "requestedModel"
             | "test"
             | "prompt"
             | "toolCallingAccuracy"
-            | "parameterAccuracy"
+            | "expectedToolCalls"
+            | "actualToolCalls"
             | "llmResponseTime"
             | "tokensUsage"
             | "respondingModel"
diff --git a/tests/accuracy/sdk/accuracy-testing-client.ts b/tests/accuracy/sdk/accuracy-testing-client.ts
@@ -5,7 +5,7 @@ import { experimental_createMCPClient as createMCPClient, tool as createVercelTo
 import { CallToolResult } from "@modelcontextprotocol/sdk/types.js";
 import { StdioClientTransport } from "@modelcontextprotocol/sdk/client/stdio.js";
 
-import { ToolCall } from "./accuracy-scorers.js";
+import { ExpectedToolCall } from "./accuracy-snapshot-storage/snapshot-storage.js";
 
 const __dirname = fileURLToPath(import.meta.url);
 const distPath = path.join(__dirname, "..", "..", "..", "..", "dist");
@@ -16,7 +16,7 @@ export type MockedTools = Record<string, ToolResultGeneratorFn>;
 
 export class AccuracyTestingClient {
     private mockedTools: MockedTools = {};
-    private recordedToolCalls: ToolCall[] = [];
+    private recordedToolCalls: ExpectedToolCall[] = [];
     private constructor(private readonly vercelMCPClient: Awaited<ReturnType<typeof createMCPClient>>) {}
 
     async close() {
@@ -33,7 +33,7 @@ export class AccuracyTestingClient {
                     this.recordedToolCalls.push({
                         toolCallId: uuid(),
                         toolName: toolName,
-                        parameters: args,
+                        parameters: args as Record<string, unknown>,
                     });
                     try {
                         const toolResultGeneratorFn = this.mockedTools[toolName];
diff --git a/tests/accuracy/sdk/describe-accuracy-tests.ts b/tests/accuracy/sdk/describe-accuracy-tests.ts
@@ -1,10 +1,10 @@
 import { TestableModels } from "./models.js";
-import { ExpectedToolCall, parameterMatchingAccuracyScorer, toolCallingAccuracyScorer } from "./accuracy-scorers.js";
+import { calculateToolCallingAccuracy } from "./accuracy-scorers.js";
 import { getVercelToolCallingAgent, VercelAgent } from "./agent.js";
 import { prepareTestData, setupMongoDBIntegrationTest } from "../../integration/tools/mongodb/mongodbHelpers.js";
 import { AccuracyTestingClient, MockedTools } from "./accuracy-testing-client.js";
 import { getAccuracySnapshotStorage } from "./accuracy-snapshot-storage/get-snapshot-storage.js";
-import { AccuracySnapshotStorage } from "./accuracy-snapshot-storage/snapshot-storage.js";
+import { AccuracySnapshotStorage, ExpectedToolCall } from "./accuracy-snapshot-storage/snapshot-storage.js";
 
 export interface AccuracyTestConfig {
     systemPrompt?: string;
@@ -33,7 +33,7 @@ export function describeAccuracyTests(
     const eachModel = describe.each(models);
     const eachSuite = describe.each(Object.keys(accuracyTestConfigs));
 
-    eachModel(`$modelName`, function (model) {
+    eachModel(`$displayName`, function (model) {
         const mdbIntegration = setupMongoDBIntegrationTest();
         const { populateTestData, cleanupTestDatabases } = prepareTestData(mdbIntegration);
 
@@ -72,20 +72,18 @@ export function describeAccuracyTests(
                 const result = await agent.prompt(promptForModel, model, toolsForModel);
                 const timeAfterPrompt = Date.now();
                 const toolCalls = testMCPClient.getToolCalls();
-                const toolCallingAccuracy = toolCallingAccuracyScorer(testConfig.expectedToolCalls, toolCalls);
-                const parameterMatchingAccuracy = parameterMatchingAccuracyScorer(
-                    testConfig.expectedToolCalls,
-                    toolCalls
-                );
+                const toolCallingAccuracy = calculateToolCallingAccuracy(testConfig.expectedToolCalls, toolCalls);
 
                 const responseTime = timeAfterPrompt - timeBeforePrompt;
                 await accuracySnapshotStorage.createSnapshotEntry({
+                    provider: model.provider,
                     requestedModel: model.modelName,
                     test: suiteName,
                     prompt: testConfig.prompt,
                     llmResponseTime: responseTime,
-                    toolCallingAccuracy,
-                    parameterAccuracy: parameterMatchingAccuracy,
+                    toolCallingAccuracy: toolCallingAccuracy,
+                    actualToolCalls: toolCalls,
+                    expectedToolCalls: testConfig.expectedToolCalls,
                     ...result,
                 });
             });
diff --git a/tests/accuracy/sdk/models.ts b/tests/accuracy/sdk/models.ts