Skip to content

Commit 545067a

Browse files
chore: MongoDB based snapshot storage for accuracy runs
introduces the following necessary env variables: - MDB_ACCURACY_RUN_ID: The accuracy run id - MDB_ACCURACY_MDB_URL: The connection string to mongodb instance where the snapshots will be stored - MDB_ACCURACY_MDB_DB: The database for snapshots - MDB_ACCURACY_MDB_COLLECTION: The collection for snapshots
1 parent 4ed06a9 commit 545067a

File tree

8 files changed

+244
-16
lines changed

8 files changed

+244
-16
lines changed

package-lock.json

Lines changed: 34 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

package.json

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,7 @@
3131
"generate": "./scripts/generate.sh",
3232
"test": "node --experimental-vm-modules node_modules/jest/bin/jest.js --coverage --testPathIgnorePatterns=/tests/accuracy/",
3333
"test:accuracy": "node --experimental-vm-modules node_modules/jest/bin/jest.js --testPathPattern tests/accuracy",
34-
"test:accuracy-file": "node --experimental-vm-modules node_modules/jest/bin/jest.js"
34+
"test:accuracy-file": "MDB_ACCURACY_RUN_ID=$(npx uuid v4) node --experimental-vm-modules node_modules/jest/bin/jest.js"
3535
},
3636
"license": "Apache-2.0",
3737
"devDependencies": {
@@ -62,6 +62,7 @@
6262
"openapi-types": "^12.1.3",
6363
"openapi-typescript": "^7.6.1",
6464
"prettier": "^3.5.3",
65+
"simple-git": "^3.28.0",
6566
"ts-jest": "^29.3.1",
6667
"tsx": "^4.19.3",
6768
"typescript": "^5.8.2",
Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
import { getCommitSHA } from "../git-info.js";
2+
import { MongoDBSnapshotStorage } from "./mdb-snapshot-storage.js";
3+
import { AccuracySnapshotStorage } from "./snapshot-storage.js";
4+
5+
export async function getAccuracySnapshotStorage(): Promise<AccuracySnapshotStorage> {
6+
const accuracyRunId = process.env.MDB_ACCURACY_RUN_ID;
7+
if (!accuracyRunId) {
8+
throw new Error(
9+
"Cannot create AccuracySnapshotStorage without an accuracyRunId - ensure that the relevant env variable is present."
10+
);
11+
}
12+
13+
const commitSHA = await getCommitSHA();
14+
if (!commitSHA) {
15+
throw new Error("Cannot create AccuracySnapshotStorage without a commitSHA.");
16+
}
17+
18+
return MongoDBSnapshotStorage.getStorage(commitSHA, accuracyRunId);
19+
}
Lines changed: 86 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,86 @@
1+
import { Collection, MongoClient } from "mongodb";
2+
import { AccuracySnapshotEntry, AccuracySnapshotEntrySchema, AccuracySnapshotStorage } from "./snapshot-storage.js";
3+
4+
export class MongoDBSnapshotStorage implements AccuracySnapshotStorage {
5+
private readonly client: MongoClient;
6+
private readonly snapshotCollection: Collection;
7+
private readonly accuracyRunId: string;
8+
private readonly commitSHA: string;
9+
private constructor({
10+
mongodbUrl,
11+
database,
12+
collection,
13+
accuracyRunId,
14+
commitSHA,
15+
}: {
16+
mongodbUrl: string;
17+
database: string;
18+
collection: string;
19+
accuracyRunId: string;
20+
commitSHA: string;
21+
}) {
22+
this.client = new MongoClient(mongodbUrl);
23+
this.snapshotCollection = this.client.db(database).collection(collection);
24+
this.accuracyRunId = accuracyRunId;
25+
this.commitSHA = commitSHA;
26+
}
27+
28+
async createSnapshotEntry(
29+
snapshotEntry: Pick<
30+
AccuracySnapshotEntry,
31+
| "requestedModel"
32+
| "test"
33+
| "prompt"
34+
| "toolCallingAccuracy"
35+
| "parameterAccuracy"
36+
| "llmResponseTime"
37+
| "tokensUsage"
38+
| "respondingModel"
39+
| "text"
40+
| "messages"
41+
>
42+
): Promise<void> {
43+
const snapshotWithMeta: AccuracySnapshotEntry = {
44+
...snapshotEntry,
45+
commitSHA: this.commitSHA,
46+
accuracyRunId: this.accuracyRunId,
47+
createdOn: Date.now(),
48+
};
49+
await this.snapshotCollection.insertOne(snapshotWithMeta);
50+
}
51+
52+
async getLastRunIdForCommit(commit: string): Promise<string | undefined> {
53+
const document = await this.snapshotCollection.findOne(
54+
{ commit: commit },
55+
{ sort: { createdOn: -1 }, projection: { accuracyRunId: 1 } }
56+
);
57+
58+
return document?.accuracyRunId ? `${document?.accuracyRunId}` : undefined;
59+
}
60+
61+
async getSnapshotEntriesForRunId(accuracyRunId: string): Promise<AccuracySnapshotEntry[]> {
62+
const snapshotEntries = await this.snapshotCollection.find({ accuracyRunId }).toArray();
63+
return AccuracySnapshotEntrySchema.array().parse(snapshotEntries);
64+
}
65+
66+
static getStorage(commitSHA: string, accuracyRunId: string): MongoDBSnapshotStorage {
67+
const mongodbUrl = process.env.MDB_ACCURACY_MDB_URL;
68+
const database = process.env.MDB_ACCURACY_MDB_DB;
69+
const collection = process.env.MDB_ACCURACY_MDB_COLLECTION;
70+
if (!mongodbUrl || !database || !collection) {
71+
throw new Error("Cannot create MongoDBAccuracySnapshot storage without relevant configuration provided");
72+
}
73+
74+
return new MongoDBSnapshotStorage({
75+
mongodbUrl,
76+
database,
77+
collection,
78+
commitSHA,
79+
accuracyRunId,
80+
});
81+
}
82+
83+
async close(): Promise<void> {
84+
await this.client.close();
85+
}
86+
}
Lines changed: 51 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,51 @@
1+
import z from "zod";
2+
3+
export const AccuracySnapshotEntrySchema = z.object({
4+
// Git and meta information for snapshot entries
5+
accuracyRunId: z.string(),
6+
createdOn: z.number(),
7+
commitSHA: z.string(),
8+
// Accuracy info
9+
requestedModel: z.string(),
10+
test: z.string(),
11+
prompt: z.string(),
12+
toolCallingAccuracy: z.number(),
13+
parameterAccuracy: z.number(),
14+
llmResponseTime: z.number(),
15+
tokensUsage: z
16+
.object({
17+
promptTokens: z.number().optional(),
18+
completionTokens: z.number().optional(),
19+
totalTokens: z.number().optional(),
20+
})
21+
.optional(),
22+
respondingModel: z.string(),
23+
text: z.string(),
24+
messages: z.array(z.record(z.string(), z.unknown())),
25+
});
26+
27+
export type AccuracySnapshotEntry = z.infer<typeof AccuracySnapshotEntrySchema>;
28+
29+
export interface AccuracySnapshotStorage {
30+
createSnapshotEntry(
31+
snapshotEntry: Pick<
32+
AccuracySnapshotEntry,
33+
| "requestedModel"
34+
| "test"
35+
| "prompt"
36+
| "toolCallingAccuracy"
37+
| "parameterAccuracy"
38+
| "llmResponseTime"
39+
| "tokensUsage"
40+
| "respondingModel"
41+
| "text"
42+
| "messages"
43+
>
44+
): Promise<void>;
45+
46+
getLastRunIdForCommit(commit: string): Promise<string | undefined>;
47+
48+
getSnapshotEntriesForRunId(accuracyRunId: string): Promise<AccuracySnapshotEntry[]>;
49+
50+
close(): Promise<void>;
51+
}

tests/accuracy/sdk/agent.ts

Lines changed: 24 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
import { generateText, Tool, Schema, LanguageModelV1 } from "ai";
1+
import { generateText, LanguageModelV1, experimental_createMCPClient } from "ai";
22
import { Model } from "./models.js";
33

44
const systemPrompt = [
@@ -10,15 +10,32 @@ const systemPrompt = [
1010
'If you do not know the answer or the request cannot be fulfilled, you MUST reply with "I don\'t know"',
1111
];
1212

13-
export interface Agent<M = unknown, T = unknown, R = unknown> {
14-
prompt(prompt: string, model: M, tools: T): Promise<R>;
13+
// Some necessary types from Vercel SDK
14+
export type VercelMCPClient = Awaited<ReturnType<typeof experimental_createMCPClient>>;
15+
export type VercelMCPClientTools = Awaited<ReturnType<VercelMCPClient["tools"]>>;
16+
export type VercelAgent = ReturnType<typeof getVercelToolCallingAgent>;
17+
18+
// Generic interface for Agent, in case we need to switch to some other agent
19+
// development SDK
20+
export interface AgentPromptResult {
21+
respondingModel: string;
22+
tokensUsage?: {
23+
promptTokens?: number;
24+
completionTokens?: number;
25+
totalTokens?: number;
26+
};
27+
text: string;
28+
messages: Record<string, unknown>[];
29+
}
30+
export interface Agent<Model = unknown, Tools = unknown, Result = unknown> {
31+
prompt(prompt: string, model: Model, tools: Tools): Promise<Result>;
1532
}
1633

1734
export function getVercelToolCallingAgent(
1835
requestedSystemPrompt?: string
19-
): Agent<Model<LanguageModelV1>, Record<string, Tool<Schema<unknown>>>, { text: string; messages: unknown[] }> {
36+
): Agent<Model<LanguageModelV1>, VercelMCPClientTools, AgentPromptResult> {
2037
return {
21-
async prompt(prompt: string, model: Model<LanguageModelV1>, tools: Record<string, Tool<Schema<unknown>>>) {
38+
async prompt(prompt: string, model: Model<LanguageModelV1>, tools: VercelMCPClientTools) {
2239
const result = await generateText({
2340
model: model.getModel(),
2441
system: [...systemPrompt, requestedSystemPrompt].join("\n"),
@@ -29,6 +46,8 @@ export function getVercelToolCallingAgent(
2946
return {
3047
text: result.text,
3148
messages: result.response.messages,
49+
respondingModel: result.response.modelId,
50+
tokensUsage: result.usage,
3251
};
3352
},
3453
};

tests/accuracy/sdk/describe-accuracy-tests.ts

Lines changed: 21 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,10 @@
11
import { TestableModels } from "./models.js";
22
import { ExpectedToolCall, parameterMatchingAccuracyScorer, toolCallingAccuracyScorer } from "./accuracy-scorers.js";
3-
import { Agent, getVercelToolCallingAgent } from "./agent.js";
3+
import { getVercelToolCallingAgent, VercelAgent } from "./agent.js";
44
import { prepareTestData, setupMongoDBIntegrationTest } from "../../integration/tools/mongodb/mongodbHelpers.js";
55
import { AccuracyTestingClient, MockedTools } from "./accuracy-testing-client.js";
6+
import { getAccuracySnapshotStorage } from "./accuracy-snapshot-storage/get-snapshot-storage.js";
7+
import { AccuracySnapshotStorage } from "./accuracy-snapshot-storage/snapshot-storage.js";
68

79
export interface AccuracyTestConfig {
810
systemPrompt?: string;
@@ -35,10 +37,12 @@ export function describeAccuracyTests(
3537
const mdbIntegration = setupMongoDBIntegrationTest();
3638
const { populateTestData, cleanupTestDatabases } = prepareTestData(mdbIntegration);
3739

40+
let accuracySnapshotStorage: AccuracySnapshotStorage;
3841
let testMCPClient: AccuracyTestingClient;
39-
let agent: Agent;
42+
let agent: VercelAgent;
4043

4144
beforeAll(async () => {
45+
accuracySnapshotStorage = await getAccuracySnapshotStorage();
4246
testMCPClient = await AccuracyTestingClient.initializeClient(mdbIntegration.connectionString());
4347
agent = getVercelToolCallingAgent();
4448
});
@@ -50,6 +54,7 @@ export function describeAccuracyTests(
5054
});
5155

5256
afterAll(async () => {
57+
await accuracySnapshotStorage.close();
5358
await testMCPClient.close();
5459
});
5560

@@ -62,21 +67,27 @@ export function describeAccuracyTests(
6267
const promptForModel = testConfig.injectConnectedAssumption
6368
? [testConfig.prompt, "(Assume that you are already connected to a MongoDB cluster!)"].join(" ")
6469
: testConfig.prompt;
65-
const conversation = await agent.prompt(promptForModel, model, toolsForModel);
70+
71+
const timeBeforePrompt = Date.now();
72+
const result = await agent.prompt(promptForModel, model, toolsForModel);
73+
const timeAfterPrompt = Date.now();
6674
const toolCalls = testMCPClient.getToolCalls();
6775
const toolCallingAccuracy = toolCallingAccuracyScorer(testConfig.expectedToolCalls, toolCalls);
6876
const parameterMatchingAccuracy = parameterMatchingAccuracyScorer(
6977
testConfig.expectedToolCalls,
7078
toolCalls
7179
);
72-
console.debug(testConfig.prompt);
73-
// console.debug(`Conversation`, JSON.stringify(conversation, null, 2));
74-
// console.debug(`Tool calls`, JSON.stringify(toolCalls, null, 2));
75-
console.debug(
76-
"Tool calling accuracy: %s, Parameter Accuracy: %s",
80+
81+
const responseTime = timeAfterPrompt - timeBeforePrompt;
82+
await accuracySnapshotStorage.createSnapshotEntry({
83+
requestedModel: model.modelName,
84+
test: suiteName,
85+
prompt: testConfig.prompt,
86+
llmResponseTime: responseTime,
7787
toolCallingAccuracy,
78-
parameterMatchingAccuracy
79-
);
88+
parameterAccuracy: parameterMatchingAccuracy,
89+
...result,
90+
});
8091
});
8192
});
8293
});

tests/accuracy/sdk/git-info.ts

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
import { simpleGit } from "simple-git";
2+
3+
export async function getCommitSHA(): Promise<string | undefined> {
4+
const commitLogs = await simpleGit().log();
5+
const lastCommit = commitLogs.latest;
6+
return lastCommit?.hash;
7+
}

0 commit comments

Comments
 (0)