Skip to content

Commit e51d339

Browse files
chore: more consolidated prompt tests
1 parent 14567e8 commit e51d339

File tree

4 files changed

+48
-38
lines changed

4 files changed

+48
-38
lines changed

tests/accuracy/list-databases.test.ts

Lines changed: 20 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,26 +1,38 @@
11
import { describeAccuracyTests } from "./sdk/describe-accuracy-tests.js";
22
import { getAvailableModels } from "./sdk/models.js";
3+
import { AccuracyTestConfig } from "./sdk/describe-accuracy-tests.js";
34

4-
describeAccuracyTests("list-databases", getAvailableModels(), [
5-
{
6-
prompt: "Assume that you're already connected. How many collections are there in sample_mflix database",
5+
function describeListDatabasesAccuracyTests(prompt: string): AccuracyTestConfig {
6+
return {
7+
systemPrompt: "Assume that you're already connected.",
8+
prompt: prompt,
79
mockedTools: {
8-
"list-collections": function listCollections() {
10+
"list-databases": function listDatabases() {
911
return {
1012
content: [
1113
{
1214
type: "text",
13-
text: "Name: coll1",
15+
text: "Name: db1",
16+
},
17+
{
18+
type: "text",
19+
text: "Name: db2",
1420
},
1521
],
1622
};
1723
},
1824
},
1925
expectedToolCalls: [
2026
{
21-
toolName: "list-collections",
22-
parameters: { database: "sample_mflix" },
27+
toolName: "list-databases",
28+
parameters: {},
2329
},
2430
],
25-
},
31+
};
32+
}
33+
34+
describeAccuracyTests("list-databases", getAvailableModels(), [
35+
describeListDatabasesAccuracyTests("How many databases do I have?"),
36+
describeListDatabasesAccuracyTests("List all the databases in my cluster."),
37+
describeListDatabasesAccuracyTests("Is there a sample_mflix database in my cluster?"),
2638
]);

tests/accuracy/sdk/agent.ts

Lines changed: 5 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -8,23 +8,20 @@ const systemPrompt = [
88
"When calling a tool, you MUST strictly follow its input schema and MUST provide all required arguments",
99
"If a task requires multiple steps, you MUST call the necessary tools in sequence",
1010
'If you do not know the answer or the request cannot be fulfilled, you MUST reply with "I don\'t know"',
11-
"You SHOULD assume that you are already connected to a MongoDB connection",
12-
].join("\n");
11+
];
1312

1413
export interface Agent<M = unknown, T = unknown, R = unknown> {
1514
prompt(prompt: string, model: M, tools: T): Promise<R>;
1615
}
1716

18-
export function getVercelToolCallingAgent(): Agent<
19-
Model<LanguageModelV1>,
20-
Record<string, Tool<Schema<unknown>>>,
21-
{ text: string; messages: unknown[] }
22-
> {
17+
export function getVercelToolCallingAgent(
18+
requestedSystemPrompt?: string
19+
): Agent<Model<LanguageModelV1>, Record<string, Tool<Schema<unknown>>>, { text: string; messages: unknown[] }> {
2320
return {
2421
async prompt(prompt: string, model: Model<LanguageModelV1>, tools: Record<string, Tool<Schema<unknown>>>) {
2522
const result = await generateText({
2623
model: model.getModel(),
27-
system: systemPrompt,
24+
system: [...systemPrompt, requestedSystemPrompt].join("\n"),
2825
prompt,
2926
tools,
3027
maxSteps: 100,

tests/accuracy/sdk/describe-accuracy-tests.ts

Lines changed: 21 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,8 @@ import { ExpectedToolCall, parameterMatchingAccuracyScorer, toolCallingAccuracyS
55
import { Agent, getVercelToolCallingAgent } from "./agent.js";
66
import { appendAccuracySnapshot } from "./accuracy-snapshot.js";
77

8-
interface AccuracyTestConfig {
8+
export interface AccuracyTestConfig {
9+
systemPrompt?: string;
910
prompt: string;
1011
expectedToolCalls: ExpectedToolCall[];
1112
mockedTools: MockedTools;
@@ -17,13 +18,7 @@ export function describeAccuracyTests(
1718
accuracyTestConfigs: AccuracyTestConfig[]
1819
) {
1920
const accuracyDatetime = process.env.MDB_ACCURACY_DATETIME;
20-
if (!accuracyDatetime) {
21-
throw new Error("MDB_ACCURACY_DATETIME environment variable is not set");
22-
}
2321
const accuracyCommit = process.env.MDB_ACCURACY_COMMIT;
24-
if (!accuracyCommit) {
25-
throw new Error("MDB_ACCURACY_COMMIT environment variable is not set");
26-
}
2722

2823
if (!models.length) {
2924
console.warn(`No models available to test ${suiteName}`);
@@ -53,25 +48,31 @@ export function describeAccuracyTests(
5348
const toolCalls = testTools.getToolCalls();
5449
const toolCallingAccuracy = toolCallingAccuracyScorer(testConfig.expectedToolCalls, toolCalls);
5550
const parameterMatchingAccuracy = parameterMatchingAccuracyScorer(testConfig.expectedToolCalls, toolCalls);
56-
await appendAccuracySnapshot({
57-
datetime: accuracyDatetime,
58-
commit: accuracyCommit,
59-
model: model.modelName,
60-
suite: suiteName,
61-
test: testConfig.prompt,
62-
toolCallingAccuracy,
63-
parameterAccuracy: parameterMatchingAccuracy,
64-
});
51+
if (accuracyDatetime && accuracyCommit) {
52+
await appendAccuracySnapshot({
53+
datetime: accuracyDatetime,
54+
commit: accuracyCommit,
55+
model: model.modelName,
56+
suite: suiteName,
57+
test: testConfig.prompt,
58+
toolCallingAccuracy,
59+
parameterAccuracy: parameterMatchingAccuracy,
60+
});
61+
} else {
62+
console.info(
63+
`Skipping accuracy snapshot update for ${model.modelName} - ${suiteName} - ${testConfig.prompt}`
64+
);
65+
}
6566

6667
try {
6768
expect(toolCallingAccuracy).not.toEqual(0);
6869
expect(parameterMatchingAccuracy).toBeGreaterThanOrEqual(0.5);
6970
} catch (error) {
7071
console.warn(`Accuracy test failed for ${model.modelName} - ${suiteName} - ${testConfig.prompt}`);
71-
console.warn(`Conversation`, JSON.stringify(conversation, null, 2));
72-
console.warn(`Tool calls`, JSON.stringify(toolCalls, null, 2));
73-
console.warn(`Tool calling accuracy`, toolCallingAccuracy);
74-
console.warn(`Parameter matching accuracy`, parameterMatchingAccuracy);
72+
console.debug(`Conversation`, JSON.stringify(conversation, null, 2));
73+
console.debug(`Tool calls`, JSON.stringify(toolCalls, null, 2));
74+
console.debug(`Tool calling accuracy`, toolCallingAccuracy);
75+
console.debug(`Parameter matching accuracy`, parameterMatchingAccuracy);
7576
throw error;
7677
}
7778
});

tests/accuracy/sdk/models.ts

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -36,8 +36,8 @@ export class OllamaModel implements Model {
3636

3737
const ALL_TESTABLE_MODELS = [
3838
new GeminiModel("gemini-1.5-flash"),
39-
new GeminiModel("gemini-2.0-flash"),
40-
new OllamaModel("qwen3:latest"),
39+
// new GeminiModel("gemini-2.0-flash"),
40+
// new OllamaModel("qwen3:latest"),
4141
];
4242

4343
export type TestableModels = ReturnType<typeof getAvailableModels>;

0 commit comments

Comments
 (0)