Skip to content

Commit ab61d1b

Browse files
committed
fix: minor changes and a couple of more test cases
1 parent 82df1ef commit ab61d1b

File tree

6 files changed

+298
-179
lines changed

6 files changed

+298
-179
lines changed

evals/README.md

Lines changed: 63 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -56,9 +56,67 @@ npm run evals:run
5656
- console: pass/fail per model + evaluator
5757
- exit code: 0 = success, 1 = failure
5858

59-
## Updating test cases
59+
## Adding new test cases
6060

61-
to add/modify test cases:
62-
1. edit `test-cases.json`
63-
2. run `npm run evals:create-dataset` to update Phoenix dataset
64-
3. run `npm run evals:run` to test changes
61+
### How to contribute?
62+
63+
1. **Create an issue or PR** with your new test cases
64+
2. **Explain why it should pass** - add a `reference` field with clear reasoning
65+
3. **Test locally** before submitting
66+
4. **Publish** - we'll review and merge
67+
68+
### Test case structure
69+
70+
Each test case in `test-cases.json` has this structure:
71+
72+
```json
73+
{
74+
"id": "unique-test-id",
75+
"category": "tool-category",
76+
"query": "user query text",
77+
"expectedTools": ["tool-name"],
78+
"reference": "explanation of why this should pass (optional)",
79+
"context": [/* conversation history (optional) */]
80+
}
81+
```
82+
83+
### Simple examples
84+
85+
**Basic tool selection:**
86+
```json
87+
{
88+
"id": "fetch-actor-details-1",
89+
"category": "fetch-actor-details",
90+
"query": "What are the details of apify/instagram-scraper?",
91+
"expectedTools": ["fetch-actor-details"]
92+
}
93+
```
94+
95+
**With reference explanation:**
96+
```json
97+
{
98+
"id": "fetch-actor-details-3",
99+
"category": "fetch-actor-details",
100+
"query": "Scrape details of apify/google-search-scraper",
101+
"expectedTools": ["fetch-actor-details"],
102+
"reference": "It should call the fetch-actor-details with the actor ID 'apify/google-search-scraper' and return the actor's documentation."
103+
}
104+
```
105+
106+
### Advanced examples with context
107+
108+
**Multi-step conversation flow:**
109+
```json
110+
{
111+
"id": "weather-mcp-search-then-call-1",
112+
"category": "flow",
113+
"query": "Now, use the mcp to check the weather in Prague, Czechia?",
114+
"expectedTools": ["call-actor"],
115+
"context": [
116+
{ "role": "user", "content": "Search for weather MCP server" },
117+
{ "role": "assistant", "content": "I'll help you to do that" },
118+
{ "role": "tool_use", "tool": "search-actors", "input": {"search": "weather mcp", "limit": 5} },
119+
{ "role": "tool_result", "tool_use_id": 12, "content": "Tool 'search-actors' successful, Actor found: jiri.spilka/weather-mcp-server" }
120+
]
121+
}
122+
```

evals/config.ts

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,7 @@ export const MODELS_TO_EVALUATE = [
3333

3434
export const TOOL_SELECTION_EVAL_MODEL = 'openai/gpt-4o-mini';
3535

36-
export const PASS_THRESHOLD = 0.6;
36+
export const PASS_THRESHOLD = 0.7;
3737

3838
export const DATASET_NAME = `mcp_server_dataset_v${getTestCasesVersion()}`;
3939

@@ -50,11 +50,11 @@ the right tool to call.
5050
5151
[BEGIN DATA]
5252
************
53-
{{context}}
54-
{{query}}
53+
[User's previous interaction with the assistant]: {{context}}
54+
[User query]: {{query}}
5555
************
56-
{{tool_calls}}
57-
{{llm_response}}
56+
[LLM decided to call these tools]: {{tool_calls}}
57+
[LLM response]: {{llm_response}}
5858
************
5959
[END DATA]
6060
@@ -78,9 +78,9 @@ Base your decision solely on the information provided in [BEGIN DATA] ... [END D
7878
the [Tool Definitions], and the [Reference instructions] (if provided).
7979
Reference instructions are optional and are intended to help you understand the use case and make your decision.
8080
81-
{{reference}}
81+
[Reference instructions]: {{reference}}
8282
83-
{{tool_definitions}}
83+
[Tool definitions]: {{tool_definitions}}
8484
`
8585
export function getRequiredEnvVars(): Record<string, string | undefined> {
8686
return {

evals/create-dataset.ts

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@ interface TestCase {
2727
id: string;
2828
category: string;
2929
query: string;
30-
context?: string;
30+
context?: string | string[];
3131
expectedTools?: string[];
3232
reference?: string;
3333
}
@@ -68,7 +68,7 @@ async function createDatasetFromTestCases(): Promise<void> {
6868

6969
// Convert to format expected by Phoenix
7070
const examples = testCases.map((testCase) => ({
71-
input: { query: testCase.query },
71+
input: { query: testCase.query, context: testCase.context || '' },
7272
output: { expectedTools: testCase.expectedTools?.join(', '), reference: testCase.reference || '' },
7373
metadata: { category: testCase.category },
7474
}));

evals/run-evaluation.ts

Lines changed: 28 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -85,27 +85,37 @@ function createOpenRouterTask(modelName: string, tools: ToolBase[]) {
8585
apiKey: sanitizeHeaderValue(process.env.OPENROUTER_API_KEY),
8686
});
8787

88-
console.log(`Input: ${JSON.stringify(example)}`);
88+
log.info(`Input: ${JSON.stringify(example)}`);
8989

9090
const context = String(example.input?.context ?? '');
9191
const query = String(example.input?.query ?? '');
9292

93-
let content = context ? `Context: ${context}\n\n` : '';
94-
content += query ? `User query: ${query}` : '';
95-
9693
const messages: OpenAI.Chat.Completions.ChatCompletionMessageParam[] = [
9794
{ role: 'system', content: SYSTEM_PROMPT },
98-
{ role: 'user', content },
9995
];
10096

101-
console.log(`Model: ${modelName}, Messages: ${JSON.stringify(messages)}`);
97+
if (context) {
98+
messages.push({
99+
role: 'user',
100+
content: `My previous interaction with the assistant: ${context}`
101+
});
102+
}
103+
104+
messages.push({
105+
role: 'user',
106+
content: `${query}`,
107+
});
108+
109+
log.info(`Messages to model: ${JSON.stringify(messages)}`);
102110

103111
const response = await client.chat.completions.create({
104112
model: modelName,
105113
messages,
106114
tools: toolsOpenAI,
107115
});
108116

117+
log.info(`Model response: ${JSON.stringify(response.choices[0])}`);
118+
109119
return {
110120
tool_calls: response.choices[0].message.tool_calls || [],
111121
llm_response: response.choices[0].message.content || '',
@@ -121,6 +131,8 @@ const toolsExactMatch = asEvaluator({
121131
name: EVALUATOR_NAMES.TOOLS_EXACT_MATCH,
122132
kind: 'CODE',
123133
evaluate: async ({ output, expected }: any) => {
134+
log.info(`Evaluating tools match. Expected: ${JSON.stringify(expected)}, Output: ${JSON.stringify(output)}`);
135+
124136
let expectedTools = expected?.expectedTools || [];
125137
if (typeof expectedTools === 'string') {
126138
expectedTools = expectedTools.split(', ');
@@ -144,7 +156,7 @@ const toolsExactMatch = asEvaluator({
144156
const score = isCorrect ? 1.0 : 0.0;
145157
const explanation = `Expected: ${JSON.stringify(expectedTools)}, Got: ${JSON.stringify(outputTools)}`;
146158

147-
log.debug(`🕵 Tools exact match: score=${score}, output=${JSON.stringify(outputTools)}, expected=${JSON.stringify(expectedTools)}`);
159+
log.debug(`🤖 Tools exact match: score=${score}, output=${JSON.stringify(outputTools)}, expected=${JSON.stringify(expectedTools)}`);
148160

149161
return {
150162
score,
@@ -170,26 +182,26 @@ const createToolSelectionLLMEvaluator = (tools: ToolBase[]) => asEvaluator({
170182
name: EVALUATOR_NAMES.TOOL_SELECTION_LLM,
171183
kind: 'LLM',
172184
evaluate: async ({ input, output, expected }: any) => {
173-
console.log(`Evaluating tool selection. Input: ${JSON.stringify(input)}, Output: ${JSON.stringify(output)}, Expected: ${JSON.stringify(expected)}`);
185+
log.info(`Evaluating tool selection. Input: ${JSON.stringify(input)}, Output: ${JSON.stringify(output)}, Expected: ${JSON.stringify(expected)}`);
174186

175187
const evalInput = {
176-
query: `[User query] ${input?.query}` || '',
177-
context: `[Context]: ${input?.context}` || '',
178-
tool_calls: `[Tool calls]: ${JSON.stringify(output?.tool_calls)}` || '',
179-
llm_response: `[LLM response]: ${output?.llm_response}` || '',
180-
reference: `[Reference instructions]: ${expected?.reference}` || '',
181-
tool_definitions: `[Tool Definitions]: ${JSON.stringify(tools)}`
188+
query: input?.query || '',
189+
context: input?.context || '',
190+
tool_calls: JSON.stringify(output?.tool_calls || []),
191+
llm_response: output?.llm_response || '',
192+
reference: expected?.reference || '',
193+
tool_definitions: JSON.stringify(tools)
182194
};
183195

184196
try {
185197
const result = await evaluator(evalInput);
186-
console.log(`🕵 Tool selection: score: ${result.score}: ${JSON.stringify(result)}`);
198+
log.info(`🕵 Tool selection: score: ${result.score}: ${JSON.stringify(result)}`);
187199
return {
188200
score: result.score || 0.0,
189201
explanation: result.explanation || 'No explanation returned by model'
190202
};
191203
} catch (error) {
192-
console.log(`Tool selection evaluation failed: ${error}`);
204+
log.info(`Tool selection evaluation failed: ${error}`);
193205
return {
194206
score: 0.0,
195207
explanation: `Evaluation failed: ${error}`

evals/test-cases.json

Lines changed: 72 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -385,6 +385,78 @@
385385
"category": "fetch-apify-docs",
386386
"query": "Get configuration info from: https://docs.apify.com/platform/integrations/mcp",
387387
"expectedTools": ["fetch-apify-docs"]
388+
},
389+
{
390+
"id": "get-actor-output-basic-2",
391+
"category": "get-actor-output",
392+
"query": "Get query and markdown fields from dataset UvsU",
393+
"expectedTools": ["get-actor-output"]
394+
},
395+
{
396+
"id": "fetch-apify-docs-edge-1",
397+
"category": "fetch-apify-docs",
398+
"query": "Get content from: https://docs.apify.com/nonexistent-page",
399+
"expectedTools": ["fetch-apify-docs"]
400+
},
401+
{
402+
"id": "misleading-query-1",
403+
"category": "misleading",
404+
"query": "What's the weather like today?",
405+
"expectedTools": ["search-actors"]
406+
},
407+
{
408+
"id": "misleading-query-2",
409+
"category": "misleading",
410+
"query": "How do I scrape Instagram without using Apify?",
411+
"expectedTools": ["search-actors"]
412+
},
413+
{
414+
"id": "misleading-query-3",
415+
"category": "misleading",
416+
"query": "I need to build my own scraper from scratch",
417+
"expectedTools": ["search-apify-docs"]
418+
},
419+
{
420+
"id": "ambiguous-query-1",
421+
"category": "ambiguous",
422+
"query": "Instagram",
423+
"expectedTools": ["search-actors"]
424+
},
425+
{
426+
"id": "ambiguous-query-3",
427+
"category": "ambiguous",
428+
"query": "documentation",
429+
"expectedTools": ["search-apify-docs"]
430+
},
431+
{
432+
"id": "tool-selection-confusion-1",
433+
"category": "tool-selection",
434+
"query": "Find posts about AI on Instagram",
435+
"expectedTools": ["search-actors"]
436+
},
437+
{
438+
"id": "tool-selection-confusion-2",
439+
"category": "tool-selection",
440+
"query": "Search for AI articles on tech blogs",
441+
"expectedTools": ["apify-slash-rag-web-browser"]
442+
},
443+
{
444+
"id": "tool-selection-confusion-3",
445+
"category": "tool-selection",
446+
"query": "Get the latest weather forecast for New York",
447+
"expectedTools": ["apify-slash-rag-web-browser"]
448+
},
449+
{
450+
"id": "weather-mcp-search-then-call-1",
451+
"category": "flow",
452+
"query": "Now, use the mcp to check the weather in Prague, Czechia?",
453+
"expectedTools": ["call-actor"],
454+
"context": [
455+
{ "role": "user", "content": "Search for weather MCP server" },
456+
{ "role": "assistant", "content": "I'll help you to do that" },
457+
{ "role": "tool_use", "tool": "search-actors", "input": {"search": "weather mcp", "limit": 5} },
458+
{ "role": "tool_result", "tool_use_id": 12, "content": "Tool 'search-actors' successful, Actor found: jiri.spilka/weather-mcp-server" }
459+
]
388460
}
389461
]
390462
}

0 commit comments

Comments
 (0)