fix: minor changes and a couple of more test cases

jirispilka · jirispilka · commit ab61d1bcfe8a · 2025-10-16T22:35:47.000+02:00
diff --git a/evals/README.md b/evals/README.md
@@ -56,9 +56,67 @@ npm run evals:run
 - console: pass/fail per model + evaluator
 - exit code: 0 = success, 1 = failure
 
-## Updating test cases
+## Adding new test cases
 
-to add/modify test cases:
-1. edit `test-cases.json`
-2. run `npm run evals:create-dataset` to update Phoenix dataset
-3. run `npm run evals:run` to test changes
+### How to contribute?
+
+1. **Create an issue or PR** with your new test cases
+2. **Explain why it should pass** - add a `reference` field with clear reasoning
+3. **Test locally** before submitting
+4. **Publish** - we'll review and merge
+
+### Test case structure
+
+Each test case in `test-cases.json` has this structure:
+
+```json
+{
+  "id": "unique-test-id",
+  "category": "tool-category",
+  "query": "user query text",
+  "expectedTools": ["tool-name"],
+  "reference": "explanation of why this should pass (optional)",
+  "context": [/* conversation history (optional) */]
+}
+```
+
+### Simple examples
+
+**Basic tool selection:**
+```json
+{
+  "id": "fetch-actor-details-1",
+  "category": "fetch-actor-details",
+  "query": "What are the details of apify/instagram-scraper?",
+  "expectedTools": ["fetch-actor-details"]
+}
+```
+
+**With reference explanation:**
+```json
+{
+  "id": "fetch-actor-details-3",
+  "category": "fetch-actor-details",
+  "query": "Scrape details of apify/google-search-scraper",
+  "expectedTools": ["fetch-actor-details"],
+  "reference": "It should call the fetch-actor-details with the actor ID 'apify/google-search-scraper' and return the actor's documentation."
+}
+```
+
+### Advanced examples with context
+
+**Multi-step conversation flow:**
+```json
+{
+  "id": "weather-mcp-search-then-call-1",
+  "category": "flow",
+  "query": "Now, use the mcp to check the weather in Prague, Czechia?",
+  "expectedTools": ["call-actor"],
+  "context": [
+    { "role": "user", "content": "Search for weather MCP server" },
+    { "role": "assistant", "content": "I'll help you to do that" },
+    { "role": "tool_use", "tool": "search-actors", "input": {"search": "weather mcp", "limit": 5} },
+    { "role": "tool_result", "tool_use_id": 12, "content": "Tool 'search-actors' successful, Actor found: jiri.spilka/weather-mcp-server" }
+  ]
+}
+```
diff --git a/evals/config.ts b/evals/config.ts
@@ -33,7 +33,7 @@ export const MODELS_TO_EVALUATE = [
 
 export const TOOL_SELECTION_EVAL_MODEL = 'openai/gpt-4o-mini';
 
-export const PASS_THRESHOLD = 0.6;
+export const PASS_THRESHOLD = 0.7;
 
 export const DATASET_NAME = `mcp_server_dataset_v${getTestCasesVersion()}`;
 
@@ -50,11 +50,11 @@ the right tool to call.
 
 [BEGIN DATA]
 ************
-{{context}}
-{{query}}
+[User's previous interaction with the assistant]: {{context}}
+[User query]: {{query}}
 ************
-{{tool_calls}}
-{{llm_response}}
+[LLM decided to call these tools]: {{tool_calls}}
+[LLM response]: {{llm_response}}
 ************
 [END DATA]
 
@@ -78,9 +78,9 @@ Base your decision solely on the information provided in [BEGIN DATA] ... [END D
 the [Tool Definitions], and the [Reference instructions] (if provided).
 Reference instructions are optional and are intended to help you understand the use case and make your decision.
 
-{{reference}}
+[Reference instructions]: {{reference}}
 
-{{tool_definitions}}
+[Tool definitions]: {{tool_definitions}}
 `
 export function getRequiredEnvVars(): Record<string, string | undefined> {
     return {
diff --git a/evals/create-dataset.ts b/evals/create-dataset.ts
@@ -27,7 +27,7 @@ interface TestCase {
     id: string;
     category: string;
     query: string;
-    context?: string;
+    context?: string | string[];
     expectedTools?: string[];
     reference?: string;
 }
@@ -68,7 +68,7 @@ async function createDatasetFromTestCases(): Promise<void> {
 
     // Convert to format expected by Phoenix
     const examples = testCases.map((testCase) => ({
-        input: { query: testCase.query },
+        input: { query: testCase.query, context: testCase.context || '' },
         output: { expectedTools: testCase.expectedTools?.join(', '), reference: testCase.reference || '' },
         metadata: { category: testCase.category },
     }));
diff --git a/evals/run-evaluation.ts b/evals/run-evaluation.ts
@@ -85,27 +85,37 @@ function createOpenRouterTask(modelName: string, tools: ToolBase[]) {
             apiKey: sanitizeHeaderValue(process.env.OPENROUTER_API_KEY),
         });
 
-        console.log(`Input: ${JSON.stringify(example)}`);
+        log.info(`Input: ${JSON.stringify(example)}`);
 
         const context = String(example.input?.context ?? '');
         const query = String(example.input?.query ?? '');
 
-        let content = context ? `Context: ${context}\n\n` : '';
-        content += query ? `User query: ${query}` : '';
-
         const messages: OpenAI.Chat.Completions.ChatCompletionMessageParam[] = [
             { role: 'system', content: SYSTEM_PROMPT },
-            { role: 'user', content },
         ];
 
-        console.log(`Model: ${modelName}, Messages: ${JSON.stringify(messages)}`);
+        if (context) {
+            messages.push({
+                role: 'user',
+                content: `My previous interaction with the assistant: ${context}`
+            });
+        }
+
+        messages.push({
+            role: 'user',
+            content: `${query}`,
+        });
+
+        log.info(`Messages to model: ${JSON.stringify(messages)}`);
 
         const response = await client.chat.completions.create({
             model: modelName,
             messages,
             tools: toolsOpenAI,
         });
 
+        log.info(`Model response: ${JSON.stringify(response.choices[0])}`);
+
         return {
             tool_calls: response.choices[0].message.tool_calls || [],
             llm_response: response.choices[0].message.content || '',
@@ -121,6 +131,8 @@ const toolsExactMatch = asEvaluator({
     name: EVALUATOR_NAMES.TOOLS_EXACT_MATCH,
     kind: 'CODE',
     evaluate: async ({ output, expected }: any) => {
+        log.info(`Evaluating tools match. Expected: ${JSON.stringify(expected)}, Output: ${JSON.stringify(output)}`);
+
         let expectedTools = expected?.expectedTools || [];
         if (typeof expectedTools === 'string') {
             expectedTools = expectedTools.split(', ');
@@ -144,7 +156,7 @@ const toolsExactMatch = asEvaluator({
         const score = isCorrect ? 1.0 : 0.0;
         const explanation = `Expected: ${JSON.stringify(expectedTools)}, Got: ${JSON.stringify(outputTools)}`;
 
-        log.debug(`🕵 Tools exact match: score=${score}, output=${JSON.stringify(outputTools)}, expected=${JSON.stringify(expectedTools)}`);
+        log.debug(`🤖 Tools exact match: score=${score}, output=${JSON.stringify(outputTools)}, expected=${JSON.stringify(expectedTools)}`);
 
         return {
             score,
@@ -170,26 +182,26 @@ const createToolSelectionLLMEvaluator = (tools: ToolBase[]) => asEvaluator({
     name: EVALUATOR_NAMES.TOOL_SELECTION_LLM,
     kind: 'LLM',
     evaluate: async ({ input, output, expected }: any) => {
-        console.log(`Evaluating tool selection. Input: ${JSON.stringify(input)}, Output: ${JSON.stringify(output)}, Expected: ${JSON.stringify(expected)}`);
+        log.info(`Evaluating tool selection. Input: ${JSON.stringify(input)}, Output: ${JSON.stringify(output)}, Expected: ${JSON.stringify(expected)}`);
 
         const evalInput = {
-            query: `[User query] ${input?.query}` || '',
-            context: `[Context]: ${input?.context}` || '',
-            tool_calls: `[Tool calls]: ${JSON.stringify(output?.tool_calls)}` || '',
-            llm_response: `[LLM response]: ${output?.llm_response}` || '',
-            reference: `[Reference instructions]: ${expected?.reference}` || '',
-            tool_definitions: `[Tool Definitions]: ${JSON.stringify(tools)}`
+            query: input?.query || '',
+            context: input?.context || '',
+            tool_calls: JSON.stringify(output?.tool_calls || []),
+            llm_response: output?.llm_response || '',
+            reference: expected?.reference || '',
+            tool_definitions: JSON.stringify(tools)
         };
 
         try {
             const result = await evaluator(evalInput);
-            console.log(`🕵 Tool selection: score: ${result.score}: ${JSON.stringify(result)}`);
+            log.info(`🕵 Tool selection: score: ${result.score}: ${JSON.stringify(result)}`);
             return {
                 score: result.score || 0.0,
                 explanation: result.explanation || 'No explanation returned by model'
             };
         } catch (error) {
-            console.log(`Tool selection evaluation failed: ${error}`);
+            log.info(`Tool selection evaluation failed: ${error}`);
             return {
                 score: 0.0,
                 explanation: `Evaluation failed: ${error}`
diff --git a/evals/test-cases.json b/evals/test-cases.json
@@ -385,6 +385,78 @@
       "category": "fetch-apify-docs",
       "query": "Get configuration info from: https://docs.apify.com/platform/integrations/mcp",
       "expectedTools": ["fetch-apify-docs"]
+    },
+    {
+      "id": "get-actor-output-basic-2",
+      "category": "get-actor-output",
+      "query": "Get query and markdown fields from dataset UvsU",
+      "expectedTools": ["get-actor-output"]
+    },
+    {
+      "id": "fetch-apify-docs-edge-1",
+      "category": "fetch-apify-docs",
+      "query": "Get content from: https://docs.apify.com/nonexistent-page",
+      "expectedTools": ["fetch-apify-docs"]
+    },
+    {
+      "id": "misleading-query-1",
+      "category": "misleading",
+      "query": "What's the weather like today?",
+      "expectedTools": ["search-actors"]
+    },
+    {
+      "id": "misleading-query-2",
+      "category": "misleading",
+      "query": "How do I scrape Instagram without using Apify?",
+      "expectedTools": ["search-actors"]
+    },
+    {
+      "id": "misleading-query-3",
+      "category": "misleading",
+      "query": "I need to build my own scraper from scratch",
+      "expectedTools": ["search-apify-docs"]
+    },
+    {
+      "id": "ambiguous-query-1",
+      "category": "ambiguous",
+      "query": "Instagram",
+      "expectedTools": ["search-actors"]
+    },
+    {
+      "id": "ambiguous-query-3",
+      "category": "ambiguous",
+      "query": "documentation",
+      "expectedTools": ["search-apify-docs"]
+    },
+    {
+      "id": "tool-selection-confusion-1",
+      "category": "tool-selection",
+      "query": "Find posts about AI on Instagram",
+      "expectedTools": ["search-actors"]
+    },
+    {
+      "id": "tool-selection-confusion-2",
+      "category": "tool-selection",
+      "query": "Search for AI articles on tech blogs",
+      "expectedTools": ["apify-slash-rag-web-browser"]
+    },
+    {
+      "id": "tool-selection-confusion-3",
+      "category": "tool-selection",
+      "query": "Get the latest weather forecast for New York",
+      "expectedTools": ["apify-slash-rag-web-browser"]
+    },
+    {
+      "id": "weather-mcp-search-then-call-1",
+      "category": "flow",
+      "query": "Now, use the mcp to check the weather in Prague, Czechia?",
+      "expectedTools": ["call-actor"],
+      "context": [
+          { "role": "user", "content": "Search for weather MCP server" },
+          { "role": "assistant", "content": "I'll help you to do that" },
+          { "role": "tool_use", "tool": "search-actors", "input": {"search": "weather mcp", "limit": 5} },
+          { "role": "tool_result", "tool_use_id": 12, "content": "Tool 'search-actors' successful, Actor found: jiri.spilka/weather-mcp-server" }
+      ]
     }
   ]
 }
diff --git a/notebooks/evaluation_2025.ipynb b/notebooks/evaluation_2025.ipynb