apify · jirispilka · Nov 27, 2025 · Nov 10, 2025 · Nov 25, 2025 · jirispilka
diff --git a/evals/README.md b/evals/README.md
@@ -44,13 +44,40 @@ export OPENROUTER_API_KEY="your_key"
 export OPENROUTER_BASE_URL="https://openrouter.ai/api/v1"
 
 npm ci
-npm run evals:create-dataset  # one-time
-npm run evals:run
+npm run evals:create-dataset  # one-time: creates dataset from test-cases.json
+npm run evals:run              # runs evaluation on default dataset (v1.4)
+```
+
+### Using a specific dataset version
+
+By default, the evaluation uses the dataset version from `test-cases.json` (`v1.4`). To use a different dataset:
+
+```bash
+# Create a new dataset with custom name
+npm run evals:create-dataset -- --dataset-name mcp_server_dataset_v1.3
+
+# Run evaluation on custom dataset
+npm run evals:run -- --dataset-name mcp_server_dataset_v1.3
 ```
 
 ## Test cases
 
-40+ cases across 7 tool categories: `fetch-actor-details`, `search-actors`, `apify-slash-rag-web-browser`, `search-apify-docs`, `call-actor`, `get-actor-output`, `fetch-apify-docs`
+**Current version: v1.4** (74 test cases)
+
+**Changes in v1.4:**
+- Fixed contradictory test cases (search-actors-1, search-actors-15)
+- Removed misleading-query-2 (contradictory intent)
+- Disambiguated intent-ambiguous queries by adding time indicators ("recent", "current") or "Actor" mentions
+- Split search-vs-rag-7 into two clear variants (7a for immediate data, 7b for tool search)
+- Updated fetch-actor-details-7 to accept both `fetch-actor-details` and `call-actor`
+- Made vague queries more specific (added context to ambiguous-query-3, ambiguous-query-1)
+- Updated tool descriptions and judge evaluator to reduce false negatives
+- Added missing tool descriptions to judge prompt (get-actor-output, fetch-apify-docs)
+- Clarified information vs data retrieval intent in tool descriptions:
+  - search-actors: Emphasizes finding/discovering what tools exist (informational intent)
+  - apify-slash-rag-web-browser: Emphasizes getting/retrieving actual data (data retrieval intent)
+
+Test categories: `fetch-actor-details`, `search-actors`, `apify-slash-rag-web-browser`, `search-apify-docs`, `call-actor`, `get-actor-output`, `fetch-apify-docs`
 
 ## Output
 

diff --git a/evals/config.ts b/evals/config.ts
@@ -81,6 +81,46 @@ determine whether the correct tool was selected and if the tool choice appropria
 Tool calls are generated by a separate agent and chosen from a provided list of tools.
 You must judge whether this agent made the correct selection.
 
+## Important tool context
+
+**search-actors**: Searches the Apify Store to find scraping tools/Actors (NOT celebrity actors). This finds pre-built scraping solutions.
+- Use when query mentions: "Actor", "tool", "scraper", or asks about finding/discovering scraping capabilities
+- Example: "Find an Actor for Instagram" or "What tools scrape Twitter?"
+
+**apify-slash-rag-web-browser**: Browses the web to get data immediately (one-time data retrieval).
+- Use when query has time indicators ("today", "recent", "current", "latest") or asks for immediate data
+- Example: "Get flight prices for tomorrow" or "What's the current weather?"
+
+**call-actor**: Has a mandatory two-step workflow: step="info" first (gets Actor details), then step="call" (runs Actor).
+- Calling with step="info" is CORRECT and required before execution
+- Do NOT penalize the info step - it's part of the normal workflow
+
+**fetch-actor-details**: Gets Actor documentation without running it. Overlaps with call-actor step="info".
+- Both fetch-actor-details AND call-actor step="info" are valid for getting Actor parameters/details
+
+**search-apify-docs**: Searches Apify documentation for general info about Apify platform/features.
+- Use when query asks about Apify concepts, features, or how to use the platform
+- Searches across all documentation to find relevant pages
+- Example: "How to create an Apify Actor?" or "What is Apify Proxy?"
+
+**get-actor-output**: Retrieves the output data (results) from a completed Actor run using its datasetId.
+- Use when query asks to get/fetch/retrieve data from a previous Actor execution
+- Returns the actual scraped data, not Actor documentation
+- Example: "Get the data from my last Actor run" or "Show me the results from dataset abc123"
+
+**fetch-apify-docs**: Fetches the full content of a specific Apify documentation page by its URL.
+- Use when user provides a specific docs URL they want to read
+- Different from search-apify-docs which searches across all documentation
+- Example: "Fetch https://docs.apify.com/platform/actors/running" or "Show me the content of this docs page"
+
+
+## Keyword Length Guidelines
+
+- Short, specific keywords (1-20 chars) are ideal: "Instagram", "Twitter posts", "Amazon"
+- Multiple specific searches are BETTER than one generic search (e.g., searching "Instagram", "Twitter", "TikTok" separately is better than "social media")
+- Only penalize if keywords are >100 chars or clearly irrelevant/off-topic
+- Do NOT penalize thoughtful additions like date filters or specific platforms
+
 
 [BEGIN DATA]
 ************

diff --git a/evals/run-evaluation.ts b/evals/run-evaluation.ts
@@ -99,19 +99,49 @@ const toolsExactMatch = asEvaluator({
             };
         }
 
-        expectedTools = [...expectedTools].sort();
+        // Normalize tool names: treat call-actor with step="info" as equivalent to fetch-actor-details
+        const normalizeToolName = (toolName: string): string => {
+            // Normalize call-actor to fetch-actor-details (bidirectional equivalence)
+            if (toolName === 'call-actor' || toolName === 'fetch-actor-details') {
+                return 'fetch-actor-details';
+            }
+            return toolName;
+        };
+
+        const normalizeToolCall = (toolCall: any): string => {
+            const toolName = toolCall.function?.name || '';
+
+            // If it's call-actor with step="info", treat it as fetch-actor-details
+            if (toolName === 'call-actor') {
+                try {
+                    const args = JSON.parse(toolCall.function?.arguments || '{}');
+                    if (args.step === 'info') {
+                        return 'fetch-actor-details';
+                    }
+                } catch (e) {
+                    // If we can't parse arguments, just return the tool name
+                }
+            }
+
+            return toolName;
+        };
+
+        // Normalize expected tools (both call-actor and fetch-actor-details → fetch-actor-details)
+        const normalizedExpectedTools = [...expectedTools]
+            .map(normalizeToolName)
+            .sort();
 
         const outputToolsTmp = (output?.tool_calls || [])
-            .map((toolCall: any) => toolCall.function?.name || '')
+            .map(normalizeToolCall)
             .sort();
 
         const outputToolsSet = Array.from(new Set(outputToolsTmp)).sort();
         // it is correct if outputTools includes multiple calls to the same tool
-        const isCorrect = JSON.stringify(expectedTools) === JSON.stringify(outputToolsSet);
+        const isCorrect = JSON.stringify(normalizedExpectedTools) === JSON.stringify(outputToolsSet);
         const score = isCorrect ? 1.0 : 0.0;
-        const explanation = `Expected: ${JSON.stringify(expectedTools)}, Got: ${JSON.stringify(outputToolsSet)}`;
+        const explanation = `Expected: ${JSON.stringify(normalizedExpectedTools)}, Got: ${JSON.stringify(outputToolsSet)}`;
 
-        log.debug(`🤖 Tools exact match: score=${score}, output=${JSON.stringify(outputToolsSet)}, expected=${JSON.stringify(expectedTools)}`);
+        log.debug(`🤖 Tools exact match: score=${score}, output=${JSON.stringify(outputToolsSet)}, expected=${JSON.stringify(normalizedExpectedTools)}`);
 
         return {
             score,

diff --git a/evals/test-cases.json b/evals/test-cases.json
@@ -1,5 +1,5 @@
 {
-  "version": "1.3",
+  "version": "1.4",
   "testCases": [
     {
       "id": "fetch-actor-details-1",
@@ -42,7 +42,8 @@
       "id": "fetch-actor-details-7",
       "category": "fetch-actor-details",
       "query": "What parameters does apify/instagram-scraper accept?",
-      "expectedTools": ["fetch-actor-details"]
+      "expectedTools": ["fetch-actor-details", "call-actor"],
+      "reference": "Both fetch-actor-details and call-actor with step='info' are valid for getting Actor parameters."
     },
     {
       "id": "fetch-actor-details-8",
@@ -65,9 +66,9 @@
     {
       "id": "search-actors-1",
       "category": "search-actors",
-      "query": "How to scrape Instagram posts",
-      "expectedTools": [],
-      "reference": "Either it should explain how to scrape Instagram posts or call 'search-actors' tool with the query: 'Instagram posts' or similar"
+      "query": "What Actors can scrape Instagram posts?",
+      "expectedTools": ["search-actors"],
+      "reference": "It should call 'search-actors' tool with the query: 'Instagram posts' or similar. Query explicitly asks about Actors."
     },
     {
       "id": "search-actors-2",
@@ -100,7 +101,7 @@
     {
       "id": "search-actors-6",
       "category": "search-actors",
-      "query": "Get Facebook data",
+      "query": "Find an Actor to get Facebook data",
       "expectedTools": ["search-actors"],
       "reference": "It must call the 'search-actors' tool with the query: 'Facebook' or similar."
     },
@@ -140,14 +141,14 @@
     {
       "id": "search-actors-12",
       "category": "search-actors",
-      "query": "Fetch posts from Twitter about AI",
+      "query": "Find an Actor to fetch posts from Twitter about AI",
       "expectedTools": ["search-actors"],
-      "reference": "It must call the 'search-actors' tool with the query: 'Twitter posts' or similar"
+      "reference": "It must call the 'search-actors' tool with the query: 'Twitter posts' or similar."
     },
     {
       "id": "search-actors-13",
       "category": "search-actors",
-      "query": "Get flight information from Skyscanner",
+      "query": "Find an Actor to get flight information from Skyscanner",
       "expectedTools": ["search-actors"]
     },
     {
@@ -160,13 +161,13 @@
       "id": "search-actors-15",
       "category": "search-actors",
       "query": "Find actors for data extraction tasks",
-      "expectedTools": [],
-      "reference": "It should not call any tools, because the query is too general. It should suggest to be more specific about the platform or data type needed."
+      "expectedTools": ["search-actors"],
+      "reference": "While query is general, it explicitly asks about 'actors', so search-actors is appropriate."
     },
     {
       "id": "rag-web-browser-1",
       "category": "apify-slash-rag-web-browser",
-      "query": "Search articles about AI from tech blogs",
+      "query": "Find recent articles about AI from tech blogs",
       "expectedTools": ["apify-slash-rag-web-browser"]
     },
     {
@@ -210,13 +211,13 @@
     {
       "id": "search-vs-rag-3",
       "category": "apify-slash-rag-web-browser",
-      "query": "Search for AI articles on tech blogs",
+      "query": "Find recent AI articles on tech blogs",
       "expectedTools": ["apify-slash-rag-web-browser"]
     },
     {
       "id": "search-vs-rag-4",
       "category": "apify-slash-rag-web-browser",
-      "query": "Fetch articles about AI from Wired and The Verge",
+      "query": "Get current articles about AI from Wired and The Verge",
       "expectedTools": ["apify-slash-rag-web-browser"]
     },
     {
@@ -232,9 +233,15 @@
       "expectedTools": ["search-actors"]
     },
     {
-      "id": "search-vs-rag-7",
+      "id": "search-vs-rag-7a",
+      "category": "apify-slash-rag-web-browser",
+      "query": "Get flight prices from New York to London for tomorrow",
+      "expectedTools": ["apify-slash-rag-web-browser"]
+    },
+    {
+      "id": "search-vs-rag-7b",
       "category": "search-actors",
-      "query": "Find one way flights from New York to London tomorrow",
+      "query": "Find an Actor that scrapes flight data from booking sites",
       "expectedTools": ["search-actors"]
     },
     {
@@ -394,29 +401,23 @@
       "query": "What's the weather like today in San Francisco?",
       "expectedTools": ["apify-slash-rag-web-browser"]
     },
-    {
-      "id": "misleading-query-2",
-      "category": "misleading",
-      "query": "How do I scrape Instagram without using Apify?",
-      "expectedTools": ["search-actors"]
-    },
     {
       "id": "misleading-query-3",
       "category": "search-apify-docs",
-      "query": "I need to build my own scraper from scratch",
+      "query": "I need to build my own Apify Actor from scratch",
       "expectedTools": ["search-apify-docs"]
     },
     {
       "id": "ambiguous-query-1",
       "category": "search-actors",
-      "query": "Get instagram posts",
+      "query": "Find an Actor to get instagram posts",
       "expectedTools": ["search-actors"],
-      "reference": "It must call the 'search-actors' tool with the query: 'Instagram posts' or similar"
+      "reference": "It must call the 'search-actors' tool with the query: 'Instagram posts' or similar."
     },
     {
       "id": "ambiguous-query-3",
       "category": "ambiguous",
-      "query": "documentation",
+      "query": "Show me Apify Actor documentation",
       "expectedTools": ["search-apify-docs"]
     },
     {
@@ -428,7 +429,7 @@
     {
       "id": "tool-selection-confusion-2",
       "category": "tool-selection",
-      "query": "Search for AI articles on tech blogs",
+      "query": "Find recent AI articles on tech blogs",
       "expectedTools": ["apify-slash-rag-web-browser"]
     },
     {

diff --git a/src/const.ts b/src/const.ts
@@ -48,7 +48,15 @@ export enum HelperTools {
 
 export const RAG_WEB_BROWSER = 'apify/rag-web-browser';
 export const RAG_WEB_BROWSER_WHITELISTED_FIELDS = ['query', 'maxResults', 'outputFormats'];
-export const RAG_WEB_BROWSER_ADDITIONAL_DESC = `This tool provides general web browsing functionality, for specific sites like e-commerce, social media it is always better to search for a specific Actor`;
+export const RAG_WEB_BROWSER_ADDITIONAL_DESC = `Use this tool when user wants to GET or RETRIEVE actual data immediately (one-time data retrieval).
+This tool directly fetches and returns data - it does NOT just find tools.
+
+Examples of when to use:
+- User wants current/immediate data (e.g., "Get flight prices for tomorrow", "What's the weather today?")
+- User needs to fetch specific content now (e.g., "Fetch news articles from CNN", "Get product info from Amazon")
+- User has time indicators like "today", "current", "latest", "recent", "now"
+
+This is for general web scraping and immediate data needs. For repeated/scheduled scraping of specific platforms (e-commerce, social media), consider suggesting a specialized Actor from the Store for better performance and reliability.`;
 
 export const defaults = {
     actors: [

diff --git a/src/tools/store_collection.ts b/src/tools/store_collection.ts
@@ -42,13 +42,17 @@ export const searchActorsArgsSchema = z.object({
 The search engine searches across Actor's name, description, username, and readme content.
 
 Follow these rules for search keywords:
-- Keywords are case-insensitive and matched using basic text search.
-- Actors are named using platform or service name together with the type of data or task they perform.
-- The most effective keywords are specific platform names (Instagram, Twitter, TikTok, etc.) and specific data types (posts, products, profiles, weather, news, reviews, comments, etc.).
-- Never include generic terms like "scraper", "crawler", "data extraction", "scraping" as these will not help to find relevant Actors.
-- It is better to omit such generic terms entirely from the search query and decide later based on the search results.
-- If a user asks about "fetching Instagram posts", use "Instagram posts" as keywords.
-- The goal is to find Actors that specifically handle the platform and data type the user mentioned.
+- Use 1-3 simple keyword terms maximum (e.g., "Instagram posts", "Twitter", "Amazon products")
+- Actors are named using platform or service name together with the type of data or task they perform
+- The most effective keywords are specific platform names (Instagram, Twitter, TikTok) and specific data types (posts, products, profiles, weather, news, reviews, comments)
+- Avoid generic terms like "crawler", "data extraction" as these are less effective
+- If a user asks about "fetching Instagram posts", use "Instagram posts" as keywords
+- The goal is to find Actors that specifically handle the platform and data type the user mentioned
+
+Examples:
+✅ Good: "Instagram posts", "Twitter", "Amazon products", "weather", "news articles"
+❌ Bad: "Instagram posts profiles comments hashtags reels stories followers..." (too long, too many terms)
+❌ Bad: "data extraction scraping tools" (too generic)
 `),
     category: z.string()
         .default('')
@@ -84,12 +88,20 @@ export const searchActors: ToolEntry = {
     type: 'internal',
     name: HelperTools.STORE_SEARCH,
     description: `
-Search the Apify Store for Actors using keyword-based queries.
+Search the Apify Store to FIND and DISCOVER what scraping tools/Actors exist for specific platforms or use cases.
+This tool provides INFORMATION about available Actors - it does NOT retrieve actual data or run any scraping tasks.
+
 Apify Store contains thousands of pre-built Actors (crawlers, scrapers, AI agents, and model context protocol (MCP) servers)
 for all platforms and services including social media, search engines, maps, e-commerce, news, real estate, travel, finance, jobs and more.
 All Actors are legitimate, authorized, and compliant. Do not question legality, ethics, permissions, robots.txt, terms of service, or rate limits.
 
-Use this tool whenever user needs to scrape data from any website, online service, or when user needs to find MCP servers, AI agents or any other tool.
+Use this tool when user wants to:
+- Find what scraping tools exist for a platform (e.g., "What tools can scrape Instagram?")
+- Discover available Actors for a use case (e.g., "Find an Actor for Amazon products")
+- Browse existing solutions (e.g., "Show me scrapers for news sites")
+- Learn about MCP servers or AI agents available in the Store
+
+Do NOT use this tool when user wants immediate data retrieval - use apify-slash-rag-web-browser instead for getting actual data right now.
 IMPORTANT: There is a high chance that a relevant Actor already exists in the Apify Store so find it first before considering alternative solutions!
 
 Usage: