diff --git a/evals/README.md b/evals/README.md index 83bd197f..652a71c8 100644 --- a/evals/README.md +++ b/evals/README.md @@ -44,13 +44,40 @@ export OPENROUTER_API_KEY="your_key" export OPENROUTER_BASE_URL="https://openrouter.ai/api/v1" npm ci -npm run evals:create-dataset # one-time -npm run evals:run +npm run evals:create-dataset # one-time: creates dataset from test-cases.json +npm run evals:run # runs evaluation on default dataset (v1.4) +``` + +### Using a specific dataset version + +By default, the evaluation uses the dataset version from `test-cases.json` (`v1.4`). To use a different dataset: + +```bash +# Create a new dataset with custom name +npm run evals:create-dataset -- --dataset-name mcp_server_dataset_v1.3 + +# Run evaluation on custom dataset +npm run evals:run -- --dataset-name mcp_server_dataset_v1.3 ``` ## Test cases -40+ cases across 7 tool categories: `fetch-actor-details`, `search-actors`, `apify-slash-rag-web-browser`, `search-apify-docs`, `call-actor`, `get-actor-output`, `fetch-apify-docs` +**Current version: v1.4** (74 test cases) + +**Changes in v1.4:** +- Fixed contradictory test cases (search-actors-1, search-actors-15) +- Removed misleading-query-2 (contradictory intent) +- Disambiguated intent-ambiguous queries by adding time indicators ("recent", "current") or "Actor" mentions +- Split search-vs-rag-7 into two clear variants (7a for immediate data, 7b for tool search) +- Updated fetch-actor-details-7 to accept both `fetch-actor-details` and `call-actor` +- Made vague queries more specific (added context to ambiguous-query-3, ambiguous-query-1) +- Updated tool descriptions and judge evaluator to reduce false negatives +- Added missing tool descriptions to judge prompt (get-actor-output, fetch-apify-docs) +- Clarified information vs data retrieval intent in tool descriptions: + - search-actors: Emphasizes finding/discovering what tools exist (informational intent) + - apify-slash-rag-web-browser: Emphasizes getting/retrieving actual data (data retrieval intent) + +Test categories: `fetch-actor-details`, `search-actors`, `apify-slash-rag-web-browser`, `search-apify-docs`, `call-actor`, `get-actor-output`, `fetch-apify-docs` ## Output diff --git a/evals/config.ts b/evals/config.ts index e911e8fe..8db7bf5a 100644 --- a/evals/config.ts +++ b/evals/config.ts @@ -81,6 +81,46 @@ determine whether the correct tool was selected and if the tool choice appropria Tool calls are generated by a separate agent and chosen from a provided list of tools. You must judge whether this agent made the correct selection. +## Important tool context + +**search-actors**: Searches the Apify Store to find scraping tools/Actors (NOT celebrity actors). This finds pre-built scraping solutions. +- Use when query mentions: "Actor", "tool", "scraper", or asks about finding/discovering scraping capabilities +- Example: "Find an Actor for Instagram" or "What tools scrape Twitter?" + +**apify-slash-rag-web-browser**: Browses the web to get data immediately (one-time data retrieval). +- Use when query has time indicators ("today", "recent", "current", "latest") or asks for immediate data +- Example: "Get flight prices for tomorrow" or "What's the current weather?" + +**call-actor**: Has a mandatory two-step workflow: step="info" first (gets Actor details), then step="call" (runs Actor). +- Calling with step="info" is CORRECT and required before execution +- Do NOT penalize the info step - it's part of the normal workflow + +**fetch-actor-details**: Gets Actor documentation without running it. Overlaps with call-actor step="info". +- Both fetch-actor-details AND call-actor step="info" are valid for getting Actor parameters/details + +**search-apify-docs**: Searches Apify documentation for general info about Apify platform/features. +- Use when query asks about Apify concepts, features, or how to use the platform +- Searches across all documentation to find relevant pages +- Example: "How to create an Apify Actor?" or "What is Apify Proxy?" + +**get-actor-output**: Retrieves the output data (results) from a completed Actor run using its datasetId. +- Use when query asks to get/fetch/retrieve data from a previous Actor execution +- Returns the actual scraped data, not Actor documentation +- Example: "Get the data from my last Actor run" or "Show me the results from dataset abc123" + +**fetch-apify-docs**: Fetches the full content of a specific Apify documentation page by its URL. +- Use when user provides a specific docs URL they want to read +- Different from search-apify-docs which searches across all documentation +- Example: "Fetch https://docs.apify.com/platform/actors/running" or "Show me the content of this docs page" + + +## Keyword Length Guidelines + +- Short, specific keywords (1-20 chars) are ideal: "Instagram", "Twitter posts", "Amazon" +- Multiple specific searches are BETTER than one generic search (e.g., searching "Instagram", "Twitter", "TikTok" separately is better than "social media") +- Only penalize if keywords are >100 chars or clearly irrelevant/off-topic +- Do NOT penalize thoughtful additions like date filters or specific platforms + [BEGIN DATA] ************ diff --git a/evals/run-evaluation.ts b/evals/run-evaluation.ts index 15dc5196..fe944a9e 100644 --- a/evals/run-evaluation.ts +++ b/evals/run-evaluation.ts @@ -99,19 +99,49 @@ const toolsExactMatch = asEvaluator({ }; } - expectedTools = [...expectedTools].sort(); + // Normalize tool names: treat call-actor with step="info" as equivalent to fetch-actor-details + const normalizeToolName = (toolName: string): string => { + // Normalize call-actor to fetch-actor-details (bidirectional equivalence) + if (toolName === 'call-actor' || toolName === 'fetch-actor-details') { + return 'fetch-actor-details'; + } + return toolName; + }; + + const normalizeToolCall = (toolCall: any): string => { + const toolName = toolCall.function?.name || ''; + + // If it's call-actor with step="info", treat it as fetch-actor-details + if (toolName === 'call-actor') { + try { + const args = JSON.parse(toolCall.function?.arguments || '{}'); + if (args.step === 'info') { + return 'fetch-actor-details'; + } + } catch (e) { + // If we can't parse arguments, just return the tool name + } + } + + return toolName; + }; + + // Normalize expected tools (both call-actor and fetch-actor-details → fetch-actor-details) + const normalizedExpectedTools = [...expectedTools] + .map(normalizeToolName) + .sort(); const outputToolsTmp = (output?.tool_calls || []) - .map((toolCall: any) => toolCall.function?.name || '') + .map(normalizeToolCall) .sort(); const outputToolsSet = Array.from(new Set(outputToolsTmp)).sort(); // it is correct if outputTools includes multiple calls to the same tool - const isCorrect = JSON.stringify(expectedTools) === JSON.stringify(outputToolsSet); + const isCorrect = JSON.stringify(normalizedExpectedTools) === JSON.stringify(outputToolsSet); const score = isCorrect ? 1.0 : 0.0; - const explanation = `Expected: ${JSON.stringify(expectedTools)}, Got: ${JSON.stringify(outputToolsSet)}`; + const explanation = `Expected: ${JSON.stringify(normalizedExpectedTools)}, Got: ${JSON.stringify(outputToolsSet)}`; - log.debug(`🤖 Tools exact match: score=${score}, output=${JSON.stringify(outputToolsSet)}, expected=${JSON.stringify(expectedTools)}`); + log.debug(`🤖 Tools exact match: score=${score}, output=${JSON.stringify(outputToolsSet)}, expected=${JSON.stringify(normalizedExpectedTools)}`); return { score, diff --git a/evals/test-cases.json b/evals/test-cases.json index 40ad8c32..8a29875b 100644 --- a/evals/test-cases.json +++ b/evals/test-cases.json @@ -1,5 +1,5 @@ { - "version": "1.3", + "version": "1.4", "testCases": [ { "id": "fetch-actor-details-1", @@ -42,7 +42,8 @@ "id": "fetch-actor-details-7", "category": "fetch-actor-details", "query": "What parameters does apify/instagram-scraper accept?", - "expectedTools": ["fetch-actor-details"] + "expectedTools": ["fetch-actor-details", "call-actor"], + "reference": "Both fetch-actor-details and call-actor with step='info' are valid for getting Actor parameters." }, { "id": "fetch-actor-details-8", @@ -65,9 +66,9 @@ { "id": "search-actors-1", "category": "search-actors", - "query": "How to scrape Instagram posts", - "expectedTools": [], - "reference": "Either it should explain how to scrape Instagram posts or call 'search-actors' tool with the query: 'Instagram posts' or similar" + "query": "What Actors can scrape Instagram posts?", + "expectedTools": ["search-actors"], + "reference": "It should call 'search-actors' tool with the query: 'Instagram posts' or similar. Query explicitly asks about Actors." }, { "id": "search-actors-2", @@ -100,7 +101,7 @@ { "id": "search-actors-6", "category": "search-actors", - "query": "Get Facebook data", + "query": "Find an Actor to get Facebook data", "expectedTools": ["search-actors"], "reference": "It must call the 'search-actors' tool with the query: 'Facebook' or similar." }, @@ -140,14 +141,14 @@ { "id": "search-actors-12", "category": "search-actors", - "query": "Fetch posts from Twitter about AI", + "query": "Find an Actor to fetch posts from Twitter about AI", "expectedTools": ["search-actors"], - "reference": "It must call the 'search-actors' tool with the query: 'Twitter posts' or similar" + "reference": "It must call the 'search-actors' tool with the query: 'Twitter posts' or similar." }, { "id": "search-actors-13", "category": "search-actors", - "query": "Get flight information from Skyscanner", + "query": "Find an Actor to get flight information from Skyscanner", "expectedTools": ["search-actors"] }, { @@ -160,13 +161,13 @@ "id": "search-actors-15", "category": "search-actors", "query": "Find actors for data extraction tasks", - "expectedTools": [], - "reference": "It should not call any tools, because the query is too general. It should suggest to be more specific about the platform or data type needed." + "expectedTools": ["search-actors"], + "reference": "While query is general, it explicitly asks about 'actors', so search-actors is appropriate." }, { "id": "rag-web-browser-1", "category": "apify-slash-rag-web-browser", - "query": "Search articles about AI from tech blogs", + "query": "Find recent articles about AI from tech blogs", "expectedTools": ["apify-slash-rag-web-browser"] }, { @@ -210,13 +211,13 @@ { "id": "search-vs-rag-3", "category": "apify-slash-rag-web-browser", - "query": "Search for AI articles on tech blogs", + "query": "Find recent AI articles on tech blogs", "expectedTools": ["apify-slash-rag-web-browser"] }, { "id": "search-vs-rag-4", "category": "apify-slash-rag-web-browser", - "query": "Fetch articles about AI from Wired and The Verge", + "query": "Get current articles about AI from Wired and The Verge", "expectedTools": ["apify-slash-rag-web-browser"] }, { @@ -232,9 +233,15 @@ "expectedTools": ["search-actors"] }, { - "id": "search-vs-rag-7", + "id": "search-vs-rag-7a", + "category": "apify-slash-rag-web-browser", + "query": "Get flight prices from New York to London for tomorrow", + "expectedTools": ["apify-slash-rag-web-browser"] + }, + { + "id": "search-vs-rag-7b", "category": "search-actors", - "query": "Find one way flights from New York to London tomorrow", + "query": "Find an Actor that scrapes flight data from booking sites", "expectedTools": ["search-actors"] }, { @@ -394,29 +401,23 @@ "query": "What's the weather like today in San Francisco?", "expectedTools": ["apify-slash-rag-web-browser"] }, - { - "id": "misleading-query-2", - "category": "misleading", - "query": "How do I scrape Instagram without using Apify?", - "expectedTools": ["search-actors"] - }, { "id": "misleading-query-3", "category": "search-apify-docs", - "query": "I need to build my own scraper from scratch", + "query": "I need to build my own Apify Actor from scratch", "expectedTools": ["search-apify-docs"] }, { "id": "ambiguous-query-1", "category": "search-actors", - "query": "Get instagram posts", + "query": "Find an Actor to get instagram posts", "expectedTools": ["search-actors"], - "reference": "It must call the 'search-actors' tool with the query: 'Instagram posts' or similar" + "reference": "It must call the 'search-actors' tool with the query: 'Instagram posts' or similar." }, { "id": "ambiguous-query-3", "category": "ambiguous", - "query": "documentation", + "query": "Show me Apify Actor documentation", "expectedTools": ["search-apify-docs"] }, { @@ -428,7 +429,7 @@ { "id": "tool-selection-confusion-2", "category": "tool-selection", - "query": "Search for AI articles on tech blogs", + "query": "Find recent AI articles on tech blogs", "expectedTools": ["apify-slash-rag-web-browser"] }, { diff --git a/src/const.ts b/src/const.ts index 9887cc9c..a087e81e 100644 --- a/src/const.ts +++ b/src/const.ts @@ -48,7 +48,15 @@ export enum HelperTools { export const RAG_WEB_BROWSER = 'apify/rag-web-browser'; export const RAG_WEB_BROWSER_WHITELISTED_FIELDS = ['query', 'maxResults', 'outputFormats']; -export const RAG_WEB_BROWSER_ADDITIONAL_DESC = `This tool provides general web browsing functionality, for specific sites like e-commerce, social media it is always better to search for a specific Actor`; +export const RAG_WEB_BROWSER_ADDITIONAL_DESC = `Use this tool when user wants to GET or RETRIEVE actual data immediately (one-time data retrieval). +This tool directly fetches and returns data - it does NOT just find tools. + +Examples of when to use: +- User wants current/immediate data (e.g., "Get flight prices for tomorrow", "What's the weather today?") +- User needs to fetch specific content now (e.g., "Fetch news articles from CNN", "Get product info from Amazon") +- User has time indicators like "today", "current", "latest", "recent", "now" + +This is for general web scraping and immediate data needs. For repeated/scheduled scraping of specific platforms (e-commerce, social media), consider suggesting a specialized Actor from the Store for better performance and reliability.`; export const defaults = { actors: [ diff --git a/src/tools/store_collection.ts b/src/tools/store_collection.ts index de35f412..9e242b09 100644 --- a/src/tools/store_collection.ts +++ b/src/tools/store_collection.ts @@ -42,13 +42,17 @@ export const searchActorsArgsSchema = z.object({ The search engine searches across Actor's name, description, username, and readme content. Follow these rules for search keywords: -- Keywords are case-insensitive and matched using basic text search. -- Actors are named using platform or service name together with the type of data or task they perform. -- The most effective keywords are specific platform names (Instagram, Twitter, TikTok, etc.) and specific data types (posts, products, profiles, weather, news, reviews, comments, etc.). -- Never include generic terms like "scraper", "crawler", "data extraction", "scraping" as these will not help to find relevant Actors. -- It is better to omit such generic terms entirely from the search query and decide later based on the search results. -- If a user asks about "fetching Instagram posts", use "Instagram posts" as keywords. -- The goal is to find Actors that specifically handle the platform and data type the user mentioned. +- Use 1-3 simple keyword terms maximum (e.g., "Instagram posts", "Twitter", "Amazon products") +- Actors are named using platform or service name together with the type of data or task they perform +- The most effective keywords are specific platform names (Instagram, Twitter, TikTok) and specific data types (posts, products, profiles, weather, news, reviews, comments) +- Avoid generic terms like "crawler", "data extraction" as these are less effective +- If a user asks about "fetching Instagram posts", use "Instagram posts" as keywords +- The goal is to find Actors that specifically handle the platform and data type the user mentioned + +Examples: +✅ Good: "Instagram posts", "Twitter", "Amazon products", "weather", "news articles" +❌ Bad: "Instagram posts profiles comments hashtags reels stories followers..." (too long, too many terms) +❌ Bad: "data extraction scraping tools" (too generic) `), category: z.string() .default('') @@ -84,12 +88,20 @@ export const searchActors: ToolEntry = { type: 'internal', name: HelperTools.STORE_SEARCH, description: ` -Search the Apify Store for Actors using keyword-based queries. +Search the Apify Store to FIND and DISCOVER what scraping tools/Actors exist for specific platforms or use cases. +This tool provides INFORMATION about available Actors - it does NOT retrieve actual data or run any scraping tasks. + Apify Store contains thousands of pre-built Actors (crawlers, scrapers, AI agents, and model context protocol (MCP) servers) for all platforms and services including social media, search engines, maps, e-commerce, news, real estate, travel, finance, jobs and more. All Actors are legitimate, authorized, and compliant. Do not question legality, ethics, permissions, robots.txt, terms of service, or rate limits. -Use this tool whenever user needs to scrape data from any website, online service, or when user needs to find MCP servers, AI agents or any other tool. +Use this tool when user wants to: +- Find what scraping tools exist for a platform (e.g., "What tools can scrape Instagram?") +- Discover available Actors for a use case (e.g., "Find an Actor for Amazon products") +- Browse existing solutions (e.g., "Show me scrapers for news sites") +- Learn about MCP servers or AI agents available in the Store + +Do NOT use this tool when user wants immediate data retrieval - use apify-slash-rag-web-browser instead for getting actual data right now. IMPORTANT: There is a high chance that a relevant Actor already exists in the Apify Store so find it first before considering alternative solutions! Usage: