apify
diff --git a/‎.github/workflows/evaluations.yaml‎
Lines changed: 1 addition & 20 deletions b/‎.github/workflows/evaluations.yaml‎
Lines changed: 1 addition & 20 deletions
diff --git a/‎evals/README.md‎
Lines changed: 56 additions & 32 deletions b/‎evals/README.md‎
Lines changed: 56 additions & 32 deletions
diff --git a/‎evals/config.ts‎
Lines changed: 3 additions & 3 deletions b/‎evals/config.ts‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎evals/create-dataset.ts‎
Lines changed: 5 additions & 5 deletions b/‎evals/create-dataset.ts‎
Lines changed: 5 additions & 5 deletions
diff --git a/‎evals/run-evaluation.ts‎
Lines changed: 33 additions & 15 deletions b/‎evals/run-evaluation.ts‎
Lines changed: 33 additions & 15 deletions
@@ -37,27 +37,8 @@ jobs:
             -   name: Build project
                 run: npm run build
 
-            -   name: Export tools
-                run: npm run evals:export-tools
-
-            -   name: Set up Python
-                uses: actions/setup-python@v4
-                with:
-                    python-version: '3.12'
-
-            -   name: Install uv
-                run: pip install uv
-
-            -   name: Create Python virtual environment
-                run: uv venv
-
-            -   name: Install Python dependencies
-                run: uv pip install -e .
-
             -   name: Run evaluations
-                run: |
-                    source .venv/bin/activate
-                    npm run test:evals
+                run: npm run evals:run
                 env:
                     PHOENIX_API_KEY: ${{ secrets.PHOENIX_API_KEY }}
                     PHOENIX_COLLECTOR_ENDPOINT: ${{ secrets.PHOENIX_COLLECTOR_ENDPOINT }}
 
@@ -1,56 +1,84 @@
 # MCP Tool Calling Evaluations
 
-Python-based evaluations for the Apify MCP Server using Arize Phoenix platform.
+TypeScript-based evaluations for the Apify MCP Server using Arize Phoenix platform.
 
-> **Note**: The TypeScript package had connection issues, so we use the Python implementation instead.
+## Objectives
+
+The MCP server tool calls evaluation has several key objectives:
+
+1. **Identify problems** in the description of the tools
+2. **Create a test suite** that can be run manually or automatically in CI
+3. **Allow for quick iteration** on tool descriptions
+
+## 1. ✍️ **Create test cases manually** (Current Implementation)
+
+- **Pros:**
+  - Straightforward approach
+  - Simple to create test cases for each tool
+  - Direct control over test scenarios
+
+- **Cons:**
+  - Complicated to create flows (several tool calls in a row)
+  - Requires maintenance when MCP server changes
+  - Manual effort for comprehensive coverage
+
+## Test case examples
+
+### Simple tool selection
+```
+"What are the best Instagram scrapers" → "search-actors"
+```
+
+### Multi-step flow
+```
+User: "Search for the weather MCP server and then add it to available tools"
+Expected sequence:
+1. search-actors (with input: {"search": "weather mcp", "limit": 5})
+2. add-actor (to add the found weather MCP server)
+```
 
 ## Workflow
 
-The evaluation process has 4 steps:
+The evaluation process has two steps:
 
 1. **Create dataset** (if not exists) - Upload test cases to Phoenix
-2. **Update dataset ID** in `config.py` - Point to the correct Phoenix dataset  
-3. **Export tools** - Get current MCP tool definitions
-4. **Run evaluation** - Test models against ground truth
+2. **Run evaluation** - Test models against ground truth
 
-## Quick Start
+## Quick start
 
 ```bash
 # 1. Set environment variables
+export PHOENIX_BASE_URL="phoenix_url"
 export PHOENIX_API_KEY="your_key"
-export OPENAI_API_KEY="your_key" 
+export OPENAI_API_KEY="your_key"
 export ANTHROPIC_API_KEY="your_key"
 
 # 2. Install dependencies
-uv pip install -e evals/
+npm ci
 
 # 3. Create dataset (one-time)
-python3 evals/create_dataset.py
-
-# 4. Update DATASET_NAME in config.py with the returned dataset ID
+npm run evals:create-dataset
 
-# 5. Export tools and run evaluation
-npm run evals:export-tools
-python3 evals/run_evaluation.py
+# 5. Run evaluation
+npm run evals:run
 ```
 
 ## Files
 
-- `config.py` - Configuration (models, threshold, Phoenix settings)
-- `test_cases.json` - Ground truth test cases
-- `run_evaluation.py` - Main evaluation script
-- `create_dataset.py` - Upload test cases to Phoenix
-- `export-tools.ts` - Export MCP tools to JSON
-- `evaluation_2025.ipynb` - Interactive analysis notebook
+- `config.ts` - Configuration (models, threshold, Phoenix settings)
+- `test-cases.json` - Ground truth test cases
+- `run-evaluation.ts` - Main evaluation script
+- `create-dataset.ts` - Upload test cases to Phoenix
+- `evaluation_2025.ipynb` - Interactive analysis notebook (Python-based, requires `pip install -e .`)
 
 ## Configuration
 
-Key settings in `config.py`:
+Key settings in `config.ts`:
 - `MODELS_TO_EVALUATE` - Models to test (default: `['gpt-4o-mini', 'claude-3-5-haiku-latest']`)
 - `PASS_THRESHOLD` - Accuracy threshold (default: 0.8)
 - `DATASET_NAME` - Phoenix dataset name
 
-## Test Cases
+## Test cases
 
 40+ test cases covering 7 tool categories:
 - `fetch-actor-details` - Actor information queries
@@ -70,18 +98,14 @@ Key settings in `config.py`:
 ## Troubleshooting
 
 ```bash
-# Missing tools.json
-npm run evals:export-tools
-
 # Missing dataset
-python3 evals/create_dataset.py
+npm run evals:create-dataset
 
 # Environment issues
-python3 -c "from dotenv import load_dotenv; load_dotenv()"
+# Make sure .env file exists with required API keys
 ```
 
-## Adding Test Cases
+## Adding test cases
 
-1. Edit `test_cases.json`
-2. Update version number
-3. Run `python3 evals/create_dataset.py`
+1. Edit `test-cases.json`
+3. Run `npm run evals:create-dataset`
@@ -6,11 +6,11 @@ import { readFileSync } from 'node:fs';
 import { dirname, join } from 'node:path';
 import { fileURLToPath } from 'node:url';
 
-// Read version from test_cases.json
+// Read version from test-cases.json
 function getTestCasesVersion(): string {
     const currentFilename = fileURLToPath(import.meta.url);
     const currentDirname = dirname(currentFilename);
-    const testCasesPath = join(currentDirname, 'test_cases.json');
+    const testCasesPath = join(currentDirname, 'test-cases.json');
     const testCasesContent = readFileSync(testCasesPath, 'utf-8');
     const testCases = JSON.parse(testCasesContent);
     return testCases.version;
@@ -19,7 +19,7 @@ function getTestCasesVersion(): string {
 // Models to evaluate
 export const MODELS_TO_EVALUATE = [
     'gpt-4o-mini',
-    // 'claude-3-5-haiku-latest',
+    'claude-3-5-haiku-latest',
 ];
 
 export const PASS_THRESHOLD = 0.8;
 
@@ -27,19 +27,19 @@ interface TestCase {
     id: string;
     category: string;
     question: string;
-    expected_tools: string[];
+    expectedTools: string[];
 }
 
 interface TestData {
     version: string;
-    test_cases: TestCase[];
+    testCases: TestCase[];
 }
 
 // eslint-disable-next-line consistent-return
 function loadTestCases(): TestData {
     const filename = fileURLToPath(import.meta.url);
     const dirname = pathDirname(filename);
-    const testCasesPath = join(dirname, 'test_cases.json');
+    const testCasesPath = join(dirname, 'test-cases.json');
 
     try {
         const fileContent = readFileSync(testCasesPath, 'utf-8');
@@ -60,14 +60,14 @@ async function createDatasetFromTestCases(): Promise<void> {
 
     // Load test cases
     const testData = loadTestCases();
-    const testCases = testData.test_cases;
+    const { testCases } = testData;
 
     log.info(`Loaded ${testCases.length} test cases`);
 
     // Convert to format expected by Phoenix
     const examples = testCases.map((testCase) => ({
         input: { question: testCase.question },
-        output: { tool_calls: testCase.expected_tools.join(', ') },
+        output: { tool_calls: testCase.expectedTools.join(', ') },
         metadata: { category: testCase.category },
     }));
 
 
@@ -26,6 +26,16 @@ dotenv.config({ path: '.env' });
 
 type ExampleInputOnly = { input: Record<string, unknown>, metadata?: Record<string, unknown>, output?: never };
 
+// Type for Phoenix evaluation run results
+interface EvaluationRun {
+    name: string;
+    result?: {
+        score?: number;
+        [key: string]: unknown;
+    };
+    [key: string]: unknown;
+}
+
 async function loadTools(): Promise<ToolBase[]> {
     const apifyClient = new ApifyClient({ token: process.env.APIFY_API_TOKEN || '' });
     const urlTools = await processParamsGetTools('', apifyClient);
@@ -55,7 +65,11 @@ function transformToolsToAnthropicFormat(tools: ToolBase[]): Anthropic.Tool[] {
 function createOpenAITask(modelName: string, tools: ToolBase[]) {
     const toolsOpenAI = transformToolsToOpenAIFormat(tools);
 
-    return async (example: ExampleInputOnly): Promise<{ toolCalls: string[] }> => {
+    return async (example: ExampleInputOnly): Promise<{
+        toolCalls: string[];
+        input: Record<string, unknown>,
+        metadata: Record<string, unknown>,
+    }> => {
         const client = new OpenAI();
 
         const response = await client.chat.completions.create({
@@ -69,14 +83,16 @@ function createOpenAITask(modelName: string, tools: ToolBase[]) {
 
         const toolCalls: string[] = [];
         const firstMessage = response.choices?.[0]?.message;
-        const msg = JSON.stringify(JSON.stringify(firstMessage));
-        log.debug(`${example.metadata?.category} - ${example.input?.question} - ${msg}`);
         if (firstMessage?.tool_calls?.length) {
             const toolCall = firstMessage.tool_calls[0];
             const name = toolCall?.function?.name;
             if (name) toolCalls.push(name);
         }
-        return { toolCalls };
+        return {
+            toolCalls,
+            input: example.input,
+            metadata: { content: firstMessage },
+        };
     };
 }
 
@@ -99,7 +115,6 @@ function createAnthropicTask(modelName: string, tools: ToolBase[]) {
         });
 
         const toolCalls: string[] = [];
-        log.debug(`${example.input?.question} - ${JSON.stringify(response.content)}`);
         for (const content of response.content) {
             if (content.type === 'tool_use') {
                 const toolUseContent = content as Anthropic.ToolUseBlock;
@@ -119,7 +134,7 @@ const toolsMatch = asEvaluator({
     name: 'tools_match',
     kind: 'CODE',
     evaluate: async ({ output, expected }: {
-        output: { toolCalls?: string[] } | null;
+        output: { toolCalls?: string[], input?: Record<string, unknown>, metadata?: Record<string, unknown> } | null;
         expected?: Record<string, unknown>;
     }) => {
         const toolCalls = String(expected?.tool_calls ?? '');
@@ -128,15 +143,18 @@ const toolsMatch = asEvaluator({
             .map((t) => t.trim())
             .filter(Boolean)
             .sort();
-
+        // console.log(`Output tools: ${JSON.stringify(output?.metadata)} -> ${JSON.stringify(output?.toolCalls)}`);
         const actualArr = Array.isArray(output?.toolCalls) ? output.toolCalls : [];
         const actual = [...actualArr].sort();
         const matches = JSON.stringify(expectedTools) === JSON.stringify(actual);
+        log.debug(
+            `-----------------------\n`
+            + `Query: ${String(output?.input?.question ?? '')}\n`
+            + `LLM response: ${JSON.stringify(output?.metadata?.content ?? '')}\n`
+            + `Match: ${matches}, expected tools: ${JSON.stringify(expectedTools)}, actual tools: ${JSON.stringify(actual)}`,
+        );
         return {
-            label: matches ? 'matches' : 'does not match',
             score: matches ? 1 : 0,
-            explanation: matches ? 'Output tool calls match expected' : 'Mismatch between expected and output tool calls',
-            metadata: {},
         };
     },
 });
@@ -206,14 +224,14 @@ async function main(): Promise<number> {
                 evaluators: [toolsMatch],
                 experimentName,
                 experimentDescription,
-                dryRun: 3,
+                concurrency = 10,
             });
 
             const runsMap = experiment.runs ?? {};
             const evalRuns = experiment.evaluationRuns ?? [];
             totalCases = Object.keys(runsMap).length;
-            const toolMatchEvals = evalRuns.filter((er: any) => er.name === 'tools_match');
-            correctCases = toolMatchEvals.filter((er: any) => (er.result?.score ?? 0) > 0.5).length;
+            const toolMatchEvals = evalRuns.filter((er: EvaluationRun) => er.name === 'tools_match');
+            correctCases = toolMatchEvals.filter((er: EvaluationRun) => (er.result?.score ?? 0) > 0.5).length;
             accuracy = totalCases > 0 ? correctCases / totalCases : 0;
             experimentId = experiment.id;
 
@@ -227,7 +245,7 @@ async function main(): Promise<number> {
         results.push({ model: modelName, accuracy, correct: correctCases, total: totalCases, experiment_id: experimentId, error });
     }
 
-    log.info('\n📊 Results:');
+    log.info('📊 Results:');
     for (const result of results) {
         const { model, accuracy, error } = result;
         if (error) {
@@ -238,7 +256,7 @@ async function main(): Promise<number> {
     }
 
     const allPassed = results.filter((r) => !r.error).every((r) => r.accuracy >= PASS_THRESHOLD);
-    log.info(`\nPass threshold: ${(PASS_THRESHOLD * 100).toFixed(1)}%`);
+    log.info(`Pass threshold: ${(PASS_THRESHOLD * 100).toFixed(1)}%`);
     if (allPassed) {
         log.info('✅ All models passed the threshold');
     } else {