apify
diff --git a/‎.github/workflows/evaluations.yaml‎
Lines changed: 1 addition & 0 deletions b/‎.github/workflows/evaluations.yaml‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎evals/config.ts‎
Lines changed: 31 additions & 19 deletions b/‎evals/config.ts‎
Lines changed: 31 additions & 19 deletions
diff --git a/‎evals/create-dataset.ts‎
Lines changed: 8 additions & 6 deletions b/‎evals/create-dataset.ts‎
Lines changed: 8 additions & 6 deletions
@@ -39,6 +39,7 @@ jobs:
             -   name: Run evaluations
                 run: npm run evals:run
                 env:
+                    GITHUB_PR_NUMBER: ${{ github.event_name == 'pull_request' && github.event.number || 'master' }}
                     PHOENIX_API_KEY: ${{ secrets.PHOENIX_API_KEY }}
                     PHOENIX_BASE_URL: ${{ secrets.PHOENIX_BASE_URL }}
                     OPENROUTER_BASE_URL: ${{ secrets.OPENROUTER_BASE_URL }}
 
@@ -35,41 +35,53 @@ export const TOOL_SELECTION_EVAL_MODEL = 'openai/gpt-4o-mini';
 
 export const PASS_THRESHOLD = 0.6;
 
-export const DATASET_NAME = `mcp_tool_calling_ground_truth_v${getTestCasesVersion()}`;
+export const DATASET_NAME = `mcp_server_dataset_v${getTestCasesVersion()}`;
 
 // System prompt
 export const SYSTEM_PROMPT = 'You are a helpful assistant';
 
 export const TOOL_CALLING_BASE_TEMPLATE = `
-You are an evaluation assistant evaluating questions and tool calls to
-determine whether the tool called would answer the question. The tool
-calls have been generated by a separate agent, and chosen from the list of
+You are an evaluation assistant evaluating user queries and tool calls to
+determine whether a tool was chosen and if it was a right tool.
+
+The tool calls have been generated by a separate agent, and chosen from the list of
 tools provided below. It is your job to decide whether that agent chose
 the right tool to call.
 
-    [BEGIN DATA]
-    ************
-    [Context]: {input}
-    ************
-    [LLM response]: {llm_response}
-    [END DATA]
+[BEGIN DATA]
+************
+{{context}}
+{{query}}
+************
+{{tool_calls}}
+{{llm_response}}
+************
+[END DATA]
+
+DECISION: [correct or incorrect]
+EXPLANATION: [Super short explanation of why the tool choice was correct or incorrect]
 
 Your response must be single word, either "correct" or "incorrect",
 and should not contain any text or characters aside from that word.
-"incorrect" means that the chosen tool would not answer the question,
-the tool includes information that is not presented in the question,
+
+"correct" means the correct tool call was chosen, the correct parameters
+were extracted from the query, the tool call generated is runnable and correct,
+and that no outside information not present in the query was used
+in the generated query.
+
+"incorrect" means that the chosen tool was not correct
 or that the tool signature includes parameter values that don't match
 the formats specified in the tool signatures below.
 
-"correct" means the correct tool call was chosen, the correct parameters
-were extracted from the question, the tool call generated is runnable and correct,
-and that no outside information not present in the question was used
-in the generated question.
+You must not use any outside information or make assumptions.
+Base your decision solely on the information provided in [BEGIN DATA] ... [END DATA],
+the [Tool Definitions], and the [Reference instructions] (if provided).
+Reference instructions are optional and are intended to help you understand the use case and make your decision.
 
-[Reference instructions]: {reference}
+{{reference}}
 
-[Tool Definitions]: {tool_definitions}
-`;
+{{tool_definitions}}
+`
 export function getRequiredEnvVars(): Record<string, string | undefined> {
     return {
         PHOENIX_BASE_URL: process.env.PHOENIX_BASE_URL,
 
@@ -26,8 +26,10 @@ dotenv.config({ path: '.env' });
 interface TestCase {
     id: string;
     category: string;
-    question: string;
-    expectedTools: string[];
+    query: string;
+    context?: string;
+    expectedTools?: string[];
+    reference?: string;
 }
 
 interface TestData {
@@ -66,8 +68,8 @@ async function createDatasetFromTestCases(): Promise<void> {
 
     // Convert to format expected by Phoenix
     const examples = testCases.map((testCase) => ({
-        input: { question: testCase.question },
-        output: { tool_calls: testCase.expectedTools.join(', ') },
+        input: { query: testCase.query },
+        output: { expectedTools: testCase.expectedTools?.join(', '), reference: testCase.reference || '' },
         metadata: { category: testCase.category },
     }));
 
@@ -80,15 +82,15 @@ async function createDatasetFromTestCases(): Promise<void> {
     });
 
     // Upload dataset
-    const datasetName = `mcp_tool_calling_ground_truth_v${testData.version}`;
+    const datasetName = `mcp_server_dataset_v${testData.version}`;
 
     log.info(`Uploading dataset '${datasetName}' to Phoenix...`);
 
     try {
         const { datasetId } = await createDataset({
             client,
             name: datasetName,
-            description: `MCP tool calling ground truth dataset version ${testData.version}`,
+            description: `MCP server dataset: version ${testData.version}`,
             examples,
         });