Skip to content

Commit 82df1ef

Browse files
committed
fix: value interpolation in the template! It was not working and failing silently
1 parent 4794ca4 commit 82df1ef

File tree

6 files changed

+169
-159
lines changed

6 files changed

+169
-159
lines changed

.github/workflows/evaluations.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,7 @@ jobs:
3939
- name: Run evaluations
4040
run: npm run evals:run
4141
env:
42+
GITHUB_PR_NUMBER: ${{ github.event_name == 'pull_request' && github.event.number || 'master' }}
4243
PHOENIX_API_KEY: ${{ secrets.PHOENIX_API_KEY }}
4344
PHOENIX_BASE_URL: ${{ secrets.PHOENIX_BASE_URL }}
4445
OPENROUTER_BASE_URL: ${{ secrets.OPENROUTER_BASE_URL }}

evals/config.ts

Lines changed: 31 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -35,41 +35,53 @@ export const TOOL_SELECTION_EVAL_MODEL = 'openai/gpt-4o-mini';
3535

3636
export const PASS_THRESHOLD = 0.6;
3737

38-
export const DATASET_NAME = `mcp_tool_calling_ground_truth_v${getTestCasesVersion()}`;
38+
export const DATASET_NAME = `mcp_server_dataset_v${getTestCasesVersion()}`;
3939

4040
// System prompt
4141
export const SYSTEM_PROMPT = 'You are a helpful assistant';
4242

4343
export const TOOL_CALLING_BASE_TEMPLATE = `
44-
You are an evaluation assistant evaluating questions and tool calls to
45-
determine whether the tool called would answer the question. The tool
46-
calls have been generated by a separate agent, and chosen from the list of
44+
You are an evaluation assistant evaluating user queries and tool calls to
45+
determine whether a tool was chosen and if it was a right tool.
46+
47+
The tool calls have been generated by a separate agent, and chosen from the list of
4748
tools provided below. It is your job to decide whether that agent chose
4849
the right tool to call.
4950
50-
[BEGIN DATA]
51-
************
52-
[Context]: {input}
53-
************
54-
[LLM response]: {llm_response}
55-
[END DATA]
51+
[BEGIN DATA]
52+
************
53+
{{context}}
54+
{{query}}
55+
************
56+
{{tool_calls}}
57+
{{llm_response}}
58+
************
59+
[END DATA]
60+
61+
DECISION: [correct or incorrect]
62+
EXPLANATION: [Super short explanation of why the tool choice was correct or incorrect]
5663
5764
Your response must be single word, either "correct" or "incorrect",
5865
and should not contain any text or characters aside from that word.
59-
"incorrect" means that the chosen tool would not answer the question,
60-
the tool includes information that is not presented in the question,
66+
67+
"correct" means the correct tool call was chosen, the correct parameters
68+
were extracted from the query, the tool call generated is runnable and correct,
69+
and that no outside information not present in the query was used
70+
in the generated query.
71+
72+
"incorrect" means that the chosen tool was not correct
6173
or that the tool signature includes parameter values that don't match
6274
the formats specified in the tool signatures below.
6375
64-
"correct" means the correct tool call was chosen, the correct parameters
65-
were extracted from the question, the tool call generated is runnable and correct,
66-
and that no outside information not present in the question was used
67-
in the generated question.
76+
You must not use any outside information or make assumptions.
77+
Base your decision solely on the information provided in [BEGIN DATA] ... [END DATA],
78+
the [Tool Definitions], and the [Reference instructions] (if provided).
79+
Reference instructions are optional and are intended to help you understand the use case and make your decision.
6880
69-
[Reference instructions]: {reference}
81+
{{reference}}
7082
71-
[Tool Definitions]: {tool_definitions}
72-
`;
83+
{{tool_definitions}}
84+
`
7385
export function getRequiredEnvVars(): Record<string, string | undefined> {
7486
return {
7587
PHOENIX_BASE_URL: process.env.PHOENIX_BASE_URL,

evals/create-dataset.ts

Lines changed: 8 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -26,8 +26,10 @@ dotenv.config({ path: '.env' });
2626
interface TestCase {
2727
id: string;
2828
category: string;
29-
question: string;
30-
expectedTools: string[];
29+
query: string;
30+
context?: string;
31+
expectedTools?: string[];
32+
reference?: string;
3133
}
3234

3335
interface TestData {
@@ -66,8 +68,8 @@ async function createDatasetFromTestCases(): Promise<void> {
6668

6769
// Convert to format expected by Phoenix
6870
const examples = testCases.map((testCase) => ({
69-
input: { question: testCase.question },
70-
output: { tool_calls: testCase.expectedTools.join(', ') },
71+
input: { query: testCase.query },
72+
output: { expectedTools: testCase.expectedTools?.join(', '), reference: testCase.reference || '' },
7173
metadata: { category: testCase.category },
7274
}));
7375

@@ -80,15 +82,15 @@ async function createDatasetFromTestCases(): Promise<void> {
8082
});
8183

8284
// Upload dataset
83-
const datasetName = `mcp_tool_calling_ground_truth_v${testData.version}`;
85+
const datasetName = `mcp_server_dataset_v${testData.version}`;
8486

8587
log.info(`Uploading dataset '${datasetName}' to Phoenix...`);
8688

8789
try {
8890
const { datasetId } = await createDataset({
8991
client,
9092
name: datasetName,
91-
description: `MCP tool calling ground truth dataset version ${testData.version}`,
93+
description: `MCP server dataset: version ${testData.version}`,
9294
examples,
9395
});
9496

0 commit comments

Comments
 (0)