@@ -35,41 +35,53 @@ export const TOOL_SELECTION_EVAL_MODEL = 'openai/gpt-4o-mini';
3535
3636export const PASS_THRESHOLD = 0.6 ;
3737
38- export const DATASET_NAME = `mcp_tool_calling_ground_truth_v ${ getTestCasesVersion ( ) } ` ;
38+ export const DATASET_NAME = `mcp_server_dataset_v ${ getTestCasesVersion ( ) } ` ;
3939
4040// System prompt
4141export const SYSTEM_PROMPT = 'You are a helpful assistant' ;
4242
4343export const TOOL_CALLING_BASE_TEMPLATE = `
44- You are an evaluation assistant evaluating questions and tool calls to
45- determine whether the tool called would answer the question. The tool
46- calls have been generated by a separate agent, and chosen from the list of
44+ You are an evaluation assistant evaluating user queries and tool calls to
45+ determine whether a tool was chosen and if it was a right tool.
46+
47+ The tool calls have been generated by a separate agent, and chosen from the list of
4748tools provided below. It is your job to decide whether that agent chose
4849the right tool to call.
4950
50- [BEGIN DATA]
51- ************
52- [Context]: {input}
53- ************
54- [LLM response]: {llm_response}
55- [END DATA]
51+ [BEGIN DATA]
52+ ************
53+ {{context}}
54+ {{query}}
55+ ************
56+ {{tool_calls}}
57+ {{llm_response}}
58+ ************
59+ [END DATA]
60+
61+ DECISION: [correct or incorrect]
62+ EXPLANATION: [Super short explanation of why the tool choice was correct or incorrect]
5663
5764Your response must be single word, either "correct" or "incorrect",
5865and should not contain any text or characters aside from that word.
59- "incorrect" means that the chosen tool would not answer the question,
60- the tool includes information that is not presented in the question,
66+
67+ "correct" means the correct tool call was chosen, the correct parameters
68+ were extracted from the query, the tool call generated is runnable and correct,
69+ and that no outside information not present in the query was used
70+ in the generated query.
71+
72+ "incorrect" means that the chosen tool was not correct
6173or that the tool signature includes parameter values that don't match
6274the formats specified in the tool signatures below.
6375
64- "correct" means the correct tool call was chosen, the correct parameters
65- were extracted from the question, the tool call generated is runnable and correct ,
66- and that no outside information not present in the question was used
67- in the generated question .
76+ You must not use any outside information or make assumptions.
77+ Base your decision solely on the information provided in [BEGIN DATA] ... [END DATA] ,
78+ the [Tool Definitions], and the [Reference instructions] (if provided).
79+ Reference instructions are optional and are intended to help you understand the use case and make your decision .
6880
69- [Reference instructions]: { reference}
81+ {{ reference} }
7082
71- [Tool Definitions]: { tool_definitions}
72- ` ;
83+ {{ tool_definitions} }
84+ `
7385export function getRequiredEnvVars ( ) : Record < string , string | undefined > {
7486 return {
7587 PHOENIX_BASE_URL : process . env . PHOENIX_BASE_URL ,
0 commit comments