@@ -26,6 +26,16 @@ dotenv.config({ path: '.env' });
2626
2727type ExampleInputOnly = { input : Record < string , unknown > , metadata ?: Record < string , unknown > , output ?: never } ;
2828
29+ // Type for Phoenix evaluation run results
30+ interface EvaluationRun {
31+ name : string ;
32+ result ?: {
33+ score ?: number ;
34+ [ key : string ] : unknown ;
35+ } ;
36+ [ key : string ] : unknown ;
37+ }
38+
2939async function loadTools ( ) : Promise < ToolBase [ ] > {
3040 const apifyClient = new ApifyClient ( { token : process . env . APIFY_API_TOKEN || '' } ) ;
3141 const urlTools = await processParamsGetTools ( '' , apifyClient ) ;
@@ -55,7 +65,11 @@ function transformToolsToAnthropicFormat(tools: ToolBase[]): Anthropic.Tool[] {
5565function createOpenAITask ( modelName : string , tools : ToolBase [ ] ) {
5666 const toolsOpenAI = transformToolsToOpenAIFormat ( tools ) ;
5767
58- return async ( example : ExampleInputOnly ) : Promise < { toolCalls : string [ ] } > => {
68+ return async ( example : ExampleInputOnly ) : Promise < {
69+ toolCalls : string [ ] ;
70+ input : Record < string , unknown > ,
71+ metadata : Record < string , unknown > ,
72+ } > => {
5973 const client = new OpenAI ( ) ;
6074
6175 const response = await client . chat . completions . create ( {
@@ -69,14 +83,16 @@ function createOpenAITask(modelName: string, tools: ToolBase[]) {
6983
7084 const toolCalls : string [ ] = [ ] ;
7185 const firstMessage = response . choices ?. [ 0 ] ?. message ;
72- const msg = JSON . stringify ( JSON . stringify ( firstMessage ) ) ;
73- log . debug ( `${ example . metadata ?. category } - ${ example . input ?. question } - ${ msg } ` ) ;
7486 if ( firstMessage ?. tool_calls ?. length ) {
7587 const toolCall = firstMessage . tool_calls [ 0 ] ;
7688 const name = toolCall ?. function ?. name ;
7789 if ( name ) toolCalls . push ( name ) ;
7890 }
79- return { toolCalls } ;
91+ return {
92+ toolCalls,
93+ input : example . input ,
94+ metadata : { content : firstMessage } ,
95+ } ;
8096 } ;
8197}
8298
@@ -99,7 +115,6 @@ function createAnthropicTask(modelName: string, tools: ToolBase[]) {
99115 } ) ;
100116
101117 const toolCalls : string [ ] = [ ] ;
102- log . debug ( `${ example . input ?. question } - ${ JSON . stringify ( response . content ) } ` ) ;
103118 for ( const content of response . content ) {
104119 if ( content . type === 'tool_use' ) {
105120 const toolUseContent = content as Anthropic . ToolUseBlock ;
@@ -119,7 +134,7 @@ const toolsMatch = asEvaluator({
119134 name : 'tools_match' ,
120135 kind : 'CODE' ,
121136 evaluate : async ( { output, expected } : {
122- output : { toolCalls ?: string [ ] } | null ;
137+ output : { toolCalls ?: string [ ] , input ?: Record < string , unknown > , metadata ?: Record < string , unknown > } | null ;
123138 expected ?: Record < string , unknown > ;
124139 } ) => {
125140 const toolCalls = String ( expected ?. tool_calls ?? '' ) ;
@@ -128,15 +143,18 @@ const toolsMatch = asEvaluator({
128143 . map ( ( t ) => t . trim ( ) )
129144 . filter ( Boolean )
130145 . sort ( ) ;
131-
146+ // console.log(`Output tools: ${JSON.stringify(output?.metadata)} -> ${JSON.stringify(output?.toolCalls)}`);
132147 const actualArr = Array . isArray ( output ?. toolCalls ) ? output . toolCalls : [ ] ;
133148 const actual = [ ...actualArr ] . sort ( ) ;
134149 const matches = JSON . stringify ( expectedTools ) === JSON . stringify ( actual ) ;
150+ log . debug (
151+ `-----------------------\n`
152+ + `Query: ${ String ( output ?. input ?. question ?? '' ) } \n`
153+ + `LLM response: ${ JSON . stringify ( output ?. metadata ?. content ?? '' ) } \n`
154+ + `Match: ${ matches } , expected tools: ${ JSON . stringify ( expectedTools ) } , actual tools: ${ JSON . stringify ( actual ) } ` ,
155+ ) ;
135156 return {
136- label : matches ? 'matches' : 'does not match' ,
137157 score : matches ? 1 : 0 ,
138- explanation : matches ? 'Output tool calls match expected' : 'Mismatch between expected and output tool calls' ,
139- metadata : { } ,
140158 } ;
141159 } ,
142160} ) ;
@@ -206,14 +224,14 @@ async function main(): Promise<number> {
206224 evaluators : [ toolsMatch ] ,
207225 experimentName,
208226 experimentDescription,
209- dryRun : 3 ,
227+ concurrency = 10 ,
210228 } ) ;
211229
212230 const runsMap = experiment . runs ?? { } ;
213231 const evalRuns = experiment . evaluationRuns ?? [ ] ;
214232 totalCases = Object . keys ( runsMap ) . length ;
215- const toolMatchEvals = evalRuns . filter ( ( er : any ) => er . name === 'tools_match' ) ;
216- correctCases = toolMatchEvals . filter ( ( er : any ) => ( er . result ?. score ?? 0 ) > 0.5 ) . length ;
233+ const toolMatchEvals = evalRuns . filter ( ( er : EvaluationRun ) => er . name === 'tools_match' ) ;
234+ correctCases = toolMatchEvals . filter ( ( er : EvaluationRun ) => ( er . result ?. score ?? 0 ) > 0.5 ) . length ;
217235 accuracy = totalCases > 0 ? correctCases / totalCases : 0 ;
218236 experimentId = experiment . id ;
219237
@@ -227,7 +245,7 @@ async function main(): Promise<number> {
227245 results . push ( { model : modelName , accuracy, correct : correctCases , total : totalCases , experiment_id : experimentId , error } ) ;
228246 }
229247
230- log . info ( '\n 📊 Results:' ) ;
248+ log . info ( '📊 Results:' ) ;
231249 for ( const result of results ) {
232250 const { model, accuracy, error } = result ;
233251 if ( error ) {
@@ -238,7 +256,7 @@ async function main(): Promise<number> {
238256 }
239257
240258 const allPassed = results . filter ( ( r ) => ! r . error ) . every ( ( r ) => r . accuracy >= PASS_THRESHOLD ) ;
241- log . info ( `\nPass threshold: ${ ( PASS_THRESHOLD * 100 ) . toFixed ( 1 ) } %` ) ;
259+ log . info ( `Pass threshold: ${ ( PASS_THRESHOLD * 100 ) . toFixed ( 1 ) } %` ) ;
242260 if ( allPassed ) {
243261 log . info ( '✅ All models passed the threshold' ) ;
244262 } else {
0 commit comments