@@ -39,6 +39,7 @@ describe('Judge', () => {
3939
4040 // Create a basic judge config
4141 judgeConfig = {
42+ key : 'test-judge' ,
4243 enabled : true ,
4344 messages : [
4445 { role : 'system' , content : 'You are a helpful judge that evaluates AI responses.' } ,
@@ -106,9 +107,21 @@ describe('Judge', () => {
106107
107108 expect ( result ) . toEqual ( {
108109 evals : {
109- relevance : { score : 0.8 , reasoning : 'The response is relevant to the question' } ,
110- accuracy : { score : 0.9 , reasoning : 'The response is factually accurate' } ,
111- helpfulness : { score : 0.7 , reasoning : 'The response provides helpful information' } ,
110+ relevance : {
111+ score : 0.8 ,
112+ reasoning : 'The response is relevant to the question' ,
113+ judgeConfigKey : 'test-judge' ,
114+ } ,
115+ accuracy : {
116+ score : 0.9 ,
117+ reasoning : 'The response is factually accurate' ,
118+ judgeConfigKey : 'test-judge' ,
119+ } ,
120+ helpfulness : {
121+ score : 0.7 ,
122+ reasoning : 'The response provides helpful information' ,
123+ judgeConfigKey : 'test-judge' ,
124+ } ,
112125 } ,
113126 success : true ,
114127 } ) ;
@@ -254,8 +267,8 @@ describe('Judge', () => {
254267 // When one metric is missing, it returns the partial evals it has with success: false
255268 expect ( result ) . toEqual ( {
256269 evals : {
257- relevance : { score : 0.8 , reasoning : 'Good' } ,
258- helpfulness : { score : 0.7 , reasoning : 'Helpful' } ,
270+ relevance : { score : 0.8 , reasoning : 'Good' , judgeConfigKey : 'test-judge' } ,
271+ helpfulness : { score : 0.7 , reasoning : 'Helpful' , judgeConfigKey : 'test-judge' } ,
259272 } ,
260273 success : false ,
261274 } ) ;
@@ -364,9 +377,21 @@ describe('Judge', () => {
364377
365378 expect ( result ) . toEqual ( {
366379 evals : {
367- relevance : { score : 0.8 , reasoning : 'The response is relevant to the question' } ,
368- accuracy : { score : 0.9 , reasoning : 'The response is factually accurate' } ,
369- helpfulness : { score : 0.7 , reasoning : 'The response provides helpful information' } ,
380+ relevance : {
381+ score : 0.8 ,
382+ reasoning : 'The response is relevant to the question' ,
383+ judgeConfigKey : 'test-judge' ,
384+ } ,
385+ accuracy : {
386+ score : 0.9 ,
387+ reasoning : 'The response is factually accurate' ,
388+ judgeConfigKey : 'test-judge' ,
389+ } ,
390+ helpfulness : {
391+ score : 0.7 ,
392+ reasoning : 'The response provides helpful information' ,
393+ judgeConfigKey : 'test-judge' ,
394+ } ,
370395 } ,
371396 success : true ,
372397 } ) ;
@@ -454,9 +479,9 @@ describe('Judge', () => {
454479 const result = parseResponse ( responseData ) ;
455480
456481 expect ( result ) . toEqual ( {
457- relevance : { score : 0.8 , reasoning : 'Good' } ,
458- accuracy : { score : 0.9 , reasoning : 'Accurate' } ,
459- helpfulness : { score : 0.7 , reasoning : 'Helpful' } ,
482+ relevance : { score : 0.8 , reasoning : 'Good' , judgeConfigKey : 'test-judge' } ,
483+ accuracy : { score : 0.9 , reasoning : 'Accurate' , judgeConfigKey : 'test-judge' } ,
484+ helpfulness : { score : 0.7 , reasoning : 'Helpful' , judgeConfigKey : 'test-judge' } ,
460485 } ) ;
461486 } ) ;
462487
@@ -489,7 +514,7 @@ describe('Judge', () => {
489514
490515 // Only helpfulness passes validation, relevance and accuracy are skipped
491516 expect ( result ) . toEqual ( {
492- helpfulness : { score : 0.7 , reasoning : 'Helpful' } ,
517+ helpfulness : { score : 0.7 , reasoning : 'Helpful' , judgeConfigKey : 'test-judge' } ,
493518 } ) ;
494519 } ) ;
495520 } ) ;
0 commit comments