Skip to content

Commit a5e82d0

Browse files
committed
move key to judge response
1 parent 6cf3980 commit a5e82d0

File tree

6 files changed

+41
-24
lines changed

6 files changed

+41
-24
lines changed

packages/sdk/server-ai/__tests__/Judge.test.ts

Lines changed: 12 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -110,20 +110,18 @@ describe('Judge', () => {
110110
relevance: {
111111
score: 0.8,
112112
reasoning: 'The response is relevant to the question',
113-
judgeConfigKey: 'test-judge',
114113
},
115114
accuracy: {
116115
score: 0.9,
117116
reasoning: 'The response is factually accurate',
118-
judgeConfigKey: 'test-judge',
119117
},
120118
helpfulness: {
121119
score: 0.7,
122120
reasoning: 'The response provides helpful information',
123-
judgeConfigKey: 'test-judge',
124121
},
125122
},
126123
success: true,
124+
judgeConfigKey: 'test-judge',
127125
});
128126

129127
expect(mockProvider.invokeStructuredModel).toHaveBeenCalledWith(
@@ -267,10 +265,11 @@ describe('Judge', () => {
267265
// When one metric is missing, it returns the partial evals it has with success: false
268266
expect(result).toEqual({
269267
evals: {
270-
relevance: { score: 0.8, reasoning: 'Good', judgeConfigKey: 'test-judge' },
271-
helpfulness: { score: 0.7, reasoning: 'Helpful', judgeConfigKey: 'test-judge' },
268+
relevance: { score: 0.8, reasoning: 'Good' },
269+
helpfulness: { score: 0.7, reasoning: 'Helpful' },
272270
},
273271
success: false,
272+
judgeConfigKey: 'test-judge',
274273
});
275274
});
276275

@@ -302,6 +301,7 @@ describe('Judge', () => {
302301
expect(result).toEqual({
303302
evals: {},
304303
success: false,
304+
judgeConfigKey: 'test-judge',
305305
});
306306
});
307307

@@ -315,6 +315,7 @@ describe('Judge', () => {
315315
evals: {},
316316
success: false,
317317
error: 'Provider error',
318+
judgeConfigKey: 'test-judge',
318319
});
319320
expect(mockLogger.error).toHaveBeenCalledWith('Judge evaluation failed:', error);
320321
});
@@ -328,6 +329,7 @@ describe('Judge', () => {
328329
evals: {},
329330
success: false,
330331
error: 'Unknown error',
332+
judgeConfigKey: 'test-judge',
331333
});
332334
});
333335
});
@@ -380,20 +382,18 @@ describe('Judge', () => {
380382
relevance: {
381383
score: 0.8,
382384
reasoning: 'The response is relevant to the question',
383-
judgeConfigKey: 'test-judge',
384385
},
385386
accuracy: {
386387
score: 0.9,
387388
reasoning: 'The response is factually accurate',
388-
judgeConfigKey: 'test-judge',
389389
},
390390
helpfulness: {
391391
score: 0.7,
392392
reasoning: 'The response provides helpful information',
393-
judgeConfigKey: 'test-judge',
394393
},
395394
},
396395
success: true,
396+
judgeConfigKey: 'test-judge',
397397
});
398398

399399
expect(mockProvider.invokeStructuredModel).toHaveBeenCalledWith(
@@ -479,9 +479,9 @@ describe('Judge', () => {
479479
const result = parseResponse(responseData);
480480

481481
expect(result).toEqual({
482-
relevance: { score: 0.8, reasoning: 'Good', judgeConfigKey: 'test-judge' },
483-
accuracy: { score: 0.9, reasoning: 'Accurate', judgeConfigKey: 'test-judge' },
484-
helpfulness: { score: 0.7, reasoning: 'Helpful', judgeConfigKey: 'test-judge' },
482+
relevance: { score: 0.8, reasoning: 'Good' },
483+
accuracy: { score: 0.9, reasoning: 'Accurate' },
484+
helpfulness: { score: 0.7, reasoning: 'Helpful' },
485485
});
486486
});
487487

@@ -514,7 +514,7 @@ describe('Judge', () => {
514514

515515
// Only helpfulness passes validation, relevance and accuracy are skipped
516516
expect(result).toEqual({
517-
helpfulness: { score: 0.7, reasoning: 'Helpful', judgeConfigKey: 'test-judge' },
517+
helpfulness: { score: 0.7, reasoning: 'Helpful' },
518518
});
519519
});
520520
});

packages/sdk/server-ai/src/LDAIConfigTrackerImpl.ts

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@ import { LDContext } from '@launchdarkly/js-server-sdk-common';
33
import { name as aiSdkName, version as aiSdkVersion } from '../package.json';
44
import { LDAIConfigTracker } from './api/config';
55
import { LDAIMetricSummary } from './api/config/LDAIConfigTracker';
6-
import { EvalScore } from './api/judge/types';
6+
import { EvalScore, JudgeResponse } from './api/judge/types';
77
import {
88
createBedrockTokenUsage,
99
createOpenAiUsage,
@@ -76,12 +76,17 @@ export class LDAIConfigTrackerImpl implements LDAIConfigTracker {
7676
}
7777

7878
trackEvalScores(scores: Record<string, EvalScore>) {
79-
// Track each evaluation score individually
8079
Object.entries(scores).forEach(([metricKey, evalScore]) => {
80+
this._ldClient.track(metricKey, this._context, this.getTrackData(), evalScore.score);
81+
});
82+
}
83+
84+
trackJudgeResponse(response: JudgeResponse) {
85+
Object.entries(response.evals).forEach(([metricKey, evalScore]) => {
8186
this._ldClient.track(
8287
metricKey,
8388
this._context,
84-
{ ...this.getTrackData(), judgeConfigKey: evalScore.judgeConfigKey },
89+
{ ...this.getTrackData(), judgeConfigKey: response.judgeConfigKey },
8590
evalScore.score,
8691
);
8792
});

packages/sdk/server-ai/src/api/chat/TrackedChat.ts

Lines changed: 9 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -84,13 +84,17 @@ export class TrackedChat {
8484
return undefined;
8585
}
8686

87-
const evalResult = await judge.evaluateMessages(messages, response, judgeConfig.samplingRate);
88-
89-
if (evalResult && evalResult.success) {
90-
this.tracker.trackEvalScores(evalResult.evals);
87+
const judgeResponse = await judge.evaluateMessages(
88+
messages,
89+
response,
90+
judgeConfig.samplingRate,
91+
);
92+
93+
if (judgeResponse && judgeResponse.success) {
94+
this.tracker.trackJudgeResponse(judgeResponse);
9195
}
9296

93-
return evalResult;
97+
return judgeResponse;
9498
});
9599

96100
// ensure all evaluations complete even if some fail

packages/sdk/server-ai/src/api/config/LDAIConfigTracker.ts

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
import { EvalScore } from '../judge/types';
1+
import { EvalScore, JudgeResponse } from '../judge/types';
22
import { LDAIMetrics, LDFeedbackKind, LDTokenUsage } from '../metrics';
33

44
/**
@@ -94,6 +94,13 @@ export interface LDAIConfigTracker {
9494
*/
9595
trackEvalScores(scores: Record<string, EvalScore>): void;
9696

97+
/**
98+
* Track a judge response containing evaluation scores and judge configuration key.
99+
*
100+
* @param response Judge response containing evaluation scores and judge configuration key
101+
*/
102+
trackJudgeResponse(response: JudgeResponse): void;
103+
97104
/**
98105
* Track the duration of execution of the provided function.
99106
*

packages/sdk/server-ai/src/api/judge/Judge.ts

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -91,13 +91,15 @@ export class Judge {
9191
return {
9292
evals,
9393
success,
94+
judgeConfigKey: this._aiConfig.key,
9495
};
9596
} catch (error) {
9697
this._logger?.error('Judge evaluation failed:', error);
9798
return {
9899
evals: {},
99100
success: false,
100101
error: error instanceof Error ? error.message : 'Unknown error',
102+
judgeConfigKey: this._aiConfig.key,
101103
};
102104
}
103105
}
@@ -208,7 +210,6 @@ export class Judge {
208210
results[metricKey] = {
209211
score: evalData.score,
210212
reasoning: evalData.reasoning,
211-
judgeConfigKey: this._aiConfig.key,
212213
};
213214
});
214215

packages/sdk/server-ai/src/api/judge/types.ts

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -24,14 +24,14 @@ export interface EvalScore {
2424
score: number;
2525
/** Reasoning behind the provided score for this metric */
2626
reasoning: string;
27-
/** The key of the judge configuration that was used to evaluate this metric */
28-
judgeConfigKey?: string;
2927
}
3028

3129
/**
3230
* Response from a judge evaluation containing scores and reasoning for multiple metrics.
3331
*/
3432
export interface JudgeResponse {
33+
/** The key of the judge configuration that was used to generate this response */
34+
judgeConfigKey?: string;
3535
/** Dictionary where keys are metric names and values contain score and reasoning */
3636
evals: Record<string, EvalScore>;
3737
/** Whether the evaluation completed successfully */

0 commit comments

Comments
 (0)