Skip to content

Commit 213fc79

Browse files
authored
fix: Include the AI Judge Config key with tracked metrics (#986)
1 parent dc7745c commit 213fc79

File tree

11 files changed

+96
-25
lines changed

11 files changed

+96
-25
lines changed

packages/sdk/server-ai/__tests__/Judge.test.ts

Lines changed: 31 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,7 @@ describe('Judge', () => {
3939

4040
// Create a basic judge config
4141
judgeConfig = {
42+
key: 'test-judge',
4243
enabled: true,
4344
messages: [
4445
{ role: 'system', content: 'You are a helpful judge that evaluates AI responses.' },
@@ -106,11 +107,21 @@ describe('Judge', () => {
106107

107108
expect(result).toEqual({
108109
evals: {
109-
relevance: { score: 0.8, reasoning: 'The response is relevant to the question' },
110-
accuracy: { score: 0.9, reasoning: 'The response is factually accurate' },
111-
helpfulness: { score: 0.7, reasoning: 'The response provides helpful information' },
110+
relevance: {
111+
score: 0.8,
112+
reasoning: 'The response is relevant to the question',
113+
},
114+
accuracy: {
115+
score: 0.9,
116+
reasoning: 'The response is factually accurate',
117+
},
118+
helpfulness: {
119+
score: 0.7,
120+
reasoning: 'The response provides helpful information',
121+
},
112122
},
113123
success: true,
124+
judgeConfigKey: 'test-judge',
114125
});
115126

116127
expect(mockProvider.invokeStructuredModel).toHaveBeenCalledWith(
@@ -258,6 +269,7 @@ describe('Judge', () => {
258269
helpfulness: { score: 0.7, reasoning: 'Helpful' },
259270
},
260271
success: false,
272+
judgeConfigKey: 'test-judge',
261273
});
262274
});
263275

@@ -289,6 +301,7 @@ describe('Judge', () => {
289301
expect(result).toEqual({
290302
evals: {},
291303
success: false,
304+
judgeConfigKey: 'test-judge',
292305
});
293306
});
294307

@@ -302,6 +315,7 @@ describe('Judge', () => {
302315
evals: {},
303316
success: false,
304317
error: 'Provider error',
318+
judgeConfigKey: 'test-judge',
305319
});
306320
expect(mockLogger.error).toHaveBeenCalledWith('Judge evaluation failed:', error);
307321
});
@@ -315,6 +329,7 @@ describe('Judge', () => {
315329
evals: {},
316330
success: false,
317331
error: 'Unknown error',
332+
judgeConfigKey: 'test-judge',
318333
});
319334
});
320335
});
@@ -364,11 +379,21 @@ describe('Judge', () => {
364379

365380
expect(result).toEqual({
366381
evals: {
367-
relevance: { score: 0.8, reasoning: 'The response is relevant to the question' },
368-
accuracy: { score: 0.9, reasoning: 'The response is factually accurate' },
369-
helpfulness: { score: 0.7, reasoning: 'The response provides helpful information' },
382+
relevance: {
383+
score: 0.8,
384+
reasoning: 'The response is relevant to the question',
385+
},
386+
accuracy: {
387+
score: 0.9,
388+
reasoning: 'The response is factually accurate',
389+
},
390+
helpfulness: {
391+
score: 0.7,
392+
reasoning: 'The response provides helpful information',
393+
},
370394
},
371395
success: true,
396+
judgeConfigKey: 'test-judge',
372397
});
373398

374399
expect(mockProvider.invokeStructuredModel).toHaveBeenCalledWith(

packages/sdk/server-ai/__tests__/LDAIClientImpl.test.ts

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -372,6 +372,7 @@ describe('agents method', () => {
372372

373373
expect(result).toEqual({
374374
'research-agent': {
375+
key: 'research-agent',
375376
model: {
376377
name: 'research-model',
377378
parameters: { temperature: 0.3, maxTokens: 2048 },
@@ -382,6 +383,7 @@ describe('agents method', () => {
382383
enabled: true,
383384
},
384385
'writing-agent': {
386+
key: 'writing-agent',
385387
model: {
386388
name: 'writing-model',
387389
parameters: { temperature: 0.7, maxTokens: 1024 },
@@ -482,6 +484,7 @@ describe('createJudge method', () => {
482484
};
483485

484486
const mockJudgeConfig = {
487+
key: 'test-judge',
485488
enabled: true,
486489
model: { name: 'gpt-4' },
487490
provider: { name: 'openai' },
@@ -525,6 +528,7 @@ describe('createJudge method', () => {
525528
};
526529

527530
const mockJudgeConfig = {
531+
key: 'test-judge',
528532
enabled: false,
529533
evaluationMetricKeys: [],
530534
};
@@ -548,6 +552,7 @@ describe('createJudge method', () => {
548552
};
549553

550554
const mockJudgeConfig = {
555+
key: 'test-judge',
551556
enabled: true,
552557
model: { name: 'gpt-4' },
553558
provider: { name: 'openai' },

packages/sdk/server-ai/__tests__/TrackedChat.test.ts

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@ describe('TrackedChat', () => {
3333

3434
// Create a basic AI config
3535
aiConfig = {
36+
key: 'test-config',
3637
enabled: true,
3738
messages: [{ role: 'system', content: 'You are a helpful assistant.' }],
3839
model: { name: 'gpt-4' },

packages/sdk/server-ai/src/LDAIClientImpl.ts

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -63,7 +63,7 @@ export class LDAIClientImpl implements LDAIClient {
6363
this._logger?.warn(
6464
`AI Config mode mismatch for ${key}: expected ${mode}, got ${flagMode}. Returning disabled config.`,
6565
);
66-
return LDAIConfigUtils.createDisabledConfig(mode);
66+
return LDAIConfigUtils.createDisabledConfig(key, mode);
6767
}
6868

6969
const tracker = new LDAIConfigTrackerImpl(
@@ -78,7 +78,7 @@ export class LDAIClientImpl implements LDAIClient {
7878
context,
7979
);
8080

81-
const config = LDAIConfigUtils.fromFlagValue(value, tracker);
81+
const config = LDAIConfigUtils.fromFlagValue(key, value, tracker);
8282

8383
// Apply variable interpolation (always needed for ldctx)
8484
return this._applyInterpolation(config, context, variables);

packages/sdk/server-ai/src/LDAIConfigTrackerImpl.ts

Lines changed: 12 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@ import { LDContext } from '@launchdarkly/js-server-sdk-common';
33
import { name as aiSdkName, version as aiSdkVersion } from '../package.json';
44
import { LDAIConfigTracker } from './api/config';
55
import { LDAIMetricSummary } from './api/config/LDAIConfigTracker';
6-
import { EvalScore } from './api/judge/types';
6+
import { EvalScore, JudgeResponse } from './api/judge/types';
77
import {
88
createBedrockTokenUsage,
99
createOpenAiUsage,
@@ -76,12 +76,22 @@ export class LDAIConfigTrackerImpl implements LDAIConfigTracker {
7676
}
7777

7878
trackEvalScores(scores: Record<string, EvalScore>) {
79-
// Track each evaluation score individually
8079
Object.entries(scores).forEach(([metricKey, evalScore]) => {
8180
this._ldClient.track(metricKey, this._context, this.getTrackData(), evalScore.score);
8281
});
8382
}
8483

84+
trackJudgeResponse(response: JudgeResponse) {
85+
Object.entries(response.evals).forEach(([metricKey, evalScore]) => {
86+
this._ldClient.track(
87+
metricKey,
88+
this._context,
89+
{ ...this.getTrackData(), judgeConfigKey: response.judgeConfigKey },
90+
evalScore.score,
91+
);
92+
});
93+
}
94+
8595
trackFeedback(feedback: { kind: LDFeedbackKind }): void {
8696
this._trackedMetrics.feedback = feedback;
8797
if (feedback.kind === LDFeedbackKind.Positive) {

packages/sdk/server-ai/src/api/chat/TrackedChat.ts

Lines changed: 9 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -84,13 +84,17 @@ export class TrackedChat {
8484
return undefined;
8585
}
8686

87-
const evalResult = await judge.evaluateMessages(messages, response, judgeConfig.samplingRate);
88-
89-
if (evalResult && evalResult.success) {
90-
this.tracker.trackEvalScores(evalResult.evals);
87+
const judgeResponse = await judge.evaluateMessages(
88+
messages,
89+
response,
90+
judgeConfig.samplingRate,
91+
);
92+
93+
if (judgeResponse && judgeResponse.success) {
94+
this.tracker.trackJudgeResponse(judgeResponse);
9195
}
9296

93-
return evalResult;
97+
return judgeResponse;
9498
});
9599

96100
// ensure all evaluations complete even if some fail

packages/sdk/server-ai/src/api/config/LDAIConfigTracker.ts

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
import { EvalScore } from '../judge/types';
1+
import { EvalScore, JudgeResponse } from '../judge/types';
22
import { LDAIMetrics, LDFeedbackKind, LDTokenUsage } from '../metrics';
33

44
/**
@@ -94,6 +94,13 @@ export interface LDAIConfigTracker {
9494
*/
9595
trackEvalScores(scores: Record<string, EvalScore>): void;
9696

97+
/**
98+
* Track a judge response containing evaluation scores and judge configuration key.
99+
*
100+
* @param response Judge response containing evaluation scores and judge configuration key
101+
*/
102+
trackJudgeResponse(response: JudgeResponse): void;
103+
97104
/**
98105
* Track the duration of execution of the provided function.
99106
*

packages/sdk/server-ai/src/api/config/LDAIConfigUtils.ts

Lines changed: 20 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -82,19 +82,23 @@ export class LDAIConfigUtils {
8282
* @param tracker The tracker to add to the config
8383
* @returns The appropriate AI configuration type
8484
*/
85-
static fromFlagValue(flagValue: LDAIConfigFlagValue, tracker: LDAIConfigTracker): LDAIConfigKind {
85+
static fromFlagValue(
86+
key: string,
87+
flagValue: LDAIConfigFlagValue,
88+
tracker: LDAIConfigTracker,
89+
): LDAIConfigKind {
8690
// Determine the actual mode from flag value
8791
// eslint-disable-next-line no-underscore-dangle
8892
const flagValueMode = flagValue._ldMeta?.mode;
8993

9094
switch (flagValueMode) {
9195
case 'agent':
92-
return this.toAgentConfig(flagValue, tracker);
96+
return this.toAgentConfig(key, flagValue, tracker);
9397
case 'judge':
94-
return this.toJudgeConfig(flagValue, tracker);
98+
return this.toJudgeConfig(key, flagValue, tracker);
9599
case 'completion':
96100
default:
97-
return this.toCompletionConfig(flagValue, tracker);
101+
return this.toCompletionConfig(key, flagValue, tracker);
98102
}
99103
}
100104

@@ -104,15 +108,17 @@ export class LDAIConfigUtils {
104108
* @param mode The mode for the disabled config
105109
* @returns A disabled config of the appropriate type
106110
*/
107-
static createDisabledConfig(mode: LDAIConfigMode): LDAIConfigKind {
111+
static createDisabledConfig(key: string, mode: LDAIConfigMode): LDAIConfigKind {
108112
switch (mode) {
109113
case 'agent':
110114
return {
115+
key,
111116
enabled: false,
112117
tracker: undefined,
113118
} as LDAIAgentConfig;
114119
case 'judge':
115120
return {
121+
key,
116122
enabled: false,
117123
tracker: undefined,
118124
evaluationMetricKeys: [],
@@ -121,6 +127,7 @@ export class LDAIConfigUtils {
121127
default:
122128
// Default to completion config for completion mode or any unexpected mode
123129
return {
130+
key,
124131
enabled: false,
125132
tracker: undefined,
126133
} as LDAICompletionConfig;
@@ -133,8 +140,9 @@ export class LDAIConfigUtils {
133140
* @param flagValue The flag value from LaunchDarkly
134141
* @returns Base configuration object
135142
*/
136-
private static _toBaseConfig(flagValue: LDAIConfigFlagValue) {
143+
private static _toBaseConfig(key: string, flagValue: LDAIConfigFlagValue) {
137144
return {
145+
key,
138146
// eslint-disable-next-line no-underscore-dangle
139147
enabled: flagValue._ldMeta?.enabled ?? false,
140148
model: flagValue.model,
@@ -150,11 +158,12 @@ export class LDAIConfigUtils {
150158
* @returns A completion configuration
151159
*/
152160
static toCompletionConfig(
161+
key: string,
153162
flagValue: LDAIConfigFlagValue,
154163
tracker: LDAIConfigTracker,
155164
): LDAICompletionConfig {
156165
return {
157-
...this._toBaseConfig(flagValue),
166+
...this._toBaseConfig(key, flagValue),
158167
tracker,
159168
messages: flagValue.messages,
160169
judgeConfiguration: flagValue.judgeConfiguration,
@@ -169,11 +178,12 @@ export class LDAIConfigUtils {
169178
* @returns An agent configuration
170179
*/
171180
static toAgentConfig(
181+
key: string,
172182
flagValue: LDAIConfigFlagValue,
173183
tracker: LDAIConfigTracker,
174184
): LDAIAgentConfig {
175185
return {
176-
...this._toBaseConfig(flagValue),
186+
...this._toBaseConfig(key, flagValue),
177187
tracker,
178188
instructions: flagValue.instructions,
179189
judgeConfiguration: flagValue.judgeConfiguration,
@@ -188,11 +198,12 @@ export class LDAIConfigUtils {
188198
* @returns A judge configuration
189199
*/
190200
static toJudgeConfig(
201+
key: string,
191202
flagValue: LDAIConfigFlagValue,
192203
tracker: LDAIConfigTracker,
193204
): LDAIJudgeConfig {
194205
return {
195-
...this._toBaseConfig(flagValue),
206+
...this._toBaseConfig(key, flagValue),
196207
tracker,
197208
messages: flagValue.messages,
198209
evaluationMetricKeys: flagValue.evaluationMetricKeys || [],

packages/sdk/server-ai/src/api/config/types.ts

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -95,6 +95,10 @@ export interface LDAIConfigDefault {
9595
* Base AI Config interface without mode-specific fields.
9696
*/
9797
export interface LDAIConfig extends Omit<LDAIConfigDefault, 'enabled'> {
98+
/**
99+
* The key of the AI Config.
100+
*/
101+
key: string;
98102
/**
99103
* Whether the configuration is enabled.
100104
*/

packages/sdk/server-ai/src/api/judge/Judge.ts

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -91,13 +91,15 @@ export class Judge {
9191
return {
9292
evals,
9393
success,
94+
judgeConfigKey: this._aiConfig.key,
9495
};
9596
} catch (error) {
9697
this._logger?.error('Judge evaluation failed:', error);
9798
return {
9899
evals: {},
99100
success: false,
100101
error: error instanceof Error ? error.message : 'Unknown error',
102+
judgeConfigKey: this._aiConfig.key,
101103
};
102104
}
103105
}

0 commit comments

Comments
 (0)