From 5a4258aaba25120a3fbcda443f509f5b039d87e0 Mon Sep 17 00:00:00 2001 From: jsonbailey Date: Thu, 6 Nov 2025 22:34:42 +0000 Subject: [PATCH 1/2] fix: Include the AI Judge Config key with tracked metrics --- packages/sdk/server-ai/src/LDAIClientImpl.ts | 4 +-- .../server-ai/src/LDAIConfigTrackerImpl.ts | 7 ++++- .../src/api/config/LDAIConfigUtils.ts | 29 +++++++++++++------ .../sdk/server-ai/src/api/config/types.ts | 4 +++ packages/sdk/server-ai/src/api/judge/Judge.ts | 1 + packages/sdk/server-ai/src/api/judge/types.ts | 2 ++ 6 files changed, 35 insertions(+), 12 deletions(-) diff --git a/packages/sdk/server-ai/src/LDAIClientImpl.ts b/packages/sdk/server-ai/src/LDAIClientImpl.ts index 54cb054fe..6955e1a05 100644 --- a/packages/sdk/server-ai/src/LDAIClientImpl.ts +++ b/packages/sdk/server-ai/src/LDAIClientImpl.ts @@ -63,7 +63,7 @@ export class LDAIClientImpl implements LDAIClient { this._logger?.warn( `AI Config mode mismatch for ${key}: expected ${mode}, got ${flagMode}. Returning disabled config.`, ); - return LDAIConfigUtils.createDisabledConfig(mode); + return LDAIConfigUtils.createDisabledConfig(key, mode); } const tracker = new LDAIConfigTrackerImpl( @@ -78,7 +78,7 @@ export class LDAIClientImpl implements LDAIClient { context, ); - const config = LDAIConfigUtils.fromFlagValue(value, tracker); + const config = LDAIConfigUtils.fromFlagValue(key, value, tracker); // Apply variable interpolation (always needed for ldctx) return this._applyInterpolation(config, context, variables); diff --git a/packages/sdk/server-ai/src/LDAIConfigTrackerImpl.ts b/packages/sdk/server-ai/src/LDAIConfigTrackerImpl.ts index f4e6624a7..5dcdf4405 100644 --- a/packages/sdk/server-ai/src/LDAIConfigTrackerImpl.ts +++ b/packages/sdk/server-ai/src/LDAIConfigTrackerImpl.ts @@ -78,7 +78,12 @@ export class LDAIConfigTrackerImpl implements LDAIConfigTracker { trackEvalScores(scores: Record) { // Track each evaluation score individually Object.entries(scores).forEach(([metricKey, evalScore]) => { - this._ldClient.track(metricKey, this._context, this.getTrackData(), evalScore.score); + this._ldClient.track( + metricKey, + this._context, + { ...this.getTrackData(), judgeConfigKey: evalScore.judgeConfigKey }, + evalScore.score, + ); }); } diff --git a/packages/sdk/server-ai/src/api/config/LDAIConfigUtils.ts b/packages/sdk/server-ai/src/api/config/LDAIConfigUtils.ts index b03ee2882..cd943be6c 100644 --- a/packages/sdk/server-ai/src/api/config/LDAIConfigUtils.ts +++ b/packages/sdk/server-ai/src/api/config/LDAIConfigUtils.ts @@ -82,19 +82,23 @@ export class LDAIConfigUtils { * @param tracker The tracker to add to the config * @returns The appropriate AI configuration type */ - static fromFlagValue(flagValue: LDAIConfigFlagValue, tracker: LDAIConfigTracker): LDAIConfigKind { + static fromFlagValue( + key: string, + flagValue: LDAIConfigFlagValue, + tracker: LDAIConfigTracker, + ): LDAIConfigKind { // Determine the actual mode from flag value // eslint-disable-next-line no-underscore-dangle const flagValueMode = flagValue._ldMeta?.mode; switch (flagValueMode) { case 'agent': - return this.toAgentConfig(flagValue, tracker); + return this.toAgentConfig(key, flagValue, tracker); case 'judge': - return this.toJudgeConfig(flagValue, tracker); + return this.toJudgeConfig(key, flagValue, tracker); case 'completion': default: - return this.toCompletionConfig(flagValue, tracker); + return this.toCompletionConfig(key, flagValue, tracker); } } @@ -104,15 +108,17 @@ export class LDAIConfigUtils { * @param mode The mode for the disabled config * @returns A disabled config of the appropriate type */ - static createDisabledConfig(mode: LDAIConfigMode): LDAIConfigKind { + static createDisabledConfig(key: string, mode: LDAIConfigMode): LDAIConfigKind { switch (mode) { case 'agent': return { + key, enabled: false, tracker: undefined, } as LDAIAgentConfig; case 'judge': return { + key, enabled: false, tracker: undefined, evaluationMetricKeys: [], @@ -121,6 +127,7 @@ export class LDAIConfigUtils { default: // Default to completion config for completion mode or any unexpected mode return { + key, enabled: false, tracker: undefined, } as LDAICompletionConfig; @@ -133,8 +140,9 @@ export class LDAIConfigUtils { * @param flagValue The flag value from LaunchDarkly * @returns Base configuration object */ - private static _toBaseConfig(flagValue: LDAIConfigFlagValue) { + private static _toBaseConfig(key: string, flagValue: LDAIConfigFlagValue) { return { + key, // eslint-disable-next-line no-underscore-dangle enabled: flagValue._ldMeta?.enabled ?? false, model: flagValue.model, @@ -150,11 +158,12 @@ export class LDAIConfigUtils { * @returns A completion configuration */ static toCompletionConfig( + key: string, flagValue: LDAIConfigFlagValue, tracker: LDAIConfigTracker, ): LDAICompletionConfig { return { - ...this._toBaseConfig(flagValue), + ...this._toBaseConfig(key, flagValue), tracker, messages: flagValue.messages, judgeConfiguration: flagValue.judgeConfiguration, @@ -169,11 +178,12 @@ export class LDAIConfigUtils { * @returns An agent configuration */ static toAgentConfig( + key: string, flagValue: LDAIConfigFlagValue, tracker: LDAIConfigTracker, ): LDAIAgentConfig { return { - ...this._toBaseConfig(flagValue), + ...this._toBaseConfig(key, flagValue), tracker, instructions: flagValue.instructions, judgeConfiguration: flagValue.judgeConfiguration, @@ -188,11 +198,12 @@ export class LDAIConfigUtils { * @returns A judge configuration */ static toJudgeConfig( + key: string, flagValue: LDAIConfigFlagValue, tracker: LDAIConfigTracker, ): LDAIJudgeConfig { return { - ...this._toBaseConfig(flagValue), + ...this._toBaseConfig(key, flagValue), tracker, messages: flagValue.messages, evaluationMetricKeys: flagValue.evaluationMetricKeys || [], diff --git a/packages/sdk/server-ai/src/api/config/types.ts b/packages/sdk/server-ai/src/api/config/types.ts index d6682676e..ade099037 100644 --- a/packages/sdk/server-ai/src/api/config/types.ts +++ b/packages/sdk/server-ai/src/api/config/types.ts @@ -95,6 +95,10 @@ export interface LDAIConfigDefault { * Base AI Config interface without mode-specific fields. */ export interface LDAIConfig extends Omit { + /** + * The key of the AI Config. + */ + key: string; /** * Whether the configuration is enabled. */ diff --git a/packages/sdk/server-ai/src/api/judge/Judge.ts b/packages/sdk/server-ai/src/api/judge/Judge.ts index 581a36d57..3762519fe 100644 --- a/packages/sdk/server-ai/src/api/judge/Judge.ts +++ b/packages/sdk/server-ai/src/api/judge/Judge.ts @@ -208,6 +208,7 @@ export class Judge { results[metricKey] = { score: evalData.score, reasoning: evalData.reasoning, + judgeConfigKey: this._aiConfig.key, }; }); diff --git a/packages/sdk/server-ai/src/api/judge/types.ts b/packages/sdk/server-ai/src/api/judge/types.ts index a265506b1..ea61f8bca 100644 --- a/packages/sdk/server-ai/src/api/judge/types.ts +++ b/packages/sdk/server-ai/src/api/judge/types.ts @@ -24,6 +24,8 @@ export interface EvalScore { score: number; /** Reasoning behind the provided score for this metric */ reasoning: string; + /** The key of the judge configuration that was used to evaluate this metric */ + judgeConfigKey?: string; } /** From 6cf398039196facc6a44792834e9f5a3da1398da Mon Sep 17 00:00:00 2001 From: jsonbailey Date: Thu, 6 Nov 2025 23:25:37 +0000 Subject: [PATCH 2/2] fix unit tests --- .../sdk/server-ai/__tests__/Judge.test.ts | 49 ++++++++++++++----- .../__tests__/LDAIClientImpl.test.ts | 5 ++ .../server-ai/__tests__/TrackedChat.test.ts | 1 + 3 files changed, 43 insertions(+), 12 deletions(-) diff --git a/packages/sdk/server-ai/__tests__/Judge.test.ts b/packages/sdk/server-ai/__tests__/Judge.test.ts index ed04f1fc4..23efd3eec 100644 --- a/packages/sdk/server-ai/__tests__/Judge.test.ts +++ b/packages/sdk/server-ai/__tests__/Judge.test.ts @@ -39,6 +39,7 @@ describe('Judge', () => { // Create a basic judge config judgeConfig = { + key: 'test-judge', enabled: true, messages: [ { role: 'system', content: 'You are a helpful judge that evaluates AI responses.' }, @@ -106,9 +107,21 @@ describe('Judge', () => { expect(result).toEqual({ evals: { - relevance: { score: 0.8, reasoning: 'The response is relevant to the question' }, - accuracy: { score: 0.9, reasoning: 'The response is factually accurate' }, - helpfulness: { score: 0.7, reasoning: 'The response provides helpful information' }, + relevance: { + score: 0.8, + reasoning: 'The response is relevant to the question', + judgeConfigKey: 'test-judge', + }, + accuracy: { + score: 0.9, + reasoning: 'The response is factually accurate', + judgeConfigKey: 'test-judge', + }, + helpfulness: { + score: 0.7, + reasoning: 'The response provides helpful information', + judgeConfigKey: 'test-judge', + }, }, success: true, }); @@ -254,8 +267,8 @@ describe('Judge', () => { // When one metric is missing, it returns the partial evals it has with success: false expect(result).toEqual({ evals: { - relevance: { score: 0.8, reasoning: 'Good' }, - helpfulness: { score: 0.7, reasoning: 'Helpful' }, + relevance: { score: 0.8, reasoning: 'Good', judgeConfigKey: 'test-judge' }, + helpfulness: { score: 0.7, reasoning: 'Helpful', judgeConfigKey: 'test-judge' }, }, success: false, }); @@ -364,9 +377,21 @@ describe('Judge', () => { expect(result).toEqual({ evals: { - relevance: { score: 0.8, reasoning: 'The response is relevant to the question' }, - accuracy: { score: 0.9, reasoning: 'The response is factually accurate' }, - helpfulness: { score: 0.7, reasoning: 'The response provides helpful information' }, + relevance: { + score: 0.8, + reasoning: 'The response is relevant to the question', + judgeConfigKey: 'test-judge', + }, + accuracy: { + score: 0.9, + reasoning: 'The response is factually accurate', + judgeConfigKey: 'test-judge', + }, + helpfulness: { + score: 0.7, + reasoning: 'The response provides helpful information', + judgeConfigKey: 'test-judge', + }, }, success: true, }); @@ -454,9 +479,9 @@ describe('Judge', () => { const result = parseResponse(responseData); expect(result).toEqual({ - relevance: { score: 0.8, reasoning: 'Good' }, - accuracy: { score: 0.9, reasoning: 'Accurate' }, - helpfulness: { score: 0.7, reasoning: 'Helpful' }, + relevance: { score: 0.8, reasoning: 'Good', judgeConfigKey: 'test-judge' }, + accuracy: { score: 0.9, reasoning: 'Accurate', judgeConfigKey: 'test-judge' }, + helpfulness: { score: 0.7, reasoning: 'Helpful', judgeConfigKey: 'test-judge' }, }); }); @@ -489,7 +514,7 @@ describe('Judge', () => { // Only helpfulness passes validation, relevance and accuracy are skipped expect(result).toEqual({ - helpfulness: { score: 0.7, reasoning: 'Helpful' }, + helpfulness: { score: 0.7, reasoning: 'Helpful', judgeConfigKey: 'test-judge' }, }); }); }); diff --git a/packages/sdk/server-ai/__tests__/LDAIClientImpl.test.ts b/packages/sdk/server-ai/__tests__/LDAIClientImpl.test.ts index 2f99ed3a4..bfb5e13ff 100644 --- a/packages/sdk/server-ai/__tests__/LDAIClientImpl.test.ts +++ b/packages/sdk/server-ai/__tests__/LDAIClientImpl.test.ts @@ -372,6 +372,7 @@ describe('agents method', () => { expect(result).toEqual({ 'research-agent': { + key: 'research-agent', model: { name: 'research-model', parameters: { temperature: 0.3, maxTokens: 2048 }, @@ -382,6 +383,7 @@ describe('agents method', () => { enabled: true, }, 'writing-agent': { + key: 'writing-agent', model: { name: 'writing-model', parameters: { temperature: 0.7, maxTokens: 1024 }, @@ -482,6 +484,7 @@ describe('createJudge method', () => { }; const mockJudgeConfig = { + key: 'test-judge', enabled: true, model: { name: 'gpt-4' }, provider: { name: 'openai' }, @@ -525,6 +528,7 @@ describe('createJudge method', () => { }; const mockJudgeConfig = { + key: 'test-judge', enabled: false, evaluationMetricKeys: [], }; @@ -548,6 +552,7 @@ describe('createJudge method', () => { }; const mockJudgeConfig = { + key: 'test-judge', enabled: true, model: { name: 'gpt-4' }, provider: { name: 'openai' }, diff --git a/packages/sdk/server-ai/__tests__/TrackedChat.test.ts b/packages/sdk/server-ai/__tests__/TrackedChat.test.ts index e70835e4f..d750a47e6 100644 --- a/packages/sdk/server-ai/__tests__/TrackedChat.test.ts +++ b/packages/sdk/server-ai/__tests__/TrackedChat.test.ts @@ -33,6 +33,7 @@ describe('TrackedChat', () => { // Create a basic AI config aiConfig = { + key: 'test-config', enabled: true, messages: [{ role: 'system', content: 'You are a helpful assistant.' }], model: { name: 'gpt-4' },