feat: Automatically judge chat results based on AI Config

jsonbailey · jsonbailey · commit 2bf8872064af · 2025-10-31T21:23:55.000Z
diff --git a/packages/sdk/server-ai/src/LDAIClientImpl.ts b/packages/sdk/server-ai/src/LDAIClientImpl.ts
@@ -217,8 +217,25 @@ export class LDAIClientImpl implements LDAIClient {
       return undefined;
     }
 
-    // Create the TrackedChat instance with the provider
-    return new TrackedChat(config, config.tracker, provider);
+    // Initialize judges if they are configured
+    const judges: Record<string, Judge> = {};
+    if (config.judgeConfiguration?.judges) {
+      for (const judgeConfig of config.judgeConfiguration.judges) {
+        const judge = await this.initJudge(
+          judgeConfig.key,
+          context,
+          { enabled: false },
+          variables,
+          defaultAiProvider,
+        );
+        if (judge) {
+          judges[judgeConfig.key] = judge;
+        }
+      }
+    }
+
+    // Create the TrackedChat instance with the provider, judges, and logger
+    return new TrackedChat(config, config.tracker, provider, judges, this._logger);
   }
 
   async initJudge(
@@ -232,7 +249,19 @@ export class LDAIClientImpl implements LDAIClient {
     this._ldClient.track(TRACK_JUDGE_INIT, context, key, 1);
 
     try {
-      // Add standard judge variables to incoming variables
+      // Logging warnings if reserved keys are present
+      if (variables?.message_history !== undefined) {
+        this._logger?.warn(
+          "The variable 'message_history' is reserved by the judge and will be ignored."
+        );
+      }
+      if (variables?.response_to_evaluate !== undefined) {
+        this._logger?.warn(
+          "The variable 'response_to_evaluate' is reserved by the judge and will be ignored."
+        );
+      }
+
+      // Add overwrite standard judge variables to incoming variables
       const extendedVariables = {
         ...variables,
         message_history: '{{message_history}}',
diff --git a/packages/sdk/server-ai/src/api/chat/TrackedChat.ts b/packages/sdk/server-ai/src/api/chat/TrackedChat.ts
@@ -1,5 +1,9 @@
+import { LDLogger } from '@launchdarkly/js-server-sdk-common';
+
 import { LDAIConfigTracker } from '../config/LDAIConfigTracker';
 import { LDAIConversationConfig, LDMessage } from '../config/types';
+import { Judge } from '../judge/Judge';
+import { JudgeResponse } from '../judge/types';
 import { AIProvider } from '../providers/AIProvider';
 import { ChatResponse } from './types';
 
@@ -11,13 +15,19 @@ import { ChatResponse } from './types';
  */
 export class TrackedChat {
   protected messages: LDMessage[];
+  protected judges: Record<string, Judge>;
+  private readonly _logger?: LDLogger;
 
   constructor(
     protected readonly aiConfig: LDAIConversationConfig,
     protected readonly tracker: LDAIConfigTracker,
     protected readonly provider: AIProvider,
+    judges?: Record<string, Judge>,
+    logger?: LDLogger,
   ) {
     this.messages = [];
+    this.judges = judges || {};
+    this._logger = logger;
   }
 
   /**
@@ -45,9 +55,63 @@ export class TrackedChat {
     // Add the assistant response to the conversation history
     this.messages.push(response.message);
 
+    // Start judge evaluations if configured
+    if (
+      this.aiConfig.judgeConfiguration?.judges &&
+      this.aiConfig.judgeConfiguration.judges.length > 0
+    ) {
+      response.evaluations = this._evaluateWithJudges(this.messages, response);
+    }
+
     return response;
   }
 
+  /**
+   * Evaluates the response with all configured judges.
+   * Returns a promise that resolves to an array of evaluation results.
+   *
+   * @param messages Array of messages representing the conversation history
+   * @param response The AI response to be evaluated
+   * @returns Promise resolving to array of judge evaluation results
+   */
+  private async _evaluateWithJudges(
+    messages: LDMessage[],
+    response: ChatResponse,
+  ): Promise<Array<JudgeResponse | undefined>> {
+    const judgeConfigs = this.aiConfig.judgeConfiguration!.judges;
+
+    // Start all judge evaluations in parallel
+    const evaluationPromises = judgeConfigs.map(async (judgeConfig) => {
+      const judge = this.judges[judgeConfig.key];
+      if (!judge) {
+        this._logger?.warn(
+          `Judge configuration is not enabled: ${judgeConfig.key}`,
+          this.tracker.getTrackData(),
+        );
+        return undefined;
+      }
+
+      const evalResult = await judge.evaluateMessages(
+        messages,
+        response,
+        judgeConfig.samplingRate,
+      );
+
+      // Track scores if evaluation was successful
+      if (evalResult && evalResult.success) {
+        this.tracker.trackEvalScores(evalResult.evals);
+      }
+
+      return evalResult;
+    });
+
+    // Use Promise.allSettled to ensure all evaluations complete
+    // even if some fail
+    const results = await Promise.allSettled(evaluationPromises);
+
+    return results.map((result) => (result.status === 'fulfilled' ? result.value : undefined));
+  }
+
   /**
    * Get the underlying AI configuration used to initialize this TrackedChat.
    */
@@ -70,6 +134,14 @@ export class TrackedChat {
     return this.provider;
   }
 
+  /**
+   * Get the judges associated with this TrackedChat.
+   * Returns a record of judge instances keyed by their configuration keys.
+   */
+  getJudges(): Record<string, Judge> {
+    return this.judges;
+  }
+
   /**
    * Append messages to the conversation history.
    * Adds messages to the conversation history without invoking the model,
diff --git a/packages/sdk/server-ai/src/api/chat/types.ts b/packages/sdk/server-ai/src/api/chat/types.ts
@@ -1,4 +1,5 @@
 import { LDMessage } from '../config/types';
+import { JudgeResponse } from '../judge/types';
 import { LDAIMetrics } from '../metrics/LDAIMetrics';
 
 /**
@@ -14,4 +15,10 @@ export interface ChatResponse {
    * Metrics information including success status and token usage.
    */
   metrics: LDAIMetrics;
+
+  /**
+   * Promise that resolves to judge evaluation results.
+   * Only present when judges are configured for evaluation.
+   */
+  evaluations?: Promise<Array<JudgeResponse | undefined>>;
 }
diff --git a/packages/sdk/server-ai/src/api/config/types.ts b/packages/sdk/server-ai/src/api/config/types.ts
@@ -114,7 +114,7 @@ export interface LDAIJudgeConfigDefault extends LDAIConfigDefault {
    * Evaluation metric keys for judge configurations.
    * The keys of the metrics that this judge can evaluate.
    */
-  evaluationMetricKeys: string[];
+  evaluationMetricKeys?: string[];
 }
 
 /**

Original file line number	Diff line number	Diff line change
`@@ -114,7 +114,7 @@ export interface LDAIJudgeConfigDefault extends LDAIConfigDefault {`
`114`	`114`	`* Evaluation metric keys for judge configurations.`
`115`	`115`	`* The keys of the metrics that this judge can evaluate.`
`116`	`116`	`*/`
`117`		`- evaluationMetricKeys: string[];`
	`117`	`+ evaluationMetricKeys?: string[];`
`118`	`118`	`}`
`119`	`119`
`120`	`120`	`/**`