[OneChat] Evaluation error handling and judge prompt fixes (#232940)

SrdjanLL · web-flow · commit 59cc861dcae9 · 2025-08-26T16:57:54.000+02:00
Closes: elastic/search-team#10864 ## Summary Updates to error handling in onechat evaluations and improvements to the LLM judge prompt construction: - Added retries on the `converse` API calls - Upon retry exhaustion providing default message to the LLM judge - Fixed user prompt template to inject string format of the input, ground truth and agent response - Other: removed Criteria evaluator from OneChat evaluations (it was there as a first, dummy evaluator, but no longer needed). ## Testing - Ran experiments with the new configuration: - Confirmed the user prompt for correctness analysis now looks correct (screenshot below) <img width="709" height="760" alt="image" src="https://github.com/user-attachments/assets/f35339b2-b833-4f14-9afd-c54152b699c7" /> - Confirmed the default error propagates to the LLM judge: <img width="709" height="760" alt="image" src="https://github.com/user-attachments/assets/bc3e8703-6991-44ca-a58b-4694857a54df" />
diff --git a/x-pack/platform/packages/shared/kbn-evals/src/evaluators/correctness/index.ts b/x-pack/platform/packages/shared/kbn-evals/src/evaluators/correctness/index.ts
@@ -27,12 +27,17 @@ export function createCorrectnessAnalysisEvaluator({
   return {
     evaluate: async ({ input, output, expected }) => {
       async function runCorrectnessAnalysis(): Promise<CorrectnessAnalysis> {
+        const userQuery = input.question;
+        const messages = (output as any)?.messages ?? [];
+        const latestMessage = messages[messages.length - 1]?.message;
+        const groundTruthResponse = expected?.expected;
+
         const response = await inferenceClient.prompt({
           prompt: LlmCorrectnessEvaluationPrompt,
           input: {
-            user_query: JSON.stringify(input),
-            agent_response: JSON.stringify(output),
-            ground_truth_response: JSON.stringify(expected),
+            user_query: `${userQuery}`,
+            agent_response: `${latestMessage}`,
+            ground_truth_response: `${groundTruthResponse}`,
           },
         });
 
diff --git a/x-pack/platform/packages/shared/onechat/kbn-evals-suite-onechat/src/chat_client.ts b/x-pack/platform/packages/shared/onechat/kbn-evals-suite-onechat/src/chat_client.ts
@@ -8,22 +8,23 @@
 import type { ToolingLog } from '@kbn/tooling-log';
 import type { HttpHandler } from '@kbn/core/public';
 import { oneChatDefaultAgentId } from '@kbn/onechat-common';
+import pRetry from 'p-retry';
 
-type StringOrMessageList = string;
+type Messages = { message: string }[];
 
 interface Options {
   agentId?: string;
 }
 
 interface ConverseFunctionParams {
-  messages: StringOrMessageList;
+  messages: Messages;
   conversationId?: string;
   options?: Options;
 }
 
 type ConverseFunction = (params: ConverseFunctionParams) => Promise<{
   conversationId?: string;
-  messages: string[];
+  messages: Messages;
   errors: any[];
 }>;
 
@@ -39,7 +40,11 @@ export class OnechatEvaluationChatClient {
 
     const { agentId = oneChatDefaultAgentId } = options;
 
-    try {
+    const callConverseApi = async (): Promise<{
+      conversationId?: string;
+      messages: { message: string }[];
+      errors: any[];
+    }> => {
       // Use the non-async OneChat API endpoint
       const response = await this.fetch('/api/chat/converse', {
         method: 'POST',
@@ -48,7 +53,7 @@ export class OnechatEvaluationChatClient {
           agent_id: agentId,
           connector_id: this.connectorId,
           conversation_id: conversationId,
-          input: messages,
+          input: messages[messages.length - 1].message,
         }),
       });
 
@@ -57,21 +62,52 @@ export class OnechatEvaluationChatClient {
         conversation_id: string;
         trace_id?: string;
         steps: any[];
-        response: string;
+        response: { message: string };
       };
       const { conversation_id: conversationIdFromResponse, response: latestResponse } =
         chatResponse;
 
       return {
         conversationId: conversationIdFromResponse,
-        messages: [messages, latestResponse],
+        messages: [...messages, latestResponse],
         errors: [],
       };
+    };
+
+    try {
+      return await pRetry(callConverseApi, {
+        retries: 2,
+        minTimeout: 2000,
+        onFailedAttempt: (error) => {
+          const isLastAttempt = error.attemptNumber === error.retriesLeft + error.attemptNumber;
+
+          if (isLastAttempt) {
+            this.log.error(
+              new Error(`Failed to call converse API after ${error.attemptNumber} attempts`, {
+                cause: error,
+              })
+            );
+            throw error;
+          } else {
+            this.log.warning(
+              new Error(`Converse API call failed on attempt ${error.attemptNumber}; retrying...`, {
+                cause: error,
+              })
+            );
+          }
+        },
+      });
     } catch (error) {
       this.log.error('Error occurred while calling converse API');
       return {
         conversationId,
-        messages: [messages],
+        messages: [
+          ...messages,
+          {
+            message:
+              'This question could not be answered as an internal error occurred. Please try again.',
+          },
+        ],
         errors: [
           {
             error: {
diff --git a/x-pack/platform/packages/shared/onechat/kbn-evals-suite-onechat/src/evaluate_dataset.ts b/x-pack/platform/packages/shared/onechat/kbn-evals-suite-onechat/src/evaluate_dataset.ts
@@ -42,7 +42,7 @@ export function createEvaluateDataset({
   phoenixClient: KibanaPhoenixClient;
   chatClient: OnechatEvaluationChatClient;
 }): EvaluateDataset {
-  return async function evaluateEsqlDataset({
+  return async function evaluateDataset({
     dataset: { name, description, examples },
   }: {
     dataset: {
@@ -62,20 +62,17 @@ export function createEvaluateDataset({
         dataset,
         task: async ({ input, output, metadata }) => {
           const response = await chatClient.converse({
-            messages: input.question,
+            messages: [{ message: input.question }],
           });
 
           // Running correctness evaluator as part of the task since quantitative correctness evaluators need its output
-          let correctnessAnalysis = null;
-          if (!response.errors?.length) {
-            const correctnessResult = await evaluators.correctnessAnalysis().evaluate({
-              input,
-              expected: output,
-              output: response,
-              metadata,
-            });
-            correctnessAnalysis = correctnessResult.metadata;
-          }
+          const correctnessResult = await evaluators.correctnessAnalysis().evaluate({
+            input,
+            expected: output,
+            output: response,
+            metadata,
+          });
+          const correctnessAnalysis = correctnessResult.metadata;
 
           return {
             errors: response.errors,
@@ -84,25 +81,7 @@ export function createEvaluateDataset({
           };
         },
       },
-      [
-        {
-          name: 'Criteria',
-          kind: 'LLM',
-          evaluate: async ({ input, output, expected, metadata }) => {
-            const result = await evaluators
-              .criteria([`The response contains the following information: ${expected.expected}`])
-              .evaluate({
-                input,
-                expected,
-                output,
-                metadata,
-              });
-
-            return result;
-          },
-        },
-        ...createQuantitativeCorrectnessEvaluators(),
-      ]
+      createQuantitativeCorrectnessEvaluators()
     );
   };
 }