Skip to content

Commit 59cc861

Browse files
authored
[OneChat] Evaluation error handling and judge prompt fixes (#232940)
Closes: elastic/search-team#10864 ## Summary Updates to error handling in onechat evaluations and improvements to the LLM judge prompt construction: - Added retries on the `converse` API calls - Upon retry exhaustion providing default message to the LLM judge - Fixed user prompt template to inject string format of the input, ground truth and agent response - Other: removed Criteria evaluator from OneChat evaluations (it was there as a first, dummy evaluator, but no longer needed). ## Testing - Ran experiments with the new configuration: - Confirmed the user prompt for correctness analysis now looks correct (screenshot below) <img width="709" height="760" alt="image" src="https://github.com/user-attachments/assets/f35339b2-b833-4f14-9afd-c54152b699c7" /> - Confirmed the default error propagates to the LLM judge: <img width="709" height="760" alt="image" src="https://github.com/user-attachments/assets/bc3e8703-6991-44ca-a58b-4694857a54df" />
1 parent b665bd5 commit 59cc861

File tree

3 files changed

+62
-42
lines changed

3 files changed

+62
-42
lines changed

x-pack/platform/packages/shared/kbn-evals/src/evaluators/correctness/index.ts

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -27,12 +27,17 @@ export function createCorrectnessAnalysisEvaluator({
2727
return {
2828
evaluate: async ({ input, output, expected }) => {
2929
async function runCorrectnessAnalysis(): Promise<CorrectnessAnalysis> {
30+
const userQuery = input.question;
31+
const messages = (output as any)?.messages ?? [];
32+
const latestMessage = messages[messages.length - 1]?.message;
33+
const groundTruthResponse = expected?.expected;
34+
3035
const response = await inferenceClient.prompt({
3136
prompt: LlmCorrectnessEvaluationPrompt,
3237
input: {
33-
user_query: JSON.stringify(input),
34-
agent_response: JSON.stringify(output),
35-
ground_truth_response: JSON.stringify(expected),
38+
user_query: `${userQuery}`,
39+
agent_response: `${latestMessage}`,
40+
ground_truth_response: `${groundTruthResponse}`,
3641
},
3742
});
3843

x-pack/platform/packages/shared/onechat/kbn-evals-suite-onechat/src/chat_client.ts

Lines changed: 44 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -8,22 +8,23 @@
88
import type { ToolingLog } from '@kbn/tooling-log';
99
import type { HttpHandler } from '@kbn/core/public';
1010
import { oneChatDefaultAgentId } from '@kbn/onechat-common';
11+
import pRetry from 'p-retry';
1112

12-
type StringOrMessageList = string;
13+
type Messages = { message: string }[];
1314

1415
interface Options {
1516
agentId?: string;
1617
}
1718

1819
interface ConverseFunctionParams {
19-
messages: StringOrMessageList;
20+
messages: Messages;
2021
conversationId?: string;
2122
options?: Options;
2223
}
2324

2425
type ConverseFunction = (params: ConverseFunctionParams) => Promise<{
2526
conversationId?: string;
26-
messages: string[];
27+
messages: Messages;
2728
errors: any[];
2829
}>;
2930

@@ -39,7 +40,11 @@ export class OnechatEvaluationChatClient {
3940

4041
const { agentId = oneChatDefaultAgentId } = options;
4142

42-
try {
43+
const callConverseApi = async (): Promise<{
44+
conversationId?: string;
45+
messages: { message: string }[];
46+
errors: any[];
47+
}> => {
4348
// Use the non-async OneChat API endpoint
4449
const response = await this.fetch('/api/chat/converse', {
4550
method: 'POST',
@@ -48,7 +53,7 @@ export class OnechatEvaluationChatClient {
4853
agent_id: agentId,
4954
connector_id: this.connectorId,
5055
conversation_id: conversationId,
51-
input: messages,
56+
input: messages[messages.length - 1].message,
5257
}),
5358
});
5459

@@ -57,21 +62,52 @@ export class OnechatEvaluationChatClient {
5762
conversation_id: string;
5863
trace_id?: string;
5964
steps: any[];
60-
response: string;
65+
response: { message: string };
6166
};
6267
const { conversation_id: conversationIdFromResponse, response: latestResponse } =
6368
chatResponse;
6469

6570
return {
6671
conversationId: conversationIdFromResponse,
67-
messages: [messages, latestResponse],
72+
messages: [...messages, latestResponse],
6873
errors: [],
6974
};
75+
};
76+
77+
try {
78+
return await pRetry(callConverseApi, {
79+
retries: 2,
80+
minTimeout: 2000,
81+
onFailedAttempt: (error) => {
82+
const isLastAttempt = error.attemptNumber === error.retriesLeft + error.attemptNumber;
83+
84+
if (isLastAttempt) {
85+
this.log.error(
86+
new Error(`Failed to call converse API after ${error.attemptNumber} attempts`, {
87+
cause: error,
88+
})
89+
);
90+
throw error;
91+
} else {
92+
this.log.warning(
93+
new Error(`Converse API call failed on attempt ${error.attemptNumber}; retrying...`, {
94+
cause: error,
95+
})
96+
);
97+
}
98+
},
99+
});
70100
} catch (error) {
71101
this.log.error('Error occurred while calling converse API');
72102
return {
73103
conversationId,
74-
messages: [messages],
104+
messages: [
105+
...messages,
106+
{
107+
message:
108+
'This question could not be answered as an internal error occurred. Please try again.',
109+
},
110+
],
75111
errors: [
76112
{
77113
error: {

x-pack/platform/packages/shared/onechat/kbn-evals-suite-onechat/src/evaluate_dataset.ts

Lines changed: 10 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,7 @@ export function createEvaluateDataset({
4242
phoenixClient: KibanaPhoenixClient;
4343
chatClient: OnechatEvaluationChatClient;
4444
}): EvaluateDataset {
45-
return async function evaluateEsqlDataset({
45+
return async function evaluateDataset({
4646
dataset: { name, description, examples },
4747
}: {
4848
dataset: {
@@ -62,20 +62,17 @@ export function createEvaluateDataset({
6262
dataset,
6363
task: async ({ input, output, metadata }) => {
6464
const response = await chatClient.converse({
65-
messages: input.question,
65+
messages: [{ message: input.question }],
6666
});
6767

6868
// Running correctness evaluator as part of the task since quantitative correctness evaluators need its output
69-
let correctnessAnalysis = null;
70-
if (!response.errors?.length) {
71-
const correctnessResult = await evaluators.correctnessAnalysis().evaluate({
72-
input,
73-
expected: output,
74-
output: response,
75-
metadata,
76-
});
77-
correctnessAnalysis = correctnessResult.metadata;
78-
}
69+
const correctnessResult = await evaluators.correctnessAnalysis().evaluate({
70+
input,
71+
expected: output,
72+
output: response,
73+
metadata,
74+
});
75+
const correctnessAnalysis = correctnessResult.metadata;
7976

8077
return {
8178
errors: response.errors,
@@ -84,25 +81,7 @@ export function createEvaluateDataset({
8481
};
8582
},
8683
},
87-
[
88-
{
89-
name: 'Criteria',
90-
kind: 'LLM',
91-
evaluate: async ({ input, output, expected, metadata }) => {
92-
const result = await evaluators
93-
.criteria([`The response contains the following information: ${expected.expected}`])
94-
.evaluate({
95-
input,
96-
expected,
97-
output,
98-
metadata,
99-
});
100-
101-
return result;
102-
},
103-
},
104-
...createQuantitativeCorrectnessEvaluators(),
105-
]
84+
createQuantitativeCorrectnessEvaluators()
10685
);
10786
};
10887
}

0 commit comments

Comments
 (0)