Skip to content

Commit 1d47a8e

Browse files
dgieselaarNicholasPeretti
authored andcommitted
[Evals] Optimize tracing for Phoenix (elastic#238599)
## What Optimizes inference instrumentation for use in Phoenix, by no longer requiring a single root trace. Additionally, retry on missing scores in the criteria evaluator. ## Why We previously required a single root trace for its events to be exported. This was done to not have to ingest all unrelated HTTP requests that do not have any value for what we use Phoenix for: debugging LLM-based workflows. The change we are making here is that we use context to flag events as inference-related, instead of requiring it to be a span. We don't need a single `RunExperiment` span anymore for this reason, which messes up the token metrics in Phoenix for evaluations. I also added retries around scoring - in some cases the LLM did not score all criteria. To that end, I've added `executeUntilValid` which will ask the LLM to keep calling a tool until it no longer encounters errors. Some other minor changes: - experiments can now contain metadata - export types for `@kbn/evals` worker fixtures I've targeted this at 9.2.0 as it is a dev-only change (mostly) and I want to limit the chance of conflicts.
1 parent 9d3f206 commit 1d47a8e

File tree

17 files changed

+648
-79
lines changed

17 files changed

+648
-79
lines changed

x-pack/platform/packages/shared/kbn-evals/index.ts

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,4 +11,4 @@ export { createPlaywrightEvalsConfig } from './src/config/create_playwright_eval
1111
export type { KibanaPhoenixClient } from './src/kibana_phoenix_client/client';
1212
export { createQuantitativeCorrectnessEvaluators } from './src/evaluators/correctness';
1313
export { createQuantitativeGroundednessEvaluator } from './src/evaluators/groundedness';
14-
export type { EvaluationDataset } from './src/types';
14+
export type { EvaluationDataset, EvaluationWorkerFixtures } from './src/types';

x-pack/platform/packages/shared/kbn-evals/src/evaluate.ts

Lines changed: 3 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -6,22 +6,15 @@
66
*/
77

88
import type { InferenceConnectorType, InferenceConnector, Model } from '@kbn/inference-common';
9-
import {
10-
getConnectorModel,
11-
type BoundInferenceClient,
12-
getConnectorFamily,
13-
getConnectorProvider,
14-
} from '@kbn/inference-common';
9+
import { getConnectorModel, getConnectorFamily, getConnectorProvider } from '@kbn/inference-common';
1510
import { createRestClient } from '@kbn/inference-plugin/common';
1611
import { test as base } from '@kbn/scout';
17-
import type { HttpHandler } from '@kbn/core/public';
18-
import type { AvailableConnectorWithId } from '@kbn/gen-ai-functional-testing';
1912
import { getPhoenixConfig } from './utils/get_phoenix_config';
2013
import { KibanaPhoenixClient } from './kibana_phoenix_client/client';
2114
import type { EvaluationTestOptions } from './config/create_playwright_eval_config';
2215
import { httpHandlerFromKbnClient } from './utils/http_handler_from_kbn_client';
2316
import { createCriteriaEvaluator } from './evaluators/criteria';
24-
import type { DefaultEvaluators } from './types';
17+
import type { DefaultEvaluators, EvaluationSpecificWorkerFixtures } from './types';
2518
import { reportModelScore } from './utils/report_model_score';
2619
import { createConnectorFixture } from './utils/create_connector_fixture';
2720
import { createCorrectnessAnalysisEvaluator } from './evaluators/correctness';
@@ -33,19 +26,7 @@ import { createGroundednessAnalysisEvaluator } from './evaluators/groundedness';
3326
* Test type for evaluations. Loads an inference client and a
3427
* (Kibana-flavored) Phoenix client.
3528
*/
36-
export const evaluate = base.extend<
37-
{},
38-
{
39-
inferenceClient: BoundInferenceClient;
40-
phoenixClient: KibanaPhoenixClient;
41-
evaluators: DefaultEvaluators;
42-
fetch: HttpHandler;
43-
connector: AvailableConnectorWithId;
44-
evaluationConnector: AvailableConnectorWithId;
45-
repetitions: number;
46-
evaluationAnalysisService: EvaluationAnalysisService;
47-
}
48-
>({
29+
export const evaluate = base.extend<{}, EvaluationSpecificWorkerFixtures>({
4930
fetch: [
5031
async ({ kbnClient, log }, use) => {
5132
// add a HttpHandler as a fixture, so consumers can use

x-pack/platform/packages/shared/kbn-evals/src/evaluators/criteria/index.ts

Lines changed: 49 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -8,8 +8,8 @@
88
import type { BoundInferenceClient } from '@kbn/inference-common';
99
import { ShortIdTable } from '@kbn/inference-common';
1010
import type { ToolingLog } from '@kbn/tooling-log';
11-
import { sumBy, uniqBy } from 'lodash';
12-
import pRetry from 'p-retry';
11+
import { difference, sumBy, uniqBy } from 'lodash';
12+
import { executeUntilValid } from '@kbn/inference-prompt-utils';
1313
import type { Evaluator } from '../../types';
1414
import { LlmCriteriaEvaluationPrompt } from './prompt';
1515

@@ -55,22 +55,20 @@ export function createCriteriaEvaluator({
5555

5656
return {
5757
evaluate: async ({ input, output }) => {
58-
async function scoreTask() {
59-
const response = await inferenceClient.prompt({
60-
prompt: LlmCriteriaEvaluationPrompt,
61-
input: {
62-
input: JSON.stringify(input),
63-
output: JSON.stringify(output),
64-
criteria: structuredCriteria.map((criterion) => {
65-
return `${criterion.id}: ${criterion.text}`;
66-
}),
67-
},
68-
});
58+
function toScores(
59+
evaluatedCriteria: Array<{ id: string; result: 'PASS' | 'FAIL' | 'N/A'; reason?: string }>
60+
) {
61+
const evaluations = uniqBy(evaluatedCriteria, (criterion) => criterion.id);
62+
63+
const evaluatedCriteriaIds = evaluations.map((evaluation) => evaluation.id);
64+
65+
const criteriaIds = Array.from(criteriaById.keys());
6966

70-
const evaluations = uniqBy(
71-
response.toolCalls.flatMap((toolCall) => toolCall.function.arguments.criteria),
72-
(criterion) => criterion.id
73-
);
67+
const unscored = difference(criteriaIds, evaluatedCriteriaIds);
68+
69+
if (unscored.length) {
70+
throw new Error(`Missing scores for ${unscored.join(', ')}`);
71+
}
7472

7573
return evaluations.map((evaluation) => {
7674
const criterion = criteriaById.get(evaluation.id);
@@ -85,12 +83,40 @@ export function createCriteriaEvaluator({
8583
});
8684
}
8785

88-
const results = await pRetry(scoreTask, {
89-
retries: 0,
90-
onFailedAttempt: (error) => {
91-
log.error(new Error(`Failed to score task`, { cause: error }));
92-
},
93-
});
86+
async function scoreTask() {
87+
const response = await executeUntilValid({
88+
prompt: LlmCriteriaEvaluationPrompt,
89+
inferenceClient,
90+
input: {
91+
input: JSON.stringify(input),
92+
output: JSON.stringify(output),
93+
criteria: structuredCriteria.map((criterion) => {
94+
return `${criterion.id}: ${criterion.text}`;
95+
}),
96+
},
97+
finalToolChoice: {
98+
function: 'score',
99+
},
100+
maxRetries: 3,
101+
toolCallbacks: {
102+
score: async (toolCall) => {
103+
return {
104+
response: {
105+
scores: toScores(toolCall.function.arguments.criteria),
106+
},
107+
};
108+
},
109+
},
110+
});
111+
112+
return response;
113+
}
114+
115+
const { toolCalls } = await scoreTask();
116+
117+
const results = toolCalls.flatMap((toolCall) =>
118+
toScores(toolCall.function.arguments.criteria)
119+
);
94120

95121
function normalize(val: number) {
96122
if (!isFinite(val)) {

x-pack/platform/packages/shared/kbn-evals/src/kibana_phoenix_client/client.ts

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@ import type { RanExperiment, TaskOutput } from '@arizeai/phoenix-client/dist/esm
1111
import type { DatasetInfo, Example } from '@arizeai/phoenix-client/dist/esm/types/datasets';
1212
import type { SomeDevLog } from '@kbn/some-dev-log';
1313
import type { Model } from '@kbn/inference-common';
14-
import { withActiveInferenceSpan } from '@kbn/inference-tracing';
14+
import { withInferenceContext } from '@kbn/inference-tracing';
1515
import type { Evaluator, EvaluationDataset, ExperimentTask } from '../types';
1616
import { upsertDataset } from './upsert_dataset';
1717
import type { PhoenixConfig } from '../utils/get_phoenix_config';
@@ -86,9 +86,11 @@ export class KibanaPhoenixClient {
8686
async runExperiment<TEvaluationDataset extends EvaluationDataset, TTaskOutput extends TaskOutput>(
8787
{
8888
dataset,
89+
metadata,
8990
task,
9091
}: {
9192
dataset: TEvaluationDataset;
93+
metadata?: Record<string, unknown>;
9294
task: ExperimentTask<TEvaluationDataset['examples'][number], TTaskOutput>;
9395
},
9496
evaluators: Array<Evaluator<TEvaluationDataset['examples'][number], TTaskOutput>>
@@ -98,13 +100,15 @@ export class KibanaPhoenixClient {
98100
{
99101
dataset,
100102
task,
103+
metadata: experimentMetadata,
101104
}: {
102105
dataset: EvaluationDataset;
103106
task: ExperimentTask<Example, TaskOutput>;
107+
metadata?: Record<string, unknown>;
104108
},
105109
evaluators: Evaluator[]
106110
): Promise<RanExperiment> {
107-
return await withActiveInferenceSpan('RunExperiment', async (span) => {
111+
return withInferenceContext(async () => {
108112
const { datasetId } = await this.syncDataSet(dataset);
109113

110114
const experiments = await import('@arizeai/phoenix-client/experiments');
@@ -115,6 +119,7 @@ export class KibanaPhoenixClient {
115119
experimentName: `Run ID: ${this.options.runId} - Dataset: ${dataset.name}`,
116120
task,
117121
experimentMetadata: {
122+
...experimentMetadata,
118123
model: this.options.model,
119124
runId: this.options.runId,
120125
},

x-pack/platform/packages/shared/kbn-evals/src/kibana_phoenix_client/diff_examples.ts

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,13 +7,14 @@
77

88
import type { Example } from '@arizeai/phoenix-client/dist/esm/types/datasets';
99
import objectHash from 'object-hash';
10+
import { isEmpty, omitBy } from 'lodash';
1011
import type { ExampleWithId } from '../types';
1112

1213
function normaliseExample(example: Example | ExampleWithId) {
1314
return {
1415
input: example.input,
1516
output: example.output,
16-
metadata: example.metadata,
17+
metadata: omitBy(example.metadata, isEmpty),
1718
};
1819
}
1920

x-pack/platform/packages/shared/kbn-evals/src/types.ts

Lines changed: 31 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,11 +7,17 @@
77

88
import type { Example } from '@arizeai/phoenix-client/dist/esm/types/datasets';
99
import type {
10-
EvaluationResult,
10+
EvaluationResult as PhoenixEvaluationResult,
1111
Evaluator as PhoenixEvaluator,
1212
TaskOutput,
1313
} from '@arizeai/phoenix-client/dist/esm/types/experiments';
14+
import type { BoundInferenceClient } from '@kbn/inference-common';
15+
import type { HttpHandler } from '@kbn/core/public';
16+
import type { AvailableConnectorWithId } from '@kbn/gen-ai-functional-testing';
17+
import type { ScoutWorkerFixtures } from '@kbn/scout';
18+
import type { KibanaPhoenixClient } from './kibana_phoenix_client/client';
1419
import type { EvaluationCriterion } from './evaluators/criteria';
20+
import type { EvaluationAnalysisService } from './utils/analysis';
1521

1622
export interface EvaluationDataset {
1723
name: string;
@@ -31,6 +37,8 @@ export interface EvaluatorParams<TExample extends Example, TTaskOutput extends T
3137
metadata: TExample['metadata'];
3238
}
3339

40+
export type EvaluationResult = PhoenixEvaluationResult;
41+
3442
type EvaluatorCallback<TExample extends Example, TTaskOutput extends TaskOutput> = (
3543
params: EvaluatorParams<TExample, TTaskOutput>
3644
) => Promise<EvaluationResult>;
@@ -54,3 +62,25 @@ export type ExperimentTask<TExample extends Example, TTaskOutput extends TaskOut
5462

5563
// simple version of Phoenix's ExampleWithId
5664
export type ExampleWithId = Example & { id: string };
65+
66+
export interface EvaluationSpecificWorkerFixtures {
67+
inferenceClient: BoundInferenceClient;
68+
phoenixClient: KibanaPhoenixClient;
69+
evaluators: DefaultEvaluators;
70+
fetch: HttpHandler;
71+
connector: AvailableConnectorWithId;
72+
evaluationConnector: AvailableConnectorWithId;
73+
repetitions: number;
74+
evaluationAnalysisService: EvaluationAnalysisService;
75+
}
76+
77+
export interface EvaluationWorkerFixtures extends ScoutWorkerFixtures {
78+
inferenceClient: BoundInferenceClient;
79+
phoenixClient: KibanaPhoenixClient;
80+
evaluators: DefaultEvaluators;
81+
fetch: HttpHandler;
82+
connector: AvailableConnectorWithId;
83+
evaluationConnector: AvailableConnectorWithId;
84+
repetitions: number;
85+
evaluationAnalysisService: EvaluationAnalysisService;
86+
}

x-pack/platform/packages/shared/kbn-evals/tsconfig.json

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,5 +29,6 @@
2929
"@kbn/repo-info",
3030
"@kbn/std",
3131
"@kbn/test",
32+
"@kbn/inference-prompt-utils",
3233
]
3334
}

x-pack/platform/packages/shared/kbn-inference-prompt-utils/index.ts

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,3 +10,5 @@ export type {
1010
ReasoningPromptResponse,
1111
ReasoningPromptResponseOf,
1212
} from './src/flows/reasoning/types';
13+
14+
export { executeUntilValid } from './src/flows/until_valid/execute_until_valid';

0 commit comments

Comments
 (0)