Skip to content

Commit abe73f5

Browse files
committed
fix: evaluation and lint
1 parent a3f04aa commit abe73f5

File tree

3 files changed

+8
-17
lines changed

3 files changed

+8
-17
lines changed

eslint.config.mjs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@ import apifyTypeScriptConfig from '@apify/eslint-config/ts.js';
22

33
// eslint-disable-next-line import/no-default-export
44
export default [
5-
{ ignores: ['**/dist', '**/.venv'] }, // Ignores need to happen first
5+
{ ignores: ['**/dist', '**/.venv', 'evals/**'] }, // Ignores need to happen first
66
...apifyTypeScriptConfig,
77
{
88
languageOptions: {

evals/config.ts

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@ function getTestCasesVersion(): string {
1919
// Models to evaluate
2020
export const MODELS_TO_EVALUATE = [
2121
'gpt-4o-mini',
22-
// 'claude-3-5-haiku-latest',
22+
'claude-3-5-haiku-latest',
2323
];
2424

2525
export const PASS_THRESHOLD = 0.8;

evals/run-evaluation.ts

Lines changed: 6 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@ import { createClient } from '@arizeai/phoenix-client';
99
import { getDatasetInfo } from '@arizeai/phoenix-client/datasets';
1010
// eslint-disable-next-line import/extensions
1111
import { asEvaluator, runExperiment } from '@arizeai/phoenix-client/experiments';
12-
import type { ExperimentTask } from '@arizeai/phoenix-client/types/experiments';
12+
import type { ExperimentEvaluationRun, ExperimentTask } from '@arizeai/phoenix-client/types/experiments';
1313
import dotenv from 'dotenv';
1414
import OpenAI from 'openai';
1515

@@ -31,16 +31,6 @@ process.env.PHOENIX_API_KEY = sanitizeHeaderValue(process.env.PHOENIX_API_KEY);
3131

3232
type ExampleInputOnly = { input: Record<string, unknown>, metadata?: Record<string, unknown>, output?: never };
3333

34-
// Type for Phoenix evaluation run results
35-
interface EvaluationRun {
36-
name: string;
37-
result?: {
38-
score?: number;
39-
[key: string]: unknown;
40-
};
41-
[key: string]: unknown;
42-
}
43-
4434
async function loadTools(): Promise<ToolBase[]> {
4535
const apifyClient = new ApifyClient({ token: process.env.APIFY_API_TOKEN || '' });
4636
const urlTools = await processParamsGetTools('', apifyClient);
@@ -187,12 +177,13 @@ async function main(): Promise<number> {
187177
try {
188178
const info = await getDatasetInfo({ client, dataset: { datasetName: DATASET_NAME } });
189179
datasetId = info?.id as string | undefined;
190-
if (!datasetId) throw new Error(`Dataset "${DATASET_NAME}" not found`);
191180
} catch (e) {
192181
log.error(`Error loading dataset: ${e}`);
193182
return 1;
194183
}
195184

185+
if (!datasetId) throw new Error(`Dataset "${DATASET_NAME}" not found`);
186+
196187
log.info(`Loaded dataset "${DATASET_NAME}" with ID: ${datasetId}`);
197188

198189
const results: { model: string; accuracy: number; correct: number; total: number; experiment_id?: string; error?: string }[] = [];
@@ -236,8 +227,8 @@ async function main(): Promise<number> {
236227
const runsMap = experiment.runs ?? {};
237228
const evalRuns = experiment.evaluationRuns ?? [];
238229
totalCases = Object.keys(runsMap).length;
239-
const toolMatchEvals = evalRuns.filter((er: EvaluationRun) => er.name === 'tools_match');
240-
correctCases = toolMatchEvals.filter((er: EvaluationRun) => (er.result?.score ?? 0) > 0.5).length;
230+
const toolMatchEvals = evalRuns.filter((er: ExperimentEvaluationRun) => er.name === 'tools_match');
231+
correctCases = toolMatchEvals.filter((er: ExperimentEvaluationRun) => (er.result?.score ?? 0) > 0.5).length;
241232
accuracy = totalCases > 0 ? correctCases / totalCases : 0;
242233
experimentId = experiment.id;
243234

@@ -261,7 +252,7 @@ async function main(): Promise<number> {
261252
}
262253
}
263254

264-
const allPassed = results.filter((r) => !r.error).every((r) => r.accuracy >= PASS_THRESHOLD);
255+
const allPassed = results.every((r) => !r.error && r.accuracy >= PASS_THRESHOLD);
265256
log.info(`Pass threshold: ${(PASS_THRESHOLD * 100).toFixed(1)}%`);
266257
if (allPassed) {
267258
log.info('✅ All models passed the threshold');

0 commit comments

Comments
 (0)