Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions packages/ai/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -90,8 +90,14 @@
},
"peerDependencies": {
"@opentelemetry/api": "^1.9.0",
"typescript": ">=5.4",
"zod": "^3.25.0 || ^4.0.0"
},
"peerDependenciesMeta": {
"typescript": {
"optional": true
}
},
"devDependencies": {
"@ai-sdk/anthropicv1": "npm:@ai-sdk/anthropic@^1.2.12",
"@ai-sdk/anthropicv2": "npm:@ai-sdk/anthropic@^2.0.57",
Expand Down
2 changes: 1 addition & 1 deletion packages/ai/src/evals/builder.ts
Original file line number Diff line number Diff line change
Expand Up @@ -95,7 +95,7 @@ class EvalBuilderImpl<
}

// Call existing Eval function - this handles all Vitest registration
Eval<TInput, TExpected, TOutput>(finalName, finalParams);
Eval(finalName, finalParams);
}
}

Expand Down
3 changes: 2 additions & 1 deletion packages/ai/src/evals/eval.ts
Original file line number Diff line number Diff line change
Expand Up @@ -119,9 +119,10 @@ export function Eval<
Step extends string = string,
>(
name: ValidateName<Name>,
params: Omit<EvalParams<TInput, TExpected, TOutput>, 'capability' | 'step'> & {
params: Omit<EvalParams<TInput, TExpected, TOutput>, 'capability' | 'step' | 'scorers'> & {
capability: ValidateName<Capability>;
step?: ValidateName<Step> | undefined;
scorers: ReadonlyArray<ScorerLike<NoInfer<TInput>, NoInfer<TExpected>, TOutput>>;
},
): void {
// Record eval name for validation
Expand Down
138 changes: 138 additions & 0 deletions packages/ai/test/evals/eval.types.test.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,138 @@
import { describe, it, expectTypeOf } from 'vitest';
import { Eval } from '../../src/evals';
import { Scorer } from '../../src/scorers/scorers';

describe('Eval type inference', () => {
it('infers task input and expected from data when scorer omits input', () => {
const answerSimilarityScorer = Scorer(
'answer-similarity',
({ output, expected }: { output: string; expected: string }) => {
output;
expected;
return 1;
},
);

const compileOnly = () =>
Eval('name-apl-query', {
capability: 'name_query',
data: () => [
{
input: "['nginx-access-logs'] | where status >= 500",
expected: 'Nginx 5xx Errors',
},
],
task: async ({ input }: { input: string }) => input,
scorers: [answerSimilarityScorer],
});

compileOnly;
});

it('preserves contextual task input typing from data when scorer omits input', () => {
const exactMatch = Scorer(
'exact-match',
({ expected, output }: { expected: string; output: string }) => expected === output,
);

const compileOnly = () =>
Eval('categorize-messages', {
capability: 'support-agent',
data: [
{
input: 'Hello world',
expected: 'support',
},
],
task: ({ input }) => {
expectTypeOf(input).toEqualTypeOf<string>();
return input;
},
scorers: [exactMatch],
});

compileOnly;
});

it('keeps structured task input inference anchored to data when scorers only use output', () => {
const queueMatchScorer = Scorer(
'queue-match',
({ expected, output }: { expected: { queue: string }; output: { queue: string } }) =>
expected.queue === output.queue,
);

const compileOnly = () =>
Eval('route-support-ticket', {
capability: 'support-routing',
data: [
{
input: {
ticketId: 'ticket-123',
message: 'Need help with a refund',
customer: {
tier: 'enterprise' as const,
},
},
expected: {
queue: 'billing' as const,
},
},
],
task: ({ input, expected }) => {
expectTypeOf(input.ticketId).toEqualTypeOf<string>();
expectTypeOf(input.customer.tier).toEqualTypeOf<'enterprise'>();
expectTypeOf(expected.queue).toEqualTypeOf<'billing'>();

return {
queue: input.customer.tier === 'enterprise' ? 'billing' : 'general',
};
},
scorers: [queueMatchScorer],
});

compileOnly;
});

it('rejects task input that conflicts with the data source', () => {
const OutputOnlyScorer = Scorer(
'output-only',
({ output }: { output: string }) => output.length > 0,
);

const invalid = () =>
Eval('mismatched-task-input', {
capability: 'name_query',
// @ts-expect-error task input must match the data input type
data: () => [
{
input: 'foo',
expected: 'bar',
},
],
task: async ({ input }: { input: number }) => String(input),
scorers: [OutputOnlyScorer],
});

invalid;
});

it('rejects a scorer whose input type conflicts with data', () => {
const inputAwareScorer = Scorer(
'input-aware',
({ input, output }: { input: { id: number }; output: string }) => {
return input.id > 0 && output.length > 0;
},
);

const invalid = () =>
Eval('scorer-input-mismatch', {
capability: 'test',
data: [{ input: 'hello', expected: 'world' }],
task: ({ input }) => input,
// @ts-expect-error scorer input type conflicts with data input type
scorers: [inputAwareScorer],
});

invalid;
});
});
Loading