Skip to content

Commit bd6d396

Browse files
committed
refactor: remove json-schema scorer from eval testing
- Remove json-schema scorer type from ResponseScorer interface - Update test-config.json schema to only support regex and llm-judge scorers - Remove runJsonSchemaScorer method and related logic from EvalTestRunner - Replace any types with proper TypeScript types (CoreMessage, ToolCall, DisplayManager) - Add ScorerResult interface for better type safety - Update schema validation tests to reflect new scorer types JSON schema validation doesn't serve a realistic use case for LLM response evaluation since responses should be natural language, not structured data requiring schema validation.
1 parent 6c1c02c commit bd6d396

File tree

4 files changed

+25
-66
lines changed

4 files changed

+25
-66
lines changed

src/core/types.ts

Lines changed: 12 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,8 @@
33
* Focus on capabilities testing initially
44
*/
55

6+
import type { CoreMessage } from 'ai';
7+
68
// Server configuration (standard MCP format)
79
export interface ServerConfig {
810
command: string;
@@ -136,20 +138,26 @@ export interface EvalTest {
136138
}
137139

138140
export interface ResponseScorer {
139-
type: 'regex' | 'json-schema' | 'llm-judge';
141+
type: 'regex' | 'llm-judge';
140142
pattern?: string;
141-
schema?: any;
142143
criteria?: string;
143144
threshold?: number;
144145
}
145146

147+
export interface ScorerResult {
148+
type: 'regex' | 'llm-judge';
149+
passed: boolean;
150+
score?: number;
151+
details?: string;
152+
}
153+
146154
export interface EvalResult {
147155
name: string;
148156
model: string;
149157
passed: boolean;
150158
errors: string[];
151-
scorer_results: any[];
152-
messages?: any[];
159+
scorer_results: ScorerResult[];
160+
messages?: CoreMessage[];
153161
}
154162

155163
// Main test configuration

src/schemas/test-config.json

Lines changed: 1 addition & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -226,16 +226,13 @@
226226
"properties": {
227227
"type": {
228228
"type": "string",
229-
"enum": ["regex", "json-schema", "llm-judge"],
229+
"enum": ["regex", "llm-judge"],
230230
"description": "Type of scorer to use"
231231
},
232232
"pattern": {
233233
"type": "string",
234234
"description": "Regex pattern to match (required for regex type)"
235235
},
236-
"schema": {
237-
"description": "JSON schema to validate against (required for json-schema type)"
238-
},
239236
"criteria": {
240237
"type": "string",
241238
"description": "Criteria for LLM judge (required for llm-judge type)"
@@ -257,14 +254,6 @@
257254
"required": ["pattern"]
258255
}
259256
},
260-
{
261-
"if": {
262-
"properties": { "type": { "const": "json-schema" } }
263-
},
264-
"then": {
265-
"required": ["schema"]
266-
}
267-
},
268257
{
269258
"if": {
270259
"properties": { "type": { "const": "llm-judge" } }

src/testing/evals/runner.ts

Lines changed: 11 additions & 49 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,8 @@ import {
1010
import { ConfigLoader } from '../../config/loader.js';
1111
import { AnthropicProvider } from './providers/anthropic-provider.js';
1212
import type { EvalsConfig, EvalTest, EvalResult } from '../../core/types.js';
13+
import type { DisplayManager } from '../display/DisplayManager.js';
14+
import type { CoreMessage, ToolCall } from 'ai';
1315

1416
export interface EvalSummary {
1517
total: number;
@@ -34,9 +36,13 @@ export class EvalTestRunner {
3436
private serverOptions: EvalServerOptions;
3537
private models: string[];
3638
private llmProvider: AnthropicProvider;
37-
private displayManager?: any; // Will be injected from outside
39+
private displayManager?: DisplayManager;
3840

39-
constructor(config: EvalsConfig, serverOptions: EvalServerOptions, displayManager?: any) {
41+
constructor(
42+
config: EvalsConfig,
43+
serverOptions: EvalServerOptions,
44+
displayManager?: DisplayManager
45+
) {
4046
this.config = config;
4147
this.serverOptions = serverOptions;
4248
this.displayManager = displayManager;
@@ -213,7 +219,7 @@ export class EvalTestRunner {
213219
}
214220

215221
private validateToolCalls(
216-
actualToolCalls: any[],
222+
actualToolCalls: ToolCall<string, Record<string, unknown>>[],
217223
expectedToolCalls: NonNullable<EvalTest['expected_tool_calls']>
218224
): string[] {
219225
const errors: string[] = [];
@@ -249,7 +255,7 @@ export class EvalTestRunner {
249255
}
250256

251257
private async runResponseScorers(
252-
messages: any[],
258+
messages: CoreMessage[],
253259
scorers: NonNullable<EvalTest['response_scorers']>
254260
): Promise<string[]> {
255261
const errors: string[] = [];
@@ -272,11 +278,6 @@ export class EvalTestRunner {
272278
`LLM judge failed: score ${result.score} < threshold ${scorer.threshold || 0.7}. Rationale: ${result.rationale}`
273279
);
274280
}
275-
} else if (scorer.type === 'json-schema') {
276-
const success = await this.runJsonSchemaScorer(messages, scorer.schema!);
277-
if (!success) {
278-
errors.push(`JSON schema scorer failed: response does not match schema`);
279-
}
280281
}
281282
} catch (error) {
282283
errors.push(
@@ -288,7 +289,7 @@ export class EvalTestRunner {
288289
return errors;
289290
}
290291

291-
private async runRegexScorer(messages: any[], pattern: string): Promise<boolean> {
292+
private async runRegexScorer(messages: CoreMessage[], pattern: string): Promise<boolean> {
292293
const regex = new RegExp(pattern, 'i');
293294

294295
for (const message of messages) {
@@ -304,43 +305,4 @@ export class EvalTestRunner {
304305

305306
return false;
306307
}
307-
308-
private async runJsonSchemaScorer(messages: any[], schema: any): Promise<boolean> {
309-
// Basic JSON schema validation - would need full implementation with ajv
310-
for (const message of messages) {
311-
if (message.role === 'assistant') {
312-
const content =
313-
typeof message.content === 'string' ? message.content : JSON.stringify(message.content);
314-
315-
// Try to parse as JSON and do basic validation
316-
try {
317-
const jsonData = JSON.parse(content);
318-
319-
// Very basic schema validation - in practice would use ajv
320-
if (schema.type === 'string' && typeof jsonData === 'string') {
321-
if (schema.minLength && jsonData.length < schema.minLength) {
322-
continue;
323-
}
324-
if (schema.pattern && !new RegExp(schema.pattern).test(jsonData)) {
325-
continue;
326-
}
327-
return true;
328-
}
329-
330-
if (schema.type === 'object' && typeof jsonData === 'object') {
331-
return true; // Basic object validation
332-
}
333-
} catch {
334-
// If schema expects a string but content is not JSON, check if it matches
335-
if (schema.type === 'string') {
336-
if (schema.pattern && new RegExp(schema.pattern).test(content)) {
337-
return true;
338-
}
339-
}
340-
}
341-
}
342-
}
343-
344-
return false;
345-
}
346308
}

test/unit/schema.test.ts

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -62,7 +62,7 @@ describe('Schema Validation', () => {
6262
const testWithScorers = config.evals.tests.find(t => t.response_scorers);
6363
if (testWithScorers?.response_scorers) {
6464
for (const scorer of testWithScorers.response_scorers) {
65-
expect(['regex', 'json-schema', 'llm-judge']).toContain(scorer.type);
65+
expect(['regex', 'llm-judge']).toContain(scorer.type);
6666
}
6767
}
6868
}

0 commit comments

Comments
 (0)