refactor: remove json-schema scorer from eval testing

steviec · steviec · commit bd6d396ed421 · 2025-07-11T12:52:42.000-07:00
- Remove json-schema scorer type from ResponseScorer interface
- Update test-config.json schema to only support regex and llm-judge scorers
- Remove runJsonSchemaScorer method and related logic from EvalTestRunner
- Replace any types with proper TypeScript types (CoreMessage, ToolCall, DisplayManager)
- Add ScorerResult interface for better type safety
- Update schema validation tests to reflect new scorer types

JSON schema validation doesn't serve a realistic use case for LLM response evaluation since responses should be natural language, not structured data requiring schema validation.
diff --git a/src/core/types.ts b/src/core/types.ts
@@ -3,6 +3,8 @@
  * Focus on capabilities testing initially
  */
 
+import type { CoreMessage } from 'ai';
+
 // Server configuration (standard MCP format)
 export interface ServerConfig {
   command: string;
@@ -136,20 +138,26 @@ export interface EvalTest {
 }
 
 export interface ResponseScorer {
-  type: 'regex' | 'json-schema' | 'llm-judge';
+  type: 'regex' | 'llm-judge';
   pattern?: string;
-  schema?: any;
   criteria?: string;
   threshold?: number;
 }
 
+export interface ScorerResult {
+  type: 'regex' | 'llm-judge';
+  passed: boolean;
+  score?: number;
+  details?: string;
+}
+
 export interface EvalResult {
   name: string;
   model: string;
   passed: boolean;
   errors: string[];
-  scorer_results: any[];
-  messages?: any[];
+  scorer_results: ScorerResult[];
+  messages?: CoreMessage[];
 }
 
 // Main test configuration
diff --git a/src/schemas/test-config.json b/src/schemas/test-config.json
@@ -226,16 +226,13 @@
       "properties": {
         "type": {
           "type": "string",
-          "enum": ["regex", "json-schema", "llm-judge"],
+          "enum": ["regex", "llm-judge"],
           "description": "Type of scorer to use"
         },
         "pattern": {
           "type": "string",
           "description": "Regex pattern to match (required for regex type)"
         },
-        "schema": {
-          "description": "JSON schema to validate against (required for json-schema type)"
-        },
         "criteria": {
           "type": "string",
           "description": "Criteria for LLM judge (required for llm-judge type)"
@@ -257,14 +254,6 @@
             "required": ["pattern"]
           }
         },
-        {
-          "if": {
-            "properties": { "type": { "const": "json-schema" } }
-          },
-          "then": {
-            "required": ["schema"]
-          }
-        },
         {
           "if": {
             "properties": { "type": { "const": "llm-judge" } }
diff --git a/src/testing/evals/runner.ts b/src/testing/evals/runner.ts
@@ -10,6 +10,8 @@ import {
 import { ConfigLoader } from '../../config/loader.js';
 import { AnthropicProvider } from './providers/anthropic-provider.js';
 import type { EvalsConfig, EvalTest, EvalResult } from '../../core/types.js';
+import type { DisplayManager } from '../display/DisplayManager.js';
+import type { CoreMessage, ToolCall } from 'ai';
 
 export interface EvalSummary {
   total: number;
@@ -34,9 +36,13 @@ export class EvalTestRunner {
   private serverOptions: EvalServerOptions;
   private models: string[];
   private llmProvider: AnthropicProvider;
-  private displayManager?: any; // Will be injected from outside
+  private displayManager?: DisplayManager;
 
-  constructor(config: EvalsConfig, serverOptions: EvalServerOptions, displayManager?: any) {
+  constructor(
+    config: EvalsConfig,
+    serverOptions: EvalServerOptions,
+    displayManager?: DisplayManager
+  ) {
     this.config = config;
     this.serverOptions = serverOptions;
     this.displayManager = displayManager;
@@ -213,7 +219,7 @@ export class EvalTestRunner {
   }
 
   private validateToolCalls(
-    actualToolCalls: any[],
+    actualToolCalls: ToolCall<string, Record<string, unknown>>[],
     expectedToolCalls: NonNullable<EvalTest['expected_tool_calls']>
   ): string[] {
     const errors: string[] = [];
@@ -249,7 +255,7 @@ export class EvalTestRunner {
   }
 
   private async runResponseScorers(
-    messages: any[],
+    messages: CoreMessage[],
     scorers: NonNullable<EvalTest['response_scorers']>
   ): Promise<string[]> {
     const errors: string[] = [];
@@ -272,11 +278,6 @@ export class EvalTestRunner {
               `LLM judge failed: score ${result.score} < threshold ${scorer.threshold || 0.7}. Rationale: ${result.rationale}`
             );
           }
-        } else if (scorer.type === 'json-schema') {
-          const success = await this.runJsonSchemaScorer(messages, scorer.schema!);
-          if (!success) {
-            errors.push(`JSON schema scorer failed: response does not match schema`);
-          }
         }
       } catch (error) {
         errors.push(
@@ -288,7 +289,7 @@ export class EvalTestRunner {
     return errors;
   }
 
-  private async runRegexScorer(messages: any[], pattern: string): Promise<boolean> {
+  private async runRegexScorer(messages: CoreMessage[], pattern: string): Promise<boolean> {
     const regex = new RegExp(pattern, 'i');
 
     for (const message of messages) {
@@ -304,43 +305,4 @@ export class EvalTestRunner {
 
     return false;
   }
-
-  private async runJsonSchemaScorer(messages: any[], schema: any): Promise<boolean> {
-    // Basic JSON schema validation - would need full implementation with ajv
-    for (const message of messages) {
-      if (message.role === 'assistant') {
-        const content =
-          typeof message.content === 'string' ? message.content : JSON.stringify(message.content);
-
-        // Try to parse as JSON and do basic validation
-        try {
-          const jsonData = JSON.parse(content);
-
-          // Very basic schema validation - in practice would use ajv
-          if (schema.type === 'string' && typeof jsonData === 'string') {
-            if (schema.minLength && jsonData.length < schema.minLength) {
-              continue;
-            }
-            if (schema.pattern && !new RegExp(schema.pattern).test(jsonData)) {
-              continue;
-            }
-            return true;
-          }
-
-          if (schema.type === 'object' && typeof jsonData === 'object') {
-            return true; // Basic object validation
-          }
-        } catch {
-          // If schema expects a string but content is not JSON, check if it matches
-          if (schema.type === 'string') {
-            if (schema.pattern && new RegExp(schema.pattern).test(content)) {
-              return true;
-            }
-          }
-        }
-      }
-    }
-
-    return false;
-  }
 }
diff --git a/test/unit/schema.test.ts b/test/unit/schema.test.ts
@@ -62,7 +62,7 @@ describe('Schema Validation', () => {
         const testWithScorers = config.evals.tests.find(t => t.response_scorers);
         if (testWithScorers?.response_scorers) {
           for (const scorer of testWithScorers.response_scorers) {
-            expect(['regex', 'json-schema', 'llm-judge']).toContain(scorer.type);
+            expect(['regex', 'llm-judge']).toContain(scorer.type);
           }
         }
       }

Original file line number	Diff line number	Diff line change
`@@ -62,7 +62,7 @@ describe('Schema Validation', () => {`
`62`	`62`	`const testWithScorers = config.evals.tests.find(t => t.response_scorers);`
`63`	`63`	`if (testWithScorers?.response_scorers) {`
`64`	`64`	`for (const scorer of testWithScorers.response_scorers) {`
`65`		`- expect(['regex', 'json-schema', 'llm-judge']).toContain(scorer.type);`
	`65`	`+ expect(['regex', 'llm-judge']).toContain(scorer.type);`
`66`	`66`	`}`
`67`	`67`	`}`
`68`	`68`	`}`