Skip to content
Merged
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions docs/evals.md
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,8 @@ The evals tool is included with the TypeScript package. No additional dependenci
| `--azure-api-version` | ❌ | Azure OpenAI API version (default: 2025-01-01-preview) |
| `--models` | ❌ | Models for benchmark mode (benchmark only) |
| `--latency-iterations` | ❌ | Latency test samples (default: 25) (benchmark only) |
| `--max-parallel-models` | ❌ | Maximum concurrent models in benchmark mode (default: CPU count) (benchmark only) |
| `--benchmark-chunk-size` | ❌ | Sample chunk size per model for memory-efficient benchmarking (benchmark only) |

## Configuration

Expand Down Expand Up @@ -154,6 +156,8 @@ npm run eval -- --config-path config.json --dataset-path data.jsonl --base-url h
- **Multi-stage evaluation**: pre_flight, input, output stages
- **Automatic stage detection**: Evaluates all stages found in configuration
- **Batch processing**: Configurable parallel processing
- **Parallel benchmarking**: Run multiple models concurrently with CPU-aware defaults
- **Memory-efficient chunking**: Process large datasets in smaller chunks during benchmarking
- **Benchmark mode**: Model performance comparison with ROC AUC, precision at recall thresholds
- **Latency testing**: End-to-end guardrail performance measurement
- **Visualization**: Automatic chart and graph generation
Expand Down
94 changes: 94 additions & 0 deletions src/__tests__/unit/evals/guardrail-evals.test.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,94 @@
/**
* Unit tests for guardrail evaluation utilities.
*/

import { describe, it, expect, vi, beforeEach, afterEach } from 'vitest';
import { GuardrailEval } from '../../../evals/guardrail-evals';
import type { Sample } from '../../../evals/core/types';
import * as os from 'os';

vi.mock('os', () => {
return {
default: {
cpus: vi.fn(),
},
cpus: vi.fn(),
};
});

function buildSamples(count: number): Sample[] {
/**Build synthetic samples for chunking tests.
*
* @param count - Number of synthetic samples to build.
* @returns List of Sample instances configured for evaluation.
*/
return Array.from({ length: count }, (_, idx) => ({
id: `sample-${idx}`,
data: `payload-${idx}`,
expectedTriggers: { g: Boolean(idx % 2) },
}));
}

describe('GuardrailEval._determineParallelModelLimit', () => {
beforeEach(() => {
vi.clearAllMocks();
});

afterEach(() => {
vi.restoreAllMocks();
});

it('should use cpu_count when explicit parallelism is not provided', () => {
vi.mocked(os.cpus).mockReturnValue(Array(4).fill({}) as os.CpuInfo[]);

expect(GuardrailEval._determineParallelModelLimit(10, null)).toBe(4);
expect(GuardrailEval._determineParallelModelLimit(2, null)).toBe(2);
});

it('should honor user-provided parallelism constraints', () => {
expect(GuardrailEval._determineParallelModelLimit(5, 3)).toBe(3);
expect(() => GuardrailEval._determineParallelModelLimit(5, 0)).toThrow('maxParallelModels must be positive');
});

it('should throw error for invalid model count', () => {
expect(() => GuardrailEval._determineParallelModelLimit(0, null)).toThrow('modelCount must be positive');
});
});

describe('GuardrailEval._chunkSamples', () => {
it('should return the original sample list when no chunk size is provided', () => {
const samples = buildSamples(3);
const chunks = Array.from(GuardrailEval._chunkSamples(samples, null));
expect(chunks.length).toBe(1);
expect(chunks[0]).toBe(samples);
});

it('should split samples into evenly sized chunks', () => {
const samples = buildSamples(5);
const chunks = Array.from(GuardrailEval._chunkSamples(samples, 2));
expect(chunks.map((chunk) => chunk.length)).toEqual([2, 2, 1]);
expect(chunks[0][0].id).toBe('sample-0');
expect(chunks[1][0].id).toBe('sample-2');
expect(chunks[2][0].id).toBe('sample-4');
});

it('should reject invalid chunk sizes', () => {
const samples = buildSamples(2);
expect(() => Array.from(GuardrailEval._chunkSamples(samples, 0))).toThrow('chunkSize must be positive when provided');
});

it('should return single chunk when chunk size is larger than samples', () => {
const samples = buildSamples(3);
const chunks = Array.from(GuardrailEval._chunkSamples(samples, 10));
expect(chunks.length).toBe(1);
expect(chunks[0]).toBe(samples);
});

it('should handle empty samples', () => {
const samples: Sample[] = [];
const chunks = Array.from(GuardrailEval._chunkSamples(samples, 2));
expect(chunks.length).toBe(1);
expect(chunks[0]).toEqual([]);
});
});

142 changes: 142 additions & 0 deletions src/cli.ts
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,16 @@ interface CliArgs {
batchSize?: number;
outputDir?: string;
multiTurn?: boolean;
maxParallelModels?: number | null;
benchmarkChunkSize?: number | null;
mode?: 'evaluate' | 'benchmark';
stages?: string[];
models?: string[];
latencyIterations?: number;
apiKey?: string | null;
baseUrl?: string | null;
azureEndpoint?: string | null;
azureApiVersion?: string;
help?: boolean;
}

Expand Down Expand Up @@ -72,6 +82,52 @@ function parseArgs(argv: string[]): CliArgs {
args.outputDir = argv[++i];
} else if (arg === '--multi-turn') {
args.multiTurn = true;
} else if (arg === '--max-parallel-models') {
const value = parseInt(argv[++i], 10);
if (isNaN(value) || value <= 0) {
console.error(`❌ Error: max-parallel-models must be positive, got: ${argv[i]}`);
process.exit(1);
}
args.maxParallelModels = value;
} else if (arg === '--benchmark-chunk-size') {
const value = parseInt(argv[++i], 10);
if (isNaN(value) || value <= 0) {
console.error(`❌ Error: benchmark-chunk-size must be positive, got: ${argv[i]}`);
process.exit(1);
}
args.benchmarkChunkSize = value;
} else if (arg === '--mode') {
const mode = argv[++i];
if (mode !== 'evaluate' && mode !== 'benchmark') {
console.error(`❌ Error: Invalid mode: ${mode}. Must be 'evaluate' or 'benchmark'`);
process.exit(1);
}
args.mode = mode as 'evaluate' | 'benchmark';
} else if (arg === '--stages') {
args.stages = [];
while (i + 1 < argv.length && !argv[i + 1].startsWith('--')) {
args.stages.push(argv[++i]);
}
} else if (arg === '--models') {
args.models = [];
while (i + 1 < argv.length && !argv[i + 1].startsWith('--')) {
args.models.push(argv[++i]);
}
} else if (arg === '--latency-iterations') {
const value = parseInt(argv[++i], 10);
if (isNaN(value) || value <= 0) {
console.error(`❌ Error: latency-iterations must be positive, got: ${argv[i]}`);
process.exit(1);
}
args.latencyIterations = value;
} else if (arg === '--api-key') {
args.apiKey = argv[++i];
} else if (arg === '--base-url') {
args.baseUrl = argv[++i];
} else if (arg === '--azure-endpoint') {
args.azureEndpoint = argv[++i];
} else if (arg === '--azure-api-version') {
args.azureApiVersion = argv[++i];
} else if (!args.configFile && !arg.startsWith('-')) {
args.configFile = arg;
}
Expand Down Expand Up @@ -119,6 +175,12 @@ function showHelp(): void {
console.log(
' --dataset-path <path> Path to evaluation dataset (required)'
);
console.log(
' --mode <mode> Evaluation mode: "evaluate" or "benchmark" (default: evaluate)'
);
console.log(
' --stages <stage>... Pipeline stages to evaluate: pre_flight, input, output'
);
console.log(
' --batch-size <number> Number of samples to process in parallel (default: 32)'
);
Expand All @@ -128,6 +190,32 @@ function showHelp(): void {
console.log(
' --multi-turn Evaluate conversation-aware guardrails turn-by-turn (default: single-pass)'
);
console.log('Benchmark Options:');
console.log(
' --models <model>... Models to test in benchmark mode (default: gpt-5, gpt-5-mini, gpt-4.1, gpt-4.1-mini)'
);
console.log(
' --latency-iterations <number> Number of iterations for latency testing (default: 25)'
);
console.log(
' --max-parallel-models <number> Maximum number of models to benchmark concurrently (default: min(models, cpu_count))'
);
console.log(
' --benchmark-chunk-size <number> Optional number of samples per chunk when benchmarking to limit long-running runs'
);
console.log('API Configuration:');
console.log(
' --api-key <key> API key for OpenAI, Azure OpenAI, or OpenAI-compatible API'
);
console.log(
' --base-url <url> Base URL for OpenAI-compatible API (e.g., http://localhost:11434/v1)'
);
console.log(
' --azure-endpoint <endpoint> Azure OpenAI endpoint (e.g., https://your-resource.openai.azure.com)'
);
console.log(
' --azure-api-version <version> Azure OpenAI API version (default: 2025-01-01-preview)'
);
console.log('');
console.log('Examples:');
console.log(' guardrails validate config.json');
Expand All @@ -136,6 +224,12 @@ function showHelp(): void {
console.log(
' guardrails eval --config-path config.json --dataset-path dataset.jsonl --batch-size 16 --output-dir my-results'
);
console.log(
' guardrails eval --config-path config.json --dataset-path dataset.jsonl --mode benchmark --models gpt-5 gpt-5-mini'
);
console.log(
' guardrails eval --config-path config.json --dataset-path dataset.jsonl --mode benchmark --azure-endpoint https://your-resource.openai.azure.com --api-key your-key'
);
console.log(' guardrails validate-dataset dataset.jsonl');
}

Expand All @@ -154,13 +248,61 @@ async function handleEvalCommand(args: CliArgs): Promise<void> {
process.exit(1);
}

if (args.maxParallelModels !== undefined && args.maxParallelModels !== null && args.maxParallelModels <= 0) {
console.error(`❌ Error: max-parallel-models must be positive, got: ${args.maxParallelModels}`);
process.exit(1);
}

if (args.benchmarkChunkSize !== undefined && args.benchmarkChunkSize !== null && args.benchmarkChunkSize <= 0) {
console.error(`❌ Error: benchmark-chunk-size must be positive, got: ${args.benchmarkChunkSize}`);
process.exit(1);
}

if (args.latencyIterations !== undefined && args.latencyIterations <= 0) {
console.error(`❌ Error: latency-iterations must be positive, got: ${args.latencyIterations}`);
process.exit(1);
}

if (args.stages) {
const validStages = new Set(['pre_flight', 'input', 'output']);
const invalidStages = args.stages.filter((s) => !validStages.has(s));
if (invalidStages.length > 0) {
console.error(`❌ Error: Invalid stages: ${invalidStages.join(', ')}. Valid stages are: ${Array.from(validStages).join(', ')}`);
process.exit(1);
}
}

if (args.mode === 'benchmark' && args.stages && args.stages.length > 1) {
console.warn('⚠️ Warning: Benchmark mode only uses the first specified stage. Additional stages will be ignored.');
}

if (args.azureEndpoint && args.baseUrl) {
console.error('❌ Error: Cannot specify both --azure-endpoint and --base-url. Choose one provider.');
process.exit(1);
}

if (args.azureEndpoint && !args.apiKey) {
console.error('❌ Error: --api-key is required when using --azure-endpoint');
process.exit(1);
}

try {
await runEvaluationCLI({
configPath: args.configPath,
datasetPath: args.datasetPath,
stages: args.stages || null,
batchSize: args.batchSize || 32,
outputDir: args.outputDir || 'results',
apiKey: args.apiKey || null,
baseUrl: args.baseUrl || null,
azureEndpoint: args.azureEndpoint || null,
azureApiVersion: args.azureApiVersion || '2025-01-01-preview',
mode: args.mode || 'evaluate',
models: args.models || null,
latencyIterations: args.latencyIterations,
multiTurn: args.multiTurn,
maxParallelModels: args.maxParallelModels,
benchmarkChunkSize: args.benchmarkChunkSize,
});

console.log('Evaluation completed successfully!');
Expand Down
7 changes: 3 additions & 4 deletions src/evals/core/async-engine.ts
Original file line number Diff line number Diff line change
Expand Up @@ -46,18 +46,17 @@ export class AsyncRunEngine implements RunEngine {
}

const results: SampleResult[] = [];
let processed = 0;
const totalSamples = samples.length;

console.log(`${desc}: ${samples.length} samples, batch size: ${batchSize}`);
console.log(`${desc}: ${totalSamples} samples, batch size: ${batchSize}`);

for (let i = 0; i < samples.length; i += batchSize) {
const batch = samples.slice(i, i + batchSize);
const batchResults = await Promise.all(
batch.map((sample) => this.evaluateSample(context, sample))
);
results.push(...batchResults);
processed += batch.length;
console.log(`Processed ${processed}/${samples.length} samples`);
console.log(`Processed ${results.length}/${totalSamples} samples`);
}

return results;
Expand Down
Loading
Loading