Skip to content

Commit 6f5c8db

Browse files
authored
Run models in parallel when benchmarking (#43)
* Run models in parallel when benchmarking * Adding benchmarking functionality * Update eval doc * Fix Azure api key warning * Unused context * Remove cli-progress import
1 parent 0e95a1c commit 6f5c8db

File tree

10 files changed

+1701
-36
lines changed

10 files changed

+1701
-36
lines changed

docs/evals.md

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,8 @@ The evals tool is included with the TypeScript package. No additional dependenci
3535
| `--azure-api-version` || Azure OpenAI API version (default: 2025-01-01-preview) |
3636
| `--models` || Models for benchmark mode (benchmark only) |
3737
| `--latency-iterations` || Latency test samples (default: 25) (benchmark only) |
38+
| `--max-parallel-models` || Maximum concurrent models in benchmark mode (default: CPU count) (benchmark only) |
39+
| `--benchmark-chunk-size` || Sample chunk size per model for memory-efficient benchmarking (benchmark only) |
3840

3941
## Configuration
4042

@@ -154,6 +156,8 @@ npm run eval -- --config-path config.json --dataset-path data.jsonl --base-url h
154156
- **Multi-stage evaluation**: pre_flight, input, output stages
155157
- **Automatic stage detection**: Evaluates all stages found in configuration
156158
- **Batch processing**: Configurable parallel processing
159+
- **Parallel benchmarking**: Run multiple models concurrently with CPU-aware defaults
160+
- **Memory-efficient chunking**: Process large datasets in smaller chunks during benchmarking
157161
- **Benchmark mode**: Model performance comparison with ROC AUC, precision at recall thresholds
158162
- **Latency testing**: End-to-end guardrail performance measurement
159163
- **Visualization**: Automatic chart and graph generation
Lines changed: 94 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,94 @@
1+
/**
2+
* Unit tests for guardrail evaluation utilities.
3+
*/
4+
5+
import { describe, it, expect, vi, beforeEach, afterEach } from 'vitest';
6+
import { GuardrailEval } from '../../../evals/guardrail-evals';
7+
import type { Sample } from '../../../evals/core/types';
8+
import * as os from 'os';
9+
10+
vi.mock('os', () => {
11+
return {
12+
default: {
13+
cpus: vi.fn(),
14+
},
15+
cpus: vi.fn(),
16+
};
17+
});
18+
19+
function buildSamples(count: number): Sample[] {
20+
/**Build synthetic samples for chunking tests.
21+
*
22+
* @param count - Number of synthetic samples to build.
23+
* @returns List of Sample instances configured for evaluation.
24+
*/
25+
return Array.from({ length: count }, (_, idx) => ({
26+
id: `sample-${idx}`,
27+
data: `payload-${idx}`,
28+
expectedTriggers: { g: Boolean(idx % 2) },
29+
}));
30+
}
31+
32+
describe('GuardrailEval._determineParallelModelLimit', () => {
33+
beforeEach(() => {
34+
vi.clearAllMocks();
35+
});
36+
37+
afterEach(() => {
38+
vi.restoreAllMocks();
39+
});
40+
41+
it('should use cpu_count when explicit parallelism is not provided', () => {
42+
vi.mocked(os.cpus).mockReturnValue(Array(4).fill({}) as os.CpuInfo[]);
43+
44+
expect(GuardrailEval._determineParallelModelLimit(10, null)).toBe(4);
45+
expect(GuardrailEval._determineParallelModelLimit(2, null)).toBe(2);
46+
});
47+
48+
it('should honor user-provided parallelism constraints', () => {
49+
expect(GuardrailEval._determineParallelModelLimit(5, 3)).toBe(3);
50+
expect(() => GuardrailEval._determineParallelModelLimit(5, 0)).toThrow('maxParallelModels must be positive');
51+
});
52+
53+
it('should throw error for invalid model count', () => {
54+
expect(() => GuardrailEval._determineParallelModelLimit(0, null)).toThrow('modelCount must be positive');
55+
});
56+
});
57+
58+
describe('GuardrailEval._chunkSamples', () => {
59+
it('should return the original sample list when no chunk size is provided', () => {
60+
const samples = buildSamples(3);
61+
const chunks = Array.from(GuardrailEval._chunkSamples(samples, null));
62+
expect(chunks.length).toBe(1);
63+
expect(chunks[0]).toBe(samples);
64+
});
65+
66+
it('should split samples into evenly sized chunks', () => {
67+
const samples = buildSamples(5);
68+
const chunks = Array.from(GuardrailEval._chunkSamples(samples, 2));
69+
expect(chunks.map((chunk) => chunk.length)).toEqual([2, 2, 1]);
70+
expect(chunks[0][0].id).toBe('sample-0');
71+
expect(chunks[1][0].id).toBe('sample-2');
72+
expect(chunks[2][0].id).toBe('sample-4');
73+
});
74+
75+
it('should reject invalid chunk sizes', () => {
76+
const samples = buildSamples(2);
77+
expect(() => Array.from(GuardrailEval._chunkSamples(samples, 0))).toThrow('chunkSize must be positive when provided');
78+
});
79+
80+
it('should return single chunk when chunk size is larger than samples', () => {
81+
const samples = buildSamples(3);
82+
const chunks = Array.from(GuardrailEval._chunkSamples(samples, 10));
83+
expect(chunks.length).toBe(1);
84+
expect(chunks[0]).toBe(samples);
85+
});
86+
87+
it('should handle empty samples', () => {
88+
const samples: Sample[] = [];
89+
const chunks = Array.from(GuardrailEval._chunkSamples(samples, 2));
90+
expect(chunks.length).toBe(1);
91+
expect(chunks[0]).toEqual([]);
92+
});
93+
});
94+

src/cli.ts

Lines changed: 142 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,16 @@ interface CliArgs {
3636
batchSize?: number;
3737
outputDir?: string;
3838
multiTurn?: boolean;
39+
maxParallelModels?: number | null;
40+
benchmarkChunkSize?: number | null;
41+
mode?: 'evaluate' | 'benchmark';
42+
stages?: string[];
43+
models?: string[];
44+
latencyIterations?: number;
45+
apiKey?: string | null;
46+
baseUrl?: string | null;
47+
azureEndpoint?: string | null;
48+
azureApiVersion?: string;
3949
help?: boolean;
4050
}
4151

@@ -72,6 +82,52 @@ function parseArgs(argv: string[]): CliArgs {
7282
args.outputDir = argv[++i];
7383
} else if (arg === '--multi-turn') {
7484
args.multiTurn = true;
85+
} else if (arg === '--max-parallel-models') {
86+
const value = parseInt(argv[++i], 10);
87+
if (isNaN(value) || value <= 0) {
88+
console.error(`❌ Error: max-parallel-models must be positive, got: ${argv[i]}`);
89+
process.exit(1);
90+
}
91+
args.maxParallelModels = value;
92+
} else if (arg === '--benchmark-chunk-size') {
93+
const value = parseInt(argv[++i], 10);
94+
if (isNaN(value) || value <= 0) {
95+
console.error(`❌ Error: benchmark-chunk-size must be positive, got: ${argv[i]}`);
96+
process.exit(1);
97+
}
98+
args.benchmarkChunkSize = value;
99+
} else if (arg === '--mode') {
100+
const mode = argv[++i];
101+
if (mode !== 'evaluate' && mode !== 'benchmark') {
102+
console.error(`❌ Error: Invalid mode: ${mode}. Must be 'evaluate' or 'benchmark'`);
103+
process.exit(1);
104+
}
105+
args.mode = mode as 'evaluate' | 'benchmark';
106+
} else if (arg === '--stages') {
107+
args.stages = [];
108+
while (i + 1 < argv.length && !argv[i + 1].startsWith('--')) {
109+
args.stages.push(argv[++i]);
110+
}
111+
} else if (arg === '--models') {
112+
args.models = [];
113+
while (i + 1 < argv.length && !argv[i + 1].startsWith('--')) {
114+
args.models.push(argv[++i]);
115+
}
116+
} else if (arg === '--latency-iterations') {
117+
const value = parseInt(argv[++i], 10);
118+
if (isNaN(value) || value <= 0) {
119+
console.error(`❌ Error: latency-iterations must be positive, got: ${argv[i]}`);
120+
process.exit(1);
121+
}
122+
args.latencyIterations = value;
123+
} else if (arg === '--api-key') {
124+
args.apiKey = argv[++i];
125+
} else if (arg === '--base-url') {
126+
args.baseUrl = argv[++i];
127+
} else if (arg === '--azure-endpoint') {
128+
args.azureEndpoint = argv[++i];
129+
} else if (arg === '--azure-api-version') {
130+
args.azureApiVersion = argv[++i];
75131
} else if (!args.configFile && !arg.startsWith('-')) {
76132
args.configFile = arg;
77133
}
@@ -119,6 +175,12 @@ function showHelp(): void {
119175
console.log(
120176
' --dataset-path <path> Path to evaluation dataset (required)'
121177
);
178+
console.log(
179+
' --mode <mode> Evaluation mode: "evaluate" or "benchmark" (default: evaluate)'
180+
);
181+
console.log(
182+
' --stages <stage>... Pipeline stages to evaluate: pre_flight, input, output'
183+
);
122184
console.log(
123185
' --batch-size <number> Number of samples to process in parallel (default: 32)'
124186
);
@@ -128,6 +190,32 @@ function showHelp(): void {
128190
console.log(
129191
' --multi-turn Evaluate conversation-aware guardrails turn-by-turn (default: single-pass)'
130192
);
193+
console.log('Benchmark Options:');
194+
console.log(
195+
' --models <model>... Models to test in benchmark mode (default: gpt-5, gpt-5-mini, gpt-4.1, gpt-4.1-mini)'
196+
);
197+
console.log(
198+
' --latency-iterations <number> Number of iterations for latency testing (default: 25)'
199+
);
200+
console.log(
201+
' --max-parallel-models <number> Maximum number of models to benchmark concurrently (default: min(models, cpu_count))'
202+
);
203+
console.log(
204+
' --benchmark-chunk-size <number> Optional number of samples per chunk when benchmarking to limit long-running runs'
205+
);
206+
console.log('API Configuration:');
207+
console.log(
208+
' --api-key <key> API key for OpenAI, Azure OpenAI, or OpenAI-compatible API'
209+
);
210+
console.log(
211+
' --base-url <url> Base URL for OpenAI-compatible API (e.g., http://localhost:11434/v1)'
212+
);
213+
console.log(
214+
' --azure-endpoint <endpoint> Azure OpenAI endpoint (e.g., https://your-resource.openai.azure.com)'
215+
);
216+
console.log(
217+
' --azure-api-version <version> Azure OpenAI API version (default: 2025-01-01-preview)'
218+
);
131219
console.log('');
132220
console.log('Examples:');
133221
console.log(' guardrails validate config.json');
@@ -136,6 +224,12 @@ function showHelp(): void {
136224
console.log(
137225
' guardrails eval --config-path config.json --dataset-path dataset.jsonl --batch-size 16 --output-dir my-results'
138226
);
227+
console.log(
228+
' guardrails eval --config-path config.json --dataset-path dataset.jsonl --mode benchmark --models gpt-5 gpt-5-mini'
229+
);
230+
console.log(
231+
' guardrails eval --config-path config.json --dataset-path dataset.jsonl --mode benchmark --azure-endpoint https://your-resource.openai.azure.com --api-key your-key'
232+
);
139233
console.log(' guardrails validate-dataset dataset.jsonl');
140234
}
141235

@@ -154,13 +248,61 @@ async function handleEvalCommand(args: CliArgs): Promise<void> {
154248
process.exit(1);
155249
}
156250

251+
if (args.maxParallelModels !== undefined && args.maxParallelModels !== null && args.maxParallelModels <= 0) {
252+
console.error(`❌ Error: max-parallel-models must be positive, got: ${args.maxParallelModels}`);
253+
process.exit(1);
254+
}
255+
256+
if (args.benchmarkChunkSize !== undefined && args.benchmarkChunkSize !== null && args.benchmarkChunkSize <= 0) {
257+
console.error(`❌ Error: benchmark-chunk-size must be positive, got: ${args.benchmarkChunkSize}`);
258+
process.exit(1);
259+
}
260+
261+
if (args.latencyIterations !== undefined && args.latencyIterations <= 0) {
262+
console.error(`❌ Error: latency-iterations must be positive, got: ${args.latencyIterations}`);
263+
process.exit(1);
264+
}
265+
266+
if (args.stages) {
267+
const validStages = new Set(['pre_flight', 'input', 'output']);
268+
const invalidStages = args.stages.filter((s) => !validStages.has(s));
269+
if (invalidStages.length > 0) {
270+
console.error(`❌ Error: Invalid stages: ${invalidStages.join(', ')}. Valid stages are: ${Array.from(validStages).join(', ')}`);
271+
process.exit(1);
272+
}
273+
}
274+
275+
if (args.mode === 'benchmark' && args.stages && args.stages.length > 1) {
276+
console.warn('⚠️ Warning: Benchmark mode only uses the first specified stage. Additional stages will be ignored.');
277+
}
278+
279+
if (args.azureEndpoint && args.baseUrl) {
280+
console.error('❌ Error: Cannot specify both --azure-endpoint and --base-url. Choose one provider.');
281+
process.exit(1);
282+
}
283+
284+
if (args.azureEndpoint && !args.apiKey) {
285+
console.error('❌ Error: --api-key is required when using --azure-endpoint');
286+
process.exit(1);
287+
}
288+
157289
try {
158290
await runEvaluationCLI({
159291
configPath: args.configPath,
160292
datasetPath: args.datasetPath,
293+
stages: args.stages || null,
161294
batchSize: args.batchSize || 32,
162295
outputDir: args.outputDir || 'results',
296+
apiKey: args.apiKey || null,
297+
baseUrl: args.baseUrl || null,
298+
azureEndpoint: args.azureEndpoint || null,
299+
azureApiVersion: args.azureApiVersion || '2025-01-01-preview',
300+
mode: args.mode || 'evaluate',
301+
models: args.models || null,
302+
latencyIterations: args.latencyIterations,
163303
multiTurn: args.multiTurn,
304+
maxParallelModels: args.maxParallelModels,
305+
benchmarkChunkSize: args.benchmarkChunkSize,
164306
});
165307

166308
console.log('Evaluation completed successfully!');

src/evals/core/async-engine.ts

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -46,18 +46,17 @@ export class AsyncRunEngine implements RunEngine {
4646
}
4747

4848
const results: SampleResult[] = [];
49-
let processed = 0;
49+
const totalSamples = samples.length;
5050

51-
console.log(`${desc}: ${samples.length} samples, batch size: ${batchSize}`);
51+
console.log(`${desc}: ${totalSamples} samples, batch size: ${batchSize}`);
5252

5353
for (let i = 0; i < samples.length; i += batchSize) {
5454
const batch = samples.slice(i, i + batchSize);
5555
const batchResults = await Promise.all(
5656
batch.map((sample) => this.evaluateSample(context, sample))
5757
);
5858
results.push(...batchResults);
59-
processed += batch.length;
60-
console.log(`Processed ${processed}/${samples.length} samples`);
59+
console.log(`Processed ${results.length}/${totalSamples} samples`);
6160
}
6261

6362
return results;

0 commit comments

Comments
 (0)