Skip to content

Commit f55e103

Browse files
committed
Adding benchmarking functionality
1 parent cbac21a commit f55e103

File tree

9 files changed

+1528
-63
lines changed

9 files changed

+1528
-63
lines changed

src/__tests__/unit/evals/guardrail-evals.test.ts

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -47,11 +47,11 @@ describe('GuardrailEval._determineParallelModelLimit', () => {
4747

4848
it('should honor user-provided parallelism constraints', () => {
4949
expect(GuardrailEval._determineParallelModelLimit(5, 3)).toBe(3);
50-
expect(() => GuardrailEval._determineParallelModelLimit(5, 0)).toThrow('max_parallel_models must be positive');
50+
expect(() => GuardrailEval._determineParallelModelLimit(5, 0)).toThrow('maxParallelModels must be positive');
5151
});
5252

5353
it('should throw error for invalid model count', () => {
54-
expect(() => GuardrailEval._determineParallelModelLimit(0, null)).toThrow('model_count must be positive');
54+
expect(() => GuardrailEval._determineParallelModelLimit(0, null)).toThrow('modelCount must be positive');
5555
});
5656
});
5757

@@ -74,7 +74,7 @@ describe('GuardrailEval._chunkSamples', () => {
7474

7575
it('should reject invalid chunk sizes', () => {
7676
const samples = buildSamples(2);
77-
expect(() => Array.from(GuardrailEval._chunkSamples(samples, 0))).toThrow('chunk_size must be positive when provided');
77+
expect(() => Array.from(GuardrailEval._chunkSamples(samples, 0))).toThrow('chunkSize must be positive when provided');
7878
});
7979

8080
it('should return single chunk when chunk size is larger than samples', () => {

src/cli.ts

Lines changed: 108 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,14 @@ interface CliArgs {
3838
multiTurn?: boolean;
3939
maxParallelModels?: number | null;
4040
benchmarkChunkSize?: number | null;
41+
mode?: 'evaluate' | 'benchmark';
42+
stages?: string[];
43+
models?: string[];
44+
latencyIterations?: number;
45+
apiKey?: string | null;
46+
baseUrl?: string | null;
47+
azureEndpoint?: string | null;
48+
azureApiVersion?: string;
4149
help?: boolean;
4250
}
4351

@@ -88,6 +96,38 @@ function parseArgs(argv: string[]): CliArgs {
8896
process.exit(1);
8997
}
9098
args.benchmarkChunkSize = value;
99+
} else if (arg === '--mode') {
100+
const mode = argv[++i];
101+
if (mode !== 'evaluate' && mode !== 'benchmark') {
102+
console.error(`❌ Error: Invalid mode: ${mode}. Must be 'evaluate' or 'benchmark'`);
103+
process.exit(1);
104+
}
105+
args.mode = mode as 'evaluate' | 'benchmark';
106+
} else if (arg === '--stages') {
107+
args.stages = [];
108+
while (i + 1 < argv.length && !argv[i + 1].startsWith('--')) {
109+
args.stages.push(argv[++i]);
110+
}
111+
} else if (arg === '--models') {
112+
args.models = [];
113+
while (i + 1 < argv.length && !argv[i + 1].startsWith('--')) {
114+
args.models.push(argv[++i]);
115+
}
116+
} else if (arg === '--latency-iterations') {
117+
const value = parseInt(argv[++i], 10);
118+
if (isNaN(value) || value <= 0) {
119+
console.error(`❌ Error: latency-iterations must be positive, got: ${argv[i]}`);
120+
process.exit(1);
121+
}
122+
args.latencyIterations = value;
123+
} else if (arg === '--api-key') {
124+
args.apiKey = argv[++i];
125+
} else if (arg === '--base-url') {
126+
args.baseUrl = argv[++i];
127+
} else if (arg === '--azure-endpoint') {
128+
args.azureEndpoint = argv[++i];
129+
} else if (arg === '--azure-api-version') {
130+
args.azureApiVersion = argv[++i];
91131
} else if (!args.configFile && !arg.startsWith('-')) {
92132
args.configFile = arg;
93133
}
@@ -135,6 +175,12 @@ function showHelp(): void {
135175
console.log(
136176
' --dataset-path <path> Path to evaluation dataset (required)'
137177
);
178+
console.log(
179+
' --mode <mode> Evaluation mode: "evaluate" or "benchmark" (default: evaluate)'
180+
);
181+
console.log(
182+
' --stages <stage>... Pipeline stages to evaluate: pre_flight, input, output'
183+
);
138184
console.log(
139185
' --batch-size <number> Number of samples to process in parallel (default: 32)'
140186
);
@@ -144,12 +190,32 @@ function showHelp(): void {
144190
console.log(
145191
' --multi-turn Evaluate conversation-aware guardrails turn-by-turn (default: single-pass)'
146192
);
193+
console.log('Benchmark Options:');
194+
console.log(
195+
' --models <model>... Models to test in benchmark mode (default: gpt-5, gpt-5-mini, gpt-4.1, gpt-4.1-mini)'
196+
);
197+
console.log(
198+
' --latency-iterations <number> Number of iterations for latency testing (default: 25)'
199+
);
147200
console.log(
148201
' --max-parallel-models <number> Maximum number of models to benchmark concurrently (default: min(models, cpu_count))'
149202
);
150203
console.log(
151204
' --benchmark-chunk-size <number> Optional number of samples per chunk when benchmarking to limit long-running runs'
152205
);
206+
console.log('API Configuration:');
207+
console.log(
208+
' --api-key <key> API key for OpenAI, Azure OpenAI, or OpenAI-compatible API'
209+
);
210+
console.log(
211+
' --base-url <url> Base URL for OpenAI-compatible API (e.g., http://localhost:11434/v1)'
212+
);
213+
console.log(
214+
' --azure-endpoint <endpoint> Azure OpenAI endpoint (e.g., https://your-resource.openai.azure.com)'
215+
);
216+
console.log(
217+
' --azure-api-version <version> Azure OpenAI API version (default: 2025-01-01-preview)'
218+
);
153219
console.log('');
154220
console.log('Examples:');
155221
console.log(' guardrails validate config.json');
@@ -158,6 +224,12 @@ function showHelp(): void {
158224
console.log(
159225
' guardrails eval --config-path config.json --dataset-path dataset.jsonl --batch-size 16 --output-dir my-results'
160226
);
227+
console.log(
228+
' guardrails eval --config-path config.json --dataset-path dataset.jsonl --mode benchmark --models gpt-5 gpt-5-mini'
229+
);
230+
console.log(
231+
' guardrails eval --config-path config.json --dataset-path dataset.jsonl --mode benchmark --azure-endpoint https://your-resource.openai.azure.com --api-key your-key'
232+
);
161233
console.log(' guardrails validate-dataset dataset.jsonl');
162234
}
163235

@@ -186,12 +258,48 @@ async function handleEvalCommand(args: CliArgs): Promise<void> {
186258
process.exit(1);
187259
}
188260

261+
if (args.latencyIterations !== undefined && args.latencyIterations <= 0) {
262+
console.error(`❌ Error: latency-iterations must be positive, got: ${args.latencyIterations}`);
263+
process.exit(1);
264+
}
265+
266+
if (args.stages) {
267+
const validStages = new Set(['pre_flight', 'input', 'output']);
268+
const invalidStages = args.stages.filter((s) => !validStages.has(s));
269+
if (invalidStages.length > 0) {
270+
console.error(`❌ Error: Invalid stages: ${invalidStages.join(', ')}. Valid stages are: ${Array.from(validStages).join(', ')}`);
271+
process.exit(1);
272+
}
273+
}
274+
275+
if (args.mode === 'benchmark' && args.stages && args.stages.length > 1) {
276+
console.warn('⚠️ Warning: Benchmark mode only uses the first specified stage. Additional stages will be ignored.');
277+
}
278+
279+
if (args.azureEndpoint && args.baseUrl) {
280+
console.error('❌ Error: Cannot specify both --azure-endpoint and --base-url. Choose one provider.');
281+
process.exit(1);
282+
}
283+
284+
if (args.azureEndpoint && !args.apiKey) {
285+
console.error('❌ Error: --api-key is required when using --azure-endpoint');
286+
process.exit(1);
287+
}
288+
189289
try {
190290
await runEvaluationCLI({
191291
configPath: args.configPath,
192292
datasetPath: args.datasetPath,
293+
stages: args.stages || null,
193294
batchSize: args.batchSize || 32,
194295
outputDir: args.outputDir || 'results',
296+
apiKey: args.apiKey || null,
297+
baseUrl: args.baseUrl || null,
298+
azureEndpoint: args.azureEndpoint || null,
299+
azureApiVersion: args.azureApiVersion || '2025-01-01-preview',
300+
mode: args.mode || 'evaluate',
301+
models: args.models || null,
302+
latencyIterations: args.latencyIterations,
195303
multiTurn: args.multiTurn,
196304
maxParallelModels: args.maxParallelModels,
197305
benchmarkChunkSize: args.benchmarkChunkSize,

src/evals/core/async-engine.ts

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -46,18 +46,17 @@ export class AsyncRunEngine implements RunEngine {
4646
}
4747

4848
const results: SampleResult[] = [];
49-
let processed = 0;
49+
const totalSamples = samples.length;
5050

51-
console.log(`${desc}: ${samples.length} samples, batch size: ${batchSize}`);
51+
console.log(`${desc}: ${totalSamples} samples, batch size: ${batchSize}`);
5252

5353
for (let i = 0; i < samples.length; i += batchSize) {
5454
const batch = samples.slice(i, i + batchSize);
5555
const batchResults = await Promise.all(
5656
batch.map((sample) => this.evaluateSample(context, sample))
5757
);
5858
results.push(...batchResults);
59-
processed += batch.length;
60-
console.log(`Processed ${processed}/${samples.length} samples`);
59+
console.log(`Processed ${results.length}/${totalSamples} samples`);
6160
}
6261

6362
return results;

0 commit comments

Comments
 (0)