@@ -36,6 +36,16 @@ interface CliArgs {
3636 batchSize ? : number ;
3737 outputDir ? : string ;
3838 multiTurn ? : boolean ;
39+ maxParallelModels ? : number | null ;
40+ benchmarkChunkSize ? : number | null ;
41+ mode ? : 'evaluate' | 'benchmark' ;
42+ stages ?: string [ ] ;
43+ models ?: string [ ] ;
44+ latencyIterations ?: number ;
45+ apiKey ?: string | null ;
46+ baseUrl ?: string | null ;
47+ azureEndpoint ?: string | null ;
48+ azureApiVersion ?: string ;
3949 help ?: boolean ;
4050}
4151
@@ -72,6 +82,52 @@ function parseArgs(argv: string[]): CliArgs {
7282 args . outputDir = argv [ ++ i ] ;
7383 } else if ( arg === '--multi-turn' ) {
7484 args . multiTurn = true ;
85+ } else if ( arg === '--max-parallel-models' ) {
86+ const value = parseInt ( argv [ ++ i ] , 10 ) ;
87+ if ( isNaN ( value ) || value <= 0 ) {
88+ console . error ( `❌ Error: max-parallel-models must be positive, got: ${ argv [ i ] } ` ) ;
89+ process . exit ( 1 ) ;
90+ }
91+ args . maxParallelModels = value ;
92+ } else if ( arg === '--benchmark-chunk-size' ) {
93+ const value = parseInt ( argv [ ++ i ] , 10 ) ;
94+ if ( isNaN ( value ) || value <= 0 ) {
95+ console . error ( `❌ Error: benchmark-chunk-size must be positive, got: ${ argv [ i ] } ` ) ;
96+ process . exit ( 1 ) ;
97+ }
98+ args . benchmarkChunkSize = value ;
99+ } else if ( arg === '--mode' ) {
100+ const mode = argv [ ++ i ] ;
101+ if ( mode !== 'evaluate' && mode !== 'benchmark' ) {
102+ console . error ( `❌ Error: Invalid mode: ${ mode } . Must be 'evaluate' or 'benchmark'` ) ;
103+ process . exit ( 1 ) ;
104+ }
105+ args . mode = mode as 'evaluate' | 'benchmark' ;
106+ } else if ( arg === '--stages' ) {
107+ args . stages = [ ] ;
108+ while ( i + 1 < argv . length && ! argv [ i + 1 ] . startsWith ( '--' ) ) {
109+ args . stages . push ( argv [ ++ i ] ) ;
110+ }
111+ } else if ( arg === '--models' ) {
112+ args . models = [ ] ;
113+ while ( i + 1 < argv . length && ! argv [ i + 1 ] . startsWith ( '--' ) ) {
114+ args . models . push ( argv [ ++ i ] ) ;
115+ }
116+ } else if ( arg === '--latency-iterations' ) {
117+ const value = parseInt ( argv [ ++ i ] , 10 ) ;
118+ if ( isNaN ( value ) || value <= 0 ) {
119+ console . error ( `❌ Error: latency-iterations must be positive, got: ${ argv [ i ] } ` ) ;
120+ process . exit ( 1 ) ;
121+ }
122+ args . latencyIterations = value ;
123+ } else if ( arg === '--api-key' ) {
124+ args . apiKey = argv [ ++ i ] ;
125+ } else if ( arg === '--base-url' ) {
126+ args . baseUrl = argv [ ++ i ] ;
127+ } else if ( arg === '--azure-endpoint' ) {
128+ args . azureEndpoint = argv [ ++ i ] ;
129+ } else if ( arg === '--azure-api-version' ) {
130+ args . azureApiVersion = argv [ ++ i ] ;
75131 } else if ( ! args . configFile && ! arg . startsWith ( '-' ) ) {
76132 args . configFile = arg ;
77133 }
@@ -119,6 +175,12 @@ function showHelp(): void {
119175 console . log (
120176 ' --dataset-path <path> Path to evaluation dataset (required)'
121177 ) ;
178+ console . log (
179+ ' --mode <mode> Evaluation mode: "evaluate" or "benchmark" (default: evaluate)'
180+ ) ;
181+ console . log (
182+ ' --stages <stage>... Pipeline stages to evaluate: pre_flight, input, output'
183+ ) ;
122184 console . log (
123185 ' --batch-size <number> Number of samples to process in parallel (default: 32)'
124186 ) ;
@@ -128,6 +190,32 @@ function showHelp(): void {
128190 console . log (
129191 ' --multi-turn Evaluate conversation-aware guardrails turn-by-turn (default: single-pass)'
130192 ) ;
193+ console . log ( 'Benchmark Options:' ) ;
194+ console . log (
195+ ' --models <model>... Models to test in benchmark mode (default: gpt-5, gpt-5-mini, gpt-4.1, gpt-4.1-mini)'
196+ ) ;
197+ console . log (
198+ ' --latency-iterations <number> Number of iterations for latency testing (default: 25)'
199+ ) ;
200+ console . log (
201+ ' --max-parallel-models <number> Maximum number of models to benchmark concurrently (default: min(models, cpu_count))'
202+ ) ;
203+ console . log (
204+ ' --benchmark-chunk-size <number> Optional number of samples per chunk when benchmarking to limit long-running runs'
205+ ) ;
206+ console . log ( 'API Configuration:' ) ;
207+ console . log (
208+ ' --api-key <key> API key for OpenAI, Azure OpenAI, or OpenAI-compatible API'
209+ ) ;
210+ console . log (
211+ ' --base-url <url> Base URL for OpenAI-compatible API (e.g., http://localhost:11434/v1)'
212+ ) ;
213+ console . log (
214+ ' --azure-endpoint <endpoint> Azure OpenAI endpoint (e.g., https://your-resource.openai.azure.com)'
215+ ) ;
216+ console . log (
217+ ' --azure-api-version <version> Azure OpenAI API version (default: 2025-01-01-preview)'
218+ ) ;
131219 console . log ( '' ) ;
132220 console . log ( 'Examples:' ) ;
133221 console . log ( ' guardrails validate config.json' ) ;
@@ -136,6 +224,12 @@ function showHelp(): void {
136224 console . log (
137225 ' guardrails eval --config-path config.json --dataset-path dataset.jsonl --batch-size 16 --output-dir my-results'
138226 ) ;
227+ console . log (
228+ ' guardrails eval --config-path config.json --dataset-path dataset.jsonl --mode benchmark --models gpt-5 gpt-5-mini'
229+ ) ;
230+ console . log (
231+ ' guardrails eval --config-path config.json --dataset-path dataset.jsonl --mode benchmark --azure-endpoint https://your-resource.openai.azure.com --api-key your-key'
232+ ) ;
139233 console . log ( ' guardrails validate-dataset dataset.jsonl' ) ;
140234}
141235
@@ -154,13 +248,61 @@ async function handleEvalCommand(args: CliArgs): Promise<void> {
154248 process . exit ( 1 ) ;
155249 }
156250
251+ if ( args . maxParallelModels !== undefined && args . maxParallelModels !== null && args . maxParallelModels <= 0 ) {
252+ console . error ( `❌ Error: max-parallel-models must be positive, got: ${ args . maxParallelModels } ` ) ;
253+ process . exit ( 1 ) ;
254+ }
255+
256+ if ( args . benchmarkChunkSize !== undefined && args . benchmarkChunkSize !== null && args . benchmarkChunkSize <= 0 ) {
257+ console . error ( `❌ Error: benchmark-chunk-size must be positive, got: ${ args . benchmarkChunkSize } ` ) ;
258+ process . exit ( 1 ) ;
259+ }
260+
261+ if ( args . latencyIterations !== undefined && args . latencyIterations <= 0 ) {
262+ console . error ( `❌ Error: latency-iterations must be positive, got: ${ args . latencyIterations } ` ) ;
263+ process . exit ( 1 ) ;
264+ }
265+
266+ if ( args . stages ) {
267+ const validStages = new Set ( [ 'pre_flight' , 'input' , 'output' ] ) ;
268+ const invalidStages = args . stages . filter ( ( s ) => ! validStages . has ( s ) ) ;
269+ if ( invalidStages . length > 0 ) {
270+ console . error ( `❌ Error: Invalid stages: ${ invalidStages . join ( ', ' ) } . Valid stages are: ${ Array . from ( validStages ) . join ( ', ' ) } ` ) ;
271+ process . exit ( 1 ) ;
272+ }
273+ }
274+
275+ if ( args . mode === 'benchmark' && args . stages && args . stages . length > 1 ) {
276+ console . warn ( '⚠️ Warning: Benchmark mode only uses the first specified stage. Additional stages will be ignored.' ) ;
277+ }
278+
279+ if ( args . azureEndpoint && args . baseUrl ) {
280+ console . error ( '❌ Error: Cannot specify both --azure-endpoint and --base-url. Choose one provider.' ) ;
281+ process . exit ( 1 ) ;
282+ }
283+
284+ if ( args . azureEndpoint && ! args . apiKey ) {
285+ console . error ( '❌ Error: --api-key is required when using --azure-endpoint' ) ;
286+ process . exit ( 1 ) ;
287+ }
288+
157289 try {
158290 await runEvaluationCLI ( {
159291 configPath : args . configPath ,
160292 datasetPath : args . datasetPath ,
293+ stages : args . stages || null ,
161294 batchSize : args . batchSize || 32 ,
162295 outputDir : args . outputDir || 'results' ,
296+ apiKey : args . apiKey || null ,
297+ baseUrl : args . baseUrl || null ,
298+ azureEndpoint : args . azureEndpoint || null ,
299+ azureApiVersion : args . azureApiVersion || '2025-01-01-preview' ,
300+ mode : args . mode || 'evaluate' ,
301+ models : args . models || null ,
302+ latencyIterations : args . latencyIterations ,
163303 multiTurn : args . multiTurn ,
304+ maxParallelModels : args . maxParallelModels ,
305+ benchmarkChunkSize : args . benchmarkChunkSize ,
164306 } ) ;
165307
166308 console . log ( 'Evaluation completed successfully!' ) ;
0 commit comments