@@ -38,6 +38,14 @@ interface CliArgs {
3838 multiTurn ? : boolean ;
3939 maxParallelModels ? : number | null ;
4040 benchmarkChunkSize ? : number | null ;
41+ mode ? : 'evaluate' | 'benchmark' ;
42+ stages ?: string [ ] ;
43+ models ?: string [ ] ;
44+ latencyIterations ?: number ;
45+ apiKey ?: string | null ;
46+ baseUrl ?: string | null ;
47+ azureEndpoint ?: string | null ;
48+ azureApiVersion ?: string ;
4149 help ?: boolean ;
4250}
4351
@@ -88,6 +96,38 @@ function parseArgs(argv: string[]): CliArgs {
8896 process . exit ( 1 ) ;
8997 }
9098 args . benchmarkChunkSize = value ;
99+ } else if ( arg === '--mode' ) {
100+ const mode = argv [ ++ i ] ;
101+ if ( mode !== 'evaluate' && mode !== 'benchmark' ) {
102+ console . error ( `❌ Error: Invalid mode: ${ mode } . Must be 'evaluate' or 'benchmark'` ) ;
103+ process . exit ( 1 ) ;
104+ }
105+ args . mode = mode as 'evaluate' | 'benchmark' ;
106+ } else if ( arg === '--stages' ) {
107+ args . stages = [ ] ;
108+ while ( i + 1 < argv . length && ! argv [ i + 1 ] . startsWith ( '--' ) ) {
109+ args . stages . push ( argv [ ++ i ] ) ;
110+ }
111+ } else if ( arg === '--models' ) {
112+ args . models = [ ] ;
113+ while ( i + 1 < argv . length && ! argv [ i + 1 ] . startsWith ( '--' ) ) {
114+ args . models . push ( argv [ ++ i ] ) ;
115+ }
116+ } else if ( arg === '--latency-iterations' ) {
117+ const value = parseInt ( argv [ ++ i ] , 10 ) ;
118+ if ( isNaN ( value ) || value <= 0 ) {
119+ console . error ( `❌ Error: latency-iterations must be positive, got: ${ argv [ i ] } ` ) ;
120+ process . exit ( 1 ) ;
121+ }
122+ args . latencyIterations = value ;
123+ } else if ( arg === '--api-key' ) {
124+ args . apiKey = argv [ ++ i ] ;
125+ } else if ( arg === '--base-url' ) {
126+ args . baseUrl = argv [ ++ i ] ;
127+ } else if ( arg === '--azure-endpoint' ) {
128+ args . azureEndpoint = argv [ ++ i ] ;
129+ } else if ( arg === '--azure-api-version' ) {
130+ args . azureApiVersion = argv [ ++ i ] ;
91131 } else if ( ! args . configFile && ! arg . startsWith ( '-' ) ) {
92132 args . configFile = arg ;
93133 }
@@ -135,6 +175,12 @@ function showHelp(): void {
135175 console . log (
136176 ' --dataset-path <path> Path to evaluation dataset (required)'
137177 ) ;
178+ console . log (
179+ ' --mode <mode> Evaluation mode: "evaluate" or "benchmark" (default: evaluate)'
180+ ) ;
181+ console . log (
182+ ' --stages <stage>... Pipeline stages to evaluate: pre_flight, input, output'
183+ ) ;
138184 console . log (
139185 ' --batch-size <number> Number of samples to process in parallel (default: 32)'
140186 ) ;
@@ -144,12 +190,32 @@ function showHelp(): void {
144190 console . log (
145191 ' --multi-turn Evaluate conversation-aware guardrails turn-by-turn (default: single-pass)'
146192 ) ;
193+ console . log ( 'Benchmark Options:' ) ;
194+ console . log (
195+ ' --models <model>... Models to test in benchmark mode (default: gpt-5, gpt-5-mini, gpt-4.1, gpt-4.1-mini)'
196+ ) ;
197+ console . log (
198+ ' --latency-iterations <number> Number of iterations for latency testing (default: 25)'
199+ ) ;
147200 console . log (
148201 ' --max-parallel-models <number> Maximum number of models to benchmark concurrently (default: min(models, cpu_count))'
149202 ) ;
150203 console . log (
151204 ' --benchmark-chunk-size <number> Optional number of samples per chunk when benchmarking to limit long-running runs'
152205 ) ;
206+ console . log ( 'API Configuration:' ) ;
207+ console . log (
208+ ' --api-key <key> API key for OpenAI, Azure OpenAI, or OpenAI-compatible API'
209+ ) ;
210+ console . log (
211+ ' --base-url <url> Base URL for OpenAI-compatible API (e.g., http://localhost:11434/v1)'
212+ ) ;
213+ console . log (
214+ ' --azure-endpoint <endpoint> Azure OpenAI endpoint (e.g., https://your-resource.openai.azure.com)'
215+ ) ;
216+ console . log (
217+ ' --azure-api-version <version> Azure OpenAI API version (default: 2025-01-01-preview)'
218+ ) ;
153219 console . log ( '' ) ;
154220 console . log ( 'Examples:' ) ;
155221 console . log ( ' guardrails validate config.json' ) ;
@@ -158,6 +224,12 @@ function showHelp(): void {
158224 console . log (
159225 ' guardrails eval --config-path config.json --dataset-path dataset.jsonl --batch-size 16 --output-dir my-results'
160226 ) ;
227+ console . log (
228+ ' guardrails eval --config-path config.json --dataset-path dataset.jsonl --mode benchmark --models gpt-5 gpt-5-mini'
229+ ) ;
230+ console . log (
231+ ' guardrails eval --config-path config.json --dataset-path dataset.jsonl --mode benchmark --azure-endpoint https://your-resource.openai.azure.com --api-key your-key'
232+ ) ;
161233 console . log ( ' guardrails validate-dataset dataset.jsonl' ) ;
162234}
163235
@@ -186,12 +258,48 @@ async function handleEvalCommand(args: CliArgs): Promise<void> {
186258 process . exit ( 1 ) ;
187259 }
188260
261+ if ( args . latencyIterations !== undefined && args . latencyIterations <= 0 ) {
262+ console . error ( `❌ Error: latency-iterations must be positive, got: ${ args . latencyIterations } ` ) ;
263+ process . exit ( 1 ) ;
264+ }
265+
266+ if ( args . stages ) {
267+ const validStages = new Set ( [ 'pre_flight' , 'input' , 'output' ] ) ;
268+ const invalidStages = args . stages . filter ( ( s ) => ! validStages . has ( s ) ) ;
269+ if ( invalidStages . length > 0 ) {
270+ console . error ( `❌ Error: Invalid stages: ${ invalidStages . join ( ', ' ) } . Valid stages are: ${ Array . from ( validStages ) . join ( ', ' ) } ` ) ;
271+ process . exit ( 1 ) ;
272+ }
273+ }
274+
275+ if ( args . mode === 'benchmark' && args . stages && args . stages . length > 1 ) {
276+ console . warn ( '⚠️ Warning: Benchmark mode only uses the first specified stage. Additional stages will be ignored.' ) ;
277+ }
278+
279+ if ( args . azureEndpoint && args . baseUrl ) {
280+ console . error ( '❌ Error: Cannot specify both --azure-endpoint and --base-url. Choose one provider.' ) ;
281+ process . exit ( 1 ) ;
282+ }
283+
284+ if ( args . azureEndpoint && ! args . apiKey ) {
285+ console . error ( '❌ Error: --api-key is required when using --azure-endpoint' ) ;
286+ process . exit ( 1 ) ;
287+ }
288+
189289 try {
190290 await runEvaluationCLI ( {
191291 configPath : args . configPath ,
192292 datasetPath : args . datasetPath ,
293+ stages : args . stages || null ,
193294 batchSize : args . batchSize || 32 ,
194295 outputDir : args . outputDir || 'results' ,
296+ apiKey : args . apiKey || null ,
297+ baseUrl : args . baseUrl || null ,
298+ azureEndpoint : args . azureEndpoint || null ,
299+ azureApiVersion : args . azureApiVersion || '2025-01-01-preview' ,
300+ mode : args . mode || 'evaluate' ,
301+ models : args . models || null ,
302+ latencyIterations : args . latencyIterations ,
195303 multiTurn : args . multiTurn ,
196304 maxParallelModels : args . maxParallelModels ,
197305 benchmarkChunkSize : args . benchmarkChunkSize ,
0 commit comments