openai
diff --git a/‎src/__tests__/unit/evals/guardrail-evals.test.ts‎
Lines changed: 3 additions & 3 deletions b/‎src/__tests__/unit/evals/guardrail-evals.test.ts‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎src/cli.ts‎
Lines changed: 108 additions & 0 deletions b/‎src/cli.ts‎
Lines changed: 108 additions & 0 deletions
diff --git a/‎src/evals/core/async-engine.ts‎
Lines changed: 3 additions & 4 deletions b/‎src/evals/core/async-engine.ts‎
Lines changed: 3 additions & 4 deletions
@@ -47,11 +47,11 @@ describe('GuardrailEval._determineParallelModelLimit', () => {
 
   it('should honor user-provided parallelism constraints', () => {
     expect(GuardrailEval._determineParallelModelLimit(5, 3)).toBe(3);
-    expect(() => GuardrailEval._determineParallelModelLimit(5, 0)).toThrow('max_parallel_models must be positive');
+    expect(() => GuardrailEval._determineParallelModelLimit(5, 0)).toThrow('maxParallelModels must be positive');
   });
 
   it('should throw error for invalid model count', () => {
-    expect(() => GuardrailEval._determineParallelModelLimit(0, null)).toThrow('model_count must be positive');
+    expect(() => GuardrailEval._determineParallelModelLimit(0, null)).toThrow('modelCount must be positive');
   });
 });
 
@@ -74,7 +74,7 @@ describe('GuardrailEval._chunkSamples', () => {
 
   it('should reject invalid chunk sizes', () => {
     const samples = buildSamples(2);
-    expect(() => Array.from(GuardrailEval._chunkSamples(samples, 0))).toThrow('chunk_size must be positive when provided');
+    expect(() => Array.from(GuardrailEval._chunkSamples(samples, 0))).toThrow('chunkSize must be positive when provided');
   });
 
   it('should return single chunk when chunk size is larger than samples', () => {
 
@@ -38,6 +38,14 @@ interface CliArgs {
   multiTurn?: boolean;
   maxParallelModels?: number | null;
   benchmarkChunkSize?: number | null;
+  mode?: 'evaluate' | 'benchmark';
+  stages?: string[];
+  models?: string[];
+  latencyIterations?: number;
+  apiKey?: string | null;
+  baseUrl?: string | null;
+  azureEndpoint?: string | null;
+  azureApiVersion?: string;
   help?: boolean;
 }
 
@@ -88,6 +96,38 @@ function parseArgs(argv: string[]): CliArgs {
         process.exit(1);
       }
       args.benchmarkChunkSize = value;
+    } else if (arg === '--mode') {
+      const mode = argv[++i];
+      if (mode !== 'evaluate' && mode !== 'benchmark') {
+        console.error(`❌ Error: Invalid mode: ${mode}. Must be 'evaluate' or 'benchmark'`);
+        process.exit(1);
+      }
+      args.mode = mode as 'evaluate' | 'benchmark';
+    } else if (arg === '--stages') {
+      args.stages = [];
+      while (i + 1 < argv.length && !argv[i + 1].startsWith('--')) {
+        args.stages.push(argv[++i]);
+      }
+    } else if (arg === '--models') {
+      args.models = [];
+      while (i + 1 < argv.length && !argv[i + 1].startsWith('--')) {
+        args.models.push(argv[++i]);
+      }
+    } else if (arg === '--latency-iterations') {
+      const value = parseInt(argv[++i], 10);
+      if (isNaN(value) || value <= 0) {
+        console.error(`❌ Error: latency-iterations must be positive, got: ${argv[i]}`);
+        process.exit(1);
+      }
+      args.latencyIterations = value;
+    } else if (arg === '--api-key') {
+      args.apiKey = argv[++i];
+    } else if (arg === '--base-url') {
+      args.baseUrl = argv[++i];
+    } else if (arg === '--azure-endpoint') {
+      args.azureEndpoint = argv[++i];
+    } else if (arg === '--azure-api-version') {
+      args.azureApiVersion = argv[++i];
     } else if (!args.configFile && !arg.startsWith('-')) {
       args.configFile = arg;
     }
@@ -135,6 +175,12 @@ function showHelp(): void {
   console.log(
     '  --dataset-path <path>                         Path to evaluation dataset (required)'
   );
+  console.log(
+    '  --mode <mode>                                 Evaluation mode: "evaluate" or "benchmark" (default: evaluate)'
+  );
+  console.log(
+    '  --stages <stage>...                            Pipeline stages to evaluate: pre_flight, input, output'
+  );
   console.log(
     '  --batch-size <number>                         Number of samples to process in parallel (default: 32)'
   );
@@ -144,12 +190,32 @@ function showHelp(): void {
   console.log(
     '  --multi-turn                                  Evaluate conversation-aware guardrails turn-by-turn (default: single-pass)'
   );
+  console.log('Benchmark Options:');
+  console.log(
+    '  --models <model>...                            Models to test in benchmark mode (default: gpt-5, gpt-5-mini, gpt-4.1, gpt-4.1-mini)'
+  );
+  console.log(
+    '  --latency-iterations <number>                 Number of iterations for latency testing (default: 25)'
+  );
   console.log(
     '  --max-parallel-models <number>                Maximum number of models to benchmark concurrently (default: min(models, cpu_count))'
   );
   console.log(
     '  --benchmark-chunk-size <number>                Optional number of samples per chunk when benchmarking to limit long-running runs'
   );
+  console.log('API Configuration:');
+  console.log(
+    '  --api-key <key>                               API key for OpenAI, Azure OpenAI, or OpenAI-compatible API'
+  );
+  console.log(
+    '  --base-url <url>                              Base URL for OpenAI-compatible API (e.g., http://localhost:11434/v1)'
+  );
+  console.log(
+    '  --azure-endpoint <endpoint>                   Azure OpenAI endpoint (e.g., https://your-resource.openai.azure.com)'
+  );
+  console.log(
+    '  --azure-api-version <version>                 Azure OpenAI API version (default: 2025-01-01-preview)'
+  );
   console.log('');
   console.log('Examples:');
   console.log('  guardrails validate config.json');
@@ -158,6 +224,12 @@ function showHelp(): void {
   console.log(
     '  guardrails eval --config-path config.json --dataset-path dataset.jsonl --batch-size 16 --output-dir my-results'
   );
+  console.log(
+    '  guardrails eval --config-path config.json --dataset-path dataset.jsonl --mode benchmark --models gpt-5 gpt-5-mini'
+  );
+  console.log(
+    '  guardrails eval --config-path config.json --dataset-path dataset.jsonl --mode benchmark --azure-endpoint https://your-resource.openai.azure.com --api-key your-key'
+  );
   console.log('  guardrails validate-dataset dataset.jsonl');
 }
 
@@ -186,12 +258,48 @@ async function handleEvalCommand(args: CliArgs): Promise<void> {
     process.exit(1);
   }
 
+  if (args.latencyIterations !== undefined && args.latencyIterations <= 0) {
+    console.error(`❌ Error: latency-iterations must be positive, got: ${args.latencyIterations}`);
+    process.exit(1);
+  }
+
+  if (args.stages) {
+    const validStages = new Set(['pre_flight', 'input', 'output']);
+    const invalidStages = args.stages.filter((s) => !validStages.has(s));
+    if (invalidStages.length > 0) {
+      console.error(`❌ Error: Invalid stages: ${invalidStages.join(', ')}. Valid stages are: ${Array.from(validStages).join(', ')}`);
+      process.exit(1);
+    }
+  }
+
+  if (args.mode === 'benchmark' && args.stages && args.stages.length > 1) {
+    console.warn('⚠️  Warning: Benchmark mode only uses the first specified stage. Additional stages will be ignored.');
+  }
+
+  if (args.azureEndpoint && args.baseUrl) {
+    console.error('❌ Error: Cannot specify both --azure-endpoint and --base-url. Choose one provider.');
+    process.exit(1);
+  }
+
+  if (args.azureEndpoint && !args.apiKey) {
+    console.error('❌ Error: --api-key is required when using --azure-endpoint');
+    process.exit(1);
+  }
+
   try {
     await runEvaluationCLI({
       configPath: args.configPath,
       datasetPath: args.datasetPath,
+      stages: args.stages || null,
       batchSize: args.batchSize || 32,
       outputDir: args.outputDir || 'results',
+      apiKey: args.apiKey || null,
+      baseUrl: args.baseUrl || null,
+      azureEndpoint: args.azureEndpoint || null,
+      azureApiVersion: args.azureApiVersion || '2025-01-01-preview',
+      mode: args.mode || 'evaluate',
+      models: args.models || null,
+      latencyIterations: args.latencyIterations,
       multiTurn: args.multiTurn,
       maxParallelModels: args.maxParallelModels,
       benchmarkChunkSize: args.benchmarkChunkSize,
 
@@ -46,18 +46,17 @@ export class AsyncRunEngine implements RunEngine {
     }
 
     const results: SampleResult[] = [];
-    let processed = 0;
+    const totalSamples = samples.length;
 
-    console.log(`${desc}: ${samples.length} samples, batch size: ${batchSize}`);
+    console.log(`${desc}: ${totalSamples} samples, batch size: ${batchSize}`);
 
     for (let i = 0; i < samples.length; i += batchSize) {
       const batch = samples.slice(i, i + batchSize);
       const batchResults = await Promise.all(
         batch.map((sample) => this.evaluateSample(context, sample))
       );
       results.push(...batchResults);
-      processed += batch.length;
-      console.log(`Processed ${processed}/${samples.length} samples`);
+      console.log(`Processed ${results.length}/${totalSamples} samples`);
     }
 
     return results;