evalops
diff --git a/‎.claude/settings.local.json‎
Lines changed: 22 additions & 2 deletions b/‎.claude/settings.local.json‎
Lines changed: 22 additions & 2 deletions
diff --git a/‎package-lock.json‎
Lines changed: 2 additions & 5 deletions b/‎package-lock.json‎
Lines changed: 2 additions & 5 deletions
diff --git a/‎src/cli.ts‎
Lines changed: 21 additions & 0 deletions b/‎src/cli.ts‎
Lines changed: 21 additions & 0 deletions
diff --git a/‎src/commands/cost.ts‎
Lines changed: 150 additions & 14 deletions b/‎src/commands/cost.ts‎
Lines changed: 150 additions & 14 deletions
@@ -19,8 +19,28 @@
       "Bash(node dist/cli.js --help)",
       "Bash(npm run format:*)",
       "Bash(npm run lint)",
-      "Bash(npm run lint:*)"
+      "Bash(npm run lint:*)",
+      "Bash(gh repo create:*)",
+      "Bash(gh repo list:*)",
+      "Bash(gh auth:*)",
+      "Bash(gh repo delete:*)",
+      "Bash(gh search repos:*)",
+      "Bash(gh repo clone:*)",
+      "mcp__filesystem__create_directory",
+      "Bash(gh repo view:*)",
+      "Bash(gh run list:*)",
+      "Bash(gh run view:*)",
+      "Bash(node:*)",
+      "Bash(git clone:*)",
+      "Bash(grep:*)",
+      "Bash(npm link:*)",
+      "Bash(evalops upload:*)",
+      "Bash(evalops:*)",
+      "Bash(EVALOPS_API_KEY=test-api-key evalops upload --dry-run -f /Users/jonathanhaas/evalops_cli/test-example/evalops.yaml)"
     ],
-    "deny": []
+    "deny": [],
+    "additionalDirectories": [
+      "/Users/jonathanhaas/platform"
+    ]
   }
 }
@@ -16,6 +16,7 @@ import dotenv from 'dotenv';
 import { BudgetCommand } from './commands/budget';
 import { CostCommand } from './commands/cost';
 import { InitCommand } from './commands/init';
+import { RunCommand } from './commands/run';
 import { UploadCommand } from './commands/upload';
 import { ValidateCommand } from './commands/validate';
 
@@ -63,6 +64,8 @@ program
   .option('--dry-run', 'Preview what would be uploaded without actually uploading')
   .option('--check-budget', 'Enforce budget constraints before and after evaluation')
   .option('--budget-file <file>', 'Path to budget.yaml file', './budget.yaml')
+  .option('--run', 'Run the evaluation immediately after upload')
+  .option('--wait', 'Wait for evaluation to complete and display results')
   .action(async (options) => {
     try {
       await UploadCommand.execute(options);
@@ -104,6 +107,24 @@ program
     }
   });
 
+program
+  .command('run <test-suite-id>')
+  .description('Run an existing test suite on the EvalOps platform')
+  .option('--api-key <key>', 'EvalOps API key')
+  .option('--api-url <url>', 'EvalOps API URL', 'https://api.evalops.dev')
+  .option('--wait', 'Wait for evaluation to complete and display results')
+  .option('--check-budget', 'Enforce budget constraints after evaluation')
+  .option('--budget-file <file>', 'Path to budget.yaml file', './budget.yaml')
+  .option('--environment <env>', 'Environment for budget validation')
+  .action(async (testSuiteId, options) => {
+    try {
+      await RunCommand.execute({ ...options, testSuiteId });
+    } catch (error) {
+      console.error(chalk.red('Error:'), error instanceof Error ? error.message : error);
+      process.exit(1);
+    }
+  });
+
 program.on('command:*', (operands) => {
   console.error(chalk.red(`Unknown command: ${operands[0]}`));
   console.log('Available commands:');
 
@@ -42,39 +42,175 @@ interface CostSummary {
 export class CostCommand {
   // Pricing data (updated as of January 2025)
   private static readonly PROVIDER_COSTS: ProviderCosts = {
+    // OpenAI Models - All prices per 1K tokens
     'openai/gpt-4': {
-      inputTokenCost: 0.03, // $0.03 per 1K tokens
-      outputTokenCost: 0.06, // $0.06 per 1K tokens
+      inputTokenCost: 0.03, // $30 per 1M tokens
+      outputTokenCost: 0.06, // $60 per 1M tokens
       requestCost: 0,
     },
     'openai/gpt-4-turbo': {
-      inputTokenCost: 0.01,
-      outputTokenCost: 0.03,
+      inputTokenCost: 0.01, // $10 per 1M tokens
+      outputTokenCost: 0.03, // $30 per 1M tokens
+      requestCost: 0,
+    },
+    'openai/gpt-4o': {
+      inputTokenCost: 0.0025, // $2.50 per 1M tokens
+      outputTokenCost: 0.01, // $10 per 1M tokens
+      requestCost: 0,
+    },
+    'openai/gpt-4o-mini': {
+      inputTokenCost: 0.00015, // $0.15 per 1M tokens
+      outputTokenCost: 0.0006, // $0.60 per 1M tokens
       requestCost: 0,
     },
     'openai/gpt-3.5-turbo': {
-      inputTokenCost: 0.0015,
-      outputTokenCost: 0.002,
+      inputTokenCost: 0.0005, // $0.50 per 1M tokens
+      outputTokenCost: 0.0015, // $1.50 per 1M tokens
       requestCost: 0,
     },
+    'openai/gpt-3.5-turbo-16k': {
+      inputTokenCost: 0.003, // $3 per 1M tokens
+      outputTokenCost: 0.004, // $4 per 1M tokens
+      requestCost: 0,
+    },
+    
+    // Anthropic Models - All prices per 1K tokens
     'anthropic/claude-3-opus': {
-      inputTokenCost: 0.015,
-      outputTokenCost: 0.075,
+      inputTokenCost: 0.015, // $15 per 1M tokens
+      outputTokenCost: 0.075, // $75 per 1M tokens
       requestCost: 0,
     },
     'anthropic/claude-3-sonnet': {
-      inputTokenCost: 0.003,
-      outputTokenCost: 0.015,
+      inputTokenCost: 0.003, // $3 per 1M tokens
+      outputTokenCost: 0.015, // $15 per 1M tokens
+      requestCost: 0,
+    },
+    'anthropic/claude-3.5-sonnet': {
+      inputTokenCost: 0.003, // $3 per 1M tokens
+      outputTokenCost: 0.015, // $15 per 1M tokens
+      requestCost: 0,
+    },
+    'anthropic/claude-3.7-sonnet': {
+      inputTokenCost: 0.003, // $3 per 1M tokens (includes thinking tokens)
+      outputTokenCost: 0.015, // $15 per 1M tokens
+      requestCost: 0,
+    },
+    'anthropic/claude-sonnet-4': {
+      inputTokenCost: 0.003, // $3 per 1M tokens
+      outputTokenCost: 0.015, // $15 per 1M tokens
       requestCost: 0,
     },
     'anthropic/claude-3-haiku': {
-      inputTokenCost: 0.00025,
-      outputTokenCost: 0.00125,
+      inputTokenCost: 0.00025, // $0.25 per 1M tokens
+      outputTokenCost: 0.00125, // $1.25 per 1M tokens
+      requestCost: 0,
+    },
+    'anthropic/claude-3.5-haiku': {
+      inputTokenCost: 0.0008, // $0.80 per 1M tokens
+      outputTokenCost: 0.004, // $4 per 1M tokens
+      requestCost: 0,
+    },
+    'anthropic/claude-2.1': {
+      inputTokenCost: 0.008, // $8 per 1M tokens
+      outputTokenCost: 0.024, // $24 per 1M tokens
       requestCost: 0,
     },
     'anthropic/claude-2': {
-      inputTokenCost: 0.008,
-      outputTokenCost: 0.024,
+      inputTokenCost: 0.008, // $8 per 1M tokens
+      outputTokenCost: 0.024, // $24 per 1M tokens
+      requestCost: 0,
+    },
+    
+    // Google Models - All prices per 1K tokens
+    'google/gemini-pro': {
+      inputTokenCost: 0.0005, // $0.50 per 1M tokens
+      outputTokenCost: 0.0015, // $1.50 per 1M tokens
+      requestCost: 0,
+    },
+    'google/gemini-pro-vision': {
+      inputTokenCost: 0.00025, // $0.25 per 1M tokens
+      outputTokenCost: 0.00125, // $1.25 per 1M tokens
+      requestCost: 0,
+    },
+    'google/gemini-1.5-pro': {
+      inputTokenCost: 0.00125, // $1.25 per 1M tokens (up to 128K)
+      outputTokenCost: 0.005, // $5.00 per 1M tokens (up to 128K)
+      requestCost: 0,
+    },
+    'google/gemini-1.5-flash': {
+      inputTokenCost: 0.000075, // $0.075 per 1M tokens (up to 128K)
+      outputTokenCost: 0.0003, // $0.30 per 1M tokens (up to 128K)
+      requestCost: 0,
+    },
+    'google/gemini-2.0-flash': {
+      inputTokenCost: 0.0001, // $0.10 per 1M tokens
+      outputTokenCost: 0.0004, // $0.40 per 1M tokens
+      requestCost: 0,
+    },
+    'google/gemini-2.5-pro': {
+      inputTokenCost: 0.00125, // $1.25 per 1M tokens (up to 200K)
+      outputTokenCost: 0.01, // $10 per 1M tokens (up to 200K)
+      requestCost: 0,
+    },
+    
+    // Cohere Models
+    'cohere/command': {
+      inputTokenCost: 0.0015, // $1.50 per 1M tokens
+      outputTokenCost: 0.002, // $2.00 per 1M tokens
+      requestCost: 0,
+    },
+    'cohere/command-light': {
+      inputTokenCost: 0.0003, // $0.30 per 1M tokens
+      outputTokenCost: 0.0006, // $0.60 per 1M tokens
+      requestCost: 0,
+    },
+    'cohere/command-r': {
+      inputTokenCost: 0.0005, // $0.50 per 1M tokens
+      outputTokenCost: 0.0015, // $1.50 per 1M tokens
+      requestCost: 0,
+    },
+    'cohere/command-r-plus': {
+      inputTokenCost: 0.003, // $3.00 per 1M tokens
+      outputTokenCost: 0.015, // $15.00 per 1M tokens
+      requestCost: 0,
+    },
+    
+    // Meta Llama Models (via various providers)
+    'meta/llama-3-8b': {
+      inputTokenCost: 0.0002, // $0.20 per 1M tokens
+      outputTokenCost: 0.0002, // $0.20 per 1M tokens
+      requestCost: 0,
+    },
+    'meta/llama-3-70b': {
+      inputTokenCost: 0.0008, // $0.80 per 1M tokens
+      outputTokenCost: 0.0008, // $0.80 per 1M tokens
+      requestCost: 0,
+    },
+    'meta/llama-3-405b': {
+      inputTokenCost: 0.002, // $2.00 per 1M tokens
+      outputTokenCost: 0.002, // $2.00 per 1M tokens
+      requestCost: 0,
+    },
+    
+    // Mistral Models
+    'mistral/mistral-tiny': {
+      inputTokenCost: 0.00025, // $0.25 per 1M tokens
+      outputTokenCost: 0.00025, // $0.25 per 1M tokens
+      requestCost: 0,
+    },
+    'mistral/mistral-small': {
+      inputTokenCost: 0.001, // $1.00 per 1M tokens
+      outputTokenCost: 0.003, // $3.00 per 1M tokens
+      requestCost: 0,
+    },
+    'mistral/mistral-medium': {
+      inputTokenCost: 0.0027, // $2.70 per 1M tokens
+      outputTokenCost: 0.0081, // $8.10 per 1M tokens
+      requestCost: 0,
+    },
+    'mistral/mistral-large': {
+      inputTokenCost: 0.008, // $8.00 per 1M tokens
+      outputTokenCost: 0.024, // $24.00 per 1M tokens
       requestCost: 0,
     },
   };