Skip to content

Commit 7858a76

Browse files
committed
feat: implement real API integration and add run command
- Connect CLI to actual EvalOps platform backend - Add new 'run' command to execute existing test suites - Transform evalops.yaml format to platform schema - Implement test run execution with polling for results - Extract real metrics from test runs for budget validation - Update pricing data for all major LLM models (2025 rates) - OpenAI: GPT-4, GPT-4o, GPT-4o-mini, GPT-3.5-turbo - Anthropic: Claude 3/3.5/3.7/4 Opus/Sonnet/Haiku models - Google: Gemini Pro/Flash 1.5/2.0/2.5 models - Added Cohere, Meta Llama, and Mistral models - Add --run and --wait options to upload command for immediate execution - Integrate real-time metrics with budget validation system
1 parent fe3e569 commit 7858a76

File tree

9 files changed

+718
-41
lines changed

9 files changed

+718
-41
lines changed

.claude/settings.local.json

Lines changed: 22 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -19,8 +19,28 @@
1919
"Bash(node dist/cli.js --help)",
2020
"Bash(npm run format:*)",
2121
"Bash(npm run lint)",
22-
"Bash(npm run lint:*)"
22+
"Bash(npm run lint:*)",
23+
"Bash(gh repo create:*)",
24+
"Bash(gh repo list:*)",
25+
"Bash(gh auth:*)",
26+
"Bash(gh repo delete:*)",
27+
"Bash(gh search repos:*)",
28+
"Bash(gh repo clone:*)",
29+
"mcp__filesystem__create_directory",
30+
"Bash(gh repo view:*)",
31+
"Bash(gh run list:*)",
32+
"Bash(gh run view:*)",
33+
"Bash(node:*)",
34+
"Bash(git clone:*)",
35+
"Bash(grep:*)",
36+
"Bash(npm link:*)",
37+
"Bash(evalops upload:*)",
38+
"Bash(evalops:*)",
39+
"Bash(EVALOPS_API_KEY=test-api-key evalops upload --dry-run -f /Users/jonathanhaas/evalops_cli/test-example/evalops.yaml)"
2340
],
24-
"deny": []
41+
"deny": [],
42+
"additionalDirectories": [
43+
"/Users/jonathanhaas/platform"
44+
]
2545
}
2646
}

package-lock.json

Lines changed: 2 additions & 5 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

src/cli.ts

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@ import dotenv from 'dotenv';
1616
import { BudgetCommand } from './commands/budget';
1717
import { CostCommand } from './commands/cost';
1818
import { InitCommand } from './commands/init';
19+
import { RunCommand } from './commands/run';
1920
import { UploadCommand } from './commands/upload';
2021
import { ValidateCommand } from './commands/validate';
2122

@@ -63,6 +64,8 @@ program
6364
.option('--dry-run', 'Preview what would be uploaded without actually uploading')
6465
.option('--check-budget', 'Enforce budget constraints before and after evaluation')
6566
.option('--budget-file <file>', 'Path to budget.yaml file', './budget.yaml')
67+
.option('--run', 'Run the evaluation immediately after upload')
68+
.option('--wait', 'Wait for evaluation to complete and display results')
6669
.action(async (options) => {
6770
try {
6871
await UploadCommand.execute(options);
@@ -104,6 +107,24 @@ program
104107
}
105108
});
106109

110+
program
111+
.command('run <test-suite-id>')
112+
.description('Run an existing test suite on the EvalOps platform')
113+
.option('--api-key <key>', 'EvalOps API key')
114+
.option('--api-url <url>', 'EvalOps API URL', 'https://api.evalops.dev')
115+
.option('--wait', 'Wait for evaluation to complete and display results')
116+
.option('--check-budget', 'Enforce budget constraints after evaluation')
117+
.option('--budget-file <file>', 'Path to budget.yaml file', './budget.yaml')
118+
.option('--environment <env>', 'Environment for budget validation')
119+
.action(async (testSuiteId, options) => {
120+
try {
121+
await RunCommand.execute({ ...options, testSuiteId });
122+
} catch (error) {
123+
console.error(chalk.red('Error:'), error instanceof Error ? error.message : error);
124+
process.exit(1);
125+
}
126+
});
127+
107128
program.on('command:*', (operands) => {
108129
console.error(chalk.red(`Unknown command: ${operands[0]}`));
109130
console.log('Available commands:');

src/commands/cost.ts

Lines changed: 150 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -42,39 +42,175 @@ interface CostSummary {
4242
export class CostCommand {
4343
// Pricing data (updated as of January 2025)
4444
private static readonly PROVIDER_COSTS: ProviderCosts = {
45+
// OpenAI Models - All prices per 1K tokens
4546
'openai/gpt-4': {
46-
inputTokenCost: 0.03, // $0.03 per 1K tokens
47-
outputTokenCost: 0.06, // $0.06 per 1K tokens
47+
inputTokenCost: 0.03, // $30 per 1M tokens
48+
outputTokenCost: 0.06, // $60 per 1M tokens
4849
requestCost: 0,
4950
},
5051
'openai/gpt-4-turbo': {
51-
inputTokenCost: 0.01,
52-
outputTokenCost: 0.03,
52+
inputTokenCost: 0.01, // $10 per 1M tokens
53+
outputTokenCost: 0.03, // $30 per 1M tokens
54+
requestCost: 0,
55+
},
56+
'openai/gpt-4o': {
57+
inputTokenCost: 0.0025, // $2.50 per 1M tokens
58+
outputTokenCost: 0.01, // $10 per 1M tokens
59+
requestCost: 0,
60+
},
61+
'openai/gpt-4o-mini': {
62+
inputTokenCost: 0.00015, // $0.15 per 1M tokens
63+
outputTokenCost: 0.0006, // $0.60 per 1M tokens
5364
requestCost: 0,
5465
},
5566
'openai/gpt-3.5-turbo': {
56-
inputTokenCost: 0.0015,
57-
outputTokenCost: 0.002,
67+
inputTokenCost: 0.0005, // $0.50 per 1M tokens
68+
outputTokenCost: 0.0015, // $1.50 per 1M tokens
5869
requestCost: 0,
5970
},
71+
'openai/gpt-3.5-turbo-16k': {
72+
inputTokenCost: 0.003, // $3 per 1M tokens
73+
outputTokenCost: 0.004, // $4 per 1M tokens
74+
requestCost: 0,
75+
},
76+
77+
// Anthropic Models - All prices per 1K tokens
6078
'anthropic/claude-3-opus': {
61-
inputTokenCost: 0.015,
62-
outputTokenCost: 0.075,
79+
inputTokenCost: 0.015, // $15 per 1M tokens
80+
outputTokenCost: 0.075, // $75 per 1M tokens
6381
requestCost: 0,
6482
},
6583
'anthropic/claude-3-sonnet': {
66-
inputTokenCost: 0.003,
67-
outputTokenCost: 0.015,
84+
inputTokenCost: 0.003, // $3 per 1M tokens
85+
outputTokenCost: 0.015, // $15 per 1M tokens
86+
requestCost: 0,
87+
},
88+
'anthropic/claude-3.5-sonnet': {
89+
inputTokenCost: 0.003, // $3 per 1M tokens
90+
outputTokenCost: 0.015, // $15 per 1M tokens
91+
requestCost: 0,
92+
},
93+
'anthropic/claude-3.7-sonnet': {
94+
inputTokenCost: 0.003, // $3 per 1M tokens (includes thinking tokens)
95+
outputTokenCost: 0.015, // $15 per 1M tokens
96+
requestCost: 0,
97+
},
98+
'anthropic/claude-sonnet-4': {
99+
inputTokenCost: 0.003, // $3 per 1M tokens
100+
outputTokenCost: 0.015, // $15 per 1M tokens
68101
requestCost: 0,
69102
},
70103
'anthropic/claude-3-haiku': {
71-
inputTokenCost: 0.00025,
72-
outputTokenCost: 0.00125,
104+
inputTokenCost: 0.00025, // $0.25 per 1M tokens
105+
outputTokenCost: 0.00125, // $1.25 per 1M tokens
106+
requestCost: 0,
107+
},
108+
'anthropic/claude-3.5-haiku': {
109+
inputTokenCost: 0.0008, // $0.80 per 1M tokens
110+
outputTokenCost: 0.004, // $4 per 1M tokens
111+
requestCost: 0,
112+
},
113+
'anthropic/claude-2.1': {
114+
inputTokenCost: 0.008, // $8 per 1M tokens
115+
outputTokenCost: 0.024, // $24 per 1M tokens
73116
requestCost: 0,
74117
},
75118
'anthropic/claude-2': {
76-
inputTokenCost: 0.008,
77-
outputTokenCost: 0.024,
119+
inputTokenCost: 0.008, // $8 per 1M tokens
120+
outputTokenCost: 0.024, // $24 per 1M tokens
121+
requestCost: 0,
122+
},
123+
124+
// Google Models - All prices per 1K tokens
125+
'google/gemini-pro': {
126+
inputTokenCost: 0.0005, // $0.50 per 1M tokens
127+
outputTokenCost: 0.0015, // $1.50 per 1M tokens
128+
requestCost: 0,
129+
},
130+
'google/gemini-pro-vision': {
131+
inputTokenCost: 0.00025, // $0.25 per 1M tokens
132+
outputTokenCost: 0.00125, // $1.25 per 1M tokens
133+
requestCost: 0,
134+
},
135+
'google/gemini-1.5-pro': {
136+
inputTokenCost: 0.00125, // $1.25 per 1M tokens (up to 128K)
137+
outputTokenCost: 0.005, // $5.00 per 1M tokens (up to 128K)
138+
requestCost: 0,
139+
},
140+
'google/gemini-1.5-flash': {
141+
inputTokenCost: 0.000075, // $0.075 per 1M tokens (up to 128K)
142+
outputTokenCost: 0.0003, // $0.30 per 1M tokens (up to 128K)
143+
requestCost: 0,
144+
},
145+
'google/gemini-2.0-flash': {
146+
inputTokenCost: 0.0001, // $0.10 per 1M tokens
147+
outputTokenCost: 0.0004, // $0.40 per 1M tokens
148+
requestCost: 0,
149+
},
150+
'google/gemini-2.5-pro': {
151+
inputTokenCost: 0.00125, // $1.25 per 1M tokens (up to 200K)
152+
outputTokenCost: 0.01, // $10 per 1M tokens (up to 200K)
153+
requestCost: 0,
154+
},
155+
156+
// Cohere Models
157+
'cohere/command': {
158+
inputTokenCost: 0.0015, // $1.50 per 1M tokens
159+
outputTokenCost: 0.002, // $2.00 per 1M tokens
160+
requestCost: 0,
161+
},
162+
'cohere/command-light': {
163+
inputTokenCost: 0.0003, // $0.30 per 1M tokens
164+
outputTokenCost: 0.0006, // $0.60 per 1M tokens
165+
requestCost: 0,
166+
},
167+
'cohere/command-r': {
168+
inputTokenCost: 0.0005, // $0.50 per 1M tokens
169+
outputTokenCost: 0.0015, // $1.50 per 1M tokens
170+
requestCost: 0,
171+
},
172+
'cohere/command-r-plus': {
173+
inputTokenCost: 0.003, // $3.00 per 1M tokens
174+
outputTokenCost: 0.015, // $15.00 per 1M tokens
175+
requestCost: 0,
176+
},
177+
178+
// Meta Llama Models (via various providers)
179+
'meta/llama-3-8b': {
180+
inputTokenCost: 0.0002, // $0.20 per 1M tokens
181+
outputTokenCost: 0.0002, // $0.20 per 1M tokens
182+
requestCost: 0,
183+
},
184+
'meta/llama-3-70b': {
185+
inputTokenCost: 0.0008, // $0.80 per 1M tokens
186+
outputTokenCost: 0.0008, // $0.80 per 1M tokens
187+
requestCost: 0,
188+
},
189+
'meta/llama-3-405b': {
190+
inputTokenCost: 0.002, // $2.00 per 1M tokens
191+
outputTokenCost: 0.002, // $2.00 per 1M tokens
192+
requestCost: 0,
193+
},
194+
195+
// Mistral Models
196+
'mistral/mistral-tiny': {
197+
inputTokenCost: 0.00025, // $0.25 per 1M tokens
198+
outputTokenCost: 0.00025, // $0.25 per 1M tokens
199+
requestCost: 0,
200+
},
201+
'mistral/mistral-small': {
202+
inputTokenCost: 0.001, // $1.00 per 1M tokens
203+
outputTokenCost: 0.003, // $3.00 per 1M tokens
204+
requestCost: 0,
205+
},
206+
'mistral/mistral-medium': {
207+
inputTokenCost: 0.0027, // $2.70 per 1M tokens
208+
outputTokenCost: 0.0081, // $8.10 per 1M tokens
209+
requestCost: 0,
210+
},
211+
'mistral/mistral-large': {
212+
inputTokenCost: 0.008, // $8.00 per 1M tokens
213+
outputTokenCost: 0.024, // $24.00 per 1M tokens
78214
requestCost: 0,
79215
},
80216
};

0 commit comments

Comments
 (0)