Skip to content

Commit 4bc6f87

Browse files
committed
feat: Improve default costs and UI representation
1 parent 87bdffc commit 4bc6f87

File tree

2 files changed

+270
-44
lines changed

2 files changed

+270
-44
lines changed
Lines changed: 103 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,103 @@
1+
# Prices are per 1 million tokens
2+
# Gemini prices from https://ai.google.dev/gemini-api/docs/pricing
3+
# OpenAI prices from https://openai.com/api/pricing/
4+
MODEL_COSTS = {
5+
"gemini-2.5-pro-preview-05-06": {
6+
"input": [
7+
{"up_to": 200000, "price": 1.25},
8+
{"up_to": float('inf'), "price": 2.50},
9+
],
10+
"output": {
11+
"default": [
12+
{"up_to": 200000, "price": 10.00},
13+
{"up_to": float('inf'), "price": 15.00},
14+
]
15+
},
16+
},
17+
"gemini-2.5-pro-preview-06-05": {
18+
"input": [
19+
{"up_to": 200000, "price": 1.25},
20+
{"up_to": float('inf'), "price": 2.50},
21+
],
22+
"output": {
23+
"default": [
24+
{"up_to": 200000, "price": 10.00},
25+
{"up_to": float('inf'), "price": 15.00},
26+
]
27+
},
28+
},
29+
"gemini-2.5-pro-preview": {
30+
"input": [
31+
{"up_to": 200000, "price": 1.25},
32+
{"up_to": float('inf'), "price": 2.50},
33+
],
34+
"output": {
35+
"default": [
36+
{"up_to": 200000, "price": 10.00},
37+
{"up_to": float('inf'), "price": 15.00},
38+
]
39+
},
40+
},
41+
"gemini-1.5-pro": {
42+
"input": [
43+
{"up_to": 128000, "price": 1.25},
44+
{"up_to": float('inf'), "price": 2.50},
45+
],
46+
"output": {
47+
"default": [
48+
{"up_to": 128000, "price": 5.00},
49+
{"up_to": float('inf'), "price": 10.00},
50+
]
51+
},
52+
},
53+
"gemini-1.5-flash": {
54+
"input": [
55+
{"up_to": 128000, "price": 0.075},
56+
{"up_to": float('inf'), "price": 0.15},
57+
],
58+
"output": {
59+
"default": [
60+
{"up_to": 128000, "price": 0.30},
61+
{"up_to": float('inf'), "price": 0.60},
62+
]
63+
},
64+
},
65+
"gemini-2.0-flash": {
66+
"input": [{"up_to": float('inf'), "price": 0.10}],
67+
"output": {"default": [{"up_to": float('inf'), "price": 0.40}]},
68+
},
69+
"gemini-2.5-flash-preview-04-17": {
70+
"input": [{"up_to": float('inf'), "price": 0.15}],
71+
"output": {
72+
"non_thinking": [{"up_to": float('inf'), "price": 0.60}],
73+
"thinking": [{"up_to": float('inf'), "price": 3.50}],
74+
},
75+
},
76+
"gemini-2.5-flash-preview": {
77+
"input": [{"up_to": float('inf'), "price": 0.15}],
78+
"output": {
79+
"non_thinking": [{"up_to": float('inf'), "price": 0.60}],
80+
"thinking": [{"up_to": float('inf'), "price": 3.50}],
81+
},
82+
},
83+
"openai:o4-mini": {
84+
"input": [{"up_to": float('inf'), "price": 1.10}],
85+
"output": {"default": [{"up_to": float('inf'), "price": 4.40}]},
86+
},
87+
"openai:o3": {
88+
"input": [{"up_to": float('inf'), "price": 10.00}],
89+
"output": {"default": [{"up_to": float('inf'), "price": 40.00}]},
90+
},
91+
"openai:gpt-4.1": {
92+
"input": [{"up_to": float('inf'), "price": 2.00}],
93+
"output": {"default": [{"up_to": float('inf'), "price": 8.00}]},
94+
},
95+
"openai:gpt-4.1-mini": {
96+
"input": [{"up_to": float('inf'), "price": 0.40}],
97+
"output": {"default": [{"up_to": float('inf'), "price": 1.60}]},
98+
},
99+
"openai:gpt-4.1-nano": {
100+
"input": [{"up_to": float('inf'), "price": 0.10}],
101+
"output": {"default": [{"up_to": float('inf'), "price": 0.40}]},
102+
},
103+
}

agents_mcp_usage/multi_mcp/eval_multi_mcp/merbench_ui.py

Lines changed: 167 additions & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,8 @@
88
import json
99
import re
1010
from pydantic import ValidationError
11+
import csv
12+
import io
1113

1214
from agents_mcp_usage.multi_mcp.eval_multi_mcp.dashboard_config import (
1315
DEFAULT_CONFIG,
@@ -27,15 +29,37 @@
2729
page_title=EVAL_CONFIG.title, page_icon=EVAL_CONFIG.icon, layout="wide"
2830
)
2931

30-
# Default model costs (per 1M tokens)
31-
DEFAULT_COSTS = {
32-
"gemini-2.5-pro-preview-06-05": {"input": 3.50, "output": 10.50},
33-
"gemini-2.0-flash": {"input": 0.075, "output": 0.30},
34-
"gemini-2.5-flash-preview-04-17": {"input": 0.075, "output": 0.30},
35-
"openai:o4-mini": {"input": 0.15, "output": 0.60},
36-
"openai:gpt-4.1-mini": {"input": 0.15, "output": 0.60},
37-
"openai:gpt-4.1": {"input": 2.50, "output": 10.00},
38-
}
32+
# --- Cost Loading ---
33+
34+
35+
def load_model_costs(file_path: str) -> Dict:
36+
"""Loads model costs from a CSV file and returns a structured dictionary."""
37+
try:
38+
with open(file_path, "r", encoding="utf-8") as f:
39+
# Read lines, skipping comments and empty lines
40+
lines = [
41+
line for line in f if not line.strip().startswith("#") and line.strip()
42+
]
43+
44+
# Find the start of the dictionary-like definition
45+
dict_str = "".join(lines)
46+
match = re.search(r"MODEL_COSTS\s*=\s*({.*})", dict_str, re.DOTALL)
47+
if not match:
48+
st.error(f"Could not find 'MODEL_COSTS' dictionary in {file_path}")
49+
return {}
50+
51+
# Safely evaluate the dictionary string
52+
model_costs_raw = eval(match.group(1), {"float": float})
53+
54+
return model_costs_raw
55+
56+
except FileNotFoundError:
57+
st.warning(f"Cost file not found at {file_path}. Using empty cost config.")
58+
return {}
59+
except (SyntaxError, NameError, Exception) as e:
60+
st.error(f"Error parsing cost file {file_path}: {e}")
61+
return {}
62+
3963

4064
# --- Data Loading and Processing ---
4165

@@ -112,10 +136,18 @@ def parse_metric_details(metric_details_str: str) -> Dict:
112136
return {}
113137

114138

139+
def get_price_for_tokens(token_count: int, price_tiers: List[Dict]) -> float:
140+
"""Finds the correct price for a given number of tokens from a list of tiers."""
141+
for tier in price_tiers:
142+
if token_count <= tier["up_to"]:
143+
return tier["price"]
144+
return price_tiers[-1]["price"] # Fallback to the highest tier price
145+
146+
115147
def calculate_costs(
116148
df: pd.DataFrame, cost_config: Dict, eval_config: Dict
117149
) -> pd.DataFrame:
118-
"""Calculates input, output, and total costs for each run based on eval config."""
150+
"""Calculates input, output, and total costs for each run based on new tiered pricing."""
119151
df_with_costs = df.copy()
120152
cost_calc_config = eval_config.get("cost_calculation", {})
121153
input_token_cols = cost_calc_config.get("input_token_cols", [])
@@ -127,21 +159,56 @@ def calculate_costs(
127159

128160
for idx, row in df_with_costs.iterrows():
129161
model = row.get("Model")
130-
if model in cost_config:
131-
try:
132-
input_tokens = sum(row.get(col, 0) or 0 for col in input_token_cols)
133-
output_tokens = sum(row.get(col, 0) or 0 for col in output_token_cols)
134-
135-
input_cost = (input_tokens / 1_000_000) * cost_config[model]["input"]
136-
output_cost = (output_tokens / 1_000_000) * cost_config[model]["output"]
137-
138-
df_with_costs.at[idx, "input_cost"] = input_cost
139-
df_with_costs.at[idx, "output_cost"] = output_cost
140-
df_with_costs.at[idx, "total_cost"] = input_cost + output_cost
141-
except (TypeError, KeyError) as e:
142-
st.warning(
143-
f"Cost calculation error for model {model} at row {idx}: {e}"
162+
model_costs = cost_config.get(model)
163+
164+
if not model_costs:
165+
continue
166+
167+
try:
168+
input_tokens = sum(row.get(col, 0) or 0 for col in input_token_cols)
169+
output_tokens = sum(row.get(col, 0) or 0 for col in output_token_cols)
170+
thinking_tokens = row.get("thinking_tokens", 0) or 0
171+
non_thinking_output_tokens = output_tokens - thinking_tokens
172+
173+
total_tokens = input_tokens + output_tokens
174+
175+
# Determine input cost
176+
input_price_tiers = model_costs.get("input", [])
177+
input_price = get_price_for_tokens(total_tokens, input_price_tiers)
178+
input_cost = (input_tokens / 1_000_000) * input_price
179+
180+
# Determine output cost
181+
output_cost = 0
182+
output_pricing = model_costs.get("output", {})
183+
184+
if "thinking" in output_pricing and thinking_tokens > 0:
185+
thinking_price_tiers = output_pricing["thinking"]
186+
thinking_price = get_price_for_tokens(
187+
total_tokens, thinking_price_tiers
188+
)
189+
output_cost += (thinking_tokens / 1_000_000) * thinking_price
190+
191+
if "non_thinking" in output_pricing and non_thinking_output_tokens > 0:
192+
non_thinking_price_tiers = output_pricing["non_thinking"]
193+
non_thinking_price = get_price_for_tokens(
194+
total_tokens, non_thinking_price_tiers
144195
)
196+
output_cost += (
197+
non_thinking_output_tokens / 1_000_000
198+
) * non_thinking_price
199+
200+
elif "default" in output_pricing:
201+
default_price_tiers = output_pricing["default"]
202+
default_price = get_price_for_tokens(total_tokens, default_price_tiers)
203+
output_cost += (output_tokens / 1_000_000) * default_price
204+
205+
df_with_costs.at[idx, "input_cost"] = input_cost
206+
df_with_costs.at[idx, "output_cost"] = output_cost
207+
df_with_costs.at[idx, "total_cost"] = input_cost + output_cost
208+
209+
except (TypeError, KeyError, IndexError) as e:
210+
st.warning(f"Cost calculation error for model {model} at row {idx}: {e}")
211+
145212
return df_with_costs
146213

147214

@@ -176,6 +243,15 @@ def process_data(
176243
processed_df.get("Metric_response_tokens", 0) + processed_df["thinking_tokens"]
177244
)
178245

246+
# Calculate total tokens for leaderboard
247+
cost_calc_config = eval_config.cost_calculation
248+
input_token_cols = cost_calc_config.input_token_cols
249+
output_token_cols = cost_calc_config.output_token_cols
250+
251+
processed_df["total_tokens"] = 0
252+
for col in input_token_cols + output_token_cols:
253+
processed_df["total_tokens"] += processed_df.get(col, 0).fillna(0)
254+
179255
# Standardize primary metric score
180256
primary_metric_config = eval_config.primary_metric
181257
if (
@@ -228,7 +304,7 @@ def create_leaderboard(
228304
"Correct": (primary_metric_name, "mean"),
229305
"Cost": ("total_cost", "mean"),
230306
"Duration": ("Duration", "mean"),
231-
"Tokens": ("total_response_tokens", "mean"),
307+
"Avg Total Tokens": ("total_tokens", "mean"),
232308
"Runs": ("Model", "size"),
233309
}
234310

@@ -459,7 +535,7 @@ def main():
459535
st.subheader("LLM Evaluation Benchmark Dashboard")
460536

461537
# --- Sidebar Setup ---
462-
st.sidebar.header("⚙️ Configuration")
538+
st.sidebar.header("⚙️ Data Configuration")
463539

464540
# File selection
465541
default_dir_path = (
@@ -493,40 +569,85 @@ def main():
493569
st.error("No data loaded. Please check the selected files.")
494570
return
495571

496-
available_models = sorted(df_initial["Model"].unique())
572+
# Grouping filter
573+
grouping_config = eval_config.grouping
574+
st.sidebar.subheader(f"🎯 {grouping_config.label} Filter")
575+
576+
# Ensure the target column exists before trying to access it
577+
if grouping_config.target_column not in df_initial.columns:
578+
df_initial = extract_grouping_column(df_initial, eval_config.model_dump())
579+
580+
available_groups = sorted(df_initial[grouping_config.target_column].unique())
581+
selected_groups = st.sidebar.multiselect(
582+
f"Filter by {grouping_config.label.lower()}:",
583+
options=available_groups,
584+
default=available_groups,
585+
)
497586

498587
# Cost configuration in sidebar
499588
st.sidebar.subheader("💰 Cost Configuration")
589+
cost_file_path = os.path.join(os.path.dirname(__file__), "costs.csv")
590+
model_costs = load_model_costs(cost_file_path)
591+
available_models = sorted(df_initial["Model"].unique())
592+
500593
cost_config = {}
594+
user_cost_override = {}
595+
501596
with st.sidebar.expander("Edit Model Costs (per 1M tokens)", expanded=False):
597+
for model in available_models:
598+
if model in model_costs:
599+
cost_config[model] = model_costs[model]
600+
else:
601+
st.warning(f"No cost data found for model: {model}. Using zeros.")
602+
cost_config[model] = {
603+
"input": [{"up_to": float("inf"), "price": 0.0}],
604+
"output": {"default": [{"up_to": float("inf"), "price": 0.0}]},
605+
}
606+
607+
st.markdown("---")
608+
st.markdown("Override costs below (optional, simplified):")
609+
502610
for model in available_models:
503611
cols = st.columns(2)
504-
default = DEFAULT_COSTS.get(model, {"input": 0.0, "output": 0.0})
612+
default_input = (
613+
cost_config.get(model, {}).get("input", [{}])[0].get("price", 0.0)
614+
)
615+
output_pricing = cost_config.get(model, {}).get("output", {})
616+
if "default" in output_pricing:
617+
default_output = output_pricing["default"][0].get("price", 0.0)
618+
elif "non_thinking" in output_pricing:
619+
default_output = output_pricing["non_thinking"][0].get("price", 0.0)
620+
else:
621+
default_output = 0.0
622+
505623
input_cost = cols[0].number_input(
506624
f"{model} Input",
507-
value=float(default["input"]),
625+
value=float(default_input),
508626
step=0.01,
509-
format="%.2f",
627+
format="%.4f",
628+
key=f"{model}_input_cost",
510629
)
511630
output_cost = cols[1].number_input(
512631
f"{model} Output",
513-
value=float(default["output"]),
632+
value=float(default_output),
514633
step=0.01,
515-
format="%.2f",
634+
format="%.4f",
635+
key=f"{model}_output_cost",
516636
)
517-
cost_config[model] = {"input": input_cost, "output": output_cost}
518637

519-
df = process_data(df_initial, cost_config, eval_config)
638+
if input_cost != default_input or output_cost != default_output:
639+
user_cost_override[model] = {
640+
"input": [{"up_to": float("inf"), "price": input_cost}],
641+
"output": {
642+
"default": [{"up_to": float("inf"), "price": output_cost}]
643+
},
644+
}
520645

521-
# Grouping filter
522-
grouping_config = eval_config.grouping
523-
st.sidebar.subheader(f"🎯 {grouping_config.label} Filter")
524-
available_groups = sorted(df[grouping_config.target_column].unique())
525-
selected_groups = st.sidebar.multiselect(
526-
f"Filter by {grouping_config.label.lower()}:",
527-
options=available_groups,
528-
default=available_groups,
529-
)
646+
# Apply overrides
647+
final_cost_config = cost_config.copy()
648+
final_cost_config.update(user_cost_override)
649+
650+
df = process_data(df_initial, final_cost_config, eval_config)
530651

531652
# --- Main Panel ---
532653
st.header("📊 Overview")
@@ -566,7 +687,9 @@ def main():
566687
"Duration": st.column_config.NumberColumn(
567688
"Avg Duration (s)", format="%.2fs"
568689
),
569-
"Tokens": st.column_config.NumberColumn("Avg Tokens", format="%.0f"),
690+
"Avg Total Tokens": st.column_config.NumberColumn(
691+
"Avg Total Tokens", format="%.0f"
692+
),
570693
},
571694
use_container_width=True,
572695
)

0 commit comments

Comments
 (0)