Skip to content

Commit d825b2d

Browse files
committed
feat: Improve legibility and swap in friendly model names
1 parent b88254f commit d825b2d

File tree

2 files changed

+183
-39
lines changed

2 files changed

+183
-39
lines changed

agents_mcp_usage/multi_mcp/eval_multi_mcp/costs.csv

Lines changed: 21 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,10 @@
11
# Prices are per 1 million tokens
22
# Gemini prices from https://ai.google.dev/gemini-api/docs/pricing
33
# OpenAI prices from https://openai.com/api/pricing/
4+
45
MODEL_COSTS = {
56
"gemini-2.5-pro-preview-03-25": {
7+
"friendly_name": "Gemini 2.5 Pro Preview (Mar)",
68
"input": [
79
{"up_to": 200000, "price": 1.25},
810
{"up_to": float('inf'), "price": 2.50},
@@ -15,6 +17,7 @@ MODEL_COSTS = {
1517
},
1618
},
1719
"gemini-2.5-pro-preview-05-06": {
20+
"friendly_name": "Gemini 2.5 Pro Preview (May)",
1821
"input": [
1922
{"up_to": 200000, "price": 1.25},
2023
{"up_to": float('inf'), "price": 2.50},
@@ -27,6 +30,7 @@ MODEL_COSTS = {
2730
},
2831
},
2932
"gemini-2.5-pro-preview-06-05": {
33+
"friendly_name": "Gemini 2.5 Pro Preview (Jun)",
3034
"input": [
3135
{"up_to": 200000, "price": 1.25},
3236
{"up_to": float('inf'), "price": 2.50},
@@ -39,6 +43,7 @@ MODEL_COSTS = {
3943
},
4044
},
4145
"gemini-2.5-pro-preview": {
46+
"friendly_name": "Gemini 2.5 Pro Preview",
4247
"input": [
4348
{"up_to": 200000, "price": 1.25},
4449
{"up_to": float('inf'), "price": 2.50},
@@ -51,6 +56,7 @@ MODEL_COSTS = {
5156
},
5257
},
5358
"gemini-1.5-pro": {
59+
"friendly_name": "Gemini 1.5 Pro",
5460
"input": [
5561
{"up_to": 128000, "price": 1.25},
5662
{"up_to": float('inf'), "price": 2.50},
@@ -63,6 +69,7 @@ MODEL_COSTS = {
6369
},
6470
},
6571
"gemini-1.5-flash": {
72+
"friendly_name": "Gemini 1.5 Flash",
6673
"input": [
6774
{"up_to": 128000, "price": 0.075},
6875
{"up_to": float('inf'), "price": 0.15},
@@ -75,61 +82,74 @@ MODEL_COSTS = {
7582
},
7683
},
7784
"gemini-2.0-flash": {
85+
"friendly_name": "Gemini 2.0 Flash",
7886
"input": [{"up_to": float('inf'), "price": 0.10}],
7987
"output": {"default": [{"up_to": float('inf'), "price": 0.40}]},
8088
},
8189
"gemini-2.5-flash-preview-04-17": {
90+
"friendly_name": "Gemini 2.5 Flash Preview (Apr)",
8291
"input": [{"up_to": float('inf'), "price": 0.15}],
8392
"output": {
8493
"non_thinking": [{"up_to": float('inf'), "price": 0.60}],
8594
"thinking": [{"up_to": float('inf'), "price": 3.50}],
8695
},
8796
},
8897
"gemini-2.5-flash-preview": {
98+
"friendly_name": "Gemini 2.5 Flash Preview",
8999
"input": [{"up_to": float('inf'), "price": 0.15}],
90100
"output": {
91101
"non_thinking": [{"up_to": float('inf'), "price": 0.60}],
92102
"thinking": [{"up_to": float('inf'), "price": 3.50}],
93103
},
94104
},
95105
"openai:o4-mini": {
106+
"friendly_name": "OpenAI o4-mini",
96107
"input": [{"up_to": float('inf'), "price": 1.10}],
97108
"output": {"default": [{"up_to": float('inf'), "price": 4.40}]},
98109
},
99110
"openai:o3": {
111+
"friendly_name": "OpenAI o3",
100112
"input": [{"up_to": float('inf'), "price": 10.00}],
101113
"output": {"default": [{"up_to": float('inf'), "price": 40.00}]},
102114
},
103115
"openai:gpt-4.1": {
116+
"friendly_name": "GPT-4.1",
104117
"input": [{"up_to": float('inf'), "price": 2.00}],
105118
"output": {"default": [{"up_to": float('inf'), "price": 8.00}]},
106119
},
107120
"openai:gpt-4.1-mini": {
121+
"friendly_name": "GPT-4.1 Mini",
108122
"input": [{"up_to": float('inf'), "price": 0.40}],
109123
"output": {"default": [{"up_to": float('inf'), "price": 1.60}]},
110124
},
111125
"openai:gpt-4.1-nano": {
126+
"friendly_name": "GPT-4.1 Nano",
112127
"input": [{"up_to": float('inf'), "price": 0.10}],
113128
"output": {"default": [{"up_to": float('inf'), "price": 0.40}]},
114129
},
115130
"bedrock:us.anthropic.claude-sonnet-4-20250514-v1:0": {
131+
"friendly_name": "Claude 4 Sonnet",
116132
"input": [{"up_to": float('inf'), "price": 3.00}],
117133
"output": {"default": [{"up_to": float('inf'), "price": 15.00}]},
118134
},
119135
"bedrock:us.anthropic.claude-opus-4-20250514-v1:0": {
136+
"friendly_name": "Claude 4 Opus",
120137
"input": [{"up_to": float('inf'), "price": 15.00}],
121138
"output": {"default": [{"up_to": float('inf'), "price": 75.00}]},
122139
},
123140
"bedrock:us.anthropic.claude-3-7-sonnet-20250219-v1:0": {
141+
"friendly_name": "Claude 3.7 Sonnet",
124142
"input": [{"up_to": float('inf'), "price": 3.00}],
125143
"output": {"default": [{"up_to": float('inf'), "price": 15.00}]},
126144
},
127145
"bedrock:us.anthropic.claude-3-5-sonnet-20240620-v1:0": {
146+
"friendly_name": "Claude 3.5 Sonnet",
128147
"input": [{"up_to": float('inf'), "price": 3.00}],
129148
"output": {"default": [{"up_to": float('inf'), "price": 15.00}]},
130149
},
131150
"bedrock:us.anthropic.claude-3-5-haiku-20241022-v1:0": {
151+
"friendly_name": "Claude 3.5 Haiku",
132152
"input": [{"up_to": float('inf'), "price": 1.00}],
133153
"output": {"default": [{"up_to": float('inf'), "price": 4.00}]},
134154
},
135-
}
155+
}

agents_mcp_usage/multi_mcp/eval_multi_mcp/merbench_ui.py

Lines changed: 162 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -29,14 +29,14 @@
2929
# --- Cost Loading ---
3030

3131

32-
def load_model_costs(file_path: str) -> Dict:
33-
"""Loads model costs from a CSV file and returns a structured dictionary.
32+
def load_model_costs(file_path: str) -> tuple[Dict, Dict]:
33+
"""Loads model costs and friendly names from a CSV file and returns structured dictionaries.
3434
3535
Args:
3636
file_path: The path to the cost file.
3737
3838
Returns:
39-
A dictionary containing the model costs.
39+
A tuple containing (model_costs_dict, friendly_names_dict).
4040
"""
4141
try:
4242
with open(file_path, "r", encoding="utf-8") as f:
@@ -45,24 +45,45 @@ def load_model_costs(file_path: str) -> Dict:
4545
line for line in f if not line.strip().startswith("#") and line.strip()
4646
]
4747

48-
# Find the start of the dictionary-like definition
48+
# Find the start of the dictionary-like definitions
4949
dict_str = "".join(lines)
50-
match = re.search(r"MODEL_COSTS\s*=\s*({.*})", dict_str, re.DOTALL)
51-
if not match:
52-
st.error(f"Could not find 'MODEL_COSTS' dictionary in {file_path}")
53-
return {}
54-
55-
# Safely evaluate the dictionary string
56-
model_costs_raw = eval(match.group(1), {"float": float})
5750

58-
return model_costs_raw
51+
# Extract MODEL_COSTS
52+
costs_match = re.search(r"MODEL_COSTS\s*=\s*({.*})", dict_str, re.DOTALL)
53+
if not costs_match:
54+
st.error(f"Could not find 'MODEL_COSTS' dictionary in {file_path}")
55+
return {}, {}
56+
57+
# Safely evaluate the dictionary strings
58+
model_costs_raw = eval(costs_match.group(1), {"float": float})
59+
60+
# Extract friendly names from the inline entries
61+
friendly_names = {}
62+
model_costs_clean = {}
63+
64+
for model_id, model_data in model_costs_raw.items():
65+
# Extract friendly name if it exists
66+
if isinstance(model_data, dict) and "friendly_name" in model_data:
67+
friendly_names[model_id] = model_data["friendly_name"]
68+
# Create a clean copy without the friendly_name for cost calculations
69+
model_costs_clean[model_id] = {
70+
key: value
71+
for key, value in model_data.items()
72+
if key != "friendly_name"
73+
}
74+
else:
75+
# No friendly name, use model_id as fallback
76+
friendly_names[model_id] = model_id
77+
model_costs_clean[model_id] = model_data
78+
79+
return model_costs_clean, friendly_names
5980

6081
except FileNotFoundError:
6182
st.warning(f"Cost file not found at {file_path}. Using empty cost config.")
62-
return {}
83+
return {}, {}
6384
except (SyntaxError, NameError, Exception) as e:
6485
st.error(f"Error parsing cost file {file_path}: {e}")
65-
return {}
86+
return {}, {}
6687

6788

6889
# --- Data Loading and Processing ---
@@ -388,8 +409,76 @@ def create_leaderboard(
388409
return leaderboard.sort_values("Correct", ascending=sort_ascending)
389410

390411

412+
def _calculate_smart_label_positions(
413+
x_data, y_data, labels, min_distance_threshold=0.1
414+
):
415+
"""Calculate optimal label positions to avoid overlaps.
416+
417+
Args:
418+
x_data: Array of x coordinates (normalized to 0-1 range for distance calc)
419+
y_data: Array of y coordinates (normalized to 0-1 range for distance calc)
420+
labels: Array of label strings
421+
min_distance_threshold: Minimum distance threshold for considering overlap
422+
423+
Returns:
424+
List of textposition strings for each point
425+
"""
426+
import numpy as np
427+
428+
# Normalize coordinates to 0-1 range for distance calculations
429+
x_norm = (
430+
(x_data - x_data.min()) / (x_data.max() - x_data.min())
431+
if x_data.max() != x_data.min()
432+
else x_data * 0
433+
)
434+
y_norm = (
435+
(y_data - y_data.min()) / (y_data.max() - y_data.min())
436+
if y_data.max() != y_data.min()
437+
else y_data * 0
438+
)
439+
440+
positions = ["top center"] * len(x_data)
441+
position_options = [
442+
"top center",
443+
"bottom center",
444+
"middle left",
445+
"middle right",
446+
"top left",
447+
"top right",
448+
"bottom left",
449+
"bottom right",
450+
]
451+
452+
# Calculate distances between all pairs of points
453+
for i in range(len(x_data)):
454+
for j in range(i + 1, len(x_data)):
455+
distance = np.sqrt(
456+
(x_norm[i] - x_norm[j]) ** 2 + (y_norm[i] - y_norm[j]) ** 2
457+
)
458+
459+
if distance < min_distance_threshold:
460+
# Points are close, try different positions
461+
for pos_idx, position in enumerate(position_options):
462+
if positions[i] == "top center":
463+
positions[i] = position_options[pos_idx % len(position_options)]
464+
break
465+
466+
for pos_idx, position in enumerate(position_options):
467+
if positions[j] == "top center" or positions[j] == positions[i]:
468+
positions[j] = position_options[
469+
(pos_idx + 1) % len(position_options)
470+
]
471+
break
472+
473+
return positions
474+
475+
391476
def create_pareto_frontier_plot(
392-
df: pd.DataFrame, selected_groups: List[str], x_axis_mode: str, config: Dict
477+
df: pd.DataFrame,
478+
selected_groups: List[str],
479+
x_axis_mode: str,
480+
config: Dict,
481+
friendly_names: Dict = None,
393482
) -> go.Figure:
394483
"""Visualizes the trade-off between model performance and cost/token usage.
395484
@@ -436,29 +525,64 @@ def create_pareto_frontier_plot(
436525
else:
437526
hover_format = ":.0f"
438527

439-
fig.add_trace(
440-
go.Scatter(
441-
x=x_data,
442-
y=model_metrics["y_axis"],
443-
mode="markers+text",
444-
marker=dict(
445-
size=18,
446-
color=model_metrics["color_axis"],
447-
colorscale="RdYlGn_r",
448-
showscale=True,
449-
colorbar=dict(title=f"Avg {plot_config['color_axis']} (s)"),
450-
),
451-
text=model_metrics["Model"],
452-
textposition="top center",
453-
hovertemplate=(
454-
"<b>%{text}</b><br>"
455-
f"{y_axis_label}: %{{y:.1f}}%<br>"
456-
f"{hover_label}: %{{x{hover_format}}}<br>"
457-
f"Avg {plot_config['color_axis']}: %{{marker.color:.1f}}s<extra></extra>"
458-
),
459-
)
528+
# Calculate smart label positions to avoid overlaps
529+
label_positions = _calculate_smart_label_positions(
530+
x_data.values, model_metrics["y_axis"].values, model_metrics["Model"].values
460531
)
461532

533+
# Group data by text position to create separate traces
534+
from collections import defaultdict
535+
536+
position_groups = defaultdict(list)
537+
538+
for i, position in enumerate(label_positions):
539+
position_groups[position].append(i)
540+
541+
# Create a trace for each text position group
542+
first_trace = True
543+
for position, indices in position_groups.items():
544+
x_vals = [x_data.iloc[i] for i in indices]
545+
y_vals = [model_metrics["y_axis"].iloc[i] for i in indices]
546+
colors = [model_metrics["color_axis"].iloc[i] for i in indices]
547+
548+
# Get model names for this position group
549+
original_names = [model_metrics["Model"].iloc[i] for i in indices]
550+
551+
# Use friendly names for display if available, otherwise use original names
552+
if friendly_names:
553+
display_texts = [friendly_names.get(name, name) for name in original_names]
554+
else:
555+
display_texts = original_names
556+
557+
fig.add_trace(
558+
go.Scatter(
559+
x=x_vals,
560+
y=y_vals,
561+
mode="markers+text",
562+
marker=dict(
563+
size=18,
564+
color=colors,
565+
colorscale="RdYlGn_r",
566+
showscale=first_trace, # Show colorbar only on first trace
567+
colorbar=dict(title=f"Avg {plot_config['color_axis']} (s)")
568+
if first_trace
569+
else None,
570+
),
571+
text=display_texts, # Use friendly names for display
572+
textposition=position,
573+
customdata=original_names, # Store original names for hover
574+
hovertemplate=(
575+
"<b>%{text}</b><br>" # Friendly name as title
576+
"API Name: %{customdata}<br>" # Original API name
577+
f"{y_axis_label}: %{{y:.1f}}%<br>"
578+
f"{hover_label}: %{{x{hover_format}}}<br>"
579+
f"Avg {plot_config['color_axis']}: %{{marker.color:.1f}}s<extra></extra>"
580+
),
581+
showlegend=False, # Don't show legend for individual position groups
582+
)
583+
)
584+
first_trace = False
585+
462586
fig.update_layout(
463587
title=plot_config["title"].format(x_axis_label=x_title),
464588
xaxis_title=f"Average {x_title}",
@@ -714,7 +838,7 @@ def main() -> None:
714838
# Cost configuration in sidebar
715839
st.sidebar.subheader("💰 Cost Configuration")
716840
cost_file_path = os.path.join(os.path.dirname(__file__), "costs.csv")
717-
model_costs = load_model_costs(cost_file_path)
841+
model_costs, friendly_names = load_model_costs(cost_file_path)
718842
available_models = sorted(df_initial["Model"].unique())
719843

720844
cost_config = {}
@@ -833,7 +957,7 @@ def main() -> None:
833957
)
834958
st.plotly_chart(
835959
create_pareto_frontier_plot(
836-
df, selected_groups, x_axis_mode, eval_config.model_dump()
960+
df, selected_groups, x_axis_mode, eval_config.model_dump(), friendly_names
837961
),
838962
use_container_width=True,
839963
)

0 commit comments

Comments
 (0)