8
8
import json
9
9
import re
10
10
from pydantic import ValidationError
11
+ import csv
12
+ import io
11
13
12
14
from agents_mcp_usage .multi_mcp .eval_multi_mcp .dashboard_config import (
13
15
DEFAULT_CONFIG ,
27
29
page_title = EVAL_CONFIG .title , page_icon = EVAL_CONFIG .icon , layout = "wide"
28
30
)
29
31
30
- # Default model costs (per 1M tokens)
31
- DEFAULT_COSTS = {
32
- "gemini-2.5-pro-preview-06-05" : {"input" : 3.50 , "output" : 10.50 },
33
- "gemini-2.0-flash" : {"input" : 0.075 , "output" : 0.30 },
34
- "gemini-2.5-flash-preview-04-17" : {"input" : 0.075 , "output" : 0.30 },
35
- "openai:o4-mini" : {"input" : 0.15 , "output" : 0.60 },
36
- "openai:gpt-4.1-mini" : {"input" : 0.15 , "output" : 0.60 },
37
- "openai:gpt-4.1" : {"input" : 2.50 , "output" : 10.00 },
38
- }
32
+ # --- Cost Loading ---
33
+
34
+
35
+ def load_model_costs (file_path : str ) -> Dict :
36
+ """Loads model costs from a CSV file and returns a structured dictionary."""
37
+ try :
38
+ with open (file_path , "r" , encoding = "utf-8" ) as f :
39
+ # Read lines, skipping comments and empty lines
40
+ lines = [
41
+ line for line in f if not line .strip ().startswith ("#" ) and line .strip ()
42
+ ]
43
+
44
+ # Find the start of the dictionary-like definition
45
+ dict_str = "" .join (lines )
46
+ match = re .search (r"MODEL_COSTS\s*=\s*({.*})" , dict_str , re .DOTALL )
47
+ if not match :
48
+ st .error (f"Could not find 'MODEL_COSTS' dictionary in { file_path } " )
49
+ return {}
50
+
51
+ # Safely evaluate the dictionary string
52
+ model_costs_raw = eval (match .group (1 ), {"float" : float })
53
+
54
+ return model_costs_raw
55
+
56
+ except FileNotFoundError :
57
+ st .warning (f"Cost file not found at { file_path } . Using empty cost config." )
58
+ return {}
59
+ except (SyntaxError , NameError , Exception ) as e :
60
+ st .error (f"Error parsing cost file { file_path } : { e } " )
61
+ return {}
62
+
39
63
40
64
# --- Data Loading and Processing ---
41
65
@@ -112,10 +136,18 @@ def parse_metric_details(metric_details_str: str) -> Dict:
112
136
return {}
113
137
114
138
139
+ def get_price_for_tokens (token_count : int , price_tiers : List [Dict ]) -> float :
140
+ """Finds the correct price for a given number of tokens from a list of tiers."""
141
+ for tier in price_tiers :
142
+ if token_count <= tier ["up_to" ]:
143
+ return tier ["price" ]
144
+ return price_tiers [- 1 ]["price" ] # Fallback to the highest tier price
145
+
146
+
115
147
def calculate_costs (
116
148
df : pd .DataFrame , cost_config : Dict , eval_config : Dict
117
149
) -> pd .DataFrame :
118
- """Calculates input, output, and total costs for each run based on eval config ."""
150
+ """Calculates input, output, and total costs for each run based on new tiered pricing ."""
119
151
df_with_costs = df .copy ()
120
152
cost_calc_config = eval_config .get ("cost_calculation" , {})
121
153
input_token_cols = cost_calc_config .get ("input_token_cols" , [])
@@ -127,21 +159,56 @@ def calculate_costs(
127
159
128
160
for idx , row in df_with_costs .iterrows ():
129
161
model = row .get ("Model" )
130
- if model in cost_config :
131
- try :
132
- input_tokens = sum (row .get (col , 0 ) or 0 for col in input_token_cols )
133
- output_tokens = sum (row .get (col , 0 ) or 0 for col in output_token_cols )
134
-
135
- input_cost = (input_tokens / 1_000_000 ) * cost_config [model ]["input" ]
136
- output_cost = (output_tokens / 1_000_000 ) * cost_config [model ]["output" ]
137
-
138
- df_with_costs .at [idx , "input_cost" ] = input_cost
139
- df_with_costs .at [idx , "output_cost" ] = output_cost
140
- df_with_costs .at [idx , "total_cost" ] = input_cost + output_cost
141
- except (TypeError , KeyError ) as e :
142
- st .warning (
143
- f"Cost calculation error for model { model } at row { idx } : { e } "
162
+ model_costs = cost_config .get (model )
163
+
164
+ if not model_costs :
165
+ continue
166
+
167
+ try :
168
+ input_tokens = sum (row .get (col , 0 ) or 0 for col in input_token_cols )
169
+ output_tokens = sum (row .get (col , 0 ) or 0 for col in output_token_cols )
170
+ thinking_tokens = row .get ("thinking_tokens" , 0 ) or 0
171
+ non_thinking_output_tokens = output_tokens - thinking_tokens
172
+
173
+ total_tokens = input_tokens + output_tokens
174
+
175
+ # Determine input cost
176
+ input_price_tiers = model_costs .get ("input" , [])
177
+ input_price = get_price_for_tokens (total_tokens , input_price_tiers )
178
+ input_cost = (input_tokens / 1_000_000 ) * input_price
179
+
180
+ # Determine output cost
181
+ output_cost = 0
182
+ output_pricing = model_costs .get ("output" , {})
183
+
184
+ if "thinking" in output_pricing and thinking_tokens > 0 :
185
+ thinking_price_tiers = output_pricing ["thinking" ]
186
+ thinking_price = get_price_for_tokens (
187
+ total_tokens , thinking_price_tiers
188
+ )
189
+ output_cost += (thinking_tokens / 1_000_000 ) * thinking_price
190
+
191
+ if "non_thinking" in output_pricing and non_thinking_output_tokens > 0 :
192
+ non_thinking_price_tiers = output_pricing ["non_thinking" ]
193
+ non_thinking_price = get_price_for_tokens (
194
+ total_tokens , non_thinking_price_tiers
144
195
)
196
+ output_cost += (
197
+ non_thinking_output_tokens / 1_000_000
198
+ ) * non_thinking_price
199
+
200
+ elif "default" in output_pricing :
201
+ default_price_tiers = output_pricing ["default" ]
202
+ default_price = get_price_for_tokens (total_tokens , default_price_tiers )
203
+ output_cost += (output_tokens / 1_000_000 ) * default_price
204
+
205
+ df_with_costs .at [idx , "input_cost" ] = input_cost
206
+ df_with_costs .at [idx , "output_cost" ] = output_cost
207
+ df_with_costs .at [idx , "total_cost" ] = input_cost + output_cost
208
+
209
+ except (TypeError , KeyError , IndexError ) as e :
210
+ st .warning (f"Cost calculation error for model { model } at row { idx } : { e } " )
211
+
145
212
return df_with_costs
146
213
147
214
@@ -176,6 +243,15 @@ def process_data(
176
243
processed_df .get ("Metric_response_tokens" , 0 ) + processed_df ["thinking_tokens" ]
177
244
)
178
245
246
+ # Calculate total tokens for leaderboard
247
+ cost_calc_config = eval_config .cost_calculation
248
+ input_token_cols = cost_calc_config .input_token_cols
249
+ output_token_cols = cost_calc_config .output_token_cols
250
+
251
+ processed_df ["total_tokens" ] = 0
252
+ for col in input_token_cols + output_token_cols :
253
+ processed_df ["total_tokens" ] += processed_df .get (col , 0 ).fillna (0 )
254
+
179
255
# Standardize primary metric score
180
256
primary_metric_config = eval_config .primary_metric
181
257
if (
@@ -228,7 +304,7 @@ def create_leaderboard(
228
304
"Correct" : (primary_metric_name , "mean" ),
229
305
"Cost" : ("total_cost" , "mean" ),
230
306
"Duration" : ("Duration" , "mean" ),
231
- "Tokens" : ("total_response_tokens " , "mean" ),
307
+ "Avg Total Tokens" : ("total_tokens " , "mean" ),
232
308
"Runs" : ("Model" , "size" ),
233
309
}
234
310
@@ -459,7 +535,7 @@ def main():
459
535
st .subheader ("LLM Evaluation Benchmark Dashboard" )
460
536
461
537
# --- Sidebar Setup ---
462
- st .sidebar .header ("⚙️ Configuration" )
538
+ st .sidebar .header ("⚙️ Data Configuration" )
463
539
464
540
# File selection
465
541
default_dir_path = (
@@ -493,40 +569,85 @@ def main():
493
569
st .error ("No data loaded. Please check the selected files." )
494
570
return
495
571
496
- available_models = sorted (df_initial ["Model" ].unique ())
572
+ # Grouping filter
573
+ grouping_config = eval_config .grouping
574
+ st .sidebar .subheader (f"🎯 { grouping_config .label } Filter" )
575
+
576
+ # Ensure the target column exists before trying to access it
577
+ if grouping_config .target_column not in df_initial .columns :
578
+ df_initial = extract_grouping_column (df_initial , eval_config .model_dump ())
579
+
580
+ available_groups = sorted (df_initial [grouping_config .target_column ].unique ())
581
+ selected_groups = st .sidebar .multiselect (
582
+ f"Filter by { grouping_config .label .lower ()} :" ,
583
+ options = available_groups ,
584
+ default = available_groups ,
585
+ )
497
586
498
587
# Cost configuration in sidebar
499
588
st .sidebar .subheader ("💰 Cost Configuration" )
589
+ cost_file_path = os .path .join (os .path .dirname (__file__ ), "costs.csv" )
590
+ model_costs = load_model_costs (cost_file_path )
591
+ available_models = sorted (df_initial ["Model" ].unique ())
592
+
500
593
cost_config = {}
594
+ user_cost_override = {}
595
+
501
596
with st .sidebar .expander ("Edit Model Costs (per 1M tokens)" , expanded = False ):
597
+ for model in available_models :
598
+ if model in model_costs :
599
+ cost_config [model ] = model_costs [model ]
600
+ else :
601
+ st .warning (f"No cost data found for model: { model } . Using zeros." )
602
+ cost_config [model ] = {
603
+ "input" : [{"up_to" : float ("inf" ), "price" : 0.0 }],
604
+ "output" : {"default" : [{"up_to" : float ("inf" ), "price" : 0.0 }]},
605
+ }
606
+
607
+ st .markdown ("---" )
608
+ st .markdown ("Override costs below (optional, simplified):" )
609
+
502
610
for model in available_models :
503
611
cols = st .columns (2 )
504
- default = DEFAULT_COSTS .get (model , {"input" : 0.0 , "output" : 0.0 })
612
+ default_input = (
613
+ cost_config .get (model , {}).get ("input" , [{}])[0 ].get ("price" , 0.0 )
614
+ )
615
+ output_pricing = cost_config .get (model , {}).get ("output" , {})
616
+ if "default" in output_pricing :
617
+ default_output = output_pricing ["default" ][0 ].get ("price" , 0.0 )
618
+ elif "non_thinking" in output_pricing :
619
+ default_output = output_pricing ["non_thinking" ][0 ].get ("price" , 0.0 )
620
+ else :
621
+ default_output = 0.0
622
+
505
623
input_cost = cols [0 ].number_input (
506
624
f"{ model } Input" ,
507
- value = float (default [ "input" ] ),
625
+ value = float (default_input ),
508
626
step = 0.01 ,
509
- format = "%.2f" ,
627
+ format = "%.4f" ,
628
+ key = f"{ model } _input_cost" ,
510
629
)
511
630
output_cost = cols [1 ].number_input (
512
631
f"{ model } Output" ,
513
- value = float (default [ "output" ] ),
632
+ value = float (default_output ),
514
633
step = 0.01 ,
515
- format = "%.2f" ,
634
+ format = "%.4f" ,
635
+ key = f"{ model } _output_cost" ,
516
636
)
517
- cost_config [model ] = {"input" : input_cost , "output" : output_cost }
518
637
519
- df = process_data (df_initial , cost_config , eval_config )
638
+ if input_cost != default_input or output_cost != default_output :
639
+ user_cost_override [model ] = {
640
+ "input" : [{"up_to" : float ("inf" ), "price" : input_cost }],
641
+ "output" : {
642
+ "default" : [{"up_to" : float ("inf" ), "price" : output_cost }]
643
+ },
644
+ }
520
645
521
- # Grouping filter
522
- grouping_config = eval_config .grouping
523
- st .sidebar .subheader (f"🎯 { grouping_config .label } Filter" )
524
- available_groups = sorted (df [grouping_config .target_column ].unique ())
525
- selected_groups = st .sidebar .multiselect (
526
- f"Filter by { grouping_config .label .lower ()} :" ,
527
- options = available_groups ,
528
- default = available_groups ,
529
- )
646
+ # Apply overrides
647
+ final_cost_config = cost_config .copy ()
648
+ final_cost_config .update (user_cost_override )
649
+
650
+ df = process_data (df_initial , final_cost_config , eval_config )
530
651
531
652
# --- Main Panel ---
532
653
st .header ("📊 Overview" )
@@ -566,7 +687,9 @@ def main():
566
687
"Duration" : st .column_config .NumberColumn (
567
688
"Avg Duration (s)" , format = "%.2fs"
568
689
),
569
- "Tokens" : st .column_config .NumberColumn ("Avg Tokens" , format = "%.0f" ),
690
+ "Avg Total Tokens" : st .column_config .NumberColumn (
691
+ "Avg Total Tokens" , format = "%.0f"
692
+ ),
570
693
},
571
694
use_container_width = True ,
572
695
)
0 commit comments