Skip to content

Commit 87bdffc

Browse files
committed
feat: Make benchmark dashboard more flexible for eval tasks
1 parent 7ada57c commit 87bdffc

File tree

3 files changed

+574
-174
lines changed

3 files changed

+574
-174
lines changed
Lines changed: 120 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,120 @@
1+
"""
2+
Dashboard Configuration for the Generic Evaluation UI
3+
4+
This file defines the "personality" of the Streamlit dashboard. By modifying this
5+
configuration, you can adapt the UI to display results from any evaluation
6+
that produces a CSV file with a compatible format.
7+
8+
Each section of the configuration is documented to explain its purpose and
9+
the available options.
10+
"""
11+
12+
# ==============================================================================
13+
# MERBENCH DASHBOARD CONFIGURATION
14+
# ==============================================================================
15+
# This is an example configuration for the "Merbench" evaluation.
16+
# You can duplicate and modify this structure to create configurations
17+
# for other evaluations.
18+
# ==============================================================================
19+
20+
MERBENCH_CONFIG = {
21+
# --- General Dashboard Settings ---
22+
"title": "Merbench - LLM Evaluation Benchmark",
23+
"icon": "🏆", # Emoji for the browser tab
24+
# --- Primary Metric Configuration ---
25+
# The primary metric is the main score used for the leaderboard and
26+
# the y-axis of the Pareto frontier plot.
27+
"primary_metric": {
28+
"name": "correctness_score", # The column name in the CSV
29+
"label": "Avg. Success Rate", # How the metric is displayed in the UI
30+
"goal": "maximize", # 'maximize' or 'minimize'
31+
"score_column": "Score_MermaidDiagramValid",
32+
},
33+
# --- Grouping and Filtering ---
34+
# Defines how to group test cases (e.g., by difficulty).
35+
"grouping": {
36+
"column": "Case", # The column containing the case names
37+
"label": "Difficulty", # The label for the filter in the sidebar
38+
# A regex to extract group names from the 'column'. The first
39+
# capture group will be used as the group name.
40+
"extractor_regex": r".*_(easy|medium|hard)",
41+
"target_column": "difficulty",
42+
},
43+
# --- Plot and Analysis Tab Configuration ---
44+
# This section defines which plots are displayed in the UI.
45+
"plots": {
46+
"pareto": {
47+
"enabled": True,
48+
"title": "Performance vs. {x_axis_label}",
49+
"y_axis": "primary_metric", # Uses the primary_metric defined above
50+
"x_axis_options": {
51+
"cost": {"column": "total_cost", "label": "Cost"},
52+
"tokens": {"column": "total_response_tokens", "label": "Tokens"},
53+
},
54+
"color_axis": "Duration", # Column to use for the color scale
55+
},
56+
"success_rates": {
57+
"enabled": True,
58+
"title": "Success Rate by Metric",
59+
"type": "grouped_bar",
60+
# Finds all columns starting with this prefix to create a bar for each.
61+
"y_prefix": "Score_",
62+
},
63+
"failure_analysis": {
64+
"enabled": True,
65+
"title": "Failure Analysis by Reason",
66+
"type": "stacked_bar",
67+
# Defines the series for the stacked bar chart.
68+
# Each item represents a condition that counts as a "failure".
69+
"series": [
70+
{
71+
"name": "Invalid Diagram",
72+
"column": "Score_MermaidDiagramValid",
73+
"condition": "== 0",
74+
},
75+
{
76+
"name": "MCP Tool Failure",
77+
"column": "Score_UsedBothMCPTools",
78+
"condition": "< 1",
79+
},
80+
{
81+
"name": "Usage Limit Exceeded",
82+
"column": "Score_UsageLimitNotExceeded",
83+
"condition": "== 0",
84+
},
85+
],
86+
},
87+
"token_breakdown": {
88+
"enabled": True,
89+
"title": "Average Token Usage by Type",
90+
"type": "stacked_bar",
91+
"series": [
92+
{"name": "Request", "column": "Metric_request_tokens"},
93+
{"name": "Response", "column": "Metric_response_tokens"},
94+
{"name": "Thinking", "column": "thinking_tokens"},
95+
],
96+
},
97+
"cost_breakdown": {
98+
"enabled": True,
99+
"title": "Average Cost Breakdown by Type",
100+
"type": "stacked_bar",
101+
"series": [
102+
{"name": "Input Cost", "column": "input_cost"},
103+
{"name": "Output Cost", "column": "output_cost"},
104+
],
105+
},
106+
},
107+
# --- Cost Calculation ---
108+
# Defines which columns are used to calculate the total cost.
109+
"cost_calculation": {
110+
"input_token_cols": ["Metric_request_tokens"],
111+
"output_token_cols": ["Metric_response_tokens", "thinking_tokens"],
112+
},
113+
}
114+
115+
# --- Add other configurations for different evaluations below ---
116+
# EXAMPLE_OTHER_EVAL_CONFIG = { ... }
117+
118+
# The default configuration to use when the dashboard starts.
119+
# You can change this to point to a different configuration.
120+
DEFAULT_CONFIG = MERBENCH_CONFIG

0 commit comments

Comments
 (0)