File tree Expand file tree Collapse file tree 3 files changed +12
-4
lines changed
agents_mcp_usage/multi_mcp/eval_multi_mcp Expand file tree Collapse file tree 3 files changed +12
-4
lines changed Original file line number Diff line number Diff line change 19
19
20
20
MERBENCH_CONFIG = {
21
21
# --- General Dashboard Settings ---
22
- "title" : "Merbench - LLM Evaluation Benchmark" ,
23
- "icon" : "🏆" , # Emoji for the browser tab
22
+ "title" : "🧜♀️ Merbench - LLM Evaluation " ,
23
+ "description" : (
24
+ "Getting LLMs to consistently nail the mermaid diagram syntax can be... an adventure. "
25
+ "\n \n Merbench tests this ability by providing an LLM Agent access to an MCP server that both validates "
26
+ "and provides error messages to guide correction of syntax. There are three different difficulty levels (test cases), "
27
+ "and the LLM is given a fixed number of attempts to fix the diagram, if this is exceeded, the test case is considered failed. "
28
+ "\n \n This leaderboard shows the average success rate across all selected models and difficulty levels."
29
+ ),
30
+ "icon" : "🧜♀️" , # Emoji for the browser tab
24
31
# --- Primary Metric Configuration ---
25
32
# The primary metric is the main score used for the leaderboard and
26
33
# the y-axis of the Pareto frontier plot.
Original file line number Diff line number Diff line change @@ -653,7 +653,7 @@ def main() -> None:
653
653
eval_config = EVAL_CONFIG # Use the validated config
654
654
655
655
st .title (eval_config .title )
656
- st .subheader ( "LLM Evaluation Benchmark Dashboard" )
656
+ st .markdown ( eval_config . description )
657
657
658
658
# --- Sidebar Setup ---
659
659
st .sidebar .header ("⚙️ Data Configuration" )
@@ -817,7 +817,7 @@ def main() -> None:
817
817
else :
818
818
st .warning ("No data available for the current filter selection." )
819
819
820
- st .header ("📈 Pareto Frontier Analysis " )
820
+ st .header ("📈 Pareto Frontier" )
821
821
pareto_config = eval_config .plots .pareto
822
822
x_axis_mode = st .radio (
823
823
"Compare performance against:" ,
Original file line number Diff line number Diff line change @@ -93,6 +93,7 @@ class CostCalculationConfig(BaseModel):
93
93
94
94
class DashboardConfig (BaseModel ):
95
95
title : str
96
+ description : str
96
97
icon : str
97
98
primary_metric : PrimaryMetricConfig
98
99
grouping : GroupingConfig
You can’t perform that action at this time.
0 commit comments