Skip to content

Commit 851ef88

Browse files
authored
[WIP] Fixed Leaderboard dropdown bug and update theme (#3291)
1 parent 827aaba commit 851ef88

File tree

2 files changed

+127
-58
lines changed

2 files changed

+127
-58
lines changed

fastchat/serve/monitor/monitor.py

Lines changed: 126 additions & 58 deletions
Original file line numberDiff line numberDiff line change
@@ -32,16 +32,28 @@
3232
leader_component_values = [None] * 5
3333

3434

35-
def make_default_md(arena_df, elo_results, mirror=False):
36-
mirror_str = "<span style='color: red; font-weight: bold;'>This is a mirror of the live leaderboard created and maintained by the [LMSYS Organization](https://lmsys.org).</span>"
35+
def make_default_md_1(arena_df, elo_results, mirror=False):
36+
link_color = "#1976D2" # This color should be clear in both light and dark mode
3737
leaderboard_md = f"""
38-
# 🏆 LMSYS Chatbot Arena Leaderboard
39-
| [Website](https://lmsys.org) | [Vote](https://chat.lmsys.org) | [Blog](https://lmsys.org/blog/2023-05-03-arena/) | [GitHub](https://github.com/lm-sys/FastChat) | [Paper](https://arxiv.org/abs/2403.04132) | [Dataset](https://github.com/lm-sys/FastChat/blob/main/docs/dataset_release.md) | [Twitter](https://twitter.com/lmsysorg) | [Discord](https://discord.gg/HSWAKCrnFx) |
38+
# 🏆 LMSYS Chatbot Arena Leaderboard
39+
<a href='https://lmsys.org/blog/2023-05-03-arena/' style='color: {link_color}; text-decoration: none;'>Blog</a> |
40+
<a href='https://arxiv.org/abs/2403.04132' style='color: {link_color}; text-decoration: none;'>Paper</a> |
41+
<a href='https://github.com/lm-sys/FastChat' style='color: {link_color}; text-decoration: none;'>GitHub</a> |
42+
<a href='https://github.com/lm-sys/FastChat/blob/main/docs/dataset_release.md' style='color: {link_color}; text-decoration: none;'>Dataset</a> |
43+
<a href='https://twitter.com/lmsysorg' style='color: {link_color}; text-decoration: none;'>Twitter</a> |
44+
<a href='https://discord.gg/HSWAKCrnFx' style='color: {link_color}; text-decoration: none;'>Discord</a>
45+
"""
46+
47+
return leaderboard_md
4048

49+
50+
def make_default_md_2(arena_df, elo_results, mirror=False):
51+
mirror_str = "<span style='color: red; font-weight: bold'>This is a mirror of the live leaderboard created and maintained by the <a href='https://lmsys.org' style='color: red; text-decoration: none;'>LMSYS Organization</a>. Please link to <a href='https://leaderboard.lmsys.org' style='color: #B00020; text-decoration: none;'>leaderboard.lmsys.org</a> for citation purposes.</span>"
52+
leaderboard_md = f"""
4153
{mirror_str if mirror else ""}
4254
43-
LMSYS [Chatbot Arena](https://lmsys.org/blog/2023-05-03-arena/) is a crowdsourced open platform for LLM evals. We've collected over **800,000** human pairwise comparisons to rank LLMs with the [Bradley-Terry model](https://en.wikipedia.org/wiki/Bradley%E2%80%93Terry_model) and display the model ratings in Elo-scale.
44-
You can find more details in our [paper](https://arxiv.org/abs/2403.04132).
55+
LMSYS Chatbot Arena is a crowdsourced open platform for LLM evals. We've collected over 800,000 human pairwise comparisons to rank LLMs with the Bradley-Terry model and display the model ratings in Elo-scale.
56+
You can find more details in our paper. **Chatbot arena is dependent on community participation, please contribute by casting your vote!**
4557
"""
4658

4759
return leaderboard_md
@@ -57,11 +69,11 @@ def make_arena_leaderboard_md(arena_df):
5769
5870
📣 **NEW!** View leaderboard for different categories (e.g., coding, long user query)! This is still in preview and subject to change.
5971
60-
Code to recreate leaderboard tables and plots in this [notebook]({notebook_url}). You can contribute your vote 🗳️ at [chat.lmsys.org](https://chat.lmsys.org)!
72+
Code to recreate leaderboard tables and plots in this [notebook]({notebook_url}). You can contribute your vote at [chat.lmsys.org](https://chat.lmsys.org)!
6173
6274
***Rank (UB)**: model's ranking (upper-bound), defined by one + the number of models that are statistically better than the target model.
6375
Model A is statistically better than model B when A's lower-bound score is greater than B's upper-bound score (in 95% confidence interval).
64-
See Figure 3 below for visualization of the confidence intervals of model scores.
76+
See Figure 1 below for visualization of the confidence intervals of model scores.
6577
"""
6678
return leaderboard_md
6779

@@ -250,10 +262,8 @@ def get_full_table(arena_df, model_table_df):
250262

251263
def create_ranking_str(ranking, ranking_difference):
252264
if ranking_difference > 0:
253-
# return f"{int(ranking)} (\u2191{int(ranking_difference)})"
254265
return f"{int(ranking)} \u2191"
255266
elif ranking_difference < 0:
256-
# return f"{int(ranking)} (\u2193{int(-ranking_difference)})"
257267
return f"{int(ranking)} \u2193"
258268
else:
259269
return f"{int(ranking)}"
@@ -275,21 +285,31 @@ def recompute_final_ranking(arena_df):
275285
return list(ranking.values())
276286

277287

288+
def highlight_top_models(df):
289+
def highlight_max_rank(s):
290+
# Pastel Yellow with transparency, rgba(red, green, blue, alpha)
291+
highlight_color = "rgba(255, 255, 128, 0.2)" # 50% transparent
292+
if int(s["Rank* (UB)"].replace("↑", "").replace("↓", "")) == 1:
293+
return [f"background-color: {highlight_color}" for _ in s]
294+
else:
295+
return ["" for _ in s]
296+
297+
# Apply and return the styled DataFrame
298+
return df.apply(highlight_max_rank, axis=1)
299+
300+
278301
def get_arena_table(arena_df, model_table_df, arena_subset_df=None):
279302
arena_df = arena_df.sort_values(
280303
by=["final_ranking", "rating"], ascending=[True, False]
281304
)
282305
arena_df["final_ranking"] = recompute_final_ranking(arena_df)
283306
arena_df = arena_df.sort_values(by=["final_ranking"], ascending=True)
284307

285-
# arena_df["final_ranking"] = range(1, len(arena_df) + 1)
286308
# sort by rating
287309
if arena_subset_df is not None:
288310
# filter out models not in the arena_df
289311
arena_subset_df = arena_subset_df[arena_subset_df.index.isin(arena_df.index)]
290312
arena_subset_df = arena_subset_df.sort_values(by=["rating"], ascending=False)
291-
# arena_subset_df = arena_subset_df.sort_values(by=["final_ranking"], ascending=True)
292-
# arena_subset_df = arena_subset_df[arena_subset_df["num_battles"] > 500]
293313
arena_subset_df["final_ranking"] = recompute_final_ranking(arena_subset_df)
294314
# keep only the models in the subset in arena_df and recompute final_ranking
295315
arena_df = arena_df[arena_df.index.isin(arena_subset_df.index)]
@@ -307,10 +327,6 @@ def get_arena_table(arena_df, model_table_df, arena_subset_df=None):
307327
arena_df["final_ranking_global"] - arena_df["final_ranking"]
308328
)
309329

310-
# no tie version
311-
# arena_df = arena_subset_df.join(arena_df["final_ranking_no_tie"], rsuffix="_global", how="inner")
312-
# arena_df["ranking_difference"] = arena_df["final_ranking_no_tie_global"] - arena_df["final_ranking_no_tie"]
313-
314330
arena_df = arena_df.sort_values(
315331
by=["final_ranking", "rating"], ascending=[True, False]
316332
)
@@ -319,6 +335,8 @@ def get_arena_table(arena_df, model_table_df, arena_subset_df=None):
319335
axis=1,
320336
)
321337

338+
arena_df["final_ranking"] = arena_df["final_ranking"].astype(str)
339+
322340
values = []
323341
for i in range(len(arena_df)):
324342
row = []
@@ -417,11 +435,19 @@ def build_leaderboard_tab(
417435
p3 = category_elo_results["Overall"]["bootstrap_elo_rating"]
418436
p4 = category_elo_results["Overall"]["average_win_rate_bar"]
419437
arena_df = arena_dfs["Overall"]
420-
default_md = make_default_md(
438+
default_md = make_default_md_1(
439+
arena_df, category_elo_results["Overall"], mirror=mirror
440+
)
441+
default_md_2 = make_default_md_2(
421442
arena_df, category_elo_results["Overall"], mirror=mirror
422443
)
423444

424-
md_1 = gr.Markdown(default_md, elem_id="leaderboard_markdown")
445+
with gr.Row():
446+
with gr.Column(scale=4):
447+
md_1 = gr.Markdown(default_md, elem_id="leaderboard_markdown")
448+
with gr.Column(scale=1):
449+
vote_button = gr.Button("Vote!", link="https://chat.lmsys.org")
450+
md2 = gr.Markdown(default_md_2, elem_id="leaderboard_markdown")
425451
if leaderboard_table_file:
426452
data = load_leaderboard_table_csv(leaderboard_table_file)
427453
model_table_df = pd.DataFrame(data)
@@ -447,13 +473,26 @@ def build_leaderboard_tab(
447473
default_category_details, elem_id="category_deets"
448474
)
449475

476+
arena_vals = pd.DataFrame(
477+
arena_table_vals,
478+
columns=[
479+
"Rank* (UB)",
480+
"Model",
481+
"Arena Elo",
482+
"95% CI",
483+
"Votes",
484+
"Organization",
485+
"License",
486+
"Knowledge Cutoff",
487+
],
488+
)
450489
elo_display_df = gr.Dataframe(
451490
headers=[
452491
"Rank* (UB)",
453-
"🤖 Model",
454-
"Arena Elo",
455-
"📊 95% CI",
456-
"🗳️ Votes",
492+
"Model",
493+
"Arena Elo",
494+
"95% CI",
495+
"Votes",
457496
"Organization",
458497
"License",
459498
"Knowledge Cutoff",
@@ -468,12 +507,14 @@ def build_leaderboard_tab(
468507
"str",
469508
"str",
470509
],
471-
value=arena_table_vals,
510+
# value=highlight_top_models(arena_vals.style),
511+
value=arena_vals.style,
472512
elem_id="arena_leaderboard_dataframe",
473513
height=700,
474514
column_widths=[70, 190, 100, 100, 90, 130, 150, 100],
475515
wrap=True,
476516
)
517+
477518
gr.Markdown(
478519
f"""Note: in each category, we exclude models with fewer than 500 votes as their confidence intervals can be large.""",
479520
elem_id="leaderboard_markdown",
@@ -489,42 +530,41 @@ def build_leaderboard_tab(
489530
with gr.Row():
490531
with gr.Column():
491532
gr.Markdown(
492-
"#### Figure 1: Fraction of Model A Wins for All Non-tied A vs. B Battles",
533+
"#### Figure 1: Confidence Intervals on Model Strength (via Bootstrapping)",
493534
elem_id="plot-title",
494535
)
495-
plot_1 = gr.Plot(
496-
p1, show_label=False, elem_id="plot-container"
497-
)
536+
plot_3 = gr.Plot(p3, show_label=False)
498537
with gr.Column():
499538
gr.Markdown(
500-
"#### Figure 2: Battle Count for Each Combination of Models (without Ties)",
539+
"#### Figure 2: Average Win Rate Against All Other Models (Assuming Uniform Sampling and No Ties)",
501540
elem_id="plot-title",
502541
)
503-
plot_2 = gr.Plot(p2, show_label=False)
542+
plot_4 = gr.Plot(p4, show_label=False)
504543
with gr.Row():
505544
with gr.Column():
506545
gr.Markdown(
507-
"#### Figure 3: Confidence Intervals on Model Strength (via Bootstrapping)",
546+
"#### Figure 3: Fraction of Model A Wins for All Non-tied A vs. B Battles",
508547
elem_id="plot-title",
509548
)
510-
plot_3 = gr.Plot(p3, show_label=False)
549+
plot_1 = gr.Plot(
550+
p1, show_label=False, elem_id="plot-container"
551+
)
511552
with gr.Column():
512553
gr.Markdown(
513-
"#### Figure 4: Average Win Rate Against All Other Models (Assuming Uniform Sampling and No Ties)",
554+
"#### Figure 4: Battle Count for Each Combination of Models (without Ties)",
514555
elem_id="plot-title",
515556
)
516-
plot_4 = gr.Plot(p4, show_label=False)
517-
557+
plot_2 = gr.Plot(p2, show_label=False)
518558
with gr.Tab("Full Leaderboard", id=1):
519559
md = make_full_leaderboard_md(elo_results)
520560
gr.Markdown(md, elem_id="leaderboard_markdown")
521561
full_table_vals = get_full_table(arena_df, model_table_df)
522562
gr.Dataframe(
523563
headers=[
524-
"🤖 Model",
525-
"Arena Elo",
526-
"📈 MT-bench",
527-
"📚 MMLU",
564+
"Model",
565+
"Arena Elo",
566+
"MT-bench",
567+
"MMLU",
528568
"Organization",
529569
"License",
530570
],
@@ -551,10 +591,10 @@ def update_leaderboard_df(arena_table_vals):
551591
columns=[
552592
"Rank* (UB)",
553593
"Delta",
554-
"🤖 Model",
555-
"Arena Elo",
556-
"📊 95% CI",
557-
"🗳️ Votes",
594+
"Model",
595+
"Arena Elo",
596+
"95% CI",
597+
"Votes",
558598
"Organization",
559599
"License",
560600
"Knowledge Cutoff",
@@ -599,20 +639,21 @@ def update_leaderboard_and_plots(category):
599639
)
600640
if category != "Overall":
601641
arena_values = update_leaderboard_df(arena_values)
642+
# arena_values = highlight_top_models(arena_values)
602643
arena_values = gr.Dataframe(
603644
headers=[
604645
"Rank* (UB)",
605646
"Delta",
606-
"🤖 Model",
607-
"Arena Elo",
608-
"📊 95% CI",
609-
"🗳️ Votes",
647+
"Model",
648+
"Arena Elo",
649+
"95% CI",
650+
"Votes",
610651
"Organization",
611652
"License",
612653
"Knowledge Cutoff",
613654
],
614655
datatype=[
615-
"number",
656+
"str",
616657
"number",
617658
"markdown",
618659
"number",
@@ -629,19 +670,29 @@ def update_leaderboard_and_plots(category):
629670
wrap=True,
630671
)
631672
else:
673+
# not_arena_values = pd.DataFrame(arena_values, columns=["Rank* (UB)",
674+
# "Model",
675+
# "Arena Elo",
676+
# "95% CI",
677+
# "Votes",
678+
# "Organization",
679+
# "License",
680+
# "Knowledge Cutoff",],
681+
# )
682+
# arena_values = highlight_top_models(not_arena_values.style)
632683
arena_values = gr.Dataframe(
633684
headers=[
634685
"Rank* (UB)",
635-
"🤖 Model",
636-
"Arena Elo",
637-
"📊 95% CI",
638-
"🗳️ Votes",
686+
"Model",
687+
"Arena Elo",
688+
"95% CI",
689+
"Votes",
639690
"Organization",
640691
"License",
641692
"Knowledge Cutoff",
642693
],
643694
datatype=[
644-
"number",
695+
"str",
645696
"markdown",
646697
"number",
647698
"str",
@@ -685,7 +736,7 @@ def update_leaderboard_and_plots(category):
685736
from fastchat.serve.gradio_web_server import acknowledgment_md
686737

687738
with gr.Accordion(
688-
"📝 Citation",
739+
"Citation",
689740
open=True,
690741
):
691742
citation_md = """
@@ -713,15 +764,31 @@ def build_demo(elo_results_file, leaderboard_table_file):
713764
from fastchat.serve.gradio_web_server import block_css
714765

715766
text_size = gr.themes.sizes.text_lg
716-
theme = gr.themes.Base(text_size=text_size)
767+
# load theme from theme.json
768+
theme = gr.themes.Default.load("theme.json")
769+
# set text size to large
770+
theme.text_size = text_size
717771
theme.set(
718-
button_secondary_background_fill_hover="*primary_300",
719-
button_secondary_background_fill_hover_dark="*primary_700",
772+
button_large_text_size="40px",
773+
button_small_text_size="40px",
774+
button_large_text_weight="1000",
775+
button_small_text_weight="1000",
776+
button_shadow="*shadow_drop_lg",
777+
button_shadow_hover="*shadow_drop_lg",
778+
checkbox_label_shadow="*shadow_drop_lg",
779+
button_shadow_active="*shadow_inset",
780+
button_secondary_background_fill="*primary_300",
781+
button_secondary_background_fill_dark="*primary_700",
782+
button_secondary_background_fill_hover="*primary_200",
783+
button_secondary_background_fill_hover_dark="*primary_500",
784+
button_secondary_text_color="*primary_800",
785+
button_secondary_text_color_dark="white",
720786
)
721787

722788
with gr.Blocks(
723789
title="Chatbot Arena Leaderboard",
724-
theme=gr.themes.Default(text_size=text_size),
790+
# theme=gr.themes.Default(text_size=text_size),
791+
theme=theme,
725792
css=block_css,
726793
) as demo:
727794
with gr.Tabs() as tabs:
@@ -730,6 +797,7 @@ def build_demo(elo_results_file, leaderboard_table_file):
730797
elo_results_file,
731798
leaderboard_table_file,
732799
show_plot=True,
800+
mirror=False,
733801
)
734802

735803
with gr.Tab("Basic Stats", id=1):

0 commit comments

Comments
 (0)