3232leader_component_values = [None ] * 5
3333
3434
35- def make_default_md (arena_df , elo_results , mirror = False ):
36- mirror_str = "<span style='color: red; font-weight: bold;'> This is a mirror of the live leaderboard created and maintained by the [LMSYS Organization](https://lmsys.org).</span>"
35+ def make_default_md_1 (arena_df , elo_results , mirror = False ):
36+ link_color = "#1976D2" # This color should be clear in both light and dark mode
3737 leaderboard_md = f"""
38- # 🏆 LMSYS Chatbot Arena Leaderboard
39- | [Website](https://lmsys.org) | [Vote](https://chat.lmsys.org) | [Blog](https://lmsys.org/blog/2023-05-03-arena/) | [GitHub](https://github.com/lm-sys/FastChat) | [Paper](https://arxiv.org/abs/2403.04132) | [Dataset](https://github.com/lm-sys/FastChat/blob/main/docs/dataset_release.md) | [Twitter](https://twitter.com/lmsysorg) | [Discord](https://discord.gg/HSWAKCrnFx) |
38+ # 🏆 LMSYS Chatbot Arena Leaderboard
39+ <a href='https://lmsys.org/blog/2023-05-03-arena/' style='color: { link_color } ; text-decoration: none;'>Blog</a> |
40+ <a href='https://arxiv.org/abs/2403.04132' style='color: { link_color } ; text-decoration: none;'>Paper</a> |
41+ <a href='https://github.com/lm-sys/FastChat' style='color: { link_color } ; text-decoration: none;'>GitHub</a> |
42+ <a href='https://github.com/lm-sys/FastChat/blob/main/docs/dataset_release.md' style='color: { link_color } ; text-decoration: none;'>Dataset</a> |
43+ <a href='https://twitter.com/lmsysorg' style='color: { link_color } ; text-decoration: none;'>Twitter</a> |
44+ <a href='https://discord.gg/HSWAKCrnFx' style='color: { link_color } ; text-decoration: none;'>Discord</a>
45+ """
46+
47+ return leaderboard_md
4048
49+
50+ def make_default_md_2 (arena_df , elo_results , mirror = False ):
51+ mirror_str = "<span style='color: red; font-weight: bold'>This is a mirror of the live leaderboard created and maintained by the <a href='https://lmsys.org' style='color: red; text-decoration: none;'>LMSYS Organization</a>. Please link to <a href='https://leaderboard.lmsys.org' style='color: #B00020; text-decoration: none;'>leaderboard.lmsys.org</a> for citation purposes.</span>"
52+ leaderboard_md = f"""
4153 { mirror_str if mirror else "" }
4254
43- LMSYS [ Chatbot Arena](https://lmsys.org/blog/2023-05-03-arena/) is a crowdsourced open platform for LLM evals. We've collected over ** 800,000** human pairwise comparisons to rank LLMs with the [ Bradley-Terry model](https://en.wikipedia.org/wiki/Bradley%E2%80%93Terry_model) and display the model ratings in Elo-scale.
44- You can find more details in our [ paper](https://arxiv.org/abs/2403.04132).
55+ LMSYS Chatbot Arena is a crowdsourced open platform for LLM evals. We've collected over 800,000 human pairwise comparisons to rank LLMs with the Bradley-Terry model and display the model ratings in Elo-scale.
56+ You can find more details in our paper. **Chatbot arena is dependent on community participation, please contribute by casting your vote!**
4557 """
4658
4759 return leaderboard_md
@@ -57,11 +69,11 @@ def make_arena_leaderboard_md(arena_df):
5769
5870📣 **NEW!** View leaderboard for different categories (e.g., coding, long user query)! This is still in preview and subject to change.
5971
60- Code to recreate leaderboard tables and plots in this [notebook]({ notebook_url } ). You can contribute your vote 🗳️ at [chat.lmsys.org](https://chat.lmsys.org)!
72+ Code to recreate leaderboard tables and plots in this [notebook]({ notebook_url } ). You can contribute your vote at [chat.lmsys.org](https://chat.lmsys.org)!
6173
6274***Rank (UB)**: model's ranking (upper-bound), defined by one + the number of models that are statistically better than the target model.
6375Model A is statistically better than model B when A's lower-bound score is greater than B's upper-bound score (in 95% confidence interval).
64- See Figure 3 below for visualization of the confidence intervals of model scores.
76+ See Figure 1 below for visualization of the confidence intervals of model scores.
6577"""
6678 return leaderboard_md
6779
@@ -250,10 +262,8 @@ def get_full_table(arena_df, model_table_df):
250262
251263def create_ranking_str (ranking , ranking_difference ):
252264 if ranking_difference > 0 :
253- # return f"{int(ranking)} (\u2191{int(ranking_difference)})"
254265 return f"{ int (ranking )} \u2191 "
255266 elif ranking_difference < 0 :
256- # return f"{int(ranking)} (\u2193{int(-ranking_difference)})"
257267 return f"{ int (ranking )} \u2193 "
258268 else :
259269 return f"{ int (ranking )} "
@@ -275,21 +285,31 @@ def recompute_final_ranking(arena_df):
275285 return list (ranking .values ())
276286
277287
288+ def highlight_top_models (df ):
289+ def highlight_max_rank (s ):
290+ # Pastel Yellow with transparency, rgba(red, green, blue, alpha)
291+ highlight_color = "rgba(255, 255, 128, 0.2)" # 50% transparent
292+ if int (s ["Rank* (UB)" ].replace ("↑" , "" ).replace ("↓" , "" )) == 1 :
293+ return [f"background-color: { highlight_color } " for _ in s ]
294+ else :
295+ return ["" for _ in s ]
296+
297+ # Apply and return the styled DataFrame
298+ return df .apply (highlight_max_rank , axis = 1 )
299+
300+
278301def get_arena_table (arena_df , model_table_df , arena_subset_df = None ):
279302 arena_df = arena_df .sort_values (
280303 by = ["final_ranking" , "rating" ], ascending = [True , False ]
281304 )
282305 arena_df ["final_ranking" ] = recompute_final_ranking (arena_df )
283306 arena_df = arena_df .sort_values (by = ["final_ranking" ], ascending = True )
284307
285- # arena_df["final_ranking"] = range(1, len(arena_df) + 1)
286308 # sort by rating
287309 if arena_subset_df is not None :
288310 # filter out models not in the arena_df
289311 arena_subset_df = arena_subset_df [arena_subset_df .index .isin (arena_df .index )]
290312 arena_subset_df = arena_subset_df .sort_values (by = ["rating" ], ascending = False )
291- # arena_subset_df = arena_subset_df.sort_values(by=["final_ranking"], ascending=True)
292- # arena_subset_df = arena_subset_df[arena_subset_df["num_battles"] > 500]
293313 arena_subset_df ["final_ranking" ] = recompute_final_ranking (arena_subset_df )
294314 # keep only the models in the subset in arena_df and recompute final_ranking
295315 arena_df = arena_df [arena_df .index .isin (arena_subset_df .index )]
@@ -307,10 +327,6 @@ def get_arena_table(arena_df, model_table_df, arena_subset_df=None):
307327 arena_df ["final_ranking_global" ] - arena_df ["final_ranking" ]
308328 )
309329
310- # no tie version
311- # arena_df = arena_subset_df.join(arena_df["final_ranking_no_tie"], rsuffix="_global", how="inner")
312- # arena_df["ranking_difference"] = arena_df["final_ranking_no_tie_global"] - arena_df["final_ranking_no_tie"]
313-
314330 arena_df = arena_df .sort_values (
315331 by = ["final_ranking" , "rating" ], ascending = [True , False ]
316332 )
@@ -319,6 +335,8 @@ def get_arena_table(arena_df, model_table_df, arena_subset_df=None):
319335 axis = 1 ,
320336 )
321337
338+ arena_df ["final_ranking" ] = arena_df ["final_ranking" ].astype (str )
339+
322340 values = []
323341 for i in range (len (arena_df )):
324342 row = []
@@ -417,11 +435,19 @@ def build_leaderboard_tab(
417435 p3 = category_elo_results ["Overall" ]["bootstrap_elo_rating" ]
418436 p4 = category_elo_results ["Overall" ]["average_win_rate_bar" ]
419437 arena_df = arena_dfs ["Overall" ]
420- default_md = make_default_md (
438+ default_md = make_default_md_1 (
439+ arena_df , category_elo_results ["Overall" ], mirror = mirror
440+ )
441+ default_md_2 = make_default_md_2 (
421442 arena_df , category_elo_results ["Overall" ], mirror = mirror
422443 )
423444
424- md_1 = gr .Markdown (default_md , elem_id = "leaderboard_markdown" )
445+ with gr .Row ():
446+ with gr .Column (scale = 4 ):
447+ md_1 = gr .Markdown (default_md , elem_id = "leaderboard_markdown" )
448+ with gr .Column (scale = 1 ):
449+ vote_button = gr .Button ("Vote!" , link = "https://chat.lmsys.org" )
450+ md2 = gr .Markdown (default_md_2 , elem_id = "leaderboard_markdown" )
425451 if leaderboard_table_file :
426452 data = load_leaderboard_table_csv (leaderboard_table_file )
427453 model_table_df = pd .DataFrame (data )
@@ -447,13 +473,26 @@ def build_leaderboard_tab(
447473 default_category_details , elem_id = "category_deets"
448474 )
449475
476+ arena_vals = pd .DataFrame (
477+ arena_table_vals ,
478+ columns = [
479+ "Rank* (UB)" ,
480+ "Model" ,
481+ "Arena Elo" ,
482+ "95% CI" ,
483+ "Votes" ,
484+ "Organization" ,
485+ "License" ,
486+ "Knowledge Cutoff" ,
487+ ],
488+ )
450489 elo_display_df = gr .Dataframe (
451490 headers = [
452491 "Rank* (UB)" ,
453- "🤖 Model" ,
454- "⭐ Arena Elo" ,
455- "📊 95% CI" ,
456- "🗳️ Votes" ,
492+ "Model" ,
493+ "Arena Elo" ,
494+ "95% CI" ,
495+ "Votes" ,
457496 "Organization" ,
458497 "License" ,
459498 "Knowledge Cutoff" ,
@@ -468,12 +507,14 @@ def build_leaderboard_tab(
468507 "str" ,
469508 "str" ,
470509 ],
471- value = arena_table_vals ,
510+ # value=highlight_top_models(arena_vals.style),
511+ value = arena_vals .style ,
472512 elem_id = "arena_leaderboard_dataframe" ,
473513 height = 700 ,
474514 column_widths = [70 , 190 , 100 , 100 , 90 , 130 , 150 , 100 ],
475515 wrap = True ,
476516 )
517+
477518 gr .Markdown (
478519 f"""Note: in each category, we exclude models with fewer than 500 votes as their confidence intervals can be large.""" ,
479520 elem_id = "leaderboard_markdown" ,
@@ -489,42 +530,41 @@ def build_leaderboard_tab(
489530 with gr .Row ():
490531 with gr .Column ():
491532 gr .Markdown (
492- "#### Figure 1: Fraction of Model A Wins for All Non-tied A vs. B Battles " ,
533+ "#### Figure 1: Confidence Intervals on Model Strength (via Bootstrapping) " ,
493534 elem_id = "plot-title" ,
494535 )
495- plot_1 = gr .Plot (
496- p1 , show_label = False , elem_id = "plot-container"
497- )
536+ plot_3 = gr .Plot (p3 , show_label = False )
498537 with gr .Column ():
499538 gr .Markdown (
500- "#### Figure 2: Battle Count for Each Combination of Models (without Ties)" ,
539+ "#### Figure 2: Average Win Rate Against All Other Models (Assuming Uniform Sampling and No Ties)" ,
501540 elem_id = "plot-title" ,
502541 )
503- plot_2 = gr .Plot (p2 , show_label = False )
542+ plot_4 = gr .Plot (p4 , show_label = False )
504543 with gr .Row ():
505544 with gr .Column ():
506545 gr .Markdown (
507- "#### Figure 3: Confidence Intervals on Model Strength (via Bootstrapping) " ,
546+ "#### Figure 3: Fraction of Model A Wins for All Non-tied A vs. B Battles " ,
508547 elem_id = "plot-title" ,
509548 )
510- plot_3 = gr .Plot (p3 , show_label = False )
549+ plot_1 = gr .Plot (
550+ p1 , show_label = False , elem_id = "plot-container"
551+ )
511552 with gr .Column ():
512553 gr .Markdown (
513- "#### Figure 4: Average Win Rate Against All Other Models (Assuming Uniform Sampling and No Ties)" ,
554+ "#### Figure 4: Battle Count for Each Combination of Models (without Ties)" ,
514555 elem_id = "plot-title" ,
515556 )
516- plot_4 = gr .Plot (p4 , show_label = False )
517-
557+ plot_2 = gr .Plot (p2 , show_label = False )
518558 with gr .Tab ("Full Leaderboard" , id = 1 ):
519559 md = make_full_leaderboard_md (elo_results )
520560 gr .Markdown (md , elem_id = "leaderboard_markdown" )
521561 full_table_vals = get_full_table (arena_df , model_table_df )
522562 gr .Dataframe (
523563 headers = [
524- "🤖 Model" ,
525- "⭐ Arena Elo" ,
526- "📈 MT-bench" ,
527- "📚 MMLU" ,
564+ "Model" ,
565+ "Arena Elo" ,
566+ "MT-bench" ,
567+ "MMLU" ,
528568 "Organization" ,
529569 "License" ,
530570 ],
@@ -551,10 +591,10 @@ def update_leaderboard_df(arena_table_vals):
551591 columns = [
552592 "Rank* (UB)" ,
553593 "Delta" ,
554- "🤖 Model" ,
555- "⭐ Arena Elo" ,
556- "📊 95% CI" ,
557- "🗳️ Votes" ,
594+ "Model" ,
595+ "Arena Elo" ,
596+ "95% CI" ,
597+ "Votes" ,
558598 "Organization" ,
559599 "License" ,
560600 "Knowledge Cutoff" ,
@@ -599,20 +639,21 @@ def update_leaderboard_and_plots(category):
599639 )
600640 if category != "Overall" :
601641 arena_values = update_leaderboard_df (arena_values )
642+ # arena_values = highlight_top_models(arena_values)
602643 arena_values = gr .Dataframe (
603644 headers = [
604645 "Rank* (UB)" ,
605646 "Delta" ,
606- "🤖 Model" ,
607- "⭐ Arena Elo" ,
608- "📊 95% CI" ,
609- "🗳️ Votes" ,
647+ "Model" ,
648+ "Arena Elo" ,
649+ "95% CI" ,
650+ "Votes" ,
610651 "Organization" ,
611652 "License" ,
612653 "Knowledge Cutoff" ,
613654 ],
614655 datatype = [
615- "number " ,
656+ "str " ,
616657 "number" ,
617658 "markdown" ,
618659 "number" ,
@@ -629,19 +670,29 @@ def update_leaderboard_and_plots(category):
629670 wrap = True ,
630671 )
631672 else :
673+ # not_arena_values = pd.DataFrame(arena_values, columns=["Rank* (UB)",
674+ # "Model",
675+ # "Arena Elo",
676+ # "95% CI",
677+ # "Votes",
678+ # "Organization",
679+ # "License",
680+ # "Knowledge Cutoff",],
681+ # )
682+ # arena_values = highlight_top_models(not_arena_values.style)
632683 arena_values = gr .Dataframe (
633684 headers = [
634685 "Rank* (UB)" ,
635- "🤖 Model" ,
636- "⭐ Arena Elo" ,
637- "📊 95% CI" ,
638- "🗳️ Votes" ,
686+ "Model" ,
687+ "Arena Elo" ,
688+ "95% CI" ,
689+ "Votes" ,
639690 "Organization" ,
640691 "License" ,
641692 "Knowledge Cutoff" ,
642693 ],
643694 datatype = [
644- "number " ,
695+ "str " ,
645696 "markdown" ,
646697 "number" ,
647698 "str" ,
@@ -685,7 +736,7 @@ def update_leaderboard_and_plots(category):
685736 from fastchat .serve .gradio_web_server import acknowledgment_md
686737
687738 with gr .Accordion (
688- "📝 Citation" ,
739+ "Citation" ,
689740 open = True ,
690741 ):
691742 citation_md = """
@@ -713,15 +764,31 @@ def build_demo(elo_results_file, leaderboard_table_file):
713764 from fastchat .serve .gradio_web_server import block_css
714765
715766 text_size = gr .themes .sizes .text_lg
716- theme = gr .themes .Base (text_size = text_size )
767+ # load theme from theme.json
768+ theme = gr .themes .Default .load ("theme.json" )
769+ # set text size to large
770+ theme .text_size = text_size
717771 theme .set (
718- button_secondary_background_fill_hover = "*primary_300" ,
719- button_secondary_background_fill_hover_dark = "*primary_700" ,
772+ button_large_text_size = "40px" ,
773+ button_small_text_size = "40px" ,
774+ button_large_text_weight = "1000" ,
775+ button_small_text_weight = "1000" ,
776+ button_shadow = "*shadow_drop_lg" ,
777+ button_shadow_hover = "*shadow_drop_lg" ,
778+ checkbox_label_shadow = "*shadow_drop_lg" ,
779+ button_shadow_active = "*shadow_inset" ,
780+ button_secondary_background_fill = "*primary_300" ,
781+ button_secondary_background_fill_dark = "*primary_700" ,
782+ button_secondary_background_fill_hover = "*primary_200" ,
783+ button_secondary_background_fill_hover_dark = "*primary_500" ,
784+ button_secondary_text_color = "*primary_800" ,
785+ button_secondary_text_color_dark = "white" ,
720786 )
721787
722788 with gr .Blocks (
723789 title = "Chatbot Arena Leaderboard" ,
724- theme = gr .themes .Default (text_size = text_size ),
790+ # theme=gr.themes.Default(text_size=text_size),
791+ theme = theme ,
725792 css = block_css ,
726793 ) as demo :
727794 with gr .Tabs () as tabs :
@@ -730,6 +797,7 @@ def build_demo(elo_results_file, leaderboard_table_file):
730797 elo_results_file ,
731798 leaderboard_table_file ,
732799 show_plot = True ,
800+ mirror = False ,
733801 )
734802
735803 with gr .Tab ("Basic Stats" , id = 1 ):
0 commit comments