Skip to content

Commit e27dd24

Browse files
authored
Merge branch 'lm-sys:main' into feature/jab-api-0.3
2 parents 2179cce + 8664268 commit e27dd24

File tree

2 files changed

+102
-0
lines changed

2 files changed

+102
-0
lines changed
Lines changed: 87 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,87 @@
1+
import gradio as gr
2+
import pandas as pd
3+
import requests
4+
import os
5+
6+
from fastchat.serve.monitor.monitor import recompute_final_ranking
7+
8+
copilot_arena_leaderboard_url = os.getenv("COPILOT_ARENA_LEADERBOARD_URL")
9+
10+
11+
def process_copilot_arena_leaderboard(leaderboard):
12+
leaderboard = leaderboard.copy().loc[leaderboard["visibility"] == "public"]
13+
leaderboard["score"] = leaderboard["score"].round().astype(int)
14+
leaderboard["rating_q975"] = leaderboard["upper"].round().astype(int)
15+
leaderboard["rating_q025"] = leaderboard["lower"].round().astype(int)
16+
17+
leaderboard["upper_diff"] = leaderboard["rating_q975"] - leaderboard["score"]
18+
leaderboard["lower_diff"] = leaderboard["score"] - leaderboard["rating_q025"]
19+
20+
leaderboard["confidence_interval"] = (
21+
"+"
22+
+ leaderboard["upper_diff"].astype(str)
23+
+ " / -"
24+
+ leaderboard["lower_diff"].astype(str)
25+
)
26+
27+
rankings_ub = recompute_final_ranking(leaderboard)
28+
leaderboard.insert(loc=0, column="Rank* (UB)", value=rankings_ub)
29+
30+
leaderboard = leaderboard.sort_values(
31+
by=["Rank* (UB)", "score"], ascending=[True, False]
32+
)
33+
34+
return leaderboard
35+
36+
37+
def build_copilot_arena_tab():
38+
response = requests.get(copilot_arena_leaderboard_url)
39+
if response.status_code == 200:
40+
leaderboard = pd.DataFrame(response.json()["elo_data"])
41+
leaderboard = process_copilot_arena_leaderboard(leaderboard)
42+
leaderboard = leaderboard.rename(
43+
columns={
44+
"name": "Model",
45+
"confidence_interval": "Confidence Interval",
46+
"score": "Arena Score",
47+
"organization": "Organization",
48+
"votes": "Votes",
49+
}
50+
)
51+
52+
column_order = [
53+
"Rank* (UB)",
54+
"Model",
55+
"Arena Score",
56+
"Confidence Interval",
57+
"Votes",
58+
"Organization",
59+
]
60+
leaderboard = leaderboard[column_order]
61+
num_models = len(leaderboard)
62+
total_battles = int(leaderboard["Votes"].sum()) // 2
63+
md = f"""
64+
[Copilot Arena](https://blog.lmarena.ai/blog/2024/copilot-arena/) is a free AI coding assistant that provides paired responses from different state-of-the-art LLMs. This leaderboard contains the relative performance and ranking of {num_models} models over {total_battles} battles.
65+
"""
66+
67+
gr.Markdown(md, elem_id="leaderboard_markdown")
68+
gr.DataFrame(
69+
leaderboard,
70+
datatype=["str" for _ in leaderboard.columns],
71+
elem_id="arena_hard_leaderboard",
72+
height=600,
73+
wrap=True,
74+
interactive=False,
75+
column_widths=[70, 130, 60, 80, 50, 80],
76+
)
77+
78+
gr.Markdown(
79+
"""
80+
***Rank (UB)**: model's ranking (upper-bound), defined by one + the number of models that are statistically better than the target model.
81+
Model A is statistically better than model B when A's lower-bound score is greater than B's upper-bound score (in 95% confidence interval). \n
82+
**Confidence Interval**: represents the range of uncertainty around the Arena Score. It's displayed as +X / -Y, where X is the difference between the upper bound and the score, and Y is the difference between the score and the lower bound.
83+
""",
84+
elem_id="leaderboard_markdown",
85+
)
86+
else:
87+
gr.Markdown("Error with fetching Copilot Arena data. Check back in later.")

fastchat/serve/monitor/monitor.py

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1035,6 +1035,21 @@ def build_leaderboard_tab(
10351035
elo_results_text, model_table_df, model_to_score
10361036
)
10371037

1038+
from fastchat.serve.monitor.copilot_arena import (
1039+
build_copilot_arena_tab,
1040+
copilot_arena_leaderboard_url,
1041+
)
1042+
1043+
if copilot_arena_leaderboard_url:
1044+
with gr.Tab("Copilot Arena Leaderboard", id=5):
1045+
build_copilot_arena_tab()
1046+
else:
1047+
print(
1048+
"Unable to build Copilot Arena's Leaderboard. "
1049+
"COPILOT_ARENA_LEADERBOARD_URL environment variable is not set. "
1050+
"Please configure it to a valid URL."
1051+
)
1052+
10381053
if not show_plot:
10391054
gr.Markdown(
10401055
""" ## Visit our [HF space](https://huggingface.co/spaces/lmsys/chatbot-arena-leaderboard) for more analysis!

0 commit comments

Comments
 (0)