Skip to content

Commit 7a912c7

Browse files
Add Copilot Arena leaderboard (#3618)
Co-authored-by: Wayne Chi <[email protected]>
1 parent 1cd4b74 commit 7a912c7

File tree

2 files changed

+105
-0
lines changed

2 files changed

+105
-0
lines changed
Lines changed: 96 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,96 @@
1+
import gradio as gr
2+
import pandas as pd
3+
import requests
4+
import os
5+
6+
from fastchat.serve.monitor.monitor import recompute_final_ranking
7+
8+
copilot_arena_leaderboard_url = os.getenv("COPILOT_ARENA_LEADERBOARD_URL")
9+
10+
if not copilot_arena_leaderboard_url:
11+
raise ValueError(
12+
"COPILOT_ARENA_LEADERBOARD_URL environment variable is not set. "
13+
"Please configure it to a valid URL."
14+
)
15+
16+
17+
def process_copilot_arena_leaderboard(leaderboard):
18+
leaderboard = leaderboard.copy().loc[leaderboard["visibility"] == "public"]
19+
leaderboard["score"] = leaderboard["score"].round().astype(int)
20+
leaderboard["rating_q975"] = leaderboard["upper"].round().astype(int)
21+
leaderboard["rating_q025"] = leaderboard["lower"].round().astype(int)
22+
23+
leaderboard["upper_diff"] = leaderboard["rating_q975"] - leaderboard["score"]
24+
leaderboard["lower_diff"] = leaderboard["score"] - leaderboard["rating_q025"]
25+
26+
leaderboard["confidence_interval"] = (
27+
"+"
28+
+ leaderboard["upper_diff"].astype(str)
29+
+ " / -"
30+
+ leaderboard["lower_diff"].astype(str)
31+
)
32+
33+
rankings_ub = recompute_final_ranking(leaderboard)
34+
leaderboard.insert(loc=0, column="Rank* (UB)", value=rankings_ub)
35+
36+
leaderboard = leaderboard.sort_values(
37+
by=["Rank* (UB)", "score"], ascending=[True, False]
38+
)
39+
40+
return leaderboard
41+
42+
43+
def build_copilot_arena_tab():
44+
if copilot_arena_leaderboard_url is None:
45+
print("Copilot Arena Leaderboard URL is not set. Skipping this leaderboard.")
46+
return
47+
response = requests.get(copilot_arena_leaderboard_url)
48+
if response.status_code == 200:
49+
leaderboard = pd.DataFrame(response.json()["elo_data"])
50+
leaderboard = process_copilot_arena_leaderboard(leaderboard)
51+
leaderboard = leaderboard.rename(
52+
columns={
53+
"name": "Model",
54+
"confidence_interval": "Confidence Interval",
55+
"score": "Arena Score",
56+
"organization": "Organization",
57+
"votes": "Votes",
58+
}
59+
)
60+
61+
column_order = [
62+
"Rank* (UB)",
63+
"Model",
64+
"Arena Score",
65+
"Confidence Interval",
66+
"Votes",
67+
"Organization",
68+
]
69+
leaderboard = leaderboard[column_order]
70+
num_models = len(leaderboard)
71+
total_battles = int(leaderboard["Votes"].sum()) // 2
72+
md = f"""
73+
[Copilot Arena](https://blog.lmarena.ai/blog/2024/copilot-arena/) is a free AI coding assistant that provides paired responses from different state-of-the-art LLMs. This leaderboard contains the relative performance and ranking of {num_models} models over {total_battles} battles.
74+
"""
75+
76+
gr.Markdown(md, elem_id="leaderboard_markdown")
77+
gr.DataFrame(
78+
leaderboard,
79+
datatype=["str" for _ in leaderboard.columns],
80+
elem_id="arena_hard_leaderboard",
81+
height=600,
82+
wrap=True,
83+
interactive=False,
84+
column_widths=[70, 130, 60, 80, 50, 80],
85+
)
86+
87+
gr.Markdown(
88+
"""
89+
***Rank (UB)**: model's ranking (upper-bound), defined by one + the number of models that are statistically better than the target model.
90+
Model A is statistically better than model B when A's lower-bound score is greater than B's upper-bound score (in 95% confidence interval). \n
91+
**Confidence Interval**: represents the range of uncertainty around the Arena Score. It's displayed as +X / -Y, where X is the difference between the upper bound and the score, and Y is the difference between the score and the lower bound.
92+
""",
93+
elem_id="leaderboard_markdown",
94+
)
95+
else:
96+
gr.Markdown("Error with fetching Copilot Arena data. Check back in later.")

fastchat/serve/monitor/monitor.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1034,6 +1034,15 @@ def build_leaderboard_tab(
10341034
build_full_leaderboard_tab(
10351035
elo_results_text, model_table_df, model_to_score
10361036
)
1037+
try:
1038+
with gr.Tab("Copilot Arena Leaderboard", id=5):
1039+
from fastchat.serve.monitor.copilot_arena import (
1040+
build_copilot_arena_tab,
1041+
)
1042+
1043+
build_copilot_arena_tab()
1044+
except Exception as e:
1045+
print(f"Unable to build Copilot Arena's Leaderboard. Error: {e}")
10371046

10381047
if not show_plot:
10391048
gr.Markdown(

0 commit comments

Comments
 (0)