Skip to content

Commit ca73dbe

Browse files
fix pr comments
1 parent 7273783 commit ca73dbe

File tree

2 files changed

+87
-114
lines changed

2 files changed

+87
-114
lines changed
Lines changed: 85 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,85 @@
1+
import gradio as gr
2+
import pandas as pd
3+
import requests
4+
5+
from fastchat.serve.monitor.monitor import recompute_final_ranking
6+
7+
copilot_arena_leaderboard_url = (
8+
"https://leaderboard-server.fly.dev/elo"
9+
)
10+
11+
def process_copilot_arena_leaderboard(leaderboard):
12+
leaderboard["score"] = leaderboard["score"].round().astype(int)
13+
leaderboard["rating_q975"] = leaderboard["upper"].round().astype(int)
14+
leaderboard["rating_q025"] = leaderboard["lower"].round().astype(int)
15+
16+
leaderboard["upper_diff"] = leaderboard["rating_q975"] - leaderboard["score"]
17+
leaderboard["lower_diff"] = leaderboard["score"] - leaderboard["rating_q025"]
18+
19+
leaderboard["confidence_interval"] = (
20+
"+"
21+
+ leaderboard["upper_diff"].astype(str)
22+
+ " / -"
23+
+ leaderboard["lower_diff"].astype(str)
24+
)
25+
26+
rankings_ub = recompute_final_ranking(leaderboard)
27+
leaderboard.insert(loc=0, column="Rank* (UB)", value=rankings_ub)
28+
leaderboard["Rank"] = leaderboard["score"].rank(ascending=False).astype(int)
29+
30+
leaderboard = leaderboard.sort_values(by=["Rank"], ascending=[True])
31+
32+
return leaderboard
33+
34+
35+
def build_copilot_arena_tab():
36+
response = requests.get(copilot_arena_leaderboard_url)
37+
if response.status_code == 200:
38+
leaderboard = pd.DataFrame(response.json()["elo_data"])
39+
leaderboard = process_copilot_arena_leaderboard(leaderboard)
40+
leaderboard = leaderboard.rename(
41+
columns={
42+
"name": "Model",
43+
"confidence_interval": "Confidence Interval",
44+
"score": "Arena Score",
45+
"organization": "Organization",
46+
"votes": "Votes",
47+
}
48+
)
49+
50+
column_order = [
51+
"Rank* (UB)",
52+
"Model",
53+
"Arena Score",
54+
"Confidence Interval",
55+
"Votes",
56+
"Organization",
57+
]
58+
leaderboard = leaderboard[column_order]
59+
num_models = len(leaderboard)
60+
total_battles = int(leaderboard["Votes"].sum()) // 2
61+
md = f"""
62+
[Copilot Arena](https://blog.lmarena.ai/blog/2024/copilot-arena/) is a free AI coding assistant that provides paired responses from different state-of-the-art LLMs. This leaderboard contains the relative performance and ranking of {num_models} models over {total_battles} battles.
63+
"""
64+
65+
gr.Markdown(md, elem_id="leaderboard_markdown")
66+
gr.DataFrame(
67+
leaderboard,
68+
datatype=["str" for _ in leaderboard.columns],
69+
elem_id="arena_hard_leaderboard",
70+
height=600,
71+
wrap=True,
72+
interactive=False,
73+
column_widths=[70, 130, 60, 80, 50, 80],
74+
)
75+
76+
gr.Markdown(
77+
"""
78+
***Rank (UB)**: model's ranking (upper-bound), defined by one + the number of models that are statistically better than the target model.
79+
Model A is statistically better than model B when A's lower-bound score is greater than B's upper-bound score (in 95% confidence interval). \n
80+
**Confidence Interval**: represents the range of uncertainty around the Arena Score. It's displayed as +X / -Y, where X is the difference between the upper bound and the score, and Y is the difference between the score and the lower bound.
81+
""",
82+
elem_id="leaderboard_markdown",
83+
)
84+
else:
85+
gr.Markdown("Error with fetching Copilot Arena data. Check back in later.")

fastchat/serve/monitor/monitor.py

Lines changed: 2 additions & 114 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,6 @@
1717
import pandas as pd
1818
import gradio as gr
1919
import numpy as np
20-
import requests
2120

2221
from fastchat.constants import SURVEY_LINK
2322
from fastchat.serve.monitor.basic_stats import report_basic_stats, get_log_files
@@ -857,119 +856,6 @@ def build_category_leaderboard_tab(
857856
)
858857

859858

860-
def compute_ub_ranking(arena_df):
861-
# Sort models based on their scores
862-
sorted_models = arena_df.sort_values("score", ascending=False).index.tolist()
863-
864-
ub_ranking = {}
865-
current_rank = 1
866-
i = 0
867-
868-
while i < len(sorted_models):
869-
current_model = sorted_models[i]
870-
current_lower = arena_df.loc[current_model]["lower"]
871-
tied_models = [current_model]
872-
873-
# Find ties
874-
j = i + 1
875-
while j < len(sorted_models):
876-
next_model = sorted_models[j]
877-
if arena_df.loc[next_model]["upper"] >= current_lower:
878-
tied_models.append(next_model)
879-
j += 1
880-
else:
881-
break
882-
883-
# Assign ranks to tied models
884-
for model in tied_models:
885-
ub_ranking[model] = current_rank
886-
887-
# Move to the next unprocessed model
888-
i = j
889-
# Next rank is at least the position in the sorted list
890-
current_rank = max(current_rank + 1, i + 1)
891-
892-
return ub_ranking
893-
894-
895-
def process_copilot_arena_leaderboard(leaderboard):
896-
leaderboard["score"] = leaderboard["score"].round().astype(int)
897-
leaderboard["upper"] = leaderboard["upper"].round().astype(int)
898-
leaderboard["lower"] = leaderboard["lower"].round().astype(int)
899-
900-
leaderboard["upper_diff"] = leaderboard["upper"] - leaderboard["score"]
901-
leaderboard["lower_diff"] = leaderboard["score"] - leaderboard["lower"]
902-
903-
leaderboard["confidence_interval"] = (
904-
"+"
905-
+ leaderboard["upper_diff"].astype(str)
906-
+ " / -"
907-
+ leaderboard["lower_diff"].astype(str)
908-
)
909-
910-
rankings_ub = compute_ub_ranking(leaderboard)
911-
leaderboard.insert(loc=0, column="Rank* (UB)", value=rankings_ub)
912-
leaderboard["Rank"] = leaderboard["score"].rank(ascending=False).astype(int)
913-
914-
leaderboard = leaderboard.sort_values(by=["Rank"], ascending=[True])
915-
916-
return leaderboard
917-
918-
919-
def build_copilot_arena_tab():
920-
copilot_arena_leaderboard_url = "https://leaderboard-server.fly.dev/elo"
921-
response = requests.get(copilot_arena_leaderboard_url)
922-
if response.status_code == 200:
923-
leaderboard = pd.DataFrame(response.json()["elo_data"])
924-
leaderboard = process_copilot_arena_leaderboard(leaderboard)
925-
leaderboard = leaderboard.rename(
926-
columns={
927-
"name": "Model",
928-
"confidence_interval": "Confidence Interval",
929-
"score": "Arena Score",
930-
"organization": "Organization",
931-
"votes": "Votes",
932-
}
933-
)
934-
935-
column_order = [
936-
"Rank* (UB)",
937-
"Model",
938-
"Arena Score",
939-
"Confidence Interval",
940-
"Votes",
941-
"Organization",
942-
]
943-
leaderboard = leaderboard[column_order]
944-
num_models = len(leaderboard)
945-
total_battles = int(leaderboard["Votes"].sum()) // 2
946-
md = f"""
947-
[Copilot Arena](https://blog.lmarena.ai/blog/2024/copilot-arena/) is a free AI coding assistant that provides paired responses from different state-of-the-art LLMs. This leaderboard contains the relative performance and ranking of {num_models} models over {total_battles} battles.
948-
"""
949-
950-
gr.Markdown(md, elem_id="leaderboard_markdown")
951-
gr.DataFrame(
952-
leaderboard,
953-
datatype=["str" for _ in leaderboard.columns],
954-
elem_id="arena_hard_leaderboard",
955-
height=600,
956-
wrap=True,
957-
interactive=False,
958-
column_widths=[70, 130, 60, 80, 50, 80],
959-
)
960-
961-
gr.Markdown(
962-
"""
963-
***Rank (UB)**: model's ranking (upper-bound), defined by one + the number of models that are statistically better than the target model.
964-
Model A is statistically better than model B when A's lower-bound score is greater than B's upper-bound score (in 95% confidence interval). \n
965-
**Confidence Interval**: represents the range of uncertainty around the Arena Score. It's displayed as +X / -Y, where X is the difference between the upper bound and the score, and Y is the difference between the score and the lower bound.
966-
""",
967-
elem_id="leaderboard_markdown",
968-
)
969-
else:
970-
gr.Markdown("Error with fetching Copilot Arena data. Check back in later.")
971-
972-
973859
selected_categories = [
974860
"full",
975861
"full_style_control",
@@ -1167,6 +1053,8 @@ def build_leaderboard_tab(
11671053
elo_results_text, model_table_df, model_to_score
11681054
)
11691055
with gr.Tab("Copilot Arena Leaderboard", id=5):
1056+
from fastchat.serve.monitor.copilot_arena import build_copilot_arena_tab
1057+
11701058
build_copilot_arena_tab()
11711059

11721060
if not show_plot:

0 commit comments

Comments
 (0)