Skip to content

Commit 759dfbe

Browse files
authored
Update monitor & plots (#2506)
1 parent f5eee7d commit 759dfbe

File tree

6 files changed

+46
-18
lines changed

6 files changed

+46
-18
lines changed

docs/commands/leaderboard.md

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,3 +24,14 @@ scp atlas:/data/lmzheng/FastChat/fastchat/serve/monitor/elo_results_20230905.pkl
2424
```
2525
wget https://huggingface.co/spaces/lmsys/chatbot-arena-leaderboard/raw/main/leaderboard_table_20230905.csv
2626
```
27+
28+
### Update files on webserver
29+
```
30+
DATE=20231002
31+
32+
rm -rf elo_results.pkl leaderboard_table.csv
33+
wget https://huggingface.co/spaces/lmsys/chatbot-arena-leaderboard/resolve/main/elo_results_$DATE.pkl
34+
wget https://huggingface.co/spaces/lmsys/chatbot-arena-leaderboard/resolve/main/leaderboard_table_$DATE.csv
35+
ln -s leaderboard_table_$DATE.csv leaderboard_table.csv
36+
ln -s elo_results_$DATE.pkl elo_results.pkl
37+
```

fastchat/serve/huggingface_api_worker.py

Lines changed: 10 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,18 +1,19 @@
11
"""
2-
A model worker to call huggingface api.
3-
JSON file format:
2+
A model worker that calls huggingface inference endpoint.
3+
4+
Register models in a JSON file with the following format:
45
{
56
"falcon-180b-chat": {
67
"model_path": "tiiuae/falcon-180B-chat",
78
"api_base": "https://api-inference.huggingface.co/models",
89
"token": "hf_xxx",
9-
"context_length": 2048
10+
"context_length": 2048,
1011
"model_names": "falcon-180b-chat",
11-
"conv_template": null,
12+
"conv_template": null
1213
}
1314
}
1415
15-
Only "model_path", "api_base", and "token" are necessary, others are optional.
16+
"model_path", "api_base", "token", and "context_length" are necessary, while others are optional.
1617
"""
1718
import argparse
1819
import asyncio
@@ -116,6 +117,9 @@ def __init__(
116117
f"Connecting with huggingface api {self.model_path} as {self.model_names} on worker {worker_id} ..."
117118
)
118119

120+
if not no_register:
121+
self.init_heart_beat()
122+
119123
def count_token(self, params):
120124
# No tokenizer here
121125
ret = {
@@ -312,7 +316,7 @@ def create_huggingface_api_worker():
312316
api_base_list.append(model_info[m]["api_base"])
313317
token_list.append(model_info[m]["token"])
314318

315-
context_length = model_info[m].get("context_length", 1024)
319+
context_length = model_info[m]["context_length"]
316320
model_names = model_info[m].get("model_names", [m.split("/")[-1]])
317321
if isinstance(model_names, str):
318322
model_names = [model_names]

fastchat/serve/monitor/clean_battle_data.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -44,8 +44,8 @@
4444

4545
def get_log_files(max_num_files=None):
4646
dates = []
47-
for month in [4, 5, 6, 7, 8, 9]:
48-
for day in range(1, 32):
47+
for month in range(4, 12):
48+
for day in range(1, 33):
4949
dates.append(f"2023-{month:02d}-{day:02d}")
5050

5151
filenames = []

fastchat/serve/monitor/clean_chat_data.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -28,8 +28,8 @@
2828

2929
def get_log_files(max_num_files=None):
3030
dates = []
31-
for month in [4, 5, 6, 7, 8, 9, 10]:
32-
for day in range(1, 32):
31+
for month in range(4, 12):
32+
for day in range(1, 33):
3333
dates.append(f"2023-{month:02d}-{day:02d}")
3434

3535
filenames = []

fastchat/serve/monitor/elo_analysis.py

Lines changed: 19 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -58,7 +58,7 @@ def get_median_elo_from_bootstrap(bootstrap_df):
5858
return median
5959

6060

61-
def compute_pairwise_win_fraction(battles, model_order):
61+
def compute_pairwise_win_fraction(battles, model_order, limit_show_number=None):
6262
# Times each model wins as Model A
6363
a_win_ptbl = pd.pivot_table(
6464
battles[battles["winner"] == "model_a"],
@@ -92,6 +92,9 @@ def compute_pairwise_win_fraction(battles, model_order):
9292
prop_wins = row_beats_col_freq.mean(axis=1).sort_values(ascending=False)
9393
model_order = list(prop_wins.keys())
9494

95+
if limit_show_number is not None:
96+
model_order = model_order[:limit_show_number]
97+
9598
# Arrange ordering according to proprition of wins
9699
row_beats_col = row_beats_col_freq.loc[model_order, model_order]
97100
return row_beats_col
@@ -166,8 +169,10 @@ def visualize_battle_count(battles, model_order):
166169
return fig
167170

168171

169-
def visualize_average_win_rate(battles):
170-
row_beats_col_freq = compute_pairwise_win_fraction(battles, None)
172+
def visualize_average_win_rate(battles, limit_show_number):
173+
row_beats_col_freq = compute_pairwise_win_fraction(
174+
battles, None, limit_show_number=limit_show_number
175+
)
171176
fig = px.bar(
172177
row_beats_col_freq.mean(axis=1).sort_values(ascending=False),
173178
text_auto=".2f",
@@ -180,7 +185,7 @@ def visualize_average_win_rate(battles):
180185
return fig
181186

182187

183-
def visualize_bootstrap_elo_rating(df):
188+
def visualize_bootstrap_elo_rating(df, limit_show_number):
184189
bars = (
185190
pd.DataFrame(
186191
dict(
@@ -192,6 +197,7 @@ def visualize_bootstrap_elo_rating(df):
192197
.reset_index(names="model")
193198
.sort_values("rating", ascending=False)
194199
)
200+
bars = bars[:limit_show_number]
195201
bars["error_y"] = bars["upper"] - bars["rating"]
196202
bars["error_y_minus"] = bars["rating"] - bars["lower"]
197203
bars["rating_rounded"] = np.round(bars["rating"], 2)
@@ -225,12 +231,19 @@ def report_elo_analysis_results(battles_json):
225231
model_order = list(elo_rating_median.keys())
226232
model_order.sort(key=lambda k: -elo_rating_median[k])
227233

234+
limit_show_number = 25 # limit show number to make plots smaller
235+
model_order = model_order[:limit_show_number]
236+
228237
# Plots
229238
leaderboard_table = visualize_leaderboard_table(elo_rating_median)
230239
win_fraction_heatmap = visualize_pairwise_win_fraction(battles_no_ties, model_order)
231240
battle_count_heatmap = visualize_battle_count(battles_no_ties, model_order)
232-
average_win_rate_bar = visualize_average_win_rate(battles_no_ties)
233-
bootstrap_elo_rating = visualize_bootstrap_elo_rating(bootstrap_df)
241+
average_win_rate_bar = visualize_average_win_rate(
242+
battles_no_ties, limit_show_number
243+
)
244+
bootstrap_elo_rating = visualize_bootstrap_elo_rating(
245+
bootstrap_df, limit_show_number
246+
)
234247

235248
last_updated_tstamp = battles["tstamp"].max()
236249
last_updated_datetime = datetime.datetime.fromtimestamp(

fastchat/serve/monitor/monitor.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -35,11 +35,11 @@ def make_leaderboard_md(elo_results):
3535
| [Blog](https://lmsys.org/blog/2023-05-03-arena/) | [GitHub](https://github.com/lm-sys/FastChat) | [Paper](https://arxiv.org/abs/2306.05685) | [Dataset](https://github.com/lm-sys/FastChat/blob/main/docs/dataset_release.md) | [Twitter](https://twitter.com/lmsysorg) | [Discord](https://discord.gg/HSWAKCrnFx) |
3636
3737
🏆 This leaderboard is based on the following three benchmarks.
38-
- [Chatbot Arena](https://lmsys.org/blog/2023-05-03-arena/) - a crowdsourced, randomized battle platform. We use 70K+ user votes to compute Elo ratings.
38+
- [Chatbot Arena](https://lmsys.org/blog/2023-05-03-arena/) - a crowdsourced, randomized battle platform. We use 90K+ user votes to compute Elo ratings.
3939
- [MT-Bench](https://arxiv.org/abs/2306.05685) - a set of challenging multi-turn questions. We use GPT-4 to grade the model responses.
4040
- [MMLU](https://arxiv.org/abs/2009.03300) (5-shot) - a test to measure a model's multitask accuracy on 57 tasks.
4141
42-
💻 Code: The Arena Elo ratings are computed by this [notebook]({notebook_url}). The MT-bench scores (single-answer grading on a scale of 10) are computed by [fastchat.llm_judge](https://github.com/lm-sys/FastChat/tree/main/fastchat/llm_judge). The MMLU scores are mostly computed by [InstructEval](https://github.com/declare-lab/instruct-eval). Higher values are better for all benchmarks. Empty cells mean not available. Last updated: Sept, 2023.
42+
💻 Code: The Arena Elo ratings are computed by this [notebook]({notebook_url}). The MT-bench scores (single-answer grading on a scale of 10) are computed by [fastchat.llm_judge](https://github.com/lm-sys/FastChat/tree/main/fastchat/llm_judge). The MMLU scores are mostly computed by [InstructEval](https://github.com/declare-lab/instruct-eval). Higher values are better for all benchmarks. Empty cells mean not available. Last updated: October, 2023.
4343
"""
4444
return leaderboard_md
4545

0 commit comments

Comments
 (0)