Skip to content

Commit 2179cce

Browse files
authored
Merge branch 'lm-sys:main' into feature/jab-api-0.3
2 parents 0af9f11 + 1cd4b74 commit 2179cce

File tree

5 files changed

+71
-77
lines changed

5 files changed

+71
-77
lines changed

fastchat/constants.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -9,8 +9,8 @@
99

1010
# Survey Link URL (to be removed) #00729c
1111
SURVEY_LINK = """<div style='text-align: left; margin: 20px 0;'>
12-
<div style='display: inline-block; border: 2px solid #C41E3A; padding: 20px; padding-bottom: 10px; padding-top: 10px; border-radius: 5px;'>
13-
<span style='color: #C41E3A; font-weight: bold;'>New Launch! Jailbreak models at <a href='https://redarena.ai' style='color: #C41E3A; text-decoration: underline;'>RedTeam Arena</a>. </span>
12+
<div style='display: inline-block; border: 2px solid #00729c; padding: 20px; padding-bottom: 10px; padding-top: 10px; border-radius: 5px;'>
13+
<span style='color: #00729c; font-weight: bold;'>New Launch! Copilot Arena: <a href='https://marketplace.visualstudio.com/items?itemName=copilot-arena.copilot-arena' style='color: #00729c; text-decoration: underline;'>VS Code Extension</a> to compare Top LLMs</span>
1414
</div>
1515
</div>"""
1616
# SURVEY_LINK = ""

fastchat/serve/monitor/elo_analysis.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,6 @@
1212
import pandas as pd
1313
import plotly.express as px
1414
from tqdm import tqdm
15-
from transformers import AutoTokenizer
1615

1716
from fastchat.model.model_registry import get_model_info
1817
from fastchat.serve.monitor.basic_stats import get_log_files

fastchat/serve/monitor/monitor.py

Lines changed: 14 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -54,19 +54,16 @@
5454

5555

5656
def recompute_final_ranking(arena_df):
57-
# compute ranking based on CI
58-
ranking = {}
59-
for i, model_a in enumerate(arena_df.index):
60-
ranking[model_a] = 1
61-
for j, model_b in enumerate(arena_df.index):
62-
if i == j:
63-
continue
64-
if (
65-
arena_df.loc[model_b]["rating_q025"]
66-
> arena_df.loc[model_a]["rating_q975"]
67-
):
68-
ranking[model_a] += 1
69-
return list(ranking.values())
57+
q025 = arena_df["rating_q025"].values
58+
q975 = arena_df["rating_q975"].values
59+
60+
sorted_q025 = np.sort(q025)
61+
insertion_indices = np.searchsorted(sorted_q025, q975, side="right")
62+
counts = len(sorted_q025) - insertion_indices
63+
64+
rankings = 1 + counts
65+
ranking_series = pd.Series(rankings, index=arena_df.index)
66+
return ranking_series.tolist()
7067

7168

7269
def arena_hard_title(date):
@@ -81,22 +78,6 @@ def arena_hard_title(date):
8178
return arena_hard_title
8279

8380

84-
def recompute_final_ranking(arena_df):
85-
# compute ranking based on CI
86-
ranking = {}
87-
for i, model_a in enumerate(arena_df.index):
88-
ranking[model_a] = 1
89-
for j, model_b in enumerate(arena_df.index):
90-
if i == j:
91-
continue
92-
if (
93-
arena_df.loc[model_b]["rating_q025"]
94-
> arena_df.loc[model_a]["rating_q975"]
95-
):
96-
ranking[model_a] += 1
97-
return list(ranking.values())
98-
99-
10081
def update_elo_components(
10182
max_num_files, elo_results_file, ban_ip_file, exclude_model_names
10283
):
@@ -861,14 +842,15 @@ def build_category_leaderboard_tab(
861842
"full_style_control",
862843
"hard_6",
863844
"hard_6_style_control",
864-
"if",
865845
"coding",
866846
"math",
867-
"multiturn",
847+
"creative_writing",
848+
"if",
868849
"long_user",
850+
"multiturn",
869851
# "no_refusal",
870852
]
871-
selected_categories_width = [110, 110, 110, 110, 110, 80, 80, 80, 80]
853+
selected_categories_width = [110, 110, 110, 110, 80, 80, 80, 110, 80, 80]
872854

873855
language_categories = [
874856
"english",

fastchat/serve/monitor/monitor_md.py

Lines changed: 8 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
"gemini-1.5-pro-api-0409-preview",
1010
"bard-jan-24-gemini-pro",
1111
"chatgpt-4o-latest-20240808",
12+
"chatgpt-4o-latest-20240903",
1213
]
1314

1415
key_to_category_name = {
@@ -18,11 +19,12 @@
1819
"math": "Math",
1920
"if": "Instruction Following",
2021
"multiturn": "Multi-Turn",
22+
"creative_writing": "Creative Writing",
2123
"coding": "Coding",
2224
"coding_style_control": "Coding w/ Style Control",
23-
"hard_6": "Hard Prompts (Overall)",
25+
"hard_6": "Hard Prompts",
2426
"hard_english_6": "Hard Prompts (English)",
25-
"hard_6_style_control": "Hard Prompts (Overall) w/ Style Control",
27+
"hard_6_style_control": "Hard Prompts w/ Style Control",
2628
"long_user": "Longer Query",
2729
"english": "English",
2830
"chinese": "Chinese",
@@ -47,8 +49,8 @@
4749
"Multi-Turn": "Multi-Turn Conversation (>= 2 turns)",
4850
"Coding": "Coding: whether conversation contains code snippets",
4951
"Coding w/ Style Control": "Coding with Style Control",
50-
"Hard Prompts (Overall)": "Hard Prompts (Overall): details in [blog post](https://lmsys.org/blog/2024-05-17-category-hard/)",
51-
"Hard Prompts (Overall) w/ Style Control": "Hard Prompts with Style Control. See details in [blog post](https://lmsys.org/blog/2024-08-28-style-control/).",
52+
"Hard Prompts": "Hard Prompts: details in [blog post](https://lmsys.org/blog/2024-05-17-category-hard/)",
53+
"Hard Prompts w/ Style Control": "Hard Prompts with Style Control. See details in [blog post](https://lmsys.org/blog/2024-08-28-style-control/).",
5254
"Hard Prompts (English)": "Hard Prompts (English), note: the delta is to English Category. details in [blog post](https://lmsys.org/blog/2024-05-17-category-hard/)",
5355
"Longer Query": "Longer Query (>= 500 tokens)",
5456
"English": "English Prompts",
@@ -64,6 +66,7 @@
6466
"Exclude Refusal": 'Exclude model responses with refusal (e.g., "I cannot answer")',
6567
"overall_limit_5_user_vote": "overall_limit_5_user_vote",
6668
"Overall (Deprecated)": "Overall without De-duplicating Top Redundant Queries (top 0.1%). See details in [blog post](https://lmsys.org/blog/2024-05-17-category-hard/#note-enhancing-quality-through-de-duplication).",
69+
"Creative Writing": "Creative Writing",
6770
}
6871
cat_name_to_baseline = {
6972
"Hard Prompts (English)": "English",
@@ -81,7 +84,7 @@ def make_default_md_1(mirror=False):
8184
link_color = "#1976D2" # This color should be clear in both light and dark mode
8285
leaderboard_md = f"""
8386
# 🏆 Chatbot Arena LLM Leaderboard: Community-driven Evaluation for Best LLM and AI chatbots
84-
[Blog](https://blog.lmarena.ai/blog/2023/arena/) | [GitHub](https://github.com/lm-sys/FastChat) | [Paper](https://arxiv.org/abs/2403.04132) | [Dataset](https://github.com/lm-sys/FastChat/blob/main/docs/dataset_release.md) | [Twitter](https://twitter.com/lmsysorg) | [Discord](https://discord.gg/6GXcFg3TH8) | [Kaggle Competition](https://www.kaggle.com/competitions/lmsys-chatbot-arena)
87+
[Twitter](https://twitter.com/lmarena_ai) | [Discord](https://discord.gg/6GXcFg3TH8) | [Blog](https://blog.lmarena.ai/) | [GitHub](https://github.com/lm-sys/FastChat) | [Paper](https://arxiv.org/abs/2403.04132) | [Dataset](https://github.com/lm-sys/FastChat/blob/main/docs/dataset_release.md) | [Kaggle Competition](https://www.kaggle.com/competitions/wsdm-cup-multilingual-chatbot-arena)
8588
"""
8689

8790
return leaderboard_md

tests/load_test.py

Lines changed: 47 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -1,18 +1,15 @@
11
import argparse
2-
import time, asyncio
3-
from openai import AsyncOpenAI, AsyncAzureOpenAI
2+
import time
3+
import threading
4+
from concurrent.futures import ThreadPoolExecutor
45
import uuid
56
import traceback
67
import numpy as np
78
from transformers import AutoTokenizer
9+
from litellm import completion
810

9-
# base_url - litellm proxy endpoint
10-
# api_key - litellm proxy api-key, is created proxy with auth
11-
litellm_client = None
1211

13-
14-
async def litellm_completion(args, tokenizer, image_url=None):
15-
# Your existing code for litellm_completion goes here
12+
def litellm_completion(args, tokenizer, image_url=None):
1613
try:
1714
if image_url:
1815
messages = [
@@ -30,16 +27,24 @@ async def litellm_completion(args, tokenizer, image_url=None):
3027
]
3128

3229
start = time.time()
33-
response = await litellm_client.chat.completions.create(
30+
31+
additional_api_kwargs = {}
32+
if args.api_key:
33+
additional_api_kwargs["api_key"] = args.api_key
34+
if args.api_base:
35+
additional_api_kwargs["api_base"] = args.api_base
36+
37+
response = completion(
3438
model=args.model,
3539
messages=messages,
3640
stream=True,
41+
**additional_api_kwargs,
3742
)
3843
ttft = None
3944

4045
itl_list = []
4146
content = ""
42-
async for chunk in response:
47+
for chunk in response:
4348
if chunk.choices[0].delta.content:
4449
end_time = time.time()
4550
if ttft is None:
@@ -52,43 +57,48 @@ async def litellm_completion(args, tokenizer, image_url=None):
5257
return content, ttft, itl_list
5358

5459
except Exception as e:
55-
# If there's an exception, log the error message
5660
print(e)
5761
with open("error_log.txt", "a") as error_log:
5862
error_log.write(f"Error during completion: {str(e)}\n")
5963
return str(e)
6064

6165

62-
async def main(args):
66+
def main(args):
6367
n = args.num_total_responses
6468
batch_size = args.req_per_sec # Requests per second
6569
start = time.time()
6670

67-
all_tasks = []
71+
all_results = []
6872
tokenizer = AutoTokenizer.from_pretrained("gpt2")
69-
for i in range(0, n, batch_size):
70-
batch = range(i, min(i + batch_size, n))
71-
for _ in batch:
72-
if args.include_image:
73-
# Generate a random dimension for the image
74-
if args.randomize_image_dimensions:
75-
y_dimension = np.random.randint(100, 1025)
73+
74+
with ThreadPoolExecutor(max_workers=batch_size) as executor:
75+
for i in range(0, n, batch_size):
76+
batch_futures = []
77+
batch = range(i, min(i + batch_size, n))
78+
79+
for _ in batch:
80+
if args.include_image:
81+
if args.randomize_image_dimensions:
82+
y_dimension = np.random.randint(100, 1025)
83+
else:
84+
y_dimension = 512
85+
image_url = f"https://placehold.co/1024x{y_dimension}/png"
86+
future = executor.submit(
87+
litellm_completion, args, tokenizer, image_url
88+
)
7689
else:
77-
y_dimension = 512
78-
image_url = f"https://placehold.co/1024x{y_dimension}/png"
79-
task = asyncio.create_task(
80-
litellm_completion(args, tokenizer, image_url)
81-
)
82-
else:
83-
task = asyncio.create_task(litellm_completion(args, tokenizer))
84-
all_tasks.append(task)
85-
if i + batch_size < n:
86-
await asyncio.sleep(1) # Wait 1 second before the next batch
87-
88-
all_completions = await asyncio.gather(*all_tasks)
90+
future = executor.submit(litellm_completion, args, tokenizer)
91+
batch_futures.append(future)
92+
93+
# Wait for batch to complete
94+
for future in batch_futures:
95+
all_results.append(future.result())
96+
97+
if i + batch_size < n:
98+
time.sleep(1) # Wait 1 second before next batch
8999

90100
successful_completions = [
91-
c for c in all_completions if isinstance(c, tuple) and len(c) == 3
101+
c for c in all_results if isinstance(c, tuple) and len(c) == 3
92102
]
93103
ttft_list = np.array([float(c[1]) for c in successful_completions])
94104
itl_list_flattened = np.array(
@@ -101,7 +111,7 @@ async def main(args):
101111

102112
# Write errors to error_log.txt
103113
with open("load_test_errors.log", "a") as error_log:
104-
for completion in all_completions:
114+
for completion in all_results:
105115
if isinstance(completion, str):
106116
error_log.write(completion + "\n")
107117

@@ -115,15 +125,15 @@ async def main(args):
115125
if __name__ == "__main__":
116126
parser = argparse.ArgumentParser()
117127
parser.add_argument("--model", type=str, default="azure-gpt-3.5")
118-
parser.add_argument("--server-address", type=str, default="http://0.0.0.0:9094")
128+
parser.add_argument("--api-base", type=str, default=None)
129+
parser.add_argument("--api-key", type=str, default=None)
119130
parser.add_argument("--num-total-responses", type=int, default=50)
120131
parser.add_argument("--req-per-sec", type=int, default=5)
121132
parser.add_argument("--include-image", action="store_true")
122133
parser.add_argument("--randomize-image-dimensions", action="store_true")
123134
args = parser.parse_args()
124135

125-
litellm_client = AsyncOpenAI(base_url=args.server_address, api_key="sk-1234")
126136
# Blank out contents of error_log.txt
127137
open("load_test_errors.log", "w").close()
128138

129-
asyncio.run(main(args))
139+
main(args)

0 commit comments

Comments
 (0)