Skip to content

Commit 05b9305

Browse files
CodingWithTimCodingWithTim
andauthored
Add Style Control to Chatbot Arena Leaderboard 🔥 (#3495)
Co-authored-by: CodingWithTim <tim@inst-builder-debian-12-build-build-4zqb5.us-central1-a.c.gce-image-builder.internal>
1 parent 282534b commit 05b9305

File tree

2 files changed

+219
-4
lines changed

2 files changed

+219
-4
lines changed
Lines changed: 84 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,84 @@
1+
import pandas as pd
2+
import re
3+
import argparse
4+
5+
from tqdm import tqdm
6+
7+
tqdm.pandas()
8+
9+
10+
def count_markdown_elements(markdown_text, suffix):
11+
counters = {
12+
f"header_count{suffix}": {
13+
"h1": len(re.findall(r"^#{1}\s", markdown_text, re.MULTILINE)),
14+
"h2": len(re.findall(r"^#{2}\s", markdown_text, re.MULTILINE)),
15+
"h3": len(re.findall(r"^#{3}\s", markdown_text, re.MULTILINE)),
16+
"h4": len(re.findall(r"^#{4}\s", markdown_text, re.MULTILINE)),
17+
"h5": len(re.findall(r"^#{5}\s", markdown_text, re.MULTILINE)),
18+
"h6": len(re.findall(r"^#{6}\s", markdown_text, re.MULTILINE)),
19+
},
20+
f"list_count{suffix}": {
21+
"ordered": len(re.findall(r"^\s*\d+\.\s", markdown_text, re.MULTILINE)),
22+
"unordered": len(re.findall(r"^\s*[-*+]\s", markdown_text, re.MULTILINE)),
23+
},
24+
f"bold_count{suffix}": {
25+
"**": len(re.findall(r"\*\*[^*\n]+\*\*", markdown_text)),
26+
"__": len(re.findall(r"__[^_\n]+__", markdown_text)),
27+
},
28+
}
29+
return counters
30+
31+
32+
def remove_pattern(answer, pattern):
33+
blocks = pattern.findall(answer)
34+
for block in blocks:
35+
answer = answer.replace(block, "")
36+
return answer
37+
38+
39+
def get_element_counts(df, column):
40+
pattern = re.compile("```([^`]*)```")
41+
answers = df[column].map(
42+
lambda convo: "\n".join(
43+
[turn["content"] for turn in convo if turn["role"] == "assistant"]
44+
)
45+
)
46+
results = answers.progress_map(
47+
lambda answer: count_markdown_elements(
48+
remove_pattern(answer, pattern),
49+
suffix=column[-2:], # Remove code block first
50+
)
51+
)
52+
53+
return results.tolist()
54+
55+
56+
def add_markdown_meta(row):
57+
conv_meta = {k: v for k, v in row["conv_metadata"].items()}
58+
return conv_meta | row["markdown_meta_a"] | row["markdown_meta_b"]
59+
60+
61+
if __name__ == "__main__":
62+
parser = argparse.ArgumentParser()
63+
parser.add_argument("--input-file", type=str, required=True)
64+
parser.add_argument("--output-file", type=str, required=True)
65+
args = parser.parse_args()
66+
67+
print("loading file...")
68+
data = pd.read_json(args.input_file)
69+
70+
assert "conv_metadata" in data.columns
71+
72+
temp = data[["question_id", "conv_metadata"]].copy()
73+
74+
print("Processing conversation_a")
75+
temp["markdown_meta_a"] = get_element_counts(data, column="conversation_a")
76+
77+
print("Processing conversation_b")
78+
temp["markdown_meta_b"] = get_element_counts(data, column="conversation_b")
79+
80+
print("Post-processing...")
81+
data["conv_metadata"] = temp.apply(add_markdown_meta, axis=1)
82+
83+
print("Saving to file...")
84+
data.to_json(args.output_file, orient="records", indent=4, force_ascii=False)

fastchat/serve/monitor/elo_analysis.py

Lines changed: 135 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,18 @@
2121
pd.options.display.float_format = "{:.2f}".format
2222

2323

24+
STYLE_CONTROL_ELEMENTS_V1 = [
25+
"sum_assistant_a_tokens",
26+
"header_count_a",
27+
"list_count_a",
28+
"bold_count_a",
29+
"sum_assistant_b_tokens",
30+
"header_count_b",
31+
"list_count_b",
32+
"bold_count_b",
33+
]
34+
35+
2436
def compute_elo(battles, K=4, SCALE=400, BASE=10, INIT_RATING=1000):
2537
rating = defaultdict(lambda: INIT_RATING)
2638

@@ -399,6 +411,109 @@ def outlier_detect(
399411
return battles
400412

401413

414+
def fit_mle_elo(X, Y, models, indices=None, SCALE=400, INIT_RATING=1000):
415+
from sklearn.linear_model import LogisticRegression
416+
417+
p = len(models.index)
418+
419+
lr = LogisticRegression(fit_intercept=False)
420+
if indices:
421+
lr.fit(X[indices], Y[indices])
422+
else:
423+
lr.fit(X, Y)
424+
425+
elo_scores = SCALE * lr.coef_[0] + INIT_RATING
426+
# calibrate llama-13b to 800 if applicable
427+
if "mixtral-8x7b-instruct-v0.1" in models.index:
428+
elo_scores += 1114 - elo_scores[models["mixtral-8x7b-instruct-v0.1"]]
429+
return (
430+
pd.Series(elo_scores[:p], index=models.index).sort_values(ascending=False),
431+
lr.coef_[0][p:],
432+
)
433+
434+
435+
def construct_style_matrices(
436+
df,
437+
BASE=10,
438+
apply_ratio=[1, 1, 1, 1],
439+
style_elements=STYLE_CONTROL_ELEMENTS_V1,
440+
add_one=True,
441+
):
442+
models = pd.concat([battles["model_a"], battles["model_b"]]).unique()
443+
models = pd.Series(np.arange(len(models)), index=models)
444+
445+
# duplicate battles
446+
df = pd.concat([df, df], ignore_index=True)
447+
p = len(models.index)
448+
n = df.shape[0]
449+
assert len(style_elements) % 2 == 0
450+
k = int(len(style_elements) / 2)
451+
452+
X = np.zeros([n, p + k])
453+
X[np.arange(n), models[df["model_a"]]] = +math.log(BASE)
454+
X[np.arange(n), models[df["model_b"]]] = -math.log(BASE)
455+
456+
# creates turn each of the specified column in "conv_metadata" into a vector
457+
style_vector = np.array(
458+
[
459+
df.conv_metadata.map(
460+
lambda x: x[element]
461+
if type(x[element]) is int
462+
else sum(x[element].values())
463+
).tolist()
464+
for element in style_elements
465+
]
466+
)
467+
468+
style_diff = (style_vector[:k] - style_vector[k:]).astype(float)
469+
style_sum = (style_vector[:k] + style_vector[k:]).astype(float)
470+
471+
if add_one:
472+
style_sum = style_sum + np.ones(style_diff.shape)
473+
474+
apply_ratio = np.flatnonzero(apply_ratio)
475+
476+
style_diff[apply_ratio] /= style_sum[
477+
apply_ratio
478+
] # Apply ratio where necessary (length, etc)
479+
480+
style_mean = np.mean(style_diff, axis=1)
481+
style_std = np.std(style_diff, axis=1)
482+
483+
X[:, -k:] = ((style_diff - style_mean[:, np.newaxis]) / style_std[:, np.newaxis]).T
484+
485+
# one A win => two A win
486+
Y = np.zeros(n)
487+
Y[df["winner"] == "model_a"] = 1.0
488+
489+
# one tie => one A win + one B win
490+
# find tie + tie (both bad) index
491+
tie_idx = (df["winner"] == "tie") | (df["winner"] == "tie (bothbad)")
492+
tie_idx[len(tie_idx) // 2 :] = False
493+
Y[tie_idx] = 1.0
494+
495+
return X, Y, models
496+
497+
498+
def get_bootstrap_result_style_control(X, Y, models, func_compute_elo, num_round=1000):
499+
elos = []
500+
coefs = []
501+
for _ in tqdm(range(num_round), desc="bootstrap"):
502+
indices = np.random.choice(
503+
list(range(len(battles))), size=(len(battles)), replace=True
504+
)
505+
_X = X[indices]
506+
_Y = Y[indices]
507+
states = ~_X[:, : len(models)].any(axis=0)
508+
509+
elo, coef = func_compute_elo(_X, _Y, models=models[~states])
510+
elos.append(elo)
511+
coefs.append(coef)
512+
513+
df = pd.DataFrame(elos)
514+
return df[df.median().sort_values(ascending=False).index], coefs
515+
516+
402517
def filter_long_conv(row):
403518
threshold = 768
404519
for conversation_type in ["conversation_a", "conversation_b"]:
@@ -421,6 +536,7 @@ def report_elo_analysis_results(
421536
run_outlier_detect=False,
422537
scale=1,
423538
filter_func=lambda x: True,
539+
style_control=False,
424540
):
425541
battles = pd.DataFrame(battles_json)
426542

@@ -461,10 +577,17 @@ def report_elo_analysis_results(
461577
elo_rating_online = compute_elo(battles)
462578

463579
if rating_system == "bt":
464-
bootstrap_df = get_bootstrap_result(
465-
battles, compute_elo_mle_with_tie, num_round=num_bootstrap
466-
)
467-
elo_rating_final = compute_elo_mle_with_tie(battles)
580+
if style_control:
581+
X, Y, models = construct_style_matrices(battles)
582+
bootstrap_df, boostrap_coef = get_bootstrap_result_style_control(
583+
X, Y, models, fit_mle_elo, num_round=num_bootstrap
584+
)
585+
elo_rating_final, coef_final = fit_mle_elo(X, Y, models)
586+
else:
587+
bootstrap_df = get_bootstrap_result(
588+
battles, compute_elo_mle_with_tie, num_round=num_bootstrap
589+
)
590+
elo_rating_final = compute_elo_mle_with_tie(battles)
468591
elif rating_system == "elo":
469592
bootstrap_df = get_bootstrap_result(
470593
battles, compute_elo, num_round=num_bootstrap
@@ -538,6 +661,12 @@ def report_elo_analysis_results(
538661
"last_updated_tstamp": last_updated_tstamp,
539662
"bootstrap_df": bootstrap_df,
540663
"leaderboard_table_df": leaderboard_table_df,
664+
"style_coefficients": {
665+
"bootstrap": np.vstack(boostrap_coef),
666+
"final": coef_final,
667+
}
668+
if rating_system == "bt" and style_control
669+
else {},
541670
}
542671

543672

@@ -565,6 +694,7 @@ def pretty_print_elo_rating(rating):
565694
parser.add_argument("--run-outlier-detect", action="store_true", default=False)
566695
parser.add_argument("--category", nargs="+", default=["full"])
567696
parser.add_argument("--scale", type=float, default=1)
697+
parser.add_argument("--style-control", action="store_true")
568698
args = parser.parse_args()
569699

570700
np.random.seed(42)
@@ -602,6 +732,7 @@ def pretty_print_elo_rating(rating):
602732
run_outlier_detect=args.run_outlier_detect,
603733
scale=args.scale,
604734
filter_func=filter_func,
735+
style_control=args.style_control,
605736
)
606737

607738
for cat in args.category:

0 commit comments

Comments
 (0)