Skip to content

Commit e208d56

Browse files
authored
Accelerate Bradley Terry MLE model fitting (#3523)
1 parent 3773213 commit e208d56

File tree

2 files changed

+404
-242
lines changed

2 files changed

+404
-242
lines changed

fastchat/serve/monitor/elo_analysis.py

Lines changed: 19 additions & 242 deletions
Original file line numberDiff line numberDiff line change
@@ -17,120 +17,18 @@
1717
from fastchat.model.model_registry import get_model_info
1818
from fastchat.serve.monitor.basic_stats import get_log_files
1919
from fastchat.serve.monitor.clean_battle_data import clean_battle_data
20+
from fastchat.serve.monitor.rating_systems import (
21+
compute_elo,
22+
compute_bt,
23+
compute_style_control,
24+
compute_bootstrap_elo,
25+
compute_bootstrap_bt,
26+
compute_bootstrap_style_control,
27+
)
2028

2129
pd.options.display.float_format = "{:.2f}".format
2230

2331

24-
STYLE_CONTROL_ELEMENTS_V1 = [
25-
"sum_assistant_a_tokens",
26-
"header_count_a",
27-
"list_count_a",
28-
"bold_count_a",
29-
"sum_assistant_b_tokens",
30-
"header_count_b",
31-
"list_count_b",
32-
"bold_count_b",
33-
]
34-
35-
36-
def compute_elo(battles, K=4, SCALE=400, BASE=10, INIT_RATING=1000):
37-
rating = defaultdict(lambda: INIT_RATING)
38-
39-
for rd, model_a, model_b, winner in battles[
40-
["model_a", "model_b", "winner"]
41-
].itertuples():
42-
ra = rating[model_a]
43-
rb = rating[model_b]
44-
ea = 1 / (1 + BASE ** ((rb - ra) / SCALE))
45-
eb = 1 / (1 + BASE ** ((ra - rb) / SCALE))
46-
if winner == "model_a":
47-
sa = 1
48-
elif winner == "model_b":
49-
sa = 0
50-
elif winner == "tie" or winner == "tie (bothbad)":
51-
sa = 0.5
52-
else:
53-
raise Exception(f"unexpected vote {winner}")
54-
rating[model_a] += K * (sa - ea)
55-
rating[model_b] += K * (1 - sa - eb)
56-
57-
return dict(rating)
58-
59-
60-
def get_bootstrap_result(battles, func_compute_elo, num_round=1000):
61-
rows = []
62-
for i in tqdm(range(num_round), desc="bootstrap"):
63-
tmp_battles = battles.sample(frac=1.0, replace=True)
64-
rows.append(func_compute_elo(tmp_battles))
65-
df = pd.DataFrame(rows)
66-
return df[df.median().sort_values(ascending=False).index]
67-
68-
69-
def compute_elo_mle_with_tie(
70-
df, SCALE=400, BASE=10, INIT_RATING=1000, sample_weight=None
71-
):
72-
from sklearn.linear_model import LogisticRegression
73-
74-
ptbl_a_win = pd.pivot_table(
75-
df[df["winner"] == "model_a"],
76-
index="model_a",
77-
columns="model_b",
78-
aggfunc="size",
79-
fill_value=0,
80-
)
81-
ptbl_tie = pd.pivot_table(
82-
df[df["winner"].isin(["tie", "tie (bothbad)"])],
83-
index="model_a",
84-
columns="model_b",
85-
aggfunc="size",
86-
fill_value=0,
87-
)
88-
ptbl_tie = ptbl_tie + ptbl_tie.T
89-
ptbl_b_win = pd.pivot_table(
90-
df[df["winner"] == "model_b"],
91-
index="model_a",
92-
columns="model_b",
93-
aggfunc="size",
94-
fill_value=0,
95-
)
96-
ptbl_win = ptbl_a_win * 2 + ptbl_b_win.T * 2 + ptbl_tie
97-
98-
models = pd.Series(np.arange(len(ptbl_win.index)), index=ptbl_win.index)
99-
100-
p = len(models)
101-
X = np.zeros([p * (p - 1) * 2, p])
102-
Y = np.zeros(p * (p - 1) * 2)
103-
104-
cur_row = 0
105-
sample_weights = []
106-
for m_a in ptbl_win.index:
107-
for m_b in ptbl_win.columns:
108-
if m_a == m_b:
109-
continue
110-
# if nan skip
111-
if math.isnan(ptbl_win.loc[m_a, m_b]) or math.isnan(ptbl_win.loc[m_b, m_a]):
112-
continue
113-
X[cur_row, models[m_a]] = +math.log(BASE)
114-
X[cur_row, models[m_b]] = -math.log(BASE)
115-
Y[cur_row] = 1.0
116-
sample_weights.append(ptbl_win.loc[m_a, m_b])
117-
118-
X[cur_row + 1, models[m_a]] = math.log(BASE)
119-
X[cur_row + 1, models[m_b]] = -math.log(BASE)
120-
Y[cur_row + 1] = 0.0
121-
sample_weights.append(ptbl_win.loc[m_b, m_a])
122-
cur_row += 2
123-
X = X[:cur_row]
124-
Y = Y[:cur_row]
125-
126-
lr = LogisticRegression(fit_intercept=False, penalty=None)
127-
lr.fit(X, Y, sample_weight=sample_weights)
128-
elo_scores = SCALE * lr.coef_[0] + INIT_RATING
129-
if "mixtral-8x7b-instruct-v0.1" in models.index:
130-
elo_scores += 1114 - elo_scores[models["mixtral-8x7b-instruct-v0.1"]]
131-
return pd.Series(elo_scores, index=models.index).sort_values(ascending=False)
132-
133-
13432
def get_median_elo_from_bootstrap(bootstrap_df):
13533
median = dict(bootstrap_df.quantile(0.5))
13634
median = {k: int(v + 0.5) for k, v in median.items()}
@@ -411,129 +309,6 @@ def outlier_detect(
411309
return battles
412310

413311

414-
def fit_mle_elo(X, Y, models, indices=None, SCALE=400, INIT_RATING=1000):
415-
from sklearn.linear_model import LogisticRegression
416-
417-
p = len(models.index)
418-
419-
lr = LogisticRegression(fit_intercept=False)
420-
if indices:
421-
lr.fit(X[indices], Y[indices])
422-
else:
423-
lr.fit(X, Y)
424-
425-
elo_scores = SCALE * lr.coef_[0] + INIT_RATING
426-
# calibrate llama-13b to 800 if applicable
427-
if "mixtral-8x7b-instruct-v0.1" in models.index:
428-
elo_scores += 1114 - elo_scores[models["mixtral-8x7b-instruct-v0.1"]]
429-
return (
430-
pd.Series(elo_scores[:p], index=models.index).sort_values(ascending=False),
431-
lr.coef_[0][p:],
432-
)
433-
434-
435-
def construct_style_matrices(
436-
df,
437-
BASE=10,
438-
apply_ratio=[1, 1, 1, 1],
439-
style_elements=STYLE_CONTROL_ELEMENTS_V1,
440-
add_one=True,
441-
):
442-
models = pd.concat([df["model_a"], df["model_b"]]).unique()
443-
models = pd.Series(np.arange(len(models)), index=models)
444-
445-
# duplicate battles
446-
df = pd.concat([df, df], ignore_index=True)
447-
p = len(models.index)
448-
n = df.shape[0]
449-
assert len(style_elements) % 2 == 0
450-
k = int(len(style_elements) / 2)
451-
452-
X = np.zeros([n, p + k])
453-
X[np.arange(n), models[df["model_a"]]] = +math.log(BASE)
454-
X[np.arange(n), models[df["model_b"]]] = -math.log(BASE)
455-
456-
# creates turn each of the specified column in "conv_metadata" into a vector
457-
style_vector = np.array(
458-
[
459-
df.conv_metadata.map(
460-
lambda x: x[element]
461-
if type(x[element]) is int
462-
else sum(x[element].values())
463-
).tolist()
464-
for element in style_elements
465-
]
466-
)
467-
468-
style_diff = (style_vector[:k] - style_vector[k:]).astype(float)
469-
style_sum = (style_vector[:k] + style_vector[k:]).astype(float)
470-
471-
if add_one:
472-
style_sum = style_sum + np.ones(style_diff.shape)
473-
474-
apply_ratio = np.flatnonzero(apply_ratio)
475-
476-
style_diff[apply_ratio] /= style_sum[
477-
apply_ratio
478-
] # Apply ratio where necessary (length, etc)
479-
480-
style_mean = np.mean(style_diff, axis=1)
481-
style_std = np.std(style_diff, axis=1)
482-
483-
X[:, -k:] = ((style_diff - style_mean[:, np.newaxis]) / style_std[:, np.newaxis]).T
484-
485-
# one A win => two A win
486-
Y = np.zeros(n)
487-
Y[df["winner"] == "model_a"] = 1.0
488-
489-
# one tie => one A win + one B win
490-
# find tie + tie (both bad) index
491-
tie_idx = (df["winner"] == "tie") | (df["winner"] == "tie (bothbad)")
492-
tie_idx[len(tie_idx) // 2 :] = False
493-
Y[tie_idx] = 1.0
494-
495-
return X, Y, models
496-
497-
498-
def get_bootstrap_result_style_control(
499-
X, Y, battles, models, func_compute_elo, num_round=1000
500-
):
501-
elos = []
502-
coefs = []
503-
assert X.shape[0] % 2 == 0 and X.shape[0] == Y.shape[0]
504-
k = int(
505-
X.shape[0] / 2
506-
) # Since we duplicate the battles when constructing X and Y, we don't want to sample the duplicates
507-
508-
battles_tie_idx = (battles["winner"] == "tie") | (
509-
battles["winner"] == "tie (bothbad)"
510-
)
511-
for _ in tqdm(range(num_round), desc="bootstrap"):
512-
indices = np.random.choice(list(range(k)), size=(k), replace=True)
513-
514-
index2tie = np.zeros(k, dtype=bool)
515-
index2tie[battles_tie_idx] = True
516-
517-
nontie_indices = indices[~index2tie[indices]]
518-
tie_indices = np.concatenate(
519-
[indices[index2tie[indices]], indices[index2tie[indices]] + k]
520-
)
521-
522-
_X = np.concatenate([X[nontie_indices], X[nontie_indices], X[tie_indices]])
523-
_Y = np.concatenate([Y[nontie_indices], Y[nontie_indices], Y[tie_indices]])
524-
525-
assert _X.shape == X.shape and _Y.shape == Y.shape
526-
527-
states = ~_X[:, : len(models)].any(axis=0)
528-
529-
elo, coef = func_compute_elo(_X, _Y, models=models[~states])
530-
elos.append(elo)
531-
coefs.append(coef)
532-
533-
df = pd.DataFrame(elos)
534-
return df[df.median().sort_values(ascending=False).index], coefs
535-
536-
537312
def filter_long_conv(row):
538313
threshold = 768
539314
for conversation_type in ["conversation_a", "conversation_b"]:
@@ -557,6 +332,7 @@ def report_elo_analysis_results(
557332
scale=1,
558333
filter_func=lambda x: True,
559334
style_control=False,
335+
num_cpu=None,
560336
):
561337
battles = pd.DataFrame(battles_json)
562338

@@ -598,19 +374,18 @@ def report_elo_analysis_results(
598374

599375
if rating_system == "bt":
600376
if style_control:
601-
X, Y, models = construct_style_matrices(battles)
602-
bootstrap_df, boostrap_coef = get_bootstrap_result_style_control(
603-
X, Y, battles, models, fit_mle_elo, num_round=num_bootstrap
377+
bootstrap_df, boostrap_coef = compute_bootstrap_style_control(
378+
battles, num_round=num_bootstrap
604379
)
605-
elo_rating_final, coef_final = fit_mle_elo(X, Y, models)
380+
elo_rating_final, coef_final = compute_style_control(battles)
606381
else:
607-
bootstrap_df = get_bootstrap_result(
608-
battles, compute_elo_mle_with_tie, num_round=num_bootstrap
382+
bootstrap_df = compute_bootstrap_bt(
383+
battles, num_round=num_bootstrap, num_cpu=num_cpu
609384
)
610-
elo_rating_final = compute_elo_mle_with_tie(battles)
385+
elo_rating_final = compute_bt(battles)
611386
elif rating_system == "elo":
612-
bootstrap_df = get_bootstrap_result(
613-
battles, compute_elo, num_round=num_bootstrap
387+
bootstrap_df = compute_bootstrap_elo(
388+
battles, num_round=num_bootstrap, num_cpu=num_cpu
614389
)
615390
elo_rating_median = get_median_elo_from_bootstrap(bootstrap_df)
616391
elo_rating_final = elo_rating_median
@@ -715,6 +490,7 @@ def pretty_print_elo_rating(rating):
715490
parser.add_argument("--category", nargs="+", default=["full"])
716491
parser.add_argument("--scale", type=float, default=1)
717492
parser.add_argument("--style-control", action="store_true")
493+
parser.add_argument("--num-cpu", type=int, default=12)
718494
args = parser.parse_args()
719495

720496
np.random.seed(42)
@@ -753,6 +529,7 @@ def pretty_print_elo_rating(rating):
753529
scale=args.scale,
754530
filter_func=filter_func,
755531
style_control=args.style_control,
532+
num_cpu=args.num_cpu,
756533
)
757534

758535
for cat in args.category:

0 commit comments

Comments
 (0)