lm-sys
diff --git a/‎fastchat/serve/monitor/elo_analysis.py‎
Lines changed: 19 additions & 242 deletions b/‎fastchat/serve/monitor/elo_analysis.py‎
Lines changed: 19 additions & 242 deletions
@@ -17,120 +17,18 @@
 from fastchat.model.model_registry import get_model_info
 from fastchat.serve.monitor.basic_stats import get_log_files
 from fastchat.serve.monitor.clean_battle_data import clean_battle_data
+from fastchat.serve.monitor.rating_systems import (
+    compute_elo,
+    compute_bt,
+    compute_style_control,
+    compute_bootstrap_elo,
+    compute_bootstrap_bt,
+    compute_bootstrap_style_control,
+)
 
 pd.options.display.float_format = "{:.2f}".format
 
 
-STYLE_CONTROL_ELEMENTS_V1 = [
-    "sum_assistant_a_tokens",
-    "header_count_a",
-    "list_count_a",
-    "bold_count_a",
-    "sum_assistant_b_tokens",
-    "header_count_b",
-    "list_count_b",
-    "bold_count_b",
-]
-
-
-def compute_elo(battles, K=4, SCALE=400, BASE=10, INIT_RATING=1000):
-    rating = defaultdict(lambda: INIT_RATING)
-
-    for rd, model_a, model_b, winner in battles[
-        ["model_a", "model_b", "winner"]
-    ].itertuples():
-        ra = rating[model_a]
-        rb = rating[model_b]
-        ea = 1 / (1 + BASE ** ((rb - ra) / SCALE))
-        eb = 1 / (1 + BASE ** ((ra - rb) / SCALE))
-        if winner == "model_a":
-            sa = 1
-        elif winner == "model_b":
-            sa = 0
-        elif winner == "tie" or winner == "tie (bothbad)":
-            sa = 0.5
-        else:
-            raise Exception(f"unexpected vote {winner}")
-        rating[model_a] += K * (sa - ea)
-        rating[model_b] += K * (1 - sa - eb)
-
-    return dict(rating)
-
-
-def get_bootstrap_result(battles, func_compute_elo, num_round=1000):
-    rows = []
-    for i in tqdm(range(num_round), desc="bootstrap"):
-        tmp_battles = battles.sample(frac=1.0, replace=True)
-        rows.append(func_compute_elo(tmp_battles))
-    df = pd.DataFrame(rows)
-    return df[df.median().sort_values(ascending=False).index]
-
-
-def compute_elo_mle_with_tie(
-    df, SCALE=400, BASE=10, INIT_RATING=1000, sample_weight=None
-):
-    from sklearn.linear_model import LogisticRegression
-
-    ptbl_a_win = pd.pivot_table(
-        df[df["winner"] == "model_a"],
-        index="model_a",
-        columns="model_b",
-        aggfunc="size",
-        fill_value=0,
-    )
-    ptbl_tie = pd.pivot_table(
-        df[df["winner"].isin(["tie", "tie (bothbad)"])],
-        index="model_a",
-        columns="model_b",
-        aggfunc="size",
-        fill_value=0,
-    )
-    ptbl_tie = ptbl_tie + ptbl_tie.T
-    ptbl_b_win = pd.pivot_table(
-        df[df["winner"] == "model_b"],
-        index="model_a",
-        columns="model_b",
-        aggfunc="size",
-        fill_value=0,
-    )
-    ptbl_win = ptbl_a_win * 2 + ptbl_b_win.T * 2 + ptbl_tie
-
-    models = pd.Series(np.arange(len(ptbl_win.index)), index=ptbl_win.index)
-
-    p = len(models)
-    X = np.zeros([p * (p - 1) * 2, p])
-    Y = np.zeros(p * (p - 1) * 2)
-
-    cur_row = 0
-    sample_weights = []
-    for m_a in ptbl_win.index:
-        for m_b in ptbl_win.columns:
-            if m_a == m_b:
-                continue
-            # if nan skip
-            if math.isnan(ptbl_win.loc[m_a, m_b]) or math.isnan(ptbl_win.loc[m_b, m_a]):
-                continue
-            X[cur_row, models[m_a]] = +math.log(BASE)
-            X[cur_row, models[m_b]] = -math.log(BASE)
-            Y[cur_row] = 1.0
-            sample_weights.append(ptbl_win.loc[m_a, m_b])
-
-            X[cur_row + 1, models[m_a]] = math.log(BASE)
-            X[cur_row + 1, models[m_b]] = -math.log(BASE)
-            Y[cur_row + 1] = 0.0
-            sample_weights.append(ptbl_win.loc[m_b, m_a])
-            cur_row += 2
-    X = X[:cur_row]
-    Y = Y[:cur_row]
-
-    lr = LogisticRegression(fit_intercept=False, penalty=None)
-    lr.fit(X, Y, sample_weight=sample_weights)
-    elo_scores = SCALE * lr.coef_[0] + INIT_RATING
-    if "mixtral-8x7b-instruct-v0.1" in models.index:
-        elo_scores += 1114 - elo_scores[models["mixtral-8x7b-instruct-v0.1"]]
-    return pd.Series(elo_scores, index=models.index).sort_values(ascending=False)
-
-
 def get_median_elo_from_bootstrap(bootstrap_df):
     median = dict(bootstrap_df.quantile(0.5))
     median = {k: int(v + 0.5) for k, v in median.items()}
@@ -411,129 +309,6 @@ def outlier_detect(
     return battles
 
 
-def fit_mle_elo(X, Y, models, indices=None, SCALE=400, INIT_RATING=1000):
-    from sklearn.linear_model import LogisticRegression
-
-    p = len(models.index)
-
-    lr = LogisticRegression(fit_intercept=False)
-    if indices:
-        lr.fit(X[indices], Y[indices])
-    else:
-        lr.fit(X, Y)
-
-    elo_scores = SCALE * lr.coef_[0] + INIT_RATING
-    # calibrate llama-13b to 800 if applicable
-    if "mixtral-8x7b-instruct-v0.1" in models.index:
-        elo_scores += 1114 - elo_scores[models["mixtral-8x7b-instruct-v0.1"]]
-    return (
-        pd.Series(elo_scores[:p], index=models.index).sort_values(ascending=False),
-        lr.coef_[0][p:],
-    )
-
-
-def construct_style_matrices(
-    df,
-    BASE=10,
-    apply_ratio=[1, 1, 1, 1],
-    style_elements=STYLE_CONTROL_ELEMENTS_V1,
-    add_one=True,
-):
-    models = pd.concat([df["model_a"], df["model_b"]]).unique()
-    models = pd.Series(np.arange(len(models)), index=models)
-
-    # duplicate battles
-    df = pd.concat([df, df], ignore_index=True)
-    p = len(models.index)
-    n = df.shape[0]
-    assert len(style_elements) % 2 == 0
-    k = int(len(style_elements) / 2)
-
-    X = np.zeros([n, p + k])
-    X[np.arange(n), models[df["model_a"]]] = +math.log(BASE)
-    X[np.arange(n), models[df["model_b"]]] = -math.log(BASE)
-
-    # creates turn each of the specified column in "conv_metadata" into a vector
-    style_vector = np.array(
-        [
-            df.conv_metadata.map(
-                lambda x: x[element]
-                if type(x[element]) is int
-                else sum(x[element].values())
-            ).tolist()
-            for element in style_elements
-        ]
-    )
-
-    style_diff = (style_vector[:k] - style_vector[k:]).astype(float)
-    style_sum = (style_vector[:k] + style_vector[k:]).astype(float)
-
-    if add_one:
-        style_sum = style_sum + np.ones(style_diff.shape)
-
-    apply_ratio = np.flatnonzero(apply_ratio)
-
-    style_diff[apply_ratio] /= style_sum[
-        apply_ratio
-    ]  # Apply ratio where necessary (length, etc)
-
-    style_mean = np.mean(style_diff, axis=1)
-    style_std = np.std(style_diff, axis=1)
-
-    X[:, -k:] = ((style_diff - style_mean[:, np.newaxis]) / style_std[:, np.newaxis]).T
-
-    # one A win => two A win
-    Y = np.zeros(n)
-    Y[df["winner"] == "model_a"] = 1.0
-
-    # one tie => one A win + one B win
-    # find tie + tie (both bad) index
-    tie_idx = (df["winner"] == "tie") | (df["winner"] == "tie (bothbad)")
-    tie_idx[len(tie_idx) // 2 :] = False
-    Y[tie_idx] = 1.0
-
-    return X, Y, models
-
-
-def get_bootstrap_result_style_control(
-    X, Y, battles, models, func_compute_elo, num_round=1000
-):
-    elos = []
-    coefs = []
-    assert X.shape[0] % 2 == 0 and X.shape[0] == Y.shape[0]
-    k = int(
-        X.shape[0] / 2
-    )  # Since we duplicate the battles when constructing X and Y, we don't want to sample the duplicates
-
-    battles_tie_idx = (battles["winner"] == "tie") | (
-        battles["winner"] == "tie (bothbad)"
-    )
-    for _ in tqdm(range(num_round), desc="bootstrap"):
-        indices = np.random.choice(list(range(k)), size=(k), replace=True)
-
-        index2tie = np.zeros(k, dtype=bool)
-        index2tie[battles_tie_idx] = True
-
-        nontie_indices = indices[~index2tie[indices]]
-        tie_indices = np.concatenate(
-            [indices[index2tie[indices]], indices[index2tie[indices]] + k]
-        )
-
-        _X = np.concatenate([X[nontie_indices], X[nontie_indices], X[tie_indices]])
-        _Y = np.concatenate([Y[nontie_indices], Y[nontie_indices], Y[tie_indices]])
-
-        assert _X.shape == X.shape and _Y.shape == Y.shape
-
-        states = ~_X[:, : len(models)].any(axis=0)
-
-        elo, coef = func_compute_elo(_X, _Y, models=models[~states])
-        elos.append(elo)
-        coefs.append(coef)
-
-    df = pd.DataFrame(elos)
-    return df[df.median().sort_values(ascending=False).index], coefs
-
-
 def filter_long_conv(row):
     threshold = 768
     for conversation_type in ["conversation_a", "conversation_b"]:
@@ -557,6 +332,7 @@ def report_elo_analysis_results(
     scale=1,
     filter_func=lambda x: True,
     style_control=False,
+    num_cpu=None,
 ):
     battles = pd.DataFrame(battles_json)
 
@@ -598,19 +374,18 @@ def report_elo_analysis_results(
 
     if rating_system == "bt":
         if style_control:
-            X, Y, models = construct_style_matrices(battles)
-            bootstrap_df, boostrap_coef = get_bootstrap_result_style_control(
-                X, Y, battles, models, fit_mle_elo, num_round=num_bootstrap
+            bootstrap_df, boostrap_coef = compute_bootstrap_style_control(
+                battles, num_round=num_bootstrap
             )
-            elo_rating_final, coef_final = fit_mle_elo(X, Y, models)
+            elo_rating_final, coef_final = compute_style_control(battles)
         else:
-            bootstrap_df = get_bootstrap_result(
-                battles, compute_elo_mle_with_tie, num_round=num_bootstrap
+            bootstrap_df = compute_bootstrap_bt(
+                battles, num_round=num_bootstrap, num_cpu=num_cpu
             )
-            elo_rating_final = compute_elo_mle_with_tie(battles)
+            elo_rating_final = compute_bt(battles)
     elif rating_system == "elo":
-        bootstrap_df = get_bootstrap_result(
-            battles, compute_elo, num_round=num_bootstrap
+        bootstrap_df = compute_bootstrap_elo(
+            battles, num_round=num_bootstrap, num_cpu=num_cpu
         )
         elo_rating_median = get_median_elo_from_bootstrap(bootstrap_df)
         elo_rating_final = elo_rating_median
@@ -715,6 +490,7 @@ def pretty_print_elo_rating(rating):
     parser.add_argument("--category", nargs="+", default=["full"])
     parser.add_argument("--scale", type=float, default=1)
     parser.add_argument("--style-control", action="store_true")
+    parser.add_argument("--num-cpu", type=int, default=12)
     args = parser.parse_args()
 
     np.random.seed(42)
@@ -753,6 +529,7 @@ def pretty_print_elo_rating(rating):
             scale=args.scale,
             filter_func=filter_func,
             style_control=args.style_control,
+            num_cpu=args.num_cpu,
         )
 
     for cat in args.category: