fix pr comments

adityamittal13 · adityamittal13 · commit ca73dbe80d8e · 2024-11-15T17:12:19.000-05:00
diff --git a/fastchat/serve/monitor/copilot_arena.py b/fastchat/serve/monitor/copilot_arena.py
@@ -0,0 +1,85 @@
+import gradio as gr
+import pandas as pd
+import requests
+
+from fastchat.serve.monitor.monitor import recompute_final_ranking
+
+copilot_arena_leaderboard_url = (
+    "https://leaderboard-server.fly.dev/elo"
+)
+
+def process_copilot_arena_leaderboard(leaderboard):
+    leaderboard["score"] = leaderboard["score"].round().astype(int)
+    leaderboard["rating_q975"] = leaderboard["upper"].round().astype(int)
+    leaderboard["rating_q025"] = leaderboard["lower"].round().astype(int)
+
+    leaderboard["upper_diff"] = leaderboard["rating_q975"] - leaderboard["score"]
+    leaderboard["lower_diff"] = leaderboard["score"] - leaderboard["rating_q025"]
+
+    leaderboard["confidence_interval"] = (
+        "+"
+        + leaderboard["upper_diff"].astype(str)
+        + " / -"
+        + leaderboard["lower_diff"].astype(str)
+    )
+
+    rankings_ub = recompute_final_ranking(leaderboard)
+    leaderboard.insert(loc=0, column="Rank* (UB)", value=rankings_ub)
+    leaderboard["Rank"] = leaderboard["score"].rank(ascending=False).astype(int)
+
+    leaderboard = leaderboard.sort_values(by=["Rank"], ascending=[True])
+
+    return leaderboard
+
+
+def build_copilot_arena_tab():
+    response = requests.get(copilot_arena_leaderboard_url)
+    if response.status_code == 200:
+        leaderboard = pd.DataFrame(response.json()["elo_data"])
+        leaderboard = process_copilot_arena_leaderboard(leaderboard)
+        leaderboard = leaderboard.rename(
+            columns={
+                "name": "Model",
+                "confidence_interval": "Confidence Interval",
+                "score": "Arena Score",
+                "organization": "Organization",
+                "votes": "Votes",
+            }
+        )
+
+        column_order = [
+            "Rank* (UB)",
+            "Model",
+            "Arena Score",
+            "Confidence Interval",
+            "Votes",
+            "Organization",
+        ]
+        leaderboard = leaderboard[column_order]
+        num_models = len(leaderboard)
+        total_battles = int(leaderboard["Votes"].sum()) // 2
+        md = f"""
+        [Copilot Arena](https://blog.lmarena.ai/blog/2024/copilot-arena/) is a free AI coding assistant that provides paired responses from different state-of-the-art LLMs. This leaderboard contains the relative performance and ranking of {num_models} models over {total_battles} battles.
+        """
+
+        gr.Markdown(md, elem_id="leaderboard_markdown")
+        gr.DataFrame(
+            leaderboard,
+            datatype=["str" for _ in leaderboard.columns],
+            elem_id="arena_hard_leaderboard",
+            height=600,
+            wrap=True,
+            interactive=False,
+            column_widths=[70, 130, 60, 80, 50, 80],
+        )
+
+        gr.Markdown(
+            """
+    ***Rank (UB)**: model's ranking (upper-bound), defined by one + the number of models that are statistically better than the target model.
+    Model A is statistically better than model B when A's lower-bound score is greater than B's upper-bound score (in 95% confidence interval). \n
+    **Confidence Interval**: represents the range of uncertainty around the Arena Score. It's displayed as +X / -Y, where X is the difference between the upper bound and the score, and Y is the difference between the score and the lower bound.
+    """,
+            elem_id="leaderboard_markdown",
+        )
+    else:
+        gr.Markdown("Error with fetching Copilot Arena data. Check back in later.")
diff --git a/fastchat/serve/monitor/monitor.py b/fastchat/serve/monitor/monitor.py
@@ -17,7 +17,6 @@
 import pandas as pd
 import gradio as gr
 import numpy as np
-import requests
 
 from fastchat.constants import SURVEY_LINK
 from fastchat.serve.monitor.basic_stats import report_basic_stats, get_log_files
@@ -857,119 +856,6 @@ def build_category_leaderboard_tab(
     )
 
 
-def compute_ub_ranking(arena_df):
-    # Sort models based on their scores
-    sorted_models = arena_df.sort_values("score", ascending=False).index.tolist()
-
-    ub_ranking = {}
-    current_rank = 1
-    i = 0
-
-    while i < len(sorted_models):
-        current_model = sorted_models[i]
-        current_lower = arena_df.loc[current_model]["lower"]
-        tied_models = [current_model]
-
-        # Find ties
-        j = i + 1
-        while j < len(sorted_models):
-            next_model = sorted_models[j]
-            if arena_df.loc[next_model]["upper"] >= current_lower:
-                tied_models.append(next_model)
-                j += 1
-            else:
-                break
-
-        # Assign ranks to tied models
-        for model in tied_models:
-            ub_ranking[model] = current_rank
-
-        # Move to the next unprocessed model
-        i = j
-        # Next rank is at least the position in the sorted list
-        current_rank = max(current_rank + 1, i + 1)
-
-    return ub_ranking
-
-
-def process_copilot_arena_leaderboard(leaderboard):
-    leaderboard["score"] = leaderboard["score"].round().astype(int)
-    leaderboard["upper"] = leaderboard["upper"].round().astype(int)
-    leaderboard["lower"] = leaderboard["lower"].round().astype(int)
-
-    leaderboard["upper_diff"] = leaderboard["upper"] - leaderboard["score"]
-    leaderboard["lower_diff"] = leaderboard["score"] - leaderboard["lower"]
-
-    leaderboard["confidence_interval"] = (
-        "+"
-        + leaderboard["upper_diff"].astype(str)
-        + " / -"
-        + leaderboard["lower_diff"].astype(str)
-    )
-
-    rankings_ub = compute_ub_ranking(leaderboard)
-    leaderboard.insert(loc=0, column="Rank* (UB)", value=rankings_ub)
-    leaderboard["Rank"] = leaderboard["score"].rank(ascending=False).astype(int)
-
-    leaderboard = leaderboard.sort_values(by=["Rank"], ascending=[True])
-
-    return leaderboard
-
-
-def build_copilot_arena_tab():
-    copilot_arena_leaderboard_url = "https://leaderboard-server.fly.dev/elo"
-    response = requests.get(copilot_arena_leaderboard_url)
-    if response.status_code == 200:
-        leaderboard = pd.DataFrame(response.json()["elo_data"])
-        leaderboard = process_copilot_arena_leaderboard(leaderboard)
-        leaderboard = leaderboard.rename(
-            columns={
-                "name": "Model",
-                "confidence_interval": "Confidence Interval",
-                "score": "Arena Score",
-                "organization": "Organization",
-                "votes": "Votes",
-            }
-        )
-
-        column_order = [
-            "Rank* (UB)",
-            "Model",
-            "Arena Score",
-            "Confidence Interval",
-            "Votes",
-            "Organization",
-        ]
-        leaderboard = leaderboard[column_order]
-        num_models = len(leaderboard)
-        total_battles = int(leaderboard["Votes"].sum()) // 2
-        md = f"""
-        [Copilot Arena](https://blog.lmarena.ai/blog/2024/copilot-arena/) is a free AI coding assistant that provides paired responses from different state-of-the-art LLMs. This leaderboard contains the relative performance and ranking of {num_models} models over {total_battles} battles.
-        """
-
-        gr.Markdown(md, elem_id="leaderboard_markdown")
-        gr.DataFrame(
-            leaderboard,
-            datatype=["str" for _ in leaderboard.columns],
-            elem_id="arena_hard_leaderboard",
-            height=600,
-            wrap=True,
-            interactive=False,
-            column_widths=[70, 130, 60, 80, 50, 80],
-        )
-
-        gr.Markdown(
-            """
-    ***Rank (UB)**: model's ranking (upper-bound), defined by one + the number of models that are statistically better than the target model.
-    Model A is statistically better than model B when A's lower-bound score is greater than B's upper-bound score (in 95% confidence interval). \n
-    **Confidence Interval**: represents the range of uncertainty around the Arena Score. It's displayed as +X / -Y, where X is the difference between the upper bound and the score, and Y is the difference between the score and the lower bound.
-    """,
-            elem_id="leaderboard_markdown",
-        )
-    else:
-        gr.Markdown("Error with fetching Copilot Arena data. Check back in later.")
-
-
 selected_categories = [
     "full",
     "full_style_control",
@@ -1167,6 +1053,8 @@ def build_leaderboard_tab(
                     elo_results_text, model_table_df, model_to_score
                 )
             with gr.Tab("Copilot Arena Leaderboard", id=5):
+                from fastchat.serve.monitor.copilot_arena import build_copilot_arena_tab
+
                 build_copilot_arena_tab()
 
         if not show_plot: