|
17 | 17 | import pandas as pd |
18 | 18 | import gradio as gr |
19 | 19 | import numpy as np |
20 | | -import requests |
21 | 20 |
|
22 | 21 | from fastchat.constants import SURVEY_LINK |
23 | 22 | from fastchat.serve.monitor.basic_stats import report_basic_stats, get_log_files |
@@ -857,119 +856,6 @@ def build_category_leaderboard_tab( |
857 | 856 | ) |
858 | 857 |
|
859 | 858 |
|
860 | | -def compute_ub_ranking(arena_df): |
861 | | - # Sort models based on their scores |
862 | | - sorted_models = arena_df.sort_values("score", ascending=False).index.tolist() |
863 | | - |
864 | | - ub_ranking = {} |
865 | | - current_rank = 1 |
866 | | - i = 0 |
867 | | - |
868 | | - while i < len(sorted_models): |
869 | | - current_model = sorted_models[i] |
870 | | - current_lower = arena_df.loc[current_model]["lower"] |
871 | | - tied_models = [current_model] |
872 | | - |
873 | | - # Find ties |
874 | | - j = i + 1 |
875 | | - while j < len(sorted_models): |
876 | | - next_model = sorted_models[j] |
877 | | - if arena_df.loc[next_model]["upper"] >= current_lower: |
878 | | - tied_models.append(next_model) |
879 | | - j += 1 |
880 | | - else: |
881 | | - break |
882 | | - |
883 | | - # Assign ranks to tied models |
884 | | - for model in tied_models: |
885 | | - ub_ranking[model] = current_rank |
886 | | - |
887 | | - # Move to the next unprocessed model |
888 | | - i = j |
889 | | - # Next rank is at least the position in the sorted list |
890 | | - current_rank = max(current_rank + 1, i + 1) |
891 | | - |
892 | | - return ub_ranking |
893 | | - |
894 | | - |
895 | | -def process_copilot_arena_leaderboard(leaderboard): |
896 | | - leaderboard["score"] = leaderboard["score"].round().astype(int) |
897 | | - leaderboard["upper"] = leaderboard["upper"].round().astype(int) |
898 | | - leaderboard["lower"] = leaderboard["lower"].round().astype(int) |
899 | | - |
900 | | - leaderboard["upper_diff"] = leaderboard["upper"] - leaderboard["score"] |
901 | | - leaderboard["lower_diff"] = leaderboard["score"] - leaderboard["lower"] |
902 | | - |
903 | | - leaderboard["confidence_interval"] = ( |
904 | | - "+" |
905 | | - + leaderboard["upper_diff"].astype(str) |
906 | | - + " / -" |
907 | | - + leaderboard["lower_diff"].astype(str) |
908 | | - ) |
909 | | - |
910 | | - rankings_ub = compute_ub_ranking(leaderboard) |
911 | | - leaderboard.insert(loc=0, column="Rank* (UB)", value=rankings_ub) |
912 | | - leaderboard["Rank"] = leaderboard["score"].rank(ascending=False).astype(int) |
913 | | - |
914 | | - leaderboard = leaderboard.sort_values(by=["Rank"], ascending=[True]) |
915 | | - |
916 | | - return leaderboard |
917 | | - |
918 | | - |
919 | | -def build_copilot_arena_tab(): |
920 | | - copilot_arena_leaderboard_url = "https://leaderboard-server.fly.dev/elo" |
921 | | - response = requests.get(copilot_arena_leaderboard_url) |
922 | | - if response.status_code == 200: |
923 | | - leaderboard = pd.DataFrame(response.json()["elo_data"]) |
924 | | - leaderboard = process_copilot_arena_leaderboard(leaderboard) |
925 | | - leaderboard = leaderboard.rename( |
926 | | - columns={ |
927 | | - "name": "Model", |
928 | | - "confidence_interval": "Confidence Interval", |
929 | | - "score": "Arena Score", |
930 | | - "organization": "Organization", |
931 | | - "votes": "Votes", |
932 | | - } |
933 | | - ) |
934 | | - |
935 | | - column_order = [ |
936 | | - "Rank* (UB)", |
937 | | - "Model", |
938 | | - "Arena Score", |
939 | | - "Confidence Interval", |
940 | | - "Votes", |
941 | | - "Organization", |
942 | | - ] |
943 | | - leaderboard = leaderboard[column_order] |
944 | | - num_models = len(leaderboard) |
945 | | - total_battles = int(leaderboard["Votes"].sum()) // 2 |
946 | | - md = f""" |
947 | | - [Copilot Arena](https://blog.lmarena.ai/blog/2024/copilot-arena/) is a free AI coding assistant that provides paired responses from different state-of-the-art LLMs. This leaderboard contains the relative performance and ranking of {num_models} models over {total_battles} battles. |
948 | | - """ |
949 | | - |
950 | | - gr.Markdown(md, elem_id="leaderboard_markdown") |
951 | | - gr.DataFrame( |
952 | | - leaderboard, |
953 | | - datatype=["str" for _ in leaderboard.columns], |
954 | | - elem_id="arena_hard_leaderboard", |
955 | | - height=600, |
956 | | - wrap=True, |
957 | | - interactive=False, |
958 | | - column_widths=[70, 130, 60, 80, 50, 80], |
959 | | - ) |
960 | | - |
961 | | - gr.Markdown( |
962 | | - """ |
963 | | - ***Rank (UB)**: model's ranking (upper-bound), defined by one + the number of models that are statistically better than the target model. |
964 | | - Model A is statistically better than model B when A's lower-bound score is greater than B's upper-bound score (in 95% confidence interval). \n |
965 | | - **Confidence Interval**: represents the range of uncertainty around the Arena Score. It's displayed as +X / -Y, where X is the difference between the upper bound and the score, and Y is the difference between the score and the lower bound. |
966 | | - """, |
967 | | - elem_id="leaderboard_markdown", |
968 | | - ) |
969 | | - else: |
970 | | - gr.Markdown("Error with fetching Copilot Arena data. Check back in later.") |
971 | | - |
972 | | - |
973 | 859 | selected_categories = [ |
974 | 860 | "full", |
975 | 861 | "full_style_control", |
@@ -1167,6 +1053,8 @@ def build_leaderboard_tab( |
1167 | 1053 | elo_results_text, model_table_df, model_to_score |
1168 | 1054 | ) |
1169 | 1055 | with gr.Tab("Copilot Arena Leaderboard", id=5): |
| 1056 | + from fastchat.serve.monitor.copilot_arena import build_copilot_arena_tab |
| 1057 | + |
1170 | 1058 | build_copilot_arena_tab() |
1171 | 1059 |
|
1172 | 1060 | if not show_plot: |
|
0 commit comments