1717from fastchat .model .model_registry import get_model_info
1818from fastchat .serve .monitor .basic_stats import get_log_files
1919from fastchat .serve .monitor .clean_battle_data import clean_battle_data
20+ from fastchat .serve .monitor .rating_systems import (
21+ compute_elo ,
22+ compute_bt ,
23+ compute_style_control ,
24+ compute_bootstrap_elo ,
25+ compute_bootstrap_bt ,
26+ compute_bootstrap_style_control ,
27+ )
2028
2129pd .options .display .float_format = "{:.2f}" .format
2230
2331
24- STYLE_CONTROL_ELEMENTS_V1 = [
25- "sum_assistant_a_tokens" ,
26- "header_count_a" ,
27- "list_count_a" ,
28- "bold_count_a" ,
29- "sum_assistant_b_tokens" ,
30- "header_count_b" ,
31- "list_count_b" ,
32- "bold_count_b" ,
33- ]
34-
35-
36- def compute_elo (battles , K = 4 , SCALE = 400 , BASE = 10 , INIT_RATING = 1000 ):
37- rating = defaultdict (lambda : INIT_RATING )
38-
39- for rd , model_a , model_b , winner in battles [
40- ["model_a" , "model_b" , "winner" ]
41- ].itertuples ():
42- ra = rating [model_a ]
43- rb = rating [model_b ]
44- ea = 1 / (1 + BASE ** ((rb - ra ) / SCALE ))
45- eb = 1 / (1 + BASE ** ((ra - rb ) / SCALE ))
46- if winner == "model_a" :
47- sa = 1
48- elif winner == "model_b" :
49- sa = 0
50- elif winner == "tie" or winner == "tie (bothbad)" :
51- sa = 0.5
52- else :
53- raise Exception (f"unexpected vote { winner } " )
54- rating [model_a ] += K * (sa - ea )
55- rating [model_b ] += K * (1 - sa - eb )
56-
57- return dict (rating )
58-
59-
60- def get_bootstrap_result (battles , func_compute_elo , num_round = 1000 ):
61- rows = []
62- for i in tqdm (range (num_round ), desc = "bootstrap" ):
63- tmp_battles = battles .sample (frac = 1.0 , replace = True )
64- rows .append (func_compute_elo (tmp_battles ))
65- df = pd .DataFrame (rows )
66- return df [df .median ().sort_values (ascending = False ).index ]
67-
68-
69- def compute_elo_mle_with_tie (
70- df , SCALE = 400 , BASE = 10 , INIT_RATING = 1000 , sample_weight = None
71- ):
72- from sklearn .linear_model import LogisticRegression
73-
74- ptbl_a_win = pd .pivot_table (
75- df [df ["winner" ] == "model_a" ],
76- index = "model_a" ,
77- columns = "model_b" ,
78- aggfunc = "size" ,
79- fill_value = 0 ,
80- )
81- ptbl_tie = pd .pivot_table (
82- df [df ["winner" ].isin (["tie" , "tie (bothbad)" ])],
83- index = "model_a" ,
84- columns = "model_b" ,
85- aggfunc = "size" ,
86- fill_value = 0 ,
87- )
88- ptbl_tie = ptbl_tie + ptbl_tie .T
89- ptbl_b_win = pd .pivot_table (
90- df [df ["winner" ] == "model_b" ],
91- index = "model_a" ,
92- columns = "model_b" ,
93- aggfunc = "size" ,
94- fill_value = 0 ,
95- )
96- ptbl_win = ptbl_a_win * 2 + ptbl_b_win .T * 2 + ptbl_tie
97-
98- models = pd .Series (np .arange (len (ptbl_win .index )), index = ptbl_win .index )
99-
100- p = len (models )
101- X = np .zeros ([p * (p - 1 ) * 2 , p ])
102- Y = np .zeros (p * (p - 1 ) * 2 )
103-
104- cur_row = 0
105- sample_weights = []
106- for m_a in ptbl_win .index :
107- for m_b in ptbl_win .columns :
108- if m_a == m_b :
109- continue
110- # if nan skip
111- if math .isnan (ptbl_win .loc [m_a , m_b ]) or math .isnan (ptbl_win .loc [m_b , m_a ]):
112- continue
113- X [cur_row , models [m_a ]] = + math .log (BASE )
114- X [cur_row , models [m_b ]] = - math .log (BASE )
115- Y [cur_row ] = 1.0
116- sample_weights .append (ptbl_win .loc [m_a , m_b ])
117-
118- X [cur_row + 1 , models [m_a ]] = math .log (BASE )
119- X [cur_row + 1 , models [m_b ]] = - math .log (BASE )
120- Y [cur_row + 1 ] = 0.0
121- sample_weights .append (ptbl_win .loc [m_b , m_a ])
122- cur_row += 2
123- X = X [:cur_row ]
124- Y = Y [:cur_row ]
125-
126- lr = LogisticRegression (fit_intercept = False , penalty = None )
127- lr .fit (X , Y , sample_weight = sample_weights )
128- elo_scores = SCALE * lr .coef_ [0 ] + INIT_RATING
129- if "mixtral-8x7b-instruct-v0.1" in models .index :
130- elo_scores += 1114 - elo_scores [models ["mixtral-8x7b-instruct-v0.1" ]]
131- return pd .Series (elo_scores , index = models .index ).sort_values (ascending = False )
132-
133-
13432def get_median_elo_from_bootstrap (bootstrap_df ):
13533 median = dict (bootstrap_df .quantile (0.5 ))
13634 median = {k : int (v + 0.5 ) for k , v in median .items ()}
@@ -411,129 +309,6 @@ def outlier_detect(
411309 return battles
412310
413311
414- def fit_mle_elo (X , Y , models , indices = None , SCALE = 400 , INIT_RATING = 1000 ):
415- from sklearn .linear_model import LogisticRegression
416-
417- p = len (models .index )
418-
419- lr = LogisticRegression (fit_intercept = False )
420- if indices :
421- lr .fit (X [indices ], Y [indices ])
422- else :
423- lr .fit (X , Y )
424-
425- elo_scores = SCALE * lr .coef_ [0 ] + INIT_RATING
426- # calibrate llama-13b to 800 if applicable
427- if "mixtral-8x7b-instruct-v0.1" in models .index :
428- elo_scores += 1114 - elo_scores [models ["mixtral-8x7b-instruct-v0.1" ]]
429- return (
430- pd .Series (elo_scores [:p ], index = models .index ).sort_values (ascending = False ),
431- lr .coef_ [0 ][p :],
432- )
433-
434-
435- def construct_style_matrices (
436- df ,
437- BASE = 10 ,
438- apply_ratio = [1 , 1 , 1 , 1 ],
439- style_elements = STYLE_CONTROL_ELEMENTS_V1 ,
440- add_one = True ,
441- ):
442- models = pd .concat ([df ["model_a" ], df ["model_b" ]]).unique ()
443- models = pd .Series (np .arange (len (models )), index = models )
444-
445- # duplicate battles
446- df = pd .concat ([df , df ], ignore_index = True )
447- p = len (models .index )
448- n = df .shape [0 ]
449- assert len (style_elements ) % 2 == 0
450- k = int (len (style_elements ) / 2 )
451-
452- X = np .zeros ([n , p + k ])
453- X [np .arange (n ), models [df ["model_a" ]]] = + math .log (BASE )
454- X [np .arange (n ), models [df ["model_b" ]]] = - math .log (BASE )
455-
456- # creates turn each of the specified column in "conv_metadata" into a vector
457- style_vector = np .array (
458- [
459- df .conv_metadata .map (
460- lambda x : x [element ]
461- if type (x [element ]) is int
462- else sum (x [element ].values ())
463- ).tolist ()
464- for element in style_elements
465- ]
466- )
467-
468- style_diff = (style_vector [:k ] - style_vector [k :]).astype (float )
469- style_sum = (style_vector [:k ] + style_vector [k :]).astype (float )
470-
471- if add_one :
472- style_sum = style_sum + np .ones (style_diff .shape )
473-
474- apply_ratio = np .flatnonzero (apply_ratio )
475-
476- style_diff [apply_ratio ] /= style_sum [
477- apply_ratio
478- ] # Apply ratio where necessary (length, etc)
479-
480- style_mean = np .mean (style_diff , axis = 1 )
481- style_std = np .std (style_diff , axis = 1 )
482-
483- X [:, - k :] = ((style_diff - style_mean [:, np .newaxis ]) / style_std [:, np .newaxis ]).T
484-
485- # one A win => two A win
486- Y = np .zeros (n )
487- Y [df ["winner" ] == "model_a" ] = 1.0
488-
489- # one tie => one A win + one B win
490- # find tie + tie (both bad) index
491- tie_idx = (df ["winner" ] == "tie" ) | (df ["winner" ] == "tie (bothbad)" )
492- tie_idx [len (tie_idx ) // 2 :] = False
493- Y [tie_idx ] = 1.0
494-
495- return X , Y , models
496-
497-
498- def get_bootstrap_result_style_control (
499- X , Y , battles , models , func_compute_elo , num_round = 1000
500- ):
501- elos = []
502- coefs = []
503- assert X .shape [0 ] % 2 == 0 and X .shape [0 ] == Y .shape [0 ]
504- k = int (
505- X .shape [0 ] / 2
506- ) # Since we duplicate the battles when constructing X and Y, we don't want to sample the duplicates
507-
508- battles_tie_idx = (battles ["winner" ] == "tie" ) | (
509- battles ["winner" ] == "tie (bothbad)"
510- )
511- for _ in tqdm (range (num_round ), desc = "bootstrap" ):
512- indices = np .random .choice (list (range (k )), size = (k ), replace = True )
513-
514- index2tie = np .zeros (k , dtype = bool )
515- index2tie [battles_tie_idx ] = True
516-
517- nontie_indices = indices [~ index2tie [indices ]]
518- tie_indices = np .concatenate (
519- [indices [index2tie [indices ]], indices [index2tie [indices ]] + k ]
520- )
521-
522- _X = np .concatenate ([X [nontie_indices ], X [nontie_indices ], X [tie_indices ]])
523- _Y = np .concatenate ([Y [nontie_indices ], Y [nontie_indices ], Y [tie_indices ]])
524-
525- assert _X .shape == X .shape and _Y .shape == Y .shape
526-
527- states = ~ _X [:, : len (models )].any (axis = 0 )
528-
529- elo , coef = func_compute_elo (_X , _Y , models = models [~ states ])
530- elos .append (elo )
531- coefs .append (coef )
532-
533- df = pd .DataFrame (elos )
534- return df [df .median ().sort_values (ascending = False ).index ], coefs
535-
536-
537312def filter_long_conv (row ):
538313 threshold = 768
539314 for conversation_type in ["conversation_a" , "conversation_b" ]:
@@ -557,6 +332,7 @@ def report_elo_analysis_results(
557332 scale = 1 ,
558333 filter_func = lambda x : True ,
559334 style_control = False ,
335+ num_cpu = None ,
560336):
561337 battles = pd .DataFrame (battles_json )
562338
@@ -598,19 +374,18 @@ def report_elo_analysis_results(
598374
599375 if rating_system == "bt" :
600376 if style_control :
601- X , Y , models = construct_style_matrices (battles )
602- bootstrap_df , boostrap_coef = get_bootstrap_result_style_control (
603- X , Y , battles , models , fit_mle_elo , num_round = num_bootstrap
377+ bootstrap_df , boostrap_coef = compute_bootstrap_style_control (
378+ battles , num_round = num_bootstrap
604379 )
605- elo_rating_final , coef_final = fit_mle_elo ( X , Y , models )
380+ elo_rating_final , coef_final = compute_style_control ( battles )
606381 else :
607- bootstrap_df = get_bootstrap_result (
608- battles , compute_elo_mle_with_tie , num_round = num_bootstrap
382+ bootstrap_df = compute_bootstrap_bt (
383+ battles , num_round = num_bootstrap , num_cpu = num_cpu
609384 )
610- elo_rating_final = compute_elo_mle_with_tie (battles )
385+ elo_rating_final = compute_bt (battles )
611386 elif rating_system == "elo" :
612- bootstrap_df = get_bootstrap_result (
613- battles , compute_elo , num_round = num_bootstrap
387+ bootstrap_df = compute_bootstrap_elo (
388+ battles , num_round = num_bootstrap , num_cpu = num_cpu
614389 )
615390 elo_rating_median = get_median_elo_from_bootstrap (bootstrap_df )
616391 elo_rating_final = elo_rating_median
@@ -715,6 +490,7 @@ def pretty_print_elo_rating(rating):
715490 parser .add_argument ("--category" , nargs = "+" , default = ["full" ])
716491 parser .add_argument ("--scale" , type = float , default = 1 )
717492 parser .add_argument ("--style-control" , action = "store_true" )
493+ parser .add_argument ("--num-cpu" , type = int , default = 12 )
718494 args = parser .parse_args ()
719495
720496 np .random .seed (42 )
@@ -753,6 +529,7 @@ def pretty_print_elo_rating(rating):
753529 scale = args .scale ,
754530 filter_func = filter_func ,
755531 style_control = args .style_control ,
532+ num_cpu = args .num_cpu ,
756533 )
757534
758535 for cat in args .category :
0 commit comments