2121pd .options .display .float_format = "{:.2f}" .format
2222
2323
24+ STYLE_CONTROL_ELEMENTS_V1 = [
25+ "sum_assistant_a_tokens" ,
26+ "header_count_a" ,
27+ "list_count_a" ,
28+ "bold_count_a" ,
29+ "sum_assistant_b_tokens" ,
30+ "header_count_b" ,
31+ "list_count_b" ,
32+ "bold_count_b" ,
33+ ]
34+
35+
2436def compute_elo (battles , K = 4 , SCALE = 400 , BASE = 10 , INIT_RATING = 1000 ):
2537 rating = defaultdict (lambda : INIT_RATING )
2638
@@ -399,6 +411,109 @@ def outlier_detect(
399411 return battles
400412
401413
414+ def fit_mle_elo (X , Y , models , indices = None , SCALE = 400 , INIT_RATING = 1000 ):
415+ from sklearn .linear_model import LogisticRegression
416+
417+ p = len (models .index )
418+
419+ lr = LogisticRegression (fit_intercept = False )
420+ if indices :
421+ lr .fit (X [indices ], Y [indices ])
422+ else :
423+ lr .fit (X , Y )
424+
425+ elo_scores = SCALE * lr .coef_ [0 ] + INIT_RATING
426+ # calibrate llama-13b to 800 if applicable
427+ if "mixtral-8x7b-instruct-v0.1" in models .index :
428+ elo_scores += 1114 - elo_scores [models ["mixtral-8x7b-instruct-v0.1" ]]
429+ return (
430+ pd .Series (elo_scores [:p ], index = models .index ).sort_values (ascending = False ),
431+ lr .coef_ [0 ][p :],
432+ )
433+
434+
435+ def construct_style_matrices (
436+ df ,
437+ BASE = 10 ,
438+ apply_ratio = [1 , 1 , 1 , 1 ],
439+ style_elements = STYLE_CONTROL_ELEMENTS_V1 ,
440+ add_one = True ,
441+ ):
442+ models = pd .concat ([battles ["model_a" ], battles ["model_b" ]]).unique ()
443+ models = pd .Series (np .arange (len (models )), index = models )
444+
445+ # duplicate battles
446+ df = pd .concat ([df , df ], ignore_index = True )
447+ p = len (models .index )
448+ n = df .shape [0 ]
449+ assert len (style_elements ) % 2 == 0
450+ k = int (len (style_elements ) / 2 )
451+
452+ X = np .zeros ([n , p + k ])
453+ X [np .arange (n ), models [df ["model_a" ]]] = + math .log (BASE )
454+ X [np .arange (n ), models [df ["model_b" ]]] = - math .log (BASE )
455+
456+ # creates turn each of the specified column in "conv_metadata" into a vector
457+ style_vector = np .array (
458+ [
459+ df .conv_metadata .map (
460+ lambda x : x [element ]
461+ if type (x [element ]) is int
462+ else sum (x [element ].values ())
463+ ).tolist ()
464+ for element in style_elements
465+ ]
466+ )
467+
468+ style_diff = (style_vector [:k ] - style_vector [k :]).astype (float )
469+ style_sum = (style_vector [:k ] + style_vector [k :]).astype (float )
470+
471+ if add_one :
472+ style_sum = style_sum + np .ones (style_diff .shape )
473+
474+ apply_ratio = np .flatnonzero (apply_ratio )
475+
476+ style_diff [apply_ratio ] /= style_sum [
477+ apply_ratio
478+ ] # Apply ratio where necessary (length, etc)
479+
480+ style_mean = np .mean (style_diff , axis = 1 )
481+ style_std = np .std (style_diff , axis = 1 )
482+
483+ X [:, - k :] = ((style_diff - style_mean [:, np .newaxis ]) / style_std [:, np .newaxis ]).T
484+
485+ # one A win => two A win
486+ Y = np .zeros (n )
487+ Y [df ["winner" ] == "model_a" ] = 1.0
488+
489+ # one tie => one A win + one B win
490+ # find tie + tie (both bad) index
491+ tie_idx = (df ["winner" ] == "tie" ) | (df ["winner" ] == "tie (bothbad)" )
492+ tie_idx [len (tie_idx ) // 2 :] = False
493+ Y [tie_idx ] = 1.0
494+
495+ return X , Y , models
496+
497+
498+ def get_bootstrap_result_style_control (X , Y , models , func_compute_elo , num_round = 1000 ):
499+ elos = []
500+ coefs = []
501+ for _ in tqdm (range (num_round ), desc = "bootstrap" ):
502+ indices = np .random .choice (
503+ list (range (len (battles ))), size = (len (battles )), replace = True
504+ )
505+ _X = X [indices ]
506+ _Y = Y [indices ]
507+ states = ~ _X [:, : len (models )].any (axis = 0 )
508+
509+ elo , coef = func_compute_elo (_X , _Y , models = models [~ states ])
510+ elos .append (elo )
511+ coefs .append (coef )
512+
513+ df = pd .DataFrame (elos )
514+ return df [df .median ().sort_values (ascending = False ).index ], coefs
515+
516+
402517def filter_long_conv (row ):
403518 threshold = 768
404519 for conversation_type in ["conversation_a" , "conversation_b" ]:
@@ -421,6 +536,7 @@ def report_elo_analysis_results(
421536 run_outlier_detect = False ,
422537 scale = 1 ,
423538 filter_func = lambda x : True ,
539+ style_control = False ,
424540):
425541 battles = pd .DataFrame (battles_json )
426542
@@ -461,10 +577,17 @@ def report_elo_analysis_results(
461577 elo_rating_online = compute_elo (battles )
462578
463579 if rating_system == "bt" :
464- bootstrap_df = get_bootstrap_result (
465- battles , compute_elo_mle_with_tie , num_round = num_bootstrap
466- )
467- elo_rating_final = compute_elo_mle_with_tie (battles )
580+ if style_control :
581+ X , Y , models = construct_style_matrices (battles )
582+ bootstrap_df , boostrap_coef = get_bootstrap_result_style_control (
583+ X , Y , models , fit_mle_elo , num_round = num_bootstrap
584+ )
585+ elo_rating_final , coef_final = fit_mle_elo (X , Y , models )
586+ else :
587+ bootstrap_df = get_bootstrap_result (
588+ battles , compute_elo_mle_with_tie , num_round = num_bootstrap
589+ )
590+ elo_rating_final = compute_elo_mle_with_tie (battles )
468591 elif rating_system == "elo" :
469592 bootstrap_df = get_bootstrap_result (
470593 battles , compute_elo , num_round = num_bootstrap
@@ -538,6 +661,12 @@ def report_elo_analysis_results(
538661 "last_updated_tstamp" : last_updated_tstamp ,
539662 "bootstrap_df" : bootstrap_df ,
540663 "leaderboard_table_df" : leaderboard_table_df ,
664+ "style_coefficients" : {
665+ "bootstrap" : np .vstack (boostrap_coef ),
666+ "final" : coef_final ,
667+ }
668+ if rating_system == "bt" and style_control
669+ else {},
541670 }
542671
543672
@@ -565,6 +694,7 @@ def pretty_print_elo_rating(rating):
565694 parser .add_argument ("--run-outlier-detect" , action = "store_true" , default = False )
566695 parser .add_argument ("--category" , nargs = "+" , default = ["full" ])
567696 parser .add_argument ("--scale" , type = float , default = 1 )
697+ parser .add_argument ("--style-control" , action = "store_true" )
568698 args = parser .parse_args ()
569699
570700 np .random .seed (42 )
@@ -602,6 +732,7 @@ def pretty_print_elo_rating(rating):
602732 run_outlier_detect = args .run_outlier_detect ,
603733 scale = args .scale ,
604734 filter_func = filter_func ,
735+ style_control = args .style_control ,
605736 )
606737
607738 for cat in args .category :
0 commit comments