11import fnmatch
2- import io
32import json
43import random
54import re
65import traceback
76import warnings
87from collections import defaultdict
9- from datetime import datetime
108from logging import warn
119from pathlib import Path
1210
1614from IPython .display import display
1715from tqdm import tqdm
1816
19- from agentlab .analyze .error_categorization import (
20- ERR_CLASS_MAP ,
21- is_critical_server_error ,
22- is_minor_server_error ,
23- )
2417from agentlab .experiments .exp_utils import RESULTS_DIR
25- from agentlab .utils .bootstrap import bootstrap_matrix , convert_df_to_array
2618
2719# TODO find a more portable way to code set_task_category_as_index at least
2820# handle dynamic imports. We don't want to always import workarena
2921# from browsergym.workarena import TASK_CATEGORY_MAP
3022
3123warnings .filterwarnings ("ignore" , category = pd .errors .PerformanceWarning )
3224
33- try :
34- import pyperclip
35- except ImportError :
36- pyperclip = None
37-
3825pd .set_option ("display.multi_sparse" , False )
3926
4027AGENT_NAME_KEY = "agent.agent_name"
@@ -224,17 +211,6 @@ def report_constant_and_variables(df, show_stack_traces=True):
224211 print (f" ...\n " )
225212
226213
227- def get_bootstrap (df , metric , reduce_fn = np .nanmean , n_bootstrap = 100 , group_by = TASK_KEY , prior = 0.5 ):
228- """Get the stratified bootstrap mean and std for the given metric."""
229- grouped_df = df .reset_index (inplace = False ).groupby (group_by )
230- array = convert_df_to_array (grouped_df , metric = metric , threshold = 0.7 )
231- if prior is not None :
232- prior = prior * np .ones ((len (array ), 1 ))
233- array = np .concatenate ([array , prior ], axis = 1 )
234-
235- bootstrapped_values = bootstrap_matrix (array , n_bootstrap = n_bootstrap , reduce_fn = reduce_fn )
236- return np .nanmean (bootstrapped_values ), np .nanstd (bootstrapped_values )
237-
238214
239215def get_std_err (df , metric ):
240216 """Get the standard error for a binary metric."""
@@ -262,7 +238,7 @@ def get_sample_std_err(df, metric):
262238 return mean , std_err
263239
264240
265- def summarize (sub_df , use_bootstrap = False ):
241+ def summarize (sub_df ):
266242 if not "cum_reward" in sub_df :
267243 record = dict (
268244 avg_reward = np .nan ,
@@ -279,10 +255,7 @@ def summarize(sub_df, use_bootstrap=False):
279255 if n_completed == 0 :
280256 return None
281257
282- if use_bootstrap :
283- _mean_reward , std_reward = get_bootstrap (sub_df , "cum_reward" )
284- else :
285- _mean_reward , std_reward = get_std_err (sub_df , "cum_reward" )
258+ _mean_reward , std_reward = get_std_err (sub_df , "cum_reward" )
286259
287260 # sanity check, if there is an error the reward should be zero
288261 assert sub_df [sub_df ["err_msg" ].notnull ()]["cum_reward" ].sum () == 0
@@ -466,20 +439,6 @@ def _rename_bool_flags(report: pd.DataFrame, true_str="✓", false_str="-"):
466439 return report
467440
468441
469- def to_clipboard (df : pd .DataFrame ):
470- """Copy the dataframe to the clipboard as a tab separated csv."""
471- output = io .StringIO ()
472- df .to_csv (output , sep = "\t " , index = True )
473- csv_string = output .getvalue ()
474- if pyperclip is not None :
475- try :
476- pyperclip .copy (csv_string )
477- except Exception as e :
478- warn (f"Failed to copy to clipboard: { e } " )
479- # else:
480- # print("pyperclip is not installed, cannot copy to clipboard.")
481- # return df
482-
483442
484443def flag_report (report : pd .DataFrame , metric : str = "avg_reward" , round_digits : int = 2 ):
485444 # for all index in the multi-index with boolean value, get the average for
0 commit comments