|
2 | 2 | Console launcher for the Human-in-the-Loop Generic Agent UI. |
3 | 3 |
|
4 | 4 | Usage (installed entry point): |
5 | | - agentlab-mentor --benchmark miniwob --task-name miniwob.book-flight --seed 123 --seed 456 --no-headless |
| 5 | + agentlab-mentor --benchmark miniwob --task-name miniwob.book-flight --seed 123 --no-headless |
6 | 6 |
|
7 | 7 | This will run a Study with the MultipleProposalGenericAgent and the selected task. |
8 | 8 | """ |
|
11 | 11 |
|
12 | 12 | import argparse |
13 | 13 | import logging |
14 | | -import copy |
15 | | -from typing import Optional |
16 | 14 |
|
17 | 15 | import bgym |
18 | 16 |
|
19 | 17 | from agentlab.agents.hitl_agent.generic_human_guided_agent import ( |
20 | 18 | HUMAN_GUIDED_GENERIC_AGENT, |
21 | 19 | ) |
| 20 | +from agentlab.experiments.exp_utils import RESULTS_DIR |
22 | 21 | from agentlab.experiments.study import Study |
| 22 | +from pathlib import Path |
23 | 23 |
|
24 | | -logger = logging.getLogger(__name__) |
25 | | - |
26 | | -def build_benchmark( |
27 | | - benchmark_name: str, task_name: Optional[str], seeds: Optional[list[int]], headless: bool |
28 | | -): |
| 24 | +def build_benchmark(benchmark_name: str, task_name: str, seed: int, headless: bool): |
29 | 25 | # Instantiate benchmark by name using BrowserGym registry |
30 | 26 | try: |
31 | 27 | benchmark = bgym.DEFAULT_BENCHMARKS[benchmark_name.lower()]() |
32 | 28 | except KeyError as e: |
33 | 29 | choices = ", ".join(sorted(bgym.DEFAULT_BENCHMARKS.keys())) |
34 | 30 | raise SystemExit(f"Unknown benchmark '{benchmark_name}'. Choose one of: {choices}") from e |
35 | 31 |
|
36 | | - if task_name: |
37 | | - try: |
38 | | - benchmark = benchmark.subset_from_glob("task_name", task_name) |
39 | | - tasks = sorted({e.task_name for e in benchmark.env_args_list}) |
40 | | - if not tasks: |
41 | | - msg = f"No tasks found matching pattern '{task_name}'." |
42 | | - logger.error(msg) |
43 | | - raise SystemExit(msg) |
44 | | - if len(tasks) > 1: |
45 | | - logger.warning( |
46 | | - "Found %d tasks matching '%s'. Using only the first: %s", |
47 | | - len(tasks), |
48 | | - task_name, |
49 | | - tasks[0], |
50 | | - ) |
51 | | - task = tasks[0] |
52 | | - except SystemExit: |
53 | | - raise |
54 | | - except Exception as e: |
55 | | - logger.error(f"Error occurred while filtering tasks: {e}") |
56 | | - raise SystemExit(str(e)) |
57 | | - |
58 | | - # If specific seeds are provided, duplicate envs for each seed |
59 | | - if seeds is not None: |
60 | | - new_env_args_list = [] |
61 | | - # If a specific task was selected above, duplicate that; otherwise, ensure there is exactly one task |
62 | | - if 'task' in locals(): |
63 | | - task_env = next((x for x in benchmark.env_args_list if x.task_name == task), None) |
64 | | - if task_env is None: |
65 | | - msg = f"Internal error: selected task '{task}' not found in env list." |
66 | | - logger.error(msg) |
67 | | - raise SystemExit(msg) |
68 | | - else: |
69 | | - unique_tasks = sorted({e.task_name for e in benchmark.env_args_list}) |
70 | | - if not unique_tasks: |
71 | | - raise SystemExit("No tasks available in the selected benchmark.") |
72 | | - if len(unique_tasks) > 1: |
73 | | - raise SystemExit( |
74 | | - "Multiple tasks present in benchmark. Please specify --task-name to apply seeds to a single task." |
75 | | - ) |
76 | | - task = unique_tasks[0] |
77 | | - task_env = next((x for x in benchmark.env_args_list if x.task_name == task), None) |
78 | | - if task_env is None: |
79 | | - raise SystemExit(f"Task '{task}' not found in env list.") |
80 | | - |
81 | | - for seed in seeds: |
82 | | - ea = copy.deepcopy(task_env) |
83 | | - ea.task_seed = seed |
84 | | - new_env_args_list.append(ea) |
85 | | - benchmark.env_args_list = new_env_args_list |
| 32 | + filtered_env_args = [ |
| 33 | + env_args for env_args in benchmark.env_args_list if env_args.task_name == task_name |
| 34 | + ] |
| 35 | + if not filtered_env_args: |
| 36 | + raise SystemExit(f'No tasks found matching "{task_name}"') |
| 37 | + filtered_env_args = filtered_env_args[:1] # take the first one |
| 38 | + benchmark.env_args_list = filtered_env_args |
86 | 39 |
|
87 | 40 | # Reasonable defaults for interactive UI |
88 | 41 | for env_args in benchmark.env_args_list: |
89 | | - env_args.max_steps = env_args.max_steps or 100 |
| 42 | + env_args.task_seed = seed |
| 43 | + env_args.max_steps = env_args.max_steps or 200 |
90 | 44 | env_args.headless = headless |
91 | 45 |
|
92 | 46 | return benchmark |
93 | 47 |
|
94 | 48 |
|
| 49 | +def extract_hints_from_experiment_trace(exp_dir): |
| 50 | + """Extracts hints from every step of each episode in a exp_dir and returns a df with each row containing a hint. |
| 51 | +
|
| 52 | + Args: |
| 53 | + exp_dir: Path-like to a study/experiment directory whose results should be scanned. |
| 54 | +
|
| 55 | + Returns: |
| 56 | + pandas.DataFrame: One row per hint with metadata columns. |
| 57 | + """ |
| 58 | + import pandas as pd |
| 59 | + |
| 60 | + from agentlab.analyze import inspect_results |
| 61 | + from agentlab.experiments.exp_utils import RESULTS_DIR |
| 62 | + from agentlab.experiments.loop import ExpResult |
| 63 | + |
| 64 | + output = [] |
| 65 | + # Use provided exp_dir if set; otherwise default to <$AGENTLAB_EXP_ROOT>/agentlab_mentor |
| 66 | + result_df = inspect_results.load_result_df(exp_dir or (RESULTS_DIR / "agentlab_mentor")) |
| 67 | + if result_df is None: |
| 68 | + # No results to parse; return empty dataframe with expected columns |
| 69 | + return pd.DataFrame( |
| 70 | + columns=[ |
| 71 | + "exp_id", |
| 72 | + "agent_name", |
| 73 | + "benchmark", |
| 74 | + "task_name", |
| 75 | + "episode_reward", |
| 76 | + "hint", |
| 77 | + ] |
| 78 | + ) |
| 79 | + result_df = result_df.reset_index() |
| 80 | + for _, row in result_df.iterrows(): |
| 81 | + result = ExpResult(row.exp_dir) |
| 82 | + episode = result.steps_info |
| 83 | + episode_reward = max([step.reward for step in episode]) |
| 84 | + for step_info in episode: |
| 85 | + step_hints = step_info.agent_info.get("extra_info", {}).get("step_hints", None) |
| 86 | + if step_hints: |
| 87 | + for hint in step_hints: |
| 88 | + output.append( |
| 89 | + { |
| 90 | + "exp_id": row["exp_id"], |
| 91 | + "agent_name": row["agent.agent_name"], |
| 92 | + "benchmark": row["env.task_name"].split(".")[0], |
| 93 | + "task_name": row["env.task_name"], |
| 94 | + "episode_reward": episode_reward, |
| 95 | + "hint": hint, |
| 96 | + } |
| 97 | + ) |
| 98 | + output = pd.DataFrame(output) |
| 99 | + output = output.dropna() |
| 100 | + return output |
| 101 | + |
| 102 | + |
95 | 103 | def parse_args(): |
96 | 104 | p = argparse.ArgumentParser(description="Run HITL Generic Agent UI on a benchmark task") |
97 | 105 | p.add_argument( |
98 | 106 | "--benchmark", |
99 | | - required=True, |
| 107 | + required=False, |
100 | 108 | help="Benchmark name as registered in BrowserGym, e.g., miniwob, workarena_l1, webarena, visualwebarena", |
101 | 109 | ) |
102 | 110 | p.add_argument( |
103 | 111 | "--task-name", |
104 | 112 | dest="task_name", |
105 | | - default=None, |
106 | | - help="Task name or glob to filter tasks within the benchmark (e.g., 'miniwob.*book*')", |
| 113 | + required=False, |
| 114 | + help="Exact task name within the benchmark (e.g., 'miniwob.book-flight')", |
107 | 115 | ) |
108 | 116 | p.add_argument( |
109 | 117 | "--seed", |
110 | | - action="append", |
111 | | - type=int, |
112 | | - default=None, |
113 | | - help="Task seed. Repeat flag for multiple seeds (e.g., --seed 1 --seed 2). If omitted, tasks keep their configured/random seed.", |
114 | | - ) |
115 | | - p.add_argument( |
116 | | - "--jobs", |
117 | 118 | type=int, |
118 | | - default=1, |
119 | | - help="Number of parallel jobs (UI agent typically runs sequentially)", |
120 | | - ) |
121 | | - p.add_argument( |
122 | | - "--parallel-backend", |
123 | | - default="sequential", |
124 | | - choices=["sequential", "ray", "joblib"], |
125 | | - help="Parallel backend to use", |
126 | | - ) |
127 | | - p.add_argument( |
128 | | - "--retries", |
129 | | - type=int, |
130 | | - default=1, |
131 | | - help="Number of relaunch attempts for incomplete experiments", |
132 | | - ) |
133 | | - p.add_argument( |
134 | | - "--log-level", |
135 | | - default="WARNING", |
136 | | - choices=["DEBUG", "INFO", "WARNING", "ERROR"], |
137 | | - help="Logging level", |
| 119 | + required=False, |
| 120 | + help="Task seed to use for the selected task.", |
138 | 121 | ) |
139 | 122 | p.add_argument( |
140 | 123 | "--headless", |
141 | 124 | action=argparse.BooleanOptionalAction, |
142 | 125 | default=True, |
143 | 126 | help="Run the browser headless (default: True). Use --no-headless to show the browser.", |
144 | 127 | ) |
| 128 | + p.add_argument( |
| 129 | + "--download-hints", |
| 130 | + nargs="?", |
| 131 | + const="extracted_hints.csv", |
| 132 | + required=False, |
| 133 | + default=None, |
| 134 | + metavar="[OUTPUT_CSV]", |
| 135 | + help=( |
| 136 | + "Extract hints from the default study directory and save to OUTPUT_CSV. " |
| 137 | + "If OUTPUT_CSV is omitted, saves to 'extracted_hints.csv'. When provided, other args are ignored." |
| 138 | + ), |
| 139 | + ) |
145 | 140 | return p.parse_args() |
146 | 141 |
|
147 | 142 |
|
148 | 143 | def main(): |
149 | 144 | args = parse_args() |
150 | | - |
151 | | - logging_level = getattr(logging, args.log_level) |
152 | | - |
| 145 | + save_dir = RESULTS_DIR / "agentlab_mentor" |
| 146 | + if args.download_hints: |
| 147 | + df = extract_hints_from_experiment_trace(save_dir) |
| 148 | + out_path = Path(args.download_hints) |
| 149 | + out_path.parent.mkdir(parents=True, exist_ok=True) |
| 150 | + df.to_csv(out_path, index=False) |
| 151 | + print(str(out_path)) |
| 152 | + return |
| 153 | + # Validate required args only when not downloading hints |
| 154 | + if not args.benchmark or not args.task_name or args.seed is None: |
| 155 | + raise SystemExit( |
| 156 | + "--benchmark, --task-name, and --seed are required unless using --download-hints" |
| 157 | + ) |
153 | 158 | benchmark = build_benchmark(args.benchmark, args.task_name, args.seed, args.headless) |
154 | 159 | agent_configs = [HUMAN_GUIDED_GENERIC_AGENT] |
155 | | - |
156 | | - study = Study( |
157 | | - agent_configs, |
158 | | - benchmark, |
159 | | - logging_level=logging_level, |
160 | | - logging_level_stdout=logging_level, |
161 | | - ) |
162 | | - |
| 160 | + # study is needed to run the 'set_benchmark' method which sets appropriate agent parameters. |
| 161 | + study = Study(agent_args=agent_configs, benchmark=benchmark, logging_level=logging.WARNING) |
163 | 162 | study.run( |
164 | | - n_jobs=args.jobs, |
165 | | - parallel_backend=args.parallel_backend, |
166 | | - n_relaunch=args.retries, |
| 163 | + n_jobs=1, |
| 164 | + parallel_backend="sequential", |
| 165 | + n_relaunch=1, |
| 166 | + exp_root=save_dir, |
167 | 167 | ) |
168 | 168 |
|
169 | 169 |
|
|
0 commit comments