Skip to content

Commit 8addffb

Browse files
simplify CLI args and add ability to download hints using CLI.
1 parent 7d988a8 commit 8addffb

File tree

1 file changed

+103
-103
lines changed

1 file changed

+103
-103
lines changed

src/agentlab/agents/hitl_agent/launch_hint_ui.py

Lines changed: 103 additions & 103 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
Console launcher for the Human-in-the-Loop Generic Agent UI.
33
44
Usage (installed entry point):
5-
agentlab-mentor --benchmark miniwob --task-name miniwob.book-flight --seed 123 --seed 456 --no-headless
5+
agentlab-mentor --benchmark miniwob --task-name miniwob.book-flight --seed 123 --no-headless
66
77
This will run a Study with the MultipleProposalGenericAgent and the selected task.
88
"""
@@ -11,159 +11,159 @@
1111

1212
import argparse
1313
import logging
14-
import copy
15-
from typing import Optional
1614

1715
import bgym
1816

1917
from agentlab.agents.hitl_agent.generic_human_guided_agent import (
2018
HUMAN_GUIDED_GENERIC_AGENT,
2119
)
20+
from agentlab.experiments.exp_utils import RESULTS_DIR
2221
from agentlab.experiments.study import Study
22+
from pathlib import Path
2323

24-
logger = logging.getLogger(__name__)
25-
26-
def build_benchmark(
27-
benchmark_name: str, task_name: Optional[str], seeds: Optional[list[int]], headless: bool
28-
):
24+
def build_benchmark(benchmark_name: str, task_name: str, seed: int, headless: bool):
2925
# Instantiate benchmark by name using BrowserGym registry
3026
try:
3127
benchmark = bgym.DEFAULT_BENCHMARKS[benchmark_name.lower()]()
3228
except KeyError as e:
3329
choices = ", ".join(sorted(bgym.DEFAULT_BENCHMARKS.keys()))
3430
raise SystemExit(f"Unknown benchmark '{benchmark_name}'. Choose one of: {choices}") from e
3531

36-
if task_name:
37-
try:
38-
benchmark = benchmark.subset_from_glob("task_name", task_name)
39-
tasks = sorted({e.task_name for e in benchmark.env_args_list})
40-
if not tasks:
41-
msg = f"No tasks found matching pattern '{task_name}'."
42-
logger.error(msg)
43-
raise SystemExit(msg)
44-
if len(tasks) > 1:
45-
logger.warning(
46-
"Found %d tasks matching '%s'. Using only the first: %s",
47-
len(tasks),
48-
task_name,
49-
tasks[0],
50-
)
51-
task = tasks[0]
52-
except SystemExit:
53-
raise
54-
except Exception as e:
55-
logger.error(f"Error occurred while filtering tasks: {e}")
56-
raise SystemExit(str(e))
57-
58-
# If specific seeds are provided, duplicate envs for each seed
59-
if seeds is not None:
60-
new_env_args_list = []
61-
# If a specific task was selected above, duplicate that; otherwise, ensure there is exactly one task
62-
if 'task' in locals():
63-
task_env = next((x for x in benchmark.env_args_list if x.task_name == task), None)
64-
if task_env is None:
65-
msg = f"Internal error: selected task '{task}' not found in env list."
66-
logger.error(msg)
67-
raise SystemExit(msg)
68-
else:
69-
unique_tasks = sorted({e.task_name for e in benchmark.env_args_list})
70-
if not unique_tasks:
71-
raise SystemExit("No tasks available in the selected benchmark.")
72-
if len(unique_tasks) > 1:
73-
raise SystemExit(
74-
"Multiple tasks present in benchmark. Please specify --task-name to apply seeds to a single task."
75-
)
76-
task = unique_tasks[0]
77-
task_env = next((x for x in benchmark.env_args_list if x.task_name == task), None)
78-
if task_env is None:
79-
raise SystemExit(f"Task '{task}' not found in env list.")
80-
81-
for seed in seeds:
82-
ea = copy.deepcopy(task_env)
83-
ea.task_seed = seed
84-
new_env_args_list.append(ea)
85-
benchmark.env_args_list = new_env_args_list
32+
filtered_env_args = [
33+
env_args for env_args in benchmark.env_args_list if env_args.task_name == task_name
34+
]
35+
if not filtered_env_args:
36+
raise SystemExit(f'No tasks found matching "{task_name}"')
37+
filtered_env_args = filtered_env_args[:1] # take the first one
38+
benchmark.env_args_list = filtered_env_args
8639

8740
# Reasonable defaults for interactive UI
8841
for env_args in benchmark.env_args_list:
89-
env_args.max_steps = env_args.max_steps or 100
42+
env_args.task_seed = seed
43+
env_args.max_steps = env_args.max_steps or 200
9044
env_args.headless = headless
9145

9246
return benchmark
9347

9448

49+
def extract_hints_from_experiment_trace(exp_dir):
50+
"""Extracts hints from every step of each episode in a exp_dir and returns a df with each row containing a hint.
51+
52+
Args:
53+
exp_dir: Path-like to a study/experiment directory whose results should be scanned.
54+
55+
Returns:
56+
pandas.DataFrame: One row per hint with metadata columns.
57+
"""
58+
import pandas as pd
59+
60+
from agentlab.analyze import inspect_results
61+
from agentlab.experiments.exp_utils import RESULTS_DIR
62+
from agentlab.experiments.loop import ExpResult
63+
64+
output = []
65+
# Use provided exp_dir if set; otherwise default to <$AGENTLAB_EXP_ROOT>/agentlab_mentor
66+
result_df = inspect_results.load_result_df(exp_dir or (RESULTS_DIR / "agentlab_mentor"))
67+
if result_df is None:
68+
# No results to parse; return empty dataframe with expected columns
69+
return pd.DataFrame(
70+
columns=[
71+
"exp_id",
72+
"agent_name",
73+
"benchmark",
74+
"task_name",
75+
"episode_reward",
76+
"hint",
77+
]
78+
)
79+
result_df = result_df.reset_index()
80+
for _, row in result_df.iterrows():
81+
result = ExpResult(row.exp_dir)
82+
episode = result.steps_info
83+
episode_reward = max([step.reward for step in episode])
84+
for step_info in episode:
85+
step_hints = step_info.agent_info.get("extra_info", {}).get("step_hints", None)
86+
if step_hints:
87+
for hint in step_hints:
88+
output.append(
89+
{
90+
"exp_id": row["exp_id"],
91+
"agent_name": row["agent.agent_name"],
92+
"benchmark": row["env.task_name"].split(".")[0],
93+
"task_name": row["env.task_name"],
94+
"episode_reward": episode_reward,
95+
"hint": hint,
96+
}
97+
)
98+
output = pd.DataFrame(output)
99+
output = output.dropna()
100+
return output
101+
102+
95103
def parse_args():
96104
p = argparse.ArgumentParser(description="Run HITL Generic Agent UI on a benchmark task")
97105
p.add_argument(
98106
"--benchmark",
99-
required=True,
107+
required=False,
100108
help="Benchmark name as registered in BrowserGym, e.g., miniwob, workarena_l1, webarena, visualwebarena",
101109
)
102110
p.add_argument(
103111
"--task-name",
104112
dest="task_name",
105-
default=None,
106-
help="Task name or glob to filter tasks within the benchmark (e.g., 'miniwob.*book*')",
113+
required=False,
114+
help="Exact task name within the benchmark (e.g., 'miniwob.book-flight')",
107115
)
108116
p.add_argument(
109117
"--seed",
110-
action="append",
111-
type=int,
112-
default=None,
113-
help="Task seed. Repeat flag for multiple seeds (e.g., --seed 1 --seed 2). If omitted, tasks keep their configured/random seed.",
114-
)
115-
p.add_argument(
116-
"--jobs",
117118
type=int,
118-
default=1,
119-
help="Number of parallel jobs (UI agent typically runs sequentially)",
120-
)
121-
p.add_argument(
122-
"--parallel-backend",
123-
default="sequential",
124-
choices=["sequential", "ray", "joblib"],
125-
help="Parallel backend to use",
126-
)
127-
p.add_argument(
128-
"--retries",
129-
type=int,
130-
default=1,
131-
help="Number of relaunch attempts for incomplete experiments",
132-
)
133-
p.add_argument(
134-
"--log-level",
135-
default="WARNING",
136-
choices=["DEBUG", "INFO", "WARNING", "ERROR"],
137-
help="Logging level",
119+
required=False,
120+
help="Task seed to use for the selected task.",
138121
)
139122
p.add_argument(
140123
"--headless",
141124
action=argparse.BooleanOptionalAction,
142125
default=True,
143126
help="Run the browser headless (default: True). Use --no-headless to show the browser.",
144127
)
128+
p.add_argument(
129+
"--download-hints",
130+
nargs="?",
131+
const="extracted_hints.csv",
132+
required=False,
133+
default=None,
134+
metavar="[OUTPUT_CSV]",
135+
help=(
136+
"Extract hints from the default study directory and save to OUTPUT_CSV. "
137+
"If OUTPUT_CSV is omitted, saves to 'extracted_hints.csv'. When provided, other args are ignored."
138+
),
139+
)
145140
return p.parse_args()
146141

147142

148143
def main():
149144
args = parse_args()
150-
151-
logging_level = getattr(logging, args.log_level)
152-
145+
save_dir = RESULTS_DIR / "agentlab_mentor"
146+
if args.download_hints:
147+
df = extract_hints_from_experiment_trace(save_dir)
148+
out_path = Path(args.download_hints)
149+
out_path.parent.mkdir(parents=True, exist_ok=True)
150+
df.to_csv(out_path, index=False)
151+
print(str(out_path))
152+
return
153+
# Validate required args only when not downloading hints
154+
if not args.benchmark or not args.task_name or args.seed is None:
155+
raise SystemExit(
156+
"--benchmark, --task-name, and --seed are required unless using --download-hints"
157+
)
153158
benchmark = build_benchmark(args.benchmark, args.task_name, args.seed, args.headless)
154159
agent_configs = [HUMAN_GUIDED_GENERIC_AGENT]
155-
156-
study = Study(
157-
agent_configs,
158-
benchmark,
159-
logging_level=logging_level,
160-
logging_level_stdout=logging_level,
161-
)
162-
160+
# study is needed to run the 'set_benchmark' method which sets appropriate agent parameters.
161+
study = Study(agent_args=agent_configs, benchmark=benchmark, logging_level=logging.WARNING)
163162
study.run(
164-
n_jobs=args.jobs,
165-
parallel_backend=args.parallel_backend,
166-
n_relaunch=args.retries,
163+
n_jobs=1,
164+
parallel_backend="sequential",
165+
n_relaunch=1,
166+
exp_root=save_dir,
167167
)
168168

169169

0 commit comments

Comments
 (0)