|
| 1 | +"""Minimal Human Trace Agent (<200 lines) |
| 2 | +
|
| 3 | +Per step we capture ONLY: |
| 4 | + - axtree_txt, pruned_html, actions.json, after.html |
| 5 | + - Auto-resume after detecting user action |
| 6 | + - Visible recording indicator |
| 7 | +""" |
| 8 | + |
1 | 9 | from __future__ import annotations |
2 | 10 |
|
3 | | -import logging |
4 | | -import textwrap |
| 11 | +import json |
| 12 | +import time |
| 13 | +import zipfile |
5 | 14 | from dataclasses import dataclass |
| 15 | +from pathlib import Path |
6 | 16 |
|
7 | 17 | import bgym |
8 | 18 | from playwright.sync_api import Page |
9 | 19 |
|
10 | 20 | from agentlab.agents.agent_args import AgentArgs |
11 | | - |
12 | | -logger = logging.getLogger(__name__) |
13 | | - |
14 | | - |
15 | | -# --------------------------------------------------------------------------- |
16 | | -# Simplified variant: capture human step (trace + screenshot + html) only |
17 | | -# --------------------------------------------------------------------------- |
| 21 | +from browsergym.utils.obs import flatten_axtree_to_str, flatten_dom_to_str, prune_html |
18 | 22 |
|
19 | 23 |
|
20 | 24 | @dataclass |
21 | | -class SimpleHumanTraceCaptureAgentArgs(AgentArgs): |
22 | | - """Args for SimpleHumanTraceCaptureAgent. |
23 | | -
|
24 | | - This version ONLY captures what the human does in the paused browser per step. |
25 | | - It does NOT attempt to map or translate actions. Always returns noop(). |
26 | | - Set use_raw_page_output=True in loop/env so that obs contains a Playwright Page. |
27 | | - """ |
28 | | - |
29 | | - agent_name: str = "SimpleHumanTraceCapture" |
| 25 | +class HumanTraceAgentArgs(AgentArgs): |
| 26 | + agent_name: str = "HumanTraceAgent" |
30 | 27 | trace_dir: str = "human_traces" |
31 | | - screenshots: bool = True |
32 | | - snapshots: bool = True # playwright tracing snapshots (DOM/Sources) |
33 | | - sources: bool = False # include source files (bigger trace) |
34 | | - # Ensure the raw Playwright Page object is present in observations so we can pause. |
35 | 28 | use_raw_page_output: bool = True |
36 | 29 |
|
37 | | - def make_agent(self) -> bgym.Agent: |
38 | | - return SimpleHumanTraceCaptureAgent( |
39 | | - trace_dir=self.trace_dir, |
40 | | - screenshots=self.screenshots, |
41 | | - snapshots=self.snapshots, |
42 | | - sources=self.sources, |
43 | | - ) |
| 30 | + def make_agent(self) -> bgym.Agent: # type: ignore[override] |
| 31 | + return HumanTraceAgent(self.trace_dir) |
44 | 32 |
|
45 | 33 | def set_reproducibility_mode(self): |
46 | 34 | pass |
47 | 35 |
|
48 | 36 |
|
49 | | -class SimpleHumanTraceCaptureAgent(bgym.Agent): |
50 | | - """Minimal human-in-the-loop recorder. |
51 | | -
|
52 | | - On each get_action: |
53 | | - 1. Start a Playwright tracing capture (if not already running for this step). |
54 | | - 2. Call page.pause() to open Inspector; user performs EXACTLY one logical action. |
55 | | - 3. Stop tracing, save trace zip, screenshot (after action), and HTML snapshot. |
56 | | - 4. Return noop() so the environment advances. |
57 | | -
|
58 | | - Artifacts are stored under trace_dir/step_<n>/ |
59 | | - """ |
60 | | - |
61 | | - def __init__(self, trace_dir: str, screenshots: bool, snapshots: bool, sources: bool): |
| 37 | +class HumanTraceAgent(bgym.Agent): |
| 38 | + def __init__(self, trace_dir: str): |
62 | 39 | self.action_set = bgym.HighLevelActionSet(["bid"], multiaction=False) |
63 | | - self._step_idx = 0 |
64 | | - from pathlib import Path |
65 | | - |
66 | 40 | self._root = Path(trace_dir) |
67 | 41 | self._root.mkdir(parents=True, exist_ok=True) |
68 | | - # Store trace config booleans; Playwright tracing.start expects them as named params. |
69 | | - self._trace_conf = dict(screenshots=screenshots, snapshots=snapshots, sources=sources) |
70 | | - self._tracing_started = False # track if global tracing has been started |
71 | | - self._page: Page | None = None # optional persistent page ref (when not in obs) |
72 | | - |
73 | | - def set_page(self, page: Page): |
74 | | - """Manually inject a Playwright Page so the agent can function without it in obs. |
75 | | -
|
76 | | - Call this once after you create / reset the environment if you prefer not to |
77 | | - expose the page through observations (e.g., for safety or serialization reasons). |
78 | | - """ |
79 | | - self._page = page |
80 | | - |
81 | | - def obs_preprocessor(self, obs): # keep original obs so page is available |
| 42 | + self._page: Page | None = None |
| 43 | + self._step = 0 |
| 44 | + self._task_name = None |
| 45 | + self._seed = None |
| 46 | + |
| 47 | + def obs_preprocessor(self, obs: dict): # type: ignore[override] |
| 48 | + if isinstance(obs, dict): |
| 49 | + if self._page is None and "page" in obs: |
| 50 | + self._page = obs["page"] |
| 51 | + |
| 52 | + # Extract task name and seed from obs if available |
| 53 | + if self._task_name is None: |
| 54 | + self._task_name = obs.get("task_name") or obs.get("task", {}).get( |
| 55 | + "task_name", "unknown_task" |
| 56 | + ) |
| 57 | + if self._seed is None: |
| 58 | + self._seed = obs.get("seed") or obs.get("task", {}).get("seed", "unknown_seed") |
| 59 | + |
| 60 | + dom = obs.get("dom_object") |
| 61 | + axt = obs.get("axtree_object") |
| 62 | + if axt is not None: |
| 63 | + try: |
| 64 | + obs["axtree_txt"] = flatten_axtree_to_str(axt) |
| 65 | + except Exception: |
| 66 | + pass |
| 67 | + if dom is not None: |
| 68 | + try: |
| 69 | + obs["pruned_html"] = prune_html(flatten_dom_to_str(dom)) |
| 70 | + except Exception: |
| 71 | + pass |
| 72 | + for k in ("dom_object", "axtree_object", "page"): |
| 73 | + obs.pop(k, None) |
82 | 74 | return obs |
83 | 75 |
|
84 | 76 | def get_action(self, obs: dict): # type: ignore[override] |
85 | | - import json |
86 | | - import time |
87 | | - |
88 | | - # Resolve page priority: observation > stored page |
89 | | - page: Page | None = obs.get("page") or self._page |
90 | | - if page is None: |
91 | | - raise RuntimeError( |
92 | | - "No Playwright Page available. Provide use_raw_page_output=True OR call set_page(page)." |
93 | | - ) |
94 | | - # Cache page if first time we see it via obs so later steps can omit it |
95 | 77 | if self._page is None: |
96 | | - self._page = page |
| 78 | + raise RuntimeError("Playwright Page missing; ensure use_raw_page_output=True") |
97 | 79 |
|
98 | | - step_dir = self._root / f"step_{self._step_idx:04d}" |
| 80 | + page = self._page |
| 81 | + |
| 82 | + # Create directory structure: trace_dir/task_name/seed/step_XXXX |
| 83 | + task_dir = self._root / str(self._task_name or "unknown_task") |
| 84 | + seed_dir = task_dir / str(self._seed or "unknown_seed") |
| 85 | + step_dir = seed_dir / f"step_{self._step:04d}" |
99 | 86 | step_dir.mkdir(parents=True, exist_ok=True) |
100 | | - trace_path = step_dir / "trace.zip" |
101 | | - screenshot_path = step_dir / "after.png" |
102 | | - html_path = step_dir / "after.html" |
103 | 87 |
|
104 | | - # Lazy start of tracing (once per context) then per-step chunk |
105 | | - if not self._tracing_started: |
106 | | - try: |
107 | | - page.context.tracing.start( |
108 | | - screenshots=self._trace_conf["screenshots"], |
109 | | - snapshots=self._trace_conf["snapshots"], |
110 | | - sources=self._trace_conf["sources"], |
111 | | - ) |
112 | | - self._tracing_started = True |
113 | | - except Exception as e: # pragma: no cover |
114 | | - print(f"[SimpleHumanTraceCapture][WARN] initial tracing.start failed: {e}") |
| 88 | + trace_path = step_dir / "temp_trace.zip" |
| 89 | + actions_path = step_dir / "actions.json" |
| 90 | + |
| 91 | + print( |
| 92 | + f"[HumanTrace] Task: {self._task_name}, Seed: {self._seed}, Step {self._step}: Perform ONE action" |
| 93 | + ) |
115 | 94 |
|
| 95 | + # Small recording indicator |
| 96 | + page.evaluate( |
| 97 | + """ |
| 98 | + const div = document.createElement('div'); |
| 99 | + div.id = '__rec'; |
| 100 | + div.innerHTML = '🔴 REC'; |
| 101 | + div.style.cssText = 'position:fixed;top:5px;right:5px;background:#f44;color:#fff;padding:5px 8px;border-radius:4px;font:bold 12px monospace;z-index:99999'; |
| 102 | + document.body.appendChild(div); |
| 103 | + """ |
| 104 | + ) |
| 105 | + |
| 106 | + # Start tracing |
116 | 107 | try: |
| 108 | + page.context.tracing.start(screenshots=True, snapshots=True) |
117 | 109 | page.context.tracing.start_chunk() |
118 | | - except Exception as e: # pragma: no cover |
119 | | - print(f"[SimpleHumanTraceCapture][WARN] tracing.start_chunk failed: {e}") |
| 110 | + except Exception: |
| 111 | + pass |
120 | 112 |
|
121 | | - print("\n[SimpleHumanTraceCapture] Perform ONE action then resume Inspector.") |
122 | | - print("[SimpleHumanTraceCapture] A trace will be saved to:", trace_path) |
123 | | - try: |
124 | | - page.pause() |
125 | | - except Exception as e: # pragma: no cover |
126 | | - print(f"[SimpleHumanTraceCapture][WARN] page.pause failed: {e}") |
| 113 | + # Wait for action |
| 114 | + self._wait_for_action(page) |
127 | 115 |
|
128 | | - # Stop current chunk & save |
| 116 | + # Stop tracing and save |
129 | 117 | try: |
130 | 118 | page.context.tracing.stop_chunk(path=str(trace_path)) |
131 | | - except Exception as e: # pragma: no cover |
132 | | - print(f"[SimpleHumanTraceCapture][WARN] tracing.stop_chunk failed: {e}") |
| 119 | + actions = self._extract_trace(str(trace_path)) |
| 120 | + actions_path.write_text(json.dumps(actions, indent=2)) |
| 121 | + trace_path.unlink(missing_ok=True) |
| 122 | + except Exception: |
| 123 | + pass |
133 | 124 |
|
134 | | - # Post-action artifacts |
| 125 | + # Remove indicator |
| 126 | + page.evaluate("document.getElementById('__rec')?.remove()") |
| 127 | + |
| 128 | + # Save screenshot |
135 | 129 | try: |
136 | | - page.screenshot(path=str(screenshot_path)) |
137 | | - except Exception as e: # pragma: no cover |
138 | | - print(f"[SimpleHumanTraceCapture][WARN] screenshot failed: {e}") |
| 130 | + page.screenshot(path=str(step_dir / "screenshot.png")) |
| 131 | + except Exception: |
| 132 | + pass |
| 133 | + |
| 134 | + # Save HTML |
139 | 135 | try: |
140 | | - html = page.content() |
141 | | - html_path.write_text(html) |
142 | | - except Exception as e: # pragma: no cover |
143 | | - print(f"[SimpleHumanTraceCapture][WARN] html capture failed: {e}") |
144 | | - |
145 | | - meta = { |
146 | | - "url": page.url, |
147 | | - "timestamp": time.time(), |
148 | | - "step": self._step_idx, |
149 | | - "trace_path": str(trace_path), |
150 | | - "screenshot_path": str(screenshot_path), |
151 | | - "html_path": str(html_path), |
| 136 | + (step_dir / "after.html").write_text(page.content()) |
| 137 | + except Exception: |
| 138 | + pass |
| 139 | + |
| 140 | + self._step += 1 |
| 141 | + return "noop()", { |
| 142 | + "extra_info": { |
| 143 | + "step": self._step - 1, |
| 144 | + "task_name": self._task_name, |
| 145 | + "seed": self._seed, |
| 146 | + "trace_dir": str(step_dir), |
| 147 | + } |
152 | 148 | } |
153 | | - (step_dir / "meta.json").write_text(json.dumps(meta, indent=2)) |
154 | 149 |
|
155 | | - # --- Derive a lightweight human-readable script summary from the trace --- |
156 | | - script_summary_lines: list[str] = [] |
157 | | - try: |
158 | | - import json as _json |
159 | | - import zipfile |
160 | | - |
161 | | - with zipfile.ZipFile(trace_path, "r") as zf: |
162 | | - # Playwright trace usually contains one or more *.trace files (jsonl) |
163 | | - trace_files = [n for n in zf.namelist() if n.endswith(".trace")] |
164 | | - for tf in trace_files: |
165 | | - with zf.open(tf, "r") as fh: |
166 | | - for raw_line in fh: |
167 | | - try: |
168 | | - evt = _json.loads(raw_line.decode("utf-8")) |
169 | | - except Exception: |
170 | | - continue |
171 | | - if evt.get("type") != "action": |
172 | | - continue |
173 | | - a = evt.get("action", {}) |
174 | | - api_name = a.get("apiName") or a.get("name") or "action" |
175 | | - selector = a.get("selector") or a.get("locator") or "" |
176 | | - value = a.get("value") or a.get("text") or "" |
177 | | - line = f"{api_name}" |
178 | | - if selector: |
179 | | - line += f" selector={selector!r}" |
180 | | - if value and isinstance(value, str) and len(value) < 200: |
181 | | - line += f" value={value!r}" |
182 | | - script_summary_lines.append(line) |
183 | | - if not script_summary_lines: |
184 | | - script_summary_lines.append("(no action events parsed from trace chunk)") |
185 | | - except Exception as e: # pragma: no cover |
186 | | - script_summary_lines.append(f"(failed to parse trace for script summary: {e})") |
187 | | - |
188 | | - # Prepare chat messages (simple list of strings for easy viewing) |
189 | | - chat_messages = [ |
190 | | - "PLAYWRIGHT TRACE STEP SUMMARY:", |
191 | | - f"Step {self._step_idx} URL: {page.url}", |
192 | | - "Actions:", |
193 | | - *script_summary_lines, |
194 | | - f"Trace file: {trace_path}", |
195 | | - "Open with: npx playwright show-trace " + str(trace_path), |
196 | | - ] |
197 | | - |
198 | | - self._step_idx += 1 |
199 | | - |
200 | | - agent_info = bgym.AgentInfo( |
201 | | - think="human-recorded", |
202 | | - chat_messages=chat_messages, |
203 | | - stats={"step": self._step_idx}, |
204 | | - markdown_page=textwrap.dedent( |
205 | | - f"""### Simple Human Trace Capture\nSaved artifacts for step {meta['step']}:\n- URL: {meta['url']}\n- Trace: {meta['trace_path']}\n- Screenshot: {meta['screenshot_path']}\n- HTML: {meta['html_path']}\n""" |
206 | | - ), |
207 | | - extra_info=meta, |
| 150 | + def _wait_for_action(self, page): |
| 151 | + """Wait for user action with auto-resume.""" |
| 152 | + page.evaluate( |
| 153 | + """ |
| 154 | + window.__acted = false; |
| 155 | + ['click','keydown','input','change'].forEach(e => |
| 156 | + document.addEventListener(e, () => window.__acted = true, true) |
| 157 | + ); |
| 158 | + """ |
208 | 159 | ) |
209 | | - return "noop()", agent_info |
210 | 160 |
|
211 | | - |
212 | | -SIMPLE_TRACE_CAPTURE_AGENT = SimpleHumanTraceCaptureAgentArgs() |
213 | | - |
214 | | -##1. Simple debug agent |
215 | | -# 2. Instead of using the page object Launch codegen directly in a subprocess using the playwright codegen --url or somethiing |
| 161 | + start = time.time() |
| 162 | + while time.time() - start < 300: # 5 min max |
| 163 | + try: |
| 164 | + if page.evaluate("window.__acted"): |
| 165 | + page.evaluate("document.getElementById('__rec').innerHTML = '💾 SAVING'") |
| 166 | + time.sleep(0.3) |
| 167 | + return |
| 168 | + except Exception: |
| 169 | + pass |
| 170 | + time.sleep(0.1) |
| 171 | + |
| 172 | + def _extract_trace(self, trace_file: str): |
| 173 | + """Extract ALL events from trace zip.""" |
| 174 | + all_events = [] |
| 175 | + try: |
| 176 | + with zipfile.ZipFile(trace_file, "r") as zf: |
| 177 | + for name in zf.namelist(): |
| 178 | + if name.endswith(".trace"): |
| 179 | + with zf.open(name) as f: |
| 180 | + for line in f: |
| 181 | + try: |
| 182 | + event = json.loads(line.decode()) |
| 183 | + # Save everything - don't filter |
| 184 | + all_events.append(event) |
| 185 | + except Exception: |
| 186 | + continue |
| 187 | + except Exception: |
| 188 | + pass |
| 189 | + return all_events |
| 190 | + |
| 191 | + |
| 192 | +HUMAN_TRACE_AGENT = HumanTraceAgentArgs() |
0 commit comments