|
| 1 | +import argparse |
1 | 2 | import logging |
2 | 3 | import os |
| 4 | +import sys |
3 | 5 |
|
4 | 6 | from bgym import DEFAULT_BENCHMARKS |
5 | 7 | from dotenv import load_dotenv |
|
18 | 20 | logger = logging.getLogger(__name__) |
19 | 21 | load_dotenv() |
20 | 22 |
|
21 | | -if __name__ == "__main__": |
22 | | - config = load_config("miniwob") |
23 | 23 |
|
24 | | - # benchmark = DEFAULT_BENCHMARKS["miniwob"](n_repeats=1) |
25 | | - # benchmark = MiniWobBenchmark(backend=MCPPlaywright()) |
26 | | - benchmark = MiniWobBenchmark(backend=AsyncPlaywright()) |
27 | 24 |
|
28 | | - # agent_args = GenericAgentArgs( |
29 | | - # chat_model_args=CHAT_MODEL_ARGS_DICT["azure/gpt-5-mini-2025-08-07"], |
30 | | - # flags=GPT5_MINI_FLAGS, |
31 | | - # ) |
32 | | - # agent_args.flags.obs.use_ax_tree = False |
33 | | - # agent_args.flags.obs.use_html = True |
34 | | - # agent_args.flags.obs.use_focused_element = False |
35 | | - agent_args = TapeAgentArgs(agent_name=config.name, config=config) |
| 25 | +def parse_args(): |
| 26 | + parser = argparse.ArgumentParser(description="Run MiniWob benchmark experiments") |
| 27 | + parser.add_argument( |
| 28 | + "--backend", |
| 29 | + choices=["playwright", "mcp", "bgym"], |
| 30 | + default="playwright", |
| 31 | + help="Browser backend to use (default: playwright)", |
| 32 | + ) |
| 33 | + parser.add_argument( |
| 34 | + "--agent", |
| 35 | + choices=["tape", "generic"], |
| 36 | + default="tape", |
| 37 | + help="Agent type to use (default: tape)", |
| 38 | + ) |
| 39 | + parser.add_argument( |
| 40 | + "--config", |
| 41 | + type=str, |
| 42 | + default="miniwob", |
| 43 | + help="Hydra config name to load (default: miniwob)", |
| 44 | + ) |
| 45 | + return parser.parse_args() |
| 46 | + |
| 47 | + |
| 48 | +if __name__ == "__main__": |
| 49 | + args = parse_args() |
| 50 | + config = load_config(args.config) |
36 | 51 |
|
| 52 | + if args.backend == "bgym": |
| 53 | + benchmark = DEFAULT_BENCHMARKS["miniwob"](n_repeats=1) |
| 54 | + elif args.backend == "playwright": |
| 55 | + benchmark = MiniWobBenchmark(backend_cls=AsyncPlaywright) |
| 56 | + elif args.backend == "mcp": |
| 57 | + benchmark = MiniWobBenchmark(backend_cls=MCPPlaywright) |
| 58 | + else: |
| 59 | + raise ValueError(f"Unknown backend: {args.backend}") |
| 60 | + |
| 61 | + if args.agent == "generic": |
| 62 | + agent_args = GenericAgentArgs( |
| 63 | + chat_model_args=CHAT_MODEL_ARGS_DICT["azure/gpt-5-mini-2025-08-07"], |
| 64 | + flags=GPT5_MINI_FLAGS, |
| 65 | + ) |
| 66 | + else: |
| 67 | + agent_args = TapeAgentArgs(agent_name=config.name, config=config) |
37 | 68 |
|
38 | 69 | study = make_study( |
39 | 70 | benchmark=benchmark, |
|
0 commit comments