Skip to content

Commit 66b9692

Browse files
add scripts to run generic and hinter agents, update tmlr config for hinter
1 parent 7e55cd7 commit 66b9692

File tree

5 files changed

+189
-0
lines changed

5 files changed

+189
-0
lines changed
Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,55 @@
1+
import argparse
2+
3+
from dotenv import load_dotenv
4+
5+
load_dotenv()
6+
7+
import argparse
8+
import logging
9+
10+
from agentlab.agents.generic_agent.tmlr_config import get_base_agent
11+
from agentlab.experiments.study import Study
12+
from bgym import DEFAULT_BENCHMARKS
13+
14+
logging.getLogger().setLevel(logging.WARNING)
15+
16+
17+
def main():
18+
parser = argparse.ArgumentParser()
19+
parser.add_argument("--benchmark", required=True)
20+
parser.add_argument("--llm-config", required=True)
21+
parser.add_argument("--relaunch", action="store_true")
22+
parser.add_argument("--n-jobs", type=int, default=5)
23+
parser.add_argument("--n-relaunch", type=int, default=3)
24+
parser.add_argument("--parallel-backend", type=str, default="ray")
25+
parser.add_argument("--reproducibility-mode", action="store_true")
26+
27+
args = parser.parse_args()
28+
29+
# instantiate agent
30+
agent_args = [get_base_agent(args.llm_config)]
31+
benchmark = DEFAULT_BENCHMARKS[args.benchmark]()
32+
33+
if args.relaunch:
34+
# relaunch an existing study
35+
study = Study.load_most_recent(contains=None)
36+
study.find_incomplete(include_errors=True)
37+
38+
else:
39+
study = Study(
40+
agent_args,
41+
benchmark,
42+
logging_level=logging.WARNING,
43+
logging_level_stdout=logging.WARNING,
44+
)
45+
46+
study.run(
47+
n_jobs=args.n_jobs,
48+
parallel_backend="ray",
49+
strict_reproducibility=args.reproducibility_mode,
50+
n_relaunch=args.n_relaunch,
51+
)
52+
53+
54+
if __name__ == "__main__":
55+
main()
Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
#!/bin/bash
2+
3+
BENCHMARK="workarena_l1"
4+
5+
LLM_CONFIG="azure/gpt-5-mini-2025-08-07"
6+
# PARALLEL_BACKEND="sequential"
7+
PARALLEL_BACKEND="ray"
8+
9+
N_JOBS=5
10+
N_RELAUNCH=3
11+
12+
python experiments/generic/run_generic_agent.py \
13+
--benchmark $BENCHMARK \
14+
--llm-config $LLM_CONFIG \
15+
--parallel-backend $PARALLEL_BACKEND \
16+
--n-jobs $N_JOBS \
17+
--n-relaunch $N_RELAUNCH
Lines changed: 76 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,76 @@
1+
2+
from dotenv import load_dotenv
3+
import argparse
4+
5+
load_dotenv()
6+
7+
import logging
8+
import argparse
9+
10+
from agentlab.agents.generic_agent_hinter.generic_agent import GenericAgentArgs
11+
from agentlab.agents.generic_agent_hinter.agent_configs import CHAT_MODEL_ARGS_DICT, FLAGS_GPT_4o
12+
from bgym import DEFAULT_BENCHMARKS
13+
from agentlab.experiments.study import Study
14+
15+
logging.getLogger().setLevel(logging.WARNING)
16+
17+
18+
def main():
19+
parser = argparse.ArgumentParser()
20+
parser.add_argument("--benchmark", required=True)
21+
parser.add_argument("--llm-config", required=True)
22+
parser.add_argument("--relaunch", action="store_true")
23+
parser.add_argument("--n-jobs", type=int, default=6)
24+
parser.add_argument("--parallel-backend", type=str, default="ray")
25+
parser.add_argument("--reproducibility-mode", action="store_true")
26+
# hint flags
27+
parser.add_argument("--hint-type", type=str, default="docs")
28+
parser.add_argument("--hint-index-type", type=str, default="sparse")
29+
parser.add_argument("--hint-query-type", type=str, default="direct")
30+
parser.add_argument("--hint-index-path", type=str, default="indexes/servicenow-docs-bm25")
31+
parser.add_argument("--hint-retriever-path", type=str, default="google/embeddinggemma-300m")
32+
parser.add_argument("--hint-num-results", type=int, default=5)
33+
args = parser.parse_args()
34+
35+
flags = FLAGS_GPT_4o
36+
flags.use_task_hint = True
37+
flags.hint_type = args.hint_type
38+
flags.hint_index_type = args.hint_index_type
39+
flags.hint_query_type = args.hint_query_type
40+
flags.hint_index_path = args.hint_index_path
41+
flags.hint_retriever_path = args.hint_retriever_path
42+
flags.hint_num_results = args.hint_num_results
43+
44+
# instantiate agent
45+
agent_args = [GenericAgentArgs(
46+
chat_model_args=CHAT_MODEL_ARGS_DICT[args.llm_config],
47+
flags=flags,
48+
)]
49+
50+
benchmark = DEFAULT_BENCHMARKS[args.benchmark]()
51+
52+
53+
if args.relaunch:
54+
# relaunch an existing study
55+
study = Study.load_most_recent(contains=None)
56+
study.find_incomplete(include_errors=True)
57+
58+
else:
59+
study = Study(
60+
agent_args,
61+
benchmark,
62+
logging_level=logging.WARNING,
63+
logging_level_stdout=logging.WARNING,
64+
)
65+
66+
study.run(
67+
n_jobs=args.n_jobs,
68+
parallel_backend=args.parallel_backend,
69+
strict_reproducibility=args.reproducibility_mode,
70+
n_relaunch=1,
71+
)
72+
73+
74+
75+
if __name__ == "__main__":
76+
main()
Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,31 @@
1+
#!/bin/bash
2+
3+
BENCHMARK="workarena_l1"
4+
5+
LLM_CONFIG="azure/gpt-5-mini-2025-08-07"
6+
# PARALLEL_BACKEND="sequential"
7+
PARALLEL_BACKEND="ray"
8+
9+
HINT_TYPE="docs" # human, llm, docs
10+
HINT_INDEX_TYPE="sparse" # sparse, dense
11+
HINT_QUERY_TYPE="goal" # goal, llm
12+
HINT_NUM_RESULTS=5
13+
14+
HINT_INDEX_PATH="indexes/servicenow-docs-bm25"
15+
# HINT_INDEX_PATH="indexes/servicenow-docs-embeddinggemma-300m"
16+
HINT_RETRIEVER_PATH="google/embeddinggemma-300m"
17+
18+
N_JOBS=6
19+
20+
python experiments/hint/run_hinter_agent.py \
21+
--benchmark $BENCHMARK \
22+
--llm-config $LLM_CONFIG \
23+
--parallel-backend $PARALLEL_BACKEND \
24+
--n-jobs $N_JOBS \
25+
--hint-type $HINT_TYPE \
26+
--hint-index-type $HINT_INDEX_TYPE \
27+
--hint-query-type $HINT_QUERY_TYPE \
28+
--hint-index-path $HINT_INDEX_PATH \
29+
--hint-retriever-path $HINT_RETRIEVER_PATH \
30+
--hint-num-results $HINT_NUM_RESULTS \
31+
--relaunch

src/agentlab/agents/generic_agent_hinter/tmlr_config.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,16 @@
4747
max_prompt_tokens=40_000,
4848
be_cautious=True,
4949
extra_instructions=None,
50+
51+
# hint flags
52+
hint_type="human",
53+
hint_index_type="sparse",
54+
hint_query_type="direct",
55+
hint_index_path=None,
56+
hint_retriever_path=None,
57+
hint_num_results=5,
58+
n_retrieval_queries=3,
59+
hint_level="episode",
5060
)
5161

5262

0 commit comments

Comments
 (0)