Improve generic agent hinter (#309)

amanjaiswal73892 · web-flow · commit 87e2510fc3a4 · 2025-10-20T19:04:53.000-04:00
* Make LLM retreival topic index selection more robust
diff --git a/src/agentlab/agents/generic_agent_hinter/generic_agent.py b/src/agentlab/agents/generic_agent_hinter/generic_agent.py
@@ -93,7 +93,9 @@ def __init__(
 
         self.flags = flags
         if self.flags.hint_db_path is not None:
-            assert os.path.exists(self.flags.hint_db_path), f"Hint database path {self.flags.hint_db_path} does not exist."
+            assert os.path.exists(
+                self.flags.hint_db_path
+            ), f"Hint database path {self.flags.hint_db_path} does not exist."
         self.action_set = self.flags.action.action_set.make_action_set()
         self._obs_preprocessor = dp.make_obs_preprocessor(flags.obs)
 
@@ -118,7 +120,9 @@ def get_action(self, obs):
 
         # use those queries to retrieve from the database and pass to prompt if step-level
         self.queries = (
-            self._get_queries()[0] if getattr(self.flags, "hint_level", "episode") == "step" else None
+            self._get_queries()[0]
+            if getattr(self.flags, "hint_level", "episode") == "step"
+            else None
         )
 
         # get hints
@@ -204,6 +208,7 @@ def _get_queries(self):
         )
 
         chat_messages = Discussion([system_prompt, query_prompt.prompt])
+        # BUG: Parsing fails multiple times.
         ans_dict = retry(
             self.chat_llm,
             chat_messages,
diff --git a/src/agentlab/agents/generic_agent_hinter/generic_agent_prompt.py b/src/agentlab/agents/generic_agent_hinter/generic_agent_prompt.py
@@ -19,6 +19,7 @@
 
 logger = logging.getLogger(__name__)
 
+
 @dataclass
 class GenericPromptFlags(dp.Flags):
     """
@@ -403,6 +404,8 @@ def _parse_answer(self, text_answer):
             ans_dict["queries"] = json.loads(raw_queries)
         except Exception as e:
             t = text_answer.replace("\n", "\\n")
-            logger.warning(f"Failed to parse queries: {e}. Queries block content: '{ans_dict['queries']}'. RAW llm answer: '{t}'. Will retry")
+            logger.warning(
+                f"Failed to parse queries: {e}. Queries block content: '{ans_dict['queries']}'. RAW llm answer: '{t}'. Will retry"
+            )
             raise e
         return ans_dict
diff --git a/src/agentlab/agents/generic_agent_hinter/tmlr_config.py b/src/agentlab/agents/generic_agent_hinter/tmlr_config.py
@@ -23,7 +23,7 @@
         use_think_history=True,  # gpt-4o config except for this line
         use_diff=False,
         html_type="pruned_html",
-        use_screenshot=True,
+        use_screenshot=False,
         use_som=False,
         extract_visible_tag=True,
         extract_clickable_tag=True,
diff --git a/src/agentlab/utils/hinting.py b/src/agentlab/utils/hinting.py
@@ -12,11 +12,14 @@
 import pandas as pd
 import requests
 from agentlab.llm.chat_api import ChatModel
+import re
+from agentlab.llm.response_api import APIPayload
 
 logger = logging.getLogger(__name__)
 
 
 class HintsSource:
+
     def __init__(
         self,
         hint_db_path: str,
@@ -27,7 +30,8 @@ def __init__(
         embedder_server: str = "http://localhost:5000",
         llm_prompt: str = """We're choosing hints to help solve the following task:\n{goal}.\n
 You need to choose the most relevant hints topic from the following list:\n\nHint topics:\n{topics}\n
-Choose hint topic for the task and return only its number, e.g. 1. If you don't know the answer, return -1.""",
+Choose hint topic for the task and return only its number. Use the following output format: 
+<choice>index</choice> for e.g. <choice>0</choice> for the topic with index 0. If you don't know the answer, return <choice>-1</choice>""",
     ) -> None:
         self.hint_db_path = hint_db_path
         self.hint_retrieval_mode = hint_retrieval_mode
@@ -96,7 +100,15 @@ def choose_hints_llm(self, llm, goal: str, task_name: str) -> list[str]:
         else:
             response: str = llm(APIPayload(messages=[llm.msg.user().add_text(prompt)])).think
         try:
-            topic_number = json.loads(response)
+            matches = re.findall(r"<choice>(-?\d+)</choice>", response)
+            if not matches:
+                logger.error(f"No choice tags found in LLM response: {response}")
+                return []
+            if len(matches) > 1:
+                logger.warning(
+                    f"LLM selected multiple topics for retrieval using only the first one."
+                )
+            topic_number = int(matches[0])
             if topic_number < 0 or topic_number >= len(hint_topics):
                 logger.error(f"Wrong LLM hint id response: {response}, no hints")
                 return []