Merge remote-tracking branch 'origin/corellm_evals' into wa_verified

NicolasAG · NicolasAG · commit b54baa6777e0 · 2025-12-02T20:30:33.000Z
diff --git a/src/agentlab/agents/dynamic_prompting.py b/src/agentlab/agents/dynamic_prompting.py
@@ -573,9 +573,9 @@ class SystemPrompt(PromptElement):
 class ActionPrompt(PromptElement):
 
     _concrete_ex = """
-<action>
+[BEGIN FINAL RESPONSE]
 click('a324')
-</action>
+[END FINAL RESPONSE]
 """
 
     def __init__(self, action_set: AbstractActionSet, action_flags: ActionFlags) -> None:
@@ -596,9 +596,9 @@ def __init__(self, action_set: AbstractActionSet, action_flags: ActionFlags) ->
             f"# Action space:\n{action_set_generic_info}{action_description}{MacNote().prompt}\n"
         )
         self._abstract_ex = f"""
-<action>
+[BEGIN FINAL RESPONSE]
 {self.action_set.example_action(abstract=True)}
-</action>
+[END FINAL RESPONSE]
 """
 
     #         self._concrete_ex = f"""
@@ -789,7 +789,7 @@ def _prompt(self) -> str:
             prompt += f"\n<think>\n{self.thought}\n</think>\n"
 
         if self.flags.use_action_history:
-            prompt += f"\n<action>\n{self.action}\n</action>\n"
+            prompt += f"\n[BEGIN FINAL RESPONSE]\n{self.action}\n[END FINAL RESPONSE]\n"
 
         # prompt += f"{self.error.prompt}{self.html_diff.prompt}{self.ax_tree_diff.prompt}"
         prompt += f"{self.error.prompt}"
diff --git a/src/agentlab/agents/generic_agent/generic_agent_prompt.py b/src/agentlab/agents/generic_agent/generic_agent_prompt.py
@@ -153,12 +153,12 @@ def shrink(self):
         self.history.shrink()
         self.obs.shrink()
 
-    def _parse_answer(self, text_answer):
+    def _parse_answer(self, text_think, text_answer):
         ans_dict = {}
-        ans_dict.update(self.think.parse_answer(text_answer))
-        ans_dict.update(self.plan.parse_answer(text_answer))
-        ans_dict.update(self.memory.parse_answer(text_answer))
-        ans_dict.update(self.criticise.parse_answer(text_answer))
+        ans_dict.update(self.think.parse_answer(text_think))
+        ans_dict.update(self.plan.parse_answer(text_think))
+        ans_dict.update(self.memory.parse_answer(text_think))
+        ans_dict.update(self.criticise.parse_answer(text_think))
         ans_dict.update(self.action_prompt.parse_answer(text_answer))
         return ans_dict
 
diff --git a/src/agentlab/llm/chat_api.py b/src/agentlab/llm/chat_api.py
@@ -324,12 +324,86 @@ def __call__(self, messages: list[dict], n_samples: int = 1, temperature: float
             tracking.TRACKER.instance(input_tokens, output_tokens, cost)
 
         if n_samples == 1:
-            res = AIMessage(completion.choices[0].message.content)
+            think, action = self._extract_thinking_content_from_response(completion)
+            res_think = AIMessage(think or "")
+            res_action = AIMessage(action or "")
             if self.log_probs:
-                res["log_probs"] = completion.choices[0].log_probs
-            return res
+                res_think["log_probs"] = completion.choices[0].logprobs
+            return res_think, res_action
         else:
-            return [AIMessage(c.message.content) for c in completion.choices]
+            return [
+                self._build_think_action_pair(choice)
+                for choice in completion.choices
+            ]
+
+    def _extract_thinking_content_from_response(self, response, wrap_tag="think") -> tuple[str, str]:
+        """Extract reasoning and action content from an API response.
+        
+        Handles multiple formats:
+        1. OpenAI/DeepSeek: reasoning in 'reasoning_content' or 'reasoning' field
+        2. Apriel: reasoning before [BEGIN FINAL RESPONSE]...[END FINAL RESPONSE] tags
+        3. Standard: content as-is
+        
+        Args:
+            response: The API response object.
+            wrap_tag: Tag name to wrap reasoning content (default: "think").
+            
+        Returns:
+            tuple: (reasoning_wrapped, action_wrapped)
+        """
+        message = response.choices[0].message
+        msg_dict = message.to_dict() if hasattr(message, 'to_dict') else dict(message)
+        
+        reasoning = msg_dict.get("reasoning_content") or msg_dict.get("reasoning")
+        content = msg_dict.get("content", "") or msg_dict.get("text", "")
+        
+        # Case 1: Explicit reasoning field from API
+        if reasoning:
+            reasoning_wrapped = f"<{wrap_tag}>{reasoning}</{wrap_tag}>\n"
+            if "[BEGIN FINAL RESPONSE]" in content and "[END FINAL RESPONSE]" in content:
+                action = self._extract_last_action_from_tags(content)
+                action_wrapped = f"<action>\n{action}\n</action>"
+            else:
+                action_wrapped = content
+            return reasoning_wrapped, action_wrapped
+        
+        # Case 2: Apriel-style format in content
+        if "[BEGIN FINAL RESPONSE]" in content:
+            reasoning_text, action_text = self._parse_apriel_format(content)
+            reasoning_wrapped = f"<{wrap_tag}>\n{reasoning_text}\n</{wrap_tag}>" if reasoning_text else ""
+            action_wrapped = f"<action>\n{action_text}\n</action>" if action_text else ""
+            return reasoning_wrapped, action_wrapped
+        
+        # Case 3: No special format
+        return "", content
+
+    def _extract_last_action_from_tags(self, content: str) -> str:
+        """Extract content from the LAST [BEGIN FINAL RESPONSE]...[END FINAL RESPONSE] block."""
+        pattern = r'\[BEGIN FINAL RESPONSE\](.*?)\[END FINAL RESPONSE\]'
+        matches = re.findall(pattern, content, re.DOTALL)
+        return matches[-1].strip() if matches else ""
+
+    def _parse_apriel_format(self, content: str) -> tuple[str, str]:
+        """Parse Apriel format: reasoning before [BEGIN FINAL RESPONSE] tags."""
+        last_begin = content.rfind("[BEGIN FINAL RESPONSE]")
+        if last_begin == -1:
+            return "", content
+        
+        reasoning = content[:last_begin].strip()
+        if reasoning.startswith("Here are my reasoning steps:"):
+            reasoning = reasoning[len("Here are my reasoning steps:"):].strip()
+        
+        action = self._extract_last_action_from_tags(content)
+        return reasoning, action
+
+    def _build_think_action_pair(self, choice) -> tuple[AIMessage, AIMessage]:
+        """Build (think, action) pair from a single choice."""
+        # Create minimal response-like object for the extraction method
+        mock_response = type('MockResponse', (), {
+            'choices': [choice]
+        })()
+        think, action = self._extract_thinking_content_from_response(mock_response)
+        return AIMessage(think or ""), AIMessage(action or "")
 
     def get_stats(self):
         return {
@@ -484,6 +558,55 @@ def __init__(
         )
 
 
+class AprielChatModel(ChatModel):
+    """Chat model for Apriel models hosted on DGX Cloud."""
+
+    def __init__(
+        self,
+        model_name="Slam-15B",
+        api_key=None,
+        base_url=None,
+        temperature=0.5,
+        max_tokens=15000,
+        max_retry=4,
+        min_retry_wait_time=60,
+    ):
+        base_url = base_url or os.getenv(
+            "APRIEL_API_URL",
+            ""
+        )
+        api_key = api_key or os.getenv("APRIEL_API_KEY")
+        
+        super().__init__(
+            model_name=model_name,
+            api_key=api_key,
+            temperature=temperature,
+            max_tokens=max_tokens,
+            max_retry=max_retry,
+            min_retry_wait_time=min_retry_wait_time,
+            client_class=OpenAI,
+            client_args={"base_url": base_url},
+            pricing_func=None,
+        )
+
+
+@dataclass
+class AprielModelArgs(BaseModelArgs):
+    """Serializable args for Apriel models."""
+    
+    base_url: str = None
+    api_key: str = None
+
+    def make_model(self):
+        return AprielChatModel(
+            model_name=self.model_name,
+            base_url=self.base_url,
+            api_key=self.api_key,
+            temperature=self.temperature,
+            max_tokens=self.max_new_tokens,
+        )
+
+
 class AnthropicChatModel(AbstractChatModel):
     def __init__(
         self,
diff --git a/src/agentlab/llm/llm_configs.py b/src/agentlab/llm/llm_configs.py
@@ -7,6 +7,7 @@
     OpenAIModelArgs,
     OpenRouterModelArgs,
     SelfHostedModelArgs,
+    AprielModelArgs
 )
 
 default_oss_llms_args = {
@@ -375,4 +376,13 @@
         max_new_tokens=4_000,
         temperature=1e-1,
     ),
+
+    "apriel/slam-15b": AprielModelArgs(
+        model_name="openai/Slam-15B",
+        base_url="",
+        api_key="",
+        max_total_tokens=40_000,
+        max_new_tokens=15_000,
+        temperature=0.6,
+    ),
 }
diff --git a/src/agentlab/llm/llm_utils.py b/src/agentlab/llm/llm_utils.py
@@ -83,17 +83,19 @@ def retry(
     """
     tries = 0
     while tries < n_retry:
-        answer = chat(messages)
+        think, action = chat(messages)
+        think_content, action_content = think["content"], action["content"]
+        
         # TODO: could we change this to not use inplace modifications ?
-        messages.append(answer)
+        messages.append({"role": "assistant", "content": think_content + action_content})
+        
         try:
-            return parser(answer["content"])
+            return parser(think_content, action_content)
         except ParseError as parsing_error:
             tries += 1
             if log:
-                msg = f"Query failed. Retrying {tries}/{n_retry}.\n[LLM]:\n{answer['content']}\n[User]:\n{str(parsing_error)}"
-                logging.info(msg)
-            messages.append(dict(role="user", content=str(parsing_error)))
+                logging.info(f"Query failed. Retrying {tries}/{n_retry}.\n[LLM]:\n{action_content}\n[User]:\n{parsing_error}")
+            messages.append({"role": "user", "content": str(parsing_error)})
 
     raise ParseError(f"Could not parse a valid value after {n_retry} retries.")
 
diff --git a/src/agentlab/llm/logging_config.py b/src/agentlab/llm/logging_config.py
@@ -0,0 +1,31 @@
+import logging
+import sys
+from functools import lru_cache
+
+
+@lru_cache(maxsize=None)
+def setup_logging(level=logging.INFO):
+    """Configure logging once and cache the result.
+
+    Using lru_cache ensures this only runs once per process,
+    even if imported and called multiple times.
+    """
+    # Remove any existing handlers to avoid duplicates
+    root = logging.getLogger()
+    for handler in root.handlers:
+        root.removeHandler(handler)
+
+    # Configure format and handler
+    formatter = logging.Formatter("%(name)s - %(levelname)s - %(message)s")
+    console_handler = logging.StreamHandler(sys.stdout)
+    console_handler.setFormatter(formatter)
+
+    # Set up root logger
+    root.addHandler(console_handler)
+    root.setLevel(level)
+
+    return root
+
+
+# Call it once when module is imported
+logger = setup_logging()
diff --git a/src/agentlab/llm/vllm_server.py b/src/agentlab/llm/vllm_server.py
diff --git a/tests/test_apriel.py b/tests/test_apriel.py