From 2a1c67ce6736cba7ea30f99af558ea402ed16075 Mon Sep 17 00:00:00 2001 From: Toolkit User Date: Mon, 27 Oct 2025 17:08:41 +0000 Subject: [PATCH 1/2] qwen3_workarena_changes --- src/agentlab/agents/dynamic_prompting.py | 38 ++++++++++ .../agents/generic_agent/agent_configs.py | 42 +++++++++++ src/agentlab/llm/chat_api.py | 71 ++++++++++++++++++- 3 files changed, 150 insertions(+), 1 deletion(-) diff --git a/src/agentlab/agents/dynamic_prompting.py b/src/agentlab/agents/dynamic_prompting.py index 92ad25b9..6a585dd8 100644 --- a/src/agentlab/agents/dynamic_prompting.py +++ b/src/agentlab/agents/dynamic_prompting.py @@ -412,6 +412,7 @@ def __init__(self, obs, flags: ObsFlags) -> None: visible=lambda: flags.use_html, prefix="## ", ) + obs["axtree_txt"] = remove_ui_patterns(obs["axtree_txt"]) self.ax_tree = AXTree( obs["axtree_txt"], visible_elements_only=flags.filter_visible_elements_only, @@ -874,3 +875,40 @@ def obs_mapping(obs: dict): return obs return obs_mapping + + + + +import re + +def remove_ui_patterns(text): + """ + Remove lines containing specific UI patterns for ServiceNow accessibility tree text. + + Args: + text (str): The input string containing the accessibility tree + + Returns: + str: The cleaned string with lines containing UI patterns removed + """ + + # Words to look for + words_to_remove = ["Edit Widget", "Edit Widget Preferences", "Close Widget", "Add content"] + + # Split text into lines + lines = text.split('\n') + + # Keep lines that don't contain any of the words + filtered_lines = [] + for line in lines: + should_keep = True + for word in words_to_remove: + if word in line: + should_keep = False + break + if should_keep: + filtered_lines.append(line) + + # Join the remaining lines back together + return '\n'.join(filtered_lines) + diff --git a/src/agentlab/agents/generic_agent/agent_configs.py b/src/agentlab/agents/generic_agent/agent_configs.py index ce8914a4..c24e91a2 100644 --- a/src/agentlab/agents/generic_agent/agent_configs.py +++ b/src/agentlab/agents/generic_agent/agent_configs.py @@ -149,6 +149,48 @@ add_missparsed_messages=True, ) +# qwen3 default config - same as llama3-70b but with use_think_history=False +FLAGS_QWEN3 = GenericPromptFlags( + obs=dp.ObsFlags( + use_html=False, + use_ax_tree=True, + use_focused_element=True, + use_error_logs=False, + use_history=True, + use_past_error_logs=False, + use_action_history=True, + use_think_history=False, # Qwen3 doesn't include thinking history + use_diff=False, + html_type="pruned_html", + use_screenshot=False, + use_som=False, + extract_visible_tag=True, + extract_clickable_tag=False, + extract_coords="False", + filter_visible_elements_only=False, + ), + action=dp.ActionFlags( + action_set=HighLevelActionSetArgs( + subsets=["bid"], + multiaction=False, + ), + long_description=False, + individual_examples=True, + ), + use_plan=False, + use_criticise=False, + use_thinking=True, + use_memory=False, + use_concrete_example=True, + use_abstract_example=True, + use_hints=True, + enable_chat=False, + max_prompt_tokens=40_000, + be_cautious=True, + extra_instructions=None, + add_missparsed_messages=True, +) + AGENT_LLAMA3_70B = GenericAgentArgs( chat_model_args=CHAT_MODEL_ARGS_DICT["openrouter/meta-llama/llama-3-70b-instruct"], flags=FLAGS_LLAMA3_70B, diff --git a/src/agentlab/llm/chat_api.py b/src/agentlab/llm/chat_api.py index 188747ac..47a49e2b 100644 --- a/src/agentlab/llm/chat_api.py +++ b/src/agentlab/llm/chat_api.py @@ -324,7 +324,7 @@ def __call__(self, messages: list[dict], n_samples: int = 1, temperature: float tracking.TRACKER.instance(input_tokens, output_tokens, cost) if n_samples == 1: - res = AIMessage(completion.choices[0].message.content) + res = AIMessage(_extract_thinking_content_from_response(completion)) if self.log_probs: res["log_probs"] = completion.choices[0].log_probs return res @@ -593,3 +593,72 @@ def make_model(self): temperature=self.temperature, max_tokens=self.max_new_tokens, ) + + + + +import logging + +def _extract_thinking_content_from_response(response, wrap_tag: str = "think"): + """ + Extracts the content from the message, including reasoning if available. + It wraps the reasoning around ... for easy identification of reasoning content, + when LLM produces 'text' and 'reasoning' in the same message. + + Args: + response: The message object or dict containing content and reasoning. + wrap_tag: The tag name to wrap reasoning content (default: "think"). + + Returns: + str: The extracted content with reasoning wrapped in specified tags. + """ + # Normalize to dict + message = response.choices[0].message + if not isinstance(message, dict): + message = message.to_dict() + + # --- Extract reasoning from either `reasoning` or `reasoning_content` (and optional metadata) --- + reasoning_text = ( + message.get("reasoning") + or message.get("reasoning_content") + ) + + # --- Extract surface text/content (keeps your original behavior, but handles list-style `content`) --- + msg_text = message.get("text", "") # works for OpenRouter + raw_content = message.get("content", "") + + if isinstance(raw_content, list): + # Concatenate text-like parts if provider returns content blocks + parts = [] + for part in raw_content: + if isinstance(part, str): + parts.append(part) + elif isinstance(part, dict): + # Common shapes: {"type":"text","text":"..."} or {"type":"text","text":{"value":"..."}} + if part.get("type") == "text": + txt = part.get("text") + if isinstance(txt, dict) and isinstance(txt.get("value"), str): + parts.append(txt["value"]) + elif isinstance(txt, str): + parts.append(txt) + else: + # Fallback: try a few likely keys + for k in ("content", "text", "value"): + v = part.get(k) + if isinstance(v, str): + parts.append(v) + break + else: + parts.append(str(part)) + msg_content = "\n".join(p for p in parts if p) + else: + msg_content = raw_content if isinstance(raw_content, str) else str(raw_content or "") + + # --- Wrap reasoning if present --- + if reasoning_text: + reasoning_wrapped = f"<{wrap_tag}>{reasoning_text}\n" if wrap_tag else (reasoning_text + "\n") + logging.debug("Extracting content from response.choices[i].message.(reasoning|reasoning_content)") + else: + reasoning_wrapped = "" + + return f"{reasoning_wrapped}{msg_text}{msg_content}" \ No newline at end of file From 0701c52c4d7be06b20ad4287a3f9330a1866733e Mon Sep 17 00:00:00 2001 From: optimass Date: Fri, 31 Oct 2025 16:05:37 +0000 Subject: [PATCH 2/2] hardcoded n_retry to 1 --- src/agentlab/agents/generic_agent/generic_agent.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/agentlab/agents/generic_agent/generic_agent.py b/src/agentlab/agents/generic_agent/generic_agent.py index d1f48f76..0fa8b529 100644 --- a/src/agentlab/agents/generic_agent/generic_agent.py +++ b/src/agentlab/agents/generic_agent/generic_agent.py @@ -30,7 +30,7 @@ class GenericAgentArgs(AgentArgs): chat_model_args: BaseModelArgs = None flags: GenericPromptFlags = None - max_retry: int = 4 + max_retry: int = 1 def __post_init__(self): try: # some attributes might be temporarily args.CrossProd for hyperparameter generation @@ -77,7 +77,7 @@ def __init__( self, chat_model_args: BaseModelArgs, flags: GenericPromptFlags, - max_retry: int = 4, + max_retry: int = 1, ): self.chat_llm = chat_model_args.make_model()