|
| 1 | +""" |
| 2 | +Prompt builder for GenericAgent |
| 3 | +
|
| 4 | +It is based on the dynamic_prompting module from the agentlab package. |
| 5 | +""" |
| 6 | + |
| 7 | +import logging |
| 8 | +from dataclasses import dataclass |
| 9 | +import bgym |
| 10 | + |
| 11 | +from browsergym.core.action.base import AbstractActionSet |
| 12 | + |
| 13 | +from agentlab.agents import dynamic_prompting as dp |
| 14 | +from agentlab.llm.llm_utils import BaseMessage, HumanMessage, image_to_jpg_base64_url |
| 15 | + |
| 16 | + |
| 17 | +@dataclass |
| 18 | +class PromptFlags(dp.Flags): |
| 19 | + """ |
| 20 | + A class to represent various flags used to control features in an application. |
| 21 | + """ |
| 22 | + |
| 23 | + obs: dp.ObsFlags = None |
| 24 | + action: dp.ActionFlags = None |
| 25 | + use_thinking: bool = True |
| 26 | + use_concrete_example: bool = False |
| 27 | + use_abstract_example: bool = True |
| 28 | + enable_chat: bool = False |
| 29 | + extra_instructions: str | None = None |
| 30 | + |
| 31 | + |
| 32 | +class SystemPrompt(dp.PromptElement): |
| 33 | + _prompt = """\ |
| 34 | +You are an agent trying to solve a web task based on the content of the page and |
| 35 | +user instructions. You can interact with the page and explore, and send messages to the user. Each time you |
| 36 | +submit an action it will be sent to the browser and you will receive a new page.""" |
| 37 | + |
| 38 | + |
| 39 | +def make_instructions(obs: dict, from_chat: bool, extra_instructions: str | None): |
| 40 | + """Convenient wrapper to extract instructions from either goal or chat""" |
| 41 | + if from_chat: |
| 42 | + instructions = dp.ChatInstructions( |
| 43 | + obs["chat_messages"], extra_instructions=extra_instructions |
| 44 | + ) |
| 45 | + else: |
| 46 | + if sum([msg["role"] == "user" for msg in obs.get("chat_messages", [])]) > 1: |
| 47 | + logging.warning( |
| 48 | + "Agent is in goal mode, but multiple user messages are present in the chat. Consider switching to `enable_chat=True`." |
| 49 | + ) |
| 50 | + instructions = dp.GoalInstructions( |
| 51 | + obs["goal_object"], extra_instructions=extra_instructions |
| 52 | + ) |
| 53 | + return instructions |
| 54 | + |
| 55 | + |
| 56 | +class History(dp.PromptElement): |
| 57 | + """ |
| 58 | + Format the actions and thoughts of previous steps.""" |
| 59 | + |
| 60 | + def __init__(self, actions, thoughts) -> None: |
| 61 | + super().__init__() |
| 62 | + prompt_elements = [] |
| 63 | + for i, (action, thought) in enumerate(zip(actions, thoughts)): |
| 64 | + prompt_elements.append( |
| 65 | + f""" |
| 66 | +## Step {i} |
| 67 | +### Thoughts: |
| 68 | +{thought} |
| 69 | +### Action: |
| 70 | +{action} |
| 71 | +""" |
| 72 | + ) |
| 73 | + self._prompt = "\n".join(prompt_elements) + "\n" |
| 74 | + |
| 75 | + |
| 76 | +class Observation(dp.PromptElement): |
| 77 | + """Observation of the current step. |
| 78 | +
|
| 79 | + Contains the html, the accessibility tree and the error logs. |
| 80 | + """ |
| 81 | + |
| 82 | + def __init__(self, obs, flags: dp.ObsFlags) -> None: |
| 83 | + super().__init__() |
| 84 | + self.flags = flags |
| 85 | + self.obs = obs |
| 86 | + |
| 87 | + # for a multi-tab browser, we need to show the current tab |
| 88 | + self.tabs = dp.Tabs( |
| 89 | + obs, |
| 90 | + visible=lambda: flags.use_tabs, |
| 91 | + prefix="## ", |
| 92 | + ) |
| 93 | + |
| 94 | + # if an error is present, we need to show it |
| 95 | + self.error = dp.Error( |
| 96 | + obs["last_action_error"], |
| 97 | + visible=lambda: flags.use_error_logs and obs["last_action_error"], |
| 98 | + prefix="## ", |
| 99 | + ) |
| 100 | + |
| 101 | + @property |
| 102 | + def _prompt(self) -> str: |
| 103 | + return f""" |
| 104 | +# Observation of current step: |
| 105 | +{self.tabs.prompt}{self.error.prompt} |
| 106 | +
|
| 107 | +""" |
| 108 | + |
| 109 | + def add_screenshot(self, prompt: BaseMessage) -> BaseMessage: |
| 110 | + if self.flags.use_screenshot: |
| 111 | + if self.flags.use_som: |
| 112 | + screenshot = self.obs["screenshot_som"] |
| 113 | + prompt.add_text( |
| 114 | + "\n## Screenshot:\nHere is a screenshot of the page, it is annotated with bounding boxes and corresponding bids:" |
| 115 | + ) |
| 116 | + else: |
| 117 | + screenshot = self.obs["screenshot"] |
| 118 | + prompt.add_text("\n## Screenshot:\nHere is a screenshot of the page:") |
| 119 | + img_url = image_to_jpg_base64_url(screenshot) |
| 120 | + prompt.add_image(img_url, detail=self.flags.openai_vision_detail) |
| 121 | + return prompt |
| 122 | + |
| 123 | + |
| 124 | +class MainPrompt(dp.PromptElement): |
| 125 | + |
| 126 | + def __init__( |
| 127 | + self, |
| 128 | + action_set: AbstractActionSet, |
| 129 | + obs: dict, |
| 130 | + actions: list[str], |
| 131 | + thoughts: list[str], |
| 132 | + flags: PromptFlags, |
| 133 | + ) -> None: |
| 134 | + super().__init__() |
| 135 | + self.flags = flags |
| 136 | + self.history = History(actions, thoughts) |
| 137 | + self.instructions = make_instructions(obs, flags.enable_chat, flags.extra_instructions) |
| 138 | + self.obs = Observation(obs, self.flags.obs) |
| 139 | + |
| 140 | + self.action_prompt = dp.ActionPrompt(action_set, action_flags=flags.action) |
| 141 | + self.think = dp.Think(visible=lambda: flags.use_thinking) |
| 142 | + |
| 143 | + @property |
| 144 | + def _prompt(self) -> HumanMessage: |
| 145 | + prompt = HumanMessage(self.instructions.prompt) |
| 146 | + prompt.add_text( |
| 147 | + f"""\ |
| 148 | +{self.obs.prompt}\ |
| 149 | +{self.history.prompt}\ |
| 150 | +{self.action_prompt.prompt}\ |
| 151 | +{self.think.prompt}\ |
| 152 | +""" |
| 153 | + ) |
| 154 | + |
| 155 | + if self.flags.use_abstract_example: |
| 156 | + prompt.add_text( |
| 157 | + f""" |
| 158 | +# Abstract Example |
| 159 | +
|
| 160 | +Here is an abstract version of the answer with description of the content of |
| 161 | +each tag. Make sure you follow this structure, but replace the content with your |
| 162 | +answer: |
| 163 | +{self.think.abstract_ex}\ |
| 164 | +{self.action_prompt.abstract_ex}\ |
| 165 | +""" |
| 166 | + ) |
| 167 | + |
| 168 | + if self.flags.use_concrete_example: |
| 169 | + prompt.add_text( |
| 170 | + f""" |
| 171 | +# Concrete Example |
| 172 | +
|
| 173 | +Here is a concrete example of how to format your answer. |
| 174 | +Make sure to follow the template with proper tags: |
| 175 | +{self.think.concrete_ex}\ |
| 176 | +{self.action_prompt.concrete_ex}\ |
| 177 | +""" |
| 178 | + ) |
| 179 | + return self.obs.add_screenshot(prompt) |
| 180 | + |
| 181 | + def _parse_answer(self, text_answer): |
| 182 | + ans_dict = {} |
| 183 | + ans_dict.update(self.think.parse_answer(text_answer)) |
| 184 | + ans_dict.update(self.action_prompt.parse_answer(text_answer)) |
| 185 | + return ans_dict |
0 commit comments