diff --git a/README.md b/README.md index 1623d6f25..3f67d7de3 100644 --- a/README.md +++ b/README.md @@ -23,6 +23,7 @@ Unlike many agent frameworks that only track the chat history with LLMs in text, ## πŸ†• News +- πŸ“…2025-03-13: TaskWeaver now supports vision input for the Planner role. Please check the [vision input](https://microsoft.github.io/TaskWeaver/blog/vision) for more details.πŸ‘€ - πŸ“…2025-01-16: TaskWeaver has been enhanced with an experimental role called [Recepta](https://microsoft.github.io/TaskWeaver/blog/reasoning) for its reasoning power.🧠 - πŸ“…2024-12-23: TaskWeaver has been integrated with the [AgentOps](https://microsoft.github.io/TaskWeaver/docs/observability) for better observability and monitoring.πŸ” - πŸ“…2024-09-13: We introduce the shared memory to store information that is shared between the roles in TaskWeaver. Please check the [memory](https://microsoft.github.io/TaskWeaver/docs/memory) for more details.🧠 @@ -31,7 +32,7 @@ Unlike many agent frameworks that only track the chat history with LLMs in text, - πŸ“…2024-05-07: We have added two blog posts on [Evaluating a LLM agent](https://microsoft.github.io/TaskWeaver/blog/evaluation) and [Adding new roles to TaskWeaver](https://microsoft.github.io/TaskWeaver/blog/role) in the documentation.πŸ“ - πŸ“…2024-03-28: TaskWeaver now offers all-in-one Docker image, providing a convenient one-stop experience for users. Please check the [docker](https://microsoft.github.io/TaskWeaver/docs/usage/docker) for more details.🐳 - πŸ“…2024-03-27: TaskWeaver now switches to `container` mode by default for code execution. Please check the [code execution](https://microsoft.github.io/TaskWeaver/docs/code_execution) for more details.🐳 -- πŸ“…2024-03-07: TaskWeaver now supports configuration of different LLMs for various components, such as the Planner and CodeInterpreter. Please check the [multi-llm](https://microsoft.github.io/TaskWeaver/docs/llms/multi-llm) for more details.πŸ”— + @@ -43,7 +44,8 @@ Unlike many agent frameworks that only track the chat history with LLMs in text, - +- ...... +- πŸ“…2023-11-30: TaskWeaver is released on GitHub🎈. ## πŸ’₯ Highlights @@ -68,7 +70,6 @@ We are looking forward to your contributions to make TaskWeaver better. - [ ] Support for prompt template management - [ ] Better plugin experiences, such as displaying updates or stopping in the middle of running the plugin and user confirmation before running the plugin - [ ] Async interaction with LLMs -- [ ] Support for vision input for Roles such as the Planner and CodeInterpreter - [ ] Support for remote code execution diff --git a/project/examples/code_generator_examples/example1-codeinterpreter.yaml b/project/examples/code_generator_examples/example-codeinterpreter-default-1.yaml similarity index 100% rename from project/examples/code_generator_examples/example1-codeinterpreter.yaml rename to project/examples/code_generator_examples/example-codeinterpreter-default-1.yaml diff --git a/project/examples/code_generator_examples/example2-codeinterpreter.yaml b/project/examples/code_generator_examples/example-codeinterpreter-default-2.yaml similarity index 100% rename from project/examples/code_generator_examples/example2-codeinterpreter.yaml rename to project/examples/code_generator_examples/example-codeinterpreter-default-2.yaml diff --git a/project/examples/planner_examples/example-planner.yaml b/project/examples/planner_examples/example-planner-default-1.yaml similarity index 100% rename from project/examples/planner_examples/example-planner.yaml rename to project/examples/planner_examples/example-planner-default-1.yaml diff --git a/project/examples/planner_examples/example-planner-2.yaml b/project/examples/planner_examples/example-planner-default-2.yaml similarity index 100% rename from project/examples/planner_examples/example-planner-2.yaml rename to project/examples/planner_examples/example-planner-default-2.yaml diff --git a/taskweaver/chat/console/chat.py b/taskweaver/chat/console/chat.py index 98dd99c54..b7125425b 100644 --- a/taskweaver/chat/console/chat.py +++ b/taskweaver/chat/console/chat.py @@ -498,7 +498,7 @@ def _reset_session(self, first_session: bool = False): self.session.stop() self.session = self.app.get_session() - self._system_message("--- new session starts ---") + self._system_message("--- new session started ---") self._assistant_message( "I am TaskWeaver, an AI assistant. To get started, could you please enter your request?", ) diff --git a/taskweaver/code_interpreter/code_interpreter/code_generator.py b/taskweaver/code_interpreter/code_interpreter/code_generator.py index 795b2f15e..5800fb1c0 100644 --- a/taskweaver/code_interpreter/code_interpreter/code_generator.py +++ b/taskweaver/code_interpreter/code_interpreter/code_generator.py @@ -251,7 +251,7 @@ def compose_conversation( # for code correction user_message += self.user_message_head_template.format( FEEDBACK=format_code_feedback(post), - MESSAGE=f"{post.get_attachment(AttachmentType.revise_message)[0]}", + MESSAGE=f"{post.get_attachment(AttachmentType.revise_message)[0].content}", ) assistant_message = self.post_translator.post_to_raw_text( diff --git a/taskweaver/code_interpreter/code_interpreter_cli_only/code_interpreter_cli_only.py b/taskweaver/code_interpreter/code_interpreter_cli_only/code_interpreter_cli_only.py index 910b48ecc..22b636d00 100644 --- a/taskweaver/code_interpreter/code_interpreter_cli_only/code_interpreter_cli_only.py +++ b/taskweaver/code_interpreter/code_interpreter_cli_only/code_interpreter_cli_only.py @@ -60,9 +60,12 @@ def reply( prompt_log_path=prompt_log_path, ) - code = post_proxy.post.get_attachment(type=AttachmentType.reply_content)[0] + code = post_proxy.post.get_attachment(type=AttachmentType.reply_content)[0].content if len(code) == 0: - post_proxy.update_message(post_proxy.post.get_attachment(type=AttachmentType.thought)[0], is_end=True) + post_proxy.update_message( + post_proxy.post.get_attachment(type=AttachmentType.thought)[0].content, + is_end=True, + ) return post_proxy.end() code_to_exec = "! " + code diff --git a/taskweaver/code_interpreter/code_interpreter_plugin_only/code_interpreter_plugin_only.py b/taskweaver/code_interpreter/code_interpreter_plugin_only/code_interpreter_plugin_only.py index b1600be83..41d09abf0 100644 --- a/taskweaver/code_interpreter/code_interpreter_plugin_only/code_interpreter_plugin_only.py +++ b/taskweaver/code_interpreter/code_interpreter_plugin_only/code_interpreter_plugin_only.py @@ -78,7 +78,7 @@ def reply( return post_proxy.end() functions = json.loads( - post_proxy.post.get_attachment(type=AttachmentType.function)[0], + post_proxy.post.get_attachment(type=AttachmentType.function)[0].content, ) if len(functions) > 0: code: List[str] = [] diff --git a/taskweaver/ext_role/image_reader/__init__.py b/taskweaver/ext_role/image_reader/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/taskweaver/ext_role/image_reader/image_reader.py b/taskweaver/ext_role/image_reader/image_reader.py new file mode 100644 index 000000000..6185c543e --- /dev/null +++ b/taskweaver/ext_role/image_reader/image_reader.py @@ -0,0 +1,119 @@ +import base64 +import json +import os.path +from mimetypes import guess_type + +from injector import inject + +from taskweaver.llm import LLMApi, format_chat_message +from taskweaver.logging import TelemetryLogger +from taskweaver.memory import Memory, Post +from taskweaver.memory.attachment import AttachmentType +from taskweaver.module.event_emitter import SessionEventEmitter +from taskweaver.module.tracing import Tracing +from taskweaver.role import Role +from taskweaver.role.role import RoleConfig, RoleEntry +from taskweaver.session import SessionMetadata + + +# Function to encode a local image into data URL +def local_image_to_data_url(image_path): + # Guess the MIME type of the image based on the file extension + mime_type, _ = guess_type(image_path) + if mime_type is None: + mime_type = "application/octet-stream" # Default MIME type if none is found + + try: + # Read and encode the image file + with open(image_path, "rb") as image_file: + base64_encoded_data = base64.b64encode(image_file.read()).decode("utf-8") + except FileNotFoundError: + logger.error(f"Error: The file {image_path} does not exist.") + return None + except IOError: + logger.error(f"Error: The file {image_path} could not be read.") + return None + # Construct the data URL + return f"data:{mime_type};base64,{base64_encoded_data}" + + +class ImageReaderConfig(RoleConfig): + def _configure(self): + pass + + +class ImageReader(Role): + @inject + def __init__( + self, + config: ImageReaderConfig, + logger: TelemetryLogger, + tracing: Tracing, + event_emitter: SessionEventEmitter, + role_entry: RoleEntry, + llm_api: LLMApi, + session_metadata: SessionMetadata, + ): + super().__init__(config, logger, tracing, event_emitter, role_entry) + + self.llm_api = llm_api + self.session_metadata = session_metadata + + def reply(self, memory: Memory, **kwargs: ...) -> Post: + rounds = memory.get_role_rounds( + role=self.alias, + include_failure_rounds=False, + ) + + # obtain the query from the last round + last_post = rounds[-1].post_list[-1] + + post_proxy = self.event_emitter.create_post_proxy(self.alias) + + post_proxy.update_send_to(last_post.send_from) + + input_message = last_post.message + prompt = ( + f"Input message: {input_message}.\n" + "\n" + "Your response should be a JSON object with the key 'image_url' and the value as the image path. " + "For example, {'image_url': 'c:/images/image.jpg'} or {'image_url': 'http://example.com/image.jpg'}. " + "Do not add any additional information in the response or wrap the JSON with ```json and ```." + ) + + response = self.llm_api.chat_completion( + messages=[ + format_chat_message( + role="system", + message="Your task is to read the image path from the message.", + ), + format_chat_message( + role="user", + message=prompt, + ), + ], + ) + + image_url = json.loads(response["content"])["image_url"] + if image_url.startswith("http"): + image_content = image_url + attachment_message = f"Image from {image_url}." + else: + if os.path.isabs(image_url): + image_content = local_image_to_data_url(image_url) + else: + image_content = local_image_to_data_url(os.path.join(self.session_metadata.execution_cwd, image_url)) + attachment_message = f"Image from {image_url} encoded as a Base64 data URL." + + post_proxy.update_attachment( + message=attachment_message, + type=AttachmentType.image_url, + extra={"image_url": image_content}, + is_end=True, + ) + + post_proxy.update_message( + "I have read the image path from the message. The image is attached below.", + ) + + return post_proxy.end() diff --git a/taskweaver/ext_role/image_reader/image_reader.role.yaml b/taskweaver/ext_role/image_reader/image_reader.role.yaml new file mode 100644 index 000000000..6bcfd0ace --- /dev/null +++ b/taskweaver/ext_role/image_reader/image_reader.role.yaml @@ -0,0 +1,5 @@ +alias: ImageReader +module: taskweaver.ext_role.image_reader.image_reader.ImageReader +intro : |- + - ImageReader is responsible for helping the Planner to read images. + - The input message must contain the image path, either local or remote. diff --git a/taskweaver/llm/util.py b/taskweaver/llm/util.py index 9c50a9b27..17ec25fe1 100644 --- a/taskweaver/llm/util.py +++ b/taskweaver/llm/util.py @@ -1,7 +1,9 @@ from typing import Any, Dict, List, Literal, Optional, TypedDict, Union ChatMessageRoleType = Literal["system", "user", "assistant", "function"] -ChatMessageType = Dict[Literal["role", "name", "content"], str] +ChatContentType = Dict[Literal["type", "text", "image_url"], str | Dict[Literal["url"], str]] +ChatMessageType = Dict[Literal["role", "name", "content"], str | List[ChatContentType]] + PromptTypeSimple = List[ChatMessageType] @@ -21,15 +23,43 @@ class PromptTypeWithTools(TypedDict): tools: Optional[List[PromptToolType]] +def format_chat_message_content( + content_type: Literal["text", "image_url"], + content_value: str, +) -> ChatContentType: + if content_type == "image_url": + return { + "type": content_type, + content_type: { + "url": content_value, + }, + } + else: + return { + "type": content_type, + content_type: content_value, + } + + def format_chat_message( role: ChatMessageRoleType, message: str, + image_urls: Optional[List[str]] = None, name: Optional[str] = None, ) -> ChatMessageType: - msg: ChatMessageType = { - "role": role, - "content": message, - } + if not image_urls: + msg: ChatMessageType = { + "role": role, + "content": message, + } + else: + msg: ChatMessageType = { + "role": role, + "content": [ + format_chat_message_content("text", message), + ] + + [format_chat_message_content("image_url", image) for image in image_urls], + } if name is not None: msg["name"] = name return msg diff --git a/taskweaver/memory/attachment.py b/taskweaver/memory/attachment.py index afc1346d4..dc9bcd334 100644 --- a/taskweaver/memory/attachment.py +++ b/taskweaver/memory/attachment.py @@ -47,6 +47,9 @@ class AttachmentType(Enum): # shared memory entry shared_memory_entry = "shared_memory_entry" + # vision input + image_url = "image_url" + @dataclass class Attachment: diff --git a/taskweaver/memory/post.py b/taskweaver/memory/post.py index ee77c2677..4b053691c 100644 --- a/taskweaver/memory/post.py +++ b/taskweaver/memory/post.py @@ -87,9 +87,9 @@ def add_attachment(self, attachment: Attachment) -> None: """Add an attachment to the post.""" self.attachment_list.append(attachment) - def get_attachment(self, type: AttachmentType) -> List[Any]: + def get_attachment(self, type: AttachmentType) -> List[Attachment]: """Get all the attachments of the given type.""" - return [attachment.content for attachment in self.attachment_list if attachment.type == type] + return [attachment for attachment in self.attachment_list if attachment.type == type] def del_attachment(self, type_list: List[AttachmentType]) -> None: """Delete all the attachments of the given type.""" diff --git a/taskweaver/planner/planner.py b/taskweaver/planner/planner.py index 5e7ac13aa..31de46201 100644 --- a/taskweaver/planner/planner.py +++ b/taskweaver/planner/planner.py @@ -133,6 +133,7 @@ def compose_conversation_for_prompt( for post in chat_round.post_list: if post.send_from == self.alias: if post.send_to == "User" or post.send_to in self.recipient_alias_set: + # planner responses planner_message = self.planner_post_translator.post_to_raw_text( post=post, ) @@ -144,47 +145,45 @@ def compose_conversation_for_prompt( ) elif post.send_to == self.alias: # self correction for planner response, e.g., format error/field check error + # append the invalid response to chat history conversation.append( format_chat_message( role="assistant", message=post.get_attachment( type=AttachmentType.invalid_response, - )[0], + )[0].content, ), ) - # append the invalid response to chat history + # append the self correction instruction message to chat history conversation.append( format_chat_message( role="user", message=self.format_message( role="User", - message=post.get_attachment(type=AttachmentType.revise_message)[0], + message=post.get_attachment(type=AttachmentType.revise_message)[0].content, ), ), ) - # append the self correction instruction message to chat history - else: - if conv_init_message is not None: - message = self.format_message( - role=post.send_from, - message=conv_init_message + "\n" + post.message, - ) - conversation.append( - format_chat_message(role="user", message=message), - ) - conv_init_message = None - else: - conversation.append( - format_chat_message( - role="user", - message=self.format_message( - role=post.send_from, - message=post.message, - ), + # messages from user or workers + conversation.append( + format_chat_message( + role="user", + message=self.format_message( + role=post.send_from, + message=post.message + if conv_init_message is None + else conv_init_message + "\n" + post.message, ), - ) + image_urls=[ + attachment.extra["image_url"] + for attachment in post.get_attachment(type=AttachmentType.image_url) + ], + ), + ) + + conv_init_message = None return conversation diff --git a/website/blog/authors.yml b/website/blog/authors.yml new file mode 100644 index 000000000..947312677 --- /dev/null +++ b/website/blog/authors.yml @@ -0,0 +1,11 @@ +liqli: + name: Liqun Li + url: https://liqul.github.io + title: Principal Researcher + image_url: https://liqul.github.io/assets/logo_small_bw.png + +xu: + name: Xu Zhang + url: https://scholar.google.com/citations?user=bqXdMMMAAAAJ&hl=zh-CN + title: Senior Researcher + image_url: https://scholar.googleusercontent.com/citations?view_op=view_photo&user=bqXdMMMAAAAJ&citpid=3 diff --git a/website/blog/evaluation.md b/website/blog/evaluation.md index 98eb1b506..71dfd176f 100644 --- a/website/blog/evaluation.md +++ b/website/blog/evaluation.md @@ -1,4 +1,8 @@ -# How to evaluate a LLM agent? +--- +title: How to evaluate a LLM agent? +authors: [liqli, xu] +date: 2024-05-07 +--- ## The challenges It is nontrivial to evaluate the performance of a LLM agent. diff --git a/website/blog/experience.md b/website/blog/experience.md index 5c36e2b57..8e2a2b5cc 100644 --- a/website/blog/experience.md +++ b/website/blog/experience.md @@ -1,4 +1,8 @@ -# Experience selection +--- +title: Experience Selection in TaskWeaver +authors: liqli +date: 2024-09-14 +--- We have introduced the motivation of the `experience` module in [Experience](/docs/customization/experience) and how to create a handcrafted experience in [Handcrafted Experience](/docs/customization/experience/handcrafted_experience). diff --git a/website/blog/local_llm.md b/website/blog/local_llm.md index bd67ed9b0..2093e8736 100644 --- a/website/blog/local_llm.md +++ b/website/blog/local_llm.md @@ -1,4 +1,8 @@ -# Run TaskWeaver with Locally Deployed Not-that-Large Language Models +--- +title: Run TaskWeaver with Locally Deployed Not-that-Large Language Models +authors: liqli +date: 2024-07-08 +--- :::info The feature introduced in this blog post can cause incompatibility issue with the previous version of TaskWeaver diff --git a/website/blog/plugin.md b/website/blog/plugin.md index 2a3442493..e01fc5a6c 100644 --- a/website/blog/plugin.md +++ b/website/blog/plugin.md @@ -1,4 +1,8 @@ -# Plugins In-Depth +--- +title: Plugins In-Depth +authors: liqli +date: 2024-05-23 +--- _**Pre-requisites**: Please refer to the [Introduction](/docs/plugin/plugin_intro) and the [Plugin Development](/docs/plugin/how_to_develop_a_new_plugin) pages for a better understanding of the plugin concept and its development process._ diff --git a/website/blog/reasoning.md b/website/blog/reasoning.md index 4ceeb6b0f..5c5be73a9 100644 --- a/website/blog/reasoning.md +++ b/website/blog/reasoning.md @@ -1,4 +1,8 @@ -# What makes a good agent reasoning framework? +--- +title: What makes a good agent reasoning framework? +authors: liqli +date: 2025-01-20 +--- An agent can listen to the user's request, understand the context, make plans, take actions, observe the results, and respond to the user. Its behavior is driven by the reasoning process, which is the core of the agent's intelligence. diff --git a/website/blog/role.md b/website/blog/role.md index dd1de91c9..86a01e989 100644 --- a/website/blog/role.md +++ b/website/blog/role.md @@ -1,4 +1,8 @@ -# Roles in TaskWeaver +--- +title: Roles in TaskWeaver +authors: [liqli, xu] +date: 2024-05-07 +--- We frame TaskWeaver as a **code-first** agent framework. The term "code-first" means that the agent is designed to diff --git a/website/blog/vision.md b/website/blog/vision.md new file mode 100644 index 000000000..935bcfa14 --- /dev/null +++ b/website/blog/vision.md @@ -0,0 +1,91 @@ +--- +title: Vision input for the Planner +authors: liqli +date: 2025-03-13 +--- + +## Introduction + +We have supported vision input for the Planner role in TaskWeaver. +The Planner role is responsible for generating the high-level plan for the task. +The vision input is a new type of input that contains images. +This feature is useful when the task requires visual understanding. + + +## How vision input is supported in TaskWeaver + +In TaskWeaver, we added a new role called `ImageReader` to read images and provide the image url (for remote images) or +the image encoded in base64 (for local images) to the Planner role. +To have this new role, you need to include it in the project configure file as follows: + +```json +{ + "session.roles": [ + "planner", + "code_interpreter", + "image_reader" + ] +} +``` + +The ImageReader role takes the path or the url of the image as input and prepares a response Post for the Planner role. As described [here](https://learn.microsoft.com/en-us/azure/ai-services/openai/how-to/gpt-with-vision?tabs=rest) for Azure OpenAI API, if the image is local, ImageReader need to encode the image in base64 and pass it to the API. If the image is remote, ImageReader need to provide the url of the image. +The Planner role can then use the image information for various tasks. + +## An example + +Let's ask the agent to describe any uploaded image. + + +![image_reader](../static/img/image_reader.png) + +The flow of the conversation is as follows: +```mermaid +graph TD + User --image path--> Planner + Planner --image path--> ImageReader + ImageReader --image encoded in Base64--> Planner + Planner --response--> User +``` + +In the example above, the User talks to the agent in Web UI and uploads an image. +TaskWeaver also support providing the image path in console mode, either using the `/load` command or just include +the image path in the input message. + +## Extension + +If you look into the implementation of the ImageReader role, you will find that it is quite simple. +The key logic is shown in the following code snippet: + +```python +if image_url.startswith("http"): + image_content = image_url + attachment_message = f"Image from {image_url}." +else: + if os.path.isabs(image_url): + image_content = local_image_to_data_url(image_url) + else: + image_content = local_image_to_data_url(os.path.join(self.session_metadata.execution_cwd, image_url)) + attachment_message = f"Image from {image_url} encoded as a Base64 data URL." + +post_proxy.update_attachment( + message=attachment_message, + type=AttachmentType.image_url, + extra={"image_url": image_content}, + is_end=True, +) +``` + +After the image url is obtained, the ImageReader role will encode the image in base64 if the image is local. +Then, it will create an attachment in the response Post and pass the image content to the Planner role. +To achieve this, the attachment is created with the type `AttachmentType.image_url` and the image content is +passed as extra data with the key `image_url`. + +Therefore, if we want to support other scenarios with vision input, we can extend the ImageReader role by adding more logic +to handle different types of contents. One example is to support reading a document with text and images. +We can add an attachment for each image in the document and pass the list of attachments to the Planner role. + + + + + + diff --git a/website/docusaurus.config.js b/website/docusaurus.config.js index 31a8898b9..11c557fc6 100644 --- a/website/docusaurus.config.js +++ b/website/docusaurus.config.js @@ -58,6 +58,8 @@ const config = { // Remove this to remove the "edit this page" links. editUrl: 'https://github.com/microsoft/TaskWeaver/tree/main/website', + blogSidebarTitle: 'All posts', + blogSidebarCount: 'ALL', }, theme: { customCss: './src/css/custom.css', diff --git a/website/static/img/image_reader.PNG b/website/static/img/image_reader.PNG new file mode 100644 index 000000000..a7c4867b4 Binary files /dev/null and b/website/static/img/image_reader.PNG differ