-
Checked other resources
Commit to Help
Example Codepython
import os
import base64
from mimetypes import guess_type
from langchain_openai import AzureChatOpenAI
from langchain.prompts import (
ChatPromptTemplate,
MessagesPlaceholder,
)
from langchain.schema import HumanMessage
from langchain.agents import AgentExecutor
from langchain.memory import ConversationBufferMemory
from dotenv import load_dotenv
from langchain.agents import AgentExecutor, create_openai_functions_agent
from langchain_core.tools import tool
from langchain_community.chat_message_histories import ChatMessageHistory
os.environ["no_proxy"] = "localhost,127.0.0.1,::1"
def local_image_to_data_url(image_path):
# Guess the MIME type of the image based on the file extension
mime_type, _ = guess_type(image_path)
if mime_type is None:
mime_type = "application/octet-stream" # Default MIME type if none is found
# Read and encode the image file
with open(image_path, "rb") as image_file:
base64_encoded_data = base64.b64encode(image_file.read()).decode("utf-8")
# Construct the data URL
return f"data:{mime_type};base64,{base64_encoded_data}"
def img_path2url(path):
img_encoded = local_image_to_data_url(path)
img_url_dict = {"type": "image_url", "image_url": {"url": f"{img_encoded}"}}
return img_url_dict
llm = AzureChatOpenAI(
azure_deployment="gpt-4o", api_version="2024-02-15-preview", temperature=0
)
prompt = ChatPromptTemplate(
messages=[
(
"system",
"you are a general assistent ",
),
MessagesPlaceholder(variable_name="chat_history", optional=True),
("human", "{input}"),
MessagesPlaceholder(variable_name="agent_scratchpad"),
]
)
@tool
def say_hi(a):
"""Says hi"""
return "hi"
memory = ChatMessageHistory(session_id="test-session")
tools = [say_hi]
agent = create_openai_functions_agent(llm, tools, prompt)
memory = ConversationBufferMemory(
memory_key="chat_history",
return_messages=True,
)
agent_executor = AgentExecutor(agent=agent, tools=tools, verbose=True, memory=memory)
result1 = agent_executor.invoke(input={"input": "hi how are you ? "}) #works great
img_path = "./out0001.png"
img_msg = img_path2url(img_path)
message = HumanMessage(
content=[
{"type": "text", "text": "what is in the image ? "},
img_msg,
],
)
llm.invoke([message]) #works
agent_executor.invoke(input={"input": message}) #crashes
pass
DescriptionI am trying to have an agent interact with an image and execute tools based on the content of that image, but I have been unable to achieve this. My goal is for the user to provide both an input image and text, and for the agent to execute actions based on the image's content. What is the proper way to communicate with an agent using images? I searched online but found that most working examples directly invoke the LLM, rather than doing so through an agent. When i execute the above code, i encounter the following error
System InfoSystem Information
Package Information
Optional packages not installed
Other Dependencies
|
Beta Was this translation helpful? Give feedback.
Replies: 2 comments 2 replies
-
Hello @sbugdary! I'm here to assist you with any issues you're experiencing. Let's work together to resolve your problem. I found some similar discussions and issues that might help you resolve the problem you're facing.
To address the error "Invalid 'messages[3].content': string too long," you need to ensure that the content of your messages is within the acceptable length limits. Here is an example of how to format messages with images and handle the error: from typing import List, Dict, Any
import base64
import httpx
# Example message with image URL
image_url = "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg"
image_data = base64.b64encode(httpx.get(image_url).content).decode("utf-8")
messages = [
{
"role": "user",
"content": [
{"type": "text", "text": "describe the weather in this image"},
{"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{image_data}"}},
],
}
]
def _convert_messages_to_ollama_messages(messages: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
ollama_messages = []
for message in messages:
role = message.get("role")
content = message.get("content", "")
if isinstance(content, list):
content_str = str(content)
if len(content_str) > 1000: # Example length check
raise ValueError("Message content is too long")
msg = {
"role": role,
"content": content,
}
ollama_messages.append(msg)
return ollama_messages
# Convert messages
ollama_messages = _convert_messages_to_ollama_messages(messages)
# Example function to send messages (simplified)
def send_messages(messages: List[Dict[str, Any]]):
for message in messages:
print(f"Sending message: {message}")
# Send the converted messages
send_messages(ollama_messages) In this example, the Additionally, LangChain supports multimodal inputs, which can be useful for applications involving images. For example, you can resize a base64-encoded image and pass it to a model directly. This allows the model to reason over the two-dimensional content on the page, instead of a "one-dimensional" string representation. Here is a utility function to resize a base64-encoded image: import base64
import io
from PIL import Image
def resize_base64_image(base64_string, size=(128, 128)):
"""
Resize an image encoded as a Base64 string.
:param base64_string: A Base64 encoded string of the image to be resized.
:param size: A tuple representing the new size (width, height) for the image.
:return: A Base64 encoded string of the resized image.
"""
img_data = base64.b64decode(base64_string)
img = Image.open(io.BytesIO(img_data))
resized_img = img.resize(size, Image.LANCZOS)
buffered = io.BytesIO()
resized_img.save(buffered, format=img.format)
return base64.b64encode(buffered.getvalue()).decode("utf-8")
def get_resized_images(docs):
"""
Resize images from base64-encoded strings.
:param docs: A list of base64-encoded image to be resized.
:return: Dict containing a list of resized base64-encoded strings.
"""
b64_images = []
for doc in docs:
if isinstance(doc, Document):
doc = doc.page_content
resized_image = resize_base64_image(doc, size=(1280, 720))
b64_images.append(resized_image)
return {"images": b64_images} You can then use this function to resize a base64-encoded image and include it in your messages [6][7][8]. For more detailed information on using LangChain with multimodal inputs, you can refer to the LangChain documentation. This documentation provides comprehensive guidance on setting up and using models, including examples of passing images as URLs or base64-encoded strings. |
Beta Was this translation helpful? Give feedback.
-
I solved this issue using langgraph.
|
Beta Was this translation helpful? Give feedback.
I solved this issue using langgraph.