From 945f882d3b9c5a80d3e9e9206ec1757c57f6a5cd Mon Sep 17 00:00:00 2001 From: sdbds <865105819@qq.com> Date: Sun, 3 Aug 2025 21:25:14 +0800 Subject: [PATCH 01/32] update multimodal part --- core/agent_core/events/ingestors.py | 22 +++ core/agent_core/framework/inbox_processor.py | 65 +++++++- core/agent_core/llm/call_llm.py | 47 ++++++ core/api/message_handlers.py | 156 ++++++++++++++++- frontend/app/chat/components/ChatInput.tsx | 157 +++++++++++++++++- frontend/app/chat/components/ProjectPage.tsx | 29 ++-- .../app/chat/components/WelcomeScreen.tsx | 32 ++-- frontend/app/chat/page.tsx | 6 +- frontend/app/stores/sessionStore.ts | 37 ++++- 9 files changed, 496 insertions(+), 55 deletions(-) diff --git a/core/agent_core/events/ingestors.py b/core/agent_core/events/ingestors.py index 8828e68..8bf520f 100644 --- a/core/agent_core/events/ingestors.py +++ b/core/agent_core/events/ingestors.py @@ -337,6 +337,28 @@ def user_prompt_ingestor(payload: Any, params: Dict, context: Dict) -> str: return payload.get("prompt", "") return str(payload) +@register_ingestor("multimodal_user_prompt_ingestor") +def multimodal_user_prompt_ingestor(payload: Any, params: Dict, context: Dict) -> str: + """处理包含图像的用户输入,返回适合LLM的格式""" + if not isinstance(payload, dict): + return str(payload) + + prompt = payload.get("prompt", "") + images = payload.get("images", []) + + # 如果没有图像,返回普通文本 + if not images: + return prompt + + # 如果有图像,需要特殊处理 + # 这里我们将图像信息标记在文本中,实际的图像数据会在消息构建时处理 + image_info = f"[用户上传了{len(images)}张图像]" + if prompt: + return f"{prompt}\n\n{image_info}" + else: + return image_info + return str(payload) + def _recursive_markdown_formatter(data: Any, schema: Dict, level: int = 0) -> List[str]: """ Intelligently formats data recursively into LLM-friendly Markdown. diff --git a/core/agent_core/framework/inbox_processor.py b/core/agent_core/framework/inbox_processor.py index e94eeeb..fc36bdd 100644 --- a/core/agent_core/framework/inbox_processor.py +++ b/core/agent_core/framework/inbox_processor.py @@ -34,7 +34,8 @@ def _create_user_turn_from_inbox_item(self, item: Dict) -> Optional[str]: team_state = self.team_state prompt_content = item.get("payload", {}).get("prompt") - if not prompt_content: + images_content = item.get("payload", {}).get("images", []) + if not prompt_content and not images_content: return None user_turn_id = f"turn_user_{uuid.uuid4().hex[:8]}" @@ -62,7 +63,7 @@ def _create_user_turn_from_inbox_item(self, item: Dict) -> Optional[str]: "end_time": item.get("metadata", {}).get("created_at", datetime.now(timezone.utc).isoformat()), "source_turn_ids": [last_agent_turn_id] if last_agent_turn_id else [], "source_tool_call_id": None, - "inputs": {"prompt": prompt_content}, + "inputs": {"prompt": prompt_content, "images": images_content} if images_content else {"prompt": prompt_content}, "outputs": {}, "llm_interaction": None, "tool_interactions": [], @@ -249,7 +250,65 @@ async def process(self) -> Dict[str, Any]: role = params.get("role", "user") is_persistent = params.get("is_persistent_in_memory", False) - new_message = {"role": role, "content": injected_content} + # 处理多模态内容(图像) + has_image_content = False + content_parts = [] + + # 检查是否有图像内容(支持两种格式) + if source in ["USER_PROMPT", "USER_PROMPT_WITH_IMAGE"] and isinstance(dehydrated_payload, dict): + # 新格式:image_info(来自 send_image_message) + if dehydrated_payload.get("image_info"): + has_image_content = True + # 添加文本内容 + if injected_content: + content_parts.append({ + "type": "text", + "text": injected_content + }) + + # 添加图像内容 + image_info = dehydrated_payload["image_info"] + content_parts.append({ + "type": "image_url", + "image_url": { + "url": image_info.get("url", ""), + "detail": "high" # Can be "low", "high", or "auto" + } + }) + logger.debug("multimodal_message_processed_from_image_info", extra={ + "agent_id": self.agent_id, + "image_url": image_info.get("url", ""), + "text_content_length": len(injected_content) if injected_content else 0 + }) + + # 旧格式:images(向后兼容) + elif dehydrated_payload.get("images"): + has_image_content = True + # 添加文本内容 + if injected_content: + content_parts.append({ + "type": "text", + "text": injected_content + }) + + # 添加图像内容 + for image_data in dehydrated_payload["images"]: + content_parts.append({ + "type": "image_url", + "image_url": { + "url": f"data:{image_data['mimeType']};base64,{image_data['data']}" + } + }) + logger.debug("multimodal_message_processed_from_images", extra={ + "agent_id": self.agent_id, + "image_count": len(dehydrated_payload["images"]), + "text_content_length": len(injected_content) if injected_content else 0 + }) + + if has_image_content: + new_message = {"role": role, "content": content_parts} + else: + new_message = {"role": role, "content": injected_content} # If this message comes from the startup briefing, add an internal flag # to prevent it from being handed over again in the future. diff --git a/core/agent_core/llm/call_llm.py b/core/agent_core/llm/call_llm.py index 21bab9e..2c6edd1 100644 --- a/core/agent_core/llm/call_llm.py +++ b/core/agent_core/llm/call_llm.py @@ -239,6 +239,7 @@ async def call_litellm_acompletion( run_id_for_event: Optional[str] = None, contextual_data_for_event: Optional[Dict] = None, run_context: Optional[Dict] = None, + image_info: Optional[Dict[str, Any]] = None, # New parameter for image data **kwargs ) -> Dict[str, Any]: """ @@ -254,6 +255,52 @@ async def call_litellm_acompletion( final_messages[0]["content"] = system_prompt_content else: final_messages.insert(0, {"role": "system", "content": system_prompt_content}) + + # Process image information if provided + if image_info and final_messages: + # Find the last user message and add image to it + for i in range(len(final_messages) - 1, -1, -1): + if final_messages[i].get("role") == "user": + current_content = final_messages[i].get("content", "") + + # Check if content is already in multimodal format (list) + if isinstance(current_content, list): + # Already multimodal, append image to existing content + current_content.append({ + "type": "image_url", + "image_url": { + "url": image_info.get("url", ""), + "detail": "high" # Can be "low", "high", or "auto" + } + }) + logger.debug("image_added_to_multimodal_message", extra={ + "agent_id": agent_id_for_event, + "image_url": image_info.get("url", ""), + "existing_parts_count": len(current_content) - 1 + }) + else: + # Convert text-only message to multimodal format + multimodal_content = [ + { + "type": "text", + "text": current_content + }, + { + "type": "image_url", + "image_url": { + "url": image_info.get("url", ""), + "detail": "high" # Can be "low", "high", or "auto" + } + } + ] + + final_messages[i]["content"] = multimodal_content + logger.debug("text_message_converted_to_multimodal", extra={ + "agent_id": agent_id_for_event, + "image_url": image_info.get("url", ""), + "text_content_length": len(current_content) + }) + break for attempt in range(app_level_max_retries + 1): # --- KEY CHANGE: Generate a NEW stream_id for EVERY attempt --- diff --git a/core/api/message_handlers.py b/core/api/message_handlers.py index 1aed78a..e09d95c 100644 --- a/core/api/message_handlers.py +++ b/core/api/message_handlers.py @@ -862,6 +862,7 @@ async def handle_send_to_run_message(ws_state: Dict, data: Dict): run_status = run_context['meta'].get('status') run_type = run_context['meta'].get('run_type') prompt_content = message_payload.get("prompt") + images_content = message_payload.get("images", []) try: # --- Branch 1: Activate a pending run --- @@ -879,10 +880,15 @@ async def handle_send_to_run_message(ws_state: Dict, data: Dict): team_state = run_context['team_state'] partner_state = partner_context['state'] + # 构建payload,包含文本和图像 + payload = {"prompt": prompt_content} + if images_content: + payload["images"] = images_content + inbox_item = { "item_id": f"inbox_{uuid.uuid4().hex[:8]}", "source": "USER_PROMPT", # Use standardized event source - "payload": {"prompt": prompt_content}, + "payload": payload, "consumption_policy": "consume_on_read", "metadata": {"created_at": datetime.now(timezone.utc).isoformat()} } @@ -921,10 +927,15 @@ async def handle_send_to_run_message(ws_state: Dict, data: Dict): team_state = run_context['team_state'] # --- Core modification: Similarly, only create an InboxItem --- + # 构建payload,包含文本和图像 + payload = {"prompt": prompt_content} + if images_content: + payload["images"] = images_content + inbox_item = { "item_id": f"inbox_{uuid.uuid4().hex[:8]}", "source": "USER_PROMPT", - "payload": {"prompt": prompt_content}, + "payload": payload, "consumption_policy": "consume_on_read", "metadata": {"created_at": datetime.now(timezone.utc).isoformat()} } @@ -949,12 +960,153 @@ async def handle_send_to_run_message(ws_state: Dict, data: Dict): logger.error("send_to_run_processing_error", extra={"session_id": session_id_for_log, "target_run_id": target_run_id, "run_type": run_type, "error_message": str(e)}, exc_info=True) await event_manager.emit_error(run_id=target_run_id, agent_id="System", error_message=f"Error processing message for run {target_run_id}: {str(e)}") +async def handle_send_image_message(ws_state: Dict, data: Dict): + """ + Handles 'send_image_message' messages, routing client messages with image data to the specified active business run. + This is similar to handle_send_to_run_message but specifically designed for multimodal messages. + """ + event_manager = ws_state.event_manager + session_id_for_log = event_manager.session_id + + target_run_id = data.get("run_id") + run_id_var.set(target_run_id) # Set context variable + message_payload = data.get("message_payload") + image_info = data.get("image_info") + + logger.info("send_image_message_received", extra={ + "session_id": session_id_for_log, + "target_run_id": target_run_id, + "message_preview": str(message_payload)[:100], + "has_image_info": bool(image_info) + }) + + if not target_run_id or message_payload is None or image_info is None: + err_msg = "'send_image_message' requires 'run_id', 'message_payload', and 'image_info'." + logger.warning("send_image_message_missing_params", extra={ + "session_id": session_id_for_log, + "data": data, + "has_run_id": bool(target_run_id), + "has_message_payload": message_payload is not None, + "has_image_info": image_info is not None + }) + await event_manager.emit_error(run_id=target_run_id, agent_id="System", error_message=err_msg) + return + + run_context = active_runs_store.get(target_run_id) + if not run_context: + err_msg = f"Target run {target_run_id} not found or not active." + logger.warning("send_image_message_target_not_found", extra={"session_id": session_id_for_log, "target_run_id": target_run_id}) + await event_manager.emit_error(run_id=target_run_id, agent_id="System", error_message=err_msg) + return + + run_status = run_context['meta'].get('status') + run_type = run_context['meta'].get('run_type') + prompt_content = message_payload.get("prompt") + + try: + # --- Branch 1: Activate a pending run with image --- + if run_status == 'CREATED': + logger.debug("run_activation_with_image_started", extra={"run_id": target_run_id, "run_type": run_type}) + + if prompt_content is None: + raise ValueError("First message to a new run must contain a 'prompt'.") + + run_context['team_state']['question'] = prompt_content + + task = None + if run_type == "partner_interaction": + partner_context = run_context['sub_context_refs']['_partner_context_ref'] + team_state = run_context['team_state'] + partner_state = partner_context['state'] + + # 构建payload,包含文本和图像信息 + payload = { + "prompt": prompt_content, + "image_info": image_info # Add image info to payload + } + + inbox_item = { + "item_id": f"inbox_{uuid.uuid4().hex[:8]}", + "source": "USER_PROMPT_WITH_IMAGE", # Use specialized event source + "payload": payload, + "consumption_policy": "consume_on_read", + "metadata": {"created_at": datetime.now(timezone.utc).isoformat()} + } + partner_state.setdefault("inbox", []).append(inbox_item) + + # 2. Start the task + task = asyncio.create_task(run_partner_interaction_async(partner_context=partner_context)) + else: + raise ValueError(f"Run type '{run_type}' does not support activation via 'send_image_message'.") + + ws_state.active_run_tasks[target_run_id] = task + task.add_done_callback( + lambda t: logger.info("run_task_finished", extra={"run_id": target_run_id, "run_type": run_type, "session_id": session_id_for_log}) + if not t.cancelled() else + logger.info("run_task_cancelled", extra={"run_id": target_run_id, "run_type": run_type, "session_id": session_id_for_log}) + ) + + run_context['meta']['status'] = 'AWAITING_INPUT' + logger.debug("run_activation_with_image_completed", extra={"run_id": target_run_id, "status": "AWAITING_INPUT"}) + + # 3. Wake up the task + if run_type == "partner_interaction": + new_input_event = run_context['sub_context_refs']['_partner_context_ref']['runtime_objects'].get("new_user_input_event") + if new_input_event: + new_input_event.set() + return # Critical: Return immediately after handling activation + + # --- Branch 2: Send an image message to a running session --- + elif run_status in ['RUNNING', 'AWAITING_INPUT']: + if prompt_content is None: + raise ValueError("Message payload must contain a 'prompt'.") + + if run_type == "partner_interaction": + partner_context = run_context['sub_context_refs']['_partner_context_ref'] + partner_state = partner_context['state'] + team_state = run_context['team_state'] + + # --- Core modification: Create an InboxItem with image info --- + payload = { + "prompt": prompt_content, + "image_info": image_info # Add image info to payload + } + + inbox_item = { + "item_id": f"inbox_{uuid.uuid4().hex[:8]}", + "source": "USER_PROMPT_WITH_IMAGE", + "payload": payload, + "consumption_policy": "consume_on_read", + "metadata": {"created_at": datetime.now(timezone.utc).isoformat()} + } + partner_state.setdefault("inbox", []).append(inbox_item) + + # Wake up the task + new_input_event = partner_context['runtime_objects'].get("new_user_input_event") + if new_input_event: + new_input_event.set() + logger.info("partner_task_notified_with_image", extra={"run_id": target_run_id, "notification_method": "inbox"}) + else: + logger.error("partner_notification_failed", extra={"run_id": target_run_id, "reason": "new_user_input_event_not_found"}, exc_info=True) + + # --- Branch 3: Handle invalid states --- + else: + err_msg = f"Cannot send image message to run {target_run_id} because its status is '{run_status}'." + logger.warning("send_image_message_invalid_status", extra={"session_id": session_id_for_log, "run_id": target_run_id, "run_status": run_status}) + await event_manager.emit_error(run_id=target_run_id, agent_id="System", error_message=err_msg) + return + + except Exception as e: + logger.error("send_image_message_processing_error", extra={"session_id": session_id_for_log, "target_run_id": target_run_id, "run_type": run_type, "error_message": str(e)}, exc_info=True) + await event_manager.emit_error(run_id=target_run_id, agent_id="System", error_message=f"Error processing image message for run {target_run_id}: {str(e)}") + # --- MESSAGE_HANDLERS registry (Dango's version, with adapted function names) --- MESSAGE_HANDLERS: Dict[str, callable] = { "start_run": handle_start_run_message, "stop_run": handle_stop_run_message, "request_available_toolsets": handle_request_available_toolsets, "send_to_run": handle_send_to_run_message, # Added by Dango, adapted + "send_image_message": handle_send_image_message, # New handler for multimodal messages "stop_managed_principal": handle_stop_managed_principal_message, # Added by Dango, adapted "request_run_profiles": handle_request_run_profiles_message, # Added by Dango, adapted "request_run_context": handle_request_run_context_message, # Added by Dango, adapted diff --git a/frontend/app/chat/components/ChatInput.tsx b/frontend/app/chat/components/ChatInput.tsx index b0747e3..d5b77f9 100644 --- a/frontend/app/chat/components/ChatInput.tsx +++ b/frontend/app/chat/components/ChatInput.tsx @@ -1,12 +1,20 @@ -import React from 'react'; +import React, { useState, useRef, useCallback } from 'react'; import { Button } from '@/components/ui/button'; import { Input } from '@/components/ui/input'; +import { X, Image as ImageIcon, Paperclip } from 'lucide-react'; + +interface ImageAttachment { + id: string; + file: File; + dataUrl: string; + name: string; +} interface ChatInputProps { currentInput: string; onInputChange: (value: string) => void; onKeyPress: (e: React.KeyboardEvent) => void; - onSendMessage: () => void; + onSendMessage: (images?: ImageAttachment[]) => void; isStreaming: boolean; isLoading: boolean; onStopExecution: () => void; @@ -21,35 +29,170 @@ export function ChatInput({ isLoading, onStopExecution, }: ChatInputProps) { + const [images, setImages] = useState([]); + const fileInputRef = useRef(null); + const inputRef = useRef(null); + + // 处理粘贴事件 + const handlePaste = useCallback((e: React.ClipboardEvent) => { + const items = e.clipboardData?.items; + if (!items) return; + + for (let i = 0; i < items.length; i++) { + const item = items[i]; + if (item.type.startsWith('image/')) { + e.preventDefault(); + const file = item.getAsFile(); + if (file) { + addImageFile(file); + } + break; + } + } + }, []); + + // 添加图像文件 + const addImageFile = useCallback((file: File) => { + if (!file.type.startsWith('image/')) { + alert('Please select an image file'); + return; + } + + // 限制文件大小为10MB + if (file.size > 10 * 1024 * 1024) { + alert('Image size should be less than 10MB'); + return; + } + + const reader = new FileReader(); + reader.onload = (e) => { + const dataUrl = e.target?.result as string; + const newImage: ImageAttachment = { + id: Date.now().toString(), + file, + dataUrl, + name: file.name + }; + setImages(prev => [...prev, newImage]); + }; + reader.readAsDataURL(file); + }, []); + + // 处理文件选择 + const handleFileSelect = useCallback((e: React.ChangeEvent) => { + const files = e.target.files; + if (files) { + Array.from(files).forEach(addImageFile); + } + // 清空input值以允许重复选择同一文件 + if (fileInputRef.current) { + fileInputRef.current.value = ''; + } + }, [addImageFile]); + + // 移除图像 + const removeImage = useCallback((id: string) => { + setImages(prev => prev.filter(img => img.id !== id)); + }, []); + + // 处理发送消息 + const handleSendMessage = useCallback(() => { + onSendMessage(images.length > 0 ? images : undefined); + setImages([]); // 发送后清空图像 + }, [onSendMessage, images]); + + // 处理键盘事件 + const handleKeyPress = useCallback((e: React.KeyboardEvent) => { + if (e.key === 'Enter' && !e.shiftKey) { + e.preventDefault(); + if ((currentInput.trim() || images.length > 0) && !isStreaming && !isLoading) { + handleSendMessage(); + } + } else { + onKeyPress(e); + } + }, [currentInput, images, isStreaming, isLoading, handleSendMessage, onKeyPress]); + return (
+ {/* 图像预览区域 */} + {images.length > 0 && ( +
+
+ {images.map((image) => ( +
+ {image.name} + +
+ {image.name} +
+
+ ))} +
+
+ )} +
+ {/* 文件上传按钮 */} + + onInputChange(e.target.value)} - onKeyPress={onKeyPress} - placeholder="Enter message..." + onKeyPress={handleKeyPress} + onPaste={handlePaste} + placeholder="Enter message or paste image..." disabled={isStreaming || isLoading} className="flex-1 border-0 focus-visible:ring-0 focus-visible:ring-offset-0 shadow-none px-2" /> + {isStreaming ? ( ) : ( )}
+ + {/* 隐藏的文件输入 */} +
); } diff --git a/frontend/app/chat/components/ProjectPage.tsx b/frontend/app/chat/components/ProjectPage.tsx index effa802..037cf78 100644 --- a/frontend/app/chat/components/ProjectPage.tsx +++ b/frontend/app/chat/components/ProjectPage.tsx @@ -1,7 +1,6 @@ import React, { useState, useEffect } from 'react'; import { observer } from 'mobx-react-lite'; import { Button } from '@/components/ui/button'; -import { Textarea } from '@/components/ui/textarea'; import { Input } from '@/components/ui/input'; import { SidebarTrigger } from '@/components/ui/sidebar'; import { @@ -22,11 +21,12 @@ import { projectStore } from '@/app/stores/projectStore'; import { selectionStore } from '@/app/stores/selectionStore'; import LoadingSpinner from '@/components/layout/LoadingSpinner'; import { ProjectWithRuns } from '@/lib/types'; +import { ChatInput } from './ChatInput'; interface ProjectPageProps { currentInput: string; onInputChange: (value: string) => void; - onSendMessage: () => void; + onSendMessage: (images?: any[]) => void; onKeyPress: (e: React.KeyboardEvent) => void; isLoading: boolean; } @@ -285,22 +285,15 @@ export const ProjectPage = observer(function ProjectPage({ {/* Chat Input Section */}

What can I help you?

-
-