From 945f882d3b9c5a80d3e9e9206ec1757c57f6a5cd Mon Sep 17 00:00:00 2001
From: sdbds <865105819@qq.com>
Date: Sun, 3 Aug 2025 21:25:14 +0800
Subject: [PATCH 01/32] update multimodal part

---
 core/agent_core/events/ingestors.py           |  22 +++
 core/agent_core/framework/inbox_processor.py  |  65 +++++++-
 core/agent_core/llm/call_llm.py               |  47 ++++++
 core/api/message_handlers.py                  | 156 ++++++++++++++++-
 frontend/app/chat/components/ChatInput.tsx    | 157 +++++++++++++++++-
 frontend/app/chat/components/ProjectPage.tsx  |  29 ++--
 .../app/chat/components/WelcomeScreen.tsx     |  32 ++--
 frontend/app/chat/page.tsx                    |   6 +-
 frontend/app/stores/sessionStore.ts           |  37 ++++-
 9 files changed, 496 insertions(+), 55 deletions(-)

diff --git a/core/agent_core/events/ingestors.py b/core/agent_core/events/ingestors.py
index 8828e68..8bf520f 100644
--- a/core/agent_core/events/ingestors.py
+++ b/core/agent_core/events/ingestors.py
@@ -337,6 +337,28 @@ def user_prompt_ingestor(payload: Any, params: Dict, context: Dict) -> str:
         return payload.get("prompt", "")
     return str(payload)
 
+@register_ingestor("multimodal_user_prompt_ingestor")
+def multimodal_user_prompt_ingestor(payload: Any, params: Dict, context: Dict) -> str:
+    """处理包含图像的用户输入，返回适合LLM的格式"""
+    if not isinstance(payload, dict):
+        return str(payload)
+    
+    prompt = payload.get("prompt", "")
+    images = payload.get("images", [])
+    
+    # 如果没有图像，返回普通文本
+    if not images:
+        return prompt
+    
+    # 如果有图像，需要特殊处理
+    # 这里我们将图像信息标记在文本中，实际的图像数据会在消息构建时处理
+    image_info = f"[用户上传了{len(images)}张图像]"
+    if prompt:
+        return f"{prompt}\n\n{image_info}"
+    else:
+        return image_info
+    return str(payload)
+
 def _recursive_markdown_formatter(data: Any, schema: Dict, level: int = 0) -> List[str]:
     """
     Intelligently formats data recursively into LLM-friendly Markdown.
diff --git a/core/agent_core/framework/inbox_processor.py b/core/agent_core/framework/inbox_processor.py
index e94eeeb..fc36bdd 100644
--- a/core/agent_core/framework/inbox_processor.py
+++ b/core/agent_core/framework/inbox_processor.py
@@ -34,7 +34,8 @@ def _create_user_turn_from_inbox_item(self, item: Dict) -> Optional[str]:
         team_state = self.team_state
         
         prompt_content = item.get("payload", {}).get("prompt")
-        if not prompt_content:
+        images_content = item.get("payload", {}).get("images", [])
+        if not prompt_content and not images_content:
             return None
 
         user_turn_id = f"turn_user_{uuid.uuid4().hex[:8]}"
@@ -62,7 +63,7 @@ def _create_user_turn_from_inbox_item(self, item: Dict) -> Optional[str]:
             "end_time": item.get("metadata", {}).get("created_at", datetime.now(timezone.utc).isoformat()),
             "source_turn_ids": [last_agent_turn_id] if last_agent_turn_id else [],
             "source_tool_call_id": None,
-            "inputs": {"prompt": prompt_content},
+            "inputs": {"prompt": prompt_content, "images": images_content} if images_content else {"prompt": prompt_content},
             "outputs": {},
             "llm_interaction": None,
             "tool_interactions": [],
@@ -249,7 +250,65 @@ async def process(self) -> Dict[str, Any]:
                 role = params.get("role", "user")
                 is_persistent = params.get("is_persistent_in_memory", False)
                 
-                new_message = {"role": role, "content": injected_content}
+                # 处理多模态内容（图像）
+                has_image_content = False
+                content_parts = []
+                
+                # 检查是否有图像内容（支持两种格式）
+                if source in ["USER_PROMPT", "USER_PROMPT_WITH_IMAGE"] and isinstance(dehydrated_payload, dict):
+                    # 新格式：image_info（来自 send_image_message）
+                    if dehydrated_payload.get("image_info"):
+                        has_image_content = True
+                        # 添加文本内容
+                        if injected_content:
+                            content_parts.append({
+                                "type": "text",
+                                "text": injected_content
+                            })
+                        
+                        # 添加图像内容
+                        image_info = dehydrated_payload["image_info"]
+                        content_parts.append({
+                            "type": "image_url",
+                            "image_url": {
+                                "url": image_info.get("url", ""),
+                                "detail": "high"  # Can be "low", "high", or "auto"
+                            }
+                        })
+                        logger.debug("multimodal_message_processed_from_image_info", extra={
+                            "agent_id": self.agent_id,
+                            "image_url": image_info.get("url", ""),
+                            "text_content_length": len(injected_content) if injected_content else 0
+                        })
+                    
+                    # 旧格式：images（向后兼容）
+                    elif dehydrated_payload.get("images"):
+                        has_image_content = True
+                        # 添加文本内容
+                        if injected_content:
+                            content_parts.append({
+                                "type": "text",
+                                "text": injected_content
+                            })
+                        
+                        # 添加图像内容
+                        for image_data in dehydrated_payload["images"]:
+                            content_parts.append({
+                                "type": "image_url",
+                                "image_url": {
+                                    "url": f"data:{image_data['mimeType']};base64,{image_data['data']}"
+                                }
+                            })
+                        logger.debug("multimodal_message_processed_from_images", extra={
+                            "agent_id": self.agent_id,
+                            "image_count": len(dehydrated_payload["images"]),
+                            "text_content_length": len(injected_content) if injected_content else 0
+                        })
+                
+                if has_image_content:
+                    new_message = {"role": role, "content": content_parts}
+                else:
+                    new_message = {"role": role, "content": injected_content}
 
                 # If this message comes from the startup briefing, add an internal flag
                 # to prevent it from being handed over again in the future.
diff --git a/core/agent_core/llm/call_llm.py b/core/agent_core/llm/call_llm.py
index 21bab9e..2c6edd1 100644
--- a/core/agent_core/llm/call_llm.py
+++ b/core/agent_core/llm/call_llm.py
@@ -239,6 +239,7 @@ async def call_litellm_acompletion(
     run_id_for_event: Optional[str] = None,
     contextual_data_for_event: Optional[Dict] = None,
     run_context: Optional[Dict] = None,
+    image_info: Optional[Dict[str, Any]] = None,  # New parameter for image data
     **kwargs
 ) -> Dict[str, Any]:
     """
@@ -254,6 +255,52 @@ async def call_litellm_acompletion(
             final_messages[0]["content"] = system_prompt_content
         else:
             final_messages.insert(0, {"role": "system", "content": system_prompt_content})
+    
+    # Process image information if provided
+    if image_info and final_messages:
+        # Find the last user message and add image to it
+        for i in range(len(final_messages) - 1, -1, -1):
+            if final_messages[i].get("role") == "user":
+                current_content = final_messages[i].get("content", "")
+                
+                # Check if content is already in multimodal format (list)
+                if isinstance(current_content, list):
+                    # Already multimodal, append image to existing content
+                    current_content.append({
+                        "type": "image_url",
+                        "image_url": {
+                            "url": image_info.get("url", ""),
+                            "detail": "high"  # Can be "low", "high", or "auto"
+                        }
+                    })
+                    logger.debug("image_added_to_multimodal_message", extra={
+                        "agent_id": agent_id_for_event,
+                        "image_url": image_info.get("url", ""),
+                        "existing_parts_count": len(current_content) - 1
+                    })
+                else:
+                    # Convert text-only message to multimodal format
+                    multimodal_content = [
+                        {
+                            "type": "text",
+                            "text": current_content
+                        },
+                        {
+                            "type": "image_url",
+                            "image_url": {
+                                "url": image_info.get("url", ""),
+                                "detail": "high"  # Can be "low", "high", or "auto"
+                            }
+                        }
+                    ]
+                    
+                    final_messages[i]["content"] = multimodal_content
+                    logger.debug("text_message_converted_to_multimodal", extra={
+                        "agent_id": agent_id_for_event,
+                        "image_url": image_info.get("url", ""),
+                        "text_content_length": len(current_content)
+                    })
+                break
 
     for attempt in range(app_level_max_retries + 1):
         # --- KEY CHANGE: Generate a NEW stream_id for EVERY attempt ---
diff --git a/core/api/message_handlers.py b/core/api/message_handlers.py
index 1aed78a..e09d95c 100644
--- a/core/api/message_handlers.py
+++ b/core/api/message_handlers.py
@@ -862,6 +862,7 @@ async def handle_send_to_run_message(ws_state: Dict, data: Dict):
     run_status = run_context['meta'].get('status')
     run_type = run_context['meta'].get('run_type')
     prompt_content = message_payload.get("prompt")
+    images_content = message_payload.get("images", [])
 
     try:
         # --- Branch 1: Activate a pending run ---
@@ -879,10 +880,15 @@ async def handle_send_to_run_message(ws_state: Dict, data: Dict):
                 team_state = run_context['team_state']
                 partner_state = partner_context['state']
 
+                # 构建payload，包含文本和图像
+                payload = {"prompt": prompt_content}
+                if images_content:
+                    payload["images"] = images_content
+                
                 inbox_item = {
                     "item_id": f"inbox_{uuid.uuid4().hex[:8]}",
                     "source": "USER_PROMPT", # Use standardized event source
-                    "payload": {"prompt": prompt_content},
+                    "payload": payload,
                     "consumption_policy": "consume_on_read",
                     "metadata": {"created_at": datetime.now(timezone.utc).isoformat()}
                 }
@@ -921,10 +927,15 @@ async def handle_send_to_run_message(ws_state: Dict, data: Dict):
                 team_state = run_context['team_state']
 
                 # --- Core modification: Similarly, only create an InboxItem ---
+                # 构建payload，包含文本和图像
+                payload = {"prompt": prompt_content}
+                if images_content:
+                    payload["images"] = images_content
+                
                 inbox_item = {
                     "item_id": f"inbox_{uuid.uuid4().hex[:8]}",
                     "source": "USER_PROMPT",
-                    "payload": {"prompt": prompt_content},
+                    "payload": payload,
                     "consumption_policy": "consume_on_read",
                     "metadata": {"created_at": datetime.now(timezone.utc).isoformat()}
                 }
@@ -949,12 +960,153 @@ async def handle_send_to_run_message(ws_state: Dict, data: Dict):
         logger.error("send_to_run_processing_error", extra={"session_id": session_id_for_log, "target_run_id": target_run_id, "run_type": run_type, "error_message": str(e)}, exc_info=True)
         await event_manager.emit_error(run_id=target_run_id, agent_id="System", error_message=f"Error processing message for run {target_run_id}: {str(e)}")
 
+async def handle_send_image_message(ws_state: Dict, data: Dict):
+    """
+    Handles 'send_image_message' messages, routing client messages with image data to the specified active business run.
+    This is similar to handle_send_to_run_message but specifically designed for multimodal messages.
+    """
+    event_manager = ws_state.event_manager
+    session_id_for_log = event_manager.session_id
+
+    target_run_id = data.get("run_id")
+    run_id_var.set(target_run_id)  # Set context variable
+    message_payload = data.get("message_payload")
+    image_info = data.get("image_info")
+
+    logger.info("send_image_message_received", extra={
+        "session_id": session_id_for_log, 
+        "target_run_id": target_run_id, 
+        "message_preview": str(message_payload)[:100],
+        "has_image_info": bool(image_info)
+    })
+
+    if not target_run_id or message_payload is None or image_info is None:
+        err_msg = "'send_image_message' requires 'run_id', 'message_payload', and 'image_info'."
+        logger.warning("send_image_message_missing_params", extra={
+            "session_id": session_id_for_log, 
+            "data": data, 
+            "has_run_id": bool(target_run_id), 
+            "has_message_payload": message_payload is not None,
+            "has_image_info": image_info is not None
+        })
+        await event_manager.emit_error(run_id=target_run_id, agent_id="System", error_message=err_msg)
+        return
+
+    run_context = active_runs_store.get(target_run_id)
+    if not run_context:
+        err_msg = f"Target run {target_run_id} not found or not active."
+        logger.warning("send_image_message_target_not_found", extra={"session_id": session_id_for_log, "target_run_id": target_run_id})
+        await event_manager.emit_error(run_id=target_run_id, agent_id="System", error_message=err_msg)
+        return
+
+    run_status = run_context['meta'].get('status')
+    run_type = run_context['meta'].get('run_type')
+    prompt_content = message_payload.get("prompt")
+
+    try:
+        # --- Branch 1: Activate a pending run with image ---
+        if run_status == 'CREATED':
+            logger.debug("run_activation_with_image_started", extra={"run_id": target_run_id, "run_type": run_type})
+            
+            if prompt_content is None:
+                raise ValueError("First message to a new run must contain a 'prompt'.")
+            
+            run_context['team_state']['question'] = prompt_content
+            
+            task = None
+            if run_type == "partner_interaction":
+                partner_context = run_context['sub_context_refs']['_partner_context_ref']
+                team_state = run_context['team_state']
+                partner_state = partner_context['state']
+
+                # 构建payload，包含文本和图像信息
+                payload = {
+                    "prompt": prompt_content,
+                    "image_info": image_info  # Add image info to payload
+                }
+                
+                inbox_item = {
+                    "item_id": f"inbox_{uuid.uuid4().hex[:8]}",
+                    "source": "USER_PROMPT_WITH_IMAGE", # Use specialized event source
+                    "payload": payload,
+                    "consumption_policy": "consume_on_read",
+                    "metadata": {"created_at": datetime.now(timezone.utc).isoformat()}
+                }
+                partner_state.setdefault("inbox", []).append(inbox_item)
+                
+                # 2. Start the task
+                task = asyncio.create_task(run_partner_interaction_async(partner_context=partner_context))
+            else:
+                raise ValueError(f"Run type '{run_type}' does not support activation via 'send_image_message'.")
+
+            ws_state.active_run_tasks[target_run_id] = task
+            task.add_done_callback(
+                lambda t: logger.info("run_task_finished", extra={"run_id": target_run_id, "run_type": run_type, "session_id": session_id_for_log})
+                if not t.cancelled() else
+                logger.info("run_task_cancelled", extra={"run_id": target_run_id, "run_type": run_type, "session_id": session_id_for_log})
+            )
+            
+            run_context['meta']['status'] = 'AWAITING_INPUT'
+            logger.debug("run_activation_with_image_completed", extra={"run_id": target_run_id, "status": "AWAITING_INPUT"})
+
+            # 3. Wake up the task
+            if run_type == "partner_interaction":
+                new_input_event = run_context['sub_context_refs']['_partner_context_ref']['runtime_objects'].get("new_user_input_event")
+                if new_input_event:
+                    new_input_event.set()
+            return  # Critical: Return immediately after handling activation
+
+        # --- Branch 2: Send an image message to a running session ---
+        elif run_status in ['RUNNING', 'AWAITING_INPUT']:
+            if prompt_content is None:
+                raise ValueError("Message payload must contain a 'prompt'.")
+
+            if run_type == "partner_interaction":
+                partner_context = run_context['sub_context_refs']['_partner_context_ref']
+                partner_state = partner_context['state']
+                team_state = run_context['team_state']
+
+                # --- Core modification: Create an InboxItem with image info ---
+                payload = {
+                    "prompt": prompt_content,
+                    "image_info": image_info  # Add image info to payload
+                }
+                
+                inbox_item = {
+                    "item_id": f"inbox_{uuid.uuid4().hex[:8]}",
+                    "source": "USER_PROMPT_WITH_IMAGE",
+                    "payload": payload,
+                    "consumption_policy": "consume_on_read",
+                    "metadata": {"created_at": datetime.now(timezone.utc).isoformat()}
+                }
+                partner_state.setdefault("inbox", []).append(inbox_item)
+
+                # Wake up the task
+                new_input_event = partner_context['runtime_objects'].get("new_user_input_event")
+                if new_input_event:
+                    new_input_event.set()
+                    logger.info("partner_task_notified_with_image", extra={"run_id": target_run_id, "notification_method": "inbox"})
+                else:
+                    logger.error("partner_notification_failed", extra={"run_id": target_run_id, "reason": "new_user_input_event_not_found"}, exc_info=True)
+            
+        # --- Branch 3: Handle invalid states ---
+        else:
+            err_msg = f"Cannot send image message to run {target_run_id} because its status is '{run_status}'."
+            logger.warning("send_image_message_invalid_status", extra={"session_id": session_id_for_log, "run_id": target_run_id, "run_status": run_status})
+            await event_manager.emit_error(run_id=target_run_id, agent_id="System", error_message=err_msg)
+            return
+
+    except Exception as e:
+        logger.error("send_image_message_processing_error", extra={"session_id": session_id_for_log, "target_run_id": target_run_id, "run_type": run_type, "error_message": str(e)}, exc_info=True)
+        await event_manager.emit_error(run_id=target_run_id, agent_id="System", error_message=f"Error processing image message for run {target_run_id}: {str(e)}")
+
 # --- MESSAGE_HANDLERS registry (Dango's version, with adapted function names) ---
 MESSAGE_HANDLERS: Dict[str, callable] = {
     "start_run": handle_start_run_message,
     "stop_run": handle_stop_run_message,
     "request_available_toolsets": handle_request_available_toolsets,
     "send_to_run": handle_send_to_run_message, # Added by Dango, adapted
+    "send_image_message": handle_send_image_message, # New handler for multimodal messages
     "stop_managed_principal": handle_stop_managed_principal_message, # Added by Dango, adapted
     "request_run_profiles": handle_request_run_profiles_message, # Added by Dango, adapted
     "request_run_context": handle_request_run_context_message, # Added by Dango, adapted
diff --git a/frontend/app/chat/components/ChatInput.tsx b/frontend/app/chat/components/ChatInput.tsx
index b0747e3..d5b77f9 100644
--- a/frontend/app/chat/components/ChatInput.tsx
+++ b/frontend/app/chat/components/ChatInput.tsx
@@ -1,12 +1,20 @@
-import React from 'react';
+import React, { useState, useRef, useCallback } from 'react';
 import { Button } from '@/components/ui/button';
 import { Input } from '@/components/ui/input';
+import { X, Image as ImageIcon, Paperclip } from 'lucide-react';
+
+interface ImageAttachment {
+  id: string;
+  file: File;
+  dataUrl: string;
+  name: string;
+}
 
 interface ChatInputProps {
   currentInput: string;
   onInputChange: (value: string) => void;
   onKeyPress: (e: React.KeyboardEvent) => void;
-  onSendMessage: () => void;
+  onSendMessage: (images?: ImageAttachment[]) => void;
   isStreaming: boolean;
   isLoading: boolean;
   onStopExecution: () => void;
@@ -21,35 +29,170 @@ export function ChatInput({
   isLoading,
   onStopExecution,
 }: ChatInputProps) {
+  const [images, setImages] = useState<ImageAttachment[]>([]);
+  const fileInputRef = useRef<HTMLInputElement>(null);
+  const inputRef = useRef<HTMLInputElement>(null);
+
+  // 处理粘贴事件
+  const handlePaste = useCallback((e: React.ClipboardEvent) => {
+    const items = e.clipboardData?.items;
+    if (!items) return;
+
+    for (let i = 0; i < items.length; i++) {
+      const item = items[i];
+      if (item.type.startsWith('image/')) {
+        e.preventDefault();
+        const file = item.getAsFile();
+        if (file) {
+          addImageFile(file);
+        }
+        break;
+      }
+    }
+  }, []);
+
+  // 添加图像文件
+  const addImageFile = useCallback((file: File) => {
+    if (!file.type.startsWith('image/')) {
+      alert('Please select an image file');
+      return;
+    }
+
+    // 限制文件大小为10MB
+    if (file.size > 10 * 1024 * 1024) {
+      alert('Image size should be less than 10MB');
+      return;
+    }
+
+    const reader = new FileReader();
+    reader.onload = (e) => {
+      const dataUrl = e.target?.result as string;
+      const newImage: ImageAttachment = {
+        id: Date.now().toString(),
+        file,
+        dataUrl,
+        name: file.name
+      };
+      setImages(prev => [...prev, newImage]);
+    };
+    reader.readAsDataURL(file);
+  }, []);
+
+  // 处理文件选择
+  const handleFileSelect = useCallback((e: React.ChangeEvent<HTMLInputElement>) => {
+    const files = e.target.files;
+    if (files) {
+      Array.from(files).forEach(addImageFile);
+    }
+    // 清空input值以允许重复选择同一文件
+    if (fileInputRef.current) {
+      fileInputRef.current.value = '';
+    }
+  }, [addImageFile]);
+
+  // 移除图像
+  const removeImage = useCallback((id: string) => {
+    setImages(prev => prev.filter(img => img.id !== id));
+  }, []);
+
+  // 处理发送消息
+  const handleSendMessage = useCallback(() => {
+    onSendMessage(images.length > 0 ? images : undefined);
+    setImages([]); // 发送后清空图像
+  }, [onSendMessage, images]);
+
+  // 处理键盘事件
+  const handleKeyPress = useCallback((e: React.KeyboardEvent) => {
+    if (e.key === 'Enter' && !e.shiftKey) {
+      e.preventDefault();
+      if ((currentInput.trim() || images.length > 0) && !isStreaming && !isLoading) {
+        handleSendMessage();
+      }
+    } else {
+      onKeyPress(e);
+    }
+  }, [currentInput, images, isStreaming, isLoading, handleSendMessage, onKeyPress]);
+
   return (
     <div className="p-3">
+      {/* 图像预览区域 */}
+      {images.length > 0 && (
+        <div className="mb-3 p-3 bg-gray-50 rounded-lg">
+          <div className="flex flex-wrap gap-2">
+            {images.map((image) => (
+              <div key={image.id} className="relative group">
+                <img
+                  src={image.dataUrl}
+                  alt={image.name}
+                  className="w-20 h-20 object-cover rounded border"
+                />
+                <button
+                  onClick={() => removeImage(image.id)}
+                  className="absolute -top-1 -right-1 w-5 h-5 bg-red-500 text-white rounded-full flex items-center justify-center opacity-0 group-hover:opacity-100 transition-opacity"
+                >
+                  <X size={12} />
+                </button>
+                <div className="absolute bottom-0 left-0 right-0 bg-black bg-opacity-50 text-white text-xs p-1 rounded-b truncate">
+                  {image.name}
+                </div>
+              </div>
+            ))}
+          </div>
+        </div>
+      )}
+
       <div className="flex gap-2 bg-white rounded-lg border overflow-hidden p-2 focus-within:border-black transition-colors">
+        {/* 文件上传按钮 */}
+        <Button
+          variant="ghost"
+          size="icon"
+          onClick={() => fileInputRef.current?.click()}
+          disabled={isStreaming || isLoading}
+          className="flex-shrink-0"
+        >
+          <Paperclip size={16} />
+        </Button>
+        
         <Input
+          ref={inputRef}
           value={currentInput}
           onChange={(e) => onInputChange(e.target.value)}
-          onKeyPress={onKeyPress}
-          placeholder="Enter message..."
+          onKeyPress={handleKeyPress}
+          onPaste={handlePaste}
+          placeholder="Enter message or paste image..."
           disabled={isStreaming || isLoading}
           className="flex-1 border-0 focus-visible:ring-0 focus-visible:ring-offset-0 shadow-none px-2"
         />
+        
         {isStreaming ? (
           <Button
             variant="ghost"
             size="icon"
-            className="rounded-full bg-black hover:bg-black/90 !px-2 !py-1"
+            className="rounded-full bg-black hover:bg-black/90 !px-2 !py-1 flex-shrink-0"
             onClick={onStopExecution}
           >
             <div className="w-3 h-3 bg-white" />
           </Button>
         ) : (
           <Button 
-            onClick={onSendMessage}
-            disabled={!currentInput.trim() || isLoading}
+            onClick={handleSendMessage}
+            disabled={(!currentInput.trim() && images.length === 0) || isLoading}
+            className="flex-shrink-0"
           >
             Send
           </Button>
         )}
       </div>
+      
+      {/* 隐藏的文件输入 */}
+      <input
+        ref={fileInputRef}
+        type="file"
+        accept="image/*"
+        multiple
+        onChange={handleFileSelect}
+        className="hidden"
+      />
     </div>
   );
 }
diff --git a/frontend/app/chat/components/ProjectPage.tsx b/frontend/app/chat/components/ProjectPage.tsx
index effa802..037cf78 100644
--- a/frontend/app/chat/components/ProjectPage.tsx
+++ b/frontend/app/chat/components/ProjectPage.tsx
@@ -1,7 +1,6 @@
 import React, { useState, useEffect } from 'react';
 import { observer } from 'mobx-react-lite';
 import { Button } from '@/components/ui/button';
-import { Textarea } from '@/components/ui/textarea';
 import { Input } from '@/components/ui/input';
 import { SidebarTrigger } from '@/components/ui/sidebar';
 import {
@@ -22,11 +21,12 @@ import { projectStore } from '@/app/stores/projectStore';
 import { selectionStore } from '@/app/stores/selectionStore';
 import LoadingSpinner from '@/components/layout/LoadingSpinner';
 import { ProjectWithRuns } from '@/lib/types';
+import { ChatInput } from './ChatInput';
 
 interface ProjectPageProps {
   currentInput: string;
   onInputChange: (value: string) => void;
-  onSendMessage: () => void;
+  onSendMessage: (images?: any[]) => void;
   onKeyPress: (e: React.KeyboardEvent) => void;
   isLoading: boolean;
 }
@@ -285,22 +285,15 @@ export const ProjectPage = observer(function ProjectPage({
             {/* Chat Input Section */}
             <div>
               <h2 className="text-lg font-semibold mb-3">What can I help you?</h2>
-              <div className="relative">
-                <Textarea
-                  value={currentInput}
-                  onChange={(e) => onInputChange(e.target.value)}
-                  onKeyPress={onKeyPress}
-                  placeholder="Enter message..."
-                  className="min-h-[120px] resize-none w-full rounded-lg border p-4 pr-24 focus-visible:ring-1 focus-visible:ring-black"
-                />
-                <Button
-                  onClick={onSendMessage}
-                  disabled={!currentInput.trim() || isLoading}
-                  className="absolute right-3 bottom-3 bg-black hover:bg-black/90"
-                >
-                  Send
-                </Button>
-              </div>
+              <ChatInput
+                currentInput={currentInput}
+                onInputChange={onInputChange}
+                onKeyPress={onKeyPress}
+                onSendMessage={onSendMessage}
+                isStreaming={false}
+                isLoading={isLoading}
+                onStopExecution={() => {}}
+              />
             </div>
 
             {/* Project Instructions Section */}
diff --git a/frontend/app/chat/components/WelcomeScreen.tsx b/frontend/app/chat/components/WelcomeScreen.tsx
index ebad82b..e0d4715 100644
--- a/frontend/app/chat/components/WelcomeScreen.tsx
+++ b/frontend/app/chat/components/WelcomeScreen.tsx
@@ -1,14 +1,13 @@
 import React from 'react';
 import { observer } from 'mobx-react-lite';
-import { Button } from '@/components/ui/button';
-import { Textarea } from '@/components/ui/textarea';
 import { SidebarTrigger } from '@/components/ui/sidebar';
 import { selectionStore } from '@/app/stores/selectionStore';
+import { ChatInput } from './ChatInput';
 
 interface WelcomeScreenProps {
   currentInput: string;
   onInputChange: (value: string) => void;
-  onSendMessage: () => void;
+  onSendMessage: (images?: any[]) => void;
   onKeyPress: (e: React.KeyboardEvent) => void;
   isLoading: boolean;
 }
@@ -28,23 +27,16 @@ export const WelcomeScreen = observer(function WelcomeScreen({ currentInput, onI
       </div>
       <div className="flex-1 flex flex-col items-center justify-center">
         <h1 className="text-[32px] font-medium mb-12">What can I help you with?</h1>
-        <div className="w-[600px] relative mb-12">
-          <div className="rounded-lg border focus-within:border-black overflow-hidden transition-colors">
-            <Textarea
-              value={currentInput}
-              onChange={(e) => onInputChange(e.target.value)}
-              onKeyPress={onKeyPress}
-              placeholder="Enter message..."
-              className="min-h-[150px] resize-none w-full border-0 shadow-none focus-visible:ring-0 focus-visible:ring-offset-0"
-            />
-          </div>
-          <Button 
-            onClick={onSendMessage}
-            disabled={!currentInput.trim() || isLoading}
-            className="absolute right-4 bottom-4 bg-black hover:bg-black/90 "
-          >
-            Send
-          </Button>
+        <div className="w-[600px] mb-12">
+          <ChatInput
+            currentInput={currentInput}
+            onInputChange={onInputChange}
+            onKeyPress={onKeyPress}
+            onSendMessage={onSendMessage}
+            isStreaming={false}
+            isLoading={isLoading}
+            onStopExecution={() => {}}
+          />
         </div>
         {/* <div className="flex flex-col items-start justify-center">
           <p className="font-medium text-lg">From the Community</p>
diff --git a/frontend/app/chat/page.tsx b/frontend/app/chat/page.tsx
index 6bf7aa1..653d4e3 100644
--- a/frontend/app/chat/page.tsx
+++ b/frontend/app/chat/page.tsx
@@ -30,8 +30,8 @@ export default observer(function ChatPage() {
     selectionStore.clearSelection();
   }, []);
 
-  const sendMessage = useCallback(async () => {
-    if (!store.currentInput.trim() || store.isLoading || store.isCreatingRun) return;
+  const sendMessage = useCallback(async (images?: any[]) => {
+    if ((!store.currentInput.trim() && !images?.length) || store.isLoading || store.isCreatingRun) return;
     
     // Close the sidebar (if on mobile or first message)
     if (sidebarContext) {
@@ -55,7 +55,7 @@ export default observer(function ChatPage() {
         
         // Send message
         store.setIsLoading(true);
-        await sessionStore.sendMessage(messageToSend, newRunId);
+        await sessionStore.sendMessage(messageToSend, newRunId, images);
 
         // Redirect to the newly created run page
         router.push(`/r?id=${newRunId}`);
diff --git a/frontend/app/stores/sessionStore.ts b/frontend/app/stores/sessionStore.ts
index 9d402d3..0bb1364 100644
--- a/frontend/app/stores/sessionStore.ts
+++ b/frontend/app/stores/sessionStore.ts
@@ -888,22 +888,55 @@ class SessionStore {
     }
   }
 
-  async sendMessage(message: string, runId: string) {
+  async sendMessage(message: string, runId: string, images?: any[]) {
     if (!this.ws || this.ws.readyState !== WebSocket.OPEN) {
         console.error("Cannot send message, WebSocket is not open.");
         this.error = "Connection failed. Cannot send message.";
         return;
     }
 
+    // 处理图像数据
+    const imageData = images ? await Promise.all(
+      images.map(async (img) => {
+        // 将文件转换为base64
+        const base64 = await this.fileToBase64(img.file);
+        return {
+          type: 'image',
+          data: base64,
+          name: img.name,
+          mimeType: img.file.type
+        };
+      })
+    ) : [];
+
+    const messagePayload: any = { prompt: message };
+    if (imageData.length > 0) {
+      messagePayload.images = imageData;
+    }
+
     this.ws.send(JSON.stringify({
         type: 'send_to_run',
         data: {
             run_id: runId,
-            message_payload: { prompt: message }
+            message_payload: messagePayload
         }
     }));
   }
 
+  private fileToBase64(file: File): Promise<string> {
+    return new Promise((resolve, reject) => {
+      const reader = new FileReader();
+      reader.onload = () => {
+        const result = reader.result as string;
+        // 移除data:image/xxx;base64,前缀，只保留base64数据
+        const base64Data = result.split(',')[1];
+        resolve(base64Data);
+      };
+      reader.onerror = reject;
+      reader.readAsDataURL(file);
+    });
+  }
+
   async stopExecution(runId: string) {
     if (this.ws?.readyState === WebSocket.OPEN) {
       this.ws.send(JSON.stringify({

From 834d1a8236bd70f1175233398a36a38d2c464f05 Mon Sep 17 00:00:00 2001
From: Qing Long <qinglongshengzhe@gmail.com>
Date: Wed, 6 Aug 2025 09:24:12 +0800
Subject: [PATCH 02/32] Update core/agent_core/events/ingestors.py

Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
---
 core/agent_core/events/ingestors.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/core/agent_core/events/ingestors.py b/core/agent_core/events/ingestors.py
index 8bf520f..c746894 100644
--- a/core/agent_core/events/ingestors.py
+++ b/core/agent_core/events/ingestors.py
@@ -357,7 +357,6 @@ def multimodal_user_prompt_ingestor(payload: Any, params: Dict, context: Dict) -
         return f"{prompt}\n\n{image_info}"
     else:
         return image_info
-    return str(payload)
 
 def _recursive_markdown_formatter(data: Any, schema: Dict, level: int = 0) -> List[str]:
     """

From 241d234aa02c03e87c3f7f1e5e11ec5a6a160403 Mon Sep 17 00:00:00 2001
From: sdbds <865105819@qq.com>
Date: Wed, 6 Aug 2025 09:32:04 +0800
Subject: [PATCH 03/32] small update

---
 frontend/app/chat/components/ChatInput.tsx | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/frontend/app/chat/components/ChatInput.tsx b/frontend/app/chat/components/ChatInput.tsx
index d5b77f9..f535eba 100644
--- a/frontend/app/chat/components/ChatInput.tsx
+++ b/frontend/app/chat/components/ChatInput.tsx
@@ -2,6 +2,7 @@ import React, { useState, useRef, useCallback } from 'react';
 import { Button } from '@/components/ui/button';
 import { Input } from '@/components/ui/input';
 import { X, Image as ImageIcon, Paperclip } from 'lucide-react';
+import { randomUUID } from 'node:crypto';
 
 interface ImageAttachment {
   id: string;
@@ -68,7 +69,7 @@ export function ChatInput({
     reader.onload = (e) => {
       const dataUrl = e.target?.result as string;
       const newImage: ImageAttachment = {
-        id: Date.now().toString(),
+        id: randomUUID(),
         file,
         dataUrl,
         name: file.name

From 6f3fd21516805619c4d5866e6d099165b174877f Mon Sep 17 00:00:00 2001
From: sdbds <865105819@qq.com>
Date: Wed, 6 Aug 2025 09:53:08 +0800
Subject: [PATCH 04/32] fix node:crypto

---
 frontend/app/chat/components/ChatInput.tsx | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/frontend/app/chat/components/ChatInput.tsx b/frontend/app/chat/components/ChatInput.tsx
index f535eba..1af78da 100644
--- a/frontend/app/chat/components/ChatInput.tsx
+++ b/frontend/app/chat/components/ChatInput.tsx
@@ -2,7 +2,7 @@ import React, { useState, useRef, useCallback } from 'react';
 import { Button } from '@/components/ui/button';
 import { Input } from '@/components/ui/input';
 import { X, Image as ImageIcon, Paperclip } from 'lucide-react';
-import { randomUUID } from 'node:crypto';
+
 
 interface ImageAttachment {
   id: string;
@@ -69,7 +69,7 @@ export function ChatInput({
     reader.onload = (e) => {
       const dataUrl = e.target?.result as string;
       const newImage: ImageAttachment = {
-        id: randomUUID(),
+        id: crypto.randomUUID(),
         file,
         dataUrl,
         name: file.name

From fc32a018c36bb285ef44ebfecc74db9abf42186b Mon Sep 17 00:00:00 2001
From: sdbds <865105819@qq.com>
Date: Mon, 11 Aug 2025 12:03:15 +0800
Subject: [PATCH 05/32] update for files upload

---
 core/agent_core/events/ingestors.py           |  38 ++-
 core/agent_core/framework/inbox_processor.py  | 180 ++++++++++----
 core/agent_core/llm/call_llm.py               |  47 ----
 core/api/message_handlers.py                  | 190 +++------------
 core/pyproject.toml                           |   1 +
 frontend/app/chat/components/ChatInput.tsx    | 222 +++++++++++++-----
 frontend/app/chat/components/ProjectPage.tsx  |   2 +-
 .../app/chat/components/WelcomeScreen.tsx     |   2 +-
 frontend/app/chat/page.tsx                    |   6 +-
 frontend/app/stores/sessionStore.ts           |  23 +-
 10 files changed, 362 insertions(+), 349 deletions(-)

diff --git a/core/agent_core/events/ingestors.py b/core/agent_core/events/ingestors.py
index c746894..19c2fda 100644
--- a/core/agent_core/events/ingestors.py
+++ b/core/agent_core/events/ingestors.py
@@ -339,24 +339,36 @@ def user_prompt_ingestor(payload: Any, params: Dict, context: Dict) -> str:
 
 @register_ingestor("multimodal_user_prompt_ingestor")
 def multimodal_user_prompt_ingestor(payload: Any, params: Dict, context: Dict) -> str:
-    """处理包含图像的用户输入，返回适合LLM的格式"""
+    """处理包含图像/文件的用户输入，返回适合LLM的简要文本描述（实际数据在消息构建时处理）。"""
     if not isinstance(payload, dict):
         return str(payload)
-    
+
     prompt = payload.get("prompt", "")
     images = payload.get("images", [])
-    
-    # 如果没有图像，返回普通文本
-    if not images:
+    files = payload.get("files", [])
+
+    # 没有图像和文件时，直接返回文本
+    if not images and not files:
         return prompt
-    
-    # 如果有图像，需要特殊处理
-    # 这里我们将图像信息标记在文本中，实际的图像数据会在消息构建时处理
-    image_info = f"[用户上传了{len(images)}张图像]"
-    if prompt:
-        return f"{prompt}\n\n{image_info}"
-    else:
-        return image_info
+
+    # 构造一个简短的附件说明
+    parts = []
+    if images:
+        parts.append(f"用户上传了{len(images)}张图像")
+    if files:
+        # 可选：列出最多前3个文件名
+        names = []
+        for f in files[:3]:
+            name = f.get("name") or f.get("filename")
+            if name:
+                names.append(name)
+        if names:
+            parts.append(f"并附带{len(files)}个文件（示例：{', '.join(names)}{'' if len(files) <= 3 else ' 等'}）")
+        else:
+            parts.append(f"并附带{len(files)}个文件")
+
+    note = "[" + "，".join(parts) + "]"
+    return f"{prompt}\n\n{note}" if prompt else note
 
 def _recursive_markdown_formatter(data: Any, schema: Dict, level: int = 0) -> List[str]:
     """
diff --git a/core/agent_core/framework/inbox_processor.py b/core/agent_core/framework/inbox_processor.py
index fc36bdd..1359087 100644
--- a/core/agent_core/framework/inbox_processor.py
+++ b/core/agent_core/framework/inbox_processor.py
@@ -1,7 +1,13 @@
 import logging
 import uuid
+import os
+import base64
 from typing import Any, Dict, List, Optional
 from datetime import datetime, timezone
+import asyncio
+import time
+import httpx
+from litellm import create_file
 
 from ..events.event_strategies import EVENT_STRATEGY_REGISTRY
 from ..events.ingestors import INGESTOR_REGISTRY, markdown_formatter_ingestor
@@ -34,8 +40,8 @@ def _create_user_turn_from_inbox_item(self, item: Dict) -> Optional[str]:
         team_state = self.team_state
         
         prompt_content = item.get("payload", {}).get("prompt")
-        images_content = item.get("payload", {}).get("images", [])
-        if not prompt_content and not images_content:
+        files_content = item.get("payload", {}).get("files", [])
+        if not prompt_content and not files_content:
             return None
 
         user_turn_id = f"turn_user_{uuid.uuid4().hex[:8]}"
@@ -63,7 +69,10 @@ def _create_user_turn_from_inbox_item(self, item: Dict) -> Optional[str]:
             "end_time": item.get("metadata", {}).get("created_at", datetime.now(timezone.utc).isoformat()),
             "source_turn_ids": [last_agent_turn_id] if last_agent_turn_id else [],
             "source_tool_call_id": None,
-            "inputs": {"prompt": prompt_content, "images": images_content} if images_content else {"prompt": prompt_content},
+            "inputs": (
+                {"prompt": prompt_content, "files": files_content}
+                if files_content else {"prompt": prompt_content}
+            ),
             "outputs": {},
             "llm_interaction": None,
             "tool_interactions": [],
@@ -250,62 +259,129 @@ async def process(self) -> Dict[str, Any]:
                 role = params.get("role", "user")
                 is_persistent = params.get("is_persistent_in_memory", False)
                 
-                # 处理多模态内容（图像）
-                has_image_content = False
+                # 处理多模态内容（仅文件）
+                has_multimodal_content = False
                 content_parts = []
                 
-                # 检查是否有图像内容（支持两种格式）
-                if source in ["USER_PROMPT", "USER_PROMPT_WITH_IMAGE"] and isinstance(dehydrated_payload, dict):
-                    # 新格式：image_info（来自 send_image_message）
-                    if dehydrated_payload.get("image_info"):
-                        has_image_content = True
-                        # 添加文本内容
-                        if injected_content:
+                # 检查是否有文件内容
+                if source in ["USER_PROMPT", "USER_PROMPT_WITH_FILES"] and isinstance(dehydrated_payload, dict):
+                    # 处理文件内容：将附件上传到 Gemini 并构造成 file 引用
+                    files = dehydrated_payload.get("files", [])
+                    if files:
+                        has_multimodal_content = True
+                        # 添加文本内容（若尚未添加）
+                        if injected_content and not any(part.get("type") == "text" for part in content_parts):
                             content_parts.append({
                                 "type": "text",
                                 "text": injected_content
                             })
-                        
-                        # 添加图像内容
-                        image_info = dehydrated_payload["image_info"]
-                        content_parts.append({
-                            "type": "image_url",
-                            "image_url": {
-                                "url": image_info.get("url", ""),
-                                "detail": "high"  # Can be "low", "high", or "auto"
-                            }
-                        })
-                        logger.debug("multimodal_message_processed_from_image_info", extra={
-                            "agent_id": self.agent_id,
-                            "image_url": image_info.get("url", ""),
-                            "text_content_length": len(injected_content) if injected_content else 0
-                        })
-                    
-                    # 旧格式：images（向后兼容）
-                    elif dehydrated_payload.get("images"):
-                        has_image_content = True
-                        # 添加文本内容
-                        if injected_content:
-                            content_parts.append({
-                                "type": "text",
-                                "text": injected_content
-                            })
-                        
-                        # 添加图像内容
-                        for image_data in dehydrated_payload["images"]:
-                            content_parts.append({
-                                "type": "image_url",
-                                "image_url": {
-                                    "url": f"data:{image_data['mimeType']};base64,{image_data['data']}"
-                                }
-                            })
-                        logger.debug("multimodal_message_processed_from_images", extra={
-                            "agent_id": self.agent_id,
-                            "image_count": len(dehydrated_payload["images"]),
-                            "text_content_length": len(injected_content) if injected_content else 0
-                        })
+
+                        for f in files:
+                            try:
+                                file_bytes = None
+                                filename = f.get("name") or f.get("filename") or f"file_{uuid.uuid4().hex[:6]}"
+                                mime_type = f.get("mimeType") or f.get("mime_type") or "application/octet-stream"
+
+                                if f.get("file_id"):
+                                    # Already uploaded
+                                    file_id = f["file_id"]
+                                    logger.info("gemini_file_upload_skipped_existing", extra={
+                                        "agent_id": self.agent_id,
+                                        "filename": filename,
+                                        "mime_type": mime_type,
+                                        "file_id": file_id,
+                                    })
+                                else:
+                                    size_bytes = None
+                                    if f.get("url"):
+                                        # Async fetch
+                                        async with httpx.AsyncClient(timeout=20) as client:
+                                            resp = await client.get(f["url"])
+                                            resp.raise_for_status()
+                                            file_bytes = resp.content
+                                        size_bytes = len(file_bytes) if file_bytes is not None else None
+                                    elif f.get("data"):
+                                        # base64 string possibly without header
+                                        data_str = f["data"]
+                                        # Strip data URL prefix if present
+                                        if data_str.startswith("data:"):
+                                            data_str = data_str.split(",", 1)[1]
+                                        file_bytes = base64.b64decode(data_str)
+                                        size_bytes = len(file_bytes)
+                                    else:
+                                        # Unsupported entry, skip
+                                        logger.warning("file_entry_missing_data", extra={"agent_id": self.agent_id, "filename": filename})
+                                        continue
+
+                                    # Prefer API key from project LLM config; fallback to env var
+                                    try:
+                                        resolver = LLMConfigResolver(shared_llm_configs=self.run_context.get("config", {}).get("shared_llm_configs_ref", {}))
+                                        llm_config = resolver.resolve(self.profile)
+                                    except Exception:
+                                        llm_config = {}
+                                    gemini_key = (
+                                        (llm_config.get("api_key") if isinstance(llm_config, dict) else None)
+                                        or os.getenv("GEMINI_API_KEY")
+                                    )
+                                    if not gemini_key:
+                                        logger.error(
+                                            "gemini_api_key_missing",
+                                            extra={
+                                                "agent_id": self.agent_id,
+                                                "hint": "Provide api_key in active LLM config or set GEMINI_API_KEY env var"
+                                            }
+                                        )
+                                        continue
+
+                                    # Structured start log
+                                    logger.info("gemini_file_upload_start", extra={
+                                        "agent_id": self.agent_id,
+                                        "filename": filename,
+                                        "mime_type": mime_type,
+                                        "size_bytes": size_bytes,
+                                    })
+                                    t0 = time.perf_counter()
+
+                                    # Offload blocking create_file to a thread
+                                    created = await asyncio.to_thread(
+                                        create_file,
+                                        file=file_bytes,
+                                        purpose="user_data",
+                                        extra_body={"custom_llm_provider": "gemini"},
+                                        api_key=gemini_key,
+                                    )
+                                    file_id = getattr(created, "id", None) if created is not None else None
+                                    if not file_id:
+                                        logger.error("gemini_file_upload_failed", extra={
+                                            "filename": filename,
+                                            "mime_type": mime_type,
+                                            "size_bytes": size_bytes,
+                                            "duration_ms": int((time.perf_counter() - t0) * 1000),
+                                        })
+                                        continue
+                                    else:
+                                        logger.info("gemini_file_upload_success", extra={
+                                            "agent_id": self.agent_id,
+                                            "filename": filename,
+                                            "mime_type": mime_type,
+                                            "size_bytes": size_bytes,
+                                            "file_id": file_id,
+                                            "duration_ms": int((time.perf_counter() - t0) * 1000),
+                                        })
+
+                                # Append file reference content part
+                                content_parts.append({
+                                    "type": "file",
+                                    "file": {
+                                        "file_id": file_id,
+                                        "filename": filename,
+                                        "format": mime_type
+                                    }
+                                })
+                            except Exception as ex:
+                                logger.error("file_processing_failed", extra={"error": str(ex)}, exc_info=True)
                 
-                if has_image_content:
+                if has_multimodal_content:
                     new_message = {"role": role, "content": content_parts}
                 else:
                     new_message = {"role": role, "content": injected_content}
diff --git a/core/agent_core/llm/call_llm.py b/core/agent_core/llm/call_llm.py
index 2c6edd1..21bab9e 100644
--- a/core/agent_core/llm/call_llm.py
+++ b/core/agent_core/llm/call_llm.py
@@ -239,7 +239,6 @@ async def call_litellm_acompletion(
     run_id_for_event: Optional[str] = None,
     contextual_data_for_event: Optional[Dict] = None,
     run_context: Optional[Dict] = None,
-    image_info: Optional[Dict[str, Any]] = None,  # New parameter for image data
     **kwargs
 ) -> Dict[str, Any]:
     """
@@ -255,52 +254,6 @@ async def call_litellm_acompletion(
             final_messages[0]["content"] = system_prompt_content
         else:
             final_messages.insert(0, {"role": "system", "content": system_prompt_content})
-    
-    # Process image information if provided
-    if image_info and final_messages:
-        # Find the last user message and add image to it
-        for i in range(len(final_messages) - 1, -1, -1):
-            if final_messages[i].get("role") == "user":
-                current_content = final_messages[i].get("content", "")
-                
-                # Check if content is already in multimodal format (list)
-                if isinstance(current_content, list):
-                    # Already multimodal, append image to existing content
-                    current_content.append({
-                        "type": "image_url",
-                        "image_url": {
-                            "url": image_info.get("url", ""),
-                            "detail": "high"  # Can be "low", "high", or "auto"
-                        }
-                    })
-                    logger.debug("image_added_to_multimodal_message", extra={
-                        "agent_id": agent_id_for_event,
-                        "image_url": image_info.get("url", ""),
-                        "existing_parts_count": len(current_content) - 1
-                    })
-                else:
-                    # Convert text-only message to multimodal format
-                    multimodal_content = [
-                        {
-                            "type": "text",
-                            "text": current_content
-                        },
-                        {
-                            "type": "image_url",
-                            "image_url": {
-                                "url": image_info.get("url", ""),
-                                "detail": "high"  # Can be "low", "high", or "auto"
-                            }
-                        }
-                    ]
-                    
-                    final_messages[i]["content"] = multimodal_content
-                    logger.debug("text_message_converted_to_multimodal", extra={
-                        "agent_id": agent_id_for_event,
-                        "image_url": image_info.get("url", ""),
-                        "text_content_length": len(current_content)
-                    })
-                break
 
     for attempt in range(app_level_max_retries + 1):
         # --- KEY CHANGE: Generate a NEW stream_id for EVERY attempt ---
diff --git a/core/api/message_handlers.py b/core/api/message_handlers.py
index e09d95c..b82cd83 100644
--- a/core/api/message_handlers.py
+++ b/core/api/message_handlers.py
@@ -834,7 +834,7 @@ async def handle_manage_work_modules_request(ws_state: Dict, data: Dict):
 async def handle_send_to_run_message(ws_state: Dict, data: Dict):
     """
     Handles 'send_to_run' messages, routing client messages to the specified active business run.
-    This function is now also responsible for "activating" runs that are in the CREATED state.
+    Also handles multimodal payloads (files), and is responsible for activating runs in CREATED state.
     """
     event_manager = ws_state.event_manager
     session_id_for_log = event_manager.session_id
@@ -843,8 +843,17 @@ async def handle_send_to_run_message(ws_state: Dict, data: Dict):
     run_id_var.set(target_run_id)  # Set context variable
     message_payload = data.get("message_payload")
     extra_payload = data.get("extra_payload")
-
-    logger.info("send_to_run_received", extra={"session_id": session_id_for_log, "target_run_id": target_run_id, "message_preview": str(message_payload)[:100]})
+    files_content = (message_payload or {}).get("files", [])
+
+    logger.info(
+        "send_to_run_received",
+        extra={
+            "session_id": session_id_for_log,
+            "target_run_id": target_run_id,
+            "message_preview": str(message_payload)[:100],
+            "has_files": bool(files_content),
+        },
+    )
 
     if not target_run_id or message_payload is None:
         err_msg = "'send_to_run' requires 'run_id' and 'message_payload'."
@@ -862,17 +871,18 @@ async def handle_send_to_run_message(ws_state: Dict, data: Dict):
     run_status = run_context['meta'].get('status')
     run_type = run_context['meta'].get('run_type')
     prompt_content = message_payload.get("prompt")
-    images_content = message_payload.get("images", [])
 
     try:
         # --- Branch 1: Activate a pending run ---
         if run_status == 'CREATED':
             logger.debug("run_activation_started", extra={"run_id": target_run_id, "run_type": run_type})
             
-            if prompt_content is None:
-                raise ValueError("First message to a new run must contain a 'prompt'.")
+            # Allow activation if there is either text or multimodal content
+            if (prompt_content is None) and (not files_content):
+                raise ValueError("First message must contain either 'prompt' text or attachments (files).")
             
-            run_context['team_state']['question'] = prompt_content
+            # If no text but attachments exist, use empty string to initialize question
+            run_context['team_state']['question'] = prompt_content or ""
             
             task = None
             if run_type == "partner_interaction":
@@ -880,14 +890,16 @@ async def handle_send_to_run_message(ws_state: Dict, data: Dict):
                 team_state = run_context['team_state']
                 partner_state = partner_context['state']
 
-                # 构建payload，包含文本和图像
+                # 构建payload，仅使用统一的 files 多模态路径
                 payload = {"prompt": prompt_content}
-                if images_content:
-                    payload["images"] = images_content
+                if files_content:
+                    payload["files"] = files_content
                 
                 inbox_item = {
                     "item_id": f"inbox_{uuid.uuid4().hex[:8]}",
-                    "source": "USER_PROMPT", # Use standardized event source
+                    "source": (
+                        "USER_PROMPT_WITH_FILES" if files_content else "USER_PROMPT"
+                    ), # Standardized source with multimodal marker when applicable
                     "payload": payload,
                     "consumption_policy": "consume_on_read",
                     "metadata": {"created_at": datetime.now(timezone.utc).isoformat()}
@@ -918,23 +930,23 @@ async def handle_send_to_run_message(ws_state: Dict, data: Dict):
 
         # --- Branch 2: Send a message to a running session ---
         elif run_status in ['RUNNING', 'AWAITING_INPUT']:
-            if prompt_content is None:
-                raise ValueError("Message payload must contain a 'prompt'.")
+            # Allow sending if text or attachments exist
+            if (prompt_content is None) and (not files_content):
+                raise ValueError("Message must contain either 'prompt' text or attachments (files).")
 
             if run_type == "partner_interaction":
                 partner_context = run_context['sub_context_refs']['_partner_context_ref']
                 partner_state = partner_context['state']
                 team_state = run_context['team_state']
 
-                # --- Core modification: Similarly, only create an InboxItem ---
-                # 构建payload，包含文本和图像
+                # --- Core modification: Create an InboxItem with multimodal info (files only) ---
                 payload = {"prompt": prompt_content}
-                if images_content:
-                    payload["images"] = images_content
+                if files_content:
+                    payload["files"] = files_content
                 
                 inbox_item = {
                     "item_id": f"inbox_{uuid.uuid4().hex[:8]}",
-                    "source": "USER_PROMPT",
+                    "source": ("USER_PROMPT_WITH_FILES" if files_content else "USER_PROMPT"),
                     "payload": payload,
                     "consumption_policy": "consume_on_read",
                     "metadata": {"created_at": datetime.now(timezone.utc).isoformat()}
@@ -960,153 +972,13 @@ async def handle_send_to_run_message(ws_state: Dict, data: Dict):
         logger.error("send_to_run_processing_error", extra={"session_id": session_id_for_log, "target_run_id": target_run_id, "run_type": run_type, "error_message": str(e)}, exc_info=True)
         await event_manager.emit_error(run_id=target_run_id, agent_id="System", error_message=f"Error processing message for run {target_run_id}: {str(e)}")
 
-async def handle_send_image_message(ws_state: Dict, data: Dict):
-    """
-    Handles 'send_image_message' messages, routing client messages with image data to the specified active business run.
-    This is similar to handle_send_to_run_message but specifically designed for multimodal messages.
-    """
-    event_manager = ws_state.event_manager
-    session_id_for_log = event_manager.session_id
-
-    target_run_id = data.get("run_id")
-    run_id_var.set(target_run_id)  # Set context variable
-    message_payload = data.get("message_payload")
-    image_info = data.get("image_info")
-
-    logger.info("send_image_message_received", extra={
-        "session_id": session_id_for_log, 
-        "target_run_id": target_run_id, 
-        "message_preview": str(message_payload)[:100],
-        "has_image_info": bool(image_info)
-    })
-
-    if not target_run_id or message_payload is None or image_info is None:
-        err_msg = "'send_image_message' requires 'run_id', 'message_payload', and 'image_info'."
-        logger.warning("send_image_message_missing_params", extra={
-            "session_id": session_id_for_log, 
-            "data": data, 
-            "has_run_id": bool(target_run_id), 
-            "has_message_payload": message_payload is not None,
-            "has_image_info": image_info is not None
-        })
-        await event_manager.emit_error(run_id=target_run_id, agent_id="System", error_message=err_msg)
-        return
-
-    run_context = active_runs_store.get(target_run_id)
-    if not run_context:
-        err_msg = f"Target run {target_run_id} not found or not active."
-        logger.warning("send_image_message_target_not_found", extra={"session_id": session_id_for_log, "target_run_id": target_run_id})
-        await event_manager.emit_error(run_id=target_run_id, agent_id="System", error_message=err_msg)
-        return
-
-    run_status = run_context['meta'].get('status')
-    run_type = run_context['meta'].get('run_type')
-    prompt_content = message_payload.get("prompt")
-
-    try:
-        # --- Branch 1: Activate a pending run with image ---
-        if run_status == 'CREATED':
-            logger.debug("run_activation_with_image_started", extra={"run_id": target_run_id, "run_type": run_type})
-            
-            if prompt_content is None:
-                raise ValueError("First message to a new run must contain a 'prompt'.")
-            
-            run_context['team_state']['question'] = prompt_content
-            
-            task = None
-            if run_type == "partner_interaction":
-                partner_context = run_context['sub_context_refs']['_partner_context_ref']
-                team_state = run_context['team_state']
-                partner_state = partner_context['state']
-
-                # 构建payload，包含文本和图像信息
-                payload = {
-                    "prompt": prompt_content,
-                    "image_info": image_info  # Add image info to payload
-                }
-                
-                inbox_item = {
-                    "item_id": f"inbox_{uuid.uuid4().hex[:8]}",
-                    "source": "USER_PROMPT_WITH_IMAGE", # Use specialized event source
-                    "payload": payload,
-                    "consumption_policy": "consume_on_read",
-                    "metadata": {"created_at": datetime.now(timezone.utc).isoformat()}
-                }
-                partner_state.setdefault("inbox", []).append(inbox_item)
-                
-                # 2. Start the task
-                task = asyncio.create_task(run_partner_interaction_async(partner_context=partner_context))
-            else:
-                raise ValueError(f"Run type '{run_type}' does not support activation via 'send_image_message'.")
-
-            ws_state.active_run_tasks[target_run_id] = task
-            task.add_done_callback(
-                lambda t: logger.info("run_task_finished", extra={"run_id": target_run_id, "run_type": run_type, "session_id": session_id_for_log})
-                if not t.cancelled() else
-                logger.info("run_task_cancelled", extra={"run_id": target_run_id, "run_type": run_type, "session_id": session_id_for_log})
-            )
-            
-            run_context['meta']['status'] = 'AWAITING_INPUT'
-            logger.debug("run_activation_with_image_completed", extra={"run_id": target_run_id, "status": "AWAITING_INPUT"})
-
-            # 3. Wake up the task
-            if run_type == "partner_interaction":
-                new_input_event = run_context['sub_context_refs']['_partner_context_ref']['runtime_objects'].get("new_user_input_event")
-                if new_input_event:
-                    new_input_event.set()
-            return  # Critical: Return immediately after handling activation
-
-        # --- Branch 2: Send an image message to a running session ---
-        elif run_status in ['RUNNING', 'AWAITING_INPUT']:
-            if prompt_content is None:
-                raise ValueError("Message payload must contain a 'prompt'.")
-
-            if run_type == "partner_interaction":
-                partner_context = run_context['sub_context_refs']['_partner_context_ref']
-                partner_state = partner_context['state']
-                team_state = run_context['team_state']
-
-                # --- Core modification: Create an InboxItem with image info ---
-                payload = {
-                    "prompt": prompt_content,
-                    "image_info": image_info  # Add image info to payload
-                }
-                
-                inbox_item = {
-                    "item_id": f"inbox_{uuid.uuid4().hex[:8]}",
-                    "source": "USER_PROMPT_WITH_IMAGE",
-                    "payload": payload,
-                    "consumption_policy": "consume_on_read",
-                    "metadata": {"created_at": datetime.now(timezone.utc).isoformat()}
-                }
-                partner_state.setdefault("inbox", []).append(inbox_item)
-
-                # Wake up the task
-                new_input_event = partner_context['runtime_objects'].get("new_user_input_event")
-                if new_input_event:
-                    new_input_event.set()
-                    logger.info("partner_task_notified_with_image", extra={"run_id": target_run_id, "notification_method": "inbox"})
-                else:
-                    logger.error("partner_notification_failed", extra={"run_id": target_run_id, "reason": "new_user_input_event_not_found"}, exc_info=True)
-            
-        # --- Branch 3: Handle invalid states ---
-        else:
-            err_msg = f"Cannot send image message to run {target_run_id} because its status is '{run_status}'."
-            logger.warning("send_image_message_invalid_status", extra={"session_id": session_id_for_log, "run_id": target_run_id, "run_status": run_status})
-            await event_manager.emit_error(run_id=target_run_id, agent_id="System", error_message=err_msg)
-            return
-
-    except Exception as e:
-        logger.error("send_image_message_processing_error", extra={"session_id": session_id_for_log, "target_run_id": target_run_id, "run_type": run_type, "error_message": str(e)}, exc_info=True)
-        await event_manager.emit_error(run_id=target_run_id, agent_id="System", error_message=f"Error processing image message for run {target_run_id}: {str(e)}")
 
 # --- MESSAGE_HANDLERS registry (Dango's version, with adapted function names) ---
 MESSAGE_HANDLERS: Dict[str, callable] = {
     "start_run": handle_start_run_message,
     "stop_run": handle_stop_run_message,
     "request_available_toolsets": handle_request_available_toolsets,
-    "send_to_run": handle_send_to_run_message, # Added by Dango, adapted
-    "send_image_message": handle_send_image_message, # New handler for multimodal messages
+    "send_to_run": handle_send_to_run_message, # Unified handler
     "stop_managed_principal": handle_stop_managed_principal_message, # Added by Dango, adapted
     "request_run_profiles": handle_request_run_profiles_message, # Added by Dango, adapted
     "request_run_context": handle_request_run_context_message, # Added by Dango, adapted
diff --git a/core/pyproject.toml b/core/pyproject.toml
index bce457b..fbdad9c 100644
--- a/core/pyproject.toml
+++ b/core/pyproject.toml
@@ -14,6 +14,7 @@ dependencies = [
     "python-dotenv>=1.0.0",
     "python-json-logger>=2.0.7",
     "requests>=2.28.0",
+    "httpx>=0.24.0",
     "markdown>=3.4.0",
     "coolname>=1.1.0",
     "numpy>=2",
diff --git a/frontend/app/chat/components/ChatInput.tsx b/frontend/app/chat/components/ChatInput.tsx
index 1af78da..0e9bb34 100644
--- a/frontend/app/chat/components/ChatInput.tsx
+++ b/frontend/app/chat/components/ChatInput.tsx
@@ -1,21 +1,23 @@
-import React, { useState, useRef, useCallback } from 'react';
+import React, { useState, useRef, useCallback, useEffect } from 'react';
 import { Button } from '@/components/ui/button';
 import { Input } from '@/components/ui/input';
-import { X, Image as ImageIcon, Paperclip } from 'lucide-react';
+import { X, Paperclip, Music, Video, FileText } from 'lucide-react';
 
 
-interface ImageAttachment {
+interface FileAttachment {
   id: string;
   file: File;
   dataUrl: string;
   name: string;
+  mimeType?: string;
+  kind?: 'image' | 'audio' | 'video' | 'document';
 }
 
 interface ChatInputProps {
   currentInput: string;
   onInputChange: (value: string) => void;
   onKeyPress: (e: React.KeyboardEvent) => void;
-  onSendMessage: (images?: ImageAttachment[]) => void;
+  onSendMessage: (files?: FileAttachment[]) => void;
   isStreaming: boolean;
   isLoading: boolean;
   onStopExecution: () => void;
@@ -30,11 +32,85 @@ export function ChatInput({
   isLoading,
   onStopExecution,
 }: ChatInputProps) {
-  const [images, setImages] = useState<ImageAttachment[]>([]);
+  const [files, setFiles] = useState<FileAttachment[]>([]);
   const fileInputRef = useRef<HTMLInputElement>(null);
   const inputRef = useRef<HTMLInputElement>(null);
+  const [errorMsg, setErrorMsg] = useState<string | null>(null);
+  const errorTimerRef = useRef<number | null>(null);
 
-  // 处理粘贴事件
+  const showError = useCallback((msg: string) => {
+    setErrorMsg(msg);
+    if (errorTimerRef.current) {
+      window.clearTimeout(errorTimerRef.current);
+    }
+    errorTimerRef.current = window.setTimeout(() => {
+      setErrorMsg(null);
+      errorTimerRef.current = null;
+    }, 3000);
+  }, []);
+
+  // Convert File to base64 data URL (for backward compatibility on send)
+  const fileToDataUrl = useCallback((file: File) => {
+    return new Promise<string>((resolve, reject) => {
+      const reader = new FileReader();
+      reader.onload = () => resolve(reader.result as string);
+      reader.onerror = reject;
+      reader.readAsDataURL(file);
+    });
+  }, []);
+
+  useEffect(() => {
+    return () => {
+      if (errorTimerRef.current) {
+        window.clearTimeout(errorTimerRef.current);
+      }
+      // Revoke any remaining object URLs on unmount to prevent memory leaks
+      try {
+        files.forEach(file => {
+          if (file.kind === 'image' && file.dataUrl) {
+            URL.revokeObjectURL(file.dataUrl);
+          }
+        });
+      } catch {}
+    };
+  }, [files]);
+
+  // 粘贴事件处理在下方定义，以确保依赖的 addFile 已声明
+
+  // 添加文件（图片/音频/视频/文档）
+  const addFile = useCallback((file: File) => {
+    const isImage = file.type.startsWith('image/');
+    const isAudio = file.type.startsWith('audio/');
+    const isVideo = file.type.startsWith('video/');
+    // 文档类型：application/* 或 text/*，或通过拓展名兜底
+    const ext = file.name.split('.').pop()?.toLowerCase() || '';
+    const docExts = new Set(['pdf','doc','docx','xls','xlsx','ppt','pptx','txt','rtf','md','csv']);
+    const isDocByMime = file.type.startsWith('application/') || file.type.startsWith('text/');
+    const isDocument = (!isImage && !isAudio && !isVideo) && (isDocByMime || docExts.has(ext));
+
+    if (!isImage && !isAudio && !isVideo && !isDocument) {
+      showError('Unsupported file type. Allowed: images, audio, video, documents.');
+      return;
+    }
+
+    // 允许超过 20MB 的文件添加；
+    // 小于 20MB 的所有文件（图片/音频/视频/文本/文档）在发送时统一转换为 dataUrl，
+    // 大于等于 20MB 的文件在发送时通过 file 交给后端上传。
+
+    // 对图片生成 Blob URL 预览；音频/视频/文档仅保留文件与名称
+    const objectUrl = isImage ? URL.createObjectURL(file) : '';
+    const newAttachment: FileAttachment = {
+      id: crypto.randomUUID(),
+      file,
+      dataUrl: objectUrl,
+      name: file.name,
+      mimeType: file.type,
+      kind: isImage ? 'image' : isAudio ? 'audio' : isVideo ? 'video' : 'document',
+    };
+    setFiles(prev => [...prev, newAttachment]);
+  }, [showError]);
+
+  // 处理粘贴事件（仅支持图片）
   const handlePaste = useCallback((e: React.ClipboardEvent) => {
     const items = e.clipboardData?.items;
     if (!items) return;
@@ -45,96 +121,104 @@ export function ChatInput({
         e.preventDefault();
         const file = item.getAsFile();
         if (file) {
-          addImageFile(file);
+          addFile(file);
         }
         break;
       }
     }
-  }, []);
-
-  // 添加图像文件
-  const addImageFile = useCallback((file: File) => {
-    if (!file.type.startsWith('image/')) {
-      alert('Please select an image file');
-      return;
-    }
-
-    // 限制文件大小为10MB
-    if (file.size > 10 * 1024 * 1024) {
-      alert('Image size should be less than 10MB');
-      return;
-    }
-
-    const reader = new FileReader();
-    reader.onload = (e) => {
-      const dataUrl = e.target?.result as string;
-      const newImage: ImageAttachment = {
-        id: crypto.randomUUID(),
-        file,
-        dataUrl,
-        name: file.name
-      };
-      setImages(prev => [...prev, newImage]);
-    };
-    reader.readAsDataURL(file);
-  }, []);
+  }, [addFile]);
 
   // 处理文件选择
   const handleFileSelect = useCallback((e: React.ChangeEvent<HTMLInputElement>) => {
     const files = e.target.files;
     if (files) {
-      Array.from(files).forEach(addImageFile);
+      Array.from(files).forEach(addFile);
     }
     // 清空input值以允许重复选择同一文件
     if (fileInputRef.current) {
       fileInputRef.current.value = '';
     }
-  }, [addImageFile]);
+  }, [addFile]);
 
   // 移除图像
-  const removeImage = useCallback((id: string) => {
-    setImages(prev => prev.filter(img => img.id !== id));
+  const removeFile = useCallback((id: string) => {
+    setFiles(prev => {
+      const target = prev.find(img => img.id === id);
+      if (target) {
+        try { if (target.kind === 'image' && target.dataUrl) { URL.revokeObjectURL(target.dataUrl); } } catch {}
+      }
+      return prev.filter(img => img.id !== id);
+    });
   }, []);
 
   // 处理发送消息
-  const handleSendMessage = useCallback(() => {
-    onSendMessage(images.length > 0 ? images : undefined);
-    setImages([]); // 发送后清空图像
-  }, [onSendMessage, images]);
+  const handleSendMessage = useCallback(async () => {
+    if (files.length > 0) {
+      // Prefer file; convert to base64 dataUrl for all files <20MB (image/audio/video/text/document)
+      const converted = await Promise.all(files.map(async (img) => {
+        const kind = img.kind ?? 'image';
+        const isImage = kind === 'image';
+        const isAudio = kind === 'audio';
+        const isVideo = kind === 'video';
+        const isDocument = kind === 'document';
+        const isTextMime = (img.mimeType ?? '').startsWith('text/');
+        const shouldConvert = (isImage || isAudio || isVideo || isTextMime || isDocument) && img.file.size < 20 * 1024 * 1024;
+        return {
+          ...img,
+          dataUrl: shouldConvert ? await fileToDataUrl(img.file) : '',
+        };
+      }));
+      onSendMessage(converted);
+    } else {
+      onSendMessage(undefined);
+    }
+    // Revoke all object URLs and clear after sending
+    files.forEach(file => {
+      try { URL.revokeObjectURL(file.dataUrl); } catch {}
+    });
+    setFiles([]);
+  }, [onSendMessage, files, fileToDataUrl]);
 
   // 处理键盘事件
   const handleKeyPress = useCallback((e: React.KeyboardEvent) => {
     if (e.key === 'Enter' && !e.shiftKey) {
       e.preventDefault();
-      if ((currentInput.trim() || images.length > 0) && !isStreaming && !isLoading) {
+      if ((currentInput.trim() || files.length > 0) && !isStreaming && !isLoading) {
         handleSendMessage();
       }
     } else {
       onKeyPress(e);
     }
-  }, [currentInput, images, isStreaming, isLoading, handleSendMessage, onKeyPress]);
+  }, [currentInput, files, isStreaming, isLoading, handleSendMessage, onKeyPress]);
 
   return (
     <div className="p-3">
-      {/* 图像预览区域 */}
-      {images.length > 0 && (
+      {/* 附件预览区域 */}
+      {files.length > 0 && (
         <div className="mb-3 p-3 bg-gray-50 rounded-lg">
           <div className="flex flex-wrap gap-2">
-            {images.map((image) => (
-              <div key={image.id} className="relative group">
-                <img
-                  src={image.dataUrl}
-                  alt={image.name}
-                  className="w-20 h-20 object-cover rounded border"
-                />
+            {files.map((file) => (
+              <div key={file.id} className="relative group">
+                {file.kind === 'image' && file.dataUrl ? (
+                  <img
+                    src={file.dataUrl}
+                    alt={file.name}
+                    className="w-20 h-20 object-cover rounded border"
+                  />
+                ) : (
+                  <div className="w-20 h-20 rounded border bg-white flex items-center justify-center">
+                    {file.kind === 'audio' ? <Music size={18} /> : file.kind === 'video' ? <Video size={18} /> : <FileText size={18} />}
+                  </div>
+                )}
                 <button
-                  onClick={() => removeImage(image.id)}
+                  onClick={() => removeFile(file.id)}
                   className="absolute -top-1 -right-1 w-5 h-5 bg-red-500 text-white rounded-full flex items-center justify-center opacity-0 group-hover:opacity-100 transition-opacity"
+                  aria-label="Remove attachment"
                 >
                   <X size={12} />
                 </button>
                 <div className="absolute bottom-0 left-0 right-0 bg-black bg-opacity-50 text-white text-xs p-1 rounded-b truncate">
-                  {image.name}
+                  {file.name}
                 </div>
               </div>
             ))}
@@ -142,7 +226,22 @@ export function ChatInput({
         </div>
       )}
 
-      <div className="flex gap-2 bg-white rounded-lg border overflow-hidden p-2 focus-within:border-black transition-colors">
+      {errorMsg && (
+        <div className="mb-2 text-sm text-red-600 bg-red-50 border border-red-200 rounded px-2 py-1">
+          {errorMsg}
+        </div>
+      )}
+
+      <div
+        className="flex gap-2 bg-white rounded-lg border overflow-hidden p-2 focus-within:border-black transition-colors"
+        onDragOver={(e) => { e.preventDefault(); e.stopPropagation(); }}
+        onDrop={(e) => {
+          e.preventDefault();
+          e.stopPropagation();
+          const files = Array.from(e.dataTransfer.files || []);
+          files.forEach((f) => addFile(f));
+        }}
+      >
         {/* 文件上传按钮 */}
         <Button
           variant="ghost"
@@ -150,6 +249,7 @@ export function ChatInput({
           onClick={() => fileInputRef.current?.click()}
           disabled={isStreaming || isLoading}
           className="flex-shrink-0"
+          aria-label="Attach files"
         >
           <Paperclip size={16} />
         </Button>
@@ -160,7 +260,7 @@ export function ChatInput({
           onChange={(e) => onInputChange(e.target.value)}
           onKeyPress={handleKeyPress}
           onPaste={handlePaste}
-          placeholder="Enter message or paste image..."
+          placeholder="Enter message, paste or drag-and-drop files (images/audio/video/docs)..."
           disabled={isStreaming || isLoading}
           className="flex-1 border-0 focus-visible:ring-0 focus-visible:ring-offset-0 shadow-none px-2"
         />
@@ -171,14 +271,16 @@ export function ChatInput({
             size="icon"
             className="rounded-full bg-black hover:bg-black/90 !px-2 !py-1 flex-shrink-0"
             onClick={onStopExecution}
+            aria-label="Stop streaming"
           >
             <div className="w-3 h-3 bg-white" />
           </Button>
         ) : (
           <Button 
             onClick={handleSendMessage}
-            disabled={(!currentInput.trim() && images.length === 0) || isLoading}
+            disabled={(!currentInput.trim() && files.length === 0) || isLoading}
             className="flex-shrink-0"
+            aria-label="Send message"
           >
             Send
           </Button>
@@ -189,7 +291,7 @@ export function ChatInput({
       <input
         ref={fileInputRef}
         type="file"
-        accept="image/*"
+        accept="image/*,audio/*,video/*,.pdf,.doc,.docx,.xls,.xlsx,.ppt,.pptx,.txt,.rtf,.md,.csv"
         multiple
         onChange={handleFileSelect}
         className="hidden"
diff --git a/frontend/app/chat/components/ProjectPage.tsx b/frontend/app/chat/components/ProjectPage.tsx
index 037cf78..c12e7d8 100644
--- a/frontend/app/chat/components/ProjectPage.tsx
+++ b/frontend/app/chat/components/ProjectPage.tsx
@@ -26,7 +26,7 @@ import { ChatInput } from './ChatInput';
 interface ProjectPageProps {
   currentInput: string;
   onInputChange: (value: string) => void;
-  onSendMessage: (images?: any[]) => void;
+  onSendMessage: (files?: any[]) => void;
   onKeyPress: (e: React.KeyboardEvent) => void;
   isLoading: boolean;
 }
diff --git a/frontend/app/chat/components/WelcomeScreen.tsx b/frontend/app/chat/components/WelcomeScreen.tsx
index e0d4715..d1fce58 100644
--- a/frontend/app/chat/components/WelcomeScreen.tsx
+++ b/frontend/app/chat/components/WelcomeScreen.tsx
@@ -7,7 +7,7 @@ import { ChatInput } from './ChatInput';
 interface WelcomeScreenProps {
   currentInput: string;
   onInputChange: (value: string) => void;
-  onSendMessage: (images?: any[]) => void;
+  onSendMessage: (files?: any[]) => void;
   onKeyPress: (e: React.KeyboardEvent) => void;
   isLoading: boolean;
 }
diff --git a/frontend/app/chat/page.tsx b/frontend/app/chat/page.tsx
index 653d4e3..84f779c 100644
--- a/frontend/app/chat/page.tsx
+++ b/frontend/app/chat/page.tsx
@@ -30,8 +30,8 @@ export default observer(function ChatPage() {
     selectionStore.clearSelection();
   }, []);
 
-  const sendMessage = useCallback(async (images?: any[]) => {
-    if ((!store.currentInput.trim() && !images?.length) || store.isLoading || store.isCreatingRun) return;
+  const sendMessage = useCallback(async (files?: any[]) => {
+    if ((!store.currentInput.trim() && !files?.length) || store.isLoading || store.isCreatingRun) return;
     
     // Close the sidebar (if on mobile or first message)
     if (sidebarContext) {
@@ -55,7 +55,7 @@ export default observer(function ChatPage() {
         
         // Send message
         store.setIsLoading(true);
-        await sessionStore.sendMessage(messageToSend, newRunId, images);
+        await sessionStore.sendMessage(messageToSend, newRunId, files);
 
         // Redirect to the newly created run page
         router.push(`/r?id=${newRunId}`);
diff --git a/frontend/app/stores/sessionStore.ts b/frontend/app/stores/sessionStore.ts
index 0bb1364..54acc80 100644
--- a/frontend/app/stores/sessionStore.ts
+++ b/frontend/app/stores/sessionStore.ts
@@ -888,30 +888,28 @@ class SessionStore {
     }
   }
 
-  async sendMessage(message: string, runId: string, images?: any[]) {
+  async sendMessage(message: string, runId: string, files?: any[]) {
     if (!this.ws || this.ws.readyState !== WebSocket.OPEN) {
         console.error("Cannot send message, WebSocket is not open.");
         this.error = "Connection failed. Cannot send message.";
         return;
     }
 
-    // 处理图像数据
-    const imageData = images ? await Promise.all(
-      images.map(async (img) => {
-        // 将文件转换为base64
-        const base64 = await this.fileToBase64(img.file);
+    // 处理附件为统一的 files 数组（base64 data URL 或纯 base64）
+    const filesData = files ? await Promise.all(
+      files.map(async (f) => {
+        const base64 = await this.fileToBase64(f.file);
         return {
-          type: 'image',
           data: base64,
-          name: img.name,
-          mimeType: img.file.type
-        };
+          name: f.name,
+          mimeType: f.file.type,
+        } as { data: string; name: string; mimeType: string };
       })
     ) : [];
 
     const messagePayload: any = { prompt: message };
-    if (imageData.length > 0) {
-      messagePayload.images = imageData;
+    if (filesData.length > 0) {
+      messagePayload.files = filesData;
     }
 
     this.ws.send(JSON.stringify({
@@ -928,7 +926,6 @@ class SessionStore {
       const reader = new FileReader();
       reader.onload = () => {
         const result = reader.result as string;
-        // 移除data:image/xxx;base64,前缀，只保留base64数据
         const base64Data = result.split(',')[1];
         resolve(base64Data);
       };

From 810164a38e41bfd2100e930293572b7e777b44ee Mon Sep 17 00:00:00 2001
From: sdbds <865105819@qq.com>
Date: Wed, 13 Aug 2025 11:09:53 +0800
Subject: [PATCH 06/32] update image part

---
 core/agent_core/framework/inbox_processor.py  | 230 +++++++++++-------
 core/agent_core/nodes/base_agent_node.py      |  54 +++-
 core/agent_core/nodes/mcp_proxy_node.py       |  33 +++
 .../chat/components/details/TurnBubble.tsx    |  33 +++
 frontend/app/chat/types/conversation.ts       |   5 +
 5 files changed, 270 insertions(+), 85 deletions(-)

diff --git a/core/agent_core/framework/inbox_processor.py b/core/agent_core/framework/inbox_processor.py
index 1359087..3fe99bf 100644
--- a/core/agent_core/framework/inbox_processor.py
+++ b/core/agent_core/framework/inbox_processor.py
@@ -217,7 +217,7 @@ async def process(self) -> Dict[str, Any]:
             try:
                 payload = item["payload"]
                 
-                if item.get("source") == "USER_PROMPT":
+                if item.get("source") in ["USER_PROMPT", "USER_PROMPT_WITH_FILES"]:
                     new_user_turn_id = self._create_user_turn_from_inbox_item(item)
                     if new_user_turn_id:
                         # Pass the "baton" so the next agent_turn can correctly link to this user_turn.
@@ -278,106 +278,170 @@ async def process(self) -> Dict[str, Any]:
 
                         for f in files:
                             try:
-                                file_bytes = None
                                 filename = f.get("name") or f.get("filename") or f"file_{uuid.uuid4().hex[:6]}"
                                 mime_type = f.get("mimeType") or f.get("mime_type") or "application/octet-stream"
 
                                 if f.get("file_id"):
-                                    # Already uploaded
+                                    # Already uploaded, use file reference
                                     file_id = f["file_id"]
                                     logger.info("gemini_file_upload_skipped_existing", extra={
                                         "agent_id": self.agent_id,
-                                        "filename": filename,
+                                        "file_name": filename,
                                         "mime_type": mime_type,
                                         "file_id": file_id,
                                     })
+                                    # Use file reference
+                                    content_parts.append({
+                                        "type": "file",
+                                        "file": {
+                                            "file_id": file_id,
+                                            "filename": filename,
+                                            "format": mime_type
+                                        }
+                                    })
                                 else:
-                                    size_bytes = None
-                                    if f.get("url"):
+                                    # Check if we have direct base64 data from frontend
+                                    if f.get("data"):
+                                        # Frontend sent base64 data - use directly without file upload
+                                        data_str = f["data"]
+                                        
+                                        if isinstance(mime_type, str) and mime_type.startswith("image/"):
+                                            # Ensure proper data URL format for images
+                                            if not data_str.startswith("data:"):
+                                                image_url = f"data:{mime_type};base64,{data_str}"
+                                            else:
+                                                image_url = data_str
+                                            
+                                            content_parts.append({
+                                                "type": "image_url",
+                                                "image_url": {
+                                                    "url": image_url,
+                                                    "detail": "high"
+                                                }
+                                            })
+                                            
+                                            logger.info("image_processed_as_base64", extra={
+                                                "agent_id": self.agent_id,
+                                                "file_name": filename,
+                                                "mime_type": mime_type,
+                                                "method": "direct_base64"
+                                            })
+                                        else:
+                                            # Non-image files with base64 data
+                                            logger.info("non_image_file_with_base64", extra={
+                                                "agent_id": self.agent_id,
+                                                "file_name": filename,
+                                                "mime_type": mime_type,
+                                                "note": "Non-image files may not be fully supported"
+                                            })
+                                    
+                                    elif f.get("url"):
+                                        # URL-based file - need to fetch and potentially upload
+                                        file_bytes = None
+                                        
                                         # Async fetch
                                         async with httpx.AsyncClient(timeout=20) as client:
                                             resp = await client.get(f["url"])
                                             resp.raise_for_status()
                                             file_bytes = resp.content
+                                        
                                         size_bytes = len(file_bytes) if file_bytes is not None else None
-                                    elif f.get("data"):
-                                        # base64 string possibly without header
-                                        data_str = f["data"]
-                                        # Strip data URL prefix if present
-                                        if data_str.startswith("data:"):
-                                            data_str = data_str.split(",", 1)[1]
-                                        file_bytes = base64.b64decode(data_str)
-                                        size_bytes = len(file_bytes)
-                                    else:
-                                        # Unsupported entry, skip
-                                        logger.warning("file_entry_missing_data", extra={"agent_id": self.agent_id, "filename": filename})
-                                        continue
-
-                                    # Prefer API key from project LLM config; fallback to env var
-                                    try:
-                                        resolver = LLMConfigResolver(shared_llm_configs=self.run_context.get("config", {}).get("shared_llm_configs_ref", {}))
-                                        llm_config = resolver.resolve(self.profile)
-                                    except Exception:
-                                        llm_config = {}
-                                    gemini_key = (
-                                        (llm_config.get("api_key") if isinstance(llm_config, dict) else None)
-                                        or os.getenv("GEMINI_API_KEY")
-                                    )
-                                    if not gemini_key:
-                                        logger.error(
-                                            "gemini_api_key_missing",
-                                            extra={
+                                        max_base64_size = 20 * 1024 * 1024  # 20MB
+                                        
+                                        if size_bytes and size_bytes < max_base64_size:
+                                            # Small file from URL - convert to base64
+                                            if isinstance(mime_type, str) and mime_type.startswith("image/"):
+                                                base64_data = base64.b64encode(file_bytes).decode()
+                                                image_url = f"data:{mime_type};base64,{base64_data}"
+                                                
+                                                content_parts.append({
+                                                    "type": "image_url",
+                                                    "image_url": {
+                                                        "url": image_url,
+                                                        "detail": "high"
+                                                    }
+                                                })
+                                                
+                                                logger.info("url_file_converted_to_base64", extra={
+                                                    "agent_id": self.agent_id,
+                                                    "file_name": filename,
+                                                    "mime_type": mime_type,
+                                                    "size_bytes": size_bytes
+                                                })
+                                            else:
+                                                logger.info("non_image_url_file_skipped", extra={
+                                                    "agent_id": self.agent_id,
+                                                    "file_name": filename,
+                                                    "mime_type": mime_type
+                                                })
+                                        else:
+                                            # Large file from URL - use Gemini file upload
+                                            # Prefer API key from project LLM config; fallback to env var
+                                            try:
+                                                resolver = LLMConfigResolver(shared_llm_configs=self.run_context.get("config", {}).get("shared_llm_configs_ref", {}))
+                                                llm_config = resolver.resolve(self.profile)
+                                            except Exception:
+                                                llm_config = {}
+                                            gemini_key = (
+                                                (llm_config.get("api_key") if isinstance(llm_config, dict) else None)
+                                                or os.getenv("GEMINI_API_KEY")
+                                            )
+                                            if not gemini_key:
+                                                logger.error(
+                                                    "gemini_api_key_missing",
+                                                    extra={
+                                                        "agent_id": self.agent_id,
+                                                        "hint": "Provide api_key in active LLM config or set GEMINI_API_KEY env var"
+                                                    }
+                                                )
+                                                continue
+
+                                            # Structured start log
+                                            logger.info("gemini_file_upload_start", extra={
                                                 "agent_id": self.agent_id,
-                                                "hint": "Provide api_key in active LLM config or set GEMINI_API_KEY env var"
-                                            }
-                                        )
-                                        continue
-
-                                    # Structured start log
-                                    logger.info("gemini_file_upload_start", extra={
-                                        "agent_id": self.agent_id,
-                                        "filename": filename,
-                                        "mime_type": mime_type,
-                                        "size_bytes": size_bytes,
-                                    })
-                                    t0 = time.perf_counter()
-
-                                    # Offload blocking create_file to a thread
-                                    created = await asyncio.to_thread(
-                                        create_file,
-                                        file=file_bytes,
-                                        purpose="user_data",
-                                        extra_body={"custom_llm_provider": "gemini"},
-                                        api_key=gemini_key,
-                                    )
-                                    file_id = getattr(created, "id", None) if created is not None else None
-                                    if not file_id:
-                                        logger.error("gemini_file_upload_failed", extra={
-                                            "filename": filename,
-                                            "mime_type": mime_type,
-                                            "size_bytes": size_bytes,
-                                            "duration_ms": int((time.perf_counter() - t0) * 1000),
-                                        })
-                                        continue
-                                    else:
-                                        logger.info("gemini_file_upload_success", extra={
-                                            "agent_id": self.agent_id,
-                                            "filename": filename,
-                                            "mime_type": mime_type,
-                                            "size_bytes": size_bytes,
-                                            "file_id": file_id,
-                                            "duration_ms": int((time.perf_counter() - t0) * 1000),
-                                        })
-
-                                # Append file reference content part
-                                content_parts.append({
-                                    "type": "file",
-                                    "file": {
-                                        "file_id": file_id,
-                                        "filename": filename,
-                                        "format": mime_type
-                                    }
-                                })
+                                                "file_name": filename,
+                                                "mime_type": mime_type,
+                                                "size_bytes": size_bytes,
+                                                "reason": "file_too_large_for_base64"
+                                            })
+                                            t0 = time.perf_counter()
+
+                                            # Offload blocking create_file to a thread
+                                            created = await asyncio.to_thread(
+                                                create_file,
+                                                file=file_bytes,
+                                                purpose="user_data",
+                                                custom_llm_provider="gemini",
+                                                api_key=gemini_key,
+                                            )
+                                            file_id = getattr(created, "id", None) if created is not None else None
+                                            if not file_id:
+                                                logger.error("gemini_file_upload_failed", extra={
+                                                    "file_name": filename,
+                                                    "mime_type": mime_type,
+                                                    "size_bytes": size_bytes,
+                                                    "duration_ms": int((time.perf_counter() - t0) * 1000),
+                                                })
+                                                continue
+                                            else:
+                                                logger.info("gemini_file_upload_success", extra={
+                                                    "agent_id": self.agent_id,
+                                                    "file_name": filename,
+                                                    "mime_type": mime_type,
+                                                    "size_bytes": size_bytes,
+                                                    "file_id": file_id,
+                                                    "duration_ms": int((time.perf_counter() - t0) * 1000),
+                                                })
+
+                                            # Append file reference content part for large files
+                                            content_parts.append({
+                                                "type": "file",
+                                                "file": {
+                                                    "file_id": file_id,
+                                                    "filename": filename,
+                                                    "format": mime_type
+                                                }
+                                            })
                             except Exception as ex:
                                 logger.error("file_processing_failed", extra={"error": str(ex)}, exc_info=True)
                 
diff --git a/core/agent_core/nodes/base_agent_node.py b/core/agent_core/nodes/base_agent_node.py
index fab2101..138cf07 100644
--- a/core/agent_core/nodes/base_agent_node.py
+++ b/core/agent_core/nodes/base_agent_node.py
@@ -931,9 +931,13 @@ def _clean_messages_for_llm(self, messages: List[Dict]) -> List[Dict]:
                 if key in msg:
                     value = msg[key]
                     
-                    # Ensure content is a string
+                    # Handle content based on type - preserve multimodal structure
                     if key == "content":
-                        if isinstance(value, dict):
+                        if isinstance(value, list):
+                            # Multimodal content (list of parts) - preserve structure for LLM
+                            cleaned_msg[key] = value
+                            logger.debug("multimodal_content_preserved", extra={"message_role": msg.get('role'), "parts_count": len(value)})
+                        elif isinstance(value, dict):
                             # If content is a dictionary, convert it to a JSON string
                             import json
                             cleaned_msg[key] = json.dumps(value, ensure_ascii=False)
@@ -952,7 +956,53 @@ def _clean_messages_for_llm(self, messages: List[Dict]) -> List[Dict]:
             
             cleaned_messages.append(cleaned_msg)
         
+        # Validate message sequence for OpenAI API compliance
+        cleaned_messages = self._validate_message_sequence(cleaned_messages)
+        
         return cleaned_messages
+    
+    def _validate_message_sequence(self, messages: List[Dict]) -> List[Dict]:
+        """Validate and fix message sequence to ensure compliance with OpenAI API requirements."""
+        if not messages:
+            return messages
+            
+        # Check for invalid sequences and log warnings
+        for i in range(1, len(messages)):
+            prev_msg = messages[i-1]
+            curr_msg = messages[i]
+            
+            prev_role = prev_msg.get("role")
+            curr_role = curr_msg.get("role")
+            has_tool_calls = bool(curr_msg.get("tool_calls"))
+            
+            # Check for invalid function call sequences
+            if has_tool_calls and curr_role == "assistant":
+                # Function calls should come after user messages or tool responses
+                if prev_role not in ["user", "tool"]:
+                    logger.warning("invalid_function_call_sequence", extra={
+                        "agent_id": self.agent_id,
+                        "prev_role": prev_role,
+                        "curr_role": curr_role,
+                        "has_tool_calls": has_tool_calls,
+                        "position": i,
+                        "fix": "This may cause OpenAI API 400 errors"
+                    })
+            
+            # Check for consecutive assistant messages without tool interaction
+            if prev_role == "assistant" and curr_role == "assistant":
+                prev_has_tools = bool(prev_msg.get("tool_calls"))
+                curr_is_tool_response = bool(curr_msg.get("tool_call_id"))
+                
+                if not prev_has_tools and not curr_is_tool_response:
+                    logger.warning("consecutive_assistant_messages", extra={
+                        "agent_id": self.agent_id,
+                        "position": i,
+                        "prev_has_tools": prev_has_tools,
+                        "curr_is_tool_response": curr_is_tool_response,
+                        "fix": "This may cause message sequence issues"
+                    })
+        
+        return messages
 
     def _finalize_dangling_tool_in_turn(self, context: Dict):
         """
diff --git a/core/agent_core/nodes/mcp_proxy_node.py b/core/agent_core/nodes/mcp_proxy_node.py
index 8c1487a..bc64e7a 100644
--- a/core/agent_core/nodes/mcp_proxy_node.py
+++ b/core/agent_core/nodes/mcp_proxy_node.py
@@ -2,6 +2,7 @@
 import asyncio
 import anyio
 from typing import Dict, Any
+from pathlib import Path
 
 # Import the new base class
 from .base_tool_node import BaseToolNode
@@ -46,6 +47,38 @@ async def exec_async(self, prep_res: Dict) -> Dict[str, Any]:
             logger.error("mcp_proxy_session_group_not_found", extra={"unique_tool_name": self.unique_tool_name})
             return {"status": "error", "error_message": error_msg}
 
+        # 2.5 Normalize Windows paths in common parameters to avoid relative-path issues
+        def _normalize_paths_in_params(params: Dict[str, Any], tool_info: Dict[str, Any]) -> None:
+            if not isinstance(params, dict) or not isinstance(tool_info, dict):
+                return
+
+            properties = tool_info.get("parameters", {}).get("properties", {})
+            for param_name, schema in properties.items():
+                # Heuristic: Find string params that look like paths
+                is_path_like = (
+                    schema.get("type") == "string" and 
+                    ("path" in param_name.lower() or schema.get("format") == "uri-reference")
+                )
+
+                if is_path_like:
+                    value = params.get(param_name)
+                    if isinstance(value, str) and value.strip():
+                        p = Path(value)
+                        if not p.is_absolute():
+                            abs_p = (Path.cwd() / p).resolve()
+                            params[param_name] = str(abs_p)
+                            try:
+                                logger.debug("mcp_param_path_normalized", extra={
+                                    "unique_tool_name": self.unique_tool_name, 
+                                    "param": param_name, 
+                                    "original_path": value,
+                                    "abs_path": str(abs_p)
+                                })
+                            except Exception:
+                                pass
+
+        _normalize_paths_in_params(tool_params, self._tool_info)
+
         # 3. Execute the original business logic
         logger.info("mcp_proxy_tool_call_begin", extra={"unique_tool_name": self.unique_tool_name})
         
diff --git a/frontend/app/chat/components/details/TurnBubble.tsx b/frontend/app/chat/components/details/TurnBubble.tsx
index 5dc83dd..6646adc 100644
--- a/frontend/app/chat/components/details/TurnBubble.tsx
+++ b/frontend/app/chat/components/details/TurnBubble.tsx
@@ -177,6 +177,39 @@ export const TurnBubble = observer(({ turn, isHighlighted = false, onNodeIdClick
               {displayContent && (
                 <ReactMarkdown remarkPlugins={[remarkGfm]}>{displayContent}</ReactMarkdown>
               )}
+              
+              {/* Display uploaded images for user messages */}
+              {isUserTurn && turn.inputs?.files && turn.inputs.files.length > 0 && (
+                <div className="mt-3 space-y-2">
+                  <div className="grid grid-cols-1 sm:grid-cols-2 gap-2">
+                    {turn.inputs.files.map((file, index) => (
+                      <div key={index} className="relative">
+                        {file.mimeType.startsWith('image/') ? (
+                          <div className="relative">
+                            <img
+                              src={file.data.startsWith('data:') ? file.data : `data:${file.mimeType};base64,${file.data}`}
+                              alt={file.name}
+                              className="max-w-full h-auto rounded-lg border border-gray-200 shadow-sm hover:shadow-md transition-shadow"
+                              style={{ maxHeight: '200px' }}
+                            />
+                            <div className="absolute bottom-0 left-0 right-0 bg-black bg-opacity-50 text-white text-xs px-2 py-1 rounded-b-lg truncate">
+                              {file.name}
+                            </div>
+                          </div>
+                        ) : (
+                          <div className="flex items-center p-3 border border-gray-200 rounded-lg bg-gray-50">
+                            <div className="flex-1">
+                              <div className="text-sm font-medium text-gray-900 truncate">{file.name}</div>
+                              <div className="text-xs text-gray-500">{file.mimeType}</div>
+                            </div>
+                          </div>
+                        )}
+                      </div>
+                    ))}
+                  </div>
+                </div>
+              )}
+              
               {turn.tool_interactions?.map((interaction) => (
                 <ToolInteraction key={interaction.tool_call_id} interaction={interaction} />
               ))}
diff --git a/frontend/app/chat/types/conversation.ts b/frontend/app/chat/types/conversation.ts
index 6106f9c..6ad00ab 100644
--- a/frontend/app/chat/types/conversation.ts
+++ b/frontend/app/chat/types/conversation.ts
@@ -11,6 +11,11 @@ export interface Turn {
   end_time?: string | null;
   inputs?: {
     prompt?: string;
+    files?: Array<{
+      data: string;
+      name: string;
+      mimeType: string;
+    }>;
   } | null;
   llm_interaction?: {
     status?: 'running' | 'completed' | 'error';

From f4b6bf2766182a748da15b937b9bae6721ddf89e Mon Sep 17 00:00:00 2001
From: Qing Long <qinglongshengzhe@gmail.com>
Date: Thu, 14 Aug 2025 09:43:03 +0800
Subject: [PATCH 07/32] Update core/agent_core/events/ingestors.py

Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
---
 core/agent_core/events/ingestors.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/core/agent_core/events/ingestors.py b/core/agent_core/events/ingestors.py
index 19c2fda..624f329 100644
--- a/core/agent_core/events/ingestors.py
+++ b/core/agent_core/events/ingestors.py
@@ -365,7 +365,7 @@ def multimodal_user_prompt_ingestor(payload: Any, params: Dict, context: Dict) -
         if names:
             parts.append(f"并附带{len(files)}个文件（示例：{', '.join(names)}{'' if len(files) <= 3 else ' 等'}）")
         else:
-            parts.append(f"并附带{len(files)}个文件")
+            parts.append(f"and attached {len(files)} files")
 
     note = "[" + "，".join(parts) + "]"
     return f"{prompt}\n\n{note}" if prompt else note

From 6c404745a12a875ed46c351d6232225ff6fd01c1 Mon Sep 17 00:00:00 2001
From: Qing Long <qinglongshengzhe@gmail.com>
Date: Thu, 14 Aug 2025 09:43:12 +0800
Subject: [PATCH 08/32] Update frontend/app/chat/components/ChatInput.tsx

Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
---
 frontend/app/chat/components/ChatInput.tsx | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/frontend/app/chat/components/ChatInput.tsx b/frontend/app/chat/components/ChatInput.tsx
index 0e9bb34..0e0e65d 100644
--- a/frontend/app/chat/components/ChatInput.tsx
+++ b/frontend/app/chat/components/ChatInput.tsx
@@ -97,7 +97,7 @@ export function ChatInput({
     // 小于 20MB 的所有文件（图片/音频/视频/文本/文档）在发送时统一转换为 dataUrl，
     // 大于等于 20MB 的文件在发送时通过 file 交给后端上传。
 
-    // 对图片生成 Blob URL 预览；音频/视频/文档仅保留文件与名称
+    // Generate Blob URL preview for images; for audio, video, and documents, only keep the file and name
     const objectUrl = isImage ? URL.createObjectURL(file) : '';
     const newAttachment: FileAttachment = {
       id: crypto.randomUUID(),

From b9403719a721717951551dfe108f35472101d57e Mon Sep 17 00:00:00 2001
From: Qing Long <qinglongshengzhe@gmail.com>
Date: Thu, 14 Aug 2025 09:43:24 +0800
Subject: [PATCH 09/32] Update frontend/app/chat/components/ChatInput.tsx

Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
---
 frontend/app/chat/components/ChatInput.tsx | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/frontend/app/chat/components/ChatInput.tsx b/frontend/app/chat/components/ChatInput.tsx
index 0e0e65d..4b17896 100644
--- a/frontend/app/chat/components/ChatInput.tsx
+++ b/frontend/app/chat/components/ChatInput.tsx
@@ -82,7 +82,14 @@ export function ChatInput({
     const isImage = file.type.startsWith('image/');
     const isAudio = file.type.startsWith('audio/');
     const isVideo = file.type.startsWith('video/');
-    // 文档类型：application/* 或 text/*，或通过拓展名兜底
+  // Paste event handler is defined below to ensure addFile is already declared
+
+  // Add file (image/audio/video/document)
+  const addFile = useCallback((file: File) => {
+    const isImage = file.type.startsWith('image/');
+    const isAudio = file.type.startsWith('audio/');
+    const isVideo = file.type.startsWith('video/');
+    // Document type: application/* or text/*, or fallback by extension
     const ext = file.name.split('.').pop()?.toLowerCase() || '';
     const docExts = new Set(['pdf','doc','docx','xls','xlsx','ppt','pptx','txt','rtf','md','csv']);
     const isDocByMime = file.type.startsWith('application/') || file.type.startsWith('text/');

From a8e0f5f3c2420902a9728d6c29520c91d2b9ac2a Mon Sep 17 00:00:00 2001
From: Qing Long <qinglongshengzhe@gmail.com>
Date: Thu, 14 Aug 2025 09:44:44 +0800
Subject: [PATCH 10/32] Update frontend/app/chat/components/ChatInput.tsx

Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
---
 frontend/app/chat/components/ChatInput.tsx | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/frontend/app/chat/components/ChatInput.tsx b/frontend/app/chat/components/ChatInput.tsx
index 4b17896..790db73 100644
--- a/frontend/app/chat/components/ChatInput.tsx
+++ b/frontend/app/chat/components/ChatInput.tsx
@@ -77,7 +77,7 @@ export function ChatInput({
 
   // 粘贴事件处理在下方定义，以确保依赖的 addFile 已声明
 
-  // 添加文件（图片/音频/视频/文档）
+  // Add file (image/audio/video/document)
   const addFile = useCallback((file: File) => {
     const isImage = file.type.startsWith('image/');
     const isAudio = file.type.startsWith('audio/');

From c10b95f74f693687f390097a282fdcc4a8b679df Mon Sep 17 00:00:00 2001
From: Qing Long <qinglongshengzhe@gmail.com>
Date: Thu, 14 Aug 2025 09:44:54 +0800
Subject: [PATCH 11/32] Update frontend/app/chat/components/ChatInput.tsx

Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
---
 frontend/app/chat/components/ChatInput.tsx | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/frontend/app/chat/components/ChatInput.tsx b/frontend/app/chat/components/ChatInput.tsx
index 790db73..530f799 100644
--- a/frontend/app/chat/components/ChatInput.tsx
+++ b/frontend/app/chat/components/ChatInput.tsx
@@ -102,7 +102,9 @@ export function ChatInput({
 
     // 允许超过 20MB 的文件添加；
     // 小于 20MB 的所有文件（图片/音频/视频/文本/文档）在发送时统一转换为 dataUrl，
-    // 大于等于 20MB 的文件在发送时通过 file 交给后端上传。
+    // Allow files larger than 20MB to be added;
+    // All files smaller than 20MB (images/audio/video/text/documents) will be converted to dataUrl when sending,
+    // Files 20MB or larger will be sent to the backend for upload as files.
 
     // Generate Blob URL preview for images; for audio, video, and documents, only keep the file and name
     const objectUrl = isImage ? URL.createObjectURL(file) : '';

From aaafd89b7c330733b58d985e0f6258bed25a88aa Mon Sep 17 00:00:00 2001
From: Qing Long <qinglongshengzhe@gmail.com>
Date: Thu, 14 Aug 2025 09:45:01 +0800
Subject: [PATCH 12/32] Update core/agent_core/events/ingestors.py

Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
---
 core/agent_core/events/ingestors.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/core/agent_core/events/ingestors.py b/core/agent_core/events/ingestors.py
index 624f329..17b1d2c 100644
--- a/core/agent_core/events/ingestors.py
+++ b/core/agent_core/events/ingestors.py
@@ -354,7 +354,7 @@ def multimodal_user_prompt_ingestor(payload: Any, params: Dict, context: Dict) -
     # 构造一个简短的附件说明
     parts = []
     if images:
-        parts.append(f"用户上传了{len(images)}张图像")
+        parts.append(f"User uploaded {len(images)} image(s)")
     if files:
         # 可选：列出最多前3个文件名
         names = []

From c6e4b8a0c5d9e6f627a98ba6d56ac981f0ddd26d Mon Sep 17 00:00:00 2001
From: Qing Long <qinglongshengzhe@gmail.com>
Date: Thu, 14 Aug 2025 09:45:12 +0800
Subject: [PATCH 13/32] Update core/agent_core/events/ingestors.py

Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
---
 core/agent_core/events/ingestors.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/core/agent_core/events/ingestors.py b/core/agent_core/events/ingestors.py
index 17b1d2c..38145ee 100644
--- a/core/agent_core/events/ingestors.py
+++ b/core/agent_core/events/ingestors.py
@@ -356,7 +356,7 @@ def multimodal_user_prompt_ingestor(payload: Any, params: Dict, context: Dict) -
     if images:
         parts.append(f"User uploaded {len(images)} image(s)")
     if files:
-        # 可选：列出最多前3个文件名
+        # Optional: list up to the first 3 file names
         names = []
         for f in files[:3]:
             name = f.get("name") or f.get("filename")

From 1789d935fa232425f1db9a2e768832616102491e Mon Sep 17 00:00:00 2001
From: Qing Long <qinglongshengzhe@gmail.com>
Date: Thu, 14 Aug 2025 09:45:29 +0800
Subject: [PATCH 14/32] Update core/agent_core/events/ingestors.py

Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
---
 core/agent_core/events/ingestors.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/core/agent_core/events/ingestors.py b/core/agent_core/events/ingestors.py
index 38145ee..afa42ad 100644
--- a/core/agent_core/events/ingestors.py
+++ b/core/agent_core/events/ingestors.py
@@ -363,7 +363,7 @@ def multimodal_user_prompt_ingestor(payload: Any, params: Dict, context: Dict) -
             if name:
                 names.append(name)
         if names:
-            parts.append(f"并附带{len(files)}个文件（示例：{', '.join(names)}{'' if len(files) <= 3 else ' 等'}）")
+            parts.append(f"and attached {len(files)} files (e.g., {', '.join(names)}{'' if len(files) <= 3 else ' etc.'})")
         else:
             parts.append(f"and attached {len(files)} files")
 

From de3d887f2d8c039b8bffdd88082b80bc7950752d Mon Sep 17 00:00:00 2001
From: Qing Long <qinglongshengzhe@gmail.com>
Date: Thu, 14 Aug 2025 09:45:49 +0800
Subject: [PATCH 15/32] Update frontend/app/chat/components/ChatInput.tsx

Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
---
 frontend/app/chat/components/ChatInput.tsx | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/frontend/app/chat/components/ChatInput.tsx b/frontend/app/chat/components/ChatInput.tsx
index 530f799..2293d53 100644
--- a/frontend/app/chat/components/ChatInput.tsx
+++ b/frontend/app/chat/components/ChatInput.tsx
@@ -143,7 +143,7 @@ export function ChatInput({
     if (files) {
       Array.from(files).forEach(addFile);
     }
-    // 清空input值以允许重复选择同一文件
+    // Clear the input value to allow selecting the same file again
     if (fileInputRef.current) {
       fileInputRef.current.value = '';
     }

From 5cef08df6d1fff130bde2cdcd27c592a1c8e9379 Mon Sep 17 00:00:00 2001
From: Qing Long <qinglongshengzhe@gmail.com>
Date: Thu, 14 Aug 2025 09:46:02 +0800
Subject: [PATCH 16/32] Update core/agent_core/events/ingestors.py

Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
---
 core/agent_core/events/ingestors.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/core/agent_core/events/ingestors.py b/core/agent_core/events/ingestors.py
index afa42ad..aef8fe6 100644
--- a/core/agent_core/events/ingestors.py
+++ b/core/agent_core/events/ingestors.py
@@ -367,7 +367,7 @@ def multimodal_user_prompt_ingestor(payload: Any, params: Dict, context: Dict) -
         else:
             parts.append(f"and attached {len(files)} files")
 
-    note = "[" + "，".join(parts) + "]"
+    note = "[" + ", ".join(parts) + "]"
     return f"{prompt}\n\n{note}" if prompt else note
 
 def _recursive_markdown_formatter(data: Any, schema: Dict, level: int = 0) -> List[str]:

From b6dfbaa71fac07517d62296a2e7e5aa2a603c457 Mon Sep 17 00:00:00 2001
From: Qing Long <qinglongshengzhe@gmail.com>
Date: Thu, 14 Aug 2025 09:46:20 +0800
Subject: [PATCH 17/32] Update frontend/app/chat/components/ChatInput.tsx

Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
---
 frontend/app/chat/components/ChatInput.tsx | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/frontend/app/chat/components/ChatInput.tsx b/frontend/app/chat/components/ChatInput.tsx
index 2293d53..c6f2c5e 100644
--- a/frontend/app/chat/components/ChatInput.tsx
+++ b/frontend/app/chat/components/ChatInput.tsx
@@ -119,7 +119,7 @@ export function ChatInput({
     setFiles(prev => [...prev, newAttachment]);
   }, [showError]);
 
-  // 处理粘贴事件（仅支持图片）
+  // Handle paste event (only supports images)
   const handlePaste = useCallback((e: React.ClipboardEvent) => {
     const items = e.clipboardData?.items;
     if (!items) return;

From 27033b7dd666394349d3e9e44d82debf48fa2ce0 Mon Sep 17 00:00:00 2001
From: Qing Long <qinglongshengzhe@gmail.com>
Date: Thu, 14 Aug 2025 09:46:53 +0800
Subject: [PATCH 18/32] Update frontend/app/chat/components/ChatInput.tsx

Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
---
 frontend/app/chat/components/ChatInput.tsx | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/frontend/app/chat/components/ChatInput.tsx b/frontend/app/chat/components/ChatInput.tsx
index c6f2c5e..d9ed088 100644
--- a/frontend/app/chat/components/ChatInput.tsx
+++ b/frontend/app/chat/components/ChatInput.tsx
@@ -137,7 +137,7 @@ export function ChatInput({
     }
   }, [addFile]);
 
-  // 处理文件选择
+  // Handle file selection
   const handleFileSelect = useCallback((e: React.ChangeEvent<HTMLInputElement>) => {
     const files = e.target.files;
     if (files) {

From 28cc20f7b96b746edba956c02c4492c5890aed12 Mon Sep 17 00:00:00 2001
From: Qing Long <qinglongshengzhe@gmail.com>
Date: Thu, 14 Aug 2025 09:47:17 +0800
Subject: [PATCH 19/32] Update frontend/app/chat/components/ChatInput.tsx

Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
---
 frontend/app/chat/components/ChatInput.tsx | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/frontend/app/chat/components/ChatInput.tsx b/frontend/app/chat/components/ChatInput.tsx
index d9ed088..0aaf489 100644
--- a/frontend/app/chat/components/ChatInput.tsx
+++ b/frontend/app/chat/components/ChatInput.tsx
@@ -149,7 +149,7 @@ export function ChatInput({
     }
   }, [addFile]);
 
-  // 移除图像
+  // Remove image
   const removeFile = useCallback((id: string) => {
     setFiles(prev => {
       const target = prev.find(img => img.id === id);

From b51e7929748e38f32cba8d918395b59df76e296e Mon Sep 17 00:00:00 2001
From: Qing Long <qinglongshengzhe@gmail.com>
Date: Thu, 14 Aug 2025 09:47:46 +0800
Subject: [PATCH 20/32] Update frontend/app/chat/components/ChatInput.tsx

Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
---
 frontend/app/chat/components/ChatInput.tsx | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/frontend/app/chat/components/ChatInput.tsx b/frontend/app/chat/components/ChatInput.tsx
index 0aaf489..22d4262 100644
--- a/frontend/app/chat/components/ChatInput.tsx
+++ b/frontend/app/chat/components/ChatInput.tsx
@@ -160,7 +160,7 @@ export function ChatInput({
     });
   }, []);
 
-  // 处理发送消息
+  // Handle sending message
   const handleSendMessage = useCallback(async () => {
     if (files.length > 0) {
       // Prefer file; convert to base64 dataUrl for all files <20MB (image/audio/video/text/document)

From c537f09e5f86c2f5578ef9427359b393fac6c07a Mon Sep 17 00:00:00 2001
From: Qing Long <qinglongshengzhe@gmail.com>
Date: Thu, 14 Aug 2025 09:48:04 +0800
Subject: [PATCH 21/32] Update frontend/app/chat/components/ChatInput.tsx

Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
---
 frontend/app/chat/components/ChatInput.tsx | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/frontend/app/chat/components/ChatInput.tsx b/frontend/app/chat/components/ChatInput.tsx
index 22d4262..9e87ef5 100644
--- a/frontend/app/chat/components/ChatInput.tsx
+++ b/frontend/app/chat/components/ChatInput.tsx
@@ -188,7 +188,7 @@ export function ChatInput({
     setFiles([]);
   }, [onSendMessage, files, fileToDataUrl]);
 
-  // 处理键盘事件
+  // Handle keyboard events
   const handleKeyPress = useCallback((e: React.KeyboardEvent) => {
     if (e.key === 'Enter' && !e.shiftKey) {
       e.preventDefault();

From a05befd58a2fc7c0fec6d7e3e074aeec05a2c914 Mon Sep 17 00:00:00 2001
From: Qing Long <qinglongshengzhe@gmail.com>
Date: Thu, 14 Aug 2025 09:48:43 +0800
Subject: [PATCH 22/32] Update frontend/app/chat/components/ChatInput.tsx

Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
---
 frontend/app/chat/components/ChatInput.tsx | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/frontend/app/chat/components/ChatInput.tsx b/frontend/app/chat/components/ChatInput.tsx
index 9e87ef5..fb565ca 100644
--- a/frontend/app/chat/components/ChatInput.tsx
+++ b/frontend/app/chat/components/ChatInput.tsx
@@ -202,7 +202,7 @@ export function ChatInput({
 
   return (
     <div className="p-3">
-      {/* 附件预览区域 */}
+      {/* Attachment preview area */}
       {files.length > 0 && (
         <div className="mb-3 p-3 bg-gray-50 rounded-lg">
           <div className="flex flex-wrap gap-2">

From d903c234467c718e02fa3036cc3860bc59d2b004 Mon Sep 17 00:00:00 2001
From: Qing Long <qinglongshengzhe@gmail.com>
Date: Thu, 14 Aug 2025 09:49:01 +0800
Subject: [PATCH 23/32] Update frontend/app/chat/components/ChatInput.tsx

Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
---
 frontend/app/chat/components/ChatInput.tsx | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/frontend/app/chat/components/ChatInput.tsx b/frontend/app/chat/components/ChatInput.tsx
index fb565ca..4ffddf8 100644
--- a/frontend/app/chat/components/ChatInput.tsx
+++ b/frontend/app/chat/components/ChatInput.tsx
@@ -251,7 +251,7 @@ export function ChatInput({
           files.forEach((f) => addFile(f));
         }}
       >
-        {/* 文件上传按钮 */}
+        {/* File upload button */}
         <Button
           variant="ghost"
           size="icon"

From 0b1872010dd06ed9daf2edbd108686d9193aeabf Mon Sep 17 00:00:00 2001
From: Qing Long <qinglongshengzhe@gmail.com>
Date: Thu, 14 Aug 2025 09:49:26 +0800
Subject: [PATCH 24/32] Update frontend/app/chat/components/ChatInput.tsx

Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
---
 frontend/app/chat/components/ChatInput.tsx | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/frontend/app/chat/components/ChatInput.tsx b/frontend/app/chat/components/ChatInput.tsx
index 4ffddf8..7b308cd 100644
--- a/frontend/app/chat/components/ChatInput.tsx
+++ b/frontend/app/chat/components/ChatInput.tsx
@@ -296,7 +296,7 @@ export function ChatInput({
         )}
       </div>
       
-      {/* 隐藏的文件输入 */}
+      {/* Hidden file input */}
       <input
         ref={fileInputRef}
         type="file"

From cf742a2f0c0663ebd5c296936eac3354d93383ce Mon Sep 17 00:00:00 2001
From: Qing Long <qinglongshengzhe@gmail.com>
Date: Thu, 14 Aug 2025 09:49:45 +0800
Subject: [PATCH 25/32] Update core/api/message_handlers.py

Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
---
 core/api/message_handlers.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/core/api/message_handlers.py b/core/api/message_handlers.py
index b82cd83..f510eef 100644
--- a/core/api/message_handlers.py
+++ b/core/api/message_handlers.py
@@ -890,7 +890,7 @@ async def handle_send_to_run_message(ws_state: Dict, data: Dict):
                 team_state = run_context['team_state']
                 partner_state = partner_context['state']
 
-                # 构建payload，仅使用统一的 files 多模态路径
+                # Construct payload, using only the unified 'files' multimodal path
                 payload = {"prompt": prompt_content}
                 if files_content:
                     payload["files"] = files_content

From 394aab4cccec27ef754bb59aa0cffd040d372b1c Mon Sep 17 00:00:00 2001
From: Qing Long <qinglongshengzhe@gmail.com>
Date: Thu, 14 Aug 2025 09:50:12 +0800
Subject: [PATCH 26/32] Update core/api/message_handlers.py

Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
---
 core/api/message_handlers.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/core/api/message_handlers.py b/core/api/message_handlers.py
index f510eef..81ce1c9 100644
--- a/core/api/message_handlers.py
+++ b/core/api/message_handlers.py
@@ -939,7 +939,7 @@ async def handle_send_to_run_message(ws_state: Dict, data: Dict):
                 partner_state = partner_context['state']
                 team_state = run_context['team_state']
 
-                # --- Core modification: Create an InboxItem with multimodal info (files only) ---
+                # --- Core modification: Create an InboxItem with multimodal info (text and/or files) ---
                 payload = {"prompt": prompt_content}
                 if files_content:
                     payload["files"] = files_content

From 6e3c67b6cb26d799e5dedf399135048012b8fb0d Mon Sep 17 00:00:00 2001
From: Qing Long <qinglongshengzhe@gmail.com>
Date: Thu, 14 Aug 2025 09:50:30 +0800
Subject: [PATCH 27/32] Update core/agent_core/framework/inbox_processor.py

Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
---
 core/agent_core/framework/inbox_processor.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/core/agent_core/framework/inbox_processor.py b/core/agent_core/framework/inbox_processor.py
index 3fe99bf..056bb41 100644
--- a/core/agent_core/framework/inbox_processor.py
+++ b/core/agent_core/framework/inbox_processor.py
@@ -259,7 +259,7 @@ async def process(self) -> Dict[str, Any]:
                 role = params.get("role", "user")
                 is_persistent = params.get("is_persistent_in_memory", False)
                 
-                # 处理多模态内容（仅文件）
+                # Handle multimodal content (files only)
                 has_multimodal_content = False
                 content_parts = []
                 

From ca41fc9768e4dce4df99ca56e1b7384e40d5efbb Mon Sep 17 00:00:00 2001
From: Qing Long <qinglongshengzhe@gmail.com>
Date: Thu, 14 Aug 2025 09:50:55 +0800
Subject: [PATCH 28/32] Update core/agent_core/framework/inbox_processor.py

Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
---
 core/agent_core/framework/inbox_processor.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/core/agent_core/framework/inbox_processor.py b/core/agent_core/framework/inbox_processor.py
index 056bb41..b554991 100644
--- a/core/agent_core/framework/inbox_processor.py
+++ b/core/agent_core/framework/inbox_processor.py
@@ -263,7 +263,7 @@ async def process(self) -> Dict[str, Any]:
                 has_multimodal_content = False
                 content_parts = []
                 
-                # 检查是否有文件内容
+                # Check if there is file content
                 if source in ["USER_PROMPT", "USER_PROMPT_WITH_FILES"] and isinstance(dehydrated_payload, dict):
                     # 处理文件内容：将附件上传到 Gemini 并构造成 file 引用
                     files = dehydrated_payload.get("files", [])

From d9fb426500e8a81345e1b163aa723b559eeb3ffb Mon Sep 17 00:00:00 2001
From: Qing Long <qinglongshengzhe@gmail.com>
Date: Thu, 14 Aug 2025 09:51:11 +0800
Subject: [PATCH 29/32] Update core/agent_core/framework/inbox_processor.py

Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
---
 core/agent_core/framework/inbox_processor.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/core/agent_core/framework/inbox_processor.py b/core/agent_core/framework/inbox_processor.py
index b554991..3d7bf56 100644
--- a/core/agent_core/framework/inbox_processor.py
+++ b/core/agent_core/framework/inbox_processor.py
@@ -265,7 +265,7 @@ async def process(self) -> Dict[str, Any]:
                 
                 # Check if there is file content
                 if source in ["USER_PROMPT", "USER_PROMPT_WITH_FILES"] and isinstance(dehydrated_payload, dict):
-                    # 处理文件内容：将附件上传到 Gemini 并构造成 file 引用
+                    # Process file content: upload attachments to Gemini and construct as file references
                     files = dehydrated_payload.get("files", [])
                     if files:
                         has_multimodal_content = True

From b616b9f9f77ee55f85d28a0ff97c67f3bb1227d9 Mon Sep 17 00:00:00 2001
From: Qing Long <qinglongshengzhe@gmail.com>
Date: Thu, 14 Aug 2025 09:51:27 +0800
Subject: [PATCH 30/32] Update core/agent_core/events/ingestors.py

Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
---
 core/agent_core/events/ingestors.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/core/agent_core/events/ingestors.py b/core/agent_core/events/ingestors.py
index aef8fe6..a204531 100644
--- a/core/agent_core/events/ingestors.py
+++ b/core/agent_core/events/ingestors.py
@@ -339,7 +339,7 @@ def user_prompt_ingestor(payload: Any, params: Dict, context: Dict) -> str:
 
 @register_ingestor("multimodal_user_prompt_ingestor")
 def multimodal_user_prompt_ingestor(payload: Any, params: Dict, context: Dict) -> str:
-    """处理包含图像/文件的用户输入，返回适合LLM的简要文本描述（实际数据在消息构建时处理）。"""
+    """Processes user input containing images/files and returns a concise text description suitable for LLMs (actual data is handled during message construction)."""
     if not isinstance(payload, dict):
         return str(payload)
 

From 8fa6b0defc70651fee5f9d17ab7e14f2367f118b Mon Sep 17 00:00:00 2001
From: Qing Long <qinglongshengzhe@gmail.com>
Date: Thu, 14 Aug 2025 09:51:42 +0800
Subject: [PATCH 31/32] Update core/agent_core/events/ingestors.py

Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
---
 core/agent_core/events/ingestors.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/core/agent_core/events/ingestors.py b/core/agent_core/events/ingestors.py
index a204531..e2fe581 100644
--- a/core/agent_core/events/ingestors.py
+++ b/core/agent_core/events/ingestors.py
@@ -347,7 +347,7 @@ def multimodal_user_prompt_ingestor(payload: Any, params: Dict, context: Dict) -
     images = payload.get("images", [])
     files = payload.get("files", [])
 
-    # 没有图像和文件时，直接返回文本
+    # If there are no images or files, return the text directly
     if not images and not files:
         return prompt
 

From 445c7b977de34d60d9508f621842d70e340f5771 Mon Sep 17 00:00:00 2001
From: Qing Long <qinglongshengzhe@gmail.com>
Date: Thu, 14 Aug 2025 09:51:55 +0800
Subject: [PATCH 32/32] Update core/agent_core/events/ingestors.py

Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
---
 core/agent_core/events/ingestors.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/core/agent_core/events/ingestors.py b/core/agent_core/events/ingestors.py
index e2fe581..94d55f5 100644
--- a/core/agent_core/events/ingestors.py
+++ b/core/agent_core/events/ingestors.py
@@ -351,7 +351,7 @@ def multimodal_user_prompt_ingestor(payload: Any, params: Dict, context: Dict) -
     if not images and not files:
         return prompt
 
-    # 构造一个简短的附件说明
+    # Construct a brief attachment description
     parts = []
     if images:
         parts.append(f"User uploaded {len(images)} image(s)")