✨ Multi modal agent.

Zhi-a · Zhi-a · commit 4787eb3d03fb · 2025-11-28T00:09:40.000+08:00
Pass the URL of the multimodal file as the query to the agent.
diff --git a/backend/agents/create_agent_info.py b/backend/agents/create_agent_info.py
@@ -239,17 +239,17 @@ async def create_tool_config_list(agent_id, tenant_id, user_id):
                 "vdb_core": get_vector_db_core(),
                 "embedding_model": get_embedding_model(tenant_id=tenant_id),
             }
-        elif tool_config.class_name == "AnalyzeImageTool":
-            tool_config.metadata = {
-                "vlm_model": get_vlm_model(tenant_id=tenant_id),
-                "storage_client": minio_client,
-            }
         elif tool_config.class_name == "AnalyzeTextFileTool":
             tool_config.metadata = {
                 "llm_model": get_llm_model(tenant_id=tenant_id),
                 "storage_client": minio_client,
                 "data_process_service_url": DATA_PROCESS_SERVICE
             }
+        elif tool_config.class_name == "AnalyzeImageTool":
+            tool_config.metadata = {
+                "vlm_model": get_vlm_model(tenant_id=tenant_id),
+                "storage_client": minio_client,
+            }
 
         tool_config_list.append(tool_config)
 
diff --git a/backend/services/tool_configuration_service.py b/backend/services/tool_configuration_service.py
@@ -616,25 +616,25 @@ def _validate_local_tool(
                 'embedding_model': embedding_model,
             }
             tool_instance = tool_class(**params)
-        elif tool_name == "analyze_text_file":
+        elif tool_name == "analyze_image":
             if not tenant_id or not user_id:
                 raise ToolExecutionException(f"Tenant ID and User ID are required for {tool_name} validation")
-            long_text_to_text_model = get_llm_model(tenant_id=tenant_id)
+            image_to_text_model = get_vlm_model(tenant_id=tenant_id)
             params = {
                 **instantiation_params,
-                'llm_model': long_text_to_text_model,
-                'storage_client': minio_client,
-                "data_process_service_url": DATA_PROCESS_SERVICE
+                'vlm_model': image_to_text_model,
+                'storage_client': minio_client
             }
             tool_instance = tool_class(**params)
-        elif tool_name == "analyze_image":
+        elif tool_name == "analyze_text_file":
             if not tenant_id or not user_id:
                 raise ToolExecutionException(f"Tenant ID and User ID are required for {tool_name} validation")
-            image_to_text_model = get_vlm_model(tenant_id=tenant_id)
+            long_text_to_text_model = get_llm_model(tenant_id=tenant_id)
             params = {
                 **instantiation_params,
-                'vlm_model': image_to_text_model,
-                'storage_client': minio_client
+                'llm_model': long_text_to_text_model,
+                'storage_client': minio_client,
+                "data_process_service_url": DATA_PROCESS_SERVICE
             }
             tool_instance = tool_class(**params)
         else:
diff --git a/frontend/app/[locale]/agents/components/PromptManager.tsx b/frontend/app/[locale]/agents/components/PromptManager.tsx
@@ -615,7 +615,7 @@ export default function PromptManager({
                   overflowY: "auto",
                 }}
                 autoSize={false}
-                disabled={!isEditingMode}
+                disabled={!isEditingMode || isGeneratingAgent}
               />
             </div>
             
diff --git a/sdk/nexent/core/agents/nexent_agent.py b/sdk/nexent/core/agents/nexent_agent.py
@@ -83,17 +83,17 @@ def create_local_tool(self, tool_config: ToolConfig):
                     "vdb_core", None) if tool_config.metadata else None
                 tools_obj.embedding_model = tool_config.metadata.get(
                     "embedding_model", None) if tool_config.metadata else None
-            elif class_name == "AnalyzeImageTool":
-                tools_obj = tool_class(observer=self.observer,
-                                       vlm_model=tool_config.metadata.get("vlm_model", []),
-                                       storage_client=tool_config.metadata.get("storage_client", []),
-                                       **params)
             elif class_name == "AnalyzeTextFileTool":
                 tools_obj = tool_class(observer=self.observer,
                                        llm_model=tool_config.metadata.get("llm_model", []),
                                        storage_client=tool_config.metadata.get("storage_client", []),
                                        data_process_service_url=tool_config.metadata.get("data_process_service_url", []),
                                        **params)
+            elif class_name == "AnalyzeImageTool":
+                tools_obj = tool_class(observer=self.observer,
+                                       vlm_model=tool_config.metadata.get("vlm_model", []),
+                                       storage_client=tool_config.metadata.get("storage_client", []),
+                                       **params)
             else:
                 tools_obj = tool_class(**params)
                 if hasattr(tools_obj, 'observer'):
diff --git a/sdk/nexent/core/tools/__init__.py b/sdk/nexent/core/tools/__init__.py
@@ -12,8 +12,8 @@
 from .move_item_tool import MoveItemTool
 from .list_directory_tool import ListDirectoryTool
 from .terminal_tool import TerminalTool
-from .analyze_image_tool import AnalyzeImageTool
 from .analyze_text_file_tool import AnalyzeTextFileTool
+from .analyze_image_tool import AnalyzeImageTool
 
 __all__ = [
     "ExaSearchTool", 
@@ -30,6 +30,6 @@
     "MoveItemTool",
     "ListDirectoryTool",
     "TerminalTool",
-    "AnalyzeImageTool",
-    "AnalyzeTextFileTool"
+    "AnalyzeTextFileTool",
+    "AnalyzeImageTool"
 ]
diff --git a/sdk/nexent/core/utils/prompt_template_utils.py b/sdk/nexent/core/utils/prompt_template_utils.py
@@ -26,12 +26,14 @@
 def get_prompt_template(template_type: str, language: str = LANGUAGE["ZH"], **kwargs) -> Dict[str, Any]:
     """
     Get prompt template
+
     Args:
         template_type: Template type, supports the following values:
             - 'analyze_image': Analyze image template
             - 'analyze_file': Analyze file template (for text files)
         language: Language code ('zh' or 'en')
         **kwargs: Additional parameters, for agent type need to pass is_manager parameter
+
     Returns:
         dict: Loaded prompt template
     """
diff --git a/sdk/nexent/core/utils/tools_common_message.py b/sdk/nexent/core/utils/tools_common_message.py
@@ -11,7 +11,7 @@ class ToolSign(Enum):
     TAVILY_SEARCH = "d"  # Tavily search tool identifier
     FILE_OPERATION = "f"      # File operation tool identifier
     TERMINAL_OPERATION = "t"  # Terminal operation tool identifier
-    MULTIMODAL_OPERATION = "m"  # Multimodal operation tool identifier
+    MULTIMODAL_OPERATION = "m" # Multimodal operation tool identifier
 
 
 # Tool sign mapping for backward compatibility
diff --git a/test/backend/agents/test_create_agent_info.py b/test/backend/agents/test_create_agent_info.py
@@ -5,22 +5,17 @@
 from pathlib import Path
 from unittest.mock import AsyncMock, MagicMock, patch, Mock, PropertyMock
 
+from test.common.env_test_utils import bootstrap_env
+
+env_state = bootstrap_env()
+consts_const = env_state["mock_const"]
 TEST_ROOT = Path(__file__).resolve().parents[2]
 PROJECT_ROOT = TEST_ROOT.parent
 
 # Ensure project backend package is found before test/backend
 for _path in (str(PROJECT_ROOT), str(TEST_ROOT)):
     if _path not in sys.path:
         sys.path.insert(0, _path)
-from test.common.env_test_utils import bootstrap_env
-
-env_state = bootstrap_env()
-consts_const = env_state["mock_const"]
-
-from test.common.env_test_utils import bootstrap_env
-
-env_state = bootstrap_env()
-consts_const = env_state["mock_const"]
 
 # Utilities ---------------------------------------------------------------
 def _create_stub_module(name: str, **attrs):
@@ -47,30 +42,6 @@ def _create_stub_module(name: str, **attrs):
 consts_const.MODEL_CONFIG_MAPPING = {"llm": "llm_config"}
 consts_const.LANGUAGE = {"ZH": "zh"}
 consts_const.DATA_PROCESS_SERVICE = "https://example.com/data-process"
-# Utilities ---------------------------------------------------------------
-def _create_stub_module(name: str, **attrs):
-    """Return a lightweight module stub with the provided attributes."""
-    module = types.ModuleType(name)
-    for attr_name, attr_value in attrs.items():
-        setattr(module, attr_name, attr_value)
-    return module
-
-
-# Configure required constants via shared bootstrap env
-consts_const.MINIO_ENDPOINT = "http://localhost:9000"
-consts_const.MINIO_ACCESS_KEY = "test_access_key"
-consts_const.MINIO_SECRET_KEY = "test_secret_key"
-consts_const.MINIO_REGION = "us-east-1"
-consts_const.MINIO_DEFAULT_BUCKET = "test-bucket"
-consts_const.POSTGRES_HOST = "localhost"
-consts_const.POSTGRES_USER = "test_user"
-consts_const.NEXENT_POSTGRES_PASSWORD = "test_password"
-consts_const.POSTGRES_DB = "test_db"
-consts_const.POSTGRES_PORT = 5432
-consts_const.DEFAULT_TENANT_ID = "default_tenant"
-consts_const.LOCAL_MCP_SERVER = "http://localhost:5011"
-consts_const.MODEL_CONFIG_MAPPING = {"llm": "llm_config"}
-consts_const.LANGUAGE = {"ZH": "zh"}
 
 # Mock utils module
 utils_mock = MagicMock()
@@ -125,13 +96,14 @@ def _create_stub_module(name: str, **attrs):
 sys.modules['utils.langchain_utils'] = MagicMock()
 sys.modules['utils.model_name_utils'] = MagicMock()
 sys.modules['langchain_core.tools'] = MagicMock()
-sys.modules['services.memory_config_service'] = MagicMock()
 # Build services module hierarchy with minimal functionality
 services_module = _create_stub_module("services")
 sys.modules['services'] = services_module
 sys.modules['services.image_service'] = _create_stub_module(
     "services.image_service", get_vlm_model=MagicMock(return_value="stub_vlm")
 )
+sys.modules['services.memory_config_service'] = MagicMock()
+# Extend services hierarchy with additional stubs
 sys.modules['services.file_management_service'] = _create_stub_module(
     "services.file_management_service",
     get_llm_model=MagicMock(return_value="stub_llm_model"),
@@ -140,18 +112,16 @@ def _create_stub_module(name: str, **attrs):
     "services.tool_configuration_service",
     initialize_tools_on_startup=AsyncMock(),
 )
+sys.modules['nexent.memory.memory_service'] = MagicMock()
+
 # Build top-level nexent module to avoid importing the real package
-nexent_module = _create_stub_module(
-    "nexent",
-    MessageObserver=mock_message_observer,
-)
+nexent_module = _create_stub_module("nexent", MessageObserver=mock_message_observer)
 sys.modules['nexent'] = nexent_module
 
 # Create nested modules for nexent.core to satisfy imports safely
 sys.modules['nexent.core'] = _create_stub_module("nexent.core")
 sys.modules['nexent.core.agents'] = _create_stub_module("nexent.core.agents")
 sys.modules['nexent.core.utils'] = _create_stub_module("nexent.core.utils")
-sys.modules['nexent.memory.memory_service'] = MagicMock()
 
 # Create mock classes that might be imported
 mock_agent_config = MagicMock()
@@ -397,76 +367,76 @@ async def test_create_tool_config_list_with_knowledge_base_tool(self):
             assert last_call[1]['class_name'] == "KnowledgeBaseSearchTool"
 
     @pytest.mark.asyncio
-    async def test_create_tool_config_list_with_analyze_text_file_tool(self):
-        """Ensure AnalyzeTextFileTool receives text-specific metadata."""
+    async def test_create_tool_config_list_with_analyze_image_tool(self):
+        """Ensure AnalyzeImageTool receives VLM model metadata."""
         mock_tool_instance = MagicMock()
-        mock_tool_instance.class_name = "AnalyzeTextFileTool"
+        mock_tool_instance.class_name = "AnalyzeImageTool"
         mock_tool_config.return_value = mock_tool_instance
 
         with patch('backend.agents.create_agent_info.discover_langchain_tools', return_value=[]), \
                 patch('backend.agents.create_agent_info.search_tools_for_sub_agent') as mock_search_tools, \
-                patch('backend.agents.create_agent_info.get_llm_model') as mock_get_llm_model, \
+                patch('backend.agents.create_agent_info.get_vlm_model') as mock_get_vlm_model, \
                 patch('backend.agents.create_agent_info.minio_client', new_callable=MagicMock) as mock_minio_client:
 
             mock_search_tools.return_value = [
                 {
-                    "class_name": "AnalyzeTextFileTool",
-                    "name": "analyze_text_file",
-                    "description": "Analyze text file tool",
+                    "class_name": "AnalyzeImageTool",
+                    "name": "analyze_image",
+                    "description": "Analyze image tool",
                     "inputs": "string",
-                    "output_type": "array",
+                    "output_type": "string",
                     "params": [{"name": "prompt", "default": "describe"}],
                     "source": "local",
                     "usage": None
                 }
             ]
-            mock_get_llm_model.return_value = "mock_llm_model"
+            mock_get_vlm_model.return_value = "mock_vlm_model"
 
             result = await create_tool_config_list("agent_1", "tenant_1", "user_1")
 
             assert len(result) == 1
             assert result[0] is mock_tool_instance
-            mock_get_llm_model.assert_called_once_with(tenant_id="tenant_1")
+            mock_get_vlm_model.assert_called_once_with(tenant_id="tenant_1")
             assert mock_tool_instance.metadata == {
-                "llm_model": "mock_llm_model",
-                "storage_client": mock_minio_client,
-                "data_process_service_url": consts_const.DATA_PROCESS_SERVICE,
+                "vlm_model": "mock_vlm_model",
+                "storage_client": mock_minio_client
             }
 
     @pytest.mark.asyncio
-    async def test_create_tool_config_list_with_analyze_image_tool(self):
-        """Ensure AnalyzeImageTool receives VLM model metadata."""
+    async def test_create_tool_config_list_with_analyze_text_file_tool(self):
+        """Ensure AnalyzeTextFileTool receives text-specific metadata."""
         mock_tool_instance = MagicMock()
-        mock_tool_instance.class_name = "AnalyzeImageTool"
+        mock_tool_instance.class_name = "AnalyzeTextFileTool"
         mock_tool_config.return_value = mock_tool_instance
 
         with patch('backend.agents.create_agent_info.discover_langchain_tools', return_value=[]), \
                 patch('backend.agents.create_agent_info.search_tools_for_sub_agent') as mock_search_tools, \
-                patch('backend.agents.create_agent_info.get_vlm_model') as mock_get_vlm_model, \
+                patch('backend.agents.create_agent_info.get_llm_model') as mock_get_llm_model, \
                 patch('backend.agents.create_agent_info.minio_client', new_callable=MagicMock) as mock_minio_client:
 
             mock_search_tools.return_value = [
                 {
-                    "class_name": "AnalyzeImageTool",
-                    "name": "analyze_image",
-                    "description": "Analyze image tool",
+                    "class_name": "AnalyzeTextFileTool",
+                    "name": "analyze_text_file",
+                    "description": "Analyze text file tool",
                     "inputs": "string",
-                    "output_type": "string",
+                    "output_type": "array",
                     "params": [{"name": "prompt", "default": "describe"}],
                     "source": "local",
                     "usage": None
                 }
             ]
-            mock_get_vlm_model.return_value = "mock_vlm_model"
+            mock_get_llm_model.return_value = "mock_llm_model"
 
             result = await create_tool_config_list("agent_1", "tenant_1", "user_1")
 
             assert len(result) == 1
             assert result[0] is mock_tool_instance
-            mock_get_vlm_model.assert_called_once_with(tenant_id="tenant_1")
+            mock_get_llm_model.assert_called_once_with(tenant_id="tenant_1")
             assert mock_tool_instance.metadata == {
-                "vlm_model": "mock_vlm_model",
-                "storage_client": mock_minio_client
+                "llm_model": "mock_llm_model",
+                "storage_client": mock_minio_client,
+                "data_process_service_url": consts_const.DATA_PROCESS_SERVICE,
             }
 
 
@@ -1278,16 +1248,13 @@ async def test_join_minio_file_description_to_query_with_files(self):
         minio_files = [
             {"url": "/nexent/1.pdf", "name": "1.pdf"},
             {"url": "/nexent/2.pdf", "name": "2.pdf"},
-            {"url": "/nexent/3.pdf", "name": "3.pdf"},
+            {"no_description": "should be ignored"}
         ]
         query = "test query"
 
         result = await join_minio_file_description_to_query(minio_files, query)
 
-        expected = ("User provided some reference files:\nFile S3 URL: s3://nexent/1.pdf, file name:1.pdf\n"
-                    "File S3 URL: s3://nexent/2.pdf, file name:2.pdf\n"
-                    "File S3 URL: s3://nexent/3.pdf, file name:3.pdf\n\n"
-                    'User wants to answer questions based on the above information: test query')
+        expected = "User provided some reference files:\nFile S3 URL: s3://nexent/1.pdf, file name:1.pdf\nFile S3 URL: s3://nexent/2.pdf, file name:2.pdf\n\nUser wants to answer questions based on the above information: test query"
         assert result == expected
 
     @pytest.mark.asyncio
diff --git a/test/backend/services/test_file_management_service.py b/test/backend/services/test_file_management_service.py
diff --git a/test/backend/services/test_tool_configuration_service.py b/test/backend/services/test_tool_configuration_service.py
diff --git a/test/sdk/core/agents/test_nexent_agent.py b/test/sdk/core/agents/test_nexent_agent.py
diff --git a/test/sdk/core/utils/test_prompt_template_utils.py b/test/sdk/core/utils/test_prompt_template_utils.py