add streamllm_agent (#635)

mushenL · 杨堃 · luyan · web-flow · commit 32fd9d9f306e · 2025-07-16T15:48:48.000+08:00
Co-authored-by: 杨堃 &lt;yk01645326@alibaba-inc.com&gt;
Co-authored-by: luyan &lt;luyan@U-V61TJ94D-2208.local&gt;
diff --git a/ms_agent/agent/llm_agent.py b/ms_agent/agent/llm_agent.py
@@ -15,7 +15,7 @@
 from ms_agent.tools import ToolManager
 from ms_agent.utils import async_retry
 from ms_agent.utils.logger import logger
-from omegaconf import DictConfig
+from omegaconf import DictConfig, OmegaConf
 
 from ..utils.utils import read_history, save_history
 from .base import Agent
@@ -308,8 +308,9 @@ def _log_output(content: str, tag: str):
             for _line in line.split('\\n'):
                 logger.info(f'[{tag}] {_line}')
 
-    @async_retry(max_attempts=2)
-    async def _step(self, messages: List[Message], tag: str) -> List[Message]:
+    @async_retry(max_attempts=2, delay=1.0)
+    async def _step(self, messages: List[Message],
+                    tag: str) -> List[Message]:  # type: ignore
         """
         Execute a single step in the agent's interaction loop.
 
@@ -345,12 +346,18 @@ async def _step(self, messages: List[Message], tag: str) -> List[Message]:
                 self.config.generation_config, 'stream', False):
             self._log_output('[assistant]:', tag=tag)
             _content = ''
+            is_first = True
             for _response_message in self._handle_stream_message(
                     messages, tools=tools):
+                if is_first:
+                    messages.append(_response_message)
+                    is_first = False
                 new_content = _response_message.content[len(_content):]
                 sys.stdout.write(new_content)
                 sys.stdout.flush()
                 _content = _response_message.content
+                messages[-1] = _response_message
+                yield messages
             sys.stdout.write('\n')
         else:
             _response_message = self.llm.generate(messages, tools=tools)
@@ -381,7 +388,7 @@ async def _step(self, messages: List[Message], tag: str) -> List[Message]:
             f'[usage] prompt_tokens: {_response_message.prompt_tokens}, '
             f'completion_tokens: {_response_message.completion_tokens}',
             tag=tag)
-        return messages
+        yield messages
 
     def _prepare_llm(self):
         """Initialize the LLM model from the configuration."""
@@ -440,13 +447,8 @@ def _save_history(self, messages: List[Message], **kwargs):
             config=config,
             messages=messages)
 
-    async def run(self, messages: Union[List[Message], str],
-                  **kwargs) -> List[Message]:
-        """
-        Main method to execute the agent.
-
-        Runs the agent loop, which includes generating responses,
-        calling tools, and managing memory and planning.
+    async def _run(self, messages: Union[List[Message], str], **kwargs):
+        """Run the agent, mainly contains a llm calling and tool calling loop.
 
         Args:
             messages (Union[List[Message], str]): Input data for the agent. Can be a raw string prompt,
@@ -483,7 +485,9 @@ async def run(self, messages: Union[List[Message], str],
                     self._log_output('[' + message.role + ']:', tag=self.tag)
                     self._log_output(message.content, tag=self.tag)
             while not self.runtime.should_stop:
-                messages = await self._step(messages, self.tag)
+                yield_step = self._step(messages, self.tag)
+                async for messages in yield_step:
+                    yield messages
                 self.runtime.round += 1
                 # +1 means the next round the assistant may give a conclusion
                 if self.runtime.round >= self.max_chat_round + 1:
@@ -495,15 +499,35 @@ async def run(self, messages: Union[List[Message], str],
                                 f'Task {messages[1].content} failed, max round({self.max_chat_round}) exceeded.'
                             ))
                     self.runtime.should_stop = True
+                    yield messages
                 # save history
                 self._save_history(messages, **kwargs)
 
             await self._loop_callback('on_task_end', messages)
             await self._cleanup_tools()
-            return messages
         except Exception as e:
             if hasattr(self.config, 'help'):
                 logger.error(
                     f'[{self.tag}] Runtime error, please follow the instructions:\n\n {self.config.help}'
                 )
             raise e
+
+    async def run(self, messages: Union[List[Message], str],
+                  **kwargs) -> List[Message]:
+        stream = kwargs.get('stream', False)
+        if stream:
+            OmegaConf.update(
+                self.config, 'generation_config.stream', True, merge=True)
+
+        if stream:
+
+            async def stream_generator():
+                async for chunk in self._run(messages=messages, **kwargs):
+                    yield chunk
+
+            return stream_generator()
+        else:
+            res = None
+            async for chunk in self._run(messages=messages, **kwargs):
+                res = chunk
+            return res
diff --git a/ms_agent/llm/openai_llm.py b/ms_agent/llm/openai_llm.py
@@ -72,7 +72,7 @@ def format_tools(self,
             tools = None
         return tools
 
-    @retry(max_attempts=12, delay=1.0)
+    @retry(max_attempts=3, delay=1.0)
     def generate(self,
                  messages: List[Message],
                  tools: Optional[List[Tool]] = None,
diff --git a/ms_agent/tools/mcp_client.py b/ms_agent/tools/mcp_client.py
@@ -61,7 +61,13 @@ async def call_tool(self, server_name: str, tool_name: str,
         texts = []
         if response.isError:
             sep = '\n\n'
-            return f'execute error: {sep.join(response.content)}'
+            if all(isinstance(item, str) for item in response.content):
+                return f'execute error: {sep.join(response.content)}'
+            else:
+                item_list = []
+                for item in response.content:
+                    item_list.append(item.text)
+                return f'execute error: {sep.join(item_list)}'
         for content in response.content:
             if content.type == 'text':
                 texts.append(content.text)
diff --git a/ms_agent/utils/llm_utils.py b/ms_agent/utils/llm_utils.py
@@ -1,4 +1,5 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
+import asyncio
 import functools
 import time
 from typing import Callable, Tuple, Type, TypeVar, Union
@@ -63,15 +64,17 @@ async def wrapper(*args, **kwargs) -> T:
 
             for attempt in range(1, max_attempts + 1):
                 try:
-                    return await func(*args, **kwargs)
+                    async for item in func(*args, **kwargs):
+                        yield item
+                    return
                 except exceptions as e:
                     last_exception = e
                     if attempt < max_attempts:
                         logger.warning(
                             f'Attempt {attempt}/{max_attempts} fails: {func.__name__}. '
                             f'Exception message: {e}. Will retry in {current_delay:.2f} seconds.'
                         )
-                        time.sleep(current_delay)
+                        await asyncio.sleep(current_delay)
                         current_delay *= backoff_factor
                     else:
                         logger.error(