diff --git a/examples/basic_modules/llm.py b/examples/basic_modules/llm.py
index d5fb16082..d33fc9544 100644
--- a/examples/basic_modules/llm.py
+++ b/examples/basic_modules/llm.py
@@ -69,6 +69,11 @@
print("Scenario 3:", response)
print("==" * 20)
+print("Scenario 3:\n")
+for chunk in llm.generate_stream(messages):
+ print(chunk, end="")
+print("==" * 20)
+
# Scenario 4: Using LLMFactory with Huggingface Models
@@ -91,3 +96,89 @@
response = llm.generate(messages)
print("Scenario 4:", response)
print("==" * 20)
+
+
+# Scenario 5: Using LLMFactory with Qwen (DashScope Compatible API)
+# Note:
+# This example works for any model that supports the OpenAI-compatible Chat Completion API,
+# including but not limited to:
+# - Qwen models: qwen-plus, qwen-max-2025-01-25
+# - DeepSeek models: deepseek-chat, deepseek-coder, deepseek-v3
+# - Other compatible providers: MiniMax, Fireworks, Groq, OpenRouter, etc.
+#
+# Just set the correct `api_key`, `api_base`, and `model_name_or_path`.
+
+config = LLMConfigFactory.model_validate(
+ {
+ "backend": "qwen",
+ "config": {
+ "model_name_or_path": "qwen-plus", # or qwen-max-2025-01-25
+ "temperature": 0.7,
+ "max_tokens": 1024,
+ "top_p": 0.9,
+ "top_k": 50,
+ "api_key": "sk-xxx",
+ "api_base": "https://dashscope.aliyuncs.com/compatible-mode/v1",
+ },
+ }
+)
+llm = LLMFactory.from_config(config)
+messages = [
+ {"role": "user", "content": "Hello, who are you"},
+]
+response = llm.generate(messages)
+print("Scenario 5:", response)
+print("==" * 20)
+
+print("Scenario 5:\n")
+for chunk in llm.generate_stream(messages):
+ print(chunk, end="")
+print("==" * 20)
+
+# Scenario 6: Using LLMFactory with Deepseek-chat
+
+cfg = LLMConfigFactory.model_validate(
+ {
+ "backend": "deepseek",
+ "config": {
+ "model_name_or_path": "deepseek-chat",
+ "api_key": "sk-xxx",
+ "api_base": "https://api.deepseek.com",
+ "temperature": 0.6,
+ "max_tokens": 512,
+ "remove_think_prefix": False,
+ },
+ }
+)
+llm = LLMFactory.from_config(cfg)
+messages = [{"role": "user", "content": "Hello, who are you"}]
+resp = llm.generate(messages)
+print("Scenario 6:", resp)
+
+
+# Scenario 7: Using LLMFactory with Deepseek-chat + reasoning + CoT + streaming
+
+cfg2 = LLMConfigFactory.model_validate(
+ {
+ "backend": "deepseek",
+ "config": {
+ "model_name_or_path": "deepseek-reasoner",
+ "api_key": "sk-xxx",
+ "api_base": "https://api.deepseek.com",
+ "temperature": 0.2,
+ "max_tokens": 1024,
+ "remove_think_prefix": False,
+ },
+ }
+)
+llm = LLMFactory.from_config(cfg2)
+messages = [
+ {
+ "role": "user",
+ "content": "Explain how to solve this problem step-by-step. Be explicit in your thinking process. Question: If a train travels from city A to city B at 60 mph and returns at 40 mph, what is its average speed for the entire trip? Let's think step by step.",
+ },
+]
+print("Scenario 7:\n")
+for chunk in llm.generate_stream(messages):
+ print(chunk, end="")
+print("==" * 20)
diff --git a/src/memos/configs/llm.py b/src/memos/configs/llm.py
index 6ff99e569..d69a0a0fc 100644
--- a/src/memos/configs/llm.py
+++ b/src/memos/configs/llm.py
@@ -27,6 +27,28 @@ class OpenAILLMConfig(BaseLLMConfig):
extra_body: Any = Field(default=None, description="extra body")
+class QwenLLMConfig(BaseLLMConfig):
+ api_key: str = Field(..., description="API key for DashScope (Qwen)")
+ api_base: str = Field(
+ default="https://dashscope-intl.aliyuncs.com/compatible-mode/v1",
+ description="Base URL for Qwen OpenAI-compatible API",
+ )
+ extra_body: Any = Field(default=None, description="extra body")
+ model_name_or_path: str = Field(..., description="Model name for Qwen, e.g., 'qwen-plus'")
+
+
+class DeepSeekLLMConfig(BaseLLMConfig):
+ api_key: str = Field(..., description="API key for DeepSeek")
+ api_base: str = Field(
+ default="https://api.deepseek.com",
+ description="Base URL for DeepSeek OpenAI-compatible API",
+ )
+ extra_body: Any = Field(default=None, description="Extra options for API")
+ model_name_or_path: str = Field(
+ ..., description="Model name: 'deepseek-chat' or 'deepseek-reasoner'"
+ )
+
+
class AzureLLMConfig(BaseLLMConfig):
base_url: str = Field(
default="https://api.openai.azure.com/",
@@ -78,6 +100,8 @@ class LLMConfigFactory(BaseConfig):
"huggingface": HFLLMConfig,
"vllm": VLLMLLMConfig,
"huggingface_singleton": HFLLMConfig, # Add singleton support
+ "qwen": QwenLLMConfig,
+ "deepseek": DeepSeekLLMConfig,
}
@field_validator("backend")
diff --git a/src/memos/llms/base.py b/src/memos/llms/base.py
index 312d19f1b..8c3681e16 100644
--- a/src/memos/llms/base.py
+++ b/src/memos/llms/base.py
@@ -1,4 +1,5 @@
from abc import ABC, abstractmethod
+from collections.abc import Generator
from memos.configs.llm import BaseLLMConfig
from memos.types import MessageList
@@ -14,3 +15,11 @@ def __init__(self, config: BaseLLMConfig):
@abstractmethod
def generate(self, messages: MessageList, **kwargs) -> str:
"""Generate a response from the LLM."""
+
+ @abstractmethod
+ def generate_stream(self, messages: MessageList, **kwargs) -> Generator[str, None, None]:
+ """
+ (Optional) Generate a streaming response from the LLM.
+ Subclasses should override this if they support streaming.
+ By default, this raises NotImplementedError.
+ """
diff --git a/src/memos/llms/deepseek.py b/src/memos/llms/deepseek.py
new file mode 100644
index 000000000..f5ee4842b
--- /dev/null
+++ b/src/memos/llms/deepseek.py
@@ -0,0 +1,54 @@
+from collections.abc import Generator
+
+from memos.configs.llm import DeepSeekLLMConfig
+from memos.llms.openai import OpenAILLM
+from memos.llms.utils import remove_thinking_tags
+from memos.log import get_logger
+from memos.types import MessageList
+
+
+logger = get_logger(__name__)
+
+
+class DeepSeekLLM(OpenAILLM):
+ """DeepSeek LLM via OpenAI-compatible API."""
+
+ def __init__(self, config: DeepSeekLLMConfig):
+ super().__init__(config)
+
+ def generate(self, messages: MessageList) -> str:
+ """Generate a response from DeepSeek."""
+ response = self.client.chat.completions.create(
+ model=self.config.model_name_or_path,
+ messages=messages,
+ temperature=self.config.temperature,
+ max_tokens=self.config.max_tokens,
+ top_p=self.config.top_p,
+ extra_body=self.config.extra_body,
+ )
+ logger.info(f"Response from DeepSeek: {response.model_dump_json()}")
+ response_content = response.choices[0].message.content
+ if self.config.remove_think_prefix:
+ return remove_thinking_tags(response_content)
+ else:
+ return response_content
+
+ def generate_stream(self, messages: MessageList, **kwargs) -> Generator[str, None, None]:
+ """Stream response from DeepSeek."""
+ response = self.client.chat.completions.create(
+ model=self.config.model_name_or_path,
+ messages=messages,
+ stream=True,
+ temperature=self.config.temperature,
+ max_tokens=self.config.max_tokens,
+ top_p=self.config.top_p,
+ extra_body=self.config.extra_body,
+ )
+ # Streaming chunks of text
+ for chunk in response:
+ delta = chunk.choices[0].delta
+ if hasattr(delta, "reasoning_content") and delta.reasoning_content:
+ yield delta.reasoning_content
+
+ if hasattr(delta, "content") and delta.content:
+ yield delta.content
diff --git a/src/memos/llms/factory.py b/src/memos/llms/factory.py
index c0f6d0a82..0c12a667a 100644
--- a/src/memos/llms/factory.py
+++ b/src/memos/llms/factory.py
@@ -2,10 +2,12 @@
from memos.configs.llm import LLMConfigFactory
from memos.llms.base import BaseLLM
+from memos.llms.deepseek import DeepSeekLLM
from memos.llms.hf import HFLLM
from memos.llms.hf_singleton import HFSingletonLLM
from memos.llms.ollama import OllamaLLM
from memos.llms.openai import AzureLLM, OpenAILLM
+from memos.llms.qwen import QwenLLM
from memos.llms.vllm import VLLMLLM
@@ -19,6 +21,8 @@ class LLMFactory(BaseLLM):
"huggingface": HFLLM,
"huggingface_singleton": HFSingletonLLM, # Add singleton version
"vllm": VLLMLLM,
+ "qwen": QwenLLM,
+ "deepseek": DeepSeekLLM,
}
@classmethod
diff --git a/src/memos/llms/ollama.py b/src/memos/llms/ollama.py
index 00e230789..050b7a253 100644
--- a/src/memos/llms/ollama.py
+++ b/src/memos/llms/ollama.py
@@ -1,3 +1,4 @@
+from collections.abc import Generator
from typing import Any
from ollama import Client
@@ -80,3 +81,6 @@ def generate(self, messages: MessageList) -> Any:
return remove_thinking_tags(str_response)
else:
return str_response
+
+ def generate_stream(self, messages: MessageList, **kwargs) -> Generator[str, None, None]:
+ raise NotImplementedError
diff --git a/src/memos/llms/openai.py b/src/memos/llms/openai.py
index ef8a005b2..148f6a2ca 100644
--- a/src/memos/llms/openai.py
+++ b/src/memos/llms/openai.py
@@ -1,3 +1,5 @@
+from collections.abc import Generator
+
import openai
from memos.configs.llm import AzureLLMConfig, OpenAILLMConfig
@@ -34,6 +36,39 @@ def generate(self, messages: MessageList) -> str:
else:
return response_content
+ def generate_stream(self, messages: MessageList, **kwargs) -> Generator[str, None, None]:
+ """Stream response from OpenAI LLM with optional reasoning support."""
+ response = self.client.chat.completions.create(
+ model=self.config.model_name_or_path,
+ messages=messages,
+ stream=True,
+ temperature=self.config.temperature,
+ max_tokens=self.config.max_tokens,
+ top_p=self.config.top_p,
+ extra_body=self.config.extra_body,
+ )
+
+ reasoning_started = False
+
+ for chunk in response:
+ delta = chunk.choices[0].delta
+
+ # Support for custom 'reasoning_content' (if present in OpenAI-compatible models like Qwen)
+ if hasattr(delta, "reasoning_content") and delta.reasoning_content:
+ if not reasoning_started and not self.config.remove_think_prefix:
+ yield ""
+ reasoning_started = True
+ yield delta.reasoning_content
+ elif hasattr(delta, "content") and delta.content:
+ if reasoning_started and not self.config.remove_think_prefix:
+ yield ""
+ reasoning_started = False
+ yield delta.content
+
+ # Ensure we close the block if not already done
+ if reasoning_started and not self.config.remove_think_prefix:
+ yield ""
+
class AzureLLM(BaseLLM):
"""Azure OpenAI LLM class."""
@@ -61,3 +96,6 @@ def generate(self, messages: MessageList) -> str:
return remove_thinking_tags(response_content)
else:
return response_content
+
+ def generate_stream(self, messages: MessageList, **kwargs) -> Generator[str, None, None]:
+ raise NotImplementedError
diff --git a/src/memos/llms/qwen.py b/src/memos/llms/qwen.py
new file mode 100644
index 000000000..a47fcdf36
--- /dev/null
+++ b/src/memos/llms/qwen.py
@@ -0,0 +1,63 @@
+from collections.abc import Generator
+
+from memos.configs.llm import QwenLLMConfig
+from memos.llms.openai import OpenAILLM
+from memos.llms.utils import remove_thinking_tags
+from memos.log import get_logger
+from memos.types import MessageList
+
+
+logger = get_logger(__name__)
+
+
+class QwenLLM(OpenAILLM):
+ """Qwen (DashScope) LLM class via OpenAI-compatible API."""
+
+ def __init__(self, config: QwenLLMConfig):
+ super().__init__(config)
+
+ def generate(self, messages: MessageList) -> str:
+ """Generate a response from Qwen LLM."""
+ response = self.client.chat.completions.create(
+ model=self.config.model_name_or_path,
+ messages=messages,
+ extra_body=self.config.extra_body,
+ temperature=self.config.temperature,
+ max_tokens=self.config.max_tokens,
+ top_p=self.config.top_p,
+ )
+ logger.info(f"Response from Qwen: {response.model_dump_json()}")
+ response_content = response.choices[0].message.content
+ if self.config.remove_think_prefix:
+ return remove_thinking_tags(response_content)
+ else:
+ return response_content
+
+ def generate_stream(self, messages: MessageList, **kwargs) -> Generator[str, None, None]:
+ """Stream response from Qwen LLM."""
+ response = self.client.chat.completions.create(
+ model=self.config.model_name_or_path,
+ messages=messages,
+ stream=True,
+ temperature=self.config.temperature,
+ max_tokens=self.config.max_tokens,
+ top_p=self.config.top_p,
+ extra_body=self.config.extra_body,
+ )
+
+ reasoning_started = False
+ for chunk in response:
+ delta = chunk.choices[0].delta
+
+ # Some models may have separate `reasoning_content` vs `content`
+ # For Qwen (DashScope), likely only `content` is used
+ if hasattr(delta, "reasoning_content") and delta.reasoning_content:
+ if not reasoning_started and not self.config.remove_think_prefix:
+ yield ""
+ reasoning_started = True
+ yield delta.reasoning_content
+ elif hasattr(delta, "content") and delta.content:
+ if reasoning_started and not self.config.remove_think_prefix:
+ yield ""
+ reasoning_started = False
+ yield delta.content
diff --git a/src/memos/templates/mem_reader_prompts.py b/src/memos/templates/mem_reader_prompts.py
index d1b9abb8e..89298e8de 100644
--- a/src/memos/templates/mem_reader_prompts.py
+++ b/src/memos/templates/mem_reader_prompts.py
@@ -71,7 +71,7 @@
"summary": "Tom is currently focused on managing a new project with a tight schedule. After a team meeting on June 25, 2025, he realized the original deadline of December 15 might not be feasible due to backend delays. Concerned about insufficient testing time, he welcomed Jerry’s suggestion of proposing an extension. Tom plans to raise the idea of shifting the deadline to January 5, 2026 in the next morning’s meeting. His actions reflect both stress about timelines and a proactive, team-oriented problem-solving approach."
}
-Another Example in Chinese (注意: 你的输出必须和输入的user语言一致):
+Another Example in Chinese (注意: 当user的语言为中文时,你就需要也输出中文):
{
"memory list": [
{
@@ -85,6 +85,8 @@
"summary": "Tom 目前专注于管理一个进度紧张的新项目..."
}
+Always respond in the same language as the conversation.
+
Conversation:
${conversation}
diff --git a/tests/llms/test_deepseek.py b/tests/llms/test_deepseek.py
new file mode 100644
index 000000000..75c1ead5f
--- /dev/null
+++ b/tests/llms/test_deepseek.py
@@ -0,0 +1,88 @@
+import unittest
+
+from types import SimpleNamespace
+from unittest.mock import MagicMock
+
+from memos.configs.llm import DeepSeekLLMConfig
+from memos.llms.deepseek import DeepSeekLLM
+
+
+class TestDeepSeekLLM(unittest.TestCase):
+ def test_deepseek_llm_generate_with_and_without_think_prefix(self):
+ """Test DeepSeekLLM generate method with and without tag removal."""
+
+ # Simulated full content including tag
+ full_content = "Thinking in progress...Hello from DeepSeek!"
+
+ # Mock response object
+ mock_response = MagicMock()
+ mock_response.model_dump_json.return_value = '{"mock": "true"}'
+ mock_response.choices[0].message.content = full_content
+
+ # Config with think prefix preserved
+ config_with_think = DeepSeekLLMConfig.model_validate(
+ {
+ "model_name_or_path": "deepseek-chat",
+ "temperature": 0.7,
+ "max_tokens": 512,
+ "top_p": 0.9,
+ "api_key": "sk-test",
+ "api_base": "https://api.deepseek.com/v1",
+ "remove_think_prefix": False,
+ }
+ )
+ llm_with_think = DeepSeekLLM(config_with_think)
+ llm_with_think.client.chat.completions.create = MagicMock(return_value=mock_response)
+
+ output_with_think = llm_with_think.generate([{"role": "user", "content": "Hello"}])
+ self.assertEqual(output_with_think, full_content)
+
+ # Config with think tag removed
+ config_without_think = config_with_think.model_copy(update={"remove_think_prefix": True})
+ llm_without_think = DeepSeekLLM(config_without_think)
+ llm_without_think.client.chat.completions.create = MagicMock(return_value=mock_response)
+
+ output_without_think = llm_without_think.generate([{"role": "user", "content": "Hello"}])
+ self.assertEqual(output_without_think, "Hello from DeepSeek!")
+
+ def test_deepseek_llm_generate_stream(self):
+ """Test DeepSeekLLM generate_stream with reasoning_content and content chunks."""
+
+ def make_chunk(delta_dict):
+ # Create a simulated stream chunk with delta fields
+ delta = SimpleNamespace(**delta_dict)
+ choice = SimpleNamespace(delta=delta)
+ return SimpleNamespace(choices=[choice])
+
+ # Simulate chunks: reasoning + answer
+ mock_stream_chunks = [
+ make_chunk({"reasoning_content": "Analyzing..."}),
+ make_chunk({"content": "Hello"}),
+ make_chunk({"content": ", "}),
+ make_chunk({"content": "DeepSeek!"}),
+ ]
+
+ mock_chat_completions_create = MagicMock(return_value=iter(mock_stream_chunks))
+
+ config = DeepSeekLLMConfig.model_validate(
+ {
+ "model_name_or_path": "deepseek-chat",
+ "temperature": 0.7,
+ "max_tokens": 512,
+ "top_p": 0.9,
+ "api_key": "sk-test",
+ "api_base": "https://api.deepseek.com/v1",
+ "remove_think_prefix": False,
+ }
+ )
+ llm = DeepSeekLLM(config)
+ llm.client.chat.completions.create = mock_chat_completions_create
+
+ messages = [{"role": "user", "content": "Say hello"}]
+ streamed = list(llm.generate_stream(messages))
+ full_output = "".join(streamed)
+
+ self.assertIn("Analyzing...", full_output)
+ self.assertIn("Hello, DeepSeek!", full_output)
+ self.assertTrue(full_output.startswith("Analyzing..."))
+ self.assertTrue(full_output.endswith("DeepSeek!"))
diff --git a/tests/llms/test_openai.py b/tests/llms/test_openai.py
index 90bc7ab2d..dff57c058 100644
--- a/tests/llms/test_openai.py
+++ b/tests/llms/test_openai.py
@@ -1,5 +1,6 @@
import unittest
+from types import SimpleNamespace
from unittest.mock import MagicMock
from memos.configs.llm import LLMConfigFactory
@@ -40,3 +41,62 @@ def test_llm_factory_with_mocked_openai_backend(self):
response,
"Hello! I'm an AI language model created by OpenAI. I'm here to help answer questions, provide information, and assist with a wide range of topics. How can I assist you today?",
)
+
+ def test_llm_factory_with_stream_openai_backend(self):
+ """Test LLMFactory stream generation with mocked OpenAI backend."""
+
+ def make_chunk(delta_dict):
+ # Create a mock response chunk with a simulated delta dictionary
+ delta = SimpleNamespace(**delta_dict)
+ choice = SimpleNamespace(delta=delta, finish_reason="stop", index=0)
+ return SimpleNamespace(choices=[choice])
+
+ # Simulate a stream response with both reasoning_content and content
+ mock_stream_chunks = [
+ make_chunk({"reasoning_content": "I am thinking"}),
+ make_chunk({"content": "Hello"}),
+ make_chunk({"content": ", "}),
+ make_chunk({"content": "world!"}),
+ ]
+
+ # Mock the streaming chat completion call
+ mock_chat_completions_create = MagicMock(return_value=iter(mock_stream_chunks))
+
+ # Create the LLM config with think prefix enabled
+ config = LLMConfigFactory.model_validate(
+ {
+ "backend": "openai",
+ "config": {
+ "model_name_or_path": "gpt-4.1-nano",
+ "temperature": 0.8,
+ "max_tokens": 1024,
+ "top_p": 0.9,
+ "top_k": 50,
+ "api_key": "sk-xxxx",
+ "api_base": "https://api.openai.com/v1",
+ "remove_think_prefix": False,
+ # Ensure tag is emitted
+ },
+ }
+ )
+
+ # Instantiate the LLM and inject the mocked stream method
+ llm = LLMFactory.from_config(config)
+ llm.client.chat.completions.create = mock_chat_completions_create
+
+ # Input message to the model
+ messages = [{"role": "user", "content": "Think and say hello"}]
+
+ # Collect streamed output as a list of chunks
+ response_parts = list(llm.generate_stream(messages))
+ response = "".join(response_parts)
+
+ # Assert the presence of the tag and expected content
+ self.assertIn("", response)
+ self.assertIn("I am thinking", response)
+ self.assertIn("Hello, world!", response)
+
+ # Optional: check structure of stream response
+ self.assertEqual(response_parts[0], "")
+ self.assertTrue(response.startswith("I am thinking"))
+ self.assertTrue(response.endswith("Hello, world!"))
diff --git a/tests/llms/test_qwen.py b/tests/llms/test_qwen.py
new file mode 100644
index 000000000..90f31e47f
--- /dev/null
+++ b/tests/llms/test_qwen.py
@@ -0,0 +1,101 @@
+import unittest
+
+from types import SimpleNamespace
+from unittest.mock import MagicMock
+
+from memos.configs.llm import QwenLLMConfig
+from memos.llms.qwen import QwenLLM
+
+
+class TestQwenLLM(unittest.TestCase):
+ def test_qwen_llm_generate_with_and_without_think_prefix(self):
+ """Test QwenLLM non-streaming response generation with and without prefix removal."""
+
+ # Simulated full response content with tag
+ full_content = "Analyzing your request...Hello, world!"
+
+ # Prepare the mock response object with expected structure
+ mock_response = MagicMock()
+ mock_response.model_dump_json.return_value = '{"mocked": "true"}'
+ mock_response.choices[0].message.content = full_content
+
+ # Create config with remove_think_prefix = False
+ config_with_think = QwenLLMConfig.model_validate(
+ {
+ "model_name_or_path": "qwen-test",
+ "temperature": 0.7,
+ "max_tokens": 100,
+ "top_p": 0.9,
+ "api_key": "sk-test",
+ "api_base": "https://dashscope.aliyuncs.com/api/v1",
+ "remove_think_prefix": False,
+ }
+ )
+
+ # Instance with think tag enabled
+ llm_with_think = QwenLLM(config_with_think)
+ llm_with_think.client.chat.completions.create = MagicMock(return_value=mock_response)
+
+ response_with_think = llm_with_think.generate([{"role": "user", "content": "Hi"}])
+ self.assertEqual(response_with_think, full_content)
+
+ # Create config with remove_think_prefix = True
+ config_without_think = config_with_think.model_copy(update={"remove_think_prefix": True})
+
+ # Instance with think tag removed
+ llm_without_think = QwenLLM(config_without_think)
+ llm_without_think.client.chat.completions.create = MagicMock(return_value=mock_response)
+
+ response_without_think = llm_without_think.generate([{"role": "user", "content": "Hi"}])
+ self.assertEqual(response_without_think, "Hello, world!")
+ self.assertNotIn("", response_without_think)
+
+ def test_qwen_llm_generate_stream(self):
+ """Test QwenLLM stream generation with both reasoning_content and content."""
+
+ def make_chunk(delta_dict):
+ # Construct a mock chunk with delta fields
+ delta = SimpleNamespace(**delta_dict)
+ choice = SimpleNamespace(delta=delta)
+ return SimpleNamespace(choices=[choice])
+
+ # Simulate a sequence of streamed chunks
+ mock_stream_chunks = [
+ make_chunk({"reasoning_content": "Analyzing input..."}),
+ make_chunk({"content": "Hello"}),
+ make_chunk({"content": ", "}),
+ make_chunk({"content": "world!"}),
+ ]
+
+ # Mock the client's streaming response
+ mock_chat_completions_create = MagicMock(return_value=iter(mock_stream_chunks))
+
+ # Build QwenLLM config with think prefix enabled
+ config = QwenLLMConfig.model_validate(
+ {
+ "model_name_or_path": "qwen-test",
+ "temperature": 0.7,
+ "max_tokens": 100,
+ "top_p": 0.9,
+ "api_key": "sk-test",
+ "api_base": "https://dashscope.aliyuncs.com/api/v1",
+ "remove_think_prefix": False,
+ }
+ )
+
+ # Create QwenLLM instance and inject mock client
+ llm = QwenLLM(config)
+ llm.client.chat.completions.create = mock_chat_completions_create
+
+ messages = [{"role": "user", "content": "Say hello"}]
+
+ # Collect the streamed output
+ response_parts = list(llm.generate_stream(messages))
+ response = "".join(response_parts)
+
+ # Assertions for structure and content
+ self.assertIn("", response)
+ self.assertIn("Analyzing input...", response)
+ self.assertIn("Hello, world!", response)
+ self.assertTrue(response.startswith("Analyzing input..."))
+ self.assertTrue(response.endswith("Hello, world!"))