From b0a511b16b132086d0d77cd80835fe3e6f798ab3 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=B8=AD=E9=98=B3=E9=98=B3?= <caralhsi@gmail.com>
Date: Thu, 17 Jul 2025 16:18:26 +0800
Subject: [PATCH 01/10] fix: language bug

---
 src/memos/templates/mem_reader_prompts.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/memos/templates/mem_reader_prompts.py b/src/memos/templates/mem_reader_prompts.py
index d1b9abb8e..89298e8de 100644
--- a/src/memos/templates/mem_reader_prompts.py
+++ b/src/memos/templates/mem_reader_prompts.py
@@ -71,7 +71,7 @@
   "summary": "Tom is currently focused on managing a new project with a tight schedule. After a team meeting on June 25, 2025, he realized the original deadline of December 15 might not be feasible due to backend delays. Concerned about insufficient testing time, he welcomed Jerry’s suggestion of proposing an extension. Tom plans to raise the idea of shifting the deadline to January 5, 2026 in the next morning’s meeting. His actions reflect both stress about timelines and a proactive, team-oriented problem-solving approach."
 }
 
-Another Example in Chinese (注意: 你的输出必须和输入的user语言一致)：
+Another Example in Chinese (注意: 当user的语言为中文时，你就需要也输出中文)：
 {
   "memory list": [
     {
@@ -85,6 +85,8 @@
   "summary": "Tom 目前专注于管理一个进度紧张的新项目..."
 }
 
+Always respond in the same language as the conversation.
+
 Conversation:
 ${conversation}
 

From d126f5dbe169e8c39107933597a878744bf86b0a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=B8=AD=E9=98=B3=E9=98=B3?= <caralhsi@gmail.com>
Date: Thu, 17 Jul 2025 16:45:28 +0800
Subject: [PATCH 02/10] feat: add qwen api

---
 examples/basic_modules/llm.py | 25 +++++++++++++++++++++++++
 src/memos/configs/llm.py      | 11 +++++++++++
 src/memos/llms/factory.py     |  2 ++
 src/memos/llms/qwen.py        | 35 +++++++++++++++++++++++++++++++++++
 4 files changed, 73 insertions(+)
 create mode 100644 src/memos/llms/qwen.py

diff --git a/examples/basic_modules/llm.py b/examples/basic_modules/llm.py
index d5fb16082..36359d7ad 100644
--- a/examples/basic_modules/llm.py
+++ b/examples/basic_modules/llm.py
@@ -91,3 +91,28 @@
 response = llm.generate(messages)
 print("Scenario 4:", response)
 print("==" * 20)
+
+
+# Scenario 5: Using LLMFactory with Qwen (DashScope Compatible API)
+
+config = LLMConfigFactory.model_validate(
+    {
+        "backend": "qwen",
+        "config": {
+            "model_name_or_path": "qwen-plus",  # or qwen-max-2025-01-25
+            "temperature": 0.7,
+            "max_tokens": 1024,
+            "top_p": 0.9,
+            "top_k": 50,
+            "api_key": "sk-xxx",
+            "api_base": "https://dashscope.aliyuncs.com/compatible-mode/v1",
+        },
+    }
+)
+llm = LLMFactory.from_config(config)
+messages = [
+    {"role": "user", "content": "Can you speak Chinese?"},
+]
+response = llm.generate(messages)
+print("Scenario 5:", response)
+print("==" * 20)
diff --git a/src/memos/configs/llm.py b/src/memos/configs/llm.py
index 6ff99e569..8b8ed7b76 100644
--- a/src/memos/configs/llm.py
+++ b/src/memos/configs/llm.py
@@ -27,6 +27,16 @@ class OpenAILLMConfig(BaseLLMConfig):
     extra_body: Any = Field(default=None, description="extra body")
 
 
+class QwenLLMConfig(BaseLLMConfig):
+    api_key: str = Field(..., description="API key for DashScope (Qwen)")
+    api_base: str = Field(
+        default="https://dashscope-intl.aliyuncs.com/compatible-mode/v1",
+        description="Base URL for Qwen OpenAI-compatible API",
+    )
+    extra_body: Any = Field(default=None, description="extra body")
+    model_name_or_path: str = Field(..., description="Model name for Qwen, e.g., 'qwen-plus'")
+
+
 class AzureLLMConfig(BaseLLMConfig):
     base_url: str = Field(
         default="https://api.openai.azure.com/",
@@ -78,6 +88,7 @@ class LLMConfigFactory(BaseConfig):
         "huggingface": HFLLMConfig,
         "vllm": VLLMLLMConfig,
         "huggingface_singleton": HFLLMConfig,  # Add singleton support
+        "qwen": QwenLLMConfig,
     }
 
     @field_validator("backend")
diff --git a/src/memos/llms/factory.py b/src/memos/llms/factory.py
index c0f6d0a82..e449bbe85 100644
--- a/src/memos/llms/factory.py
+++ b/src/memos/llms/factory.py
@@ -6,6 +6,7 @@
 from memos.llms.hf_singleton import HFSingletonLLM
 from memos.llms.ollama import OllamaLLM
 from memos.llms.openai import AzureLLM, OpenAILLM
+from memos.llms.qwen import QwenLLM
 from memos.llms.vllm import VLLMLLM
 
 
@@ -19,6 +20,7 @@ class LLMFactory(BaseLLM):
         "huggingface": HFLLM,
         "huggingface_singleton": HFSingletonLLM,  # Add singleton version
         "vllm": VLLMLLM,
+        "qwen": QwenLLM,
     }
 
     @classmethod
diff --git a/src/memos/llms/qwen.py b/src/memos/llms/qwen.py
new file mode 100644
index 000000000..b51ab37fd
--- /dev/null
+++ b/src/memos/llms/qwen.py
@@ -0,0 +1,35 @@
+import openai
+
+from memos.configs.llm import QwenLLMConfig
+from memos.llms.base import BaseLLM
+from memos.llms.utils import remove_thinking_tags
+from memos.log import get_logger
+from memos.types import MessageList
+
+
+logger = get_logger(__name__)
+
+
+class QwenLLM(BaseLLM):
+    """Qwen (DashScope) LLM class via OpenAI-compatible API."""
+
+    def __init__(self, config: QwenLLMConfig):
+        self.config = config
+        self.client = openai.Client(api_key=config.api_key, base_url=config.api_base)
+
+    def generate(self, messages: MessageList) -> str:
+        """Generate a response from Qwen LLM."""
+        response = self.client.chat.completions.create(
+            model=self.config.model_name_or_path,
+            messages=messages,
+            extra_body=self.config.extra_body,
+            temperature=self.config.temperature,
+            max_tokens=self.config.max_tokens,
+            top_p=self.config.top_p,
+        )
+        logger.info(f"Response from Qwen: {response.model_dump_json()}")
+        response_content = response.choices[0].message.content
+        if self.config.remove_think_prefix:
+            return remove_thinking_tags(response_content)
+        else:
+            return response_content

From faa4c520e24f0744751dc11ae4bfc93c0e0cb818 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=B8=AD=E9=98=B3=E9=98=B3?= <caralhsi@gmail.com>
Date: Thu, 17 Jul 2025 17:48:50 +0800
Subject: [PATCH 03/10] feat: modify qwen llm

---
 src/memos/llms/qwen.py | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

diff --git a/src/memos/llms/qwen.py b/src/memos/llms/qwen.py
index b51ab37fd..0d32cdea3 100644
--- a/src/memos/llms/qwen.py
+++ b/src/memos/llms/qwen.py
@@ -1,7 +1,5 @@
-import openai
-
 from memos.configs.llm import QwenLLMConfig
-from memos.llms.base import BaseLLM
+from memos.llms.openai import OpenAILLM
 from memos.llms.utils import remove_thinking_tags
 from memos.log import get_logger
 from memos.types import MessageList
@@ -10,12 +8,11 @@
 logger = get_logger(__name__)
 
 
-class QwenLLM(BaseLLM):
+class QwenLLM(OpenAILLM):
     """Qwen (DashScope) LLM class via OpenAI-compatible API."""
 
     def __init__(self, config: QwenLLMConfig):
-        self.config = config
-        self.client = openai.Client(api_key=config.api_key, base_url=config.api_base)
+        super().__init__(config)
 
     def generate(self, messages: MessageList) -> str:
         """Generate a response from Qwen LLM."""

From 3c4fcd4f0133f5022dc4f2df2f5193ccbaa39e00 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=B8=AD=E9=98=B3=E9=98=B3?= <caralhsi@gmail.com>
Date: Thu, 17 Jul 2025 18:55:33 +0800
Subject: [PATCH 04/10] feat: add deepseek llm

---
 examples/basic_modules/llm.py | 63 ++++++++++++++++++++++++++++++++++-
 src/memos/configs/llm.py      | 13 ++++++++
 src/memos/llms/base.py        |  9 +++++
 src/memos/llms/deepseek.py    | 56 +++++++++++++++++++++++++++++++
 src/memos/llms/factory.py     |  2 ++
 src/memos/llms/ollama.py      |  4 +++
 src/memos/llms/openai.py      |  5 +++
 src/memos/llms/qwen.py        | 31 +++++++++++++++++
 8 files changed, 182 insertions(+), 1 deletion(-)
 create mode 100644 src/memos/llms/deepseek.py

diff --git a/examples/basic_modules/llm.py b/examples/basic_modules/llm.py
index 36359d7ad..c37c86f9a 100644
--- a/examples/basic_modules/llm.py
+++ b/examples/basic_modules/llm.py
@@ -94,6 +94,14 @@
 
 
 # Scenario 5: Using LLMFactory with Qwen (DashScope Compatible API)
+# Note:
+# This example works for any model that supports the OpenAI-compatible Chat Completion API,
+# including but not limited to:
+# - Qwen models: qwen-plus, qwen-max-2025-01-25
+# - DeepSeek models: deepseek-chat, deepseek-coder, deepseek-v3
+# - Other compatible providers: MiniMax, Fireworks, Groq, OpenRouter, etc.
+#
+# Just set the correct `api_key`, `api_base`, and `model_name_or_path`.
 
 config = LLMConfigFactory.model_validate(
     {
@@ -111,8 +119,61 @@
 )
 llm = LLMFactory.from_config(config)
 messages = [
-    {"role": "user", "content": "Can you speak Chinese?"},
+    {"role": "user", "content": "Hello, who are you"},
 ]
 response = llm.generate(messages)
 print("Scenario 5:", response)
 print("==" * 20)
+
+print("Scenario 5:\n")
+for chunk in llm.generate_stream(messages):
+    print(chunk, end="")
+print("==" * 20)
+
+# Scenario 6: Using LLMFactory with Deepseek-chat
+
+cfg = LLMConfigFactory.model_validate(
+    {
+        "backend": "deepseek",
+        "config": {
+            "model_name_or_path": "deepseek-chat",
+            "api_key": "sk-xxx",
+            "api_base": "https://api.deepseek.com",
+            "temperature": 0.6,
+            "max_tokens": 512,
+            "remove_think_prefix": False,
+        },
+    }
+)
+llm = LLMFactory.from_config(cfg)
+messages = [{"role": "user", "content": "Hello, who are you"}]
+resp = llm.generate(messages)
+print("Scenario 6:", resp)
+
+
+# Scenario 7: Using LLMFactory with Deepseek-chat + reasoning + CoT + streaming
+
+cfg2 = LLMConfigFactory.model_validate(
+    {
+        "backend": "deepseek",
+        "config": {
+            "model_name_or_path": "deepseek-reasoner",
+            "api_key": "sk-xxx",
+            "api_base": "https://api.deepseek.com",
+            "temperature": 0.2,
+            "max_tokens": 1024,
+            "remove_think_prefix": False,
+        },
+    }
+)
+llm = LLMFactory.from_config(cfg2)
+messages = [
+    {
+        "role": "user",
+        "content": "Explain how to solve this problem step-by-step. Be explicit in your thinking process. Question: If a train travels from city A to city B at 60 mph and returns at 40 mph, what is its average speed for the entire trip? Let's think step by step.",
+    },
+]
+print("Scenario 7:\n")
+for chunk in llm.generate_stream(messages):
+    print(chunk, end="")
+print("==" * 20)
diff --git a/src/memos/configs/llm.py b/src/memos/configs/llm.py
index 8b8ed7b76..d69a0a0fc 100644
--- a/src/memos/configs/llm.py
+++ b/src/memos/configs/llm.py
@@ -37,6 +37,18 @@ class QwenLLMConfig(BaseLLMConfig):
     model_name_or_path: str = Field(..., description="Model name for Qwen, e.g., 'qwen-plus'")
 
 
+class DeepSeekLLMConfig(BaseLLMConfig):
+    api_key: str = Field(..., description="API key for DeepSeek")
+    api_base: str = Field(
+        default="https://api.deepseek.com",
+        description="Base URL for DeepSeek OpenAI-compatible API",
+    )
+    extra_body: Any = Field(default=None, description="Extra options for API")
+    model_name_or_path: str = Field(
+        ..., description="Model name: 'deepseek-chat' or 'deepseek-reasoner'"
+    )
+
+
 class AzureLLMConfig(BaseLLMConfig):
     base_url: str = Field(
         default="https://api.openai.azure.com/",
@@ -89,6 +101,7 @@ class LLMConfigFactory(BaseConfig):
         "vllm": VLLMLLMConfig,
         "huggingface_singleton": HFLLMConfig,  # Add singleton support
         "qwen": QwenLLMConfig,
+        "deepseek": DeepSeekLLMConfig,
     }
 
     @field_validator("backend")
diff --git a/src/memos/llms/base.py b/src/memos/llms/base.py
index 312d19f1b..8c3681e16 100644
--- a/src/memos/llms/base.py
+++ b/src/memos/llms/base.py
@@ -1,4 +1,5 @@
 from abc import ABC, abstractmethod
+from collections.abc import Generator
 
 from memos.configs.llm import BaseLLMConfig
 from memos.types import MessageList
@@ -14,3 +15,11 @@ def __init__(self, config: BaseLLMConfig):
     @abstractmethod
     def generate(self, messages: MessageList, **kwargs) -> str:
         """Generate a response from the LLM."""
+
+    @abstractmethod
+    def generate_stream(self, messages: MessageList, **kwargs) -> Generator[str, None, None]:
+        """
+        (Optional) Generate a streaming response from the LLM.
+        Subclasses should override this if they support streaming.
+        By default, this raises NotImplementedError.
+        """
diff --git a/src/memos/llms/deepseek.py b/src/memos/llms/deepseek.py
new file mode 100644
index 000000000..fb7565866
--- /dev/null
+++ b/src/memos/llms/deepseek.py
@@ -0,0 +1,56 @@
+from memos.configs.llm import DeepSeekLLMConfig
+from memos.llms.openai import OpenAILLM
+from memos.llms.utils import remove_thinking_tags
+from memos.log import get_logger
+from memos.types import MessageList
+
+
+logger = get_logger(__name__)
+
+
+class DeepSeekLLM(OpenAILLM):
+    """DeepSeek LLM via OpenAI-compatible API."""
+
+    def __init__(self, config: DeepSeekLLMConfig):
+        super().__init__(config)
+
+    def generate(self, messages: MessageList) -> str:
+        """Generate a response from DeepSeek."""
+        response = self.client.chat.completions.create(
+            model=self.config.model_name_or_path,
+            messages=messages,
+            temperature=self.config.temperature,
+            max_tokens=self.config.max_tokens,
+            top_p=self.config.top_p,
+            extra_body=self.config.extra_body,
+        )
+        logger.info(f"Response from DeepSeek: {response.model_dump_json()}")
+        response_content = response.choices[0].message.content
+        if self.config.remove_think_prefix:
+            return remove_thinking_tags(response_content)
+        else:
+            return response_content
+
+    def generate_stream(self, messages: MessageList, **kwargs):
+        """Stream response from DeepSeek."""
+        response = self.client.chat.completions.create(
+            model=self.config.model_name_or_path,
+            messages=messages,
+            stream=True,
+            temperature=self.config.temperature,
+            max_tokens=self.config.max_tokens,
+            top_p=self.config.top_p,
+            extra_body=self.config.extra_body,
+        )
+        # Streaming chunks of text
+        reasoning_parts = ""
+        answer_parts = ""
+        for chunk in response:
+            delta = chunk.choices[0].delta
+            if hasattr(delta, "reasoning_content") and delta.reasoning_content:
+                reasoning_parts += delta.reasoning_content
+                yield delta.reasoning_content
+
+            if hasattr(delta, "content") and delta.content:
+                answer_parts += delta.content
+                yield delta.content
diff --git a/src/memos/llms/factory.py b/src/memos/llms/factory.py
index e449bbe85..0c12a667a 100644
--- a/src/memos/llms/factory.py
+++ b/src/memos/llms/factory.py
@@ -2,6 +2,7 @@
 
 from memos.configs.llm import LLMConfigFactory
 from memos.llms.base import BaseLLM
+from memos.llms.deepseek import DeepSeekLLM
 from memos.llms.hf import HFLLM
 from memos.llms.hf_singleton import HFSingletonLLM
 from memos.llms.ollama import OllamaLLM
@@ -21,6 +22,7 @@ class LLMFactory(BaseLLM):
         "huggingface_singleton": HFSingletonLLM,  # Add singleton version
         "vllm": VLLMLLM,
         "qwen": QwenLLM,
+        "deepseek": DeepSeekLLM,
     }
 
     @classmethod
diff --git a/src/memos/llms/ollama.py b/src/memos/llms/ollama.py
index 00e230789..050b7a253 100644
--- a/src/memos/llms/ollama.py
+++ b/src/memos/llms/ollama.py
@@ -1,3 +1,4 @@
+from collections.abc import Generator
 from typing import Any
 
 from ollama import Client
@@ -80,3 +81,6 @@ def generate(self, messages: MessageList) -> Any:
             return remove_thinking_tags(str_response)
         else:
             return str_response
+
+    def generate_stream(self, messages: MessageList, **kwargs) -> Generator[str, None, None]:
+        raise NotImplementedError
diff --git a/src/memos/llms/openai.py b/src/memos/llms/openai.py
index ef8a005b2..21c143d96 100644
--- a/src/memos/llms/openai.py
+++ b/src/memos/llms/openai.py
@@ -1,3 +1,5 @@
+from collections.abc import Generator
+
 import openai
 
 from memos.configs.llm import AzureLLMConfig, OpenAILLMConfig
@@ -61,3 +63,6 @@ def generate(self, messages: MessageList) -> str:
             return remove_thinking_tags(response_content)
         else:
             return response_content
+
+    def generate_stream(self, messages: MessageList, **kwargs) -> Generator[str, None, None]:
+        raise NotImplementedError
diff --git a/src/memos/llms/qwen.py b/src/memos/llms/qwen.py
index 0d32cdea3..a47fcdf36 100644
--- a/src/memos/llms/qwen.py
+++ b/src/memos/llms/qwen.py
@@ -1,3 +1,5 @@
+from collections.abc import Generator
+
 from memos.configs.llm import QwenLLMConfig
 from memos.llms.openai import OpenAILLM
 from memos.llms.utils import remove_thinking_tags
@@ -30,3 +32,32 @@ def generate(self, messages: MessageList) -> str:
             return remove_thinking_tags(response_content)
         else:
             return response_content
+
+    def generate_stream(self, messages: MessageList, **kwargs) -> Generator[str, None, None]:
+        """Stream response from Qwen LLM."""
+        response = self.client.chat.completions.create(
+            model=self.config.model_name_or_path,
+            messages=messages,
+            stream=True,
+            temperature=self.config.temperature,
+            max_tokens=self.config.max_tokens,
+            top_p=self.config.top_p,
+            extra_body=self.config.extra_body,
+        )
+
+        reasoning_started = False
+        for chunk in response:
+            delta = chunk.choices[0].delta
+
+            # Some models may have separate `reasoning_content` vs `content`
+            # For Qwen (DashScope), likely only `content` is used
+            if hasattr(delta, "reasoning_content") and delta.reasoning_content:
+                if not reasoning_started and not self.config.remove_think_prefix:
+                    yield "<think>"
+                    reasoning_started = True
+                yield delta.reasoning_content
+            elif hasattr(delta, "content") and delta.content:
+                if reasoning_started and not self.config.remove_think_prefix:
+                    yield "</think>"
+                    reasoning_started = False
+                yield delta.content

From 18f3bfce05e90dfb8071c99402bd031ed486b41a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=B8=AD=E9=98=B3=E9=98=B3?= <caralhsi@gmail.com>
Date: Thu, 17 Jul 2025 19:02:47 +0800
Subject: [PATCH 05/10] feat: add stream output for openai

---
 examples/basic_modules/llm.py |  5 +++++
 src/memos/llms/openai.py      | 33 +++++++++++++++++++++++++++++++++
 2 files changed, 38 insertions(+)

diff --git a/examples/basic_modules/llm.py b/examples/basic_modules/llm.py
index c37c86f9a..d33fc9544 100644
--- a/examples/basic_modules/llm.py
+++ b/examples/basic_modules/llm.py
@@ -69,6 +69,11 @@
 print("Scenario 3:", response)
 print("==" * 20)
 
+print("Scenario 3:\n")
+for chunk in llm.generate_stream(messages):
+    print(chunk, end="")
+print("==" * 20)
+
 
 # Scenario 4: Using LLMFactory with Huggingface Models
 
diff --git a/src/memos/llms/openai.py b/src/memos/llms/openai.py
index 21c143d96..148f6a2ca 100644
--- a/src/memos/llms/openai.py
+++ b/src/memos/llms/openai.py
@@ -36,6 +36,39 @@ def generate(self, messages: MessageList) -> str:
         else:
             return response_content
 
+    def generate_stream(self, messages: MessageList, **kwargs) -> Generator[str, None, None]:
+        """Stream response from OpenAI LLM with optional reasoning support."""
+        response = self.client.chat.completions.create(
+            model=self.config.model_name_or_path,
+            messages=messages,
+            stream=True,
+            temperature=self.config.temperature,
+            max_tokens=self.config.max_tokens,
+            top_p=self.config.top_p,
+            extra_body=self.config.extra_body,
+        )
+
+        reasoning_started = False
+
+        for chunk in response:
+            delta = chunk.choices[0].delta
+
+            # Support for custom 'reasoning_content' (if present in OpenAI-compatible models like Qwen)
+            if hasattr(delta, "reasoning_content") and delta.reasoning_content:
+                if not reasoning_started and not self.config.remove_think_prefix:
+                    yield "<think>"
+                    reasoning_started = True
+                yield delta.reasoning_content
+            elif hasattr(delta, "content") and delta.content:
+                if reasoning_started and not self.config.remove_think_prefix:
+                    yield "</think>"
+                    reasoning_started = False
+                yield delta.content
+
+        # Ensure we close the <think> block if not already done
+        if reasoning_started and not self.config.remove_think_prefix:
+            yield "</think>"
+
 
 class AzureLLM(BaseLLM):
     """Azure OpenAI LLM class."""

From e268b8233b28cdc0f393d10f177c7c90ba33a8e5 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=B8=AD=E9=98=B3=E9=98=B3?= <caralhsi@gmail.com>
Date: Thu, 17 Jul 2025 19:28:05 +0800
Subject: [PATCH 06/10] test: add unit test for llms

---
 tests/llms/test_deepseek.py |  88 +++++++++++++++++++++++++++++++
 tests/llms/test_openai.py   |  60 +++++++++++++++++++++
 tests/llms/test_qwen.py     | 101 ++++++++++++++++++++++++++++++++++++
 3 files changed, 249 insertions(+)
 create mode 100644 tests/llms/test_deepseek.py
 create mode 100644 tests/llms/test_qwen.py

diff --git a/tests/llms/test_deepseek.py b/tests/llms/test_deepseek.py
new file mode 100644
index 000000000..3eb9fae37
--- /dev/null
+++ b/tests/llms/test_deepseek.py
@@ -0,0 +1,88 @@
+import unittest
+
+from types import SimpleNamespace
+from unittest.mock import MagicMock
+
+from memos.configs.llm import DeepSeekLLMConfig
+from memos.llms.deepseek import DeepSeekLLM
+
+
+class TestDeepSeekLLM(unittest.TestCase):
+    def test_deepseek_llm_generate_with_and_without_think_prefix(self):
+        """Test DeepSeekLLM generate method with and without <think> tag removal."""
+
+        # Simulated full content including <think> tag
+        full_content = "<think>Thinking in progress...</think>Hello from DeepSeek!"
+
+        # Mock response object
+        mock_response = MagicMock()
+        mock_response.model_dump_json.return_value = '{"mock": "true"}'
+        mock_response.choices[0].message.content = full_content
+
+        # Config with think prefix preserved
+        config_with_think = DeepSeekLLMConfig.model_validate(
+            {
+                "model_name_or_path": "deepseek-chat",
+                "temperature": 0.7,
+                "max_tokens": 512,
+                "top_p": 0.9,
+                "api_key": "sk-test",
+                "api_base": "https://api.deepseek.com/v1",
+                "remove_think_prefix": False,
+            }
+        )
+        llm_with_think = DeepSeekLLM(config_with_think)
+        llm_with_think.client.chat.completions.create = MagicMock(return_value=mock_response)
+
+        output_with_think = llm_with_think.generate([{"role": "user", "content": "Hello"}])
+        self.assertEqual(output_with_think, full_content)
+
+        # Config with think tag removed
+        config_without_think = config_with_think.copy(update={"remove_think_prefix": True})
+        llm_without_think = DeepSeekLLM(config_without_think)
+        llm_without_think.client.chat.completions.create = MagicMock(return_value=mock_response)
+
+        output_without_think = llm_without_think.generate([{"role": "user", "content": "Hello"}])
+        self.assertEqual(output_without_think, "Hello from DeepSeek!")
+
+    def test_deepseek_llm_generate_stream(self):
+        """Test DeepSeekLLM generate_stream with reasoning_content and content chunks."""
+
+        def make_chunk(delta_dict):
+            # Create a simulated stream chunk with delta fields
+            delta = SimpleNamespace(**delta_dict)
+            choice = SimpleNamespace(delta=delta)
+            return SimpleNamespace(choices=[choice])
+
+        # Simulate chunks: reasoning + answer
+        mock_stream_chunks = [
+            make_chunk({"reasoning_content": "Analyzing..."}),
+            make_chunk({"content": "Hello"}),
+            make_chunk({"content": ", "}),
+            make_chunk({"content": "DeepSeek!"}),
+        ]
+
+        mock_chat_completions_create = MagicMock(return_value=iter(mock_stream_chunks))
+
+        config = DeepSeekLLMConfig.model_validate(
+            {
+                "model_name_or_path": "deepseek-chat",
+                "temperature": 0.7,
+                "max_tokens": 512,
+                "top_p": 0.9,
+                "api_key": "sk-test",
+                "api_base": "https://api.deepseek.com/v1",
+                "remove_think_prefix": False,
+            }
+        )
+        llm = DeepSeekLLM(config)
+        llm.client.chat.completions.create = mock_chat_completions_create
+
+        messages = [{"role": "user", "content": "Say hello"}]
+        streamed = list(llm.generate_stream(messages))
+        full_output = "".join(streamed)
+
+        self.assertIn("Analyzing...", full_output)
+        self.assertIn("Hello, DeepSeek!", full_output)
+        self.assertTrue(full_output.startswith("Analyzing..."))
+        self.assertTrue(full_output.endswith("DeepSeek!"))
diff --git a/tests/llms/test_openai.py b/tests/llms/test_openai.py
index 90bc7ab2d..dff57c058 100644
--- a/tests/llms/test_openai.py
+++ b/tests/llms/test_openai.py
@@ -1,5 +1,6 @@
 import unittest
 
+from types import SimpleNamespace
 from unittest.mock import MagicMock
 
 from memos.configs.llm import LLMConfigFactory
@@ -40,3 +41,62 @@ def test_llm_factory_with_mocked_openai_backend(self):
             response,
             "Hello! I'm an AI language model created by OpenAI. I'm here to help answer questions, provide information, and assist with a wide range of topics. How can I assist you today?",
         )
+
+    def test_llm_factory_with_stream_openai_backend(self):
+        """Test LLMFactory stream generation with mocked OpenAI backend."""
+
+        def make_chunk(delta_dict):
+            # Create a mock response chunk with a simulated delta dictionary
+            delta = SimpleNamespace(**delta_dict)
+            choice = SimpleNamespace(delta=delta, finish_reason="stop", index=0)
+            return SimpleNamespace(choices=[choice])
+
+        # Simulate a stream response with both reasoning_content and content
+        mock_stream_chunks = [
+            make_chunk({"reasoning_content": "I am thinking"}),
+            make_chunk({"content": "Hello"}),
+            make_chunk({"content": ", "}),
+            make_chunk({"content": "world!"}),
+        ]
+
+        # Mock the streaming chat completion call
+        mock_chat_completions_create = MagicMock(return_value=iter(mock_stream_chunks))
+
+        # Create the LLM config with think prefix enabled
+        config = LLMConfigFactory.model_validate(
+            {
+                "backend": "openai",
+                "config": {
+                    "model_name_or_path": "gpt-4.1-nano",
+                    "temperature": 0.8,
+                    "max_tokens": 1024,
+                    "top_p": 0.9,
+                    "top_k": 50,
+                    "api_key": "sk-xxxx",
+                    "api_base": "https://api.openai.com/v1",
+                    "remove_think_prefix": False,
+                    # Ensure <think> tag is emitted
+                },
+            }
+        )
+
+        # Instantiate the LLM and inject the mocked stream method
+        llm = LLMFactory.from_config(config)
+        llm.client.chat.completions.create = mock_chat_completions_create
+
+        # Input message to the model
+        messages = [{"role": "user", "content": "Think and say hello"}]
+
+        # Collect streamed output as a list of chunks
+        response_parts = list(llm.generate_stream(messages))
+        response = "".join(response_parts)
+
+        # Assert the presence of the <think> tag and expected content
+        self.assertIn("<think>", response)
+        self.assertIn("I am thinking", response)
+        self.assertIn("Hello, world!", response)
+
+        # Optional: check structure of stream response
+        self.assertEqual(response_parts[0], "<think>")
+        self.assertTrue(response.startswith("<think>I am thinking"))
+        self.assertTrue(response.endswith("Hello, world!"))
diff --git a/tests/llms/test_qwen.py b/tests/llms/test_qwen.py
new file mode 100644
index 000000000..035b0dccf
--- /dev/null
+++ b/tests/llms/test_qwen.py
@@ -0,0 +1,101 @@
+import unittest
+
+from types import SimpleNamespace
+from unittest.mock import MagicMock
+
+from memos.configs.llm import QwenLLMConfig
+from memos.llms.qwen import QwenLLM
+
+
+class TestQwenLLM(unittest.TestCase):
+    def test_qwen_llm_generate_with_and_without_think_prefix(self):
+        """Test QwenLLM non-streaming response generation with and without <think> prefix removal."""
+
+        # Simulated full response content with <think> tag
+        full_content = "<think>Analyzing your request...</think>Hello, world!"
+
+        # Prepare the mock response object with expected structure
+        mock_response = MagicMock()
+        mock_response.model_dump_json.return_value = '{"mocked": "true"}'
+        mock_response.choices[0].message.content = full_content
+
+        # Create config with remove_think_prefix = False
+        config_with_think = QwenLLMConfig.model_validate(
+            {
+                "model_name_or_path": "qwen-test",
+                "temperature": 0.7,
+                "max_tokens": 100,
+                "top_p": 0.9,
+                "api_key": "sk-test",
+                "api_base": "https://dashscope.aliyuncs.com/api/v1",
+                "remove_think_prefix": False,
+            }
+        )
+
+        # Instance with think tag enabled
+        llm_with_think = QwenLLM(config_with_think)
+        llm_with_think.client.chat.completions.create = MagicMock(return_value=mock_response)
+
+        response_with_think = llm_with_think.generate([{"role": "user", "content": "Hi"}])
+        self.assertEqual(response_with_think, full_content)
+
+        # Create config with remove_think_prefix = True
+        config_without_think = config_with_think.copy(update={"remove_think_prefix": True})
+
+        # Instance with think tag removed
+        llm_without_think = QwenLLM(config_without_think)
+        llm_without_think.client.chat.completions.create = MagicMock(return_value=mock_response)
+
+        response_without_think = llm_without_think.generate([{"role": "user", "content": "Hi"}])
+        self.assertEqual(response_without_think, "Hello, world!")
+        self.assertNotIn("<think>", response_without_think)
+
+    def test_qwen_llm_generate_stream(self):
+        """Test QwenLLM stream generation with both reasoning_content and content."""
+
+        def make_chunk(delta_dict):
+            # Construct a mock chunk with delta fields
+            delta = SimpleNamespace(**delta_dict)
+            choice = SimpleNamespace(delta=delta)
+            return SimpleNamespace(choices=[choice])
+
+        # Simulate a sequence of streamed chunks
+        mock_stream_chunks = [
+            make_chunk({"reasoning_content": "Analyzing input..."}),
+            make_chunk({"content": "Hello"}),
+            make_chunk({"content": ", "}),
+            make_chunk({"content": "world!"}),
+        ]
+
+        # Mock the client's streaming response
+        mock_chat_completions_create = MagicMock(return_value=iter(mock_stream_chunks))
+
+        # Build QwenLLM config with think prefix enabled
+        config = QwenLLMConfig.model_validate(
+            {
+                "model_name_or_path": "qwen-test",
+                "temperature": 0.7,
+                "max_tokens": 100,
+                "top_p": 0.9,
+                "api_key": "sk-test",
+                "api_base": "https://dashscope.aliyuncs.com/api/v1",
+                "remove_think_prefix": False,
+            }
+        )
+
+        # Create QwenLLM instance and inject mock client
+        llm = QwenLLM(config)
+        llm.client.chat.completions.create = mock_chat_completions_create
+
+        messages = [{"role": "user", "content": "Say hello"}]
+
+        # Collect the streamed output
+        response_parts = list(llm.generate_stream(messages))
+        response = "".join(response_parts)
+
+        # Assertions for structure and content
+        self.assertIn("<think>", response)
+        self.assertIn("Analyzing input...", response)
+        self.assertIn("Hello, world!", response)
+        self.assertTrue(response.startswith("<think>Analyzing input..."))
+        self.assertTrue(response.endswith("Hello, world!"))

From 6b8583db47f40aa227e6b6e7acb90befb6a1b4ec Mon Sep 17 00:00:00 2001
From: CaralHsi <caralhsi@gmail.com>
Date: Fri, 18 Jul 2025 16:49:34 +0800
Subject: [PATCH 07/10] Update src/memos/llms/deepseek.py

Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
---
 src/memos/llms/deepseek.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/memos/llms/deepseek.py b/src/memos/llms/deepseek.py
index fb7565866..2071bbcdc 100644
--- a/src/memos/llms/deepseek.py
+++ b/src/memos/llms/deepseek.py
@@ -31,7 +31,7 @@ def generate(self, messages: MessageList) -> str:
         else:
             return response_content
 
-    def generate_stream(self, messages: MessageList, **kwargs):
+    def generate_stream(self, messages: MessageList, **kwargs) -> Generator[str, None, None]:
         """Stream response from DeepSeek."""
         response = self.client.chat.completions.create(
             model=self.config.model_name_or_path,

From 195ca6470744645f6e96f8f8f4818ce73986f711 Mon Sep 17 00:00:00 2001
From: CaralHsi <caralhsi@gmail.com>
Date: Fri, 18 Jul 2025 16:50:01 +0800
Subject: [PATCH 08/10] Apply suggestion from @Copilot

Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
---
 src/memos/llms/deepseek.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/src/memos/llms/deepseek.py b/src/memos/llms/deepseek.py
index 2071bbcdc..e87a27b5e 100644
--- a/src/memos/llms/deepseek.py
+++ b/src/memos/llms/deepseek.py
@@ -43,12 +43,10 @@ def generate_stream(self, messages: MessageList, **kwargs) -> Generator[str, Non
             extra_body=self.config.extra_body,
         )
         # Streaming chunks of text
-        reasoning_parts = ""
         answer_parts = ""
         for chunk in response:
             delta = chunk.choices[0].delta
             if hasattr(delta, "reasoning_content") and delta.reasoning_content:
-                reasoning_parts += delta.reasoning_content
                 yield delta.reasoning_content
 
             if hasattr(delta, "content") and delta.content:

From bc0f25e480ee3af5e18401dd0f3d21115ed4c4f5 Mon Sep 17 00:00:00 2001
From: CaralHsi <caralhsi@gmail.com>
Date: Fri, 18 Jul 2025 16:50:18 +0800
Subject: [PATCH 09/10] Apply suggestion from @Copilot

Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
---
 src/memos/llms/deepseek.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/memos/llms/deepseek.py b/src/memos/llms/deepseek.py
index e87a27b5e..7c126a7a9 100644
--- a/src/memos/llms/deepseek.py
+++ b/src/memos/llms/deepseek.py
@@ -50,5 +50,4 @@ def generate_stream(self, messages: MessageList, **kwargs) -> Generator[str, Non
                 yield delta.reasoning_content
 
             if hasattr(delta, "content") and delta.content:
-                answer_parts += delta.content
                 yield delta.content

From f621f80d656b7e08079bb74facec0ce466073689 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=B8=AD=E9=98=B3=E9=98=B3?= <caralhsi@gmail.com>
Date: Sat, 19 Jul 2025 11:27:06 +0800
Subject: [PATCH 10/10] fix: multi llm test bug

---
 src/memos/llms/deepseek.py  | 3 ++-
 tests/llms/test_deepseek.py | 2 +-
 tests/llms/test_qwen.py     | 2 +-
 3 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/src/memos/llms/deepseek.py b/src/memos/llms/deepseek.py
index 7c126a7a9..f5ee4842b 100644
--- a/src/memos/llms/deepseek.py
+++ b/src/memos/llms/deepseek.py
@@ -1,3 +1,5 @@
+from collections.abc import Generator
+
 from memos.configs.llm import DeepSeekLLMConfig
 from memos.llms.openai import OpenAILLM
 from memos.llms.utils import remove_thinking_tags
@@ -43,7 +45,6 @@ def generate_stream(self, messages: MessageList, **kwargs) -> Generator[str, Non
             extra_body=self.config.extra_body,
         )
         # Streaming chunks of text
-        answer_parts = ""
         for chunk in response:
             delta = chunk.choices[0].delta
             if hasattr(delta, "reasoning_content") and delta.reasoning_content:
diff --git a/tests/llms/test_deepseek.py b/tests/llms/test_deepseek.py
index 3eb9fae37..75c1ead5f 100644
--- a/tests/llms/test_deepseek.py
+++ b/tests/llms/test_deepseek.py
@@ -38,7 +38,7 @@ def test_deepseek_llm_generate_with_and_without_think_prefix(self):
         self.assertEqual(output_with_think, full_content)
 
         # Config with think tag removed
-        config_without_think = config_with_think.copy(update={"remove_think_prefix": True})
+        config_without_think = config_with_think.model_copy(update={"remove_think_prefix": True})
         llm_without_think = DeepSeekLLM(config_without_think)
         llm_without_think.client.chat.completions.create = MagicMock(return_value=mock_response)
 
diff --git a/tests/llms/test_qwen.py b/tests/llms/test_qwen.py
index 035b0dccf..90f31e47f 100644
--- a/tests/llms/test_qwen.py
+++ b/tests/llms/test_qwen.py
@@ -40,7 +40,7 @@ def test_qwen_llm_generate_with_and_without_think_prefix(self):
         self.assertEqual(response_with_think, full_content)
 
         # Create config with remove_think_prefix = True
-        config_without_think = config_with_think.copy(update={"remove_think_prefix": True})
+        config_without_think = config_with_think.model_copy(update={"remove_think_prefix": True})
 
         # Instance with think tag removed
         llm_without_think = QwenLLM(config_without_think)