MemTensor · CaralHsi · Jul 19, 2025 · Jul 17, 2025 · Jul 17, 2025 · Jul 17, 2025
diff --git a/examples/basic_modules/llm.py b/examples/basic_modules/llm.py
@@ -69,6 +69,11 @@
 print("Scenario 3:", response)
 print("==" * 20)
 
+print("Scenario 3:\n")
+for chunk in llm.generate_stream(messages):
+    print(chunk, end="")
+print("==" * 20)
+
 
 # Scenario 4: Using LLMFactory with Huggingface Models
 
@@ -91,3 +96,89 @@
 response = llm.generate(messages)
 print("Scenario 4:", response)
 print("==" * 20)
+
+
+# Scenario 5: Using LLMFactory with Qwen (DashScope Compatible API)
+# Note:
+# This example works for any model that supports the OpenAI-compatible Chat Completion API,
+# including but not limited to:
+# - Qwen models: qwen-plus, qwen-max-2025-01-25
+# - DeepSeek models: deepseek-chat, deepseek-coder, deepseek-v3
+# - Other compatible providers: MiniMax, Fireworks, Groq, OpenRouter, etc.
+#
+# Just set the correct `api_key`, `api_base`, and `model_name_or_path`.
+
+config = LLMConfigFactory.model_validate(
+    {
+        "backend": "qwen",
+        "config": {
+            "model_name_or_path": "qwen-plus",  # or qwen-max-2025-01-25
+            "temperature": 0.7,
+            "max_tokens": 1024,
+            "top_p": 0.9,
+            "top_k": 50,
+            "api_key": "sk-xxx",
+            "api_base": "https://dashscope.aliyuncs.com/compatible-mode/v1",
+        },
+    }
+)
+llm = LLMFactory.from_config(config)
+messages = [
+    {"role": "user", "content": "Hello, who are you"},
+]
+response = llm.generate(messages)
+print("Scenario 5:", response)
+print("==" * 20)
+
+print("Scenario 5:\n")
+for chunk in llm.generate_stream(messages):
+    print(chunk, end="")
+print("==" * 20)
+
+# Scenario 6: Using LLMFactory with Deepseek-chat
+
+cfg = LLMConfigFactory.model_validate(
+    {
+        "backend": "deepseek",
+        "config": {
+            "model_name_or_path": "deepseek-chat",
+            "api_key": "sk-xxx",
+            "api_base": "https://api.deepseek.com",
+            "temperature": 0.6,
+            "max_tokens": 512,
+            "remove_think_prefix": False,
+        },
+    }
+)
+llm = LLMFactory.from_config(cfg)
+messages = [{"role": "user", "content": "Hello, who are you"}]
+resp = llm.generate(messages)
+print("Scenario 6:", resp)
+
+
+# Scenario 7: Using LLMFactory with Deepseek-chat + reasoning + CoT + streaming
+
+cfg2 = LLMConfigFactory.model_validate(
+    {
+        "backend": "deepseek",
+        "config": {
+            "model_name_or_path": "deepseek-reasoner",
+            "api_key": "sk-xxx",
+            "api_base": "https://api.deepseek.com",
+            "temperature": 0.2,
+            "max_tokens": 1024,
+            "remove_think_prefix": False,
+        },
+    }
+)
+llm = LLMFactory.from_config(cfg2)
+messages = [
+    {
+        "role": "user",
+        "content": "Explain how to solve this problem step-by-step. Be explicit in your thinking process. Question: If a train travels from city A to city B at 60 mph and returns at 40 mph, what is its average speed for the entire trip? Let's think step by step.",
+    },
+]
+print("Scenario 7:\n")
+for chunk in llm.generate_stream(messages):
+    print(chunk, end="")
+print("==" * 20)
diff --git a/src/memos/configs/llm.py b/src/memos/configs/llm.py
@@ -27,6 +27,28 @@ class OpenAILLMConfig(BaseLLMConfig):
     extra_body: Any = Field(default=None, description="extra body")
 
 
+class QwenLLMConfig(BaseLLMConfig):
+    api_key: str = Field(..., description="API key for DashScope (Qwen)")
+    api_base: str = Field(
+        default="https://dashscope-intl.aliyuncs.com/compatible-mode/v1",
+        description="Base URL for Qwen OpenAI-compatible API",
+    )
+    extra_body: Any = Field(default=None, description="extra body")
+    model_name_or_path: str = Field(..., description="Model name for Qwen, e.g., 'qwen-plus'")
+
+
+class DeepSeekLLMConfig(BaseLLMConfig):
+    api_key: str = Field(..., description="API key for DeepSeek")
+    api_base: str = Field(
+        default="https://api.deepseek.com",
+        description="Base URL for DeepSeek OpenAI-compatible API",
+    )
+    extra_body: Any = Field(default=None, description="Extra options for API")
+    model_name_or_path: str = Field(
+        ..., description="Model name: 'deepseek-chat' or 'deepseek-reasoner'"
+    )
+
+
 class AzureLLMConfig(BaseLLMConfig):
     base_url: str = Field(
         default="https://api.openai.azure.com/",
@@ -78,6 +100,8 @@ class LLMConfigFactory(BaseConfig):
         "huggingface": HFLLMConfig,
         "vllm": VLLMLLMConfig,
         "huggingface_singleton": HFLLMConfig,  # Add singleton support
+        "qwen": QwenLLMConfig,
+        "deepseek": DeepSeekLLMConfig,
     }
 
     @field_validator("backend")

diff --git a/src/memos/llms/base.py b/src/memos/llms/base.py
@@ -1,4 +1,5 @@
 from abc import ABC, abstractmethod
+from collections.abc import Generator
 
 from memos.configs.llm import BaseLLMConfig
 from memos.types import MessageList
@@ -14,3 +15,11 @@ def __init__(self, config: BaseLLMConfig):
     @abstractmethod
     def generate(self, messages: MessageList, **kwargs) -> str:
         """Generate a response from the LLM."""
+
+    @abstractmethod
+    def generate_stream(self, messages: MessageList, **kwargs) -> Generator[str, None, None]:
+        """
+        (Optional) Generate a streaming response from the LLM.
+        Subclasses should override this if they support streaming.
+        By default, this raises NotImplementedError.
+        """
diff --git a/src/memos/llms/deepseek.py b/src/memos/llms/deepseek.py
@@ -0,0 +1,56 @@
+from memos.configs.llm import DeepSeekLLMConfig
+from memos.llms.openai import OpenAILLM
+from memos.llms.utils import remove_thinking_tags
+from memos.log import get_logger
+from memos.types import MessageList
+
+
+logger = get_logger(__name__)
+
+
+class DeepSeekLLM(OpenAILLM):
+    """DeepSeek LLM via OpenAI-compatible API."""
+
+    def __init__(self, config: DeepSeekLLMConfig):
+        super().__init__(config)
+
+    def generate(self, messages: MessageList) -> str:
+        """Generate a response from DeepSeek."""
+        response = self.client.chat.completions.create(
+            model=self.config.model_name_or_path,
+            messages=messages,
+            temperature=self.config.temperature,
+            max_tokens=self.config.max_tokens,
+            top_p=self.config.top_p,
+            extra_body=self.config.extra_body,
+        )
+        logger.info(f"Response from DeepSeek: {response.model_dump_json()}")
+        response_content = response.choices[0].message.content
+        if self.config.remove_think_prefix:
+            return remove_thinking_tags(response_content)
+        else:
+            return response_content
+
+    def generate_stream(self, messages: MessageList, **kwargs):
+        """Stream response from DeepSeek."""
+        response = self.client.chat.completions.create(
+            model=self.config.model_name_or_path,
+            messages=messages,
+            stream=True,
+            temperature=self.config.temperature,
+            max_tokens=self.config.max_tokens,
+            top_p=self.config.top_p,
+            extra_body=self.config.extra_body,
+        )
+        # Streaming chunks of text
+        reasoning_parts = ""
+        answer_parts = ""
+        for chunk in response:
+            delta = chunk.choices[0].delta
+            if hasattr(delta, "reasoning_content") and delta.reasoning_content:
+                reasoning_parts += delta.reasoning_content
+                yield delta.reasoning_content
+
+            if hasattr(delta, "content") and delta.content:
+                answer_parts += delta.content
+                yield delta.content
diff --git a/src/memos/llms/factory.py b/src/memos/llms/factory.py
@@ -2,10 +2,12 @@
 
 from memos.configs.llm import LLMConfigFactory
 from memos.llms.base import BaseLLM
+from memos.llms.deepseek import DeepSeekLLM
 from memos.llms.hf import HFLLM
 from memos.llms.hf_singleton import HFSingletonLLM
 from memos.llms.ollama import OllamaLLM
 from memos.llms.openai import AzureLLM, OpenAILLM
+from memos.llms.qwen import QwenLLM
 from memos.llms.vllm import VLLMLLM
 
 
@@ -19,6 +21,8 @@ class LLMFactory(BaseLLM):
         "huggingface": HFLLM,
         "huggingface_singleton": HFSingletonLLM,  # Add singleton version
         "vllm": VLLMLLM,
+        "qwen": QwenLLM,
+        "deepseek": DeepSeekLLM,
     }
 
     @classmethod

diff --git a/src/memos/llms/ollama.py b/src/memos/llms/ollama.py
@@ -1,3 +1,4 @@
+from collections.abc import Generator
 from typing import Any
 
 from ollama import Client
@@ -80,3 +81,6 @@ def generate(self, messages: MessageList) -> Any:
             return remove_thinking_tags(str_response)
         else:
             return str_response
+
+    def generate_stream(self, messages: MessageList, **kwargs) -> Generator[str, None, None]:
+        raise NotImplementedError
diff --git a/src/memos/llms/openai.py b/src/memos/llms/openai.py
@@ -1,3 +1,5 @@
+from collections.abc import Generator
+
 import openai
 
 from memos.configs.llm import AzureLLMConfig, OpenAILLMConfig
@@ -34,6 +36,39 @@ def generate(self, messages: MessageList) -> str:
         else:
             return response_content
 
+    def generate_stream(self, messages: MessageList, **kwargs) -> Generator[str, None, None]:
+        """Stream response from OpenAI LLM with optional reasoning support."""
+        response = self.client.chat.completions.create(
+            model=self.config.model_name_or_path,
+            messages=messages,
+            stream=True,
+            temperature=self.config.temperature,
+            max_tokens=self.config.max_tokens,
+            top_p=self.config.top_p,
+            extra_body=self.config.extra_body,
+        )
+
+        reasoning_started = False
+
+        for chunk in response:
+            delta = chunk.choices[0].delta
+
+            # Support for custom 'reasoning_content' (if present in OpenAI-compatible models like Qwen)
+            if hasattr(delta, "reasoning_content") and delta.reasoning_content:
+                if not reasoning_started and not self.config.remove_think_prefix:
+                    yield "<think>"
+                    reasoning_started = True
+                yield delta.reasoning_content
+            elif hasattr(delta, "content") and delta.content:
+                if reasoning_started and not self.config.remove_think_prefix:
+                    yield "</think>"
+                    reasoning_started = False
+                yield delta.content
+
+        # Ensure we close the <think> block if not already done
+        if reasoning_started and not self.config.remove_think_prefix:
+            yield "</think>"
+
 
 class AzureLLM(BaseLLM):
     """Azure OpenAI LLM class."""
@@ -61,3 +96,6 @@ def generate(self, messages: MessageList) -> str:
             return remove_thinking_tags(response_content)
         else:
             return response_content
+
+    def generate_stream(self, messages: MessageList, **kwargs) -> Generator[str, None, None]:
+        raise NotImplementedError
diff --git a/src/memos/llms/qwen.py b/src/memos/llms/qwen.py
@@ -0,0 +1,63 @@
+from collections.abc import Generator
+
+from memos.configs.llm import QwenLLMConfig
+from memos.llms.openai import OpenAILLM
+from memos.llms.utils import remove_thinking_tags
+from memos.log import get_logger
+from memos.types import MessageList
+
+
+logger = get_logger(__name__)
+
+
+class QwenLLM(OpenAILLM):
+    """Qwen (DashScope) LLM class via OpenAI-compatible API."""
+
+    def __init__(self, config: QwenLLMConfig):
+        super().__init__(config)
+
+    def generate(self, messages: MessageList) -> str:
+        """Generate a response from Qwen LLM."""
+        response = self.client.chat.completions.create(
+            model=self.config.model_name_or_path,
+            messages=messages,
+            extra_body=self.config.extra_body,
+            temperature=self.config.temperature,
+            max_tokens=self.config.max_tokens,
+            top_p=self.config.top_p,
+        )
+        logger.info(f"Response from Qwen: {response.model_dump_json()}")
+        response_content = response.choices[0].message.content
+        if self.config.remove_think_prefix:
+            return remove_thinking_tags(response_content)
+        else:
+            return response_content
+
+    def generate_stream(self, messages: MessageList, **kwargs) -> Generator[str, None, None]:
+        """Stream response from Qwen LLM."""
+        response = self.client.chat.completions.create(
+            model=self.config.model_name_or_path,
+            messages=messages,
+            stream=True,
+            temperature=self.config.temperature,
+            max_tokens=self.config.max_tokens,
+            top_p=self.config.top_p,
+            extra_body=self.config.extra_body,
+        )
+
+        reasoning_started = False
+        for chunk in response:
+            delta = chunk.choices[0].delta
+
+            # Some models may have separate `reasoning_content` vs `content`
+            # For Qwen (DashScope), likely only `content` is used
+            if hasattr(delta, "reasoning_content") and delta.reasoning_content:
+                if not reasoning_started and not self.config.remove_think_prefix:
+                    yield "<think>"
+                    reasoning_started = True
+                yield delta.reasoning_content
+            elif hasattr(delta, "content") and delta.content:
+                if reasoning_started and not self.config.remove_think_prefix:
+                    yield "</think>"
+                    reasoning_started = False
+                yield delta.content
diff --git a/src/memos/templates/mem_reader_prompts.py b/src/memos/templates/mem_reader_prompts.py
@@ -71,7 +71,7 @@
   "summary": "Tom is currently focused on managing a new project with a tight schedule. After a team meeting on June 25, 2025, he realized the original deadline of December 15 might not be feasible due to backend delays. Concerned about insufficient testing time, he welcomed Jerry’s suggestion of proposing an extension. Tom plans to raise the idea of shifting the deadline to January 5, 2026 in the next morning’s meeting. His actions reflect both stress about timelines and a proactive, team-oriented problem-solving approach."
 }
 
-Another Example in Chinese (注意: 你的输出必须和输入的user语言一致)：
+Another Example in Chinese (注意: 当user的语言为中文时，你就需要也输出中文)：
 {
   "memory list": [
     {
@@ -85,6 +85,8 @@
   "summary": "Tom 目前专注于管理一个进度紧张的新项目..."
 }
 
+Always respond in the same language as the conversation.
+
 Conversation:
 ${conversation}