feat: add deepseek llm

CaralHsi · CaralHsi · commit 3c4fcd4f0133 · 2025-07-17T18:55:33.000+08:00
diff --git a/examples/basic_modules/llm.py b/examples/basic_modules/llm.py
@@ -94,6 +94,14 @@
 
 
 # Scenario 5: Using LLMFactory with Qwen (DashScope Compatible API)
+# Note:
+# This example works for any model that supports the OpenAI-compatible Chat Completion API,
+# including but not limited to:
+# - Qwen models: qwen-plus, qwen-max-2025-01-25
+# - DeepSeek models: deepseek-chat, deepseek-coder, deepseek-v3
+# - Other compatible providers: MiniMax, Fireworks, Groq, OpenRouter, etc.
+#
+# Just set the correct `api_key`, `api_base`, and `model_name_or_path`.
 
 config = LLMConfigFactory.model_validate(
     {
@@ -111,8 +119,61 @@
 )
 llm = LLMFactory.from_config(config)
 messages = [
-    {"role": "user", "content": "Can you speak Chinese?"},
+    {"role": "user", "content": "Hello, who are you"},
 ]
 response = llm.generate(messages)
 print("Scenario 5:", response)
 print("==" * 20)
+
+print("Scenario 5:\n")
+for chunk in llm.generate_stream(messages):
+    print(chunk, end="")
+print("==" * 20)
+
+# Scenario 6: Using LLMFactory with Deepseek-chat
+
+cfg = LLMConfigFactory.model_validate(
+    {
+        "backend": "deepseek",
+        "config": {
+            "model_name_or_path": "deepseek-chat",
+            "api_key": "sk-xxx",
+            "api_base": "https://api.deepseek.com",
+            "temperature": 0.6,
+            "max_tokens": 512,
+            "remove_think_prefix": False,
+        },
+    }
+)
+llm = LLMFactory.from_config(cfg)
+messages = [{"role": "user", "content": "Hello, who are you"}]
+resp = llm.generate(messages)
+print("Scenario 6:", resp)
+
+
+# Scenario 7: Using LLMFactory with Deepseek-chat + reasoning + CoT + streaming
+
+cfg2 = LLMConfigFactory.model_validate(
+    {
+        "backend": "deepseek",
+        "config": {
+            "model_name_or_path": "deepseek-reasoner",
+            "api_key": "sk-xxx",
+            "api_base": "https://api.deepseek.com",
+            "temperature": 0.2,
+            "max_tokens": 1024,
+            "remove_think_prefix": False,
+        },
+    }
+)
+llm = LLMFactory.from_config(cfg2)
+messages = [
+    {
+        "role": "user",
+        "content": "Explain how to solve this problem step-by-step. Be explicit in your thinking process. Question: If a train travels from city A to city B at 60 mph and returns at 40 mph, what is its average speed for the entire trip? Let's think step by step.",
+    },
+]
+print("Scenario 7:\n")
+for chunk in llm.generate_stream(messages):
+    print(chunk, end="")
+print("==" * 20)
diff --git a/src/memos/configs/llm.py b/src/memos/configs/llm.py
@@ -37,6 +37,18 @@ class QwenLLMConfig(BaseLLMConfig):
     model_name_or_path: str = Field(..., description="Model name for Qwen, e.g., 'qwen-plus'")
 
 
+class DeepSeekLLMConfig(BaseLLMConfig):
+    api_key: str = Field(..., description="API key for DeepSeek")
+    api_base: str = Field(
+        default="https://api.deepseek.com",
+        description="Base URL for DeepSeek OpenAI-compatible API",
+    )
+    extra_body: Any = Field(default=None, description="Extra options for API")
+    model_name_or_path: str = Field(
+        ..., description="Model name: 'deepseek-chat' or 'deepseek-reasoner'"
+    )
+
+
 class AzureLLMConfig(BaseLLMConfig):
     base_url: str = Field(
         default="https://api.openai.azure.com/",
@@ -89,6 +101,7 @@ class LLMConfigFactory(BaseConfig):
         "vllm": VLLMLLMConfig,
         "huggingface_singleton": HFLLMConfig,  # Add singleton support
         "qwen": QwenLLMConfig,
+        "deepseek": DeepSeekLLMConfig,
     }
 
     @field_validator("backend")
diff --git a/src/memos/llms/base.py b/src/memos/llms/base.py
@@ -1,4 +1,5 @@
 from abc import ABC, abstractmethod
+from collections.abc import Generator
 
 from memos.configs.llm import BaseLLMConfig
 from memos.types import MessageList
@@ -14,3 +15,11 @@ def __init__(self, config: BaseLLMConfig):
     @abstractmethod
     def generate(self, messages: MessageList, **kwargs) -> str:
         """Generate a response from the LLM."""
+
+    @abstractmethod
+    def generate_stream(self, messages: MessageList, **kwargs) -> Generator[str, None, None]:
+        """
+        (Optional) Generate a streaming response from the LLM.
+        Subclasses should override this if they support streaming.
+        By default, this raises NotImplementedError.
+        """
diff --git a/src/memos/llms/deepseek.py b/src/memos/llms/deepseek.py
@@ -0,0 +1,56 @@
+from memos.configs.llm import DeepSeekLLMConfig
+from memos.llms.openai import OpenAILLM
+from memos.llms.utils import remove_thinking_tags
+from memos.log import get_logger
+from memos.types import MessageList
+
+
+logger = get_logger(__name__)
+
+
+class DeepSeekLLM(OpenAILLM):
+    """DeepSeek LLM via OpenAI-compatible API."""
+
+    def __init__(self, config: DeepSeekLLMConfig):
+        super().__init__(config)
+
+    def generate(self, messages: MessageList) -> str:
+        """Generate a response from DeepSeek."""
+        response = self.client.chat.completions.create(
+            model=self.config.model_name_or_path,
+            messages=messages,
+            temperature=self.config.temperature,
+            max_tokens=self.config.max_tokens,
+            top_p=self.config.top_p,
+            extra_body=self.config.extra_body,
+        )
+        logger.info(f"Response from DeepSeek: {response.model_dump_json()}")
+        response_content = response.choices[0].message.content
+        if self.config.remove_think_prefix:
+            return remove_thinking_tags(response_content)
+        else:
+            return response_content
+
+    def generate_stream(self, messages: MessageList, **kwargs):
+        """Stream response from DeepSeek."""
+        response = self.client.chat.completions.create(
+            model=self.config.model_name_or_path,
+            messages=messages,
+            stream=True,
+            temperature=self.config.temperature,
+            max_tokens=self.config.max_tokens,
+            top_p=self.config.top_p,
+            extra_body=self.config.extra_body,
+        )
+        # Streaming chunks of text
+        reasoning_parts = ""
+        answer_parts = ""
+        for chunk in response:
+            delta = chunk.choices[0].delta
+            if hasattr(delta, "reasoning_content") and delta.reasoning_content:
+                reasoning_parts += delta.reasoning_content
+                yield delta.reasoning_content
+
+            if hasattr(delta, "content") and delta.content:
+                answer_parts += delta.content
+                yield delta.content
diff --git a/src/memos/llms/factory.py b/src/memos/llms/factory.py
@@ -2,6 +2,7 @@
 
 from memos.configs.llm import LLMConfigFactory
 from memos.llms.base import BaseLLM
+from memos.llms.deepseek import DeepSeekLLM
 from memos.llms.hf import HFLLM
 from memos.llms.hf_singleton import HFSingletonLLM
 from memos.llms.ollama import OllamaLLM
@@ -21,6 +22,7 @@ class LLMFactory(BaseLLM):
         "huggingface_singleton": HFSingletonLLM,  # Add singleton version
         "vllm": VLLMLLM,
         "qwen": QwenLLM,
+        "deepseek": DeepSeekLLM,
     }
 
     @classmethod
diff --git a/src/memos/llms/ollama.py b/src/memos/llms/ollama.py
@@ -1,3 +1,4 @@
+from collections.abc import Generator
 from typing import Any
 
 from ollama import Client
@@ -80,3 +81,6 @@ def generate(self, messages: MessageList) -> Any:
             return remove_thinking_tags(str_response)
         else:
             return str_response
+
+    def generate_stream(self, messages: MessageList, **kwargs) -> Generator[str, None, None]:
+        raise NotImplementedError
diff --git a/src/memos/llms/openai.py b/src/memos/llms/openai.py
@@ -1,3 +1,5 @@
+from collections.abc import Generator
+
 import openai
 
 from memos.configs.llm import AzureLLMConfig, OpenAILLMConfig
@@ -61,3 +63,6 @@ def generate(self, messages: MessageList) -> str:
             return remove_thinking_tags(response_content)
         else:
             return response_content
+
+    def generate_stream(self, messages: MessageList, **kwargs) -> Generator[str, None, None]:
+        raise NotImplementedError
diff --git a/src/memos/llms/qwen.py b/src/memos/llms/qwen.py
@@ -1,3 +1,5 @@
+from collections.abc import Generator
+
 from memos.configs.llm import QwenLLMConfig
 from memos.llms.openai import OpenAILLM
 from memos.llms.utils import remove_thinking_tags
@@ -30,3 +32,32 @@ def generate(self, messages: MessageList) -> str:
             return remove_thinking_tags(response_content)
         else:
             return response_content
+
+    def generate_stream(self, messages: MessageList, **kwargs) -> Generator[str, None, None]:
+        """Stream response from Qwen LLM."""
+        response = self.client.chat.completions.create(
+            model=self.config.model_name_or_path,
+            messages=messages,
+            stream=True,
+            temperature=self.config.temperature,
+            max_tokens=self.config.max_tokens,
+            top_p=self.config.top_p,
+            extra_body=self.config.extra_body,
+        )
+
+        reasoning_started = False
+        for chunk in response:
+            delta = chunk.choices[0].delta
+
+            # Some models may have separate `reasoning_content` vs `content`
+            # For Qwen (DashScope), likely only `content` is used
+            if hasattr(delta, "reasoning_content") and delta.reasoning_content:
+                if not reasoning_started and not self.config.remove_think_prefix:
+                    yield "<think>"
+                    reasoning_started = True
+                yield delta.reasoning_content
+            elif hasattr(delta, "content") and delta.content:
+                if reasoning_started and not self.config.remove_think_prefix:
+                    yield "</think>"
+                    reasoning_started = False
+                yield delta.content