Skip to content
Merged
Show file tree
Hide file tree
Changes from 7 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
91 changes: 91 additions & 0 deletions examples/basic_modules/llm.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,11 @@
print("Scenario 3:", response)
print("==" * 20)

print("Scenario 3:\n")
for chunk in llm.generate_stream(messages):
print(chunk, end="")
print("==" * 20)


# Scenario 4: Using LLMFactory with Huggingface Models

Expand All @@ -91,3 +96,89 @@
response = llm.generate(messages)
print("Scenario 4:", response)
print("==" * 20)


# Scenario 5: Using LLMFactory with Qwen (DashScope Compatible API)
# Note:
# This example works for any model that supports the OpenAI-compatible Chat Completion API,
# including but not limited to:
# - Qwen models: qwen-plus, qwen-max-2025-01-25
# - DeepSeek models: deepseek-chat, deepseek-coder, deepseek-v3
# - Other compatible providers: MiniMax, Fireworks, Groq, OpenRouter, etc.
#
# Just set the correct `api_key`, `api_base`, and `model_name_or_path`.

config = LLMConfigFactory.model_validate(
{
"backend": "qwen",
"config": {
"model_name_or_path": "qwen-plus", # or qwen-max-2025-01-25
"temperature": 0.7,
"max_tokens": 1024,
"top_p": 0.9,
"top_k": 50,
"api_key": "sk-xxx",
"api_base": "https://dashscope.aliyuncs.com/compatible-mode/v1",
},
}
)
llm = LLMFactory.from_config(config)
messages = [
{"role": "user", "content": "Hello, who are you"},
]
response = llm.generate(messages)
print("Scenario 5:", response)
print("==" * 20)

print("Scenario 5:\n")
for chunk in llm.generate_stream(messages):
print(chunk, end="")
print("==" * 20)

# Scenario 6: Using LLMFactory with Deepseek-chat

cfg = LLMConfigFactory.model_validate(
{
"backend": "deepseek",
"config": {
"model_name_or_path": "deepseek-chat",
"api_key": "sk-xxx",
"api_base": "https://api.deepseek.com",
"temperature": 0.6,
"max_tokens": 512,
"remove_think_prefix": False,
},
}
)
llm = LLMFactory.from_config(cfg)
messages = [{"role": "user", "content": "Hello, who are you"}]
resp = llm.generate(messages)
print("Scenario 6:", resp)


# Scenario 7: Using LLMFactory with Deepseek-chat + reasoning + CoT + streaming

cfg2 = LLMConfigFactory.model_validate(
{
"backend": "deepseek",
"config": {
"model_name_or_path": "deepseek-reasoner",
"api_key": "sk-xxx",
"api_base": "https://api.deepseek.com",
"temperature": 0.2,
"max_tokens": 1024,
"remove_think_prefix": False,
},
}
)
llm = LLMFactory.from_config(cfg2)
messages = [
{
"role": "user",
"content": "Explain how to solve this problem step-by-step. Be explicit in your thinking process. Question: If a train travels from city A to city B at 60 mph and returns at 40 mph, what is its average speed for the entire trip? Let's think step by step.",
},
]
print("Scenario 7:\n")
for chunk in llm.generate_stream(messages):
print(chunk, end="")
print("==" * 20)
24 changes: 24 additions & 0 deletions src/memos/configs/llm.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,28 @@ class OpenAILLMConfig(BaseLLMConfig):
extra_body: Any = Field(default=None, description="extra body")


class QwenLLMConfig(BaseLLMConfig):
api_key: str = Field(..., description="API key for DashScope (Qwen)")
api_base: str = Field(
default="https://dashscope-intl.aliyuncs.com/compatible-mode/v1",
description="Base URL for Qwen OpenAI-compatible API",
)
extra_body: Any = Field(default=None, description="extra body")
model_name_or_path: str = Field(..., description="Model name for Qwen, e.g., 'qwen-plus'")


class DeepSeekLLMConfig(BaseLLMConfig):
api_key: str = Field(..., description="API key for DeepSeek")
api_base: str = Field(
default="https://api.deepseek.com",
description="Base URL for DeepSeek OpenAI-compatible API",
)
extra_body: Any = Field(default=None, description="Extra options for API")
model_name_or_path: str = Field(
..., description="Model name: 'deepseek-chat' or 'deepseek-reasoner'"
)


class AzureLLMConfig(BaseLLMConfig):
base_url: str = Field(
default="https://api.openai.azure.com/",
Expand Down Expand Up @@ -78,6 +100,8 @@ class LLMConfigFactory(BaseConfig):
"huggingface": HFLLMConfig,
"vllm": VLLMLLMConfig,
"huggingface_singleton": HFLLMConfig, # Add singleton support
"qwen": QwenLLMConfig,
"deepseek": DeepSeekLLMConfig,
}

@field_validator("backend")
Expand Down
9 changes: 9 additions & 0 deletions src/memos/llms/base.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
from abc import ABC, abstractmethod
from collections.abc import Generator

from memos.configs.llm import BaseLLMConfig
from memos.types import MessageList
Expand All @@ -14,3 +15,11 @@ def __init__(self, config: BaseLLMConfig):
@abstractmethod
def generate(self, messages: MessageList, **kwargs) -> str:
"""Generate a response from the LLM."""

@abstractmethod
def generate_stream(self, messages: MessageList, **kwargs) -> Generator[str, None, None]:
"""
(Optional) Generate a streaming response from the LLM.
Subclasses should override this if they support streaming.
By default, this raises NotImplementedError.
"""
56 changes: 56 additions & 0 deletions src/memos/llms/deepseek.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
from memos.configs.llm import DeepSeekLLMConfig
from memos.llms.openai import OpenAILLM
from memos.llms.utils import remove_thinking_tags
from memos.log import get_logger
from memos.types import MessageList


logger = get_logger(__name__)


class DeepSeekLLM(OpenAILLM):
"""DeepSeek LLM via OpenAI-compatible API."""

def __init__(self, config: DeepSeekLLMConfig):
super().__init__(config)

def generate(self, messages: MessageList) -> str:
"""Generate a response from DeepSeek."""
response = self.client.chat.completions.create(
model=self.config.model_name_or_path,
messages=messages,
temperature=self.config.temperature,
max_tokens=self.config.max_tokens,
top_p=self.config.top_p,
extra_body=self.config.extra_body,
)
logger.info(f"Response from DeepSeek: {response.model_dump_json()}")
response_content = response.choices[0].message.content
if self.config.remove_think_prefix:
return remove_thinking_tags(response_content)
else:
return response_content

def generate_stream(self, messages: MessageList, **kwargs):
"""Stream response from DeepSeek."""
response = self.client.chat.completions.create(
model=self.config.model_name_or_path,
messages=messages,
stream=True,
temperature=self.config.temperature,
max_tokens=self.config.max_tokens,
top_p=self.config.top_p,
extra_body=self.config.extra_body,
)
# Streaming chunks of text
reasoning_parts = ""
answer_parts = ""
for chunk in response:
delta = chunk.choices[0].delta
if hasattr(delta, "reasoning_content") and delta.reasoning_content:
reasoning_parts += delta.reasoning_content
yield delta.reasoning_content

if hasattr(delta, "content") and delta.content:
answer_parts += delta.content
yield delta.content
4 changes: 4 additions & 0 deletions src/memos/llms/factory.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,12 @@

from memos.configs.llm import LLMConfigFactory
from memos.llms.base import BaseLLM
from memos.llms.deepseek import DeepSeekLLM
from memos.llms.hf import HFLLM
from memos.llms.hf_singleton import HFSingletonLLM
from memos.llms.ollama import OllamaLLM
from memos.llms.openai import AzureLLM, OpenAILLM
from memos.llms.qwen import QwenLLM
from memos.llms.vllm import VLLMLLM


Expand All @@ -19,6 +21,8 @@ class LLMFactory(BaseLLM):
"huggingface": HFLLM,
"huggingface_singleton": HFSingletonLLM, # Add singleton version
"vllm": VLLMLLM,
"qwen": QwenLLM,
"deepseek": DeepSeekLLM,
}

@classmethod
Expand Down
4 changes: 4 additions & 0 deletions src/memos/llms/ollama.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
from collections.abc import Generator
from typing import Any

from ollama import Client
Expand Down Expand Up @@ -80,3 +81,6 @@ def generate(self, messages: MessageList) -> Any:
return remove_thinking_tags(str_response)
else:
return str_response

def generate_stream(self, messages: MessageList, **kwargs) -> Generator[str, None, None]:
raise NotImplementedError
38 changes: 38 additions & 0 deletions src/memos/llms/openai.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
from collections.abc import Generator

import openai

from memos.configs.llm import AzureLLMConfig, OpenAILLMConfig
Expand Down Expand Up @@ -34,6 +36,39 @@ def generate(self, messages: MessageList) -> str:
else:
return response_content

def generate_stream(self, messages: MessageList, **kwargs) -> Generator[str, None, None]:
"""Stream response from OpenAI LLM with optional reasoning support."""
response = self.client.chat.completions.create(
model=self.config.model_name_or_path,
messages=messages,
stream=True,
temperature=self.config.temperature,
max_tokens=self.config.max_tokens,
top_p=self.config.top_p,
extra_body=self.config.extra_body,
)

reasoning_started = False

for chunk in response:
delta = chunk.choices[0].delta

# Support for custom 'reasoning_content' (if present in OpenAI-compatible models like Qwen)
if hasattr(delta, "reasoning_content") and delta.reasoning_content:
if not reasoning_started and not self.config.remove_think_prefix:
yield "<think>"
reasoning_started = True
yield delta.reasoning_content
elif hasattr(delta, "content") and delta.content:
if reasoning_started and not self.config.remove_think_prefix:
yield "</think>"
reasoning_started = False
yield delta.content

# Ensure we close the <think> block if not already done
if reasoning_started and not self.config.remove_think_prefix:
yield "</think>"


class AzureLLM(BaseLLM):
"""Azure OpenAI LLM class."""
Expand Down Expand Up @@ -61,3 +96,6 @@ def generate(self, messages: MessageList) -> str:
return remove_thinking_tags(response_content)
else:
return response_content

def generate_stream(self, messages: MessageList, **kwargs) -> Generator[str, None, None]:
raise NotImplementedError
63 changes: 63 additions & 0 deletions src/memos/llms/qwen.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
from collections.abc import Generator

from memos.configs.llm import QwenLLMConfig
from memos.llms.openai import OpenAILLM
from memos.llms.utils import remove_thinking_tags
from memos.log import get_logger
from memos.types import MessageList


logger = get_logger(__name__)


class QwenLLM(OpenAILLM):
"""Qwen (DashScope) LLM class via OpenAI-compatible API."""

def __init__(self, config: QwenLLMConfig):
super().__init__(config)

def generate(self, messages: MessageList) -> str:
"""Generate a response from Qwen LLM."""
response = self.client.chat.completions.create(
model=self.config.model_name_or_path,
messages=messages,
extra_body=self.config.extra_body,
temperature=self.config.temperature,
max_tokens=self.config.max_tokens,
top_p=self.config.top_p,
)
logger.info(f"Response from Qwen: {response.model_dump_json()}")
response_content = response.choices[0].message.content
if self.config.remove_think_prefix:
return remove_thinking_tags(response_content)
else:
return response_content

def generate_stream(self, messages: MessageList, **kwargs) -> Generator[str, None, None]:
"""Stream response from Qwen LLM."""
response = self.client.chat.completions.create(
model=self.config.model_name_or_path,
messages=messages,
stream=True,
temperature=self.config.temperature,
max_tokens=self.config.max_tokens,
top_p=self.config.top_p,
extra_body=self.config.extra_body,
)

reasoning_started = False
for chunk in response:
delta = chunk.choices[0].delta

# Some models may have separate `reasoning_content` vs `content`
# For Qwen (DashScope), likely only `content` is used
if hasattr(delta, "reasoning_content") and delta.reasoning_content:
if not reasoning_started and not self.config.remove_think_prefix:
yield "<think>"
reasoning_started = True
yield delta.reasoning_content
elif hasattr(delta, "content") and delta.content:
if reasoning_started and not self.config.remove_think_prefix:
yield "</think>"
reasoning_started = False
yield delta.content
4 changes: 3 additions & 1 deletion src/memos/templates/mem_reader_prompts.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,7 @@
"summary": "Tom is currently focused on managing a new project with a tight schedule. After a team meeting on June 25, 2025, he realized the original deadline of December 15 might not be feasible due to backend delays. Concerned about insufficient testing time, he welcomed Jerry’s suggestion of proposing an extension. Tom plans to raise the idea of shifting the deadline to January 5, 2026 in the next morning’s meeting. His actions reflect both stress about timelines and a proactive, team-oriented problem-solving approach."
}

Another Example in Chinese (注意: 你的输出必须和输入的user语言一致):
Another Example in Chinese (注意: 当user的语言为中文时,你就需要也输出中文):
{
"memory list": [
{
Expand All @@ -85,6 +85,8 @@
"summary": "Tom 目前专注于管理一个进度紧张的新项目..."
}

Always respond in the same language as the conversation.

Conversation:
${conversation}

Expand Down
Loading