fix:merge error (#75)

fridayL · web-flow · commit 84624d47eb76 · 2025-07-14T20:02:42.000+08:00
## Description

&lt;!--
Please include a summary of the changes below;
Fill in the issue number that this PR addresses (if applicable);
Mention the person who will review this PR (if you know who it is);
Replace (summary), (issue), and (reviewer) with the appropriate
information (No parentheses).

请在下方填写更改的摘要；
填写此 PR 解决的问题编号（如果适用）；
提及将审查此 PR 的人（如果您知道是谁）；
替换 (summary)、(issue) 和 (reviewer) 为适当的信息（不带括号）。
--&gt;

Summary: (summary)

Fix: #(issue)

Reviewer: @(reviewer)

## Checklist:

- [ ] I have performed a self-review of my own code | 我已自行检查了自己的代码
- [ ] I have commented my code in hard-to-understand areas |
我已在难以理解的地方对代码进行了注释
- [ ] I have added tests that prove my fix is effective or that my
feature works | 我已添加测试以证明我的修复有效或功能正常
- [ ] I have added necessary documentation (if applicable) |
我已添加必要的文档（如果适用）
- [ ] I have linked the issue to this PR (if applicable) | 我已将 issue
链接到此 PR（如果适用）
- [ ] I have mentioned the person who will review this PR | 我已提及将审查此 PR
的人
diff --git a/examples/mem_os/simple_vllm_memos.py b/examples/mem_os/simple_vllm_memos.py
@@ -2,8 +2,6 @@
 """
 Simple example demonstrating how to use VLLMLLM with an existing vLLM server.
 Requires a vLLM server to be running.
-Simple example demonstrating how to use VLLMLLM with an existing vLLM server.
-Requires a vLLM server to be running.
 """
 
 from memos.configs.llm import VLLMLLMConfig
@@ -15,11 +13,9 @@ def main():
     
     # Configuration for connecting to existing vLLM server
     config = VLLMLLMConfig(
-        model_name_or_path="/mnt/afs/models/hf_models/Qwen2.5-7B",  # MUST MATCH the --model arg of vLLM server
         model_name_or_path="/mnt/afs/models/hf_models/Qwen2.5-7B",  # MUST MATCH the --model arg of vLLM server
         api_key="",  # Not needed for local server
         api_base="http://localhost:8088/v1",  # vLLM server address with /v1
-        api_base="http://localhost:8088/v1",  # vLLM server address with /v1
         temperature=0.7,
         max_tokens=512,
         top_p=0.9,
@@ -39,7 +35,6 @@ def main():
     try:
         prompt = llm.build_vllm_kv_cache(system_messages)
         print(f"✓ KV cache built successfully for prompt: '{prompt[:100]}...'")
-        print(f"✓ KV cache built successfully for prompt: '{prompt[:100]}...'")
     except Exception as e:
         print(f"✗ Failed to build KV cache: {e}")
     
diff --git a/src/memos/llms/vllm.py b/src/memos/llms/vllm.py
@@ -33,17 +33,7 @@ def __init__(self, config: VLLMLLMConfig):
             api_key=api_key, 
             base_url=getattr(self.config, "api_base", "http://localhost:8088/v1")
         )
-        api_key = getattr(self.config, "api_key", "dummy")
-        if not api_key:
-            api_key = "dummy"
-
-        import openai
-        self.client = openai.Client(
-            api_key=api_key, 
-            base_url=getattr(self.config, "api_base", "http://localhost:8088/v1")
-        )
     
-    def build_vllm_kv_cache(self, messages: Any) -> str:
     def build_vllm_kv_cache(self, messages: Any) -> str:
         """
         Build a KV cache from chat messages via one vLLM request.
@@ -67,21 +57,12 @@ def build_vllm_kv_cache(self, messages: Any) -> str:
         
         if not prompt.strip():
             raise ValueError("Prompt is empty, cannot build KV cache.")
-            raise ValueError("Prompt is empty, cannot build KV cache.")
         
         # 3. Send request to vLLM server to preload the KV cache
-        if self.client:
-            try:
-                # Use the processed messages for the API call
-        # 3. Send request to vLLM server to preload the KV cache
         if self.client:
             try:
                 # Use the processed messages for the API call
                 prefill_kwargs = {
-                    "model": self.config.model_name_or_path,
-                    "messages": processed_messages,
-                    "max_tokens": 2,
-                    "temperature": 0.0,
                     "model": self.config.model_name_or_path,
                     "messages": processed_messages,
                     "max_tokens": 2,
@@ -90,8 +71,6 @@ def build_vllm_kv_cache(self, messages: Any) -> str:
                 }
                 self.client.chat.completions.create(**prefill_kwargs)
                 logger.info(f"vLLM KV cache prefill completed for prompt: '{prompt[:100]}...'")
-                self.client.chat.completions.create(**prefill_kwargs)
-                logger.info(f"vLLM KV cache prefill completed for prompt: '{prompt[:100]}...'")
             except Exception as e:
                 logger.warning(f"Failed to prefill vLLM KV cache: {e}")
         
@@ -101,7 +80,6 @@ def generate(self, messages: list[MessageDict]) -> str:
         """
         Generate a response from the model.
         """
-        if self.client:
         if self.client:
             return self._generate_with_api_client(messages)
         else:
@@ -111,11 +89,8 @@ def _generate_with_api_client(self, messages: list[MessageDict]) -> str:
         """
         Generate response using vLLM API client.
         """
-        if self.client:
         if self.client:
             completion_kwargs = {
-                "model": self.config.model_name_or_path,
-                "messages": messages,
                 "model": self.config.model_name_or_path,
                 "messages": messages,
                 "temperature": float(getattr(self.config, "temperature", 0.8)),
@@ -127,9 +102,6 @@ def _generate_with_api_client(self, messages: list[MessageDict]) -> str:
             response_text = response.choices[0].message.content or ""
             logger.info(f"VLLM API response: {response_text}")
             return remove_thinking_tags(response_text) if getattr(self.config, "remove_think_prefix", False) else response_text
-            response_text = response.choices[0].message.content or ""
-            logger.info(f"VLLM API response: {response_text}")
-            return remove_thinking_tags(response_text) if getattr(self.config, "remove_think_prefix", False) else response_text
         else:
             raise RuntimeError("API client is not available")
     
@@ -142,7 +114,6 @@ def _messages_to_prompt(self, messages: list[MessageDict]) -> str:
             role = msg["role"]
             content = msg["content"]
             prompt_parts.append(f"{role.capitalize()}: {content}")
-            prompt_parts.append(f"{role.capitalize()}: {content}")
         return "\n".join(prompt_parts)
     
     def generate_stream(self, messages: list[MessageDict]):