fix:merge error

kakack · kakack · commit c103b328d26e · 2025-07-14T19:55:32.000+08:00
diff --git a/examples/mem_os/simple_vllm_memos.py b/examples/mem_os/simple_vllm_memos.py
@@ -2,24 +2,20 @@
 """
 Simple example demonstrating how to use VLLMLLM with an existing vLLM server.
 Requires a vLLM server to be running.
-Simple example demonstrating how to use VLLMLLM with an existing vLLM server.
-Requires a vLLM server to be running.
 """
 
 from memos.configs.llm import VLLMLLMConfig
 from memos.llms.vllm import VLLMLLM
-from memos.types import MessageDict
+from memos.types import MessageList
 
 def main():
     """Main function demonstrating VLLMLLM usage."""
     
     # Configuration for connecting to existing vLLM server
     config = VLLMLLMConfig(
-        model_name_or_path="/mnt/afs/models/hf_models/Qwen2.5-7B",  # MUST MATCH the --model arg of vLLM server
         model_name_or_path="/mnt/afs/models/hf_models/Qwen2.5-7B",  # MUST MATCH the --model arg of vLLM server
         api_key="",  # Not needed for local server
         api_base="http://localhost:8088/v1",  # vLLM server address with /v1
-        api_base="http://localhost:8088/v1",  # vLLM server address with /v1
         temperature=0.7,
         max_tokens=512,
         top_p=0.9,
@@ -32,21 +28,20 @@ def main():
     
     # Test messages for KV cache building
     print("\nBuilding KV cache for system messages...")
-    system_messages: list[MessageDict] = [
+    system_messages: MessageList = [
         {"role": "system", "content": "You are a helpful AI assistant."},
         {"role": "user", "content": "Hello! Can you tell me about vLLM?"}
     ]
     try:
         prompt = llm.build_vllm_kv_cache(system_messages)
         print(f"✓ KV cache built successfully for prompt: '{prompt[:100]}...'")
-        print(f"✓ KV cache built successfully for prompt: '{prompt[:100]}...'")
     except Exception as e:
         print(f"✗ Failed to build KV cache: {e}")
     
     # Test with different messages for generation
     print("\nGenerating response...")
-    user_messages: list[MessageDict] = [
-        {"role": "system", "content": "You are a helpful AI assistant. Please Introduce yourself "},
+    user_messages: MessageList = [
+        {"role": "system", "content": "You are a helpful AI assistant."},
         {"role": "user", "content": "What are the benefits of using vLLM?"}
     ]
     try:
diff --git a/src/memos/llms/vllm.py b/src/memos/llms/vllm.py
@@ -33,17 +33,7 @@ def __init__(self, config: VLLMLLMConfig):
             api_key=api_key, 
             base_url=getattr(self.config, "api_base", "http://localhost:8088/v1")
         )
-        api_key = getattr(self.config, "api_key", "dummy")
-        if not api_key:
-            api_key = "dummy"
-
-        import openai
-        self.client = openai.Client(
-            api_key=api_key, 
-            base_url=getattr(self.config, "api_base", "http://localhost:8088/v1")
-        )
     
-    def build_vllm_kv_cache(self, messages: Any) -> str:
     def build_vllm_kv_cache(self, messages: Any) -> str:
         """
         Build a KV cache from chat messages via one vLLM request.
@@ -67,21 +57,12 @@ def build_vllm_kv_cache(self, messages: Any) -> str:
         
         if not prompt.strip():
             raise ValueError("Prompt is empty, cannot build KV cache.")
-            raise ValueError("Prompt is empty, cannot build KV cache.")
         
         # 3. Send request to vLLM server to preload the KV cache
-        if self.client:
-            try:
-                # Use the processed messages for the API call
-        # 3. Send request to vLLM server to preload the KV cache
         if self.client:
             try:
                 # Use the processed messages for the API call
                 prefill_kwargs = {
-                    "model": self.config.model_name_or_path,
-                    "messages": processed_messages,
-                    "max_tokens": 2,
-                    "temperature": 0.0,
                     "model": self.config.model_name_or_path,
                     "messages": processed_messages,
                     "max_tokens": 2,
@@ -90,8 +71,6 @@ def build_vllm_kv_cache(self, messages: Any) -> str:
                 }
                 self.client.chat.completions.create(**prefill_kwargs)
                 logger.info(f"vLLM KV cache prefill completed for prompt: '{prompt[:100]}...'")
-                self.client.chat.completions.create(**prefill_kwargs)
-                logger.info(f"vLLM KV cache prefill completed for prompt: '{prompt[:100]}...'")
             except Exception as e:
                 logger.warning(f"Failed to prefill vLLM KV cache: {e}")
         
@@ -101,7 +80,6 @@ def generate(self, messages: list[MessageDict]) -> str:
         """
         Generate a response from the model.
         """
-        if self.client:
         if self.client:
             return self._generate_with_api_client(messages)
         else:
@@ -111,11 +89,8 @@ def _generate_with_api_client(self, messages: list[MessageDict]) -> str:
         """
         Generate response using vLLM API client.
         """
-        if self.client:
         if self.client:
             completion_kwargs = {
-                "model": self.config.model_name_or_path,
-                "messages": messages,
                 "model": self.config.model_name_or_path,
                 "messages": messages,
                 "temperature": float(getattr(self.config, "temperature", 0.8)),
@@ -127,9 +102,6 @@ def _generate_with_api_client(self, messages: list[MessageDict]) -> str:
             response_text = response.choices[0].message.content or ""
             logger.info(f"VLLM API response: {response_text}")
             return remove_thinking_tags(response_text) if getattr(self.config, "remove_think_prefix", False) else response_text
-            response_text = response.choices[0].message.content or ""
-            logger.info(f"VLLM API response: {response_text}")
-            return remove_thinking_tags(response_text) if getattr(self.config, "remove_think_prefix", False) else response_text
         else:
             raise RuntimeError("API client is not available")
     
@@ -142,7 +114,6 @@ def _messages_to_prompt(self, messages: list[MessageDict]) -> str:
             role = msg["role"]
             content = msg["content"]
             prompt_parts.append(f"{role.capitalize()}: {content}")
-            prompt_parts.append(f"{role.capitalize()}: {content}")
         return "\n".join(prompt_parts)
     
     def generate_stream(self, messages: list[MessageDict]):