feat:vllm llm support version0.5 (#68)

fridayL · web-flow · commit 54c99fbb3156 · 2025-07-13T14:25:57.000+08:00
## Description

&lt;!--
Please include a summary of the changes below;
Fill in the issue number that this PR addresses (if applicable);
Mention the person who will review this PR (if you know who it is);
Replace (summary), (issue), and (reviewer) with the appropriate
information (No parentheses).

请在下方填写更改的摘要；
填写此 PR 解决的问题编号（如果适用）；
提及将审查此 PR 的人（如果您知道是谁）；
替换 (summary)、(issue) 和 (reviewer) 为适当的信息（不带括号）。
--&gt;

Summary: (summary)

Fix: #(issue)

Reviewer: @(reviewer)

## Checklist:

- [ ] I have performed a self-review of my own code | 我已自行检查了自己的代码
- [ ] I have commented my code in hard-to-understand areas |
我已在难以理解的地方对代码进行了注释
- [ ] I have added tests that prove my fix is effective or that my
feature works | 我已添加测试以证明我的修复有效或功能正常
- [ ] I have added necessary documentation (if applicable) |
我已添加必要的文档（如果适用）
- [ ] I have linked the issue to this PR (if applicable) | 我已将 issue
链接到此 PR（如果适用）
- [ ] I have mentioned the person who will review this PR | 我已提及将审查此 PR
的人
diff --git a/examples/mem_os/simple_vllm_memos.py b/examples/mem_os/simple_vllm_memos.py
@@ -1,29 +1,24 @@
 #!/usr/bin/env python3
 """
-Simple example demonstrating how to use VLLMLLM with existing vLLM server.
-Requires a vLLM server to be running on localhost:8088.
+Simple example demonstrating how to use VLLMLLM with an existing vLLM server.
+Requires a vLLM server to be running.
 """
 
-import asyncio
-import sys
-
 from memos.configs.llm import VLLMLLMConfig
 from memos.llms.vllm import VLLMLLM
 from memos.types import MessageList
 
-
 def main():
     """Main function demonstrating VLLMLLM usage."""
     
     # Configuration for connecting to existing vLLM server
     config = VLLMLLMConfig(
-        model_name_or_path="Qwen/Qwen3-1.7B",  # Model name (for reference)
+        model_name_or_path="/mnt/afs/models/hf_models/Qwen2.5-7B",  # MUST MATCH the --model arg of vLLM server
         api_key="",  # Not needed for local server
-        api_base="http://localhost:8088",  # vLLM server address
+        api_base="http://localhost:8088/v1",  # vLLM server address with /v1
         temperature=0.7,
         max_tokens=512,
         top_p=0.9,
-        top_k=50,
         model_schema="memos.configs.llm.VLLMLLMConfig",
     )
     
@@ -32,49 +27,28 @@ def main():
     llm = VLLMLLM(config)
     
     # Test messages for KV cache building
+    print("\nBuilding KV cache for system messages...")
     system_messages: MessageList = [
         {"role": "system", "content": "You are a helpful AI assistant."},
         {"role": "user", "content": "Hello! Can you tell me about vLLM?"}
     ]
-    
-    # Build KV cache for system messages
-    print("Building KV cache for system messages...")
     try:
         prompt = llm.build_vllm_kv_cache(system_messages)
-        print(f"✓ KV cache built successfully. Prompt length: {len(prompt)}")
+        print(f"✓ KV cache built successfully for prompt: '{prompt[:100]}...'")
     except Exception as e:
         print(f"✗ Failed to build KV cache: {e}")
     
-    # Test with different messages
+    # Test with different messages for generation
+    print("\nGenerating response...")
     user_messages: MessageList = [
         {"role": "system", "content": "You are a helpful AI assistant."},
         {"role": "user", "content": "What are the benefits of using vLLM?"}
     ]
-    
-    # Generate response
-    print("\nGenerating response...")
     try:
         response = llm.generate(user_messages)
         print(f"Response: {response}")
     except Exception as e:
         print(f"Error generating response: {e}")
-    
-    # Test with string input for KV cache
-    print("\nTesting KV cache with string input...")
-    try:
-        string_prompt = llm.build_vllm_kv_cache("You are a helpful assistant.")
-        print(f"✓ String KV cache built successfully. Prompt length: {len(string_prompt)}")
-    except Exception as e:
-        print(f"✗ Failed to build string KV cache: {e}")
-    
-    # Test with list of strings input for KV cache
-    print("\nTesting KV cache with list of strings input...")
-    try:
-        list_prompt = llm.build_vllm_kv_cache(["You are helpful.", "You are knowledgeable."])
-        print(f"✓ List KV cache built successfully. Prompt length: {len(list_prompt)}")
-    except Exception as e:
-        print(f"✗ Failed to build list KV cache: {e}")
-
 
 if __name__ == "__main__":
     main() 
diff --git a/src/memos/configs/llm.py b/src/memos/configs/llm.py
@@ -46,7 +46,7 @@ class HFLLMConfig(BaseLLMConfig):
 class VLLMLLMConfig(BaseLLMConfig):
     api_key: str = Field(default="", description="API key for vLLM (optional for local server)")
     api_base: str = Field(
-        default="http://localhost:8088",
+        default="http://localhost:8088/v1",
         description="Base URL for vLLM API",
     )
 
diff --git a/src/memos/llms/vllm.py b/src/memos/llms/vllm.py
@@ -1,5 +1,5 @@
 import asyncio
-from typing import Optional, Dict, Any
+from typing import Optional, Any, cast
 
 import torch
 from transformers.cache_utils import DynamicCache
@@ -27,117 +27,63 @@ def __init__(self, config: VLLMLLMConfig):
         
         # Initialize OpenAI client for API calls
         self.client = None
-        if hasattr(self.config, "api_key") and self.config.api_key:
-            import openai
-            self.client = openai.Client(
-                api_key=self.config.api_key, 
-                base_url=getattr(self.config, "api_base", "http://localhost:8088")
-            )
-        else:
-            # Create client without API key for local servers
-            import openai
-            self.client = openai.Client(
-                api_key="dummy",  # vLLM local server doesn't require real API key
-                base_url=getattr(self.config, "api_base", "http://localhost:8088")
-            )
+        api_key = getattr(self.config, "api_key", "dummy")
+        if not api_key:
+            api_key = "dummy"
+
+        import openai
+        self.client = openai.Client(
+            api_key=api_key, 
+            base_url=getattr(self.config, "api_base", "http://localhost:8088/v1")
+        )
     
-    def build_vllm_kv_cache(self, messages) -> str:
+    def build_vllm_kv_cache(self, messages: Any) -> str:
         """
         Build a KV cache from chat messages via one vLLM request.
-        Supports the following input types:
-            - str: Used as a system prompt.
-            - list[str]: Concatenated and used as a system prompt.
-            - list[dict]: Used directly as chat messages.
-        The messages are always converted to a standard chat template.
-        Raises:
-            ValueError: If the resulting prompt is empty after template processing.
-        Returns:
-            str: The constructed prompt string for vLLM KV cache building.
+        Handles str, list[str], and MessageList formats.
         """
-        # Accept multiple input types and convert to standard chat messages
-        if isinstance(messages, str):
-            messages = [
-                {
-                    "role": "system",
-                    "content": f"Below is some information about the user.\n{messages}",
-                }
-            ]
-        elif isinstance(messages, list) and messages and isinstance(messages[0], str):
-            # Handle list of strings
-            str_messages = [str(msg) for msg in messages]
-            messages = [
-                {
-                    "role": "system",
-                    "content": f"Below is some information about the user.\n{' '.join(str_messages)}",
-                }
-            ]
-        
-        # Convert messages to prompt string using the same logic as HFLLM
-        # Convert to MessageList format for _messages_to_prompt
+        # 1. Normalize input to a MessageList
+        processed_messages: MessageList = []
         if isinstance(messages, str):
-            message_list = [{"role": "system", "content": messages}]
-        elif isinstance(messages, list) and messages and isinstance(messages[0], str):
-            str_messages = [str(msg) for msg in messages]
-            message_list = [{"role": "system", "content": " ".join(str_messages)}]
-        else:
-            message_list = messages  # Assume it's already in MessageList format
-        
-        # Convert to proper MessageList type
-        from memos.types import MessageList
-        typed_message_list: MessageList = []
-        for msg in message_list:
-            if isinstance(msg, dict) and "role" in msg and "content" in msg:
-                typed_message_list.append({
-                    "role": str(msg["role"]),
-                    "content": str(msg["content"])
-                })
-        
-        prompt = self._messages_to_prompt(typed_message_list)
+            processed_messages = [{"role": "system", "content": f"Below is some information about the user.\n{messages}"}]
+        elif isinstance(messages, list):
+            if not messages:
+                pass # Empty list
+            elif isinstance(messages[0], str):
+                str_content = " ".join(str(msg) for msg in messages)
+                processed_messages = [{"role": "system", "content": f"Below is some information about the user.\n{str_content}"}]
+            elif isinstance(messages[0], dict):
+                 processed_messages = cast(MessageList, messages)
+
+        # 2. Convert to prompt for logging/return value.
+        prompt = self._messages_to_prompt(processed_messages)
         
         if not prompt.strip():
-            raise ValueError(
-                "Prompt after chat template is empty, cannot build KV cache. Check your messages input."
-            )
+            raise ValueError("Prompt is empty, cannot build KV cache.")
         
-        # Send a request to vLLM server to preload the KV cache
-        # This is done by sending a completion request with max_tokens=0
-        # which will cause vLLM to process the input but not generate any output
-        if self.client is not None:
-            # Convert messages to OpenAI format
-            openai_messages = []
-            for msg in messages:
-                openai_messages.append({
-                    "role": msg["role"],
-                    "content": msg["content"]
-                })
-            
-            # Send prefill request to vLLM
+        # 3. Send request to vLLM server to preload the KV cache
+        if self.client:
             try:
+                # Use the processed messages for the API call
                 prefill_kwargs = {
-                    "model": "default",  # vLLM uses "default" as model name
-                    "messages": openai_messages,
-                    "max_tokens": 2,  # Don't generate any tokens, just prefill
-                    "temperature": 0.0,  # Use deterministic sampling for prefill
+                    "model": self.config.model_name_or_path,
+                    "messages": processed_messages,
+                    "max_tokens": 2,
+                    "temperature": 0.0,
                     "top_p": 1.0,
-                    "top_k": 1,
                 }
-                prefill_response = self.client.chat.completions.create(**prefill_kwargs)
-                logger.info(f"vLLM KV cache prefill completed for prompt length: {len(prompt)}")
+                self.client.chat.completions.create(**prefill_kwargs)
+                logger.info(f"vLLM KV cache prefill completed for prompt: '{prompt[:100]}...'")
             except Exception as e:
                 logger.warning(f"Failed to prefill vLLM KV cache: {e}")
-                # Continue anyway, as this is not critical for functionality
         
         return prompt 
     
-    def generate(self, messages: MessageList, past_key_values: Optional[DynamicCache] = None) -> str:
+    def generate(self, messages: MessageList) -> str:
         """
         Generate a response from the model.
-        Args:
-            messages (MessageList): Chat messages for prompt construction.
-        Returns:
-            str: Model response.
         """
-        if self.client is not None:
+        if self.client:
             return self._generate_with_api_client(messages)
         else:
             raise RuntimeError("API client is not available")
@@ -146,60 +92,31 @@ def _generate_with_api_client(self, messages: MessageList) -> str:
         """
         Generate response using vLLM API client.
         """
-        # Convert messages to OpenAI format
-        openai_messages = []
-        for msg in messages:
-            openai_messages.append({
-                "role": msg["role"],
-                "content": msg["content"]
-            })
-        
-        # Generate response
-        if self.client is not None:
-            # Create completion request with proper parameter types
+        if self.client:
             completion_kwargs = {
-                "model": "default",  # vLLM uses "default" as model name
-                "messages": openai_messages,
+                "model": self.config.model_name_or_path,
+                "messages": messages,
                 "temperature": float(getattr(self.config, "temperature", 0.8)),
                 "max_tokens": int(getattr(self.config, "max_tokens", 1024)),
                 "top_p": float(getattr(self.config, "top_p", 0.9)),
             }
             
-            # Add top_k only if it's greater than 0
-            top_k = getattr(self.config, "top_k", 50)
-            if top_k > 0:
-                completion_kwargs["top_k"] = int(top_k)
-            
             response = self.client.chat.completions.create(**completion_kwargs)
+            response_text = response.choices[0].message.content or ""
+            logger.info(f"VLLM API response: {response_text}")
+            return remove_thinking_tags(response_text) if getattr(self.config, "remove_think_prefix", False) else response_text
         else:
             raise RuntimeError("API client is not available")
-        
-        response_text = response.choices[0].message.content or ""
-        logger.info(f"VLLM API response: {response_text}")
-        
-        return (
-            remove_thinking_tags(response_text)
-            if getattr(self.config, "remove_think_prefix", False)
-            else response_text
-        )
     
     def _messages_to_prompt(self, messages: MessageList) -> str:
         """
         Convert messages to prompt string.
         """
-        # Simple conversion - can be enhanced with proper chat template
         prompt_parts = []
         for msg in messages:
             role = msg["role"]
             content = msg["content"]
-            
-            if role == "system":
-                prompt_parts.append(f"System: {content}")
-            elif role == "user":
-                prompt_parts.append(f"User: {content}")
-            elif role == "assistant":
-                prompt_parts.append(f"Assistant: {content}")
-        
+            prompt_parts.append(f"{role.capitalize()}: {content}")
         return "\n".join(prompt_parts)
     
     

Original file line number	Diff line number	Diff line change
`@@ -46,7 +46,7 @@ class HFLLMConfig(BaseLLMConfig):`
`46`	`46`	`class VLLMLLMConfig(BaseLLMConfig):`
`47`	`47`	`api_key: str = Field(default="", description="API key for vLLM (optional for local server)")`
`48`	`48`	`api_base: str = Field(`
`49`		`- default="http://localhost:8088",`
	`49`	`+ default="http://localhost:8088/v1",`
`50`	`50`	`description="Base URL for vLLM API",`
`51`	`51`	`)`
`52`	`52`