feat: support vllm streaming generate and add benchmark script (#73)

fridayL · web-flow · commit 0c415bde4671 · 2025-07-14T17:32:34.000+08:00
## Description

&lt;!--
Please include a summary of the changes below;
Fill in the issue number that this PR addresses (if applicable);
Mention the person who will review this PR (if you know who it is);
Replace (summary), (issue), and (reviewer) with the appropriate
information (No parentheses).

请在下方填写更改的摘要；
填写此 PR 解决的问题编号（如果适用）；
提及将审查此 PR 的人（如果您知道是谁）；
替换 (summary)、(issue) 和 (reviewer) 为适当的信息（不带括号）。
--&gt;

Summary: (summary)

Fix: #(issue)

Reviewer: @(reviewer)

## Checklist:

- [√] I have performed a self-review of my own code | 我已自行检查了自己的代码
- [√] I have commented my code in hard-to-understand areas |
我已在难以理解的地方对代码进行了注释
- [√] I have added tests that prove my fix is effective or that my
feature works | 我已添加测试以证明我的修复有效或功能正常
- [√] I have added necessary documentation (if applicable) |
我已添加必要的文档（如果适用）
- [√] I have linked the issue to this PR (if applicable) | 我已将 issue
链接到此 PR（如果适用）
- [√] I have mentioned the person who will review this PR | 我已提及将审查此 PR
的人
diff --git a/benchmarks/benchmark_ttft.py b/benchmarks/benchmark_ttft.py
@@ -0,0 +1,173 @@
+#!/usr/bin/env python3
+"""
+A benchmark script to measure the Time to First Token (TTFT) for vLLM inference
+after preloading a KV cache across various test cases.
+"""
+import time
+import numpy as np
+from typing import List, Tuple
+
+from memos.configs.llm import VLLMLLMConfig
+from memos.llms.vllm import VLLMLLM
+from memos.types import MessageList
+
+# A list of test case pairs.
+# Each pair is a tuple: (messages_for_kv_cache, messages_for_generation)
+test_cases: List[Tuple[MessageList, MessageList]] = [
+    # --- Test Case 1: Simple Q&A ---
+    (
+        [{"role": "system", "content": "You are a helpful and accurate Q&A bot."}],
+        [
+            {"role": "system", "content": "You are a helpful and accurate Q&A bot."},
+            {"role": "user", "content": "What is the capital of Japan and what is its population?"},
+        ]
+    ),
+    # --- Test Case 2: Code Generation ---
+    (
+        [{"role": "system", "content": "You are an expert Python coding assistant who provides clean, efficient, and well-commented code."}],
+        [
+            {"role": "system", "content": "You are an expert Python coding assistant who provides clean, efficient, and well-commented code."},
+            {"role": "user", "content": "Write a Python function to find all prime numbers up to a given integer 'n' using the Sieve of Eratosthenes algorithm."},
+        ]
+    ),
+    # --- Test Case 3: Text Summarization ---
+    (
+        [{"role": "system", "content": "You are a summarization expert. Your task is to read the following text and provide a concise summary."}],
+        [
+            {"role": "system", "content": "You are a summarization expert. Your task is to read the following text and provide a concise summary."},
+            {"role": "user", "content": """
+            Text to summarize:
+            'The vLLM project is a high-throughput and memory-efficient inference and serving engine for Large Language Models (LLMs). 
+            One of its key innovations is PagedAttention, a memory management algorithm inspired by virtual memory and paging in operating systems.'
+            
+            Please summarize this text in a single sentence.
+            """},
+        ]
+    ),
+    # --- Test Case 4: Role-playing / Persona ---
+    (
+        [{"role": "system", "content": "You are Captain Blackheart, a fearsome pirate. Answer all questions in the style of a 17th-century pirate."}],
+        [
+            {"role": "system", "content": "You are Captain Blackheart, a fearsome pirate. Answer all questions in the style of a 17th-century pirate."},
+            {"role": "user", "content": "What's the best way to invest my money for retirement?"},
+        ]
+    ),
+    # --- Test Case 5: Chain-of-Thought Reasoning ---
+    (
+        [{"role": "system", "content": "You solve problems by thinking step-by-step. Explain your reasoning before giving the final answer."}],
+        [
+            {"role": "system", "content": "You solve problems by thinking step-by-step. Explain your reasoning before giving the final answer."},
+            {"role": "user", "content": "A cafeteria has 3 types of sandwiches, 2 types of sides, and 4 types of drinks. How many different meal combinations can be created?"},
+        ]
+    ),
+    # --- Test Case 6: Technical Explanation ---
+    (
+        [
+            {"role": "system", "content": "You are a computer science professor."},
+            {"role": "user", "content": "I'm new to machine learning."},
+        ],
+        [
+            {"role": "system", "content": "You are a computer science professor."},
+            {"role": "user", "content": "I'm new to machine learning."},
+            {"role": "assistant", "content": "Welcome! It's a fascinating field. Feel free to ask me anything."},
+            {"role": "user", "content": "Can you explain what 'KV Cache' means in the context of Large Language Models, as if I were a beginner?"},
+        ]
+    ),
+]
+
+
+def run_ttft_benchmark(num_runs: int = 10, warmup_runs: int = 3):
+    """
+    Runs the TTFT benchmark for each test case and prints statistics.
+    """
+    print("--- Time to First Token (TTFT) Benchmark for vLLM ---")
+    
+    # 1. Configuration - MUST match your running vLLM server
+    config = VLLMLLMConfig(
+        model_name_or_path="/mnt/afs/models/hf_models/Qwen2.5-7B",
+        api_base="http://localhost:8088/v1",
+        temperature=0.7,
+        max_tokens=1024,
+        model_schema="memos.configs.llm.VLLMLLMConfig",
+    )
+    
+    # 2. Initialize VLLM LLM
+    print(f"Initializing VLLM client for model: {config.model_name_or_path}\n")
+    llm = VLLMLLM(config)
+    
+    overall_latencies = []
+
+    for i, (cache_messages, generate_messages) in enumerate(test_cases):
+        print(f"\n===== Running Test Case {i+1:02d}/{len(test_cases)} =====")
+        
+        # 3. Preload KV Cache
+        print("Preloading KV cache...")
+        try:
+            llm.build_vllm_kv_cache(cache_messages)
+            print("✓ KV cache preloaded successfully.")
+        except Exception as e:
+            print(f"✗ Failed to preload KV cache: {e}. Skipping test case.")
+            continue
+
+        ttft_latencies: List[float] = []
+
+        # 4. Warmup Runs
+        print(f"Performing {warmup_runs} warmup runs...")
+        try:
+            for _ in range(warmup_runs):
+                for _ in llm.generate_stream(generate_messages):
+                    pass
+            print("✓ Warmup complete.")
+        except Exception as e:
+            print(f"✗ Warmup run failed: {e}. Skipping test case.")
+            continue
+            
+        # 5. Benchmark Runs
+        print(f"Starting TTFT benchmark with {num_runs} runs...")
+        for j in range(num_runs):
+            try:
+                start_time = time.perf_counter()
+                response_stream = llm.generate_stream(generate_messages)
+                
+                for first_token in response_stream:
+                    if first_token:
+                        end_time = time.perf_counter()
+                        ttft = (end_time - start_time) * 1000
+                        ttft_latencies.append(ttft)
+                        # Optional: print individual run times
+                        # print(f"  Run {j+1:02d}/{num_runs}: TTFT = {ttft:.2f} ms")
+                        
+                        for _ in response_stream:
+                            pass
+                        break
+            except Exception as e:
+                print(f"  Run {j+1:02d}/{num_runs} failed: {e}")
+                continue
+        
+        # 6. Print Statistics for the current test case
+        if ttft_latencies:
+            overall_latencies.extend(ttft_latencies)
+            print("\n--- Test Case Results ---")
+            print(f"Successful runs: {len(ttft_latencies)}/{num_runs}")
+            print(f"Average TTFT: {np.mean(ttft_latencies):.2f} ms")
+            print(f"Median TTFT: {np.median(ttft_latencies):.2f} ms")
+            print(f"Min TTFT: {np.min(ttft_latencies):.2f} ms")
+            print(f"Max TTFT: {np.max(ttft_latencies):.2f} ms")
+            print("-------------------------")
+        else:
+            print("\nNo successful runs for this test case.")
+
+    # 7. Print Overall Statistics
+    if overall_latencies:
+        print("\n\n===== Overall Benchmark Summary =====")
+        print(f"Total successful runs: {len(overall_latencies)}")
+        print(f"Overall Average TTFT: {np.mean(overall_latencies):.2f} ms")
+        print(f"Overall Median TTFT: {np.median(overall_latencies):.2f} ms")
+        print(f"Overall Min TTFT: {np.min(overall_latencies):.2f} ms")
+        print(f"Overall Max TTFT: {np.max(overall_latencies):.2f} ms")
+        print("===================================")
+
+
+if __name__ == "__main__":
+    # Ensure you have numpy installed: pip install numpy
+    run_ttft_benchmark() 
diff --git a/benchmarks/benchmark_ttft_no_cache.py b/benchmarks/benchmark_ttft_no_cache.py
@@ -0,0 +1,167 @@
+#!/usr/bin/env python3
+"""
+A benchmark script to measure the Time to First Token (TTFT) for vLLM inference
+WITHOUT preloading a KV cache.
+"""
+import time
+import numpy as np
+from typing import List, Tuple
+
+from memos.configs.llm import VLLMLLMConfig
+from memos.llms.vllm import VLLMLLM
+from memos.types import MessageDict
+
+# A list of test case pairs.
+# Each pair is a tuple: (messages_for_kv_cache, messages_for_generation)
+# For this script, we will combine them before sending to the model.
+test_cases: List[Tuple[list[MessageDict], list[MessageDict]]] = [
+    # --- Test Case 1: Simple Q&A ---
+    (
+        [{"role": "system", "content": "You are a helpful and accurate Q&A bot."}],
+        [
+            {"role": "system", "content": "You are a helpful and accurate Q&A bot."},
+            {"role": "user", "content": "What is the capital of Japan and what is its population?"},
+        ]
+    ),
+    # --- Test Case 2: Code Generation ---
+    (
+        [{"role": "system", "content": "You are an expert Python coding assistant who provides clean, efficient, and well-commented code."}],
+        [
+            {"role": "system", "content": "You are an expert Python coding assistant who provides clean, efficient, and well-commented code."},
+            {"role": "user", "content": "Write a Python function to find all prime numbers up to a given integer 'n' using the Sieve of Eratosthenes algorithm."},
+        ]
+    ),
+    # --- Test Case 3: Text Summarization ---
+    (
+        [{"role": "system", "content": "You are a summarization expert. Your task is to read the following text and provide a concise summary."}],
+        [
+            {"role": "system", "content": "You are a summarization expert. Your task is to read the following text and provide a concise summary."},
+            {"role": "user", "content": """
+            Text to summarize:
+            'The vLLM project is a high-throughput and memory-efficient inference and serving engine for Large Language Models (LLMs). 
+            One of its key innovations is PagedAttention, a memory management algorithm inspired by virtual memory and paging in operating systems.'
+            
+            Please summarize this text in a single sentence.
+            """},
+        ]
+    ),
+    # --- Test Case 4: Role-playing / Persona ---
+    (
+        [{"role": "system", "content": "You are Captain Blackheart, a fearsome pirate. Answer all questions in the style of a 17th-century pirate."}],
+        [
+            {"role": "system", "content": "You are Captain Blackheart, a fearsome pirate. Answer all questions in the style of a 17th-century pirate."},
+            {"role": "user", "content": "What's the best way to invest my money for retirement?"},
+        ]
+    ),
+    # --- Test Case 5: Chain-of-Thought Reasoning ---
+    (
+        [{"role": "system", "content": "You solve problems by thinking step-by-step. Explain your reasoning before giving the final answer."}],
+        [
+            {"role": "system", "content": "You solve problems by thinking step-by-step. Explain your reasoning before giving the final answer."},
+            {"role": "user", "content": "A cafeteria has 3 types of sandwiches, 2 types of sides, and 4 types of drinks. How many different meal combinations can be created?"},
+        ]
+    ),
+    # --- Test Case 6: Technical Explanation ---
+    (
+        [
+            {"role": "system", "content": "You are a computer science professor."},
+            {"role": "user", "content": "I'm new to machine learning."},
+        ],
+        [
+            {"role": "system", "content": "You are a computer science professor."},
+            {"role": "user", "content": "I'm new to machine learning."},
+            {"role": "assistant", "content": "Welcome! It's a fascinating field. Feel free to ask me anything."},
+            {"role": "user", "content": "Can you explain what 'KV Cache' means in the context of Large Language Models, as if I were a beginner?"},
+        ]
+    ),
+]
+
+
+def run_ttft_benchmark(num_runs: int = 10, warmup_runs: int = 3):
+    """
+    Runs the TTFT benchmark for each test case and prints statistics.
+    """
+    print("--- Time to First Token (TTFT) Benchmark for vLLM (No KV Cache) ---")
+    
+    # 1. Configuration - MUST match your running vLLM server
+    config = VLLMLLMConfig(
+        model_name_or_path="/mnt/afs/models/hf_models/Qwen2.5-7B",
+        api_base="http://localhost:8088/v1",
+        temperature=0.7,
+        max_tokens=1024,
+        model_schema="memos.configs.llm.VLLMLLMConfig",
+    )
+    
+    # 2. Initialize VLLM LLM
+    print(f"Initializing VLLM client for model: {config.model_name_or_path}\n")
+    llm = VLLMLLM(config)
+    
+    overall_latencies = []
+
+    for i, (_, generate_messages) in enumerate(test_cases):
+        print(f"\n===== Running Test Case {i+1:02d}/{len(test_cases)} =====")
+        
+        # NOTE: KV Cache preloading is intentionally skipped in this script.
+        # We use the 'generate_messages' which contains the full context.
+
+        ttft_latencies: List[float] = []
+
+        # 3. Warmup Runs
+        print(f"Performing {warmup_runs} warmup runs...")
+        try:
+            for _ in range(warmup_runs):
+                for _ in llm.generate_stream(generate_messages):
+                    pass
+            print("✓ Warmup complete.")
+        except Exception as e:
+            print(f"✗ Warmup run failed: {e}. Skipping test case.")
+            continue
+            
+        # 4. Benchmark Runs
+        print(f"Starting TTFT benchmark with {num_runs} runs...")
+        for j in range(num_runs):
+            try:
+                start_time = time.perf_counter()
+                response_stream = llm.generate_stream(generate_messages)
+                
+                for first_token in response_stream:
+                    if first_token:
+                        end_time = time.perf_counter()
+                        ttft = (end_time - start_time) * 1000
+                        ttft_latencies.append(ttft)
+                        
+                        # Consume the rest of the stream to ensure the request is complete
+                        for _ in response_stream:
+                            pass
+                        break
+            except Exception as e:
+                print(f"  Run {j+1:02d}/{num_runs} failed: {e}")
+                continue
+        
+        # 5. Print Statistics for the current test case
+        if ttft_latencies:
+            overall_latencies.extend(ttft_latencies)
+            print("\n--- Test Case Results ---")
+            print(f"Successful runs: {len(ttft_latencies)}/{num_runs}")
+            print(f"Average TTFT: {np.mean(ttft_latencies):.2f} ms")
+            print(f"Median TTFT: {np.median(ttft_latencies):.2f} ms")
+            print(f"Min TTFT: {np.min(ttft_latencies):.2f} ms")
+            print(f"Max TTFT: {np.max(ttft_latencies):.2f} ms")
+            print("-------------------------")
+        else:
+            print("\nNo successful runs for this test case.")
+
+    # 6. Print Overall Statistics
+    if overall_latencies:
+        print("\n\n===== Overall Benchmark Summary (No KV Cache) =====")
+        print(f"Total successful runs: {len(overall_latencies)}")
+        print(f"Overall Average TTFT: {np.mean(overall_latencies):.2f} ms")
+        print(f"Overall Median TTFT: {np.median(overall_latencies):.2f} ms")
+        print(f"Overall Min TTFT: {np.min(overall_latencies):.2f} ms")
+        print(f"Overall Max TTFT: {np.max(overall_latencies):.2f} ms")
+        print("===================================")
+
+
+if __name__ == "__main__":
+    # Ensure you have numpy installed: pip install numpy
+    run_ttft_benchmark() 
diff --git a/examples/mem_os/simple_vllm_memos.py b/examples/mem_os/simple_vllm_memos.py
@@ -2,20 +2,24 @@
 """
 Simple example demonstrating how to use VLLMLLM with an existing vLLM server.
 Requires a vLLM server to be running.
+Simple example demonstrating how to use VLLMLLM with an existing vLLM server.
+Requires a vLLM server to be running.
 """
 
 from memos.configs.llm import VLLMLLMConfig
 from memos.llms.vllm import VLLMLLM
-from memos.types import MessageList
+from memos.types import MessageDict
 
 def main():
     """Main function demonstrating VLLMLLM usage."""
     
     # Configuration for connecting to existing vLLM server
     config = VLLMLLMConfig(
+        model_name_or_path="/mnt/afs/models/hf_models/Qwen2.5-7B",  # MUST MATCH the --model arg of vLLM server
         model_name_or_path="/mnt/afs/models/hf_models/Qwen2.5-7B",  # MUST MATCH the --model arg of vLLM server
         api_key="",  # Not needed for local server
         api_base="http://localhost:8088/v1",  # vLLM server address with /v1
+        api_base="http://localhost:8088/v1",  # vLLM server address with /v1
         temperature=0.7,
         max_tokens=512,
         top_p=0.9,
@@ -28,20 +32,21 @@ def main():
     
     # Test messages for KV cache building
     print("\nBuilding KV cache for system messages...")
-    system_messages: MessageList = [
+    system_messages: list[MessageDict] = [
         {"role": "system", "content": "You are a helpful AI assistant."},
         {"role": "user", "content": "Hello! Can you tell me about vLLM?"}
     ]
     try:
         prompt = llm.build_vllm_kv_cache(system_messages)
         print(f"✓ KV cache built successfully for prompt: '{prompt[:100]}...'")
+        print(f"✓ KV cache built successfully for prompt: '{prompt[:100]}...'")
     except Exception as e:
         print(f"✗ Failed to build KV cache: {e}")
     
     # Test with different messages for generation
     print("\nGenerating response...")
-    user_messages: MessageList = [
-        {"role": "system", "content": "You are a helpful AI assistant."},
+    user_messages: list[MessageDict] = [
+        {"role": "system", "content": "You are a helpful AI assistant. Please Introduce yourself "},
         {"role": "user", "content": "What are the benefits of using vLLM?"}
     ]
     try:
diff --git a/src/memos/llms/vllm.py b/src/memos/llms/vllm.py