Skip to content

Commit 0c415bd

Browse files
authored
feat: support vllm streaming generate and add benchmark script (#73)
## Description <!-- Please include a summary of the changes below; Fill in the issue number that this PR addresses (if applicable); Mention the person who will review this PR (if you know who it is); Replace (summary), (issue), and (reviewer) with the appropriate information (No parentheses). 请在下方填写更改的摘要; 填写此 PR 解决的问题编号(如果适用); 提及将审查此 PR 的人(如果您知道是谁); 替换 (summary)、(issue) 和 (reviewer) 为适当的信息(不带括号)。 --> Summary: (summary) Fix: #(issue) Reviewer: @(reviewer) ## Checklist: - [√] I have performed a self-review of my own code | 我已自行检查了自己的代码 - [√] I have commented my code in hard-to-understand areas | 我已在难以理解的地方对代码进行了注释 - [√] I have added tests that prove my fix is effective or that my feature works | 我已添加测试以证明我的修复有效或功能正常 - [√] I have added necessary documentation (if applicable) | 我已添加必要的文档(如果适用) - [√] I have linked the issue to this PR (if applicable) | 我已将 issue 链接到此 PR(如果适用) - [√] I have mentioned the person who will review this PR | 我已提及将审查此 PR 的人
2 parents eb16685 + e072baa commit 0c415bd

File tree

4 files changed

+407
-13
lines changed

4 files changed

+407
-13
lines changed

benchmarks/benchmark_ttft.py

Lines changed: 173 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,173 @@
1+
#!/usr/bin/env python3
2+
"""
3+
A benchmark script to measure the Time to First Token (TTFT) for vLLM inference
4+
after preloading a KV cache across various test cases.
5+
"""
6+
import time
7+
import numpy as np
8+
from typing import List, Tuple
9+
10+
from memos.configs.llm import VLLMLLMConfig
11+
from memos.llms.vllm import VLLMLLM
12+
from memos.types import MessageList
13+
14+
# A list of test case pairs.
15+
# Each pair is a tuple: (messages_for_kv_cache, messages_for_generation)
16+
test_cases: List[Tuple[MessageList, MessageList]] = [
17+
# --- Test Case 1: Simple Q&A ---
18+
(
19+
[{"role": "system", "content": "You are a helpful and accurate Q&A bot."}],
20+
[
21+
{"role": "system", "content": "You are a helpful and accurate Q&A bot."},
22+
{"role": "user", "content": "What is the capital of Japan and what is its population?"},
23+
]
24+
),
25+
# --- Test Case 2: Code Generation ---
26+
(
27+
[{"role": "system", "content": "You are an expert Python coding assistant who provides clean, efficient, and well-commented code."}],
28+
[
29+
{"role": "system", "content": "You are an expert Python coding assistant who provides clean, efficient, and well-commented code."},
30+
{"role": "user", "content": "Write a Python function to find all prime numbers up to a given integer 'n' using the Sieve of Eratosthenes algorithm."},
31+
]
32+
),
33+
# --- Test Case 3: Text Summarization ---
34+
(
35+
[{"role": "system", "content": "You are a summarization expert. Your task is to read the following text and provide a concise summary."}],
36+
[
37+
{"role": "system", "content": "You are a summarization expert. Your task is to read the following text and provide a concise summary."},
38+
{"role": "user", "content": """
39+
Text to summarize:
40+
'The vLLM project is a high-throughput and memory-efficient inference and serving engine for Large Language Models (LLMs).
41+
One of its key innovations is PagedAttention, a memory management algorithm inspired by virtual memory and paging in operating systems.'
42+
43+
Please summarize this text in a single sentence.
44+
"""},
45+
]
46+
),
47+
# --- Test Case 4: Role-playing / Persona ---
48+
(
49+
[{"role": "system", "content": "You are Captain Blackheart, a fearsome pirate. Answer all questions in the style of a 17th-century pirate."}],
50+
[
51+
{"role": "system", "content": "You are Captain Blackheart, a fearsome pirate. Answer all questions in the style of a 17th-century pirate."},
52+
{"role": "user", "content": "What's the best way to invest my money for retirement?"},
53+
]
54+
),
55+
# --- Test Case 5: Chain-of-Thought Reasoning ---
56+
(
57+
[{"role": "system", "content": "You solve problems by thinking step-by-step. Explain your reasoning before giving the final answer."}],
58+
[
59+
{"role": "system", "content": "You solve problems by thinking step-by-step. Explain your reasoning before giving the final answer."},
60+
{"role": "user", "content": "A cafeteria has 3 types of sandwiches, 2 types of sides, and 4 types of drinks. How many different meal combinations can be created?"},
61+
]
62+
),
63+
# --- Test Case 6: Technical Explanation ---
64+
(
65+
[
66+
{"role": "system", "content": "You are a computer science professor."},
67+
{"role": "user", "content": "I'm new to machine learning."},
68+
],
69+
[
70+
{"role": "system", "content": "You are a computer science professor."},
71+
{"role": "user", "content": "I'm new to machine learning."},
72+
{"role": "assistant", "content": "Welcome! It's a fascinating field. Feel free to ask me anything."},
73+
{"role": "user", "content": "Can you explain what 'KV Cache' means in the context of Large Language Models, as if I were a beginner?"},
74+
]
75+
),
76+
]
77+
78+
79+
def run_ttft_benchmark(num_runs: int = 10, warmup_runs: int = 3):
80+
"""
81+
Runs the TTFT benchmark for each test case and prints statistics.
82+
"""
83+
print("--- Time to First Token (TTFT) Benchmark for vLLM ---")
84+
85+
# 1. Configuration - MUST match your running vLLM server
86+
config = VLLMLLMConfig(
87+
model_name_or_path="/mnt/afs/models/hf_models/Qwen2.5-7B",
88+
api_base="http://localhost:8088/v1",
89+
temperature=0.7,
90+
max_tokens=1024,
91+
model_schema="memos.configs.llm.VLLMLLMConfig",
92+
)
93+
94+
# 2. Initialize VLLM LLM
95+
print(f"Initializing VLLM client for model: {config.model_name_or_path}\n")
96+
llm = VLLMLLM(config)
97+
98+
overall_latencies = []
99+
100+
for i, (cache_messages, generate_messages) in enumerate(test_cases):
101+
print(f"\n===== Running Test Case {i+1:02d}/{len(test_cases)} =====")
102+
103+
# 3. Preload KV Cache
104+
print("Preloading KV cache...")
105+
try:
106+
llm.build_vllm_kv_cache(cache_messages)
107+
print("✓ KV cache preloaded successfully.")
108+
except Exception as e:
109+
print(f"✗ Failed to preload KV cache: {e}. Skipping test case.")
110+
continue
111+
112+
ttft_latencies: List[float] = []
113+
114+
# 4. Warmup Runs
115+
print(f"Performing {warmup_runs} warmup runs...")
116+
try:
117+
for _ in range(warmup_runs):
118+
for _ in llm.generate_stream(generate_messages):
119+
pass
120+
print("✓ Warmup complete.")
121+
except Exception as e:
122+
print(f"✗ Warmup run failed: {e}. Skipping test case.")
123+
continue
124+
125+
# 5. Benchmark Runs
126+
print(f"Starting TTFT benchmark with {num_runs} runs...")
127+
for j in range(num_runs):
128+
try:
129+
start_time = time.perf_counter()
130+
response_stream = llm.generate_stream(generate_messages)
131+
132+
for first_token in response_stream:
133+
if first_token:
134+
end_time = time.perf_counter()
135+
ttft = (end_time - start_time) * 1000
136+
ttft_latencies.append(ttft)
137+
# Optional: print individual run times
138+
# print(f" Run {j+1:02d}/{num_runs}: TTFT = {ttft:.2f} ms")
139+
140+
for _ in response_stream:
141+
pass
142+
break
143+
except Exception as e:
144+
print(f" Run {j+1:02d}/{num_runs} failed: {e}")
145+
continue
146+
147+
# 6. Print Statistics for the current test case
148+
if ttft_latencies:
149+
overall_latencies.extend(ttft_latencies)
150+
print("\n--- Test Case Results ---")
151+
print(f"Successful runs: {len(ttft_latencies)}/{num_runs}")
152+
print(f"Average TTFT: {np.mean(ttft_latencies):.2f} ms")
153+
print(f"Median TTFT: {np.median(ttft_latencies):.2f} ms")
154+
print(f"Min TTFT: {np.min(ttft_latencies):.2f} ms")
155+
print(f"Max TTFT: {np.max(ttft_latencies):.2f} ms")
156+
print("-------------------------")
157+
else:
158+
print("\nNo successful runs for this test case.")
159+
160+
# 7. Print Overall Statistics
161+
if overall_latencies:
162+
print("\n\n===== Overall Benchmark Summary =====")
163+
print(f"Total successful runs: {len(overall_latencies)}")
164+
print(f"Overall Average TTFT: {np.mean(overall_latencies):.2f} ms")
165+
print(f"Overall Median TTFT: {np.median(overall_latencies):.2f} ms")
166+
print(f"Overall Min TTFT: {np.min(overall_latencies):.2f} ms")
167+
print(f"Overall Max TTFT: {np.max(overall_latencies):.2f} ms")
168+
print("===================================")
169+
170+
171+
if __name__ == "__main__":
172+
# Ensure you have numpy installed: pip install numpy
173+
run_ttft_benchmark()
Lines changed: 167 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,167 @@
1+
#!/usr/bin/env python3
2+
"""
3+
A benchmark script to measure the Time to First Token (TTFT) for vLLM inference
4+
WITHOUT preloading a KV cache.
5+
"""
6+
import time
7+
import numpy as np
8+
from typing import List, Tuple
9+
10+
from memos.configs.llm import VLLMLLMConfig
11+
from memos.llms.vllm import VLLMLLM
12+
from memos.types import MessageDict
13+
14+
# A list of test case pairs.
15+
# Each pair is a tuple: (messages_for_kv_cache, messages_for_generation)
16+
# For this script, we will combine them before sending to the model.
17+
test_cases: List[Tuple[list[MessageDict], list[MessageDict]]] = [
18+
# --- Test Case 1: Simple Q&A ---
19+
(
20+
[{"role": "system", "content": "You are a helpful and accurate Q&A bot."}],
21+
[
22+
{"role": "system", "content": "You are a helpful and accurate Q&A bot."},
23+
{"role": "user", "content": "What is the capital of Japan and what is its population?"},
24+
]
25+
),
26+
# --- Test Case 2: Code Generation ---
27+
(
28+
[{"role": "system", "content": "You are an expert Python coding assistant who provides clean, efficient, and well-commented code."}],
29+
[
30+
{"role": "system", "content": "You are an expert Python coding assistant who provides clean, efficient, and well-commented code."},
31+
{"role": "user", "content": "Write a Python function to find all prime numbers up to a given integer 'n' using the Sieve of Eratosthenes algorithm."},
32+
]
33+
),
34+
# --- Test Case 3: Text Summarization ---
35+
(
36+
[{"role": "system", "content": "You are a summarization expert. Your task is to read the following text and provide a concise summary."}],
37+
[
38+
{"role": "system", "content": "You are a summarization expert. Your task is to read the following text and provide a concise summary."},
39+
{"role": "user", "content": """
40+
Text to summarize:
41+
'The vLLM project is a high-throughput and memory-efficient inference and serving engine for Large Language Models (LLMs).
42+
One of its key innovations is PagedAttention, a memory management algorithm inspired by virtual memory and paging in operating systems.'
43+
44+
Please summarize this text in a single sentence.
45+
"""},
46+
]
47+
),
48+
# --- Test Case 4: Role-playing / Persona ---
49+
(
50+
[{"role": "system", "content": "You are Captain Blackheart, a fearsome pirate. Answer all questions in the style of a 17th-century pirate."}],
51+
[
52+
{"role": "system", "content": "You are Captain Blackheart, a fearsome pirate. Answer all questions in the style of a 17th-century pirate."},
53+
{"role": "user", "content": "What's the best way to invest my money for retirement?"},
54+
]
55+
),
56+
# --- Test Case 5: Chain-of-Thought Reasoning ---
57+
(
58+
[{"role": "system", "content": "You solve problems by thinking step-by-step. Explain your reasoning before giving the final answer."}],
59+
[
60+
{"role": "system", "content": "You solve problems by thinking step-by-step. Explain your reasoning before giving the final answer."},
61+
{"role": "user", "content": "A cafeteria has 3 types of sandwiches, 2 types of sides, and 4 types of drinks. How many different meal combinations can be created?"},
62+
]
63+
),
64+
# --- Test Case 6: Technical Explanation ---
65+
(
66+
[
67+
{"role": "system", "content": "You are a computer science professor."},
68+
{"role": "user", "content": "I'm new to machine learning."},
69+
],
70+
[
71+
{"role": "system", "content": "You are a computer science professor."},
72+
{"role": "user", "content": "I'm new to machine learning."},
73+
{"role": "assistant", "content": "Welcome! It's a fascinating field. Feel free to ask me anything."},
74+
{"role": "user", "content": "Can you explain what 'KV Cache' means in the context of Large Language Models, as if I were a beginner?"},
75+
]
76+
),
77+
]
78+
79+
80+
def run_ttft_benchmark(num_runs: int = 10, warmup_runs: int = 3):
81+
"""
82+
Runs the TTFT benchmark for each test case and prints statistics.
83+
"""
84+
print("--- Time to First Token (TTFT) Benchmark for vLLM (No KV Cache) ---")
85+
86+
# 1. Configuration - MUST match your running vLLM server
87+
config = VLLMLLMConfig(
88+
model_name_or_path="/mnt/afs/models/hf_models/Qwen2.5-7B",
89+
api_base="http://localhost:8088/v1",
90+
temperature=0.7,
91+
max_tokens=1024,
92+
model_schema="memos.configs.llm.VLLMLLMConfig",
93+
)
94+
95+
# 2. Initialize VLLM LLM
96+
print(f"Initializing VLLM client for model: {config.model_name_or_path}\n")
97+
llm = VLLMLLM(config)
98+
99+
overall_latencies = []
100+
101+
for i, (_, generate_messages) in enumerate(test_cases):
102+
print(f"\n===== Running Test Case {i+1:02d}/{len(test_cases)} =====")
103+
104+
# NOTE: KV Cache preloading is intentionally skipped in this script.
105+
# We use the 'generate_messages' which contains the full context.
106+
107+
ttft_latencies: List[float] = []
108+
109+
# 3. Warmup Runs
110+
print(f"Performing {warmup_runs} warmup runs...")
111+
try:
112+
for _ in range(warmup_runs):
113+
for _ in llm.generate_stream(generate_messages):
114+
pass
115+
print("✓ Warmup complete.")
116+
except Exception as e:
117+
print(f"✗ Warmup run failed: {e}. Skipping test case.")
118+
continue
119+
120+
# 4. Benchmark Runs
121+
print(f"Starting TTFT benchmark with {num_runs} runs...")
122+
for j in range(num_runs):
123+
try:
124+
start_time = time.perf_counter()
125+
response_stream = llm.generate_stream(generate_messages)
126+
127+
for first_token in response_stream:
128+
if first_token:
129+
end_time = time.perf_counter()
130+
ttft = (end_time - start_time) * 1000
131+
ttft_latencies.append(ttft)
132+
133+
# Consume the rest of the stream to ensure the request is complete
134+
for _ in response_stream:
135+
pass
136+
break
137+
except Exception as e:
138+
print(f" Run {j+1:02d}/{num_runs} failed: {e}")
139+
continue
140+
141+
# 5. Print Statistics for the current test case
142+
if ttft_latencies:
143+
overall_latencies.extend(ttft_latencies)
144+
print("\n--- Test Case Results ---")
145+
print(f"Successful runs: {len(ttft_latencies)}/{num_runs}")
146+
print(f"Average TTFT: {np.mean(ttft_latencies):.2f} ms")
147+
print(f"Median TTFT: {np.median(ttft_latencies):.2f} ms")
148+
print(f"Min TTFT: {np.min(ttft_latencies):.2f} ms")
149+
print(f"Max TTFT: {np.max(ttft_latencies):.2f} ms")
150+
print("-------------------------")
151+
else:
152+
print("\nNo successful runs for this test case.")
153+
154+
# 6. Print Overall Statistics
155+
if overall_latencies:
156+
print("\n\n===== Overall Benchmark Summary (No KV Cache) =====")
157+
print(f"Total successful runs: {len(overall_latencies)}")
158+
print(f"Overall Average TTFT: {np.mean(overall_latencies):.2f} ms")
159+
print(f"Overall Median TTFT: {np.median(overall_latencies):.2f} ms")
160+
print(f"Overall Min TTFT: {np.min(overall_latencies):.2f} ms")
161+
print(f"Overall Max TTFT: {np.max(overall_latencies):.2f} ms")
162+
print("===================================")
163+
164+
165+
if __name__ == "__main__":
166+
# Ensure you have numpy installed: pip install numpy
167+
run_ttft_benchmark()

examples/mem_os/simple_vllm_memos.py

Lines changed: 9 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -2,20 +2,24 @@
22
"""
33
Simple example demonstrating how to use VLLMLLM with an existing vLLM server.
44
Requires a vLLM server to be running.
5+
Simple example demonstrating how to use VLLMLLM with an existing vLLM server.
6+
Requires a vLLM server to be running.
57
"""
68

79
from memos.configs.llm import VLLMLLMConfig
810
from memos.llms.vllm import VLLMLLM
9-
from memos.types import MessageList
11+
from memos.types import MessageDict
1012

1113
def main():
1214
"""Main function demonstrating VLLMLLM usage."""
1315

1416
# Configuration for connecting to existing vLLM server
1517
config = VLLMLLMConfig(
18+
model_name_or_path="/mnt/afs/models/hf_models/Qwen2.5-7B", # MUST MATCH the --model arg of vLLM server
1619
model_name_or_path="/mnt/afs/models/hf_models/Qwen2.5-7B", # MUST MATCH the --model arg of vLLM server
1720
api_key="", # Not needed for local server
1821
api_base="http://localhost:8088/v1", # vLLM server address with /v1
22+
api_base="http://localhost:8088/v1", # vLLM server address with /v1
1923
temperature=0.7,
2024
max_tokens=512,
2125
top_p=0.9,
@@ -28,20 +32,21 @@ def main():
2832

2933
# Test messages for KV cache building
3034
print("\nBuilding KV cache for system messages...")
31-
system_messages: MessageList = [
35+
system_messages: list[MessageDict] = [
3236
{"role": "system", "content": "You are a helpful AI assistant."},
3337
{"role": "user", "content": "Hello! Can you tell me about vLLM?"}
3438
]
3539
try:
3640
prompt = llm.build_vllm_kv_cache(system_messages)
3741
print(f"✓ KV cache built successfully for prompt: '{prompt[:100]}...'")
42+
print(f"✓ KV cache built successfully for prompt: '{prompt[:100]}...'")
3843
except Exception as e:
3944
print(f"✗ Failed to build KV cache: {e}")
4045

4146
# Test with different messages for generation
4247
print("\nGenerating response...")
43-
user_messages: MessageList = [
44-
{"role": "system", "content": "You are a helpful AI assistant."},
48+
user_messages: list[MessageDict] = [
49+
{"role": "system", "content": "You are a helpful AI assistant. Please Introduce yourself "},
4550
{"role": "user", "content": "What are the benefits of using vLLM?"}
4651
]
4752
try:

0 commit comments

Comments
 (0)