Skip to content

Commit ed07e70

Browse files
committed
feat:update feat
1 parent 75ef1b9 commit ed07e70

33 files changed

+780
-797
lines changed

benchmarks/benchmark_ttft.py

Lines changed: 85 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -3,62 +3,105 @@
33
A benchmark script to measure the Time to First Token (TTFT) for vLLM inference
44
after preloading a KV cache across various test cases.
55
"""
6+
67
import time
8+
79
import numpy as np
8-
from typing import List, Tuple
910

1011
from memos.configs.llm import VLLMLLMConfig
1112
from memos.llms.vllm import VLLMLLM
1213
from memos.types import MessageList
1314

15+
1416
# A list of test case pairs.
1517
# Each pair is a tuple: (messages_for_kv_cache, messages_for_generation)
16-
test_cases: List[Tuple[MessageList, MessageList]] = [
18+
test_cases: list[tuple[MessageList, MessageList]] = [
1719
# --- Test Case 1: Simple Q&A ---
1820
(
1921
[{"role": "system", "content": "You are a helpful and accurate Q&A bot."}],
2022
[
2123
{"role": "system", "content": "You are a helpful and accurate Q&A bot."},
2224
{"role": "user", "content": "What is the capital of Japan and what is its population?"},
23-
]
25+
],
2426
),
2527
# --- Test Case 2: Code Generation ---
2628
(
27-
[{"role": "system", "content": "You are an expert Python coding assistant who provides clean, efficient, and well-commented code."}],
2829
[
29-
{"role": "system", "content": "You are an expert Python coding assistant who provides clean, efficient, and well-commented code."},
30-
{"role": "user", "content": "Write a Python function to find all prime numbers up to a given integer 'n' using the Sieve of Eratosthenes algorithm."},
31-
]
30+
{
31+
"role": "system",
32+
"content": "You are an expert Python coding assistant who provides clean, efficient, and well-commented code.",
33+
}
34+
],
35+
[
36+
{
37+
"role": "system",
38+
"content": "You are an expert Python coding assistant who provides clean, efficient, and well-commented code.",
39+
},
40+
{
41+
"role": "user",
42+
"content": "Write a Python function to find all prime numbers up to a given integer 'n' using the Sieve of Eratosthenes algorithm.",
43+
},
44+
],
3245
),
3346
# --- Test Case 3: Text Summarization ---
3447
(
35-
[{"role": "system", "content": "You are a summarization expert. Your task is to read the following text and provide a concise summary."}],
3648
[
37-
{"role": "system", "content": "You are a summarization expert. Your task is to read the following text and provide a concise summary."},
38-
{"role": "user", "content": """
49+
{
50+
"role": "system",
51+
"content": "You are a summarization expert. Your task is to read the following text and provide a concise summary.",
52+
}
53+
],
54+
[
55+
{
56+
"role": "system",
57+
"content": "You are a summarization expert. Your task is to read the following text and provide a concise summary.",
58+
},
59+
{
60+
"role": "user",
61+
"content": """
3962
Text to summarize:
40-
'The vLLM project is a high-throughput and memory-efficient inference and serving engine for Large Language Models (LLMs).
63+
'The vLLM project is a high-throughput and memory-efficient inference and serving engine for Large Language Models (LLMs).
4164
One of its key innovations is PagedAttention, a memory management algorithm inspired by virtual memory and paging in operating systems.'
42-
65+
4366
Please summarize this text in a single sentence.
44-
"""},
45-
]
67+
""",
68+
},
69+
],
4670
),
4771
# --- Test Case 4: Role-playing / Persona ---
4872
(
49-
[{"role": "system", "content": "You are Captain Blackheart, a fearsome pirate. Answer all questions in the style of a 17th-century pirate."}],
5073
[
51-
{"role": "system", "content": "You are Captain Blackheart, a fearsome pirate. Answer all questions in the style of a 17th-century pirate."},
74+
{
75+
"role": "system",
76+
"content": "You are Captain Blackheart, a fearsome pirate. Answer all questions in the style of a 17th-century pirate.",
77+
}
78+
],
79+
[
80+
{
81+
"role": "system",
82+
"content": "You are Captain Blackheart, a fearsome pirate. Answer all questions in the style of a 17th-century pirate.",
83+
},
5284
{"role": "user", "content": "What's the best way to invest my money for retirement?"},
53-
]
85+
],
5486
),
5587
# --- Test Case 5: Chain-of-Thought Reasoning ---
5688
(
57-
[{"role": "system", "content": "You solve problems by thinking step-by-step. Explain your reasoning before giving the final answer."}],
5889
[
59-
{"role": "system", "content": "You solve problems by thinking step-by-step. Explain your reasoning before giving the final answer."},
60-
{"role": "user", "content": "A cafeteria has 3 types of sandwiches, 2 types of sides, and 4 types of drinks. How many different meal combinations can be created?"},
61-
]
90+
{
91+
"role": "system",
92+
"content": "You solve problems by thinking step-by-step. Explain your reasoning before giving the final answer.",
93+
}
94+
],
95+
[
96+
{
97+
"role": "system",
98+
"content": "You solve problems by thinking step-by-step. Explain your reasoning before giving the final answer.",
99+
},
100+
{
101+
"role": "user",
102+
"content": "A cafeteria has 3 types of sandwiches, 2 types of sides, and 4 types of drinks. How many different meal combinations can be created?",
103+
},
104+
],
62105
),
63106
# --- Test Case 6: Technical Explanation ---
64107
(
@@ -69,9 +112,15 @@
69112
[
70113
{"role": "system", "content": "You are a computer science professor."},
71114
{"role": "user", "content": "I'm new to machine learning."},
72-
{"role": "assistant", "content": "Welcome! It's a fascinating field. Feel free to ask me anything."},
73-
{"role": "user", "content": "Can you explain what 'KV Cache' means in the context of Large Language Models, as if I were a beginner?"},
74-
]
115+
{
116+
"role": "assistant",
117+
"content": "Welcome! It's a fascinating field. Feel free to ask me anything.",
118+
},
119+
{
120+
"role": "user",
121+
"content": "Can you explain what 'KV Cache' means in the context of Large Language Models, as if I were a beginner?",
122+
},
123+
],
75124
),
76125
]
77126

@@ -81,7 +130,7 @@ def run_ttft_benchmark(num_runs: int = 10, warmup_runs: int = 3):
81130
Runs the TTFT benchmark for each test case and prints statistics.
82131
"""
83132
print("--- Time to First Token (TTFT) Benchmark for vLLM ---")
84-
133+
85134
# 1. Configuration - MUST match your running vLLM server
86135
config = VLLMLLMConfig(
87136
model_name_or_path="/mnt/afs/models/hf_models/Qwen2.5-7B",
@@ -90,16 +139,16 @@ def run_ttft_benchmark(num_runs: int = 10, warmup_runs: int = 3):
90139
max_tokens=1024,
91140
model_schema="memos.configs.llm.VLLMLLMConfig",
92141
)
93-
142+
94143
# 2. Initialize VLLM LLM
95144
print(f"Initializing VLLM client for model: {config.model_name_or_path}\n")
96145
llm = VLLMLLM(config)
97-
146+
98147
overall_latencies = []
99148

100149
for i, (cache_messages, generate_messages) in enumerate(test_cases):
101-
print(f"\n===== Running Test Case {i+1:02d}/{len(test_cases)} =====")
102-
150+
print(f"\n===== Running Test Case {i + 1:02d}/{len(test_cases)} =====")
151+
103152
# 3. Preload KV Cache
104153
print("Preloading KV cache...")
105154
try:
@@ -109,7 +158,7 @@ def run_ttft_benchmark(num_runs: int = 10, warmup_runs: int = 3):
109158
print(f"✗ Failed to preload KV cache: {e}. Skipping test case.")
110159
continue
111160

112-
ttft_latencies: List[float] = []
161+
ttft_latencies: list[float] = []
113162

114163
# 4. Warmup Runs
115164
print(f"Performing {warmup_runs} warmup runs...")
@@ -121,29 +170,27 @@ def run_ttft_benchmark(num_runs: int = 10, warmup_runs: int = 3):
121170
except Exception as e:
122171
print(f"✗ Warmup run failed: {e}. Skipping test case.")
123172
continue
124-
173+
125174
# 5. Benchmark Runs
126175
print(f"Starting TTFT benchmark with {num_runs} runs...")
127176
for j in range(num_runs):
128177
try:
129178
start_time = time.perf_counter()
130179
response_stream = llm.generate_stream(generate_messages)
131-
180+
132181
for first_token in response_stream:
133182
if first_token:
134183
end_time = time.perf_counter()
135184
ttft = (end_time - start_time) * 1000
136185
ttft_latencies.append(ttft)
137-
# Optional: print individual run times
138-
# print(f" Run {j+1:02d}/{num_runs}: TTFT = {ttft:.2f} ms")
139-
186+
140187
for _ in response_stream:
141188
pass
142189
break
143190
except Exception as e:
144-
print(f" Run {j+1:02d}/{num_runs} failed: {e}")
191+
print(f" Run {j + 1:02d}/{num_runs} failed: {e}")
145192
continue
146-
193+
147194
# 6. Print Statistics for the current test case
148195
if ttft_latencies:
149196
overall_latencies.extend(ttft_latencies)
@@ -170,4 +217,4 @@ def run_ttft_benchmark(num_runs: int = 10, warmup_runs: int = 3):
170217

171218
if __name__ == "__main__":
172219
# Ensure you have numpy installed: pip install numpy
173-
run_ttft_benchmark()
220+
run_ttft_benchmark()

0 commit comments

Comments
 (0)