Skip to content

Commit 7ff1006

Browse files
refactor: break profile_sla into different files; feat: support vllm_v1 (ai-dynamo#1588)
Signed-off-by: Hongkuan Zhou <tedzhouhk@gmail.com> Co-authored-by: coderabbitai[bot] <136622811+coderabbitai[bot]@users.noreply.github.com>
1 parent d2bec6f commit 7ff1006

File tree

10 files changed

+918
-547
lines changed

10 files changed

+918
-547
lines changed

benchmarks/profiler/profile_sla.py

Lines changed: 47 additions & 544 deletions
Large diffs are not rendered by default.
Lines changed: 242 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,242 @@
1+
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2+
# SPDX-License-Identifier: Apache-2.0
3+
#
4+
# Licensed under the Apache License, Version 2.0 (the "License");
5+
# you may not use this file except in compliance with the License.
6+
# You may obtain a copy of the License at
7+
#
8+
# http://www.apache.org/licenses/LICENSE-2.0
9+
#
10+
# Unless required by applicable law or agreed to in writing, software
11+
# distributed under the License is distributed on an "AS IS" BASIS,
12+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
# See the License for the specific language governing permissions and
14+
# limitations under the License.
15+
16+
import logging
17+
from typing import Literal
18+
19+
from dynamo.planner.defaults import WORKER_COMPONENT_NAMES
20+
21+
logger = logging.getLogger(__name__)
22+
logger.setLevel(logging.INFO)
23+
console_handler = logging.StreamHandler()
24+
console_handler.setLevel(logging.INFO)
25+
formatter = logging.Formatter(
26+
"%(asctime)s - %(name)s - %(levelname)s - %(message)s", "%Y-%m-%d %H:%M:%S"
27+
)
28+
console_handler.setFormatter(formatter)
29+
logger.addHandler(console_handler)
30+
31+
32+
class VllmV0ConfigModifier:
33+
@classmethod
34+
def convert_config(cls, config: dict, target: Literal["prefill", "decode"]) -> dict:
35+
config = config.copy()
36+
37+
# disable planner
38+
if "Planner" in config:
39+
config["Planner"]["no-operation"] = True
40+
41+
if target == "prefill":
42+
if WORKER_COMPONENT_NAMES["vllm_v0"].prefill_worker in config:
43+
# make PrefillWorker into VllmWorker
44+
del config[WORKER_COMPONENT_NAMES["vllm_v0"].decode_worker]
45+
config[WORKER_COMPONENT_NAMES["vllm_v0"].decode_worker] = config[
46+
WORKER_COMPONENT_NAMES["vllm_v0"].prefill_worker
47+
]
48+
del config[WORKER_COMPONENT_NAMES["vllm_v0"].prefill_worker]
49+
50+
# to profile prefill, we disable prefix caching
51+
config[WORKER_COMPONENT_NAMES["vllm_v0"].decode_worker][
52+
"enable-prefix-caching"
53+
] = False
54+
elif target == "decode":
55+
if WORKER_COMPONENT_NAMES["vllm_v0"].prefill_worker in config:
56+
del config[WORKER_COMPONENT_NAMES["vllm_v0"].prefill_worker]
57+
58+
# to profile prefill, we enable prefix caching to pass the prefill stage
59+
config[WORKER_COMPONENT_NAMES["vllm_v0"].decode_worker][
60+
"enable-prefix-caching"
61+
] = True
62+
63+
# set num workers to 1
64+
config[WORKER_COMPONENT_NAMES["vllm_v0"].decode_worker]["ServiceArgs"][
65+
"workers"
66+
] = 1
67+
68+
# set PP to 1
69+
if (
70+
"pipeline-parallel-size"
71+
in config[WORKER_COMPONENT_NAMES["vllm_v0"].decode_worker]
72+
and config[WORKER_COMPONENT_NAMES["vllm_v0"].decode_worker][
73+
"pipeline-parallel-size"
74+
]
75+
> 1
76+
):
77+
logger.warning("Currently we only support TP, setting PP to 1")
78+
config[WORKER_COMPONENT_NAMES["vllm_v0"].decode_worker][
79+
"pipeline-parallel-size"
80+
] = 1
81+
82+
# always local prefill
83+
config[WORKER_COMPONENT_NAMES["vllm_v0"].decode_worker][
84+
"remote-prefill"
85+
] = False
86+
config[WORKER_COMPONENT_NAMES["vllm_v0"].decode_worker][
87+
"conditional-disagg"
88+
] = False
89+
90+
return config
91+
92+
@classmethod
93+
def set_config_tp_size(cls, config: dict, tp_size: int):
94+
config[WORKER_COMPONENT_NAMES["vllm_v0"].decode_worker][
95+
"tensor-parallel-size"
96+
] = tp_size
97+
config[WORKER_COMPONENT_NAMES["vllm_v0"].decode_worker]["ServiceArgs"][
98+
"resources"
99+
]["gpu"] = tp_size
100+
return config
101+
102+
@classmethod
103+
def get_model_name(cls, config: dict) -> str:
104+
if "Common" in config and "served_model_name" in config["Common"]:
105+
return config["Common"]["served_model_name"]
106+
else:
107+
return config["Frontend"]["served_model_name"]
108+
109+
@classmethod
110+
def get_port(cls, config: dict) -> int:
111+
if "Common" in config and "port" in config["Common"]:
112+
return config["Common"]["port"]
113+
else:
114+
return config["Frontend"]["port"]
115+
116+
@classmethod
117+
def get_kv_cache_size_from_dynamo_log(cls, dynamo_log_fn: str) -> int:
118+
try:
119+
with open(dynamo_log_fn, "r") as f:
120+
for line in f:
121+
if "Maximum concurrency for" in line:
122+
line = line.strip().split("Maximum concurrency for ")[1]
123+
token_count = int(line.split(" tokens per request: ")[0])
124+
concurrency = float(line.split(" tokens per request: ")[1][:-1])
125+
126+
logger.info(
127+
f"Found KV cache info: {token_count} x {concurrency} = {int(token_count * concurrency)}"
128+
)
129+
return int(token_count * concurrency)
130+
except Exception as e:
131+
logger.warning(
132+
f"Failed to parse KV cache size from line: {line}. Error: {e}"
133+
)
134+
return 0
135+
136+
137+
class VllmV1ConfigModifier:
138+
@classmethod
139+
def convert_config(cls, config: dict, target: Literal["prefill", "decode"]) -> dict:
140+
config = config.copy()
141+
142+
# disable planner
143+
if "Planner" in config:
144+
config["Planner"]["no-operation"] = True
145+
146+
# turn-off disagg
147+
config["SimpleLoadBalancer"]["enable_disagg"] = False
148+
149+
if target == "prefill":
150+
if WORKER_COMPONENT_NAMES["vllm_v1"].prefill_worker in config:
151+
# make VllmPrefillWorker into VllmDecodeWorker
152+
del config[WORKER_COMPONENT_NAMES["vllm_v1"].decode_worker]
153+
config[WORKER_COMPONENT_NAMES["vllm_v1"].decode_worker] = config[
154+
WORKER_COMPONENT_NAMES["vllm_v1"].prefill_worker
155+
]
156+
del config[WORKER_COMPONENT_NAMES["vllm_v1"].prefill_worker]
157+
158+
# to profile prefill, we disable prefix caching
159+
config[WORKER_COMPONENT_NAMES["vllm_v1"].decode_worker][
160+
"enable-prefix-caching"
161+
] = False
162+
elif target == "decode":
163+
if WORKER_COMPONENT_NAMES["vllm_v1"].prefill_worker in config:
164+
del config[WORKER_COMPONENT_NAMES["vllm_v1"].prefill_worker]
165+
166+
# to profile prefill, we enable prefix caching to pass the prefill stage
167+
config[WORKER_COMPONENT_NAMES["vllm_v1"].decode_worker][
168+
"enable-prefix-caching"
169+
] = True
170+
171+
# set num workers to 1
172+
config[WORKER_COMPONENT_NAMES["vllm_v1"].decode_worker]["ServiceArgs"][
173+
"workers"
174+
] = 1
175+
176+
# set PP to 1
177+
if (
178+
"pipeline-parallel-size"
179+
in config[WORKER_COMPONENT_NAMES["vllm_v1"].decode_worker]
180+
and config[WORKER_COMPONENT_NAMES["vllm_v1"].decode_worker][
181+
"pipeline-parallel-size"
182+
]
183+
> 1
184+
):
185+
logger.warning("Currently we only support TP, setting PP to 1")
186+
config[WORKER_COMPONENT_NAMES["vllm_v1"].decode_worker][
187+
"pipeline-parallel-size"
188+
] = 1
189+
190+
return config
191+
192+
@classmethod
193+
def set_config_tp_size(cls, config: dict, tp_size: int):
194+
config[WORKER_COMPONENT_NAMES["vllm_v1"].decode_worker][
195+
"tensor-parallel-size"
196+
] = tp_size
197+
config[WORKER_COMPONENT_NAMES["vllm_v1"].decode_worker]["ServiceArgs"][
198+
"resources"
199+
]["gpu"] = tp_size
200+
return config
201+
202+
@classmethod
203+
def get_model_name(cls, config: dict) -> str:
204+
if "Common" in config and "served_model_name" in config["Common"]:
205+
return config["Common"]["served_model_name"]
206+
else:
207+
return config["Frontend"]["served_model_name"]
208+
209+
@classmethod
210+
def get_port(cls, config: dict) -> int:
211+
if "Common" in config and "port" in config["Common"]:
212+
return config["Common"]["port"]
213+
else:
214+
return config["Frontend"]["port"]
215+
216+
@classmethod
217+
def get_kv_cache_size_from_dynamo_log(cls, dynamo_log_fn: str) -> int:
218+
try:
219+
with open(dynamo_log_fn, "r") as f:
220+
for line in f:
221+
if "Maximum concurrency for" in line:
222+
line = line.strip().split("Maximum concurrency for ")[1]
223+
token_count = int(
224+
line.split(" tokens per request: ")[0].replace(",", "")
225+
)
226+
concurrency = float(line.split(" tokens per request: ")[1][:-1])
227+
228+
logger.info(
229+
f"Found KV cache info: {token_count} x {concurrency} = {int(token_count * concurrency)}"
230+
)
231+
return int(token_count * concurrency)
232+
except Exception as e:
233+
logger.warning(
234+
f"Failed to parse KV cache size from line: {line}. Error: {e}"
235+
)
236+
return 0
237+
238+
239+
CONFIG_MODIFIERS = {
240+
"vllm_v0": VllmV0ConfigModifier,
241+
"vllm_v1": VllmV1ConfigModifier,
242+
}
Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,31 @@
1+
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2+
# SPDX-License-Identifier: Apache-2.0
3+
#
4+
# Licensed under the Apache License, Version 2.0 (the "License");
5+
# you may not use this file except in compliance with the License.
6+
# You may obtain a copy of the License at
7+
#
8+
# http://www.apache.org/licenses/LICENSE-2.0
9+
#
10+
# Unless required by applicable law or agreed to in writing, software
11+
# distributed under the License is distributed on an "AS IS" BASIS,
12+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
# See the License for the specific language governing permissions and
14+
# limitations under the License.
15+
16+
DECODE_NUM_REQUESTS_RANGE = [
17+
1,
18+
5,
19+
10,
20+
25,
21+
50,
22+
100,
23+
150,
24+
200,
25+
250,
26+
300,
27+
350,
28+
400,
29+
450,
30+
500,
31+
]

0 commit comments

Comments
 (0)