Skip to content

Commit 961a7c3

Browse files
committed
Merge branch 'r25.10' of github.com:triton-inference-server/vllm_backend into yinggeh/tri-49-request-for-openai-compatible-api-endpoints-for-triton
2 parents 2c3e148 + d50bda1 commit 961a7c3

File tree

8 files changed

+94
-63
lines changed

8 files changed

+94
-63
lines changed

ci/L0_backend_vllm/accuracy_test/accuracy_test.py

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -190,7 +190,6 @@ def test_guided_decoding(self):
190190
sampling_params = SAMPLING_PARAMETERS
191191
guided_decoding_params = {
192192
"choice": ["Positive", "Negative"],
193-
"backend": "outlines",
194193
}
195194
sampling_params["guided_decoding"] = json.dumps(guided_decoding_params)
196195
for i in range(len(GUIDED_PROMPTS)):
@@ -245,7 +244,6 @@ def tearDown(self):
245244
if FLAGS.generate_guided_baseline:
246245
guided_decoding_params = {
247246
"choice": ["Positive", "Negative"],
248-
"backend": "outlines",
249247
}
250248
guided_generation = GuidedDecodingParams(**guided_decoding_params)
251249
asyncio.run(

ci/L0_backend_vllm/accuracy_test/test.sh

Lines changed: 0 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -48,17 +48,11 @@ RET=0
4848
set +e
4949
# Need to generate baseline first, since running 2 vLLM engines causes
5050
# memory issues: https://github.com/vllm-project/vllm/issues/2248
51-
export VLLM_USE_V1=0
52-
export VLLM_WORKER_MULTIPROC_METHOD=spawn
5351
python3 $CLIENT_PY --generate-baseline >> $VLLM_ENGINE_LOG 2>&1 & BASELINE_PID=$!
5452
wait $BASELINE_PID
5553

5654
python3 $CLIENT_PY --generate-guided-baseline > $VLLM_ENGINE_LOG 2>&1 & BASELINE_PID=$!
5755
wait $BASELINE_PID
58-
59-
unset VLLM_USE_V1
60-
unset VLLM_WORKER_MULTIPROC_METHOD
61-
6256
set -e
6357

6458
run_server
@@ -88,12 +82,6 @@ set -e
8882
kill $SERVER_PID
8983
wait $SERVER_PID
9084

91-
# Check that warning about V1 Engine appears in log - this warning is expected
92-
if ! grep -q "Engine in background thread is experimental on VLLM_USE_V1=1. Falling back to V0 Engine." $SERVER_LOG; then
93-
echo -e "\n***\n*** ERROR: Expected warning about vLLM falling back to V0 Engine not found in logs.\n***"
94-
RET=1
95-
fi
96-
9785
rm -rf models/
9886

9987
if [ $RET -eq 1 ]; then

ci/L0_backend_vllm/metrics_test/vllm_metrics_test.py

Lines changed: 3 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -173,13 +173,11 @@ def test_vllm_metrics(self):
173173
# TODO: Revisit this test due to the removal of best_of
174174
def test_custom_sampling_params(self):
175175
# Adding sampling parameters for testing metrics.
176-
# Definitions can be found here https://docs.vllm.ai/en/latest/dev/sampling_params.html
177-
n, best_of = 2, 4
176+
# Definitions can be found here https://docs.vllm.ai/en/latest/api/vllm/sampling_params.html
177+
n, temperature = 2, 1
178178
custom_sampling_parameters = self.sampling_parameters.copy()
179-
# Changing "temperature" because "best_of" must be 1 when using greedy
180-
# sampling, i.e. "temperature": "0".
181179
custom_sampling_parameters.update(
182-
{"n": str(n), "best_of": str(best_of), "temperature": "1"}
180+
{"n": str(n), "temperature": str(temperature)}
183181
)
184182

185183
# Test vLLM metrics

ci/L0_backend_vllm/test.sh

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,9 @@
2828
RET=0
2929
SUBTESTS="accuracy_test request_cancellation enabled_stream vllm_backend metrics_test"
3030

31+
export C_INCLUDE_PATH=/usr/local/cuda/include:$C_INCLUDE_PATH
32+
export TRITON_PTXAS_PATH=/usr/local/cuda/bin/ptxas
33+
3134
python3 -m pip install tritonclient[grpc]
3235

3336
for TEST in ${SUBTESTS}; do

ci/L0_check_health_vllm/test.sh

Lines changed: 17 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -31,11 +31,12 @@ source ../common/util.sh
3131
pip3 install pytest==8.1.1
3232
pip3 install tritonclient[grpc]
3333

34+
rm -f *.log *.report.xml
3435
RET=0
3536

3637
function setup_model_repository {
37-
local sample_model_repo_path=${1:-"../../samples/model_repository"}
38-
rm -rf models vllm_baseline_output.pkl && mkdir -p models
38+
local sample_model_repo_path="../../samples/model_repository"
39+
rm -rf models && mkdir -p models
3940
cp -r $sample_model_repo_path/vllm_model models/vllm_opt
4041
}
4142

@@ -48,23 +49,24 @@ function enable_health_check {
4849
}
4950

5051
VLLM_INSTALL_PATH="/usr/local/lib/python3.12/dist-packages/vllm"
52+
VLLM_V1_ENGINE_PATH="$VLLM_INSTALL_PATH/v1/engine"
5153

5254
function mock_vllm_async_llm_engine {
5355
# backup original file
54-
mv $VLLM_INSTALL_PATH/engine/multiprocessing/client.py $VLLM_INSTALL_PATH/engine/multiprocessing/client.py.backup
55-
cp $VLLM_INSTALL_PATH/engine/multiprocessing/client.py.backup $VLLM_INSTALL_PATH/engine/multiprocessing/client.py
56+
mv $VLLM_V1_ENGINE_PATH/async_llm.py $VLLM_V1_ENGINE_PATH/async_llm.py.backup
57+
cp $VLLM_V1_ENGINE_PATH/async_llm.py.backup $VLLM_V1_ENGINE_PATH/async_llm.py
5658
# overwrite the original check_health method
57-
echo -e "" >> $VLLM_INSTALL_PATH/engine/multiprocessing/client.py
58-
echo -e " async def check_health(self, check_count=[0]):" >> $VLLM_INSTALL_PATH/engine/multiprocessing/client.py
59-
echo -e " check_count[0] += 1" >> $VLLM_INSTALL_PATH/engine/multiprocessing/client.py
60-
echo -e " if check_count[0] > 1:" >> $VLLM_INSTALL_PATH/engine/multiprocessing/client.py
61-
echo -e " raise RuntimeError(\"Simulated vLLM check_health() failure\")" >> $VLLM_INSTALL_PATH/engine/multiprocessing/client.py
59+
echo -e "" >> $VLLM_V1_ENGINE_PATH/async_llm.py
60+
echo -e " async def check_health(self, check_count=[0]):" >> $VLLM_V1_ENGINE_PATH/async_llm.py
61+
echo -e " check_count[0] += 1" >> $VLLM_V1_ENGINE_PATH/async_llm.py
62+
echo -e " if check_count[0] > 1:" >> $VLLM_V1_ENGINE_PATH/async_llm.py
63+
echo -e " raise RuntimeError(\"Simulated vLLM check_health() failure\")" >> $VLLM_V1_ENGINE_PATH/async_llm.py
6264
}
6365

6466
function unmock_vllm_async_llm_engine {
6567
# restore from backup
66-
rm -f $VLLM_INSTALL_PATH/engine/multiprocessing/client.py
67-
mv $VLLM_INSTALL_PATH/engine/multiprocessing/client.py.backup $VLLM_INSTALL_PATH/engine/multiprocessing/client.py
68+
rm -f $VLLM_V1_ENGINE_PATH/async_llm.py
69+
mv $VLLM_V1_ENGINE_PATH/async_llm.py.backup $VLLM_V1_ENGINE_PATH/async_llm.py
6870
}
6971

7072
function test_check_health {
@@ -93,8 +95,12 @@ function test_check_health {
9395
}
9496

9597
# Test health check unspecified
98+
# Cold start on SBSA device can take longer than default 120 seconds
99+
PREV_SERVER_TIMEOUT=$SERVER_TIMEOUT
100+
SERVER_TIMEOUT=240
96101
setup_model_repository
97102
test_check_health "health_check_unspecified" "test_vllm_is_healthy"
103+
SERVER_TIMEOUT=$PREV_SERVER_TIMEOUT
98104

99105
# Test health check disabled
100106
setup_model_repository
Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
{
22
"model":"facebook/opt-125m",
3-
"gpu_memory_utilization": 0.5,
3+
"gpu_memory_utilization": 0.1,
44
"enforce_eager": true
55
}

src/model.py

Lines changed: 10 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -33,14 +33,13 @@
3333
from typing import Dict, List
3434

3535
import numpy as np
36-
import torch
3736
import triton_python_backend_utils as pb_utils
3837
from vllm.engine.arg_utils import AsyncEngineArgs
3938
from vllm.entrypoints.openai.api_server import (
4039
build_async_engine_client_from_engine_args,
4140
)
4241

43-
from utils.metrics import VllmStatLogger
42+
from utils.metrics import VllmStatLoggerFactory
4443
from utils.vllm_backend_utils import TritonSamplingParams
4544

4645
_VLLM_ENGINE_ARGS_FILENAME = "model.json"
@@ -187,12 +186,12 @@ def initialize(self, args):
187186
and not self._aync_engine_args.disable_log_stats
188187
)
189188

190-
# Starting the vLLM engine and its event thread running the AsyncIO event loop.
191-
self._init_engine()
192-
193189
# Setup vLLM metrics
194190
self._setup_metrics()
195191

192+
# Starting the vLLM engine and its event thread running the AsyncIO event loop.
193+
self._init_engine()
194+
196195
# Starting the response thread. It allows vLLM to keep making progress while
197196
# response sender(s) are sending responses to server frontend.
198197
self._response_queue = queue.Queue()
@@ -261,6 +260,7 @@ async def _run_llm_engine(self):
261260
async with build_async_engine_client_from_engine_args(
262261
engine_args=self._aync_engine_args,
263262
disable_frontend_multiprocessing=self._enable_metrics,
263+
stat_loggers=self._vllm_metrics,
264264
) as engine:
265265
# Capture the engine event loop and make it visible to other threads.
266266
self._event_loop = asyncio.get_running_loop()
@@ -351,7 +351,7 @@ def _setup_lora(self):
351351
)
352352

353353
def _setup_metrics(self):
354-
self._vllm_metrics = None
354+
self._vllm_metrics = []
355355
# TODO: Do not read metrics directly from the vLLM engine, read from prometheus
356356
# client to allow the use of ZMQ process when metrics are enabled. See
357357
# https://github.com/vllm-project/vllm/blob/v0.6.3.post1/vllm/entrypoints/openai/api_server.py#L222-L245
@@ -362,9 +362,8 @@ def _setup_metrics(self):
362362
"version": self.args["model_version"],
363363
}
364364
# Add vLLM custom metrics
365-
vllm_config = self._llm_engine.engine.vllm_config
366-
self._vllm_metrics = VllmStatLogger(labels, vllm_config, self.logger)
367-
self._llm_engine.add_logger("triton", self._vllm_metrics)
365+
factory = VllmStatLoggerFactory(labels, self.logger)
366+
self._vllm_metrics.append(factory)
368367
except pb_utils.TritonModelException as e:
369368
if "metrics not supported" in str(e):
370369
# Metrics are disabled at the server
@@ -621,8 +620,8 @@ def finalize(self):
621620
self._response_thread = None
622621

623622
# Shutdown the metrics thread.
624-
if self._vllm_metrics is not None:
625-
self._vllm_metrics.finalize()
623+
for stat_logger_factory in self._vllm_metrics:
624+
stat_logger_factory.finalize()
626625

627626
# When using parallel tensors, the stub process may not shutdown due to
628627
# unreleased references, so manually run the garbage collector once.

src/utils/metrics.py

Lines changed: 60 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -26,13 +26,12 @@
2626

2727
import queue
2828
import threading
29-
from typing import Dict, List, Union
29+
from typing import Dict, List, Optional, Union
3030

3131
import triton_python_backend_utils as pb_utils
3232
from vllm.config import VllmConfig
33-
from vllm.engine.metrics import StatLoggerBase as VllmStatLoggerBase
34-
from vllm.engine.metrics import Stats as VllmStats
35-
from vllm.engine.metrics import SupportsMetricsInfo, build_1_2_5_buckets
33+
from vllm.v1.metrics.loggers import StatLoggerBase, build_1_2_5_buckets
34+
from vllm.v1.metrics.stats import IterationStats, SchedulerStats
3635

3736

3837
class TritonMetrics:
@@ -161,13 +160,35 @@ def __init__(self, labels: List[str], max_model_len: int):
161160
)
162161

163162

164-
class VllmStatLogger(VllmStatLoggerBase):
163+
# Create a partially initialized callable that adapts VllmStatLogger to StatLoggerFactory interface
164+
class VllmStatLoggerFactory:
165+
def __init__(self, labels, log_logger):
166+
self._labels = labels
167+
self._log_logger = log_logger
168+
self._instances_list = []
169+
170+
def __call__(self, vllm_config, engine_index):
171+
stat_logger = VllmStatLogger(
172+
self._labels, self._log_logger, vllm_config, engine_index
173+
)
174+
self._instances_list.append(stat_logger)
175+
return stat_logger
176+
177+
def finalize(self):
178+
for stat_logger in self._instances_list:
179+
if stat_logger is not None:
180+
stat_logger.finalize()
181+
182+
183+
class VllmStatLogger(StatLoggerBase):
165184
"""StatLogger is used as an adapter between vLLM stats collector and Triton metrics provider."""
166185

167-
def __init__(self, labels: Dict, vllm_config: VllmConfig, log_logger) -> None:
186+
def __init__(
187+
self, labels: Dict, log_logger, vllm_config: VllmConfig, engine_index: int
188+
) -> None:
168189
# Tracked stats over current local logging interval.
169190
# local_interval not used here. It's for vLLM logs to stdout.
170-
super().__init__(local_interval=0, vllm_config=vllm_config)
191+
super().__init__(vllm_config=vllm_config, engine_index=engine_index)
171192
self.metrics = TritonMetrics(
172193
labels=labels, max_model_len=vllm_config.model_config.max_model_len
173194
)
@@ -176,12 +197,9 @@ def __init__(self, labels: Dict, vllm_config: VllmConfig, log_logger) -> None:
176197
# Starting the metrics thread. It allows vLLM to keep making progress
177198
# while reporting metrics to triton metrics service.
178199
self._logger_queue = queue.Queue()
179-
self._logger_thread = threading.Thread(target=self.logger_loop)
200+
self._logger_thread = threading.Thread(target=self._logger_loop)
180201
self._logger_thread.start()
181202

182-
def info(self, type: str, obj: SupportsMetricsInfo) -> None:
183-
pass
184-
185203
def _log_counter(self, counter, data: Union[int, float]) -> None:
186204
"""Convenience function for logging to counter.
187205
@@ -208,7 +226,12 @@ def _log_histogram(self, histogram, data: Union[List[int], List[float]]) -> None
208226
for datum in data:
209227
self._logger_queue.put_nowait((histogram, "observe", datum))
210228

211-
def log(self, stats: VllmStats) -> None:
229+
def record(
230+
self,
231+
scheduler_stats: Optional[SchedulerStats],
232+
iteration_stats: Optional[IterationStats],
233+
engine_idx: int = 0,
234+
) -> None:
212235
"""Report stats to Triton metrics server.
213236
214237
Args:
@@ -217,38 +240,54 @@ def log(self, stats: VllmStats) -> None:
217240
Returns:
218241
None
219242
"""
243+
244+
# Parse finished request stats into lists
245+
e2e_latency: List[float] = []
246+
num_prompt_tokens: List[int] = []
247+
num_generation_tokens: List[int] = []
248+
for finished_req in iteration_stats.finished_requests:
249+
e2e_latency.append(finished_req.e2e_latency)
250+
num_prompt_tokens.append(finished_req.num_prompt_tokens)
251+
num_generation_tokens.append(finished_req.num_generation_tokens)
252+
220253
# The list of vLLM metrics reporting to Triton is also documented here.
221254
# https://github.com/triton-inference-server/vllm_backend/blob/main/README.md#triton-metrics
222255
counter_metrics = [
223-
(self.metrics.counter_prompt_tokens, stats.num_prompt_tokens_iter),
224-
(self.metrics.counter_generation_tokens, stats.num_generation_tokens_iter),
256+
(self.metrics.counter_prompt_tokens, iteration_stats.num_prompt_tokens),
257+
(
258+
self.metrics.counter_generation_tokens,
259+
iteration_stats.num_generation_tokens,
260+
),
225261
]
226262
histogram_metrics = [
227263
(
228264
self.metrics.histogram_time_to_first_token,
229-
stats.time_to_first_tokens_iter,
265+
iteration_stats.time_to_first_tokens_iter,
230266
),
231267
(
232268
self.metrics.histogram_time_per_output_token,
233-
stats.time_per_output_tokens_iter,
269+
iteration_stats.inter_token_latencies_iter,
234270
),
235-
(self.metrics.histogram_e2e_time_request, stats.time_e2e_requests),
271+
(self.metrics.histogram_e2e_time_request, e2e_latency),
236272
(
237273
self.metrics.histogram_num_prompt_tokens_request,
238-
stats.num_prompt_tokens_requests,
274+
num_prompt_tokens,
239275
),
240276
(
241277
self.metrics.histogram_num_generation_tokens_request,
242-
stats.num_generation_tokens_requests,
278+
num_generation_tokens,
243279
),
244-
(self.metrics.histogram_n_request, stats.n_requests),
280+
(self.metrics.histogram_n_request, iteration_stats.n_params_iter),
245281
]
246282
for metric, data in counter_metrics:
247283
self._log_counter(metric, data)
248284
for metric, data in histogram_metrics:
249285
self._log_histogram(metric, data)
250286

251-
def logger_loop(self):
287+
def log_engine_initialized(self) -> None:
288+
pass
289+
290+
def _logger_loop(self):
252291
while True:
253292
item = self._logger_queue.get()
254293
# To signal shutdown a None item will be added to the queue.

0 commit comments

Comments
 (0)