-
Notifications
You must be signed in to change notification settings - Fork 32
feat: Add vLLM counter metrics access through Triton #53
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 22 commits
0686a7c
21e2356
d95bb2c
321faa0
468539f
8eba2f0
6f97f6f
bf7669e
e9d0dbb
7d0dc5b
979dc02
3dd04c5
07f2575
2135145
56aea05
0dadc8e
8d8fd2a
4f2e217
d22fd03
c8bdb6e
8280d26
6fa7ae3
89ca6f4
1158fee
a99d38b
de8f25b
b1333ce
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,160 @@ | ||
#!/bin/bash | ||
# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. | ||
# | ||
# Redistribution and use in source and binary forms, with or without | ||
# modification, are permitted provided that the following conditions | ||
# are met: | ||
# * Redistributions of source code must retain the above copyright | ||
# notice, this list of conditions and the following disclaimer. | ||
# * Redistributions in binary form must reproduce the above copyright | ||
# notice, this list of conditions and the following disclaimer in the | ||
# documentation and/or other materials provided with the distribution. | ||
# * Neither the name of NVIDIA CORPORATION nor the names of its | ||
# contributors may be used to endorse or promote products derived | ||
# from this software without specific prior written permission. | ||
# | ||
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY | ||
# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | ||
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR | ||
# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR | ||
# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, | ||
# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, | ||
# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR | ||
# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY | ||
# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT | ||
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE | ||
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | ||
|
||
source ../../common/util.sh | ||
|
||
TRITON_DIR=${TRITON_DIR:="/opt/tritonserver"} | ||
SERVER=${TRITON_DIR}/bin/tritonserver | ||
BACKEND_DIR=${TRITON_DIR}/backends | ||
SERVER_ARGS="--model-repository=$(pwd)/models --backend-directory=${BACKEND_DIR} --model-control-mode=explicit --load-model=vllm_opt --log-verbose=1" | ||
SERVER_LOG="./vllm_metrics_server.log" | ||
CLIENT_LOG="./vllm_metrics_client.log" | ||
TEST_RESULT_FILE='test_results.txt' | ||
CLIENT_PY="./vllm_metrics_test.py" | ||
SAMPLE_MODELS_REPO="../../../samples/model_repository" | ||
EXPECTED_NUM_TESTS=1 | ||
|
||
# Helpers ======================================= | ||
function assert_curl_success { | ||
message="${1}" | ||
if [ "$code" != "200" ]; then | ||
cat ./curl.out | ||
echo -e "\n***\n*** ${message} : line ${BASH_LINENO}\n***" | ||
RET=1 | ||
fi | ||
} | ||
|
||
rm -rf models && mkdir -p models | ||
cp -r ${SAMPLE_MODELS_REPO}/vllm_model models/vllm_opt | ||
# `vllm_opt` model will be loaded on server start and stay loaded throughout | ||
# unittesting. To ensure that vllm's memory profiler will not error out | ||
# on `vllm_load_test` load, we reduce "gpu_memory_utilization" for `vllm_opt`, | ||
# so that at least 60% of GPU memory was available for other models. | ||
sed -i 's/"gpu_memory_utilization": 0.5/"gpu_memory_utilization": 0.4/' models/vllm_opt/1/model.json | ||
|
||
RET=0 | ||
|
||
# Test vLLM metrics | ||
run_server | ||
if [ "$SERVER_PID" == "0" ]; then | ||
cat $SERVER_LOG | ||
echo -e "\n***\n*** Failed to start $SERVER\n***" | ||
exit 1 | ||
fi | ||
|
||
set +e | ||
python3 $CLIENT_PY VLLMTritonMetricsTest.test_vllm_metrics -v > $CLIENT_LOG 2>&1 | ||
|
||
if [ $? -ne 0 ]; then | ||
cat $CLIENT_LOG | ||
echo -e "\n***\n*** Running $CLIENT_PY VLLMTritonMetricsTest.test_vllm_metrics FAILED. \n***" | ||
RET=1 | ||
else | ||
check_test_results $TEST_RESULT_FILE $EXPECTED_NUM_TESTS | ||
if [ $? -ne 0 ]; then | ||
cat $CLIENT_LOG | ||
echo -e "\n***\n*** Test Result Verification FAILED.\n***" | ||
RET=1 | ||
fi | ||
fi | ||
set -e | ||
|
||
kill $SERVER_PID | ||
wait $SERVER_PID | ||
|
||
# Test disabling vLLM metrics with disable_log_stats set to true | ||
sed -i 's/"disable_log_stats": false/"disable_log_stats": true/' models/vllm_opt/1/model.json | ||
|
||
run_server | ||
if [ "$SERVER_PID" == "0" ]; then | ||
cat $SERVER_LOG | ||
echo -e "\n***\n*** Failed to start $SERVER\n***" | ||
exit 1 | ||
fi | ||
|
||
set +e | ||
python3 $CLIENT_PY VLLMTritonMetricsTest.test_vllm_metrics_disabled -v > $CLIENT_LOG 2>&1 | ||
|
||
if [ $? -ne 0 ]; then | ||
cat $CLIENT_LOG | ||
echo -e "\n***\n*** Running $CLIENT_PY VLLMTritonMetricsTest.test_vllm_metrics_disabled FAILED. \n***" | ||
RET=1 | ||
else | ||
check_test_results $TEST_RESULT_FILE $EXPECTED_NUM_TESTS | ||
if [ $? -ne 0 ]; then | ||
cat $CLIENT_LOG | ||
echo -e "\n***\n*** Test Result Verification FAILED.\n***" | ||
RET=1 | ||
fi | ||
fi | ||
set -e | ||
|
||
kill $SERVER_PID | ||
wait $SERVER_PID | ||
|
||
# Test vLLM metrics if disable_log_stats is not set in model.json | ||
jq 'del(.disable_log_stats)' models/vllm_opt/1/model.json > "temp.json" | ||
mv temp.json models/vllm_opt/1/model.json | ||
|
||
run_server | ||
if [ "$SERVER_PID" == "0" ]; then | ||
cat $SERVER_LOG | ||
echo -e "\n***\n*** Failed to start $SERVER\n***" | ||
exit 1 | ||
fi | ||
|
||
set +e | ||
python3 $CLIENT_PY VLLMTritonMetricsTest.test_vllm_metrics -v > $CLIENT_LOG 2>&1 | ||
|
||
if [ $? -ne 0 ]; then | ||
cat $CLIENT_LOG | ||
echo -e "\n***\n*** Running $CLIENT_PY VLLMTritonMetricsTest.test_vllm_metrics FAILED. \n***" | ||
RET=1 | ||
else | ||
check_test_results $TEST_RESULT_FILE $EXPECTED_NUM_TESTS | ||
if [ $? -ne 0 ]; then | ||
cat $CLIENT_LOG | ||
echo -e "\n***\n*** Test Result Verification FAILED.\n***" | ||
RET=1 | ||
fi | ||
fi | ||
set -e | ||
|
||
kill $SERVER_PID | ||
wait $SERVER_PID | ||
rm -rf "./models" "temp.json" | ||
|
||
if [ $RET -eq 1 ]; then | ||
cat $CLIENT_LOG | ||
cat $SERVER_LOG | ||
echo -e "\n***\n*** vLLM test FAILED. \n***" | ||
else | ||
echo -e "\n***\n*** vLLM test PASSED. \n***" | ||
fi | ||
|
||
collect_artifacts_from_subdir | ||
exit $RET |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,145 @@ | ||
# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. | ||
# | ||
# Redistribution and use in source and binary forms, with or without | ||
# modification, are permitted provided that the following conditions | ||
# are met: | ||
# * Redistributions of source code must retain the above copyright | ||
# notice, this list of conditions and the following disclaimer. | ||
# * Redistributions in binary form must reproduce the above copyright | ||
# notice, this list of conditions and the following disclaimer in the | ||
# documentation and/or other materials provided with the distribution. | ||
# * Neither the name of NVIDIA CORPORATION nor the names of its | ||
# contributors may be used to endorse or promote products derived | ||
# from this software without specific prior written permission. | ||
# | ||
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY | ||
# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | ||
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR | ||
# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR | ||
# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, | ||
# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, | ||
# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR | ||
# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY | ||
# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT | ||
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE | ||
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | ||
|
||
import os | ||
import re | ||
import sys | ||
import unittest | ||
from functools import partial | ||
|
||
import requests | ||
import tritonclient.grpc as grpcclient | ||
from tritonclient.utils import * | ||
|
||
sys.path.append("../../common") | ||
from test_util import TestResultCollector, UserData, callback, create_vllm_request | ||
|
||
|
||
class VLLMTritonMetricsTest(TestResultCollector): | ||
def setUp(self): | ||
self.triton_client = grpcclient.InferenceServerClient(url="localhost:8001") | ||
self.tritonserver_ipaddr = os.environ.get("TRITONSERVER_IPADDR", "localhost") | ||
self.vllm_model_name = "vllm_opt" | ||
self.prompts = [ | ||
"The most dangerous animal is", | ||
"The capital of France is", | ||
"The future of AI is", | ||
] | ||
self.sampling_parameters = {"temperature": "0", "top_p": "1"} | ||
|
||
def get_vllm_metrics(self): | ||
""" | ||
Store vllm metrics in a dictionary. | ||
""" | ||
r = requests.get(f"http://{self.tritonserver_ipaddr}:8002/metrics") | ||
r.raise_for_status() | ||
|
||
# Regular expression to match the pattern | ||
pattern = r"^(vllm:[^ {]+)(?:{.*})? ([0-9.-]+)$" | ||
vllm_dict = {} | ||
|
||
# Find all matches in the text | ||
matches = re.findall(pattern, r.text, re.MULTILINE) | ||
|
||
for match in matches: | ||
key, value = match | ||
vllm_dict[key] = float(value) if "." in value else int(value) | ||
|
||
return vllm_dict | ||
|
||
def vllm_infer( | ||
self, | ||
prompts, | ||
sampling_parameters, | ||
model_name, | ||
): | ||
""" | ||
Helper function to send async stream infer requests to vLLM. | ||
""" | ||
user_data = UserData() | ||
number_of_vllm_reqs = len(prompts) | ||
|
||
self.triton_client.start_stream(callback=partial(callback, user_data)) | ||
for i in range(number_of_vllm_reqs): | ||
request_data = create_vllm_request( | ||
prompts[i], | ||
i, | ||
False, | ||
sampling_parameters, | ||
model_name, | ||
True, | ||
) | ||
self.triton_client.async_stream_infer( | ||
model_name=model_name, | ||
inputs=request_data["inputs"], | ||
request_id=request_data["request_id"], | ||
outputs=request_data["outputs"], | ||
parameters=sampling_parameters, | ||
) | ||
|
||
for _ in range(number_of_vllm_reqs): | ||
result = user_data._completed_requests.get() | ||
if type(result) is InferenceServerException: | ||
print(result.message()) | ||
self.assertIsNot(type(result), InferenceServerException, str(result)) | ||
|
||
output = result.as_numpy("text_output") | ||
self.assertIsNotNone(output, "`text_output` should not be None") | ||
|
||
self.triton_client.stop_stream() | ||
|
||
def test_vllm_metrics(self): | ||
# Test vLLM metrics | ||
self.vllm_infer( | ||
prompts=self.prompts, | ||
sampling_parameters=self.sampling_parameters, | ||
model_name=self.vllm_model_name, | ||
) | ||
metrics_dict = self.get_vllm_metrics() | ||
|
||
# vllm:prompt_tokens_total | ||
self.assertEqual(metrics_dict["vllm:prompt_tokens_total"], 18) | ||
# vllm:generation_tokens_total | ||
self.assertEqual(metrics_dict["vllm:generation_tokens_total"], 48) | ||
|
||
def test_vllm_metrics_disabled(self): | ||
# Test vLLM metrics | ||
self.vllm_infer( | ||
prompts=self.prompts, | ||
sampling_parameters=self.sampling_parameters, | ||
model_name=self.vllm_model_name, | ||
) | ||
metrics_dict = self.get_vllm_metrics() | ||
|
||
# No vLLM metric found | ||
self.assertEqual(len(metrics_dict), 0) | ||
|
||
def tearDown(self): | ||
self.triton_client.close() | ||
|
||
|
||
if __name__ == "__main__": | ||
unittest.main() |
oandreeva-nv marked this conversation as resolved.
Show resolved
Hide resolved
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,6 +1,7 @@ | ||
{ | ||
"model":"facebook/opt-125m", | ||
"disable_log_requests": "true", | ||
"disable_log_requests": true, | ||
"gpu_memory_utilization": 0.5, | ||
"enforce_eager": "true" | ||
"enforce_eager": true, | ||
"disable_log_stats": false | ||
oandreeva-nv marked this conversation as resolved.
Show resolved
Hide resolved
|
||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -41,6 +41,8 @@ | |
from vllm.sampling_params import SamplingParams | ||
from vllm.utils import random_uuid | ||
|
||
from utils.metrics import VllmStatLogger | ||
|
||
_VLLM_ENGINE_ARGS_FILENAME = "model.json" | ||
_MULTI_LORA_ARGS_FILENAME = "multi_lora.json" | ||
|
||
|
@@ -155,9 +157,24 @@ def init_engine(self): | |
self.setup_lora() | ||
|
||
# Create an AsyncLLMEngine from the config from JSON | ||
self.llm_engine = AsyncLLMEngine.from_engine_args( | ||
AsyncEngineArgs(**self.vllm_engine_config) | ||
) | ||
aync_engine_args = AsyncEngineArgs(**self.vllm_engine_config) | ||
self.llm_engine = AsyncLLMEngine.from_engine_args(aync_engine_args) | ||
|
||
# Create vLLM custom metrics | ||
if not aync_engine_args.disable_log_stats: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We need to document this behavior There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Agree. I will update the doc in another PR after this gets merged due to time constraint. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Do we need the documentation in 24.08? It needs to go in today if needed. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I will follow up with another docs PR. Should be cherry-picked with histogram. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Docs added |
||
try: | ||
labels = { | ||
"model": self.args["model_name"], | ||
"version": self.args["model_version"], | ||
} | ||
# Add vLLM custom metrics | ||
self.llm_engine.add_logger("triton", VllmStatLogger(labels=labels)) | ||
except pb_utils.TritonModelException as e: | ||
if "metrics not supported" in str(e): | ||
# Metrics are disabled at the server | ||
self.logger.log_info("[vllm] Metrics not supported") | ||
else: | ||
raise e | ||
|
||
def setup_lora(self): | ||
self.enable_lora = False | ||
|
Uh oh!
There was an error while loading. Please reload this page.