Skip to content
Merged
Show file tree
Hide file tree
Changes from 6 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,7 @@ coverage.xml
.hypothesis/
.pytest_cache/
cover/
*.out

# Translations
*.mo
Expand Down
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -99,6 +99,7 @@ export TRITON_CONTAINER_VERSION=<YY.MM>
--endpoint=vertex-ai
--upstream-container-version=${TRITON_CONTAINER_VERSION}
--backend=python:r${TRITON_CONTAINER_VERSION}
--backend=ensemble
--backend=vllm:r${TRITON_CONTAINER_VERSION}
```

Expand Down
59 changes: 59 additions & 0 deletions ci/L0_backend_vllm/vllm_backend/ensemble_config.pbtxt
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
# Copyright 2023-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions
# are met:
# * Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
# * Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# documentation and/or other materials provided with the distribution.
# * Neither the name of NVIDIA CORPORATION nor the names of its
# contributors may be used to endorse or promote products derived
# from this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

name: "ensemble_model"
platform: "ensemble"
max_batch_size: 1
input [
{
name: "text_input"
data_type: TYPE_STRING
dims: [ -1 ]
}
]
output [
{
name: "text_output"
data_type: TYPE_STRING
dims: [ -1 ]
}
]
ensemble_scheduling {
step [
{
model_name: "vllm_opt"
model_version: -1
input_map {
key: "text_input"
value: "text_input"
}
output_map {
key: "text_output"
value: "text_output"
}
}
]
}
7 changes: 6 additions & 1 deletion ci/L0_backend_vllm/vllm_backend/test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,11 @@ sed -i 's/"disable_log_requests"/"invalid_attribute"/' models/vllm_invalid_1/1/m
cp -r ${SAMPLE_MODELS_REPO}/vllm_model models/vllm_invalid_2/
sed -i 's/"facebook\/opt-125m"/"invalid_model"/' models/vllm_invalid_2/1/model.json


mkdir -p models/ensemble_model/1

cp -r ensemble_config.pbtxt models/ensemble_model/config.pbtxt

RET=0

run_server
Expand Down Expand Up @@ -166,4 +171,4 @@ fi

collect_artifacts_from_subdir

exit $RET
exit $RET
12 changes: 12 additions & 0 deletions ci/L0_backend_vllm/vllm_backend/vllm_backend_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,7 @@ def setUp(self):
self.triton_client = grpcclient.InferenceServerClient(url="localhost:8001")
self.vllm_model_name = "vllm_opt"
self.python_model_name = "add_sub"
self.enseble_model_name = "ensemble_model"
self.vllm_load_test = "vllm_load_test"

def test_vllm_triton_backend(self):
Expand Down Expand Up @@ -163,6 +164,17 @@ def test_exclude_input_in_output_true(self):
expected_output=expected_output,
)

def test_ensemble_model(self):
Copy link
Contributor

@rmccorm4 rmccorm4 Sep 23, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is going to break EXPECTED_NUM_TESTS=6.

You could update it to EXPECTED_NUM_TESTS=7, but I think it's easier to just append the ensemble to the other test that is testing model loading here:

# Load both vllm and add_sub models
self.triton_client.load_model(self.vllm_load_test)
self.assertTrue(self.triton_client.is_model_ready(self.vllm_load_test))
self.triton_client.load_model(self.python_model_name)
self.assertTrue(self.triton_client.is_model_ready(self.python_model_name))

ex:

        # Load both vllm and add_sub models 
	self.triton_client.load_model(self.vllm_load_test) 
	self.assertTrue(self.triton_client.is_model_ready(self.vllm_load_test)) 
	self.triton_client.load_model(self.python_model_name) 
	self.assertTrue(self.triton_client.is_model_ready(self.python_model_name)) 
        
        # Test to ensure that ensemble models are supported in vllm container.
        # If ensemble support is not enabled, triton will fail to load the ensemble.
	self.triton_client.load_model(self.ensemble_model_name) 
	self.assertTrue(self.triton_client.is_model_ready(self.ensemble_model_name)) 

Also use vllm_load_test model inside the ensemble instead of vllm_opt for the same reason.

Copy link
Contributor

@rmccorm4 rmccorm4 Sep 23, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The tests documents some expected behavior around vllm_opt staying alive for the duration of the test:

# `vllm_opt` model will be loaded on server start and stay loaded throughout
# unittesting. To test vllm model load/unload we use a dedicated
# `vllm_load_test`. To ensure that vllm's memory profiler will not error out
# on `vllm_load_test` load, we reduce "gpu_memory_utilization" for `vllm_opt`,
# so that at least 60% of GPU memory was available for other models.

So I think it's best not to mess with it in a new test and make use of vllm_load_test instead.

# Test to ensure that ensemble models are supported in vllm container.
# If ensemble support not present, triton will error out at model loading stage.

# Before loading ensemble model, the dependency model is loaded.
self.triton_client.load_model(self.vllm_model_name)
self.assertTrue(self.triton_client.is_model_ready(self.vllm_model_name))

self.triton_client.load_model(self.enseble_model_name)
self.assertTrue(self.triton_client.is_model_ready(self.enseble_model_name))

def _test_vllm_model(
self,
prompts,
Expand Down
Loading