Skip to content
2 changes: 0 additions & 2 deletions ci/L0_backend_vllm/accuracy_test/accuracy_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -190,7 +190,6 @@ def test_guided_decoding(self):
sampling_params = SAMPLING_PARAMETERS
guided_decoding_params = {
"choice": ["Positive", "Negative"],
"backend": "outlines",
}
sampling_params["guided_decoding"] = json.dumps(guided_decoding_params)
for i in range(len(GUIDED_PROMPTS)):
Expand Down Expand Up @@ -245,7 +244,6 @@ def tearDown(self):
if FLAGS.generate_guided_baseline:
guided_decoding_params = {
"choice": ["Positive", "Negative"],
"backend": "outlines",
}
guided_generation = GuidedDecodingParams(**guided_decoding_params)
asyncio.run(
Expand Down
12 changes: 0 additions & 12 deletions ci/L0_backend_vllm/accuracy_test/test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -48,17 +48,11 @@ RET=0
set +e
# Need to generate baseline first, since running 2 vLLM engines causes
# memory issues: https://github.com/vllm-project/vllm/issues/2248
export VLLM_USE_V1=0
export VLLM_WORKER_MULTIPROC_METHOD=spawn
python3 $CLIENT_PY --generate-baseline >> $VLLM_ENGINE_LOG 2>&1 & BASELINE_PID=$!
wait $BASELINE_PID

python3 $CLIENT_PY --generate-guided-baseline > $VLLM_ENGINE_LOG 2>&1 & BASELINE_PID=$!
wait $BASELINE_PID

unset VLLM_USE_V1
unset VLLM_WORKER_MULTIPROC_METHOD

set -e

run_server
Expand Down Expand Up @@ -88,12 +82,6 @@ set -e
kill $SERVER_PID
wait $SERVER_PID

# Check that warning about V1 Engine appears in log - this warning is expected
if ! grep -q "Engine in background thread is experimental on VLLM_USE_V1=1. Falling back to V0 Engine." $SERVER_LOG; then
echo -e "\n***\n*** ERROR: Expected warning about vLLM falling back to V0 Engine not found in logs.\n***"
RET=1
fi

rm -rf models/

if [ $RET -eq 1 ]; then
Expand Down
8 changes: 3 additions & 5 deletions ci/L0_backend_vllm/metrics_test/vllm_metrics_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -173,13 +173,11 @@ def test_vllm_metrics(self):
# TODO: Revisit this test due to the removal of best_of
def test_custom_sampling_params(self):
# Adding sampling parameters for testing metrics.
# Definitions can be found here https://docs.vllm.ai/en/latest/dev/sampling_params.html
n, best_of = 2, 4
# Definitions can be found here https://docs.vllm.ai/en/latest/api/vllm/sampling_params.html
n, temperature = 2, 1
custom_sampling_parameters = self.sampling_parameters.copy()
# Changing "temperature" because "best_of" must be 1 when using greedy
# sampling, i.e. "temperature": "0".
custom_sampling_parameters.update(
{"n": str(n), "best_of": str(best_of), "temperature": "1"}
{"n": str(n), "temperature": str(temperature)}
)

# Test vLLM metrics
Expand Down
3 changes: 3 additions & 0 deletions ci/L0_backend_vllm/test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,9 @@
RET=0
SUBTESTS="accuracy_test request_cancellation enabled_stream vllm_backend metrics_test"

export C_INCLUDE_PATH=/usr/local/cuda/include:$C_INCLUDE_PATH
export TRITON_PTXAS_PATH=/usr/local/cuda/bin/ptxas

python3 -m pip install tritonclient[grpc]

for TEST in ${SUBTESTS}; do
Expand Down
28 changes: 17 additions & 11 deletions ci/L0_check_health_vllm/test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -31,11 +31,12 @@ source ../common/util.sh
pip3 install pytest==8.1.1
pip3 install tritonclient[grpc]

rm -f *.log *.report.xml
RET=0

function setup_model_repository {
local sample_model_repo_path=${1:-"../../samples/model_repository"}
rm -rf models vllm_baseline_output.pkl && mkdir -p models
local sample_model_repo_path="../../samples/model_repository"
rm -rf models && mkdir -p models
cp -r $sample_model_repo_path/vllm_model models/vllm_opt
}

Expand All @@ -48,23 +49,24 @@ function enable_health_check {
}

VLLM_INSTALL_PATH="/usr/local/lib/python3.12/dist-packages/vllm"
VLLM_V1_ENGINE_PATH="$VLLM_INSTALL_PATH/v1/engine"

function mock_vllm_async_llm_engine {
# backup original file
mv $VLLM_INSTALL_PATH/engine/multiprocessing/client.py $VLLM_INSTALL_PATH/engine/multiprocessing/client.py.backup
cp $VLLM_INSTALL_PATH/engine/multiprocessing/client.py.backup $VLLM_INSTALL_PATH/engine/multiprocessing/client.py
mv $VLLM_V1_ENGINE_PATH/async_llm.py $VLLM_V1_ENGINE_PATH/async_llm.py.backup
cp $VLLM_V1_ENGINE_PATH/async_llm.py.backup $VLLM_V1_ENGINE_PATH/async_llm.py
# overwrite the original check_health method
echo -e "" >> $VLLM_INSTALL_PATH/engine/multiprocessing/client.py
echo -e " async def check_health(self, check_count=[0]):" >> $VLLM_INSTALL_PATH/engine/multiprocessing/client.py
echo -e " check_count[0] += 1" >> $VLLM_INSTALL_PATH/engine/multiprocessing/client.py
echo -e " if check_count[0] > 1:" >> $VLLM_INSTALL_PATH/engine/multiprocessing/client.py
echo -e " raise RuntimeError(\"Simulated vLLM check_health() failure\")" >> $VLLM_INSTALL_PATH/engine/multiprocessing/client.py
echo -e "" >> $VLLM_V1_ENGINE_PATH/async_llm.py
echo -e " async def check_health(self, check_count=[0]):" >> $VLLM_V1_ENGINE_PATH/async_llm.py
echo -e " check_count[0] += 1" >> $VLLM_V1_ENGINE_PATH/async_llm.py
echo -e " if check_count[0] > 1:" >> $VLLM_V1_ENGINE_PATH/async_llm.py
echo -e " raise RuntimeError(\"Simulated vLLM check_health() failure\")" >> $VLLM_V1_ENGINE_PATH/async_llm.py
}

function unmock_vllm_async_llm_engine {
# restore from backup
rm -f $VLLM_INSTALL_PATH/engine/multiprocessing/client.py
mv $VLLM_INSTALL_PATH/engine/multiprocessing/client.py.backup $VLLM_INSTALL_PATH/engine/multiprocessing/client.py
rm -f $VLLM_V1_ENGINE_PATH/async_llm.py
mv $VLLM_V1_ENGINE_PATH/async_llm.py.backup $VLLM_V1_ENGINE_PATH/async_llm.py
}

function test_check_health {
Expand Down Expand Up @@ -93,8 +95,12 @@ function test_check_health {
}

# Test health check unspecified
# Cold start on SBSA device can take longer than default 120 seconds
PREV_SERVER_TIMEOUT=$SERVER_TIMEOUT
SERVER_TIMEOUT=240
setup_model_repository
test_check_health "health_check_unspecified" "test_vllm_is_healthy"
SERVER_TIMEOUT=$PREV_SERVER_TIMEOUT

# Test health check disabled
setup_model_repository
Expand Down
2 changes: 1 addition & 1 deletion samples/model_repository/vllm_model/1/model.json
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
{
"model":"facebook/opt-125m",
"gpu_memory_utilization": 0.5,
"gpu_memory_utilization": 0.1,
"enforce_eager": true
}
Loading
Loading