triton-inference-server · yinggeh · Oct 21, 2025 · Oct 25, 2025 · Oct 27, 2025 · Oct 29, 2025
diff --git a/ci/L0_backend_vllm/accuracy_test/accuracy_test.py b/ci/L0_backend_vllm/accuracy_test/accuracy_test.py
@@ -190,7 +190,6 @@ def test_guided_decoding(self):
         sampling_params = SAMPLING_PARAMETERS
         guided_decoding_params = {
             "choice": ["Positive", "Negative"],
-            "backend": "outlines",
         }
         sampling_params["guided_decoding"] = json.dumps(guided_decoding_params)
         for i in range(len(GUIDED_PROMPTS)):
@@ -245,7 +244,6 @@ def tearDown(self):
     if FLAGS.generate_guided_baseline:
         guided_decoding_params = {
             "choice": ["Positive", "Negative"],
-            "backend": "outlines",
         }
         guided_generation = GuidedDecodingParams(**guided_decoding_params)
         asyncio.run(

diff --git a/ci/L0_backend_vllm/accuracy_test/test.sh b/ci/L0_backend_vllm/accuracy_test/test.sh
@@ -48,17 +48,11 @@ RET=0
 set +e
 # Need to generate baseline first, since running 2 vLLM engines causes
 # memory issues: https://github.com/vllm-project/vllm/issues/2248
-export VLLM_USE_V1=0
-export VLLM_WORKER_MULTIPROC_METHOD=spawn
 python3 $CLIENT_PY --generate-baseline >> $VLLM_ENGINE_LOG 2>&1 & BASELINE_PID=$!
 wait $BASELINE_PID
 
 python3 $CLIENT_PY --generate-guided-baseline > $VLLM_ENGINE_LOG 2>&1 & BASELINE_PID=$!
 wait $BASELINE_PID
-
-unset VLLM_USE_V1
-unset VLLM_WORKER_MULTIPROC_METHOD
-
 set -e
 
 run_server
@@ -88,12 +82,6 @@ set -e
 kill $SERVER_PID
 wait $SERVER_PID
 
-# Check that warning about V1 Engine appears in log - this warning is expected
-if ! grep -q "Engine in background thread is experimental on VLLM_USE_V1=1. Falling back to V0 Engine." $SERVER_LOG; then
-    echo -e "\n***\n*** ERROR: Expected warning about vLLM falling back to V0 Engine not found in logs.\n***"
-    RET=1
-fi
-
 rm -rf models/
 
 if [ $RET -eq 1 ]; then

diff --git a/ci/L0_backend_vllm/metrics_test/vllm_metrics_test.py b/ci/L0_backend_vllm/metrics_test/vllm_metrics_test.py
@@ -173,13 +173,11 @@ def test_vllm_metrics(self):
     # TODO: Revisit this test due to the removal of best_of
     def test_custom_sampling_params(self):
         # Adding sampling parameters for testing metrics.
-        # Definitions can be found here https://docs.vllm.ai/en/latest/dev/sampling_params.html
-        n, best_of = 2, 4
+        # Definitions can be found here https://docs.vllm.ai/en/latest/api/vllm/sampling_params.html
+        n, temperature = 2, 1
         custom_sampling_parameters = self.sampling_parameters.copy()
-        # Changing "temperature" because "best_of" must be 1 when using greedy
-        # sampling, i.e. "temperature": "0".
         custom_sampling_parameters.update(
-            {"n": str(n), "best_of": str(best_of), "temperature": "1"}
+            {"n": str(n), "temperature": str(temperature)}
         )
 
         # Test vLLM metrics

diff --git a/ci/L0_backend_vllm/test.sh b/ci/L0_backend_vllm/test.sh
@@ -28,6 +28,9 @@
 RET=0
 SUBTESTS="accuracy_test request_cancellation enabled_stream vllm_backend metrics_test"
 
+export C_INCLUDE_PATH=/usr/local/cuda/include:$C_INCLUDE_PATH
+export TRITON_PTXAS_PATH=/usr/local/cuda/bin/ptxas
+
 python3 -m pip install tritonclient[grpc]
 
 for TEST in ${SUBTESTS}; do

diff --git a/ci/L0_check_health_vllm/test.sh b/ci/L0_check_health_vllm/test.sh
@@ -31,11 +31,12 @@ source ../common/util.sh
 pip3 install pytest==8.1.1
 pip3 install tritonclient[grpc]
 
+rm -f *.log *.report.xml
 RET=0
 
 function setup_model_repository {
-    local sample_model_repo_path=${1:-"../../samples/model_repository"}
-    rm -rf models vllm_baseline_output.pkl && mkdir -p models
+    local sample_model_repo_path="../../samples/model_repository"
+    rm -rf models && mkdir -p models
     cp -r $sample_model_repo_path/vllm_model models/vllm_opt
 }
 
@@ -48,23 +49,24 @@ function enable_health_check {
 }
 
 VLLM_INSTALL_PATH="/usr/local/lib/python3.12/dist-packages/vllm"
+VLLM_V1_ENGINE_PATH="$VLLM_INSTALL_PATH/v1/engine"
 
 function mock_vllm_async_llm_engine {
     # backup original file
-    mv $VLLM_INSTALL_PATH/engine/multiprocessing/client.py $VLLM_INSTALL_PATH/engine/multiprocessing/client.py.backup
-    cp $VLLM_INSTALL_PATH/engine/multiprocessing/client.py.backup $VLLM_INSTALL_PATH/engine/multiprocessing/client.py
+    mv $VLLM_V1_ENGINE_PATH/async_llm.py $VLLM_V1_ENGINE_PATH/async_llm.py.backup
+    cp $VLLM_V1_ENGINE_PATH/async_llm.py.backup $VLLM_V1_ENGINE_PATH/async_llm.py
     # overwrite the original check_health method
-    echo -e "" >> $VLLM_INSTALL_PATH/engine/multiprocessing/client.py
-    echo -e "    async def check_health(self, check_count=[0]):" >> $VLLM_INSTALL_PATH/engine/multiprocessing/client.py
-    echo -e "        check_count[0] += 1" >> $VLLM_INSTALL_PATH/engine/multiprocessing/client.py
-    echo -e "        if check_count[0] > 1:" >> $VLLM_INSTALL_PATH/engine/multiprocessing/client.py
-    echo -e "            raise RuntimeError(\"Simulated vLLM check_health() failure\")" >> $VLLM_INSTALL_PATH/engine/multiprocessing/client.py
+    echo -e "" >> $VLLM_V1_ENGINE_PATH/async_llm.py
+    echo -e "    async def check_health(self, check_count=[0]):" >> $VLLM_V1_ENGINE_PATH/async_llm.py
+    echo -e "        check_count[0] += 1" >> $VLLM_V1_ENGINE_PATH/async_llm.py
+    echo -e "        if check_count[0] > 1:" >> $VLLM_V1_ENGINE_PATH/async_llm.py
+    echo -e "            raise RuntimeError(\"Simulated vLLM check_health() failure\")" >> $VLLM_V1_ENGINE_PATH/async_llm.py
 }
 
 function unmock_vllm_async_llm_engine {
     # restore from backup
-    rm -f $VLLM_INSTALL_PATH/engine/multiprocessing/client.py
-    mv $VLLM_INSTALL_PATH/engine/multiprocessing/client.py.backup $VLLM_INSTALL_PATH/engine/multiprocessing/client.py
+    rm -f $VLLM_V1_ENGINE_PATH/async_llm.py
+    mv $VLLM_V1_ENGINE_PATH/async_llm.py.backup $VLLM_V1_ENGINE_PATH/async_llm.py
 }
 
 function test_check_health {
@@ -93,8 +95,12 @@ function test_check_health {
 }
 
 # Test health check unspecified
+# Cold start on SBSA device can take longer than default 120 seconds
+PREV_SERVER_TIMEOUT=$SERVER_TIMEOUT
+SERVER_TIMEOUT=240
 setup_model_repository
 test_check_health "health_check_unspecified" "test_vllm_is_healthy"
+SERVER_TIMEOUT=$PREV_SERVER_TIMEOUT
 
 # Test health check disabled
 setup_model_repository

diff --git a/samples/model_repository/vllm_model/1/model.json b/samples/model_repository/vllm_model/1/model.json
@@ -1,5 +1,5 @@
 {
     "model":"facebook/opt-125m",
-    "gpu_memory_utilization": 0.5,
+    "gpu_memory_utilization": 0.1,
     "enforce_eager": true
 }