[V0 Deprecation] Remove VLLM_USE_V1 from docs and scripts (#26336)

DarkLight1337 · web-flow · commit 7e4cd070b01d · 2025-10-07T16:46:44.000+08:00
Signed-off-by: DarkLight1337 &lt;tlleungac@connect.ust.hk&gt;
diff --git a/.buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh b/.buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
@@ -454,11 +454,6 @@ main() {
   fi
   check_hf_token
 
-  # Set to v1 to run v1 benchmark
-  if [[ "${ENGINE_VERSION:-v0}" == "v1" ]]; then
-    export VLLM_USE_V1=1
-  fi
-
   # dependencies
   (which wget && which curl) || (apt-get update && apt-get install -y wget curl)
   (which jq) || (apt-get update && apt-get -y install jq)
diff --git a/.buildkite/scripts/hardware_ci/run-tpu-v1-test-part2.sh b/.buildkite/scripts/hardware_ci/run-tpu-v1-test-part2.sh
@@ -64,10 +64,9 @@ python3 -m pip install --progress-bar off git+https://github.com/thuml/depyf.git
     && python3 -m pip install --progress-bar off "lm-eval @ git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d" \
     && python3 -m pip install --progress-bar off hf-transfer tblib==3.1.0
 echo "--- Python dependencies installed ---"
-export VLLM_USE_V1=1
+
 export VLLM_XLA_CHECK_RECOMPILATION=1
 export VLLM_XLA_CACHE_PATH=
-echo "Using VLLM V1"
 
 echo "--- Hardware Information ---"
 # tpu-info
diff --git a/.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh b/.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh
@@ -64,10 +64,9 @@ python3 -m pip install --progress-bar off git+https://github.com/thuml/depyf.git
     && python3 -m pip install --progress-bar off "lm-eval @ git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d" \
     && python3 -m pip install --progress-bar off hf-transfer tblib==3.1.0
 echo "--- Python dependencies installed ---"
-export VLLM_USE_V1=1
+
 export VLLM_XLA_CHECK_RECOMPILATION=1
 export VLLM_XLA_CACHE_PATH=
-echo "Using VLLM V1"
 
 echo "--- Hardware Information ---"
 # tpu-info
diff --git a/.buildkite/scripts/tpu/run_bm.sh b/.buildkite/scripts/tpu/run_bm.sh
@@ -42,7 +42,7 @@ echo "lanching vllm..."
 echo "logging to $VLLM_LOG"
 echo
 
-VLLM_USE_V1=1 vllm serve $MODEL \
+vllm serve $MODEL \
  --seed 42 \
  --max-num-seqs $MAX_NUM_SEQS \
  --max-num-batched-tokens $MAX_NUM_BATCHED_TOKENS \
diff --git a/benchmarks/auto_tune/auto_tune.sh b/benchmarks/auto_tune/auto_tune.sh
@@ -96,11 +96,11 @@ start_server() {
     # This correctly passes each element as a separate argument.
     if [[ -n "$profile_dir" ]]; then
         # Start server with profiling enabled
-        VLLM_USE_V1=1 VLLM_SERVER_DEV_MODE=1 VLLM_TORCH_PROFILER_DIR=$profile_dir \
+        VLLM_SERVER_DEV_MODE=1 VLLM_TORCH_PROFILER_DIR=$profile_dir \
             vllm serve "${common_args_array[@]}" > "$vllm_log" 2>&1 &
     else
         # Start server without profiling
-        VLLM_USE_V1=1 VLLM_SERVER_DEV_MODE=1 \
+        VLLM_SERVER_DEV_MODE=1 \
             vllm serve "${common_args_array[@]}" > "$vllm_log" 2>&1 &
     fi
     local server_pid=$!
diff --git a/docs/design/p2p_nccl_connector.md b/docs/design/p2p_nccl_connector.md
@@ -97,7 +97,7 @@ python3 disagg_proxy_p2p_nccl_xpyd.py &
 ??? console "Command"
 
     ```shell
-    VLLM_USE_V1=1 CUDA_VISIBLE_DEVICES=0 vllm serve {your model directory} \
+    CUDA_VISIBLE_DEVICES=0 vllm serve {your model directory} \
         --host 0.0.0.0 \
         --port 20001 \
         --tensor-parallel-size 1 \
@@ -118,7 +118,7 @@ python3 disagg_proxy_p2p_nccl_xpyd.py &
 ??? console "Command"
 
     ```shell
-    VLLM_USE_V1=1 CUDA_VISIBLE_DEVICES=1 vllm serve {your model directory} \
+    CUDA_VISIBLE_DEVICES=1 vllm serve {your model directory} \
         --host 0.0.0.0 \
         --port 20002 \
         --tensor-parallel-size 1 \
@@ -139,7 +139,7 @@ python3 disagg_proxy_p2p_nccl_xpyd.py &
 ??? console "Command"
 
     ```shell
-    VLLM_USE_V1=1 CUDA_VISIBLE_DEVICES=2 vllm serve {your model directory} \
+    CUDA_VISIBLE_DEVICES=2 vllm serve {your model directory} \
         --host 0.0.0.0 \
         --port 20003 \
         --tensor-parallel-size 1 \
@@ -160,7 +160,7 @@ python3 disagg_proxy_p2p_nccl_xpyd.py &
 ??? console "Command"
 
     ```shell
-    VLLM_USE_V1=1 CUDA_VISIBLE_DEVICES=3 vllm serve {your model directory} \
+    CUDA_VISIBLE_DEVICES=3 vllm serve {your model directory} \
         --host 0.0.0.0 \
         --port 20004 \
         --tensor-parallel-size 1 \
@@ -190,7 +190,7 @@ python3 disagg_proxy_p2p_nccl_xpyd.py &
 ??? console "Command"
 
     ```shell
-    VLLM_USE_V1=1 CUDA_VISIBLE_DEVICES=0 vllm serve {your model directory} \
+    CUDA_VISIBLE_DEVICES=0 vllm serve {your model directory} \
         --host 0.0.0.0 \
         --port 20001 \
         --tensor-parallel-size 1 \
@@ -211,7 +211,7 @@ python3 disagg_proxy_p2p_nccl_xpyd.py &
 ??? console "Command"
 
     ```shell
-    VLLM_USE_V1=1 CUDA_VISIBLE_DEVICES=1 vllm serve {your model directory} \
+    CUDA_VISIBLE_DEVICES=1 vllm serve {your model directory} \
         --host 0.0.0.0 \
         --port 20002 \
         --tensor-parallel-size 1 \
@@ -232,7 +232,7 @@ python3 disagg_proxy_p2p_nccl_xpyd.py &
 ??? console "Command"
 
     ```shell
-    VLLM_USE_V1=1 CUDA_VISIBLE_DEVICES=2 vllm serve {your model directory} \
+    CUDA_VISIBLE_DEVICES=2 vllm serve {your model directory} \
         --host 0.0.0.0 \
         --port 20003 \
         --tensor-parallel-size 1 \
@@ -253,7 +253,7 @@ python3 disagg_proxy_p2p_nccl_xpyd.py &
 ??? console "Command"
 
     ```shell
-    VLLM_USE_V1=1 CUDA_VISIBLE_DEVICES=3 vllm serve {your model directory} \
+    CUDA_VISIBLE_DEVICES=3 vllm serve {your model directory} \
         --host 0.0.0.0 \
         --port 20004 \
         --tensor-parallel-size 1 \
diff --git a/docs/design/torch_compile.md b/docs/design/torch_compile.md
@@ -2,7 +2,7 @@
 
 In vLLM's V1 architecture, `torch.compile` is enabled by default and is a critical part of the framework. This document gives a simple walk-through example to show how to understand the `torch.compile` usage.
 
-Throughout the example, we will run a common Llama model using v1, and turn on debug level logging to show all the details. The command to be used is `VLLM_USE_V1=1 VLLM_LOGGING_LEVEL=DEBUG vllm serve meta-llama/Llama-3.2-1B`.
+Throughout the example, we will run a common Llama model, and turn on debug level logging to show all the details. The command to be used is `VLLM_LOGGING_LEVEL=DEBUG vllm serve meta-llama/Llama-3.2-1B`.
 
 ## Compilation Cache
 
diff --git a/examples/online_serving/disaggregated_serving_p2p_nccl_xpyd/disagg_example_p2p_nccl_xpyd.sh b/examples/online_serving/disaggregated_serving_p2p_nccl_xpyd/disagg_example_p2p_nccl_xpyd.sh
@@ -166,7 +166,7 @@ main() {
         local kv_port=$((21001 + i))
 
         echo "  Prefill server $((i+1)): GPU $gpu_id, Port $port, KV Port $kv_port"
-        CUDA_VISIBLE_DEVICES=$gpu_id VLLM_USE_V1=1 vllm serve $MODEL \
+        CUDA_VISIBLE_DEVICES=$gpu_id vllm serve $MODEL \
         --enforce-eager \
         --host 0.0.0.0 \
         --port $port \
@@ -194,7 +194,7 @@ main() {
         local kv_port=$((22001 + i))
 
         echo "  Decode server $((i+1)): GPU $gpu_id, Port $port, KV Port $kv_port"
-        VLLM_USE_V1=1 CUDA_VISIBLE_DEVICES=$gpu_id vllm serve $MODEL \
+        CUDA_VISIBLE_DEVICES=$gpu_id vllm serve $MODEL \
         --enforce-eager \
         --host 0.0.0.0 \
         --port $port \
diff --git a/examples/online_serving/elastic_ep/serve_deepseek_v2.sh b/examples/online_serving/elastic_ep/serve_deepseek_v2.sh
@@ -55,7 +55,6 @@ done
 echo "Starting vLLM server for $MODEL_NAME with data parallel size: $DATA_PARALLEL_SIZE and redundant experts: $REDUNDANT_EXPERTS"
 
 export RAY_DEDUP_LOGS=0
-export VLLM_USE_V1=1
 export VLLM_ALL2ALL_BACKEND="pplx"
 export VLLM_USE_DEEP_GEMM=1
 
diff --git a/examples/online_serving/openai_chat_completion_client_with_tools_required.py b/examples/online_serving/openai_chat_completion_client_with_tools_required.py
@@ -5,7 +5,7 @@
 without any specific flags:
 
 ```bash
-VLLM_USE_V1=0 vllm serve unsloth/Llama-3.2-1B-Instruct \
+vllm serve unsloth/Llama-3.2-1B-Instruct \
     --structured-outputs-config.backend outlines
 ```
 
diff --git a/examples/online_serving/ray_serve_deepseek.py b/examples/online_serving/ray_serve_deepseek.py
@@ -36,7 +36,6 @@
     },
     # Set to the node's accelerator type.
     accelerator_type="H100",
-    runtime_env={"env_vars": {"VLLM_USE_V1": "1"}},
     # Customize engine arguments as required (for example, vLLM engine kwargs).
     engine_kwargs={
         "tensor_parallel_size": 8,