Skip to content

Commit 7e4cd07

Browse files
[V0 Deprecation] Remove VLLM_USE_V1 from docs and scripts (#26336)
Signed-off-by: DarkLight1337 <[email protected]>
1 parent 46b0779 commit 7e4cd07

File tree

11 files changed

+17
-26
lines changed

11 files changed

+17
-26
lines changed

.buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh

Lines changed: 0 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -454,11 +454,6 @@ main() {
454454
fi
455455
check_hf_token
456456

457-
# Set to v1 to run v1 benchmark
458-
if [[ "${ENGINE_VERSION:-v0}" == "v1" ]]; then
459-
export VLLM_USE_V1=1
460-
fi
461-
462457
# dependencies
463458
(which wget && which curl) || (apt-get update && apt-get install -y wget curl)
464459
(which jq) || (apt-get update && apt-get -y install jq)

.buildkite/scripts/hardware_ci/run-tpu-v1-test-part2.sh

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -64,10 +64,9 @@ python3 -m pip install --progress-bar off git+https://github.com/thuml/depyf.git
6464
&& python3 -m pip install --progress-bar off "lm-eval @ git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d" \
6565
&& python3 -m pip install --progress-bar off hf-transfer tblib==3.1.0
6666
echo "--- Python dependencies installed ---"
67-
export VLLM_USE_V1=1
67+
6868
export VLLM_XLA_CHECK_RECOMPILATION=1
6969
export VLLM_XLA_CACHE_PATH=
70-
echo "Using VLLM V1"
7170
7271
echo "--- Hardware Information ---"
7372
# tpu-info

.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -64,10 +64,9 @@ python3 -m pip install --progress-bar off git+https://github.com/thuml/depyf.git
6464
&& python3 -m pip install --progress-bar off "lm-eval @ git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d" \
6565
&& python3 -m pip install --progress-bar off hf-transfer tblib==3.1.0
6666
echo "--- Python dependencies installed ---"
67-
export VLLM_USE_V1=1
67+
6868
export VLLM_XLA_CHECK_RECOMPILATION=1
6969
export VLLM_XLA_CACHE_PATH=
70-
echo "Using VLLM V1"
7170
7271
echo "--- Hardware Information ---"
7372
# tpu-info

.buildkite/scripts/tpu/run_bm.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,7 @@ echo "lanching vllm..."
4242
echo "logging to $VLLM_LOG"
4343
echo
4444

45-
VLLM_USE_V1=1 vllm serve $MODEL \
45+
vllm serve $MODEL \
4646
--seed 42 \
4747
--max-num-seqs $MAX_NUM_SEQS \
4848
--max-num-batched-tokens $MAX_NUM_BATCHED_TOKENS \

benchmarks/auto_tune/auto_tune.sh

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -96,11 +96,11 @@ start_server() {
9696
# This correctly passes each element as a separate argument.
9797
if [[ -n "$profile_dir" ]]; then
9898
# Start server with profiling enabled
99-
VLLM_USE_V1=1 VLLM_SERVER_DEV_MODE=1 VLLM_TORCH_PROFILER_DIR=$profile_dir \
99+
VLLM_SERVER_DEV_MODE=1 VLLM_TORCH_PROFILER_DIR=$profile_dir \
100100
vllm serve "${common_args_array[@]}" > "$vllm_log" 2>&1 &
101101
else
102102
# Start server without profiling
103-
VLLM_USE_V1=1 VLLM_SERVER_DEV_MODE=1 \
103+
VLLM_SERVER_DEV_MODE=1 \
104104
vllm serve "${common_args_array[@]}" > "$vllm_log" 2>&1 &
105105
fi
106106
local server_pid=$!

docs/design/p2p_nccl_connector.md

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -97,7 +97,7 @@ python3 disagg_proxy_p2p_nccl_xpyd.py &
9797
??? console "Command"
9898

9999
```shell
100-
VLLM_USE_V1=1 CUDA_VISIBLE_DEVICES=0 vllm serve {your model directory} \
100+
CUDA_VISIBLE_DEVICES=0 vllm serve {your model directory} \
101101
--host 0.0.0.0 \
102102
--port 20001 \
103103
--tensor-parallel-size 1 \
@@ -118,7 +118,7 @@ python3 disagg_proxy_p2p_nccl_xpyd.py &
118118
??? console "Command"
119119

120120
```shell
121-
VLLM_USE_V1=1 CUDA_VISIBLE_DEVICES=1 vllm serve {your model directory} \
121+
CUDA_VISIBLE_DEVICES=1 vllm serve {your model directory} \
122122
--host 0.0.0.0 \
123123
--port 20002 \
124124
--tensor-parallel-size 1 \
@@ -139,7 +139,7 @@ python3 disagg_proxy_p2p_nccl_xpyd.py &
139139
??? console "Command"
140140

141141
```shell
142-
VLLM_USE_V1=1 CUDA_VISIBLE_DEVICES=2 vllm serve {your model directory} \
142+
CUDA_VISIBLE_DEVICES=2 vllm serve {your model directory} \
143143
--host 0.0.0.0 \
144144
--port 20003 \
145145
--tensor-parallel-size 1 \
@@ -160,7 +160,7 @@ python3 disagg_proxy_p2p_nccl_xpyd.py &
160160
??? console "Command"
161161

162162
```shell
163-
VLLM_USE_V1=1 CUDA_VISIBLE_DEVICES=3 vllm serve {your model directory} \
163+
CUDA_VISIBLE_DEVICES=3 vllm serve {your model directory} \
164164
--host 0.0.0.0 \
165165
--port 20004 \
166166
--tensor-parallel-size 1 \
@@ -190,7 +190,7 @@ python3 disagg_proxy_p2p_nccl_xpyd.py &
190190
??? console "Command"
191191

192192
```shell
193-
VLLM_USE_V1=1 CUDA_VISIBLE_DEVICES=0 vllm serve {your model directory} \
193+
CUDA_VISIBLE_DEVICES=0 vllm serve {your model directory} \
194194
--host 0.0.0.0 \
195195
--port 20001 \
196196
--tensor-parallel-size 1 \
@@ -211,7 +211,7 @@ python3 disagg_proxy_p2p_nccl_xpyd.py &
211211
??? console "Command"
212212

213213
```shell
214-
VLLM_USE_V1=1 CUDA_VISIBLE_DEVICES=1 vllm serve {your model directory} \
214+
CUDA_VISIBLE_DEVICES=1 vllm serve {your model directory} \
215215
--host 0.0.0.0 \
216216
--port 20002 \
217217
--tensor-parallel-size 1 \
@@ -232,7 +232,7 @@ python3 disagg_proxy_p2p_nccl_xpyd.py &
232232
??? console "Command"
233233

234234
```shell
235-
VLLM_USE_V1=1 CUDA_VISIBLE_DEVICES=2 vllm serve {your model directory} \
235+
CUDA_VISIBLE_DEVICES=2 vllm serve {your model directory} \
236236
--host 0.0.0.0 \
237237
--port 20003 \
238238
--tensor-parallel-size 1 \
@@ -253,7 +253,7 @@ python3 disagg_proxy_p2p_nccl_xpyd.py &
253253
??? console "Command"
254254

255255
```shell
256-
VLLM_USE_V1=1 CUDA_VISIBLE_DEVICES=3 vllm serve {your model directory} \
256+
CUDA_VISIBLE_DEVICES=3 vllm serve {your model directory} \
257257
--host 0.0.0.0 \
258258
--port 20004 \
259259
--tensor-parallel-size 1 \

docs/design/torch_compile.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22

33
In vLLM's V1 architecture, `torch.compile` is enabled by default and is a critical part of the framework. This document gives a simple walk-through example to show how to understand the `torch.compile` usage.
44

5-
Throughout the example, we will run a common Llama model using v1, and turn on debug level logging to show all the details. The command to be used is `VLLM_USE_V1=1 VLLM_LOGGING_LEVEL=DEBUG vllm serve meta-llama/Llama-3.2-1B`.
5+
Throughout the example, we will run a common Llama model, and turn on debug level logging to show all the details. The command to be used is `VLLM_LOGGING_LEVEL=DEBUG vllm serve meta-llama/Llama-3.2-1B`.
66

77
## Compilation Cache
88

examples/online_serving/disaggregated_serving_p2p_nccl_xpyd/disagg_example_p2p_nccl_xpyd.sh

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -166,7 +166,7 @@ main() {
166166
local kv_port=$((21001 + i))
167167

168168
echo " Prefill server $((i+1)): GPU $gpu_id, Port $port, KV Port $kv_port"
169-
CUDA_VISIBLE_DEVICES=$gpu_id VLLM_USE_V1=1 vllm serve $MODEL \
169+
CUDA_VISIBLE_DEVICES=$gpu_id vllm serve $MODEL \
170170
--enforce-eager \
171171
--host 0.0.0.0 \
172172
--port $port \
@@ -194,7 +194,7 @@ main() {
194194
local kv_port=$((22001 + i))
195195

196196
echo " Decode server $((i+1)): GPU $gpu_id, Port $port, KV Port $kv_port"
197-
VLLM_USE_V1=1 CUDA_VISIBLE_DEVICES=$gpu_id vllm serve $MODEL \
197+
CUDA_VISIBLE_DEVICES=$gpu_id vllm serve $MODEL \
198198
--enforce-eager \
199199
--host 0.0.0.0 \
200200
--port $port \

examples/online_serving/elastic_ep/serve_deepseek_v2.sh

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -55,7 +55,6 @@ done
5555
echo "Starting vLLM server for $MODEL_NAME with data parallel size: $DATA_PARALLEL_SIZE and redundant experts: $REDUNDANT_EXPERTS"
5656

5757
export RAY_DEDUP_LOGS=0
58-
export VLLM_USE_V1=1
5958
export VLLM_ALL2ALL_BACKEND="pplx"
6059
export VLLM_USE_DEEP_GEMM=1
6160

examples/online_serving/openai_chat_completion_client_with_tools_required.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
without any specific flags:
66
77
```bash
8-
VLLM_USE_V1=0 vllm serve unsloth/Llama-3.2-1B-Instruct \
8+
vllm serve unsloth/Llama-3.2-1B-Instruct \
99
--structured-outputs-config.backend outlines
1010
```
1111

0 commit comments

Comments
 (0)