Skip to content

Commit 5b8a7c1

Browse files
authored
[Misc] centralize all usage of environment variables (#4548)
1 parent 1ff0c73 commit 5b8a7c1

File tree

18 files changed

+220
-64
lines changed

18 files changed

+220
-64
lines changed

vllm/attention/backends/rocm_flash_attn.py

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,10 @@
11
"""Attention layer ROCm GPUs."""
2-
import os
32
from dataclasses import dataclass
43
from typing import Dict, List, Optional, Tuple, Type
54

65
import torch
76

7+
import vllm.envs as envs
88
from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl,
99
AttentionMetadata,
1010
AttentionMetadataPerStage)
@@ -156,8 +156,7 @@ def __init__(
156156

157157
self.use_naive_attn = False
158158
# NOTE: Allow for switching between Triton and CK. Defaulting to triton.
159-
self.use_triton_flash_attn = (os.environ.get(
160-
"VLLM_USE_TRITON_FLASH_ATTN", "True").lower() in ("true", "1"))
159+
self.use_triton_flash_attn = envs.VLLM_USE_TRITON_FLASH_ATTN
161160
if self.use_triton_flash_attn:
162161
from vllm.attention.ops.triton_flash_attention import ( # noqa: F401
163162
triton_attention)

vllm/attention/selector.py

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,18 +1,16 @@
11
import enum
2-
import os
32
from functools import lru_cache
43
from typing import Type
54

65
import torch
76

7+
import vllm.envs as envs
88
from vllm.attention.backends.abstract import AttentionBackend
99
from vllm.logger import init_logger
1010
from vllm.utils import is_cpu, is_hip
1111

1212
logger = init_logger(__name__)
1313

14-
VLLM_ATTENTION_BACKEND = "VLLM_ATTENTION_BACKEND"
15-
1614

1715
class _Backend(enum.Enum):
1816
FLASH_ATTN = enum.auto()
@@ -79,7 +77,7 @@ def _which_attn_to_use(dtype: torch.dtype) -> _Backend:
7977
"package is not found. Please install it for better performance.")
8078
return _Backend.XFORMERS
8179

82-
backend_by_env_var = os.getenv(VLLM_ATTENTION_BACKEND)
80+
backend_by_env_var = envs.VLLM_ATTENTION_BACKEND
8381
if backend_by_env_var is not None:
8482
return _Backend[backend_by_env_var]
8583

vllm/config.py

Lines changed: 0 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,5 @@
11
import enum
22
import json
3-
import os
43
from dataclasses import dataclass, field, fields
54
from typing import TYPE_CHECKING, ClassVar, List, Optional, Union
65

@@ -24,10 +23,6 @@
2423

2524
logger = init_logger(__name__)
2625

27-
# If true, will load models from ModelScope instead of Hugging Face Hub.
28-
VLLM_USE_MODELSCOPE = os.environ.get("VLLM_USE_MODELSCOPE",
29-
"False").lower() == "true"
30-
3126
_GB = 1 << 30
3227

3328

vllm/distributed/device_communicators/custom_all_reduce.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,10 @@
1-
import os
21
from contextlib import contextmanager
32
from typing import Any, List, Optional
43

54
import torch
65
import torch.distributed as dist
76

7+
import vllm.envs as envs
88
from vllm.logger import init_logger
99

1010
try:
@@ -54,9 +54,9 @@ def init_custom_ar() -> None:
5454
return
5555
# test nvlink first, this will filter out most of the cases
5656
# where custom allreduce is not supported
57-
if "CUDA_VISIBLE_DEVICES" in os.environ:
58-
device_ids = list(
59-
map(int, os.environ["CUDA_VISIBLE_DEVICES"].split(",")))
57+
cuda_visible_devices = envs.CUDA_VISIBLE_DEVICES
58+
if cuda_visible_devices:
59+
device_ids = list(map(int, cuda_visible_devices.split(",")))
6060
else:
6161
device_ids = list(range(num_dev))
6262
# this checks hardware and driver support for NVLink

vllm/distributed/parallel_state.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,11 +4,11 @@
44
# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
55
"""Tensor and pipeline parallel groups."""
66
import contextlib
7-
import os
87
from typing import Optional
98

109
import torch
1110

11+
import vllm.envs as envs
1212
from vllm.logger import init_logger
1313

1414
logger = init_logger(__name__)
@@ -80,7 +80,7 @@ def init_distributed_environment(
8080
# local_rank is not available in torch ProcessGroup,
8181
# see https://github.com/pytorch/pytorch/issues/122816
8282
if local_rank == -1 and distributed_init_method == "env://":
83-
local_rank = int(os.environ['LOCAL_RANK'])
83+
local_rank = envs.LOCAL_RANK
8484
global _LOCAL_RANK
8585
_LOCAL_RANK = local_rank
8686

vllm/distributed/utils.py

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
import torch
1010
import torch.distributed as dist
1111

12+
import vllm.envs as envs
1213
from vllm.logger import init_logger
1314

1415
from .parallel_state import get_cpu_world_group, get_local_rank
@@ -102,11 +103,13 @@ def gpu_p2p_access_check(i: int, j: int) -> bool:
102103
is_distributed = dist.is_initialized()
103104

104105
num_dev = torch.cuda.device_count()
105-
cuda_visible_devices = os.environ.get("CUDA_VISIBLE_DEVICES", None)
106+
cuda_visible_devices = envs.CUDA_VISIBLE_DEVICES
106107
if cuda_visible_devices is None:
107108
cuda_visible_devices = ",".join(str(i) for i in range(num_dev))
109+
VLLM_CONFIG_ROOT = envs.VLLM_CONFIG_ROOT
108110
path = os.path.expanduser(
109-
f"~/.config/vllm/gpu_p2p_access_cache_for_{cuda_visible_devices}.json")
111+
f"{VLLM_CONFIG_ROOT}/vllm/gpu_p2p_access_cache_for_{cuda_visible_devices}.json"
112+
)
110113
os.makedirs(os.path.dirname(path), exist_ok=True)
111114
if (not is_distributed or get_local_rank() == 0) \
112115
and (not os.path.exists(path)):

vllm/engine/async_llm_engine.py

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,12 @@
11
import asyncio
2-
import os
32
import time
43
from functools import partial
54
from typing import (Any, AsyncIterator, Callable, Dict, Iterable, List,
65
Optional, Set, Tuple, Type, Union)
76

87
from transformers import PreTrainedTokenizer
98

9+
import vllm.envs as envs
1010
from vllm.config import DecodingConfig, ModelConfig
1111
from vllm.core.scheduler import SchedulerOutputs
1212
from vllm.engine.arg_utils import AsyncEngineArgs
@@ -20,8 +20,7 @@
2020
from vllm.usage.usage_lib import UsageContext
2121

2222
logger = init_logger(__name__)
23-
ENGINE_ITERATION_TIMEOUT_S = int(
24-
os.environ.get("VLLM_ENGINE_ITERATION_TIMEOUT_S", "60"))
23+
ENGINE_ITERATION_TIMEOUT_S = envs.VLLM_ENGINE_ITERATION_TIMEOUT_S
2524

2625

2726
class AsyncEngineDeadError(RuntimeError):

vllm/entrypoints/openai/api_server.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,6 @@
11
import asyncio
22
import importlib
33
import inspect
4-
import os
54
import re
65
from contextlib import asynccontextmanager
76
from http import HTTPStatus
@@ -16,6 +15,7 @@
1615
from starlette.routing import Mount
1716

1817
import vllm
18+
import vllm.envs as envs
1919
from vllm.engine.arg_utils import AsyncEngineArgs
2020
from vllm.engine.async_llm_engine import AsyncLLMEngine
2121
from vllm.entrypoints.openai.cli_args import make_arg_parser
@@ -129,7 +129,7 @@ async def create_completion(request: CompletionRequest, raw_request: Request):
129129
allow_headers=args.allowed_headers,
130130
)
131131

132-
if token := os.environ.get("VLLM_API_KEY") or args.api_key:
132+
if token := envs.VLLM_API_KEY or args.api_key:
133133

134134
@app.middleware("http")
135135
async def authentication(request: Request, call_next):

vllm/envs.py

Lines changed: 160 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,160 @@
1+
import os
2+
from typing import TYPE_CHECKING, Any, Callable, Dict, Optional
3+
4+
if TYPE_CHECKING:
5+
VLLM_HOST_IP: str = ""
6+
VLLM_USE_MODELSCOPE: bool = False
7+
VLLM_INSTANCE_ID: Optional[str] = None
8+
VLLM_NCCL_SO_PATH: Optional[str] = None
9+
LD_LIBRARY_PATH: Optional[str] = None
10+
VLLM_USE_TRITON_FLASH_ATTN: bool = False
11+
LOCAL_RANK: int = 0
12+
CUDA_VISIBLE_DEVICES: Optional[str] = None
13+
VLLM_ENGINE_ITERATION_TIMEOUT_S: int = 60
14+
VLLM_API_KEY: Optional[str] = None
15+
S3_ACCESS_KEY_ID: Optional[str] = None
16+
S3_SECRET_ACCESS_KEY: Optional[str] = None
17+
S3_ENDPOINT_URL: Optional[str] = None
18+
VLLM_CONFIG_ROOT: str = ""
19+
VLLM_USAGE_STATS_SERVER: str = "https://stats.vllm.ai"
20+
VLLM_NO_USAGE_STATS: bool = False
21+
VLLM_DO_NOT_TRACK: bool = False
22+
VLLM_USAGE_SOURCE: str = ""
23+
VLLM_CONFIGURE_LOGGING: int = 1
24+
VLLM_LOGGING_CONFIG_PATH: Optional[str] = None
25+
VLLM_TRACE_FUNCTION: int = 0
26+
VLLM_ATTENTION_BACKEND: Optional[str] = None
27+
VLLM_CPU_KVCACHE_SPACE: int = 0
28+
VLLM_USE_RAY_COMPILED_DAG: bool = False
29+
VLLM_WORKER_MULTIPROC_METHOD: str = "spawn"
30+
31+
environment_variables: Dict[str, Callable[[], Any]] = {
32+
# used in distributed environment to determine the master address
33+
'VLLM_HOST_IP':
34+
lambda: os.getenv('VLLM_HOST_IP', "") or os.getenv("HOST_IP", ""),
35+
36+
# If true, will load models from ModelScope instead of Hugging Face Hub.
37+
# note that the value is true or false, not numbers
38+
"VLLM_USE_MODELSCOPE":
39+
lambda: os.environ.get("VLLM_USE_MODELSCOPE", "False").lower() == "true",
40+
41+
# Instance id represents an instance of the VLLM. All processes in the same
42+
# instance should have the same instance id.
43+
"VLLM_INSTANCE_ID":
44+
lambda: os.environ.get("VLLM_INSTANCE_ID", None),
45+
46+
# path to cudatoolkit home directory, under which should be bin, include,
47+
# and lib directories.
48+
"CUDA_HOME":
49+
lambda: os.environ.get("CUDA_HOME", None),
50+
51+
# Path to the NCCL library file. It is needed because nccl>=2.19 brought
52+
# by PyTorch contains a bug: https://github.com/NVIDIA/nccl/issues/1234
53+
"VLLM_NCCL_SO_PATH":
54+
lambda: os.environ.get("VLLM_NCCL_SO_PATH", None),
55+
56+
# when `VLLM_NCCL_SO_PATH` is not set, vllm will try to find the nccl
57+
# library file in the locations specified by `LD_LIBRARY_PATH`
58+
"LD_LIBRARY_PATH":
59+
lambda: os.environ.get("LD_LIBRARY_PATH", None),
60+
61+
# flag to control if vllm should use triton flash attention
62+
"VLLM_USE_TRITON_FLASH_ATTN":
63+
lambda: (os.environ.get("VLLM_USE_TRITON_FLASH_ATTN", "True").lower() in
64+
("true", "1")),
65+
66+
# local rank of the process in the distributed setting, used to determine
67+
# the GPU device id
68+
"LOCAL_RANK":
69+
lambda: int(os.environ.get("LOCAL_RANK", "0")),
70+
71+
# used to control the visible devices in the distributed setting
72+
"CUDA_VISIBLE_DEVICES":
73+
lambda: os.environ.get("CUDA_VISIBLE_DEVICES", None),
74+
75+
# timeout for each iteration in the engine
76+
"VLLM_ENGINE_ITERATION_TIMEOUT_S":
77+
lambda: int(os.environ.get("VLLM_ENGINE_ITERATION_TIMEOUT_S", "60")),
78+
79+
# API key for VLLM API server
80+
"VLLM_API_KEY":
81+
lambda: os.environ.get("VLLM_API_KEY", None),
82+
83+
# S3 access information, used for tensorizer to load model from S3
84+
"S3_ACCESS_KEY_ID":
85+
lambda: os.environ.get("S3_ACCESS_KEY", None),
86+
"S3_SECRET_ACCESS_KEY":
87+
lambda: os.environ.get("S3_SECRET_ACCESS_KEY", None),
88+
"S3_ENDPOINT_URL":
89+
lambda: os.environ.get("S3_ENDPOINT_URL", None),
90+
91+
# Root directory for VLLM configuration files
92+
# Note that this not only affects how vllm finds its configuration files
93+
# during runtime, but also affects how vllm installs its configuration
94+
# files during **installation**.
95+
"VLLM_CONFIG_ROOT":
96+
lambda: os.environ.get("VLLM_CONFIG_ROOT", None) or os.getenv(
97+
"XDG_CONFIG_HOME", None) or os.path.expanduser("~/.config"),
98+
99+
# Usage stats collection
100+
"VLLM_USAGE_STATS_SERVER":
101+
lambda: os.environ.get("VLLM_USAGE_STATS_SERVER", "https://stats.vllm.ai"),
102+
"VLLM_NO_USAGE_STATS":
103+
lambda: os.environ.get("VLLM_NO_USAGE_STATS", "0") == "1",
104+
"VLLM_DO_NOT_TRACK":
105+
lambda: (os.environ.get("VLLM_DO_NOT_TRACK", None) or os.environ.get(
106+
"DO_NOT_TRACK", None) or "0") == "1",
107+
"VLLM_USAGE_SOURCE":
108+
lambda: os.environ.get("VLLM_USAGE_SOURCE", "production"),
109+
110+
# Logging configuration
111+
# If set to 0, vllm will not configure logging
112+
# If set to 1, vllm will configure logging using the default configuration
113+
# or the configuration file specified by VLLM_LOGGING_CONFIG_PATH
114+
"VLLM_CONFIGURE_LOGGING":
115+
lambda: int(os.getenv("VLLM_CONFIGURE_LOGGING", "1")),
116+
"VLLM_LOGGING_CONFIG_PATH":
117+
lambda: os.getenv("VLLM_LOGGING_CONFIG_PATH"),
118+
119+
# Trace function calls
120+
# If set to 1, vllm will trace function calls
121+
# Useful for debugging
122+
"VLLM_TRACE_FUNCTION":
123+
lambda: int(os.getenv("VLLM_TRACE_FUNCTION", "0")),
124+
125+
# Backend for attention computation
126+
# Available options:
127+
# - "TORCH_SDPA": use torch.nn.MultiheadAttention
128+
# - "FLASH_ATTN": use FlashAttention
129+
# - "XFORMERS": use XFormers
130+
# - "ROCM_FLASH": use ROCmFlashAttention
131+
"VLLM_ATTENTION_BACKEND":
132+
lambda: os.getenv("VLLM_ATTENTION_BACKEND", None),
133+
134+
# CPU key-value cache space
135+
# default is 4GB
136+
"VLLM_CPU_KVCACHE_SPACE":
137+
lambda: int(os.getenv("VLLM_CPU_KVCACHE_SPACE", "0")),
138+
139+
# If the env var is set, it uses the Ray's compiled DAG API
140+
# which optimizes the control plane overhead.
141+
# Run vLLM with VLLM_USE_RAY_COMPILED_DAG=1 to enable it.
142+
"VLLM_USE_RAY_COMPILED_DAG":
143+
lambda: bool(os.getenv("VLLM_USE_RAY_COMPILED_DAG", 0)),
144+
145+
# Use dedicated multiprocess context for workers.
146+
# Both spawn and fork work
147+
"VLLM_WORKER_MULTIPROC_METHOD":
148+
lambda: os.getenv("VLLM_WORKER_MULTIPROC_METHOD", "spawn"),
149+
}
150+
151+
152+
def __getattr__(name):
153+
# lazy evaluation of environment variables
154+
if name in environment_variables:
155+
return environment_variables[name]()
156+
raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
157+
158+
159+
def __dir__():
160+
return list(environment_variables.keys())

vllm/executor/cpu_executor.py

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,8 @@
1-
import os
21
from typing import Dict, List, Set, Tuple
32

43
import torch
54

5+
import vllm.envs as envs
66
from vllm.config import CacheConfig, ModelConfig, SchedulerConfig
77
from vllm.executor.executor_base import ExecutorAsyncBase, ExecutorBase
88
from vllm.logger import init_logger
@@ -152,8 +152,7 @@ def _verify_and_get_cache_config(config: CacheConfig) -> CacheConfig:
152152
logger.warning("Prefix caching is not supported on CPU, disable it.")
153153
config.enable_prefix_caching = False
154154

155-
kv_cache_space_str = os.getenv("VLLM_CPU_KVCACHE_SPACE", "0")
156-
kv_cache_space = int(kv_cache_space_str)
155+
kv_cache_space = envs.VLLM_CPU_KVCACHE_SPACE
157156

158157
if kv_cache_space >= 0:
159158
if kv_cache_space == 0:

0 commit comments

Comments
 (0)