vllm-project
diff --git a/‎vllm/attention/backends/rocm_flash_attn.py‎
Lines changed: 2 additions & 3 deletions b/‎vllm/attention/backends/rocm_flash_attn.py‎
Lines changed: 2 additions & 3 deletions
diff --git a/‎vllm/attention/selector.py‎
Lines changed: 2 additions & 4 deletions b/‎vllm/attention/selector.py‎
Lines changed: 2 additions & 4 deletions
diff --git a/‎vllm/config.py‎
Lines changed: 0 additions & 5 deletions b/‎vllm/config.py‎
Lines changed: 0 additions & 5 deletions
diff --git a/‎vllm/distributed/device_communicators/custom_all_reduce.py‎
Lines changed: 4 additions & 4 deletions b/‎vllm/distributed/device_communicators/custom_all_reduce.py‎
Lines changed: 4 additions & 4 deletions
diff --git a/‎vllm/distributed/parallel_state.py‎
Lines changed: 2 additions & 2 deletions b/‎vllm/distributed/parallel_state.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎vllm/distributed/utils.py‎
Lines changed: 5 additions & 2 deletions b/‎vllm/distributed/utils.py‎
Lines changed: 5 additions & 2 deletions
diff --git a/‎vllm/engine/async_llm_engine.py‎
Lines changed: 2 additions & 3 deletions b/‎vllm/engine/async_llm_engine.py‎
Lines changed: 2 additions & 3 deletions
diff --git a/‎vllm/entrypoints/openai/api_server.py‎
Lines changed: 2 additions & 2 deletions b/‎vllm/entrypoints/openai/api_server.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎vllm/envs.py‎
Lines changed: 160 additions & 0 deletions b/‎vllm/envs.py‎
Lines changed: 160 additions & 0 deletions
diff --git a/‎vllm/executor/cpu_executor.py‎
Lines changed: 2 additions & 3 deletions b/‎vllm/executor/cpu_executor.py‎
Lines changed: 2 additions & 3 deletions
@@ -1,10 +1,10 @@
 """Attention layer ROCm GPUs."""
-import os
 from dataclasses import dataclass
 from typing import Dict, List, Optional, Tuple, Type
 
 import torch
 
+import vllm.envs as envs
 from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl,
                                               AttentionMetadata,
                                               AttentionMetadataPerStage)
@@ -156,8 +156,7 @@ def __init__(
 
         self.use_naive_attn = False
         # NOTE: Allow for switching between Triton and CK. Defaulting to triton.
-        self.use_triton_flash_attn = (os.environ.get(
-            "VLLM_USE_TRITON_FLASH_ATTN", "True").lower() in ("true", "1"))
+        self.use_triton_flash_attn = envs.VLLM_USE_TRITON_FLASH_ATTN
         if self.use_triton_flash_attn:
             from vllm.attention.ops.triton_flash_attention import (  # noqa: F401
                 triton_attention)
 
@@ -1,18 +1,16 @@
 import enum
-import os
 from functools import lru_cache
 from typing import Type
 
 import torch
 
+import vllm.envs as envs
 from vllm.attention.backends.abstract import AttentionBackend
 from vllm.logger import init_logger
 from vllm.utils import is_cpu, is_hip
 
 logger = init_logger(__name__)
 
-VLLM_ATTENTION_BACKEND = "VLLM_ATTENTION_BACKEND"
-
 
 class _Backend(enum.Enum):
     FLASH_ATTN = enum.auto()
@@ -79,7 +77,7 @@ def _which_attn_to_use(dtype: torch.dtype) -> _Backend:
             "package is not found. Please install it for better performance.")
         return _Backend.XFORMERS
 
-    backend_by_env_var = os.getenv(VLLM_ATTENTION_BACKEND)
+    backend_by_env_var = envs.VLLM_ATTENTION_BACKEND
     if backend_by_env_var is not None:
         return _Backend[backend_by_env_var]
 
 
@@ -1,6 +1,5 @@
 import enum
 import json
-import os
 from dataclasses import dataclass, field, fields
 from typing import TYPE_CHECKING, ClassVar, List, Optional, Union
 
@@ -24,10 +23,6 @@
 
 logger = init_logger(__name__)
 
-# If true, will load models from ModelScope instead of Hugging Face Hub.
-VLLM_USE_MODELSCOPE = os.environ.get("VLLM_USE_MODELSCOPE",
-                                     "False").lower() == "true"
-
 _GB = 1 << 30
 
 
 
@@ -1,10 +1,10 @@
-import os
 from contextlib import contextmanager
 from typing import Any, List, Optional
 
 import torch
 import torch.distributed as dist
 
+import vllm.envs as envs
 from vllm.logger import init_logger
 
 try:
@@ -54,9 +54,9 @@ def init_custom_ar() -> None:
         return
     # test nvlink first, this will filter out most of the cases
     # where custom allreduce is not supported
-    if "CUDA_VISIBLE_DEVICES" in os.environ:
-        device_ids = list(
-            map(int, os.environ["CUDA_VISIBLE_DEVICES"].split(",")))
+    cuda_visible_devices = envs.CUDA_VISIBLE_DEVICES
+    if cuda_visible_devices:
+        device_ids = list(map(int, cuda_visible_devices.split(",")))
     else:
         device_ids = list(range(num_dev))
     # this checks hardware and driver support for NVLink
 
@@ -4,11 +4,11 @@
 # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
 """Tensor and pipeline parallel groups."""
 import contextlib
-import os
 from typing import Optional
 
 import torch
 
+import vllm.envs as envs
 from vllm.logger import init_logger
 
 logger = init_logger(__name__)
@@ -80,7 +80,7 @@ def init_distributed_environment(
         # local_rank is not available in torch ProcessGroup,
         # see https://github.com/pytorch/pytorch/issues/122816
         if local_rank == -1 and distributed_init_method == "env://":
-            local_rank = int(os.environ['LOCAL_RANK'])
+            local_rank = envs.LOCAL_RANK
         global _LOCAL_RANK
         _LOCAL_RANK = local_rank
 
 
@@ -9,6 +9,7 @@
 import torch
 import torch.distributed as dist
 
+import vllm.envs as envs
 from vllm.logger import init_logger
 
 from .parallel_state import get_cpu_world_group, get_local_rank
@@ -102,11 +103,13 @@ def gpu_p2p_access_check(i: int, j: int) -> bool:
     is_distributed = dist.is_initialized()
 
     num_dev = torch.cuda.device_count()
-    cuda_visible_devices = os.environ.get("CUDA_VISIBLE_DEVICES", None)
+    cuda_visible_devices = envs.CUDA_VISIBLE_DEVICES
     if cuda_visible_devices is None:
         cuda_visible_devices = ",".join(str(i) for i in range(num_dev))
+    VLLM_CONFIG_ROOT = envs.VLLM_CONFIG_ROOT
     path = os.path.expanduser(
-        f"~/.config/vllm/gpu_p2p_access_cache_for_{cuda_visible_devices}.json")
+        f"{VLLM_CONFIG_ROOT}/vllm/gpu_p2p_access_cache_for_{cuda_visible_devices}.json"
+    )
     os.makedirs(os.path.dirname(path), exist_ok=True)
     if (not is_distributed or get_local_rank() == 0) \
         and (not os.path.exists(path)):
 
@@ -1,12 +1,12 @@
 import asyncio
-import os
 import time
 from functools import partial
 from typing import (Any, AsyncIterator, Callable, Dict, Iterable, List,
                     Optional, Set, Tuple, Type, Union)
 
 from transformers import PreTrainedTokenizer
 
+import vllm.envs as envs
 from vllm.config import DecodingConfig, ModelConfig
 from vllm.core.scheduler import SchedulerOutputs
 from vllm.engine.arg_utils import AsyncEngineArgs
@@ -20,8 +20,7 @@
 from vllm.usage.usage_lib import UsageContext
 
 logger = init_logger(__name__)
-ENGINE_ITERATION_TIMEOUT_S = int(
-    os.environ.get("VLLM_ENGINE_ITERATION_TIMEOUT_S", "60"))
+ENGINE_ITERATION_TIMEOUT_S = envs.VLLM_ENGINE_ITERATION_TIMEOUT_S
 
 
 class AsyncEngineDeadError(RuntimeError):
 
@@ -1,7 +1,6 @@
 import asyncio
 import importlib
 import inspect
-import os
 import re
 from contextlib import asynccontextmanager
 from http import HTTPStatus
@@ -16,6 +15,7 @@
 from starlette.routing import Mount
 
 import vllm
+import vllm.envs as envs
 from vllm.engine.arg_utils import AsyncEngineArgs
 from vllm.engine.async_llm_engine import AsyncLLMEngine
 from vllm.entrypoints.openai.cli_args import make_arg_parser
@@ -129,7 +129,7 @@ async def create_completion(request: CompletionRequest, raw_request: Request):
         allow_headers=args.allowed_headers,
     )
 
-    if token := os.environ.get("VLLM_API_KEY") or args.api_key:
+    if token := envs.VLLM_API_KEY or args.api_key:
 
         @app.middleware("http")
         async def authentication(request: Request, call_next):
 
@@ -0,0 +1,160 @@
+import os
+from typing import TYPE_CHECKING, Any, Callable, Dict, Optional
+
+if TYPE_CHECKING:
+    VLLM_HOST_IP: str = ""
+    VLLM_USE_MODELSCOPE: bool = False
+    VLLM_INSTANCE_ID: Optional[str] = None
+    VLLM_NCCL_SO_PATH: Optional[str] = None
+    LD_LIBRARY_PATH: Optional[str] = None
+    VLLM_USE_TRITON_FLASH_ATTN: bool = False
+    LOCAL_RANK: int = 0
+    CUDA_VISIBLE_DEVICES: Optional[str] = None
+    VLLM_ENGINE_ITERATION_TIMEOUT_S: int = 60
+    VLLM_API_KEY: Optional[str] = None
+    S3_ACCESS_KEY_ID: Optional[str] = None
+    S3_SECRET_ACCESS_KEY: Optional[str] = None
+    S3_ENDPOINT_URL: Optional[str] = None
+    VLLM_CONFIG_ROOT: str = ""
+    VLLM_USAGE_STATS_SERVER: str = "https://stats.vllm.ai"
+    VLLM_NO_USAGE_STATS: bool = False
+    VLLM_DO_NOT_TRACK: bool = False
+    VLLM_USAGE_SOURCE: str = ""
+    VLLM_CONFIGURE_LOGGING: int = 1
+    VLLM_LOGGING_CONFIG_PATH: Optional[str] = None
+    VLLM_TRACE_FUNCTION: int = 0
+    VLLM_ATTENTION_BACKEND: Optional[str] = None
+    VLLM_CPU_KVCACHE_SPACE: int = 0
+    VLLM_USE_RAY_COMPILED_DAG: bool = False
+    VLLM_WORKER_MULTIPROC_METHOD: str = "spawn"
+
+environment_variables: Dict[str, Callable[[], Any]] = {
+    # used in distributed environment to determine the master address
+    'VLLM_HOST_IP':
+    lambda: os.getenv('VLLM_HOST_IP', "") or os.getenv("HOST_IP", ""),
+
+    # If true, will load models from ModelScope instead of Hugging Face Hub.
+    # note that the value is true or false, not numbers
+    "VLLM_USE_MODELSCOPE":
+    lambda: os.environ.get("VLLM_USE_MODELSCOPE", "False").lower() == "true",
+
+    # Instance id represents an instance of the VLLM. All processes in the same
+    # instance should have the same instance id.
+    "VLLM_INSTANCE_ID":
+    lambda: os.environ.get("VLLM_INSTANCE_ID", None),
+
+    # path to cudatoolkit home directory, under which should be bin, include,
+    # and lib directories.
+    "CUDA_HOME":
+    lambda: os.environ.get("CUDA_HOME", None),
+
+    # Path to the NCCL library file. It is needed because nccl>=2.19 brought
+    # by PyTorch contains a bug: https://github.com/NVIDIA/nccl/issues/1234
+    "VLLM_NCCL_SO_PATH":
+    lambda: os.environ.get("VLLM_NCCL_SO_PATH", None),
+
+    # when `VLLM_NCCL_SO_PATH` is not set, vllm will try to find the nccl
+    # library file in the locations specified by `LD_LIBRARY_PATH`
+    "LD_LIBRARY_PATH":
+    lambda: os.environ.get("LD_LIBRARY_PATH", None),
+
+    # flag to control if vllm should use triton flash attention
+    "VLLM_USE_TRITON_FLASH_ATTN":
+    lambda: (os.environ.get("VLLM_USE_TRITON_FLASH_ATTN", "True").lower() in
+             ("true", "1")),
+
+    # local rank of the process in the distributed setting, used to determine
+    # the GPU device id
+    "LOCAL_RANK":
+    lambda: int(os.environ.get("LOCAL_RANK", "0")),
+
+    # used to control the visible devices in the distributed setting
+    "CUDA_VISIBLE_DEVICES":
+    lambda: os.environ.get("CUDA_VISIBLE_DEVICES", None),
+
+    # timeout for each iteration in the engine
+    "VLLM_ENGINE_ITERATION_TIMEOUT_S":
+    lambda: int(os.environ.get("VLLM_ENGINE_ITERATION_TIMEOUT_S", "60")),
+
+    # API key for VLLM API server
+    "VLLM_API_KEY":
+    lambda: os.environ.get("VLLM_API_KEY", None),
+
+    # S3 access information, used for tensorizer to load model from S3
+    "S3_ACCESS_KEY_ID":
+    lambda: os.environ.get("S3_ACCESS_KEY", None),
+    "S3_SECRET_ACCESS_KEY":
+    lambda: os.environ.get("S3_SECRET_ACCESS_KEY", None),
+    "S3_ENDPOINT_URL":
+    lambda: os.environ.get("S3_ENDPOINT_URL", None),
+
+    # Root directory for VLLM configuration files
+    # Note that this not only affects how vllm finds its configuration files
+    # during runtime, but also affects how vllm installs its configuration
+    # files during **installation**.
+    "VLLM_CONFIG_ROOT":
+    lambda: os.environ.get("VLLM_CONFIG_ROOT", None) or os.getenv(
+        "XDG_CONFIG_HOME", None) or os.path.expanduser("~/.config"),
+
+    # Usage stats collection
+    "VLLM_USAGE_STATS_SERVER":
+    lambda: os.environ.get("VLLM_USAGE_STATS_SERVER", "https://stats.vllm.ai"),
+    "VLLM_NO_USAGE_STATS":
+    lambda: os.environ.get("VLLM_NO_USAGE_STATS", "0") == "1",
+    "VLLM_DO_NOT_TRACK":
+    lambda: (os.environ.get("VLLM_DO_NOT_TRACK", None) or os.environ.get(
+        "DO_NOT_TRACK", None) or "0") == "1",
+    "VLLM_USAGE_SOURCE":
+    lambda: os.environ.get("VLLM_USAGE_SOURCE", "production"),
+
+    # Logging configuration
+    # If set to 0, vllm will not configure logging
+    # If set to 1, vllm will configure logging using the default configuration
+    #    or the configuration file specified by VLLM_LOGGING_CONFIG_PATH
+    "VLLM_CONFIGURE_LOGGING":
+    lambda: int(os.getenv("VLLM_CONFIGURE_LOGGING", "1")),
+    "VLLM_LOGGING_CONFIG_PATH":
+    lambda: os.getenv("VLLM_LOGGING_CONFIG_PATH"),
+
+    # Trace function calls
+    # If set to 1, vllm will trace function calls
+    # Useful for debugging
+    "VLLM_TRACE_FUNCTION":
+    lambda: int(os.getenv("VLLM_TRACE_FUNCTION", "0")),
+
+    # Backend for attention computation
+    # Available options:
+    # - "TORCH_SDPA": use torch.nn.MultiheadAttention
+    # - "FLASH_ATTN": use FlashAttention
+    # - "XFORMERS": use XFormers
+    # - "ROCM_FLASH": use ROCmFlashAttention
+    "VLLM_ATTENTION_BACKEND":
+    lambda: os.getenv("VLLM_ATTENTION_BACKEND", None),
+
+    # CPU key-value cache space
+    # default is 4GB
+    "VLLM_CPU_KVCACHE_SPACE":
+    lambda: int(os.getenv("VLLM_CPU_KVCACHE_SPACE", "0")),
+
+    # If the env var is set, it uses the Ray's compiled DAG API
+    # which optimizes the control plane overhead.
+    # Run vLLM with VLLM_USE_RAY_COMPILED_DAG=1 to enable it.
+    "VLLM_USE_RAY_COMPILED_DAG":
+    lambda: bool(os.getenv("VLLM_USE_RAY_COMPILED_DAG", 0)),
+
+    # Use dedicated multiprocess context for workers.
+    # Both spawn and fork work
+    "VLLM_WORKER_MULTIPROC_METHOD":
+    lambda: os.getenv("VLLM_WORKER_MULTIPROC_METHOD", "spawn"),
+}
+
+
+def __getattr__(name):
+    # lazy evaluation of environment variables
+    if name in environment_variables:
+        return environment_variables[name]()
+    raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
+
+
+def __dir__():
+    return list(environment_variables.keys())
@@ -1,8 +1,8 @@
-import os
 from typing import Dict, List, Set, Tuple
 
 import torch
 
+import vllm.envs as envs
 from vllm.config import CacheConfig, ModelConfig, SchedulerConfig
 from vllm.executor.executor_base import ExecutorAsyncBase, ExecutorBase
 from vllm.logger import init_logger
@@ -152,8 +152,7 @@ def _verify_and_get_cache_config(config: CacheConfig) -> CacheConfig:
         logger.warning("Prefix caching is not supported on CPU, disable it.")
         config.enable_prefix_caching = False
 
-    kv_cache_space_str = os.getenv("VLLM_CPU_KVCACHE_SPACE", "0")
-    kv_cache_space = int(kv_cache_space_str)
+    kv_cache_space = envs.VLLM_CPU_KVCACHE_SPACE
 
     if kv_cache_space >= 0:
         if kv_cache_space == 0: