Skip to content
This repository was archived by the owner on Sep 4, 2025. It is now read-only.

Commit 352ef7c

Browse files
zhaoyang-starjoerunde
authored andcommitted
Disable cuda version check in vllm-openai image (vllm-project#4530)
1 parent c7426c1 commit 352ef7c

File tree

2 files changed

+2
-33
lines changed

2 files changed

+2
-33
lines changed

vllm/config.py

Lines changed: 1 addition & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -4,15 +4,13 @@
44
from typing import TYPE_CHECKING, ClassVar, List, Optional, Union
55

66
import torch
7-
from packaging.version import Version
87
from transformers import PretrainedConfig
98

109
from vllm.logger import init_logger
1110
from vllm.model_executor.layers.quantization import (QUANTIZATION_METHODS,
1211
get_quantization_config)
1312
from vllm.transformers_utils.config import get_config, get_hf_text_config
14-
from vllm.utils import (get_cpu_memory, get_nvcc_cuda_version, is_cpu, is_hip,
15-
is_neuron)
13+
from vllm.utils import get_cpu_memory, is_cpu, is_hip, is_neuron
1614

1715
GPTQMarlinConfig = get_quantization_config("gptq_marlin")
1816

@@ -369,13 +367,6 @@ def _verify_cache_dtype(self) -> None:
369367
if self.cache_dtype == "auto":
370368
pass
371369
elif self.cache_dtype == "fp8":
372-
if not is_hip():
373-
nvcc_cuda_version = get_nvcc_cuda_version()
374-
if nvcc_cuda_version is not None \
375-
and nvcc_cuda_version < Version("11.8"):
376-
raise ValueError(
377-
"FP8 is not supported when cuda version is"
378-
"lower than 11.8.")
379370
logger.info(
380371
"Using fp8 data type to store kv cache. It reduces the GPU "
381372
"memory footprint and boosts the performance. "

vllm/utils.py

Lines changed: 1 addition & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,6 @@
1919

2020
import psutil
2121
import torch
22-
from packaging.version import Version, parse
2322

2423
import vllm.envs as envs
2524
from vllm.logger import enable_trace_function_call, init_logger
@@ -314,27 +313,6 @@ def cdiv(a: int, b: int) -> int:
314313
return -(a // -b)
315314

316315

317-
@lru_cache(maxsize=None)
318-
def get_nvcc_cuda_version() -> Optional[Version]:
319-
cuda_home = envs.CUDA_HOME
320-
if not cuda_home:
321-
cuda_home = '/usr/local/cuda'
322-
if os.path.isfile(cuda_home + '/bin/nvcc'):
323-
logger.info(
324-
'CUDA_HOME is not found in the environment. '
325-
'Using %s as CUDA_HOME.', cuda_home)
326-
else:
327-
logger.warning('Not found nvcc in %s. Skip cuda version check!',
328-
cuda_home)
329-
return None
330-
nvcc_output = subprocess.check_output([cuda_home + "/bin/nvcc", "-V"],
331-
universal_newlines=True)
332-
output = nvcc_output.split()
333-
release_idx = output.index("release") + 1
334-
nvcc_cuda_version = parse(output[release_idx].split(",")[0])
335-
return nvcc_cuda_version
336-
337-
338316
def _generate_random_fp8(
339317
tensor: torch.tensor,
340318
low: float,
@@ -560,7 +538,7 @@ def maybe_expand_dim(tensor: torch.Tensor,
560538
def merge_dicts(dict1: Dict[Any, List[Any]],
561539
dict2: Dict[Any, List[Any]]) -> Dict[Any, List[Any]]:
562540
"""Merge 2 dicts that have key -> List of items.
563-
541+
564542
When a key conflicts, the values in dict1 is prioritized.
565543
"""
566544
merged_dict = defaultdict(list)

0 commit comments

Comments
 (0)