Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .buildkite/scripts/hardware_ci/run-xpu-test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ docker run \
set -e
echo $ZE_AFFINITY_MASK
VLLM_USE_V1=1 python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager
VLLM_USE_V1=1 python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 -O3 -O.cudagraph_mode=NONE
VLLM_USE_V1=1 python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend ray
VLLM_USE_V1=1 python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend mp
cd tests
Expand Down
3 changes: 1 addition & 2 deletions vllm/attention/layer.py
Original file line number Diff line number Diff line change
Expand Up @@ -190,8 +190,7 @@ def __init__(
# torch.compile works by registering the attention as one giant
# opaque custom op. For other platforms, we directly call them
# and let torch.compile handle them.
self.use_direct_call = not current_platform.is_cuda_alike(
) and not current_platform.is_cpu()
self.use_direct_call = not current_platform.opaque_attention_op()

self.use_output = self.attn_backend.accept_output_buffer
compilation_config = get_current_vllm_config().compilation_config
Expand Down
8 changes: 8 additions & 0 deletions vllm/compilation/fix_functionalization.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
from torch._higher_order_ops.auto_functionalize import auto_functionalized

from vllm.logger import init_logger
from vllm.platforms import current_platform

from .fx_utils import is_func
from .vllm_inductor_pass import VllmInductorPass
Expand All @@ -26,6 +27,13 @@ class FixFunctionalizationPass(VllmInductorPass):
"""

def __call__(self, graph: torch.fx.Graph):
# XPU does not support auto-functionalization yet.
# Will enable this when switch to vllm-xpu-kernels.
if current_platform.is_xpu():
logger.debug("XPU platform does not support fix functionalization"
"pass currently.")
return

self.begin()
self.dump_graph(graph, "before_fix_functionalization")

Expand Down
4 changes: 4 additions & 0 deletions vllm/platforms/cpu.py
Original file line number Diff line number Diff line change
Expand Up @@ -335,3 +335,7 @@ def default_v1(cls, model_config) -> bool:
return (cls.supports_v1(model_config)
and arch in (CpuArchEnum.X86, CpuArchEnum.POWERPC,
CpuArchEnum.ARM, CpuArchEnum.S390X))

@classmethod
def opaque_attention_op(cls) -> bool:
return True
4 changes: 4 additions & 0 deletions vllm/platforms/cuda.py
Original file line number Diff line number Diff line change
Expand Up @@ -442,6 +442,10 @@ def supports_v1(cls, model_config: "ModelConfig") -> bool:
def use_custom_allreduce(cls) -> bool:
return True

@classmethod
def opaque_attention_op(cls) -> bool:
return True

@classmethod
def get_static_graph_wrapper_cls(cls) -> str:
return "vllm.compilation.cuda_graph.CUDAGraphWrapper"
Expand Down
8 changes: 8 additions & 0 deletions vllm/platforms/interface.py
Original file line number Diff line number Diff line change
Expand Up @@ -509,6 +509,14 @@ def use_custom_allreduce(cls) -> bool:
"""
return False

@classmethod
def opaque_attention_op(cls) -> bool:
"""
Returns True if we register attention as one giant opaque custom op
on the current platform
"""
return False

@classmethod
def validate_request(
cls,
Expand Down
4 changes: 4 additions & 0 deletions vllm/platforms/rocm.py
Original file line number Diff line number Diff line change
Expand Up @@ -411,6 +411,10 @@ def use_custom_allreduce(cls) -> bool:
supported_archs = ['gfx94', 'gfx95']
return any(gfx in gcn_arch for gfx in supported_archs)

@classmethod
def opaque_attention_op(cls) -> bool:
return True

@classmethod
def get_cu_count(cls, device_id: int = 0) -> int:
return torch.cuda.get_device_properties(
Expand Down
15 changes: 6 additions & 9 deletions vllm/platforms/xpu.py
Original file line number Diff line number Diff line change
Expand Up @@ -90,21 +90,14 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
if cache_config and cache_config.block_size is None:
cache_config.block_size = 64

# FIXME: Temporarily forcing eager mode
# remove after t.compile support stabilizes.
if (envs.VLLM_USE_V1 and model_config is not None
and not vllm_config.model_config.enforce_eager):
from vllm.config import CompilationLevel
vllm_config.compilation_config.level = CompilationLevel.NO_COMPILATION # noqa: E501

# lazy import to avoid circular import
from vllm.config import CUDAGraphMode
compilation_config = vllm_config.compilation_config
if compilation_config.cudagraph_mode is None or \
compilation_config.cudagraph_mode.max_cudagraph_mode() \
!= CUDAGraphMode.NONE:
logger.info("[XPU] CUDA graph is not supported on XPU, "
"disabling cudagraphs.")
logger.info("[XPU] CUDA graph is not supported on XPU, disabling "
"cudagraphs. Fallback to cudagraph_mode=NONE")
compilation_config.cudagraph_mode = CUDAGraphMode.NONE

# check and update parallel config
Expand Down Expand Up @@ -182,3 +175,7 @@ def check_if_supports_dtype(cls, torch_dtype: torch.dtype):
"Intel Arc A770 have bfloat16 accuracy known issue. "
"You can use float16 instead by explicitly setting the "
"`dtype` flag in CLI, for example: --dtype=half.")

@classmethod
def opaque_attention_op(cls) -> bool:
return True