Skip to content

Commit 4555143

Browse files
authored
[CPU] V1 support for the CPU backend (vllm-project#16441)
1 parent 52dceb1 commit 4555143

File tree

15 files changed

+465
-40
lines changed

15 files changed

+465
-40
lines changed

.buildkite/scripts/hardware_ci/run-cpu-test.sh

Lines changed: 5 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@ set -ex
66

77
# allow to bind to different cores
88
CORE_RANGE=${CORE_RANGE:-48-95}
9+
OMP_CORE_RANGE=${OMP_CORE_RANGE:-48-95}
910
NUMA_NODE=${NUMA_NODE:-1}
1011

1112
export CMAKE_BUILD_PARALLEL_LEVEL=32
@@ -23,10 +24,8 @@ numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --tag cpu-test-"$NUMA_NODE
2324
numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" --tag cpu-test-"$NUMA_NODE"-avx2 --target vllm-test -f docker/Dockerfile.cpu .
2425

2526
# Run the image, setting --shm-size=4g for tensor parallel.
26-
docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --cpuset-cpus="$CORE_RANGE" \
27-
--cpuset-mems="$NUMA_NODE" --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test-"$NUMA_NODE" cpu-test-"$NUMA_NODE"
28-
docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --cpuset-cpus="$CORE_RANGE" \
29-
--cpuset-mems="$NUMA_NODE" --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test-"$NUMA_NODE"-avx2 cpu-test-"$NUMA_NODE"-avx2
27+
docker run -itd --cpuset-cpus="$CORE_RANGE" --cpuset-mems="$NUMA_NODE" --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --env VLLM_CPU_OMP_THREADS_BIND="$OMP_CORE_RANGE" --shm-size=4g --name cpu-test-"$NUMA_NODE" cpu-test-"$NUMA_NODE"
28+
docker run -itd --cpuset-cpus="$CORE_RANGE" --cpuset-mems="$NUMA_NODE" --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --env VLLM_CPU_OMP_THREADS_BIND="$OMP_CORE_RANGE" --shm-size=4g --name cpu-test-"$NUMA_NODE"-avx2 cpu-test-"$NUMA_NODE"-avx2
3029

3130
function cpu_tests() {
3231
set -e
@@ -56,7 +55,7 @@ function cpu_tests() {
5655
# Run AWQ test
5756
docker exec cpu-test-"$NUMA_NODE" bash -c "
5857
set -e
59-
pytest -s -v \
58+
VLLM_USE_V1=0 pytest -s -v \
6059
tests/quantization/test_ipex_quant.py"
6160

6261
# Run chunked-prefill and prefix-cache test
@@ -68,8 +67,6 @@ function cpu_tests() {
6867
# online serving
6968
docker exec cpu-test-"$NUMA_NODE" bash -c "
7069
set -e
71-
export VLLM_CPU_KVCACHE_SPACE=10
72-
export VLLM_CPU_OMP_THREADS_BIND=$1
7370
python3 -m vllm.entrypoints.openai.api_server --model facebook/opt-125m --dtype half &
7471
timeout 600 bash -c 'until curl localhost:8000/v1/models; do sleep 1; done' || exit 1
7572
python3 benchmarks/benchmark_serving.py \
@@ -89,4 +86,4 @@ function cpu_tests() {
8986

9087
# All of CPU tests are expected to be finished less than 40 mins.
9188
export -f cpu_tests
92-
timeout 40m bash -c "cpu_tests $CORE_RANGE $NUMA_NODE"
89+
timeout 1h bash -c "cpu_tests $CORE_RANGE $NUMA_NODE"

docs/usage/v1_guide.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,8 @@ This living user guide outlines a few known **important changes and limitations*
4040
| **NVIDIA** | <nobr>🚀 Natively Supported</nobr> |
4141
| **AMD** | <nobr>🚧 WIP</nobr> |
4242
| **TPU** | <nobr>🚧 WIP</nobr> |
43+
| **CPU** | <nobr>🚧 WIP</nobr> |
44+
4345
#### Feature / Model
4446

4547
| Feature / Model | Status |

requirements/cpu.txt

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,9 @@
11
# Common dependencies
22
-r common.txt
33

4+
numba == 0.60.0; python_version == '3.9' # v0.61 doesn't support Python 3.9. Required for N-gram speculative decoding
5+
numba == 0.61.2; python_version > '3.9'
6+
47
# Dependencies for CPUs
58
packaging>=24.2
69
setuptools>=77.0.3,<80.0.0

tests/kernels/attention/test_attention_selector.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -85,7 +85,10 @@ def test_env(
8585
CpuPlatform()):
8686
backend = get_attn_backend(16, torch.float16, torch.float16,
8787
block_size, False)
88-
assert backend.get_name() == "TORCH_SDPA"
88+
if use_v1:
89+
assert backend.get_name() == "TORCH_SDPA_VLLM_V1"
90+
else:
91+
assert backend.get_name() == "TORCH_SDPA"
8992

9093
elif device == "hip":
9194
with patch("vllm.attention.selector.current_platform",

tests/models/language/generation/test_common.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -87,7 +87,6 @@
8787
pytest.param("bigcode/starcoder2-3b"), # starcoder2
8888
pytest.param(
8989
"TitanML/tiny-mixtral", # mixtral
90-
marks=[pytest.mark.cpu_model],
9190
)
9291
])
9392
@pytest.mark.parametrize("max_tokens", [32])

vllm/attention/backends/cpu_mla.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -178,7 +178,7 @@ def build(self, seq_lens, query_lens, cuda_graph_pad_size, batch_size):
178178
seq_lens_tensor=seq_lens_tensor,
179179
max_query_len=max_query_len,
180180
max_kv_len=max_kv_len,
181-
query_start_loc=query_start_loc,
181+
prefill_query_start_loc=query_start_loc,
182182
kv_start_loc=kv_start_loc,
183183
max_decode_seq_len=input_data.max_decode_seq_len,
184184
num_prefills=input_data.num_prefills,
@@ -264,8 +264,8 @@ def _forward_prefill(
264264
key=k,
265265
value=v_padded,
266266
out=output,
267-
seqlen_q=prefill_metadata.query_start_loc,
268-
seqlen_k=prefill_metadata.query_start_loc,
267+
seqlen_q=prefill_metadata.prefill_query_start_loc,
268+
seqlen_k=prefill_metadata.prefill_query_start_loc,
269269
max_seqlen_q=prefill_metadata.max_query_len,
270270
max_seqlen_k=prefill_metadata.max_query_len,
271271
pdropout=0.0,

vllm/attention/backends/torch_sdpa.py

Lines changed: 12 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -87,10 +87,13 @@ class TorchSDPAMetadata(AttentionMetadata, PagedAttentionMetadata):
8787
# For chunked prefill only
8888
max_query_len: Optional[int] = None
8989
max_kv_len: Optional[int] = None
90-
query_start_loc: Optional[torch.Tensor] = None
90+
prefill_query_start_loc: Optional[torch.Tensor] = None
9191
kv_start_loc: Optional[torch.Tensor] = None
9292
prefill_block_tables: Optional[torch.Tensor] = None
9393

94+
# For V1 logits index only
95+
query_start_loc: Optional[torch.Tensor] = None
96+
9497
# Begin encoder attn & enc/dec cross-attn fields...
9598
# Encoder sequence lengths representation
9699
encoder_seq_lens: Optional[List[int]] = None
@@ -375,7 +378,7 @@ def build(self, seq_lens: List[int], query_lens: List[int],
375378
seq_lens_tensor=seq_lens_tensor,
376379
max_query_len=max_query_len,
377380
max_kv_len=max_kv_len,
378-
query_start_loc=query_start_loc,
381+
prefill_query_start_loc=query_start_loc,
379382
kv_start_loc=kv_start_loc,
380383
max_decode_seq_len=input_data.max_decode_seq_len,
381384
num_prefills=input_data.num_prefills,
@@ -470,6 +473,11 @@ def forward(
470473
Returns:
471474
shape = [num_tokens, num_heads * head_size]
472475
"""
476+
477+
# For warming-up
478+
if attn_metadata is None:
479+
return query
480+
473481
attn_type = self.attn_type
474482
if (attn_type == AttentionType.ENCODER
475483
and (not attn_metadata.is_all_encoder_attn_metadata_set)):
@@ -537,8 +545,8 @@ def forward(
537545

538546
output = torch.empty_like(query)
539547
if prefill_meta := attn_metadata.prefill_metadata:
540-
assert attn_metadata.seq_lens is not None
541548
if not prefill_meta.prefill_metadata.chunked_prefill: # type: ignore
549+
assert attn_metadata.seq_lens is not None
542550
self._run_sdpa_forward(output,
543551
query,
544552
key,
@@ -555,7 +563,7 @@ def forward(
555563
query[:prefill_meta.num_prefill_tokens, :, :],
556564
key_cache,
557565
value_cache,
558-
prefill_meta.query_start_loc,
566+
prefill_meta.prefill_query_start_loc,
559567
prefill_meta.kv_start_loc,
560568
prefill_meta.max_query_len,
561569
prefill_meta.max_kv_len,

vllm/compilation/wrapper.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -41,11 +41,16 @@ def __init__(self,
4141
# compiling the forward method
4242

4343
backend = vllm_config.compilation_config.init_backend(vllm_config)
44+
options = None
45+
if isinstance(backend, str) and backend == "inductor":
46+
options = get_current_vllm_config(
47+
).compilation_config.inductor_compile_config
4448

4549
compiled_callable = torch.compile(
4650
self.forward,
4751
fullgraph=envs.VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE,
48-
backend=backend)
52+
backend=backend,
53+
options=options)
4954

5055
self.compiled_callable = compiled_callable
5156
self.original_code_object = self.__class__.forward.__code__

vllm/engine/arg_utils.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1399,6 +1399,7 @@ def _is_v1_supported_oracle(self, model_config: ModelConfig) -> bool:
13991399
"FLASHINFER",
14001400
"FLASHINFER_VLLM_V1",
14011401
"ROCM_AITER_MLA",
1402+
"TORCH_SDPA_VLLM_V1",
14021403
]
14031404
if (envs.is_set("VLLM_ATTENTION_BACKEND")
14041405
and envs.VLLM_ATTENTION_BACKEND not in V1_BACKENDS):
@@ -1431,7 +1432,8 @@ def _is_v1_supported_oracle(self, model_config: ModelConfig) -> bool:
14311432

14321433
# Non-[CUDA, TPU] may be supported on V1, but off by default for now.
14331434
v0_hardware = not any(
1434-
(current_platform.is_cuda(), current_platform.is_tpu()))
1435+
(current_platform.is_cuda(), current_platform.is_tpu(),
1436+
current_platform.is_cpu()))
14351437
if v0_hardware and _warn_or_fallback( # noqa: SIM103
14361438
current_platform.device_name):
14371439
return False

vllm/platforms/cpu.py

Lines changed: 57 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -57,7 +57,10 @@ def get_attn_backend_cls(cls, selected_backend: _Backend, head_size: int,
5757
logger.info("Using CPU MLA backend.")
5858
return "vllm.attention.backends.cpu_mla.CPUMLABackend"
5959
logger.info("Using Torch SDPA backend.")
60-
return "vllm.attention.backends.torch_sdpa.TorchSDPABackend"
60+
if use_v1:
61+
return "vllm.v1.attention.backends.cpu_attn.TorchSDPABackend"
62+
else:
63+
return "vllm.attention.backends.torch_sdpa.TorchSDPABackend"
6164

6265
@classmethod
6366
def get_device_total_memory(cls, device_id: int = 0) -> int:
@@ -81,6 +84,8 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
8184
if not model_config.enforce_eager:
8285
model_config.enforce_eager = True
8386

87+
model_config.disable_cascade_attn = True
88+
8489
cache_config = vllm_config.cache_config
8590

8691
ipex_available = find_spec("intel_extension_for_pytorch") is not None
@@ -128,7 +133,8 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
128133
f" {kv_cache_space}, expect a positive integer value.")
129134

130135
parallel_config = vllm_config.parallel_config
131-
if (parallel_config.distributed_executor_backend is not None
136+
if (parallel_config.world_size > 1
137+
and parallel_config.distributed_executor_backend is not None
132138
and parallel_config.distributed_executor_backend != "mp"):
133139
logger.warning(("%s is not supported on CPU, fallback to mp "
134140
"distributed executor backend."),
@@ -141,14 +147,51 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
141147
parallel_config.sd_worker_cls = \
142148
"vllm.worker.cpu_worker.CPUWorker"
143149
else:
144-
parallel_config.worker_cls = "vllm.worker.cpu_worker.CPUWorker"
150+
if envs.VLLM_USE_V1:
151+
parallel_config.worker_cls = \
152+
"vllm.v1.worker.cpu_worker.CPUWorker"
153+
else:
154+
parallel_config.worker_cls = \
155+
"vllm.worker.cpu_worker.CPUWorker"
156+
157+
# Note: workaround for v1 gpu_model_runner
158+
from vllm.config import CompilationLevel
159+
vllm_config.compilation_config.cudagraph_capture_sizes = []
160+
161+
compilation_config = vllm_config.compilation_config
162+
if (envs.VLLM_USE_V1 and vllm_config.compilation_config.level
163+
== CompilationLevel.PIECEWISE):
164+
compilation_config.level = CompilationLevel.DYNAMO_ONCE
165+
compilation_config.backend = "eager"
166+
compilation_config.custom_ops += ["none"]
167+
compilation_config.inductor_compile_config.update({
168+
"dce":
169+
True,
170+
"size_asserts":
171+
False,
172+
"nan_asserts":
173+
False,
174+
"memory_planning":
175+
True,
176+
"epilogue_fusion":
177+
True,
178+
})
179+
180+
if vllm_config.lora_config is not None:
181+
compilation_config.level = CompilationLevel.NO_COMPILATION
145182

146183
assert vllm_config.device_config.device_type == "cpu"
147184

148185
#
149186
# Environment variables for CPU executor
150187
#
151188

189+
os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
190+
191+
# Note: to avoid the error 'nthreads cannot be larger than environment
192+
# variable "NUMEXPR_MAX_THREADS" (64)'.
193+
os.environ["NUMEXPR_MAX_THREADS"] = str(len(os.sched_getaffinity(0)))
194+
152195
# Set default threads num for OpenMP parallel
153196
os.environ["OMP_NUM_THREADS"] = str(torch.get_num_threads())
154197

@@ -171,13 +214,6 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
171214
# To hint IPEX uses shared memory based AllReduce
172215
os.environ["LOCAL_WORLD_SIZE"] = str(
173216
vllm_config.parallel_config.tensor_parallel_size)
174-
if sys.platform == "darwin" and \
175-
envs.VLLM_WORKER_MULTIPROC_METHOD == "fork":
176-
if os.environ.get('VLLM_WORKER_MULTIPROC_METHOD', None) is None:
177-
logger.warning(
178-
"Default to spawn method on MacOS. If this is not desired,"
179-
" set VLLM_WORKER_MULTIPROC_METHOD to fork explicitly.")
180-
os.environ['VLLM_WORKER_MULTIPROC_METHOD'] = 'spawn'
181217

182218
if vllm_config.model_config and vllm_config.model_config.use_mla:
183219
logger.info(
@@ -204,3 +240,14 @@ def get_device_communicator_cls(cls) -> str:
204240
Get device specific communicator class for distributed communication.
205241
"""
206242
return "vllm.distributed.device_communicators.cpu_communicator.CpuCommunicator" # noqa
243+
244+
@classmethod
245+
def supports_structured_output(cls) -> bool:
246+
return True
247+
248+
@classmethod
249+
def supports_v1(cls, model_config) -> bool:
250+
"""Returns whether the current platform can support v1 for the supplied
251+
model configuration.
252+
"""
253+
return True

0 commit comments

Comments
 (0)