@@ -57,7 +57,10 @@ def get_attn_backend_cls(cls, selected_backend: _Backend, head_size: int,
5757 logger .info ("Using CPU MLA backend." )
5858 return "vllm.attention.backends.cpu_mla.CPUMLABackend"
5959 logger .info ("Using Torch SDPA backend." )
60- return "vllm.attention.backends.torch_sdpa.TorchSDPABackend"
60+ if use_v1 :
61+ return "vllm.v1.attention.backends.cpu_attn.TorchSDPABackend"
62+ else :
63+ return "vllm.attention.backends.torch_sdpa.TorchSDPABackend"
6164
6265 @classmethod
6366 def get_device_total_memory (cls , device_id : int = 0 ) -> int :
@@ -81,6 +84,8 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
8184 if not model_config .enforce_eager :
8285 model_config .enforce_eager = True
8386
87+ model_config .disable_cascade_attn = True
88+
8489 cache_config = vllm_config .cache_config
8590
8691 ipex_available = find_spec ("intel_extension_for_pytorch" ) is not None
@@ -128,7 +133,8 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
128133 f" { kv_cache_space } , expect a positive integer value." )
129134
130135 parallel_config = vllm_config .parallel_config
131- if (parallel_config .distributed_executor_backend is not None
136+ if (parallel_config .world_size > 1
137+ and parallel_config .distributed_executor_backend is not None
132138 and parallel_config .distributed_executor_backend != "mp" ):
133139 logger .warning (("%s is not supported on CPU, fallback to mp "
134140 "distributed executor backend." ),
@@ -141,14 +147,51 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
141147 parallel_config .sd_worker_cls = \
142148 "vllm.worker.cpu_worker.CPUWorker"
143149 else :
144- parallel_config .worker_cls = "vllm.worker.cpu_worker.CPUWorker"
150+ if envs .VLLM_USE_V1 :
151+ parallel_config .worker_cls = \
152+ "vllm.v1.worker.cpu_worker.CPUWorker"
153+ else :
154+ parallel_config .worker_cls = \
155+ "vllm.worker.cpu_worker.CPUWorker"
156+
157+ # Note: workaround for v1 gpu_model_runner
158+ from vllm .config import CompilationLevel
159+ vllm_config .compilation_config .cudagraph_capture_sizes = []
160+
161+ compilation_config = vllm_config .compilation_config
162+ if (envs .VLLM_USE_V1 and vllm_config .compilation_config .level
163+ == CompilationLevel .PIECEWISE ):
164+ compilation_config .level = CompilationLevel .DYNAMO_ONCE
165+ compilation_config .backend = "eager"
166+ compilation_config .custom_ops += ["none" ]
167+ compilation_config .inductor_compile_config .update ({
168+ "dce" :
169+ True ,
170+ "size_asserts" :
171+ False ,
172+ "nan_asserts" :
173+ False ,
174+ "memory_planning" :
175+ True ,
176+ "epilogue_fusion" :
177+ True ,
178+ })
179+
180+ if vllm_config .lora_config is not None :
181+ compilation_config .level = CompilationLevel .NO_COMPILATION
145182
146183 assert vllm_config .device_config .device_type == "cpu"
147184
148185 #
149186 # Environment variables for CPU executor
150187 #
151188
189+ os .environ ["VLLM_WORKER_MULTIPROC_METHOD" ] = "spawn"
190+
191+ # Note: to avoid the error 'nthreads cannot be larger than environment
192+ # variable "NUMEXPR_MAX_THREADS" (64)'.
193+ os .environ ["NUMEXPR_MAX_THREADS" ] = str (len (os .sched_getaffinity (0 )))
194+
152195 # Set default threads num for OpenMP parallel
153196 os .environ ["OMP_NUM_THREADS" ] = str (torch .get_num_threads ())
154197
@@ -171,13 +214,6 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
171214 # To hint IPEX uses shared memory based AllReduce
172215 os .environ ["LOCAL_WORLD_SIZE" ] = str (
173216 vllm_config .parallel_config .tensor_parallel_size )
174- if sys .platform == "darwin" and \
175- envs .VLLM_WORKER_MULTIPROC_METHOD == "fork" :
176- if os .environ .get ('VLLM_WORKER_MULTIPROC_METHOD' , None ) is None :
177- logger .warning (
178- "Default to spawn method on MacOS. If this is not desired,"
179- " set VLLM_WORKER_MULTIPROC_METHOD to fork explicitly." )
180- os .environ ['VLLM_WORKER_MULTIPROC_METHOD' ] = 'spawn'
181217
182218 if vllm_config .model_config and vllm_config .model_config .use_mla :
183219 logger .info (
@@ -204,3 +240,14 @@ def get_device_communicator_cls(cls) -> str:
204240 Get device specific communicator class for distributed communication.
205241 """
206242 return "vllm.distributed.device_communicators.cpu_communicator.CpuCommunicator" # noqa
243+
244+ @classmethod
245+ def supports_structured_output (cls ) -> bool :
246+ return True
247+
248+ @classmethod
249+ def supports_v1 (cls , model_config ) -> bool :
250+ """Returns whether the current platform can support v1 for the supplied
251+ model configuration.
252+ """
253+ return True
0 commit comments