Changed default model to DeepSeek #106

wtripp180901 · 2025-05-15T11:21:13Z

When testing the current default Phi model on Intel GPUs, I encountered this error in the API pod using the default azimuth UI values

INFO 05-15 12:29:37 xpu_model_runner.py:425] Loading model weights took 7.1418 GB
ERROR 05-15 12:29:38 engine.py:389] IPEX varlen fwd do not support causal when head_dim * sizeof(dtype) not 128 byte aligned.
ERROR 05-15 12:29:38 engine.py:389] Traceback (most recent call last):
ERROR 05-15 12:29:38 engine.py:389]   File "/workspace/vllm/vllm/engine/multiprocessing/engine.py", line 380, in run_mp_engine
ERROR 05-15 12:29:38 engine.py:389]     engine = MQLLMEngine.from_engine_args(engine_args=engine_args,
ERROR 05-15 12:29:38 engine.py:389]   File "/workspace/vllm/vllm/engine/multiprocessing/engine.py", line 123, in from_engine_args
ERROR 05-15 12:29:38 engine.py:389]     return cls(ipc_path=ipc_path,
ERROR 05-15 12:29:38 engine.py:389]   File "/workspace/vllm/vllm/engine/multiprocessing/engine.py", line 75, in __init__
ERROR 05-15 12:29:38 engine.py:389]     self.engine = LLMEngine(*args, **kwargs)
ERROR 05-15 12:29:38 engine.py:389]   File "/workspace/vllm/vllm/engine/llm_engine.py", line 276, in __init__
ERROR 05-15 12:29:38 engine.py:389]     self._initialize_kv_caches()
ERROR 05-15 12:29:38 engine.py:389]   File "/workspace/vllm/vllm/engine/llm_engine.py", line 416, in _initialize_kv_caches
ERROR 05-15 12:29:38 engine.py:389]     self.model_executor.determine_num_available_blocks())
ERROR 05-15 12:29:38 engine.py:389]   File "/workspace/vllm/vllm/executor/executor_base.py", line 101, in determine_num_available_blocks
ERROR 05-15 12:29:38 engine.py:389]     results = self.collective_rpc("determine_num_available_blocks")
ERROR 05-15 12:29:38 engine.py:389]   File "/workspace/vllm/vllm/executor/uniproc_executor.py", line 51, in collective_rpc
ERROR 05-15 12:29:38 engine.py:389]     answer = run_method(self.driver_worker, method, args, kwargs)
ERROR 05-15 12:29:38 engine.py:389]   File "/workspace/vllm/vllm/utils.py", line 2220, in run_method
ERROR 05-15 12:29:38 engine.py:389]     return func(*args, **kwargs)
ERROR 05-15 12:29:38 engine.py:389]   File "/usr/local/lib/python3.10/dist-packages/torch/utils/_contextlib.py", line 116, in decorate_context
ERROR 05-15 12:29:38 engine.py:389]     return func(*args, **kwargs)
ERROR 05-15 12:29:38 engine.py:389]   File "/workspace/vllm/vllm/worker/xpu_worker.py", line 106, in determine_num_available_blocks
ERROR 05-15 12:29:38 engine.py:389]     self.model_runner.profile_run()
ERROR 05-15 12:29:38 engine.py:389]   File "/usr/local/lib/python3.10/dist-packages/torch/utils/_contextlib.py", line 116, in decorate_context
ERROR 05-15 12:29:38 engine.py:389]     return func(*args, **kwargs)
ERROR 05-15 12:29:38 engine.py:389]   File "/workspace/vllm/vllm/worker/xpu_model_runner.py", line 505, in profile_run
ERROR 05-15 12:29:38 engine.py:389]     self.execute_model(model_input, kv_caches, intermediate_tensors)
ERROR 05-15 12:29:38 engine.py:389]   File "/usr/local/lib/python3.10/dist-packages/torch/utils/_contextlib.py", line 116, in decorate_context
ERROR 05-15 12:29:38 engine.py:389]     return func(*args, **kwargs)
ERROR 05-15 12:29:38 engine.py:389]   File "/workspace/vllm/vllm/worker/xpu_model_runner.py", line 581, in execute_model
ERROR 05-15 12:29:38 engine.py:389]     hidden_or_intermediate_states = model_executable(
ERROR 05-15 12:29:38 engine.py:389]   File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1736, in _wrapped_call_impl
ERROR 05-15 12:29:38 engine.py:389]     return self._call_impl(*args, **kwargs)
ERROR 05-15 12:29:38 engine.py:389]   File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1747, in _call_impl
ERROR 05-15 12:29:38 engine.py:389]     return forward_call(*args, **kwargs)
ERROR 05-15 12:29:38 engine.py:389]   File "/workspace/vllm/vllm/model_executor/models/llama.py", line 541, in forward
ERROR 05-15 12:29:38 engine.py:389]     model_output = self.model(input_ids, positions, kv_caches,
ERROR 05-15 12:29:38 engine.py:389]   File "/workspace/vllm/vllm/compilation/decorators.py", line 172, in __call__
ERROR 05-15 12:29:38 engine.py:389]     return self.forward(*args, **kwargs)
ERROR 05-15 12:29:38 engine.py:389]   File "/workspace/vllm/vllm/model_executor/models/llama.py", line 365, in forward
ERROR 05-15 12:29:38 engine.py:389]     hidden_states, residual = layer(positions, hidden_states,
ERROR 05-15 12:29:38 engine.py:389]   File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1736, in _wrapped_call_impl
ERROR 05-15 12:29:38 engine.py:389]     return self._call_impl(*args, **kwargs)
ERROR 05-15 12:29:38 engine.py:389]   File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1747, in _call_impl
ERROR 05-15 12:29:38 engine.py:389]     return forward_call(*args, **kwargs)
ERROR 05-15 12:29:38 engine.py:389]   File "/workspace/vllm/vllm/model_executor/models/llama.py", line 279, in forward
ERROR 05-15 12:29:38 engine.py:389]     hidden_states = self.self_attn(positions=positions,
ERROR 05-15 12:29:38 engine.py:389]   File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1736, in _wrapped_call_impl
ERROR 05-15 12:29:38 engine.py:389]     return self._call_impl(*args, **kwargs)
ERROR 05-15 12:29:38 engine.py:389]   File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1747, in _call_impl
ERROR 05-15 12:29:38 engine.py:389]     return forward_call(*args, **kwargs)
ERROR 05-15 12:29:38 engine.py:389]   File "/workspace/vllm/vllm/model_executor/models/llama.py", line 203, in forward
ERROR 05-15 12:29:38 engine.py:389]     attn_output = self.attn(q, k, v, kv_cache, attn_metadata)
ERROR 05-15 12:29:38 engine.py:389]   File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1736, in _wrapped_call_impl
ERROR 05-15 12:29:38 engine.py:389]     return self._call_impl(*args, **kwargs)
ERROR 05-15 12:29:38 engine.py:389]   File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1747, in _call_impl
ERROR 05-15 12:29:38 engine.py:389]     return forward_call(*args, **kwargs)
ERROR 05-15 12:29:38 engine.py:389]   File "/workspace/vllm/vllm/attention/layer.py", line 198, in forward
ERROR 05-15 12:29:38 engine.py:389]     return self.impl.forward(self, query, key, value,
ERROR 05-15 12:29:38 engine.py:389]   File "/workspace/vllm/vllm/attention/backends/ipex_attn.py", line 245, in forward
ERROR 05-15 12:29:38 engine.py:389]     ipex_ops.varlen_attention(
ERROR 05-15 12:29:38 engine.py:389]   File "/workspace/vllm/vllm/_ipex_ops.py", line 190, in varlen_attention
ERROR 05-15 12:29:38 engine.py:389]     ipex.llm.functional.varlen_attention(query.contiguous(),
ERROR 05-15 12:29:38 engine.py:389]   File "/usr/local/lib/python3.10/dist-packages/intel_extension_for_pytorch/llm/functional/fusions.py", line 283, in varlen_attention
ERROR 05-15 12:29:38 engine.py:389]     return VarlenAttention.apply_function(
ERROR 05-15 12:29:38 engine.py:389]   File "/usr/local/lib/python3.10/dist-packages/intel_extension_for_pytorch/llm/modules/mha_fusion.py", line 379, in apply_function
ERROR 05-15 12:29:38 engine.py:389]     ).apply_function(
ERROR 05-15 12:29:38 engine.py:389]   File "/usr/local/lib/python3.10/dist-packages/intel_extension_for_pytorch/transformers/models/xpu/fusions/mha_fusion.py", line 237, in apply_function
ERROR 05-15 12:29:38 engine.py:389]     _IPEXVarlenScaledDotProductXPU.apply_function_flash_varlen(
ERROR 05-15 12:29:38 engine.py:389]   File "/usr/local/lib/python3.10/dist-packages/intel_extension_for_pytorch/transformers/models/xpu/fusions/mha_fusion.py", line 291, in apply_function_flash_varlen
ERROR 05-15 12:29:38 engine.py:389]     torch.ops.torch_ipex.varlen_fwd(
ERROR 05-15 12:29:38 engine.py:389]   File "/usr/local/lib/python3.10/dist-packages/torch/_ops.py", line 1116, in __call__
ERROR 05-15 12:29:38 engine.py:389]     return self._op(*args, **(kwargs or {}))
ERROR 05-15 12:29:38 engine.py:389] RuntimeError: IPEX varlen fwd do not support causal when head_dim * sizeof(dtype) not 128 byte aligned.

it may therefore be worth changing the UI default to a model which is known to work in all environments

sd109

LGTM

changed default model to deepseek

ca50a8a

wtripp180901 requested a review from sd109 May 19, 2025 09:53

sd109 approved these changes May 19, 2025

View reviewed changes

sd109 merged commit 6528a53 into main May 19, 2025
7 checks passed

sd109 deleted the feat/deepseek-default branch May 19, 2025 10:20

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Changed default model to DeepSeek #106

Changed default model to DeepSeek #106

Uh oh!

wtripp180901 commented May 15, 2025 •

edited

Loading

Uh oh!

sd109 left a comment

Uh oh!

Uh oh!

Reviewers

Assignees

Labels

Projects

Milestone

Development

Uh oh!

2 participants

Changed default model to DeepSeek #106

Changed default model to DeepSeek #106

Uh oh!

Conversation

wtripp180901 commented May 15, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

sd109 left a comment

Choose a reason for hiding this comment

Uh oh!

Uh oh!

Reviewers

Assignees

Labels

Projects

Milestone

Development

Uh oh!

2 participants

wtripp180901 commented May 15, 2025 •

edited

Loading