[NPU] Upgrade to v0.17.0 (vllm-project#1890)

gcanlin · web-flow · commit 9718a9e36967 · 2026-03-16T14:42:59.000+08:00
Signed-off-by: gcanlin &lt;canlinguosdu@gmail.com&gt;
diff --git a/docker/Dockerfile.npu b/docker/Dockerfile.npu
@@ -1,17 +1,17 @@
 ARG VLLM_ASCEND_IMAGE=quay.io/ascend/vllm-ascend
-ARG VLLM_ASCEND_TAG=v0.14.0rc1
+ARG VLLM_ASCEND_TAG=v0.17.0rc1
 FROM ${VLLM_ASCEND_IMAGE}:${VLLM_ASCEND_TAG}
 
-WORKDIR /vllm-workspace/vllm-ascend
-RUN git checkout e2175d9c7e62b437391dfee996b1375674ba7c18
-RUN pip install -v -e .
-
 ARG APP_DIR=/vllm-workspace/vllm-omni
 WORKDIR ${APP_DIR}
 
 COPY . .
 
-RUN pip install -v -e .
+RUN export PIP_EXTRA_INDEX_URL=https://mirrors.huaweicloud.com/ascend/repos/pypi && \
+    source /usr/local/Ascend/ascend-toolkit/set_env.sh && \
+    source /usr/local/Ascend/nnal/atb/set_env.sh && \
+    export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/Ascend/ascend-toolkit/latest/`uname -i`-linux/devlib && \
+    python3 -m pip install -v -e /vllm-workspace/vllm-omni/ --no-build-isolation
 
 ENV VLLM_WORKER_MULTIPROC_METHOD=spawn
 
diff --git a/docker/Dockerfile.npu.a3 b/docker/Dockerfile.npu.a3
@@ -1,17 +1,17 @@
 ARG VLLM_ASCEND_IMAGE=quay.io/ascend/vllm-ascend
-ARG VLLM_ASCEND_TAG=v0.14.0rc1-a3
+ARG VLLM_ASCEND_TAG=v0.17.0rc1-a3
 FROM ${VLLM_ASCEND_IMAGE}:${VLLM_ASCEND_TAG}
 
-WORKDIR /vllm-workspace/vllm-ascend
-RUN git checkout e2175d9c7e62b437391dfee996b1375674ba7c18
-RUN pip install -v -e .
-
 ARG APP_DIR=/vllm-workspace/vllm-omni
 WORKDIR ${APP_DIR}
 
 COPY . .
 
-RUN pip install -v -e .
+RUN export PIP_EXTRA_INDEX_URL=https://mirrors.huaweicloud.com/ascend/repos/pypi && \
+    source /usr/local/Ascend/ascend-toolkit/set_env.sh && \
+    source /usr/local/Ascend/nnal/atb/set_env.sh && \
+    export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/Ascend/ascend-toolkit/latest/`uname -i`-linux/devlib && \
+    python3 -m pip install -v -e /vllm-workspace/vllm-omni/ --no-build-isolation
 
 ENV VLLM_WORKER_MULTIPROC_METHOD=spawn
 
diff --git a/docs/getting_started/installation/npu/npu.inc.md b/docs/getting_started/installation/npu/npu.inc.md
@@ -33,18 +33,25 @@ docker run --rm \
     -p 8000:8000 \
     -it $IMAGE bash
 
+cd /vllm-workspace/vllm
+git pull origin main
+git fetch origin --tags
+git checkout v0.16.0
+
 # Because vllm-ascend will release v0.16.0rc1 after vllm-omni 0.16.0,
 # we have to pin vllm-ascend at the current commit.
 cd /vllm-workspace/vllm-ascend
+git pull origin main
 git checkout e2175d9c7e62b437391dfee996b1375674ba7c18
 pip install -v -e .
 
 # Inside the container, install vLLM-Omni from source
 cd /vllm-workspace
 git clone -b v0.16.0 https://github.com/vllm-project/vllm-omni.git
-
 cd vllm-omni
-pip install -v -e .
+pip install -v -e . --no-build-isolation
+# or VLLM_OMNI_TARGET_DEVICE=npu pip install -v -e .
+
 export VLLM_WORKER_MULTIPROC_METHOD=spawn
 ```
 
@@ -61,22 +68,22 @@ We are keeping [issue #886](https://github.com/vllm-project/vllm-omni/issues/886
 You can also build vLLM-Omni from the latest main branch if you want to use the latest features or bug fixes. (But sometimes it will break for a while. You can check [issue #886](https://github.com/vllm-project/vllm-omni/issues/886) for the status of the latest commit of vLLM-Omni main branch on NPU.)
 
 ```bash
-# Pin vLLM version to 0.16.0
+# Pin vLLM version to 0.17.0
 cd /vllm-workspace/vllm
 git pull origin main
 git fetch origin --tags
-git checkout v0.16.0
+git checkout v0.17.0
 VLLM_TARGET_DEVICE=empty pip install -v -e .
 
 # Because vllm-ascend has not yet entered continuous development and has not been officially released, we need to pin it to a specific commit. Please note that this commit may change over time.
-cd ../vllm-ascend
+cd /vllm-workspace/vllm-ascend
 git pull origin main
 git fetch origin --tags
-git checkout e2175d9c7e62b437391dfee996b1375674ba7c18
+git checkout v0.17.0
 pip install -v -e .
 
 # Install vLLM-Omni from the latest main branch
-cd ../vllm-omni
+cd /vllm-workspace/vllm-omni
 git clone https://github.com/vllm-project/vllm-omni.git
 pip install -v -e . --no-build-isolation
 # or VLLM_OMNI_TARGET_DEVICE=npu pip install -v -e .
diff --git a/vllm_omni/model_executor/models/qwen3_omni/qwen3_omni_moe_code_predictor_mtp.py b/vllm_omni/model_executor/models/qwen3_omni/qwen3_omni_moe_code_predictor_mtp.py
@@ -22,6 +22,8 @@
 from vllm.model_executor.layers.rotary_embedding import get_rope
 from vllm.model_executor.layers.vocab_parallel_embedding import VocabParallelEmbedding
 
+from vllm_omni.platforms import current_omni_platform
+
 logger = init_logger(__name__)
 
 
@@ -343,6 +345,10 @@ def _ensure_cached_refs(self) -> None:
     def _ensure_model_fwd(self) -> None:
         if self._model_fwd is not None:
             return
+        if not current_omni_platform.supports_torch_inductor():
+            logger.warning_once("code_predictor: torch.compile disabled")
+            self._model_fwd = self.model.forward
+            return
         self._model_fwd = torch.compile(
             self.model.forward,
             mode="default",
diff --git a/vllm_omni/model_executor/models/qwen3_tts/qwen3_tts_code_predictor_vllm.py b/vllm_omni/model_executor/models/qwen3_tts/qwen3_tts_code_predictor_vllm.py
@@ -21,6 +21,8 @@
 )
 from vllm.model_executor.models.utils import is_pp_missing_parameter
 
+from vllm_omni.platforms import current_omni_platform
+
 from .configuration_qwen3_tts import Qwen3TTSTalkerCodePredictorConfig, Qwen3TTSTalkerConfig
 
 logger = init_logger(__name__)
@@ -410,6 +412,10 @@ def _setup_compile(self) -> None:
         """
         if self._compiled_model_fwd is not None:
             return
+        if not current_omni_platform.supports_torch_inductor():
+            logger.warning_once("code_predictor: torch.compile disabled")
+            self._compiled_model_fwd = self.model.forward
+            return
         self._compiled_model_fwd = torch.compile(
             self.model.forward,
             mode="default",
diff --git a/vllm_omni/platforms/npu/stage_configs/qwen3_omni_moe_async_chunk.yaml b/vllm_omni/platforms/npu/stage_configs/qwen3_omni_moe_async_chunk.yaml
@@ -0,0 +1,101 @@
+# Stage config for running Qwen3-Omni-MoE with 3-stage architecture
+# Stage 0: Thinker (multimodal understanding + text generation)
+# Stage 1: Talker (text embeddings → 16-layer RVQ codec codes)
+# Stage 2: Code2Wav (16-layer RVQ codes → audio waveform)
+
+# The following config has been verified on 2x H100-80G GPUs.
+async_chunk: true
+stage_args:
+  - stage_id: 0
+    stage_type: llm  # Use llm stage type to launch OmniLLM
+    runtime:
+      devices: "0,1"
+      max_batch_size: 10
+    engine_args:
+      model_stage: thinker
+      model_arch: Qwen3OmniMoeForConditionalGeneration
+      worker_type: ar
+      scheduler_cls: vllm_omni.core.sched.omni_ar_scheduler.OmniARScheduler
+      gpu_memory_utilization: 0.9
+      enforce_eager: false
+      trust_remote_code: true
+      engine_output_type: latent  # Output hidden states for talker
+      distributed_executor_backend: "mp"
+      enable_prefix_caching: false
+      max_num_batched_tokens: 32768
+      hf_config_name: thinker_config
+      tensor_parallel_size: 2
+      custom_process_next_stage_input_func: vllm_omni.model_executor.stage_input_processors.qwen3_omni.thinker2talker_async_chunk
+    final_output: true
+    final_output_type: text
+    is_comprehension: true
+    default_sampling_params:
+      temperature: 0.4
+      top_p: 0.9
+      top_k: 1
+      max_tokens: 2048
+      seed: 42
+      detokenize: True
+      repetition_penalty: 1.05
+
+  - stage_id: 1
+    stage_type: llm  # Use llm stage type to launch OmniLLM
+    runtime:
+      devices: "2"
+      max_batch_size: 10
+    engine_args:
+      model_stage: talker
+      model_arch: Qwen3OmniMoeForConditionalGeneration
+      worker_type: ar
+      scheduler_cls: vllm_omni.core.sched.omni_ar_scheduler.OmniARScheduler
+      gpu_memory_utilization: 0.6
+      enforce_eager: true
+      trust_remote_code: true
+      engine_output_type: latent  # Output codec codes for code2wav
+      enable_prefix_caching: false
+      max_num_batched_tokens: 32768
+      distributed_executor_backend: "mp"
+      hf_config_name: talker_config
+      custom_process_next_stage_input_func: vllm_omni.model_executor.stage_input_processors.qwen3_omni.talker2code2wav_async_chunk
+    engine_input_source: [0]
+    # final_output: true
+    # final_output_type: text
+    default_sampling_params:
+      temperature: 0.9
+      top_k: 50
+      max_tokens: 4096
+      seed: 42
+      detokenize: False
+      repetition_penalty: 1.0
+      stop_token_ids: [2150]
+
+  - stage_id: 2
+    stage_type: llm  # Use llm stage type to launch OmniLLM
+    runtime:
+      devices: "2"
+      max_batch_size: 10
+    engine_args:
+      model_stage: code2wav
+      model_arch: Qwen3OmniMoeForConditionalGeneration
+      worker_type: generation
+      scheduler_cls: vllm_omni.core.sched.omni_generation_scheduler.OmniGenerationScheduler
+      enforce_eager: true
+      trust_remote_code: true
+      async_scheduling: false
+      enable_prefix_caching: false
+      engine_output_type: audio  # Final output: audio waveform
+      gpu_memory_utilization: 0.3
+      distributed_executor_backend: "mp"
+      max_num_batched_tokens: 51200 # [TODO] if max_num_batch_tokens < max_batch_size * 800, there will be precision problem.
+      hf_config_name: thinker_config
+    engine_input_source: [1]
+    final_output: true
+    final_output_type: audio
+    default_sampling_params:
+      temperature: 0.0
+      top_p: 1.0
+      top_k: -1
+      max_tokens: 65536
+      seed: 42
+      detokenize: True
+      repetition_penalty: 1.1
diff --git a/vllm_omni/platforms/npu/worker/npu_ar_model_runner.py b/vllm_omni/platforms/npu/worker/npu_ar_model_runner.py
diff --git a/vllm_omni/platforms/npu/worker/npu_generation_model_runner.py b/vllm_omni/platforms/npu/worker/npu_generation_model_runner.py
diff --git a/vllm_omni/platforms/npu/worker/npu_model_runner.py b/vllm_omni/platforms/npu/worker/npu_model_runner.py