vllm-project · 22dimensions · Mar 23, 2026 · Mar 25, 2026 · Mar 25, 2026 · Mar 25, 2026
@@ -18,7 +18,7 @@ on:
       continue_on_error:
         required: false
         type: boolean
-        default: false
+        default: true
 env:
   UV_INDEX_URL: http://cache-service.nginx-pypi-cache.svc.cluster.local/pypi/simple
   UV_EXTRA_INDEX_URL: https://mirrors.huaweicloud.com/ascend/repos/pypi

@@ -37,7 +37,7 @@ jobs:
     steps:
       - name: Get vLLM version
         run: |
-          VLLM_COMMIT=ed359c497a728f08b5b41456c07a688ccd510fbc
+          VLLM_COMMIT=14acf429ac08b6d538ca6feb3e06b6d13895804d
           echo "VLLM_COMMIT=https://github.com/vllm-project/vllm/commit/$VLLM_COMMIT" >> "$GITHUB_ENV"
 
       - name: Checkout repository

@@ -27,7 +27,7 @@ RUN apt-get update -y && \
 
 ARG VLLM_REPO=https://github.com/vllm-project/vllm.git
 # For lint purpose, actually we need make a main2main matching.
-ARG VLLM_COMMIT=ed359c497a728f08b5b41456c07a688ccd510fbc
+ARG VLLM_COMMIT=14acf429ac08b6d538ca6feb3e06b6d13895804d
 RUN git clone $VLLM_REPO /vllm-workspace/vllm && \
     cd /vllm-workspace/vllm && \
     git checkout $VLLM_COMMIT

@@ -75,7 +75,7 @@ jobs:
     name: e2e-full
     strategy:
       matrix:
-        vllm_version: [ed359c497a728f08b5b41456c07a688ccd510fbc, v0.18.0]
+        vllm_version: [14acf429ac08b6d538ca6feb3e06b6d13895804d, v0.18.0]
     needs: [changes]
     if: ${{ needs.changes.outputs.e2e_tracker == 'true' || needs.changes.outputs.e2e_tracker == true }}
     uses: ./.github/workflows/_e2e_test.yaml

@@ -41,7 +41,7 @@ jobs:
   lint:
     uses: ./.github/workflows/_pre_commit.yml
     with:
-      vllm: ed359c497a728f08b5b41456c07a688ccd510fbc
+      vllm: 14acf429ac08b6d538ca6feb3e06b6d13895804d
   changes:
     runs-on: linux-aarch64-a2b3-0
     outputs:
@@ -90,7 +90,7 @@ jobs:
     if: ${{ needs.lint.result == 'success' && (needs.changes.outputs.e2e_tracker == 'true' || needs.changes.outputs.ut_tracker == 'true') }}
     strategy:
       matrix:
-        vllm_version: [ed359c497a728f08b5b41456c07a688ccd510fbc, v0.18.0]
+        vllm_version: [14acf429ac08b6d538ca6feb3e06b6d13895804d, v0.18.0]
     uses: ./.github/workflows/_unit_test.yaml
     with:
       vllm: ${{ matrix.vllm_version }}
@@ -102,7 +102,7 @@ jobs:
     name: e2e-light
     strategy:
       matrix:
-        vllm_version: [ed359c497a728f08b5b41456c07a688ccd510fbc, v0.18.0]
+        vllm_version: [14acf429ac08b6d538ca6feb3e06b6d13895804d, v0.18.0]
     # Note (yikun): If CI resource are limited we can split job into two chain jobs
     needs: [lint, changes]
     # only trigger e2e test after lint passed and the change is e2e related with pull request.

@@ -33,7 +33,7 @@ jobs:
     name: refresh codecov
     strategy:
       matrix:
-        vllm_version: [ed359c497a728f08b5b41456c07a688ccd510fbc]
+        vllm_version: [14acf429ac08b6d538ca6feb3e06b6d13895804d]
     uses: ./.github/workflows/_unit_test.yaml
     with:
       vllm: ${{ matrix.vllm_version }}

@@ -59,7 +59,7 @@ For main branch of vLLM Ascend, we usually make it compatible with the latest vL
 
 | vLLM Ascend | vLLM         | Python           | Stable CANN | PyTorch/torch_npu  |
 |-------------|--------------|------------------|-------------|--------------------|
-|     main    | ed359c497a728f08b5b41456c07a688ccd510fbc, v0.18.0 tag | >= 3.10, < 3.12   | 8.5.0 | 2.9.0 / 2.9.0 |
+|     main    | 14acf429ac08b6d538ca6feb3e06b6d13895804d, v0.18.0 tag | >= 3.10, < 3.12   | 8.5.0 | 2.9.0 / 2.9.0 |
 
 ## Release cadence
 

@@ -129,7 +129,7 @@ def __init__(self, vllm_config: "VllmConfig"):
         # when enable_async_exponential is True, AscendSampler will be different from vllm Sampler,
         # which make batch_invariant mode not working.
         # so we disable async exponential when batch_invariant mode is enabled.
-        from vllm.model_executor.layers.batch_invariant import vllm_is_batch_invariant
+        from vllm_ascend.batch_invariant import vllm_is_batch_invariant
 
         self.enable_async_exponential = (
             bool(additional_config.get("enable_async_exponential", False)) and not vllm_is_batch_invariant()

@@ -20,14 +20,28 @@
 
 import torch
 import torch_npu
+import vllm.envs as envs
 from vllm.logger import logger
-from vllm.model_executor.layers.batch_invariant import vllm_is_batch_invariant
 from vllm.triton_utils import HAS_TRITON
 
 # in case recursive call in reduce_sum.
 torch_sum = torch.sum
 
 
+def vllm_is_batch_invariant() -> bool:
+    """Check if batch-invariant mode is enabled.
+
+    This is a compatibility wrapper for the vllm function that was removed
+    in recent upstream vLLM refactoring.
+    """
+    # Try to access from envs module, fall back to environment variable
+    if hasattr(envs, 'VLLM_BATCH_INVARIANT'):
+        return bool(envs.VLLM_BATCH_INVARIANT)
+    else:
+        # Fallback to environment variable for older vLLM versions
+        return bool(int(os.getenv("VLLM_BATCH_INVARIANT", "0")))
+
+
 if HAS_TRITON:
     from vllm_ascend.ops.triton.batch_invariant.matmul import (
         addmm_batch_invariant,

@@ -1,9 +1,9 @@
 import torch
-from vllm.model_executor.layers.batch_invariant import vllm_is_batch_invariant
 from vllm.v1.sample.ops.topk_topp_sampler import TopKTopPSampler
 from vllm.v1.sample.sampler import Sampler
 
 from vllm_ascend.ascend_config import get_ascend_config
+from vllm_ascend.batch_invariant import vllm_is_batch_invariant
 from vllm_ascend.utils import AscendDeviceType, get_ascend_device_type, global_stream, npu_stream_switch
 
 DEFAULT_LOGPROBS_MODE = "raw_logprobs"

@@ -91,6 +91,9 @@ class SpecDecodeBaseProposer(EagleProposer):
     def __init__(self, vllm_config: VllmConfig, device: torch.device, pass_hidden_states_to_model: bool, runner=None):
         super().__init__(vllm_config, device, runner)
 
+        # Assign runner before it's used in the methods below
+        self.runner = runner
+
         self.use_async_scheduling = self.vllm_config.scheduler_config.async_scheduling
         self.pass_hidden_states_to_model = pass_hidden_states_to_model
         self.decode_threshold = 1 + self.num_speculative_tokens

@@ -259,7 +259,7 @@ def enable_custom_op():
     Enable lazy init for vllm_ascend_C to avoid early initialization of CANN's RTS component.
     Ensure that ASCEND_RT_VISIBLE_DEVICES can be dynamically modified before torch.npu.set_device().
     """
-    from vllm.model_executor.layers.batch_invariant import vllm_is_batch_invariant
+    from vllm_ascend.batch_invariant import vllm_is_batch_invariant
 
     global _CUSTOM_OP_ENABLED
 

@@ -219,6 +219,15 @@ class ExecuteModelState(NamedTuple):
 
 
 class NPUModelRunner(GPUModelRunner):
+    @staticmethod
+    def _get_device_tensor(buf):
+        """Get device tensor from either CpuGpuBuffer or direct Tensor.
+
+        Compatibility wrapper for handling both old (CpuGpuBuffer) and new
+        (direct Tensor) versions of vLLM.
+        """
+        return buf.gpu if hasattr(buf, 'gpu') else buf
+
     def __init__(self, vllm_config: VllmConfig, device: torch.device):
         # TODO(qcs): These manual pad and unpad for GPUModelRunner are
         # used to expand some buffers, which need to be reverted after
@@ -2426,17 +2435,17 @@ def _dummy_run(
             assert num_tokens_padded <= self.max_num_tokens
             if self.is_multimodal_model and not self.model_config.is_encoder_decoder or self.enable_prompt_embeds:
                 input_ids = None
-                inputs_embeds = self.inputs_embeds.gpu[:num_tokens_padded]
+                inputs_embeds = self._get_device_tensor(self.inputs_embeds)[:num_tokens_padded]
             else:
-                input_ids = self.input_ids.gpu[:num_tokens_padded]
+                input_ids = self._get_device_tensor(self.input_ids)[:num_tokens_padded]
                 inputs_embeds = None
 
             if self.uses_mrope:
-                positions = self.mrope_positions.gpu[:, :num_tokens_padded]
+                positions = self._get_device_tensor(self.mrope_positions)[:, :num_tokens_padded]
             elif self.uses_xdrope_dim > 0:
-                positions = self.xdrope_positions.gpu[:, :num_tokens_padded]
+                positions = self._get_device_tensor(self.xdrope_positions)[:, :num_tokens_padded]
             else:
-                positions = self.positions.gpu[:num_tokens_padded]
+                positions = self._get_device_tensor(self.positions)[:num_tokens_padded]
 
             # update global cos, sin
             update_cos_sin(positions)

@@ -80,8 +80,20 @@ def __init__(
         # Maps req_index -> tensor of shape (num_prompt_tokens, hidden_size)
         self.req_prompt_embeds: dict[int, torch.Tensor] = {}
         self.num_tokens = np.zeros(max_num_reqs, dtype=np.int32)
-        self.num_tokens_no_spec = np.zeros(max_num_reqs, dtype=np.int32)
-        self.num_prompt_tokens = np.zeros(max_num_reqs, dtype=np.int32)
+        self.num_tokens_no_spec_cpu_tensor = torch.zeros(
+            (max_num_reqs,),
+            device="cpu",
+            dtype=torch.int32,
+            pin_memory=pin_memory,
+        )
+        self.num_tokens_no_spec = self.num_tokens_no_spec_cpu_tensor.numpy()
+        self.num_prompt_tokens_cpu_tensor = torch.zeros(
+            (max_num_reqs,),
+            device="cpu",
+            dtype=torch.int32,
+            pin_memory=pin_memory,
+        )
+        self.num_prompt_tokens = self.num_prompt_tokens_cpu_tensor.numpy()
         self.num_computed_tokens_cpu_tensor = torch.zeros(
             (max_num_reqs,),
             device="cpu",