upgrade vllm to 0323 commit id: 35141a7eeda941a60ad5a4956670c60fd5a77029

22dimensions · claude · 22dimensions · commit e19f659e8384 · 2026-03-25T11:10:50.000+08:00
fix: add missing num_prompt_tokens_cpu_tensor to NPUInputBatch

Adapt to upstream vLLM changes in InputBatch. The vLLM v1 refactored
InputBatch to use torch tensors for CPU data structures with numpy views,
matching the pattern used for other batch statistics.

- Added num_tokens_no_spec_cpu_tensor and num_tokens_no_spec
- Added num_prompt_tokens_cpu_tensor and updated num_prompt_tokens to be a numpy view
- Fixes AttributeError: 'NPUInputBatch' object has no attribute 'num_prompt_tokens_cpu_tensor'

Affects: All pooling model tests that access input batch metadata.

Co-Authored-By: Claude Code &lt;noreply@anthropic.com&gt;
Signed-off-by: 22dimensions &lt;waitingwind@foxmail.com&gt;
diff --git a/.github/workflows/bot_pr_create.yaml b/.github/workflows/bot_pr_create.yaml
@@ -37,7 +37,7 @@ jobs:
     steps:
       - name: Get vLLM version
         run: |
-          VLLM_COMMIT=ed359c497a728f08b5b41456c07a688ccd510fbc
+          VLLM_COMMIT=35141a7eeda941a60ad5a4956670c60fd5a77029
           echo "VLLM_COMMIT=https://github.com/vllm-project/vllm/commit/$VLLM_COMMIT" >> "$GITHUB_ENV"
 
       - name: Checkout repository
diff --git a/.github/workflows/dockerfiles/Dockerfile.lint b/.github/workflows/dockerfiles/Dockerfile.lint
@@ -27,7 +27,7 @@ RUN apt-get update -y && \
 
 ARG VLLM_REPO=https://github.com/vllm-project/vllm.git
 # For lint purpose, actually we need make a main2main matching.
-ARG VLLM_COMMIT=ed359c497a728f08b5b41456c07a688ccd510fbc
+ARG VLLM_COMMIT=35141a7eeda941a60ad5a4956670c60fd5a77029
 RUN git clone $VLLM_REPO /vllm-workspace/vllm && \
     cd /vllm-workspace/vllm && \
     git checkout $VLLM_COMMIT
diff --git a/.github/workflows/pr_test_full.yaml b/.github/workflows/pr_test_full.yaml
@@ -75,7 +75,7 @@ jobs:
     name: e2e-full
     strategy:
       matrix:
-        vllm_version: [ed359c497a728f08b5b41456c07a688ccd510fbc, v0.18.0]
+        vllm_version: [35141a7eeda941a60ad5a4956670c60fd5a77029, v0.18.0]
     needs: [changes]
     if: ${{ needs.changes.outputs.e2e_tracker == 'true' || needs.changes.outputs.e2e_tracker == true }}
     uses: ./.github/workflows/_e2e_test.yaml
diff --git a/.github/workflows/pr_test_light.yaml b/.github/workflows/pr_test_light.yaml
@@ -41,7 +41,7 @@ jobs:
   lint:
     uses: ./.github/workflows/_pre_commit.yml
     with:
-      vllm: ed359c497a728f08b5b41456c07a688ccd510fbc
+      vllm: 35141a7eeda941a60ad5a4956670c60fd5a77029
   changes:
     runs-on: linux-aarch64-a2b3-0
     outputs:
@@ -90,7 +90,7 @@ jobs:
     if: ${{ needs.lint.result == 'success' && (needs.changes.outputs.e2e_tracker == 'true' || needs.changes.outputs.ut_tracker == 'true') }}
     strategy:
       matrix:
-        vllm_version: [ed359c497a728f08b5b41456c07a688ccd510fbc, v0.18.0]
+        vllm_version: [35141a7eeda941a60ad5a4956670c60fd5a77029, v0.18.0]
     uses: ./.github/workflows/_unit_test.yaml
     with:
       vllm: ${{ matrix.vllm_version }}
@@ -102,7 +102,7 @@ jobs:
     name: e2e-light
     strategy:
       matrix:
-        vllm_version: [ed359c497a728f08b5b41456c07a688ccd510fbc, v0.18.0]
+        vllm_version: [35141a7eeda941a60ad5a4956670c60fd5a77029, v0.18.0]
     # Note (yikun): If CI resource are limited we can split job into two chain jobs
     needs: [lint, changes]
     # only trigger e2e test after lint passed and the change is e2e related with pull request.
diff --git a/.github/workflows/schedule_codecov_refresh.yaml b/.github/workflows/schedule_codecov_refresh.yaml
@@ -33,7 +33,7 @@ jobs:
     name: refresh codecov
     strategy:
       matrix:
-        vllm_version: [ed359c497a728f08b5b41456c07a688ccd510fbc]
+        vllm_version: [35141a7eeda941a60ad5a4956670c60fd5a77029]
     uses: ./.github/workflows/_unit_test.yaml
     with:
       vllm: ${{ matrix.vllm_version }}
diff --git a/docs/source/community/versioning_policy.md b/docs/source/community/versioning_policy.md
@@ -59,7 +59,7 @@ For main branch of vLLM Ascend, we usually make it compatible with the latest vL
 
 | vLLM Ascend | vLLM         | Python           | Stable CANN | PyTorch/torch_npu  |
 |-------------|--------------|------------------|-------------|--------------------|
-|     main    | ed359c497a728f08b5b41456c07a688ccd510fbc, v0.18.0 tag | >= 3.10, < 3.12   | 8.5.0 | 2.9.0 / 2.9.0 |
+|     main    | 35141a7eeda941a60ad5a4956670c60fd5a77029, v0.18.0 tag | >= 3.10, < 3.12   | 8.5.0 | 2.9.0 / 2.9.0 |
 
 ## Release cadence
 
diff --git a/vllm_ascend/worker/npu_input_batch.py b/vllm_ascend/worker/npu_input_batch.py
@@ -80,8 +80,20 @@ def __init__(
         # Maps req_index -> tensor of shape (num_prompt_tokens, hidden_size)
         self.req_prompt_embeds: dict[int, torch.Tensor] = {}
         self.num_tokens = np.zeros(max_num_reqs, dtype=np.int32)
-        self.num_tokens_no_spec = np.zeros(max_num_reqs, dtype=np.int32)
-        self.num_prompt_tokens = np.zeros(max_num_reqs, dtype=np.int32)
+        self.num_tokens_no_spec_cpu_tensor = torch.zeros(
+            (max_num_reqs,),
+            device="cpu",
+            dtype=torch.int32,
+            pin_memory=pin_memory,
+        )
+        self.num_tokens_no_spec = self.num_tokens_no_spec_cpu_tensor.numpy()
+        self.num_prompt_tokens_cpu_tensor = torch.zeros(
+            (max_num_reqs,),
+            device="cpu",
+            dtype=torch.int32,
+            pin_memory=pin_memory,
+        )
+        self.num_prompt_tokens = self.num_prompt_tokens_cpu_tensor.numpy()
         self.num_computed_tokens_cpu_tensor = torch.zeros(
             (max_num_reqs,),
             device="cpu",