Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/_e2e_test.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ on:
continue_on_error:
required: false
type: boolean
default: false
default: true
env:
UV_INDEX_URL: http://cache-service.nginx-pypi-cache.svc.cluster.local/pypi/simple
UV_EXTRA_INDEX_URL: https://mirrors.huaweicloud.com/ascend/repos/pypi
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/bot_pr_create.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ jobs:
steps:
- name: Get vLLM version
run: |
VLLM_COMMIT=ed359c497a728f08b5b41456c07a688ccd510fbc
VLLM_COMMIT=14acf429ac08b6d538ca6feb3e06b6d13895804d
echo "VLLM_COMMIT=https://github.com/vllm-project/vllm/commit/$VLLM_COMMIT" >> "$GITHUB_ENV"

- name: Checkout repository
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/dockerfiles/Dockerfile.lint
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ RUN apt-get update -y && \

ARG VLLM_REPO=https://github.com/vllm-project/vllm.git
# For lint purpose, actually we need make a main2main matching.
ARG VLLM_COMMIT=ed359c497a728f08b5b41456c07a688ccd510fbc
ARG VLLM_COMMIT=14acf429ac08b6d538ca6feb3e06b6d13895804d
RUN git clone $VLLM_REPO /vllm-workspace/vllm && \
cd /vllm-workspace/vllm && \
git checkout $VLLM_COMMIT
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/pr_test_full.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,7 @@ jobs:
name: e2e-full
strategy:
matrix:
vllm_version: [ed359c497a728f08b5b41456c07a688ccd510fbc, v0.18.0]
vllm_version: [14acf429ac08b6d538ca6feb3e06b6d13895804d, v0.18.0]
needs: [changes]
if: ${{ needs.changes.outputs.e2e_tracker == 'true' || needs.changes.outputs.e2e_tracker == true }}
uses: ./.github/workflows/_e2e_test.yaml
Expand Down
6 changes: 3 additions & 3 deletions .github/workflows/pr_test_light.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ jobs:
lint:
uses: ./.github/workflows/_pre_commit.yml
with:
vllm: ed359c497a728f08b5b41456c07a688ccd510fbc
vllm: 14acf429ac08b6d538ca6feb3e06b6d13895804d
changes:
runs-on: linux-aarch64-a2b3-0
outputs:
Expand Down Expand Up @@ -90,7 +90,7 @@ jobs:
if: ${{ needs.lint.result == 'success' && (needs.changes.outputs.e2e_tracker == 'true' || needs.changes.outputs.ut_tracker == 'true') }}
strategy:
matrix:
vllm_version: [ed359c497a728f08b5b41456c07a688ccd510fbc, v0.18.0]
vllm_version: [14acf429ac08b6d538ca6feb3e06b6d13895804d, v0.18.0]
uses: ./.github/workflows/_unit_test.yaml
with:
vllm: ${{ matrix.vllm_version }}
Expand All @@ -102,7 +102,7 @@ jobs:
name: e2e-light
strategy:
matrix:
vllm_version: [ed359c497a728f08b5b41456c07a688ccd510fbc, v0.18.0]
vllm_version: [14acf429ac08b6d538ca6feb3e06b6d13895804d, v0.18.0]
# Note (yikun): If CI resource are limited we can split job into two chain jobs
needs: [lint, changes]
# only trigger e2e test after lint passed and the change is e2e related with pull request.
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/schedule_codecov_refresh.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ jobs:
name: refresh codecov
strategy:
matrix:
vllm_version: [ed359c497a728f08b5b41456c07a688ccd510fbc]
vllm_version: [14acf429ac08b6d538ca6feb3e06b6d13895804d]
uses: ./.github/workflows/_unit_test.yaml
with:
vllm: ${{ matrix.vllm_version }}
Expand Down
2 changes: 1 addition & 1 deletion docs/source/community/versioning_policy.md
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@ For main branch of vLLM Ascend, we usually make it compatible with the latest vL

| vLLM Ascend | vLLM | Python | Stable CANN | PyTorch/torch_npu |
|-------------|--------------|------------------|-------------|--------------------|
| main | ed359c497a728f08b5b41456c07a688ccd510fbc, v0.18.0 tag | >= 3.10, < 3.12 | 8.5.0 | 2.9.0 / 2.9.0 |
| main | 14acf429ac08b6d538ca6feb3e06b6d13895804d, v0.18.0 tag | >= 3.10, < 3.12 | 8.5.0 | 2.9.0 / 2.9.0 |

## Release cadence

Expand Down
2 changes: 1 addition & 1 deletion vllm_ascend/ascend_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -129,7 +129,7 @@ def __init__(self, vllm_config: "VllmConfig"):
# when enable_async_exponential is True, AscendSampler will be different from vllm Sampler,
# which make batch_invariant mode not working.
# so we disable async exponential when batch_invariant mode is enabled.
from vllm.model_executor.layers.batch_invariant import vllm_is_batch_invariant
from vllm_ascend.batch_invariant import vllm_is_batch_invariant

self.enable_async_exponential = (
bool(additional_config.get("enable_async_exponential", False)) and not vllm_is_batch_invariant()
Expand Down
16 changes: 15 additions & 1 deletion vllm_ascend/batch_invariant.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,14 +20,28 @@

import torch
import torch_npu
import vllm.envs as envs
from vllm.logger import logger
from vllm.model_executor.layers.batch_invariant import vllm_is_batch_invariant
from vllm.triton_utils import HAS_TRITON

# in case recursive call in reduce_sum.
torch_sum = torch.sum


def vllm_is_batch_invariant() -> bool:
"""Check if batch-invariant mode is enabled.

This is a compatibility wrapper for the vllm function that was removed
in recent upstream vLLM refactoring.
"""
# Try to access from envs module, fall back to environment variable
if hasattr(envs, 'VLLM_BATCH_INVARIANT'):
return bool(envs.VLLM_BATCH_INVARIANT)
else:
# Fallback to environment variable for older vLLM versions
return bool(int(os.getenv("VLLM_BATCH_INVARIANT", "0")))


if HAS_TRITON:
from vllm_ascend.ops.triton.batch_invariant.matmul import (
addmm_batch_invariant,
Expand Down
2 changes: 1 addition & 1 deletion vllm_ascend/sample/sampler.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
import torch
from vllm.model_executor.layers.batch_invariant import vllm_is_batch_invariant
from vllm.v1.sample.ops.topk_topp_sampler import TopKTopPSampler
from vllm.v1.sample.sampler import Sampler

from vllm_ascend.ascend_config import get_ascend_config
from vllm_ascend.batch_invariant import vllm_is_batch_invariant
from vllm_ascend.utils import AscendDeviceType, get_ascend_device_type, global_stream, npu_stream_switch

DEFAULT_LOGPROBS_MODE = "raw_logprobs"
Expand Down
3 changes: 3 additions & 0 deletions vllm_ascend/spec_decode/eagle_proposer.py
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,9 @@ class SpecDecodeBaseProposer(EagleProposer):
def __init__(self, vllm_config: VllmConfig, device: torch.device, pass_hidden_states_to_model: bool, runner=None):
super().__init__(vllm_config, device, runner)

# Assign runner before it's used in the methods below
self.runner = runner

self.use_async_scheduling = self.vllm_config.scheduler_config.async_scheduling
self.pass_hidden_states_to_model = pass_hidden_states_to_model
self.decode_threshold = 1 + self.num_speculative_tokens
Expand Down
2 changes: 1 addition & 1 deletion vllm_ascend/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -259,7 +259,7 @@ def enable_custom_op():
Enable lazy init for vllm_ascend_C to avoid early initialization of CANN's RTS component.
Ensure that ASCEND_RT_VISIBLE_DEVICES can be dynamically modified before torch.npu.set_device().
"""
from vllm.model_executor.layers.batch_invariant import vllm_is_batch_invariant
from vllm_ascend.batch_invariant import vllm_is_batch_invariant

global _CUSTOM_OP_ENABLED

Expand Down
19 changes: 14 additions & 5 deletions vllm_ascend/worker/model_runner_v1.py
Original file line number Diff line number Diff line change
Expand Up @@ -219,6 +219,15 @@ class ExecuteModelState(NamedTuple):


class NPUModelRunner(GPUModelRunner):
@staticmethod
def _get_device_tensor(buf):
"""Get device tensor from either CpuGpuBuffer or direct Tensor.

Compatibility wrapper for handling both old (CpuGpuBuffer) and new
(direct Tensor) versions of vLLM.
"""
return buf.gpu if hasattr(buf, 'gpu') else buf

def __init__(self, vllm_config: VllmConfig, device: torch.device):
# TODO(qcs): These manual pad and unpad for GPUModelRunner are
# used to expand some buffers, which need to be reverted after
Expand Down Expand Up @@ -2426,17 +2435,17 @@ def _dummy_run(
assert num_tokens_padded <= self.max_num_tokens
if self.is_multimodal_model and not self.model_config.is_encoder_decoder or self.enable_prompt_embeds:
input_ids = None
inputs_embeds = self.inputs_embeds.gpu[:num_tokens_padded]
inputs_embeds = self._get_device_tensor(self.inputs_embeds)[:num_tokens_padded]
else:
input_ids = self.input_ids.gpu[:num_tokens_padded]
input_ids = self._get_device_tensor(self.input_ids)[:num_tokens_padded]
inputs_embeds = None

if self.uses_mrope:
positions = self.mrope_positions.gpu[:, :num_tokens_padded]
positions = self._get_device_tensor(self.mrope_positions)[:, :num_tokens_padded]
elif self.uses_xdrope_dim > 0:
positions = self.xdrope_positions.gpu[:, :num_tokens_padded]
positions = self._get_device_tensor(self.xdrope_positions)[:, :num_tokens_padded]
else:
positions = self.positions.gpu[:num_tokens_padded]
positions = self._get_device_tensor(self.positions)[:num_tokens_padded]

# update global cos, sin
update_cos_sin(positions)
Expand Down
16 changes: 14 additions & 2 deletions vllm_ascend/worker/npu_input_batch.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,8 +80,20 @@ def __init__(
# Maps req_index -> tensor of shape (num_prompt_tokens, hidden_size)
self.req_prompt_embeds: dict[int, torch.Tensor] = {}
self.num_tokens = np.zeros(max_num_reqs, dtype=np.int32)
self.num_tokens_no_spec = np.zeros(max_num_reqs, dtype=np.int32)
self.num_prompt_tokens = np.zeros(max_num_reqs, dtype=np.int32)
self.num_tokens_no_spec_cpu_tensor = torch.zeros(
(max_num_reqs,),
device="cpu",
dtype=torch.int32,
pin_memory=pin_memory,
)
self.num_tokens_no_spec = self.num_tokens_no_spec_cpu_tensor.numpy()
self.num_prompt_tokens_cpu_tensor = torch.zeros(
(max_num_reqs,),
device="cpu",
dtype=torch.int32,
pin_memory=pin_memory,
)
self.num_prompt_tokens = self.num_prompt_tokens_cpu_tensor.numpy()
self.num_computed_tokens_cpu_tensor = torch.zeros(
(max_num_reqs,),
device="cpu",
Expand Down
Loading