Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 3 additions & 9 deletions .github/workflows/vllm_ascend_test.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,7 @@ jobs:
VLLM_USE_MODELSCOPE: True
strategy:
matrix:
vllm_version: [main, v0.10.0]
vllm_version: [main]
steps:
- name: Install packages
run: |
Expand Down Expand Up @@ -137,7 +137,7 @@ jobs:
max-parallel: 2
matrix:
os: [linux-aarch64-a2-1]
vllm_version: [main, v0.10.0]
vllm_version: [main]
name: singlecard e2e test
runs-on: ${{ matrix.os }}
container:
Expand Down Expand Up @@ -185,9 +185,6 @@ jobs:
run: |
pip install -r requirements-dev.txt
pip install -v -e .
if [[ "${{ matrix.vllm_version }}" == "v0.10.0" ]]; then
pip install "transformers<4.54.0"
fi

- name: Run e2e test
env:
Expand Down Expand Up @@ -222,7 +219,7 @@ jobs:
max-parallel: 2
matrix:
os: [linux-aarch64-a2-2]
vllm_version: [main, v0.10.0]
vllm_version: [main]
name: multicard e2e test
runs-on: ${{ matrix.os }}
container:
Expand Down Expand Up @@ -270,9 +267,6 @@ jobs:
run: |
pip install -r requirements-dev.txt
pip install -v -e .
if [[ "${{ matrix.vllm_version }}" == "v0.10.0" ]]; then
pip install "transformers<4.54.0"
fi

- name: Run vllm-project/vllm-ascend test
env:
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/vllm_ascend_test_310p.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ jobs:
max-parallel: 2
matrix:
os: [linux-aarch64-310p-1, linux-aarch64-310p-4]
vllm_version: [main, v0.10.0]
vllm_version: [main]
name: 310p e2e test
runs-on: ${{ matrix.os }}
container:
Expand Down
2 changes: 1 addition & 1 deletion tests/ut/core/test_scheduler.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ def create_requests(
request_id=f"{i}",
prompt_token_ids=[i] * num_tokens,
sampling_params=sampling_params,
multi_modal_inputs=mm_inputs,
multi_modal_kwargs=mm_inputs,
multi_modal_placeholders=mm_position,
multi_modal_hashes=None,
eos_token_id=EOS_TOKEN_ID,
Expand Down
23 changes: 8 additions & 15 deletions tests/ut/kv_connector/test_remote_decode_lifecycle.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,6 @@
create_model_runner_output,
create_request, create_scheduler,
create_vllm_config)
from vllm_ascend.utils import vllm_version_is


def test_basic_lifecycle():
Expand Down Expand Up @@ -103,13 +102,10 @@ def test_basic_lifecycle():

# (3b): execute_model()
model_runner_output = copy.deepcopy(EMPTY_MODEL_RUNNER_OUTPUT)
if vllm_version_is("0.10.0"):
model_runner_output.finished_sending = [request_id]
else:
from vllm.v1.worker.kv_connector_model_runner_mixin import \
KVConnectorOutput # type: ignore # noqa
model_runner_output.kv_connector_output = KVConnectorOutput(
finished_sending=[request_id])
from vllm.v1.worker.kv_connector_model_runner_mixin import \
KVConnectorOutput # type: ignore # noqa
model_runner_output.kv_connector_output = KVConnectorOutput(
finished_sending=[request_id])

# (3c): update_from_output()
scheduler.update_from_output(scheduler_output, model_runner_output)
Expand Down Expand Up @@ -164,13 +160,10 @@ def test_prefix_cache_lifecycle():
scheduler_output = scheduler.schedule()
scheduler.schedule()
model_runner_output = copy.deepcopy(EMPTY_MODEL_RUNNER_OUTPUT)
if vllm_version_is("0.10.0"):
model_runner_output.finished_sending = [request_remote.request_id]
else:
from vllm.v1.worker.kv_connector_model_runner_mixin import \
KVConnectorOutput # noqa
model_runner_output.kv_connector_output = KVConnectorOutput(
finished_sending=[request_remote.request_id])
from vllm.v1.worker.kv_connector_model_runner_mixin import \
KVConnectorOutput # noqa
model_runner_output.kv_connector_output = KVConnectorOutput(
finished_sending=[request_remote.request_id])
scheduler.update_from_output(scheduler_output, model_runner_output)
_ = scheduler.schedule()
assert_scheduler_empty(scheduler)
23 changes: 8 additions & 15 deletions tests/ut/kv_connector/test_remote_prefill_lifecycle.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,6 @@
create_model_runner_output,
create_request, create_scheduler,
create_vllm_config)
from vllm_ascend.utils import vllm_version_is


def test_basic_lifecycle():
Expand Down Expand Up @@ -91,13 +90,10 @@ def test_basic_lifecycle():

# (2b): forward(): request finishes recv.
model_runner_output = copy.deepcopy(EMPTY_MODEL_RUNNER_OUTPUT)
if vllm_version_is("0.10.0"):
model_runner_output.finished_recving = [request_id]
else:
from vllm.v1.worker.kv_connector_model_runner_mixin import \
KVConnectorOutput # type: ignore # noqa
model_runner_output.kv_connector_output = KVConnectorOutput(
finished_recving=[request_id])
from vllm.v1.worker.kv_connector_model_runner_mixin import \
KVConnectorOutput # type: ignore # noqa
model_runner_output.kv_connector_output = KVConnectorOutput(
finished_recving=[request_id])

# (2c): update_from_output():
engine_core_outputs = scheduler.update_from_output(scheduler_output,
Expand Down Expand Up @@ -211,13 +207,10 @@ def test_full_block_prompt():
# # STEP (2): Recv.
scheduler_output = scheduler.schedule()
model_runner_output = copy.deepcopy(EMPTY_MODEL_RUNNER_OUTPUT)
if vllm_version_is("0.10.0"):
model_runner_output.finished_recving = [request_id]
else:
from vllm.v1.worker.kv_connector_model_runner_mixin import \
KVConnectorOutput # type: ignore # noqa
model_runner_output.kv_connector_output = KVConnectorOutput(
finished_recving=[request_id])
from vllm.v1.worker.kv_connector_model_runner_mixin import \
KVConnectorOutput # type: ignore # noqa
model_runner_output.kv_connector_output = KVConnectorOutput(
finished_recving=[request_id])
scheduler.update_from_output(scheduler_output, model_runner_output)
assert len(scheduler.waiting) == 1
assert (request_id in scheduler.finished_recving_kv_req_ids)
Expand Down
20 changes: 6 additions & 14 deletions tests/ut/kv_connector/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -157,7 +157,7 @@ def create_request(
request_id=f"id-{request_id}",
prompt_token_ids=prompt_token_ids,
sampling_params=sampling_params,
multi_modal_inputs=None,
multi_modal_kwargs=None,
multi_modal_placeholders=None,
multi_modal_hashes=None,
**({
Expand Down Expand Up @@ -187,19 +187,11 @@ def create_model_runner_output(

# Make output data structure.
extra_args = {}
if not vllm_version_is("0.10.0"):
from vllm.v1.worker.kv_connector_model_runner_mixin import \
KVConnectorOutput # type: ignore # noqa
kv_connector_output = KVConnectorOutput(
finished_sending=finished_sending,
finished_recving=finished_recving)
extra_args = {"kv_connector_output": kv_connector_output}
else:
extra_args = {
"finished_sending": finished_sending,
"finished_recving": finished_recving,
}

from vllm.v1.worker.kv_connector_model_runner_mixin import \
KVConnectorOutput # type: ignore # noqa
kv_connector_output = KVConnectorOutput(finished_sending=finished_sending,
finished_recving=finished_recving)
extra_args = {"kv_connector_output": kv_connector_output}
return ModelRunnerOutput(
req_ids=req_ids,
req_id_to_index=req_id_to_index,
Expand Down
2 changes: 1 addition & 1 deletion tests/ut/worker/test_input_batch.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ def mock_cached_request_state(req_id="1", prompt=[1, 2, 3], output=[4, 5, 6]):
return CachedRequestState(
req_id=req_id,
prompt_token_ids=prompt,
mm_inputs=[],
mm_kwargs=[],
mm_positions=[],
sampling_params=SamplingParams(),
pooling_params=None,
Expand Down
14 changes: 3 additions & 11 deletions vllm_ascend/models/qwen2_5_vl.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,8 +30,7 @@
from vllm.config import VllmConfig
from vllm.distributed import parallel_state
from vllm.distributed import utils as dist_utils
from vllm.model_executor.layers.activation import (_ACTIVATION_REGISTRY,
get_act_and_mul_fn)
from vllm.model_executor.layers.activation import get_act_and_mul_fn
from vllm.model_executor.layers.layernorm import RMSNorm
from vllm.model_executor.layers.quantization import QuantizationConfig
from vllm.model_executor.model_loader.weight_utils import default_weight_loader
Expand All @@ -43,8 +42,6 @@
from vllm.model_executor.models.utils import maybe_prefix
from vllm.multimodal import MULTIMODAL_REGISTRY

from vllm_ascend.utils import vllm_version_is

MIN_PAD_SIZE = 64 # min_size to pad weight
MAX_PAD_SIZE = 128 # max_size to pad weight

Expand Down Expand Up @@ -202,8 +199,6 @@ def __init__(
)

act_fn = get_act_and_mul_fn(vision_config.hidden_act)
if vllm_version_is("0.10.0"):
act_fn = _ACTIVATION_REGISTRY[vision_config.hidden_act]
self.blocks = nn.ModuleList([
AscendQwen2_5_VisionBlock(
dim=self.hidden_size,
Expand Down Expand Up @@ -303,12 +298,9 @@ def load_weights(self, weights: Iterable[Tuple[str,
("qkv_proj", "q_proj", "q"),
("qkv_proj", "k_proj", "k"),
("qkv_proj", "v_proj", "v"),
("mlp.gate_up_proj.", "mlp.gate_proj.", 0),
("mlp.gate_up_proj.", "mlp.up_proj.", 1),
]
if not vllm_version_is("0.10.0"):
stacked_params_mapping.extend([
("mlp.gate_up_proj.", "mlp.gate_proj.", 0),
("mlp.gate_up_proj.", "mlp.up_proj.", 1),
])
params_dict = dict(self.named_parameters(remove_duplicate=False))
loaded_params: Set[str] = set()
for name, loaded_weight in weights:
Expand Down
6 changes: 1 addition & 5 deletions vllm_ascend/models/qwen2_5_vl_without_padding.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,8 +30,7 @@
from vllm.config import VllmConfig
from vllm.distributed import parallel_state
from vllm.distributed import utils as dist_utils
from vllm.model_executor.layers.activation import (_ACTIVATION_REGISTRY,
get_act_and_mul_fn)
from vllm.model_executor.layers.activation import get_act_and_mul_fn
from vllm.model_executor.layers.layernorm import RMSNorm
from vllm.model_executor.layers.quantization import QuantizationConfig
from vllm.model_executor.models.qwen2_5_vl import (
Expand All @@ -43,7 +42,6 @@
from vllm.multimodal import MULTIMODAL_REGISTRY

from vllm_ascend.models.qwen2_5_vl import AscendQwen2_5_VisionRotaryEmbedding
from vllm_ascend.utils import vllm_version_is


class AscendQwen2_5_VisionAttention_Without_Padding(Qwen2_5_VisionAttention):
Expand Down Expand Up @@ -175,8 +173,6 @@ def __init__(
)

act_fn = get_act_and_mul_fn(vision_config.hidden_act)
if vllm_version_is("0.10.0"):
act_fn = _ACTIVATION_REGISTRY[vision_config.hidden_act]
self.blocks = nn.ModuleList([
AscendQwen2_5_VisionBlock_Without_Padding(
dim=self.hidden_size,
Expand Down
2 changes: 1 addition & 1 deletion vllm_ascend/multistream/ms_split.py
Original file line number Diff line number Diff line change
Expand Up @@ -105,7 +105,7 @@ def model_input_split_v1_mla_attn(
[block_table_pre,
block_table_post] = split_attn_tensor_type(attn_metadata.block_tables,
seq_index)

assert attn_metadata.attn_mask is not None
if attn_metadata.attn_state == AscendAttentionState.PrefillNoCache or attn_metadata.attn_state == AscendAttentionState.PrefillCacheHit:
# the attn_mla kernel in torch npu only accept 128*128 attn mask
attn_mask_pre = attn_mask_post = attn_metadata.attn_mask
Expand Down
11 changes: 2 additions & 9 deletions vllm_ascend/patch/platform/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,12 +14,5 @@
# See the License for the specific language governing permissions and
# limitations under the License.

from vllm_ascend.utils import vllm_version_is

# Import specific patches for different versions
if vllm_version_is("0.10.0"):
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this can be left. while we can add it back later.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this can be left. while we can add it back later.

ok, will add it back in next pr then

from vllm_ascend.patch.platform import patch_0_10_0 # noqa: F401
from vllm_ascend.patch.platform import patch_common # noqa: F401
else:
from vllm_ascend.patch.platform import patch_common # noqa: F401
from vllm_ascend.patch.platform import patch_main # noqa: F401
from vllm_ascend.patch.platform import patch_common # noqa: F401
from vllm_ascend.patch.platform import patch_main # noqa: F401
16 changes: 0 additions & 16 deletions vllm_ascend/patch/platform/patch_0_10_0/__init__.py

This file was deleted.

11 changes: 2 additions & 9 deletions vllm_ascend/patch/worker/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,12 +15,5 @@
# limitations under the License.
#

from vllm_ascend.utils import vllm_version_is

# Import specific patches for different versions
if vllm_version_is("0.10.0"):
from vllm_ascend.patch.worker import patch_0_10_0 # noqa: F401
from vllm_ascend.patch.worker import patch_common # noqa: F401
else:
from vllm_ascend.patch.worker import patch_common # noqa: F401
from vllm_ascend.patch.worker import patch_main # noqa: F401
from vllm_ascend.patch.worker import patch_common # noqa: F401
from vllm_ascend.patch.worker import patch_main # noqa: F401
18 changes: 0 additions & 18 deletions vllm_ascend/patch/worker/patch_0_10_0/__init__.py

This file was deleted.

Loading
Loading