Skip to content

Commit 61866b8

Browse files
authored
[Quickfix] update CachedRequestState as NewRequestData changed (#2367)
### What this PR does / why we need it? 1. update `CachedRequestState` as `NewRequestData` changed in vllm-project/vllm#22570 2. drop maintenance of vllm v0.10.0 in the branch main ### Does this PR introduce _any_ user-facing change? N/A ### How was this patch tested? CI passed with existing test. - vLLM version: v0.10.0 - vLLM main: vllm-project/vllm@92ff41a --------- Signed-off-by: MengqingCao <[email protected]>
1 parent 2ad7e12 commit 61866b8

18 files changed

+77
-285
lines changed

.github/workflows/vllm_ascend_test.yaml

Lines changed: 3 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -81,7 +81,7 @@ jobs:
8181
VLLM_USE_MODELSCOPE: True
8282
strategy:
8383
matrix:
84-
vllm_version: [main, v0.10.0]
84+
vllm_version: [main]
8585
steps:
8686
- name: Install packages
8787
run: |
@@ -137,7 +137,7 @@ jobs:
137137
max-parallel: 2
138138
matrix:
139139
os: [linux-aarch64-a2-1]
140-
vllm_version: [main, v0.10.0]
140+
vllm_version: [main]
141141
name: singlecard e2e test
142142
runs-on: ${{ matrix.os }}
143143
container:
@@ -185,9 +185,6 @@ jobs:
185185
run: |
186186
pip install -r requirements-dev.txt
187187
pip install -v -e .
188-
if [[ "${{ matrix.vllm_version }}" == "v0.10.0" ]]; then
189-
pip install "transformers<4.54.0"
190-
fi
191188
192189
- name: Run e2e test
193190
env:
@@ -222,7 +219,7 @@ jobs:
222219
max-parallel: 2
223220
matrix:
224221
os: [linux-aarch64-a2-2]
225-
vllm_version: [main, v0.10.0]
222+
vllm_version: [main]
226223
name: multicard e2e test
227224
runs-on: ${{ matrix.os }}
228225
container:
@@ -270,9 +267,6 @@ jobs:
270267
run: |
271268
pip install -r requirements-dev.txt
272269
pip install -v -e .
273-
if [[ "${{ matrix.vllm_version }}" == "v0.10.0" ]]; then
274-
pip install "transformers<4.54.0"
275-
fi
276270
277271
- name: Run vllm-project/vllm-ascend test
278272
env:

.github/workflows/vllm_ascend_test_310p.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -53,7 +53,7 @@ jobs:
5353
max-parallel: 2
5454
matrix:
5555
os: [linux-aarch64-310p-1, linux-aarch64-310p-4]
56-
vllm_version: [main, v0.10.0]
56+
vllm_version: [main]
5757
name: 310p e2e test
5858
runs-on: ${{ matrix.os }}
5959
container:

tests/ut/core/test_scheduler.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -50,7 +50,7 @@ def create_requests(
5050
request_id=f"{i}",
5151
prompt_token_ids=[i] * num_tokens,
5252
sampling_params=sampling_params,
53-
multi_modal_inputs=mm_inputs,
53+
multi_modal_kwargs=mm_inputs,
5454
multi_modal_placeholders=mm_position,
5555
multi_modal_hashes=None,
5656
eos_token_id=EOS_TOKEN_ID,

tests/ut/kv_connector/test_remote_decode_lifecycle.py

Lines changed: 8 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,6 @@
2525
create_model_runner_output,
2626
create_request, create_scheduler,
2727
create_vllm_config)
28-
from vllm_ascend.utils import vllm_version_is
2928

3029

3130
def test_basic_lifecycle():
@@ -103,13 +102,10 @@ def test_basic_lifecycle():
103102

104103
# (3b): execute_model()
105104
model_runner_output = copy.deepcopy(EMPTY_MODEL_RUNNER_OUTPUT)
106-
if vllm_version_is("0.10.0"):
107-
model_runner_output.finished_sending = [request_id]
108-
else:
109-
from vllm.v1.worker.kv_connector_model_runner_mixin import \
110-
KVConnectorOutput # type: ignore # noqa
111-
model_runner_output.kv_connector_output = KVConnectorOutput(
112-
finished_sending=[request_id])
105+
from vllm.v1.worker.kv_connector_model_runner_mixin import \
106+
KVConnectorOutput # type: ignore # noqa
107+
model_runner_output.kv_connector_output = KVConnectorOutput(
108+
finished_sending=[request_id])
113109

114110
# (3c): update_from_output()
115111
scheduler.update_from_output(scheduler_output, model_runner_output)
@@ -164,13 +160,10 @@ def test_prefix_cache_lifecycle():
164160
scheduler_output = scheduler.schedule()
165161
scheduler.schedule()
166162
model_runner_output = copy.deepcopy(EMPTY_MODEL_RUNNER_OUTPUT)
167-
if vllm_version_is("0.10.0"):
168-
model_runner_output.finished_sending = [request_remote.request_id]
169-
else:
170-
from vllm.v1.worker.kv_connector_model_runner_mixin import \
171-
KVConnectorOutput # noqa
172-
model_runner_output.kv_connector_output = KVConnectorOutput(
173-
finished_sending=[request_remote.request_id])
163+
from vllm.v1.worker.kv_connector_model_runner_mixin import \
164+
KVConnectorOutput # noqa
165+
model_runner_output.kv_connector_output = KVConnectorOutput(
166+
finished_sending=[request_remote.request_id])
174167
scheduler.update_from_output(scheduler_output, model_runner_output)
175168
_ = scheduler.schedule()
176169
assert_scheduler_empty(scheduler)

tests/ut/kv_connector/test_remote_prefill_lifecycle.py

Lines changed: 8 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,6 @@
2525
create_model_runner_output,
2626
create_request, create_scheduler,
2727
create_vllm_config)
28-
from vllm_ascend.utils import vllm_version_is
2928

3029

3130
def test_basic_lifecycle():
@@ -91,13 +90,10 @@ def test_basic_lifecycle():
9190

9291
# (2b): forward(): request finishes recv.
9392
model_runner_output = copy.deepcopy(EMPTY_MODEL_RUNNER_OUTPUT)
94-
if vllm_version_is("0.10.0"):
95-
model_runner_output.finished_recving = [request_id]
96-
else:
97-
from vllm.v1.worker.kv_connector_model_runner_mixin import \
98-
KVConnectorOutput # type: ignore # noqa
99-
model_runner_output.kv_connector_output = KVConnectorOutput(
100-
finished_recving=[request_id])
93+
from vllm.v1.worker.kv_connector_model_runner_mixin import \
94+
KVConnectorOutput # type: ignore # noqa
95+
model_runner_output.kv_connector_output = KVConnectorOutput(
96+
finished_recving=[request_id])
10197

10298
# (2c): update_from_output():
10399
engine_core_outputs = scheduler.update_from_output(scheduler_output,
@@ -211,13 +207,10 @@ def test_full_block_prompt():
211207
# # STEP (2): Recv.
212208
scheduler_output = scheduler.schedule()
213209
model_runner_output = copy.deepcopy(EMPTY_MODEL_RUNNER_OUTPUT)
214-
if vllm_version_is("0.10.0"):
215-
model_runner_output.finished_recving = [request_id]
216-
else:
217-
from vllm.v1.worker.kv_connector_model_runner_mixin import \
218-
KVConnectorOutput # type: ignore # noqa
219-
model_runner_output.kv_connector_output = KVConnectorOutput(
220-
finished_recving=[request_id])
210+
from vllm.v1.worker.kv_connector_model_runner_mixin import \
211+
KVConnectorOutput # type: ignore # noqa
212+
model_runner_output.kv_connector_output = KVConnectorOutput(
213+
finished_recving=[request_id])
221214
scheduler.update_from_output(scheduler_output, model_runner_output)
222215
assert len(scheduler.waiting) == 1
223216
assert (request_id in scheduler.finished_recving_kv_req_ids)

tests/ut/kv_connector/utils.py

Lines changed: 6 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -157,7 +157,7 @@ def create_request(
157157
request_id=f"id-{request_id}",
158158
prompt_token_ids=prompt_token_ids,
159159
sampling_params=sampling_params,
160-
multi_modal_inputs=None,
160+
multi_modal_kwargs=None,
161161
multi_modal_placeholders=None,
162162
multi_modal_hashes=None,
163163
**({
@@ -187,19 +187,11 @@ def create_model_runner_output(
187187

188188
# Make output data structure.
189189
extra_args = {}
190-
if not vllm_version_is("0.10.0"):
191-
from vllm.v1.worker.kv_connector_model_runner_mixin import \
192-
KVConnectorOutput # type: ignore # noqa
193-
kv_connector_output = KVConnectorOutput(
194-
finished_sending=finished_sending,
195-
finished_recving=finished_recving)
196-
extra_args = {"kv_connector_output": kv_connector_output}
197-
else:
198-
extra_args = {
199-
"finished_sending": finished_sending,
200-
"finished_recving": finished_recving,
201-
}
202-
190+
from vllm.v1.worker.kv_connector_model_runner_mixin import \
191+
KVConnectorOutput # type: ignore # noqa
192+
kv_connector_output = KVConnectorOutput(finished_sending=finished_sending,
193+
finished_recving=finished_recving)
194+
extra_args = {"kv_connector_output": kv_connector_output}
203195
return ModelRunnerOutput(
204196
req_ids=req_ids,
205197
req_id_to_index=req_id_to_index,

tests/ut/worker/test_input_batch.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@ def mock_cached_request_state(req_id="1", prompt=[1, 2, 3], output=[4, 5, 6]):
1212
return CachedRequestState(
1313
req_id=req_id,
1414
prompt_token_ids=prompt,
15-
mm_inputs=[],
15+
mm_kwargs=[],
1616
mm_positions=[],
1717
sampling_params=SamplingParams(),
1818
pooling_params=None,

vllm_ascend/models/qwen2_5_vl.py

Lines changed: 3 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -30,8 +30,7 @@
3030
from vllm.config import VllmConfig
3131
from vllm.distributed import parallel_state
3232
from vllm.distributed import utils as dist_utils
33-
from vllm.model_executor.layers.activation import (_ACTIVATION_REGISTRY,
34-
get_act_and_mul_fn)
33+
from vllm.model_executor.layers.activation import get_act_and_mul_fn
3534
from vllm.model_executor.layers.layernorm import RMSNorm
3635
from vllm.model_executor.layers.quantization import QuantizationConfig
3736
from vllm.model_executor.model_loader.weight_utils import default_weight_loader
@@ -43,8 +42,6 @@
4342
from vllm.model_executor.models.utils import maybe_prefix
4443
from vllm.multimodal import MULTIMODAL_REGISTRY
4544

46-
from vllm_ascend.utils import vllm_version_is
47-
4845
MIN_PAD_SIZE = 64 # min_size to pad weight
4946
MAX_PAD_SIZE = 128 # max_size to pad weight
5047

@@ -202,8 +199,6 @@ def __init__(
202199
)
203200

204201
act_fn = get_act_and_mul_fn(vision_config.hidden_act)
205-
if vllm_version_is("0.10.0"):
206-
act_fn = _ACTIVATION_REGISTRY[vision_config.hidden_act]
207202
self.blocks = nn.ModuleList([
208203
AscendQwen2_5_VisionBlock(
209204
dim=self.hidden_size,
@@ -303,12 +298,9 @@ def load_weights(self, weights: Iterable[Tuple[str,
303298
("qkv_proj", "q_proj", "q"),
304299
("qkv_proj", "k_proj", "k"),
305300
("qkv_proj", "v_proj", "v"),
301+
("mlp.gate_up_proj.", "mlp.gate_proj.", 0),
302+
("mlp.gate_up_proj.", "mlp.up_proj.", 1),
306303
]
307-
if not vllm_version_is("0.10.0"):
308-
stacked_params_mapping.extend([
309-
("mlp.gate_up_proj.", "mlp.gate_proj.", 0),
310-
("mlp.gate_up_proj.", "mlp.up_proj.", 1),
311-
])
312304
params_dict = dict(self.named_parameters(remove_duplicate=False))
313305
loaded_params: Set[str] = set()
314306
for name, loaded_weight in weights:

vllm_ascend/models/qwen2_5_vl_without_padding.py

Lines changed: 1 addition & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -30,8 +30,7 @@
3030
from vllm.config import VllmConfig
3131
from vllm.distributed import parallel_state
3232
from vllm.distributed import utils as dist_utils
33-
from vllm.model_executor.layers.activation import (_ACTIVATION_REGISTRY,
34-
get_act_and_mul_fn)
33+
from vllm.model_executor.layers.activation import get_act_and_mul_fn
3534
from vllm.model_executor.layers.layernorm import RMSNorm
3635
from vllm.model_executor.layers.quantization import QuantizationConfig
3736
from vllm.model_executor.models.qwen2_5_vl import (
@@ -43,7 +42,6 @@
4342
from vllm.multimodal import MULTIMODAL_REGISTRY
4443

4544
from vllm_ascend.models.qwen2_5_vl import AscendQwen2_5_VisionRotaryEmbedding
46-
from vllm_ascend.utils import vllm_version_is
4745

4846

4947
class AscendQwen2_5_VisionAttention_Without_Padding(Qwen2_5_VisionAttention):
@@ -175,8 +173,6 @@ def __init__(
175173
)
176174

177175
act_fn = get_act_and_mul_fn(vision_config.hidden_act)
178-
if vllm_version_is("0.10.0"):
179-
act_fn = _ACTIVATION_REGISTRY[vision_config.hidden_act]
180176
self.blocks = nn.ModuleList([
181177
AscendQwen2_5_VisionBlock_Without_Padding(
182178
dim=self.hidden_size,

vllm_ascend/multistream/ms_split.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -105,7 +105,7 @@ def model_input_split_v1_mla_attn(
105105
[block_table_pre,
106106
block_table_post] = split_attn_tensor_type(attn_metadata.block_tables,
107107
seq_index)
108-
108+
assert attn_metadata.attn_mask is not None
109109
if attn_metadata.attn_state == AscendAttentionState.PrefillNoCache or attn_metadata.attn_state == AscendAttentionState.PrefillCacheHit:
110110
# the attn_mla kernel in torch npu only accept 128*128 attn mask
111111
attn_mask_pre = attn_mask_post = attn_metadata.attn_mask

0 commit comments

Comments
 (0)