|
16 | 16 | from vllm.forward_context import set_forward_context
|
17 | 17 | from vllm.inputs import INPUT_REGISTRY, InputRegistry
|
18 | 18 | from vllm.logger import init_logger
|
| 19 | +from vllm.lora.request import LoRARequest |
19 | 20 | from vllm.model_executor import SamplingMetadata
|
20 | 21 | from vllm.model_executor.layers.sampler import SamplerOutput
|
21 | 22 | from vllm.multimodal import (MULTIMODAL_REGISTRY, MultiModalKwargs,
|
|
34 | 35 | from vllm.worker.utils import assert_enc_dec_mr_supported_scenario
|
35 | 36 |
|
36 | 37 | logger = init_logger(__name__)
|
| 38 | +LORA_WARMUP_RANK = 8 |
37 | 39 |
|
38 | 40 |
|
39 | 41 | @dataclasses.dataclass(frozen=True)
|
@@ -160,7 +162,11 @@ def execute_model(
|
160 | 162 | if num_steps > 1:
|
161 | 163 | raise ValueError("num_steps > 1 is not supported in "
|
162 | 164 | "EncoderDecoderModelRunner")
|
163 |
| - |
| 165 | + if self.lora_config: |
| 166 | + assert model_input.lora_requests is not None |
| 167 | + assert model_input.lora_mapping is not None |
| 168 | + self.set_active_loras(model_input.lora_requests, |
| 169 | + model_input.lora_mapping) |
164 | 170 | if (model_input.attn_metadata is not None
|
165 | 171 | and model_input.attn_metadata.prefill_metadata is None
|
166 | 172 | and model_input.attn_metadata.decode_metadata.use_cuda_graph):
|
@@ -268,6 +274,22 @@ def profile_run(self) -> None:
|
268 | 274 | max_num_batched_tokens = self.scheduler_config.max_num_batched_tokens
|
269 | 275 | max_num_seqs = self.scheduler_config.max_num_seqs
|
270 | 276 |
|
| 277 | + # This represents the maximum number of different requests |
| 278 | + # that will have unique loras, and therefore the max amount of |
| 279 | + # memory consumption. Create dummy lora request copies from the |
| 280 | + # lora request passed in, which contains a lora from the lora |
| 281 | + # warmup path. |
| 282 | + dummy_lora_requests: List[LoRARequest] = [] |
| 283 | + dummy_lora_requests_per_seq: List[LoRARequest] = [] |
| 284 | + if self.lora_config: |
| 285 | + dummy_lora_requests = self._add_dummy_loras( |
| 286 | + self.lora_config.max_loras) |
| 287 | + assert len(dummy_lora_requests) == self.lora_config.max_loras |
| 288 | + dummy_lora_requests_per_seq = [ |
| 289 | + dummy_lora_requests[idx % len(dummy_lora_requests)] |
| 290 | + for idx in range(max_num_seqs) |
| 291 | + ] |
| 292 | + |
271 | 293 | # Profile memory usage with max_num_sequences sequences and the total
|
272 | 294 | # number of tokens equal to max_num_batched_tokens.
|
273 | 295 | seqs: List[SequenceGroupMetadata] = []
|
@@ -315,6 +337,8 @@ def profile_run(self) -> None:
|
315 | 337 | block_tables=None,
|
316 | 338 | encoder_seq_data=encoder_dummy_data.seq_data,
|
317 | 339 | cross_block_table=None,
|
| 340 | + lora_request=dummy_lora_requests_per_seq[group_id] |
| 341 | + if dummy_lora_requests_per_seq else None, |
318 | 342 | multi_modal_data=decoder_dummy_data.multi_modal_data
|
319 | 343 | or encoder_dummy_data.multi_modal_data,
|
320 | 344 | multi_modal_placeholders=decoder_dummy_data.
|
|
0 commit comments