|
42 | 42 | from fastdeploy.model_executor.layers.sample.sampler import Sampler, SpeculativeSampler
|
43 | 43 | from fastdeploy.model_executor.model_loader import get_model_loader
|
44 | 44 | from fastdeploy.platforms import current_platform
|
| 45 | +from fastdeploy.utils import ceil_div |
45 | 46 |
|
46 | 47 | if current_platform.is_iluvatar():
|
47 | 48 | from fastdeploy.model_executor.ops.iluvatar import set_value_by_flags_and_idx
|
@@ -588,17 +589,16 @@ def _dummy_prefill_inputs(self, num_tokens: int, batch_size: int, expected_decod
|
588 | 589 | """Set dummy prefill inputs to share_inputs"""
|
589 | 590 | # NOTE(gongshaotian): The maximum decoding length is equal to the expected decoded tokens plus the eos token
|
590 | 591 | max_dec_len = expected_decode_len + 1
|
591 |
| - full_length = min( |
592 |
| - num_tokens // batch_size, |
| 592 | + input_length = min( |
| 593 | + ceil_div(num_tokens, batch_size), |
593 | 594 | self.parallel_config.max_model_len - max_dec_len,
|
594 | 595 | )
|
595 | 596 |
|
596 | 597 | # NOTE(wanglongzhi): When the full length is too large, DeepEP's buffer size will not be enough to cause the result to appear nan.
|
597 | 598 | # TODO(wanglongzhi): Figure out the accurate buffer size of DeepEP.
|
598 | 599 | if self.fd_config.parallel_config.enable_expert_parallel:
|
599 |
| - full_length = min(full_length, 32) |
| 600 | + input_length = min(input_length, 32) |
600 | 601 |
|
601 |
| - input_length = int(full_length * self.cache_config.kv_cache_ratio) |
602 | 602 | block_num = (
|
603 | 603 | input_length + self.cache_config.block_size - 1
|
604 | 604 | ) // self.cache_config.block_size + self.cache_config.enc_dec_block_num
|
|
0 commit comments