Skip to content

Commit f936657

Browse files
authored
Provide default max model length (#1224)
1 parent 6f88f76 commit f936657

File tree

4 files changed

+14
-9
lines changed

4 files changed

+14
-9
lines changed

vllm/config.py

Lines changed: 11 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -164,9 +164,6 @@ def get_num_kv_heads(self, parallel_config: "ParallelConfig") -> int:
164164
total_num_attention_heads = self.hf_config.num_attention_heads
165165
return total_num_attention_heads // parallel_config.tensor_parallel_size
166166

167-
def get_max_model_len(self) -> int:
168-
return self.max_model_len
169-
170167
def get_num_layers(self, parallel_config: "ParallelConfig") -> int:
171168
total_num_hidden_layers = self.hf_config.num_hidden_layers
172169
return total_num_hidden_layers // parallel_config.pipeline_parallel_size
@@ -378,10 +375,17 @@ def _get_and_verify_max_len(
378375
if max_len_key is not None:
379376
derived_max_model_len = min(derived_max_model_len, max_len_key)
380377
if derived_max_model_len == float("inf"):
381-
raise ValueError(
382-
"The model's config.json must contain one of the following keys "
383-
"to determine the original maximum length of the model: "
384-
f"{possible_keys}")
378+
if max_model_len is not None:
379+
# If max_model_len is specified, we use it.
380+
return max_model_len
381+
382+
default_max_len = 2048
383+
logger.warning(
384+
"The model's config.json does not contain any of the following "
385+
"keys to determine the original maximum length of the model: "
386+
f"{possible_keys}. Assuming the model's maximum length is "
387+
f"{default_max_len}.")
388+
derived_max_model_len = default_max_len
385389

386390
rope_scaling = getattr(hf_config, "rope_scaling", None)
387391
if rope_scaling is not None:

vllm/engine/arg_utils.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -184,7 +184,7 @@ def create_engine_configs(
184184
self.worker_use_ray)
185185
scheduler_config = SchedulerConfig(self.max_num_batched_tokens,
186186
self.max_num_seqs,
187-
model_config.get_max_model_len())
187+
model_config.max_model_len)
188188
return model_config, cache_config, parallel_config, scheduler_config
189189

190190

vllm/engine/llm_engine.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -77,6 +77,7 @@ def __init__(
7777
f"revision={model_config.revision}, "
7878
f"trust_remote_code={model_config.trust_remote_code}, "
7979
f"dtype={model_config.dtype}, "
80+
f"max_seq_len={model_config.max_model_len}, "
8081
f"download_dir={model_config.download_dir!r}, "
8182
f"load_format={model_config.load_format}, "
8283
f"tensor_parallel_size={parallel_config.tensor_parallel_size}, "

vllm/entrypoints/openai/api_server.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -615,7 +615,7 @@ async def fake_stream_generator() -> AsyncGenerator[str, None]:
615615
engine_args = AsyncEngineArgs.from_cli_args(args)
616616
engine = AsyncLLMEngine.from_engine_args(engine_args)
617617
engine_model_config = asyncio.run(engine.get_model_config())
618-
max_model_len = engine_model_config.get_max_model_len()
618+
max_model_len = engine_model_config.max_model_len
619619

620620
# A separate tokenizer to map token IDs to strings.
621621
tokenizer = get_tokenizer(engine_args.tokenizer,

0 commit comments

Comments
 (0)