Skip to content

Commit 99ef59c

Browse files
sarckkyeqcharlotte
andauthored
[Llama4] Enable attention temperature tuning by default for long context (>32k) (#16439)
Signed-off-by: Ye (Charlotte) Qi <[email protected]> Co-authored-by: Ye (Charlotte) Qi <[email protected]>
1 parent d544d14 commit 99ef59c

File tree

1 file changed

+6
-2
lines changed

1 file changed

+6
-2
lines changed

vllm/model_executor/models/llama4.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -467,11 +467,15 @@ class Llama4ForCausalLM(LlamaForCausalLM):
467467
}
468468

469469
def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
470-
# Update temperature tuning config from generation config
470+
# update temperature tuning config from generation config
471471
gen_config = vllm_config.model_config.try_get_generation_config()
472472
gen_config.update(vllm_config.model_config.override_generation_config)
473+
# enable temperature tuning by default when max_model_len > 32K
474+
default_attn_temperature_tuning = \
475+
vllm_config.model_config.max_model_len > 32768
473476
vllm_config.model_config.hf_config.attn_temperature_tuning \
474-
= gen_config.get("attn_temperature_tuning", False)
477+
= gen_config.get(
478+
"attn_temperature_tuning", default_attn_temperature_tuning)
475479

476480
super().__init__(vllm_config=vllm_config,
477481
prefix=prefix,

0 commit comments

Comments
 (0)