Skip to content
4 changes: 4 additions & 0 deletions src/lighteval/models/vllm/vllm_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -105,6 +105,8 @@ class VLLMModelConfig(ModelConfig):
max_num_batched_tokens: PositiveInt = 2048 # maximum number of tokens per batch
subfolder: str | None = None
is_async: bool = False # Whether to use the async version or sync version of the model
use_dual_chunk_attention: bool = False
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

what version of vllm are you using for this ? I get TypeError: EngineArgs.__init__() got an unexpected keyword argument 'use_dual_chunk_attention' with vllm == 0.8.5.post1

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I was on 0.9.1 I think

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

(changed my env to same as you now)

enforce_eager: bool = False


class VLLMModel(LightevalModel):
Expand Down Expand Up @@ -187,6 +189,8 @@ def _create_auto_model(self, config: VLLMModelConfig) -> Optional[LLM]:
"seed": int(config.seed),
"max_num_seqs": int(config.max_num_seqs),
"max_num_batched_tokens": int(config.max_num_batched_tokens),
"enforce_eager": config.enforce_eager,
"use_dual_chunk_attention": config.use_dual_chunk_attention,
}

if config.quantization is not None:
Expand Down
Loading