Skip to content

Commit 04a74a2

Browse files
SulRashNathanHB
andauthored
Added support for quantization in vLLM backend (#690)
* Added support for quanitzation in vllm backend * Fixed style issues --------- Co-authored-by: Nathan Habib <[email protected]>
1 parent f7392fa commit 04a74a2

File tree

1 file changed

+8
-0
lines changed

1 file changed

+8
-0
lines changed

src/lighteval/models/vllm/vllm_model.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -81,6 +81,8 @@ class VLLMModelConfig(ModelConfig):
8181
pipeline_parallel_size: PositiveInt = 1 # how many GPUs to use for pipeline parallelism
8282
gpu_memory_utilization: NonNegativeFloat = 0.9 # lower this if you are running out of memory
8383
max_model_length: PositiveInt | None = None # maximum length of the model, ussually infered automatically. reduce this if you encouter OOM issues, 4096 is usually enough
84+
quantization: str | None = None
85+
load_format: str | None = None
8486
swap_space: PositiveInt = 4 # CPU swap space size (GiB) per GPU.
8587
seed: NonNegativeInt = 1234
8688
trust_remote_code: bool = False
@@ -176,6 +178,12 @@ def _create_auto_model(self, config: VLLMModelConfig) -> Optional[LLM]:
176178
"max_num_seqs": int(config.max_num_seqs),
177179
"max_num_batched_tokens": int(config.max_num_batched_tokens),
178180
}
181+
182+
if config.quantization is not None:
183+
self.model_args["quantization"] = config.quantization
184+
if config.load_format is not None:
185+
self.model_args["load_format"] = config.load_format
186+
179187
if config.data_parallel_size > 1:
180188
self.model_args["distributed_executor_backend"] = "ray"
181189
self._batch_size = "auto"

0 commit comments

Comments
 (0)