Added support for quantization in vLLM backend (#690)

SulRash · NathanHB · web-flow · commit 04a74a282b4a · 2025-05-12T13:22:12.000+02:00
* Added support for quanitzation in vllm backend

* Fixed style issues

---------

Co-authored-by: Nathan Habib &lt;30601243+NathanHB@users.noreply.github.com&gt;
diff --git a/src/lighteval/models/vllm/vllm_model.py b/src/lighteval/models/vllm/vllm_model.py
@@ -81,6 +81,8 @@ class VLLMModelConfig(ModelConfig):
     pipeline_parallel_size: PositiveInt = 1  # how many GPUs to use for pipeline parallelism
     gpu_memory_utilization: NonNegativeFloat = 0.9  # lower this if you are running out of memory
     max_model_length: PositiveInt | None = None  # maximum length of the model, ussually infered automatically. reduce this if you encouter OOM issues, 4096 is usually enough
+    quantization: str | None = None
+    load_format: str | None = None
     swap_space: PositiveInt = 4  # CPU swap space size (GiB) per GPU.
     seed: NonNegativeInt = 1234
     trust_remote_code: bool = False
@@ -176,6 +178,12 @@ def _create_auto_model(self, config: VLLMModelConfig) -> Optional[LLM]:
             "max_num_seqs": int(config.max_num_seqs),
             "max_num_batched_tokens": int(config.max_num_batched_tokens),
         }
+
+        if config.quantization is not None:
+            self.model_args["quantization"] = config.quantization
+        if config.load_format is not None:
+            self.model_args["load_format"] = config.load_format
+
         if config.data_parallel_size > 1:
             self.model_args["distributed_executor_backend"] = "ray"
             self._batch_size = "auto"