Added support for launching inference endpoint with different model dtypes (#124)

shaltielshmid · clefourrier · web-flow · commit 07ca0b280b56 · 2024-03-22T13:42:12.000+01:00
* Added support for any dtype

---------

Co-authored-by: Clémentine Fourrier &lt;22726840+clefourrier@users.noreply.github.com&gt;
diff --git a/src/lighteval/models/endpoint_model.py b/src/lighteval/models/endpoint_model.py
@@ -87,6 +87,7 @@ def __init__(
                             "MAX_INPUT_LENGTH": "2047",
                             "MAX_TOTAL_TOKENS": "2048",
                             "MODEL_ID": "/repository",
+                            **config.get_dtype_args(),
                         },
                         "url": "ghcr.io/huggingface/text-generation-inference:1.1.0",
                     },
diff --git a/src/lighteval/models/model_config.py b/src/lighteval/models/model_config.py
@@ -22,7 +22,7 @@
 
 from argparse import Namespace
 from dataclasses import dataclass
-from typing import Optional, Union
+from typing import Dict, Optional, Union
 
 import torch
 from transformers import AutoConfig, BitsAndBytesConfig, GPTQConfig, PretrainedConfig
@@ -233,11 +233,24 @@ class InferenceEndpointModelConfig:
     region: str
     instance_size: str
     instance_type: str
+    model_dtype: str
     framework: str = "pytorch"
     endpoint_type: str = "protected"
     should_reuse_existing: bool = False
     add_special_tokens: bool = True
 
+    def get_dtype_args(self) -> Dict[str, str]:
+        model_dtype = self.model_dtype.lower()
+        if model_dtype in ["awq", "eetq", "gptq"]:
+            return {"QUANTIZE": model_dtype}
+        if model_dtype == "8bit":
+            return {"QUANTIZE": "bitsandbytes"}
+        if model_dtype == "4bit":
+            return {"QUANTIZE": "bitsandbytes-nf4"}
+        if model_dtype in ["bfloat16", "float16"]:
+            return {"DTYPE": model_dtype}
+        return {}
+
 
 def create_model_config(args: Namespace, accelerator: Union["Accelerator", None]) -> BaseModelConfig:  # noqa: C901
     """
@@ -282,6 +295,7 @@ def create_model_config(args: Namespace, accelerator: Union["Accelerator", None]
                 instance_size=args.instance_size,
                 instance_type=args.instance_type,
                 should_reuse_existing=args.reuse_existing,
+                model_dtype=args.model_dtype,
             )
         return InferenceModelConfig(model=args.endpoint_model_name)
 
diff --git a/src/lighteval/models/model_loader.py b/src/lighteval/models/model_loader.py
@@ -108,7 +108,7 @@ def load_model_with_inference_endpoints(config: InferenceEndpointModelConfig, en
     model_info = ModelInfo(
         model_name=model.name,
         model_sha=model.revision,
-        model_dtype="default",
+        model_dtype=config.model_dtype or "default",
         model_size=-1,
     )
     return model, model_info

Original file line number	Diff line number	Diff line change
`@@ -108,7 +108,7 @@ def load_model_with_inference_endpoints(config: InferenceEndpointModelConfig, en`
`108`	`108`	`model_info = ModelInfo(`
`109`	`109`	`model_name=model.name,`
`110`	`110`	`model_sha=model.revision,`
`111`		`- model_dtype="default",`
	`111`	`+ model_dtype=config.model_dtype or "default",`
`112`	`112`	`model_size=-1,`
`113`	`113`	`)`
`114`	`114`	`return model, model_info`