Use exllamav2 by default with autogptq; set seq length for exllama v1

joerunde · njhill · njhill · commit 024cbe2a3630 · 2024-01-29T17:09:39.000-08:00
Co-authored-by: Nick Hill &lt;nickhill@us.ibm.com&gt;
diff --git a/server/text_generation_server/inference_engine/ds_inference.py b/server/text_generation_server/inference_engine/ds_inference.py
@@ -21,7 +21,8 @@ def __init__(
         model_class: type[_BaseAutoModelClass],
         dtype: torch.dtype,
         quantize: Optional[str],
-        model_config: Optional[Any]
+        model_config: Optional[Any],
+        max_sequence_length: Optional[int],
     ) -> None:
         super().__init__(model_path, model_config)
 
diff --git a/server/text_generation_server/inference_engine/hf_accelerate.py b/server/text_generation_server/inference_engine/hf_accelerate.py
@@ -9,12 +9,13 @@
 
 class InferenceEngine(BaseInferenceEngine):
     def __init__(
-            self,
-            model_path: str,
-            model_class: type[_BaseAutoModelClass],
-            dtype: torch.dtype,
-            quantize: Optional[str],
-            model_config: Optional[Any]
+        self,
+        model_path: str,
+        model_class: type[_BaseAutoModelClass],
+        dtype: torch.dtype,
+        quantize: Optional[str],
+        model_config: Optional[Any],
+        max_sequence_length: Optional[int],
     ) -> None:
         super().__init__(model_path, model_config)
 
@@ -32,7 +33,7 @@ def __init__(
             # using LLM.int8()
             kwargs["load_in_8bit"] = True
         elif quantize is not None:
-            raise ValueError(f"{quantize} quantization not supported by hf_transformers engine")
+            raise ValueError(f"{quantize} quantization not supported by hf_accelerate engine")
         else:
             kwargs["torch_dtype"] = dtype
 
diff --git a/server/text_generation_server/inference_engine/hf_custom_tp.py b/server/text_generation_server/inference_engine/hf_custom_tp.py
@@ -28,6 +28,7 @@ def __init__(
         dtype: torch.dtype,
         quantize: Optional[str],
         model_config: Optional[Any],
+        max_sequence_length: Optional[int],
     ) -> None:
         super().__init__(model_path, model_config)
 
diff --git a/server/text_generation_server/inference_engine/hf_optimum_bt.py b/server/text_generation_server/inference_engine/hf_optimum_bt.py
@@ -16,6 +16,7 @@ def __init__(
         dtype: torch.dtype,
         quantize: Optional[str],
         model_config: Optional[Any],
+        max_sequence_length: Optional[int],
     ) -> None:
         super().__init__(model_path, model_config)
 
diff --git a/server/text_generation_server/inference_engine/hf_optimum_ort.py b/server/text_generation_server/inference_engine/hf_optimum_ort.py
@@ -19,6 +19,7 @@ def __init__(
         dtype: torch.dtype,
         quantize: Optional[str],
         model_config: Optional[Any],
+        max_sequence_length: Optional[int],
     ) -> None:
         super().__init__(model_path, model_config)
 
diff --git a/server/text_generation_server/inference_engine/hf_transformers.py b/server/text_generation_server/inference_engine/hf_transformers.py
@@ -1,5 +1,6 @@
 import os
 import torch
+from loguru import logger
 from transformers.models.auto.auto_factory import _BaseAutoModelClass
 
 from text_generation_server.inference_engine.engine import BaseInferenceEngine
@@ -14,7 +15,8 @@ def __init__(
         model_class: type[_BaseAutoModelClass],
         dtype: torch.dtype,
         quantize: Optional[str],
-        model_config: Optional[Any]
+        model_config: Optional[Any],
+        max_sequence_length: Optional[int] = None,
     ) -> None:
         super().__init__(model_path, model_config)
 
@@ -32,9 +34,29 @@ def __init__(
             model_config.init_device = str(self.device)
             kwargs["config"] = model_config
 
+        if quantize is None and hasattr(model_config, "quantization_config"):
+            quantize = model_config.quantization_config.get("quant_method")
+
         if quantize == "bitsandbytes":
             # using LLM.int8()
             kwargs["load_in_8bit"] = True
+
+        elif quantize == "gptq" and model_config.quantization_config.get("bits", 4) == 4:
+            from transformers import GPTQConfig
+
+            logger.info("Using AutoGPTQ to load 4-bit GPTQ model")
+            kwargs["device_map"] = "auto"
+            quantization_config = GPTQConfig(bits=4, max_input_length=max_sequence_length)
+            disable_exllama = os.getenv("DISABLE_EXLLAMA", "False").lower() == "true"
+            if disable_exllama:
+                logger.info("Exllama kernels disabled")
+                quantization_config.use_exllama = False
+            else:
+                exllama_version = int(os.getenv("EXLLAMA_VERSION", "2"))  # Use v2 as default
+                logger.info(f"Using exllama version {exllama_version}")
+                quantization_config.exllama_config = {"version": exllama_version}
+            kwargs["quantization_config"] = quantization_config
+
         elif quantize is not None:
             raise ValueError(f"{quantize} quantization not supported by hf_transformers engine")
         else:
diff --git a/server/text_generation_server/models/__init__.py b/server/text_generation_server/models/__init__.py
@@ -28,7 +28,12 @@
 
 
 def get_model(
-    model_name: str, revision: str, deployment_framework: str, dtype_str: str, quantize: Optional[str]
+    model_name: str,
+    revision: str,
+    deployment_framework: str,
+    dtype_str: str,
+    quantize: Optional[str],
+    max_sequence_length: Optional[int],
 ) -> Model:
     dtype = get_torch_dtype(dtype_str)
     model_path = get_model_path(model_name, revision)
@@ -59,7 +64,14 @@ def get_model(
             model_config = LlamaConfig.from_pretrained(model_path)
 
         from text_generation_server.models.flash_causal_lm import FlashCausalLM
-        return FlashCausalLM(model_name, revision, deployment_framework, dtype, quantize, model_config)
+        return FlashCausalLM(
+            model_name,
+            revision,
+            deployment_framework,
+            dtype, quantize,
+            model_config,
+            max_sequence_length=max_sequence_length,
+        )
 
     elif deployment_framework == "hf_transformers" and int(os.getenv("WORLD_SIZE", "1")) > 1:
         print_rank_n(
@@ -89,9 +101,9 @@ def get_model(
         )
 
     if supports_causal_lm:
-        return CausalLM(model_name, revision, deployment_framework, dtype, quantize, model_config)
+        return CausalLM(model_name, revision, deployment_framework, dtype, quantize, model_config, max_sequence_length)
 
     if supports_seq2seq_lm:
-        return Seq2SeqLM(model_name, revision, deployment_framework, dtype, quantize, model_config)
+        return Seq2SeqLM(model_name, revision, deployment_framework, dtype, quantize, model_config, max_sequence_length)
 
     raise NotImplementedError(f"Unsupported model type {model_type}")
diff --git a/server/text_generation_server/models/causal_lm.py b/server/text_generation_server/models/causal_lm.py
@@ -551,11 +551,12 @@ def __init__(
         dtype: torch.dtype,
         quantize: Optional[str],
         model_config: Union[Any] = None,
+        max_sequence_length: Optional[int] = None,
     ):
         model_path = get_model_path(model_name, revision)
 
         inference_engine = get_inference_engine_class(deployment_framework)(
-            model_path, AutoModelForCausalLM, dtype, quantize, model_config,
+            model_path, AutoModelForCausalLM, dtype, quantize, model_config, max_sequence_length
         )
 
         super(CausalLM, self).__init__(inference_engine, dtype)
diff --git a/server/text_generation_server/models/flash_causal_lm.py b/server/text_generation_server/models/flash_causal_lm.py
@@ -372,6 +372,7 @@ def __init__(
         quantize: Optional[str],
         model_config: Union[Any] = None,
         auto_model_class=None,
+        max_sequence_length: Optional[int] = None,
     ):
         if not torch.cuda.is_available():
             raise NotImplementedError("FlashCausalLM is only available on GPU")
@@ -381,7 +382,7 @@ def __init__(
         model_path = get_model_path(model_name, revision)
 
         inference_engine = get_inference_engine_class(deployment_framework)(
-            model_path, auto_model_class, dtype, quantize, model_config,
+            model_path, auto_model_class, dtype, quantize, model_config, max_sequence_length
         )
 
         super(FlashCausalLM, self).__init__(inference_engine, dtype)
diff --git a/server/text_generation_server/models/seq2seq_lm.py b/server/text_generation_server/models/seq2seq_lm.py
@@ -550,11 +550,12 @@ def __init__(
         dtype: torch.dtype,
         quantize: Optional[str],
         model_config: Union[Any] = None,
+        max_sequence_length: Optional[int] = None,
     ):
         model_path = get_model_path(model_name, revision)
 
         inference_engine = get_inference_engine_class(deployment_framework)(
-            model_path, AutoModelForSeq2SeqLM, dtype, quantize, model_config,
+            model_path, AutoModelForSeq2SeqLM, dtype, quantize, model_config, max_sequence_length
         )
         super(Seq2SeqLM, self).__init__(inference_engine, dtype)
 
diff --git a/server/text_generation_server/server.py b/server/text_generation_server/server.py
@@ -252,7 +252,9 @@ async def serve_inner(
         if cuda_available and cuda_process_memory_fraction < 1:
             torch.cuda.set_per_process_memory_fraction(cuda_process_memory_fraction)
 
-        model = get_model(model_name, revision, deployment_framework, dtype_str, quantize)
+        model = get_model(
+            model_name, revision, deployment_framework, dtype_str, quantize, max_sequence_length
+        )
 
         device = model.engine.get_device()
         if local_rank == 0: