feat: have hf_accelerate call to hf_transformers

tjohnson31415 · njhill · commit 8a4b195fd2f4 · 2024-02-08T15:03:08.000-08:00
Signed-off-by: Travis Johnson &lt;tsjohnso@us.ibm.com&gt;
diff --git a/server/text_generation_server/inference_engine/hf_accelerate.py b/server/text_generation_server/inference_engine/hf_accelerate.py
@@ -1,13 +1,12 @@
-import os
+from typing import Any, Optional
+
 import torch
 from transformers.models.auto.auto_factory import _BaseAutoModelClass
 
-from text_generation_server.inference_engine.engine import BaseInferenceEngine
-from text_generation_server.utils.hub import TRUST_REMOTE_CODE
-from typing import Any, Optional
+from text_generation_server.inference_engine.hf_transformers import InferenceEngine as HFTransformersInferenceEngine
 
 
-class InferenceEngine(BaseInferenceEngine):
+class InferenceEngine(HFTransformersInferenceEngine):
     def __init__(
         self,
         model_path: str,
@@ -17,28 +16,12 @@ def __init__(
         model_config: Optional[Any],
         max_sequence_length: Optional[int],
     ) -> None:
-        super().__init__(model_path, model_config)
-
-        kwargs = {
-            "pretrained_model_name_or_path": model_path,
-            "device_map": None,
-            "local_files_only": True,
-            "trust_remote_code": TRUST_REMOTE_CODE,
-        }
-
-        if self.device.type == "cuda":
-            kwargs["device_map"] = "balanced_low_0" if self.world_size > 1 else "auto"
-
-        if quantize == "bitsandbytes":
-            # using LLM.int8()
-            kwargs["load_in_8bit"] = True
-        elif quantize is not None:
-            raise ValueError(f"{quantize} quantization not supported by hf_accelerate engine")
-        else:
-            kwargs["torch_dtype"] = dtype
-
-        slow_but_exact = os.getenv('BLOOM_SLOW_BUT_EXACT', 'false').lower() == 'true'
-        if slow_but_exact:
-            kwargs["slow_but_exact"] = True
-
-        self.model = model_class.from_pretrained(**kwargs).requires_grad_(False).eval()
+        super().__init__(
+            model_path,
+            model_class,
+            dtype,
+            quantize,
+            model_config,
+            max_sequence_length,
+            _use_accelerate=True
+        )
diff --git a/server/text_generation_server/inference_engine/hf_transformers.py b/server/text_generation_server/inference_engine/hf_transformers.py
@@ -17,6 +17,8 @@ def __init__(
         quantize: Optional[str],
         model_config: Optional[Any],
         max_sequence_length: Optional[int] = None,
+        # internal arg only for this engine
+        _use_accelerate: bool = False,
     ) -> None:
         super().__init__(model_path, model_config)
 
@@ -26,6 +28,9 @@ def __init__(
             "trust_remote_code": TRUST_REMOTE_CODE,
         }
 
+        if _use_accelerate and self.device.type == "cuda":
+            kwargs["device_map"]= "balanced_low_0" if self.world_size > 1 else "auto"
+
         # TODO: consider if Flash Attention should be enabled based on FLASH_ATTENTION=True
         if attn_impl := os.getenv("TRANSFORMERS_ATTN_IMPL"):
             logger.info(f"Setting attn_implementation to {attn_impl}")