Rename hf_custom_tp engine to tgis_native

njhill · njhill · commit f27b7bcc6d57 · 2024-02-08T14:58:12.000-08:00
And retain configuration backwards compatibility
diff --git a/README.md b/README.md
@@ -114,7 +114,7 @@ The following model types can currently be run in sharded mode where the weights
 
 1. Ensure that the model weights are in `safetensors format (see above)
 2. Ensure that the `CUDA_VISIBLE_DEVICES` environment variable is set appropriately (e.g. "0,1" to use the first two GPUs). The number of GPUs to use will be inferred from this or else can be set explicitly with the `NUM_GPUS` environment variable.
-3. Set the environment variable `DEPLOYMENT_FRAMEWORK=hf_custom_tp`
+3. Set the environment variable `DEPLOYMENT_FRAMEWORK=tgis_native`
 
 ### TLS configuration
 
diff --git a/deployment/base/patches/flash-attention.yaml b/deployment/base/patches/flash-attention.yaml
@@ -9,6 +9,6 @@ spec:
         - name: server
           env:
             - name: DEPLOYMENT_FRAMEWORK
-              value: hf_custom_tp
+              value: tgis_native
             - name: FLASH_ATTENTION
               value: "true"
diff --git a/deployment/models/bloom/kustomization.yaml b/deployment/models/bloom/kustomization.yaml
@@ -28,7 +28,7 @@ patchesStrategicMerge:
              - name: MODEL_NAME
                value: bigscience/bloom
              - name: DEPLOYMENT_FRAMEWORK
-               value: hf_custom_tp
+               value: tgis_native
                
              - name: MAX_BATCH_SIZE
                value: "16"
diff --git a/deployment/models/bloomchat-v1/kustomization.yaml b/deployment/models/bloomchat-v1/kustomization.yaml
@@ -28,7 +28,7 @@ patchesStrategicMerge:
              - name: MODEL_NAME
                value: sambanovasystems/BLOOMChat-176B-v1
              - name: DEPLOYMENT_FRAMEWORK
-               value: hf_custom_tp
+               value: tgis_native
              - name: DTYPE_STR
                value: float16
              - name: MAX_BATCH_SIZE
diff --git a/deployment/models/bloomz/kustomization.yaml b/deployment/models/bloomz/kustomization.yaml
@@ -28,7 +28,7 @@ patchesStrategicMerge:
              - name: MODEL_NAME
                value: bigscience/bloomz
              - name: DEPLOYMENT_FRAMEWORK
-               value: hf_custom_tp
+               value: tgis_native
              - name: DTYPE_STR
                value: float16
              - name: MAX_BATCH_SIZE
diff --git a/deployment/models/flan-ul2-tp/kustomization.yaml b/deployment/models/flan-ul2-tp/kustomization.yaml
@@ -28,5 +28,5 @@ patchesStrategicMerge:
              - name: MODEL_NAME
                value: google/flan-ul2
              - name: DEPLOYMENT_FRAMEWORK
-               value: hf_custom_tp
+               value: tgis_native
 
diff --git a/launcher/src/main.rs b/launcher/src/main.rs
@@ -162,6 +162,14 @@ fn main() -> ExitCode {
         Err(VarError::NotUnicode(_)) => panic!("PYTORCH_CUDA_ALLOC_CONF set to non-unicode value"),
     };
 
+    // Backwards compatibility for "hf_custom_tp" deployment engine name
+    let deployment_framework = if args.deployment_framework == "hf_custom_tp" {
+        warn!("The \"hf_custom_tp\" deployment engine name is deprecated, please use \"tgis_native\"");
+        "tgis_native"
+    } else {
+        &args.deployment_framework
+    };
+
     // Signal handler
     let running = Arc::new(AtomicBool::new(true));
     let r = running.clone();
@@ -182,6 +190,7 @@ fn main() -> ExitCode {
     // Start shard processes
     for rank in 0..num_shard {
         let args = args.clone();
+        let deployment_framework = deployment_framework.to_string();
         let status_sender = status_sender.clone();
         let shutdown = shutdown.clone();
         let shutdown_sender = shutdown_sender.clone();
@@ -190,7 +199,7 @@ fn main() -> ExitCode {
             shard_manager(
                 args.model_name,
                 args.revision,
-                args.deployment_framework,
+                deployment_framework,
                 args.dtype.or(args.dtype_str),
                 args.quantize,
                 max_sequence_length,
diff --git a/server/text_generation_server/inference_engine/tgis_native.py b/server/text_generation_server/inference_engine/tgis_native.py
@@ -53,7 +53,7 @@ def __init__(
                     f"Flash attention currently only supported by the following model types: {NONTP_FLASH_TYPES}"
                 )
         elif model_type not in NONTP_NONFLASH_TYPES:
-            raise ValueError("hf_custom_tp engine must be used with FLASH_ATTENTION, num_shards > 1 and/or BLOOM or T5")
+            raise ValueError("tgis_native engine must be used with FLASH_ATTENTION, num_shards > 1 and/or BLOOM or T5")
 
         aliases = None
 
@@ -105,7 +105,7 @@ def __init__(
         torch.distributed.barrier(group=self.process_group)
         filenames = local_weight_files(model_path, extension=".safetensors")
         if not filenames:
-            raise ValueError("No safetensors weights found - required for hf_custom_tp engine")
+            raise ValueError("No safetensors weights found - required for tgis_native engine")
 
         weights = Weights(
             filenames, device=self.device, dtype=dtype, process_group=self.process_group, aliases=aliases
diff --git a/server/text_generation_server/models/__init__.py b/server/text_generation_server/models/__init__.py
@@ -45,12 +45,12 @@ def get_model(
         import text_generation_server.utils.flash_attn as flash_attn
         print(f"Using Flash Attention V2: {flash_attn.HAS_FLASH_ATTN_V2}")
 
-        if deployment_framework != "hf_custom_tp":
+        if deployment_framework != "tgis_native":
             print_rank_n(
-                f"WARNING: Using deployment engine hf_custom_tp rather than {deployment_framework} "
+                f"WARNING: Using deployment engine tgis_native rather than {deployment_framework} "
                 "because FLASH_ATTENTION is enabled"
             )
-            deployment_framework = "hf_custom_tp"
+            deployment_framework = "tgis_native"
 
         if model_type in ["RefinedWeb", "RefinedWebModel", "falcon"]:
             # Custom config type for RW models
@@ -75,10 +75,10 @@ def get_model(
 
     elif deployment_framework == "hf_transformers" and int(os.getenv("WORLD_SIZE", "1")) > 1:
         print_rank_n(
-            f"WARNING: Using deployment engine hf_custom_tp rather than {deployment_framework} "
+            f"WARNING: Using deployment engine tgis_native rather than {deployment_framework} "
             "because more than one shard is configured"
         )
-        deployment_framework = "hf_custom_tp"
+        deployment_framework = "tgis_native"
 
     supports_causal_lm = model_type in modeling_auto.MODEL_FOR_CAUSAL_LM_MAPPING_NAMES \
         or type(model_config) in AutoModelForCausalLM._model_mapping \
@@ -95,9 +95,9 @@ def get_model(
     if supports_seq2seq_lm and model_type == "bart":
         supports_causal_lm = False
 
-    if deployment_framework != "hf_custom_tp" and (model_type == "bloom" or model_type == "t5"):
+    if deployment_framework != "tgis_native" and (model_type == "bloom" or model_type == "t5"):
         print_rank_n(
-            "WARNING: It's recommended to use the hf_custom_tp engine with safetensors weights for T5 and BLOOM models"
+            "WARNING: It's recommended to use the tgis_native engine with safetensors weights for T5 and BLOOM models"
         )
 
     if supports_causal_lm:
diff --git a/server/text_generation_server/server.py b/server/text_generation_server/server.py
@@ -276,7 +276,7 @@ async def serve_inner(
             print(f"Using device {device}, dtype {dtype_str}, quantize {quantize}")
             print(model.config.__str__())
 
-        if quantize == "gptq" and deployment_framework == "hf_custom_tp":
+        if quantize == "gptq" and deployment_framework == "tgis_native":
             from text_generation_server.utils.layers import HAS_EXLLAMA, EXLLAMA_VERSION
             if HAS_EXLLAMA:
                 try: