fix: add falcon to supported tp flash types, fix num_groups in FlashRWLargeAttention

declark1 · njhill · commit 79d38a1bdcdd · 2023-10-13T15:15:25.000-07:00
This PR fixes two issues to enable falcon-180b:

1. Fixes issue with num_groups resulting in `NotImplementedError: Tensor Parallelism is not implemented for 14 not divisible by 8` when loading falcon-180b with 8 GPUs.
2. Adds "falcon" to supported TP flash types
diff --git a/server/text_generation_server/inference_engine/hf_custom_tp.py b/server/text_generation_server/inference_engine/hf_custom_tp.py
@@ -14,7 +14,7 @@
 from text_generation_server.utils.dist import initialize_torch_distributed
 from text_generation_server.utils.hub import local_weight_files
 
-NONTP_FLASH_TYPES = ["RefinedWeb", "RefinedWebModel", "gpt_neox", "gpt_bigcode", "llama"]
+NONTP_FLASH_TYPES = ["RefinedWeb", "RefinedWebModel", "gpt_neox", "gpt_bigcode", "llama", "falcon"]
 TP_NONFLASH_TYPES = ["bloom", "t5", "gpt_neox"]
 TP_FLASH_TYPES = NONTP_FLASH_TYPES  # All flash types currently support TP
 NONTP_NONFLASH_TYPES = ["bloom", "t5"]
diff --git a/server/text_generation_server/models/custom_modeling/flash_rw_modeling.py b/server/text_generation_server/models/custom_modeling/flash_rw_modeling.py
@@ -228,24 +228,23 @@ def __init__(
 
         hidden_size = config.hidden_size
         num_heads = config.n_head
-        num_heads_kv = config.n_head_kv
+        num_groups = config.n_head_kv
 
         self.hidden_size = hidden_size
         self.head_size = hidden_size // num_heads
+        self.num_groups = num_groups
+        self.num_heads = num_heads // self.num_groups
 
         self.rotary_emb = PositionRotaryEmbedding.static(
             self.head_size, base=10000.0, device=weights.device
         )
         self.softmax_scale = self.head_size ** (-0.5)
 
-        self.num_groups = num_heads // (num_heads_kv * 2)
-        self.num_heads = num_heads // self.num_groups
-        self.num_heads_kv = num_heads_kv // self.num_groups
         process_group = weights.process_group
 
         if process_group.size() > self.num_groups:
             raise NotImplementedError(
-                f"Tensor Parallelism is not implemented for world_size > n groups"
+                "Tensor Parallelism is not implemented for world_size > n groups"
             )
         if self.num_groups % process_group.size() != 0:
             raise NotImplementedError(