added mlp and attn bias option to flash and paged llama models (#85)

JRosenkranz · joerunde · web-flow · commit deb99f611a5e · 2024-05-06T14:18:47.000-07:00
#### Motivation The `Calico` models currently set the mlp and attention bias to true, which was hard-coded to false in flash and paged llama implementations. This will use the config params set in huggingface/transformers#30031 to set those values properly. #### Modifications - added attention_bias, mlp_bias to config for Flash and Paged Llama implementations (default is False) - set bias in attention and mlp to the config value #### Result Models should be able to load properly if containing attention and mlp bias --------- Signed-off-by: Joshua Rosenkranz <jmrosenk@us.ibm.com> Signed-off-by: Joe Runde <Joseph.Runde@ibm.com> Co-authored-by: Joe Runde <Joseph.Runde@ibm.com>
diff --git a/server/text_generation_server/inference_engine/tgis_native.py b/server/text_generation_server/inference_engine/tgis_native.py
@@ -101,6 +101,11 @@ def __init__(
             model_class = FlashRWForCausalLM
 
         elif model_type == "llama":
+            # See: https://github.com/ibm-granite/vllm_granite/blob/main/vllm/model_executor/models/llama.py#L353-L354
+            if self._config.tie_word_embeddings:
+                aliases = {
+                    "lm_head.weight": ["model.embed_tokens.weight"]
+                }
             if PAGED_ATTENTION:
                 from text_generation_server.models.custom_modeling.paged_llama_modeling import PagedLlamaForCausalLM
                 model_class = PagedLlamaForCausalLM
diff --git a/server/text_generation_server/models/custom_modeling/flash_llama_modeling.py b/server/text_generation_server/models/custom_modeling/flash_llama_modeling.py
@@ -64,6 +64,8 @@ def __init__(
         tie_word_embeddings=False,
         rope_scaling=None,
         rope_theta=10000.0,
+        attention_bias=False,
+        mlp_bias=False,
         **kwargs,
     ):
         self.vocab_size = vocab_size
@@ -85,6 +87,8 @@ def __init__(
         self.use_cache = use_cache
         self.rope_scaling = rope_scaling
         self.rope_theta = rope_theta
+        self.attention_bias = attention_bias
+        self.mlp_bias = mlp_bias
 
         super().__init__(
             pad_token_id=pad_token_id,
@@ -169,7 +173,7 @@ def _load_gqa(config, prefix: str, weights):
             config.hidden_size,
             ], f"{list(weight.shape)} != {[(num_heads + 2 * config.num_key_value_heads) * head_size, config.hidden_size]}"
 
-    return TensorParallelColumnLinear(get_linear(weight, bias=None, quantize=config.quantize))
+    return TensorParallelColumnLinear(get_linear(weight, bias=config.attention_bias, quantize=config.quantize))
 
 
 class FlashLlamaAttention(torch.nn.Module):
@@ -220,13 +224,13 @@ def __init__(
                 prefixes=[f"{prefix}.q_proj", f"{prefix}.k_proj", f"{prefix}.v_proj"],
                 dim=0,
                 weights=weights,
-                bias=False,
+                bias=config.attention_bias,
             )
         self.o_proj = TensorParallelRowLinear.load(
             config,
             prefix=f"{prefix}.o_proj",
             weights=weights,
-            bias=False,
+            bias=config.attention_bias,
         )
 
     def forward(
@@ -309,13 +313,13 @@ def __init__(self, prefix, config, weights):
             prefixes=[f"{prefix}.gate_proj", f"{prefix}.up_proj"],
             weights=weights,
             dim=0,
-            bias=False,
+            bias=config.mlp_bias,
         )
         self.down_proj = TensorParallelRowLinear.load(
             config,
             prefix=f"{prefix}.down_proj",
             weights=weights,
-            bias=False,
+            bias=config.mlp_bias,
         )
         self.intermediate_size = (
             config.intermediate_size // weights.process_group.size()
diff --git a/server/text_generation_server/models/custom_modeling/paged_llama_modeling.py b/server/text_generation_server/models/custom_modeling/paged_llama_modeling.py
@@ -64,6 +64,8 @@ def __init__(
         tie_word_embeddings=False,
         rope_scaling=None,
         rope_theta=10000.0,
+        attention_bias=False,
+        mlp_bias=False,
         **kwargs,
     ):
         self.vocab_size = vocab_size
@@ -85,6 +87,8 @@ def __init__(
         self.use_cache = use_cache
         self.rope_scaling = rope_scaling
         self.rope_theta = rope_theta
+        self.attention_bias = attention_bias
+        self.mlp_bias = mlp_bias
 
         super().__init__(
             pad_token_id=pad_token_id,
@@ -169,7 +173,7 @@ def _load_gqa(config, prefix: str, weights):
             config.hidden_size,
             ], f"{list(weight.shape)} != {[(num_heads + 2 * config.num_key_value_heads) * head_size, config.hidden_size]}"
 
-    return TensorParallelColumnLinear(get_linear(weight, bias=None, quantize=config.quantize))
+    return TensorParallelColumnLinear(get_linear(weight, bias=config.attention_bias, quantize=config.quantize))
 
 
 class PagedLlamaAttention(torch.nn.Module):
@@ -207,13 +211,13 @@ def __init__(
                 prefixes=[f"{prefix}.q_proj", f"{prefix}.k_proj", f"{prefix}.v_proj"],
                 dim=0,
                 weights=weights,
-                bias=False,
+                bias=config.attention_bias,
             )
         self.o_proj = TensorParallelRowLinear.load(
             config,
             prefix=f"{prefix}.o_proj",
             weights=weights,
-            bias=False,
+            bias=config.attention_bias,
         )
 
     def forward(
@@ -280,13 +284,13 @@ def __init__(self, prefix, config, weights):
             prefixes=[f"{prefix}.gate_proj", f"{prefix}.up_proj"],
             weights=weights,
             dim=0,
-            bias=False,
+            bias=config.mlp_bias,
         )
         self.down_proj = TensorParallelRowLinear.load(
             config,
             prefix=f"{prefix}.down_proj",
             weights=weights,
-            bias=False,
+            bias=config.mlp_bias,
         )
         self.intermediate_size = (
             config.intermediate_size // weights.process_group.size()