Merge pull request #74 from IBM/main

openshift-merge-bot[bot] · web-flow · commit 2d78aed113a3 · 2024-05-07T11:12:52.000Z
[pull] main from IBM:main
diff --git a/server/poetry.lock b/server/poetry.lock
diff --git a/server/pyproject.toml b/server/pyproject.toml
@@ -30,7 +30,7 @@ onnxruntime-gpu = { version = "^1.17.1", optional = true }
 onnx = { version = "^1.16.0", optional = true }
 einops = "^0.7.0"
 ibm-fms = { version = "^0.0", optional = true }
-fms-extras = { git = "https://github.com/foundation-model-stack/fms-extras", rev = "a010516ff2c938c206b9b342b16bd747ef07d43c", optional = true }
+fms-extras = { git = "https://github.com/foundation-model-stack/fms-extras", rev = "d41f8a34c9841aa3c4c59f17b5e7f3cb365f49de", optional = true }
 
 # Explicitly install some transitive dependencies to avoid CVEs
 jinja2 = ">=3.1.3"
diff --git a/server/text_generation_server/inference_engine/tgis_native.py b/server/text_generation_server/inference_engine/tgis_native.py
@@ -101,6 +101,11 @@ def __init__(
             model_class = FlashRWForCausalLM
 
         elif model_type == "llama":
+            # See: https://github.com/ibm-granite/vllm_granite/blob/main/vllm/model_executor/models/llama.py#L353-L354
+            if self._config.tie_word_embeddings:
+                aliases = {
+                    "lm_head.weight": ["model.embed_tokens.weight"]
+                }
             if PAGED_ATTENTION:
                 from text_generation_server.models.custom_modeling.paged_llama_modeling import PagedLlamaForCausalLM
                 model_class = PagedLlamaForCausalLM
diff --git a/server/text_generation_server/models/custom_modeling/flash_llama_modeling.py b/server/text_generation_server/models/custom_modeling/flash_llama_modeling.py
@@ -64,6 +64,8 @@ def __init__(
         tie_word_embeddings=False,
         rope_scaling=None,
         rope_theta=10000.0,
+        attention_bias=False,
+        mlp_bias=False,
         **kwargs,
     ):
         self.vocab_size = vocab_size
@@ -85,6 +87,8 @@ def __init__(
         self.use_cache = use_cache
         self.rope_scaling = rope_scaling
         self.rope_theta = rope_theta
+        self.attention_bias = attention_bias
+        self.mlp_bias = mlp_bias
 
         super().__init__(
             pad_token_id=pad_token_id,
@@ -169,7 +173,7 @@ def _load_gqa(config, prefix: str, weights):
             config.hidden_size,
             ], f"{list(weight.shape)} != {[(num_heads + 2 * config.num_key_value_heads) * head_size, config.hidden_size]}"
 
-    return TensorParallelColumnLinear(get_linear(weight, bias=None, quantize=config.quantize))
+    return TensorParallelColumnLinear(get_linear(weight, bias=config.attention_bias, quantize=config.quantize))
 
 
 class FlashLlamaAttention(torch.nn.Module):
@@ -220,13 +224,13 @@ def __init__(
                 prefixes=[f"{prefix}.q_proj", f"{prefix}.k_proj", f"{prefix}.v_proj"],
                 dim=0,
                 weights=weights,
-                bias=False,
+                bias=config.attention_bias,
             )
         self.o_proj = TensorParallelRowLinear.load(
             config,
             prefix=f"{prefix}.o_proj",
             weights=weights,
-            bias=False,
+            bias=config.attention_bias,
         )
 
     def forward(
@@ -309,13 +313,13 @@ def __init__(self, prefix, config, weights):
             prefixes=[f"{prefix}.gate_proj", f"{prefix}.up_proj"],
             weights=weights,
             dim=0,
-            bias=False,
+            bias=config.mlp_bias,
         )
         self.down_proj = TensorParallelRowLinear.load(
             config,
             prefix=f"{prefix}.down_proj",
             weights=weights,
-            bias=False,
+            bias=config.mlp_bias,
         )
         self.intermediate_size = (
             config.intermediate_size // weights.process_group.size()
diff --git a/server/text_generation_server/models/custom_modeling/paged_llama_modeling.py b/server/text_generation_server/models/custom_modeling/paged_llama_modeling.py
@@ -64,6 +64,8 @@ def __init__(
         tie_word_embeddings=False,
         rope_scaling=None,
         rope_theta=10000.0,
+        attention_bias=False,
+        mlp_bias=False,
         **kwargs,
     ):
         self.vocab_size = vocab_size
@@ -85,6 +87,8 @@ def __init__(
         self.use_cache = use_cache
         self.rope_scaling = rope_scaling
         self.rope_theta = rope_theta
+        self.attention_bias = attention_bias
+        self.mlp_bias = mlp_bias
 
         super().__init__(
             pad_token_id=pad_token_id,
@@ -169,7 +173,7 @@ def _load_gqa(config, prefix: str, weights):
             config.hidden_size,
             ], f"{list(weight.shape)} != {[(num_heads + 2 * config.num_key_value_heads) * head_size, config.hidden_size]}"
 
-    return TensorParallelColumnLinear(get_linear(weight, bias=None, quantize=config.quantize))
+    return TensorParallelColumnLinear(get_linear(weight, bias=config.attention_bias, quantize=config.quantize))
 
 
 class PagedLlamaAttention(torch.nn.Module):
@@ -207,13 +211,13 @@ def __init__(
                 prefixes=[f"{prefix}.q_proj", f"{prefix}.k_proj", f"{prefix}.v_proj"],
                 dim=0,
                 weights=weights,
-                bias=False,
+                bias=config.attention_bias,
             )
         self.o_proj = TensorParallelRowLinear.load(
             config,
             prefix=f"{prefix}.o_proj",
             weights=weights,
-            bias=False,
+            bias=config.attention_bias,
         )
 
     def forward(
@@ -280,13 +284,13 @@ def __init__(self, prefix, config, weights):
             prefixes=[f"{prefix}.gate_proj", f"{prefix}.up_proj"],
             weights=weights,
             dim=0,
-            bias=False,
+            bias=config.mlp_bias,
         )
         self.down_proj = TensorParallelRowLinear.load(
             config,
             prefix=f"{prefix}.down_proj",
             weights=weights,
-            bias=False,
+            bias=config.mlp_bias,
         )
         self.intermediate_size = (
             config.intermediate_size // weights.process_group.size()