feat: Add support for attention and ff biases

gabe-l-hart · gabe-l-hart · commit 88705cb899e9 · 2024-10-04T14:00:04.000-06:00
Branch: GraniteCodeSupport

Signed-off-by: Gabe Goodhart &lt;ghart@us.ibm.com&gt;
diff --git a/torchchat/cli/convert_hf_checkpoint.py b/torchchat/cli/convert_hf_checkpoint.py
@@ -81,10 +81,17 @@ def convert_hf_checkpoint(
         "model.layers.{}.self_attn.k_proj.weight": "layers.{}.attention.wk.weight",
         "model.layers.{}.self_attn.v_proj.weight": "layers.{}.attention.wv.weight",
         "model.layers.{}.self_attn.o_proj.weight": "layers.{}.attention.wo.weight",
+        "model.layers.{}.self_attn.q_proj.bias": "layers.{}.attention.wq.bias",
+        "model.layers.{}.self_attn.k_proj.bias": "layers.{}.attention.wk.bias",
+        "model.layers.{}.self_attn.v_proj.bias": "layers.{}.attention.wv.bias",
+        "model.layers.{}.self_attn.o_proj.bias": "layers.{}.attention.wo.bias",
         "model.layers.{}.self_attn.rotary_emb.inv_freq": None,
         "model.layers.{}.mlp.gate_proj.weight": "layers.{}.feed_forward.w1.weight",
         "model.layers.{}.mlp.up_proj.weight": "layers.{}.feed_forward.w3.weight",
         "model.layers.{}.mlp.down_proj.weight": "layers.{}.feed_forward.w2.weight",
+        "model.layers.{}.mlp.gate_proj.bias": "layers.{}.feed_forward.w1.bias",
+        "model.layers.{}.mlp.up_proj.bias": "layers.{}.feed_forward.w3.bias",
+        "model.layers.{}.mlp.down_proj.bias": "layers.{}.feed_forward.w2.bias",
         "model.layers.{}.input_layernorm.weight": "layers.{}.attention_norm.weight",
         "model.layers.{}.post_attention_layernorm.weight": "layers.{}.ffn_norm.weight",
         "model.norm.weight": "norm.weight",
@@ -135,17 +142,15 @@ def load_safetensors():
         if "layers" in key:
             abstract_key = re.sub(r"(\d+)", "{}", key)
             layer_num = re.search(r"\d+", key).group(0)
-            new_key = weight_map[abstract_key]
-            if new_key is None:
-                continue
+            new_key = weight_map.get(abstract_key, abstract_key)
             new_key = new_key.format(layer_num)
         else:
-            new_key = weight_map[key]
+            new_key = weight_map.get(key, key)
 
         final_result[new_key] = value
 
     for key in tuple(final_result.keys()):
-        if "wq" in key:
+        if "wq.weight" in key:
             q = final_result[key]
             k = final_result[key.replace("wq", "wk")]
             v = final_result[key.replace("wq", "wv")]
diff --git a/torchchat/model.py b/torchchat/model.py
@@ -34,7 +34,7 @@
 try:
     # TODO: remove this after we figure out where in torchtune an `evaluate` module
     # is being imported, which is being confused with huggingface's `evaluate``.
-    import lm_eval  # noqa 
+    import lm_eval  # noqa
 except Exception:
     pass
 
@@ -278,6 +278,9 @@ class TransformerArgs:
     # For pipeline parallel
     n_stages: int = 1
     stage_idx: int = 0
+    # Optional biases
+    attention_bias: bool = False
+    feed_forward_bias: bool = False
 
     def __post_init__(self):
         if self.n_local_heads == -1:
@@ -394,7 +397,7 @@ def from_name(cls, name: str):
         config = [
             config
             for config in known_model_params
-            if config in str(name).upper() or config in str(name)
+            if config.upper() in str(name).upper() or config in str(name)
         ]
 
         # We may have two or more configs matched (e.g., "7B" and
@@ -471,7 +474,7 @@ def build_model(self) -> nn.Module:
                 modules[name] = module_class(TransformerArgs.from_params(config_args))
             else:
                 modules[name] = module_class(**config_args)
-        
+
         # Temporary add extra params to the DeepFusionModel.
         # TODO: Remove it once we can make fusion model configurable in model_param.
         if recipe.fusion_class == DeepFusionModel:
@@ -730,16 +733,16 @@ def __init__(self, config: TransformerArgs):
 
         # key, query, value projections for all heads, but in a batch
         # total_head_dim = (config.n_heads + 2 * config.n_local_heads) * config.head_dim
-        # self.wqkv = nn.Linear(config.dim, total_head_dim, bias=False)
-        self.wq = nn.Linear(config.dim, config.n_heads * config.head_dim, bias=False)
+        # self.wqkv = nn.Linear(config.dim, total_head_dim, bias=config.attention_bias)
+        self.wq = nn.Linear(config.dim, config.n_heads * config.head_dim, bias=config.attention_bias)
         self.wk = nn.Linear(
-            config.dim, config.n_local_heads * config.head_dim, bias=False
+            config.dim, config.n_local_heads * config.head_dim, bias=config.attention_bias
         )
         self.wv = nn.Linear(
-            config.dim, config.n_local_heads * config.head_dim, bias=False
+            config.dim, config.n_local_heads * config.head_dim, bias=config.attention_bias
         )
 
-        self.wo = nn.Linear(config.dim, config.dim, bias=False)
+        self.wo = nn.Linear(config.dim, config.dim, bias=config.attention_bias)
         self.kv_cache = None
 
         self.n_heads = config.n_heads
@@ -852,9 +855,9 @@ def forward(
 class FeedForward(nn.Module):
     def __init__(self, config: TransformerArgs) -> None:
         super().__init__()
-        self.w1 = nn.Linear(config.dim, config.hidden_dim, bias=False)
-        self.w2 = nn.Linear(config.hidden_dim, config.dim, bias=False)
-        self.w3 = nn.Linear(config.dim, config.hidden_dim, bias=False)
+        self.w1 = nn.Linear(config.dim, config.hidden_dim, bias=config.feed_forward_bias)
+        self.w2 = nn.Linear(config.hidden_dim, config.dim, bias=config.feed_forward_bias)
+        self.w3 = nn.Linear(config.dim, config.hidden_dim, bias=config.feed_forward_bias)
 
     def distribute(self, device_mesh: DeviceMesh):
         parallelize_module(self.w1, device_mesh, ColwiseParallel())