feat - suport tie_word_embeddings option in hf config.json, to fix qwen1.5 0.5b finetune load failed

rtp-llm · rtp-llm · commit 96b567ca7c67 · 2024-04-07T22:56:00.000+08:00
diff --git a/maga_transformer/config/gpt_init_model_parameters.py b/maga_transformer/config/gpt_init_model_parameters.py
@@ -100,7 +100,8 @@ class GptInitModelParameters:
         "normalize_lm_head_weight",
         "ref_model",
         "is_quant_mode",
-        "model_type"
+        "model_type",
+        "tie_word_embeddings"
     }
 
     def __init__(self,
@@ -134,6 +135,7 @@ def __init__(self,
         self.ref_model: Optional[torch.nn.Module] = None
 
         self.model_type = ModelType.NORMAL
+        self.tie_word_embeddings = False
 
         for k, v in kwargs.items():
             setattr(self, k, v)
diff --git a/maga_transformer/models/bloom.py b/maga_transformer/models/bloom.py
@@ -107,6 +107,7 @@ def from_huggingface(config_json: Dict[str, Any]):
         config.layernorm_eps = config_json['layer_norm_epsilon']
         config.inter_size = hidden_size * 4
         config.special_tokens.eos_token_id = config_json['eos_token_id']
+        config.tie_word_embeddings = config_json.get('tie_word_embeddings', False)
         return config
 
     @classmethod
diff --git a/maga_transformer/models/chat_glm.py b/maga_transformer/models/chat_glm.py
@@ -84,6 +84,7 @@ def from_huggingface(cls, config_json: Dict[str, Any]):
         config.special_tokens.bos_token_id = config_json.get('bos_token_id', config.special_tokens.bos_token_id)
         config.special_tokens.eos_token_id = config_json.get('eos_token_id', config.special_tokens.eos_token_id)
         config.src_quantization_bit = config_json.get('quantization_bit', 0)
+        config.tie_word_embeddings = config_json.get('tie_word_embeddings', False)
         return config
 
     # override
diff --git a/maga_transformer/models/chat_glm_v2.py b/maga_transformer/models/chat_glm_v2.py
@@ -52,6 +52,7 @@ def from_huggingface(cls, config_json: Dict[str, Any]):
         config.special_tokens.eos_token_id = config_json['eos_token_id']
         config.src_quantization_bit = config_json.get('quantization_bit', 0)
         config.rotary_embedding_dim = config.size_per_head
+        config.tie_word_embeddings = config_json.get('tie_word_embeddings', False)
         config = cls.get_rotary_embedding_scale(config, config_json)
         return config
     
diff --git a/maga_transformer/models/falcon.py b/maga_transformer/models/falcon.py
@@ -81,6 +81,7 @@ def _create_config(cls, ckpt_path: str):
         config.special_tokens.bos_token_id = config_json['bos_token_id']
         config.special_tokens.eos_token_id = config_json['eos_token_id']
         config.rotary_embedding_dim = config.size_per_head
+        config.tie_word_embeddings = config_json.get('tie_word_embeddings', False)
         return config
 
 register_model('falcon', Falcon, ["FalconForCausalLM"])
diff --git a/maga_transformer/models/gpt_neox.py b/maga_transformer/models/gpt_neox.py
@@ -64,6 +64,7 @@ def from_huggingface(config_json: Dict[str, Any]):
         config.has_post_decoder_layernorm = True
         config.norm_type = 'layernorm'
         config.use_norm_input_residual = True
+        config.tie_word_embeddings = config_json.get('tie_word_embeddings', False)
         
         return config
 
diff --git a/maga_transformer/models/llama.py b/maga_transformer/models/llama.py
@@ -75,6 +75,7 @@ def from_huggingface(config, config_json: Dict[str, Any]):
         config.inter_size = config_json['intermediate_size']
         config.rotary_embedding_base = int(config_json.get('rope_theta', 10000))
         config.rotary_embedding_dim = config.size_per_head
+        config.tie_word_embeddings = config_json.get('tie_word_embeddings', False)
         if config_json.get('rope_scaling', None):
             if config_json['rope_scaling']['type'] == 'dynamic':
                 config.dynamic_embedding_scalar = config_json['rope_scaling']['factor']
@@ -105,6 +106,7 @@ def from_params(config: GptInitModelParameters, params_json: Dict[str, Any]):
             params_json['multiple_of'])
         config.special_tokens.eos_token_id = 2
         config.rotary_embedding_dim = config.size_per_head
+        config.tie_word_embeddings = params_json.get('tie_word_embeddings', False)
         return config
 
     @classmethod
diff --git a/maga_transformer/models/phi.py b/maga_transformer/models/phi.py
@@ -50,7 +50,8 @@ def _create_config(cls, ckpt_path: str):
             activation_type='gelu',
             has_positional_encoding=False,
             has_post_decoder_layernorm=True,
-            has_lm_head_bias=True)
+            has_lm_head_bias=True,
+            tie_word_embeddings = config_dict.get('tie_word_embeddings', False))
         config.head_num_kv = config.head_num
         return config
 
diff --git a/maga_transformer/models/qwen.py b/maga_transformer/models/qwen.py
@@ -267,6 +267,7 @@ def _from_hf(config: GptInitModelParameters, ckpt_path: str):
         config.rotary_embedding_base = int(config_json.get('rotary_emb_base', 10000))
         config.rotary_embedding_dim = config.size_per_head
         config.special_tokens.eos_token_id = config_json.get("eos_token_id", config.special_tokens.eos_token_id)
+        config.tie_word_embeddings = config_json.get('tie_word_embeddings', False)
 
         quant_config = config_json.get("quantization_config", None)
         if quant_config is not None:
diff --git a/maga_transformer/models/qwen_v2.py b/maga_transformer/models/qwen_v2.py
@@ -193,6 +193,7 @@ def _from_hf(config: GptInitModelParameters, ckpt_path: str):
         config.vocab_size = config_json["vocab_size"]
         config.rotary_embedding_dim = config.size_per_head
         config.layernorm_eps = config_json.get("rms_norm_eps", 1e-06)
+        config.tie_word_embeddings = config_json.get('tie_word_embeddings', False)
 
         quant_config = config_json.get("quantization_config", None)
         if quant_config is not None:
diff --git a/maga_transformer/models/starcoder.py b/maga_transformer/models/starcoder.py
@@ -81,6 +81,7 @@ def from_huggingface(config_json: Dict[str, Any]):
         # config.activation_type = config_json['activation_function']
         config.has_positional_encoding = True
         config.has_post_decoder_layernorm = True
+        config.tie_word_embeddings = config_json.get('tie_word_embeddings', False)
         return config
 
     @classmethod
diff --git a/maga_transformer/models/starcoder2.py b/maga_transformer/models/starcoder2.py
@@ -146,6 +146,7 @@ def from_huggingface(config_json: Dict[str, Any]):
         config.rotary_embedding_base = int(
             config_json.get('rope_theta', 1000000))
         config.rotary_embedding_dim = config.size_per_head
+        config.tie_word_embeddings = config_json.get('tie_word_embeddings', False)
         return config
 
     @classmethod
diff --git a/maga_transformer/utils/model_weight.py b/maga_transformer/utils/model_weight.py
@@ -4,6 +4,7 @@
 from functools import reduce
 import torch
 import torch.serialization
+import functools
 from typing import Any, NamedTuple, Callable, List, Dict, Set, Tuple, Optional, Union
 from maga_transformer.utils.database import FinetuneType, TrainType, CkptFileInfo, LoraConfig
 from maga_transformer.config.gpt_init_model_parameters import GptInitModelParameters
@@ -45,6 +46,19 @@ def identity(ts: List[torch.Tensor], allow_empty=False) -> torch.Tensor:
         return None
     return ts[0].contiguous()
 
+def tolerate_failed(ts: List[torch.Tensor], origin_func: Callable[[List[torch.Tensor]], torch.Tensor]) -> torch.Tensor:
+    try:
+        return origin_func(ts)
+    except Exception as _:
+        return None
+
+def choose_available(ts: List[torch.Tensor], origin_func_list: List[Callable[[List[torch.Tensor]], torch.Tensor]]) -> torch.Tensor:
+    for t, func in zip(ts, origin_func_list):
+        if t is not None and len(ts) > 0:
+            return func([t])
+    raise ValueError(f"all tensor is empty, but not allow empty")
+    
+
 def shift_one(ts: List[torch.Tensor], allow_empty=False) -> torch.Tensor:
     if len(ts) == 0 and allow_empty:
         return None
@@ -578,6 +592,8 @@ def __init__(self, config: GptInitModelParameters, tp_size: int, tp_rank: int):
         self.expert_num_ = config.gpt_init_params.expert_num
         self.moe_k_      = config.gpt_init_params.moe_k
 
+        self.tie_word_embeddings = config.tie_word_embeddings
+
     def get_preprocessed_weight_info(self, all_names: Set[str]) -> ModelWeightInfo:
         # auto create weight info based on exist tensor names
         weights: List[WeightInfo] = []
@@ -598,13 +614,39 @@ def get_preprocessed_weight_info(self, all_names: Set[str]) -> ModelWeightInfo:
 
     def get_weight_info(self) -> ModelWeightInfo:
         weight_info = self._get_weight_info()
+        if self.tie_word_embeddings:
+            logging.info("fix tie_word_embeddings")
+            weight_info = self._fix_tie_lm_head(weight_info)
         if self._is_sparse_head:
             logging.info("Skiping load empty weight for head_num == 0")
             weight_info = self._process_sparse_weight(weight_info)
         if self._is_medusa_model:
             weight_info = self._add_medusa_head_info(weight_info)
         return weight_info
 
+    def _fix_tie_lm_head(self, origin_weight_info: ModelWeightInfo) -> ModelWeightInfo:
+        word_emb_idx = -1
+        word_emb = None
+        lm_head_idx = -1
+        lm_head = None
+        for idx, weight in enumerate(origin_weight_info.weights):
+            if weight.name == W.embedding:
+                word_emb_idx = idx
+                word_emb = weight
+            elif weight.name == W.lm_head:
+                lm_head = weight
+                lm_head_idx = idx
+        if not lm_head or not word_emb:
+            return origin_weight_info
+
+        assert len(lm_head.weights) == 1 and len(word_emb.weights) == 1
+        lm_head_ckpt_weigth_infos = [CkptWeightInfo(w.name, functools.partial(tolerate_failed, origin_func=w.merge_fun)) for w in lm_head.weights]
+        lm_head_ckpt_weigth_infos.extend([CkptWeightInfo(w.name, functools.partial(tolerate_failed, origin_func=w.merge_fun)) for w in word_emb.weights])
+        lm_head_merge_funcs = [lm_head.process_fun, word_emb.process_fun]
+        lm_head = WeightInfo(W.lm_head, lm_head_ckpt_weigth_infos, functools.partial(choose_available, origin_func_list = lm_head_merge_funcs))
+        origin_weight_info.weights[lm_head_idx] = lm_head
+        return origin_weight_info
+
     def _process_sparse_weight(self, origin_weight_info: ModelWeightInfo) -> ModelWeightInfo:
         if not isinstance(origin_weight_info.layer_weights[0], list):
             raise Exception("model weight use sparse config should be list(list())")
diff --git a/maga_transformer/utils/test/model_weights_loader_test.py b/maga_transformer/utils/test/model_weights_loader_test.py
@@ -60,19 +60,6 @@ def load_module(model_type, ckpt_path, tp_size=1, tp_rank=1, pp_size=1, pp_rank=
         model_weights_loader.show_warns()
         return config, weights
 
-    def test_load_from_module(self):
-        model_type = "qwen_7b"
-        ckpt_path = "/mnt/nas1/smoke/Qwen-7B-Chat"
-        config, weights = ModelWeihgtsLoaderTest.load_module(model_type, ckpt_path, num_layers=1)
-        self.assertEqual(config.num_layers, len(weights.weights))
-        
-        self.assertEqual([151936, 4096], list(weights.steal_pytorch_weight(W.embedding).shape))
-
-        self.assertEqual([4096], list(weights.steal_pytorch_weight(W.final_ln_gamma).shape))
-
-        self.assertEqual([12288], list(weights.weights[0][W.attn_qkv_w][0].shape))
-
-        self.assertEqual([4096], list(weights.weights[0][W.ffn_w2][0].shape))
 
     def test_qwen_megatron_model_load(self):
         ckpt_path = os.path.join(ModelWeihgtsLoaderTest._testdata_path(), "qwen_14b_megatron_model")