@@ -79,7 +79,7 @@ def __init__(self, dir_model: Path, ftype: gguf.LlamaFileType, fname_out: Path,
7979 if not self .is_safetensors :
8080 self .part_names = Model .get_model_part_names (self .dir_model , ".bin" )
8181 self .hparams = Model .load_hparams (self .dir_model )
82- self .block_count = self .find_hparam (["n_layers" , "num_hidden_layers" , "n_layer" ])
82+ self .block_count = self .find_hparam (["n_layers" , "num_hidden_layers" , "n_layer" , "num_layers" ])
8383 self .tensor_map = gguf .get_tensor_name_map (self .model_arch , self .block_count )
8484 self .tensor_names = None
8585 if self .ftype == gguf .LlamaFileType .GUESSED :
@@ -2710,6 +2710,167 @@ def write_tensors(self):
27102710 raise ValueError (f"Unprocessed experts: { experts } " )
27112711
27122712
2713+ @Model .register ("ChatGLMModel" )
2714+ class ChatGLMModel (Model ):
2715+ model_arch = gguf .MODEL_ARCH .CHATGLM
2716+
2717+ def set_vocab (self ):
2718+ dir_model = self .dir_model
2719+ hparams = self .hparams
2720+ tokens : list [bytearray ] = []
2721+ toktypes : list [int ] = []
2722+ scores : list [float ] = []
2723+
2724+ from transformers import AutoTokenizer
2725+ tokenizer = AutoTokenizer .from_pretrained (dir_model , trust_remote_code = True )
2726+ vocab_size = hparams .get ("padded_vocab_size" , len (tokenizer .get_vocab ()))
2727+ assert max (tokenizer .get_vocab ().values ()) < vocab_size
2728+
2729+ reverse_vocab = {id_ : encoded_tok for encoded_tok , id_ in tokenizer .get_vocab ().items ()}
2730+
2731+ for token_id in range (vocab_size ):
2732+ piece = tokenizer ._convert_id_to_token (token_id )
2733+ if token_id == 0 :
2734+ piece = "<unk>"
2735+ elif token_id == 1 :
2736+ piece = "<bos>"
2737+ elif token_id == 2 :
2738+ piece = "<eos>"
2739+
2740+ text = piece .encode ("utf-8" )
2741+ score = 0.0
2742+ if len (piece ) != 0 and token_id < 64789 :
2743+ score = tokenizer .tokenizer .sp_model .get_score (token_id )
2744+
2745+ if len (piece ) == 0 :
2746+ text = f"[PAD{ token_id } ]" .encode ("utf-8" )
2747+
2748+ if token_id >= 64789 :
2749+ toktype = SentencePieceTokenTypes .UNKNOWN
2750+ tokens .append (text )
2751+ scores .append (score )
2752+ toktypes .append (toktype )
2753+ continue
2754+
2755+ toktype = SentencePieceTokenTypes .NORMAL
2756+ if tokenizer .tokenizer .sp_model .is_unknown (token_id ):
2757+ toktype = SentencePieceTokenTypes .UNKNOWN
2758+ elif tokenizer .tokenizer .sp_model .is_control (token_id ):
2759+ toktype = SentencePieceTokenTypes .CONTROL
2760+ elif tokenizer .tokenizer .sp_model .is_unused (token_id ):
2761+ toktype = SentencePieceTokenTypes .UNUSED
2762+ elif tokenizer .tokenizer .sp_model .is_byte (token_id ):
2763+ toktype = SentencePieceTokenTypes .BYTE
2764+
2765+ tokens .append (text )
2766+ scores .append (score )
2767+ toktypes .append (toktype )
2768+
2769+ self .gguf_writer .add_tokenizer_model ("llama" )
2770+ self .gguf_writer .add_token_list (tokens )
2771+ self .gguf_writer .add_token_scores (scores )
2772+ self .gguf_writer .add_token_types (toktypes )
2773+
2774+ special_vocab = gguf .SpecialVocab (self .dir_model , n_vocab = len (tokens ))
2775+ special_vocab .add_to_gguf (self .gguf_writer )
2776+
2777+ def set_gguf_parameters (self ):
2778+ self .gguf_writer .add_name ("ChatGLM-6b-chat" )
2779+ n_embed = self .hparams .get ("hidden_size" , self .hparams .get ("n_embed" ))
2780+ n_head = self .hparams .get ("n_head" , self .hparams .get ("num_attention_heads" ))
2781+ n_head_kv = self .hparams .get ("multi_query_group_num" , n_head )
2782+ self .gguf_writer .add_context_length (self .hparams .get ("seq_length" , n_embed ))
2783+ self .gguf_writer .add_embedding_length (n_embed )
2784+ self .gguf_writer .add_feed_forward_length (self .hparams .get ("ffn_hidden_size" , 4 * n_embed ))
2785+ self .gguf_writer .add_block_count (self .hparams ["num_layers" ])
2786+ self .gguf_writer .add_head_count (n_head )
2787+ self .gguf_writer .add_head_count_kv (n_head_kv )
2788+ self .gguf_writer .add_layer_norm_rms_eps (self .hparams ["layernorm_epsilon" ])
2789+ self .gguf_writer .add_file_type (self .ftype )
2790+ self .gguf_writer .add_rope_dimension_count (64 )
2791+ self .gguf_writer .add_add_bos_token (False )
2792+
2793+ def write_tensors (self ):
2794+ block_count = self .hparams ["num_layers" ]
2795+ tensors = dict (self .get_tensors ())
2796+ tensor_map = gguf .get_tensor_name_map (self .model_arch , block_count )
2797+ has_lm_head = True
2798+ n_head = self .hparams .get ("n_head" , self .hparams .get ("num_attention_heads" ))
2799+ n_embed = self .hparams .get ("hidden_size" , self .hparams .get ("n_embed" ))
2800+
2801+ for name , data_torch in tensors .items ():
2802+ if name .endswith (".rotary_pos_emb.inv_freq" ):
2803+ continue
2804+
2805+ if "lm_head.weight" not in tensors .keys () and "output.weight" not in tensors .keys ():
2806+ has_lm_head = False
2807+
2808+ name = re .sub (r'transformer\.' , '' , name )
2809+
2810+ old_dtype = data_torch .dtype
2811+
2812+ # convert any unsupported data types to float32
2813+ if data_torch .dtype not in (torch .float16 , torch .float32 ):
2814+ data_torch = data_torch .to (torch .float32 )
2815+
2816+ data = data_torch .squeeze ().numpy ()
2817+
2818+ if re .match (r"h\.\d+\.self_attention\.query_key_value\.weight" , name ):
2819+ # Map bloom-style qkv_linear to gpt-style qkv_linear
2820+ # bloom: https://github.com/huggingface/transformers/blob/main/src/transformers/models/bloom/modeling_bloom.py#L238-L252 # noqa
2821+ # gpt-2: https://github.com/huggingface/transformers/blob/main/src/transformers/models/gpt2/modeling_gpt2.py#L312 # noqa
2822+ qkv_weights = data .reshape ((n_head , 3 , n_embed // n_head , n_embed ))
2823+ data = np .concatenate (
2824+ (
2825+ qkv_weights [:, 0 , :, :].reshape ((- 1 , n_embed )),
2826+ qkv_weights [:, 1 , :, :].reshape ((- 1 , n_embed )),
2827+ qkv_weights [:, 2 , :, :].reshape ((- 1 , n_embed )),
2828+ ),
2829+ axis = 0 ,
2830+ )
2831+ print ("re-format attention.linear_qkv.weight" )
2832+ elif re .match (r"h\.\d+\.self_attention\.query_key_value\.bias" , name ):
2833+ qkv_bias = data .reshape ((n_head , 3 , n_embed // n_head ))
2834+ data = np .concatenate (
2835+ (
2836+ qkv_bias [:, 0 , :].reshape ((n_embed ,)),
2837+ qkv_bias [:, 1 , :].reshape ((n_embed ,)),
2838+ qkv_bias [:, 2 , :].reshape ((n_embed ,)),
2839+ ),
2840+ axis = 0 ,
2841+ )
2842+ print ("re-format attention.linear_qkv.bias" )
2843+
2844+ # map tensor names
2845+ new_name = tensor_map .get_name (name , try_suffixes = (".weight" , ".bias" ))
2846+ if new_name is None :
2847+ print (f"Can not map tensor { name !r} " )
2848+ sys .exit ()
2849+
2850+ n_dims = len (data .shape )
2851+ data_dtype = data .dtype
2852+
2853+ # if f32 desired, convert any float16 to float32
2854+ if self .ftype == 0 and data_dtype == np .float16 :
2855+ data = data .astype (np .float32 )
2856+
2857+ # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
2858+ if self .ftype == 1 and data_dtype == np .float16 and n_dims == 1 :
2859+ data = data .astype (np .float32 )
2860+
2861+ # if f16 desired, convert any float32 2-dim weight tensors to float16
2862+ if self .ftype == 1 and data_dtype == np .float32 and name .endswith (".weight" ) and n_dims == 2 :
2863+ data = data .astype (np .float16 )
2864+
2865+ print (f"=> { new_name } , shape = { data .shape } , { old_dtype } --> { data .dtype } " )
2866+
2867+ self .gguf_writer .add_tensor (new_name , data )
2868+
2869+ if not has_lm_head and name == "word_embeddings.weight" :
2870+ self .gguf_writer .add_tensor ("output.weight" , data )
2871+ print (name , f"=> output.weight, shape = { data .shape } , { old_dtype } --> { data .dtype } " )
2872+
2873+
27132874###### CONVERSION LOGIC ######
27142875
27152876
0 commit comments