@@ -529,9 +529,19 @@ def get_vocab_base(self) -> tuple[list[str], list[int], str]:
529529 else :
530530 token : str = reverse_vocab [i ]
531531 if token in added_vocab :
532+ # The tokenizer in llama.cpp assumes the CONTROL and USER_DEFINED tokens are pre-normalized.
533+ # To avoid unexpected issues - we make sure to normalize non-normalized tokens
534+ if not tokenizer .added_tokens_decoder [i ].normalized :
535+ previous_token = token
536+ token = tokenizer .decode (tokenizer .encode (token , add_special_tokens = False ))
537+ if previous_token != token :
538+ logger .info (f"{ repr (previous_token )} is encoded and decoded back to { repr (token )} using AutoTokenizer" )
539+
532540 if tokenizer .added_tokens_decoder [i ].special or self .does_token_look_special (token ):
533541 toktypes .append (gguf .TokenType .CONTROL )
534542 else :
543+ # NOTE: this was added for Gemma.
544+ # Encoding and decoding the tokens above isn't sufficient for this case.
535545 token = token .replace (b"\xe2 \x96 \x81 " .decode ("utf-8" ), " " ) # pre-normalize user-defined spaces
536546 toktypes .append (gguf .TokenType .USER_DEFINED )
537547 else :
@@ -575,6 +585,9 @@ def get_vocab_base_pre(self, tokenizer) -> str:
575585 if chkhsh == "8aeee3860c56296a157a1fe2fad249ec40aa59b1bb5709f4ade11c4e6fe652ed" :
576586 # ref: https://huggingface.co/tiiuae/falcon-7b
577587 res = "falcon"
588+ if chkhsh == "9d032fcbd5501f4a38150912590928bfb36091efb5df11b8e2124b0390e3fb1e" :
589+ # ref: https://huggingface.co/tiiuae/Falcon3-7B-Base
590+ res = "falcon3"
578591 if chkhsh == "0876d13b50744004aa9aeae05e7b0647eac9d801b5ba4668afc01e709c15e19f" :
579592 # ref: https://huggingface.co/BAAI/bge-small-en-v1.5
580593 res = "bert-bge"
@@ -671,6 +684,9 @@ def get_vocab_base_pre(self, tokenizer) -> str:
671684 if chkhsh == "ad851be1dba641f2e3711822f816db2c265f788b37c63b4e1aeacb9ee92de8eb" :
672685 # ref: https://huggingface.co/ai-sage/GigaChat-20B-A3B-instruct
673686 res = "gigachat"
687+ if chkhsh == "d4c8f286ea6b520b3d495c4455483cfa2302c0cfcd4be05d781b6a8a0a7cdaf1" :
688+ # ref: https://huggingface.co/Infinigence/Megrez-3B-Instruct
689+ res = "megrez"
674690
675691 if res is None :
676692 logger .warning ("\n " )
@@ -1679,6 +1695,184 @@ def prepare_tensors(self):
16791695 raise ValueError (f"Unprocessed experts: { experts } " )
16801696
16811697
1698+ @Model .register ("DeciLMForCausalLM" )
1699+ class DeciModel (Model ):
1700+ model_arch = gguf .MODEL_ARCH .DECI
1701+
1702+ @staticmethod
1703+ def _ffn_mult_to_intermediate_size (ffn_mult : float , n_embd : int ) -> int :
1704+ # DeciLM-specific code
1705+ intermediate_size = int (2 * ffn_mult * n_embd / 3 )
1706+ return DeciModel ._find_multiple (intermediate_size , 256 )
1707+
1708+ @staticmethod
1709+ def _find_multiple (n : int , k : int ) -> int :
1710+ # DeciLM-specific code
1711+ if n % k == 0 :
1712+ return n
1713+ return n + k - (n % k )
1714+
1715+ def __init__ (self , * args , ** kwargs ):
1716+ super ().__init__ (* args , ** kwargs )
1717+
1718+ if "block_configs" in self .hparams : # Llama-3_1-Nemotron-51B
1719+ _block_configs : list [dict [str ,Any ]] = self .hparams ["block_configs" ]
1720+ assert self .block_count == len (_block_configs )
1721+ self ._num_kv_heads = list ()
1722+ self ._num_heads = list ()
1723+ _ffn_multipliers = list ()
1724+ # ***linear attention layer***
1725+ # if n_heads_in_group is None and replace_with_linear is True
1726+ # then _num_kv_heads[il] is 0 and _num_heads[il] is num_attention_heads
1727+ # ***attention-free layer***
1728+ # if n_heads_in_group is None and replace_with_linear is False
1729+ # then _num_kv_heads[il] is 0 and _num_heads[il] is 0
1730+ # ***normal attention-layer***
1731+ # if n_heads_in_group is not None, then
1732+ # _num_kv_heads[il] is num_attention_head // n_heads_in_group and
1733+ # _num_heads[il] is num_attention_head
1734+ for il in range (len (_block_configs )):
1735+ if _block_configs [il ]["attention" ]["n_heads_in_group" ] is None :
1736+ if _block_configs [il ]["attention" ]["replace_with_linear" ] is True :
1737+ self ._num_kv_heads .append (0 )
1738+ self ._num_heads .append (self .hparams ["num_attention_heads" ])
1739+ else :
1740+ self ._num_kv_heads .append (0 )
1741+ self ._num_heads .append (0 )
1742+ else :
1743+ self ._num_kv_heads .append (self .hparams ["num_attention_heads" ] // _block_configs [il ]["attention" ]["n_heads_in_group" ])
1744+ self ._num_heads .append (self .hparams ["num_attention_heads" ])
1745+ _ffn_multipliers .append (_block_configs [il ]["ffn" ]["ffn_mult" ])
1746+ assert self .block_count == len (self ._num_kv_heads )
1747+ assert self .block_count == len (self ._num_heads )
1748+ assert self .block_count == len (_ffn_multipliers )
1749+ assert isinstance (self ._num_kv_heads , list ) and isinstance (self ._num_kv_heads [0 ], int )
1750+ assert isinstance (self ._num_heads , list ) and isinstance (self ._num_heads [0 ], int )
1751+ assert isinstance (_ffn_multipliers , list ) and isinstance (_ffn_multipliers [0 ], float )
1752+ self ._ffn_dims : list [int ] = [
1753+ DeciModel ._ffn_mult_to_intermediate_size (multiplier , self .hparams ["hidden_size" ])
1754+ for multiplier in _ffn_multipliers
1755+ ]
1756+
1757+ def set_vocab (self ):
1758+ # Please change tokenizer_config.json of Llama-3_1-Nemotron-51B's
1759+ # eos_token from '|eot_id|' to '|end_of_text|'
1760+ if self .hparams .get ("vocab_size" , 128256 ) == 128256 :
1761+ tokens , toktypes , tokpre = self .get_vocab_base ()
1762+ self .gguf_writer .add_tokenizer_model ("gpt2" )
1763+ self .gguf_writer .add_tokenizer_pre (tokpre )
1764+ self .gguf_writer .add_token_list (tokens )
1765+ self .gguf_writer .add_token_types (toktypes )
1766+
1767+ special_vocab = gguf .SpecialVocab (
1768+ self .dir_model , load_merges = True ,
1769+ special_token_types = ['bos' , 'eos' , 'eom' , 'eot' ]
1770+ )
1771+ special_vocab ._set_special_token ("bos" , 128000 )
1772+ special_vocab ._set_special_token ("eos" , 128001 )
1773+ special_vocab ._set_special_token ("eom" , 128008 )
1774+ special_vocab ._set_special_token ("eot" , 128009 )
1775+ special_vocab .add_to_gguf (self .gguf_writer )
1776+ else :
1777+ # DeciLM-7B
1778+ self ._set_vocab_llama_hf ()
1779+ # self._set_vocab_gpt2()
1780+
1781+ def set_gguf_parameters (self ):
1782+ if "block_configs" in self .hparams : # Llama-3_1-Nemotron-51B
1783+ assert self .block_count == len (self ._num_kv_heads )
1784+ assert self .block_count == len (self ._num_heads )
1785+ assert self .block_count == len (self ._ffn_dims )
1786+ self .gguf_writer .add_head_count_kv (self ._num_kv_heads )
1787+ self .gguf_writer .add_head_count (self ._num_heads )
1788+ self .gguf_writer .add_feed_forward_length (self ._ffn_dims )
1789+ self .gguf_writer .add_block_count (self .block_count )
1790+ self .gguf_writer .add_context_length (self .hparams ["max_position_embeddings" ])
1791+ self .gguf_writer .add_embedding_length (self .hparams ["hidden_size" ])
1792+ self .gguf_writer .add_layer_norm_rms_eps (self .hparams ["rms_norm_eps" ])
1793+ self .gguf_writer .add_key_length (self .hparams ["hidden_size" ] // self .hparams ["num_attention_heads" ])
1794+ self .gguf_writer .add_value_length (self .hparams ["hidden_size" ] // self .hparams ["num_attention_heads" ])
1795+ self .gguf_writer .add_file_type (self .ftype )
1796+ else : # DeciLM-7B
1797+ super ().set_gguf_parameters ()
1798+ if "num_key_value_heads_per_layer" in self .hparams : # DeciLM-7B
1799+ self ._num_kv_heads : list [int ] = self .hparams ["num_key_value_heads_per_layer" ]
1800+ assert self .block_count == len (self ._num_kv_heads )
1801+ self .gguf_writer .add_head_count_kv (self ._num_kv_heads )
1802+ hparams = self .hparams
1803+ self .gguf_writer .add_vocab_size (hparams ["vocab_size" ])
1804+
1805+ if "head_dim" in hparams :
1806+ rope_dim = hparams ["head_dim" ]
1807+ else :
1808+ rope_dim = hparams ["hidden_size" ] // hparams ["num_attention_heads" ]
1809+ self .gguf_writer .add_rope_dimension_count (rope_dim )
1810+
1811+ if self .hparams .get ("rope_scaling" ) is not None and "factor" in self .hparams ["rope_scaling" ]:
1812+ if self .hparams ["rope_scaling" ].get ("type" ) == "linear" :
1813+ self .gguf_writer .add_rope_scaling_type (gguf .RopeScalingType .LINEAR )
1814+ self .gguf_writer .add_rope_scaling_factor (self .hparams ["rope_scaling" ]["factor" ])
1815+
1816+ @staticmethod
1817+ def permute (weights : Tensor , n_head : int , n_head_kv : int | None ):
1818+ if n_head_kv is not None and n_head != n_head_kv :
1819+ n_head = n_head_kv
1820+ return (weights .reshape (n_head , 2 , weights .shape [0 ] // n_head // 2 , * weights .shape [1 :])
1821+ .swapaxes (1 , 2 )
1822+ .reshape (weights .shape ))
1823+
1824+ def modify_tensors (self , data_torch : Tensor , name : str , bid : int | None ) -> Iterable [tuple [str , Tensor ]]:
1825+ n_head = self .hparams ["num_attention_heads" ]
1826+ if bid is not None :
1827+ if "num_key_value_heads_per_layer" in self .hparams :
1828+ n_kv_head = self .hparams ["num_key_value_heads_per_layer" ][bid ]
1829+ elif "block_configs" in self .hparams :
1830+ n_kv_head = self ._num_kv_heads [bid ]
1831+ n_head = self ._num_heads [bid ]
1832+ else :
1833+ n_kv_head = self .hparams .get ("num_key_value_heads" )
1834+ else :
1835+ n_kv_head = self .hparams .get ("num_key_value_heads" )
1836+
1837+ if name .endswith (("q_proj.weight" , "q_proj.bias" )):
1838+ data_torch = DeciModel .permute (data_torch , n_head , n_head )
1839+ if name .endswith (("k_proj.weight" , "k_proj.bias" )):
1840+ data_torch = DeciModel .permute (data_torch , n_head , n_kv_head )
1841+ return [(self .map_tensor_name (name ), data_torch )]
1842+
1843+ def generate_extra_tensors (self ) -> Iterable [tuple [str , Tensor ]]:
1844+ if rope_scaling := self .find_hparam (["rope_scaling" ], optional = True ):
1845+ if rope_scaling .get ("rope_type" , '' ).lower () == "llama3" :
1846+ base = self .hparams .get ("rope_theta" , 10000.0 )
1847+ dim = self .hparams .get ("head_dim" , self .hparams ["hidden_size" ] // self .hparams ["num_attention_heads" ])
1848+ freqs = 1.0 / (base ** (torch .arange (0 , dim , 2 , dtype = torch .float32 ) / dim ))
1849+
1850+ factor = rope_scaling .get ("factor" , 8.0 )
1851+ low_freq_factor = rope_scaling .get ("low_freq_factor" , 1.0 )
1852+ high_freq_factor = rope_scaling .get ("high_freq_factor" , 4.0 )
1853+ old_context_len = self .hparams .get ("original_max_position_embeddings" , 8192 )
1854+
1855+ low_freq_wavelen = old_context_len / low_freq_factor
1856+ high_freq_wavelen = old_context_len / high_freq_factor
1857+ assert low_freq_wavelen != high_freq_wavelen
1858+
1859+ rope_factors = []
1860+ for freq in freqs :
1861+ wavelen = 2 * math .pi / freq
1862+ if wavelen < high_freq_wavelen :
1863+ rope_factors .append (1 )
1864+ elif wavelen > low_freq_wavelen :
1865+ rope_factors .append (factor )
1866+ else :
1867+ smooth = (old_context_len / wavelen - low_freq_factor ) / (high_freq_factor - low_freq_factor )
1868+ rope_factors .append (1 / ((1 - smooth ) / factor + smooth ))
1869+
1870+ yield (self .format_tensor_name (gguf .MODEL_TENSOR .ROPE_FREQS ), torch .tensor (rope_factors , dtype = torch .float32 ))
1871+
1872+ def prepare_tensors (self ):
1873+ super ().prepare_tensors ()
1874+
1875+
16821876@Model .register ("BitnetForCausalLM" )
16831877class BitnetModel (Model ):
16841878 model_arch = gguf .MODEL_ARCH .BITNET
@@ -2628,7 +2822,7 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
26282822 return [(self .map_tensor_name (name ), data_torch )]
26292823
26302824
2631- @Model .register ("BertModel" , "CamembertModel" )
2825+ @Model .register ("BertModel" , "BertForMaskedLM" , " CamembertModel" )
26322826class BertModel (Model ):
26332827 model_arch = gguf .MODEL_ARCH .BERT
26342828
@@ -2694,10 +2888,25 @@ def phantom(tok):
26942888 def modify_tensors (self , data_torch : Tensor , name : str , bid : int | None ) -> Iterable [tuple [str , Tensor ]]:
26952889 del bid # unused
26962890
2891+ if name .startswith ("bert." ):
2892+ name = name [5 :]
2893+
2894+ if name .endswith (".gamma" ):
2895+ name = name [:- 6 ] + ".weight"
2896+
2897+ if name .endswith (".beta" ):
2898+ name = name [:- 5 ] + ".bias"
2899+
26972900 # we are only using BERT for embeddings so we don't need the pooling layer
26982901 if name in ("embeddings.position_ids" , "pooler.dense.weight" , "pooler.dense.bias" ):
26992902 return [] # we don't need these
27002903
2904+ if name .startswith ("cls.predictions" ):
2905+ return []
2906+
2907+ if name .startswith ("cls.seq_relationship" ):
2908+ return []
2909+
27012910 return [(self .map_tensor_name (name ), data_torch )]
27022911
27032912
0 commit comments