@@ -714,6 +714,9 @@ def get_vocab_base_pre(self, tokenizer) -> str:
714714 if chkhsh == "96a5f08be6259352137b512d4157e333e21df7edd3fcd152990608735a65b224" :
715715 # ref: https://huggingface.co/inclusionAI/Ling-lite
716716 res = "bailingmoe"
717+ if chkhsh == "d353350c764d8c3b39c763113960e4fb4919bea5fbf208a0e3b22e8469dc7406" :
718+ # ref: https://huggingface.co/meta-llama/Llama-4-Scout-17B-16E-Instruct
719+ res = "llama4"
717720
718721 if res is None :
719722 logger .warning ("\n " )
@@ -1608,6 +1611,7 @@ def prepare_tensors(self):
16081611@Model .register ("LLaMAForCausalLM" , "LlamaForCausalLM" , "MistralForCausalLM" , "MixtralForCausalLM" )
16091612class LlamaModel (Model ):
16101613 model_arch = gguf .MODEL_ARCH .LLAMA
1614+ undo_permute = True
16111615
16121616 def set_vocab (self ):
16131617 try :
@@ -1672,10 +1676,11 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
16721676 n_head = self .hparams ["num_attention_heads" ]
16731677 n_kv_head = self .hparams .get ("num_key_value_heads" )
16741678
1675- if name .endswith (("q_proj.weight" , "q_proj.bias" )):
1676- data_torch = LlamaModel .permute (data_torch , n_head , n_head )
1677- if name .endswith (("k_proj.weight" , "k_proj.bias" )):
1678- data_torch = LlamaModel .permute (data_torch , n_head , n_kv_head )
1679+ if self .undo_permute :
1680+ if name .endswith (("q_proj.weight" , "q_proj.bias" )):
1681+ data_torch = LlamaModel .permute (data_torch , n_head , n_head )
1682+ if name .endswith (("k_proj.weight" , "k_proj.bias" )):
1683+ data_torch = LlamaModel .permute (data_torch , n_head , n_kv_head )
16791684
16801685 # process the experts separately
16811686 if name .find ("block_sparse_moe.experts" ) != - 1 :
@@ -1752,6 +1757,61 @@ def prepare_tensors(self):
17521757 raise ValueError (f"Unprocessed experts: { experts } " )
17531758
17541759
1760+ @Model .register ("Llama4ForConditionalGeneration" )
1761+ class Llama4Model (LlamaModel ):
1762+ model_arch = gguf .MODEL_ARCH .LLAMA4
1763+ has_vision : bool = False
1764+ undo_permute = False
1765+
1766+ # TODO @ngxson : avoid duplicate this code everywhere by at least support "text_config"
1767+ # same with llama, but we need to merge the text_config into the root level of hparams
1768+ def __init__ (self , * args , ** kwargs ):
1769+ hparams = kwargs ["hparams" ] if "hparams" in kwargs else Model .load_hparams (args [0 ])
1770+ if "text_config" in hparams :
1771+ hparams = {** hparams , ** hparams ["text_config" ]}
1772+ kwargs ["hparams" ] = hparams
1773+ super ().__init__ (* args , ** kwargs )
1774+ if "vision_config" in hparams :
1775+ logger .info ("Has vision encoder, but it will be ignored" )
1776+ self .has_vision = True
1777+ # IMPORTANT: the normal "intermediate_size" is renamed to "intermediate_size_mlp", we need to undo this
1778+ self .hparams ["intermediate_size_moe" ] = self .hparams ["intermediate_size" ]
1779+ self .hparams ["intermediate_size" ] = self .hparams ["intermediate_size_mlp" ]
1780+
1781+ def set_vocab (self ):
1782+ self ._set_vocab_gpt2 ()
1783+ self .gguf_writer .add_add_bos_token (True )
1784+
1785+ def set_gguf_parameters (self ):
1786+ super ().set_gguf_parameters ()
1787+ self .gguf_writer .add_interleave_moe_layer_step (self .hparams ["interleave_moe_layer_step" ])
1788+ self .gguf_writer .add_expert_feed_forward_length (self .hparams ["intermediate_size_moe" ])
1789+
1790+ def modify_tensors (self , data_torch : Tensor , name : str , bid : int | None ):
1791+ name = name .replace ("language_model." , "" )
1792+ name = name .replace ("feed_forward." , "mlp." ) # a bit hacky for now
1793+ name = name .replace (".router.weight" , ".gate.weight" ) # a bit hacky for now
1794+
1795+ # split the gate_up into gate and up
1796+ if "gate_up_proj" in name :
1797+ name_up = name .replace ("gate_up_proj" , "up_proj.weight" )
1798+ name_gate = name .replace ("gate_up_proj" , "gate_proj.weight" )
1799+ dim_half = data_torch .shape [- 1 ] // 2
1800+ gate_proj_weight , up_proj_weight = data_torch .transpose (- 1 , - 2 ).split (dim_half , dim = - 2 )
1801+ return [
1802+ (self .map_tensor_name (name_gate ), gate_proj_weight ),
1803+ (self .map_tensor_name (name_up ), up_proj_weight )
1804+ ]
1805+
1806+ if name .endswith ("down_proj" ):
1807+ name += ".weight"
1808+ data_torch = data_torch .transpose (- 1 , - 2 )
1809+
1810+ if "multi_modal_projector" in name or "vision_model" in name :
1811+ return []
1812+ return super ().modify_tensors (data_torch , name , bid )
1813+
1814+
17551815@Model .register ("Mistral3ForConditionalGeneration" )
17561816class Mistral3Model (LlamaModel ):
17571817 model_arch = gguf .MODEL_ARCH .LLAMA
@@ -2399,6 +2459,16 @@ def prepare_tensors(self):
23992459 raise ValueError (f"Unprocessed experts: { experts } " )
24002460
24012461
2462+ @Model .register ("Qwen3ForCausalLM" )
2463+ class Qwen3Model (Qwen2Model ):
2464+ model_arch = gguf .MODEL_ARCH .QWEN3
2465+
2466+
2467+ @Model .register ("Qwen3MoeForCausalLM" )
2468+ class Qwen3MoeModel (Qwen2MoeModel ):
2469+ model_arch = gguf .MODEL_ARCH .QWEN3MOE
2470+
2471+
24022472@Model .register ("GPT2LMHeadModel" )
24032473class GPT2Model (Model ):
24042474 model_arch = gguf .MODEL_ARCH .GPT2
0 commit comments