@@ -7589,6 +7589,88 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
75897589 return [(self .map_tensor_name (name ), data_torch )]
75907590
75917591
7592+ @ModelBase .register ("SmallThinkerForCausalLM" )
7593+ class SmallThinkerModel (TextModel ):
7594+ model_arch = gguf .MODEL_ARCH .SMALLTHINKER
7595+
7596+ def set_gguf_parameters (self ):
7597+ super ().set_gguf_parameters ()
7598+ if (n_experts := self .hparams .get ("num_experts" , self .hparams .get ("moe_num_primary_experts" ))) is not None :
7599+ self .gguf_writer .add_expert_count (n_experts )
7600+ if (n_experts_used := self .hparams .get ("num_experts_per_tok" , self .hparams .get ("moe_num_active_primary_experts" ))) is not None :
7601+ self .gguf_writer .add_expert_used_count (n_experts_used )
7602+ if (moe_intermediate_size := self .hparams .get ("moe_ffn_hidden_size" )) is not None :
7603+ self .gguf_writer .add_expert_feed_forward_length (moe_intermediate_size )
7604+ self .gguf_writer .add_feed_forward_length (moe_intermediate_size )
7605+ logger .info (f"gguf: expert feed forward length = { moe_intermediate_size } " )
7606+ if (self .hparams .get ('moe_primary_router_apply_softmax' )):
7607+ self .gguf_writer .add_expert_gating_func (gguf .ExpertGatingFuncType .SOFTMAX )
7608+ else :
7609+ self .gguf_writer .add_expert_gating_func (gguf .ExpertGatingFuncType .SIGMOID )
7610+ # YaRN is not enabled by default
7611+ # To enable it, please refer to this guide: https://huggingface.co/Qwen/Qwen3-30B-A3B#processing-long-texts
7612+ rope_scaling = self .hparams .get ("rope_scaling" ) or {}
7613+ if rope_scaling .get ("rope_type" , rope_scaling .get ("type" )) == "yarn" and "factor" in rope_scaling :
7614+ self .gguf_writer .add_rope_scaling_type (gguf .RopeScalingType .YARN )
7615+ self .gguf_writer .add_rope_scaling_factor (rope_scaling ["factor" ])
7616+ self .gguf_writer .add_rope_scaling_orig_ctx_len (rope_scaling ["original_max_position_embeddings" ])
7617+
7618+ sliding_window_layout = self .hparams .get ("sliding_window_layout" )
7619+ if sliding_window_layout :
7620+ for i in sliding_window_layout :
7621+ if i != 0 :
7622+ sliding_window = self .hparams .get ("sliding_window_size" )
7623+ if sliding_window :
7624+ self .gguf_writer .add_sliding_window (sliding_window )
7625+ break
7626+
7627+ _experts : list [dict [str , Tensor ]] | None = None
7628+
7629+ def modify_tensors (self , data_torch : Tensor , name : str , bid : int | None ) -> Iterable [tuple [str , Tensor ]]:
7630+ # process the experts separately
7631+ if name .find ("experts" ) != - 1 :
7632+ n_experts = self .hparams .get ("num_experts" , self .hparams .get ("moe_num_primary_experts" ))
7633+ assert bid is not None
7634+
7635+ if self ._experts is None :
7636+ self ._experts = [{} for _ in range (self .block_count )]
7637+
7638+ self ._experts [bid ][name ] = data_torch
7639+
7640+ if len (self ._experts [bid ]) >= n_experts * 3 :
7641+ tensors : list [tuple [str , Tensor ]] = []
7642+
7643+ # merge the experts into a single 3d tensor
7644+ for w_name in ["down" , "gate" , "up" ]:
7645+ datas : list [Tensor ] = []
7646+
7647+ for xid in range (n_experts ):
7648+ ename = f"model.layers.{ bid } .block_sparse_moe.experts.{ xid } .{ w_name } .weight"
7649+ datas .append (self ._experts [bid ][ename ])
7650+ del self ._experts [bid ][ename ]
7651+
7652+ data_torch = torch .stack (datas , dim = 0 )
7653+
7654+ merged_name = f"model.layers.{ bid } .block_sparse_moe.experts.{ w_name } .weight"
7655+
7656+ new_name = self .map_tensor_name (merged_name )
7657+
7658+ tensors .append ((new_name , data_torch ))
7659+ return tensors
7660+ else :
7661+ return []
7662+
7663+ return [(self .map_tensor_name (name ), data_torch )]
7664+
7665+ def prepare_tensors (self ):
7666+ super ().prepare_tensors ()
7667+
7668+ if self ._experts is not None :
7669+ # flatten `list[dict[str, Tensor]]` into `list[str]`
7670+ experts = [k for d in self ._experts for k in d .keys ()]
7671+ if len (experts ) > 0 :
7672+ raise ValueError (f"Unprocessed experts: { experts } " )
7673+
75927674###### CONVERSION LOGIC ######
75937675
75947676
0 commit comments