@@ -2717,27 +2717,56 @@ def set_gguf_parameters(self):
27172717 if (rope_dim := self .hparams .get ("head_dim" )) is None :
27182718 rope_dim = self .hparams ["hidden_size" ] // self .hparams ["num_attention_heads" ]
27192719
2720+ # Treat "original" as "yarn", seems to have been a mistake
2721+ if self .hparams .get ("rope_type" ) in ("yarn" , "original" ):
2722+ # config.json values differ from standard, we may have to add metadata for these:
2723+ # extrapolation_factor = 1.0
2724+ # attn_factor = 1.0
2725+ # beta_fast = 8
2726+ # beta_slow = 1
2727+ self .gguf_writer .add_rope_scaling_type (gguf .RopeScalingType .YARN )
2728+ self .gguf_writer .add_rope_scaling_factor (self .hparams ["scaling_factor" ])
2729+ self .gguf_writer .add_rope_scaling_orig_ctx_len (self .hparams ["original_max_position_embeddings" ])
2730+
2731+ if temp_len := self .hparams .get ("attn_temperature_len" ):
2732+ self .gguf_writer .add_attn_temperature_length (temp_len )
2733+
27202734 self .gguf_writer .add_attn_output_scale (self .hparams .get ("attn_output_multiplier" , rope_dim ** - 0.5 ))
27212735 self .gguf_writer .add_embedding_scale (self .hparams ["embedding_multiplier_scale" ])
27222736 self .gguf_writer .add_logit_scale (self .hparams ["output_multiplier_scale" ])
27232737
2724- _experts : list [dict [str , Tensor ]] | None = None
2738+ _experts : list [dict [str , list [Tensor ]]] | None = None
2739+ _cur_expert = ""
27252740
27262741 def modify_tensors (self , data_torch : Tensor , name : str , bid : int | None ) -> Iterable [tuple [str , Tensor ]]:
2742+ tensors : list [tuple [str , Tensor ]] = []
2743+ is_expert = ".moe." in name or ".block_sparse_moe.experts." in name
2744+
2745+ if not is_expert :
2746+ tensors .append ((self .map_tensor_name (name ), data_torch ))
2747+
27272748 # process the experts separately
2728- if name . find ( ".moe." ) != - 1 or name . find ( ".block_sparse_moe.experts." ) != - 1 :
2749+ if is_expert or self . _cur_expert :
27292750 n_experts = self .hparams ["num_local_experts" ]
27302751
27312752 assert bid is not None
27322753
27332754 if self ._experts is None :
27342755 self ._experts = [{} for _ in range (self .block_count )]
27352756
2736- self ._experts [bid ][name ] = data_torch
2757+ # concatenate split tensors
2758+ if name in self ._experts [bid ]:
2759+ self ._cur_expert = name
2760+ self ._experts [bid ][name ].append (data_torch )
2761+ return []
2762+ elif is_expert :
2763+ self ._cur_expert = name
2764+ self ._experts [bid ][name ] = [data_torch ]
2765+ return []
2766+ else :
2767+ self ._cur_expert = ""
27372768
27382769 if len (self ._experts [bid ]) >= n_experts * 3 :
2739- tensors : list [tuple [str , Tensor ]] = []
2740-
27412770 # merge the experts into a single 3d tensor
27422771 for wid in [("linear" , "w1" ), ("linear_1" , "w2" ), ("linear_v" , "w3" )]:
27432772 datas : list [Tensor ] = []
@@ -2746,7 +2775,8 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
27462775 ename = f"transformer.decoder_layer.{ bid } .moe.{ xid } .{ wid [0 ]} .weight"
27472776 if ename not in self ._experts [bid ]:
27482777 ename = f"model.layers.{ bid } .block_sparse_moe.experts.{ xid } .{ wid [1 ]} .weight"
2749- datas .append (self ._experts [bid ][ename ])
2778+ tensor_list = self ._experts [bid ][ename ]
2779+ datas .append (torch .hstack (tensor_list ) if len (tensor_list ) > 1 else tensor_list [0 ])
27502780 del self ._experts [bid ][ename ]
27512781
27522782 data_torch = torch .stack (datas , dim = 0 )
@@ -2756,11 +2786,8 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
27562786 new_name = self .map_tensor_name (merged_name )
27572787
27582788 tensors .append ((new_name , data_torch ))
2759- return tensors
2760- else :
2761- return []
27622789
2763- return [( self . map_tensor_name ( name ), data_torch )]
2790+ return tensors
27642791
27652792
27662793@ModelBase .register ("DbrxForCausalLM" )
0 commit comments