@@ -5746,11 +5746,20 @@ def set_gguf_parameters(self):
57465746 logger .info ("gguf: (granite) logits_scale = %s" , logits_scale )
57475747
57485748
5749- @ModelBase .register ("GraniteMoeForCausalLM" )
5749+ @ModelBase .register ("GraniteMoeForCausalLM" , "GraniteMoeSharedForCausalLM" )
57505750class GraniteMoeModel (GraniteModel ):
57515751 """Conversion for IBM's GraniteMoeForCausalLM"""
57525752 model_arch = gguf .MODEL_ARCH .GRANITE_MOE
57535753
5754+ def set_gguf_parameters (self ):
5755+ """GraniteMoeShared uses GraniteMoe parameters plus the following:
5756+ - shared_intermediate_size
5757+ """
5758+ super ().set_gguf_parameters ()
5759+ if shared_feed_forward_length := self .hparams .get ("shared_intermediate_size" ):
5760+ self .gguf_writer .add_expert_shared_feed_forward_length (shared_feed_forward_length )
5761+ logger .info ("gguf: (granitemoeshared) shared_feed_forward_length = %s" , shared_feed_forward_length )
5762+
57545763 def modify_tensors (self , data_torch : Tensor , name : str , bid : int | None ) -> Iterable [tuple [str , Tensor ]]:
57555764 """In modeling_granitemoe, the JetMoe implementation of parallel experts
57565765 is used. This essentially merges w1 and w3 into a single tensor with 2x
@@ -5761,12 +5770,21 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
57615770 if name .endswith ("block_sparse_moe.input_linear.weight" ):
57625771 ffn_dim = self .hparams ["intermediate_size" ]
57635772 assert data_torch .shape [- 2 ] == 2 * ffn_dim , "Merged FFN tensor size must be 2 * intermediate_size"
5764- gate , up = data_torch [..., : ffn_dim , :], data_torch [..., ffn_dim :, :]
5773+ gate , up = data_torch . split ( ffn_dim , dim = - 2 )
57655774 return [
57665775 (self .format_tensor_name (gguf .MODEL_TENSOR .FFN_GATE_EXP , bid ), gate ),
57675776 (self .format_tensor_name (gguf .MODEL_TENSOR .FFN_UP_EXP , bid ), up ),
57685777 ]
57695778
5779+ if name .endswith ("shared_mlp.input_linear.weight" ):
5780+ ffn_dim = self .hparams ["shared_intermediate_size" ]
5781+ assert data_torch .shape [- 2 ] == 2 * ffn_dim , "Merged FFN tensor size must be 2 * shared_intermediate_size"
5782+ gate , up = data_torch .split (ffn_dim , dim = - 2 )
5783+ return [
5784+ (self .format_tensor_name (gguf .MODEL_TENSOR .FFN_GATE_SHEXP , bid ), gate ),
5785+ (self .format_tensor_name (gguf .MODEL_TENSOR .FFN_UP_SHEXP , bid ), up ),
5786+ ]
5787+
57705788 return super ().modify_tensors (data_torch , name , bid )
57715789
57725790
0 commit comments