@@ -209,6 +209,8 @@ class ModelType(Enum):
209209
210210 JiuTian = 0x2900
211211
212+ GPTOSS = 0x2A00
213+
212214 BCE_Embedding = 0x10000100
213215 BCE_ReRanker = 0x10000101
214216 BGE_M3 = 0x10000102
@@ -7242,7 +7244,6 @@ def get_block(prefix: str):
72427244
72437245 return weights + dac_weights
72447246
7245-
72467247class JiuTianConverter (BaseConverter ):
72477248 MODEL_TYPE = ModelType .JiuTian
72487249
@@ -7262,6 +7263,166 @@ def dump_config(f, config, ggml_type):
72627263 def get_weight_names (config ):
72637264 return QWen2Converter .get_weight_names (config )
72647265
7266+ class GptOssConverter (BaseConverter ):
7267+ MODEL_TYPE = ModelType .GPTOSS
7268+
7269+ @classmethod
7270+ def state_dict_pp (cls , config , state_dict ):
7271+ def convert_moe_packed_tensors (
7272+ blocks ,
7273+ scales ,
7274+ * ,
7275+ dtype : torch .dtype = torch .bfloat16 ,
7276+ rows_per_chunk : int = 32768 * 1024 ,
7277+ ) -> torch .Tensor :
7278+ FP4_VALUES = [ + 0.0 , + 0.5 , + 1.0 , + 1.5 , + 2.0 , + 3.0 , + 4.0 , + 6.0 , - 0.0 , - 0.5 , - 1.0 , - 1.5 , - 2.0 , - 3.0 , - 4.0 , - 6.0 , ]
7279+ scales = scales .to (torch .int32 ) - 127
7280+ assert blocks .shape [:- 1 ] == scales .shape , f"{ blocks .shape = } does not match { scales .shape = } "
7281+
7282+ lut = torch .tensor (FP4_VALUES , dtype = dtype , device = blocks .device )
7283+
7284+ * prefix_shape , G , B = blocks .shape
7285+ rows_total = math .prod (prefix_shape ) * G
7286+
7287+ blocks = blocks .reshape (rows_total , B )
7288+ scales = scales .reshape (rows_total , 1 )
7289+
7290+ out = torch .empty (rows_total , B * 2 , dtype = dtype , device = blocks .device )
7291+
7292+ for r0 in range (0 , rows_total , rows_per_chunk ):
7293+ r1 = min (r0 + rows_per_chunk , rows_total )
7294+
7295+ blk = blocks [r0 :r1 ]
7296+ exp = scales [r0 :r1 ]
7297+
7298+ # nibble indices -> int64
7299+ idx_lo = (blk & 0x0F ).to (torch .long )
7300+ idx_hi = (blk >> 4 ).to (torch .long )
7301+
7302+ sub = out [r0 :r1 ]
7303+ sub [:, 0 ::2 ] = lut [idx_lo ]
7304+ sub [:, 1 ::2 ] = lut [idx_hi ]
7305+
7306+ torch .ldexp (sub , exp , out = sub )
7307+ del idx_lo , idx_hi , blk , exp
7308+
7309+ out = out .reshape (* prefix_shape , G , B * 2 ).view (* prefix_shape , G * B * 2 )
7310+ # to match for now existing implementation
7311+ return out .to (torch .float8_e5m2 )
7312+
7313+ r = {}
7314+
7315+ for name in state_dict :
7316+ t = state_dict [name ]
7317+ if name .endswith ('mlp.experts.gate_up_proj_blocks' ):
7318+ unpacked = convert_moe_packed_tensors (t , state_dict [name .replace ('gate_up_proj_blocks' , 'gate_up_proj_scales' )])
7319+ for j in range (config .num_local_experts ):
7320+ gate_up = unpacked [j ]
7321+ new_name = name .replace ('experts.gate_up_proj_blocks' , f'experts.{ j } .gate_proj.weight' )
7322+ r [new_name ] = gate_up [0 ::2 , ...]
7323+ new_name = name .replace ('experts.gate_up_proj_blocks' , f'experts.{ j } .up_proj.weight' )
7324+ r [new_name ] = gate_up [1 ::2 , ...]
7325+
7326+ elif name .endswith ('mlp.experts.gate_up_proj_bias' ):
7327+ for j in range (config .num_local_experts ):
7328+ gate_up = t [j ]
7329+ new_name = name .replace ('experts.gate_up_proj_bias' , f'experts.{ j } .gate_proj.bias' )
7330+ r [new_name ] = gate_up [0 ::2 ]
7331+ new_name = name .replace ('experts.gate_up_proj_bias' , f'experts.{ j } .up_proj.bias' )
7332+ r [new_name ] = gate_up [1 ::2 ]
7333+ elif name .endswith ('mlp.experts.down_proj_blocks' ):
7334+ unpacked = convert_moe_packed_tensors (t , state_dict [name .replace ('down_proj_blocks' , 'down_proj_scales' )])
7335+ for j in range (config .num_local_experts ):
7336+ new_name = name .replace ('experts.down_proj_blocks' , f'experts.{ j } .down_proj.weight' )
7337+ r [new_name ] = unpacked [j ]
7338+ elif name .endswith ('mlp.experts.down_proj_bias' ):
7339+ for j in range (config .num_local_experts ):
7340+ new_name = name .replace ('experts.down_proj_bias' , f'experts.{ j } .down_proj.bias' )
7341+ r [new_name ] = t [j ]
7342+ elif name .endswith ('mlp.experts.gate_up_proj_scales' ) or name .endswith ('mlp.experts.down_proj_scales' ):
7343+ pass
7344+ else :
7345+ r [name ] = t
7346+
7347+ return r
7348+
7349+ @staticmethod
7350+ def dump_config (f , config , ggml_type ):
7351+ MAX_LAYERS = 128
7352+ assert not config .tie_word_embeddings
7353+ assert len (config .layer_types ) <= MAX_LAYERS
7354+ assert config .num_hidden_layers <= MAX_LAYERS
7355+ assert config .rope_scaling ['rope_type' ] == 'yarn'
7356+
7357+ dump_llama_like_config (f , config , ggml_type )
7358+
7359+ layer_types = [0 ] * MAX_LAYERS
7360+ for i in range (len (config .layer_types )):
7361+ layer_types [i ] = 1 if config .layer_types [i ] == 'sliding_attention' else 0
7362+
7363+ config_values = [
7364+ config .num_key_value_heads ,
7365+ config .head_dim ,
7366+ config .experts_per_token ,
7367+ config .num_experts_per_tok ,
7368+ config .num_local_experts ,
7369+ config .sliding_window ,
7370+ ] + layer_types
7371+ f .write (struct .pack ("<" + "i" * len (config_values ), * config_values ))
7372+
7373+ config_values = [
7374+ config .router_aux_loss_coef ,
7375+ config .swiglu_limit ,
7376+ config .rope_theta ,
7377+ config .rope_scaling ['original_max_position_embeddings' ],
7378+ config .rope_scaling ['beta_fast' ],
7379+ config .rope_scaling ['beta_slow' ],
7380+ config .rope_scaling ['factor' ],
7381+ ]
7382+ f .write (struct .pack ("<" + "f" * len (config_values ), * config_values ))
7383+
7384+ @staticmethod
7385+ def get_weight_names (config ):
7386+ weight_names = ["model.embed_tokens.weight" ]
7387+ for i in range (config .num_hidden_layers ):
7388+
7389+ weight_names += [
7390+ f"model.layers.{ i } .input_layernorm.weight" ,
7391+ ]
7392+
7393+ for j in range (config .num_local_experts ):
7394+ weight_names += [
7395+ f"model.layers.{ i } .mlp.experts.{ j } .down_proj.weight" ,
7396+ f"model.layers.{ i } .mlp.experts.{ j } .down_proj.bias" ,
7397+ f"model.layers.{ i } .mlp.experts.{ j } .gate_proj.weight" ,
7398+ f"model.layers.{ i } .mlp.experts.{ j } .gate_proj.bias" ,
7399+ f"model.layers.{ i } .mlp.experts.{ j } .up_proj.weight" ,
7400+ f"model.layers.{ i } .mlp.experts.{ j } .up_proj.bias" ,
7401+ ]
7402+
7403+ weight_names += [
7404+ f"model.layers.{ i } .mlp.router.weight" ,
7405+ f"model.layers.{ i } .mlp.router.bias" ,
7406+
7407+ f"model.layers.{ i } .post_attention_layernorm.weight" ,
7408+ f"model.layers.{ i } .self_attn.k_proj.weight" ,
7409+ f"model.layers.{ i } .self_attn.k_proj.bias" ,
7410+ f"model.layers.{ i } .self_attn.q_proj.weight" ,
7411+ f"model.layers.{ i } .self_attn.q_proj.bias" ,
7412+ f"model.layers.{ i } .self_attn.v_proj.weight" ,
7413+ f"model.layers.{ i } .self_attn.v_proj.bias" ,
7414+ f"model.layers.{ i } .self_attn.o_proj.weight" ,
7415+ f"model.layers.{ i } .self_attn.o_proj.bias" ,
7416+ f"model.layers.{ i } .self_attn.sinks" ,
7417+ ]
7418+
7419+ weight_names += [
7420+ "model.norm.weight" ,
7421+ "lm_head.weight"
7422+ ]
7423+
7424+ return weight_names
7425+
72657426def convert_grok_1_base (args , vocab , ggml_type ):
72667427 def ffn_size (emb_size , widening_factor ):
72677428 _ffn_size = int (widening_factor * emb_size ) * 2 // 3
@@ -7857,6 +8018,8 @@ def main():
78578018 PanguEmbeddedConverter .convert (config , model_files , vocab , ggml_type , args .save_path )
78588019 elif arch == 'JiutianForCausalLM' :
78598020 JiuTianConverter .convert (config , model_files , vocab , ggml_type , args .save_path )
8021+ elif arch == 'GptOssForCausalLM' :
8022+ GptOssConverter .convert (config , model_files , vocab , ggml_type , args .save_path )
78608023 elif arch == 'deepseek-r1-distill-qwen3' :
78618024 QWen3Converter .MODEL_TYPE = ModelType .DeepSeek_R1_Distill_QWen3
78628025 QWen3Converter .convert (config , model_files , vocab , ggml_type , args .save_path )
0 commit comments