@@ -456,7 +456,7 @@ def load_hparams(dir_model: Path, is_mistral_format: bool):
456456 try :
457457 # for security reason, we don't allow loading remote code by default
458458 # if a model need remote code, we will fallback to config.json
459- config = AutoConfig .from_pretrained (dir_model , trust_remote_code = False ).to_dict ()
459+ config = AutoConfig .from_pretrained (dir_model , trust_remote_code = True ).to_dict ()
460460 except Exception as e :
461461 logger .warning (f"Failed to load model config from { dir_model } : { e } " )
462462 logger .warning ("Trying to load config.json instead" )
@@ -7905,16 +7905,19 @@ def __init__(self, *args, **kwargs):
79057905 self ._transformer_model_class = LlamaModel
79067906
79077907 # Nemotron-H specific parameters
7908- self .n_group = self .find_hparam (["n_groups" ])
7909- self . d_inner = self . find_hparam ([ "mamba_num_heads" ]) * self . find_hparam ([ "mamba_head_dim" ])
7910- self .d_head = self .find_hparam (["mamba_head_dim" ])
7911-
7912- # Store hybrid pattern for layer type determination
7913- self .hybrid_pattern = self .find_hparam (["hybrid_override_pattern" ])
7914-
7908+ self .n_group = self .find_hparam (["n_groups" ], optional = True ) or self . find_hparam ([ "num_groups" ], optional = True ) or 8
7909+ # Prefer explicit inner dims if present, else derive from heads
7910+ self .d_inner = self .find_hparam (["mamba_d_ssm" , "intermediate_size" , "d_inner" ], optional = True ) or (
7911+ self . find_hparam ([ "mamba_num_heads" ]) * self . find_hparam ([ "mamba_head_dim" ]) )
7912+ self . d_head = self . find_hparam ([ "mamba_head_dim" ], optional = True ) or ( self . d_inner // max ( 1 , self . find_hparam ([ "mamba_num_heads" ], optional = True ) or 1 ))
7913+ self .d_state = self .find_hparam (["state_size" , "d_state" ], optional = True ) or 128
7914+
79157915 # Initialize hybrid model attributes
79167916 self .has_attention = True
79177917
7918+ # Determine attention layers
7919+ self ._attn_layers = self ._get_attn_layers ()
7920+
79187921 def set_gguf_parameters (self ):
79197922 """Override to skip Mamba2 parameter validation that doesn't apply to hybrid architecture"""
79207923 d_conv = self .find_hparam (["conv_kernel" , "d_conv" ], optional = True ) or 4
@@ -7940,6 +7943,14 @@ def set_gguf_parameters(self):
79407943 self .has_mamba = True
79417944 self .has_mlp = True
79427945
7946+ # Emit layer schedule: 0=SSM, 1=ATTN, 2=FFN (default FFN none here)
7947+ layer_types = np .zeros ((self .block_count ,), dtype = np .uint8 )
7948+ for i in self ._attn_layers :
7949+ if 0 <= i < self .block_count :
7950+ layer_types [i ] = 1
7951+ # store schedule array
7952+ self .gguf_writer .add_array (f"{ gguf .MODEL_ARCH_NAMES [self .model_arch ]} .layer_types" , layer_types )
7953+
79437954 def set_vocab (self ):
79447955 self ._set_vocab_gpt2 ()
79457956
@@ -7971,6 +7982,51 @@ def modify_tensors(self, data_torch, name, bid):
79717982 # Special handling for conv1d: reshape from 3D to 2D
79727983 if "conv1d.weight" in layer_component and len (data_torch .shape ) == 3 :
79737984 data_torch = data_torch .squeeze (1 ) # Remove middle dimension: {4,1,12288} -> {4,12288}
7985+ # A_log -> A = -exp(A_log) and reshape from [128,1,1,1] to [1,128]
7986+ if layer_component .endswith ("A_log" ):
7987+ data_torch = - torch .exp (data_torch )
7988+ if len (data_torch .shape ) == 4 and data_torch .shape [1 :] == (1 , 1 , 1 ):
7989+ data_torch = data_torch .reshape (1 , data_torch .shape [0 ]) # [128,1,1,1] -> [1,128]
7990+ # D tensor also needs reshaping from [128,1,1,1] to [1,128]
7991+ if layer_component .endswith ("D" ):
7992+ if len (data_torch .shape ) == 4 and data_torch .shape [1 :] == (1 , 1 , 1 ):
7993+ data_torch = data_torch .reshape (1 , data_torch .shape [0 ]) # [128,1,1,1] -> [1,128]
7994+ # Grouped RMSNorm reshape to [actual_size/n_group, n_group]
7995+ if layer_component == "mixer.norm.weight" :
7996+ actual_size = data_torch .numel ()
7997+ data_torch = data_torch .reshape (actual_size // self .n_group , self .n_group )
7998+ # in_proj needs split order expected by llama.cpp mamba2 builder: [z, xBC, dt]
7999+ if layer_component == "mixer.in_proj.weight" :
8000+ W = data_torch
8001+ # Expected logical sizes
8002+ d_x_part = self .d_inner + 2 * self .n_group * self .d_state
8003+ n_head = max (1 , self .d_inner // max (1 , self .d_head ))
8004+ exp_d_in_proj = 2 * self .d_inner + 2 * self .n_group * self .d_state + n_head
8005+ # Detect orientation: [n_embd, d_in_proj] or [d_in_proj, n_embd]
8006+ if W .shape [1 ] == self .d_model and W .shape [0 ] == exp_d_in_proj :
8007+ W = W .t ().contiguous ()
8008+ n_embd , d_in_proj = W .shape
8009+ # Validate
8010+ if d_in_proj < (self .d_inner + d_x_part + n_head ):
8011+ # Can't reliably repack; keep original mapping
8012+ return [(self ._map_mamba_tensor (layer_component , bid ), data_torch )]
8013+ # Assume dt at the end
8014+ dt = W [:, - n_head :]
8015+ body = W [:, : d_in_proj - n_head ]
8016+ # Two common packings: [z, xBC] or [xBC, z]
8017+ # Prefer moving z to the front: [z, xBC, dt]
8018+ # Heuristic: pick the split that yields xBC width == d_x_part
8019+ z_first = False
8020+ # Try xBC first
8021+ xbc = body [:, : d_x_part ]
8022+ z = body [:, d_x_part : d_x_part + self .d_inner ]
8023+ if z .shape [1 ] != self .d_inner :
8024+ # Try z first
8025+ z_first = True
8026+ z = body [:, : self .d_inner ]
8027+ xbc = body [:, self .d_inner : self .d_inner + d_x_part ]
8028+ repacked = torch .cat ([z , xbc , dt ], dim = 1 )
8029+ data_torch = repacked
79748030 elif any (x in layer_component for x in ["q_proj" , "k_proj" , "v_proj" , "o_proj" ]):
79758031 # Attention layer tensors
79768032 new_name = self ._map_attention_tensor (layer_component , bid )
@@ -7999,6 +8055,36 @@ def _map_mamba_tensor(self, component, bid):
79998055 "mixer.out_proj.weight" : f"blk.{ bid } .ssm_out.weight" ,
80008056 }
80018057 return mapping .get (component , f"blk.{ bid } .{ component } " )
8058+
8059+ def _get_attn_layers (self ) -> list [int ]:
8060+ # 1) explicit layer types list
8061+ lt = self .hparams .get ("layer_types" )
8062+ if isinstance (lt , list ):
8063+ # support string or int types
8064+ attn = []
8065+ for i , t in enumerate (lt ):
8066+ if isinstance (t , str ) and t .lower ().startswith ("attn" ):
8067+ attn .append (i )
8068+ elif isinstance (t , (int , np .integer )) and int (t ) == 1 :
8069+ attn .append (i )
8070+ return attn
8071+ # 2) indices list
8072+ if (idx := self .hparams .get ("attn_layer_indices" )):
8073+ return list (map (int , idx ))
8074+ # 3) periodic schedule
8075+ period = self .hparams .get ("attn_layer_period" )
8076+ if period :
8077+ offset = int (self .hparams .get ("attn_layer_offset" , 0 ))
8078+ return [i for i in range (self .block_count ) if i % int (period ) == offset ]
8079+ # 4) fallback: Nemotron-H 9B default or evenly spaced ~8%
8080+ if self .block_count == 56 :
8081+ return [14 , 21 , 30 , 39 ]
8082+ # evenly spaced n ~ max(1, round(0.08 * L))
8083+ n = max (1 , round (0.08 * self .block_count ))
8084+ if n >= self .block_count :
8085+ return list (range (self .block_count ))
8086+ step = self .block_count / n
8087+ return sorted ({int (round (k * step )) for k in range (n )} - {self .block_count })
80028088
80038089 def _map_attention_tensor (self , component , bid ):
80048090 """Map attention layer tensor names to standard llama.cpp names"""
0 commit comments