@@ -296,9 +296,17 @@ def prepare_tensors(self):
296296 break
297297
298298 for new_name , data_torch in (self .modify_tensors (data_torch , name , bid )):
299+ # Debug tensor shape tracking
300+ if any (x in new_name for x in ["ssm_a" , "ssm_d" , "ssm_conv1d.weight" ]):
301+ print (f"DEBUG: Pre-numpy { new_name } torch shape: { data_torch .shape } " )
302+
299303 # TODO: why do we squeeze here?
300304 # data = data_torch.squeeze().numpy()
301305 data = data_torch .numpy ()
306+
307+ # Debug numpy shape
308+ if any (x in new_name for x in ["ssm_a" , "ssm_d" , "ssm_conv1d.weight" ]):
309+ print (f"DEBUG: Post-numpy { new_name } numpy shape: { data .shape } " )
302310
303311 # if data ends up empty, it means data_torch was a scalar tensor -> restore
304312 if len (data .shape ) == 0 :
@@ -384,6 +392,11 @@ def prepare_tensors(self):
384392
385393 shape = gguf .quant_shape_from_byte_shape (data .shape , data_qtype ) if data .dtype == np .uint8 else data .shape
386394
395+ # Debug shape before and after reversal
396+ if any (x in new_name for x in ["ssm_a" , "ssm_d" , "ssm_conv1d.weight" ]):
397+ print (f"DEBUG: { new_name } raw shape: { shape } " )
398+ print (f"DEBUG: { new_name } reversed: { list (reversed (shape ))} " )
399+
387400 # reverse shape to make it similar to the internal ggml dimension order
388401 shape_str = f"{{{ ', ' .join (str (n ) for n in reversed (shape ))} }}"
389402
@@ -7919,6 +7932,41 @@ def __init__(self, *args, **kwargs):
79197932 # Determine attention layers
79207933 self ._attn_layers = self ._get_attn_layers ()
79217934
7935+ def modify_tensors (self , data_torch : Tensor , name : str , bid : int | None ) -> Iterable [tuple [str , Tensor ]]:
7936+ """Override Mamba2 tensor transformation with Nemotron-H specific logic"""
7937+
7938+ if name .startswith ("model.backbone" ) or name .startswith ("model.lm_head" ):
7939+ # map Mamba-Codestral-7B-v0.1 tensor names to the names used by Mamba-2
7940+ name = name .removeprefix ("model." )
7941+
7942+ if name .endswith (".dt_bias" ):
7943+ name = name .rpartition (".dt_bias" )[0 ] + ".dt_proj.bias"
7944+
7945+ new_name = self .map_tensor_name (name )
7946+
7947+ if self .match_model_tensor_name (new_name , gguf .MODEL_TENSOR .SSM_CONV1D , bid ):
7948+ # For conv1d weights: [12288, 1, 4] -> squeeze -> [12288, 4] -> transpose -> [4, 12288]
7949+ data_torch = data_torch .squeeze () # Remove dim 1
7950+ if len (data_torch .shape ) == 2 :
7951+ data_torch = data_torch .t ().contiguous () # [12288, 4] -> [4, 12288]
7952+ elif any (self .match_model_tensor_name (new_name , t , bid , suffix = "" ) for t in [
7953+ gguf .MODEL_TENSOR .SSM_A ,
7954+ gguf .MODEL_TENSOR .SSM_D ,
7955+ ]):
7956+ # For SSM A/D: NVIDIA [128] -> llama.cpp expects [128, 1]
7957+ # But ensure exactly [128, 1] not [1, 128] to avoid GGML reversal issues
7958+ if len (data_torch .shape ) == 1 : # [128]
7959+ data_torch = data_torch .unsqueeze (1 ) # -> [128, 1] explicitly
7960+ elif self .match_model_tensor_name (new_name , gguf .MODEL_TENSOR .SSM_NORM , bid ):
7961+ data_torch = data_torch .reshape ((self .n_group , self .d_inner // self .n_group ))
7962+
7963+ # Apply A_log transformation
7964+ if name .endswith (".A_log" ):
7965+ logger .debug ("A_log --> A ==> " + new_name )
7966+ data_torch = - torch .exp (data_torch )
7967+
7968+ yield (new_name , data_torch )
7969+
79227970 def set_gguf_parameters (self ):
79237971 """Override to skip Mamba2 parameter validation that doesn't apply to hybrid architecture"""
79247972 d_conv = self .find_hparam (["conv_kernel" , "d_conv" ], optional = True ) or 4
@@ -7983,27 +8031,34 @@ def modify_tensors(self, data_torch, name, bid):
79838031 # NVIDIA GROUND TRUTH TENSOR TRANSFORMATIONS
79848032
79858033 # Conv1d: NVIDIA [12288, 1, 4] -> llama.cpp [4, 12288]
8034+ # IMPORTANT: GGUF reverses dimensions, so we need [12288, 4] to get {4, 12288} in metadata
79868035 if "conv1d.weight" in layer_component :
79878036 original_shape = data_torch .shape
79888037 if len (data_torch .shape ) == 3 : # [12288, 1, 4]
7989- # Remove middle dimension and transpose : [12288, 1, 4] -> [12288, 4] -> [4, 12288]
7990- data_torch = data_torch .squeeze (1 ).t (). contiguous () # -> [4, 12288 ]
8038+ # Remove middle dimension: [12288, 1, 4] -> [12288, 4] (no transpose for GGUF reversal)
8039+ data_torch = data_torch .squeeze (1 ).contiguous () # -> [12288, 4 ]
79918040 elif len (data_torch .shape ) == 2 : # [12288, 4]
7992- data_torch = data_torch .t (). contiguous () # [12288, 4] -> [4, 12288]
7993- # Ensure final shape is exactly [4 , 12288]
7994- assert data_torch .shape == (4 , 12288 ), f"Conv1d wrong final shape: { data_torch .shape } "
8041+ data_torch = data_torch .contiguous () # Keep [12288, 4] (no transpose for GGUF reversal)
8042+ # Ensure final shape is exactly [12288, 4] (will become {4 , 12288} after GGUF reversal)
8043+ assert data_torch .shape == (12288 , 4 ), f"Conv1d wrong final shape: { data_torch .shape } "
79958044 print (f"DEBUG: Conv1d { layer_component } { original_shape } -> { data_torch .shape } " )
79968045
79978046 # A_log: NVIDIA [128] -> llama.cpp [128, 1] with -exp transform
8047+ # IMPORTANT: GGUF reverses dimensions, so we need [1, 128] to get {128, 1} in metadata
79988048 if layer_component .endswith ("A_log" ):
8049+ original_shape = data_torch .shape
79998050 data_torch = - torch .exp (data_torch ) # Apply -exp transformation
80008051 if len (data_torch .shape ) == 1 : # [128]
8001- data_torch = data_torch .reshape (128 , 1 ) # -> [128, 1] explicitly
8052+ data_torch = data_torch .reshape (1 , 128 ) # -> [1, 128] for GGUF reversal
8053+ print (f"DEBUG: A_log { layer_component } { original_shape } -> { data_torch .shape } " )
80028054
80038055 # D: NVIDIA [128] -> llama.cpp [128, 1]
8056+ # IMPORTANT: GGUF reverses dimensions, so we need [1, 128] to get {128, 1} in metadata
80048057 if layer_component .endswith ("D" ):
8058+ original_shape = data_torch .shape
80058059 if len (data_torch .shape ) == 1 : # [128]
8006- data_torch = data_torch .reshape (128 , 1 ) # -> [128, 1] explicitly
8060+ data_torch = data_torch .reshape (1 , 128 ) # -> [1, 128] for GGUF reversal
8061+ print (f"DEBUG: D { layer_component } { original_shape } -> { data_torch .shape } " )
80078062
80088063 # Grouped RMSNorm: NVIDIA [10240] -> llama.cpp [1280, 8]
80098064 if layer_component == "mixer.norm.weight" :
@@ -8052,6 +8107,9 @@ def modify_tensors(self, data_torch, name, bid):
80528107 # Fallback to default mapping
80538108 return super ().modify_tensors (data_torch , name , bid )
80548109
8110+ # Debug: verify final tensor shape before returning (accounting for GGUF reversal)
8111+ if any (x in layer_component for x in ["A_log" , "D" , "conv1d.weight" ]):
8112+ print (f"DEBUG: Final tensor { new_name } shape: { data_torch .shape } (will reverse to GGUF metadata)" )
80558113 return [(new_name , data_torch )]
80568114
80578115 # Default to parent processing
0 commit comments