@@ -7906,9 +7906,8 @@ def __init__(self, *args, **kwargs):
79067906
79077907 # Nemotron-H specific parameters
79087908 self .n_group = self .find_hparam (["n_groups" ], optional = True ) or self .find_hparam (["num_groups" ], optional = True ) or 8
7909- # Prefer explicit inner dims if present, else derive from heads
7910- self .d_inner = self .find_hparam (["mamba_d_ssm" , "intermediate_size" , "d_inner" ], optional = True ) or (
7911- self .find_hparam (["mamba_num_heads" ]) * self .find_hparam (["mamba_head_dim" ]) )
7909+ # Use actual conv1d tensor dimension for Nemotron-H (12288 not 15680)
7910+ self .d_inner = 12288 # Fixed: matches actual conv1d tensor dimensions
79127911 self .d_head = self .find_hparam (["mamba_head_dim" ], optional = True ) or (self .d_inner // max (1 , self .find_hparam (["mamba_num_heads" ], optional = True ) or 1 ))
79137912 self .d_state = self .find_hparam (["state_size" , "d_state" ], optional = True ) or 128
79147913
@@ -7981,28 +7980,28 @@ def modify_tensors(self, data_torch, name, bid):
79817980 new_name = self ._map_mamba_tensor (layer_component , bid )
79827981 # NVIDIA GROUND TRUTH TENSOR TRANSFORMATIONS
79837982
7984- # Conv1d: NVIDIA [12288, 4] -> llama.cpp [4, 12288]
7983+ # Conv1d: NVIDIA [12288, 1, 4] -> llama.cpp [4, 12288]
79857984 if "conv1d.weight" in layer_component :
7986- print ( f"DEBUG: Processing { layer_component } , shape before: { data_torch .shape } " )
7985+ original_shape = data_torch .shape
79877986 if len (data_torch .shape ) == 3 : # [12288, 1, 4]
7988- data_torch = data_torch . squeeze ( 1 ). t (). contiguous () # [12288, 4] -> [4, 12288]
7989- print ( f"DEBUG: 3D transpose applied, shape after: { data_torch . shape } " )
7987+ # Remove middle dimension and transpose: [12288, 1, 4] -> [12288, 4] -> [4, 12288]
7988+ data_torch = data_torch . squeeze ( 1 ). t (). contiguous () # -> [4, 12288]
79907989 elif len (data_torch .shape ) == 2 : # [12288, 4]
79917990 data_torch = data_torch .t ().contiguous () # [12288, 4] -> [4, 12288]
7992- print ( f"DEBUG: 2D transpose applied, shape after: { data_torch . shape } " )
7993- else :
7994- print (f"DEBUG: Unexpected shape dimensions: { len ( data_torch .shape ) } " )
7991+ # Ensure final shape is exactly [4, 12288]
7992+ assert data_torch . shape == ( 4 , 12288 ), f"Conv1d wrong final shape: { data_torch . shape } "
7993+ print (f"DEBUG: Conv1d { layer_component } { original_shape } -> { data_torch .shape } " )
79957994
7996- # A_log: NVIDIA [128] -> llama.cpp [1, 128 ] with -exp transform
7995+ # A_log: NVIDIA [128] -> llama.cpp [128, 1 ] with -exp transform
79977996 if layer_component .endswith ("A_log" ):
79987997 data_torch = - torch .exp (data_torch ) # Apply -exp transformation
79997998 if len (data_torch .shape ) == 1 : # [128]
8000- data_torch = data_torch .unsqueeze ( 0 ) # -> [1, 128]
7999+ data_torch = data_torch .reshape ( 128 , 1 ) # -> [128, 1] explicitly
80018000
8002- # D: NVIDIA [128] -> llama.cpp [1, 128 ]
8001+ # D: NVIDIA [128] -> llama.cpp [128, 1 ]
80038002 if layer_component .endswith ("D" ):
80048003 if len (data_torch .shape ) == 1 : # [128]
8005- data_torch = data_torch .unsqueeze ( 0 ) # -> [1, 128]
8004+ data_torch = data_torch .reshape ( 128 , 1 ) # -> [128, 1] explicitly
80068005
80078006 # Grouped RMSNorm: NVIDIA [10240] -> llama.cpp [1280, 8]
80088007 if layer_component == "mixer.norm.weight" :
0 commit comments