@@ -2371,11 +2371,11 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
23712371 hidden_size_per_head = self .hparams .get ("hidden_size_per_head" , 128 )
23722372 d_inner = mamba_num_heads * hidden_size_per_head # 64 * 128 = 8192
23732373
2374- # Create tensor with correct shape {d_state, d_inner } = {64, 8192 }
2375- # Each row of the matrix should contain the same value from the original 1D tensor
2376- new_tensor = data_torch .new_zeros ((d_state , d_inner ))
2374+ # Create tensor with correct shape {d_inner, d_state } = {8192, 64 }
2375+ # Each column of the matrix should contain the same value from the original 1D tensor
2376+ new_tensor = data_torch .new_zeros ((d_inner , d_state ))
23772377 for i in range (d_state ):
2378- new_tensor [i , : ] = data_torch [i ] # Broadcast the single value across the inner dimension
2378+ new_tensor [:, i ] = data_torch [i ] # Broadcast the single value across the inner dimension
23792379 data_torch = new_tensor
23802380 logger .info (f"Expanded A tensor from { d_state } to shape: { data_torch .shape } " )
23812381 elif len (data_torch .shape ) == 2 :
@@ -2391,13 +2391,22 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
23912391
23922392 return [(new_name , data_torch )]
23932393
2394- # Handle Mamba D tensor - ensure .weight suffix
2394+ # Handle Mamba D tensor - ensure .weight suffix and expand shape
23952395 if name .endswith ("mixer.D" ) or name .endswith ("ssm.D" ):
23962396 new_name = self .map_tensor_name (name )
23972397 # Add .weight suffix if not present
23982398 if not new_name .endswith (".weight" ):
23992399 new_name += ".weight"
2400- logger .debug (f"D tensor ==> { new_name } " )
2400+ logger .debug (f"D tensor ==> { new_name } , original shape: { data_torch .shape } " )
2401+
2402+ # PLaMo2 D is shape {64} but llama.cpp expects {8192}
2403+ # Expand D to broadcast across d_inner dimension
2404+ if len (data_torch .shape ) == 1 and data_torch .shape [0 ] == 64 :
2405+ d_inner = 8192 # SSM inner size for PLaMo2
2406+ # Repeat D values across inner dimension
2407+ data_torch = data_torch .repeat (d_inner // data_torch .shape [0 ])
2408+ logger .debug (f"Expanded D tensor from 64 to shape: { data_torch .shape } " )
2409+
24012410 return [(new_name , data_torch )]
24022411
24032412 # Handle Mamba conv1d tensor shape adjustment
0 commit comments