Skip to content

Commit 40d0717

Browse files
committed
Add mamba part of plamo2
2 parents e99335b + f7d51a5 commit 40d0717

File tree

7 files changed

+15
-4018
lines changed

7 files changed

+15
-4018
lines changed

convert_hf_to_gguf.py

Lines changed: 15 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -2371,11 +2371,11 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
23712371
hidden_size_per_head = self.hparams.get("hidden_size_per_head", 128)
23722372
d_inner = mamba_num_heads * hidden_size_per_head # 64 * 128 = 8192
23732373

2374-
# Create tensor with correct shape {d_state, d_inner} = {64, 8192}
2375-
# Each row of the matrix should contain the same value from the original 1D tensor
2376-
new_tensor = data_torch.new_zeros((d_state, d_inner))
2374+
# Create tensor with correct shape {d_inner, d_state} = {8192, 64}
2375+
# Each column of the matrix should contain the same value from the original 1D tensor
2376+
new_tensor = data_torch.new_zeros((d_inner, d_state))
23772377
for i in range(d_state):
2378-
new_tensor[i, :] = data_torch[i] # Broadcast the single value across the inner dimension
2378+
new_tensor[:, i] = data_torch[i] # Broadcast the single value across the inner dimension
23792379
data_torch = new_tensor
23802380
logger.info(f"Expanded A tensor from {d_state} to shape: {data_torch.shape}")
23812381
elif len(data_torch.shape) == 2:
@@ -2391,13 +2391,22 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
23912391

23922392
return [(new_name, data_torch)]
23932393

2394-
# Handle Mamba D tensor - ensure .weight suffix
2394+
# Handle Mamba D tensor - ensure .weight suffix and expand shape
23952395
if name.endswith("mixer.D") or name.endswith("ssm.D"):
23962396
new_name = self.map_tensor_name(name)
23972397
# Add .weight suffix if not present
23982398
if not new_name.endswith(".weight"):
23992399
new_name += ".weight"
2400-
logger.debug(f"D tensor ==> {new_name}")
2400+
logger.debug(f"D tensor ==> {new_name}, original shape: {data_torch.shape}")
2401+
2402+
# PLaMo2 D is shape {64} but llama.cpp expects {8192}
2403+
# Expand D to broadcast across d_inner dimension
2404+
if len(data_torch.shape) == 1 and data_torch.shape[0] == 64:
2405+
d_inner = 8192 # SSM inner size for PLaMo2
2406+
# Repeat D values across inner dimension
2407+
data_torch = data_torch.repeat(d_inner // data_torch.shape[0])
2408+
logger.debug(f"Expanded D tensor from 64 to shape: {data_torch.shape}")
2409+
24012410
return [(new_name, data_torch)]
24022411

24032412
# Handle Mamba conv1d tensor shape adjustment

src/llama-arch.cpp

Lines changed: 0 additions & 1795 deletions
Large diffs are not rendered by default.

0 commit comments

Comments
 (0)