Skip to content

Commit 8fcfba1

Browse files
committed
Add more mappings for mamba layers in plamo2
1 parent 6b54dc8 commit 8fcfba1

File tree

2 files changed

+7
-19
lines changed

2 files changed

+7
-19
lines changed

convert_hf_to_gguf.py

Lines changed: 0 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -2191,25 +2191,6 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None):
21912191
class Plamo2Model(LlamaModel):
21922192
model_arch = gguf.MODEL_ARCH.PLAMO2
21932193

2194-
def __init__(self, *args, **kwargs):
2195-
super().__init__(*args, **kwargs)
2196-
2197-
# Add custom mappings for Plamo2's unique structure
2198-
# Plamo2 uses "mixer" for Mamba layers instead of standard attention
2199-
tensor_map = gguf.get_tensor_name_map(self.model_arch, self.block_count)
2200-
2201-
# Add Mamba-specific mappings
2202-
for i in range(self.block_count):
2203-
# SSM/Mamba tensors
2204-
tensor_map[f"model.layers.{i}.mixer.in_proj"] = f"blk.{i}.ssm_in"
2205-
tensor_map[f"model.layers.{i}.mixer.conv1d"] = f"blk.{i}.ssm_conv1d"
2206-
tensor_map[f"model.layers.{i}.mixer.x_proj"] = f"blk.{i}.ssm_x"
2207-
tensor_map[f"model.layers.{i}.mixer.dt_proj"] = f"blk.{i}.ssm_dt"
2208-
tensor_map[f"model.layers.{i}.mixer.A_log"] = f"blk.{i}.ssm_a"
2209-
tensor_map[f"model.layers.{i}.mixer.D"] = f"blk.{i}.ssm_d"
2210-
tensor_map[f"model.layers.{i}.mixer.out_proj"] = f"blk.{i}.ssm_out"
2211-
2212-
self.tensor_map = tensor_map
22132194

22142195
def set_vocab(self):
22152196
# Plamo2 uses sentencepiece tokenizer similar to Llama

gguf-py/gguf/tensor_mapping.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -464,36 +464,43 @@ class TensorNameMap:
464464
MODEL_TENSOR.SSM_IN: (
465465
"model.layers.{bid}.in_proj",
466466
"backbone.layers.{bid}.mixer.in_proj",
467+
"model.layers.{bid}.mixer.in_proj", # plamo2
467468
),
468469

469470
MODEL_TENSOR.SSM_CONV1D: (
470471
"model.layers.{bid}.conv1d",
471472
"backbone.layers.{bid}.mixer.conv1d",
473+
"model.layers.{bid}.mixer.conv1d", # plamo2
472474
),
473475

474476
MODEL_TENSOR.SSM_X: (
475477
"model.layers.{bid}.x_proj",
476478
"backbone.layers.{bid}.mixer.x_proj",
479+
"model.layers.{bid}.mixer.x_proj", # plamo2
477480
),
478481

479482
MODEL_TENSOR.SSM_DT: (
480483
"model.layers.{bid}.dt_proj",
481484
"backbone.layers.{bid}.mixer.dt_proj",
485+
"model.layers.{bid}.mixer.dt_proj", # plamo2
482486
),
483487

484488
MODEL_TENSOR.SSM_A: (
485489
"model.layers.{bid}.A_log",
486490
"backbone.layers.{bid}.mixer.A_log",
491+
"model.layers.{bid}.mixer.A_log", # plamo2
487492
),
488493

489494
MODEL_TENSOR.SSM_D: (
490495
"model.layers.{bid}.D",
491496
"backbone.layers.{bid}.mixer.D",
497+
"model.layers.{bid}.mixer.D", # plamo2
492498
),
493499

494500
MODEL_TENSOR.SSM_OUT: (
495501
"model.layers.{bid}.out_proj",
496502
"backbone.layers.{bid}.mixer.out_proj",
503+
"model.layers.{bid}.mixer.out_proj", # plamo2
497504
),
498505

499506
MODEL_TENSOR.TIME_MIX_W0: (

0 commit comments

Comments
 (0)