Skip to content

Commit 62accf9

Browse files
committed
working on the ssm tensors sizing
1 parent 1f55ace commit 62accf9

File tree

8 files changed

+196
-100
lines changed

8 files changed

+196
-100
lines changed

convert_hf_to_gguf.py

Lines changed: 94 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -456,7 +456,7 @@ def load_hparams(dir_model: Path, is_mistral_format: bool):
456456
try:
457457
# for security reason, we don't allow loading remote code by default
458458
# if a model need remote code, we will fallback to config.json
459-
config = AutoConfig.from_pretrained(dir_model, trust_remote_code=False).to_dict()
459+
config = AutoConfig.from_pretrained(dir_model, trust_remote_code=True).to_dict()
460460
except Exception as e:
461461
logger.warning(f"Failed to load model config from {dir_model}: {e}")
462462
logger.warning("Trying to load config.json instead")
@@ -7905,16 +7905,19 @@ def __init__(self, *args, **kwargs):
79057905
self._transformer_model_class = LlamaModel
79067906

79077907
# Nemotron-H specific parameters
7908-
self.n_group = self.find_hparam(["n_groups"])
7909-
self.d_inner = self.find_hparam(["mamba_num_heads"]) * self.find_hparam(["mamba_head_dim"])
7910-
self.d_head = self.find_hparam(["mamba_head_dim"])
7911-
7912-
# Store hybrid pattern for layer type determination
7913-
self.hybrid_pattern = self.find_hparam(["hybrid_override_pattern"])
7914-
7908+
self.n_group = self.find_hparam(["n_groups"], optional=True) or self.find_hparam(["num_groups"], optional=True) or 8
7909+
# Prefer explicit inner dims if present, else derive from heads
7910+
self.d_inner = self.find_hparam(["mamba_d_ssm", "intermediate_size", "d_inner"], optional=True) or (
7911+
self.find_hparam(["mamba_num_heads"]) * self.find_hparam(["mamba_head_dim"]) )
7912+
self.d_head = self.find_hparam(["mamba_head_dim"], optional=True) or (self.d_inner // max(1, self.find_hparam(["mamba_num_heads"], optional=True) or 1))
7913+
self.d_state = self.find_hparam(["state_size", "d_state"], optional=True) or 128
7914+
79157915
# Initialize hybrid model attributes
79167916
self.has_attention = True
79177917

7918+
# Determine attention layers
7919+
self._attn_layers = self._get_attn_layers()
7920+
79187921
def set_gguf_parameters(self):
79197922
"""Override to skip Mamba2 parameter validation that doesn't apply to hybrid architecture"""
79207923
d_conv = self.find_hparam(["conv_kernel", "d_conv"], optional=True) or 4
@@ -7940,6 +7943,14 @@ def set_gguf_parameters(self):
79407943
self.has_mamba = True
79417944
self.has_mlp = True
79427945

7946+
# Emit layer schedule: 0=SSM, 1=ATTN, 2=FFN (default FFN none here)
7947+
layer_types = np.zeros((self.block_count,), dtype=np.uint8)
7948+
for i in self._attn_layers:
7949+
if 0 <= i < self.block_count:
7950+
layer_types[i] = 1
7951+
# store schedule array
7952+
self.gguf_writer.add_array(f"{gguf.MODEL_ARCH_NAMES[self.model_arch]}.layer_types", layer_types)
7953+
79437954
def set_vocab(self):
79447955
self._set_vocab_gpt2()
79457956

@@ -7971,6 +7982,51 @@ def modify_tensors(self, data_torch, name, bid):
79717982
# Special handling for conv1d: reshape from 3D to 2D
79727983
if "conv1d.weight" in layer_component and len(data_torch.shape) == 3:
79737984
data_torch = data_torch.squeeze(1) # Remove middle dimension: {4,1,12288} -> {4,12288}
7985+
# A_log -> A = -exp(A_log) and reshape from [128,1,1,1] to [1,128]
7986+
if layer_component.endswith("A_log"):
7987+
data_torch = -torch.exp(data_torch)
7988+
if len(data_torch.shape) == 4 and data_torch.shape[1:] == (1, 1, 1):
7989+
data_torch = data_torch.reshape(1, data_torch.shape[0]) # [128,1,1,1] -> [1,128]
7990+
# D tensor also needs reshaping from [128,1,1,1] to [1,128]
7991+
if layer_component.endswith("D"):
7992+
if len(data_torch.shape) == 4 and data_torch.shape[1:] == (1, 1, 1):
7993+
data_torch = data_torch.reshape(1, data_torch.shape[0]) # [128,1,1,1] -> [1,128]
7994+
# Grouped RMSNorm reshape to [actual_size/n_group, n_group]
7995+
if layer_component == "mixer.norm.weight":
7996+
actual_size = data_torch.numel()
7997+
data_torch = data_torch.reshape(actual_size // self.n_group, self.n_group)
7998+
# in_proj needs split order expected by llama.cpp mamba2 builder: [z, xBC, dt]
7999+
if layer_component == "mixer.in_proj.weight":
8000+
W = data_torch
8001+
# Expected logical sizes
8002+
d_x_part = self.d_inner + 2 * self.n_group * self.d_state
8003+
n_head = max(1, self.d_inner // max(1, self.d_head))
8004+
exp_d_in_proj = 2 * self.d_inner + 2 * self.n_group * self.d_state + n_head
8005+
# Detect orientation: [n_embd, d_in_proj] or [d_in_proj, n_embd]
8006+
if W.shape[1] == self.d_model and W.shape[0] == exp_d_in_proj:
8007+
W = W.t().contiguous()
8008+
n_embd, d_in_proj = W.shape
8009+
# Validate
8010+
if d_in_proj < (self.d_inner + d_x_part + n_head):
8011+
# Can't reliably repack; keep original mapping
8012+
return [(self._map_mamba_tensor(layer_component, bid), data_torch)]
8013+
# Assume dt at the end
8014+
dt = W[:, -n_head:]
8015+
body = W[:, : d_in_proj - n_head]
8016+
# Two common packings: [z, xBC] or [xBC, z]
8017+
# Prefer moving z to the front: [z, xBC, dt]
8018+
# Heuristic: pick the split that yields xBC width == d_x_part
8019+
z_first = False
8020+
# Try xBC first
8021+
xbc = body[:, : d_x_part]
8022+
z = body[:, d_x_part: d_x_part + self.d_inner]
8023+
if z.shape[1] != self.d_inner:
8024+
# Try z first
8025+
z_first = True
8026+
z = body[:, : self.d_inner]
8027+
xbc = body[:, self.d_inner: self.d_inner + d_x_part]
8028+
repacked = torch.cat([z, xbc, dt], dim=1)
8029+
data_torch = repacked
79748030
elif any(x in layer_component for x in ["q_proj", "k_proj", "v_proj", "o_proj"]):
79758031
# Attention layer tensors
79768032
new_name = self._map_attention_tensor(layer_component, bid)
@@ -7999,6 +8055,36 @@ def _map_mamba_tensor(self, component, bid):
79998055
"mixer.out_proj.weight": f"blk.{bid}.ssm_out.weight",
80008056
}
80018057
return mapping.get(component, f"blk.{bid}.{component}")
8058+
8059+
def _get_attn_layers(self) -> list[int]:
8060+
# 1) explicit layer types list
8061+
lt = self.hparams.get("layer_types")
8062+
if isinstance(lt, list):
8063+
# support string or int types
8064+
attn = []
8065+
for i, t in enumerate(lt):
8066+
if isinstance(t, str) and t.lower().startswith("attn"):
8067+
attn.append(i)
8068+
elif isinstance(t, (int, np.integer)) and int(t) == 1:
8069+
attn.append(i)
8070+
return attn
8071+
# 2) indices list
8072+
if (idx := self.hparams.get("attn_layer_indices")):
8073+
return list(map(int, idx))
8074+
# 3) periodic schedule
8075+
period = self.hparams.get("attn_layer_period")
8076+
if period:
8077+
offset = int(self.hparams.get("attn_layer_offset", 0))
8078+
return [i for i in range(self.block_count) if i % int(period) == offset]
8079+
# 4) fallback: Nemotron-H 9B default or evenly spaced ~8%
8080+
if self.block_count == 56:
8081+
return [14, 21, 30, 39]
8082+
# evenly spaced n ~ max(1, round(0.08 * L))
8083+
n = max(1, round(0.08 * self.block_count))
8084+
if n >= self.block_count:
8085+
return list(range(self.block_count))
8086+
step = self.block_count / n
8087+
return sorted({int(round(k*step)) for k in range(n)} - {self.block_count})
80028088

80038089
def _map_attention_tensor(self, component, bid):
80048090
"""Map attention layer tensor names to standard llama.cpp names"""

src/llama-arch.cpp

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -201,6 +201,9 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
201201

202202
{ LLM_KV_CLASSIFIER_OUTPUT_LABELS, "%s.classifier.output_labels" },
203203

204+
// Nemotron-H specific
205+
{ LLM_KV_LAYER_TYPES, "%s.layer_types" },
206+
204207
{ LLM_KV_SHORTCONV_L_CACHE, "%s.shortconv.l_cache" },
205208

206209
{ LLM_KV_TOKENIZER_MODEL, "tokenizer.ggml.model" },

src/llama-arch.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -240,6 +240,9 @@ enum llm_kv {
240240

241241
LLM_KV_CLASSIFIER_OUTPUT_LABELS,
242242

243+
// Nemotron-H specific
244+
LLM_KV_LAYER_TYPES,
245+
243246
LLM_KV_SHORTCONV_L_CACHE,
244247

245248
// deprecated:

src/llama-graph.cpp

Lines changed: 0 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -240,7 +240,6 @@ void llm_graph_input_rs::set_input(const llama_ubatch * ubatch) {
240240
if (s_copy) {
241241
// Check if buffer was allocated - skip if not
242242
if (s_copy->buffer == nullptr) {
243-
fprintf(stderr, "[DEBUG] RS s_copy buffer is NULL, skipping copy operations\n");
244243
return;
245244
}
246245
GGML_ASSERT(ggml_backend_buffer_is_host(s_copy->buffer));
@@ -397,16 +396,11 @@ void llm_graph_input_attn_cross::set_input(const llama_ubatch * ubatch) {
397396
}
398397

399398
void llm_graph_input_mem_hybrid::set_input(const llama_ubatch * ubatch) {
400-
fprintf(stderr, "[DEBUG] hybrid set_input: inp_attn=%p, inp_rs=%p\n", (void*)inp_attn.get(), (void*)inp_rs.get());
401399
if (inp_attn) {
402400
inp_attn->set_input(ubatch);
403-
} else {
404-
fprintf(stderr, "[ERROR] inp_attn is null!\n");
405401
}
406402
if (inp_rs) {
407403
inp_rs->set_input(ubatch);
408-
} else {
409-
fprintf(stderr, "[ERROR] inp_rs is null!\n");
410404
}
411405
}
412406

src/llama-model-loader.cpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -377,6 +377,8 @@ namespace GGUFMeta {
377377
}
378378

379379
template bool llama_model_loader::get_arr<std::vector<std::string>>(enum llm_kv kid, std::vector<std::string> & result, bool required);
380+
template bool llama_model_loader::get_arr<std::vector<unsigned char>>(enum llm_kv kid, std::vector<unsigned char> & result, bool required);
381+
template bool llama_model_loader::get_arr<std::vector<int32_t>>(enum llm_kv kid, std::vector<int32_t> & result, bool required);
380382

381383
template<typename T>
382384
bool llama_model_loader::get_key(const std::string & key, T & result, bool required) {

0 commit comments

Comments
 (0)