Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
22 commits
Select commit Hold shift + click to select a range
cce8cb1
attempt at implementing nemotron_h architecture.
jwjohns Aug 23, 2025
423d890
fix(nemotron-h): Fix KV cache over-allocation for hybrid architecture
jwjohns Aug 23, 2025
f1acd11
update
jwjohns Aug 24, 2025
175d60e
Merge branch 'ggml-org:master' into feature/nemotron-h-support-working
jwjohns Aug 24, 2025
3a99e79
Merge branch 'ggml-org:master' into feature/nemotron-h-support-working
jwjohns Aug 24, 2025
1f55ace
Merge branch 'ggml-org:master' into feature/nemotron-h-support-working
jwjohns Aug 25, 2025
62accf9
working on the ssm tensors sizing
jwjohns Aug 25, 2025
cc9b929
still isnt working though progress is being made
jwjohns Aug 25, 2025
36dc3eb
fix nemotron-h tensor dimensions and gguf conversion
jwjohns Aug 25, 2025
657903a
Merge branch 'ggml-org:master' into feature/nemotron-h-support-working
jwjohns Aug 25, 2025
3df06e6
cleanup docs
jwjohns Aug 25, 2025
154459a
Merge branch 'ggml-org:master' into feature/nemotron-h-support-working
jwjohns Aug 25, 2025
ca4c978
resolving tensor dimensions
jwjohns Aug 25, 2025
a556953
Merge branch 'ggml-org:master' into feature/nemotron-h-support-working
jwjohns Aug 25, 2025
e2b0dda
implement a custom tensor creation
jwjohns Aug 25, 2025
0d9725c
update shapes to nvidia safetensors ground truth
jwjohns Aug 26, 2025
3efbb74
code review cleanup
jwjohns Aug 26, 2025
743681b
Merge branch 'ggml-org:master' into feature/nemotron-h-support-working
jwjohns Aug 27, 2025
bfc234d
convert_hf_to_gguf.py
jwjohns Aug 27, 2025
2ebaa43
cleanup debug logs and hardcoded portions
jwjohns Aug 27, 2025
497d73b
cleanup
jwjohns Aug 27, 2025
7c668fd
Applying the SSM_SCAN fix for n_groups > 1
jwjohns Aug 27, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
232 changes: 231 additions & 1 deletion convert_hf_to_gguf.py
Original file line number Diff line number Diff line change
Expand Up @@ -296,9 +296,17 @@ def prepare_tensors(self):
break

for new_name, data_torch in (self.modify_tensors(data_torch, name, bid)):
# Debug tensor shape tracking
if any(x in new_name for x in ["ssm_a", "ssm_d", "ssm_conv1d.weight"]):
print(f"DEBUG: Pre-numpy {new_name} torch shape: {data_torch.shape}")

# TODO: why do we squeeze here?
# data = data_torch.squeeze().numpy()
data = data_torch.numpy()

# Debug numpy shape
if any(x in new_name for x in ["ssm_a", "ssm_d", "ssm_conv1d.weight"]):
print(f"DEBUG: Post-numpy {new_name} numpy shape: {data.shape}")

# if data ends up empty, it means data_torch was a scalar tensor -> restore
if len(data.shape) == 0:
Expand Down Expand Up @@ -384,6 +392,11 @@ def prepare_tensors(self):

shape = gguf.quant_shape_from_byte_shape(data.shape, data_qtype) if data.dtype == np.uint8 else data.shape

# Debug shape before and after reversal
if any(x in new_name for x in ["ssm_a", "ssm_d", "ssm_conv1d.weight"]):
print(f"DEBUG: {new_name} raw shape: {shape}")
print(f"DEBUG: {new_name} reversed: {list(reversed(shape))}")

# reverse shape to make it similar to the internal ggml dimension order
shape_str = f"{{{', '.join(str(n) for n in reversed(shape))}}}"

Expand Down Expand Up @@ -456,7 +469,7 @@ def load_hparams(dir_model: Path, is_mistral_format: bool):
try:
# for security reason, we don't allow loading remote code by default
# if a model need remote code, we will fallback to config.json
config = AutoConfig.from_pretrained(dir_model, trust_remote_code=False).to_dict()
config = AutoConfig.from_pretrained(dir_model, trust_remote_code=True).to_dict()
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I got tired of typing it. Temporary.

except Exception as e:
logger.warning(f"Failed to load model config from {dir_model}: {e}")
logger.warning("Trying to load config.json instead")
Expand Down Expand Up @@ -7894,6 +7907,223 @@ def set_gguf_parameters(self):
self.gguf_writer.add_rope_freq_base(self.find_hparam(["rope_theta"]))


@ModelBase.register("NemotronHForCausalLM")
class NemotronHModel(Mamba2Model):
"""Nemotron-H is a hybrid SSM + Attention model with Mamba2 layers and attention layers"""
model_arch = gguf.MODEL_ARCH.NEMOTRON_H

def __init__(self, *args, **kwargs):
# Initialize the base Mamba2Model
super().__init__(*args, **kwargs)

# Nemotron-H specific parameters with Gabe's fixes
self.n_group = self.find_hparam(["n_groups"], optional=True) or self.find_hparam(["num_groups"], optional=True) or 8
# Use actual conv1d tensor dimension for Nemotron-H (12288 not 15680)
self.d_inner = 12288 # Fixed: matches actual conv1d tensor dimensions
self.d_head = self.find_hparam(["mamba_head_dim"], optional=True) or (self.d_inner // max(1, self.find_hparam(["mamba_num_heads"], optional=True) or 1))
self.d_state = self.find_hparam(["state_size", "d_state"], optional=True) or 128

# Initialize hybrid model attributes
self.has_attention = True
self._attn_layers = self._get_attn_layers()

def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
"""Consolidated Nemotron-H tensor transformation with Gabe's fixes applied"""

# Handle backbone prefix mapping
if name.startswith("model.backbone") or name.startswith("model.lm_head"):
name = name.removeprefix("model.")

# Handle token embeddings and output tensors
if "backbone.embeddings.weight" in name:
yield (self.map_tensor_name("token_embd.weight"), data_torch)
return
elif "backbone.norm.weight" in name:
yield (self.map_tensor_name("output_norm.weight"), data_torch)
return
elif "backbone.lm_head.weight" in name:
yield (self.map_tensor_name("output.weight"), data_torch)
return

# Handle layer-specific tensors with improved logic
if "backbone.layers." in name and bid is not None:
parts = name.split(".")
if len(parts) >= 4:
layer_component = ".".join(parts[3:])

# Detect and map layer types
if layer_component == "norm.weight":
new_name = f"blk.{bid}.attn_norm.weight"
elif any(x in layer_component for x in ["A_log", "D", "conv1d", "dt_bias", "in_proj", "mixer.norm", "out_proj"]):
new_name = self._map_mamba_tensor(layer_component, bid)

# Apply Gabe's tensor transformations with specific fixes
if layer_component == "mixer.conv1d.weight":
# Conv1d: NVIDIA [12288, 1, 4] -> llama.cpp [4, 12288] with BOS alignment fix
if len(data_torch.shape) == 3: # [12288, 1, 4]
data_torch = data_torch.squeeze(1) # -> [12288, 4]
if len(data_torch.shape) == 2:
data_torch = data_torch.t().contiguous() # -> [4, 12288] for BOS alignment
logger.debug(f"Conv1d BOS alignment: {data_torch.shape}")
elif layer_component.endswith("A_log"):
# A_log transformation with proper dimensions
data_torch = -torch.exp(data_torch)
if len(data_torch.shape) == 1:
data_torch = data_torch.unsqueeze(1) # -> [128, 1] explicitly
logger.debug(f"A_log transformation: {data_torch.shape}")
elif layer_component.endswith("D"):
# D tensor proper dimensions
if len(data_torch.shape) == 1:
data_torch = data_torch.unsqueeze(1) # -> [128, 1] explicitly
logger.debug(f"D tensor shape: {data_torch.shape}")
elif layer_component == "mixer.norm.weight":
# Apply Gabe's flattened RMS norm fix for n_groups=8
if len(data_torch.shape) == 1: # [10240]
# Calculate correct dimensions: 10240 elements with n_groups=8 -> [1280, 8]
elements_per_group = data_torch.numel() // self.n_group
data_torch = data_torch.reshape((elements_per_group, self.n_group))
logger.debug(f"SSM norm reshape for n_groups={self.n_group}: {data_torch.shape}")

elif any(x in layer_component for x in ["q_proj", "k_proj", "v_proj", "o_proj"]):
new_name = self._map_attention_tensor(layer_component, bid)
elif any(x in layer_component for x in ["down_proj", "up_proj"]):
new_name = self._map_mlp_tensor(layer_component, bid)
else:
# If we can't map it in the layer-specific logic, fall back to parent mapping
if name.endswith(".dt_bias"):
name = name.rpartition(".dt_bias")[0] + ".dt_proj.bias"
new_name = self.map_tensor_name(name)
else:
# For non-layer tensors, apply standard mapping
if name.endswith(".dt_bias"):
name = name.rpartition(".dt_bias")[0] + ".dt_proj.bias"
new_name = self.map_tensor_name(name)

# Handle base Mamba2 tensor transformations for backward compatibility
if self.match_model_tensor_name(new_name, gguf.MODEL_TENSOR.SSM_CONV1D, bid):
if len(data_torch.shape) == 3: # [12288, 1, 4]
data_torch = data_torch.squeeze(1) # -> [12288, 4]
if len(data_torch.shape) == 2:
data_torch = data_torch.t().contiguous() # -> [4, 12288]
elif any(self.match_model_tensor_name(new_name, t, bid, suffix="") for t in [
gguf.MODEL_TENSOR.SSM_A, gguf.MODEL_TENSOR.SSM_D,
]):
if len(data_torch.shape) == 1:
data_torch = data_torch.unsqueeze(1) # -> [128, 1] explicitly
elif self.match_model_tensor_name(new_name, gguf.MODEL_TENSOR.SSM_NORM, bid):
if len(data_torch.shape) == 1: # [10240]
elements_per_group = data_torch.numel() // self.n_group
data_torch = data_torch.reshape((elements_per_group, self.n_group))

# Apply A_log transformation for base cases
if name.endswith(".A_log"):
data_torch = -torch.exp(data_torch)

yield (new_name, data_torch)

def set_gguf_parameters(self):
"""Override to skip Mamba2 parameter validation that doesn't apply to hybrid architecture"""
d_conv = self.find_hparam(["conv_kernel", "d_conv"], optional=True) or 4
d_state = self.find_hparam(["state_size", "d_state"], optional=True) or 128
head_dim = self.find_hparam(["mamba_d_head", "head_dim"], optional=True) or 64
rms_norm_eps = self.find_hparam(["layer_norm_epsilon", "rms_norm_eps"], optional=True) or 1e-5

self.gguf_writer.add_context_length(2**20) # arbitrary value; for those who use the default
self.gguf_writer.add_embedding_length(self.d_model)
self.gguf_writer.add_feed_forward_length(0) # unused, but seemingly required when loading
self.gguf_writer.add_head_count(0) # unused, but seemingly required when loading
self.gguf_writer.add_block_count(self.block_count)
self.gguf_writer.add_ssm_conv_kernel(d_conv)
self.gguf_writer.add_ssm_inner_size(self.d_inner)
self.gguf_writer.add_ssm_state_size(d_state)
self.gguf_writer.add_ssm_time_step_rank(self.d_inner // head_dim)
self.gguf_writer.add_ssm_group_count(self.n_group)
self.gguf_writer.add_layer_norm_rms_eps(rms_norm_eps)
self.gguf_writer.add_file_type(self.ftype)
self.has_mamba = True
self.has_mlp = True

# Emit layer schedule: 0=SSM, 1=ATTN, 2=FFN (default FFN none here)
layer_types = np.zeros((self.block_count,), dtype=np.uint8)
for i in self._attn_layers:
if 0 <= i < self.block_count:
layer_types[i] = 1
# store schedule array
self.gguf_writer.add_array(f"{gguf.MODEL_ARCH_NAMES[self.model_arch]}.layer_types", layer_types)

def set_vocab(self):
# BOS token handling fix from Gabe's findings - ensures tensor alignment through first conv1d
self._set_vocab_gpt2()

# Nemotron-H specific BOS token configuration
try:
# Force BOS token ID to align with model expectations
self.gguf_writer.add_bos_token_id(1) # Standard GPT-2 style BOS token
logger.info("Applied Nemotron-H BOS token fix for conv1d alignment")
except Exception as e:
logger.debug(f"BOS token already set or unavailable: {e}")

def _map_mamba_tensor(self, component, bid):
"""Map Mamba layer tensor names"""
mapping = {
"mixer.A_log": f"blk.{bid}.ssm_a",
"mixer.D": f"blk.{bid}.ssm_d",
"mixer.conv1d.weight": f"blk.{bid}.ssm_conv1d.weight",
"mixer.conv1d.bias": f"blk.{bid}.ssm_conv1d.bias",
"mixer.dt_bias": f"blk.{bid}.ssm_dt.bias",
"mixer.in_proj.weight": f"blk.{bid}.ssm_in.weight",
"mixer.norm.weight": f"blk.{bid}.ssm_norm.weight",
"mixer.out_proj.weight": f"blk.{bid}.ssm_out.weight",
}
return mapping.get(component, f"blk.{bid}.{component}")

def _get_attn_layers(self) -> list[int]:
# 1) explicit layer types list
lt = self.hparams.get("layer_types")
if isinstance(lt, list):
attn = []
for i, t in enumerate(lt):
if isinstance(t, str) and t.lower().startswith("attn"):
attn.append(i)
elif isinstance(t, (int, np.integer)) and int(t) == 1:
attn.append(i)
return attn
# 2) indices list
if (idx := self.hparams.get("attn_layer_indices")):
return list(map(int, idx))
# 3) periodic schedule
period = self.hparams.get("attn_layer_period")
if period:
offset = int(self.hparams.get("attn_layer_offset", 0))
return [i for i in range(self.block_count) if i % int(period) == offset]
# 4) fallback: Nemotron-H 9B default or evenly spaced ~8%
if self.block_count == 56:
return [14, 21, 30, 39]
n = max(1, round(0.08 * self.block_count))
if n >= self.block_count:
return list(range(self.block_count))
step = self.block_count / n
return sorted({int(round(k*step)) for k in range(n)} - {self.block_count})

def _map_attention_tensor(self, component, bid):
"""Map attention layer tensor names to standard llama.cpp names"""
mapping = {
"mixer.q_proj.weight": f"blk.{bid}.wq.weight",
"mixer.k_proj.weight": f"blk.{bid}.wk.weight",
"mixer.v_proj.weight": f"blk.{bid}.wv.weight",
"mixer.o_proj.weight": f"blk.{bid}.wo.weight",
}
return mapping.get(component, f"blk.{bid}.{component}")

def _map_mlp_tensor(self, component, bid):
"""Map MLP layer tensor names"""
mapping = {
"mixer.down_proj.weight": f"blk.{bid}.ffn_down.weight",
"mixer.up_proj.weight": f"blk.{bid}.ffn_up.weight",
}
return mapping.get(component, f"blk.{bid}.{component}")


@ModelBase.register("HunYuanMoEV1ForCausalLM")
class HunYuanMoEModel(TextModel):
model_arch = gguf.MODEL_ARCH.HUNYUAN_MOE
Expand Down
21 changes: 11 additions & 10 deletions ggml/src/ggml-cpu/ops.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -9003,8 +9003,7 @@ static void ggml_compute_forward_ssm_scan_f32(
GGML_ASSERT(src4->nb[0] == sizeof(float));
GGML_ASSERT(src5->nb[0] == sizeof(float));
GGML_ASSERT(src6->nb[0] == sizeof(int32_t));
// allows optimizing the modulo since n_group should be a power of 2
GGML_ASSERT((ng & -ng) == ng);
GGML_ASSERT(nh % ng == 0);

// heads per thread
const int dh = (nh + nth - 1)/nth;
Expand Down Expand Up @@ -9035,6 +9034,7 @@ static void ggml_compute_forward_ssm_scan_f32(
// ref: https://github.com/state-spaces/mamba/blob/62db608da60f6fc790b8ed9f4b3225e95ca15fde/mamba_ssm/ops/triton/softplus.py#L16
const float dt_soft_plus = dt[h] <= 20.0f ? log1pf(expf(dt[h])) : dt[h];
const float dA = expf(dt_soft_plus * A[h]);
const int g = h / (nh / ng); // repeat_interleave

// dim
for (int i1 = 0; i1 < nr; ++i1) {
Expand All @@ -9057,8 +9057,8 @@ static void ggml_compute_forward_ssm_scan_f32(
// TODO: maybe unroll more?
for (int j = 0; j < 1; j++) {
GGML_F32_VEC t0 = GGML_F32_VEC_LOAD(s0 + i + j*ggml_f32_epr + ii*nc);
GGML_F32_VEC t1 = GGML_F32_VEC_LOAD(B + i + j*ggml_f32_epr + (h & (ng - 1))*nc);
GGML_F32_VEC t2 = GGML_F32_VEC_LOAD(C + i + j*ggml_f32_epr + (h & (ng - 1))*nc);
GGML_F32_VEC t1 = GGML_F32_VEC_LOAD(B + i + j*ggml_f32_epr + g*nc);
GGML_F32_VEC t2 = GGML_F32_VEC_LOAD(C + i + j*ggml_f32_epr + g*nc);

t0 = GGML_F32_VEC_MUL(t0, adA);
t1 = GGML_F32_VEC_MUL(t1, axdt);
Expand Down Expand Up @@ -9090,8 +9090,8 @@ static void ggml_compute_forward_ssm_scan_f32(
for (int i = 0; i < np; i += GGML_F32_STEP) {
for (int j = 0; j < GGML_F32_ARR; j++) {
ax[j] = GGML_F32_VEC_LOAD(s0 + i + j*GGML_F32_EPR + ii*nc);
ay[j] = GGML_F32_VEC_LOAD(B + i + j*GGML_F32_EPR + (h & (ng - 1))*nc);
az[j] = GGML_F32_VEC_LOAD(C + i + j*GGML_F32_EPR + (h & (ng - 1))*nc);
ay[j] = GGML_F32_VEC_LOAD(B + i + j*GGML_F32_EPR + g*nc);
az[j] = GGML_F32_VEC_LOAD(C + i + j*GGML_F32_EPR + g*nc);

ax[j] = GGML_F32_VEC_MUL(ax[j], adA);
ay[j] = GGML_F32_VEC_MUL(ay[j], axdt);
Expand All @@ -9113,7 +9113,7 @@ static void ggml_compute_forward_ssm_scan_f32(
// d_state
for (int i0 = np; i0 < nc; ++i0) {
const int i = i0 + ii*nc;
const int ig = i0 + (h & (ng - 1))*nc;
const int ig = i0 + g*nc;
// state = prev_state * dA + dB * x
const float state = (s0[i] * dA) + (B[ig] * x_dt);
// y = rowwise_dotprod(state, C)
Expand All @@ -9130,6 +9130,7 @@ static void ggml_compute_forward_ssm_scan_f32(
for (int h = ih0; h < ih1; ++h) {
// ref: https://github.com/state-spaces/mamba/blob/62db608da60f6fc790b8ed9f4b3225e95ca15fde/mamba_ssm/ops/triton/softplus.py#L16
const float dt_soft_plus = dt[h] <= 20.0f ? log1pf(expf(dt[h])) : dt[h];
const int g = h / (nh / ng); // repeat_interleave

// dim
for (int i1 = 0; i1 < nr; ++i1) {
Expand All @@ -9144,8 +9145,8 @@ static void ggml_compute_forward_ssm_scan_f32(
// TODO: what happens when (d_state % svcntw()) != 0?
for (int64_t k = 0; k < nc; k += svcntw()) {
svfloat32_t vA = GGML_F32_VEC_LOAD(&A[h*nc + k]);
svfloat32_t vB = GGML_F32_VEC_LOAD(&B[k + (h & (ng - 1))*nc]);
svfloat32_t vC = GGML_F32_VEC_LOAD(&C[k + (h & (ng - 1))*nc]);
svfloat32_t vB = GGML_F32_VEC_LOAD(&B[k + g*nc]);
svfloat32_t vC = GGML_F32_VEC_LOAD(&C[k + g*nc]);
svfloat32_t vs0 = GGML_F32_VEC_LOAD(&s0[ii*nc + k]);

svfloat32_t t1 = GGML_F32_VEC_MUL(vdt_soft_plus, vA);
Expand All @@ -9165,7 +9166,7 @@ static void ggml_compute_forward_ssm_scan_f32(
// d_state
for (int i0 = 0; i0 < nc; ++i0) {
const int i = i0 + ii*nc;
const int ig = i0 + (h & (ng - 1))*nc;
const int ig = i0 + g*nc;
// state = prev_state * dA + dB * x
const float state = (s0[i] * expf(dt_soft_plus * A[i0 + h*nc])) + (B[ig] * x_dt);
// y = rowwise_dotprod(state, C)
Expand Down
Loading