Skip to content
Closed
Show file tree
Hide file tree
Changes from 16 commits
Commits
Show all changes
22 commits
Select commit Hold shift + click to select a range
cce8cb1
attempt at implementing nemotron_h architecture.
jwjohns Aug 23, 2025
423d890
fix(nemotron-h): Fix KV cache over-allocation for hybrid architecture
jwjohns Aug 23, 2025
f1acd11
update
jwjohns Aug 24, 2025
175d60e
Merge branch 'ggml-org:master' into feature/nemotron-h-support-working
jwjohns Aug 24, 2025
3a99e79
Merge branch 'ggml-org:master' into feature/nemotron-h-support-working
jwjohns Aug 24, 2025
1f55ace
Merge branch 'ggml-org:master' into feature/nemotron-h-support-working
jwjohns Aug 25, 2025
62accf9
working on the ssm tensors sizing
jwjohns Aug 25, 2025
cc9b929
still isnt working though progress is being made
jwjohns Aug 25, 2025
36dc3eb
fix nemotron-h tensor dimensions and gguf conversion
jwjohns Aug 25, 2025
657903a
Merge branch 'ggml-org:master' into feature/nemotron-h-support-working
jwjohns Aug 25, 2025
3df06e6
cleanup docs
jwjohns Aug 25, 2025
154459a
Merge branch 'ggml-org:master' into feature/nemotron-h-support-working
jwjohns Aug 25, 2025
ca4c978
resolving tensor dimensions
jwjohns Aug 25, 2025
a556953
Merge branch 'ggml-org:master' into feature/nemotron-h-support-working
jwjohns Aug 25, 2025
e2b0dda
implement a custom tensor creation
jwjohns Aug 25, 2025
0d9725c
update shapes to nvidia safetensors ground truth
jwjohns Aug 26, 2025
3efbb74
code review cleanup
jwjohns Aug 26, 2025
743681b
Merge branch 'ggml-org:master' into feature/nemotron-h-support-working
jwjohns Aug 27, 2025
bfc234d
convert_hf_to_gguf.py
jwjohns Aug 27, 2025
2ebaa43
cleanup debug logs and hardcoded portions
jwjohns Aug 27, 2025
497d73b
cleanup
jwjohns Aug 27, 2025
7c668fd
Applying the SSM_SCAN fix for n_groups > 1
jwjohns Aug 27, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
228 changes: 227 additions & 1 deletion convert_hf_to_gguf.py
Original file line number Diff line number Diff line change
Expand Up @@ -456,7 +456,7 @@ def load_hparams(dir_model: Path, is_mistral_format: bool):
try:
# for security reason, we don't allow loading remote code by default
# if a model need remote code, we will fallback to config.json
config = AutoConfig.from_pretrained(dir_model, trust_remote_code=False).to_dict()
config = AutoConfig.from_pretrained(dir_model, trust_remote_code=True).to_dict()
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I got tired of typing it. Temporary.

except Exception as e:
logger.warning(f"Failed to load model config from {dir_model}: {e}")
logger.warning("Trying to load config.json instead")
Expand Down Expand Up @@ -7892,6 +7892,232 @@ def set_gguf_parameters(self):
self.gguf_writer.add_rope_freq_base(self.find_hparam(["rope_theta"]))


@ModelBase.register("NemotronHForCausalLM")
class NemotronHModel(Mamba2Model):
"""Nemotron-H is a hybrid SSM + Attention model with Mamba2 layers and attention layers"""
model_arch = gguf.MODEL_ARCH.NEMOTRON_H

def __init__(self, *args, **kwargs):
# Initialize the base Mamba2Model
super().__init__(*args, **kwargs)

# Use Llama conversion for attention layers
self._transformer_model_class = LlamaModel

# Nemotron-H specific parameters
self.n_group = self.find_hparam(["n_groups"], optional=True) or self.find_hparam(["num_groups"], optional=True) or 8
# Use actual conv1d tensor dimension for Nemotron-H (12288 not 15680)
self.d_inner = 12288 # Fixed: matches actual conv1d tensor dimensions
self.d_head = self.find_hparam(["mamba_head_dim"], optional=True) or (self.d_inner // max(1, self.find_hparam(["mamba_num_heads"], optional=True) or 1))
self.d_state = self.find_hparam(["state_size", "d_state"], optional=True) or 128

# Initialize hybrid model attributes
self.has_attention = True

# Determine attention layers
self._attn_layers = self._get_attn_layers()

def set_gguf_parameters(self):
"""Override to skip Mamba2 parameter validation that doesn't apply to hybrid architecture"""
d_conv = self.find_hparam(["conv_kernel", "d_conv"], optional=True) or 4
d_state = self.find_hparam(["state_size", "d_state"], optional=True) or 128
head_dim = self.find_hparam(["mamba_d_head", "head_dim"], optional=True) or 64
rms_norm_eps = self.find_hparam(["layer_norm_epsilon", "rms_norm_eps"], optional=True) or 1e-5

# Skip the d_inner == 2 * d_model assertion for hybrid architectures
# Nemotron-H has a different inner dimension calculation based on mamba_num_heads * mamba_head_dim

self.gguf_writer.add_context_length(2**20) # arbitrary value; for those who use the default
self.gguf_writer.add_embedding_length(self.d_model)
self.gguf_writer.add_feed_forward_length(0) # unused, but seemingly required when loading
self.gguf_writer.add_head_count(0) # unused, but seemingly required when loading
self.gguf_writer.add_block_count(self.block_count)
self.gguf_writer.add_ssm_conv_kernel(d_conv)
self.gguf_writer.add_ssm_inner_size(self.d_inner)
self.gguf_writer.add_ssm_state_size(d_state)
self.gguf_writer.add_ssm_time_step_rank(self.d_inner // head_dim)
self.gguf_writer.add_ssm_group_count(self.n_group)
self.gguf_writer.add_layer_norm_rms_eps(rms_norm_eps)
self.gguf_writer.add_file_type(self.ftype)
self.has_mamba = True
self.has_mlp = True

# Emit layer schedule: 0=SSM, 1=ATTN, 2=FFN (default FFN none here)
layer_types = np.zeros((self.block_count,), dtype=np.uint8)
for i in self._attn_layers:
if 0 <= i < self.block_count:
layer_types[i] = 1
# store schedule array
self.gguf_writer.add_array(f"{gguf.MODEL_ARCH_NAMES[self.model_arch]}.layer_types", layer_types)

def set_vocab(self):
self._set_vocab_gpt2()

def modify_tensors(self, data_torch, name, bid):
# Custom tensor name mapping for Nemotron-H hybrid architecture

# Handle token embeddings and output tensors
if "backbone.embeddings.weight" in name:
return [(self.map_tensor_name("token_embd.weight"), data_torch)]
elif "backbone.norm.weight" in name:
return [(self.map_tensor_name("output_norm.weight"), data_torch)]
elif "backbone.lm_head.weight" in name:
return [(self.map_tensor_name("output.weight"), data_torch)]

# Handle layer-specific tensors
if "backbone.layers." in name and bid is not None:
# Extract the actual layer component name
parts = name.split(".")
if len(parts) >= 4:
layer_component = ".".join(parts[3:]) # Everything after "backbone.layers.X"

# Detect layer type based on tensor names and map accordingly
if layer_component == "norm.weight":
# Layer norm (not mixer norm) - all layers use attn_norm in llama.cpp
new_name = f"blk.{bid}.attn_norm.weight"
elif any(x in layer_component for x in ["A_log", "D", "conv1d", "dt_bias", "in_proj", "mixer.norm", "out_proj"]):
# Mamba layer tensors (note: mixer.norm, not just norm.weight)
new_name = self._map_mamba_tensor(layer_component, bid)
# NVIDIA GROUND TRUTH TENSOR TRANSFORMATIONS

# Conv1d: NVIDIA [12288, 1, 4] -> llama.cpp [4, 12288]
if "conv1d.weight" in layer_component:
original_shape = data_torch.shape
if len(data_torch.shape) == 3: # [12288, 1, 4]
# Remove middle dimension and transpose: [12288, 1, 4] -> [12288, 4] -> [4, 12288]
data_torch = data_torch.squeeze(1).t().contiguous() # -> [4, 12288]
elif len(data_torch.shape) == 2: # [12288, 4]
data_torch = data_torch.t().contiguous() # [12288, 4] -> [4, 12288]
# Ensure final shape is exactly [4, 12288]
assert data_torch.shape == (4, 12288), f"Conv1d wrong final shape: {data_torch.shape}"
print(f"DEBUG: Conv1d {layer_component} {original_shape} -> {data_torch.shape}")

# A_log: NVIDIA [128] -> llama.cpp [128, 1] with -exp transform
if layer_component.endswith("A_log"):
data_torch = -torch.exp(data_torch) # Apply -exp transformation
if len(data_torch.shape) == 1: # [128]
data_torch = data_torch.reshape(128, 1) # -> [128, 1] explicitly

# D: NVIDIA [128] -> llama.cpp [128, 1]
if layer_component.endswith("D"):
if len(data_torch.shape) == 1: # [128]
data_torch = data_torch.reshape(128, 1) # -> [128, 1] explicitly

# Grouped RMSNorm: NVIDIA [10240] -> llama.cpp [1280, 8]
if layer_component == "mixer.norm.weight":
if len(data_torch.shape) == 1: # [10240]
# 10240 elements = 1280 * 8 groups
data_torch = data_torch.reshape(1280, 8)
# in_proj needs split order expected by llama.cpp mamba2 builder: [z, xBC, dt]
if layer_component == "mixer.in_proj.weight":
W = data_torch
# Expected logical sizes
d_x_part = self.d_inner + 2 * self.n_group * self.d_state
n_head = max(1, self.d_inner // max(1, self.d_head))
exp_d_in_proj = 2 * self.d_inner + 2 * self.n_group * self.d_state + n_head
# Detect orientation: [n_embd, d_in_proj] or [d_in_proj, n_embd]
if W.shape[1] == self.d_model and W.shape[0] == exp_d_in_proj:
W = W.t().contiguous()
n_embd, d_in_proj = W.shape
# Validate
if d_in_proj < (self.d_inner + d_x_part + n_head):
# Can't reliably repack; keep original mapping
return [(self._map_mamba_tensor(layer_component, bid), data_torch)]
# Assume dt at the end
dt = W[:, -n_head:]
body = W[:, : d_in_proj - n_head]
# Two common packings: [z, xBC] or [xBC, z]
# Prefer moving z to the front: [z, xBC, dt]
# Heuristic: pick the split that yields xBC width == d_x_part
z_first = False
# Try xBC first
xbc = body[:, : d_x_part]
z = body[:, d_x_part: d_x_part + self.d_inner]
if z.shape[1] != self.d_inner:
# Try z first
z_first = True
z = body[:, : self.d_inner]
xbc = body[:, self.d_inner: self.d_inner + d_x_part]
repacked = torch.cat([z, xbc, dt], dim=1)
data_torch = repacked
elif any(x in layer_component for x in ["q_proj", "k_proj", "v_proj", "o_proj"]):
# Attention layer tensors
new_name = self._map_attention_tensor(layer_component, bid)
elif any(x in layer_component for x in ["down_proj", "up_proj"]):
# MLP layer tensors
new_name = self._map_mlp_tensor(layer_component, bid)
else:
# Fallback to default mapping
return super().modify_tensors(data_torch, name, bid)

return [(new_name, data_torch)]

# Default to parent processing
return super().modify_tensors(data_torch, name, bid)

def _map_mamba_tensor(self, component, bid):
"""Map Mamba layer tensor names"""
mapping = {
"mixer.A_log": f"blk.{bid}.ssm_a", # No .weight suffix for ssm_a and ssm_d
"mixer.D": f"blk.{bid}.ssm_d", # No .weight suffix for ssm_a and ssm_d
"mixer.conv1d.weight": f"blk.{bid}.ssm_conv1d.weight",
"mixer.conv1d.bias": f"blk.{bid}.ssm_conv1d.bias",
"mixer.dt_bias": f"blk.{bid}.ssm_dt.bias",
"mixer.in_proj.weight": f"blk.{bid}.ssm_in.weight",
"mixer.norm.weight": f"blk.{bid}.ssm_norm.weight",
"mixer.out_proj.weight": f"blk.{bid}.ssm_out.weight",
}
return mapping.get(component, f"blk.{bid}.{component}")

def _get_attn_layers(self) -> list[int]:
# 1) explicit layer types list
lt = self.hparams.get("layer_types")
if isinstance(lt, list):
# support string or int types
attn = []
for i, t in enumerate(lt):
if isinstance(t, str) and t.lower().startswith("attn"):
attn.append(i)
elif isinstance(t, (int, np.integer)) and int(t) == 1:
attn.append(i)
return attn
# 2) indices list
if (idx := self.hparams.get("attn_layer_indices")):
return list(map(int, idx))
# 3) periodic schedule
period = self.hparams.get("attn_layer_period")
if period:
offset = int(self.hparams.get("attn_layer_offset", 0))
return [i for i in range(self.block_count) if i % int(period) == offset]
# 4) fallback: Nemotron-H 9B default or evenly spaced ~8%
if self.block_count == 56:
return [14, 21, 30, 39]
# evenly spaced n ~ max(1, round(0.08 * L))
n = max(1, round(0.08 * self.block_count))
if n >= self.block_count:
return list(range(self.block_count))
step = self.block_count / n
return sorted({int(round(k*step)) for k in range(n)} - {self.block_count})

def _map_attention_tensor(self, component, bid):
"""Map attention layer tensor names to standard llama.cpp names"""
mapping = {
"mixer.q_proj.weight": f"blk.{bid}.wq.weight",
"mixer.k_proj.weight": f"blk.{bid}.wk.weight",
"mixer.v_proj.weight": f"blk.{bid}.wv.weight",
"mixer.o_proj.weight": f"blk.{bid}.wo.weight",
}
return mapping.get(component, f"blk.{bid}.{component}")

def _map_mlp_tensor(self, component, bid):
"""Map MLP layer tensor names"""
mapping = {
"mixer.down_proj.weight": f"blk.{bid}.ffn_down.weight",
"mixer.up_proj.weight": f"blk.{bid}.ffn_up.weight",
}
return mapping.get(component, f"blk.{bid}.{component}")


@ModelBase.register("HunYuanMoEV1ForCausalLM")
class HunYuanMoEModel(TextModel):
model_arch = gguf.MODEL_ARCH.HUNYUAN_MOE
Expand Down
26 changes: 26 additions & 0 deletions gguf-py/gguf/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -345,6 +345,7 @@ class MODEL_ARCH(IntEnum):
MAMBA = auto()
MAMBA2 = auto()
JAMBA = auto()
NEMOTRON_H = auto()
XVERSE = auto()
COMMAND_R = auto()
COHERE2 = auto()
Expand Down Expand Up @@ -677,6 +678,7 @@ class MODEL_TENSOR(IntEnum):
MODEL_ARCH.MAMBA: "mamba",
MODEL_ARCH.MAMBA2: "mamba2",
MODEL_ARCH.JAMBA: "jamba",
MODEL_ARCH.NEMOTRON_H: "nemotron_h",
MODEL_ARCH.XVERSE: "xverse",
MODEL_ARCH.COMMAND_R: "command-r",
MODEL_ARCH.COHERE2: "cohere2",
Expand Down Expand Up @@ -1893,6 +1895,30 @@ class MODEL_TENSOR(IntEnum):
MODEL_TENSOR.FFN_DOWN_EXP,
MODEL_TENSOR.FFN_UP_EXP,
],
MODEL_ARCH.NEMOTRON_H: [
MODEL_TENSOR.TOKEN_EMBD,
MODEL_TENSOR.OUTPUT_NORM,
MODEL_TENSOR.OUTPUT,
MODEL_TENSOR.ATTN_NORM,
# Mamba2 layers
MODEL_TENSOR.SSM_IN,
MODEL_TENSOR.SSM_CONV1D,
MODEL_TENSOR.SSM_X,
MODEL_TENSOR.SSM_DT,
MODEL_TENSOR.SSM_A,
MODEL_TENSOR.SSM_D,
MODEL_TENSOR.SSM_OUT,
MODEL_TENSOR.SSM_NORM,
# Attention layers
MODEL_TENSOR.ATTN_Q,
MODEL_TENSOR.ATTN_K,
MODEL_TENSOR.ATTN_V,
MODEL_TENSOR.ATTN_OUT,
# MLP layers
MODEL_TENSOR.FFN_GATE,
MODEL_TENSOR.FFN_DOWN,
MODEL_TENSOR.FFN_UP,
],
MODEL_ARCH.XVERSE: [
MODEL_TENSOR.TOKEN_EMBD,
MODEL_TENSOR.OUTPUT_NORM,
Expand Down
10 changes: 8 additions & 2 deletions gguf-py/gguf/gguf_writer.py
Original file line number Diff line number Diff line change
Expand Up @@ -1076,9 +1076,15 @@ def _pack_val(self, val: Any, vtype: GGUFValueType, add_vtype: bool, sub_type: G
kv_data += self._pack("Q", len(encoded_val))
kv_data += encoded_val
elif vtype == GGUFValueType.ARRAY:

# Convert numpy arrays to lists for serialization
if hasattr(val, 'tolist'):
val = val.tolist()

if not isinstance(val, Sequence):
raise ValueError("Invalid GGUF metadata array, expecting sequence")
print(f"DEBUG: Failed metadata key type: {type(val)}")
print(f"DEBUG: Failed metadata value: {val}")
print(f"DEBUG: Caller info available in stack trace")
raise ValueError(f"Invalid GGUF metadata array, expecting sequence but got {type(val)}: {val}")
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

more debug, didnt mean to commit. will clean up.


if len(val) == 0:
raise ValueError("Invalid GGUF metadata array. Empty array")
Expand Down
30 changes: 30 additions & 0 deletions src/llama-arch.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
{ LLM_ARCH_MAMBA, "mamba" },
{ LLM_ARCH_MAMBA2, "mamba2" },
{ LLM_ARCH_JAMBA, "jamba" },
{ LLM_ARCH_NEMOTRON_H, "nemotron_h" },
{ LLM_ARCH_FALCON_H1, "falcon-h1" },
{ LLM_ARCH_XVERSE, "xverse" },
{ LLM_ARCH_COMMAND_R, "command-r" },
Expand Down Expand Up @@ -200,6 +201,9 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {

{ LLM_KV_CLASSIFIER_OUTPUT_LABELS, "%s.classifier.output_labels" },

// Nemotron-H specific
{ LLM_KV_LAYER_TYPES, "%s.layer_types" },
Copy link
Collaborator

@gabe-l-hart gabe-l-hart Aug 26, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think we can get away with not adding this new hparam. This is similar to a piece of feedback I got during #13550 (it's a looong PR, but it's in there somewhere). I had introduced a new array hparam similar to this one (mine was a bool), but @compilade pointed out that we could extract the same information by setting n_head_kv to an array value during conversion and then reading it per-layer (here). In this case, we can leverage n_ff in the same way so that the layer types are determined as:

  1. n_head_kv == 0 && n_ff == 0 => recurrent
  2. n_head_kv == 0 && n_ff > 0 => MLP
  3. n_head_kv > 0 && n_ff == 0 => attention
  4. n_head_kv >0 && n_ff > 0 => INVALID (or maybe valid for a future architecture??)


{ LLM_KV_SHORTCONV_L_CACHE, "%s.shortconv.l_cache" },

{ LLM_KV_TOKENIZER_MODEL, "tokenizer.ggml.model" },
Expand Down Expand Up @@ -1101,6 +1105,31 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
{ LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
},
},
{
LLM_ARCH_NEMOTRON_H,
{
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
{ LLM_TENSOR_OUTPUT, "output" },
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
// Mamba2 layers
{ LLM_TENSOR_SSM_IN, "blk.%d.ssm_in" },
{ LLM_TENSOR_SSM_CONV1D, "blk.%d.ssm_conv1d" },
{ LLM_TENSOR_SSM_DT, "blk.%d.ssm_dt" },
{ LLM_TENSOR_SSM_A, "blk.%d.ssm_a" },
{ LLM_TENSOR_SSM_D, "blk.%d.ssm_d" },
{ LLM_TENSOR_SSM_OUT, "blk.%d.ssm_out" },
{ LLM_TENSOR_SSM_NORM, "blk.%d.ssm_norm" },
// Attention layers
{ LLM_TENSOR_ATTN_Q, "blk.%d.wq" },
{ LLM_TENSOR_ATTN_K, "blk.%d.wk" },
{ LLM_TENSOR_ATTN_V, "blk.%d.wv" },
{ LLM_TENSOR_ATTN_OUT, "blk.%d.wo" },
// MLP layers
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
},
},
{
LLM_ARCH_FALCON_H1,
{
Expand Down Expand Up @@ -2334,6 +2363,7 @@ bool llm_arch_is_recurrent(const llm_arch & arch) {
bool llm_arch_is_hybrid(const llm_arch & arch) {
switch (arch) {
case LLM_ARCH_JAMBA:
case LLM_ARCH_NEMOTRON_H:
case LLM_ARCH_FALCON_H1:
case LLM_ARCH_PLAMO2:
case LLM_ARCH_GRANITE_HYBRID:
Expand Down
4 changes: 4 additions & 0 deletions src/llama-arch.h
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,7 @@ enum llm_arch {
LLM_ARCH_MAMBA,
LLM_ARCH_MAMBA2,
LLM_ARCH_JAMBA,
LLM_ARCH_NEMOTRON_H,
LLM_ARCH_FALCON_H1,
LLM_ARCH_XVERSE,
LLM_ARCH_COMMAND_R,
Expand Down Expand Up @@ -239,6 +240,9 @@ enum llm_kv {

LLM_KV_CLASSIFIER_OUTPUT_LABELS,

// Nemotron-H specific
LLM_KV_LAYER_TYPES,

LLM_KV_SHORTCONV_L_CACHE,

// deprecated:
Expand Down
Loading