diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index fddfc4a3be2bc..c7c6072ef2443 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -7648,7 +7648,7 @@ def set_vocab(self): @ModelBase.register("NemotronHForCausalLM") class NemotronHModel(GraniteHybridModel): """Hybrid mamba2/attention model from NVIDIA""" - model_arch = gguf.MODEL_ARCH.NEMOTRONH + model_arch = gguf.MODEL_ARCH.NEMOTRON_H def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index 821577810c0ba..6156d35c2ad67 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -367,7 +367,7 @@ class MODEL_ARCH(IntEnum): T5ENCODER = auto() JAIS = auto() NEMOTRON = auto() - NEMOTRONH = auto() + NEMOTRON_H = auto() EXAONE = auto() EXAONE4 = auto() GRANITE = auto() @@ -701,7 +701,7 @@ class MODEL_TENSOR(IntEnum): MODEL_ARCH.T5ENCODER: "t5encoder", MODEL_ARCH.JAIS: "jais", MODEL_ARCH.NEMOTRON: "nemotron", - MODEL_ARCH.NEMOTRONH: "nemotronh", + MODEL_ARCH.NEMOTRON_H: "nemotron_h", MODEL_ARCH.EXAONE: "exaone", MODEL_ARCH.EXAONE4: "exaone4", MODEL_ARCH.GRANITE: "granite", @@ -2299,7 +2299,7 @@ class MODEL_TENSOR(IntEnum): MODEL_TENSOR.FFN_DOWN, MODEL_TENSOR.FFN_UP, ], - MODEL_ARCH.NEMOTRONH: [ + MODEL_ARCH.NEMOTRON_H: [ MODEL_TENSOR.TOKEN_EMBD, MODEL_TENSOR.OUTPUT_NORM, MODEL_TENSOR.OUTPUT, diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp index c5f51e1048e88..d5c8477f4aa39 100644 --- a/src/llama-arch.cpp +++ b/src/llama-arch.cpp @@ -69,7 +69,7 @@ static const std::map LLM_ARCH_NAMES = { { LLM_ARCH_T5ENCODER, "t5encoder" }, { LLM_ARCH_JAIS, "jais" }, { LLM_ARCH_NEMOTRON, "nemotron" }, - { LLM_ARCH_NEMOTRONH, "nemotronh" }, + { LLM_ARCH_NEMOTRON_H, "nemotron_h" }, { LLM_ARCH_EXAONE, "exaone" }, { LLM_ARCH_EXAONE4, "exaone4" }, { LLM_ARCH_RWKV6, "rwkv6" }, @@ -1552,7 +1552,7 @@ static const std::map> LLM_TENSOR_N }, }, { - LLM_ARCH_NEMOTRONH, + LLM_ARCH_NEMOTRON_H, { { LLM_TENSOR_TOKEN_EMBD, "token_embd" }, { LLM_TENSOR_OUTPUT_NORM, "output_norm" }, @@ -2381,7 +2381,7 @@ bool llm_arch_is_hybrid(const llm_arch & arch) { case LLM_ARCH_PLAMO2: case LLM_ARCH_GRANITE_HYBRID: case LLM_ARCH_LFM2: - case LLM_ARCH_NEMOTRONH: + case LLM_ARCH_NEMOTRON_H: return true; default: return false; diff --git a/src/llama-arch.h b/src/llama-arch.h index 701c2a9c392af..86c119692d8cc 100644 --- a/src/llama-arch.h +++ b/src/llama-arch.h @@ -73,7 +73,7 @@ enum llm_arch { LLM_ARCH_T5ENCODER, LLM_ARCH_JAIS, LLM_ARCH_NEMOTRON, - LLM_ARCH_NEMOTRONH, + LLM_ARCH_NEMOTRON_H, LLM_ARCH_EXAONE, LLM_ARCH_EXAONE4, LLM_ARCH_RWKV6, diff --git a/src/llama-model.cpp b/src/llama-model.cpp index 71d9a3b60d9b5..f3e0e9ac64b0d 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -1570,7 +1570,7 @@ void llama_model::load_hparams(llama_model_loader & ml) { default: type = LLM_TYPE_UNKNOWN; } } break; - case LLM_ARCH_NEMOTRONH: + case LLM_ARCH_NEMOTRON_H: { ml.get_key(LLM_KV_SSM_CONV_KERNEL, hparams.ssm_d_conv); ml.get_key(LLM_KV_SSM_INNER_SIZE, hparams.ssm_d_inner); @@ -4709,7 +4709,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) { layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, TENSOR_NOT_REQUIRED); } } break; - case LLM_ARCH_NEMOTRONH: + case LLM_ARCH_NEMOTRON_H: { // mamba2 Mixer SSM params // NOTE: int64_t for tensor dimensions @@ -5953,7 +5953,7 @@ void llama_model::print_info() const { arch == LLM_ARCH_FALCON_H1 || arch == LLM_ARCH_PLAMO2 || arch == LLM_ARCH_GRANITE_HYBRID || - arch == LLM_ARCH_NEMOTRONH) { + arch == LLM_ARCH_NEMOTRON_H) { LLAMA_LOG_INFO("%s: ssm_d_conv = %u\n", __func__, hparams.ssm_d_conv); LLAMA_LOG_INFO("%s: ssm_d_inner = %u\n", __func__, hparams.ssm_d_inner); LLAMA_LOG_INFO("%s: ssm_d_state = %u\n", __func__, hparams.ssm_d_state); @@ -14220,8 +14220,8 @@ struct llm_build_nemotron : public llm_graph_context { } }; -struct llm_build_nemotronh : public llm_graph_context_mamba { - llm_build_nemotronh( +struct llm_build_nemotron_h : public llm_graph_context_mamba { + llm_build_nemotron_h( const llama_model & model, const llm_graph_params & params) : llm_graph_context_mamba(params) { @@ -18508,7 +18508,7 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params, if (arch == LLM_ARCH_FALCON_H1) { filter_attn = [&](int32_t) { return true; }; filter_recr = [&](int32_t) { return true; }; - } else if (arch == LLM_ARCH_NEMOTRONH) { + } else if (arch == LLM_ARCH_NEMOTRON_H) { filter_attn = [&](int32_t il) { return !hparams.is_recurrent(il) && hparams.n_ff(il) == 0; }; @@ -18865,9 +18865,9 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const { { llm = std::make_unique(*this, params); } break; - case LLM_ARCH_NEMOTRONH: + case LLM_ARCH_NEMOTRON_H: { - llm = std::make_unique(*this, params); + llm = std::make_unique(*this, params); } break; case LLM_ARCH_EXAONE: { @@ -19104,7 +19104,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) { case LLM_ARCH_RWKV7: case LLM_ARCH_ARWKV7: case LLM_ARCH_WAVTOKENIZER_DEC: - case LLM_ARCH_NEMOTRONH: + case LLM_ARCH_NEMOTRON_H: return LLAMA_ROPE_TYPE_NONE; // use what we call a normal RoPE, operating on pairs of consecutive head values