Skip to content

Commit a2d0778

Browse files
authored
Merge branch 'master' into model/a.x-4.0
2 parents b6fcaf7 + 1055545 commit a2d0778

File tree

8 files changed

+595
-9
lines changed

8 files changed

+595
-9
lines changed

convert_hf_to_gguf.py

Lines changed: 136 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -821,6 +821,18 @@ def get_vocab_base_pre(self, tokenizer) -> str:
821821
if chkhsh == "b0a6b1c0bd5998ebd9df08611efde34a4ff03faed45ae09c43e6b31ebd4b94cf":
822822
# ref: https://huggingface.co/skt/A.X-4.0
823823
res = "a.x-4.0"
824+
if chkhsh == "a6b57017d60e6edb4d88ecc2845188e0eb333a70357e45dcc9b53964a73bbae6":
825+
# ref: https://huggingface.co/tiiuae/Falcon-H1-0.5B-Base
826+
res = "falcon-h1"
827+
if chkhsh == "60476e1243776c4fb1b993dbd7a5f15ac22f83c80afdf425fa5ae01c8d44ef86":
828+
# ref: https://huggingface.co/tiiuae/Falcon-H1-1B-Base
829+
res = "falcon-h1"
830+
if chkhsh == "3eda48b4c4dc7de733d1a8b3e3b4a85243dbbf704da2ee9d42c6beced8897896":
831+
# ref: https://huggingface.co/tiiuae/Falcon-H1-7B-Base
832+
res = "falcon-h1"
833+
if chkhsh == "48f8e02c0359c0bbdd82f26909171fac1c18a457bb47573ed1fe3bbb2c1cfd4b":
834+
# ref: https://huggingface.co/tiiuae/Falcon-H1-34B-Base
835+
res = "falcon-h1"
824836

825837
if res is None:
826838
logger.warning("\n")
@@ -4902,17 +4914,19 @@ def set_vocab(self):
49024914
def set_gguf_parameters(self):
49034915
d_model = self.find_hparam(["hidden_size", "d_model", "dim"])
49044916
d_conv = self.find_hparam(["conv_kernel", "d_conv"], optional=True) or 4
4905-
d_inner = self.find_hparam(["intermediate_size", "d_inner"], optional=True) or 2 * d_model
4917+
d_inner = self.find_hparam(["mamba_d_ssm", "intermediate_size", "d_inner"], optional=True) or 2 * d_model
49064918
d_state = self.find_hparam(["state_size", "d_state"], optional=True) or 128
4907-
head_dim = self.find_hparam(["head_dim"], optional=True) or 64
4919+
head_dim = self.find_hparam(["mamba_d_head", "head_dim"], optional=True) or 64
49084920
n_group = self.find_hparam(["n_groups"], optional=True) or 1
49094921

49104922
rms_norm_eps = self.find_hparam(["layer_norm_epsilon", "rms_norm_eps"], optional=True) or 1e-5
49114923

49124924
# Fail early for models which don't have a block expansion factor of 2
49134925
# TODO: does this really matter?
4914-
assert d_inner == 2 * d_model
4915-
assert d_inner % head_dim == 0
4926+
# skip the assertion for FalconH1 Model
4927+
if self.model_arch != gguf.MODEL_ARCH.FALCON_H1:
4928+
assert d_inner == 2 * d_model
4929+
assert d_inner % head_dim == 0
49164930

49174931
self.gguf_writer.add_context_length(2**20) # arbitrary value; for those who use the default
49184932
self.gguf_writer.add_embedding_length(d_model)
@@ -4949,7 +4963,7 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
49494963
data_torch = data_torch.reshape((*data_torch.shape, 1))
49504964
elif self.match_model_tensor_name(new_name, gguf.MODEL_TENSOR.SSM_NORM, bid):
49514965
d_model = self.find_hparam(["hidden_size", "d_model", "dim"])
4952-
d_inner = self.find_hparam(["intermediate_size", "d_inner"], optional=True) or 2 * d_model
4966+
d_inner = self.find_hparam(["mamba_d_ssm", "intermediate_size", "d_inner"], optional=True) or 2 * d_model
49534967
n_group = self.hparams.get("n_groups", 1)
49544968
data_torch = data_torch.reshape((n_group, d_inner // n_group))
49554969

@@ -6542,6 +6556,113 @@ def set_gguf_parameters(self):
65426556
self.gguf_writer.add_audio_stack_factor(self.global_config["stack_factor"])
65436557

65446558

6559+
@ModelBase.register("FalconH1ForCausalLM")
6560+
class FalconH1Model(Mamba2Model):
6561+
model_arch = gguf.MODEL_ARCH.FALCON_H1
6562+
6563+
def __init__(self, *args, **kwargs):
6564+
# Set the hparam prefixes for Falcon Mamba2
6565+
self.hparam_prefixes = ["mamba"]
6566+
6567+
# Initialize the base Mamba2Model
6568+
super().__init__(*args, **kwargs)
6569+
6570+
# Use Llama conversion for attention
6571+
self._transformer_model_class = LlamaModel
6572+
6573+
# n_group and d_inner are used during reshape_tensors for mamaba2
6574+
self.n_group = self.find_hparam(["n_groups"])
6575+
self.d_inner = self.find_hparam(["mamba_d_ssm"])
6576+
self.d_head = self.find_hparam(["d_head"])
6577+
6578+
# Initialize any Falcon Mamba2 specific attributes
6579+
self.has_attention = True # Falcon Mamba2 has attention components
6580+
6581+
# Load Falcon-H1 multipliers from hyperparameters
6582+
self.attention_in_multiplier = self.find_hparam(["attention_in_multiplier"], optional=True)
6583+
self.attention_out_multiplier = self.find_hparam(["attention_out_multiplier"], optional=True)
6584+
self.ssm_in_multiplier = self.find_hparam(["ssm_in_multiplier"], optional=True)
6585+
self.ssm_out_multiplier = self.find_hparam(["ssm_out_multiplier"], optional=True)
6586+
self.mlp_multipliers = self.find_hparam(["mlp_multipliers"], optional=True)
6587+
self.ssm_multipliers = self.find_hparam(["ssm_multipliers"], optional=True)
6588+
self.intermediate_size = self.find_hparam(["intermediate_size"])
6589+
self.key_multiplier = self.find_hparam(["key_multiplier"], optional=True)
6590+
6591+
def find_hparam(self, keys: Iterable[str], *args, **kwargs) -> Any:
6592+
prefixed = []
6593+
for pfx in self.hparam_prefixes:
6594+
prefixed.extend(
6595+
"_".join([pfx, k])
6596+
for k in keys
6597+
)
6598+
keys = list(keys) + prefixed
6599+
return super().find_hparam(keys, *args, **kwargs)
6600+
6601+
def set_vocab(self):
6602+
self._set_vocab_gpt2()
6603+
6604+
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
6605+
tensors = list(super().modify_tensors(data_torch, name, bid))
6606+
tensor = tensors[0][1]
6607+
6608+
if "down_proj" in name:
6609+
tensor = tensor * self.mlp_multipliers[1]
6610+
elif "gate_proj" in name:
6611+
tensor = tensor * self.mlp_multipliers[0]
6612+
elif "k_proj" in name:
6613+
tensor = tensor * self.key_multiplier * self.attention_in_multiplier
6614+
elif "q_proj" in name:
6615+
tensor = tensor * self.attention_in_multiplier
6616+
elif "v_proj" in name:
6617+
tensor = tensor * self.attention_in_multiplier
6618+
elif "o_proj" in name:
6619+
tensor = tensor * self.attention_out_multiplier
6620+
elif "out_proj" in name:
6621+
tensor = tensor * self.ssm_out_multiplier
6622+
elif "in_proj" in name:
6623+
tensor = tensor * self.ssm_in_multiplier
6624+
zxbcdt_multipliers = self.hparams["ssm_multipliers"]
6625+
intermediate_size = self.hparams["mamba_d_ssm"]
6626+
groups_time_state_size = self.hparams["mamba_n_groups"] * self.hparams["mamba_d_state"]
6627+
tensor[:intermediate_size, :] *= zxbcdt_multipliers[0]
6628+
tensor[intermediate_size:2 * intermediate_size, :] *= zxbcdt_multipliers[1]
6629+
tensor[2 * intermediate_size:2 * intermediate_size + groups_time_state_size, :] *= zxbcdt_multipliers[2]
6630+
tensor[2 * intermediate_size + groups_time_state_size:2 * intermediate_size + 2 * groups_time_state_size, :] *= zxbcdt_multipliers[3]
6631+
tensor[2 * intermediate_size + 2 * groups_time_state_size:, :] *= zxbcdt_multipliers[4]
6632+
elif "lm_head" in name:
6633+
tensor = tensor * self.hparams["lm_head_multiplier"]
6634+
elif "embed_tokens" in name:
6635+
tensor = tensor * self.hparams["embedding_multiplier"]
6636+
elif "mamba.norm" in name:
6637+
tensor = tensor.reshape(self.n_group, self.d_inner // self.n_group)
6638+
6639+
tensors = [(tensors[0][0], tensor)]
6640+
return tensors
6641+
6642+
def set_gguf_parameters(self):
6643+
super().set_gguf_parameters()
6644+
6645+
## General Params ##
6646+
self.gguf_writer.add_vocab_size(self.hparams["vocab_size"])
6647+
# Override some Mamba2 defaults
6648+
self.gguf_writer.add_block_count(self.block_count)
6649+
self.gguf_writer.add_context_length(self.hparams.get("max_position_embeddings", 0))
6650+
self.gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"])
6651+
6652+
## Attention params ##
6653+
self.gguf_writer.add_head_count(self.hparams["num_attention_heads"]) # Override value 0 from Mamba2
6654+
self.gguf_writer.add_head_count_kv(self.hparams["num_key_value_heads"])
6655+
self.gguf_writer.add_key_length(self.hparams["head_dim"])
6656+
self.gguf_writer.add_value_length(self.hparams["head_dim"])
6657+
6658+
## Validation ##
6659+
assert self.hparams.get("hidden_act") in [None, "silu"], "Only SILU activation supported"
6660+
assert self.d_inner % self.d_head == 0, f"SSM inner size {self.d_inner} not a multiple of head dim {self.d_head}"
6661+
6662+
# Add any other Falcon Mamba2 specific configuration
6663+
self.gguf_writer.add_rope_freq_base(self.find_hparam(["rope_theta"]))
6664+
6665+
65456666
@ModelBase.register("HunYuanMoEV1ForCausalLM")
65466667
class HunYuanMoEModel(TextModel):
65476668
model_arch = gguf.MODEL_ARCH.HUNYUAN_MOE
@@ -6695,6 +6816,16 @@ def prepare_tensors(self):
66956816
class SmolLM3Model(LlamaModel):
66966817
model_arch = gguf.MODEL_ARCH.SMOLLM3
66976818

6819+
def set_vocab(self):
6820+
super().set_vocab()
6821+
# remove unsupported array slicing in chat template
6822+
# ref: https://huggingface.co/ggml-org/SmolLM3-3B-GGUF/discussions/1
6823+
from transformers import AutoTokenizer
6824+
tokenizer = AutoTokenizer.from_pretrained(self.dir_model)
6825+
if tokenizer.chat_template is not None:
6826+
chat_template = tokenizer.chat_template.replace("[:]", "")
6827+
self.gguf_writer.add_chat_template(chat_template)
6828+
66986829
###### CONVERSION LOGIC ######
66996830

67006831

convert_hf_to_gguf_update.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -139,6 +139,11 @@ class TOKENIZER_TYPE(IntEnum):
139139
{"name": "glm4", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/THUDM/glm-4-9b-hf", "chkhsh": "a1336059768a55c99a734006ffb02203cd450fed003e9a71886c88acf24fdbc2"},
140140
{"name": "minerva-7b", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/sapienzanlp/Minerva-7B-base-v1.0", "chkhsh": "1431a23e583c97432bc230bff598d103ddb5a1f89960c8f1d1051aaa944d0b35"},
141141
{"name": "hunyuan", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/tencent/Hunyuan-A13B-Instruct", "chkhsh": "7e57df22b1fe23a7b1e1c7f3dc4e3f96d43a4eb0836d0c6bdc3436d7b2f1c664"},
142+
# falcon-h1 series uses 4 different tokenizers across model sizes (0.5b - 34b), hence we need to define 4 different hashes
143+
{"name": "falcon-h1", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/tiiuae/Falcon-H1-0.5B-Base", "chkhsh": "a6b57017d60e6edb4d88ecc2845188e0eb333a70357e45dcc9b53964a73bbae6"},
144+
{"name": "falcon-h1", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/tiiuae/Falcon-H1-1B-Base", "chkhsh": "60476e1243776c4fb1b993dbd7a5f15ac22f83c80afdf425fa5ae01c8d44ef86"},
145+
{"name": "falcon-h1", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/tiiuae/Falcon-H1-7B-Base", "chkhsh": "3eda48b4c4dc7de733d1a8b3e3b4a85243dbbf704da2ee9d42c6beced8897896"},
146+
{"name": "falcon-h1", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/tiiuae/Falcon-H1-34B-Base", "chkhsh": "48f8e02c0359c0bbdd82f26909171fac1c18a457bb47573ed1fe3bbb2c1cfd4b"},
142147
]
143148

144149

gguf-py/gguf/constants.py

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -288,6 +288,7 @@ class MODEL_ARCH(IntEnum):
288288
LLAMA4 = auto()
289289
DECI = auto()
290290
FALCON = auto()
291+
FALCON_H1 = auto()
291292
BAICHUAN = auto()
292293
GROK = auto()
293294
GPT2 = auto()
@@ -662,6 +663,7 @@ class MODEL_TENSOR(IntEnum):
662663
MODEL_ARCH.DOTS1: "dots1",
663664
MODEL_ARCH.ARCEE: "arcee",
664665
MODEL_ARCH.ERNIE4_5: "ernie4_5",
666+
MODEL_ARCH.FALCON_H1: "falcon-h1",
665667
MODEL_ARCH.HUNYUAN_MOE: "hunyuan-moe",
666668
MODEL_ARCH.SMOLLM3: "smollm3",
667669
}
@@ -2215,6 +2217,40 @@ class MODEL_TENSOR(IntEnum):
22152217
MODEL_TENSOR.FFN_DOWN,
22162218
MODEL_TENSOR.FFN_UP,
22172219
],
2220+
MODEL_ARCH.FALCON_H1: [
2221+
# Token embedding
2222+
MODEL_TENSOR.TOKEN_EMBD,
2223+
2224+
# Input layernorm
2225+
MODEL_TENSOR.ATTN_NORM,
2226+
2227+
# Attention components
2228+
MODEL_TENSOR.ATTN_Q, # Query projection
2229+
MODEL_TENSOR.ATTN_K, # Key projection
2230+
MODEL_TENSOR.ATTN_V, # Value projection
2231+
MODEL_TENSOR.ATTN_OUT, # Output projection
2232+
2233+
# SSM components (Mamba2 specific)
2234+
MODEL_TENSOR.SSM_IN, # Input projection for SSM
2235+
MODEL_TENSOR.SSM_CONV1D, # Convolution layer
2236+
MODEL_TENSOR.SSM_DT, # Delta time projection
2237+
MODEL_TENSOR.SSM_A, # A parameter (log form)
2238+
MODEL_TENSOR.SSM_D, # D parameter
2239+
MODEL_TENSOR.SSM_NORM, # Normalization in SSM
2240+
MODEL_TENSOR.SSM_OUT, # Output projection
2241+
2242+
# Pre-feedforward layernorm
2243+
MODEL_TENSOR.FFN_PRE_NORM,
2244+
2245+
# Feed-forward network components
2246+
MODEL_TENSOR.FFN_GATE, # Gate projection (SwiGLU)
2247+
MODEL_TENSOR.FFN_DOWN, # Down projection
2248+
MODEL_TENSOR.FFN_UP, # Up projection
2249+
2250+
# Post-feedforward layernorm
2251+
MODEL_TENSOR.OUTPUT_NORM, # Final layer norm
2252+
MODEL_TENSOR.OUTPUT, # Output projection (lm_head)
2253+
],
22182254
MODEL_ARCH.HUNYUAN_MOE: [
22192255
MODEL_TENSOR.TOKEN_EMBD,
22202256
MODEL_TENSOR.OUTPUT_NORM,

gguf-py/gguf/tensor_mapping.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -286,12 +286,14 @@ class TensorNameMap:
286286
# Post feed-forward norm
287287
MODEL_TENSOR.FFN_PRE_NORM: (
288288
"model.layers.{bid}.pre_feedforward_layernorm", # gemma2
289+
"model.layers.{bid}.pre_ff_layernorm.weight",
289290
),
290291

291292
# Post feed-forward norm
292293
MODEL_TENSOR.FFN_POST_NORM: (
293294
"model.layers.{bid}.post_feedforward_layernorm", # gemma2 olmo2
294295
"model.layers.{bid}.post_mlp_layernorm", # glm-4-0414
296+
"model.layers.{bid}.feed_forward.up_proj",
295297
),
296298

297299
MODEL_TENSOR.FFN_GATE_INP: (
@@ -363,6 +365,7 @@ class TensorNameMap:
363365
"model.layers.{bid}.mlp.shared_expert.up_proj", # qwen2moe
364366
"model.layers.{bid}.mlp.shared_experts.up_proj", # deepseek deepseek2
365367
"model.layers.{bid}.feed_forward.shared_expert.up_proj", # llama4
368+
"model.layers.{bid}.feed_forward.down_proj",
366369
"model.layers.{bid}.mlp.shared_mlp.up_proj", # hunyuan
367370
),
368371

@@ -553,11 +556,13 @@ class TensorNameMap:
553556
MODEL_TENSOR.SSM_IN: (
554557
"model.layers.{bid}.in_proj",
555558
"backbone.layers.{bid}.mixer.in_proj",
559+
"model.layers.{bid}.mamba.in_proj",
556560
),
557561

558562
MODEL_TENSOR.SSM_CONV1D: (
559563
"model.layers.{bid}.conv1d",
560564
"backbone.layers.{bid}.mixer.conv1d",
565+
"model.layers.{bid}.mamba.conv1d",
561566
),
562567

563568
MODEL_TENSOR.SSM_X: (
@@ -568,25 +573,30 @@ class TensorNameMap:
568573
MODEL_TENSOR.SSM_DT: (
569574
"model.layers.{bid}.dt_proj",
570575
"backbone.layers.{bid}.mixer.dt_proj",
576+
"model.layers.{bid}.mamba.dt_proj",
571577
),
572578

573579
MODEL_TENSOR.SSM_A: (
574580
"model.layers.{bid}.A_log",
575581
"backbone.layers.{bid}.mixer.A_log",
582+
"model.layers.{bid}.mamba.A_log",
576583
),
577584

578585
MODEL_TENSOR.SSM_D: (
579586
"model.layers.{bid}.D",
580587
"backbone.layers.{bid}.mixer.D",
588+
"model.layers.{bid}.mamba.D",
581589
),
582590

583591
MODEL_TENSOR.SSM_NORM: (
592+
"model.layers.{bid}.mamba.norm", # falcon-h1
584593
"backbone.layers.{bid}.mixer.norm", # mamba2
585594
),
586595

587596
MODEL_TENSOR.SSM_OUT: (
588597
"model.layers.{bid}.out_proj",
589598
"backbone.layers.{bid}.mixer.out_proj",
599+
"model.layers.{bid}.mamba.out_proj", # falcon-h1
590600
),
591601

592602
MODEL_TENSOR.TIME_MIX_W0: (

src/llama-arch.cpp

Lines changed: 28 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
4646
{ LLM_ARCH_STARCODER2, "starcoder2" },
4747
{ LLM_ARCH_MAMBA, "mamba" },
4848
{ LLM_ARCH_MAMBA2, "mamba2" },
49+
{ LLM_ARCH_FALCON_H1, "falcon-h1" },
4950
{ LLM_ARCH_XVERSE, "xverse" },
5051
{ LLM_ARCH_COMMAND_R, "command-r" },
5152
{ LLM_ARCH_COHERE2, "cohere2" },
@@ -1024,6 +1025,30 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
10241025
{ LLM_TENSOR_SSM_OUT, "blk.%d.ssm_out" },
10251026
},
10261027
},
1028+
{
1029+
LLM_ARCH_FALCON_H1,
1030+
{
1031+
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
1032+
{ LLM_TENSOR_OUTPUT, "output" },
1033+
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
1034+
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
1035+
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
1036+
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
1037+
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
1038+
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
1039+
{ LLM_TENSOR_SSM_IN, "blk.%d.ssm_in" },
1040+
{ LLM_TENSOR_SSM_CONV1D, "blk.%d.ssm_conv1d" },
1041+
{ LLM_TENSOR_SSM_DT, "blk.%d.ssm_dt" },
1042+
{ LLM_TENSOR_SSM_A, "blk.%d.ssm_a" },
1043+
{ LLM_TENSOR_SSM_D, "blk.%d.ssm_d" },
1044+
{ LLM_TENSOR_SSM_NORM, "blk.%d.ssm_norm" },
1045+
{ LLM_TENSOR_SSM_OUT, "blk.%d.ssm_out" },
1046+
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
1047+
{ LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
1048+
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
1049+
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
1050+
},
1051+
},
10271052
{
10281053
LLM_ARCH_XVERSE,
10291054
{
@@ -1967,9 +1992,10 @@ bool llm_arch_is_recurrent(const llm_arch & arch) {
19671992
}
19681993

19691994
bool llm_arch_is_hybrid(const llm_arch & arch) {
1970-
// TODO: There are currently no hybrid models! Once there are, this will be
1971-
// the place to identify them
1995+
// List all mamba-attention hybrid models here
19721996
switch (arch) {
1997+
case LLM_ARCH_FALCON_H1:
1998+
return true;
19731999
default:
19742000
return false;
19752001
}

src/llama-arch.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,7 @@ enum llm_arch {
5050
LLM_ARCH_STARCODER2,
5151
LLM_ARCH_MAMBA,
5252
LLM_ARCH_MAMBA2,
53+
LLM_ARCH_FALCON_H1,
5354
LLM_ARCH_XVERSE,
5455
LLM_ARCH_COMMAND_R,
5556
LLM_ARCH_COHERE2,

0 commit comments

Comments
 (0)