Skip to content
Merged
55 changes: 55 additions & 0 deletions convert_hf_to_gguf.py
Original file line number Diff line number Diff line change
Expand Up @@ -300,6 +300,7 @@ def prepare_tensors(self):
gguf.MODEL_TENSOR.POS_EMBD,
gguf.MODEL_TENSOR.TOKEN_TYPES,
gguf.MODEL_TENSOR.SSM_CONV1D,
gguf.MODEL_TENSOR.SHORTCONV_CONV,
gguf.MODEL_TENSOR.TIME_MIX_FIRST,
gguf.MODEL_TENSOR.TIME_MIX_W1,
gguf.MODEL_TENSOR.TIME_MIX_W2,
Expand Down Expand Up @@ -833,6 +834,9 @@ def get_vocab_base_pre(self, tokenizer) -> str:
if chkhsh == "48f8e02c0359c0bbdd82f26909171fac1c18a457bb47573ed1fe3bbb2c1cfd4b":
# ref: https://huggingface.co/tiiuae/Falcon-H1-34B-Base
res = "falcon-h1"
if chkhsh == "169bf0296a13c4d9b7672313f749eb36501d931022de052aad6e36f2bf34dd51":
# ref: https://huggingface.co/LiquidAI/LFM2-Tokenizer
res = "lfm2"

if res is None:
logger.warning("\n")
Expand Down Expand Up @@ -6943,6 +6947,57 @@ def set_vocab(self):
chat_template = tokenizer.chat_template.replace("[:]", "")
self.gguf_writer.add_chat_template(chat_template)


@ModelBase.register("LFM2ForCausalLM")
class LFM2Model(TextModel):
model_arch = gguf.MODEL_ARCH.LFM2

def _add_feed_forward_length(self):
ff_dim = self.hparams["block_ff_dim"]

auto_adjust_ff_dim = self.hparams["block_auto_adjust_ff_dim"]
ff_dim = self.hparams["block_ff_dim"]
ffn_dim_multiplier = self.hparams["block_ffn_dim_multiplier"]
multiple_of = self.hparams["block_multiple_of"]

if auto_adjust_ff_dim:
ff_dim = int(2 * ff_dim / 3)
# custom dim factor multiplier
if ffn_dim_multiplier is not None:
ff_dim = int(ffn_dim_multiplier * ff_dim)
ff_dim = multiple_of * ((ff_dim + multiple_of - 1) // multiple_of)

self.gguf_writer.add_feed_forward_length(ff_dim)

def set_gguf_parameters(self):
# set num_key_value_heads only for attention layers
self.hparams["num_key_value_heads"] = [
self.hparams["num_key_value_heads"] if layer_type == "full_attention" else 0
for layer_type in self.hparams["layer_types"]
]

super().set_gguf_parameters()
self.gguf_writer.add_vocab_size(self.hparams["vocab_size"])
self.gguf_writer.add_shortconv_l_cache(self.hparams["conv_L_cache"])
self.gguf_writer.add_layer_norm_rms_eps(self.hparams["norm_eps"])
self._add_feed_forward_length()

def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
if 'operator_norm' in name:
name = name.replace('operator_norm', 'norm')
elif 'attention.k_layernorm' in name or 'attention.q_layernorm' in name:
name = name.replace('attention', 'self_attn')
elif name.startswith("model.embedding_norm"):
name = name.replace("model.embedding_norm", 'word_embeddings_layernorm')
elif 'conv.conv' in name:
# conv op requires 2d tensor
data_torch = data_torch.squeeze(1)
elif 'self_attn.out_proj' in name:
name = name.replace('out_proj', 'o_proj')

return [(self.map_tensor_name(name), data_torch)]


###### CONVERSION LOGIC ######


Expand Down
1 change: 1 addition & 0 deletions convert_hf_to_gguf_update.py
Original file line number Diff line number Diff line change
Expand Up @@ -129,6 +129,7 @@ class TOKENIZER_TYPE(IntEnum):
{"name": "pixtral", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/mistral-community/pixtral-12b", },
{"name": "seed-coder", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/ByteDance-Seed/Seed-Coder-8B-Base", },
{"name": "a.x-4.0", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/skt/A.X-4.0", },
{"name": "lfm2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/LiquidAI/LFM2-Tokenizer"},
]

# some models are known to be broken upstream, so we will skip them as exceptions
Expand Down
8 changes: 8 additions & 0 deletions ggml/src/ggml-cuda/ssm-conv.cu
Original file line number Diff line number Diff line change
Expand Up @@ -107,6 +107,9 @@ static void ssm_conv_f32_cuda(const float * src0, const float * src1, const int
if (nc == 4) {
ssm_conv_f32<threads, 4><<<blocks, threads, 0, stream>>>(src0, src1, src0_nb0, src0_nb1, src0_nb2, src1_nb1,
dst, dst_nb0, dst_nb1, dst_nb2, n_t);
} else if (nc == 3) {
ssm_conv_f32<threads, 3><<<blocks, threads, 0, stream>>>(src0, src1, src0_nb0, src0_nb1, src0_nb2, src1_nb1,
dst, dst_nb0, dst_nb1, dst_nb2, n_t);
} else {
GGML_ABORT("Only support kernel size = 4 now.");
}
Expand All @@ -116,6 +119,11 @@ static void ssm_conv_f32_cuda(const float * src0, const float * src1, const int
dim3 blocks(n_s, (nr + threads - 1) / threads, (n_t + split_n_t - 1) / split_n_t);
ssm_conv_long_token_f32<threads, 4, split_n_t><<<blocks, threads, 0, stream>>>(
src0, src1, src0_nb0, src0_nb1, src0_nb2, src1_nb1, dst, dst_nb0, dst_nb1, dst_nb2, n_t);
} else if (nc == 3) {
const int64_t split_n_t = 32;
dim3 blocks(n_s, (nr + threads - 1) / threads, (n_t + split_n_t - 1) / split_n_t);
ssm_conv_long_token_f32<threads, 3, split_n_t><<<blocks, threads, 0, stream>>>(
src0, src1, src0_nb0, src0_nb1, src0_nb2, src1_nb1, dst, dst_nb0, dst_nb1, dst_nb2, n_t);
} else {
GGML_ABORT("Only support kernel size = 4 right now.");
}
Expand Down
29 changes: 29 additions & 0 deletions gguf-py/gguf/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -187,6 +187,9 @@ class ConvNext:
class Classifier:
OUTPUT_LABELS = "{arch}.classifier.output_labels"

class ShortConv:
L_CACHE = "{arch}.shortconv.l_cache"

class Tokenizer:
MODEL = "tokenizer.ggml.model"
PRE = "tokenizer.ggml.pre"
Expand Down Expand Up @@ -361,6 +364,7 @@ class MODEL_ARCH(IntEnum):
ERNIE4_5 = auto()
HUNYUAN_MOE = auto()
SMOLLM3 = auto()
LFM2 = auto()


class VISION_PROJECTOR_TYPE(IntEnum):
Expand Down Expand Up @@ -532,6 +536,9 @@ class MODEL_TENSOR(IntEnum):
POSNET_ATTN_K = auto()
POSNET_ATTN_V = auto()
POSNET_ATTN_OUT = auto()
SHORTCONV_CONV = auto()
SHORTCONV_INPROJ = auto()
SHORTCONV_OUTPROJ = auto()
# vision
V_MMPROJ = auto()
V_MMPROJ_FC = auto()
Expand Down Expand Up @@ -671,6 +678,7 @@ class MODEL_TENSOR(IntEnum):
MODEL_ARCH.FALCON_H1: "falcon-h1",
MODEL_ARCH.HUNYUAN_MOE: "hunyuan-moe",
MODEL_ARCH.SMOLLM3: "smollm3",
MODEL_ARCH.LFM2: "lfm2",
}

VISION_PROJECTOR_TYPE_NAMES: dict[VISION_PROJECTOR_TYPE, str] = {
Expand Down Expand Up @@ -842,6 +850,9 @@ class MODEL_TENSOR(IntEnum):
MODEL_TENSOR.POSNET_ATTN_K: "posnet.{bid}.attn_k",
MODEL_TENSOR.POSNET_ATTN_V: "posnet.{bid}.attn_v",
MODEL_TENSOR.POSNET_ATTN_OUT: "posnet.{bid}.attn_output",
MODEL_TENSOR.SHORTCONV_CONV: "blk.{bid}.shortconv.conv",
MODEL_TENSOR.SHORTCONV_INPROJ: "blk.{bid}.shortconv.in_proj",
MODEL_TENSOR.SHORTCONV_OUTPROJ: "blk.{bid}.shortconv.out_proj",
# vision
MODEL_TENSOR.V_MMPROJ: "mm.{bid}",
MODEL_TENSOR.V_MMPROJ_FC: "mm.model.fc",
Expand Down Expand Up @@ -2324,6 +2335,24 @@ class MODEL_TENSOR(IntEnum):
MODEL_TENSOR.FFN_DOWN,
MODEL_TENSOR.FFN_UP,
],
MODEL_ARCH.LFM2: [
MODEL_TENSOR.TOKEN_EMBD,
MODEL_TENSOR.TOKEN_EMBD_NORM,
MODEL_TENSOR.SHORTCONV_CONV,
MODEL_TENSOR.SHORTCONV_INPROJ,
MODEL_TENSOR.SHORTCONV_OUTPROJ,
MODEL_TENSOR.FFN_GATE,
MODEL_TENSOR.FFN_DOWN,
MODEL_TENSOR.FFN_UP,
MODEL_TENSOR.FFN_NORM,
MODEL_TENSOR.ATTN_NORM, # operator_norm
MODEL_TENSOR.ATTN_Q_NORM,
MODEL_TENSOR.ATTN_K_NORM,
MODEL_TENSOR.ATTN_Q,
MODEL_TENSOR.ATTN_K,
MODEL_TENSOR.ATTN_V,
MODEL_TENSOR.ATTN_OUT,
],
# TODO
}

Expand Down
3 changes: 3 additions & 0 deletions gguf-py/gguf/gguf_writer.py
Original file line number Diff line number Diff line change
Expand Up @@ -648,6 +648,9 @@ def add_convnext_embedding_length(self, length: int) -> None:
def add_convnext_block_count(self, length: int) -> None:
self.add_uint32(Keys.ConvNext.BLOCK_COUNT.format(arch=self.arch), length)

def add_shortconv_l_cache(self, length: int) -> None:
self.add_uint32(Keys.ShortConv.L_CACHE.format(arch=self.arch), length)

def add_block_count(self, length: int) -> None:
self.add_uint32(Keys.LLM.BLOCK_COUNT.format(arch=self.arch), length)

Expand Down
12 changes: 12 additions & 0 deletions gguf-py/gguf/tensor_mapping.py
Original file line number Diff line number Diff line change
Expand Up @@ -1015,6 +1015,18 @@ class TensorNameMap:
"backbone.posnet.{bid}.proj_out", # wavtokenizer
),

MODEL_TENSOR.SHORTCONV_CONV: (
"model.layers.{bid}.conv.conv",
),

MODEL_TENSOR.SHORTCONV_INPROJ: (
"model.layers.{bid}.conv.in_proj",
),

MODEL_TENSOR.SHORTCONV_OUTPROJ: (
"model.layers.{bid}.conv.out_proj",
),

#############################################################################
## Vision encoder

Expand Down
28 changes: 28 additions & 0 deletions src/llama-arch.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
{ LLM_ARCH_ERNIE4_5, "ernie4_5" },
{ LLM_ARCH_HUNYUAN_MOE, "hunyuan-moe" },
{ LLM_ARCH_SMOLLM3, "smollm3" },
{ LLM_ARCH_LFM2, "lfm2" },
{ LLM_ARCH_UNKNOWN, "(unknown)" },
};

Expand Down Expand Up @@ -188,6 +189,8 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {

{ LLM_KV_CLASSIFIER_OUTPUT_LABELS, "%s.classifier.output_labels" },

{ LLM_KV_SHORTCONV_L_CACHE, "%s.shortconv.l_cache" },

{ LLM_KV_TOKENIZER_MODEL, "tokenizer.ggml.model" },
{ LLM_KV_TOKENIZER_PRE, "tokenizer.ggml.pre" },
{ LLM_KV_TOKENIZER_LIST, "tokenizer.ggml.tokens" },
Expand Down Expand Up @@ -1793,6 +1796,27 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
},
},
{
LLM_ARCH_LFM2,
{
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
{ LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm" },
{ LLM_TENSOR_ATTN_Q_NORM, "blk.%d.attn_q_norm" },
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
{ LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
{ LLM_TENSOR_SHORTCONV_CONV, "blk.%d.shortconv.conv" },
{ LLM_TENSOR_SHORTCONV_INPROJ, "blk.%d.shortconv.in_proj" },
{ LLM_TENSOR_SHORTCONV_OUTPROJ, "blk.%d.shortconv.out_proj" },
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
{ LLM_TENSOR_TOKEN_EMBD_NORM, "token_embd_norm" },
}
},
{
LLM_ARCH_UNKNOWN,
{
Expand Down Expand Up @@ -1960,6 +1984,9 @@ static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = {
{LLM_TENSOR_CONVNEXT_PW1, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
{LLM_TENSOR_CONVNEXT_PW2, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
{LLM_TENSOR_CONVNEXT_GAMMA, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
{LLM_TENSOR_SHORTCONV_CONV, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
{LLM_TENSOR_SHORTCONV_INPROJ, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
{LLM_TENSOR_SHORTCONV_OUTPROJ, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
};

LLM_KV::LLM_KV(llm_arch arch, const char * suffix) : arch(arch), suffix(suffix) {}
Expand Down Expand Up @@ -2031,6 +2058,7 @@ bool llm_arch_is_hybrid(const llm_arch & arch) {
switch (arch) {
case LLM_ARCH_JAMBA:
case LLM_ARCH_FALCON_H1:
case LLM_ARCH_LFM2:
return true;
default:
return false;
Expand Down
6 changes: 6 additions & 0 deletions src/llama-arch.h
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,7 @@ enum llm_arch {
LLM_ARCH_ERNIE4_5,
LLM_ARCH_HUNYUAN_MOE,
LLM_ARCH_SMOLLM3,
LLM_ARCH_LFM2,
LLM_ARCH_UNKNOWN,
};

Expand Down Expand Up @@ -227,6 +228,8 @@ enum llm_kv {

LLM_KV_CLASSIFIER_OUTPUT_LABELS,

LLM_KV_SHORTCONV_L_CACHE,

// deprecated:
LLM_KV_TOKENIZER_PREFIX_ID,
LLM_KV_TOKENIZER_SUFFIX_ID,
Expand Down Expand Up @@ -396,6 +399,9 @@ enum llm_tensor {
LLM_TENSOR_POS_NET_ATTN_K,
LLM_TENSOR_POS_NET_ATTN_V,
LLM_TENSOR_POS_NET_ATTN_OUT,
LLM_TENSOR_SHORTCONV_CONV,
LLM_TENSOR_SHORTCONV_INPROJ,
LLM_TENSOR_SHORTCONV_OUTPROJ,
};

enum llm_tensor_layer {
Expand Down
5 changes: 5 additions & 0 deletions src/llama-hparams.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,11 @@ uint32_t llama_hparams::n_embd_r() const {
return token_shift_count * n_embd;
}

if (n_shortconv_l_cache != 0) {
// for LFM2 models
return n_embd * (n_shortconv_l_cache - 1);
}

// TODO: maybe support other convolution strides than 1
// NOTE: since the first column of the conv_state is shifted out each time, it's not actually needed
// Corresponds to Mamba's conv_states size
Expand Down
2 changes: 2 additions & 0 deletions src/llama-hparams.h
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,8 @@ struct llama_hparams {
struct llama_hparams_posnet posnet;
struct llama_hparams_convnext convnext;

uint32_t n_shortconv_l_cache = 0;

std::array<uint32_t, LLAMA_MAX_LAYERS> n_head_arr;
std::array<uint32_t, LLAMA_MAX_LAYERS> n_head_kv_arr;
std::array<uint32_t, LLAMA_MAX_LAYERS> n_ff_arr;
Expand Down
Loading