Skip to content

Commit 103e894

Browse files
committed
update glm_asr convert script & use build_ffn for glm_asr clip & use build_stack for padding and review
1 parent e8a1ec5 commit 103e894

File tree

2 files changed

+50
-28
lines changed

2 files changed

+50
-28
lines changed

convert_hf_to_gguf.py

Lines changed: 17 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -2359,7 +2359,6 @@ def prepare_tensors(self):
23592359
"VLlama3ForCausalLM",
23602360
"LlavaForConditionalGeneration",
23612361
"VoxtralForConditionalGeneration",
2362-
"GlmasrModel",
23632362
"LlamaModel")
23642363
class LlamaModel(TextModel):
23652364
model_arch = gguf.MODEL_ARCH.LLAMA
@@ -2411,16 +2410,6 @@ def set_vocab(self):
24112410
# Apply to granite small models only
24122411
if self.hparams.get("vocab_size", 32000) == 49152:
24132412
self.gguf_writer.add_add_bos_token(False)
2414-
if isinstance(self.hparams.get("eos_token_id"), list):
2415-
from transformers import AutoTokenizer
2416-
tokenizer = AutoTokenizer.from_pretrained(self.dir_model, trust_remote_code=True)
2417-
special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=True)
2418-
special_vocab._set_special_token("eos", tokenizer.get_added_vocab()["<|endoftext|>"])
2419-
special_vocab._set_special_token("eot", tokenizer.get_added_vocab()["<|user|>"])
2420-
special_vocab._set_special_token("unk", tokenizer.get_added_vocab()["<|endoftext|>"])
2421-
special_vocab._set_special_token("bos", tokenizer.get_added_vocab()["<|endoftext|>"])
2422-
special_vocab.add_to_gguf(self.gguf_writer)
2423-
special_vocab.chat_template = "glmedge"
24242413

24252414
def set_gguf_parameters(self):
24262415
super().set_gguf_parameters()
@@ -2575,6 +2564,22 @@ def set_gguf_parameters(self):
25752564
self.gguf_writer.add_rope_scaling_orig_ctx_len(rope_scaling["original_max_position_embeddings"])
25762565

25772566

2567+
@ModelBase.register("GlmasrModel")
2568+
class GlmasrModel(LlamaModel):
2569+
model_arch = gguf.MODEL_ARCH.LLAMA
2570+
2571+
def set_vocab(self):
2572+
super().set_vocab()
2573+
from transformers import AutoTokenizer
2574+
tokenizer = AutoTokenizer.from_pretrained(self.dir_model, trust_remote_code=True)
2575+
special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=True)
2576+
special_vocab._set_special_token("eos", tokenizer.get_added_vocab()["<|endoftext|>"])
2577+
special_vocab._set_special_token("eot", tokenizer.get_added_vocab()["<|user|>"])
2578+
special_vocab._set_special_token("unk", tokenizer.get_added_vocab()["<|endoftext|>"])
2579+
special_vocab._set_special_token("bos", tokenizer.get_added_vocab()["<|endoftext|>"])
2580+
special_vocab.add_to_gguf(self.gguf_writer)
2581+
2582+
25782583
@ModelBase.register("AfmoeForCausalLM")
25792584
class AfmoeModel(LlamaModel):
25802585
model_arch = gguf.MODEL_ARCH.AFMOE
@@ -9031,6 +9036,7 @@ def set_gguf_parameters(self):
90319036
self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.GLMA)
90329037
self.gguf_writer.add_audio_num_mel_bins(self.hparams["num_mel_bins"])
90339038
self.gguf_writer.add_audio_attention_layernorm_eps(self.hparams.get("layer_norm_eps", 1e-5))
9039+
self.gguf_writer.add_audio_stack_factor(self.global_config["merge_factor"])
90349040

90359041
def tensor_force_quant(self, name, new_name, bid, n_dims):
90369042
if ".conv" in name and ".weight" in name:

tools/mtmd/clip.cpp

Lines changed: 33 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -1848,15 +1848,7 @@ struct clip_graph {
18481848
if (model.audio_has_stack_frames()) {
18491849
// StackAudioFrames
18501850
// https://huggingface.co/fixie-ai/ultravox-v0_5-llama-3_2-1b/blob/main/ultravox_model.py
1851-
int64_t stride = n_embd * hparams.proj_stack_factor;
1852-
int64_t padded_len = GGML_PAD(ggml_nelements(cur), stride);
1853-
int64_t pad = padded_len - ggml_nelements(cur);
1854-
if (pad > 0) {
1855-
cur = ggml_view_1d(ctx0, cur, ggml_nelements(cur), 0);
1856-
cur = ggml_pad(ctx0, cur, pad, 0, 0, 0);
1857-
}
1858-
cur = ggml_view_2d(ctx0, cur, stride, padded_len / stride,
1859-
ggml_row_size(cur->type, stride), 0);
1851+
cur = build_stack(cur, hparams.proj_stack_factor, n_embd);
18601852
cb(cur, "after_stacked", -1);
18611853
}
18621854

@@ -1895,12 +1887,8 @@ struct clip_graph {
18951887
cur = ggml_norm(ctx0, cur, hparams.eps);
18961888
cur = ggml_mul(ctx0, cur, model.mm_norm_pre_w);
18971889
cur = ggml_add(ctx0, cur, model.mm_norm_pre_b);
1898-
cur = ggml_reshape_2d(ctx0, cur, cur->ne[0] * 4, cur->ne[1] / 4);
1899-
cur = ggml_mul_mat(ctx0, model.mm_1_w, cur);
1900-
cur = ggml_add(ctx0, cur, model.mm_1_b);
1901-
cur = ggml_gelu_erf(ctx0, cur);
1902-
cur = ggml_mul_mat(ctx0, model.mm_2_w, cur);
1903-
cur = ggml_add(ctx0, cur, model.mm_2_b);
1890+
cur = build_stack(cur, hparams.proj_stack_factor, n_embd);
1891+
cur = build_ffn(cur, model.mm_1_w, model.mm_1_b, nullptr, nullptr, model.mm_2_w, model.mm_2_b, hparams.ffn_op, 0);
19041892
cur = ggml_concat(ctx0, model.mm_boi, cur, 1);
19051893
cur = ggml_concat(ctx0, cur, model.mm_eoi, 1);
19061894
} else {
@@ -2486,6 +2474,32 @@ struct clip_graph {
24862474
return cur;
24872475
}
24882476

2477+
// Generic function to stack frames for audio processing
2478+
// Abstracts out the StackAudioFrames logic used by ultravox
2479+
ggml_tensor * build_stack(ggml_tensor * cur, int32_t stack_factor, int32_t n_embed) {
2480+
if (stack_factor <= 1) {
2481+
return cur;
2482+
}
2483+
2484+
int64_t total_elements = ggml_nelements(cur);
2485+
int64_t stride = n_embed * stack_factor;
2486+
2487+
// Calculate padded length
2488+
int64_t padded_len = GGML_PAD(total_elements, stride);
2489+
int64_t pad = padded_len - total_elements;
2490+
2491+
if (pad > 0) {
2492+
// Pad the tensor to make it divisible by stride
2493+
cur = ggml_view_1d(ctx0, cur, total_elements, 0);
2494+
cur = ggml_pad(ctx0, cur, pad, 0, 0, 0);
2495+
}
2496+
2497+
// Reshape to [stride, padded_len / stride]
2498+
cur = ggml_view_2d(ctx0, cur, stride, padded_len / stride,
2499+
ggml_row_size(cur->type, stride), 0);
2500+
return cur;
2501+
}
2502+
24892503
};
24902504

24912505
static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32_batch & imgs) {
@@ -2864,10 +2878,12 @@ struct clip_model_loader {
28642878
} break;
28652879
case PROJECTOR_TYPE_ULTRAVOX:
28662880
case PROJECTOR_TYPE_QWEN2A:
2881+
case PROJECTOR_TYPE_GLMA:
28672882
case PROJECTOR_TYPE_VOXTRAL:
28682883
{
28692884
bool require_stack = model.proj_type == PROJECTOR_TYPE_ULTRAVOX ||
2870-
model.proj_type == PROJECTOR_TYPE_VOXTRAL;
2885+
model.proj_type == PROJECTOR_TYPE_VOXTRAL ||
2886+
model.proj_type == PROJECTOR_TYPE_GLMA;
28712887
get_u32(KEY_A_PROJ_STACK_FACTOR, hparams.proj_stack_factor, require_stack);
28722888
if (hparams.n_mel_bins != 128) {
28732889
throw std::runtime_error(string_format("%s: only 128 mel bins are supported for ultravox\n", __func__));
@@ -4640,7 +4656,7 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im
46404656
// whisper downscales input token by half after conv1d
46414657
n_patches /= 2;
46424658
// reshape by merge_factor
4643-
n_patches /= 4;
4659+
n_patches /= ctx->model.hparams.proj_stack_factor;
46444660
// for BOI and EOI token embeddings
46454661
n_patches += 2;
46464662
} break;

0 commit comments

Comments
 (0)