Skip to content

Commit 7590a0e

Browse files
committed
Merge branch 'upstream' into concedo_experimental
# Conflicts: # .github/workflows/build.yml # ggml/CMakeLists.txt # ggml/cmake/ggml-config.cmake.in # ggml/src/CMakeLists.txt # models/templates/README.md # tools/imatrix/imatrix.cpp
2 parents 428a074 + ee3a9fc commit 7590a0e

22 files changed

+1124
-423
lines changed

common/arg.cpp

Lines changed: 26 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@
2626
#include <cstdarg>
2727
#include <filesystem>
2828
#include <fstream>
29+
#include <list>
2930
#include <regex>
3031
#include <set>
3132
#include <string>
@@ -2377,20 +2378,35 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
23772378
}
23782379
throw std::invalid_argument("unknown buffer type");
23792380
}
2380-
// FIXME: this leaks memory
2381-
params.tensor_buft_overrides.push_back({strdup(tensor_name.c_str()), buft_list.at(buffer_type)});
2381+
// keep strings alive and avoid leaking memory by storing them in a static vector
2382+
static std::list<std::string> buft_overrides;
2383+
buft_overrides.push_back(tensor_name);
2384+
params.tensor_buft_overrides.push_back({buft_overrides.back().c_str(), buft_list.at(buffer_type)});
23822385
}
23832386
}
23842387
));
23852388
add_opt(common_arg(
2386-
{"--cpu-moe"},
2387-
"use CPU for Mixture of Experts (MoE) weights",
2389+
{"--cpu-moe", "-cmoe"},
2390+
"keep all Mixture of Experts (MoE) weights in the CPU",
23882391
[](common_params & params) {
2389-
params.tensor_buft_overrides.push_back({"\\.ffn_up_exps\\.weight$", ggml_backend_cpu_buffer_type()});
2390-
params.tensor_buft_overrides.push_back({"\\.ffn_down_exps\\.weight$", ggml_backend_cpu_buffer_type()});
2391-
params.tensor_buft_overrides.push_back({"\\.ffn_gate_exps\\.weight$", ggml_backend_cpu_buffer_type()});
2392+
params.tensor_buft_overrides.push_back({"\\.ffn_(up|down|gate)_exps", ggml_backend_cpu_buffer_type()});
23922393
}
23932394
).set_env("LLAMA_ARG_CPU_MOE"));
2395+
add_opt(common_arg(
2396+
{"--n-cpu-moe", "-ncmoe"}, "N",
2397+
"keep the Mixture of Experts (MoE) weights of the first N layers in the CPU",
2398+
[](common_params & params, int value) {
2399+
if (value < 0) {
2400+
throw std::invalid_argument("invalid value");
2401+
}
2402+
for (int i = 0; i < value; ++i) {
2403+
// keep strings alive and avoid leaking memory by storing them in a static vector
2404+
static std::list<std::string> buft_overrides;
2405+
buft_overrides.push_back(string_format("blk\\.%d\\.ffn_(up|down|gate)_exps", i));
2406+
params.tensor_buft_overrides.push_back({buft_overrides.back().c_str(), ggml_backend_cpu_buffer_type()});
2407+
}
2408+
}
2409+
).set_env("LLAMA_ARG_N_CPU_MOE"));
23942410
add_opt(common_arg(
23952411
{"-ngl", "--gpu-layers", "--n-gpu-layers"}, "N",
23962412
"number of layers to store in VRAM",
@@ -2651,10 +2667,10 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
26512667
).set_examples({LLAMA_EXAMPLE_IMATRIX}));
26522668
add_opt(common_arg(
26532669
{"--output-format"}, "{gguf,dat}",
2654-
string_format("output format for imatrix file (default: %s)", params.imat_dat ? "dat" : "gguf"),
2670+
string_format("output format for imatrix file (default: %s)", params.imat_dat > 0 ? "dat" : "gguf"),
26552671
[](common_params & params, const std::string & value) {
2656-
/**/ if (value == "gguf") { params.imat_dat = false; }
2657-
else if (value == "dat") { params.imat_dat = true; }
2672+
/**/ if (value == "gguf") { params.imat_dat = -1; }
2673+
else if (value == "dat") { params.imat_dat = 1; }
26582674
else { throw std::invalid_argument("invalid output format"); }
26592675
}
26602676
).set_examples({LLAMA_EXAMPLE_IMATRIX}));

common/common.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -435,7 +435,7 @@ struct common_params {
435435
int32_t n_out_freq = 10; // output the imatrix every n_out_freq iterations
436436
int32_t n_save_freq = 0; // save the imatrix every n_save_freq iterations
437437
int32_t i_chunk = 0; // start processing from this chunk
438-
bool imat_dat = false; // whether the legacy imatrix.dat format should be output
438+
int8_t imat_dat = 0; // whether the legacy imatrix.dat format should be output (gguf <= 0 < dat)
439439

440440
bool process_output = false; // collect data for the output tensor
441441
bool compute_ppl = true; // whether to compute perplexity

convert_hf_to_gguf.py

Lines changed: 136 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -678,6 +678,9 @@ def get_vocab_base_pre(self, tokenizer) -> str:
678678
if chkhsh == "a1336059768a55c99a734006ffb02203cd450fed003e9a71886c88acf24fdbc2":
679679
# ref: https://huggingface.co/THUDM/glm-4-9b-hf
680680
res = "glm4"
681+
if chkhsh == "9ca2dd618e8afaf09731a7cf6e2105b373ba6a1821559f258b272fe83e6eb902":
682+
# ref: https://huggingface.co/zai-org/GLM-4.5-Air
683+
res = "glm4"
681684
if chkhsh == "1431a23e583c97432bc230bff598d103ddb5a1f89960c8f1d1051aaa944d0b35":
682685
# ref: https://huggingface.co/sapienzanlp/Minerva-7B-base-v1.0
683686
res = "minerva-7b"
@@ -6696,6 +6699,139 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
66966699
return super().modify_tensors(data_torch, name, bid)
66976700

66986701

6702+
@ModelBase.register("Glm4MoeForCausalLM")
6703+
class Glm4MoeModel(TextModel):
6704+
model_arch = gguf.MODEL_ARCH.GLM4_MOE
6705+
6706+
def __init__(self, *args, **kwargs):
6707+
super().__init__(*args, **kwargs)
6708+
# GLM4_MOE has num_hidden_layers + 1 actual layers (including NextN layer)
6709+
self.block_count = self.hparams["num_hidden_layers"] + self.hparams.get("num_nextn_predict_layers", 0)
6710+
self.tensor_map = gguf.get_tensor_name_map(self.model_arch, self.block_count)
6711+
6712+
def set_vocab(self):
6713+
from transformers import AutoTokenizer
6714+
6715+
tokenizer = AutoTokenizer.from_pretrained(self.dir_model)
6716+
special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=True)
6717+
tokens, toktypes, tokpre = self.get_vocab_base()
6718+
self.gguf_writer.add_tokenizer_model("gpt2")
6719+
self.gguf_writer.add_tokenizer_pre(tokpre)
6720+
self.gguf_writer.add_token_list(tokens)
6721+
self.gguf_writer.add_token_types(toktypes)
6722+
6723+
# Special tokens
6724+
# Note: Using <|endoftext|> (151329) for eot causes endless generation
6725+
special_vocab._set_special_token("bos", tokenizer.get_added_vocab()["[gMASK]"]) # 151331
6726+
special_vocab._set_special_token("eot", tokenizer.get_added_vocab()["<|user|>"]) # 151336
6727+
special_vocab._set_special_token("unk", tokenizer.get_added_vocab()["<|endoftext|>"]) # 151329
6728+
special_vocab._set_special_token("eom", tokenizer.get_added_vocab()["<|observation|>"]) # 151338
6729+
6730+
# Patch broken chat template
6731+
if isinstance(special_vocab.chat_template, str) and "visible_text(m.content).endswith" in special_vocab.chat_template:
6732+
special_vocab.chat_template = special_vocab.chat_template.replace(
6733+
"""{{ visible_text(m.content) }}\n{{- '/nothink' if (enable_thinking is defined and not enable_thinking and not visible_text(m.content).endswith("/nothink")) else '' -}}""",
6734+
"""{% set content = visible_text(m.content) %}{{ content }}\n{{- '/nothink' if (enable_thinking is defined and not enable_thinking and not content.endswith("/nothink")) else '' -}}""")
6735+
6736+
special_vocab.add_to_gguf(self.gguf_writer)
6737+
6738+
def set_gguf_parameters(self):
6739+
super().set_gguf_parameters()
6740+
if (rope_dim := self.hparams.get("head_dim")) is None:
6741+
rope_dim = (
6742+
self.hparams["hidden_size"] // self.hparams["num_attention_heads"]
6743+
)
6744+
self.gguf_writer.add_rope_dimension_count(
6745+
int(rope_dim * self.hparams.get("partial_rotary_factor", 0.5))
6746+
)
6747+
6748+
# MoE parameters - Use only routed expert count (shared experts handled separately)
6749+
if (n_routed_experts := self.hparams.get("n_routed_experts")) is not None:
6750+
self.gguf_writer.add_expert_count(n_routed_experts)
6751+
if (moe_intermediate_size := self.hparams.get("moe_intermediate_size")) is not None:
6752+
self.gguf_writer.add_expert_feed_forward_length(moe_intermediate_size)
6753+
if (n_shared_experts := self.hparams.get("n_shared_experts")) is not None:
6754+
self.gguf_writer.add_expert_shared_count(n_shared_experts)
6755+
if (first_k_dense_replace := self.hparams.get("first_k_dense_replace")) is not None:
6756+
self.gguf_writer.add_leading_dense_block_count(first_k_dense_replace)
6757+
6758+
# Expert gating function (sigmoid for GLM4_MOE)
6759+
self.gguf_writer.add_expert_gating_func(gguf.ExpertGatingFuncType.SIGMOID)
6760+
6761+
# Routed scaling factor
6762+
if (routed_scaling_factor := self.hparams.get("routed_scaling_factor")) is not None:
6763+
self.gguf_writer.add_expert_weights_scale(routed_scaling_factor)
6764+
6765+
# Normalise topk probabilities
6766+
if (norm_topk_prob := self.hparams.get("norm_topk_prob")) is not None:
6767+
self.gguf_writer.add_expert_weights_norm(norm_topk_prob)
6768+
6769+
# NextN/MTP prediction layers
6770+
if (num_nextn_predict_layers := self.hparams.get("num_nextn_predict_layers")) is not None:
6771+
self.gguf_writer.add_nextn_predict_layers(num_nextn_predict_layers)
6772+
6773+
_experts: list[dict[str, Tensor]] | None = None
6774+
6775+
def modify_tensors(
6776+
self, data_torch: Tensor, name: str, bid: int | None
6777+
) -> Iterable[tuple[str, Tensor]]:
6778+
if name.startswith("model.visual."): # ignore visual part
6779+
return []
6780+
elif name.startswith("model.language_model."):
6781+
name = name.replace("language_model.", "") # for multimodal variants
6782+
6783+
# Handle main token embedding (but not layer-specific NextN embeddings)
6784+
if name == "model.embed_tokens.weight" and ".layers." not in name:
6785+
return [(self.map_tensor_name("token_embd.weight"), data_torch)]
6786+
6787+
# Handle routed experts
6788+
if name.find("mlp.experts") != -1:
6789+
n_experts = self.hparams["n_routed_experts"]
6790+
assert bid is not None
6791+
6792+
if self._experts is None:
6793+
self._experts = [{} for _ in range(self.block_count)]
6794+
6795+
self._experts[bid][name] = data_torch
6796+
6797+
if len(self._experts[bid]) >= n_experts * 3:
6798+
tensors: list[tuple[str, Tensor]] = []
6799+
6800+
# merge the experts into a single 3d tensor
6801+
for w_name in ["down_proj", "gate_proj", "up_proj"]:
6802+
datas: list[Tensor] = []
6803+
6804+
for xid in range(n_experts):
6805+
ename = f"model.layers.{bid}.mlp.experts.{xid}.{w_name}.weight"
6806+
datas.append(self._experts[bid][ename])
6807+
del self._experts[bid][ename]
6808+
6809+
data_torch = torch.stack(datas, dim=0)
6810+
6811+
merged_name = f"model.layers.{bid}.mlp.experts.{w_name}.weight"
6812+
6813+
new_name = self.map_tensor_name(merged_name)
6814+
tensors.append((new_name, data_torch))
6815+
return tensors
6816+
else:
6817+
return []
6818+
6819+
if name.endswith("e_score_correction_bias"):
6820+
name = name.replace("e_score_correction_bias", "e_score_correction.bias")
6821+
6822+
new_name = self.map_tensor_name(name)
6823+
6824+
return [(new_name, data_torch)]
6825+
6826+
def prepare_tensors(self):
6827+
super().prepare_tensors()
6828+
if self._experts is not None:
6829+
# flatten `list[dict[str, Tensor]]` into `list[str]`
6830+
experts = [k for d in self._experts for k in d.keys()]
6831+
if len(experts) > 0:
6832+
raise ValueError(f"Unprocessed experts: {experts}")
6833+
6834+
66996835
@ModelBase.register("GlmForCausalLM", "ChatGLMModel", "ChatGLMForConditionalGeneration")
67006836
class ChatGLMModel(TextModel):
67016837
model_arch = gguf.MODEL_ARCH.CHATGLM

convert_hf_to_gguf_update.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -147,6 +147,7 @@ class TOKENIZER_TYPE(IntEnum):
147147
{"name": "chatglm-bpe", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/THUDM/glm-4-9b-chat", "chkhsh": "b6e8e1518dc4305be2fe39c313ed643381c4da5db34a98f6a04c093f8afbe99b"},
148148
{"name": "chatglm-bpe", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/THUDM/glm-4-9b-chat", "chkhsh": "81d72c7348a9f0ebe86f23298d37debe0a5e71149e29bd283904c02262b27516"},
149149
{"name": "glm4", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/THUDM/glm-4-9b-hf", "chkhsh": "a1336059768a55c99a734006ffb02203cd450fed003e9a71886c88acf24fdbc2"},
150+
{"name": "glm4", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/zai-org/GLM-4.5-Air", "chkhsh": "9ca2dd618e8afaf09731a7cf6e2105b373ba6a1821559f258b272fe83e6eb902"},
150151
{"name": "minerva-7b", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/sapienzanlp/Minerva-7B-base-v1.0", "chkhsh": "1431a23e583c97432bc230bff598d103ddb5a1f89960c8f1d1051aaa944d0b35"},
151152
{"name": "hunyuan", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/tencent/Hunyuan-A13B-Instruct", "chkhsh": "7e57df22b1fe23a7b1e1c7f3dc4e3f96d43a4eb0836d0c6bdc3436d7b2f1c664"},
152153
{"name": "hunyuan-dense", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/tencent/Hunyuan-4B-Instruct", "chkhsh": "bba3b3366b646dbdded5dbc42d59598b849371afc42f7beafa914afaa5b70aa6"},

ggml/src/ggml-backend-reg.cpp

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -499,6 +499,9 @@ static ggml_backend_reg_t ggml_backend_load_best(const char * name, bool silent,
499499

500500
std::vector<fs::path> search_paths;
501501
if (user_search_path == nullptr) {
502+
#ifdef GGML_BACKEND_DIR
503+
search_paths.push_back(fs::u8path(GGML_BACKEND_DIR));
504+
#endif
502505
// default search paths: executable directory, current directory
503506
search_paths.push_back(get_executable_path());
504507
search_paths.push_back(fs::current_path());

0 commit comments

Comments
 (0)