Skip to content

Commit ba49bb3

Browse files
authored
Merge branch 'ggml-org:master' into mradermacher
2 parents f368820 + db97837 commit ba49bb3

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

59 files changed

+21112
-531
lines changed

CONTRIBUTING.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@ The project differentiates between 3 levels of contributors:
1616
- If you modified a `ggml` operator or added a new one, add the corresponding test cases to `test-backend-ops`
1717
- Create separate PRs for each feature or fix. Avoid combining unrelated changes in a single PR
1818
- Consider allowing write access to your branch for faster reviews, as reviewers can push commits directly
19-
- If your PR becomes stale, don't hesitate to ping the maintainers in the comments
19+
- If your PR becomes stale, rebase it on top of latest `master` to get maintainers attention
2020
- Maintainers will rely on your insights and approval when making a final decision to approve and merge a PR
2121
- Consider adding yourself to [CODEOWNERS](CODEOWNERS) to indicate your availability for reviewing related PRs
2222
- Using AI to generate PRs is permitted. However, you must (1) explicitly disclose how AI was used and (2) conduct a thorough manual review before publishing the PR. Note that trivial tab autocompletions do not require disclosure.

README.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -276,6 +276,7 @@ Instructions for adding support for new models: [HOWTO-add-model.md](docs/develo
276276
| [MUSA](docs/build.md#musa) | Moore Threads GPU |
277277
| [CUDA](docs/build.md#cuda) | Nvidia GPU |
278278
| [HIP](docs/build.md#hip) | AMD GPU |
279+
| [ZenDNN](docs/build.md#zendnn) | AMD CPU |
279280
| [Vulkan](docs/build.md#vulkan) | GPU |
280281
| [CANN](docs/build.md#cann) | Ascend NPU |
281282
| [OpenCL](docs/backend/OPENCL.md) | Adreno GPU |

convert_hf_to_gguf.py

Lines changed: 177 additions & 83 deletions
Original file line numberDiff line numberDiff line change
@@ -1534,6 +1534,79 @@ def _set_vocab_interns1(self):
15341534
special_vocab._set_special_token("bos", 151643)
15351535
special_vocab.add_to_gguf(self.gguf_writer)
15361536

1537+
def _set_vocab_mistral(self):
1538+
if not _mistral_common_installed:
1539+
raise ImportError(_mistral_import_error_msg)
1540+
1541+
vocab = MistralVocab(self.dir_model)
1542+
logger.info(
1543+
f"Converting tokenizer {vocab.tokenizer_type} of size {vocab.vocab_size}."
1544+
)
1545+
1546+
self.gguf_writer.add_tokenizer_model(vocab.gguf_tokenizer_model)
1547+
1548+
tokens = []
1549+
scores = []
1550+
toktypes = []
1551+
1552+
for text, score, toktype in vocab.all_tokens():
1553+
tokens.append(text)
1554+
scores.append(score)
1555+
toktypes.append(toktype)
1556+
1557+
assert len(tokens) == vocab.vocab_size, (
1558+
f"token count ({len(tokens)}) != vocab size ({vocab.vocab_size})"
1559+
)
1560+
1561+
if vocab.tokenizer_type == MistralTokenizerType.tekken:
1562+
self.gguf_writer.add_tokenizer_pre("tekken")
1563+
self.gguf_writer.add_token_merges(
1564+
vocab.extract_vocab_merges_from_model()
1565+
)
1566+
1567+
logger.info(
1568+
f"Setting bos, eos, unk and pad token IDs to {vocab.bos_id}, {vocab.eos_id}, {vocab.unk_id}, {vocab.pad_id}."
1569+
)
1570+
1571+
self.gguf_writer.add_bos_token_id(vocab.bos_id)
1572+
self.gguf_writer.add_eos_token_id(vocab.eos_id)
1573+
self.gguf_writer.add_unk_token_id(vocab.unk_id)
1574+
self.gguf_writer.add_pad_token_id(vocab.pad_id)
1575+
1576+
self.gguf_writer.add_token_list(tokens)
1577+
self.gguf_writer.add_token_scores(scores)
1578+
self.gguf_writer.add_token_types(toktypes)
1579+
self.gguf_writer.add_vocab_size(vocab.vocab_size)
1580+
1581+
self.gguf_writer.add_add_bos_token(True)
1582+
self.gguf_writer.add_add_eos_token(False)
1583+
1584+
local_template_file_path = self.dir_model / "chat_template.jinja"
1585+
1586+
if self.is_mistral_format and local_template_file_path.is_file():
1587+
# Ministral-3 and other new Mistral models come with chat templates.
1588+
# ref: https://huggingface.co/mistralai/Ministral-3-14B-Instruct-2512/tree/main
1589+
logger.info("Using an existing Mistral local chat template.")
1590+
1591+
with open(local_template_file_path, "r", encoding="utf-8") as f:
1592+
template = f.read()
1593+
elif not self.is_mistral_format or not self.disable_mistral_community_chat_template:
1594+
template_dir = Path(__file__).parent / "models/templates/"
1595+
1596+
# Log only for Mistral format that the official tokenization and detokenization is via `mistral-common`.
1597+
if self.is_mistral_format:
1598+
logger.info(
1599+
"Using a Mistral community chat template. These templates can be subject to errors in early days or weeks after a release. "
1600+
"Mistral recommends to use `mistral-common` to perform tokenization and detokenization."
1601+
)
1602+
template = MistralModel.get_community_chat_template(vocab, template_dir, self.is_mistral_format)
1603+
else:
1604+
logger.info("Not using a Mistral local or community chat template. Ensure to perform the tokenization and detokenization via `mistral-common`.")
1605+
template = None
1606+
1607+
if template is not None:
1608+
self.gguf_writer.add_chat_template(template)
1609+
15371610

15381611
class MmprojModel(ModelBase):
15391612
model_type = ModelType.MMPROJ
@@ -2304,79 +2377,6 @@ def __init__(self, *args, **kwargs):
23042377
if self.hf_arch == "VLlama3ForCausalLM":
23052378
self.hparams["num_attention_heads"] = self.hparams.get("num_attention_heads", 32)
23062379

2307-
def _set_vocab_mistral(self):
2308-
if not _mistral_common_installed:
2309-
raise ImportError(_mistral_import_error_msg)
2310-
2311-
vocab = MistralVocab(self.dir_model)
2312-
logger.info(
2313-
f"Converting tokenizer {vocab.tokenizer_type} of size {vocab.vocab_size}."
2314-
)
2315-
2316-
self.gguf_writer.add_tokenizer_model(vocab.gguf_tokenizer_model)
2317-
2318-
tokens = []
2319-
scores = []
2320-
toktypes = []
2321-
2322-
for text, score, toktype in vocab.all_tokens():
2323-
tokens.append(text)
2324-
scores.append(score)
2325-
toktypes.append(toktype)
2326-
2327-
assert len(tokens) == vocab.vocab_size, (
2328-
f"token count ({len(tokens)}) != vocab size ({vocab.vocab_size})"
2329-
)
2330-
2331-
if vocab.tokenizer_type == MistralTokenizerType.tekken:
2332-
self.gguf_writer.add_tokenizer_pre("tekken")
2333-
self.gguf_writer.add_token_merges(
2334-
vocab.extract_vocab_merges_from_model()
2335-
)
2336-
2337-
logger.info(
2338-
f"Setting bos, eos, unk and pad token IDs to {vocab.bos_id}, {vocab.eos_id}, {vocab.unk_id}, {vocab.pad_id}."
2339-
)
2340-
2341-
self.gguf_writer.add_bos_token_id(vocab.bos_id)
2342-
self.gguf_writer.add_eos_token_id(vocab.eos_id)
2343-
self.gguf_writer.add_unk_token_id(vocab.unk_id)
2344-
self.gguf_writer.add_pad_token_id(vocab.pad_id)
2345-
2346-
self.gguf_writer.add_token_list(tokens)
2347-
self.gguf_writer.add_token_scores(scores)
2348-
self.gguf_writer.add_token_types(toktypes)
2349-
self.gguf_writer.add_vocab_size(vocab.vocab_size)
2350-
2351-
self.gguf_writer.add_add_bos_token(True)
2352-
self.gguf_writer.add_add_eos_token(False)
2353-
2354-
local_template_file_path = self.dir_model / "chat_template.jinja"
2355-
2356-
if self.is_mistral_format and local_template_file_path.is_file():
2357-
# Ministral-3 and other new Mistral models come with chat templates.
2358-
# ref: https://huggingface.co/mistralai/Ministral-3-14B-Instruct-2512/tree/main
2359-
logger.info("Using an existing Mistral local chat template.")
2360-
2361-
with open(local_template_file_path, "r", encoding="utf-8") as f:
2362-
template = f.read()
2363-
elif not self.is_mistral_format or not self.disable_mistral_community_chat_template:
2364-
template_dir = Path(__file__).parent / "models/templates/"
2365-
2366-
# Log only for Mistral format that the official tokenization and detokenization is via `mistral-common`.
2367-
if self.is_mistral_format:
2368-
logger.info(
2369-
"Using a Mistral community chat template. These templates can be subject to errors in early days or weeks after a release. "
2370-
"Mistral recommends to use `mistral-common` to perform tokenization and detokenization."
2371-
)
2372-
template = MistralModel.get_community_chat_template(vocab, template_dir, self.is_mistral_format)
2373-
else:
2374-
logger.info("Not using a Mistral local or community chat template. Ensure to perform the tokenization and detokenization via `mistral-common`.")
2375-
template = None
2376-
2377-
if template is not None:
2378-
self.gguf_writer.add_chat_template(template)
2379-
23802380
def set_vocab(self):
23812381
if self.is_mistral_format:
23822382
return self._set_vocab_mistral()
@@ -9934,17 +9934,109 @@ def get_community_chat_template(vocab: MistralVocab, templates_dir: Path, is_mis
99349934

99359935
def set_gguf_parameters(self):
99369936
super().set_gguf_parameters()
9937-
if "yarn" in self.hparams:
9938-
yarn_params = self.hparams["yarn"]
9939-
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.YARN)
9940-
self.gguf_writer.add_rope_scaling_factor(yarn_params["factor"])
9941-
self.gguf_writer.add_rope_scaling_yarn_beta_fast(yarn_params["beta"])
9942-
self.gguf_writer.add_rope_scaling_yarn_beta_slow(yarn_params["alpha"])
9943-
self.gguf_writer.add_rope_scaling_yarn_log_mul(1.0) # mscale_all_dim
9944-
self.gguf_writer.add_rope_scaling_orig_ctx_len(yarn_params["original_max_position_embeddings"])
9937+
MistralModel.set_mistral_config(self.gguf_writer, self.hparams)
99459938

9946-
if "llama_4_scaling" in self.hparams:
9947-
self.gguf_writer.add_attn_temperature_scale(self.hparams["llama_4_scaling"]["beta"])
9939+
@staticmethod
9940+
def set_mistral_config(gguf_writer: gguf.GGUFWriter, hparams: dict):
9941+
if "yarn" in hparams:
9942+
yarn_params = hparams["yarn"]
9943+
gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.YARN)
9944+
gguf_writer.add_rope_scaling_factor(yarn_params["factor"])
9945+
gguf_writer.add_rope_scaling_yarn_beta_fast(yarn_params["beta"])
9946+
gguf_writer.add_rope_scaling_yarn_beta_slow(yarn_params["alpha"])
9947+
gguf_writer.add_rope_scaling_yarn_log_mul(1.0) # mscale_all_dim
9948+
gguf_writer.add_rope_scaling_orig_ctx_len(yarn_params["original_max_position_embeddings"])
9949+
9950+
if "llama_4_scaling" in hparams:
9951+
gguf_writer.add_attn_temperature_scale(hparams["llama_4_scaling"]["beta"])
9952+
9953+
9954+
class MistralMoeModel(DeepseekV2Model):
9955+
model_arch = gguf.MODEL_ARCH.DEEPSEEK2
9956+
model_name = "Mistral"
9957+
hf_arch = ""
9958+
is_mistral_format = True
9959+
9960+
def __init__(self, *args, **kwargs):
9961+
super().__init__(*args, **kwargs)
9962+
logger.info("Using MistralMoeModel")
9963+
# remap hparams from Mistral MoE format to DeepseekV2 format
9964+
# we do this way to be able to reuse DeepseekV2Model set_gguf_parameters logic
9965+
# ref: https://github.com/vllm-project/vllm/blob/b294e28db2c5dee61bc25157664edcada8b90b31/vllm/transformers_utils/configs/mistral.py
9966+
config = self.hparams
9967+
# Mistral key -> HF key
9968+
config_mapping = {
9969+
"dim": "hidden_size",
9970+
"norm_eps": "rms_norm_eps",
9971+
"n_kv_heads": "num_key_value_heads",
9972+
"n_layers": "num_hidden_layers",
9973+
"n_heads": "num_attention_heads",
9974+
"hidden_dim": "intermediate_size",
9975+
}
9976+
# HF key -> (Mistral key, default value)
9977+
top_level_mapping_with_default = {
9978+
"model_type": ("model_type", "transformer"),
9979+
"hidden_act": ("activation", "silu"),
9980+
"tie_word_embeddings": ("tied_embeddings", False),
9981+
"max_seq_len": ("max_seq_len", config.get("max_position_embeddings", 128_000)),
9982+
"max_position_embeddings": ("max_position_embeddings", 128_000),
9983+
}
9984+
# mapping top-level keys
9985+
for key, new_key in config_mapping.items():
9986+
if key in config:
9987+
config[new_key] = config[key]
9988+
for new_key, (key, default_value) in top_level_mapping_with_default.items():
9989+
config[new_key] = config.get(key, default_value)
9990+
# mapping MoE-specific keys
9991+
moe_config_map = {
9992+
"route_every_n": "moe_layer_freq",
9993+
"first_k_dense_replace": "first_k_dense_replace",
9994+
"num_experts_per_tok": "num_experts_per_tok",
9995+
"num_experts": "n_routed_experts",
9996+
"expert_hidden_dim": "moe_intermediate_size",
9997+
"routed_scale": "routed_scaling_factor",
9998+
"num_shared_experts": "n_shared_experts",
9999+
"num_expert_groups": "n_group",
10000+
"num_expert_groups_per_tok": "topk_group",
10001+
}
10002+
moe = config["moe"]
10003+
for key, new_key in moe_config_map.items():
10004+
if key in moe:
10005+
config[new_key] = moe[key]
10006+
# provide missing values
10007+
config["topk_method"] = None
10008+
config["norm_topk_prob"] = True
10009+
config["scoring_func"] = "softmax"
10010+
10011+
def set_vocab(self):
10012+
self._set_vocab_mistral()
10013+
10014+
def set_gguf_parameters(self):
10015+
super().set_gguf_parameters()
10016+
MistralModel.set_mistral_config(self.gguf_writer, self.hparams)
10017+
yarn_params = self.hparams["yarn"]
10018+
self.gguf_writer.add_attn_temperature_length(yarn_params["original_max_position_embeddings"])
10019+
self.gguf_writer.add_rope_scaling_yarn_log_mul(0.1) # mscale_all_dim * 0.1
10020+
10021+
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None):
10022+
if name.startswith("vision_") or name.startswith("patch_merger.") or "mm_projector" in name:
10023+
return []
10024+
10025+
# rename certain tensors so that we can reuse DeepseekV2Model modify_tensors logic
10026+
if name.endswith(".qscale_act"):
10027+
name = name.replace(".qscale_act", ".input_scale")
10028+
if name.endswith(".qscale_weight"):
10029+
name = name.replace(".qscale_weight", ".weight_scale")
10030+
if ".wkv_b." in name:
10031+
name = name.replace(".wkv_b.", ".kv_b_proj.")
10032+
if ".experts." in name:
10033+
name = name.replace(".experts.", ".mlp.experts.")
10034+
name = name.replace(".w1.", ".gate_proj.")
10035+
name = name.replace(".w2.", ".down_proj.")
10036+
name = name.replace(".w3.", ".up_proj.")
10037+
name = "model." + name
10038+
10039+
return super().modify_tensors(data_torch, name, bid)
994810040

994910041

995010042
class PixtralModel(LlavaVisionModel):
@@ -10501,6 +10593,8 @@ def main() -> None:
1050110593
elif args.mmproj:
1050210594
assert hparams.get("vision_encoder") is not None, "This model does not support multimodal"
1050310595
model_class = PixtralModel
10596+
elif "moe" in hparams:
10597+
model_class = MistralMoeModel
1050410598
else:
1050510599
model_class = MistralModel
1050610600

0 commit comments

Comments
 (0)