Skip to content

Commit 42ffcba

Browse files
Merge branch 'ggml-org:master' into master
2 parents d149f04 + 8ad7b3e commit 42ffcba

File tree

96 files changed

+60060
-31819
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

96 files changed

+60060
-31819
lines changed

.devops/musa.Dockerfile

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,10 @@
11
ARG UBUNTU_VERSION=22.04
22
# This needs to generally match the container host's environment.
3-
ARG MUSA_VERSION=rc4.0.1
3+
ARG MUSA_VERSION=rc4.2.0
44
# Target the MUSA build image
5-
ARG BASE_MUSA_DEV_CONTAINER=mthreads/musa:${MUSA_VERSION}-mudnn-devel-ubuntu${UBUNTU_VERSION}
5+
ARG BASE_MUSA_DEV_CONTAINER=mthreads/musa:${MUSA_VERSION}-devel-ubuntu${UBUNTU_VERSION}-amd64
66

7-
ARG BASE_MUSA_RUN_CONTAINER=mthreads/musa:${MUSA_VERSION}-mudnn-runtime-ubuntu${UBUNTU_VERSION}
7+
ARG BASE_MUSA_RUN_CONTAINER=mthreads/musa:${MUSA_VERSION}-runtime-ubuntu${UBUNTU_VERSION}-amd64
88

99
FROM ${BASE_MUSA_DEV_CONTAINER} AS build
1010

.devops/rocm.Dockerfile

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,8 @@
11
ARG UBUNTU_VERSION=24.04
22

33
# This needs to generally match the container host's environment.
4-
ARG ROCM_VERSION=6.3
5-
ARG AMDGPU_VERSION=6.3
4+
ARG ROCM_VERSION=6.4
5+
ARG AMDGPU_VERSION=6.4
66

77
# Target the CUDA build image
88
ARG BASE_ROCM_DEV_CONTAINER=rocm/dev-ubuntu-${UBUNTU_VERSION}:${ROCM_VERSION}-complete

.github/workflows/build.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -515,7 +515,7 @@ jobs:
515515
516516
ubuntu-22-cmake-musa:
517517
runs-on: ubuntu-22.04
518-
container: mthreads/musa:rc4.0.1-mudnn-devel-ubuntu22.04
518+
container: mthreads/musa:rc4.2.0-devel-ubuntu22.04-amd64
519519

520520
steps:
521521
- name: Clone

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -82,6 +82,7 @@ models/*
8282
models-mnt
8383
!models/.editorconfig
8484
!models/ggml-vocab-*.gguf*
85+
!models/templates
8586

8687
# Zig
8788
zig-out/

ci/README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -54,7 +54,7 @@ docker run --privileged -it \
5454
-v $HOME/llama.cpp/ci-cache:/ci-cache \
5555
-v $HOME/llama.cpp/ci-results:/ci-results \
5656
-v $PWD:/ws -w /ws \
57-
mthreads/musa:rc4.0.1-mudnn-devel-ubuntu22.04
57+
mthreads/musa:rc4.2.0-devel-ubuntu22.04-amd64
5858
```
5959

6060
Inside the container, execute the following commands:

convert_hf_to_gguf.py

Lines changed: 165 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1900,6 +1900,7 @@ def prepare_tensors(self):
19001900
"MixtralForCausalLM",
19011901
"VLlama3ForCausalLM",
19021902
"LlavaForConditionalGeneration",
1903+
"VoxtralForConditionalGeneration",
19031904
"LlamaModel")
19041905
class LlamaModel(TextModel):
19051906
model_arch = gguf.MODEL_ARCH.LLAMA
@@ -1912,6 +1913,11 @@ def __init__(self, *args, **kwargs):
19121913
self.hparams["num_attention_heads"] = self.hparams.get("num_attention_heads", 32)
19131914

19141915
def set_vocab(self):
1916+
path_tekken_json = self.dir_model / "tekken.json"
1917+
path_tokenizer_json = self.dir_model / "tokenizer.json"
1918+
if path_tekken_json.is_file() and not path_tokenizer_json.is_file():
1919+
return self.set_vocab_tekken()
1920+
19151921
try:
19161922
self._set_vocab_sentencepiece()
19171923
except FileNotFoundError:
@@ -1944,6 +1950,52 @@ def set_vocab(self):
19441950
if self.hparams.get("vocab_size", 32000) == 49152:
19451951
self.gguf_writer.add_add_bos_token(False)
19461952

1953+
def set_vocab_tekken(self):
1954+
vocab = gguf.vocab.MistralVocab(self.dir_model)
1955+
self.gguf_writer.add_tokenizer_model(vocab.gguf_tokenizer_model)
1956+
1957+
tokens = []
1958+
scores = []
1959+
toktypes = []
1960+
1961+
for text, score, toktype in vocab.all_tokens():
1962+
tokens.append(text)
1963+
scores.append(score)
1964+
toktypes.append(toktype)
1965+
1966+
assert len(tokens) == vocab.vocab_size, (
1967+
f"token count ({len(tokens)}) != vocab size ({vocab.vocab_size})"
1968+
)
1969+
1970+
if vocab.tokenizer_type == gguf.vocab.MistralTokenizerType.tekken:
1971+
self.gguf_writer.add_tokenizer_pre("tekken")
1972+
self.gguf_writer.add_token_merges(
1973+
vocab.extract_vocab_merges_from_model()
1974+
)
1975+
1976+
logger.info(
1977+
f"Setting bos, eos, unk and pad token IDs to {vocab.bos_id}, {vocab.eos_id}, {vocab.unk_id}, {vocab.pad_id}."
1978+
)
1979+
1980+
self.gguf_writer.add_bos_token_id(vocab.bos_id)
1981+
self.gguf_writer.add_eos_token_id(vocab.eos_id)
1982+
self.gguf_writer.add_unk_token_id(vocab.unk_id)
1983+
self.gguf_writer.add_pad_token_id(vocab.pad_id)
1984+
1985+
self.gguf_writer.add_token_list(tokens)
1986+
self.gguf_writer.add_token_scores(scores)
1987+
self.gguf_writer.add_token_types(toktypes)
1988+
self.gguf_writer.add_vocab_size(vocab.vocab_size)
1989+
1990+
self.gguf_writer.add_add_bos_token(True)
1991+
self.gguf_writer.add_add_eos_token(False)
1992+
1993+
script_dir = Path(__file__).parent
1994+
template_path = script_dir / "models/templates/unsloth-mistral-Devstral-Small-2507.jinja"
1995+
with open(template_path, "r", encoding="utf-8") as f:
1996+
template = f.read()
1997+
self.gguf_writer.add_chat_template(template)
1998+
19471999
def set_gguf_parameters(self):
19482000
super().set_gguf_parameters()
19492001
hparams = self.hparams
@@ -1971,12 +2023,13 @@ def permute(weights: Tensor, n_head: int, n_head_kv: int | None):
19712023
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
19722024
n_head = self.hparams["num_attention_heads"]
19732025
n_kv_head = self.hparams.get("num_key_value_heads")
1974-
is_vision_tensor = "vision_tower" in name \
2026+
is_multimodal_tensor = "vision_tower" in name \
19752027
or "vision_model" in name \
2028+
or "audio_tower" in name \
19762029
or "model.connector" in name \
19772030
or "multi_modal_projector" in name
19782031

1979-
if is_vision_tensor:
2032+
if is_multimodal_tensor:
19802033
return [] # skip vision tensors
19812034
elif self.hf_arch == "LlamaModel":
19822035
name = "model." + name
@@ -3791,7 +3844,7 @@ def set_gguf_parameters(self):
37913844
self.gguf_writer.add_block_count(block_count)
37923845
self.gguf_writer.add_head_count(hparams.get("num_attention_heads", 32))
37933846
self.gguf_writer.add_layer_norm_rms_eps(hparams.get("rms_norm_eps", 1e-06))
3794-
self.gguf_writer.add_rope_freq_base(hparams.get("rope_theta", 1000000.0))
3847+
self.gguf_writer.add_rope_freq_base(hparams.get("rope_theta", 10000))
37953848

37963849
# Mamba parameters
37973850
self.gguf_writer.add_ssm_state_size(hparams.get("mamba_d_state", 64))
@@ -3802,7 +3855,7 @@ def set_gguf_parameters(self):
38023855
self.gguf_writer.add_ssm_group_count(0)
38033856

38043857
# MLP feed forward parameters (for attention layers)
3805-
self.gguf_writer.add_feed_forward_length(hparams.get("intermediate_size", 16384))
3858+
self.gguf_writer.add_feed_forward_length(hparams.get("intermediate_size", 13312))
38063859
self.gguf_writer.add_file_type(self.ftype)
38073860

38083861
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
@@ -6486,7 +6539,7 @@ def prepare_tensors(self):
64866539
self.gguf_writer.add_max_alibi_bias(self.max_alibi_bias)
64876540

64886541

6489-
@ModelBase.register("Glm4ForCausalLM")
6542+
@ModelBase.register("Glm4ForCausalLM", "Glm4vForConditionalGeneration")
64906543
class Glm4Model(TextModel):
64916544
model_arch = gguf.MODEL_ARCH.GLM4
64926545

@@ -6508,14 +6561,22 @@ def set_vocab(self):
65086561

65096562
def set_gguf_parameters(self):
65106563
super().set_gguf_parameters()
6511-
rope_dim = self.hparams["head_dim"]
6564+
if (rope_dim := self.hparams.get("head_dim")) is None:
6565+
rope_dim = self.hparams["hidden_size"] // self.hparams["num_attention_heads"]
65126566
self.gguf_writer.add_rope_dimension_count(int(rope_dim * self.hparams.get("partial_rotary_factor", 0.5)))
65136567
rope_scaling = self.hparams.get("rope_scaling") or {}
65146568
if rope_scaling.get("rope_type", rope_scaling.get("type")) == "yarn" and "factor" in rope_scaling:
65156569
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.YARN)
65166570
self.gguf_writer.add_rope_scaling_factor(rope_scaling["factor"])
65176571
self.gguf_writer.add_rope_scaling_orig_ctx_len(rope_scaling["original_max_position_embeddings"])
65186572

6573+
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
6574+
if name.startswith("model.visual."): # ignore visual part of Glm4v
6575+
return []
6576+
elif name.startswith("model.language_model."):
6577+
name = name.replace("language_model.", "") # for Glm4v
6578+
return super().modify_tensors(data_torch, name, bid)
6579+
65196580

65206581
@ModelBase.register("GlmForCausalLM", "ChatGLMModel", "ChatGLMForConditionalGeneration")
65216582
class ChatGLMModel(TextModel):
@@ -7223,9 +7284,10 @@ class WhisperEncoderModel(MmprojModel):
72237284

72247285
def __init__(self, *args, **kwargs):
72257286
super().__init__(*args, **kwargs)
7226-
self.hparams["hidden_size"] = self.hparams["d_model"]
7227-
self.hparams["intermediate_size"] = self.hparams["encoder_ffn_dim"]
7228-
self.hparams["num_attention_heads"] = self.hparams["encoder_attention_heads"]
7287+
if "hidden_size" not in self.hparams and "intermediate_size" not in self.hparams:
7288+
self.hparams["hidden_size"] = self.hparams["d_model"]
7289+
self.hparams["intermediate_size"] = self.hparams["encoder_ffn_dim"]
7290+
self.hparams["num_attention_heads"] = self.hparams["encoder_attention_heads"]
72297291

72307292
def set_gguf_parameters(self):
72317293
super().set_gguf_parameters()
@@ -7264,9 +7326,21 @@ class UltravoxWhisperEncoderModel(WhisperEncoderModel):
72647326

72657327
def set_gguf_parameters(self):
72667328
super().set_gguf_parameters()
7329+
self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.ULTRAVOX)
72677330
self.gguf_writer.add_audio_stack_factor(self.global_config["stack_factor"])
72687331

72697332

7333+
@ModelBase.register("VoxtralForConditionalGeneration")
7334+
class VoxtralWhisperEncoderModel(WhisperEncoderModel):
7335+
has_vision_encoder = False # no vision encoder
7336+
has_audio_encoder = True
7337+
7338+
def set_gguf_parameters(self):
7339+
super().set_gguf_parameters()
7340+
self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.VOXTRAL)
7341+
self.gguf_writer.add_audio_stack_factor(4) # == intermediate_size // hidden_size
7342+
7343+
72707344
@ModelBase.register("FalconH1ForCausalLM")
72717345
class FalconH1Model(Mamba2Model):
72727346
model_arch = gguf.MODEL_ARCH.FALCON_H1
@@ -7581,6 +7655,88 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
75817655
return [(self.map_tensor_name(name), data_torch)]
75827656

75837657

7658+
@ModelBase.register("SmallThinkerForCausalLM")
7659+
class SmallThinkerModel(TextModel):
7660+
model_arch = gguf.MODEL_ARCH.SMALLTHINKER
7661+
7662+
def set_gguf_parameters(self):
7663+
super().set_gguf_parameters()
7664+
if (n_experts := self.hparams.get("num_experts", self.hparams.get("moe_num_primary_experts"))) is not None:
7665+
self.gguf_writer.add_expert_count(n_experts)
7666+
if (n_experts_used := self.hparams.get("num_experts_per_tok", self.hparams.get("moe_num_active_primary_experts"))) is not None:
7667+
self.gguf_writer.add_expert_used_count(n_experts_used)
7668+
if (moe_intermediate_size := self.hparams.get("moe_ffn_hidden_size")) is not None:
7669+
self.gguf_writer.add_expert_feed_forward_length(moe_intermediate_size)
7670+
self.gguf_writer.add_feed_forward_length(moe_intermediate_size)
7671+
logger.info(f"gguf: expert feed forward length = {moe_intermediate_size}")
7672+
if (self.hparams.get('moe_primary_router_apply_softmax')):
7673+
self.gguf_writer.add_expert_gating_func(gguf.ExpertGatingFuncType.SOFTMAX)
7674+
else:
7675+
self.gguf_writer.add_expert_gating_func(gguf.ExpertGatingFuncType.SIGMOID)
7676+
# YaRN is not enabled by default
7677+
# To enable it, please refer to this guide: https://huggingface.co/Qwen/Qwen3-30B-A3B#processing-long-texts
7678+
rope_scaling = self.hparams.get("rope_scaling") or {}
7679+
if rope_scaling.get("rope_type", rope_scaling.get("type")) == "yarn" and "factor" in rope_scaling:
7680+
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.YARN)
7681+
self.gguf_writer.add_rope_scaling_factor(rope_scaling["factor"])
7682+
self.gguf_writer.add_rope_scaling_orig_ctx_len(rope_scaling["original_max_position_embeddings"])
7683+
7684+
sliding_window_layout = self.hparams.get("sliding_window_layout")
7685+
if sliding_window_layout:
7686+
for i in sliding_window_layout:
7687+
if i != 0:
7688+
sliding_window = self.hparams.get("sliding_window_size")
7689+
if sliding_window:
7690+
self.gguf_writer.add_sliding_window(sliding_window)
7691+
break
7692+
7693+
_experts: list[dict[str, Tensor]] | None = None
7694+
7695+
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
7696+
# process the experts separately
7697+
if name.find("experts") != -1:
7698+
n_experts = self.hparams.get("num_experts", self.hparams.get("moe_num_primary_experts"))
7699+
assert bid is not None
7700+
7701+
if self._experts is None:
7702+
self._experts = [{} for _ in range(self.block_count)]
7703+
7704+
self._experts[bid][name] = data_torch
7705+
7706+
if len(self._experts[bid]) >= n_experts * 3:
7707+
tensors: list[tuple[str, Tensor]] = []
7708+
7709+
# merge the experts into a single 3d tensor
7710+
for w_name in ["down", "gate", "up"]:
7711+
datas: list[Tensor] = []
7712+
7713+
for xid in range(n_experts):
7714+
ename = f"model.layers.{bid}.block_sparse_moe.experts.{xid}.{w_name}.weight"
7715+
datas.append(self._experts[bid][ename])
7716+
del self._experts[bid][ename]
7717+
7718+
data_torch = torch.stack(datas, dim=0)
7719+
7720+
merged_name = f"model.layers.{bid}.block_sparse_moe.experts.{w_name}.weight"
7721+
7722+
new_name = self.map_tensor_name(merged_name)
7723+
7724+
tensors.append((new_name, data_torch))
7725+
return tensors
7726+
else:
7727+
return []
7728+
7729+
return [(self.map_tensor_name(name), data_torch)]
7730+
7731+
def prepare_tensors(self):
7732+
super().prepare_tensors()
7733+
7734+
if self._experts is not None:
7735+
# flatten `list[dict[str, Tensor]]` into `list[str]`
7736+
experts = [k for d in self._experts for k in d.keys()]
7737+
if len(experts) > 0:
7738+
raise ValueError(f"Unprocessed experts: {experts}")
7739+
75847740
###### CONVERSION LOGIC ######
75857741

75867742

0 commit comments

Comments
 (0)