Skip to content

Commit 1749cf1

Browse files
Merge pull request #191 from menloresearch/update-dev-from-master-2025-08-02-09-09
Sync master with upstream release b6062
2 parents 454b987 + f906275 commit 1749cf1

File tree

22 files changed

+1033
-207
lines changed

22 files changed

+1033
-207
lines changed

convert_hf_to_gguf.py

Lines changed: 95 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -684,6 +684,9 @@ def get_vocab_base_pre(self, tokenizer) -> str:
684684
if chkhsh == "7e57df22b1fe23a7b1e1c7f3dc4e3f96d43a4eb0836d0c6bdc3436d7b2f1c664":
685685
# ref: https://huggingface.co/tencent/Hunyuan-A13B-Instruct
686686
res = "hunyuan"
687+
if chkhsh == "bba3b3366b646dbdded5dbc42d59598b849371afc42f7beafa914afaa5b70aa6":
688+
# ref: https://huggingface.co/tencent/Hunyuan-4B-Instruct
689+
res = "hunyuan-dense"
687690
if chkhsh == "a6b57017d60e6edb4d88ecc2845188e0eb333a70357e45dcc9b53964a73bbae6":
688691
# ref: https://huggingface.co/tiiuae/Falcon-H1-0.5B-Base
689692
res = "falcon-h1"
@@ -7553,11 +7556,6 @@ def set_gguf_parameters(self):
75537556
class HunYuanMoEModel(TextModel):
75547557
model_arch = gguf.MODEL_ARCH.HUNYUAN_MOE
75557558

7556-
def __init__(self, *args, **kwargs):
7557-
super().__init__(*args, **kwargs)
7558-
# For handling tied embeddings
7559-
self._tok_embd = None
7560-
75617559
def set_vocab(self):
75627560
from transformers import AutoTokenizer
75637561
tokenizer = AutoTokenizer.from_pretrained(self.dir_model, trust_remote_code=True)
@@ -7651,9 +7649,6 @@ def set_gguf_parameters(self):
76517649
_experts: list[dict[str, Tensor]] | None = None
76527650

76537651
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
7654-
if name == "model.embed_tokens.weight":
7655-
self._tok_embd = data_torch.clone()
7656-
76577652
if name == "lm_head.weight":
76587653
if self.hparams.get("tie_word_embeddings", False):
76597654
logger.info("Skipping tied output layer 'lm_head.weight'")
@@ -7698,6 +7693,98 @@ def prepare_tensors(self):
76987693
raise ValueError(f"Unprocessed experts: {experts}")
76997694

77007695

7696+
@ModelBase.register("HunYuanDenseV1ForCausalLM")
7697+
class HunYuanModel(TextModel):
7698+
model_arch = gguf.MODEL_ARCH.HUNYUAN_DENSE
7699+
7700+
def set_vocab(self):
7701+
if (self.dir_model / "tokenizer.json").is_file():
7702+
self._set_vocab_gpt2()
7703+
else:
7704+
from transformers import AutoTokenizer
7705+
tokenizer = AutoTokenizer.from_pretrained(self.dir_model, trust_remote_code=True)
7706+
7707+
# 1. Get the pre-tokenizer identifier hash
7708+
tokpre = self.get_vocab_base_pre(tokenizer)
7709+
7710+
# 2. Reverse-engineer the merges list from mergeable_ranks
7711+
merges = []
7712+
vocab = {}
7713+
mergeable_ranks = tokenizer.mergeable_ranks
7714+
for token, rank in mergeable_ranks.items():
7715+
vocab[QwenModel.token_bytes_to_string(token)] = rank
7716+
if len(token) == 1:
7717+
continue
7718+
merged = QwenModel.bpe(mergeable_ranks, token, max_rank=rank)
7719+
if len(merged) == 2:
7720+
merges.append(' '.join(map(QwenModel.token_bytes_to_string, merged)))
7721+
7722+
# 3. Generate the tokens and toktypes lists
7723+
vocab_size = self.hparams["vocab_size"]
7724+
assert tokenizer.vocab_size == vocab_size
7725+
special_tokens = tokenizer.special_tokens
7726+
reverse_vocab = {id_ : encoded_tok for encoded_tok, id_ in {**vocab, **special_tokens}.items()}
7727+
tokens: list[str] = []
7728+
toktypes: list[int] = []
7729+
for i in range(vocab_size):
7730+
if i not in reverse_vocab:
7731+
tokens.append(f"[PAD{i}]")
7732+
toktypes.append(gguf.TokenType.UNUSED)
7733+
else:
7734+
token = reverse_vocab[i]
7735+
tokens.append(token)
7736+
if i in special_tokens.values():
7737+
toktypes.append(gguf.TokenType.CONTROL)
7738+
else:
7739+
toktypes.append(gguf.TokenType.NORMAL)
7740+
7741+
# 4. Write all vocab-related fields to the GGUF writer
7742+
self.gguf_writer.add_tokenizer_model("gpt2")
7743+
self.gguf_writer.add_tokenizer_pre(tokpre)
7744+
self.gguf_writer.add_token_list(tokens)
7745+
self.gguf_writer.add_token_types(toktypes)
7746+
self.gguf_writer.add_token_merges(merges)
7747+
7748+
# 5. Add special tokens and chat templates
7749+
special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=False)
7750+
special_vocab.add_to_gguf(self.gguf_writer)
7751+
# FIX for BOS token: Overwrite incorrect id read from config.json
7752+
if self.hparams['hidden_size'] == 4096:
7753+
self.gguf_writer.add_bos_token_id(127958) # only for 7b dense, fix <|bos|> token
7754+
7755+
def set_gguf_parameters(self):
7756+
super().set_gguf_parameters()
7757+
hparams = self.hparams
7758+
7759+
# Rope
7760+
rope_scaling = hparams.get("rope_scaling", {})
7761+
if rope_scaling.get("type") == "dynamic":
7762+
# HunYuan uses NTK Aware Alpha based scaling. Original implementation: https://www.reddit.com/r/LocalLLaMA/comments/14lz7j5/ntkaware_scaled_rope_allows_llama_models_to_have/
7763+
# 1000 corresponds to a usable context length of 256k (https://github.com/Tencent-Hunyuan/Hunyuan-A13B/blob/main/report/Hunyuan_A13B_Technical_Report.pdf)
7764+
alpha = rope_scaling.get("alpha", 50)
7765+
base = hparams.get("rope_theta", 10000.0)
7766+
dim = hparams["head_dim"]
7767+
scaled_base = base * (alpha ** (dim / (dim - 2)))
7768+
self.gguf_writer.add_rope_freq_base(scaled_base)
7769+
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.NONE)
7770+
self.gguf_writer.add_rope_scaling_factor(1)
7771+
# There is no consistent way to calculate ctx from alpha, and the config is incorrectly set to 32k
7772+
self.gguf_writer.add_rope_scaling_orig_ctx_len(256 * 1024) # 256k context length
7773+
self.gguf_writer.add_context_length(256 * 1024) # 256k context length
7774+
7775+
# if any of our assumptions about the values are wrong, something has changed and this may need to be updated
7776+
assert base == 10000.0 and self.hparams["max_position_embeddings"] in [32 * 1024, 256 * 1024] , \
7777+
"HunYuan dynamic RoPE scaling assumptions changed, please update the logic or context length manually"
7778+
7779+
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
7780+
if name == "lm_head.weight":
7781+
if self.hparams.get("tie_word_embeddings", False):
7782+
logger.info("Skipping tied output layer 'lm_head.weight'")
7783+
return []
7784+
7785+
return [(self.map_tensor_name(name), data_torch)]
7786+
7787+
77017788
@ModelBase.register("SmolLM3ForCausalLM")
77027789
class SmolLM3Model(LlamaModel):
77037790
model_arch = gguf.MODEL_ARCH.SMOLLM3

convert_hf_to_gguf_update.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -140,6 +140,7 @@ class TOKENIZER_TYPE(IntEnum):
140140
{"name": "glm4", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/THUDM/glm-4-9b-hf", "chkhsh": "a1336059768a55c99a734006ffb02203cd450fed003e9a71886c88acf24fdbc2"},
141141
{"name": "minerva-7b", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/sapienzanlp/Minerva-7B-base-v1.0", "chkhsh": "1431a23e583c97432bc230bff598d103ddb5a1f89960c8f1d1051aaa944d0b35"},
142142
{"name": "hunyuan", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/tencent/Hunyuan-A13B-Instruct", "chkhsh": "7e57df22b1fe23a7b1e1c7f3dc4e3f96d43a4eb0836d0c6bdc3436d7b2f1c664"},
143+
{"name": "hunyuan-dense", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/tencent/Hunyuan-4B-Instruct", "chkhsh": "bba3b3366b646dbdded5dbc42d59598b849371afc42f7beafa914afaa5b70aa6"},
143144
# falcon-h1 series uses 4 different tokenizers across model sizes (0.5b - 34b), hence we need to define 4 different hashes
144145
{"name": "falcon-h1", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/tiiuae/Falcon-H1-0.5B-Base", "chkhsh": "a6b57017d60e6edb4d88ecc2845188e0eb333a70357e45dcc9b53964a73bbae6"},
145146
{"name": "falcon-h1", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/tiiuae/Falcon-H1-1B-Base", "chkhsh": "60476e1243776c4fb1b993dbd7a5f15ac22f83c80afdf425fa5ae01c8d44ef86"},

ggml/src/ggml-cuda/mmq.cuh

Lines changed: 7 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -251,25 +251,21 @@ static constexpr __device__ int mmq_get_granularity_device(const int /*mmq_x*/)
251251
#endif // AMD_MFMA_AVAILABLE
252252

253253
#if defined(GGML_USE_HIP)
254-
static int mmq_get_nwarps_host(const int cc) {
255-
return amd_mfma_available(cc) ? 8 : 4;
254+
static int mmq_get_nwarps_host(const int cc, const int warp_size) {
255+
return amd_mfma_available(cc) ? 8 : 256/warp_size;
256256
}
257257
#else
258-
static int mmq_get_nwarps_host(const int /*cc*/) {
259-
return 8;
258+
static int mmq_get_nwarps_host(const int /*cc*/, const int warp_size) {
259+
return 256/warp_size;
260260
}
261261
#endif // (GGML_USE_HIP)
262262

263263
static constexpr __device__ int mmq_get_nwarps_device() {
264-
#if defined(GGML_USE_HIP)
265264
#if defined(AMD_MFMA_AVAILABLE)
266265
return 8;
267266
#else
268-
return 4;
267+
return 256/ggml_cuda_get_physical_warp_size();
269268
#endif // AMD_MFMA_AVAILABLE
270-
#else
271-
return 8;
272-
#endif // defined(GGML_USE_HIP)
273269
}
274270

275271
// ------------------------------------------------------------
@@ -3472,7 +3468,7 @@ static void launch_mul_mat_q(ggml_backend_cuda_context & ctx, const mmq_args & a
34723468
const int cc = ggml_cuda_info().devices[id].cc;
34733469
const int nsm = ggml_cuda_info().devices[id].nsm;
34743470
const int warp_size = ggml_cuda_info().devices[id].warp_size;
3475-
const int nwarps = mmq_get_nwarps_host(cc);
3471+
const int nwarps = mmq_get_nwarps_host(cc, warp_size);
34763472
const int mmq_y = get_mmq_y_host(cc);
34773473

34783474
const dim3 block_dims(warp_size, nwarps, 1);
@@ -3559,7 +3555,7 @@ void mul_mat_q_case(ggml_backend_cuda_context & ctx, const mmq_args & args, cuda
35593555
const int cc = ggml_cuda_info().devices[id].cc;
35603556
const size_t smpbo = ggml_cuda_info().devices[id].smpbo;
35613557
const int warp_size = ggml_cuda_info().devices[id].warp_size;
3562-
const int nwarps = mmq_get_nwarps_host(cc);
3558+
const int nwarps = mmq_get_nwarps_host(cc, warp_size);
35633559

35643560
const int mmq_x_max = get_mmq_x_max_host(cc);
35653561
const int mmq_y = get_mmq_y_host(cc);

0 commit comments

Comments
 (0)