Skip to content

Commit 46c8b70

Browse files
committed
ntk alpha generic
1 parent 5471f5a commit 46c8b70

File tree

2 files changed

+16
-10
lines changed

2 files changed

+16
-10
lines changed

convert_hf_to_gguf.py

Lines changed: 16 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -6477,11 +6477,22 @@ def set_gguf_parameters(self):
64776477
# Rope
64786478
rope_scaling = hparams.get("rope_scaling", {})
64796479
if rope_scaling.get("type") == "dynamic":
6480-
# Not sure if YARN is correct here, and the factor in the config is only 1 anyway
6481-
# but the release claims to scale to 256k, which would be a factor of 8
6482-
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.YARN)
6483-
self.gguf_writer.add_rope_scaling_factor(rope_scaling["factor"])
6484-
self.gguf_writer.add_rope_scaling_orig_ctx_len(self.hparams["max_position_embeddings"])
6480+
# HunYuan uses NTK Aware Alpha based scaling. Original implementation: https://www.reddit.com/r/LocalLLaMA/comments/14lz7j5/ntkaware_scaled_rope_allows_llama_models_to_have/
6481+
# 1000 corresponds to a usable context length of 256k (https://github.com/Tencent-Hunyuan/Hunyuan-A13B/blob/main/report/Hunyuan_A13B_Technical_Report.pdf)
6482+
alpha = rope_scaling.get("alpha", 1000)
6483+
base = hparams.get("rope_theta", 10000.0)
6484+
dim = (hparams["hidden_size"] // hparams["num_attention_heads"]) # 128
6485+
scaled_base = base * (alpha ** (dim / (dim-2))) # 10000 * (1000 ** (128 / 126)) = 11158839.9251
6486+
self.gguf_writer.add_rope_freq_base(scaled_base)
6487+
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.NONE)
6488+
self.gguf_writer.add_rope_scaling_factor(1)
6489+
#There is no consistent way to calculate ctx from alpha, and the config is incorrectly set to 32k
6490+
self.gguf_writer.add_rope_scaling_orig_ctx_len(256 * 1024) # 256k context length
6491+
self.gguf_writer.add_context_length(256 * 1024) # 256k context length
6492+
6493+
# if any of our assumptions about the values are wrong, something has changed and this may need to be updated
6494+
assert alpha == 1000 and base == 10000.0 and dim == 128 and self.hparams["max_position_embeddings"] in [32 * 1024, 256 * 1024] , \
6495+
"HunYuan dynamic RoPE scaling assumptions changed, please update the logic or context length manually"
64856496

64866497
_experts: list[dict[str, Tensor]] | None = None
64876498

src/llama-model.cpp

Lines changed: 0 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1511,11 +1511,6 @@ void llama_model::load_hparams(llama_model_loader & ml) {
15111511
ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
15121512
ml.get_key(LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, hparams.n_ff_shexp);
15131513

1514-
// TODO: read from gguf
1515-
float n_dim = hparams.n_embd_head_k;
1516-
float alpha = 1000.0f; // NTK-Aware
1517-
hparams.rope_freq_base_train = 10000.0f * std::powf(alpha, n_dim / (n_dim - 2.0f));
1518-
15191514
switch (hparams.n_layer) {
15201515
case 32: type = LLM_TYPE_A13B; break;
15211516
default: type = LLM_TYPE_UNKNOWN;

0 commit comments

Comments
 (0)