Skip to content

Commit d28c31a

Browse files
Merge branch 'master' into add-fh1-rebased
2 parents 58e3866 + 8f22dc0 commit d28c31a

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

63 files changed

+2880
-676
lines changed

.github/workflows/build.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -342,7 +342,7 @@ jobs:
342342
cd build
343343
export GGML_VK_VISIBLE_DEVICES=0
344344
# This is using llvmpipe and runs slower than other backends
345-
ctest -L main --verbose --timeout 3600
345+
ctest -L main --verbose --timeout 4200
346346
347347
ubuntu-22-cmake-hip:
348348
runs-on: ubuntu-22.04

convert_hf_to_gguf.py

Lines changed: 153 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -815,6 +815,9 @@ def get_vocab_base_pre(self, tokenizer) -> str:
815815
if chkhsh == "1431a23e583c97432bc230bff598d103ddb5a1f89960c8f1d1051aaa944d0b35":
816816
# ref: https://huggingface.co/sapienzanlp/Minerva-7B-base-v1.0
817817
res = "minerva-7b"
818+
if chkhsh == "7e57df22b1fe23a7b1e1c7f3dc4e3f96d43a4eb0836d0c6bdc3436d7b2f1c664":
819+
# ref: https://huggingface.co/tencent/Hunyuan-A13B-Instruct
820+
res = "hunyuan"
818821

819822
if res is None:
820823
logger.warning("\n")
@@ -6666,6 +6669,156 @@ def set_gguf_parameters(self):
66666669
# Add any other Falcon Mamba2 specific configuration
66676670
self.gguf_writer.add_rope_freq_base(self.find_hparam(["rope_theta"]))
66686671

6672+
6673+
@ModelBase.register("HunYuanMoEV1ForCausalLM")
6674+
class HunYuanMoEModel(TextModel):
6675+
model_arch = gguf.MODEL_ARCH.HUNYUAN_MOE
6676+
6677+
def __init__(self, *args, **kwargs):
6678+
super().__init__(*args, **kwargs)
6679+
# For handling tied embeddings
6680+
self._tok_embd = None
6681+
6682+
def set_vocab(self):
6683+
from transformers import AutoTokenizer
6684+
tokenizer = AutoTokenizer.from_pretrained(self.dir_model, trust_remote_code=True)
6685+
6686+
# 1. Get the pre-tokenizer identifier hash
6687+
tokpre = self.get_vocab_base_pre(tokenizer)
6688+
6689+
# 2. Reverse-engineer the merges list from mergeable_ranks
6690+
merges = []
6691+
vocab = {}
6692+
mergeable_ranks = tokenizer.mergeable_ranks
6693+
for token, rank in mergeable_ranks.items():
6694+
vocab[QwenModel.token_bytes_to_string(token)] = rank
6695+
if len(token) == 1:
6696+
continue
6697+
merged = QwenModel.bpe(mergeable_ranks, token, max_rank=rank)
6698+
if len(merged) == 2: # todo this is an assert in Qwen, why?
6699+
merges.append(' '.join(map(QwenModel.token_bytes_to_string, merged)))
6700+
6701+
# 3. Generate the tokens and toktypes lists
6702+
vocab_size = self.hparams["vocab_size"]
6703+
assert tokenizer.vocab_size == vocab_size
6704+
special_tokens = tokenizer.special_tokens
6705+
reverse_vocab = {id_ : encoded_tok for encoded_tok, id_ in {**vocab, **special_tokens}.items()}
6706+
tokens: list[str] = []
6707+
toktypes: list[int] = []
6708+
for i in range(vocab_size):
6709+
if i not in reverse_vocab:
6710+
tokens.append(f"[PAD{i}]")
6711+
toktypes.append(gguf.TokenType.UNUSED)
6712+
else:
6713+
token = reverse_vocab[i]
6714+
tokens.append(token)
6715+
if i in special_tokens.values():
6716+
toktypes.append(gguf.TokenType.CONTROL)
6717+
else:
6718+
toktypes.append(gguf.TokenType.NORMAL)
6719+
6720+
# 4. Write all vocab-related fields to the GGUF writer
6721+
self.gguf_writer.add_tokenizer_model("gpt2")
6722+
self.gguf_writer.add_tokenizer_pre(tokpre)
6723+
self.gguf_writer.add_token_list(tokens)
6724+
self.gguf_writer.add_token_types(toktypes)
6725+
self.gguf_writer.add_token_merges(merges)
6726+
6727+
# 5. Add special tokens and chat templates
6728+
special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=False)
6729+
special_vocab.add_to_gguf(self.gguf_writer)
6730+
# FIX for BOS token: Overwrite incorrect id read from config.json
6731+
self.gguf_writer.add_bos_token_id(127959) # <|bos|>
6732+
6733+
def set_gguf_parameters(self):
6734+
super().set_gguf_parameters()
6735+
hparams = self.hparams
6736+
6737+
self.gguf_writer.add_expert_count(hparams["num_experts"])
6738+
self.gguf_writer.add_expert_shared_feed_forward_length(hparams["intermediate_size"])
6739+
6740+
moe_intermediate_size = hparams["moe_intermediate_size"]
6741+
assert all(n == moe_intermediate_size[0] for n in moe_intermediate_size)
6742+
self.gguf_writer.add_expert_feed_forward_length(moe_intermediate_size[0])
6743+
6744+
moe_topk = hparams["moe_topk"]
6745+
assert all(topk == moe_topk[0] for topk in moe_topk)
6746+
self.gguf_writer.add_expert_used_count(moe_topk[0])
6747+
6748+
moe_shared_expert = hparams["num_shared_expert"]
6749+
assert all(n == moe_shared_expert[0] for n in moe_shared_expert)
6750+
self.gguf_writer.add_expert_shared_count(moe_shared_expert[0])
6751+
6752+
# Rope
6753+
rope_scaling = hparams.get("rope_scaling", {})
6754+
if rope_scaling.get("type") == "dynamic":
6755+
# HunYuan uses NTK Aware Alpha based scaling. Original implementation: https://www.reddit.com/r/LocalLLaMA/comments/14lz7j5/ntkaware_scaled_rope_allows_llama_models_to_have/
6756+
# 1000 corresponds to a usable context length of 256k (https://github.com/Tencent-Hunyuan/Hunyuan-A13B/blob/main/report/Hunyuan_A13B_Technical_Report.pdf)
6757+
alpha = rope_scaling.get("alpha", 1000)
6758+
base = hparams.get("rope_theta", 10000.0)
6759+
dim = (hparams["hidden_size"] // hparams["num_attention_heads"]) # 128
6760+
scaled_base = base * (alpha ** (dim / (dim - 2))) # 10000 * (1000 ** (128 / 126)) = 11158839.9251
6761+
self.gguf_writer.add_rope_freq_base(scaled_base)
6762+
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.NONE)
6763+
self.gguf_writer.add_rope_scaling_factor(1)
6764+
# There is no consistent way to calculate ctx from alpha, and the config is incorrectly set to 32k
6765+
self.gguf_writer.add_rope_scaling_orig_ctx_len(256 * 1024) # 256k context length
6766+
self.gguf_writer.add_context_length(256 * 1024) # 256k context length
6767+
6768+
# if any of our assumptions about the values are wrong, something has changed and this may need to be updated
6769+
assert alpha == 1000 and base == 10000.0 and dim == 128 and self.hparams["max_position_embeddings"] in [32 * 1024, 256 * 1024] , \
6770+
"HunYuan dynamic RoPE scaling assumptions changed, please update the logic or context length manually"
6771+
6772+
_experts: list[dict[str, Tensor]] | None = None
6773+
6774+
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
6775+
if name == "model.embed_tokens.weight":
6776+
self._tok_embd = data_torch.clone()
6777+
6778+
if name == "lm_head.weight":
6779+
if self.hparams.get("tie_word_embeddings", False):
6780+
logger.info("Skipping tied output layer 'lm_head.weight'")
6781+
return []
6782+
6783+
if name.find("mlp.experts") != -1:
6784+
n_experts = self.hparams["num_experts"]
6785+
assert bid is not None
6786+
6787+
if self._experts is None:
6788+
self._experts = [{} for _ in range(self.block_count)]
6789+
6790+
self._experts[bid][name] = data_torch
6791+
6792+
if len(self._experts[bid]) >= n_experts * 3:
6793+
# merge the experts into a single 3d tensor
6794+
tensors: list[tuple[str, Tensor]] = []
6795+
for w_name in ["down_proj", "gate_proj", "up_proj"]:
6796+
datas: list[Tensor] = []
6797+
6798+
for xid in range(n_experts):
6799+
ename = f"model.layers.{bid}.mlp.experts.{xid}.{w_name}.weight"
6800+
datas.append(self._experts[bid][ename])
6801+
del self._experts[bid][ename]
6802+
6803+
data_torch = torch.stack(datas, dim=0)
6804+
merged_name = f"model.layers.{bid}.mlp.experts.{w_name}.weight"
6805+
new_name = self.map_tensor_name(merged_name)
6806+
tensors.append((new_name, data_torch))
6807+
6808+
return tensors
6809+
else:
6810+
return []
6811+
6812+
return [(self.map_tensor_name(name), data_torch)]
6813+
6814+
def prepare_tensors(self):
6815+
super().prepare_tensors()
6816+
if self._experts is not None:
6817+
experts = [k for d in self._experts for k in d.keys()]
6818+
if len(experts) > 0:
6819+
raise ValueError(f"Unprocessed experts: {experts}")
6820+
6821+
66696822
###### CONVERSION LOGIC ######
66706823

66716824

convert_hf_to_gguf_update.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -138,6 +138,7 @@ class TOKENIZER_TYPE(IntEnum):
138138
{"name": "chatglm-bpe", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/THUDM/glm-4-9b-chat", "chkhsh": "81d72c7348a9f0ebe86f23298d37debe0a5e71149e29bd283904c02262b27516"},
139139
{"name": "glm4", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/THUDM/glm-4-9b-hf", "chkhsh": "a1336059768a55c99a734006ffb02203cd450fed003e9a71886c88acf24fdbc2"},
140140
{"name": "minerva-7b", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/sapienzanlp/Minerva-7B-base-v1.0", "chkhsh": "1431a23e583c97432bc230bff598d103ddb5a1f89960c8f1d1051aaa944d0b35"},
141+
{"name": "hunyuan", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/tencent/Hunyuan-A13B-Instruct", "chkhsh": "7e57df22b1fe23a7b1e1c7f3dc4e3f96d43a4eb0836d0c6bdc3436d7b2f1c664"},
141142
]
142143

143144

examples/eval-callback/eval-callback.cpp

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -136,6 +136,11 @@ static bool run(llama_context * ctx, const common_params & params) {
136136

137137
std::vector<llama_token> tokens = common_tokenize(ctx, params.prompt, add_bos);
138138

139+
if (tokens.empty()) {
140+
LOG_ERR("%s : there are not input tokens to process - (try to provide a prompt with '-p')\n", __func__);
141+
return false;
142+
}
143+
139144
if (llama_decode(ctx, llama_batch_get_one(tokens.data(), tokens.size()))) {
140145
LOG_ERR("%s : failed to eval\n", __func__);
141146
return false;

ggml/include/ggml.h

Lines changed: 29 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -495,7 +495,7 @@ extern "C" {
495495
GGML_OP_POOL_1D,
496496
GGML_OP_POOL_2D,
497497
GGML_OP_POOL_2D_BACK,
498-
GGML_OP_UPSCALE, // nearest interpolate
498+
GGML_OP_UPSCALE,
499499
GGML_OP_PAD,
500500
GGML_OP_PAD_REFLECT_1D,
501501
GGML_OP_ROLL,
@@ -557,6 +557,8 @@ extern "C" {
557557
GGML_GLU_OP_REGLU,
558558
GGML_GLU_OP_GEGLU,
559559
GGML_GLU_OP_SWIGLU,
560+
GGML_GLU_OP_GEGLU_ERF,
561+
GGML_GLU_OP_GEGLU_QUICK,
560562

561563
GGML_GLU_OP_COUNT,
562564
};
@@ -1147,6 +1149,22 @@ extern "C" {
11471149
struct ggml_context * ctx,
11481150
struct ggml_tensor * a);
11491151

1152+
GGML_API struct ggml_tensor * ggml_geglu_erf(
1153+
struct ggml_context * ctx,
1154+
struct ggml_tensor * a);
1155+
1156+
GGML_API struct ggml_tensor * ggml_geglu_erf_swapped(
1157+
struct ggml_context * ctx,
1158+
struct ggml_tensor * a);
1159+
1160+
GGML_API struct ggml_tensor * ggml_geglu_quick(
1161+
struct ggml_context * ctx,
1162+
struct ggml_tensor * a);
1163+
1164+
GGML_API struct ggml_tensor * ggml_geglu_quick_swapped(
1165+
struct ggml_context * ctx,
1166+
struct ggml_tensor * a);
1167+
11501168
// A: n columns, r rows,
11511169
// B: n columns, r rows,
11521170
GGML_API struct ggml_tensor * ggml_glu_split(
@@ -1170,6 +1188,16 @@ extern "C" {
11701188
struct ggml_tensor * a,
11711189
struct ggml_tensor * b);
11721190

1191+
GGML_API struct ggml_tensor * ggml_geglu_erf_split(
1192+
struct ggml_context * ctx,
1193+
struct ggml_tensor * a,
1194+
struct ggml_tensor * b);
1195+
1196+
GGML_API struct ggml_tensor * ggml_geglu_quick_split(
1197+
struct ggml_context * ctx,
1198+
struct ggml_tensor * a,
1199+
struct ggml_tensor * b);
1200+
11731201
// normalize along rows
11741202
GGML_API struct ggml_tensor * ggml_norm(
11751203
struct ggml_context * ctx,

ggml/src/ggml-cann/aclnn_ops.cpp

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -67,6 +67,7 @@
6767
#include <aclnnop/aclnn_pow.h>
6868
#include <aclnnop/aclnn_grouped_matmul_v3.h>
6969
#include <aclnnop/aclnn_fused_infer_attention_score_v2.h>
70+
#include <aclnnop/aclnn_zero.h>
7071
#include <float.h>
7172

7273
#include <cmath>
@@ -804,10 +805,11 @@ static aclTensor* aclnn_zero(ggml_backend_cann_context& ctx, void* buffer,
804805
nb[i] = nb[i - 1] * ne[i - 1];
805806
}
806807

807-
ggml_cann_async_memset(ctx, buffer, n_bytes, 0);
808808
aclTensor* zero =
809809
ggml_cann_create_tensor(buffer, type, type_size, ne, nb, dims);
810+
GGML_CANN_CALL_ACLNN_OP(ctx, InplaceZero, zero);
810811
return zero;
812+
GGML_UNUSED(n_bytes);
811813
}
812814

813815
/**

ggml/src/ggml-cpu/ggml-cpu.c

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2172,6 +2172,8 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
21722172
case GGML_GLU_OP_REGLU:
21732173
case GGML_GLU_OP_GEGLU:
21742174
case GGML_GLU_OP_SWIGLU:
2175+
case GGML_GLU_OP_GEGLU_ERF:
2176+
case GGML_GLU_OP_GEGLU_QUICK:
21752177
{
21762178
n_tasks = n_threads;
21772179
} break;

0 commit comments

Comments
 (0)