5959 #include <io.h>
6060#endif
6161
62+ #if __cplusplus >= 202000L
63+ #define LU8(x) (const char*)(u8##x)
64+ #else
65+ #define LU8(x) u8##x
66+ #endif
67+
6268#include <algorithm>
6369#include <array>
6470#include <cassert>
@@ -4437,16 +4443,6 @@ static void llm_load_hparams(
44374443
44384444 // non-transformer models do not have attention heads
44394445 if (hparams.n_head() > 0) {
4440- // sanity check for n_rot (optional)
4441- hparams.n_rot = hparams.n_embd / hparams.n_head();
4442-
4443- ml.get_key(LLM_KV_ROPE_DIMENSION_COUNT, hparams.n_rot, false);
4444-
4445- if (model.arch == LLM_ARCH_LLAMA || model.arch == LLM_ARCH_FALCON) {
4446- if (hparams.n_rot != hparams.n_embd / hparams.n_head()) {
4447- throw std::runtime_error(format("invalid n_rot: %u, expected %u", hparams.n_rot, hparams.n_embd / hparams.n_head()));
4448- }
4449- }
44504446 // gpt-neox n_rot = rotary_pct * (n_embd / n_head)
44514447 // gpt-j n_rot = rotary_dim
44524448
@@ -4455,6 +4451,17 @@ static void llm_load_hparams(
44554451
44564452 hparams.n_embd_head_v = hparams.n_embd / hparams.n_head();
44574453 ml.get_key(LLM_KV_ATTENTION_VALUE_LENGTH, hparams.n_embd_head_v, false);
4454+
4455+ // sanity check for n_rot (optional)
4456+ hparams.n_rot = hparams.n_embd_head_k;
4457+
4458+ ml.get_key(LLM_KV_ROPE_DIMENSION_COUNT, hparams.n_rot, false);
4459+
4460+ if (model.arch == LLM_ARCH_LLAMA || model.arch == LLM_ARCH_FALCON) {
4461+ if (hparams.n_rot != hparams.n_embd_head_k) {
4462+ throw std::runtime_error(format("invalid n_rot: %u, expected %u", hparams.n_rot, hparams.n_embd_head_k));
4463+ }
4464+ }
44584465 } else {
44594466 hparams.n_rot = 0;
44604467 hparams.n_embd_head_k = 0;
@@ -5232,6 +5239,9 @@ static void llm_load_vocab(
52325239 } else if (
52335240 tokenizer_pre == "jais") {
52345241 vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_JAIS;
5242+ } else if (
5243+ tokenizer_pre == "tekken") {// K-KINGUUU?!
5244+ vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_TEKKEN;
52355245 } else {
52365246 throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str()));
52375247 }
@@ -5782,6 +5792,13 @@ static bool llm_load_tensors(
57825792 const int64_t n_ff = hparams.n_ff();
57835793 const int64_t n_expert = hparams.n_expert;
57845794
5795+ const int64_t n_head = hparams.n_head();
5796+ const int64_t n_head_kv = hparams.n_head_kv();
5797+ const int64_t n_embd_head_k = hparams.n_embd_head_k;
5798+ const int64_t n_embd_head_v = hparams.n_embd_head_v;
5799+ const int64_t n_expert_used = hparams.n_expert_used;
5800+ const int64_t n_ctx_train = hparams.n_ctx_train;
5801+
57855802 if (n_expert > 0 && hparams.n_expert_used == 0) {
57865803 throw std::runtime_error("model has expert layers but no expert layers are used");
57875804 }
@@ -5820,10 +5837,15 @@ static bool llm_load_tensors(
58205837
58215838 layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
58225839
5823- layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd});
5824- layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa});
5825- layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa});
5826- layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
5840+ layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head});
5841+ layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa});
5842+ layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa});
5843+ layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd});
5844+ // new vs old
5845+ // layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd});
5846+ // layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa});
5847+ // layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa});
5848+ // layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
58275849
58285850 // optional bias tensors
58295851 layer.bq = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
@@ -12922,6 +12944,8 @@ struct llm_build_context {
1292212944 LLM_NORM_RMS, cb, -1);
1292312945 cb(cur, "result_norm", -1);
1292412946 } else {
12947+ GGML_ASSERT(n_outputs_enc > 0 && "call llama_encode() first");
12948+
1292512949 struct ggml_tensor * embd_enc = llm_build_inp_embd_enc();
1292612950 struct ggml_tensor * pos_bucket_dec = llm_build_pos_bucket(true);
1292712951
@@ -15133,6 +15157,13 @@ struct llm_tokenizer_bpe {
1513315157 "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
1513415158 };
1513515159 break;
15160+ case LLAMA_VOCAB_PRE_TYPE_TEKKEN:
15161+ // original regex from tokenizer.json
15162+ // "[^\\r\\n\\p{L}\\p{N}]?[\\p{Lu}\\p{Lt}\\p{Lm}\\p{Lo}\\p{M}]*[\\p{Ll}\\p{Lm}\\p{Lo}\\p{M}]+|[^\\r\\n\\p{L}\\p{N}]?[\\p{Lu}\\p{Lt}\\p{Lm}\\p{Lo}\\p{M}]+[\\p{Ll}\\p{Lm}\\p{Lo}\\p{M}]*|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n/]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+"
15163+ regex_exprs = {
15164+ "[^\\r\\n\\p{L}\\p{N}]?((?=[\\p{L}])([^a-z]))*((?=[\\p{L}])([^A-Z]))+|[^\\r\\n\\p{L}\\p{N}]?((?=[\\p{L}])([^a-z]))+((?=[\\p{L}])([^A-Z]))*|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n/]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
15165+ };
15166+ break;
1513615167 case LLAMA_VOCAB_PRE_TYPE_STABLELM2:
1513715168 case LLAMA_VOCAB_PRE_TYPE_QWEN2:
1513815169 regex_exprs = {
0 commit comments