Skip to content

Commit cd97310

Browse files
committed
Nemo support, vulkan shaders changes, generealized links in config
* {{{filename}}} for text files * {{json string key}} for character profiles
1 parent 59f1996 commit cd97310

File tree

9 files changed

+145797
-144803
lines changed

9 files changed

+145797
-144803
lines changed

Makefile

Lines changed: 8 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -430,29 +430,9 @@ $(TMP)old_cl_grammar-parser.o: GGML/grammar-parser.cpp GGML/grammar-parser.h
430430

431431
#VULKAN
432432

433-
PYTHON_CMD = python
434-
GLSLC_CMD = glslc
435-
_llama_vk_genshaders_cmd = $(PYTHON_CMD) ggml/ggml_vk_generate_shaders.py
436-
_llama_vk_header = ggml/src/ggml-vulkan-shaders.hpp
437-
_llama_vk_source = ggml/src/ggml-vulkan-shaders.cpp
438-
_llama_vk_input_dir = ggml/src/vulkan-shaders
439-
_llama_vk_shader_deps = $(echo $(_llama_vk_input_dir)/*.comp)
440-
441-
ggml/src/ggml-vulkan.o: ggml/src/ggml-vulkan.cpp ggml/include/ggml-vulkan.h $(_llama_vk_header) $(_llama_vk_source)
442-
$(CXX) $(CXXFLAGS) -c $< -o $@
443-
444-
$(_llama_vk_header): $(_llama_vk_source)
445-
446-
$(_llama_vk_source): $(_llama_vk_shader_deps)
447-
$(_llama_vk_genshaders_cmd) \
448-
--glslc $(GLSLC_CMD) \
449-
--input-dir $(_llama_vk_input_dir) \
450-
--target-hpp $(_llama_vk_header) \
451-
--target-cpp $(_llama_vk_source)
452-
453433
#CXXFLAGS_VK += -I$(VULKAN_DIR)/include
454434

455-
OBJS_VK = $(TMP)vk_ggml.o $(TMP)vk_ggml-alloc.o $(TMP)vk_ggml-backend.o $(TMP)vk_llama.o $(TMP)vk_llama-addon.o $(TMP)vk_sampling.o $(TMP)vk_common.o $(TMP)vk_ggml-quants.o $(TMP)vk_grammar-parser.o $(TMP)vk_ggml-vulkan.o $(TMP)vk_unicode.o $(TMP)vk_unicode-data.o $(TMP)vk_sgemm.o
435+
OBJS_VK = $(TMP)vk_ggml.o $(TMP)vk_ggml-alloc.o $(TMP)vk_ggml-backend.o $(TMP)vk_llama.o $(TMP)vk_llama-addon.o $(TMP)vk_sampling.o $(TMP)vk_common.o $(TMP)vk_ggml-quants.o $(TMP)vk_grammar-parser.o $(TMP)vk_ggml-vulkan.o $(TMP)vk_ggml-vulkan-shaders.o $(TMP)vk_unicode.o $(TMP)vk_unicode-data.o $(TMP)vk_sgemm.o
456436

457437
$(TMP)vk_ggml.o: base/ggml.c base/ggml.h
458438
$(CC) $(CFLAGS_VK) -c $< -o $@
@@ -482,7 +462,7 @@ $(TMP)vk_common.o: base/common.cpp $(VK_COMMON_H_DEPS)
482462
$(CXX) $(CXXFLAGS_VK) -c $< -o $@
483463

484464
$(TMP)vk_llama-addon.o: base/llama-addon.cpp $(COMMON_H_DEPS)
485-
$(CXX) $(CXXFLAGS) -c $< -o $@
465+
$(CXX) $(CXXFLAGS_VK) -c $< -o $@
486466

487467
$(TMP)vk_sampling.o: base/sampling.cpp $(VK_COMMON_H_DEPS)
488468
$(CXX) $(CXXFLAGS_VK) -c $< -o $@
@@ -494,26 +474,14 @@ $(TMP)vk_sgemm.o: base/sgemm.cpp base/sgemm.h base/ggml.h
494474
$(TMP)vk_grammar-parser.o: base/grammar-parser.cpp base/grammar-parser.h
495475
$(CXX) $(CXXFLAGS_VK) -c $< -o $@
496476

497-
PYTHON_CMD = python
498-
GLSLC_CMD = glslc
499-
_llama_vk_genshaders_cmd = $(PYTHON_CMD) base/ggml_vk_generate_shaders.py
500-
_llama_vk_header = base/ggml-vulkan-shaders.hpp
501-
_llama_vk_source = base/ggml-vulkan-shaders.cpp
502-
_llama_vk_input_dir = base/vulkan-shaders
503-
_llama_vk_shader_deps = $(echo $(_llama_vk_input_dir)/*.comp)
504-
477+
vulkan-shaders-gen: base/vulkan-shaders-gen.cpp
478+
$(CXX) $(CXXFLAGS_VK) -o $@ $(LDFLAGS_VK) base/vulkan-shaders-gen.cpp
505479

506-
$(TMP)vk_ggml-vulkan.o: base/ggml-vulkan.cpp base/ggml-vulkan.h
480+
$(TMP)vk_ggml-vulkan-shaders.o: base/ggml-vulkan-shaders.cpp base/ggml-vulkan-shaders.hpp
481+
$(CXX) $(CXXFLAGS_VK) $(LDFLAGS_VK) -c $< -o $@
482+
483+
$(TMP)vk_ggml-vulkan.o: base/ggml-vulkan.cpp base/ggml-vulkan.h base/ggml-vulkan-shaders.hpp base/ggml-vulkan-shaders.cpp
507484
$(CXX) $(CXXFLAGS_VK) $(LDFLAGS_VK) -c $< -o $@
508-
509-
$(_llama_vk_header): $(_llama_vk_source)
510-
511-
$(_llama_vk_source): $(_llama_vk_shader_deps)
512-
$(_llama_vk_genshaders_cmd) \
513-
--glslc $(GLSLC_CMD) \
514-
--input-dir $(_llama_vk_input_dir) \
515-
--target-hpp $(_llama_vk_header) \
516-
--target-cpp $(_llama_vk_source)
517485

518486

519487
#####################################

base/ggml-vulkan-shaders.cpp

Lines changed: 144758 additions & 0 deletions
Large diffs are not rendered by default.

base/ggml-vulkan-shaders.hpp

Lines changed: 384 additions & 144742 deletions
Large diffs are not rendered by default.

base/llama.cpp

Lines changed: 45 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -59,6 +59,12 @@
5959
#include <io.h>
6060
#endif
6161

62+
#if __cplusplus >= 202000L
63+
#define LU8(x) (const char*)(u8##x)
64+
#else
65+
#define LU8(x) u8##x
66+
#endif
67+
6268
#include <algorithm>
6369
#include <array>
6470
#include <cassert>
@@ -4437,16 +4443,6 @@ static void llm_load_hparams(
44374443

44384444
// non-transformer models do not have attention heads
44394445
if (hparams.n_head() > 0) {
4440-
// sanity check for n_rot (optional)
4441-
hparams.n_rot = hparams.n_embd / hparams.n_head();
4442-
4443-
ml.get_key(LLM_KV_ROPE_DIMENSION_COUNT, hparams.n_rot, false);
4444-
4445-
if (model.arch == LLM_ARCH_LLAMA || model.arch == LLM_ARCH_FALCON) {
4446-
if (hparams.n_rot != hparams.n_embd / hparams.n_head()) {
4447-
throw std::runtime_error(format("invalid n_rot: %u, expected %u", hparams.n_rot, hparams.n_embd / hparams.n_head()));
4448-
}
4449-
}
44504446
// gpt-neox n_rot = rotary_pct * (n_embd / n_head)
44514447
// gpt-j n_rot = rotary_dim
44524448

@@ -4455,6 +4451,17 @@ static void llm_load_hparams(
44554451

44564452
hparams.n_embd_head_v = hparams.n_embd / hparams.n_head();
44574453
ml.get_key(LLM_KV_ATTENTION_VALUE_LENGTH, hparams.n_embd_head_v, false);
4454+
4455+
// sanity check for n_rot (optional)
4456+
hparams.n_rot = hparams.n_embd_head_k;
4457+
4458+
ml.get_key(LLM_KV_ROPE_DIMENSION_COUNT, hparams.n_rot, false);
4459+
4460+
if (model.arch == LLM_ARCH_LLAMA || model.arch == LLM_ARCH_FALCON) {
4461+
if (hparams.n_rot != hparams.n_embd_head_k) {
4462+
throw std::runtime_error(format("invalid n_rot: %u, expected %u", hparams.n_rot, hparams.n_embd_head_k));
4463+
}
4464+
}
44584465
} else {
44594466
hparams.n_rot = 0;
44604467
hparams.n_embd_head_k = 0;
@@ -5232,6 +5239,9 @@ static void llm_load_vocab(
52325239
} else if (
52335240
tokenizer_pre == "jais") {
52345241
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_JAIS;
5242+
} else if (
5243+
tokenizer_pre == "tekken") {// K-KINGUUU?!
5244+
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_TEKKEN;
52355245
} else {
52365246
throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str()));
52375247
}
@@ -5782,6 +5792,13 @@ static bool llm_load_tensors(
57825792
const int64_t n_ff = hparams.n_ff();
57835793
const int64_t n_expert = hparams.n_expert;
57845794

5795+
const int64_t n_head = hparams.n_head();
5796+
const int64_t n_head_kv = hparams.n_head_kv();
5797+
const int64_t n_embd_head_k = hparams.n_embd_head_k;
5798+
const int64_t n_embd_head_v = hparams.n_embd_head_v;
5799+
const int64_t n_expert_used = hparams.n_expert_used;
5800+
const int64_t n_ctx_train = hparams.n_ctx_train;
5801+
57855802
if (n_expert > 0 && hparams.n_expert_used == 0) {
57865803
throw std::runtime_error("model has expert layers but no expert layers are used");
57875804
}
@@ -5820,10 +5837,15 @@ static bool llm_load_tensors(
58205837

58215838
layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
58225839

5823-
layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd});
5824-
layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa});
5825-
layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa});
5826-
layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
5840+
layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head});
5841+
layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa});
5842+
layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa});
5843+
layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd});
5844+
// new vs old
5845+
// layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd});
5846+
// layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa});
5847+
// layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa});
5848+
// layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
58275849

58285850
// optional bias tensors
58295851
layer.bq = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
@@ -12922,6 +12944,8 @@ struct llm_build_context {
1292212944
LLM_NORM_RMS, cb, -1);
1292312945
cb(cur, "result_norm", -1);
1292412946
} else {
12947+
GGML_ASSERT(n_outputs_enc > 0 && "call llama_encode() first");
12948+
1292512949
struct ggml_tensor * embd_enc = llm_build_inp_embd_enc();
1292612950
struct ggml_tensor * pos_bucket_dec = llm_build_pos_bucket(true);
1292712951

@@ -15133,6 +15157,13 @@ struct llm_tokenizer_bpe {
1513315157
"'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
1513415158
};
1513515159
break;
15160+
case LLAMA_VOCAB_PRE_TYPE_TEKKEN:
15161+
// original regex from tokenizer.json
15162+
// "[^\\r\\n\\p{L}\\p{N}]?[\\p{Lu}\\p{Lt}\\p{Lm}\\p{Lo}\\p{M}]*[\\p{Ll}\\p{Lm}\\p{Lo}\\p{M}]+|[^\\r\\n\\p{L}\\p{N}]?[\\p{Lu}\\p{Lt}\\p{Lm}\\p{Lo}\\p{M}]+[\\p{Ll}\\p{Lm}\\p{Lo}\\p{M}]*|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n/]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+"
15163+
regex_exprs = {
15164+
"[^\\r\\n\\p{L}\\p{N}]?((?=[\\p{L}])([^a-z]))*((?=[\\p{L}])([^A-Z]))+|[^\\r\\n\\p{L}\\p{N}]?((?=[\\p{L}])([^a-z]))+((?=[\\p{L}])([^A-Z]))*|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n/]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
15165+
};
15166+
break;
1513615167
case LLAMA_VOCAB_PRE_TYPE_STABLELM2:
1513715168
case LLAMA_VOCAB_PRE_TYPE_QWEN2:
1513815169
regex_exprs = {

base/llama.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -92,6 +92,7 @@ extern "C" {
9292
LLAMA_VOCAB_PRE_TYPE_CHATGLM4 = 17,
9393
LLAMA_VOCAB_PRE_TYPE_VIKING = 18,
9494
LLAMA_VOCAB_PRE_TYPE_JAIS = 19,
95+
LLAMA_VOCAB_PRE_TYPE_TEKKEN = 20,
9596
};
9697

9798
// note: these values should be synchronized with ggml_rope

base/sampling.cpp

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -216,10 +216,12 @@ llama_token llama_sampling_sample(
216216
llama_sample_apply_guidance(ctx_main, logits, logits_guidance, params.cfg_scale);
217217
}
218218

219-
cur.clear();
219+
//cur.clear();
220+
cur.resize(n_vocab);
220221

221222
for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
222-
cur.emplace_back(llama_token_data{token_id, logits[token_id], 0.0f});
223+
//cur.emplace_back(llama_token_data{token_id, logits[token_id], 0.0f});
224+
cur[token_id] = llama_token_data{token_id, logits[token_id], 0.0f};
223225
}
224226

225227
llama_token_data_array cur_p = { cur.data(), cur.size(), false };

0 commit comments

Comments
 (0)