Skip to content

Commit e1fc07a

Browse files
committed
llama : arch (cont)
ggml-ci
1 parent 7ab08d5 commit e1fc07a

File tree

7 files changed

+1349
-1325
lines changed

7 files changed

+1349
-1325
lines changed

src/llama-adapter.h

Lines changed: 1 addition & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -68,12 +68,7 @@ static bool llama_control_vector_init(struct llama_control_vector & cvec, const
6868
cvec.tensors.reserve(model.hparams.n_layer);
6969
cvec.tensors.push_back(nullptr); // there's never a tensor for layer 0
7070
for (size_t il = 1; il < model.hparams.n_layer; il++) {
71-
ggml_backend_buffer_type_t buft = select_buft(*model.dev_layer.at(il).buft_list,
72-
[&](ggml_context * ctx) {
73-
ggml_tensor * cur = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, model.hparams.n_embd);
74-
ggml_tensor * layer_dir = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, model.hparams.n_embd);
75-
return ggml_add(ctx, cur, layer_dir);
76-
});
71+
ggml_backend_buffer_type_t buft = llama_model_select_buft(model, il);
7772
ggml_context * ctx = ctx_for_buft(buft);
7873
if (!ctx) {
7974
LLAMA_LOG_ERROR("%s: failed to allocate context for control vector\n", __func__);

src/llama-arch.cpp

Lines changed: 1247 additions & 0 deletions
Large diffs are not rendered by default.

src/llama-arch.h

Lines changed: 43 additions & 1266 deletions
Large diffs are not rendered by default.

src/llama-context.cpp

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,7 @@ struct llama_data_write {
4040
}
4141

4242
void write_model_info(const struct llama_context * ctx) {
43-
std::string arch_str = LLM_ARCH_NAMES.at(ctx->model.arch);
43+
const std::string arch_str = llm_arch_name(ctx->model.arch);
4444
write_string(arch_str);
4545
// TODO: add more model-specific info which should prevent loading the session file if not identical
4646
}
@@ -263,7 +263,8 @@ struct llama_data_read {
263263

264264
// validate model information
265265
void read_model_info(const struct llama_context * ctx) {
266-
std::string cur_arch_str = LLM_ARCH_NAMES.at(ctx->model.arch);
266+
const std::string cur_arch_str = llm_arch_name(ctx->model.arch);
267+
267268
std::string arch_str;
268269
read_string(arch_str);
269270
if (cur_arch_str != arch_str) {

src/llama-model.cpp

Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,7 @@
11
#include "llama-model.h"
22

3+
#include "llama-impl.h"
4+
35
std::string llama_model_ftype_name(llama_ftype ftype) {
46
if (ftype & LLAMA_FTYPE_GUESSED) {
57
return llama_model_ftype_name((enum llama_ftype) (ftype & ~LLAMA_FTYPE_GUESSED)) + " (guessed)";
@@ -42,3 +44,49 @@ std::string llama_model_ftype_name(llama_ftype ftype) {
4244
default: return "unknown, may not work";
4345
}
4446
}
47+
48+
template<typename F>
49+
static bool buft_supported(ggml_backend_buffer_type_t buft, ggml_backend_dev_t dev, F & fn) {
50+
ggml_init_params params = {
51+
/*.mem_size =*/ ggml_tensor_overhead()*8,
52+
/*.mem_buffer =*/ NULL,
53+
/*.no_alloc =*/ true,
54+
};
55+
ggml_context_ptr ctx { ggml_init(params) };
56+
if (!ctx) {
57+
throw std::runtime_error(format("failed to create ggml context"));
58+
}
59+
60+
ggml_backend_buffer_ptr buf { ggml_backend_buft_alloc_buffer(buft, 0) };
61+
ggml_tensor * op_tensor = fn(ctx.get());
62+
for (int i = 0; i < GGML_MAX_SRC; i++) {
63+
if (op_tensor->src[i] != nullptr) {
64+
assert(op_tensor->src[i]->buffer == nullptr);
65+
op_tensor->src[i]->buffer = buf.get();
66+
}
67+
}
68+
bool op_supported = ggml_backend_dev_supports_op(dev, op_tensor);
69+
70+
return op_supported;
71+
}
72+
73+
template<typename F>
74+
static ggml_backend_buffer_type_t select_buft(const llama_model::buft_list_t & buft_list, const F & fn) {
75+
for (const auto & cur : buft_list) {
76+
ggml_backend_dev_t cur_dev = cur.first;
77+
ggml_backend_buffer_type_t cur_buft = cur.second;
78+
if (buft_supported(cur_buft, cur_dev, fn)) {
79+
return cur_buft;
80+
}
81+
}
82+
throw std::runtime_error(format("no suitable buffer type found"));
83+
}
84+
85+
ggml_backend_buffer_type_t llama_model_select_buft(const llama_model & model, int il) {
86+
return select_buft(*model.dev_layer.at(il).buft_list,
87+
[&](ggml_context * ctx) {
88+
ggml_tensor * cur = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, model.hparams.n_embd);
89+
ggml_tensor * layer_dir = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, model.hparams.n_embd);
90+
return ggml_add(ctx, cur, layer_dir);
91+
});
92+
}

src/llama-model.h

Lines changed: 1 addition & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -5,8 +5,6 @@
55
#include "llama-vocab.h"
66
#include "llama-mmap.h"
77

8-
#include "llama-impl.h"
9-
108
#include "ggml-cpp.h"
119

1210
#include <array>
@@ -613,42 +611,6 @@ struct llama_model {
613611
}
614612
};
615613

616-
template<typename F>
617-
static bool buft_supported(ggml_backend_buffer_type_t buft, ggml_backend_dev_t dev, F & fn) {
618-
ggml_init_params params = {
619-
/*.mem_size =*/ ggml_tensor_overhead()*8,
620-
/*.mem_buffer =*/ NULL,
621-
/*.no_alloc =*/ true,
622-
};
623-
ggml_context_ptr ctx { ggml_init(params) };
624-
if (!ctx) {
625-
throw std::runtime_error(format("failed to create ggml context"));
626-
}
627-
628-
ggml_backend_buffer_ptr buf { ggml_backend_buft_alloc_buffer(buft, 0) };
629-
ggml_tensor * op_tensor = fn(ctx.get());
630-
for (int i = 0; i < GGML_MAX_SRC; i++) {
631-
if (op_tensor->src[i] != nullptr) {
632-
assert(op_tensor->src[i]->buffer == nullptr);
633-
op_tensor->src[i]->buffer = buf.get();
634-
}
635-
}
636-
bool op_supported = ggml_backend_dev_supports_op(dev, op_tensor);
637-
638-
return op_supported;
639-
}
640-
641-
template<typename F>
642-
static ggml_backend_buffer_type_t select_buft(const llama_model::buft_list_t & buft_list, const F & fn) {
643-
for (const auto & cur : buft_list) {
644-
ggml_backend_dev_t cur_dev = cur.first;
645-
ggml_backend_buffer_type_t cur_buft = cur.second;
646-
if (buft_supported(cur_buft, cur_dev, fn)) {
647-
return cur_buft;
648-
}
649-
}
650-
throw std::runtime_error(format("no suitable buffer type found"));
651-
}
652-
614+
ggml_backend_buffer_type_t llama_model_select_buft(const llama_model & model, int il);
653615

654616
std::string llama_model_ftype_name(llama_ftype ftype);

src/llama.cpp

Lines changed: 6 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -2962,7 +2962,7 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
29622962

29632963
// hparams
29642964
LLAMA_LOG_INFO("%s: format = %s\n", __func__, llama_file_version_name(ml.fver));
2965-
LLAMA_LOG_INFO("%s: arch = %s\n", __func__, LLM_ARCH_NAMES.at(model.arch));
2965+
LLAMA_LOG_INFO("%s: arch = %s\n", __func__, llm_arch_name(model.arch));
29662966
LLAMA_LOG_INFO("%s: vocab type = %s\n", __func__, llama_model_vocab_type_name(vocab.type));
29672967
LLAMA_LOG_INFO("%s: n_vocab = %u\n", __func__, hparams.n_vocab);
29682968
LLAMA_LOG_INFO("%s: n_merges = %u\n", __func__, (int) vocab.bpe_ranks.size());
@@ -17042,9 +17042,12 @@ int32_t llama_detokenize(
1704217042
//
1704317043

1704417044
static llm_chat_template llama_chat_detect_template(const std::string & tmpl) {
17045-
if (LLM_CHAT_TEMPLATES.find(tmpl) != LLM_CHAT_TEMPLATES.end()) {
17046-
return LLM_CHAT_TEMPLATES.at(tmpl);
17045+
try {
17046+
return llm_chat_template_from_str(tmpl);
17047+
} catch (const std::out_of_range &) {
17048+
// ignore
1704717049
}
17050+
1704817051
auto tmpl_contains = [&tmpl](const char * haystack) -> bool {
1704917052
return tmpl.find(haystack) != std::string::npos;
1705017053
};
@@ -17535,15 +17538,6 @@ int32_t llama_chat_apply_template(
1753517538
return res;
1753617539
}
1753717540

17538-
int32_t llama_chat_builtin_templates(const char ** output, size_t len) {
17539-
auto it = LLM_CHAT_TEMPLATES.begin();
17540-
for (size_t i = 0; i < std::min(len, LLM_CHAT_TEMPLATES.size()); i++) {
17541-
output[i] = it->first.c_str();
17542-
std::advance(it, 1);
17543-
}
17544-
return (int32_t) LLM_CHAT_TEMPLATES.size();
17545-
}
17546-
1754717541
//
1754817542
// sampling
1754917543
//

0 commit comments

Comments
 (0)