Skip to content

Commit 35d0e02

Browse files
authored
talk-llama : sync llama.cpp (#2709)
1 parent 45d3faf commit 35d0e02

36 files changed

+20456
-18837
lines changed

examples/talk-llama/CMakeLists.txt

Lines changed: 17 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,26 @@
11
if (WHISPER_SDL2)
2+
set(CMAKE_CXX_STANDARD 17)
3+
set(CMAKE_CXX_STANDARD_REQUIRED ON)
4+
25
set(TARGET whisper-talk-llama)
36
add_executable(${TARGET} talk-llama.cpp
47
llama.cpp
5-
llama-vocab.cpp
8+
llama-adapter.cpp
9+
llama-arch.cpp
10+
llama-batch.cpp
11+
llama-chat.cpp
12+
llama-context.cpp
13+
llama-cparams.cpp
614
llama-grammar.cpp
15+
llama-hparams.cpp
16+
llama-impl.cpp
17+
llama-kv-cache.cpp
18+
llama-mmap.cpp
19+
llama-model-loader.cpp
20+
llama-model.cpp
21+
llama-quant.cpp
722
llama-sampling.cpp
23+
llama-vocab.cpp
824
unicode.cpp
925
unicode-data.cpp)
1026
target_include_directories(${TARGET} PRIVATE ${SDL2_INCLUDE_DIRS})
Lines changed: 334 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,334 @@
1+
#include "llama-adapter.h"
2+
3+
#include "llama-model.h"
4+
5+
#include <algorithm>
6+
#include <map>
7+
#include <cassert>
8+
#include <stdexcept>
9+
10+
// vec
11+
12+
struct ggml_tensor * llama_control_vector::tensor_for(int il) const {
13+
if (il < 0 || il < layer_start || il > layer_end || (size_t) il >= tensors.size()) {
14+
return nullptr;
15+
}
16+
17+
return tensors[il];
18+
}
19+
20+
struct ggml_tensor * llama_control_vector::apply_to(struct ggml_context * ctx, struct ggml_tensor * cur, int il) const {
21+
ggml_tensor * layer_dir = tensor_for(il);
22+
if (layer_dir != nullptr) {
23+
cur = ggml_add(ctx, cur, layer_dir);
24+
}
25+
26+
return cur;
27+
}
28+
29+
static bool llama_control_vector_init(struct llama_control_vector & cvec, const llama_model & model) {
30+
const auto & hparams = model.hparams;
31+
32+
GGML_ASSERT(cvec.tensors.empty());
33+
GGML_ASSERT(cvec.ctxs.empty());
34+
GGML_ASSERT(cvec.bufs.empty());
35+
36+
// create a context for each buffer type
37+
std::map<ggml_backend_buffer_type_t, ggml_context *> ctx_map;
38+
auto ctx_for_buft = [&](ggml_backend_buffer_type_t buft) -> ggml_context * {
39+
auto it = ctx_map.find(buft);
40+
if (it == ctx_map.end()) {
41+
struct ggml_init_params params = {
42+
/*.mem_size =*/ hparams.n_layer*ggml_tensor_overhead(),
43+
/*.mem_buffer =*/ NULL,
44+
/*.no_alloc =*/ true,
45+
};
46+
47+
ggml_context * ctx = ggml_init(params);
48+
if (!ctx) {
49+
return nullptr;
50+
}
51+
52+
ctx_map[buft] = ctx;
53+
cvec.ctxs.emplace_back(ctx);
54+
55+
return ctx;
56+
}
57+
58+
return it->second;
59+
};
60+
61+
// make tensors
62+
cvec.tensors.reserve(hparams.n_layer);
63+
cvec.tensors.push_back(nullptr); // there's never a tensor for layer 0
64+
for (size_t il = 1; il < hparams.n_layer; il++) {
65+
ggml_backend_buffer_type_t buft = llama_model_select_buft(model, il);
66+
ggml_context * ctx = ctx_for_buft(buft);
67+
if (!ctx) {
68+
LLAMA_LOG_ERROR("%s: failed to allocate context for control vector\n", __func__);
69+
return false;
70+
}
71+
ggml_tensor * tensor = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, hparams.n_embd);
72+
cvec.tensors.push_back(tensor);
73+
}
74+
75+
// allocate tensors / buffers and zero
76+
cvec.bufs.reserve(ctx_map.size());
77+
for (auto it : ctx_map) {
78+
ggml_backend_buffer_type_t buft = it.first;
79+
ggml_context * ctx = it.second;
80+
ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, buft);
81+
if (!buf) {
82+
LLAMA_LOG_ERROR("%s: failed to allocate buffer for control vector\n", __func__);
83+
return false;
84+
}
85+
ggml_backend_buffer_clear(buf, 0);
86+
cvec.bufs.emplace_back(buf);
87+
}
88+
89+
return true;
90+
}
91+
92+
int32_t llama_control_vector_apply(
93+
struct llama_control_vector & cvec,
94+
const llama_model & model,
95+
const float * data,
96+
size_t len,
97+
int32_t n_embd,
98+
int32_t il_start,
99+
int32_t il_end) {
100+
const auto & hparams = model.hparams;
101+
102+
if (data == nullptr) {
103+
// disable the current control vector (but leave allocated for later)
104+
cvec.layer_start = -1;
105+
cvec.layer_end = -1;
106+
return 0;
107+
}
108+
109+
if (n_embd != (int) hparams.n_embd) {
110+
LLAMA_LOG_ERROR("%s: control vector n_embd does not match model\n", __func__);
111+
return 1;
112+
}
113+
114+
if (cvec.tensors.empty()) {
115+
if (!llama_control_vector_init(cvec, model)) {
116+
return 1;
117+
}
118+
}
119+
120+
cvec.layer_start = il_start;
121+
cvec.layer_end = il_end;
122+
123+
for (size_t il = 1; il < hparams.n_layer; il++) {
124+
assert(cvec.tensors[il] != nullptr);
125+
126+
const size_t off = n_embd * (il - 1); // buffer doesn't have data for layer 0, since it's never present
127+
if (off + n_embd <= len) {
128+
ggml_backend_tensor_set(cvec.tensors[il], data + off, 0, n_embd * ggml_element_size(cvec.tensors[il]));
129+
}
130+
}
131+
132+
return 0;
133+
}
134+
135+
// lora
136+
137+
llama_lora_weight * llama_lora_adapter::get_weight(struct ggml_tensor * w) {
138+
const std::string name(w->name);
139+
140+
const auto pos = ab_map.find(name);
141+
if (pos != ab_map.end()) {
142+
return &pos->second;
143+
}
144+
145+
return nullptr;
146+
}
147+
148+
void llama_lora_adapter_free(struct llama_lora_adapter * adapter) {
149+
delete adapter;
150+
}
151+
152+
static void llama_lora_adapter_init_impl(struct llama_model & model, const char * path_lora, struct llama_lora_adapter & adapter) {
153+
LLAMA_LOG_INFO("%s: loading lora adapter from '%s' ...\n", __func__, path_lora);
154+
155+
ggml_context * ctx_init;
156+
struct gguf_init_params meta_gguf_params = {
157+
/* .no_alloc = */ true,
158+
/* .ctx = */ &ctx_init,
159+
};
160+
161+
gguf_context_ptr ctx_gguf { gguf_init_from_file(path_lora, meta_gguf_params) };
162+
if (!ctx_gguf) {
163+
throw std::runtime_error("failed to load lora adapter file from " + std::string(path_lora));
164+
}
165+
166+
ggml_context_ptr ctx { ctx_init };
167+
168+
// check metadata
169+
{
170+
auto get_kv_str = [&](const std::string & key) -> std::string {
171+
int id = gguf_find_key(ctx_gguf.get(), key.c_str());
172+
return id < 0 ? "" : std::string(gguf_get_val_str(ctx_gguf.get(), id));
173+
};
174+
auto get_kv_f32 = [&](const std::string & key) -> float {
175+
int id = gguf_find_key(ctx_gguf.get(), key.c_str());
176+
return id < 0 ? 0.0f : gguf_get_val_f32(ctx_gguf.get(), id);
177+
};
178+
LLM_KV llm_kv = LLM_KV(LLM_ARCH_UNKNOWN);
179+
180+
auto general_type = get_kv_str(llm_kv(LLM_KV_GENERAL_TYPE));
181+
if (general_type != "adapter") {
182+
throw std::runtime_error("expect general.type to be 'adapter', but got: " + general_type);
183+
}
184+
185+
auto general_arch_str = get_kv_str(llm_kv(LLM_KV_GENERAL_ARCHITECTURE));
186+
auto general_arch = llm_arch_from_string(general_arch_str);
187+
if (general_arch != model.arch) {
188+
throw std::runtime_error("model arch and LoRA arch mismatch");
189+
}
190+
191+
auto adapter_type = get_kv_str(llm_kv(LLM_KV_ADAPTER_TYPE));
192+
if (adapter_type != "lora") {
193+
throw std::runtime_error("expect adapter.type to be 'lora', but got: " + adapter_type);
194+
}
195+
196+
adapter.alpha = get_kv_f32(llm_kv(LLM_KV_ADAPTER_LORA_ALPHA));
197+
}
198+
199+
int n_tensors = gguf_get_n_tensors(ctx_gguf.get());
200+
201+
// contexts for each buffer type
202+
std::map<ggml_backend_buffer_type_t, ggml_context *> ctx_map;
203+
auto ctx_for_buft = [&](ggml_backend_buffer_type_t buft) -> ggml_context * {
204+
auto it = ctx_map.find(buft);
205+
if (it == ctx_map.end()) {
206+
// add a new context
207+
struct ggml_init_params params = {
208+
/*.mem_size =*/ n_tensors*ggml_tensor_overhead(),
209+
/*.mem_buffer =*/ NULL,
210+
/*.no_alloc =*/ true,
211+
};
212+
ggml_context * buft_ctx = ggml_init(params);
213+
if (!buft_ctx) {
214+
return nullptr;
215+
}
216+
ctx_map[buft] = buft_ctx;
217+
adapter.ctxs.emplace_back(buft_ctx);
218+
return buft_ctx;
219+
};
220+
return it->second;
221+
};
222+
223+
// bundle lora_a and lora_b into pairs
224+
std::map<std::string, llama_lora_weight> ab_map;
225+
auto str_endswith = [](const std::string & str, const std::string & suffix) {
226+
return str.size() >= suffix.size() && str.compare(str.size()-suffix.size(), suffix.size(), suffix) == 0;
227+
};
228+
229+
for (ggml_tensor * cur = ggml_get_first_tensor(ctx.get()); cur; cur = ggml_get_next_tensor(ctx.get(), cur)) {
230+
std::string name(cur->name);
231+
if (str_endswith(name, ".lora_a")) {
232+
replace_all(name, ".lora_a", "");
233+
if (ab_map.find(name) == ab_map.end()) {
234+
ab_map[name] = llama_lora_weight(cur, nullptr);
235+
} else {
236+
ab_map[name].a = cur;
237+
}
238+
} else if (str_endswith(name, ".lora_b")) {
239+
replace_all(name, ".lora_b", "");
240+
if (ab_map.find(name) == ab_map.end()) {
241+
ab_map[name] = llama_lora_weight(nullptr, cur);
242+
} else {
243+
ab_map[name].b = cur;
244+
}
245+
} else {
246+
throw std::runtime_error("LoRA tensor '" + name + "' has unexpected suffix");
247+
}
248+
}
249+
250+
// add tensors
251+
for (auto & it : ab_map) {
252+
const std::string & name = it.first;
253+
llama_lora_weight & w = it.second;
254+
255+
if (!w.a || !w.b) {
256+
throw std::runtime_error("LoRA tensor pair for '" + name + "' is missing one component");
257+
}
258+
259+
// device buft and device ctx
260+
auto * model_tensor = llama_model_get_tensor(model, name.c_str());
261+
if (!model_tensor) {
262+
throw std::runtime_error("LoRA tensor '" + name + "' does not exist in base model");
263+
}
264+
265+
struct ggml_context * dev_ctx = ctx_for_buft(ggml_backend_buffer_get_type(model_tensor->buffer));
266+
// validate tensor shape
267+
if (model_tensor->ne[0] != w.a->ne[0] || model_tensor->ne[1] != w.b->ne[1]) {
268+
throw std::runtime_error("tensor '" + name + "' has incorrect shape");
269+
}
270+
if (w.a->ne[1] != w.b->ne[0]) {
271+
throw std::runtime_error("lora_a tensor is not transposed (hint: adapter from \"finetune\" example is no longer supported)");
272+
}
273+
274+
// save tensor to adapter
275+
struct ggml_tensor * tensor_a = ggml_dup_tensor(dev_ctx, w.a);
276+
struct ggml_tensor * tensor_b = ggml_dup_tensor(dev_ctx, w.b);
277+
ggml_set_name(tensor_a, w.a->name);
278+
ggml_set_name(tensor_b, w.b->name);
279+
adapter.ab_map[name] = llama_lora_weight(tensor_a, tensor_b);
280+
}
281+
282+
// allocate tensors / buffers and zero
283+
{
284+
adapter.ctxs.reserve(ctx_map.size());
285+
adapter.bufs.reserve(ctx_map.size());
286+
for (auto & it : ctx_map) {
287+
ggml_backend_buffer_type_t buft = it.first;
288+
ggml_context * ctx_dev = it.second;
289+
ggml_backend_buffer_ptr buf { ggml_backend_alloc_ctx_tensors_from_buft(ctx_dev, buft) };
290+
if (!buf) {
291+
throw std::runtime_error("failed to allocate buffer for lora adapter\n");
292+
}
293+
LLAMA_LOG_INFO("%s: %10s LoRA buffer size = %8.2f MiB\n", __func__, ggml_backend_buffer_name(buf.get()), ggml_backend_buffer_get_size(buf.get())/1024.0/1024.0);
294+
adapter.bufs.emplace_back(std::move(buf));
295+
}
296+
}
297+
298+
// set tensor data
299+
{
300+
llama_file gguf_file(path_lora, "rb");
301+
std::vector<uint8_t> read_buf;
302+
auto set_tensor = [&](struct ggml_tensor * orig, struct ggml_tensor * dev) {
303+
size_t offs = gguf_get_data_offset(ctx_gguf.get()) + gguf_get_tensor_offset(ctx_gguf.get(), gguf_find_tensor(ctx_gguf.get(), orig->name));
304+
size_t size = ggml_nbytes(orig);
305+
read_buf.resize(size);
306+
gguf_file.seek(offs, SEEK_SET);
307+
gguf_file.read_raw(read_buf.data(), size);
308+
ggml_backend_tensor_set(dev, read_buf.data(), 0, size);
309+
};
310+
for (auto & it : adapter.ab_map) {
311+
auto orig = ab_map[it.first];
312+
auto dev = it.second;
313+
set_tensor(orig.a, dev.a);
314+
set_tensor(orig.b, dev.b);
315+
}
316+
}
317+
318+
LLAMA_LOG_INFO("%s: loaded %zu tensors from lora file\n", __func__, adapter.ab_map.size()*2);
319+
}
320+
321+
struct llama_lora_adapter * llama_lora_adapter_init(struct llama_model * model, const char * path_lora) {
322+
struct llama_lora_adapter * adapter = new llama_lora_adapter();
323+
324+
try {
325+
llama_lora_adapter_init_impl(*model, path_lora, *adapter);
326+
return adapter;
327+
} catch (const std::exception & err) {
328+
LLAMA_LOG_ERROR("%s: failed to apply lora adapter: %s\n", __func__, err.what());
329+
330+
delete adapter;
331+
}
332+
333+
return nullptr;
334+
}

0 commit comments

Comments
 (0)