Skip to content

Commit 99b011a

Browse files
committed
talk-llama : sync llama.cpp
1 parent 19d95f9 commit 99b011a

26 files changed

+5768
-5073
lines changed

examples/talk-llama/llama-adapter.cpp

Lines changed: 57 additions & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,7 @@
11
#include "llama-adapter.h"
22

3+
#include "llama-impl.h"
4+
#include "llama-mmap.h"
35
#include "llama-model.h"
46

57
#include <algorithm>
@@ -9,15 +11,15 @@
911

1012
// vec
1113

12-
struct ggml_tensor * llama_control_vector::tensor_for(int il) const {
14+
struct ggml_tensor * llama_adapter_cvec::tensor_for(int il) const {
1315
if (il < 0 || il < layer_start || il > layer_end || (size_t) il >= tensors.size()) {
1416
return nullptr;
1517
}
1618

1719
return tensors[il];
1820
}
1921

20-
struct ggml_tensor * llama_control_vector::apply_to(struct ggml_context * ctx, struct ggml_tensor * cur, int il) const {
22+
struct ggml_tensor * llama_adapter_cvec::apply_to(struct ggml_context * ctx, struct ggml_tensor * cur, int il) const {
2123
ggml_tensor * layer_dir = tensor_for(il);
2224
if (layer_dir != nullptr) {
2325
cur = ggml_add(ctx, cur, layer_dir);
@@ -26,12 +28,12 @@ struct ggml_tensor * llama_control_vector::apply_to(struct ggml_context * ctx, s
2628
return cur;
2729
}
2830

29-
static bool llama_control_vector_init(struct llama_control_vector & cvec, const llama_model & model) {
31+
bool llama_adapter_cvec::init(const llama_model & model) {
3032
const auto & hparams = model.hparams;
3133

32-
GGML_ASSERT(cvec.tensors.empty());
33-
GGML_ASSERT(cvec.ctxs.empty());
34-
GGML_ASSERT(cvec.bufs.empty());
34+
GGML_ASSERT(tensors.empty());
35+
GGML_ASSERT(ctxs.empty());
36+
GGML_ASSERT(bufs.empty());
3537

3638
// create a context for each buffer type
3739
std::map<ggml_backend_buffer_type_t, ggml_context *> ctx_map;
@@ -50,7 +52,7 @@ static bool llama_control_vector_init(struct llama_control_vector & cvec, const
5052
}
5153

5254
ctx_map[buft] = ctx;
53-
cvec.ctxs.emplace_back(ctx);
55+
ctxs.emplace_back(ctx);
5456

5557
return ctx;
5658
}
@@ -59,21 +61,21 @@ static bool llama_control_vector_init(struct llama_control_vector & cvec, const
5961
};
6062

6163
// make tensors
62-
cvec.tensors.reserve(hparams.n_layer);
63-
cvec.tensors.push_back(nullptr); // there's never a tensor for layer 0
64+
tensors.reserve(hparams.n_layer);
65+
tensors.push_back(nullptr); // there's never a tensor for layer 0
6466
for (size_t il = 1; il < hparams.n_layer; il++) {
65-
ggml_backend_buffer_type_t buft = llama_model_select_buft(model, il);
67+
ggml_backend_buffer_type_t buft = model.select_buft(il);
6668
ggml_context * ctx = ctx_for_buft(buft);
6769
if (!ctx) {
6870
LLAMA_LOG_ERROR("%s: failed to allocate context for control vector\n", __func__);
6971
return false;
7072
}
7173
ggml_tensor * tensor = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, hparams.n_embd);
72-
cvec.tensors.push_back(tensor);
74+
tensors.push_back(tensor);
7375
}
7476

7577
// allocate tensors / buffers and zero
76-
cvec.bufs.reserve(ctx_map.size());
78+
bufs.reserve(ctx_map.size());
7779
for (auto it : ctx_map) {
7880
ggml_backend_buffer_type_t buft = it.first;
7981
ggml_context * ctx = it.second;
@@ -83,14 +85,13 @@ static bool llama_control_vector_init(struct llama_control_vector & cvec, const
8385
return false;
8486
}
8587
ggml_backend_buffer_clear(buf, 0);
86-
cvec.bufs.emplace_back(buf);
88+
bufs.emplace_back(buf);
8789
}
8890

8991
return true;
9092
}
9193

92-
int32_t llama_control_vector_apply(
93-
struct llama_control_vector & cvec,
94+
int32_t llama_adapter_cvec::apply(
9495
const llama_model & model,
9596
const float * data,
9697
size_t len,
@@ -101,8 +102,8 @@ int32_t llama_control_vector_apply(
101102

102103
if (data == nullptr) {
103104
// disable the current control vector (but leave allocated for later)
104-
cvec.layer_start = -1;
105-
cvec.layer_end = -1;
105+
layer_start = -1;
106+
layer_end = -1;
106107
return 0;
107108
}
108109

@@ -111,21 +112,21 @@ int32_t llama_control_vector_apply(
111112
return 1;
112113
}
113114

114-
if (cvec.tensors.empty()) {
115-
if (!llama_control_vector_init(cvec, model)) {
115+
if (tensors.empty()) {
116+
if (!init(model)) {
116117
return 1;
117118
}
118119
}
119120

120-
cvec.layer_start = il_start;
121-
cvec.layer_end = il_end;
121+
layer_start = il_start;
122+
layer_end = il_end;
122123

123124
for (size_t il = 1; il < hparams.n_layer; il++) {
124-
assert(cvec.tensors[il] != nullptr);
125+
assert(tensors[il] != nullptr);
125126

126127
const size_t off = n_embd * (il - 1); // buffer doesn't have data for layer 0, since it's never present
127128
if (off + n_embd <= len) {
128-
ggml_backend_tensor_set(cvec.tensors[il], data + off, 0, n_embd * ggml_element_size(cvec.tensors[il]));
129+
ggml_backend_tensor_set(tensors[il], data + off, 0, n_embd * ggml_element_size(tensors[il]));
129130
}
130131
}
131132

@@ -134,7 +135,7 @@ int32_t llama_control_vector_apply(
134135

135136
// lora
136137

137-
llama_lora_weight * llama_lora_adapter::get_weight(struct ggml_tensor * w) {
138+
llama_adapter_lora_weight * llama_adapter_lora::get_weight(struct ggml_tensor * w) {
138139
const std::string name(w->name);
139140

140141
const auto pos = ab_map.find(name);
@@ -145,11 +146,7 @@ llama_lora_weight * llama_lora_adapter::get_weight(struct ggml_tensor * w) {
145146
return nullptr;
146147
}
147148

148-
void llama_lora_adapter_free(struct llama_lora_adapter * adapter) {
149-
delete adapter;
150-
}
151-
152-
static void llama_lora_adapter_init_impl(struct llama_model & model, const char * path_lora, struct llama_lora_adapter & adapter) {
149+
static void llama_adapter_lora_init_impl(struct llama_model & model, const char * path_lora, struct llama_adapter_lora & adapter) {
153150
LLAMA_LOG_INFO("%s: loading lora adapter from '%s' ...\n", __func__, path_lora);
154151

155152
ggml_context * ctx_init;
@@ -221,7 +218,7 @@ static void llama_lora_adapter_init_impl(struct llama_model & model, const char
221218
};
222219

223220
// bundle lora_a and lora_b into pairs
224-
std::map<std::string, llama_lora_weight> ab_map;
221+
std::map<std::string, llama_adapter_lora_weight> ab_map;
225222
auto str_endswith = [](const std::string & str, const std::string & suffix) {
226223
return str.size() >= suffix.size() && str.compare(str.size()-suffix.size(), suffix.size(), suffix) == 0;
227224
};
@@ -231,17 +228,21 @@ static void llama_lora_adapter_init_impl(struct llama_model & model, const char
231228
if (str_endswith(name, ".lora_a")) {
232229
replace_all(name, ".lora_a", "");
233230
if (ab_map.find(name) == ab_map.end()) {
234-
ab_map[name] = llama_lora_weight(cur, nullptr);
231+
ab_map[name] = llama_adapter_lora_weight(cur, nullptr);
235232
} else {
236233
ab_map[name].a = cur;
237234
}
238235
} else if (str_endswith(name, ".lora_b")) {
239236
replace_all(name, ".lora_b", "");
240237
if (ab_map.find(name) == ab_map.end()) {
241-
ab_map[name] = llama_lora_weight(nullptr, cur);
238+
ab_map[name] = llama_adapter_lora_weight(nullptr, cur);
242239
} else {
243240
ab_map[name].b = cur;
244241
}
242+
} else if (str_endswith(name, "_norm.weight")) {
243+
// TODO: add support for norm vector
244+
// for now, we don't really care because most adapters still work fine without it
245+
continue;
245246
} else {
246247
throw std::runtime_error("LoRA tensor '" + name + "' has unexpected suffix");
247248
}
@@ -250,33 +251,41 @@ static void llama_lora_adapter_init_impl(struct llama_model & model, const char
250251
// add tensors
251252
for (auto & it : ab_map) {
252253
const std::string & name = it.first;
253-
llama_lora_weight & w = it.second;
254+
llama_adapter_lora_weight & w = it.second;
255+
bool is_token_embd = str_endswith(name, "token_embd.weight");
254256

255257
if (!w.a || !w.b) {
256258
throw std::runtime_error("LoRA tensor pair for '" + name + "' is missing one component");
257259
}
258260

259261
// device buft and device ctx
260-
auto * model_tensor = llama_model_get_tensor(model, name.c_str());
262+
const auto * model_tensor = model.get_tensor(name.c_str());
261263
if (!model_tensor) {
262-
throw std::runtime_error("LoRA tensor '" + name + "' does not exist in base model");
264+
throw std::runtime_error("LoRA tensor '" + name + "' does not exist in base model (hint: maybe wrong base model?)");
263265
}
264266

265267
struct ggml_context * dev_ctx = ctx_for_buft(ggml_backend_buffer_get_type(model_tensor->buffer));
266268
// validate tensor shape
267-
if (model_tensor->ne[0] != w.a->ne[0] || model_tensor->ne[1] != w.b->ne[1]) {
268-
throw std::runtime_error("tensor '" + name + "' has incorrect shape");
269-
}
270-
if (w.a->ne[1] != w.b->ne[0]) {
271-
throw std::runtime_error("lora_a tensor is not transposed (hint: adapter from \"finetune\" example is no longer supported)");
269+
if (is_token_embd) {
270+
// expect B to be non-transposed, A and B are flipped; see llm_build_inp_embd()
271+
if (model_tensor->ne[0] != w.b->ne[1] || model_tensor->ne[1] != w.a->ne[1]) {
272+
throw std::runtime_error("tensor '" + name + "' has incorrect shape (hint: maybe wrong base model?)");
273+
}
274+
} else {
275+
if (model_tensor->ne[0] != w.a->ne[0] || model_tensor->ne[1] != w.b->ne[1]) {
276+
throw std::runtime_error("tensor '" + name + "' has incorrect shape (hint: maybe wrong base model?)");
277+
}
278+
if (w.a->ne[1] != w.b->ne[0]) {
279+
throw std::runtime_error("lora_a tensor is not transposed (hint: adapter from \"finetune\" example is no longer supported)");
280+
}
272281
}
273282

274283
// save tensor to adapter
275284
struct ggml_tensor * tensor_a = ggml_dup_tensor(dev_ctx, w.a);
276285
struct ggml_tensor * tensor_b = ggml_dup_tensor(dev_ctx, w.b);
277286
ggml_set_name(tensor_a, w.a->name);
278287
ggml_set_name(tensor_b, w.b->name);
279-
adapter.ab_map[name] = llama_lora_weight(tensor_a, tensor_b);
288+
adapter.ab_map[name] = llama_adapter_lora_weight(tensor_a, tensor_b);
280289
}
281290

282291
// allocate tensors / buffers and zero
@@ -318,11 +327,11 @@ static void llama_lora_adapter_init_impl(struct llama_model & model, const char
318327
LLAMA_LOG_INFO("%s: loaded %zu tensors from lora file\n", __func__, adapter.ab_map.size()*2);
319328
}
320329

321-
struct llama_lora_adapter * llama_lora_adapter_init(struct llama_model * model, const char * path_lora) {
322-
struct llama_lora_adapter * adapter = new llama_lora_adapter();
330+
struct llama_adapter_lora * llama_adapter_lora_init(struct llama_model * model, const char * path_lora) {
331+
struct llama_adapter_lora * adapter = new llama_adapter_lora();
323332

324333
try {
325-
llama_lora_adapter_init_impl(*model, path_lora, *adapter);
334+
llama_adapter_lora_init_impl(*model, path_lora, *adapter);
326335
return adapter;
327336
} catch (const std::exception & err) {
328337
LLAMA_LOG_ERROR("%s: failed to apply lora adapter: %s\n", __func__, err.what());
@@ -332,3 +341,7 @@ struct llama_lora_adapter * llama_lora_adapter_init(struct llama_model * model,
332341

333342
return nullptr;
334343
}
344+
345+
void llama_adapter_lora_free(struct llama_adapter_lora * adapter) {
346+
delete adapter;
347+
}
Lines changed: 36 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -1,66 +1,74 @@
11
#pragma once
22

3-
#include "llama-impl.h"
4-
#include "llama-hparams.h"
3+
#include "llama.h"
54

65
#include "ggml-cpp.h"
76

7+
#include <string>
88
#include <unordered_map>
99
#include <vector>
1010

11+
// TODO: pimpl
12+
1113
//
1214
// llama_adapter_cvec
1315
//
1416

15-
// TODO: rename to llama_adapter_cvec
16-
struct llama_control_vector {
17-
std::vector<ggml_context_ptr> ctxs;
18-
std::vector<ggml_backend_buffer_ptr> bufs;
17+
struct llama_adapter_cvec {
18+
struct ggml_tensor * tensor_for(int il) const;
1919

20-
std::vector<struct ggml_tensor *> tensors; // per layer
20+
struct ggml_tensor * apply_to(struct ggml_context * ctx, struct ggml_tensor * cur, int il) const;
21+
22+
int32_t apply(
23+
const llama_model & model,
24+
const float * data,
25+
size_t len,
26+
int32_t n_embd,
27+
int32_t il_start,
28+
int32_t il_end);
29+
30+
private:
31+
bool init(const llama_model & model);
2132

2233
int32_t layer_start = -1;
2334
int32_t layer_end = -1;
2435

25-
struct ggml_tensor * tensor_for(int il) const;
36+
std::vector<ggml_context_ptr> ctxs;
37+
std::vector<ggml_backend_buffer_ptr> bufs;
2638

27-
struct ggml_tensor * apply_to(struct ggml_context * ctx, struct ggml_tensor * cur, int il) const;
39+
std::vector<struct ggml_tensor *> tensors; // per layer
2840
};
2941

30-
int32_t llama_control_vector_apply(
31-
struct llama_control_vector & cvec,
32-
const llama_model & model,
33-
const float * data,
34-
size_t len,
35-
int32_t n_embd,
36-
int32_t il_start,
37-
int32_t il_end);
38-
3942
//
4043
// llama_adapter_lora
4144
//
4245

43-
// TODO: rename to llama_adapter_lora_weight
44-
struct llama_lora_weight {
46+
struct llama_adapter_lora_weight {
4547
struct ggml_tensor * a = nullptr;
4648
struct ggml_tensor * b = nullptr;
4749

48-
llama_lora_weight() = default;
49-
llama_lora_weight(struct ggml_tensor * a, struct ggml_tensor * b) : a(a), b(b) {}
50+
// get actual scale based on rank and alpha
51+
float get_scale(float alpha, float adapter_scale) const {
52+
const float rank = (float) b->ne[0];
53+
const float scale = alpha ? adapter_scale * alpha / rank : adapter_scale;
54+
return scale;
55+
}
56+
57+
llama_adapter_lora_weight() = default;
58+
llama_adapter_lora_weight(struct ggml_tensor * a, struct ggml_tensor * b) : a(a), b(b) {}
5059
};
5160

52-
// TODO: rename to llama_adapter_lora
53-
struct llama_lora_adapter {
61+
struct llama_adapter_lora {
5462
// map tensor name to lora_a_b
55-
std::unordered_map<std::string, struct llama_lora_weight> ab_map;
63+
std::unordered_map<std::string, struct llama_adapter_lora_weight> ab_map;
5664

5765
std::vector<ggml_context_ptr> ctxs;
5866
std::vector<ggml_backend_buffer_ptr> bufs;
5967

6068
float alpha;
6169

62-
llama_lora_adapter() = default;
63-
~llama_lora_adapter() = default;
70+
llama_adapter_lora() = default;
71+
~llama_adapter_lora() = default;
6472

65-
llama_lora_weight * get_weight(struct ggml_tensor * w);
73+
llama_adapter_lora_weight * get_weight(struct ggml_tensor * w);
6674
};

0 commit comments

Comments
 (0)