Skip to content

Commit e444b8e

Browse files
committed
support mergekit-extract-lora
1 parent 93fbfd0 commit e444b8e

File tree

4 files changed

+39
-72
lines changed

4 files changed

+39
-72
lines changed

convert_lora_to_gguf.py

Lines changed: 9 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -382,13 +382,13 @@ def get_tensors(self) -> Iterator[tuple[str, Tensor]]:
382382
if self.lazy:
383383
tensor = LazyTorchTensor.from_eager(tensor)
384384
base_name = get_base_tensor_name(name)
385-
# note: lora_embedding is transposed by mergekit-extract-lora, so it's reversed here
386-
is_lora_a = ".lora_A.weight" in name or ".lora_embedding_B" in name
387-
is_lora_b = ".lora_B.weight" in name or ".lora_embedding_A" in name
385+
# note: mergekit-extract-lora also adds token embeddings to the adapter
386+
is_lora_a = ".lora_A.weight" in name or ".lora_embedding_A" in name
387+
is_lora_b = ".lora_B.weight" in name or ".lora_embedding_B" in name
388388
if not is_lora_a and not is_lora_b:
389389
if ".base_layer.weight" in name:
390390
continue
391-
# mergekit-extract-lora add these layernorm to the adapter
391+
# mergekit-extract-lora add these layernorm to the adapter, we need to keep them
392392
if ".layernorm" or ".norm" in name:
393393
yield (base_name, tensor)
394394
continue
@@ -398,10 +398,6 @@ def get_tensors(self) -> Iterator[tuple[str, Tensor]]:
398398
logger.error("Please refer to https://github.com/ggerganov/llama.cpp/pull/9948")
399399
sys.exit(1)
400400

401-
# mergekit-extract-lora transposes this tensor, we need to transpose it back
402-
if ".lora_embedding" in name:
403-
tensor = tensor.T
404-
405401
if base_name in tensor_map:
406402
if is_lora_a:
407403
tensor_map[base_name].A = tensor
@@ -437,6 +433,11 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
437433
assert isinstance(dest_data, LoraTorchTensor)
438434
lora_a, lora_b = dest_data.get_lora_A_B()
439435

436+
# token_embd A and B are already transposed by mergekit-extract-lora
437+
# we transpose A back again because it is used by llm_build_inp_embd()
438+
if "token_embd.weight" in dest_name:
439+
lora_a = lora_a.T
440+
440441
yield (dest_name + ".lora_a", lora_a)
441442
yield (dest_name + ".lora_b", lora_b)
442443

src/llama-adapter.cpp

Lines changed: 17 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -243,8 +243,9 @@ static void llama_lora_adapter_init_impl(struct llama_model & model, const char
243243
ab_map[name].b = cur;
244244
}
245245
} else if (str_endswith(name, "_norm.weight")) {
246-
// norm only has 1 dim, so tensor b == nullptr
247-
ab_map[name] = llama_lora_weight(cur);
246+
// TODO: add support for norm vector
247+
// for now, we don't really care because most adapters still work fine without it
248+
continue;
248249
} else {
249250
throw std::runtime_error("LoRA tensor '" + name + "' has unexpected suffix");
250251
}
@@ -254,9 +255,7 @@ static void llama_lora_adapter_init_impl(struct llama_model & model, const char
254255
for (auto & it : ab_map) {
255256
const std::string & name = it.first;
256257
llama_lora_weight & w = it.second;
257-
if (w.is_norm) {
258-
continue;
259-
}
258+
bool is_token_embd = str_endswith(name, "token_embd.weight");
260259

261260
if (!w.a || !w.b) {
262261
throw std::runtime_error("LoRA tensor pair for '" + name + "' is missing one component");
@@ -270,11 +269,18 @@ static void llama_lora_adapter_init_impl(struct llama_model & model, const char
270269

271270
struct ggml_context * dev_ctx = ctx_for_buft(ggml_backend_buffer_get_type(model_tensor->buffer));
272271
// validate tensor shape
273-
if (model_tensor->ne[0] != w.a->ne[0] || model_tensor->ne[1] != w.b->ne[1]) {
274-
throw std::runtime_error("tensor '" + name + "' has incorrect shape");
275-
}
276-
if (w.a->ne[1] != w.b->ne[0]) {
277-
throw std::runtime_error("lora_a tensor is not transposed (hint: adapter from \"finetune\" example is no longer supported)");
272+
if (is_token_embd) {
273+
// expect B to be transposed, see llm_build_inp_embd()
274+
if (model_tensor->ne[0] != w.b->ne[1] || model_tensor->ne[1] != w.a->ne[1]) {
275+
throw std::runtime_error("tensor '" + name + "' has incorrect shape");
276+
}
277+
} else {
278+
if (model_tensor->ne[0] != w.a->ne[0] || model_tensor->ne[1] != w.b->ne[1]) {
279+
throw std::runtime_error("tensor '" + name + "' has incorrect shape");
280+
}
281+
if (w.a->ne[1] != w.b->ne[0]) {
282+
throw std::runtime_error("lora_a tensor is not transposed (hint: adapter from \"finetune\" example is no longer supported)");
283+
}
278284
}
279285

280286
// save tensor to adapter
@@ -285,24 +291,6 @@ static void llama_lora_adapter_init_impl(struct llama_model & model, const char
285291
adapter.ab_map[name] = llama_lora_weight(tensor_a, tensor_b);
286292
}
287293

288-
// add norm vectors
289-
for (auto & it : ab_map) {
290-
const std::string & name = it.first;
291-
llama_lora_weight & w = it.second;
292-
if (w.is_norm) {
293-
GGML_ASSERT(w.a != nullptr);
294-
// device buft and device ctx
295-
auto * model_tensor = llama_model_get_tensor(model, name.c_str());
296-
if (!model_tensor) {
297-
throw std::runtime_error("LoRA tensor '" + name + "' does not exist in base model");
298-
}
299-
struct ggml_context * dev_ctx = ctx_for_buft(ggml_backend_buffer_get_type(model_tensor->buffer));
300-
struct ggml_tensor * tensor_norm = ggml_dup_tensor(dev_ctx, w.a);
301-
ggml_set_name(tensor_norm, w.a->name);
302-
adapter.ab_map[it.first] = llama_lora_weight(tensor_norm);
303-
}
304-
}
305-
306294
// allocate tensors / buffers and zero
307295
{
308296
adapter.ctxs.reserve(ctx_map.size());
@@ -335,9 +323,7 @@ static void llama_lora_adapter_init_impl(struct llama_model & model, const char
335323
auto orig = ab_map[it.first];
336324
auto dev = it.second;
337325
set_tensor(orig.a, dev.a);
338-
if (!dev.is_norm) {
339-
set_tensor(orig.b, dev.b);
340-
}
326+
set_tensor(orig.b, dev.b);
341327
}
342328
}
343329

src/llama-adapter.h

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -45,11 +45,14 @@ struct llama_lora_weight {
4545
struct ggml_tensor * a = nullptr;
4646
struct ggml_tensor * b = nullptr;
4747

48-
// note: norm only has 1 dim, so tensor b == nullptr
49-
bool is_norm = false; // is this a norm vector? (e.g. _norm.weight)
48+
// get actual scale based on rank and alpha
49+
float get_scale(float alpha, float adapter_scale) {
50+
const float rank = (float) b->ne[0];
51+
const float scale = alpha ? adapter_scale * alpha / rank : adapter_scale;
52+
return scale;
53+
}
5054

5155
llama_lora_weight() = default;
52-
llama_lora_weight(struct ggml_tensor * a) : a(a), is_norm(true) {}
5356
llama_lora_weight(struct ggml_tensor * a, struct ggml_tensor * b) : a(a), b(b) {}
5457
};
5558

src/llama.cpp

Lines changed: 7 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -2545,27 +2545,20 @@ static struct ggml_tensor * llm_build_inp_embd(
25452545
ggml_set_input(lctx.inp_tokens);
25462546

25472547
inpL = ggml_get_rows(ctx, tok_embd, lctx.inp_tokens);
2548-
//printf("tok_embd shape: %d x %d\n", tok_embd->ne[0], tok_embd->ne[1]);
2549-
//printf("inpL shape: %d x %d\n", inpL->ne[0], inpL->ne[1]);
25502548

25512549
// apply lora for embedding tokens if needed
25522550
for (auto & it : lctx.lora_adapters) {
25532551
struct llama_lora_weight * lora = it.first->get_weight(tok_embd);
25542552
if (lora == nullptr) {
25552553
continue;
25562554
}
2557-
const float alpha = it.first->alpha;
2558-
const float rank = (float) lora->b->ne[0];
2559-
const float scale = alpha ? it.second * alpha / rank : it.second;
2560-
auto ss = ggml_get_rows(ctx, lora->b, lctx.inp_tokens);
2561-
//printf("a shape: %d x %d\n", lora->a->ne[0], lora->a->ne[1]);
2562-
//printf("b shape: %d x %d\n", lora->b->ne[0], lora->b->ne[1]);
2563-
//printf("ss shape: %d x %d\n", ss->ne[0], ss->ne[1]);
2555+
const float adapter_scale = it.second;
2556+
const float scale = lora->get_scale(it.first->alpha, adapter_scale);
25642557
struct ggml_tensor * inpL_delta = ggml_scale(ctx, ggml_mul_mat(
2565-
ctx, ss, ggml_transpose(ctx, lora->a)
2558+
ctx, lora->b, // non-transposed lora_b
2559+
ggml_get_rows(ctx, lora->a, lctx.inp_tokens)
25662560
), scale);
2567-
//printf("inpL_delta shape: %d x %d\n", inpL_delta->ne[0], inpL_delta->ne[1]);
2568-
inpL = ggml_add(ctx, inpL, ggml_cont(ctx, ggml_transpose(ctx, inpL_delta)));
2561+
inpL = ggml_add(ctx, inpL, inpL_delta);
25692562
}
25702563
} else {
25712564
lctx.inp_embd = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, ubatch.n_tokens);
@@ -3919,17 +3912,9 @@ struct llm_build_context {
39193912
for (int il = 0; il < n_layer; ++il) {
39203913
struct ggml_tensor * inpSA = inpL;
39213914

3922-
struct ggml_tensor * attn_norm = model.layers[il].attn_norm;
3923-
for (auto & it : lctx.lora_adapters) {
3924-
struct llama_lora_weight * lora = it.first->get_weight(model.layers[il].attn_norm);
3925-
if (lora && lora->is_norm) {
3926-
attn_norm = ggml_add(ctx0, attn_norm, ggml_scale(ctx0, lora->a, 0.5));
3927-
}
3928-
}
3929-
39303915
// norm
39313916
cur = llm_build_norm(ctx0, inpL, hparams,
3932-
attn_norm, NULL,
3917+
model.layers[il].attn_norm, NULL,
39333918
LLM_NORM_RMS, cb, il);
39343919
cb(cur, "attn_norm", il);
39353920

@@ -3998,16 +3983,8 @@ struct llm_build_context {
39983983
// feed-forward network
39993984
if (model.layers[il].ffn_gate_inp == nullptr) {
40003985

4001-
struct ggml_tensor * ffn_norm = model.layers[il].ffn_norm;
4002-
// for (auto & it : lctx.lora_adapters) {
4003-
// struct llama_lora_weight * lora = it.first->get_weight(ffn_norm);
4004-
// if (lora && lora->is_norm) {
4005-
// ffn_norm = ggml_add(ctx0, ffn_norm, lora->a);
4006-
// }
4007-
// }
4008-
40093986
cur = llm_build_norm(ctx0, ffn_inp, hparams,
4010-
ffn_norm, NULL,
3987+
model.layers[il].ffn_norm, NULL,
40113988
LLM_NORM_RMS, cb, il);
40123989
cb(cur, "ffn_norm", il);
40133990

0 commit comments

Comments
 (0)