From f8d33f3f08f21dec60280fde73b45d25aaa3d086 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?St=C3=A9phane=20du=20Hamel?= Date: Tue, 10 Dec 2024 20:53:51 +0100 Subject: [PATCH 01/12] Flux Lora: support _A _B and merge split qkv to linear1 --- lora.hpp | 361 ++++++++++++++++++++++++++++++++++--------- stable-diffusion.cpp | 5 +- 2 files changed, 288 insertions(+), 78 deletions(-) diff --git a/lora.hpp b/lora.hpp index 5f458faee..720b41c74 100644 --- a/lora.hpp +++ b/lora.hpp @@ -6,6 +6,41 @@ #define LORA_GRAPH_SIZE 10240 struct LoraModel : public GGMLRunner { + static enum lora_t { + REGULAR = 0, + DIFFUSERS = 1, + DIFFUSERS_2 = 2, + DIFFUSERS_3 = 3, + TRANSFORMERS = 4, + LORA_TYPE_COUNT + }; + + const std::string lora_ups[LORA_TYPE_COUNT] = { + ".lora_up", + "_lora.up", + ".lora_B", + ".lora.up", + ".lora_linear_layer.up", + }; + + const std::string lora_downs[LORA_TYPE_COUNT] = { + ".lora_down", + "_lora.down", + ".lora_A", + ".lora.down", + ".lora_linear_layer.down", + }; + + const std::string lora_pre[LORA_TYPE_COUNT] = { + "lora.", + "", + "", + "", + "", + }; + + const std::string* type_fingerprints = lora_ups; + float multiplier = 1.0f; std::map lora_tensors; std::string file_path; @@ -14,6 +49,7 @@ struct LoraModel : public GGMLRunner { bool applied = false; std::vector zero_index_vec = {0}; ggml_tensor* zero_index = NULL; + enum lora_t type = REGULAR; LoraModel(ggml_backend_t backend, const std::string& file_path = "", @@ -44,6 +80,13 @@ struct LoraModel : public GGMLRunner { // LOG_INFO("skipping LoRA tesnor '%s'", name.c_str()); return true; } + // LOG_INFO("%s", name.c_str()); + for (int i = 0; i < LORA_TYPE_COUNT; i++) { + if (name.find(type_fingerprints[i]) != std::string::npos) { + type = (lora_t)i; + break; + } + } if (dry_run) { struct ggml_tensor* real = ggml_new_tensor(params_ctx, @@ -76,7 +119,38 @@ struct LoraModel : public GGMLRunner { return out; } - struct ggml_cgraph* build_lora_graph(std::map model_tensors) { + std::vector to_lora_keys(std::string blk_name, SDVersion version) { + std::vector keys; + size_t k_pos = blk_name.find(".weight"); + if (k_pos == std::string::npos) { + return keys; + } + blk_name = blk_name.substr(0, k_pos); + if (type == REGULAR) { + keys.push_back(blk_name); + // blk_name = blk_name.substr(sizeof("diffusion_model.")); + replace_all_chars(blk_name, '.', '_'); + keys.push_back(blk_name); + return keys; + } else if (type == DIFFUSERS || type == DIFFUSERS_2 || DIFFUSERS_3) { + // if (sd_version_is_Flux(version)) { + if (blk_name.find("model.diffusion_model") != std::string::npos) { + blk_name.replace(blk_name.find("model.diffusion_model"), sizeof("model.diffusion_model") - 1, "transformer"); + } + if (blk_name.find(".single_blocks") != std::string::npos) { + blk_name.replace(blk_name.find(".single_blocks"), sizeof(".single_blocks") - 1, ".single_transformer_blocks"); + } + if (blk_name.find(".double_blocks") != std::string::npos) { + blk_name.replace(blk_name.find(".double_blocks"), sizeof(".double_blocks") - 1, ".transformer_blocks"); + } + keys.push_back(blk_name); + // } + } + // LOG_DEBUG("k_tensor %s", k_tensor.c_str()); + return keys; + } + + struct ggml_cgraph* build_lora_graph(std::map model_tensors, SDVersion version) { struct ggml_cgraph* gf = ggml_new_graph_custom(compute_ctx, LORA_GRAPH_SIZE, false); zero_index = ggml_new_tensor_1d(compute_ctx, GGML_TYPE_I32, 1); @@ -88,84 +162,217 @@ struct LoraModel : public GGMLRunner { std::string k_tensor = it.first; struct ggml_tensor* weight = model_tensors[it.first]; - size_t k_pos = k_tensor.find(".weight"); - if (k_pos == std::string::npos) { + // LOG_INFO("%s", k_tensor.c_str()); + std::vector keys = to_lora_keys(k_tensor, version); + if (keys.size() == 0) continue; - } - k_tensor = k_tensor.substr(0, k_pos); - replace_all_chars(k_tensor, '.', '_'); - // LOG_DEBUG("k_tensor %s", k_tensor.c_str()); - std::string lora_up_name = "lora." + k_tensor + ".lora_up.weight"; - if (lora_tensors.find(lora_up_name) == lora_tensors.end()) { - if (k_tensor == "model_diffusion_model_output_blocks_2_2_conv") { - // fix for some sdxl lora, like lcm-lora-xl - k_tensor = "model_diffusion_model_output_blocks_2_1_conv"; - lora_up_name = "lora." + k_tensor + ".lora_up.weight"; + for (auto& key : keys) { + ggml_tensor* lora_up = NULL; + ggml_tensor* lora_down = NULL; + // LOG_DEBUG("k_tensor %s", k_tensor.c_str()); + if (sd_version_is_flux(version)) { + size_t l1 = key.find("linear1"); + if (l1 != std::string::npos) { + l1 -= 1; + auto split_q_u_name = lora_pre[type] + key.substr(0, l1) + ".attn.to_q" + lora_ups[type] + ".weight"; + if (lora_tensors.find(split_q_u_name) != lora_tensors.end()) { + // print_ggml_tensor(it.second, true); //[3072, 21504, 1, 1] + // find qkv and mlp up parts in LoRA model + auto split_k_u_name = lora_pre[type] + key.substr(0, l1) + ".attn.to_k" + lora_ups[type] + ".weight"; + auto split_v_u_name = lora_pre[type] + key.substr(0, l1) + ".attn.to_v" + lora_ups[type] + ".weight"; + + auto split_q_d_name = lora_pre[type] + key.substr(0, l1) + ".attn.to_q" + lora_downs[type] + ".weight"; + auto split_k_d_name = lora_pre[type] + key.substr(0, l1) + ".attn.to_k" + lora_downs[type] + ".weight"; + auto split_v_d_name = lora_pre[type] + key.substr(0, l1) + ".attn.to_v" + lora_downs[type] + ".weight"; + + auto split_m_u_name = lora_pre[type] + key.substr(0, l1) + ".proj_mlp" + lora_ups[type] + ".weight"; + auto split_m_d_name = lora_pre[type] + key.substr(0, l1) + ".proj_mlp" + lora_downs[type] + ".weight"; + + ggml_tensor* lora_q_up = NULL; + ggml_tensor* lora_q_down = NULL; + ggml_tensor* lora_k_up = NULL; + ggml_tensor* lora_k_down = NULL; + ggml_tensor* lora_v_up = NULL; + ggml_tensor* lora_v_down = NULL; + + ggml_tensor* lora_m_up = NULL; + ggml_tensor* lora_m_down = NULL; + + lora_q_up = lora_tensors[split_q_u_name]; + + if (lora_tensors.find(split_q_d_name) != lora_tensors.end()) { + lora_q_down = lora_tensors[split_q_d_name]; + } + + if (lora_tensors.find(split_k_u_name) != lora_tensors.end()) { + lora_k_up = lora_tensors[split_k_u_name]; + } + + if (lora_tensors.find(split_k_d_name) != lora_tensors.end()) { + lora_k_down = lora_tensors[split_k_d_name]; + } + + if (lora_tensors.find(split_v_u_name) != lora_tensors.end()) { + lora_v_up = lora_tensors[split_v_u_name]; + } + + if (lora_tensors.find(split_v_d_name) != lora_tensors.end()) { + lora_v_down = lora_tensors[split_v_d_name]; + } + + if (lora_tensors.find(split_m_u_name) != lora_tensors.end()) { + lora_m_up = lora_tensors[split_m_u_name]; + } + + if (lora_tensors.find(split_m_d_name) != lora_tensors.end()) { + lora_m_down = lora_tensors[split_m_d_name]; + } + + // print_ggml_tensor(lora_q_down, true); //[3072, R, 1, 1] + // print_ggml_tensor(lora_k_down, true); //[3072, R, 1, 1] + // print_ggml_tensor(lora_v_down, true); //[3072, R, 1, 1] + // print_ggml_tensor(lora_m_down, true); //[3072, R, 1, 1] + // print_ggml_tensor(lora_q_up, true); //[R, 3072, 1, 1] + // print_ggml_tensor(lora_k_up, true); //[R, 3072, 1, 1] + // print_ggml_tensor(lora_v_up, true); //[R, 3072, 1, 1] + // print_ggml_tensor(lora_m_up, true); //[R, 12288, 1, 1] + + // these need to be stitched together this way: + // |q_up,0 ,0 ,0 | + // |0 ,k_up,0 ,0 | + // |0 ,0 ,v_up,0 | + // |0 ,0 ,0 ,m_up| + // (q_down,k_down,v_down,m_down) . (q ,k ,v ,m) + + // up_concat will be [21504, R*4, 1, 1] + // down_concat will be [R*4, 3072, 1, 1] + + ggml_tensor* lora_down_concat = ggml_concat(compute_ctx, ggml_concat(compute_ctx, lora_q_down, lora_k_down, 1), ggml_concat(compute_ctx, lora_v_down, lora_m_down, 1), 1); + // print_ggml_tensor(lora_down_concat, true); //[3072, R*4, 1, 1] + + // this also means that if rank is bigger than 672, it is less memory efficient to do it this way (should be fine) + // print_ggml_tensor(lora_q_up, true); //[3072, R, 1, 1] + ggml_tensor* z = ggml_dup_tensor(compute_ctx, lora_q_up); + ggml_tensor* mlp_z = ggml_dup_tensor(compute_ctx, lora_m_up); + ggml_scale(compute_ctx, z, 0); + ggml_scale(compute_ctx, mlp_z, 0); + ggml_tensor* zz = ggml_concat(compute_ctx, z, z, 1); + + ggml_tensor* q_up = ggml_concat(compute_ctx, ggml_concat(compute_ctx, lora_q_up, zz, 1), mlp_z, 1); + ggml_tensor* k_up = ggml_concat(compute_ctx, ggml_concat(compute_ctx, z, lora_k_up, 1), ggml_concat(compute_ctx, z, mlp_z, 1), 1); + ggml_tensor* v_up = ggml_concat(compute_ctx, ggml_concat(compute_ctx, zz, lora_v_up, 1), mlp_z, 1); + ggml_tensor* m_up = ggml_concat(compute_ctx, ggml_concat(compute_ctx, zz, z, 1), lora_m_up, 1); + // print_ggml_tensor(q_up, true); //[R, 21504, 1, 1] + // print_ggml_tensor(k_up, true); //[R, 21504, 1, 1] + // print_ggml_tensor(v_up, true); //[R, 21504, 1, 1] + // print_ggml_tensor(m_up, true); //[R, 21504, 1, 1] + + ggml_tensor* lora_up_concat = ggml_concat(compute_ctx, ggml_concat(compute_ctx, q_up, k_up, 0), ggml_concat(compute_ctx, v_up, m_up, 0), 0); + // print_ggml_tensor(lora_up_concat, true); //[R*4, 21504, 1, 1] + + lora_down = lora_down_concat; + lora_up = lora_up_concat; + + std::string lora_down_name = lora_pre[type] + key + lora_downs[type] + ".weight"; + std::string lora_up_name = lora_pre[type] + key + lora_ups[type] + ".weight"; + + lora_tensors[lora_down_name] = lora_down; + lora_tensors[lora_up_name] = lora_up; + + lora_tensors.erase(split_q_u_name); + lora_tensors.erase(split_k_u_name); + lora_tensors.erase(split_v_u_name); + lora_tensors.erase(split_m_u_name); + + lora_tensors.erase(split_q_d_name); + lora_tensors.erase(split_k_d_name); + lora_tensors.erase(split_v_d_name); + lora_tensors.erase(split_m_d_name); + + applied_lora_tensors.insert(lora_down_name); + applied_lora_tensors.insert(lora_up_name); + + } else { + // std::string lora_up_name = lora_pre[type] + key + lora_ups[type] + ".weight"; + // std::string lora_down_name = lora_pre[type] + key + lora_downs[type] + ".weight"; + // if (lora_tensors.find(lora_up_name) != lora_tensors.end()) { + // // print_ggml_tensor(lora_tensors[lora_down_name], true); // [3072, R, 1, 1] + // // print_ggml_tensor(lora_tensors[lora_up_name], true); // [R, 21504, 1, 1] + // // print_ggml_tensor(it.second, true); // [3072, 21504, 1, 1] + // } + } + } } - } - - std::string lora_down_name = "lora." + k_tensor + ".lora_down.weight"; - std::string alpha_name = "lora." + k_tensor + ".alpha"; - std::string scale_name = "lora." + k_tensor + ".scale"; - ggml_tensor* lora_up = NULL; - ggml_tensor* lora_down = NULL; - - if (lora_tensors.find(lora_up_name) != lora_tensors.end()) { - lora_up = lora_tensors[lora_up_name]; - } - - if (lora_tensors.find(lora_down_name) != lora_tensors.end()) { - lora_down = lora_tensors[lora_down_name]; - } - - if (lora_up == NULL || lora_down == NULL) { - continue; - } - - applied_lora_tensors.insert(lora_up_name); - applied_lora_tensors.insert(lora_down_name); - applied_lora_tensors.insert(alpha_name); - applied_lora_tensors.insert(scale_name); - - // calc_cale - int64_t dim = lora_down->ne[ggml_n_dims(lora_down) - 1]; - float scale_value = 1.0f; - if (lora_tensors.find(scale_name) != lora_tensors.end()) { - scale_value = ggml_backend_tensor_get_f32(lora_tensors[scale_name]); - } else if (lora_tensors.find(alpha_name) != lora_tensors.end()) { - float alpha = ggml_backend_tensor_get_f32(lora_tensors[alpha_name]); - scale_value = alpha / dim; - } - scale_value *= multiplier; - - // flat lora tensors to multiply it - int64_t lora_up_rows = lora_up->ne[ggml_n_dims(lora_up) - 1]; - lora_up = ggml_reshape_2d(compute_ctx, lora_up, ggml_nelements(lora_up) / lora_up_rows, lora_up_rows); - int64_t lora_down_rows = lora_down->ne[ggml_n_dims(lora_down) - 1]; - lora_down = ggml_reshape_2d(compute_ctx, lora_down, ggml_nelements(lora_down) / lora_down_rows, lora_down_rows); - - // ggml_mul_mat requires tensor b transposed - lora_down = ggml_cont(compute_ctx, ggml_transpose(compute_ctx, lora_down)); - struct ggml_tensor* updown = ggml_mul_mat(compute_ctx, lora_up, lora_down); - updown = ggml_cont(compute_ctx, ggml_transpose(compute_ctx, updown)); - updown = ggml_reshape(compute_ctx, updown, weight); - GGML_ASSERT(ggml_nelements(updown) == ggml_nelements(weight)); - updown = ggml_scale_inplace(compute_ctx, updown, scale_value); - ggml_tensor* final_weight; - if (weight->type != GGML_TYPE_F32 && weight->type != GGML_TYPE_F16) { - // final_weight = ggml_new_tensor(compute_ctx, GGML_TYPE_F32, ggml_n_dims(weight), weight->ne); - // final_weight = ggml_cpy(compute_ctx, weight, final_weight); - final_weight = to_f32(compute_ctx, weight); - final_weight = ggml_add_inplace(compute_ctx, final_weight, updown); - final_weight = ggml_cpy(compute_ctx, final_weight, weight); - } else { - final_weight = ggml_add_inplace(compute_ctx, weight, updown); + if (lora_up == NULL || lora_down == NULL) { + std::string lora_up_name = lora_pre[type] + key + lora_ups[type] + ".weight"; + if (lora_tensors.find(lora_up_name) == lora_tensors.end()) { + if (key == "model_diffusion_model_output_blocks_2_2_conv") { + // fix for some sdxl lora, like lcm-lora-xl + key = "model_diffusion_model_output_blocks_2_1_conv"; + lora_up_name = lora_pre[type] + key + lora_ups[type] + ".weight"; + } + } + + std::string lora_down_name = lora_pre[type] + key + lora_downs[type] + ".weight"; + std::string alpha_name = lora_pre[type] + key + ".alpha"; + std::string scale_name = lora_pre[type] + key + ".scale"; + + if (lora_tensors.find(lora_up_name) != lora_tensors.end()) { + lora_up = lora_tensors[lora_up_name]; + } + + if (lora_tensors.find(lora_down_name) != lora_tensors.end()) { + lora_down = lora_tensors[lora_down_name]; + } + applied_lora_tensors.insert(lora_up_name); + applied_lora_tensors.insert(lora_down_name); + applied_lora_tensors.insert(alpha_name); + applied_lora_tensors.insert(scale_name); + + if (lora_up == NULL || lora_down == NULL) { + continue; + } + + // calc_scale + int64_t dim = lora_down->ne[ggml_n_dims(lora_down) - 1]; + float scale_value = 1.0f; + if (lora_tensors.find(scale_name) != lora_tensors.end()) { + scale_value = ggml_backend_tensor_get_f32(lora_tensors[scale_name]); + } else if (lora_tensors.find(alpha_name) != lora_tensors.end()) { + float alpha = ggml_backend_tensor_get_f32(lora_tensors[alpha_name]); + scale_value = alpha / dim; + } + scale_value *= multiplier; + + // flat lora tensors to multiply it + int64_t lora_up_rows = lora_up->ne[ggml_n_dims(lora_up) - 1]; + lora_up = ggml_reshape_2d(compute_ctx, lora_up, ggml_nelements(lora_up) / lora_up_rows, lora_up_rows); + int64_t lora_down_rows = lora_down->ne[ggml_n_dims(lora_down) - 1]; + lora_down = ggml_reshape_2d(compute_ctx, lora_down, ggml_nelements(lora_down) / lora_down_rows, lora_down_rows); + + // ggml_mul_mat requires tensor b transposed + lora_down = ggml_cont(compute_ctx, ggml_transpose(compute_ctx, lora_down)); + struct ggml_tensor* updown = ggml_mul_mat(compute_ctx, lora_up, lora_down); + updown = ggml_cont(compute_ctx, ggml_transpose(compute_ctx, updown)); + updown = ggml_reshape(compute_ctx, updown, weight); + GGML_ASSERT(ggml_nelements(updown) == ggml_nelements(weight)); + updown = ggml_scale_inplace(compute_ctx, updown, scale_value); + ggml_tensor* final_weight; + if (weight->type != GGML_TYPE_F32 && weight->type != GGML_TYPE_F16) { + // final_weight = ggml_new_tensor(compute_ctx, GGML_TYPE_F32, ggml_n_dims(weight), weight->ne); + // final_weight = ggml_cpy(compute_ctx, weight, final_weight); + final_weight = to_f32(compute_ctx, weight); + final_weight = ggml_add_inplace(compute_ctx, final_weight, updown); + final_weight = ggml_cpy(compute_ctx, final_weight, weight); + } else { + final_weight = ggml_add_inplace(compute_ctx, weight, updown); + } + // final_weight = ggml_add_inplace(compute_ctx, weight, updown); // apply directly + ggml_build_forward_expand(gf, final_weight); + } } - // final_weight = ggml_add_inplace(compute_ctx, weight, updown); // apply directly - ggml_build_forward_expand(gf, final_weight); } - size_t total_lora_tensors_count = 0; size_t applied_lora_tensors_count = 0; @@ -173,6 +380,8 @@ struct LoraModel : public GGMLRunner { total_lora_tensors_count++; if (applied_lora_tensors.find(kv.first) == applied_lora_tensors.end()) { LOG_WARN("unused lora tensor %s", kv.first.c_str()); + exit(0); + } else { applied_lora_tensors_count++; } @@ -191,9 +400,9 @@ struct LoraModel : public GGMLRunner { return gf; } - void apply(std::map model_tensors, int n_threads) { + void apply(std::map model_tensors, SDVersion version, int n_threads) { auto get_graph = [&]() -> struct ggml_cgraph* { - return build_lora_graph(model_tensors); + return build_lora_graph(model_tensors, version); }; GGMLRunner::compute(get_graph, n_threads, true); } diff --git a/stable-diffusion.cpp b/stable-diffusion.cpp index 5abc29507..4d5a7d9b6 100644 --- a/stable-diffusion.cpp +++ b/stable-diffusion.cpp @@ -642,7 +642,8 @@ class StableDiffusionGGML { } lora.multiplier = multiplier; - lora.apply(tensors, n_threads); + // TODO: send version? + lora.apply(tensors, version, n_threads); lora.free_params_buffer(); int64_t t1 = ggml_time_ms(); @@ -1206,7 +1207,7 @@ sd_image_t* generate_image(sd_ctx_t* sd_ctx, if (sd_ctx->sd->stacked_id) { if (!sd_ctx->sd->pmid_lora->applied) { t0 = ggml_time_ms(); - sd_ctx->sd->pmid_lora->apply(sd_ctx->sd->tensors, sd_ctx->sd->n_threads); + sd_ctx->sd->pmid_lora->apply(sd_ctx->sd->tensors, sd_ctx->sd->version, sd_ctx->sd->n_threads); t1 = ggml_time_ms(); sd_ctx->sd->pmid_lora->applied = true; LOG_INFO("pmid_lora apply completed, taking %.2fs", (t1 - t0) * 1.0f / 1000); From 0b600d7174bec87e2fc28d146ec4399617fa43d2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?St=C3=A9phane=20du=20Hamel?= Date: Wed, 11 Dec 2024 00:23:41 +0100 Subject: [PATCH 02/12] Flux Lora: single_block --- lora.hpp | 71 +++++++++++++++++++++++++++++++++++++++++++------------- 1 file changed, 55 insertions(+), 16 deletions(-) diff --git a/lora.hpp b/lora.hpp index 720b41c74..1743e860a 100644 --- a/lora.hpp +++ b/lora.hpp @@ -87,6 +87,9 @@ struct LoraModel : public GGMLRunner { break; } } + // if (name.find(".transformer_blocks.0") != std::string::npos) { + // LOG_INFO("%s", name.c_str()); + // } if (dry_run) { struct ggml_tensor* real = ggml_new_tensor(params_ctx, @@ -104,7 +107,7 @@ struct LoraModel : public GGMLRunner { model_loader.load_tensors(on_new_tensor_cb, backend); alloc_params_buffer(); - + // exit(0); dry_run = false; model_loader.load_tensors(on_new_tensor_cb, backend); @@ -171,32 +174,34 @@ struct LoraModel : public GGMLRunner { ggml_tensor* lora_down = NULL; // LOG_DEBUG("k_tensor %s", k_tensor.c_str()); if (sd_version_is_flux(version)) { - size_t l1 = key.find("linear1"); + size_t l1 = key.find("linear1"); + size_t l2 = key.find("linear2"); + size_t mod = key.find("modulation.lin"); if (l1 != std::string::npos) { - l1 -= 1; - auto split_q_u_name = lora_pre[type] + key.substr(0, l1) + ".attn.to_q" + lora_ups[type] + ".weight"; - if (lora_tensors.find(split_q_u_name) != lora_tensors.end()) { + l1--; + auto split_q_d_name = lora_pre[type] + key.substr(0, l1) + ".attn.to_q" + lora_downs[type] + ".weight"; + if (lora_tensors.find(split_q_d_name) != lora_tensors.end()) { // print_ggml_tensor(it.second, true); //[3072, 21504, 1, 1] // find qkv and mlp up parts in LoRA model - auto split_k_u_name = lora_pre[type] + key.substr(0, l1) + ".attn.to_k" + lora_ups[type] + ".weight"; - auto split_v_u_name = lora_pre[type] + key.substr(0, l1) + ".attn.to_v" + lora_ups[type] + ".weight"; - - auto split_q_d_name = lora_pre[type] + key.substr(0, l1) + ".attn.to_q" + lora_downs[type] + ".weight"; auto split_k_d_name = lora_pre[type] + key.substr(0, l1) + ".attn.to_k" + lora_downs[type] + ".weight"; auto split_v_d_name = lora_pre[type] + key.substr(0, l1) + ".attn.to_v" + lora_downs[type] + ".weight"; - auto split_m_u_name = lora_pre[type] + key.substr(0, l1) + ".proj_mlp" + lora_ups[type] + ".weight"; + auto split_q_u_name = lora_pre[type] + key.substr(0, l1) + ".attn.to_q" + lora_ups[type] + ".weight"; + auto split_k_u_name = lora_pre[type] + key.substr(0, l1) + ".attn.to_k" + lora_ups[type] + ".weight"; + auto split_v_u_name = lora_pre[type] + key.substr(0, l1) + ".attn.to_v" + lora_ups[type] + ".weight"; + auto split_m_d_name = lora_pre[type] + key.substr(0, l1) + ".proj_mlp" + lora_downs[type] + ".weight"; + auto split_m_u_name = lora_pre[type] + key.substr(0, l1) + ".proj_mlp" + lora_ups[type] + ".weight"; - ggml_tensor* lora_q_up = NULL; ggml_tensor* lora_q_down = NULL; - ggml_tensor* lora_k_up = NULL; + ggml_tensor* lora_q_up = NULL; ggml_tensor* lora_k_down = NULL; - ggml_tensor* lora_v_up = NULL; + ggml_tensor* lora_k_up = NULL; ggml_tensor* lora_v_down = NULL; + ggml_tensor* lora_v_up = NULL; - ggml_tensor* lora_m_up = NULL; ggml_tensor* lora_m_down = NULL; + ggml_tensor* lora_m_up = NULL; lora_q_up = lora_tensors[split_q_u_name]; @@ -301,6 +306,38 @@ struct LoraModel : public GGMLRunner { // // print_ggml_tensor(it.second, true); // [3072, 21504, 1, 1] // } } + } else if (l2 != std::string::npos) { + l2--; + std::string lora_down_name = lora_pre[type] + key.substr(0, l2) + ".proj_out" + lora_downs[type] + ".weight"; + if (lora_tensors.find(lora_down_name) != lora_tensors.end()) { + std::string lora_up_name = lora_pre[type] + key.substr(0, l2) + ".proj_out" + lora_ups[type] + ".weight"; + if (lora_tensors.find(lora_up_name) != lora_tensors.end()) { + lora_up = lora_tensors[lora_up_name]; + } + + if (lora_tensors.find(lora_down_name) != lora_tensors.end()) { + lora_down = lora_tensors[lora_down_name]; + } + + applied_lora_tensors.insert(lora_up_name); + applied_lora_tensors.insert(lora_down_name); + } + } else if (mod != std::string::npos) { + mod--; + std::string lora_down_name = lora_pre[type] + key.substr(0, mod) + ".norm.linear" + lora_downs[type] + ".weight"; + if (lora_tensors.find(lora_down_name) != lora_tensors.end()) { + std::string lora_up_name = lora_pre[type] + key.substr(0, mod) + ".norm.linear" + lora_ups[type] + ".weight"; + if (lora_tensors.find(lora_up_name) != lora_tensors.end()) { + lora_up = lora_tensors[lora_up_name]; + } + + if (lora_tensors.find(lora_down_name) != lora_tensors.end()) { + lora_down = lora_tensors[lora_down_name]; + } + + applied_lora_tensors.insert(lora_up_name); + applied_lora_tensors.insert(lora_down_name); + } } } @@ -380,8 +417,10 @@ struct LoraModel : public GGMLRunner { total_lora_tensors_count++; if (applied_lora_tensors.find(kv.first) == applied_lora_tensors.end()) { LOG_WARN("unused lora tensor %s", kv.first.c_str()); - exit(0); - + print_ggml_tensor(kv.second, true); + if (kv.first.find("B") != std::string::npos) { + exit(0); + } } else { applied_lora_tensors_count++; } From f8db4fa4a9f2d521fbb83558305015927bb6997a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?St=C3=A9phane=20du=20Hamel?= Date: Wed, 11 Dec 2024 01:39:59 +0100 Subject: [PATCH 03/12] Flux lora loading (crashes) --- lora.hpp | 341 ++++++++++++++++++++++++++++++++++++++++++++----------- 1 file changed, 273 insertions(+), 68 deletions(-) diff --git a/lora.hpp b/lora.hpp index 1743e860a..6c77d93ec 100644 --- a/lora.hpp +++ b/lora.hpp @@ -172,26 +172,44 @@ struct LoraModel : public GGMLRunner { for (auto& key : keys) { ggml_tensor* lora_up = NULL; ggml_tensor* lora_down = NULL; + + std::string alpha_name = ""; + std::string scale_name = ""; // LOG_DEBUG("k_tensor %s", k_tensor.c_str()); if (sd_version_is_flux(version)) { - size_t l1 = key.find("linear1"); - size_t l2 = key.find("linear2"); - size_t mod = key.find("modulation.lin"); - if (l1 != std::string::npos) { - l1--; - auto split_q_d_name = lora_pre[type] + key.substr(0, l1) + ".attn.to_q" + lora_downs[type] + ".weight"; + size_t linear1 = key.find("linear1"); + size_t linear2 = key.find("linear2"); + size_t modulation = key.find("modulation.lin"); + + size_t txt_attn_qkv = key.find("txt_attn.qkv"); + size_t img_attn_qkv = key.find("img_attn.qkv"); + + size_t txt_attn_proj = key.find("txt_attn.proj"); + size_t img_attn_proj = key.find("img_attn.proj"); + + size_t txt_mlp_0 = key.find("txt_mlp.0"); + size_t txt_mlp_2 = key.find("txt_mlp.2"); + size_t img_mlp_0 = key.find("img_mlp.0"); + size_t img_mlp_2 = key.find("img_mlp.2"); + + size_t txt_mod_lin = key.find("txt_mod.lin"); + size_t img_mod_lin = key.find("img_mod.lin"); + + if (linear1 != std::string::npos) { + linear1--; + auto split_q_d_name = lora_pre[type] + key.substr(0, linear1) + ".attn.to_q" + lora_downs[type] + ".weight"; if (lora_tensors.find(split_q_d_name) != lora_tensors.end()) { // print_ggml_tensor(it.second, true); //[3072, 21504, 1, 1] // find qkv and mlp up parts in LoRA model - auto split_k_d_name = lora_pre[type] + key.substr(0, l1) + ".attn.to_k" + lora_downs[type] + ".weight"; - auto split_v_d_name = lora_pre[type] + key.substr(0, l1) + ".attn.to_v" + lora_downs[type] + ".weight"; + auto split_k_d_name = lora_pre[type] + key.substr(0, linear1) + ".attn.to_k" + lora_downs[type] + ".weight"; + auto split_v_d_name = lora_pre[type] + key.substr(0, linear1) + ".attn.to_v" + lora_downs[type] + ".weight"; - auto split_q_u_name = lora_pre[type] + key.substr(0, l1) + ".attn.to_q" + lora_ups[type] + ".weight"; - auto split_k_u_name = lora_pre[type] + key.substr(0, l1) + ".attn.to_k" + lora_ups[type] + ".weight"; - auto split_v_u_name = lora_pre[type] + key.substr(0, l1) + ".attn.to_v" + lora_ups[type] + ".weight"; + auto split_q_u_name = lora_pre[type] + key.substr(0, linear1) + ".attn.to_q" + lora_ups[type] + ".weight"; + auto split_k_u_name = lora_pre[type] + key.substr(0, linear1) + ".attn.to_k" + lora_ups[type] + ".weight"; + auto split_v_u_name = lora_pre[type] + key.substr(0, linear1) + ".attn.to_v" + lora_ups[type] + ".weight"; - auto split_m_d_name = lora_pre[type] + key.substr(0, l1) + ".proj_mlp" + lora_downs[type] + ".weight"; - auto split_m_u_name = lora_pre[type] + key.substr(0, l1) + ".proj_mlp" + lora_ups[type] + ".weight"; + auto split_m_d_name = lora_pre[type] + key.substr(0, linear1) + ".proj_mlp" + lora_downs[type] + ".weight"; + auto split_m_u_name = lora_pre[type] + key.substr(0, linear1) + ".proj_mlp" + lora_ups[type] + ".weight"; ggml_tensor* lora_q_down = NULL; ggml_tensor* lora_q_up = NULL; @@ -209,30 +227,34 @@ struct LoraModel : public GGMLRunner { lora_q_down = lora_tensors[split_q_d_name]; } - if (lora_tensors.find(split_k_u_name) != lora_tensors.end()) { - lora_k_up = lora_tensors[split_k_u_name]; + if (lora_tensors.find(split_q_u_name) != lora_tensors.end()) { + lora_q_up = lora_tensors[split_q_u_name]; } if (lora_tensors.find(split_k_d_name) != lora_tensors.end()) { lora_k_down = lora_tensors[split_k_d_name]; } - if (lora_tensors.find(split_v_u_name) != lora_tensors.end()) { - lora_v_up = lora_tensors[split_v_u_name]; + if (lora_tensors.find(split_k_u_name) != lora_tensors.end()) { + lora_k_up = lora_tensors[split_k_u_name]; } if (lora_tensors.find(split_v_d_name) != lora_tensors.end()) { lora_v_down = lora_tensors[split_v_d_name]; } - if (lora_tensors.find(split_m_u_name) != lora_tensors.end()) { - lora_m_up = lora_tensors[split_m_u_name]; + if (lora_tensors.find(split_v_u_name) != lora_tensors.end()) { + lora_v_up = lora_tensors[split_v_u_name]; } if (lora_tensors.find(split_m_d_name) != lora_tensors.end()) { lora_m_down = lora_tensors[split_m_d_name]; } + if (lora_tensors.find(split_m_u_name) != lora_tensors.end()) { + lora_m_up = lora_tensors[split_m_u_name]; + } + // print_ggml_tensor(lora_q_down, true); //[3072, R, 1, 1] // print_ggml_tensor(lora_k_down, true); //[3072, R, 1, 1] // print_ggml_tensor(lora_v_down, true); //[3072, R, 1, 1] @@ -275,8 +297,8 @@ struct LoraModel : public GGMLRunner { ggml_tensor* lora_up_concat = ggml_concat(compute_ctx, ggml_concat(compute_ctx, q_up, k_up, 0), ggml_concat(compute_ctx, v_up, m_up, 0), 0); // print_ggml_tensor(lora_up_concat, true); //[R*4, 21504, 1, 1] - lora_down = lora_down_concat; - lora_up = lora_up_concat; + lora_down = ggml_cont(compute_ctx, lora_down_concat); + lora_up = ggml_cont(compute_ctx, lora_up_concat); std::string lora_down_name = lora_pre[type] + key + lora_downs[type] + ".weight"; std::string lora_up_name = lora_pre[type] + key + lora_ups[type] + ".weight"; @@ -306,11 +328,155 @@ struct LoraModel : public GGMLRunner { // // print_ggml_tensor(it.second, true); // [3072, 21504, 1, 1] // } } - } else if (l2 != std::string::npos) { - l2--; - std::string lora_down_name = lora_pre[type] + key.substr(0, l2) + ".proj_out" + lora_downs[type] + ".weight"; + } else if (linear2 != std::string::npos) { + linear2--; + std::string lora_down_name = lora_pre[type] + key.substr(0, linear2) + ".proj_out" + lora_downs[type] + ".weight"; + if (lora_tensors.find(lora_down_name) != lora_tensors.end()) { + std::string lora_up_name = lora_pre[type] + key.substr(0, linear2) + ".proj_out" + lora_ups[type] + ".weight"; + if (lora_tensors.find(lora_up_name) != lora_tensors.end()) { + lora_up = lora_tensors[lora_up_name]; + } + + if (lora_tensors.find(lora_down_name) != lora_tensors.end()) { + lora_down = lora_tensors[lora_down_name]; + } + + applied_lora_tensors.insert(lora_up_name); + applied_lora_tensors.insert(lora_down_name); + } + } else if (modulation != std::string::npos) { + modulation--; + std::string lora_down_name = lora_pre[type] + key.substr(0, modulation) + ".norm.linear" + lora_downs[type] + ".weight"; + if (lora_tensors.find(lora_down_name) != lora_tensors.end()) { + std::string lora_up_name = lora_pre[type] + key.substr(0, modulation) + ".norm.linear" + lora_ups[type] + ".weight"; + if (lora_tensors.find(lora_up_name) != lora_tensors.end()) { + lora_up = lora_tensors[lora_up_name]; + } + + if (lora_tensors.find(lora_down_name) != lora_tensors.end()) { + lora_down = lora_tensors[lora_down_name]; + } + + applied_lora_tensors.insert(lora_up_name); + applied_lora_tensors.insert(lora_down_name); + } + } + // Double blocks + else if (txt_attn_qkv != std::string::npos || img_attn_qkv != std::string::npos) { + size_t match = txt_attn_qkv; + std::string prefix = ".attn.add_"; + std::string suffix = "_proj"; + if (img_attn_qkv != std::string::npos) { + match = img_attn_qkv; + prefix = ".attn.to_"; + suffix = ""; + } + match--; + + auto split_q_d_name = lora_pre[type] + key.substr(0, match) + prefix + "q" + suffix + lora_downs[type] + ".weight"; + if (lora_tensors.find(split_q_d_name) != lora_tensors.end()) { + // print_ggml_tensor(it.second, true); //[3072, 21504, 1, 1] + // find qkv and mlp up parts in LoRA model + auto split_k_d_name = lora_pre[type] + key.substr(0, match) + prefix + "k" + suffix + lora_downs[type] + ".weight"; + auto split_v_d_name = lora_pre[type] + key.substr(0, match) + prefix + "v" + suffix + lora_downs[type] + ".weight"; + + auto split_q_u_name = lora_pre[type] + key.substr(0, match) + prefix + "q" + suffix + lora_ups[type] + ".weight"; + auto split_k_u_name = lora_pre[type] + key.substr(0, match) + prefix + "k" + suffix + lora_ups[type] + ".weight"; + auto split_v_u_name = lora_pre[type] + key.substr(0, match) + prefix + "v" + suffix + lora_ups[type] + ".weight"; + + ggml_tensor* lora_q_down = NULL; + ggml_tensor* lora_q_up = NULL; + ggml_tensor* lora_k_down = NULL; + ggml_tensor* lora_k_up = NULL; + ggml_tensor* lora_v_down = NULL; + ggml_tensor* lora_v_up = NULL; + + if (lora_tensors.find(split_q_d_name) != lora_tensors.end()) { + lora_q_down = lora_tensors[split_q_d_name]; + } + + if (lora_tensors.find(split_q_u_name) != lora_tensors.end()) { + lora_q_up = lora_tensors[split_q_u_name]; + } + + if (lora_tensors.find(split_k_d_name) != lora_tensors.end()) { + lora_k_down = lora_tensors[split_k_d_name]; + } + + if (lora_tensors.find(split_k_u_name) != lora_tensors.end()) { + lora_k_up = lora_tensors[split_k_u_name]; + } + + if (lora_tensors.find(split_v_d_name) != lora_tensors.end()) { + lora_v_down = lora_tensors[split_v_d_name]; + } + + if (lora_tensors.find(split_v_u_name) != lora_tensors.end()) { + lora_v_up = lora_tensors[split_v_u_name]; + } + + // print_ggml_tensor(lora_q_down, true); //[3072, R, 1, 1] + // print_ggml_tensor(lora_k_down, true); //[3072, R, 1, 1] + // print_ggml_tensor(lora_v_down, true); //[3072, R, 1, 1] + // print_ggml_tensor(lora_q_up, true); //[R, 3072, 1, 1] + // print_ggml_tensor(lora_k_up, true); //[R, 3072, 1, 1] + // print_ggml_tensor(lora_v_up, true); //[R, 3072, 1, 1] + + // these need to be stitched together this way: + // |q_up,0 ,0 | + // |0 ,k_up,0 | + // |0 ,0 ,v_up| + // (q_down,k_down,v_down) . (q ,k ,v) + + // up_concat will be [9216, R*3, 1, 1] + // down_concat will be [R*3, 3072, 1, 1] + ggml_tensor* lora_down_concat = ggml_concat(compute_ctx, ggml_concat(compute_ctx, lora_q_down, lora_k_down, 1), lora_v_down, 1); + + ggml_tensor* z = ggml_dup_tensor(compute_ctx, lora_q_up); + ggml_scale(compute_ctx, z, 0); + ggml_tensor* zz = ggml_concat(compute_ctx, z, z, 1); + + ggml_tensor* q_up = ggml_concat(compute_ctx, lora_q_up, zz, 1); + ggml_tensor* k_up = ggml_concat(compute_ctx, ggml_concat(compute_ctx, z, lora_k_up, 1), z, 1); + ggml_tensor* v_up = ggml_concat(compute_ctx, zz, lora_v_up, 1); + // print_ggml_tensor(q_up, true); //[R, 9216, 1, 1] + // print_ggml_tensor(k_up, true); //[R, 9216, 1, 1] + // print_ggml_tensor(v_up, true); //[R, 9216, 1, 1] + ggml_tensor* lora_up_concat = ggml_concat(compute_ctx, ggml_concat(compute_ctx, q_up, k_up, 0), v_up, 0); + // print_ggml_tensor(lora_up_concat, true); //[R*3, 9216, 1, 1] + + lora_down = ggml_cont(compute_ctx, lora_down_concat); + lora_up = ggml_cont(compute_ctx, lora_up_concat); + + std::string lora_down_name = lora_pre[type] + key + lora_downs[type] + ".weight"; + std::string lora_up_name = lora_pre[type] + key + lora_ups[type] + ".weight"; + + lora_tensors[lora_down_name] = lora_down; + lora_tensors[lora_up_name] = lora_up; + + lora_tensors.erase(split_q_u_name); + lora_tensors.erase(split_k_u_name); + lora_tensors.erase(split_v_u_name); + + lora_tensors.erase(split_q_d_name); + lora_tensors.erase(split_k_d_name); + lora_tensors.erase(split_v_d_name); + + applied_lora_tensors.insert(lora_down_name); + applied_lora_tensors.insert(lora_up_name); + } + } else if (txt_attn_proj != std::string::npos || img_attn_proj != std::string::npos) { + size_t match = txt_attn_proj; + std::string new_name = ".attn.to_add_out"; + if (img_attn_proj != std::string::npos) { + match = img_attn_proj; + new_name = ".attn.to_out.0"; + } + match--; + + std::string lora_down_name = lora_pre[type] + key.substr(0, match) + new_name + lora_downs[type] + ".weight"; if (lora_tensors.find(lora_down_name) != lora_tensors.end()) { - std::string lora_up_name = lora_pre[type] + key.substr(0, l2) + ".proj_out" + lora_ups[type] + ".weight"; + std::string lora_up_name = lora_pre[type] + key.substr(0, match) + new_name + lora_ups[type] + ".weight"; if (lora_tensors.find(lora_up_name) != lora_tensors.end()) { lora_up = lora_tensors[lora_up_name]; } @@ -322,11 +488,51 @@ struct LoraModel : public GGMLRunner { applied_lora_tensors.insert(lora_up_name); applied_lora_tensors.insert(lora_down_name); } - } else if (mod != std::string::npos) { - mod--; - std::string lora_down_name = lora_pre[type] + key.substr(0, mod) + ".norm.linear" + lora_downs[type] + ".weight"; + } else if (txt_mlp_0 != std::string::npos || txt_mlp_2 != std::string::npos || img_mlp_0 != std::string::npos || img_mlp_2 != std::string::npos) { + bool has_two = txt_mlp_2 != std::string::npos || img_mlp_2 != std::string::npos; + std::string prefix = ".ff_context.net."; + std::string suffix = "0.proj"; + if (img_mlp_0 != std::string::npos || img_mlp_2 != std::string::npos) { + prefix = ".ff.net."; + } + if (has_two) { + suffix = "2"; + } + size_t match = txt_mlp_0; + if (txt_mlp_2 != std::string::npos) { + match = txt_mlp_2; + } else if (img_mlp_0 != std::string::npos) { + match = img_mlp_0; + } else if (img_mlp_2 != std::string::npos) { + match = img_mlp_2; + } + match--; + std::string lora_down_name = lora_pre[type] + key.substr(0, match) + prefix + suffix + lora_downs[type] + ".weight"; if (lora_tensors.find(lora_down_name) != lora_tensors.end()) { - std::string lora_up_name = lora_pre[type] + key.substr(0, mod) + ".norm.linear" + lora_ups[type] + ".weight"; + std::string lora_up_name = lora_pre[type] + key.substr(0, match) + prefix + suffix + lora_ups[type] + ".weight"; + if (lora_tensors.find(lora_up_name) != lora_tensors.end()) { + lora_up = lora_tensors[lora_up_name]; + } + + if (lora_tensors.find(lora_down_name) != lora_tensors.end()) { + lora_down = lora_tensors[lora_down_name]; + } + + applied_lora_tensors.insert(lora_up_name); + applied_lora_tensors.insert(lora_down_name); + } + } else if (txt_mod_lin != std::string::npos || img_mod_lin != std::string::npos) { + size_t match = txt_mod_lin; + std::string new_name = ".norm1_context.linear"; + if (img_mod_lin != std::string::npos) { + match = img_mod_lin; + new_name = ".norm1.linear"; + } + match--; + + std::string lora_down_name = lora_pre[type] + key.substr(0, match) + new_name + lora_downs[type] + ".weight"; + if (lora_tensors.find(lora_down_name) != lora_tensors.end()) { + std::string lora_up_name = lora_pre[type] + key.substr(0, match) + new_name + lora_ups[type] + ".weight"; if (lora_tensors.find(lora_up_name) != lora_tensors.end()) { lora_up = lora_tensors[lora_up_name]; } @@ -352,8 +558,8 @@ struct LoraModel : public GGMLRunner { } std::string lora_down_name = lora_pre[type] + key + lora_downs[type] + ".weight"; - std::string alpha_name = lora_pre[type] + key + ".alpha"; - std::string scale_name = lora_pre[type] + key + ".scale"; + alpha_name = lora_pre[type] + key + ".alpha"; + scale_name = lora_pre[type] + key + ".scale"; if (lora_tensors.find(lora_up_name) != lora_tensors.end()) { lora_up = lora_tensors[lora_up_name]; @@ -370,44 +576,43 @@ struct LoraModel : public GGMLRunner { if (lora_up == NULL || lora_down == NULL) { continue; } - - // calc_scale - int64_t dim = lora_down->ne[ggml_n_dims(lora_down) - 1]; - float scale_value = 1.0f; - if (lora_tensors.find(scale_name) != lora_tensors.end()) { - scale_value = ggml_backend_tensor_get_f32(lora_tensors[scale_name]); - } else if (lora_tensors.find(alpha_name) != lora_tensors.end()) { - float alpha = ggml_backend_tensor_get_f32(lora_tensors[alpha_name]); - scale_value = alpha / dim; - } - scale_value *= multiplier; - - // flat lora tensors to multiply it - int64_t lora_up_rows = lora_up->ne[ggml_n_dims(lora_up) - 1]; - lora_up = ggml_reshape_2d(compute_ctx, lora_up, ggml_nelements(lora_up) / lora_up_rows, lora_up_rows); - int64_t lora_down_rows = lora_down->ne[ggml_n_dims(lora_down) - 1]; - lora_down = ggml_reshape_2d(compute_ctx, lora_down, ggml_nelements(lora_down) / lora_down_rows, lora_down_rows); - - // ggml_mul_mat requires tensor b transposed - lora_down = ggml_cont(compute_ctx, ggml_transpose(compute_ctx, lora_down)); - struct ggml_tensor* updown = ggml_mul_mat(compute_ctx, lora_up, lora_down); - updown = ggml_cont(compute_ctx, ggml_transpose(compute_ctx, updown)); - updown = ggml_reshape(compute_ctx, updown, weight); - GGML_ASSERT(ggml_nelements(updown) == ggml_nelements(weight)); - updown = ggml_scale_inplace(compute_ctx, updown, scale_value); - ggml_tensor* final_weight; - if (weight->type != GGML_TYPE_F32 && weight->type != GGML_TYPE_F16) { - // final_weight = ggml_new_tensor(compute_ctx, GGML_TYPE_F32, ggml_n_dims(weight), weight->ne); - // final_weight = ggml_cpy(compute_ctx, weight, final_weight); - final_weight = to_f32(compute_ctx, weight); - final_weight = ggml_add_inplace(compute_ctx, final_weight, updown); - final_weight = ggml_cpy(compute_ctx, final_weight, weight); - } else { - final_weight = ggml_add_inplace(compute_ctx, weight, updown); - } - // final_weight = ggml_add_inplace(compute_ctx, weight, updown); // apply directly - ggml_build_forward_expand(gf, final_weight); } + // calc_scale + int64_t dim = lora_down->ne[ggml_n_dims(lora_down) - 1]; + float scale_value = 1.0f; + if (lora_tensors.find(scale_name) != lora_tensors.end()) { + scale_value = ggml_backend_tensor_get_f32(lora_tensors[scale_name]); + } else if (lora_tensors.find(alpha_name) != lora_tensors.end()) { + float alpha = ggml_backend_tensor_get_f32(lora_tensors[alpha_name]); + scale_value = alpha / dim; + } + scale_value *= multiplier; + + // flat lora tensors to multiply it + int64_t lora_up_rows = lora_up->ne[ggml_n_dims(lora_up) - 1]; + lora_up = ggml_reshape_2d(compute_ctx, lora_up, ggml_nelements(lora_up) / lora_up_rows, lora_up_rows); + int64_t lora_down_rows = lora_down->ne[ggml_n_dims(lora_down) - 1]; + lora_down = ggml_reshape_2d(compute_ctx, lora_down, ggml_nelements(lora_down) / lora_down_rows, lora_down_rows); + + // ggml_mul_mat requires tensor b transposed + lora_down = ggml_cont(compute_ctx, ggml_transpose(compute_ctx, lora_down)); + struct ggml_tensor* updown = ggml_mul_mat(compute_ctx, lora_up, lora_down); + updown = ggml_cont(compute_ctx, ggml_transpose(compute_ctx, updown)); + updown = ggml_reshape(compute_ctx, updown, weight); + GGML_ASSERT(ggml_nelements(updown) == ggml_nelements(weight)); + updown = ggml_scale_inplace(compute_ctx, updown, scale_value); + ggml_tensor* final_weight; + if (weight->type != GGML_TYPE_F32 && weight->type != GGML_TYPE_F16) { + // final_weight = ggml_new_tensor(compute_ctx, GGML_TYPE_F32, ggml_n_dims(weight), weight->ne); + // final_weight = ggml_cpy(compute_ctx, weight, final_weight); + final_weight = to_f32(compute_ctx, weight); + final_weight = ggml_add_inplace(compute_ctx, final_weight, updown); + final_weight = ggml_cpy(compute_ctx, final_weight, weight); + } else { + final_weight = ggml_add_inplace(compute_ctx, weight, updown); + } + // final_weight = ggml_add_inplace(compute_ctx, weight, updown); // apply directly + ggml_build_forward_expand(gf, final_weight); } } size_t total_lora_tensors_count = 0; From d22f18348e0bf150896a3a2e07b912e54b3b76b7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?St=C3=A9phane=20du=20Hamel?= Date: Wed, 11 Dec 2024 02:02:30 +0100 Subject: [PATCH 04/12] Flux Lora working! --- lora.hpp | 132 ++++++++++++++++++++++++++----------------------------- 1 file changed, 62 insertions(+), 70 deletions(-) diff --git a/lora.hpp b/lora.hpp index 6c77d93ec..202b8ab3a 100644 --- a/lora.hpp +++ b/lora.hpp @@ -173,8 +173,10 @@ struct LoraModel : public GGMLRunner { ggml_tensor* lora_up = NULL; ggml_tensor* lora_down = NULL; - std::string alpha_name = ""; - std::string scale_name = ""; + std::string alpha_name = ""; + std::string scale_name = ""; + std::string lora_down_name = ""; + std::string lora_up_name = ""; // LOG_DEBUG("k_tensor %s", k_tensor.c_str()); if (sd_version_is_flux(version)) { size_t linear1 = key.find("linear1"); @@ -221,38 +223,38 @@ struct LoraModel : public GGMLRunner { ggml_tensor* lora_m_down = NULL; ggml_tensor* lora_m_up = NULL; - lora_q_up = lora_tensors[split_q_u_name]; + lora_q_up = to_f32(compute_ctx, lora_tensors[split_q_u_name]); if (lora_tensors.find(split_q_d_name) != lora_tensors.end()) { - lora_q_down = lora_tensors[split_q_d_name]; + lora_q_down = to_f32(compute_ctx, lora_tensors[split_q_d_name]); } if (lora_tensors.find(split_q_u_name) != lora_tensors.end()) { - lora_q_up = lora_tensors[split_q_u_name]; + lora_q_up = to_f32(compute_ctx, lora_tensors[split_q_u_name]); } if (lora_tensors.find(split_k_d_name) != lora_tensors.end()) { - lora_k_down = lora_tensors[split_k_d_name]; + lora_k_down = to_f32(compute_ctx, lora_tensors[split_k_d_name]); } if (lora_tensors.find(split_k_u_name) != lora_tensors.end()) { - lora_k_up = lora_tensors[split_k_u_name]; + lora_k_up = to_f32(compute_ctx, lora_tensors[split_k_u_name]); } if (lora_tensors.find(split_v_d_name) != lora_tensors.end()) { - lora_v_down = lora_tensors[split_v_d_name]; + lora_v_down = to_f32(compute_ctx, lora_tensors[split_v_d_name]); } if (lora_tensors.find(split_v_u_name) != lora_tensors.end()) { - lora_v_up = lora_tensors[split_v_u_name]; + lora_v_up = to_f32(compute_ctx, lora_tensors[split_v_u_name]); } if (lora_tensors.find(split_m_d_name) != lora_tensors.end()) { - lora_m_down = lora_tensors[split_m_d_name]; + lora_m_down = to_f32(compute_ctx, lora_tensors[split_m_d_name]); } if (lora_tensors.find(split_m_u_name) != lora_tensors.end()) { - lora_m_up = lora_tensors[split_m_u_name]; + lora_m_up = to_f32(compute_ctx, lora_tensors[split_m_u_name]); } // print_ggml_tensor(lora_q_down, true); //[3072, R, 1, 1] @@ -300,28 +302,26 @@ struct LoraModel : public GGMLRunner { lora_down = ggml_cont(compute_ctx, lora_down_concat); lora_up = ggml_cont(compute_ctx, lora_up_concat); - std::string lora_down_name = lora_pre[type] + key + lora_downs[type] + ".weight"; - std::string lora_up_name = lora_pre[type] + key + lora_ups[type] + ".weight"; + lora_down_name = lora_pre[type] + key + lora_downs[type] + ".weight"; + lora_up_name = lora_pre[type] + key + lora_ups[type] + ".weight"; lora_tensors[lora_down_name] = lora_down; lora_tensors[lora_up_name] = lora_up; - lora_tensors.erase(split_q_u_name); - lora_tensors.erase(split_k_u_name); - lora_tensors.erase(split_v_u_name); - lora_tensors.erase(split_m_u_name); + // Would be nice to be able to clean up lora_tensors, but it breaks because this is called twice :/ + // lora_tensors.erase(split_q_u_name); + // lora_tensors.erase(split_k_u_name); + // lora_tensors.erase(split_v_u_name); + // lora_tensors.erase(split_m_u_name); - lora_tensors.erase(split_q_d_name); - lora_tensors.erase(split_k_d_name); - lora_tensors.erase(split_v_d_name); - lora_tensors.erase(split_m_d_name); - - applied_lora_tensors.insert(lora_down_name); - applied_lora_tensors.insert(lora_up_name); + // lora_tensors.erase(split_q_d_name); + // lora_tensors.erase(split_k_d_name); + // lora_tensors.erase(split_v_d_name); + // lora_tensors.erase(split_m_d_name); } else { - // std::string lora_up_name = lora_pre[type] + key + lora_ups[type] + ".weight"; - // std::string lora_down_name = lora_pre[type] + key + lora_downs[type] + ".weight"; + // lora_up_name = lora_pre[type] + key + lora_ups[type] + ".weight"; + // lora_down_name = lora_pre[type] + key + lora_downs[type] + ".weight"; // if (lora_tensors.find(lora_up_name) != lora_tensors.end()) { // // print_ggml_tensor(lora_tensors[lora_down_name], true); // [3072, R, 1, 1] // // print_ggml_tensor(lora_tensors[lora_up_name], true); // [R, 21504, 1, 1] @@ -330,9 +330,9 @@ struct LoraModel : public GGMLRunner { } } else if (linear2 != std::string::npos) { linear2--; - std::string lora_down_name = lora_pre[type] + key.substr(0, linear2) + ".proj_out" + lora_downs[type] + ".weight"; + lora_down_name = lora_pre[type] + key.substr(0, linear2) + ".proj_out" + lora_downs[type] + ".weight"; if (lora_tensors.find(lora_down_name) != lora_tensors.end()) { - std::string lora_up_name = lora_pre[type] + key.substr(0, linear2) + ".proj_out" + lora_ups[type] + ".weight"; + lora_up_name = lora_pre[type] + key.substr(0, linear2) + ".proj_out" + lora_ups[type] + ".weight"; if (lora_tensors.find(lora_up_name) != lora_tensors.end()) { lora_up = lora_tensors[lora_up_name]; } @@ -346,9 +346,9 @@ struct LoraModel : public GGMLRunner { } } else if (modulation != std::string::npos) { modulation--; - std::string lora_down_name = lora_pre[type] + key.substr(0, modulation) + ".norm.linear" + lora_downs[type] + ".weight"; + lora_down_name = lora_pre[type] + key.substr(0, modulation) + ".norm.linear" + lora_downs[type] + ".weight"; if (lora_tensors.find(lora_down_name) != lora_tensors.end()) { - std::string lora_up_name = lora_pre[type] + key.substr(0, modulation) + ".norm.linear" + lora_ups[type] + ".weight"; + lora_up_name = lora_pre[type] + key.substr(0, modulation) + ".norm.linear" + lora_ups[type] + ".weight"; if (lora_tensors.find(lora_up_name) != lora_tensors.end()) { lora_up = lora_tensors[lora_up_name]; } @@ -391,28 +391,26 @@ struct LoraModel : public GGMLRunner { ggml_tensor* lora_v_down = NULL; ggml_tensor* lora_v_up = NULL; - if (lora_tensors.find(split_q_d_name) != lora_tensors.end()) { - lora_q_down = lora_tensors[split_q_d_name]; - } + lora_q_down = to_f32(compute_ctx, lora_tensors[split_q_d_name]); if (lora_tensors.find(split_q_u_name) != lora_tensors.end()) { - lora_q_up = lora_tensors[split_q_u_name]; + lora_q_up = to_f32(compute_ctx, lora_tensors[split_q_u_name]); } if (lora_tensors.find(split_k_d_name) != lora_tensors.end()) { - lora_k_down = lora_tensors[split_k_d_name]; + lora_k_down = to_f32(compute_ctx, lora_tensors[split_k_d_name]); } if (lora_tensors.find(split_k_u_name) != lora_tensors.end()) { - lora_k_up = lora_tensors[split_k_u_name]; + lora_k_up = to_f32(compute_ctx, lora_tensors[split_k_u_name]); } if (lora_tensors.find(split_v_d_name) != lora_tensors.end()) { - lora_v_down = lora_tensors[split_v_d_name]; + lora_v_down = to_f32(compute_ctx, lora_tensors[split_v_d_name]); } if (lora_tensors.find(split_v_u_name) != lora_tensors.end()) { - lora_v_up = lora_tensors[split_v_u_name]; + lora_v_up = to_f32(compute_ctx, lora_tensors[split_v_u_name]); } // print_ggml_tensor(lora_q_down, true); //[3072, R, 1, 1] @@ -448,22 +446,20 @@ struct LoraModel : public GGMLRunner { lora_down = ggml_cont(compute_ctx, lora_down_concat); lora_up = ggml_cont(compute_ctx, lora_up_concat); - std::string lora_down_name = lora_pre[type] + key + lora_downs[type] + ".weight"; - std::string lora_up_name = lora_pre[type] + key + lora_ups[type] + ".weight"; + lora_down_name = lora_pre[type] + key + lora_downs[type] + ".weight"; + lora_up_name = lora_pre[type] + key + lora_ups[type] + ".weight"; lora_tensors[lora_down_name] = lora_down; lora_tensors[lora_up_name] = lora_up; - lora_tensors.erase(split_q_u_name); - lora_tensors.erase(split_k_u_name); - lora_tensors.erase(split_v_u_name); - - lora_tensors.erase(split_q_d_name); - lora_tensors.erase(split_k_d_name); - lora_tensors.erase(split_v_d_name); + // Would be nice to be able to clean up lora_tensors, but it breaks because this is called twice :/ + // lora_tensors.erase(split_q_u_name); + // lora_tensors.erase(split_k_u_name); + // lora_tensors.erase(split_v_u_name); - applied_lora_tensors.insert(lora_down_name); - applied_lora_tensors.insert(lora_up_name); + // lora_tensors.erase(split_q_d_name); + // lora_tensors.erase(split_k_d_name); + // lora_tensors.erase(split_v_d_name); } } else if (txt_attn_proj != std::string::npos || img_attn_proj != std::string::npos) { size_t match = txt_attn_proj; @@ -474,9 +470,9 @@ struct LoraModel : public GGMLRunner { } match--; - std::string lora_down_name = lora_pre[type] + key.substr(0, match) + new_name + lora_downs[type] + ".weight"; + lora_down_name = lora_pre[type] + key.substr(0, match) + new_name + lora_downs[type] + ".weight"; if (lora_tensors.find(lora_down_name) != lora_tensors.end()) { - std::string lora_up_name = lora_pre[type] + key.substr(0, match) + new_name + lora_ups[type] + ".weight"; + lora_up_name = lora_pre[type] + key.substr(0, match) + new_name + lora_ups[type] + ".weight"; if (lora_tensors.find(lora_up_name) != lora_tensors.end()) { lora_up = lora_tensors[lora_up_name]; } @@ -507,9 +503,9 @@ struct LoraModel : public GGMLRunner { match = img_mlp_2; } match--; - std::string lora_down_name = lora_pre[type] + key.substr(0, match) + prefix + suffix + lora_downs[type] + ".weight"; + lora_down_name = lora_pre[type] + key.substr(0, match) + prefix + suffix + lora_downs[type] + ".weight"; if (lora_tensors.find(lora_down_name) != lora_tensors.end()) { - std::string lora_up_name = lora_pre[type] + key.substr(0, match) + prefix + suffix + lora_ups[type] + ".weight"; + lora_up_name = lora_pre[type] + key.substr(0, match) + prefix + suffix + lora_ups[type] + ".weight"; if (lora_tensors.find(lora_up_name) != lora_tensors.end()) { lora_up = lora_tensors[lora_up_name]; } @@ -530,9 +526,9 @@ struct LoraModel : public GGMLRunner { } match--; - std::string lora_down_name = lora_pre[type] + key.substr(0, match) + new_name + lora_downs[type] + ".weight"; + lora_down_name = lora_pre[type] + key.substr(0, match) + new_name + lora_downs[type] + ".weight"; if (lora_tensors.find(lora_down_name) != lora_tensors.end()) { - std::string lora_up_name = lora_pre[type] + key.substr(0, match) + new_name + lora_ups[type] + ".weight"; + lora_up_name = lora_pre[type] + key.substr(0, match) + new_name + lora_ups[type] + ".weight"; if (lora_tensors.find(lora_up_name) != lora_tensors.end()) { lora_up = lora_tensors[lora_up_name]; } @@ -548,7 +544,7 @@ struct LoraModel : public GGMLRunner { } if (lora_up == NULL || lora_down == NULL) { - std::string lora_up_name = lora_pre[type] + key + lora_ups[type] + ".weight"; + lora_up_name = lora_pre[type] + key + lora_ups[type] + ".weight"; if (lora_tensors.find(lora_up_name) == lora_tensors.end()) { if (key == "model_diffusion_model_output_blocks_2_2_conv") { // fix for some sdxl lora, like lcm-lora-xl @@ -557,9 +553,9 @@ struct LoraModel : public GGMLRunner { } } - std::string lora_down_name = lora_pre[type] + key + lora_downs[type] + ".weight"; - alpha_name = lora_pre[type] + key + ".alpha"; - scale_name = lora_pre[type] + key + ".scale"; + lora_down_name = lora_pre[type] + key + lora_downs[type] + ".weight"; + alpha_name = lora_pre[type] + key + ".alpha"; + scale_name = lora_pre[type] + key + ".scale"; if (lora_tensors.find(lora_up_name) != lora_tensors.end()) { lora_up = lora_tensors[lora_up_name]; @@ -568,14 +564,14 @@ struct LoraModel : public GGMLRunner { if (lora_tensors.find(lora_down_name) != lora_tensors.end()) { lora_down = lora_tensors[lora_down_name]; } - applied_lora_tensors.insert(lora_up_name); - applied_lora_tensors.insert(lora_down_name); - applied_lora_tensors.insert(alpha_name); - applied_lora_tensors.insert(scale_name); + } + applied_lora_tensors.insert(lora_up_name); + applied_lora_tensors.insert(lora_down_name); + applied_lora_tensors.insert(alpha_name); + applied_lora_tensors.insert(scale_name); - if (lora_up == NULL || lora_down == NULL) { - continue; - } + if (lora_up == NULL || lora_down == NULL) { + continue; } // calc_scale int64_t dim = lora_down->ne[ggml_n_dims(lora_down) - 1]; @@ -622,10 +618,6 @@ struct LoraModel : public GGMLRunner { total_lora_tensors_count++; if (applied_lora_tensors.find(kv.first) == applied_lora_tensors.end()) { LOG_WARN("unused lora tensor %s", kv.first.c_str()); - print_ggml_tensor(kv.second, true); - if (kv.first.find("B") != std::string::npos) { - exit(0); - } } else { applied_lora_tensors_count++; } From f17c9f586f8c82bd46e1176382b068ff580877c7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?St=C3=A9phane=20du=20Hamel?= Date: Wed, 11 Dec 2024 13:15:51 +0100 Subject: [PATCH 05/12] Fix erroneous "unused lora tensors" --- lora.hpp | 73 +++++++++++++++++++++++++------------------------------- 1 file changed, 32 insertions(+), 41 deletions(-) diff --git a/lora.hpp b/lora.hpp index 202b8ab3a..8490bf56d 100644 --- a/lora.hpp +++ b/lora.hpp @@ -302,31 +302,22 @@ struct LoraModel : public GGMLRunner { lora_down = ggml_cont(compute_ctx, lora_down_concat); lora_up = ggml_cont(compute_ctx, lora_up_concat); - lora_down_name = lora_pre[type] + key + lora_downs[type] + ".weight"; - lora_up_name = lora_pre[type] + key + lora_ups[type] + ".weight"; + // lora_down_name = lora_pre[type] + key + lora_downs[type] + ".weight"; + // lora_up_name = lora_pre[type] + key + lora_ups[type] + ".weight"; - lora_tensors[lora_down_name] = lora_down; - lora_tensors[lora_up_name] = lora_up; + // lora_tensors[lora_down_name] = lora_down; + // lora_tensors[lora_up_name] = lora_up; // Would be nice to be able to clean up lora_tensors, but it breaks because this is called twice :/ - // lora_tensors.erase(split_q_u_name); - // lora_tensors.erase(split_k_u_name); - // lora_tensors.erase(split_v_u_name); - // lora_tensors.erase(split_m_u_name); - - // lora_tensors.erase(split_q_d_name); - // lora_tensors.erase(split_k_d_name); - // lora_tensors.erase(split_v_d_name); - // lora_tensors.erase(split_m_d_name); - - } else { - // lora_up_name = lora_pre[type] + key + lora_ups[type] + ".weight"; - // lora_down_name = lora_pre[type] + key + lora_downs[type] + ".weight"; - // if (lora_tensors.find(lora_up_name) != lora_tensors.end()) { - // // print_ggml_tensor(lora_tensors[lora_down_name], true); // [3072, R, 1, 1] - // // print_ggml_tensor(lora_tensors[lora_up_name], true); // [R, 21504, 1, 1] - // // print_ggml_tensor(it.second, true); // [3072, 21504, 1, 1] - // } + applied_lora_tensors.insert(split_q_u_name); + applied_lora_tensors.insert(split_k_u_name); + applied_lora_tensors.insert(split_v_u_name); + applied_lora_tensors.insert(split_m_u_name); + + applied_lora_tensors.insert(split_q_d_name); + applied_lora_tensors.insert(split_k_d_name); + applied_lora_tensors.insert(split_v_d_name); + applied_lora_tensors.insert(split_m_d_name); } } else if (linear2 != std::string::npos) { linear2--; @@ -341,8 +332,8 @@ struct LoraModel : public GGMLRunner { lora_down = lora_tensors[lora_down_name]; } - applied_lora_tensors.insert(lora_up_name); applied_lora_tensors.insert(lora_down_name); + applied_lora_tensors.insert(lora_up_name); } } else if (modulation != std::string::npos) { modulation--; @@ -357,8 +348,8 @@ struct LoraModel : public GGMLRunner { lora_down = lora_tensors[lora_down_name]; } - applied_lora_tensors.insert(lora_up_name); applied_lora_tensors.insert(lora_down_name); + applied_lora_tensors.insert(lora_up_name); } } // Double blocks @@ -446,20 +437,20 @@ struct LoraModel : public GGMLRunner { lora_down = ggml_cont(compute_ctx, lora_down_concat); lora_up = ggml_cont(compute_ctx, lora_up_concat); - lora_down_name = lora_pre[type] + key + lora_downs[type] + ".weight"; - lora_up_name = lora_pre[type] + key + lora_ups[type] + ".weight"; + // lora_down_name = lora_pre[type] + key + lora_downs[type] + ".weight"; + // lora_up_name = lora_pre[type] + key + lora_ups[type] + ".weight"; - lora_tensors[lora_down_name] = lora_down; - lora_tensors[lora_up_name] = lora_up; + // lora_tensors[lora_down_name] = lora_down; + // lora_tensors[lora_up_name] = lora_up; // Would be nice to be able to clean up lora_tensors, but it breaks because this is called twice :/ - // lora_tensors.erase(split_q_u_name); - // lora_tensors.erase(split_k_u_name); - // lora_tensors.erase(split_v_u_name); + applied_lora_tensors.insert(split_q_u_name); + applied_lora_tensors.insert(split_k_u_name); + applied_lora_tensors.insert(split_v_u_name); - // lora_tensors.erase(split_q_d_name); - // lora_tensors.erase(split_k_d_name); - // lora_tensors.erase(split_v_d_name); + applied_lora_tensors.insert(split_q_d_name); + applied_lora_tensors.insert(split_k_d_name); + applied_lora_tensors.insert(split_v_d_name); } } else if (txt_attn_proj != std::string::npos || img_attn_proj != std::string::npos) { size_t match = txt_attn_proj; @@ -481,8 +472,8 @@ struct LoraModel : public GGMLRunner { lora_down = lora_tensors[lora_down_name]; } - applied_lora_tensors.insert(lora_up_name); applied_lora_tensors.insert(lora_down_name); + applied_lora_tensors.insert(lora_up_name); } } else if (txt_mlp_0 != std::string::npos || txt_mlp_2 != std::string::npos || img_mlp_0 != std::string::npos || img_mlp_2 != std::string::npos) { bool has_two = txt_mlp_2 != std::string::npos || img_mlp_2 != std::string::npos; @@ -514,8 +505,8 @@ struct LoraModel : public GGMLRunner { lora_down = lora_tensors[lora_down_name]; } - applied_lora_tensors.insert(lora_up_name); applied_lora_tensors.insert(lora_down_name); + applied_lora_tensors.insert(lora_up_name); } } else if (txt_mod_lin != std::string::npos || img_mod_lin != std::string::npos) { size_t match = txt_mod_lin; @@ -537,8 +528,8 @@ struct LoraModel : public GGMLRunner { lora_down = lora_tensors[lora_down_name]; } - applied_lora_tensors.insert(lora_up_name); applied_lora_tensors.insert(lora_down_name); + applied_lora_tensors.insert(lora_up_name); } } } @@ -564,11 +555,11 @@ struct LoraModel : public GGMLRunner { if (lora_tensors.find(lora_down_name) != lora_tensors.end()) { lora_down = lora_tensors[lora_down_name]; } + applied_lora_tensors.insert(lora_up_name); + applied_lora_tensors.insert(lora_down_name); + applied_lora_tensors.insert(alpha_name); + applied_lora_tensors.insert(scale_name); } - applied_lora_tensors.insert(lora_up_name); - applied_lora_tensors.insert(lora_down_name); - applied_lora_tensors.insert(alpha_name); - applied_lora_tensors.insert(scale_name); if (lora_up == NULL || lora_down == NULL) { continue; From 486d42c2e0b4f9be355f4f6ec433095241dddc46 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?St=C3=A9phane=20du=20Hamel?= Date: Wed, 11 Dec 2024 14:36:23 +0100 Subject: [PATCH 06/12] Fix linux build --- lora.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lora.hpp b/lora.hpp index 8490bf56d..4568a6c05 100644 --- a/lora.hpp +++ b/lora.hpp @@ -6,7 +6,7 @@ #define LORA_GRAPH_SIZE 10240 struct LoraModel : public GGMLRunner { - static enum lora_t { + enum lora_t { REGULAR = 0, DIFFUSERS = 1, DIFFUSERS_2 = 2, From bc3b23bb6fe1fe7ab6b7cfe064d3bed5c67f5f9a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?St=C3=A9phane=20du=20Hamel?= Date: Wed, 11 Dec 2024 14:38:54 +0100 Subject: [PATCH 07/12] Remove deprecated comments --- lora.hpp | 22 ---------------------- 1 file changed, 22 deletions(-) diff --git a/lora.hpp b/lora.hpp index 4568a6c05..efed3679f 100644 --- a/lora.hpp +++ b/lora.hpp @@ -87,9 +87,6 @@ struct LoraModel : public GGMLRunner { break; } } - // if (name.find(".transformer_blocks.0") != std::string::npos) { - // LOG_INFO("%s", name.c_str()); - // } if (dry_run) { struct ggml_tensor* real = ggml_new_tensor(params_ctx, @@ -107,7 +104,6 @@ struct LoraModel : public GGMLRunner { model_loader.load_tensors(on_new_tensor_cb, backend); alloc_params_buffer(); - // exit(0); dry_run = false; model_loader.load_tensors(on_new_tensor_cb, backend); @@ -131,7 +127,6 @@ struct LoraModel : public GGMLRunner { blk_name = blk_name.substr(0, k_pos); if (type == REGULAR) { keys.push_back(blk_name); - // blk_name = blk_name.substr(sizeof("diffusion_model.")); replace_all_chars(blk_name, '.', '_'); keys.push_back(blk_name); return keys; @@ -149,7 +144,6 @@ struct LoraModel : public GGMLRunner { keys.push_back(blk_name); // } } - // LOG_DEBUG("k_tensor %s", k_tensor.c_str()); return keys; } @@ -165,7 +159,6 @@ struct LoraModel : public GGMLRunner { std::string k_tensor = it.first; struct ggml_tensor* weight = model_tensors[it.first]; - // LOG_INFO("%s", k_tensor.c_str()); std::vector keys = to_lora_keys(k_tensor, version); if (keys.size() == 0) continue; @@ -177,7 +170,6 @@ struct LoraModel : public GGMLRunner { std::string scale_name = ""; std::string lora_down_name = ""; std::string lora_up_name = ""; - // LOG_DEBUG("k_tensor %s", k_tensor.c_str()); if (sd_version_is_flux(version)) { size_t linear1 = key.find("linear1"); size_t linear2 = key.find("linear2"); @@ -302,13 +294,6 @@ struct LoraModel : public GGMLRunner { lora_down = ggml_cont(compute_ctx, lora_down_concat); lora_up = ggml_cont(compute_ctx, lora_up_concat); - // lora_down_name = lora_pre[type] + key + lora_downs[type] + ".weight"; - // lora_up_name = lora_pre[type] + key + lora_ups[type] + ".weight"; - - // lora_tensors[lora_down_name] = lora_down; - // lora_tensors[lora_up_name] = lora_up; - - // Would be nice to be able to clean up lora_tensors, but it breaks because this is called twice :/ applied_lora_tensors.insert(split_q_u_name); applied_lora_tensors.insert(split_k_u_name); applied_lora_tensors.insert(split_v_u_name); @@ -437,13 +422,6 @@ struct LoraModel : public GGMLRunner { lora_down = ggml_cont(compute_ctx, lora_down_concat); lora_up = ggml_cont(compute_ctx, lora_up_concat); - // lora_down_name = lora_pre[type] + key + lora_downs[type] + ".weight"; - // lora_up_name = lora_pre[type] + key + lora_ups[type] + ".weight"; - - // lora_tensors[lora_down_name] = lora_down; - // lora_tensors[lora_up_name] = lora_up; - - // Would be nice to be able to clean up lora_tensors, but it breaks because this is called twice :/ applied_lora_tensors.insert(split_q_u_name); applied_lora_tensors.insert(split_k_u_name); applied_lora_tensors.insert(split_v_u_name); From 7993de0d790f7337d90f3fc2439cdcb2bd024162 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?St=C3=A9phane=20du=20Hamel?= Date: Wed, 11 Dec 2024 17:51:20 +0100 Subject: [PATCH 08/12] mmdit loras --- lora.hpp | 211 +++++++++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 206 insertions(+), 5 deletions(-) diff --git a/lora.hpp b/lora.hpp index efed3679f..54fd5ff96 100644 --- a/lora.hpp +++ b/lora.hpp @@ -107,6 +107,8 @@ struct LoraModel : public GGMLRunner { dry_run = false; model_loader.load_tensors(on_new_tensor_cb, backend); + LOG_DEBUG("lora type: \"%s\"/\"%s\"", lora_downs[type].c_str(), lora_ups[type].c_str()); + LOG_DEBUG("finished loaded lora"); return true; } @@ -120,29 +122,36 @@ struct LoraModel : public GGMLRunner { std::vector to_lora_keys(std::string blk_name, SDVersion version) { std::vector keys; + // if (!sd_version_is_sd3(version) || blk_name != "model.diffusion_model.pos_embed") { size_t k_pos = blk_name.find(".weight"); if (k_pos == std::string::npos) { return keys; } blk_name = blk_name.substr(0, k_pos); + // } if (type == REGULAR) { keys.push_back(blk_name); replace_all_chars(blk_name, '.', '_'); keys.push_back(blk_name); return keys; } else if (type == DIFFUSERS || type == DIFFUSERS_2 || DIFFUSERS_3) { - // if (sd_version_is_Flux(version)) { - if (blk_name.find("model.diffusion_model") != std::string::npos) { - blk_name.replace(blk_name.find("model.diffusion_model"), sizeof("model.diffusion_model") - 1, "transformer"); + if (sd_version_is_dit(version)) { + if (blk_name.find("model.diffusion_model") != std::string::npos) { + blk_name.replace(blk_name.find("model.diffusion_model"), sizeof("model.diffusion_model") - 1, "transformer"); + } } + if (blk_name.find(".single_blocks") != std::string::npos) { blk_name.replace(blk_name.find(".single_blocks"), sizeof(".single_blocks") - 1, ".single_transformer_blocks"); } if (blk_name.find(".double_blocks") != std::string::npos) { blk_name.replace(blk_name.find(".double_blocks"), sizeof(".double_blocks") - 1, ".transformer_blocks"); } + + if (blk_name.find(".joint_blocks") != std::string::npos) { + blk_name.replace(blk_name.find(".joint_blocks"), sizeof(".joint_blocks") - 1, ".transformer_blocks"); + } keys.push_back(blk_name); - // } } return keys; } @@ -510,8 +519,199 @@ struct LoraModel : public GGMLRunner { applied_lora_tensors.insert(lora_up_name); } } - } + } else if (sd_version_is_sd3(version)) { + size_t final_layer_adaLN_modulation = key.find("final_layer.adaLN_modulation.1"); + size_t pos_embed = key.find("pos_embed"); + size_t final_layer_linear = key.find("final_layer.linear"); + size_t y_embedder_mlp_0 = key.find("y_embedder.mlp.0"); + size_t y_embedder_mlp_2 = key.find("y_embedder.mlp.2"); + size_t t_embedder_mlp_0 = key.find("t_embedder.mlp.0"); + size_t t_embedder_mlp_2 = key.find("t_embedder.mlp.2"); + size_t x_block_mlp_fc1 = key.find("x_block.mlp.fc1"); + size_t x_block_mlp_fc2 = key.find("x_block.mlp.fc2"); + size_t context_block_mlp_fc1 = key.find("context_block.mlp.fc1"); + size_t context_block_mlp_fc2 = key.find("context_block.mlp.fc2"); + size_t x_block_adaLN_modulation_1 = key.find("x_block.adaLN_modulation.1"); + size_t context_block_adaLN_modulation_1 = key.find("context_block.adaLN_modulation.1"); + + size_t context_block_attn_proj = key.find("context_block.attn.proj"); + size_t x_block_attn_proj = key.find("x_block.attn.proj"); + size_t x_block_attn2_proj = key.find("x_block.attn2.proj"); + + size_t context_block_attn_qkv = key.find("context_block.attn.qkv"); + size_t x_block_attn_qkv = key.find("x_block.attn.qkv"); + size_t x_block_attn2_qkv = key.find("x_block.attn2.qkv"); + + size_t match = std::string::npos; + std::string new_name = ""; + if (final_layer_adaLN_modulation != std::string::npos) { + new_name = ".norm_out.linear"; + match = final_layer_adaLN_modulation; + } else if (pos_embed != std::string::npos) { + match = pos_embed; + new_name = ".pos_embed.proj"; + } else if (final_layer_linear != std::string::npos) { + match = final_layer_linear; + new_name = ".proj_out"; + } else if (y_embedder_mlp_0 != std::string::npos) { + match = y_embedder_mlp_0; + new_name = ".time_text_embed.text_embedder.linear_1"; + } else if (y_embedder_mlp_2 != std::string::npos) { + match = y_embedder_mlp_2; + new_name = ".time_text_embed.text_embedder.linear_2"; + } else if (t_embedder_mlp_0 != std::string::npos) { + match = t_embedder_mlp_0; + new_name = ".time_text_embed.timestep_embedder.linear_1"; + } else if (t_embedder_mlp_2 != std::string::npos) { + match = t_embedder_mlp_2; + new_name = ".time_text_embed.timestep_embedder.linear_2"; + } else if (x_block_mlp_fc1 != std::string::npos) { + match = x_block_mlp_fc1; + new_name = ".ff.net.0.proj"; + } else if (x_block_mlp_fc2 != std::string::npos) { + match = x_block_mlp_fc2; + new_name = ".ff.net.2"; + } else if (context_block_mlp_fc1 != std::string::npos) { + match = context_block_mlp_fc1; + new_name = ".ff_context.net.0.proj"; + } else if (context_block_mlp_fc2 != std::string::npos) { + match = context_block_mlp_fc2; + new_name = ".ff_context.net.2"; + } else if (x_block_adaLN_modulation_1 != std::string::npos) { + match = x_block_adaLN_modulation_1; + new_name = ".norm1.linear"; + } else if (context_block_adaLN_modulation_1 != std::string::npos) { + match = context_block_adaLN_modulation_1; + new_name = ".norm1_context.linear"; + } else if (context_block_attn_proj != std::string::npos) { + match = context_block_attn_proj; + new_name = ".attn.to_add_out"; + } else if (x_block_attn_proj != std::string::npos) { + match = x_block_attn_proj; + new_name = ".attn.to_out.0"; + } else if (x_block_attn2_proj != std::string::npos) { + match = x_block_attn2_proj; + new_name = ".attn2.to_out.0"; + } + + if (match != std::string::npos) { + match--; + lora_down_name = lora_pre[type] + key.substr(0, match) + new_name + lora_downs[type] + ".weight"; + if (lora_tensors.find(lora_down_name) != lora_tensors.end()) { + lora_up_name = lora_pre[type] + key.substr(0, match) + new_name + lora_ups[type] + ".weight"; + if (lora_tensors.find(lora_up_name) != lora_tensors.end()) { + lora_up = lora_tensors[lora_up_name]; + } + + if (lora_tensors.find(lora_down_name) != lora_tensors.end()) { + lora_down = lora_tensors[lora_down_name]; + } + + applied_lora_tensors.insert(lora_down_name); + applied_lora_tensors.insert(lora_up_name); + } + } + std::string prefix = ""; + std::string suffix = ""; + + if (context_block_attn_qkv != std::string::npos) { + match = context_block_attn_qkv; + prefix = ".attn.add_"; + suffix = "_proj"; + } else if (x_block_attn_qkv != std::string::npos) { + match = x_block_attn_qkv; + prefix = ".attn.to_"; + suffix = ""; + } else if (x_block_attn2_qkv != std::string::npos) { + match = x_block_attn2_qkv; + prefix = ".attn2.to_"; + suffix = ""; + } + if (match != std::string::npos) { + match--; + auto split_q_d_name = lora_pre[type] + key.substr(0, match) + prefix + "q" + suffix + lora_downs[type] + ".weight"; + if (lora_tensors.find(split_q_d_name) != lora_tensors.end()) { + // print_ggml_tensor(it.second, true); //[3072, 21504, 1, 1] + // find qkv and mlp up parts in LoRA model + auto split_k_d_name = lora_pre[type] + key.substr(0, match) + prefix + "k" + suffix + lora_downs[type] + ".weight"; + auto split_v_d_name = lora_pre[type] + key.substr(0, match) + prefix + "v" + suffix + lora_downs[type] + ".weight"; + + auto split_q_u_name = lora_pre[type] + key.substr(0, match) + prefix + "q" + suffix + lora_ups[type] + ".weight"; + auto split_k_u_name = lora_pre[type] + key.substr(0, match) + prefix + "k" + suffix + lora_ups[type] + ".weight"; + auto split_v_u_name = lora_pre[type] + key.substr(0, match) + prefix + "v" + suffix + lora_ups[type] + ".weight"; + + ggml_tensor* lora_q_down = NULL; + ggml_tensor* lora_q_up = NULL; + ggml_tensor* lora_k_down = NULL; + ggml_tensor* lora_k_up = NULL; + ggml_tensor* lora_v_down = NULL; + ggml_tensor* lora_v_up = NULL; + + lora_q_down = to_f32(compute_ctx, lora_tensors[split_q_d_name]); + + if (lora_tensors.find(split_q_u_name) != lora_tensors.end()) { + lora_q_up = to_f32(compute_ctx, lora_tensors[split_q_u_name]); + } + + if (lora_tensors.find(split_k_d_name) != lora_tensors.end()) { + lora_k_down = to_f32(compute_ctx, lora_tensors[split_k_d_name]); + } + + if (lora_tensors.find(split_k_u_name) != lora_tensors.end()) { + lora_k_up = to_f32(compute_ctx, lora_tensors[split_k_u_name]); + } + + if (lora_tensors.find(split_v_d_name) != lora_tensors.end()) { + lora_v_down = to_f32(compute_ctx, lora_tensors[split_v_d_name]); + } + + if (lora_tensors.find(split_v_u_name) != lora_tensors.end()) { + lora_v_up = to_f32(compute_ctx, lora_tensors[split_v_u_name]); + } + + // print_ggml_tensor(lora_q_down, true); //[hidden_size, R, 1, 1] + // print_ggml_tensor(lora_k_down, true); //[hidden_size, R, 1, 1] + // print_ggml_tensor(lora_v_down, true); //[hidden_size, R, 1, 1] + // print_ggml_tensor(lora_q_up, true); //[R, hidden_size, 1, 1] + // print_ggml_tensor(lora_k_up, true); //[R, hidden_size, 1, 1] + // print_ggml_tensor(lora_v_up, true); //[R, hidden_size, 1, 1] + + // these need to be stitched together this way: + // |q_up,0 ,0 | + // |0 ,k_up,0 | + // |0 ,0 ,v_up| + // (q_down,k_down,v_down) . (q ,k ,v) + // up_concat will be [4608, R*3, 1, 1] + // down_concat will be [R*3, hidden_size, 1, 1] + ggml_tensor* lora_down_concat = ggml_concat(compute_ctx, ggml_concat(compute_ctx, lora_q_down, lora_k_down, 1), lora_v_down, 1); + + ggml_tensor* z = ggml_dup_tensor(compute_ctx, lora_q_up); + ggml_scale(compute_ctx, z, 0); + ggml_tensor* zz = ggml_concat(compute_ctx, z, z, 1); + + ggml_tensor* q_up = ggml_concat(compute_ctx, lora_q_up, zz, 1); + ggml_tensor* k_up = ggml_concat(compute_ctx, ggml_concat(compute_ctx, z, lora_k_up, 1), z, 1); + ggml_tensor* v_up = ggml_concat(compute_ctx, zz, lora_v_up, 1); + // print_ggml_tensor(q_up, true); //[R, hidden_size * 3, 1, 1] + // print_ggml_tensor(k_up, true); //[R, hidden_size * 3, 1, 1] + // print_ggml_tensor(v_up, true); //[R, hidden_size * 3, 1, 1] + ggml_tensor* lora_up_concat = ggml_concat(compute_ctx, ggml_concat(compute_ctx, q_up, k_up, 0), v_up, 0); + // print_ggml_tensor(lora_up_concat, true); //[R*3, hidden_size * 3, 1, 1] + + lora_down = ggml_cont(compute_ctx, lora_down_concat); + lora_up = ggml_cont(compute_ctx, lora_up_concat); + + applied_lora_tensors.insert(split_q_u_name); + applied_lora_tensors.insert(split_k_u_name); + applied_lora_tensors.insert(split_v_u_name); + + applied_lora_tensors.insert(split_q_d_name); + applied_lora_tensors.insert(split_k_d_name); + applied_lora_tensors.insert(split_v_d_name); + } + } + } if (lora_up == NULL || lora_down == NULL) { lora_up_name = lora_pre[type] + key + lora_ups[type] + ".weight"; if (lora_tensors.find(lora_up_name) == lora_tensors.end()) { @@ -587,6 +787,7 @@ struct LoraModel : public GGMLRunner { total_lora_tensors_count++; if (applied_lora_tensors.find(kv.first) == applied_lora_tensors.end()) { LOG_WARN("unused lora tensor %s", kv.first.c_str()); + print_ggml_tensor(kv.second, true); } else { applied_lora_tensors_count++; } From a1ead4f4fec64bcbc01a181d5863dc44a0980153 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?St=C3=A9phane=20du=20Hamel?= Date: Thu, 12 Dec 2024 22:37:14 +0100 Subject: [PATCH 09/12] Refactor Lora loading --- lora.hpp | 783 +++++++++++++++++++------------------------------------ 1 file changed, 266 insertions(+), 517 deletions(-) diff --git a/lora.hpp b/lora.hpp index 54fd5ff96..ae8d5bfaf 100644 --- a/lora.hpp +++ b/lora.hpp @@ -39,6 +39,55 @@ struct LoraModel : public GGMLRunner { "", }; + const std::map alt_names = { + // mmdit + {"final_layer.adaLN_modulation.1", "norm_out.linear"}, + {"pos_embed", "pos_embed.proj"}, + {"final_layer.linear", "proj_out"}, + {"y_embedder.mlp.0", "time_text_embed.text_embedder.linear_1"}, + {"y_embedder.mlp.2", "time_text_embed.text_embedder.linear_2"}, + {"t_embedder.mlp.0", "time_text_embed.timestep_embedder.linear_1"}, + {"t_embedder.mlp.2", "time_text_embed.timestep_embedder.linear_2"}, + {"x_block.mlp.fc1", "ff.net.0.proj"}, + {"x_block.mlp.fc2", "ff.net.2"}, + {"context_block.mlp.fc1", "ff_context.net.0.proj"}, + {"context_block.mlp.fc2", "ff_context.net.2"}, + {"x_block.adaLN_modulation.1", "norm1.linear"}, + {"context_block.adaLN_modulation.1", "norm1_context.linear"}, + {"context_block.attn.proj", "attn.to_add_out"}, + {"x_block.attn.proj", "attn.to_out.0"}, + {"x_block.attn2.proj", "attn2.to_out.0"}, + // flux + // singlestream + {"linear2", "proj_out"}, + {"modulation.lin", "norm.linear"}, + // doublestream + {"txt_attn.proj", "attn.to_add_out"}, + {"img_attn.proj", "attn.to_out.0"}, + {"txt_mlp.0", "ff_context.net.0.proj"}, + {"txt_mlp.2", "ff_context.net.2"}, + {"img_mlp.0", "ff.net.0.proj"}, + {"img_mlp.2", "ff.net.2"}, + {"txt_mod.lin", "norm1_context.linear"}, + {"img_mod.lin", "norm1.linear"}, + }; + + const std::map qkv_prefixes = { + // mmdit + {"context_block.attn.qkv", "attn.add_"}, // suffix "_proj" + {"x_block.attn.qkv", "attn.to_"}, + {"x_block.attn2.qkv", "attn2.to_"}, + // flux + // doublestream + {"txt_attn.qkv", "attn.add_"}, // suffix "_proj" + {"img_attn.qkv", "attn.to_"}, + }; + const std::map qkvm_prefixes = { + // flux + // singlestream + {"linear1", ""}, + }; + const std::string* type_fingerprints = lora_ups; float multiplier = 1.0f; @@ -104,6 +153,7 @@ struct LoraModel : public GGMLRunner { model_loader.load_tensors(on_new_tensor_cb, backend); alloc_params_buffer(); + // exit(0); dry_run = false; model_loader.load_tensors(on_new_tensor_cb, backend); @@ -129,16 +179,10 @@ struct LoraModel : public GGMLRunner { } blk_name = blk_name.substr(0, k_pos); // } - if (type == REGULAR) { - keys.push_back(blk_name); - replace_all_chars(blk_name, '.', '_'); - keys.push_back(blk_name); - return keys; - } else if (type == DIFFUSERS || type == DIFFUSERS_2 || DIFFUSERS_3) { - if (sd_version_is_dit(version)) { - if (blk_name.find("model.diffusion_model") != std::string::npos) { - blk_name.replace(blk_name.find("model.diffusion_model"), sizeof("model.diffusion_model") - 1, "transformer"); - } + keys.push_back(blk_name); + if (sd_version_is_dit(version)) { + if (blk_name.find("model.diffusion_model") != std::string::npos) { + blk_name.replace(blk_name.find("model.diffusion_model"), sizeof("model.diffusion_model") - 1, "transformer"); } if (blk_name.find(".single_blocks") != std::string::npos) { @@ -151,9 +195,37 @@ struct LoraModel : public GGMLRunner { if (blk_name.find(".joint_blocks") != std::string::npos) { blk_name.replace(blk_name.find(".joint_blocks"), sizeof(".joint_blocks") - 1, ".transformer_blocks"); } - keys.push_back(blk_name); + + for (const auto& item : alt_names) { + size_t match = blk_name.find(item.first); + if (match != std::string::npos) { + blk_name = blk_name.substr(0, match) + item.second; + } + } + for (const auto& prefix : qkv_prefixes) { + size_t match = blk_name.find(prefix.first); + if (match != std::string::npos) { + std::string split_blk = "SPLIT|" + blk_name.substr(0, match) + prefix.second; + keys.push_back(split_blk); + } + } + for (const auto& prefix : qkvm_prefixes) { + size_t match = blk_name.find(prefix.first); + if (match != std::string::npos) { + std::string split_blk = "SPLIT_L|" + blk_name.substr(0, match) + prefix.second; + keys.push_back(split_blk); + } + } } - return keys; + keys.push_back(blk_name); + + std::vector ret; + for (std::string& key : keys) { + ret.push_back(key); + replace_all_chars(key, '.', '_'); + ret.push_back(key); + } + return ret; } struct ggml_cgraph* build_lora_graph(std::map model_tensors, SDVersion version) { @@ -179,537 +251,213 @@ struct LoraModel : public GGMLRunner { std::string scale_name = ""; std::string lora_down_name = ""; std::string lora_up_name = ""; - if (sd_version_is_flux(version)) { - size_t linear1 = key.find("linear1"); - size_t linear2 = key.find("linear2"); - size_t modulation = key.find("modulation.lin"); - size_t txt_attn_qkv = key.find("txt_attn.qkv"); - size_t img_attn_qkv = key.find("img_attn.qkv"); + if (starts_with(key, "SPLIT|")) { + key = key.substr(sizeof("SPLIT|") - 1); + // TODO: Handle alphas + std::string suffix = ""; + auto split_q_d_name = lora_pre[type] + key + "q" + suffix + lora_downs[type] + ".weight"; - size_t txt_attn_proj = key.find("txt_attn.proj"); - size_t img_attn_proj = key.find("img_attn.proj"); + if (lora_tensors.find(split_q_d_name) == lora_tensors.end()) { + suffix = "_proj"; + split_q_d_name = lora_pre[type] + key + "q" + suffix + lora_downs[type] + ".weight"; + } + if (lora_tensors.find(split_q_d_name) != lora_tensors.end()) { + // print_ggml_tensor(it.second, true); //[3072, 21504, 1, 1] + // find qkv and mlp up parts in LoRA model + auto split_k_d_name = lora_pre[type] + key + "k" + suffix + lora_downs[type] + ".weight"; + auto split_v_d_name = lora_pre[type] + key + "v" + suffix + lora_downs[type] + ".weight"; + + auto split_q_u_name = lora_pre[type] + key + "q" + suffix + lora_ups[type] + ".weight"; + auto split_k_u_name = lora_pre[type] + key + "k" + suffix + lora_ups[type] + ".weight"; + auto split_v_u_name = lora_pre[type] + key + "v" + suffix + lora_ups[type] + ".weight"; + + ggml_tensor* lora_q_down = NULL; + ggml_tensor* lora_q_up = NULL; + ggml_tensor* lora_k_down = NULL; + ggml_tensor* lora_k_up = NULL; + ggml_tensor* lora_v_down = NULL; + ggml_tensor* lora_v_up = NULL; + + lora_q_down = to_f32(compute_ctx, lora_tensors[split_q_d_name]); + + if (lora_tensors.find(split_q_u_name) != lora_tensors.end()) { + lora_q_up = to_f32(compute_ctx, lora_tensors[split_q_u_name]); + } - size_t txt_mlp_0 = key.find("txt_mlp.0"); - size_t txt_mlp_2 = key.find("txt_mlp.2"); - size_t img_mlp_0 = key.find("img_mlp.0"); - size_t img_mlp_2 = key.find("img_mlp.2"); + if (lora_tensors.find(split_k_d_name) != lora_tensors.end()) { + lora_k_down = to_f32(compute_ctx, lora_tensors[split_k_d_name]); + } - size_t txt_mod_lin = key.find("txt_mod.lin"); - size_t img_mod_lin = key.find("img_mod.lin"); + if (lora_tensors.find(split_k_u_name) != lora_tensors.end()) { + lora_k_up = to_f32(compute_ctx, lora_tensors[split_k_u_name]); + } - if (linear1 != std::string::npos) { - linear1--; - auto split_q_d_name = lora_pre[type] + key.substr(0, linear1) + ".attn.to_q" + lora_downs[type] + ".weight"; - if (lora_tensors.find(split_q_d_name) != lora_tensors.end()) { - // print_ggml_tensor(it.second, true); //[3072, 21504, 1, 1] - // find qkv and mlp up parts in LoRA model - auto split_k_d_name = lora_pre[type] + key.substr(0, linear1) + ".attn.to_k" + lora_downs[type] + ".weight"; - auto split_v_d_name = lora_pre[type] + key.substr(0, linear1) + ".attn.to_v" + lora_downs[type] + ".weight"; + if (lora_tensors.find(split_v_d_name) != lora_tensors.end()) { + lora_v_down = to_f32(compute_ctx, lora_tensors[split_v_d_name]); + } - auto split_q_u_name = lora_pre[type] + key.substr(0, linear1) + ".attn.to_q" + lora_ups[type] + ".weight"; - auto split_k_u_name = lora_pre[type] + key.substr(0, linear1) + ".attn.to_k" + lora_ups[type] + ".weight"; - auto split_v_u_name = lora_pre[type] + key.substr(0, linear1) + ".attn.to_v" + lora_ups[type] + ".weight"; + if (lora_tensors.find(split_v_u_name) != lora_tensors.end()) { + lora_v_up = to_f32(compute_ctx, lora_tensors[split_v_u_name]); + } - auto split_m_d_name = lora_pre[type] + key.substr(0, linear1) + ".proj_mlp" + lora_downs[type] + ".weight"; - auto split_m_u_name = lora_pre[type] + key.substr(0, linear1) + ".proj_mlp" + lora_ups[type] + ".weight"; + // print_ggml_tensor(lora_q_down, true); //[3072, R, 1, 1] + // print_ggml_tensor(lora_k_down, true); //[3072, R, 1, 1] + // print_ggml_tensor(lora_v_down, true); //[3072, R, 1, 1] + // print_ggml_tensor(lora_q_up, true); //[R, 3072, 1, 1] + // print_ggml_tensor(lora_k_up, true); //[R, 3072, 1, 1] + // print_ggml_tensor(lora_v_up, true); //[R, 3072, 1, 1] + + // these need to be stitched together this way: + // |q_up,0 ,0 | + // |0 ,k_up,0 | + // |0 ,0 ,v_up| + // (q_down,k_down,v_down) . (q ,k ,v) + + // up_concat will be [9216, R*3, 1, 1] + // down_concat will be [R*3, 3072, 1, 1] + ggml_tensor* lora_down_concat = ggml_concat(compute_ctx, ggml_concat(compute_ctx, lora_q_down, lora_k_down, 1), lora_v_down, 1); + + ggml_tensor* z = ggml_dup_tensor(compute_ctx, lora_q_up); + ggml_scale(compute_ctx, z, 0); + ggml_tensor* zz = ggml_concat(compute_ctx, z, z, 1); + + ggml_tensor* q_up = ggml_concat(compute_ctx, lora_q_up, zz, 1); + ggml_tensor* k_up = ggml_concat(compute_ctx, ggml_concat(compute_ctx, z, lora_k_up, 1), z, 1); + ggml_tensor* v_up = ggml_concat(compute_ctx, zz, lora_v_up, 1); + // print_ggml_tensor(q_up, true); //[R, 9216, 1, 1] + // print_ggml_tensor(k_up, true); //[R, 9216, 1, 1] + // print_ggml_tensor(v_up, true); //[R, 9216, 1, 1] + ggml_tensor* lora_up_concat = ggml_concat(compute_ctx, ggml_concat(compute_ctx, q_up, k_up, 0), v_up, 0); + // print_ggml_tensor(lora_up_concat, true); //[R*3, 9216, 1, 1] + + lora_down = ggml_cont(compute_ctx, lora_down_concat); + lora_up = ggml_cont(compute_ctx, lora_up_concat); + + applied_lora_tensors.insert(split_q_u_name); + applied_lora_tensors.insert(split_k_u_name); + applied_lora_tensors.insert(split_v_u_name); + + applied_lora_tensors.insert(split_q_d_name); + applied_lora_tensors.insert(split_k_d_name); + applied_lora_tensors.insert(split_v_d_name); + } + } + if (starts_with(key, "SPLIT_L|")) { + key = key.substr(sizeof("SPLIT_L|") - 1); - ggml_tensor* lora_q_down = NULL; - ggml_tensor* lora_q_up = NULL; - ggml_tensor* lora_k_down = NULL; - ggml_tensor* lora_k_up = NULL; - ggml_tensor* lora_v_down = NULL; - ggml_tensor* lora_v_up = NULL; + auto split_q_d_name = lora_pre[type] + key + "attn.to_q" + lora_downs[type] + ".weight"; + if (lora_tensors.find(split_q_d_name) != lora_tensors.end()) { + // print_ggml_tensor(it.second, true); //[3072, 21504, 1, 1] + // find qkv and mlp up parts in LoRA model + auto split_k_d_name = lora_pre[type] + key + "attn.to_k" + lora_downs[type] + ".weight"; + auto split_v_d_name = lora_pre[type] + key + "attn.to_v" + lora_downs[type] + ".weight"; - ggml_tensor* lora_m_down = NULL; - ggml_tensor* lora_m_up = NULL; + auto split_q_u_name = lora_pre[type] + key + "attn.to_q" + lora_ups[type] + ".weight"; + auto split_k_u_name = lora_pre[type] + key + "attn.to_k" + lora_ups[type] + ".weight"; + auto split_v_u_name = lora_pre[type] + key + "attn.to_v" + lora_ups[type] + ".weight"; - lora_q_up = to_f32(compute_ctx, lora_tensors[split_q_u_name]); + auto split_m_d_name = lora_pre[type] + key + "proj_mlp" + lora_downs[type] + ".weight"; + auto split_m_u_name = lora_pre[type] + key + "proj_mlp" + lora_ups[type] + ".weight"; - if (lora_tensors.find(split_q_d_name) != lora_tensors.end()) { - lora_q_down = to_f32(compute_ctx, lora_tensors[split_q_d_name]); - } - - if (lora_tensors.find(split_q_u_name) != lora_tensors.end()) { - lora_q_up = to_f32(compute_ctx, lora_tensors[split_q_u_name]); - } - - if (lora_tensors.find(split_k_d_name) != lora_tensors.end()) { - lora_k_down = to_f32(compute_ctx, lora_tensors[split_k_d_name]); - } - - if (lora_tensors.find(split_k_u_name) != lora_tensors.end()) { - lora_k_up = to_f32(compute_ctx, lora_tensors[split_k_u_name]); - } - - if (lora_tensors.find(split_v_d_name) != lora_tensors.end()) { - lora_v_down = to_f32(compute_ctx, lora_tensors[split_v_d_name]); - } - - if (lora_tensors.find(split_v_u_name) != lora_tensors.end()) { - lora_v_up = to_f32(compute_ctx, lora_tensors[split_v_u_name]); - } - - if (lora_tensors.find(split_m_d_name) != lora_tensors.end()) { - lora_m_down = to_f32(compute_ctx, lora_tensors[split_m_d_name]); - } - - if (lora_tensors.find(split_m_u_name) != lora_tensors.end()) { - lora_m_up = to_f32(compute_ctx, lora_tensors[split_m_u_name]); - } - - // print_ggml_tensor(lora_q_down, true); //[3072, R, 1, 1] - // print_ggml_tensor(lora_k_down, true); //[3072, R, 1, 1] - // print_ggml_tensor(lora_v_down, true); //[3072, R, 1, 1] - // print_ggml_tensor(lora_m_down, true); //[3072, R, 1, 1] - // print_ggml_tensor(lora_q_up, true); //[R, 3072, 1, 1] - // print_ggml_tensor(lora_k_up, true); //[R, 3072, 1, 1] - // print_ggml_tensor(lora_v_up, true); //[R, 3072, 1, 1] - // print_ggml_tensor(lora_m_up, true); //[R, 12288, 1, 1] - - // these need to be stitched together this way: - // |q_up,0 ,0 ,0 | - // |0 ,k_up,0 ,0 | - // |0 ,0 ,v_up,0 | - // |0 ,0 ,0 ,m_up| - // (q_down,k_down,v_down,m_down) . (q ,k ,v ,m) - - // up_concat will be [21504, R*4, 1, 1] - // down_concat will be [R*4, 3072, 1, 1] - - ggml_tensor* lora_down_concat = ggml_concat(compute_ctx, ggml_concat(compute_ctx, lora_q_down, lora_k_down, 1), ggml_concat(compute_ctx, lora_v_down, lora_m_down, 1), 1); - // print_ggml_tensor(lora_down_concat, true); //[3072, R*4, 1, 1] - - // this also means that if rank is bigger than 672, it is less memory efficient to do it this way (should be fine) - // print_ggml_tensor(lora_q_up, true); //[3072, R, 1, 1] - ggml_tensor* z = ggml_dup_tensor(compute_ctx, lora_q_up); - ggml_tensor* mlp_z = ggml_dup_tensor(compute_ctx, lora_m_up); - ggml_scale(compute_ctx, z, 0); - ggml_scale(compute_ctx, mlp_z, 0); - ggml_tensor* zz = ggml_concat(compute_ctx, z, z, 1); - - ggml_tensor* q_up = ggml_concat(compute_ctx, ggml_concat(compute_ctx, lora_q_up, zz, 1), mlp_z, 1); - ggml_tensor* k_up = ggml_concat(compute_ctx, ggml_concat(compute_ctx, z, lora_k_up, 1), ggml_concat(compute_ctx, z, mlp_z, 1), 1); - ggml_tensor* v_up = ggml_concat(compute_ctx, ggml_concat(compute_ctx, zz, lora_v_up, 1), mlp_z, 1); - ggml_tensor* m_up = ggml_concat(compute_ctx, ggml_concat(compute_ctx, zz, z, 1), lora_m_up, 1); - // print_ggml_tensor(q_up, true); //[R, 21504, 1, 1] - // print_ggml_tensor(k_up, true); //[R, 21504, 1, 1] - // print_ggml_tensor(v_up, true); //[R, 21504, 1, 1] - // print_ggml_tensor(m_up, true); //[R, 21504, 1, 1] - - ggml_tensor* lora_up_concat = ggml_concat(compute_ctx, ggml_concat(compute_ctx, q_up, k_up, 0), ggml_concat(compute_ctx, v_up, m_up, 0), 0); - // print_ggml_tensor(lora_up_concat, true); //[R*4, 21504, 1, 1] - - lora_down = ggml_cont(compute_ctx, lora_down_concat); - lora_up = ggml_cont(compute_ctx, lora_up_concat); - - applied_lora_tensors.insert(split_q_u_name); - applied_lora_tensors.insert(split_k_u_name); - applied_lora_tensors.insert(split_v_u_name); - applied_lora_tensors.insert(split_m_u_name); - - applied_lora_tensors.insert(split_q_d_name); - applied_lora_tensors.insert(split_k_d_name); - applied_lora_tensors.insert(split_v_d_name); - applied_lora_tensors.insert(split_m_d_name); - } - } else if (linear2 != std::string::npos) { - linear2--; - lora_down_name = lora_pre[type] + key.substr(0, linear2) + ".proj_out" + lora_downs[type] + ".weight"; - if (lora_tensors.find(lora_down_name) != lora_tensors.end()) { - lora_up_name = lora_pre[type] + key.substr(0, linear2) + ".proj_out" + lora_ups[type] + ".weight"; - if (lora_tensors.find(lora_up_name) != lora_tensors.end()) { - lora_up = lora_tensors[lora_up_name]; - } - - if (lora_tensors.find(lora_down_name) != lora_tensors.end()) { - lora_down = lora_tensors[lora_down_name]; - } - - applied_lora_tensors.insert(lora_down_name); - applied_lora_tensors.insert(lora_up_name); - } - } else if (modulation != std::string::npos) { - modulation--; - lora_down_name = lora_pre[type] + key.substr(0, modulation) + ".norm.linear" + lora_downs[type] + ".weight"; - if (lora_tensors.find(lora_down_name) != lora_tensors.end()) { - lora_up_name = lora_pre[type] + key.substr(0, modulation) + ".norm.linear" + lora_ups[type] + ".weight"; - if (lora_tensors.find(lora_up_name) != lora_tensors.end()) { - lora_up = lora_tensors[lora_up_name]; - } - - if (lora_tensors.find(lora_down_name) != lora_tensors.end()) { - lora_down = lora_tensors[lora_down_name]; - } - - applied_lora_tensors.insert(lora_down_name); - applied_lora_tensors.insert(lora_up_name); - } - } - // Double blocks - else if (txt_attn_qkv != std::string::npos || img_attn_qkv != std::string::npos) { - size_t match = txt_attn_qkv; - std::string prefix = ".attn.add_"; - std::string suffix = "_proj"; - if (img_attn_qkv != std::string::npos) { - match = img_attn_qkv; - prefix = ".attn.to_"; - suffix = ""; - } - match--; + ggml_tensor* lora_q_down = NULL; + ggml_tensor* lora_q_up = NULL; + ggml_tensor* lora_k_down = NULL; + ggml_tensor* lora_k_up = NULL; + ggml_tensor* lora_v_down = NULL; + ggml_tensor* lora_v_up = NULL; - auto split_q_d_name = lora_pre[type] + key.substr(0, match) + prefix + "q" + suffix + lora_downs[type] + ".weight"; - if (lora_tensors.find(split_q_d_name) != lora_tensors.end()) { - // print_ggml_tensor(it.second, true); //[3072, 21504, 1, 1] - // find qkv and mlp up parts in LoRA model - auto split_k_d_name = lora_pre[type] + key.substr(0, match) + prefix + "k" + suffix + lora_downs[type] + ".weight"; - auto split_v_d_name = lora_pre[type] + key.substr(0, match) + prefix + "v" + suffix + lora_downs[type] + ".weight"; - - auto split_q_u_name = lora_pre[type] + key.substr(0, match) + prefix + "q" + suffix + lora_ups[type] + ".weight"; - auto split_k_u_name = lora_pre[type] + key.substr(0, match) + prefix + "k" + suffix + lora_ups[type] + ".weight"; - auto split_v_u_name = lora_pre[type] + key.substr(0, match) + prefix + "v" + suffix + lora_ups[type] + ".weight"; - - ggml_tensor* lora_q_down = NULL; - ggml_tensor* lora_q_up = NULL; - ggml_tensor* lora_k_down = NULL; - ggml_tensor* lora_k_up = NULL; - ggml_tensor* lora_v_down = NULL; - ggml_tensor* lora_v_up = NULL; + ggml_tensor* lora_m_down = NULL; + ggml_tensor* lora_m_up = NULL; - lora_q_down = to_f32(compute_ctx, lora_tensors[split_q_d_name]); + lora_q_up = to_f32(compute_ctx, lora_tensors[split_q_u_name]); - if (lora_tensors.find(split_q_u_name) != lora_tensors.end()) { - lora_q_up = to_f32(compute_ctx, lora_tensors[split_q_u_name]); - } - - if (lora_tensors.find(split_k_d_name) != lora_tensors.end()) { - lora_k_down = to_f32(compute_ctx, lora_tensors[split_k_d_name]); - } - - if (lora_tensors.find(split_k_u_name) != lora_tensors.end()) { - lora_k_up = to_f32(compute_ctx, lora_tensors[split_k_u_name]); - } - - if (lora_tensors.find(split_v_d_name) != lora_tensors.end()) { - lora_v_down = to_f32(compute_ctx, lora_tensors[split_v_d_name]); - } - - if (lora_tensors.find(split_v_u_name) != lora_tensors.end()) { - lora_v_up = to_f32(compute_ctx, lora_tensors[split_v_u_name]); - } - - // print_ggml_tensor(lora_q_down, true); //[3072, R, 1, 1] - // print_ggml_tensor(lora_k_down, true); //[3072, R, 1, 1] - // print_ggml_tensor(lora_v_down, true); //[3072, R, 1, 1] - // print_ggml_tensor(lora_q_up, true); //[R, 3072, 1, 1] - // print_ggml_tensor(lora_k_up, true); //[R, 3072, 1, 1] - // print_ggml_tensor(lora_v_up, true); //[R, 3072, 1, 1] - - // these need to be stitched together this way: - // |q_up,0 ,0 | - // |0 ,k_up,0 | - // |0 ,0 ,v_up| - // (q_down,k_down,v_down) . (q ,k ,v) - - // up_concat will be [9216, R*3, 1, 1] - // down_concat will be [R*3, 3072, 1, 1] - ggml_tensor* lora_down_concat = ggml_concat(compute_ctx, ggml_concat(compute_ctx, lora_q_down, lora_k_down, 1), lora_v_down, 1); - - ggml_tensor* z = ggml_dup_tensor(compute_ctx, lora_q_up); - ggml_scale(compute_ctx, z, 0); - ggml_tensor* zz = ggml_concat(compute_ctx, z, z, 1); - - ggml_tensor* q_up = ggml_concat(compute_ctx, lora_q_up, zz, 1); - ggml_tensor* k_up = ggml_concat(compute_ctx, ggml_concat(compute_ctx, z, lora_k_up, 1), z, 1); - ggml_tensor* v_up = ggml_concat(compute_ctx, zz, lora_v_up, 1); - // print_ggml_tensor(q_up, true); //[R, 9216, 1, 1] - // print_ggml_tensor(k_up, true); //[R, 9216, 1, 1] - // print_ggml_tensor(v_up, true); //[R, 9216, 1, 1] - ggml_tensor* lora_up_concat = ggml_concat(compute_ctx, ggml_concat(compute_ctx, q_up, k_up, 0), v_up, 0); - // print_ggml_tensor(lora_up_concat, true); //[R*3, 9216, 1, 1] - - lora_down = ggml_cont(compute_ctx, lora_down_concat); - lora_up = ggml_cont(compute_ctx, lora_up_concat); - - applied_lora_tensors.insert(split_q_u_name); - applied_lora_tensors.insert(split_k_u_name); - applied_lora_tensors.insert(split_v_u_name); - - applied_lora_tensors.insert(split_q_d_name); - applied_lora_tensors.insert(split_k_d_name); - applied_lora_tensors.insert(split_v_d_name); - } - } else if (txt_attn_proj != std::string::npos || img_attn_proj != std::string::npos) { - size_t match = txt_attn_proj; - std::string new_name = ".attn.to_add_out"; - if (img_attn_proj != std::string::npos) { - match = img_attn_proj; - new_name = ".attn.to_out.0"; + if (lora_tensors.find(split_q_d_name) != lora_tensors.end()) { + lora_q_down = to_f32(compute_ctx, lora_tensors[split_q_d_name]); } - match--; - - lora_down_name = lora_pre[type] + key.substr(0, match) + new_name + lora_downs[type] + ".weight"; - if (lora_tensors.find(lora_down_name) != lora_tensors.end()) { - lora_up_name = lora_pre[type] + key.substr(0, match) + new_name + lora_ups[type] + ".weight"; - if (lora_tensors.find(lora_up_name) != lora_tensors.end()) { - lora_up = lora_tensors[lora_up_name]; - } - - if (lora_tensors.find(lora_down_name) != lora_tensors.end()) { - lora_down = lora_tensors[lora_down_name]; - } - applied_lora_tensors.insert(lora_down_name); - applied_lora_tensors.insert(lora_up_name); - } - } else if (txt_mlp_0 != std::string::npos || txt_mlp_2 != std::string::npos || img_mlp_0 != std::string::npos || img_mlp_2 != std::string::npos) { - bool has_two = txt_mlp_2 != std::string::npos || img_mlp_2 != std::string::npos; - std::string prefix = ".ff_context.net."; - std::string suffix = "0.proj"; - if (img_mlp_0 != std::string::npos || img_mlp_2 != std::string::npos) { - prefix = ".ff.net."; - } - if (has_two) { - suffix = "2"; - } - size_t match = txt_mlp_0; - if (txt_mlp_2 != std::string::npos) { - match = txt_mlp_2; - } else if (img_mlp_0 != std::string::npos) { - match = img_mlp_0; - } else if (img_mlp_2 != std::string::npos) { - match = img_mlp_2; - } - match--; - lora_down_name = lora_pre[type] + key.substr(0, match) + prefix + suffix + lora_downs[type] + ".weight"; - if (lora_tensors.find(lora_down_name) != lora_tensors.end()) { - lora_up_name = lora_pre[type] + key.substr(0, match) + prefix + suffix + lora_ups[type] + ".weight"; - if (lora_tensors.find(lora_up_name) != lora_tensors.end()) { - lora_up = lora_tensors[lora_up_name]; - } - - if (lora_tensors.find(lora_down_name) != lora_tensors.end()) { - lora_down = lora_tensors[lora_down_name]; - } - - applied_lora_tensors.insert(lora_down_name); - applied_lora_tensors.insert(lora_up_name); - } - } else if (txt_mod_lin != std::string::npos || img_mod_lin != std::string::npos) { - size_t match = txt_mod_lin; - std::string new_name = ".norm1_context.linear"; - if (img_mod_lin != std::string::npos) { - match = img_mod_lin; - new_name = ".norm1.linear"; + if (lora_tensors.find(split_q_u_name) != lora_tensors.end()) { + lora_q_up = to_f32(compute_ctx, lora_tensors[split_q_u_name]); } - match--; - lora_down_name = lora_pre[type] + key.substr(0, match) + new_name + lora_downs[type] + ".weight"; - if (lora_tensors.find(lora_down_name) != lora_tensors.end()) { - lora_up_name = lora_pre[type] + key.substr(0, match) + new_name + lora_ups[type] + ".weight"; - if (lora_tensors.find(lora_up_name) != lora_tensors.end()) { - lora_up = lora_tensors[lora_up_name]; - } + if (lora_tensors.find(split_k_d_name) != lora_tensors.end()) { + lora_k_down = to_f32(compute_ctx, lora_tensors[split_k_d_name]); + } - if (lora_tensors.find(lora_down_name) != lora_tensors.end()) { - lora_down = lora_tensors[lora_down_name]; - } + if (lora_tensors.find(split_k_u_name) != lora_tensors.end()) { + lora_k_up = to_f32(compute_ctx, lora_tensors[split_k_u_name]); + } - applied_lora_tensors.insert(lora_down_name); - applied_lora_tensors.insert(lora_up_name); + if (lora_tensors.find(split_v_d_name) != lora_tensors.end()) { + lora_v_down = to_f32(compute_ctx, lora_tensors[split_v_d_name]); } - } - } else if (sd_version_is_sd3(version)) { - size_t final_layer_adaLN_modulation = key.find("final_layer.adaLN_modulation.1"); - size_t pos_embed = key.find("pos_embed"); - size_t final_layer_linear = key.find("final_layer.linear"); - size_t y_embedder_mlp_0 = key.find("y_embedder.mlp.0"); - size_t y_embedder_mlp_2 = key.find("y_embedder.mlp.2"); - size_t t_embedder_mlp_0 = key.find("t_embedder.mlp.0"); - size_t t_embedder_mlp_2 = key.find("t_embedder.mlp.2"); - size_t x_block_mlp_fc1 = key.find("x_block.mlp.fc1"); - size_t x_block_mlp_fc2 = key.find("x_block.mlp.fc2"); - size_t context_block_mlp_fc1 = key.find("context_block.mlp.fc1"); - size_t context_block_mlp_fc2 = key.find("context_block.mlp.fc2"); - size_t x_block_adaLN_modulation_1 = key.find("x_block.adaLN_modulation.1"); - size_t context_block_adaLN_modulation_1 = key.find("context_block.adaLN_modulation.1"); - - size_t context_block_attn_proj = key.find("context_block.attn.proj"); - size_t x_block_attn_proj = key.find("x_block.attn.proj"); - size_t x_block_attn2_proj = key.find("x_block.attn2.proj"); - - size_t context_block_attn_qkv = key.find("context_block.attn.qkv"); - size_t x_block_attn_qkv = key.find("x_block.attn.qkv"); - size_t x_block_attn2_qkv = key.find("x_block.attn2.qkv"); - - size_t match = std::string::npos; - std::string new_name = ""; - if (final_layer_adaLN_modulation != std::string::npos) { - new_name = ".norm_out.linear"; - match = final_layer_adaLN_modulation; - } else if (pos_embed != std::string::npos) { - match = pos_embed; - new_name = ".pos_embed.proj"; - } else if (final_layer_linear != std::string::npos) { - match = final_layer_linear; - new_name = ".proj_out"; - } else if (y_embedder_mlp_0 != std::string::npos) { - match = y_embedder_mlp_0; - new_name = ".time_text_embed.text_embedder.linear_1"; - } else if (y_embedder_mlp_2 != std::string::npos) { - match = y_embedder_mlp_2; - new_name = ".time_text_embed.text_embedder.linear_2"; - } else if (t_embedder_mlp_0 != std::string::npos) { - match = t_embedder_mlp_0; - new_name = ".time_text_embed.timestep_embedder.linear_1"; - } else if (t_embedder_mlp_2 != std::string::npos) { - match = t_embedder_mlp_2; - new_name = ".time_text_embed.timestep_embedder.linear_2"; - } else if (x_block_mlp_fc1 != std::string::npos) { - match = x_block_mlp_fc1; - new_name = ".ff.net.0.proj"; - } else if (x_block_mlp_fc2 != std::string::npos) { - match = x_block_mlp_fc2; - new_name = ".ff.net.2"; - } else if (context_block_mlp_fc1 != std::string::npos) { - match = context_block_mlp_fc1; - new_name = ".ff_context.net.0.proj"; - } else if (context_block_mlp_fc2 != std::string::npos) { - match = context_block_mlp_fc2; - new_name = ".ff_context.net.2"; - } else if (x_block_adaLN_modulation_1 != std::string::npos) { - match = x_block_adaLN_modulation_1; - new_name = ".norm1.linear"; - } else if (context_block_adaLN_modulation_1 != std::string::npos) { - match = context_block_adaLN_modulation_1; - new_name = ".norm1_context.linear"; - } else if (context_block_attn_proj != std::string::npos) { - match = context_block_attn_proj; - new_name = ".attn.to_add_out"; - } else if (x_block_attn_proj != std::string::npos) { - match = x_block_attn_proj; - new_name = ".attn.to_out.0"; - } else if (x_block_attn2_proj != std::string::npos) { - match = x_block_attn2_proj; - new_name = ".attn2.to_out.0"; - } - if (match != std::string::npos) { - match--; - lora_down_name = lora_pre[type] + key.substr(0, match) + new_name + lora_downs[type] + ".weight"; - if (lora_tensors.find(lora_down_name) != lora_tensors.end()) { - lora_up_name = lora_pre[type] + key.substr(0, match) + new_name + lora_ups[type] + ".weight"; - if (lora_tensors.find(lora_up_name) != lora_tensors.end()) { - lora_up = lora_tensors[lora_up_name]; - } - - if (lora_tensors.find(lora_down_name) != lora_tensors.end()) { - lora_down = lora_tensors[lora_down_name]; - } - - applied_lora_tensors.insert(lora_down_name); - applied_lora_tensors.insert(lora_up_name); + if (lora_tensors.find(split_v_u_name) != lora_tensors.end()) { + lora_v_up = to_f32(compute_ctx, lora_tensors[split_v_u_name]); } - } - std::string prefix = ""; - std::string suffix = ""; - - if (context_block_attn_qkv != std::string::npos) { - match = context_block_attn_qkv; - prefix = ".attn.add_"; - suffix = "_proj"; - } else if (x_block_attn_qkv != std::string::npos) { - match = x_block_attn_qkv; - prefix = ".attn.to_"; - suffix = ""; - } else if (x_block_attn2_qkv != std::string::npos) { - match = x_block_attn2_qkv; - prefix = ".attn2.to_"; - suffix = ""; - } - if (match != std::string::npos) { - match--; - auto split_q_d_name = lora_pre[type] + key.substr(0, match) + prefix + "q" + suffix + lora_downs[type] + ".weight"; - if (lora_tensors.find(split_q_d_name) != lora_tensors.end()) { - // print_ggml_tensor(it.second, true); //[3072, 21504, 1, 1] - // find qkv and mlp up parts in LoRA model - auto split_k_d_name = lora_pre[type] + key.substr(0, match) + prefix + "k" + suffix + lora_downs[type] + ".weight"; - auto split_v_d_name = lora_pre[type] + key.substr(0, match) + prefix + "v" + suffix + lora_downs[type] + ".weight"; - - auto split_q_u_name = lora_pre[type] + key.substr(0, match) + prefix + "q" + suffix + lora_ups[type] + ".weight"; - auto split_k_u_name = lora_pre[type] + key.substr(0, match) + prefix + "k" + suffix + lora_ups[type] + ".weight"; - auto split_v_u_name = lora_pre[type] + key.substr(0, match) + prefix + "v" + suffix + lora_ups[type] + ".weight"; - - ggml_tensor* lora_q_down = NULL; - ggml_tensor* lora_q_up = NULL; - ggml_tensor* lora_k_down = NULL; - ggml_tensor* lora_k_up = NULL; - ggml_tensor* lora_v_down = NULL; - ggml_tensor* lora_v_up = NULL; - lora_q_down = to_f32(compute_ctx, lora_tensors[split_q_d_name]); + if (lora_tensors.find(split_m_d_name) != lora_tensors.end()) { + lora_m_down = to_f32(compute_ctx, lora_tensors[split_m_d_name]); + } - if (lora_tensors.find(split_q_u_name) != lora_tensors.end()) { - lora_q_up = to_f32(compute_ctx, lora_tensors[split_q_u_name]); - } - - if (lora_tensors.find(split_k_d_name) != lora_tensors.end()) { - lora_k_down = to_f32(compute_ctx, lora_tensors[split_k_d_name]); - } - - if (lora_tensors.find(split_k_u_name) != lora_tensors.end()) { - lora_k_up = to_f32(compute_ctx, lora_tensors[split_k_u_name]); - } - - if (lora_tensors.find(split_v_d_name) != lora_tensors.end()) { - lora_v_down = to_f32(compute_ctx, lora_tensors[split_v_d_name]); - } - - if (lora_tensors.find(split_v_u_name) != lora_tensors.end()) { - lora_v_up = to_f32(compute_ctx, lora_tensors[split_v_u_name]); - } - - // print_ggml_tensor(lora_q_down, true); //[hidden_size, R, 1, 1] - // print_ggml_tensor(lora_k_down, true); //[hidden_size, R, 1, 1] - // print_ggml_tensor(lora_v_down, true); //[hidden_size, R, 1, 1] - // print_ggml_tensor(lora_q_up, true); //[R, hidden_size, 1, 1] - // print_ggml_tensor(lora_k_up, true); //[R, hidden_size, 1, 1] - // print_ggml_tensor(lora_v_up, true); //[R, hidden_size, 1, 1] - - // these need to be stitched together this way: - // |q_up,0 ,0 | - // |0 ,k_up,0 | - // |0 ,0 ,v_up| - // (q_down,k_down,v_down) . (q ,k ,v) - - // up_concat will be [4608, R*3, 1, 1] - // down_concat will be [R*3, hidden_size, 1, 1] - ggml_tensor* lora_down_concat = ggml_concat(compute_ctx, ggml_concat(compute_ctx, lora_q_down, lora_k_down, 1), lora_v_down, 1); - - ggml_tensor* z = ggml_dup_tensor(compute_ctx, lora_q_up); - ggml_scale(compute_ctx, z, 0); - ggml_tensor* zz = ggml_concat(compute_ctx, z, z, 1); - - ggml_tensor* q_up = ggml_concat(compute_ctx, lora_q_up, zz, 1); - ggml_tensor* k_up = ggml_concat(compute_ctx, ggml_concat(compute_ctx, z, lora_k_up, 1), z, 1); - ggml_tensor* v_up = ggml_concat(compute_ctx, zz, lora_v_up, 1); - // print_ggml_tensor(q_up, true); //[R, hidden_size * 3, 1, 1] - // print_ggml_tensor(k_up, true); //[R, hidden_size * 3, 1, 1] - // print_ggml_tensor(v_up, true); //[R, hidden_size * 3, 1, 1] - ggml_tensor* lora_up_concat = ggml_concat(compute_ctx, ggml_concat(compute_ctx, q_up, k_up, 0), v_up, 0); - // print_ggml_tensor(lora_up_concat, true); //[R*3, hidden_size * 3, 1, 1] - - lora_down = ggml_cont(compute_ctx, lora_down_concat); - lora_up = ggml_cont(compute_ctx, lora_up_concat); - - applied_lora_tensors.insert(split_q_u_name); - applied_lora_tensors.insert(split_k_u_name); - applied_lora_tensors.insert(split_v_u_name); - - applied_lora_tensors.insert(split_q_d_name); - applied_lora_tensors.insert(split_k_d_name); - applied_lora_tensors.insert(split_v_d_name); + if (lora_tensors.find(split_m_u_name) != lora_tensors.end()) { + lora_m_up = to_f32(compute_ctx, lora_tensors[split_m_u_name]); } + + // print_ggml_tensor(lora_q_down, true); //[3072, R, 1, 1] + // print_ggml_tensor(lora_k_down, true); //[3072, R, 1, 1] + // print_ggml_tensor(lora_v_down, true); //[3072, R, 1, 1] + // print_ggml_tensor(lora_m_down, true); //[3072, R, 1, 1] + // print_ggml_tensor(lora_q_up, true); //[R, 3072, 1, 1] + // print_ggml_tensor(lora_k_up, true); //[R, 3072, 1, 1] + // print_ggml_tensor(lora_v_up, true); //[R, 3072, 1, 1] + // print_ggml_tensor(lora_m_up, true); //[R, 12288, 1, 1] + + // these need to be stitched together this way: + // |q_up,0 ,0 ,0 | + // |0 ,k_up,0 ,0 | + // |0 ,0 ,v_up,0 | + // |0 ,0 ,0 ,m_up| + // (q_down,k_down,v_down,m_down) . (q ,k ,v ,m) + + // up_concat will be [21504, R*4, 1, 1] + // down_concat will be [R*4, 3072, 1, 1] + + ggml_tensor* lora_down_concat = ggml_concat(compute_ctx, ggml_concat(compute_ctx, lora_q_down, lora_k_down, 1), ggml_concat(compute_ctx, lora_v_down, lora_m_down, 1), 1); + // print_ggml_tensor(lora_down_concat, true); //[3072, R*4, 1, 1] + + // this also means that if rank is bigger than 672, it is less memory efficient to do it this way (should be fine) + // print_ggml_tensor(lora_q_up, true); //[3072, R, 1, 1] + ggml_tensor* z = ggml_dup_tensor(compute_ctx, lora_q_up); + ggml_tensor* mlp_z = ggml_dup_tensor(compute_ctx, lora_m_up); + ggml_scale(compute_ctx, z, 0); + ggml_scale(compute_ctx, mlp_z, 0); + ggml_tensor* zz = ggml_concat(compute_ctx, z, z, 1); + + ggml_tensor* q_up = ggml_concat(compute_ctx, ggml_concat(compute_ctx, lora_q_up, zz, 1), mlp_z, 1); + ggml_tensor* k_up = ggml_concat(compute_ctx, ggml_concat(compute_ctx, z, lora_k_up, 1), ggml_concat(compute_ctx, z, mlp_z, 1), 1); + ggml_tensor* v_up = ggml_concat(compute_ctx, ggml_concat(compute_ctx, zz, lora_v_up, 1), mlp_z, 1); + ggml_tensor* m_up = ggml_concat(compute_ctx, ggml_concat(compute_ctx, zz, z, 1), lora_m_up, 1); + // print_ggml_tensor(q_up, true); //[R, 21504, 1, 1] + // print_ggml_tensor(k_up, true); //[R, 21504, 1, 1] + // print_ggml_tensor(v_up, true); //[R, 21504, 1, 1] + // print_ggml_tensor(m_up, true); //[R, 21504, 1, 1] + + ggml_tensor* lora_up_concat = ggml_concat(compute_ctx, ggml_concat(compute_ctx, q_up, k_up, 0), ggml_concat(compute_ctx, v_up, m_up, 0), 0); + // print_ggml_tensor(lora_up_concat, true); //[R*4, 21504, 1, 1] + + lora_down = ggml_cont(compute_ctx, lora_down_concat); + lora_up = ggml_cont(compute_ctx, lora_up_concat); + + applied_lora_tensors.insert(split_q_u_name); + applied_lora_tensors.insert(split_k_u_name); + applied_lora_tensors.insert(split_v_u_name); + applied_lora_tensors.insert(split_m_u_name); + + applied_lora_tensors.insert(split_q_d_name); + applied_lora_tensors.insert(split_k_d_name); + applied_lora_tensors.insert(split_v_d_name); + applied_lora_tensors.insert(split_m_d_name); } } if (lora_up == NULL || lora_down == NULL) { @@ -788,6 +536,7 @@ struct LoraModel : public GGMLRunner { if (applied_lora_tensors.find(kv.first) == applied_lora_tensors.end()) { LOG_WARN("unused lora tensor %s", kv.first.c_str()); print_ggml_tensor(kv.second, true); + // exit(0); } else { applied_lora_tensors_count++; } From 1caefec6e1a0ac8806d4ad094a2db4c8dece25a1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?St=C3=A9phane=20du=20Hamel?= Date: Fri, 13 Dec 2024 01:38:59 +0100 Subject: [PATCH 10/12] split qkv scales --- lora.hpp | 123 ++++++++++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 118 insertions(+), 5 deletions(-) diff --git a/lora.hpp b/lora.hpp index ae8d5bfaf..012851a4e 100644 --- a/lora.hpp +++ b/lora.hpp @@ -247,10 +247,11 @@ struct LoraModel : public GGMLRunner { ggml_tensor* lora_up = NULL; ggml_tensor* lora_down = NULL; - std::string alpha_name = ""; - std::string scale_name = ""; - std::string lora_down_name = ""; - std::string lora_up_name = ""; + std::string alpha_name = ""; + std::string scale_name = ""; + std::string split_q_scale_name = ""; + std::string lora_down_name = ""; + std::string lora_up_name = ""; if (starts_with(key, "SPLIT|")) { key = key.substr(sizeof("SPLIT|") - 1); @@ -272,6 +273,14 @@ struct LoraModel : public GGMLRunner { auto split_k_u_name = lora_pre[type] + key + "k" + suffix + lora_ups[type] + ".weight"; auto split_v_u_name = lora_pre[type] + key + "v" + suffix + lora_ups[type] + ".weight"; + auto split_q_scale_name = lora_pre[type] + key + "q" + suffix + ".scale"; + auto split_k_scale_name = lora_pre[type] + key + "k" + suffix + ".scale"; + auto split_v_scale_name = lora_pre[type] + key + "v" + suffix + ".scale"; + + auto split_q_alpha_name = lora_pre[type] + key + "q" + suffix + ".alpha"; + auto split_k_alpha_name = lora_pre[type] + key + "k" + suffix + ".alpha"; + auto split_v_alpha_name = lora_pre[type] + key + "v" + suffix + ".alpha"; + ggml_tensor* lora_q_down = NULL; ggml_tensor* lora_q_up = NULL; ggml_tensor* lora_k_down = NULL; @@ -301,6 +310,47 @@ struct LoraModel : public GGMLRunner { lora_v_up = to_f32(compute_ctx, lora_tensors[split_v_u_name]); } + float q_rank = lora_q_up->ne[0]; + float k_rank = lora_k_up->ne[0]; + float v_rank = lora_v_up->ne[0]; + + float lora_q_scale = 1; + float lora_k_scale = 1; + float lora_v_scale = 1; + + if (lora_tensors.find(split_q_scale_name) != lora_tensors.end()) { + lora_q_scale = ggml_backend_tensor_get_f32(lora_tensors[split_q_scale_name]); + applied_lora_tensors.insert(split_q_scale_name); + } + if (lora_tensors.find(split_k_scale_name) != lora_tensors.end()) { + lora_k_scale = ggml_backend_tensor_get_f32(lora_tensors[split_k_scale_name]); + applied_lora_tensors.insert(split_k_scale_name); + } + if (lora_tensors.find(split_v_scale_name) != lora_tensors.end()) { + lora_v_scale = ggml_backend_tensor_get_f32(lora_tensors[split_v_scale_name]); + applied_lora_tensors.insert(split_v_scale_name); + } + + if (lora_tensors.find(split_q_alpha_name) != lora_tensors.end()) { + float lora_q_alpha = ggml_backend_tensor_get_f32(lora_tensors[split_q_alpha_name]); + applied_lora_tensors.insert(split_q_alpha_name); + lora_q_scale = lora_q_alpha / q_rank; + } + if (lora_tensors.find(split_k_alpha_name) != lora_tensors.end()) { + float lora_k_alpha = ggml_backend_tensor_get_f32(lora_tensors[split_k_alpha_name]); + applied_lora_tensors.insert(split_k_alpha_name); + lora_k_scale = lora_k_alpha / k_rank; + } + if (lora_tensors.find(split_v_alpha_name) != lora_tensors.end()) { + float lora_v_alpha = ggml_backend_tensor_get_f32(lora_tensors[split_v_alpha_name]); + applied_lora_tensors.insert(split_v_alpha_name); + lora_v_scale = lora_v_alpha / v_rank; + } + + ggml_scale_inplace(compute_ctx, lora_q_down, lora_q_scale); + ggml_scale_inplace(compute_ctx, lora_k_down, lora_k_scale); + ggml_scale_inplace(compute_ctx, lora_v_down, lora_v_scale); + // print_ggml_tensor(lora_q_down, true); //[3072, R, 1, 1] // print_ggml_tensor(lora_k_down, true); //[3072, R, 1, 1] // print_ggml_tensor(lora_v_down, true); //[3072, R, 1, 1] @@ -360,6 +410,16 @@ struct LoraModel : public GGMLRunner { auto split_m_d_name = lora_pre[type] + key + "proj_mlp" + lora_downs[type] + ".weight"; auto split_m_u_name = lora_pre[type] + key + "proj_mlp" + lora_ups[type] + ".weight"; + auto split_q_scale_name = lora_pre[type] + key + "attn.to_q" + ".scale"; + auto split_k_scale_name = lora_pre[type] + key + "attn.to_k" + ".scale"; + auto split_v_scale_name = lora_pre[type] + key + "attn.to_v" + ".scale"; + auto split_m_scale_name = lora_pre[type] + key + "proj_mlp" + ".scale"; + + auto split_q_alpha_name = lora_pre[type] + key + "attn.to_q" + ".alpha"; + auto split_k_alpha_name = lora_pre[type] + key + "attn.to_k" + ".alpha"; + auto split_v_alpha_name = lora_pre[type] + key + "attn.to_v" + ".alpha"; + auto split_m_alpha_name = lora_pre[type] + key + "proj_mlp" + ".alpha"; + ggml_tensor* lora_q_down = NULL; ggml_tensor* lora_q_up = NULL; ggml_tensor* lora_k_down = NULL; @@ -404,6 +464,59 @@ struct LoraModel : public GGMLRunner { lora_m_up = to_f32(compute_ctx, lora_tensors[split_m_u_name]); } + float q_rank = lora_q_up->ne[0]; + float k_rank = lora_k_up->ne[0]; + float v_rank = lora_v_up->ne[0]; + float m_rank = lora_v_up->ne[0]; + + float lora_q_scale = 1; + float lora_k_scale = 1; + float lora_v_scale = 1; + float lora_m_scale = 1; + + if (lora_tensors.find(split_q_scale_name) != lora_tensors.end()) { + lora_q_scale = ggml_backend_tensor_get_f32(lora_tensors[split_q_scale_name]); + applied_lora_tensors.insert(split_q_scale_name); + } + if (lora_tensors.find(split_k_scale_name) != lora_tensors.end()) { + lora_k_scale = ggml_backend_tensor_get_f32(lora_tensors[split_k_scale_name]); + applied_lora_tensors.insert(split_k_scale_name); + } + if (lora_tensors.find(split_v_scale_name) != lora_tensors.end()) { + lora_v_scale = ggml_backend_tensor_get_f32(lora_tensors[split_v_scale_name]); + applied_lora_tensors.insert(split_v_scale_name); + } + if (lora_tensors.find(split_m_scale_name) != lora_tensors.end()) { + lora_m_scale = ggml_backend_tensor_get_f32(lora_tensors[split_m_scale_name]); + applied_lora_tensors.insert(split_m_scale_name); + } + + if (lora_tensors.find(split_q_alpha_name) != lora_tensors.end()) { + float lora_q_alpha = ggml_backend_tensor_get_f32(lora_tensors[split_q_alpha_name]); + applied_lora_tensors.insert(split_q_alpha_name); + lora_q_scale = lora_q_alpha / q_rank; + } + if (lora_tensors.find(split_k_alpha_name) != lora_tensors.end()) { + float lora_k_alpha = ggml_backend_tensor_get_f32(lora_tensors[split_k_alpha_name]); + applied_lora_tensors.insert(split_k_alpha_name); + lora_k_scale = lora_k_alpha / k_rank; + } + if (lora_tensors.find(split_v_alpha_name) != lora_tensors.end()) { + float lora_v_alpha = ggml_backend_tensor_get_f32(lora_tensors[split_v_alpha_name]); + applied_lora_tensors.insert(split_v_alpha_name); + lora_v_scale = lora_v_alpha / v_rank; + } + if (lora_tensors.find(split_m_alpha_name) != lora_tensors.end()) { + float lora_m_alpha = ggml_backend_tensor_get_f32(lora_tensors[split_m_alpha_name]); + applied_lora_tensors.insert(split_m_alpha_name); + lora_m_scale = lora_m_alpha / m_rank; + } + + ggml_scale_inplace(compute_ctx, lora_q_down, lora_q_scale); + ggml_scale_inplace(compute_ctx, lora_k_down, lora_k_scale); + ggml_scale_inplace(compute_ctx, lora_v_down, lora_v_scale); + ggml_scale_inplace(compute_ctx, lora_m_down, lora_m_scale); + // print_ggml_tensor(lora_q_down, true); //[3072, R, 1, 1] // print_ggml_tensor(lora_k_down, true); //[3072, R, 1, 1] // print_ggml_tensor(lora_v_down, true); //[3072, R, 1, 1] @@ -534,7 +647,7 @@ struct LoraModel : public GGMLRunner { for (auto& kv : lora_tensors) { total_lora_tensors_count++; if (applied_lora_tensors.find(kv.first) == applied_lora_tensors.end()) { - LOG_WARN("unused lora tensor %s", kv.first.c_str()); + LOG_WARN("unused lora tensor |%s|", kv.first.c_str()); print_ggml_tensor(kv.second, true); // exit(0); } else { From 68c23f3e2919d8b06bc2e8b20ee6b8b43efa1d63 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?St=C3=A9phane=20du=20Hamel?= Date: Fri, 13 Dec 2024 20:06:31 +0100 Subject: [PATCH 11/12] Make sure lora tensors are only applied once -_- --- lora.hpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/lora.hpp b/lora.hpp index 012851a4e..1ca781f07 100644 --- a/lora.hpp +++ b/lora.hpp @@ -243,10 +243,9 @@ struct LoraModel : public GGMLRunner { std::vector keys = to_lora_keys(k_tensor, version); if (keys.size() == 0) continue; + ggml_tensor* lora_up = NULL; + ggml_tensor* lora_down = NULL; for (auto& key : keys) { - ggml_tensor* lora_up = NULL; - ggml_tensor* lora_down = NULL; - std::string alpha_name = ""; std::string scale_name = ""; std::string split_q_scale_name = ""; @@ -639,6 +638,7 @@ struct LoraModel : public GGMLRunner { } // final_weight = ggml_add_inplace(compute_ctx, weight, updown); // apply directly ggml_build_forward_expand(gf, final_weight); + break; } } size_t total_lora_tensors_count = 0; From e54d604968ed9ae8960d77b4d174319ee3715a20 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?St=C3=A9phane=20du=20Hamel?= Date: Sat, 14 Dec 2024 01:53:08 +0100 Subject: [PATCH 12/12] lora: better photomaker support? --- lora.hpp | 1 + 1 file changed, 1 insertion(+) diff --git a/lora.hpp b/lora.hpp index 1ca781f07..83181837c 100644 --- a/lora.hpp +++ b/lora.hpp @@ -180,6 +180,7 @@ struct LoraModel : public GGMLRunner { blk_name = blk_name.substr(0, k_pos); // } keys.push_back(blk_name); + keys.push_back("lora." + blk_name); if (sd_version_is_dit(version)) { if (blk_name.find("model.diffusion_model") != std::string::npos) { blk_name.replace(blk_name.find("model.diffusion_model"), sizeof("model.diffusion_model") - 1, "transformer");