@@ -211,8 +211,9 @@ struct lora_merge_ctx {
211211 }
212212 }
213213
214- // if true, this tensor can be lora-merged. if false, we skip merging and just copy data to outfile
215- std::vector<std::pair<struct ggml_tensor *, bool >> base_tensors;
214+ // mapping base tensor to out tensor (same shape with base, but different type)
215+ // if out_tensor == nullptr, we only copy it
216+ std::vector<std::pair<struct ggml_tensor *, struct ggml_tensor *>> base_to_out_tensors;
216217 for (auto & it : base_model.tensors ) {
217218 bool t_a = true ;
218219 bool t_b = true ;
@@ -221,22 +222,22 @@ struct lora_merge_ctx {
221222 t_b &= nullptr != adapter->get_tensor (it.first + " .lora_b" );
222223 }
223224 auto base_tensor = it.second ;
224- struct ggml_tensor * out_tensor;
225225 if (!t_a && !t_b) {
226226 // only copy
227- out_tensor = ggml_dup_tensor (ctx_out_ggml, base_tensor);
228- ggml_set_name (out_tensor, base_tensor->name );
229- base_tensors.push_back (std::make_pair (out_tensor, false ));
227+ struct ggml_tensor * cpy_tensor = ggml_dup_tensor (ctx_out_ggml, base_tensor);
228+ ggml_set_name (cpy_tensor, base_tensor->name );
229+ base_to_out_tensors.push_back (std::make_pair (cpy_tensor, nullptr ));
230+ gguf_add_tensor (ctx_out, cpy_tensor);
230231 } else if (t_a && t_b) {
231232 // need merging
232- out_tensor = ggml_dup_tensor (ctx_out_ggml, base_tensor);
233- out_tensor-> type = get_out_tensor_type (base_tensor);
233+ struct ggml_tensor * out_tensor = ggml_new_tensor (
234+ ctx_out_ggml, get_out_tensor_type (base_tensor), GGML_MAX_DIMS, base_tensor-> ne );
234235 ggml_set_name (out_tensor, base_tensor->name );
235- base_tensors.push_back (std::make_pair (out_tensor, true ));
236+ base_to_out_tensors.push_back (std::make_pair (base_tensor, out_tensor));
237+ gguf_add_tensor (ctx_out, out_tensor);
236238 } else {
237239 throw std::runtime_error (" tensor " + it.first + " missing either lora_a or lora_b" );
238240 }
239- gguf_add_tensor (ctx_out, out_tensor);
240241 }
241242
242243 // placeholder for the meta data
@@ -247,9 +248,9 @@ struct lora_merge_ctx {
247248
248249 // process base model tensors
249250 size_t n_merged = 0 ;
250- for (auto & it : base_tensors ) {
251- if (it.second ) {
252- merge_tensor (it.first );
251+ for (auto & it : base_to_out_tensors ) {
252+ if (it.second != nullptr ) {
253+ merge_tensor (it.first , it. second );
253254 n_merged++;
254255 } else {
255256 copy_tensor (it.first );
@@ -265,7 +266,7 @@ struct lora_merge_ctx {
265266 }
266267
267268 printf (" %s : merged %ld tensors with lora adapters\n " , __func__, n_merged);
268- printf (" %s : wrote %ld tensors to output file\n " , __func__, base_tensors .size ());
269+ printf (" %s : wrote %ld tensors to output file\n " , __func__, base_to_out_tensors .size ());
269270 }
270271
271272 void copy_tensor (struct ggml_tensor * base) {
@@ -276,7 +277,7 @@ struct lora_merge_ctx {
276277 zeros (fout, GGML_PAD (len, GGUF_DEFAULT_ALIGNMENT) - len);
277278 }
278279
279- void merge_tensor (struct ggml_tensor * base) {
280+ void merge_tensor (struct ggml_tensor * base, struct ggml_tensor * out ) {
280281 std::string name_base (base->name );
281282 std::string name_lora_a = name_base + " .lora_a" ;
282283 std::string name_lora_b = name_base + " .lora_b" ;
@@ -287,14 +288,14 @@ struct lora_merge_ctx {
287288 std::vector<struct ggml_tensor *> inp_a (adapters.size ());
288289 std::vector<struct ggml_tensor *> inp_b (adapters.size ());
289290 struct ggml_init_params params {
290- /* .mem_size =*/ ggml_tensor_overhead()*(1 +adapters.size()*2 ),
291+ /* .mem_size =*/ ggml_tensor_overhead()*(2 +adapters.size()*2 ),
291292 /* .mem_buffer =*/ NULL ,
292293 /* .no_alloc =*/ true ,
293294 };
294295 struct ggml_context * ctx = ggml_init (params);
295296
296297 // alloc tensors
297- struct ggml_tensor * inp = ggml_dup_tensor (ctx, base);
298+ struct ggml_tensor * inp_base = ggml_new_tensor (ctx, GGML_TYPE_F32, GGML_MAX_DIMS, base-> ne );
298299 for (size_t i = 0 ; i < adapters.size (); ++i) {
299300 auto t_a = adapters[i]->get_tensor (name_lora_a);
300301 auto t_b = adapters[i]->get_tensor (name_lora_b);
@@ -303,9 +304,21 @@ struct lora_merge_ctx {
303304 }
304305 ggml_backend_buffer_t buffer = ggml_backend_alloc_ctx_tensors (ctx, backend);
305306
306- // load data to backend buffer
307+ // load base tensor to backend buffer
307308 base_model.read_tensor_data (name_base, read_buf);
308- ggml_backend_tensor_set (inp, read_buf.data (), 0 , ggml_nbytes (inp));
309+ if (base->type != GGML_TYPE_F32) {
310+ // optionally dequantize it
311+ printf (" %s : + dequantize base tensor from %s to F32\n " , __func__, ggml_type_name (base->type ));
312+ auto nels = ggml_nelements (inp_base);
313+ ggml_type_traits_t qtype = ggml_internal_get_type_traits (base->type );
314+ std::vector<uint8_t > dequant_buf (nels * sizeof (float ));
315+ qtype.to_float (read_buf.data (), (float *)dequant_buf.data (), nels);
316+ ggml_backend_tensor_set (inp_base, dequant_buf.data (), 0 , dequant_buf.size ());
317+ } else {
318+ ggml_backend_tensor_set (inp_base, read_buf.data (), 0 , ggml_nbytes (inp_base));
319+ }
320+
321+ // load lora tensors to backend buffer
309322 for (size_t i = 0 ; i < adapters.size (); ++i) {
310323 adapters[i]->read_tensor_data (name_lora_a, read_buf);
311324 ggml_backend_tensor_set (inp_a[i], read_buf.data (), 0 , ggml_nbytes (inp_a[i]));
@@ -325,20 +338,21 @@ struct lora_merge_ctx {
325338 };
326339 struct ggml_context * ctx0 = ggml_init (params0);
327340 gf = ggml_new_graph (ctx0);
328- struct ggml_tensor * cur = inp ;
341+ struct ggml_tensor * cur = inp_base ;
329342 for (size_t i = 0 ; i < adapters.size (); ++i) {
330- struct ggml_tensor * a_T = ggml_cont (ctx0, ggml_transpose (ctx0, inp_a[i]));
331- struct ggml_tensor * delta = ggml_mul_mat (ctx0, a_T, inp_b[i]);
343+ struct ggml_tensor * a_T = ggml_cont (ctx0, ggml_transpose (ctx0, ggml_cast (ctx0, inp_a[i], GGML_TYPE_F32) ));
344+ struct ggml_tensor * delta = ggml_mul_mat (ctx0, a_T, ggml_cast (ctx0, inp_b[i], GGML_TYPE_F32) );
332345 // scale
333346 const float alpha = adapters[i]->alpha ;
334347 const float rank = (float ) inp_b[i]->ne [0 ];
335348 const float scale = alpha ? adapters[i]->scale * alpha / rank : adapters[i]->scale ;
336349 delta = ggml_scale (ctx0, delta, scale);
337- cur = ggml_add (ctx0, cur, delta );
338- printf (" %s : + merging from adapter[%ld]\n " , __func__, i);
350+ cur = ggml_add (ctx0, delta, cur );
351+ printf (" %s : + merging from adapter[%ld] type=%s \n " , __func__, i, ggml_type_name (inp_a[i]-> type ) );
339352 printf (" %s : input_scale=%f calculated_scale=%f rank=%d\n " , __func__, adapters[i]->scale , scale, (int ) inp_b[i]->ne [0 ]);
340353 }
341- cur = ggml_cast (ctx0, cur, get_out_tensor_type (base));
354+ cur = ggml_cast (ctx0, cur, out->type );
355+ printf (" %s : + output type is %s\n " , __func__, ggml_type_name (out->type ));
342356 ggml_build_forward_expand (gf, cur);
343357 ggml_free (ctx0);
344358 }
0 commit comments