Skip to content

Commit 966d0e0

Browse files
authored
export separate lora ggufs instead
1 parent 9a39ccb commit 966d0e0

File tree

13 files changed

+85
-82
lines changed

13 files changed

+85
-82
lines changed

common/arg.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2460,15 +2460,15 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
24602460
{"--lora"}, "FNAME",
24612461
"path to LoRA adapter (can be repeated to use multiple adapters)",
24622462
[](common_params & params, const std::string & value) {
2463-
params.lora_adapters.push_back({ std::string(value), 1.0, nullptr });
2463+
params.lora_adapters.push_back({ std::string(value), 1.0, "", "", nullptr });
24642464
}
24652465
// we define this arg on both COMMON and EXPORT_LORA, so when showing help message of export-lora, it will be categorized as "example-specific" arg
24662466
).set_examples({LLAMA_EXAMPLE_COMMON, LLAMA_EXAMPLE_EXPORT_LORA}));
24672467
add_opt(common_arg(
24682468
{"--lora-scaled"}, "FNAME", "SCALE",
24692469
"path to LoRA adapter with user defined scaling (can be repeated to use multiple adapters)",
24702470
[](common_params & params, const std::string & fname, const std::string & scale) {
2471-
params.lora_adapters.push_back({ fname, std::stof(scale), nullptr });
2471+
params.lora_adapters.push_back({ fname, std::stof(scale), "", "", nullptr });
24722472
}
24732473
// we define this arg on both COMMON and EXPORT_LORA, so when showing help message of export-lora, it will be categorized as "example-specific" arg
24742474
).set_examples({LLAMA_EXAMPLE_COMMON, LLAMA_EXAMPLE_EXPORT_LORA}));

common/common.cpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -993,6 +993,8 @@ struct common_init_result common_init_from_params(common_params & params) {
993993
}
994994

995995
la.ptr = lora.get();
996+
la.task_name = llama_adapter_lora_task_name(la.ptr);
997+
la.prompt_prefix = llama_adapter_lora_prompt_prefix(la.ptr);
996998
iparams.lora.emplace_back(std::move(lora)); // copy to list of loaded adapters
997999
}
9981000

common/common.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,9 @@ struct common_adapter_lora_info {
3131
std::string path;
3232
float scale;
3333

34+
std::string task_name;
35+
std::string prompt_prefix;
36+
3437
struct llama_adapter_lora * ptr;
3538
};
3639

convert_hf_to_gguf.py

Lines changed: 49 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -64,6 +64,7 @@ class ModelBase:
6464
endianess: gguf.GGUFEndian
6565
use_temp_file: bool
6666
lazy: bool
67+
dry_run: bool
6768
part_names: list[str]
6869
is_safetensors: bool
6970
hparams: dict[str, Any]
@@ -98,6 +99,7 @@ def __init__(self, dir_model: Path, ftype: gguf.LlamaFileType, fname_out: Path,
9899
self.endianess = gguf.GGUFEndian.BIG if is_big_endian else gguf.GGUFEndian.LITTLE
99100
self.use_temp_file = use_temp_file
100101
self.lazy = not eager or (remote_hf_model_id is not None)
102+
self.dry_run = dry_run
101103
self.remote_hf_model_id = remote_hf_model_id
102104
if remote_hf_model_id is not None:
103105
self.is_safetensors = True
@@ -4153,18 +4155,31 @@ def modify_tensors(self, data_torch, name, bid):
41534155
@ModelBase.register("XLMRobertaModel", "XLMRobertaForSequenceClassification")
41544156
class XLMRobertaModel(BertModel):
41554157
model_arch = gguf.MODEL_ARCH.BERT
4158+
_lora_files = {}
41564159

41574160
def __init__(self, dir_model: Path, ftype: gguf.LlamaFileType, fname_out: Path, **kwargs: Any):
41584161
hparams = kwargs.pop("hparams", None)
41594162
if hparams is None:
41604163
hparams = ModelBase.load_hparams(dir_model)
41614164

4162-
if hparams.get("lora_adaptations"):
4165+
if lora_names := hparams.get("lora_adaptations"):
41634166
self.model_arch = gguf.MODEL_ARCH.JINA_BERT_V3
41644167

41654168
super().__init__(dir_model, ftype, fname_out, hparams=hparams, **kwargs)
4169+
4170+
if lora_names:
4171+
for name in lora_names:
4172+
fname = self.add_prefix_to_filename(self.fname_out, f"lora-{name}-")
4173+
self._lora_files[name] = gguf.GGUFWriter(fname, arch=gguf.MODEL_ARCH_NAMES[self.model_arch], endianess=self.endianess, use_temp_file=self.use_temp_file, dry_run=self.dry_run)
4174+
41664175
self._xlmroberta_tokenizer_init()
41674176

4177+
def set_type(self):
4178+
for lora_writer in self._lora_files.values():
4179+
lora_writer.add_type(gguf.GGUFType.ADAPTER)
4180+
lora_writer.add_string(gguf.Keys.Adapter.TYPE, "lora")
4181+
super().set_type()
4182+
41684183
def set_vocab(self):
41694184
self._xlmroberta_set_vocab()
41704185

@@ -4185,36 +4200,52 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
41854200
if self._position_offset is not None:
41864201
data_torch = data_torch[self._position_offset:,:]
41874202

4188-
if name.endswith(".weight.0.lora_A") or name.endswith(".weight.0.lora_B"):
4203+
if name.endswith(".0.lora_A") or name.endswith(".0.lora_B"):
41894204
if name.startswith("pooler.dense"):
4190-
return
4205+
return []
41914206

4192-
lora_name = self.hparams["lora_adaptations"]
41934207
num_loras = data_torch.size(0)
4194-
assert num_loras == len(lora_name)
4208+
assert num_loras == len(self._lora_files)
4209+
4210+
# Split out each LoRA in their own GGUF
4211+
for i, lora_writer in enumerate(self._lora_files.values()):
4212+
new_name = self.map_tensor_name(name[:-9]) + name[-7:].lower()
4213+
data_qtype = gguf.GGMLQuantizationType.F32
4214+
data = data_torch[i, :, :]
4215+
# Transpose/flip token_embd/types into correct shape
4216+
if new_name == "token_embd.weight.lora_b":
4217+
data = data.T
4218+
elif new_name.startswith("token_types.weight."):
4219+
new_name = new_name[:-1] + ("a" if new_name[-1:] == "b" else "b")
4220+
data = gguf.quants.quantize(data.numpy(), data_qtype)
4221+
lora_writer.add_tensor(new_name, data, raw_dtype=data_qtype)
41954222

4196-
# Split out each LoRA in their own named tensors
4197-
# Remove "weight" from the name to not confuse quantize
4198-
for i in range(num_loras):
4199-
data_lora = data_torch[i, :, :]
4200-
yield (self.map_tensor_name(name[:-16]) + name[-16:].lower().replace("weight.0.", f"<{lora_name[i]}>"), data_lora)
4201-
return
4223+
return []
42024224

4203-
yield from super().modify_tensors(data_torch, name, bid)
4225+
return super().modify_tensors(data_torch, name, bid)
42044226

42054227
def set_gguf_parameters(self):
42064228
super().set_gguf_parameters()
42074229

42084230
# jina-embeddings-v3
42094231
if rotary_emb_base := self.hparams.get("rotary_emb_base"):
42104232
self.gguf_writer.add_rope_freq_base(rotary_emb_base)
4211-
if lora_alpha := self.hparams.get("lora_alpha"):
4212-
self.gguf_writer.add_float32(gguf.Keys.Adapter.LORA_ALPHA, lora_alpha)
4213-
if lora_names := self.hparams.get("lora_adaptations"):
4214-
self.gguf_writer.add_array(gguf.Keys.Adapter.LORA_NAMES, lora_names)
4233+
lora_alpha = self.hparams.get("lora_alpha")
42154234
if lora_prompt_prefixes := self.hparams.get("task_instructions"):
4216-
assert lora_names and all(lora_name in lora_prompt_prefixes for lora_name in lora_names)
4217-
self.gguf_writer.add_array(gguf.Keys.Adapter.LORA_PROMPT_PREFIXES, [lora_prompt_prefixes[lora_name] for lora_name in lora_names])
4235+
assert self._lora_files and all(lora_name in lora_prompt_prefixes for lora_name in self._lora_files.keys())
4236+
for lora_name, lora_writer in self._lora_files.items():
4237+
lora_writer.add_float32(gguf.Keys.Adapter.LORA_ALPHA, lora_alpha if lora_alpha is not None else 1.0)
4238+
lora_writer.add_string(gguf.Keys.Adapter.LORA_TASK_NAME, lora_name)
4239+
if lora_prompt_prefixes:
4240+
lora_writer.add_string(gguf.Keys.Adapter.LORA_PROMPT_PREFIX, lora_prompt_prefixes[lora_name])
4241+
4242+
def write(self):
4243+
super().write()
4244+
for lora_writer in self._lora_files.values():
4245+
lora_writer.write_header_to_file()
4246+
lora_writer.write_kv_data_to_file()
4247+
lora_writer.write_tensors_to_file(progress=True)
4248+
lora_writer.close()
42184249

42194250

42204251
@ModelBase.register("GemmaForCausalLM")

gguf-py/gguf/constants.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -227,10 +227,10 @@ class Tokenizer:
227227
MIDDLE_ID = "tokenizer.ggml.middle_token_id"
228228

229229
class Adapter:
230-
TYPE = "adapter.type"
231-
LORA_ALPHA = "adapter.lora.alpha"
232-
LORA_NAMES = "adapter.lora.names"
233-
LORA_PROMPT_PREFIXES = "adapter.lora.prompt_prefixes"
230+
TYPE = "adapter.type"
231+
LORA_ALPHA = "adapter.lora.alpha"
232+
LORA_TASK_NAME = "adapter.lora.task_name"
233+
LORA_PROMPT_PREFIX = "adapter.lora.prompt_prefix"
234234

235235
class Clip:
236236
PROJECTOR_TYPE = "clip.projector_type"

include/llama.h

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -588,6 +588,12 @@ extern "C" {
588588
struct llama_model * model,
589589
const char * path_lora);
590590

591+
// Get the LoRA task name. Returns a blank string if not applicable
592+
LLAMA_API const char * llama_adapter_lora_task_name(struct llama_adapter_lora * adapter);
593+
594+
// Get the required LoRA prompt prefix. Returns a blank string if not applicable
595+
LLAMA_API const char * llama_adapter_lora_prompt_prefix(struct llama_adapter_lora * adapter);
596+
591597
// Manually free a LoRA adapter
592598
// Note: loaded adapters will be free when the associated model is deleted
593599
LLAMA_API void llama_adapter_lora_free(struct llama_adapter_lora * adapter);

src/llama-adapter.cpp

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -190,6 +190,8 @@ static void llama_adapter_lora_init_impl(llama_model & model, const char * path_
190190
}
191191

192192
adapter.alpha = get_kv_f32(llm_kv(LLM_KV_ADAPTER_LORA_ALPHA));
193+
adapter.task_name = get_kv_str(llm_kv(LLM_KV_ADAPTER_LORA_TASK_NAME));
194+
adapter.prompt_prefix = get_kv_str(llm_kv(LLM_KV_ADAPTER_LORA_PROMPT_PREFIX));
193195
}
194196

195197
int n_tensors = gguf_get_n_tensors(ctx_gguf.get());
@@ -383,6 +385,14 @@ llama_adapter_lora * llama_adapter_lora_init(llama_model * model, const char * p
383385
return nullptr;
384386
}
385387

388+
const char * llama_adapter_lora_task_name(llama_adapter_lora * adapter) {
389+
return adapter->task_name.c_str();
390+
}
391+
392+
const char * llama_adapter_lora_prompt_prefix(llama_adapter_lora * adapter) {
393+
return adapter->prompt_prefix.c_str();
394+
}
395+
386396
void llama_adapter_lora_free(llama_adapter_lora * adapter) {
387397
delete adapter;
388398
}

src/llama-adapter.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -66,6 +66,7 @@ struct llama_adapter_lora {
6666
std::vector<ggml_backend_buffer_ptr> bufs;
6767

6868
float alpha;
69+
std::string task_name;
6970
std::string prompt_prefix;
7071

7172
llama_adapter_lora() = default;

src/llama-arch.cpp

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -217,10 +217,10 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
217217
{ LLM_KV_TOKENIZER_FIM_REP_ID, "tokenizer.ggml.fim_rep_token_id" },
218218
{ LLM_KV_TOKENIZER_FIM_SEP_ID, "tokenizer.ggml.fim_sep_token_id" },
219219

220-
{ LLM_KV_ADAPTER_TYPE, "adapter.type" },
221-
{ LLM_KV_ADAPTER_LORA_ALPHA, "adapter.lora.alpha" },
222-
{ LLM_KV_ADAPTER_LORA_NAMES, "adapter.lora.names" },
223-
{ LLM_KV_ADAPTER_LORA_PROMPT_PREFIXES, "adapter.lora.prompt_prefixes" },
220+
{ LLM_KV_ADAPTER_TYPE, "adapter.type" },
221+
{ LLM_KV_ADAPTER_LORA_ALPHA, "adapter.lora.alpha" },
222+
{ LLM_KV_ADAPTER_LORA_TASK_NAME, "adapter.lora.task_name" },
223+
{ LLM_KV_ADAPTER_LORA_PROMPT_PREFIX, "adapter.lora.prompt_prefix" },
224224

225225
// deprecated
226226
{ LLM_KV_TOKENIZER_PREFIX_ID, "tokenizer.ggml.prefix_token_id" },

src/llama-arch.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -215,8 +215,8 @@ enum llm_kv {
215215

216216
LLM_KV_ADAPTER_TYPE,
217217
LLM_KV_ADAPTER_LORA_ALPHA,
218-
LLM_KV_ADAPTER_LORA_NAMES,
219-
LLM_KV_ADAPTER_LORA_PROMPT_PREFIXES,
218+
LLM_KV_ADAPTER_LORA_TASK_NAME,
219+
LLM_KV_ADAPTER_LORA_PROMPT_PREFIX,
220220

221221
LLM_KV_POSNET_EMBEDDING_LENGTH,
222222
LLM_KV_POSNET_BLOCK_COUNT,

0 commit comments

Comments
 (0)