Skip to content

Commit ceeddd4

Browse files
committed
Merge branch 'eso_b6311' into crokeso
2 parents 9602109 + 02f4240 commit ceeddd4

31 files changed

+1412
-434
lines changed

common/arg.cpp

Lines changed: 19 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1109,7 +1109,7 @@ static void common_params_print_completion(common_params_context & ctx_arg) {
11091109
printf("\"\n\n");
11101110

11111111
printf(" case \"$prev\" in\n");
1112-
printf(" --model)\n");
1112+
printf(" --model|-m)\n");
11131113
printf(" COMPREPLY=( $(compgen -f -X '!*.gguf' -- \"$cur\") $(compgen -d -- \"$cur\") )\n");
11141114
printf(" return 0\n");
11151115
printf(" ;;\n");
@@ -2556,15 +2556,15 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
25562556
{"--lora"}, "FNAME",
25572557
"path to LoRA adapter (can be repeated to use multiple adapters)",
25582558
[](common_params & params, const std::string & value) {
2559-
params.lora_adapters.push_back({ std::string(value), 1.0, nullptr });
2559+
params.lora_adapters.push_back({ std::string(value), 1.0, "", "", nullptr });
25602560
}
25612561
// we define this arg on both COMMON and EXPORT_LORA, so when showing help message of export-lora, it will be categorized as "example-specific" arg
25622562
).set_examples({LLAMA_EXAMPLE_COMMON, LLAMA_EXAMPLE_EXPORT_LORA}));
25632563
add_opt(common_arg(
25642564
{"--lora-scaled"}, "FNAME", "SCALE",
25652565
"path to LoRA adapter with user defined scaling (can be repeated to use multiple adapters)",
25662566
[](common_params & params, const std::string & fname, const std::string & scale) {
2567-
params.lora_adapters.push_back({ fname, std::stof(scale), nullptr });
2567+
params.lora_adapters.push_back({ fname, std::stof(scale), "", "", nullptr });
25682568
}
25692569
// we define this arg on both COMMON and EXPORT_LORA, so when showing help message of export-lora, it will be categorized as "example-specific" arg
25702570
).set_examples({LLAMA_EXAMPLE_COMMON, LLAMA_EXAMPLE_EXPORT_LORA}));
@@ -3539,6 +3539,22 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
35393539
}
35403540
).set_examples({LLAMA_EXAMPLE_SERVER}));
35413541

3542+
add_opt(common_arg(
3543+
{"--fim-qwen-30b-default"},
3544+
string_format("use default Qwen 3 Coder 30B A3B Instruct (note: can download weights from the internet)"),
3545+
[](common_params & params) {
3546+
params.model.hf_repo = "ggml-org/Qwen3-Coder-30B-A3B-Instruct-Q8_0-GGUF";
3547+
params.model.hf_file = "qwen3-coder-30b-a3b-instruct-q8_0.gguf";
3548+
params.port = 8012;
3549+
params.n_gpu_layers = 99;
3550+
params.flash_attn = true;
3551+
params.n_ubatch = 1024;
3552+
params.n_batch = 1024;
3553+
params.n_ctx = 0;
3554+
params.n_cache_reuse = 256;
3555+
}
3556+
).set_examples({LLAMA_EXAMPLE_SERVER}));
3557+
35423558
add_opt(common_arg(
35433559
{ "--diffusion-steps" }, "N",
35443560
string_format("number of diffusion steps (default: %d)", params.diffusion.steps),

common/common.cpp

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1010,7 +1010,12 @@ struct common_init_result common_init_from_params(common_params & params) {
10101010
return iparams;
10111011
}
10121012

1013+
char buf[1024];
10131014
la.ptr = lora.get();
1015+
llama_adapter_meta_val_str(la.ptr, "adapter.lora.task_name", buf, sizeof(buf));
1016+
la.task_name = buf;
1017+
llama_adapter_meta_val_str(la.ptr, "adapter.lora.prompt_prefix", buf, sizeof(buf));
1018+
la.prompt_prefix = buf;
10141019
iparams.lora.emplace_back(std::move(lora)); // copy to list of loaded adapters
10151020
}
10161021

common/common.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,9 @@ struct common_adapter_lora_info {
3434
std::string path;
3535
float scale;
3636

37+
std::string task_name;
38+
std::string prompt_prefix;
39+
3740
struct llama_adapter_lora * ptr;
3841
};
3942

convert_hf_to_gguf.py

Lines changed: 77 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -72,6 +72,7 @@ class ModelBase:
7272
endianess: gguf.GGUFEndian
7373
use_temp_file: bool
7474
lazy: bool
75+
dry_run: bool
7576
part_names: list[str]
7677
is_safetensors: bool
7778
hparams: dict[str, Any]
@@ -109,6 +110,7 @@ def __init__(self, dir_model: Path, ftype: gguf.LlamaFileType, fname_out: Path,
109110
self.endianess = gguf.GGUFEndian.BIG if is_big_endian else gguf.GGUFEndian.LITTLE
110111
self.use_temp_file = use_temp_file
111112
self.lazy = not eager or (remote_hf_model_id is not None)
113+
self.dry_run = dry_run
112114
self.remote_hf_model_id = remote_hf_model_id
113115
if remote_hf_model_id is not None:
114116
self.is_safetensors = True
@@ -5188,11 +5190,35 @@ def modify_tensors(self, data_torch, name, bid):
51885190
@ModelBase.register("XLMRobertaModel", "XLMRobertaForSequenceClassification")
51895191
class XLMRobertaModel(BertModel):
51905192
model_arch = gguf.MODEL_ARCH.BERT
5193+
_lora_files = {}
5194+
_lora_names = []
51915195

5192-
def __init__(self, *args, **kwargs):
5193-
super().__init__(*args, **kwargs)
5196+
def __init__(self, dir_model: Path, ftype: gguf.LlamaFileType, fname_out: Path, **kwargs: Any):
5197+
hparams = kwargs.pop("hparams", None)
5198+
if hparams is None:
5199+
hparams = ModelBase.load_hparams(dir_model, False)
5200+
5201+
if lora_names := hparams.get("lora_adaptations"):
5202+
self._lora_names = lora_names
5203+
self.model_arch = gguf.MODEL_ARCH.JINA_BERT_V3
5204+
5205+
super().__init__(dir_model, ftype, fname_out, hparams=hparams, **kwargs)
51945206
self._xlmroberta_tokenizer_init()
51955207

5208+
def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
5209+
if self._lora_names:
5210+
for name in self._lora_names:
5211+
fname = self.add_prefix_to_filename(self.fname_out, f"lora-{name}-")
5212+
self._lora_files[name] = gguf.GGUFWriter(fname, arch=gguf.MODEL_ARCH_NAMES[self.model_arch], endianess=self.endianess, use_temp_file=self.use_temp_file, dry_run=self.dry_run)
5213+
5214+
return super().generate_extra_tensors()
5215+
5216+
def set_type(self):
5217+
for lora_writer in self._lora_files.values():
5218+
lora_writer.add_type(gguf.GGUFType.ADAPTER)
5219+
lora_writer.add_string(gguf.Keys.Adapter.TYPE, "lora")
5220+
super().set_type()
5221+
51965222
def set_vocab(self):
51975223
self._xlmroberta_set_vocab()
51985224

@@ -5202,13 +5228,62 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
52025228
if name.startswith("roberta."):
52035229
name = name[8:]
52045230

5231+
# jina-embeddings-v3
5232+
if ".parametrizations." in name:
5233+
name = name.replace(".parametrizations.", ".")
5234+
if name.endswith(".original"):
5235+
name = name[:-9]
5236+
52055237
# position embeddings start at pad_token_id + 1, so just chop down the weight tensor
52065238
if name == "embeddings.position_embeddings.weight":
52075239
if self._position_offset is not None:
52085240
data_torch = data_torch[self._position_offset:,:]
52095241

5242+
if name.endswith(".0.lora_A") or name.endswith(".0.lora_B"):
5243+
if name.startswith("pooler.dense"):
5244+
return []
5245+
5246+
num_loras = data_torch.size(0)
5247+
assert num_loras == len(self._lora_names)
5248+
5249+
# Split out each LoRA in their own GGUF
5250+
for i, lora_writer in enumerate(self._lora_files.values()):
5251+
new_name = self.map_tensor_name(name[:-9]) + name[-7:].lower()
5252+
data = data_torch[i, :, :]
5253+
# Transpose/flip token_embd/types into correct shape
5254+
if new_name == "token_embd.weight.lora_b":
5255+
data = data.T
5256+
elif new_name.startswith("token_types.weight."):
5257+
new_name = new_name[:-1] + ("a" if new_name[-1:] == "b" else "b")
5258+
lora_writer.add_tensor(new_name, data.float().numpy(), raw_dtype=gguf.GGMLQuantizationType.F32)
5259+
5260+
return []
5261+
52105262
return super().modify_tensors(data_torch, name, bid)
52115263

5264+
def set_gguf_parameters(self):
5265+
super().set_gguf_parameters()
5266+
5267+
# jina-embeddings-v3
5268+
if rotary_emb_base := self.hparams.get("rotary_emb_base"):
5269+
self.gguf_writer.add_rope_freq_base(rotary_emb_base)
5270+
lora_alpha = self.hparams.get("lora_alpha")
5271+
if lora_prompt_prefixes := self.hparams.get("task_instructions"):
5272+
assert self._lora_files and all(lora_name in lora_prompt_prefixes for lora_name in self._lora_files.keys())
5273+
for lora_name, lora_writer in self._lora_files.items():
5274+
lora_writer.add_float32(gguf.Keys.Adapter.LORA_ALPHA, lora_alpha if lora_alpha is not None else 1.0)
5275+
lora_writer.add_string(gguf.Keys.Adapter.LORA_TASK_NAME, lora_name)
5276+
if lora_prompt_prefixes:
5277+
lora_writer.add_string(gguf.Keys.Adapter.LORA_PROMPT_PREFIX, lora_prompt_prefixes[lora_name])
5278+
5279+
def write(self):
5280+
super().write()
5281+
for lora_writer in self._lora_files.values():
5282+
lora_writer.write_header_to_file()
5283+
lora_writer.write_kv_data_to_file()
5284+
lora_writer.write_tensors_to_file(progress=True)
5285+
lora_writer.close()
5286+
52125287

52135288
@ModelBase.register("GemmaForCausalLM")
52145289
class GemmaModel(TextModel):

0 commit comments

Comments
 (0)