Skip to content

Commit 1b49dc3

Browse files
committed
Merge branch 'upstream' into concedo_experimental
# Conflicts: # .github/workflows/build.yml # .github/workflows/docker.yml # .github/workflows/editorconfig.yml # examples/run/run.cpp # examples/server/README.md # scripts/sync-ggml.last
2 parents 5cce8a5 + 8d59d91 commit 1b49dc3

File tree

18 files changed

+891
-110
lines changed

18 files changed

+891
-110
lines changed

common/arg.cpp

Lines changed: 13 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,11 @@ common_arg & common_arg::set_examples(std::initializer_list<enum llama_example>
2323
return *this;
2424
}
2525

26+
common_arg & common_arg::set_excludes(std::initializer_list<enum llama_example> excludes) {
27+
this->excludes = std::move(excludes);
28+
return *this;
29+
}
30+
2631
common_arg & common_arg::set_env(const char * env) {
2732
help = help + "\n(env: " + env + ")";
2833
this->env = env;
@@ -38,6 +43,10 @@ bool common_arg::in_example(enum llama_example ex) {
3843
return examples.find(ex) != examples.end();
3944
}
4045

46+
bool common_arg::is_exclude(enum llama_example ex) {
47+
return excludes.find(ex) != excludes.end();
48+
}
49+
4150
bool common_arg::get_value_from_env(std::string & output) {
4251
if (env == nullptr) return false;
4352
char * value = std::getenv(env);
@@ -421,7 +430,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
421430
* - if both {LLAMA_EXAMPLE_COMMON, LLAMA_EXAMPLE_*,} are set, we will prioritize the LLAMA_EXAMPLE_* matching current example
422431
*/
423432
auto add_opt = [&](common_arg arg) {
424-
if (arg.in_example(ex) || arg.in_example(LLAMA_EXAMPLE_COMMON)) {
433+
if ((arg.in_example(ex) || arg.in_example(LLAMA_EXAMPLE_COMMON)) && !arg.is_exclude(ex)) {
425434
ctx_arg.options.push_back(std::move(arg));
426435
}
427436
};
@@ -650,7 +659,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
650659
[](common_params & params, const std::string & value) {
651660
params.prompt = value;
652661
}
653-
));
662+
).set_excludes({LLAMA_EXAMPLE_SERVER}));
654663
add_opt(common_arg(
655664
{"--no-perf"},
656665
string_format("disable internal libllama performance timings (default: %s)", params.no_perf ? "true" : "false"),
@@ -674,7 +683,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
674683
params.prompt.pop_back();
675684
}
676685
}
677-
));
686+
).set_excludes({LLAMA_EXAMPLE_SERVER}));
678687
add_opt(common_arg(
679688
{"--in-file"}, "FNAME",
680689
"an input file (repeat to specify multiple files)",
@@ -701,7 +710,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
701710
params.prompt = ss.str();
702711
fprintf(stderr, "Read %zu bytes from binary file %s\n", params.prompt.size(), value.c_str());
703712
}
704-
));
713+
).set_excludes({LLAMA_EXAMPLE_SERVER}));
705714
add_opt(common_arg(
706715
{"-e", "--escape"},
707716
string_format("process escapes sequences (\\n, \\r, \\t, \\', \\\", \\\\) (default: %s)", params.escape ? "true" : "false"),

common/arg.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212

1313
struct common_arg {
1414
std::set<enum llama_example> examples = {LLAMA_EXAMPLE_COMMON};
15+
std::set<enum llama_example> excludes = {};
1516
std::vector<const char *> args;
1617
const char * value_hint = nullptr; // help text or example for arg value
1718
const char * value_hint_2 = nullptr; // for second arg value
@@ -53,9 +54,11 @@ struct common_arg {
5354
) : args(args), value_hint(value_hint), value_hint_2(value_hint_2), help(help), handler_str_str(handler) {}
5455

5556
common_arg & set_examples(std::initializer_list<enum llama_example> examples);
57+
common_arg & set_excludes(std::initializer_list<enum llama_example> excludes);
5658
common_arg & set_env(const char * env);
5759
common_arg & set_sparam();
5860
bool in_example(enum llama_example ex);
61+
bool is_exclude(enum llama_example ex);
5962
bool get_value_from_env(std::string & output);
6063
bool has_value_from_env();
6164
std::string to_string();

convert_lora_to_gguf.py

Lines changed: 31 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -226,6 +226,9 @@ def get_base_tensor_name(lora_tensor_name: str) -> str:
226226
base_name = lora_tensor_name.replace("base_model.model.", "")
227227
base_name = base_name.replace(".lora_A.weight", ".weight")
228228
base_name = base_name.replace(".lora_B.weight", ".weight")
229+
# models produced by mergekit-extract-lora have token embeddings in the adapter
230+
base_name = base_name.replace(".lora_embedding_A", ".weight")
231+
base_name = base_name.replace(".lora_embedding_B", ".weight")
229232
return base_name
230233

231234

@@ -260,6 +263,10 @@ def parse_args() -> argparse.Namespace:
260263
"--base", type=Path,
261264
help="directory containing Hugging Face model config files (config.json, tokenizer.json) for the base model that the adapter is based on - only config is needed, actual model weights are not required. If base model is unspecified, it will be loaded from Hugging Face hub based on the adapter config",
262265
)
266+
parser.add_argument(
267+
"--base-model-id", type=str,
268+
help="the model ID of the base model, if it is not available locally or in the adapter config. If specified, it will ignore --base and load the base model config from the Hugging Face hub (Example: 'meta-llama/Llama-3.2-1B-Instruct')",
269+
)
263270
parser.add_argument(
264271
"lora_path", type=Path,
265272
help="directory containing Hugging Face PEFT LoRA config (adapter_model.json) and weights (adapter_model.safetensors or adapter_model.bin)",
@@ -290,6 +297,7 @@ def load_hparams_from_hf(hf_model_id: str) -> dict[str, Any]:
290297

291298
dir_base_model: Path | None = args.base
292299
dir_lora: Path = args.lora_path
300+
base_model_id: str | None = args.base_model_id
293301
lora_config = dir_lora / "adapter_config.json"
294302
input_model = dir_lora / "adapter_model.safetensors"
295303

@@ -313,7 +321,10 @@ def load_hparams_from_hf(hf_model_id: str) -> dict[str, Any]:
313321
lparams: dict[str, Any] = json.load(f)
314322

315323
# load base model
316-
if dir_base_model is None:
324+
if base_model_id is not None:
325+
logger.info(f"Loading base model from Hugging Face: {base_model_id}")
326+
hparams = load_hparams_from_hf(base_model_id)
327+
elif dir_base_model is None:
317328
if "base_model_name_or_path" in lparams:
318329
model_id = lparams["base_model_name_or_path"]
319330
logger.info(f"Loading base model from Hugging Face: {model_id}")
@@ -371,11 +382,16 @@ def get_tensors(self) -> Iterator[tuple[str, Tensor]]:
371382
if self.lazy:
372383
tensor = LazyTorchTensor.from_eager(tensor)
373384
base_name = get_base_tensor_name(name)
374-
is_lora_a = ".lora_A.weight" in name
375-
is_lora_b = ".lora_B.weight" in name
385+
# note: mergekit-extract-lora also adds token embeddings to the adapter
386+
is_lora_a = ".lora_A.weight" in name or ".lora_embedding_A" in name
387+
is_lora_b = ".lora_B.weight" in name or ".lora_embedding_B" in name
376388
if not is_lora_a and not is_lora_b:
377389
if ".base_layer.weight" in name:
378390
continue
391+
# mergekit-extract-lora add these layernorm to the adapter, we need to keep them
392+
if "_layernorm" in name or ".norm" in name:
393+
yield (base_name, tensor)
394+
continue
379395
logger.error(f"Unexpected name '{name}': Not a lora_A or lora_B tensor")
380396
if ".embed_tokens.weight" in name or ".lm_head.weight" in name:
381397
logger.error("Embeddings is present in the adapter. This can be due to new tokens added during fine tuning")
@@ -407,9 +423,21 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
407423
if name == "lm_head.weight" and len(dest) == 0:
408424
raise ValueError("lm_head is present in adapter, but is ignored in base model")
409425
for dest_name, dest_data in dest:
426+
# mergekit-extract-lora add these layernorm to the adapter
427+
if "_norm" in dest_name:
428+
assert dest_data.dim() == 1
429+
yield (dest_name, dest_data)
430+
continue
431+
432+
# otherwise, we must get the lora_A and lora_B tensors
410433
assert isinstance(dest_data, LoraTorchTensor)
411434
lora_a, lora_b = dest_data.get_lora_A_B()
412435

436+
# note: mergekit-extract-lora flip and transpose A and B
437+
# here we only need to transpose token_embd.lora_a, see llm_build_inp_embd()
438+
if "token_embd.weight" in dest_name:
439+
lora_a = lora_a.T
440+
413441
yield (dest_name + ".lora_a", lora_a)
414442
yield (dest_name + ".lora_b", lora_b)
415443

ggml/src/ggml-backend-reg.cpp

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -575,4 +575,9 @@ void ggml_backend_load_all_from_path(const char * dir_path) {
575575
ggml_backend_load_best("opencl", silent, dir_path);
576576
ggml_backend_load_best("musa", silent, dir_path);
577577
ggml_backend_load_best("cpu", silent, dir_path);
578+
// check the environment variable GGML_BACKEND_PATH to load an out-of-tree backend
579+
const char * backend_path = std::getenv("GGML_BACKEND_PATH");
580+
if (backend_path) {
581+
ggml_backend_load(backend_path);
582+
}
578583
}

0 commit comments

Comments
 (0)