Skip to content

Commit 9e2b2d7

Browse files
authored
Merge b3535
b3535
2 parents 12c4918 + 1e6f655 commit 9e2b2d7

File tree

40 files changed

+525
-221
lines changed

40 files changed

+525
-221
lines changed

CONTRIBUTING.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
- Execute [the full CI locally on your machine](ci/README.md) before publishing
66
- Please rate the complexity of your PR (i.e. `Review Complexity : Low`, `Review Complexity : Medium`, `Review Complexity : High`). This makes it easier for maintainers to triage the PRs.
77
- The PR template has a series of review complexity checkboxes `[ ]` that [you can mark as](https://docs.github.com/en/get-started/writing-on-github/working-with-advanced-formatting/about-task-lists) `[X]` for your convenience
8+
- Consider allowing write access to your branch for faster review
89
- If your PR becomes stale, don't hesitate to ping the maintainers in the comments
910

1011
# Pull requests (for collaborators)

common/common.cpp

Lines changed: 52 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -684,14 +684,24 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
684684
}
685685
if (arg == "--lora") {
686686
CHECK_ARG
687-
params.lora_adapter.emplace_back(argv[i], 1.0f);
687+
params.lora_adapters.push_back({
688+
std::string(argv[i]),
689+
1.0,
690+
});
688691
return true;
689692
}
690693
if (arg == "--lora-scaled") {
691694
CHECK_ARG
692-
const char* lora_adapter = argv[i];
695+
std::string lora_adapter = argv[i];
693696
CHECK_ARG
694-
params.lora_adapter.emplace_back(lora_adapter, std::stof(argv[i]));
697+
params.lora_adapters.push_back({
698+
lora_adapter,
699+
std::stof(argv[i]),
700+
});
701+
return true;
702+
}
703+
if (arg == "--lora-init-without-apply") {
704+
params.lora_init_without_apply = true;
695705
return true;
696706
}
697707
if (arg == "--control-vector") {
@@ -1654,6 +1664,7 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
16541664
"https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template" });
16551665
options.push_back({ "server", "-sps, --slot-prompt-similarity SIMILARITY",
16561666
"how much the prompt of a request must match the prompt of a slot in order to use that slot (default: %.2f, 0.0 = disabled)\n", params.slot_prompt_similarity });
1667+
options.push_back({ "server", " --lora-init-without-apply", "load LoRA adapters without applying them (apply later via POST /lora-adapters) (default: %s)", params.lora_init_without_apply ? "enabled" : "disabled"});
16571668

16581669
#ifndef LOG_DISABLE_LOGS
16591670
options.push_back({ "logging" });
@@ -2039,8 +2050,8 @@ std::string fs_get_cache_file(const std::string & filename) {
20392050
//
20402051
// Model utils
20412052
//
2042-
2043-
std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_params(gpt_params & params) {
2053+
struct llama_init_result llama_init_from_gpt_params(gpt_params & params) {
2054+
llama_init_result iparams;
20442055
auto mparams = llama_model_params_from_gpt_params(params);
20452056

20462057
llama_model * model = nullptr;
@@ -2055,7 +2066,7 @@ std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_par
20552066

20562067
if (model == NULL) {
20572068
fprintf(stderr, "%s: error: failed to load model '%s'\n", __func__, params.model.c_str());
2058-
return std::make_tuple(nullptr, nullptr);
2069+
return iparams;
20592070
}
20602071

20612072
auto cparams = llama_context_params_from_gpt_params(params);
@@ -2064,7 +2075,7 @@ std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_par
20642075
if (lctx == NULL) {
20652076
fprintf(stderr, "%s: error: failed to create context with model '%s'\n", __func__, params.model.c_str());
20662077
llama_free_model(model);
2067-
return std::make_tuple(nullptr, nullptr);
2078+
return iparams;
20682079
}
20692080

20702081
if (!params.control_vectors.empty()) {
@@ -2075,7 +2086,7 @@ std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_par
20752086
if (cvec.n_embd == -1) {
20762087
llama_free(lctx);
20772088
llama_free_model(model);
2078-
return std::make_tuple(nullptr, nullptr);
2089+
return iparams;
20792090
}
20802091

20812092
int err = llama_control_vector_apply(lctx,
@@ -2087,21 +2098,26 @@ std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_par
20872098
if (err) {
20882099
llama_free(lctx);
20892100
llama_free_model(model);
2090-
return std::make_tuple(nullptr, nullptr);
2101+
return iparams;
20912102
}
20922103
}
20932104

2094-
for (unsigned int i = 0; i < params.lora_adapter.size(); ++i) {
2095-
const std::string & lora_adapter = std::get<0>(params.lora_adapter[i]);
2096-
float lora_scale = std::get<1>(params.lora_adapter[i]);
2097-
auto adapter = llama_lora_adapter_init(model, lora_adapter.c_str());
2098-
if (adapter == nullptr) {
2099-
fprintf(stderr, "%s: error: failed to apply lora adapter\n", __func__);
2105+
// load and optionally apply lora adapters
2106+
for (auto & la : params.lora_adapters) {
2107+
llama_lora_adapter_container loaded_la;
2108+
loaded_la.path = la.path;
2109+
loaded_la.scale = la.scale;
2110+
loaded_la.adapter = llama_lora_adapter_init(model, la.path.c_str());
2111+
if (loaded_la.adapter == nullptr) {
2112+
fprintf(stderr, "%s: error: failed to apply lora adapter '%s'\n", __func__, la.path.c_str());
21002113
llama_free(lctx);
21012114
llama_free_model(model);
2102-
return std::make_tuple(nullptr, nullptr);
2115+
return iparams;
21032116
}
2104-
llama_lora_adapter_set(lctx, adapter, lora_scale);
2117+
iparams.lora_adapters.push_back(loaded_la); // copy to list of loaded adapters
2118+
}
2119+
if (!params.lora_init_without_apply) {
2120+
llama_lora_adapters_apply(lctx, iparams.lora_adapters);
21052121
}
21062122

21072123
if (params.ignore_eos) {
@@ -2135,7 +2151,18 @@ std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_par
21352151
llama_reset_timings(lctx);
21362152
}
21372153

2138-
return std::make_tuple(model, lctx);
2154+
iparams.model = model;
2155+
iparams.context = lctx;
2156+
return iparams;
2157+
}
2158+
2159+
void llama_lora_adapters_apply(struct llama_context * ctx, std::vector<llama_lora_adapter_container> & lora_adapters) {
2160+
llama_lora_adapter_clear(ctx);
2161+
for (auto & la : lora_adapters) {
2162+
if (la.scale != 0.0f) {
2163+
llama_lora_adapter_set(ctx, la.adapter, la.scale);
2164+
}
2165+
}
21392166
}
21402167

21412168
struct llama_model_params llama_model_params_from_gpt_params(const gpt_params & params) {
@@ -3160,19 +3187,18 @@ void yaml_dump_non_result_info(FILE * stream, const gpt_params & params, const l
31603187
}
31613188

31623189
fprintf(stream, "lora:\n");
3163-
for (std::tuple<std::string, float> la : params.lora_adapter) {
3164-
if (std::get<1>(la) != 1.0f) {
3165-
continue;
3190+
for (auto & la : params.lora_adapters) {
3191+
if (la.scale == 1.0f) {
3192+
fprintf(stream, " - %s\n", la.path.c_str());
31663193
}
3167-
fprintf(stream, " - %s\n", std::get<0>(la).c_str());
31683194
}
31693195
fprintf(stream, "lora_scaled:\n");
3170-
for (std::tuple<std::string, float> la : params.lora_adapter) {
3171-
if (std::get<1>(la) == 1.0f) {
3172-
continue;
3196+
for (auto & la : params.lora_adapters) {
3197+
if (la.scale != 1.0f) {
3198+
fprintf(stream, " - %s: %f\n", la.path.c_str(), la.scale);
31733199
}
3174-
fprintf(stream, " - %s: %f\n", std::get<0>(la).c_str(), std::get<1>(la));
31753200
}
3201+
fprintf(stream, "lora_init_without_apply: %s # default: false\n", params.lora_init_without_apply ? "true" : "false");
31763202
fprintf(stream, "main_gpu: %d # default: 0\n", params.main_gpu);
31773203
fprintf(stream, "min_keep: %d # default: 0 (disabled)\n", sparams.min_keep);
31783204
fprintf(stream, "mirostat: %d # default: 0 (disabled)\n", sparams.mirostat);

common/common.h

Lines changed: 21 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,15 @@
3333

3434
#define DEFAULT_MODEL_PATH "models/7B/ggml-model-f16.gguf"
3535

36+
struct llama_lora_adapter_info {
37+
std::string path;
38+
float scale;
39+
};
40+
41+
struct llama_lora_adapter_container : llama_lora_adapter_info {
42+
struct llama_lora_adapter * adapter;
43+
};
44+
3645
// build info
3746
extern int LLAMA_BUILD_NUMBER;
3847
extern char const * LLAMA_COMMIT;
@@ -126,8 +135,8 @@ struct gpt_params {
126135
std::vector<std::string> antiprompt; // strings upon which more user input is prompted (a.k.a. reverse prompts)
127136
std::vector<llama_model_kv_override> kv_overrides;
128137

129-
// TODO: avoid tuple, use struct
130-
std::vector<std::tuple<std::string, float>> lora_adapter; // lora adapter path with user defined scale
138+
bool lora_init_without_apply = false; // only load lora to memory, but do not apply it to ctx (user can manually apply lora later using llama_lora_adapter_apply)
139+
std::vector<llama_lora_adapter_info> lora_adapters; // lora adapter path with user defined scale
131140

132141
std::vector<llama_control_vector_load_info> control_vectors; // control vector with user defined scale
133142

@@ -308,15 +317,23 @@ std::string fs_get_cache_file(const std::string & filename);
308317
// Model utils
309318
//
310319

311-
// TODO: avoid tuplue, use struct
312-
std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_params(gpt_params & params);
320+
struct llama_init_result {
321+
struct llama_model * model = nullptr;
322+
struct llama_context * context = nullptr;
323+
std::vector<llama_lora_adapter_container> lora_adapters;
324+
};
325+
326+
struct llama_init_result llama_init_from_gpt_params(gpt_params & params);
313327

314328
struct llama_model_params llama_model_params_from_gpt_params (const gpt_params & params);
315329
struct llama_context_params llama_context_params_from_gpt_params(const gpt_params & params);
316330

317331
struct llama_model * llama_load_model_from_url(const char * model_url, const char * path_model, const char * hf_token, const struct llama_model_params & params);
318332
struct llama_model * llama_load_model_from_hf(const char * repo, const char * file, const char * path_model, const char * hf_token, const struct llama_model_params & params);
319333

334+
// clear LoRA adapters from context, then apply new list of adapters
335+
void llama_lora_adapters_apply(struct llama_context * ctx, std::vector<llama_lora_adapter_container> & lora_adapters);
336+
320337
// Batch utils
321338

322339
void llama_batch_clear(struct llama_batch & batch);

convert_hf_to_gguf.py

Lines changed: 106 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2506,6 +2506,112 @@ def set_gguf_parameters(self):
25062506
self.gguf_writer.add_rope_freq_base(self.hparams["rotary_emb_base"])
25072507

25082508

2509+
@Model.register("XLMRobertaModel")
2510+
class XLMRobertaModel(BertModel):
2511+
model_arch = gguf.MODEL_ARCH.BERT
2512+
2513+
def __init__(self, *args, **kwargs):
2514+
super().__init__(*args, **kwargs)
2515+
2516+
# we need the pad_token_id to know how to chop down position_embd matrix
2517+
if (pad_token_id := self.hparams.get("pad_token_id")) is not None:
2518+
self._position_offset = 1 + pad_token_id
2519+
if "max_position_embeddings" in self.hparams:
2520+
self.hparams["max_position_embeddings"] -= self._position_offset
2521+
else:
2522+
self._position_offset = None
2523+
2524+
def set_vocab(self):
2525+
# to avoid TypeError: Descriptors cannot be created directly
2526+
# exception when importing sentencepiece_model_pb2
2527+
os.environ["PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION"] = "python"
2528+
from sentencepiece import SentencePieceProcessor
2529+
from sentencepiece import sentencepiece_model_pb2 as model
2530+
2531+
tokenizer_path = self.dir_model / 'sentencepiece.bpe.model'
2532+
if not tokenizer_path.is_file():
2533+
raise FileNotFoundError(f"File not found: {tokenizer_path}")
2534+
2535+
sentencepiece_model = model.ModelProto() # pyright: ignore[reportAttributeAccessIssue]
2536+
sentencepiece_model.ParseFromString(open(tokenizer_path, "rb").read())
2537+
assert sentencepiece_model.trainer_spec.model_type == 1 # UNIGRAM
2538+
2539+
add_prefix = sentencepiece_model.normalizer_spec.add_dummy_prefix
2540+
remove_whitespaces = sentencepiece_model.normalizer_spec.remove_extra_whitespaces
2541+
precompiled_charsmap = sentencepiece_model.normalizer_spec.precompiled_charsmap
2542+
2543+
tokenizer = SentencePieceProcessor()
2544+
tokenizer.LoadFromFile(str(tokenizer_path))
2545+
2546+
vocab_size = self.hparams.get('vocab_size', tokenizer.vocab_size())
2547+
2548+
tokens: list[bytes] = [f"[PAD{i}]".encode("utf-8") for i in range(vocab_size)]
2549+
scores: list[float] = [-10000.0] * vocab_size
2550+
toktypes: list[int] = [SentencePieceTokenTypes.UNUSED] * vocab_size
2551+
2552+
for token_id in range(tokenizer.vocab_size()):
2553+
piece = tokenizer.IdToPiece(token_id)
2554+
text = piece.encode("utf-8")
2555+
score = tokenizer.GetScore(token_id)
2556+
2557+
toktype = SentencePieceTokenTypes.NORMAL
2558+
if tokenizer.IsUnknown(token_id):
2559+
toktype = SentencePieceTokenTypes.UNKNOWN
2560+
elif tokenizer.IsControl(token_id):
2561+
toktype = SentencePieceTokenTypes.CONTROL
2562+
elif tokenizer.IsUnused(token_id):
2563+
toktype = SentencePieceTokenTypes.UNUSED
2564+
elif tokenizer.IsByte(token_id):
2565+
toktype = SentencePieceTokenTypes.BYTE
2566+
2567+
tokens[token_id] = text
2568+
scores[token_id] = score
2569+
toktypes[token_id] = toktype
2570+
2571+
if vocab_size > len(tokens):
2572+
pad_count = vocab_size - len(tokens)
2573+
logger.debug(f"Padding vocab with {pad_count} token(s) - [PAD1] through [PAD{pad_count}]")
2574+
for i in range(1, pad_count + 1):
2575+
tokens.append(bytes(f"[PAD{i}]", encoding="utf-8"))
2576+
scores.append(-1000.0)
2577+
toktypes.append(SentencePieceTokenTypes.UNUSED)
2578+
2579+
# realign tokens (see HF tokenizer code)
2580+
tokens = [b'<s>', b'<pad>', b'</s>', b'<unk>'] + tokens[3:-1]
2581+
scores = [0.0, 0.0, 0.0, 0.0] + scores[3:-1]
2582+
toktypes = [
2583+
SentencePieceTokenTypes.CONTROL,
2584+
SentencePieceTokenTypes.CONTROL,
2585+
SentencePieceTokenTypes.CONTROL,
2586+
SentencePieceTokenTypes.UNKNOWN,
2587+
] + toktypes[3:-1]
2588+
2589+
self.gguf_writer.add_tokenizer_model("t5")
2590+
self.gguf_writer.add_tokenizer_pre("default")
2591+
self.gguf_writer.add_token_list(tokens)
2592+
self.gguf_writer.add_token_scores(scores)
2593+
self.gguf_writer.add_token_types(toktypes)
2594+
self.gguf_writer.add_add_space_prefix(add_prefix)
2595+
self.gguf_writer.add_token_type_count(1)
2596+
self.gguf_writer.add_remove_extra_whitespaces(remove_whitespaces)
2597+
if precompiled_charsmap:
2598+
self.gguf_writer.add_precompiled_charsmap(precompiled_charsmap)
2599+
2600+
special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens))
2601+
special_vocab.add_to_gguf(self.gguf_writer)
2602+
2603+
self.gguf_writer.add_add_bos_token(True)
2604+
self.gguf_writer.add_add_eos_token(True)
2605+
2606+
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
2607+
# position embeddings start at pad_token_id + 1, so just chop down the weight tensor
2608+
if name == "embeddings.position_embeddings.weight":
2609+
if self._position_offset is not None:
2610+
data_torch = data_torch[self._position_offset:,:]
2611+
2612+
return super().modify_tensors(data_torch, name, bid)
2613+
2614+
25092615
@Model.register("GemmaForCausalLM")
25102616
class GemmaModel(Model):
25112617
model_arch = gguf.MODEL_ARCH.GEMMA

examples/cvector-generator/cvector-generator.cpp

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -414,9 +414,10 @@ int main(int argc, char ** argv) {
414414
llama_numa_init(params.numa);
415415

416416
// load the model to get hparams
417-
llama_model * model;
418-
llama_context * ctx;
419-
std::tie(model, ctx) = llama_init_from_gpt_params(params);
417+
llama_init_result llama_init = llama_init_from_gpt_params(params);
418+
419+
llama_model * model = llama_init.model;
420+
llama_context * ctx = llama_init.context;
420421

421422
// int n_ctx = llama_n_ctx(ctx);
422423
int n_layers = llama_n_layer(model);

examples/embedding/embedding.cpp

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -79,11 +79,11 @@ int main(int argc, char ** argv) {
7979
llama_backend_init();
8080
llama_numa_init(params.numa);
8181

82-
llama_model * model;
83-
llama_context * ctx;
84-
8582
// load the model
86-
std::tie(model, ctx) = llama_init_from_gpt_params(params);
83+
llama_init_result llama_init = llama_init_from_gpt_params(params);
84+
85+
llama_model * model = llama_init.model;
86+
llama_context * ctx = llama_init.context;
8787
if (model == NULL) {
8888
fprintf(stderr, "%s: error: unable to load model\n", __func__);
8989
return 1;

examples/eval-callback/eval-callback.cpp

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -163,9 +163,10 @@ int main(int argc, char ** argv) {
163163
params.warmup = false;
164164

165165
// init
166-
llama_model * model;
167-
llama_context * ctx;
168-
std::tie(model, ctx) = llama_init_from_gpt_params(params);
166+
llama_init_result llama_init = llama_init_from_gpt_params(params);
167+
168+
llama_model * model = llama_init.model;
169+
llama_context * ctx = llama_init.context;
169170
if (model == nullptr || ctx == nullptr) {
170171
fprintf(stderr, "%s : failed to init\n", __func__);
171172
return 1;

0 commit comments

Comments
 (0)