Skip to content

Commit c03d5cc

Browse files
committed
Merge branch 'master' into xsn/tag_based_hf_repo
2 parents ff484f7 + 924518e commit c03d5cc

File tree

70 files changed

+5836
-5401
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

70 files changed

+5836
-5401
lines changed

common/common.cpp

Lines changed: 63 additions & 53 deletions
Original file line numberDiff line numberDiff line change
@@ -852,21 +852,23 @@ struct common_init_result common_init_from_params(common_params & params) {
852852
return iparams;
853853
}
854854

855+
const llama_vocab * vocab = llama_model_get_vocab(model);
856+
855857
if (params.reranking) {
856858
bool ok = true;
857859

858-
if (llama_token_bos(model) == LLAMA_TOKEN_NULL) {
859-
LOG_WRN("%s: warning: model does not have a BOS token, reranking will not work\n", __func__);
860+
if (llama_vocab_bos(vocab) == LLAMA_TOKEN_NULL) {
861+
LOG_WRN("%s: warning: vocab does not have a BOS token, reranking will not work\n", __func__);
860862
ok = false;
861863
}
862864

863-
if (llama_token_eos(model) == LLAMA_TOKEN_NULL) {
864-
LOG_WRN("%s: warning: model does not have an EOS token, reranking will not work\n", __func__);
865+
if (llama_vocab_eos(vocab) == LLAMA_TOKEN_NULL) {
866+
LOG_WRN("%s: warning: vocab does not have an EOS token, reranking will not work\n", __func__);
865867
ok = false;
866868
}
867869

868-
if (llama_token_sep(model) == LLAMA_TOKEN_NULL) {
869-
LOG_WRN("%s: warning: model does not have a SEP token, reranking will not work\n", __func__);
870+
if (llama_vocab_sep(vocab) == LLAMA_TOKEN_NULL) {
871+
LOG_WRN("%s: warning: vocab does not have a SEP token, reranking will not work\n", __func__);
870872
ok = false;
871873
}
872874

@@ -879,7 +881,7 @@ struct common_init_result common_init_from_params(common_params & params) {
879881

880882
auto cparams = common_context_params_to_llama(params);
881883

882-
llama_context * lctx = llama_new_context_with_model(model, cparams);
884+
llama_context * lctx = llama_init_from_model(model, cparams);
883885
if (lctx == NULL) {
884886
LOG_ERR("%s: failed to create context with model '%s'\n", __func__, params.model.c_str());
885887
llama_model_free(model);
@@ -893,7 +895,7 @@ struct common_init_result common_init_from_params(common_params & params) {
893895

894896
if (!params.control_vectors.empty()) {
895897
if (params.control_vector_layer_start <= 0) params.control_vector_layer_start = 1;
896-
if (params.control_vector_layer_end <= 0) params.control_vector_layer_end = llama_n_layer(model);
898+
if (params.control_vector_layer_end <= 0) params.control_vector_layer_end = llama_model_n_layer(model);
897899

898900
const auto cvec = common_control_vector_load(params.control_vectors);
899901
if (cvec.n_embd == -1) {
@@ -903,12 +905,13 @@ struct common_init_result common_init_from_params(common_params & params) {
903905
return iparams;
904906
}
905907

906-
int err = llama_control_vector_apply(lctx,
907-
cvec.data.data(),
908-
cvec.data.size(),
909-
cvec.n_embd,
910-
params.control_vector_layer_start,
911-
params.control_vector_layer_end);
908+
int err = llama_apply_adapter_cvec(
909+
lctx,
910+
cvec.data.data(),
911+
cvec.data.size(),
912+
cvec.n_embd,
913+
params.control_vector_layer_start,
914+
params.control_vector_layer_end);
912915
if (err) {
913916
llama_free(lctx);
914917
llama_model_free(model);
@@ -919,8 +922,8 @@ struct common_init_result common_init_from_params(common_params & params) {
919922

920923
// load and optionally apply lora adapters
921924
for (auto & la : params.lora_adapters) {
922-
llama_lora_adapter_ptr lora;
923-
lora.reset(llama_lora_adapter_init(model, la.path.c_str()));
925+
llama_adapter_lora_ptr lora;
926+
lora.reset(llama_adapter_lora_init(model, la.path.c_str()));
924927
if (lora == nullptr) {
925928
LOG_ERR("%s: failed to apply lora adapter '%s'\n", __func__, la.path.c_str());
926929
llama_free(lctx);
@@ -933,17 +936,17 @@ struct common_init_result common_init_from_params(common_params & params) {
933936
}
934937

935938
if (!params.lora_init_without_apply) {
936-
common_lora_adapters_apply(lctx, params.lora_adapters);
939+
common_set_adapter_lora(lctx, params.lora_adapters);
937940
}
938941

939-
if (params.sampling.ignore_eos && llama_token_eos(model) == LLAMA_TOKEN_NULL) {
940-
LOG_WRN("%s: warning: model does not have an EOS token, ignoring --ignore-eos\n", __func__);
942+
if (params.sampling.ignore_eos && llama_vocab_eos(vocab) == LLAMA_TOKEN_NULL) {
943+
LOG_WRN("%s: warning: vocab does not have an EOS token, ignoring --ignore-eos\n", __func__);
941944
params.sampling.ignore_eos = false;
942945
}
943946

944947
if (params.sampling.ignore_eos) {
945-
for (llama_token i = 0; i < llama_n_vocab(model); i++) {
946-
if (llama_token_is_eog(model, i)) {
948+
for (llama_token i = 0; i < llama_vocab_n_tokens(vocab); i++) {
949+
if (llama_vocab_is_eog(vocab, i)) {
947950
LOG_INF("%s: added %s logit bias = %f\n", __func__, common_token_to_piece(lctx, i).c_str(), -INFINITY);
948951
params.sampling.logit_bias.push_back({i, -INFINITY});
949952
}
@@ -964,8 +967,9 @@ struct common_init_result common_init_from_params(common_params & params) {
964967
LOG_WRN("%s: warming up the model with an empty run - please wait ... (--no-warmup to disable)\n", __func__);
965968

966969
std::vector<llama_token> tmp;
967-
llama_token bos = llama_token_bos(model);
968-
llama_token eos = llama_token_eos(model);
970+
llama_token bos = llama_vocab_bos(vocab);
971+
llama_token eos = llama_vocab_eos(vocab);
972+
969973
// some models (e.g. T5) don't have a BOS token
970974
if (bos != LLAMA_TOKEN_NULL) {
971975
tmp.push_back(bos);
@@ -1000,11 +1004,11 @@ struct common_init_result common_init_from_params(common_params & params) {
10001004
return iparams;
10011005
}
10021006

1003-
void common_lora_adapters_apply(struct llama_context * ctx, std::vector<common_lora_adapter_info> & lora) {
1004-
llama_lora_adapter_clear(ctx);
1007+
void common_set_adapter_lora(struct llama_context * ctx, std::vector<common_adapter_lora_info> & lora) {
1008+
llama_clear_adapter_lora(ctx);
10051009
for (auto & la : lora) {
10061010
if (la.scale != 0.0f) {
1007-
llama_lora_adapter_set(ctx, la.ptr, la.scale);
1011+
llama_set_adapter_lora(ctx, la.ptr, la.scale);
10081012
}
10091013
}
10101014
}
@@ -1553,21 +1557,23 @@ std::vector<llama_token> common_tokenize(
15531557
const std::string & text,
15541558
bool add_special,
15551559
bool parse_special) {
1556-
return common_tokenize(llama_get_model(ctx), text, add_special, parse_special);
1560+
const llama_model * model = llama_get_model(ctx);
1561+
const llama_vocab * vocab = llama_model_get_vocab(model);
1562+
return common_tokenize(vocab, text, add_special, parse_special);
15571563
}
15581564

15591565
std::vector<llama_token> common_tokenize(
1560-
const struct llama_model * model,
1566+
const struct llama_vocab * vocab,
15611567
const std::string & text,
15621568
bool add_special,
15631569
bool parse_special) {
15641570
// upper limit for the number of tokens
15651571
int n_tokens = text.length() + 2 * add_special;
15661572
std::vector<llama_token> result(n_tokens);
1567-
n_tokens = llama_tokenize(model, text.data(), text.length(), result.data(), result.size(), add_special, parse_special);
1573+
n_tokens = llama_tokenize(vocab, text.data(), text.length(), result.data(), result.size(), add_special, parse_special);
15681574
if (n_tokens < 0) {
15691575
result.resize(-n_tokens);
1570-
int check = llama_tokenize(model, text.data(), text.length(), result.data(), result.size(), add_special, parse_special);
1576+
int check = llama_tokenize(vocab, text.data(), text.length(), result.data(), result.size(), add_special, parse_special);
15711577
GGML_ASSERT(check == -n_tokens);
15721578
} else {
15731579
result.resize(n_tokens);
@@ -1576,12 +1582,18 @@ std::vector<llama_token> common_tokenize(
15761582
}
15771583

15781584
std::string common_token_to_piece(const struct llama_context * ctx, llama_token token, bool special) {
1585+
const llama_model * model = llama_get_model(ctx);
1586+
const llama_vocab * vocab = llama_model_get_vocab(model);
1587+
return common_token_to_piece(vocab, token, special);
1588+
}
1589+
1590+
std::string common_token_to_piece(const struct llama_vocab * vocab, llama_token token, bool special) {
15791591
std::string piece;
15801592
piece.resize(piece.capacity()); // using string internal cache, 15 bytes + '\n'
1581-
const int n_chars = llama_token_to_piece(llama_get_model(ctx), token, &piece[0], piece.size(), 0, special);
1593+
const int n_chars = llama_token_to_piece(vocab, token, &piece[0], piece.size(), 0, special);
15821594
if (n_chars < 0) {
15831595
piece.resize(-n_chars);
1584-
int check = llama_token_to_piece(llama_get_model(ctx), token, &piece[0], piece.size(), 0, special);
1596+
int check = llama_token_to_piece(vocab, token, &piece[0], piece.size(), 0, special);
15851597
GGML_ASSERT(check == -n_chars);
15861598
}
15871599
else {
@@ -1591,13 +1603,19 @@ std::string common_token_to_piece(const struct llama_context * ctx, llama_token
15911603
return piece;
15921604
}
15931605

1594-
std::string common_detokenize(llama_context * ctx, const std::vector<llama_token> & tokens, bool special) {
1606+
std::string common_detokenize(const struct llama_context * ctx, const std::vector<llama_token> & tokens, bool special) {
1607+
const llama_model * model = llama_get_model(ctx);
1608+
const llama_vocab * vocab = llama_model_get_vocab(model);
1609+
return common_detokenize(vocab, tokens, special);
1610+
}
1611+
1612+
std::string common_detokenize(const struct llama_vocab * vocab, const std::vector<llama_token> & tokens, bool special) {
15951613
std::string text;
15961614
text.resize(std::max(text.capacity(), tokens.size()));
1597-
int32_t n_chars = llama_detokenize(llama_get_model(ctx), tokens.data(), (int32_t)tokens.size(), &text[0], (int32_t)text.size(), false, special);
1615+
int32_t n_chars = llama_detokenize(vocab, tokens.data(), (int32_t)tokens.size(), &text[0], (int32_t)text.size(), false, special);
15981616
if (n_chars < 0) {
15991617
text.resize(-n_chars);
1600-
n_chars = llama_detokenize(llama_get_model(ctx), tokens.data(), (int32_t)tokens.size(), &text[0], (int32_t)text.size(), false, special);
1618+
n_chars = llama_detokenize(vocab, tokens.data(), (int32_t)tokens.size(), &text[0], (int32_t)text.size(), false, special);
16011619
GGML_ASSERT(n_chars <= (int32_t)text.size()); // whitespace trimming is performed after per-token detokenization
16021620
}
16031621

@@ -1612,20 +1630,13 @@ std::string common_detokenize(llama_context * ctx, const std::vector<llama_token
16121630
//
16131631

16141632
std::string common_get_builtin_chat_template(const struct llama_model * model) {
1615-
static const char * template_key = "tokenizer.chat_template";
1616-
// call with NULL buffer to get the total size of the string
1617-
int32_t res = llama_model_meta_val_str(model, template_key, NULL, 0);
1618-
if (res > 0) {
1619-
std::vector<char> model_template(res + 1, 0);
1620-
llama_model_meta_val_str(model, template_key, model_template.data(), model_template.size());
1621-
return std::string(model_template.data(), model_template.size() - 1);
1622-
}
1623-
return "";
1633+
const char * ptr_tmpl = llama_model_chat_template(model);
1634+
return ptr_tmpl == nullptr ? "" : ptr_tmpl;
16241635
}
16251636

16261637
bool common_chat_verify_template(const std::string & tmpl) {
16271638
llama_chat_message chat[] = {{"user", "test"}};
1628-
int res = llama_chat_apply_template(nullptr, tmpl.c_str(), chat, 1, true, nullptr, 0);
1639+
const int res = llama_chat_apply_template(tmpl.c_str(), chat, 1, true, nullptr, 0);
16291640
return res >= 0;
16301641
}
16311642

@@ -1636,35 +1647,34 @@ std::string common_chat_apply_template(const struct llama_model * model,
16361647
int alloc_size = 0;
16371648
bool fallback = false; // indicate if we must fallback to default chatml
16381649
std::vector<llama_chat_message> chat;
1639-
for (auto & msg : msgs) {
1650+
for (const auto & msg : msgs) {
16401651
chat.push_back({msg.role.c_str(), msg.content.c_str()});
16411652
alloc_size += (msg.role.size() + msg.content.size()) * 1.25;
16421653
}
16431654

1644-
const char * ptr_tmpl = tmpl.empty() ? nullptr : tmpl.c_str();
1655+
const char * ptr_tmpl = tmpl.empty() ? llama_model_chat_template(model) : tmpl.c_str();
16451656
std::vector<char> buf(alloc_size);
16461657

16471658
// run the first time to get the total output length
1648-
int32_t res = llama_chat_apply_template(model, ptr_tmpl, chat.data(), chat.size(), add_ass, buf.data(), buf.size());
1659+
int32_t res = llama_chat_apply_template(ptr_tmpl, chat.data(), chat.size(), add_ass, buf.data(), buf.size());
16491660

16501661
// error: chat template is not supported
16511662
if (res < 0) {
16521663
if (ptr_tmpl != nullptr) {
16531664
// if the custom "tmpl" is not supported, we throw an error
16541665
// this is a bit redundant (for good), since we're not sure if user validated the custom template with llama_chat_verify_template()
16551666
throw std::runtime_error("this custom template is not supported");
1656-
} else {
1657-
// If the built-in template is not supported, we default to chatml
1658-
res = llama_chat_apply_template(nullptr, "chatml", chat.data(), chat.size(), add_ass, buf.data(), buf.size());
1659-
fallback = true;
16601667
}
1668+
1669+
// If the built-in template is not supported, we default to chatml
1670+
res = llama_chat_apply_template("chatml", chat.data(), chat.size(), add_ass, buf.data(), buf.size());
1671+
fallback = true;
16611672
}
16621673

16631674
// if it turns out that our buffer is too small, we resize it
16641675
if ((size_t) res > buf.size()) {
16651676
buf.resize(res);
16661677
res = llama_chat_apply_template(
1667-
fallback ? nullptr : model,
16681678
fallback ? "chatml" : ptr_tmpl,
16691679
chat.data(), chat.size(), add_ass, buf.data(), buf.size());
16701680
}

common/common.h

Lines changed: 18 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -30,11 +30,11 @@
3030

3131
#define DEFAULT_MODEL_PATH "models/7B/ggml-model-f16.gguf"
3232

33-
struct common_lora_adapter_info {
33+
struct common_adapter_lora_info {
3434
std::string path;
3535
float scale;
3636

37-
struct llama_lora_adapter * ptr;
37+
struct llama_adapter_lora * ptr;
3838
};
3939

4040
using llama_tokens = std::vector<llama_token>;
@@ -252,8 +252,8 @@ struct common_params {
252252
std::vector<std::string> antiprompt; // strings upon which more user input is prompted (a.k.a. reverse prompts)
253253
std::vector<llama_model_kv_override> kv_overrides;
254254

255-
bool lora_init_without_apply = false; // only load lora to memory, but do not apply it to ctx (user can manually apply lora later using llama_lora_adapter_apply)
256-
std::vector<common_lora_adapter_info> lora_adapters; // lora adapter path with user defined scale
255+
bool lora_init_without_apply = false; // only load lora to memory, but do not apply it to ctx (user can manually apply lora later using llama_adapter_lora_apply)
256+
std::vector<common_adapter_lora_info> lora_adapters; // lora adapter path with user defined scale
257257

258258
std::vector<common_control_vector_load_info> control_vectors; // control vector with user defined scale
259259

@@ -492,7 +492,7 @@ struct common_init_result {
492492
llama_model_ptr model;
493493
llama_context_ptr context;
494494

495-
std::vector<llama_lora_adapter_ptr> lora;
495+
std::vector<llama_adapter_lora_ptr> lora;
496496
};
497497

498498
struct common_init_result common_init_from_params(common_params & params);
@@ -514,7 +514,7 @@ struct llama_model * common_load_model_from_hf(
514514
const struct llama_model_params & params);
515515

516516
// clear LoRA adapters from context, then apply new list of adapters
517-
void common_lora_adapters_apply(struct llama_context * ctx, std::vector<common_lora_adapter_info> & lora);
517+
void common_set_adapter_lora(struct llama_context * ctx, std::vector<common_adapter_lora_info> & lora);
518518

519519
//
520520
// Batch utils
@@ -552,7 +552,7 @@ std::vector<llama_token> common_tokenize(
552552
bool parse_special = false);
553553

554554
std::vector<llama_token> common_tokenize(
555-
const struct llama_model * model,
555+
const struct llama_vocab * vocab,
556556
const std::string & text,
557557
bool add_special,
558558
bool parse_special = false);
@@ -564,11 +564,21 @@ std::string common_token_to_piece(
564564
llama_token token,
565565
bool special = true);
566566

567+
std::string common_token_to_piece(
568+
const struct llama_vocab * vocab,
569+
llama_token token,
570+
bool special = true);
571+
567572
// detokenizes a vector of tokens into a string
568573
// should work similar to Python's `tokenizer.decode`
569574
// optionally renders special/control tokens
570575
std::string common_detokenize(
571-
llama_context * ctx,
576+
const struct llama_context * ctx,
577+
const std::vector<llama_token> & tokens,
578+
bool special = true);
579+
580+
std::string common_detokenize(
581+
const struct llama_vocab * vocab,
572582
const std::vector<llama_token> & tokens,
573583
bool special = true);
574584

0 commit comments

Comments
 (0)