@@ -852,21 +852,23 @@ struct common_init_result common_init_from_params(common_params & params) {
852852 return iparams;
853853 }
854854
855+ const llama_vocab * vocab = llama_model_get_vocab (model);
856+
855857 if (params.reranking ) {
856858 bool ok = true ;
857859
858- if (llama_token_bos (model ) == LLAMA_TOKEN_NULL) {
859- LOG_WRN (" %s: warning: model does not have a BOS token, reranking will not work\n " , __func__);
860+ if (llama_vocab_bos (vocab ) == LLAMA_TOKEN_NULL) {
861+ LOG_WRN (" %s: warning: vocab does not have a BOS token, reranking will not work\n " , __func__);
860862 ok = false ;
861863 }
862864
863- if (llama_token_eos (model ) == LLAMA_TOKEN_NULL) {
864- LOG_WRN (" %s: warning: model does not have an EOS token, reranking will not work\n " , __func__);
865+ if (llama_vocab_eos (vocab ) == LLAMA_TOKEN_NULL) {
866+ LOG_WRN (" %s: warning: vocab does not have an EOS token, reranking will not work\n " , __func__);
865867 ok = false ;
866868 }
867869
868- if (llama_token_sep (model ) == LLAMA_TOKEN_NULL) {
869- LOG_WRN (" %s: warning: model does not have a SEP token, reranking will not work\n " , __func__);
870+ if (llama_vocab_sep (vocab ) == LLAMA_TOKEN_NULL) {
871+ LOG_WRN (" %s: warning: vocab does not have a SEP token, reranking will not work\n " , __func__);
870872 ok = false ;
871873 }
872874
@@ -879,7 +881,7 @@ struct common_init_result common_init_from_params(common_params & params) {
879881
880882 auto cparams = common_context_params_to_llama (params);
881883
882- llama_context * lctx = llama_new_context_with_model (model, cparams);
884+ llama_context * lctx = llama_init_from_model (model, cparams);
883885 if (lctx == NULL ) {
884886 LOG_ERR (" %s: failed to create context with model '%s'\n " , __func__, params.model .c_str ());
885887 llama_model_free (model);
@@ -893,7 +895,7 @@ struct common_init_result common_init_from_params(common_params & params) {
893895
894896 if (!params.control_vectors .empty ()) {
895897 if (params.control_vector_layer_start <= 0 ) params.control_vector_layer_start = 1 ;
896- if (params.control_vector_layer_end <= 0 ) params.control_vector_layer_end = llama_n_layer (model);
898+ if (params.control_vector_layer_end <= 0 ) params.control_vector_layer_end = llama_model_n_layer (model);
897899
898900 const auto cvec = common_control_vector_load (params.control_vectors );
899901 if (cvec.n_embd == -1 ) {
@@ -903,12 +905,13 @@ struct common_init_result common_init_from_params(common_params & params) {
903905 return iparams;
904906 }
905907
906- int err = llama_control_vector_apply (lctx,
907- cvec.data .data (),
908- cvec.data .size (),
909- cvec.n_embd ,
910- params.control_vector_layer_start ,
911- params.control_vector_layer_end );
908+ int err = llama_apply_adapter_cvec (
909+ lctx,
910+ cvec.data .data (),
911+ cvec.data .size (),
912+ cvec.n_embd ,
913+ params.control_vector_layer_start ,
914+ params.control_vector_layer_end );
912915 if (err) {
913916 llama_free (lctx);
914917 llama_model_free (model);
@@ -919,8 +922,8 @@ struct common_init_result common_init_from_params(common_params & params) {
919922
920923 // load and optionally apply lora adapters
921924 for (auto & la : params.lora_adapters ) {
922- llama_lora_adapter_ptr lora;
923- lora.reset (llama_lora_adapter_init (model, la.path .c_str ()));
925+ llama_adapter_lora_ptr lora;
926+ lora.reset (llama_adapter_lora_init (model, la.path .c_str ()));
924927 if (lora == nullptr ) {
925928 LOG_ERR (" %s: failed to apply lora adapter '%s'\n " , __func__, la.path .c_str ());
926929 llama_free (lctx);
@@ -933,17 +936,17 @@ struct common_init_result common_init_from_params(common_params & params) {
933936 }
934937
935938 if (!params.lora_init_without_apply ) {
936- common_lora_adapters_apply (lctx, params.lora_adapters );
939+ common_set_adapter_lora (lctx, params.lora_adapters );
937940 }
938941
939- if (params.sampling .ignore_eos && llama_token_eos (model ) == LLAMA_TOKEN_NULL) {
940- LOG_WRN (" %s: warning: model does not have an EOS token, ignoring --ignore-eos\n " , __func__);
942+ if (params.sampling .ignore_eos && llama_vocab_eos (vocab ) == LLAMA_TOKEN_NULL) {
943+ LOG_WRN (" %s: warning: vocab does not have an EOS token, ignoring --ignore-eos\n " , __func__);
941944 params.sampling .ignore_eos = false ;
942945 }
943946
944947 if (params.sampling .ignore_eos ) {
945- for (llama_token i = 0 ; i < llama_n_vocab (model ); i++) {
946- if (llama_token_is_eog (model , i)) {
948+ for (llama_token i = 0 ; i < llama_vocab_n_tokens (vocab ); i++) {
949+ if (llama_vocab_is_eog (vocab , i)) {
947950 LOG_INF (" %s: added %s logit bias = %f\n " , __func__, common_token_to_piece (lctx, i).c_str (), -INFINITY);
948951 params.sampling .logit_bias .push_back ({i, -INFINITY});
949952 }
@@ -964,8 +967,9 @@ struct common_init_result common_init_from_params(common_params & params) {
964967 LOG_WRN (" %s: warming up the model with an empty run - please wait ... (--no-warmup to disable)\n " , __func__);
965968
966969 std::vector<llama_token> tmp;
967- llama_token bos = llama_token_bos (model);
968- llama_token eos = llama_token_eos (model);
970+ llama_token bos = llama_vocab_bos (vocab);
971+ llama_token eos = llama_vocab_eos (vocab);
972+
969973 // some models (e.g. T5) don't have a BOS token
970974 if (bos != LLAMA_TOKEN_NULL) {
971975 tmp.push_back (bos);
@@ -1000,11 +1004,11 @@ struct common_init_result common_init_from_params(common_params & params) {
10001004 return iparams;
10011005}
10021006
1003- void common_lora_adapters_apply (struct llama_context * ctx, std::vector<common_lora_adapter_info > & lora) {
1004- llama_lora_adapter_clear (ctx);
1007+ void common_set_adapter_lora (struct llama_context * ctx, std::vector<common_adapter_lora_info > & lora) {
1008+ llama_clear_adapter_lora (ctx);
10051009 for (auto & la : lora) {
10061010 if (la.scale != 0 .0f ) {
1007- llama_lora_adapter_set (ctx, la.ptr , la.scale );
1011+ llama_set_adapter_lora (ctx, la.ptr , la.scale );
10081012 }
10091013 }
10101014}
@@ -1553,21 +1557,23 @@ std::vector<llama_token> common_tokenize(
15531557 const std::string & text,
15541558 bool add_special,
15551559 bool parse_special) {
1556- return common_tokenize (llama_get_model (ctx), text, add_special, parse_special);
1560+ const llama_model * model = llama_get_model (ctx);
1561+ const llama_vocab * vocab = llama_model_get_vocab (model);
1562+ return common_tokenize (vocab, text, add_special, parse_special);
15571563}
15581564
15591565std::vector<llama_token> common_tokenize (
1560- const struct llama_model * model ,
1566+ const struct llama_vocab * vocab ,
15611567 const std::string & text,
15621568 bool add_special,
15631569 bool parse_special) {
15641570 // upper limit for the number of tokens
15651571 int n_tokens = text.length () + 2 * add_special;
15661572 std::vector<llama_token> result (n_tokens);
1567- n_tokens = llama_tokenize (model , text.data (), text.length (), result.data (), result.size (), add_special, parse_special);
1573+ n_tokens = llama_tokenize (vocab , text.data (), text.length (), result.data (), result.size (), add_special, parse_special);
15681574 if (n_tokens < 0 ) {
15691575 result.resize (-n_tokens);
1570- int check = llama_tokenize (model , text.data (), text.length (), result.data (), result.size (), add_special, parse_special);
1576+ int check = llama_tokenize (vocab , text.data (), text.length (), result.data (), result.size (), add_special, parse_special);
15711577 GGML_ASSERT (check == -n_tokens);
15721578 } else {
15731579 result.resize (n_tokens);
@@ -1576,12 +1582,18 @@ std::vector<llama_token> common_tokenize(
15761582}
15771583
15781584std::string common_token_to_piece (const struct llama_context * ctx, llama_token token, bool special) {
1585+ const llama_model * model = llama_get_model (ctx);
1586+ const llama_vocab * vocab = llama_model_get_vocab (model);
1587+ return common_token_to_piece (vocab, token, special);
1588+ }
1589+
1590+ std::string common_token_to_piece (const struct llama_vocab * vocab, llama_token token, bool special) {
15791591 std::string piece;
15801592 piece.resize (piece.capacity ()); // using string internal cache, 15 bytes + '\n'
1581- const int n_chars = llama_token_to_piece (llama_get_model (ctx) , token, &piece[0 ], piece.size (), 0 , special);
1593+ const int n_chars = llama_token_to_piece (vocab , token, &piece[0 ], piece.size (), 0 , special);
15821594 if (n_chars < 0 ) {
15831595 piece.resize (-n_chars);
1584- int check = llama_token_to_piece (llama_get_model (ctx) , token, &piece[0 ], piece.size (), 0 , special);
1596+ int check = llama_token_to_piece (vocab , token, &piece[0 ], piece.size (), 0 , special);
15851597 GGML_ASSERT (check == -n_chars);
15861598 }
15871599 else {
@@ -1591,13 +1603,19 @@ std::string common_token_to_piece(const struct llama_context * ctx, llama_token
15911603 return piece;
15921604}
15931605
1594- std::string common_detokenize (llama_context * ctx, const std::vector<llama_token> & tokens, bool special) {
1606+ std::string common_detokenize (const struct llama_context * ctx, const std::vector<llama_token> & tokens, bool special) {
1607+ const llama_model * model = llama_get_model (ctx);
1608+ const llama_vocab * vocab = llama_model_get_vocab (model);
1609+ return common_detokenize (vocab, tokens, special);
1610+ }
1611+
1612+ std::string common_detokenize (const struct llama_vocab * vocab, const std::vector<llama_token> & tokens, bool special) {
15951613 std::string text;
15961614 text.resize (std::max (text.capacity (), tokens.size ()));
1597- int32_t n_chars = llama_detokenize (llama_get_model (ctx) , tokens.data (), (int32_t )tokens.size (), &text[0 ], (int32_t )text.size (), false , special);
1615+ int32_t n_chars = llama_detokenize (vocab , tokens.data (), (int32_t )tokens.size (), &text[0 ], (int32_t )text.size (), false , special);
15981616 if (n_chars < 0 ) {
15991617 text.resize (-n_chars);
1600- n_chars = llama_detokenize (llama_get_model (ctx) , tokens.data (), (int32_t )tokens.size (), &text[0 ], (int32_t )text.size (), false , special);
1618+ n_chars = llama_detokenize (vocab , tokens.data (), (int32_t )tokens.size (), &text[0 ], (int32_t )text.size (), false , special);
16011619 GGML_ASSERT (n_chars <= (int32_t )text.size ()); // whitespace trimming is performed after per-token detokenization
16021620 }
16031621
@@ -1612,20 +1630,13 @@ std::string common_detokenize(llama_context * ctx, const std::vector<llama_token
16121630//
16131631
16141632std::string common_get_builtin_chat_template (const struct llama_model * model) {
1615- static const char * template_key = " tokenizer.chat_template" ;
1616- // call with NULL buffer to get the total size of the string
1617- int32_t res = llama_model_meta_val_str (model, template_key, NULL , 0 );
1618- if (res > 0 ) {
1619- std::vector<char > model_template (res + 1 , 0 );
1620- llama_model_meta_val_str (model, template_key, model_template.data (), model_template.size ());
1621- return std::string (model_template.data (), model_template.size () - 1 );
1622- }
1623- return " " ;
1633+ const char * ptr_tmpl = llama_model_chat_template (model);
1634+ return ptr_tmpl == nullptr ? " " : ptr_tmpl;
16241635}
16251636
16261637bool common_chat_verify_template (const std::string & tmpl) {
16271638 llama_chat_message chat[] = {{" user" , " test" }};
1628- int res = llama_chat_apply_template (nullptr , tmpl.c_str (), chat, 1 , true , nullptr , 0 );
1639+ const int res = llama_chat_apply_template (tmpl.c_str (), chat, 1 , true , nullptr , 0 );
16291640 return res >= 0 ;
16301641}
16311642
@@ -1636,35 +1647,34 @@ std::string common_chat_apply_template(const struct llama_model * model,
16361647 int alloc_size = 0 ;
16371648 bool fallback = false ; // indicate if we must fallback to default chatml
16381649 std::vector<llama_chat_message> chat;
1639- for (auto & msg : msgs) {
1650+ for (const auto & msg : msgs) {
16401651 chat.push_back ({msg.role .c_str (), msg.content .c_str ()});
16411652 alloc_size += (msg.role .size () + msg.content .size ()) * 1.25 ;
16421653 }
16431654
1644- const char * ptr_tmpl = tmpl.empty () ? nullptr : tmpl.c_str ();
1655+ const char * ptr_tmpl = tmpl.empty () ? llama_model_chat_template (model) : tmpl.c_str ();
16451656 std::vector<char > buf (alloc_size);
16461657
16471658 // run the first time to get the total output length
1648- int32_t res = llama_chat_apply_template (model, ptr_tmpl, chat.data (), chat.size (), add_ass, buf.data (), buf.size ());
1659+ int32_t res = llama_chat_apply_template (ptr_tmpl, chat.data (), chat.size (), add_ass, buf.data (), buf.size ());
16491660
16501661 // error: chat template is not supported
16511662 if (res < 0 ) {
16521663 if (ptr_tmpl != nullptr ) {
16531664 // if the custom "tmpl" is not supported, we throw an error
16541665 // this is a bit redundant (for good), since we're not sure if user validated the custom template with llama_chat_verify_template()
16551666 throw std::runtime_error (" this custom template is not supported" );
1656- } else {
1657- // If the built-in template is not supported, we default to chatml
1658- res = llama_chat_apply_template (nullptr , " chatml" , chat.data (), chat.size (), add_ass, buf.data (), buf.size ());
1659- fallback = true ;
16601667 }
1668+
1669+ // If the built-in template is not supported, we default to chatml
1670+ res = llama_chat_apply_template (" chatml" , chat.data (), chat.size (), add_ass, buf.data (), buf.size ());
1671+ fallback = true ;
16611672 }
16621673
16631674 // if it turns out that our buffer is too small, we resize it
16641675 if ((size_t ) res > buf.size ()) {
16651676 buf.resize (res);
16661677 res = llama_chat_apply_template (
1667- fallback ? nullptr : model,
16681678 fallback ? " chatml" : ptr_tmpl,
16691679 chat.data (), chat.size (), add_ass, buf.data (), buf.size ());
16701680 }
0 commit comments