@@ -133,7 +133,6 @@ struct slot_params {
133133 std::vector<std::string> response_fields;
134134 bool timings_per_token = false ;
135135 bool post_sampling_probs = false ;
136- bool ignore_eos = false ;
137136
138137 struct common_params_sampling sampling;
139138 struct common_params_speculative speculative;
@@ -447,7 +446,6 @@ struct server_task {
447446
448447 {
449448 params.sampling .logit_bias .clear ();
450- params.ignore_eos = json_value (data, " ignore_eos" , false );
451449
452450 const auto & logit_bias = data.find (" logit_bias" );
453451 if (logit_bias != data.end () && logit_bias->is_array ()) {
@@ -478,6 +476,13 @@ struct server_task {
478476 }
479477 }
480478 }
479+
480+ params.sampling .ignore_eos = json_value (data, " ignore_eos" , params_base.sampling .ignore_eos );
481+ if (params.sampling .ignore_eos ) {
482+ params.sampling .logit_bias .insert (
483+ params.sampling .logit_bias .end (),
484+ defaults.sampling .logit_bias_eog .begin (), defaults.sampling .logit_bias_eog .end ());
485+ }
481486 }
482487
483488 {
@@ -1906,7 +1911,6 @@ struct server_context {
19061911
19071912 bool clean_kv_cache = true ;
19081913 bool add_bos_token = true ;
1909- bool has_eos_token = false ;
19101914
19111915 int32_t n_ctx; // total context for all clients / slots
19121916
@@ -1965,7 +1969,6 @@ struct server_context {
19651969 n_ctx = llama_n_ctx (ctx);
19661970
19671971 add_bos_token = llama_vocab_get_add_bos (vocab);
1968- has_eos_token = llama_vocab_eos (vocab) != LLAMA_TOKEN_NULL;
19691972
19701973 if (!params_base.speculative .model .path .empty () || !params_base.speculative .model .hf_repo .empty ()) {
19711974 SRV_INF (" loading draft model '%s'\n " , params_base.speculative .model .path .c_str ());
@@ -2225,10 +2228,6 @@ struct server_context {
22252228 slot.params .n_predict = slot.n_predict ;
22262229 }
22272230
2228- if (slot.params .ignore_eos && has_eos_token) {
2229- slot.params .sampling .logit_bias .push_back ({llama_vocab_eos (vocab), -INFINITY});
2230- }
2231-
22322231 {
22332232 if (slot.smpl != nullptr ) {
22342233 common_sampler_free (slot.smpl );
@@ -3894,12 +3893,11 @@ int main(int argc, char ** argv) {
38943893
38953894 return false ;
38963895 };
3897-
3896+
38983897 auto middleware_server_state = [&res_error, &state](const httplib::Request & req, httplib::Response & res) {
38993898 server_state current_state = state.load ();
39003899 if (current_state == SERVER_STATE_LOADING_MODEL) {
39013900 auto tmp = string_split<std::string>(req.path , ' .' );
3902-
39033901 if (req.path == " /" || tmp.back () == " html" ) {
39043902 // mmojo-server START
39053903 // res.set_content(reinterpret_cast<const char*>(loading_html), loading_html_len, "text/html; charset=utf-8");
@@ -3912,7 +3910,6 @@ int main(int argc, char ** argv) {
39123910 } else {
39133911 res_error (res, format_error_response (" Loading model" , ERROR_TYPE_UNAVAILABLE));
39143912 }
3915-
39163913 return false ;
39173914 }
39183915 return true ;
@@ -4612,9 +4609,10 @@ int main(int argc, char ** argv) {
46124609 json tokens_response = json::array ();
46134610 if (body.count (" content" ) != 0 ) {
46144611 const bool add_special = json_value (body, " add_special" , false );
4612+ const bool parse_special = json_value (body, " parse_special" , true );
46154613 const bool with_pieces = json_value (body, " with_pieces" , false );
46164614
4617- llama_tokens tokens = tokenize_mixed (ctx_server.vocab , body.at (" content" ), add_special, true );
4615+ llama_tokens tokens = tokenize_mixed (ctx_server.vocab , body.at (" content" ), add_special, parse_special );
46184616
46194617 if (with_pieces) {
46204618 for (const auto & token : tokens) {
0 commit comments