@@ -432,6 +432,7 @@ struct llama_server_context
432
432
}
433
433
434
434
default_generation_settings_for_props = get_formated_generation (slots.front ());
435
+ default_generation_settings_for_props[" num_slots" ] = params.n_parallel ;
435
436
default_generation_settings_for_props[" seed" ] = -1 ;
436
437
437
438
batch = llama_batch_init (n_ctx, 0 , params.n_parallel );
@@ -524,27 +525,29 @@ struct llama_server_context
524
525
slot->oaicompat_model = " " ;
525
526
}
526
527
527
- slot->params .stream = json_value (data, " stream" , false );
528
- slot->params .cache_prompt = json_value (data, " cache_prompt" , false );
529
- slot->params .n_predict = json_value (data, " n_predict" , default_params.n_predict );
530
- slot->sparams .top_k = json_value (data, " top_k" , default_sparams.top_k );
531
- slot->sparams .top_p = json_value (data, " top_p" , default_sparams.top_p );
532
- slot->sparams .min_p = json_value (data, " min_p" , default_sparams.min_p );
533
- slot->sparams .tfs_z = json_value (data, " tfs_z" , default_sparams.tfs_z );
534
- slot->sparams .typical_p = json_value (data, " typical_p" , default_sparams.typical_p );
535
- slot->sparams .temp = json_value (data, " temperature" , default_sparams.temp );
536
- slot->sparams .penalty_last_n = json_value (data, " repeat_last_n" , default_sparams.penalty_last_n );
537
- slot->sparams .penalty_repeat = json_value (data, " repeat_penalty" , default_sparams.penalty_repeat );
538
- slot->sparams .penalty_freq = json_value (data, " frequency_penalty" , default_sparams.penalty_freq );
539
- slot->sparams .penalty_present = json_value (data, " presence_penalty" , default_sparams.penalty_present );
540
- slot->sparams .mirostat = json_value (data, " mirostat" , default_sparams.mirostat );
541
- slot->sparams .mirostat_tau = json_value (data, " mirostat_tau" , default_sparams.mirostat_tau );
542
- slot->sparams .mirostat_eta = json_value (data, " mirostat_eta" , default_sparams.mirostat_eta );
543
- slot->sparams .penalize_nl = json_value (data, " penalize_nl" , default_sparams.penalize_nl );
544
- slot->params .n_keep = json_value (data, " n_keep" , slot->params .n_keep );
545
- slot->params .seed = json_value (data, " seed" , default_params.seed );
546
- slot->sparams .grammar = json_value (data, " grammar" , default_sparams.grammar );
547
- slot->sparams .n_probs = json_value (data, " n_probs" , default_sparams.n_probs );
528
+ slot->params .stream = json_value (data, " stream" , false );
529
+ slot->params .cache_prompt = json_value (data, " cache_prompt" , false );
530
+ slot->params .n_predict = json_value (data, " n_predict" , default_params.n_predict );
531
+ slot->sparams .top_k = json_value (data, " top_k" , default_sparams.top_k );
532
+ slot->sparams .top_p = json_value (data, " top_p" , default_sparams.top_p );
533
+ slot->sparams .min_p = json_value (data, " min_p" , default_sparams.min_p );
534
+ slot->sparams .tfs_z = json_value (data, " tfs_z" , default_sparams.tfs_z );
535
+ slot->sparams .typical_p = json_value (data, " typical_p" , default_sparams.typical_p );
536
+ slot->sparams .temp = json_value (data, " temperature" , default_sparams.temp );
537
+ slot->sparams .dynatemp_range = json_value (data, " dynatemp_range" , default_sparams.dynatemp_range );
538
+ slot->sparams .dynatemp_exponent = json_value (data, " dynatemp_exponent" , default_sparams.dynatemp_exponent );
539
+ slot->sparams .penalty_last_n = json_value (data, " repeat_last_n" , default_sparams.penalty_last_n );
540
+ slot->sparams .penalty_repeat = json_value (data, " repeat_penalty" , default_sparams.penalty_repeat );
541
+ slot->sparams .penalty_freq = json_value (data, " frequency_penalty" , default_sparams.penalty_freq );
542
+ slot->sparams .penalty_present = json_value (data, " presence_penalty" , default_sparams.penalty_present );
543
+ slot->sparams .mirostat = json_value (data, " mirostat" , default_sparams.mirostat );
544
+ slot->sparams .mirostat_tau = json_value (data, " mirostat_tau" , default_sparams.mirostat_tau );
545
+ slot->sparams .mirostat_eta = json_value (data, " mirostat_eta" , default_sparams.mirostat_eta );
546
+ slot->sparams .penalize_nl = json_value (data, " penalize_nl" , default_sparams.penalize_nl );
547
+ slot->params .n_keep = json_value (data, " n_keep" , slot->params .n_keep );
548
+ slot->params .seed = json_value (data, " seed" , default_params.seed );
549
+ slot->sparams .grammar = json_value (data, " grammar" , default_sparams.grammar );
550
+ slot->sparams .n_probs = json_value (data, " n_probs" , default_sparams.n_probs );
548
551
549
552
// infill
550
553
if (data.count (" input_prefix" ) != 0 )
@@ -1002,6 +1005,8 @@ struct llama_server_context
1002
1005
{" model" , params.model_alias },
1003
1006
{" seed" , slot.params .seed },
1004
1007
{" temperature" , slot.sparams .temp },
1008
+ {" dynatemp_range" , slot.sparams .dynatemp_range },
1009
+ {" dynatemp_exponent" , slot.sparams .dynatemp_exponent },
1005
1010
{" top_k" , slot.sparams .top_k },
1006
1011
{" top_p" , slot.sparams .top_p },
1007
1012
{" min_p" , slot.sparams .min_p },
@@ -1163,13 +1168,30 @@ struct llama_server_context
1163
1168
task.multitask_id = multitask_id;
1164
1169
1165
1170
// when a completion task's prompt array is not a singleton, we split it into multiple requests
1166
- if (task.data .count (" prompt" ) && task.data .at (" prompt" ).size () > 1 )
1167
- {
1168
- split_multiprompt_task (task_id, task);
1169
- }
1170
-
1171
1171
// otherwise, it's a single-prompt task, we actually queue it
1172
- queue_tasks.post (task);
1172
+ // if there's numbers in the prompt array it will be treated as an array of tokens
1173
+ if (task.data .count (" prompt" ) != 0 && task.data .at (" prompt" ).size () > 1 ) {
1174
+ bool numbers = false ;
1175
+ for (const auto & e : task.data .at (" prompt" )) {
1176
+ if (e.is_number ()) {
1177
+ numbers = true ;
1178
+ break ;
1179
+ }
1180
+ }
1181
+
1182
+ // NOTE: split_multiprompt_task() does not handle a mix of strings and numbers,
1183
+ // it will completely stall the server. I don't know where the bug for this is.
1184
+ //
1185
+ // if there are numbers, it needs to be treated like a single prompt,
1186
+ // queue_tasks handles a mix of strings and numbers just fine.
1187
+ if (numbers) {
1188
+ queue_tasks.post (task);
1189
+ } else {
1190
+ split_multiprompt_task (task_id, task);
1191
+ }
1192
+ } else {
1193
+ queue_tasks.post (task);
1194
+ }
1173
1195
}
1174
1196
1175
1197
// for multiple images processing
@@ -1251,7 +1273,10 @@ struct llama_server_context
1251
1273
void split_multiprompt_task (int multitask_id, task_server& multiprompt_task)
1252
1274
{
1253
1275
int prompt_count = multiprompt_task.data .at (" prompt" ).size ();
1254
- assert (prompt_count > 1 );
1276
+ if (prompt_count <= 1 ) {
1277
+ send_error (multiprompt_task, " error while handling multiple prompts" );
1278
+ return ;
1279
+ }
1255
1280
1256
1281
// generate all the ID for subtask
1257
1282
std::vector<int > subtask_ids (prompt_count);
0 commit comments