@@ -1321,17 +1321,29 @@ struct server_slot {
13211321 && are_lora_equal (lora, other_slot.lora );
13221322 }
13231323
1324- bool has_budget (const common_params & global_params) {
1324+ bool has_budget (const common_params & global_params, int32_t slot_n_ctx ) {
13251325 if (params.n_predict == -1 && global_params.n_predict == -1 ) {
13261326 return true ; // limitless
13271327 }
13281328
13291329 n_remaining = -1 ;
1330-
1331- if (params.n_predict != -1 ) {
1332- n_remaining = params.n_predict - n_decoded;
1333- } else if (global_params.n_predict != -1 ) {
1334- n_remaining = global_params.n_predict - n_decoded;
1330+ if (global_params.n_predict == -1 ) {
1331+ if (params.n_predict == -2 )
1332+ n_remaining = slot_n_ctx - n_decoded;
1333+ else
1334+ n_remaining = params.n_predict - n_decoded;
1335+ } else if (global_params.n_predict == -2 ) {
1336+ if (params.n_predict == -1 || params.n_predict == -2 )
1337+ n_remaining = slot_n_ctx - n_decoded;
1338+ else
1339+ n_remaining = std::min (params.n_predict - n_decoded, slot_n_ctx - n_decoded);
1340+ } else {
1341+ if (params.n_predict == -1 )
1342+ n_remaining = global_params.n_predict - n_decoded;
1343+ else if (params.n_predict == -2 )
1344+ n_remaining = std::min (global_params.n_predict - n_decoded, slot_n_ctx - n_decoded);
1345+ else
1346+ n_remaining = std::min (params.n_predict - n_decoded, global_params.n_predict - n_decoded);
13351347 }
13361348
13371349 return n_remaining > 0 ; // no budget
@@ -2153,7 +2165,7 @@ struct server_context {
21532165 }
21542166
21552167 // check the limits
2156- if (slot.n_decoded > 0 && slot.has_next_token && !slot.has_budget (params_base)) {
2168+ if (slot.n_decoded > 0 && slot.has_next_token && !slot.has_budget (params_base, slot. n_ctx )) {
21572169 slot.stop = STOP_TYPE_LIMIT;
21582170 slot.has_next_token = false ;
21592171
0 commit comments