@@ -1585,23 +1585,31 @@ struct server_prompt_cache {
15851585 }
15861586 }
15871587
1588+ // average size per token
1589+ const float size_per_token = std::max<float >(1 .0f , float (size ()) / (std::max<size_t >(1 , n_tokens ())));
1590+
1591+ // dynamically increase the token limit if it can fit in the memory limit
1592+ const size_t limit_tokens_cur = limit_size > 0 ? std::max<size_t >(limit_tokens, limit_size/size_per_token) : limit_tokens;
1593+
15881594 if (limit_tokens > 0 ) {
1589- while (states.size () > 1 && n_tokens () > limit_tokens ) {
1595+ while (states.size () > 1 && n_tokens () > limit_tokens_cur ) {
15901596 if (states.empty ()) {
15911597 break ;
15921598 }
15931599
1594- SRV_WRN (" - cache token limit reached, removing oldest entry (size = %.3f MiB)\n " , states.front ().size () / (1024.0 * 1024.0 ));
1600+ SRV_WRN (" - cache token limit (%zu, est: %zu) reached, removing oldest entry (size = %.3f MiB)\n " ,
1601+ limit_tokens, limit_tokens_cur, states.front ().size () / (1024.0 * 1024.0 ));
15951602
15961603 states.pop_front ();
15971604 }
15981605 }
15991606
1600- SRV_WRN (" - cache state: %zu prompts, %.3f MiB (limits: %.3f MiB, %zu tokens)\n " ,
1601- states.size (), size () / (1024.0 * 1024.0 ), limit_size / (1024.0 * 1024.0 ), limit_tokens);
1607+ SRV_WRN (" - cache state: %zu prompts, %.3f MiB (limits: %.3f MiB, %zu tokens, %zu est )\n " ,
1608+ states.size (), size () / (1024.0 * 1024.0 ), limit_size / (1024.0 * 1024.0 ), limit_tokens, limit_tokens_cur );
16021609
16031610 for (const auto & state : states) {
1604- SRV_WRN (" - prompt %p: %7d tokens, checkpoints: %2zu, %9.3f MiB\n " , (const void *)&state, state.n_tokens (), state.checkpoints .size (), state.size () / (1024.0 * 1024.0 ));
1611+ SRV_WRN (" - prompt %p: %7d tokens, checkpoints: %2zu, %9.3f MiB\n " ,
1612+ (const void *)&state, state.n_tokens (), state.checkpoints .size (), state.size () / (1024.0 * 1024.0 ));
16051613 }
16061614 }
16071615};
0 commit comments