server : dynamic token limit for prompt cache (ggml-org#16560)

ggerganov · web-flow · commit bc07349a7f87 · 2025-10-14T08:48:50.000+03:00
* server : dynamic token limit for prompt cache

* cont : print estimated token limit
diff --git a/tools/server/server.cpp b/tools/server/server.cpp
@@ -1585,23 +1585,31 @@ struct server_prompt_cache {
             }
         }
 
+        // average size per token
+        const float size_per_token = std::max<float>(1.0f, float(size()) / (std::max<size_t>(1, n_tokens())));
+
+        // dynamically increase the token limit if it can fit in the memory limit
+        const size_t limit_tokens_cur = limit_size > 0 ? std::max<size_t>(limit_tokens, limit_size/size_per_token) : limit_tokens;
+
         if (limit_tokens > 0) {
-            while (states.size() > 1 && n_tokens() > limit_tokens) {
+            while (states.size() > 1 && n_tokens() > limit_tokens_cur) {
                 if (states.empty()) {
                     break;
                 }
 
-                SRV_WRN(" - cache token limit reached, removing oldest entry (size = %.3f MiB)\n", states.front().size() / (1024.0 * 1024.0));
+                SRV_WRN(" - cache token limit (%zu, est: %zu) reached, removing oldest entry (size = %.3f MiB)\n",
+                        limit_tokens, limit_tokens_cur, states.front().size() / (1024.0 * 1024.0));
 
                 states.pop_front();
             }
         }
 
-        SRV_WRN(" - cache state: %zu prompts, %.3f MiB (limits: %.3f MiB, %zu tokens)\n",
-                states.size(), size() / (1024.0 * 1024.0), limit_size / (1024.0 * 1024.0), limit_tokens);
+        SRV_WRN(" - cache state: %zu prompts, %.3f MiB (limits: %.3f MiB, %zu tokens, %zu est)\n",
+                states.size(), size() / (1024.0 * 1024.0), limit_size / (1024.0 * 1024.0), limit_tokens, limit_tokens_cur);
 
         for (const auto & state : states) {
-            SRV_WRN("   - prompt %p: %7d tokens, checkpoints: %2zu, %9.3f MiB\n", (const void *)&state, state.n_tokens(), state.checkpoints.size(), state.size() / (1024.0 * 1024.0));
+            SRV_WRN("   - prompt %p: %7d tokens, checkpoints: %2zu, %9.3f MiB\n",
+                    (const void *)&state, state.n_tokens(), state.checkpoints.size(), state.size() / (1024.0 * 1024.0));
         }
     }
 };

Original file line number	Diff line number	Diff line change
`@@ -1585,23 +1585,31 @@ struct server_prompt_cache {`
`1585`	`1585`	`}`
`1586`	`1586`	`}`
`1587`	`1587`
	`1588`	`+ // average size per token`
	`1589`	`+ const float size_per_token = std::max<float>(1.0f, float(size()) / (std::max<size_t>(1, n_tokens())));`
	`1590`	`+`
	`1591`	`+ // dynamically increase the token limit if it can fit in the memory limit`
	`1592`	`+ const size_t limit_tokens_cur = limit_size > 0 ? std::max<size_t>(limit_tokens, limit_size/size_per_token) : limit_tokens;`
	`1593`	`+`
`1588`	`1594`	`if (limit_tokens > 0) {`
`1589`		`- while (states.size() > 1 && n_tokens() > limit_tokens) {`
	`1595`	`+ while (states.size() > 1 && n_tokens() > limit_tokens_cur) {`
`1590`	`1596`	`if (states.empty()) {`
`1591`	`1597`	`break;`
`1592`	`1598`	`}`
`1593`	`1599`
`1594`		`- SRV_WRN(" - cache token limit reached, removing oldest entry (size = %.3f MiB)\n", states.front().size() / (1024.0 * 1024.0));`
	`1600`	`+ SRV_WRN(" - cache token limit (%zu, est: %zu) reached, removing oldest entry (size = %.3f MiB)\n",`
	`1601`	`+ limit_tokens, limit_tokens_cur, states.front().size() / (1024.0 * 1024.0));`
`1595`	`1602`
`1596`	`1603`	`states.pop_front();`
`1597`	`1604`	`}`
`1598`	`1605`	`}`
`1599`	`1606`
`1600`		`- SRV_WRN(" - cache state: %zu prompts, %.3f MiB (limits: %.3f MiB, %zu tokens)\n",`
`1601`		`- states.size(), size() / (1024.0 * 1024.0), limit_size / (1024.0 * 1024.0), limit_tokens);`
	`1607`	`+ SRV_WRN(" - cache state: %zu prompts, %.3f MiB (limits: %.3f MiB, %zu tokens, %zu est)\n",`
	`1608`	`+ states.size(), size() / (1024.0 * 1024.0), limit_size / (1024.0 * 1024.0), limit_tokens, limit_tokens_cur);`
`1602`	`1609`
`1603`	`1610`	`for (const auto & state : states) {`
`1604`		`- SRV_WRN(" - prompt %p: %7d tokens, checkpoints: %2zu, %9.3f MiB\n", (const void )&state, state.n_tokens(), state.checkpoints.size(), state.size() / (1024.0 1024.0));`
	`1611`	`+ SRV_WRN(" - prompt %p: %7d tokens, checkpoints: %2zu, %9.3f MiB\n",`
	`1612`	`+ (const void )&state, state.n_tokens(), state.checkpoints.size(), state.size() / (1024.0 1024.0));`
`1605`	`1613`	`}`
`1606`	`1614`	`}`
`1607`	`1615`	`};`