Skip to content

Commit 9a8df14

Browse files
committed
server: Add standby-timeout
Add standby-timeout. A timeout for automatically terminating the server after being unused for a certain amount of time
1 parent 26a8406 commit 9a8df14

File tree

3 files changed

+37
-8
lines changed

3 files changed

+37
-8
lines changed

common/arg.cpp

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1785,6 +1785,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
17851785
params.n_cache_reuse = value;
17861786
}
17871787
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_CACHE_REUSE"));
1788+
add_opt(common_arg(
1789+
{"--standby-timeout"}, "N",
1790+
string_format("time that must pass since a request has been served, before the server stops automatically (default: %d)", params.standby_timeout),
1791+
[](common_params & params, int value) {
1792+
params.standby_timeout = value;
1793+
}
1794+
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_STANDBY_TIMEOUT"));
17881795
add_opt(common_arg(
17891796
{"--metrics"},
17901797
string_format("enable prometheus compatible metrics endpoint (default: %s)", params.endpoint_metrics ? "enabled" : "disabled"),

common/common.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -306,6 +306,7 @@ struct common_params {
306306
int32_t timeout_write = timeout_read; // http write timeout in seconds
307307
int32_t n_threads_http = -1; // number of threads to process HTTP requests (TODO: support threadpool)
308308
int32_t n_cache_reuse = 0; // min chunk size to reuse from the cache via KV shifting
309+
int32_t standby_timeout = 0; // time that must pass since a request has been processed before server terminates in order to save resources. If -1, then never terminate automatically.
309310

310311
std::string hostname = "127.0.0.1";
311312
std::string public_path = ""; // NOLINT

examples/server/server.cpp

Lines changed: 29 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,8 @@
2929
#include <thread>
3030
#include <unordered_map>
3131
#include <unordered_set>
32+
#include <chrono>
33+
#include <variant>
3234

3335
using json = nlohmann::ordered_json;
3436

@@ -1162,6 +1164,16 @@ struct server_metrics {
11621164
}
11631165
};
11641166

1167+
struct Signal {
1168+
int number;
1169+
};
1170+
1171+
struct StandbyTimeout {};
1172+
1173+
using ShutdownReason = std::variant<Signal, StandbyTimeout>;
1174+
1175+
std::function<void(ShutdownReason)> shutdown_handler;
1176+
11651177
struct server_queue {
11661178
int id = 0;
11671179
bool running;
@@ -1258,7 +1270,7 @@ struct server_queue {
12581270
* - Check if multitask is finished
12591271
* - Update all slots
12601272
*/
1261-
void start_loop() {
1273+
void start_loop(int standby_timeout) {
12621274
running = true;
12631275

12641276
while (true) {
@@ -1291,9 +1303,19 @@ struct server_queue {
12911303
QUE_DBG("%s", "terminate\n");
12921304
return;
12931305
}
1294-
condition_tasks.wait(lock, [&]{
1295-
return (!queue_tasks.empty() || !running);
1296-
});
1306+
const auto pred = [&] {
1307+
return (!queue_tasks.empty() || !running);
1308+
};
1309+
if (standby_timeout > 0) {
1310+
if (!condition_tasks.wait_for(lock, std::chrono::seconds(standby_timeout), pred)) {
1311+
lock.release()->unlock(); // unlock the unique_lock, before calling the shutdown_handler, as it tries to lock it
1312+
QUE_INF("%s", "stand-by timeout reached\n");
1313+
shutdown_handler(StandbyTimeout{});
1314+
break;
1315+
}
1316+
} else {
1317+
condition_tasks.wait(lock, pred);
1318+
}
12971319
}
12981320
}
12991321
}
@@ -2884,7 +2906,6 @@ static void log_server_request(const httplib::Request & req, const httplib::Resp
28842906
LOG_DBG("response: %s\n", res.body.c_str());
28852907
}
28862908

2887-
std::function<void(int)> shutdown_handler;
28882909
std::atomic_flag is_terminating = ATOMIC_FLAG_INIT;
28892910

28902911
inline void signal_handler(int signal) {
@@ -2895,7 +2916,7 @@ inline void signal_handler(int signal) {
28952916
exit(1);
28962917
}
28972918

2898-
shutdown_handler(signal);
2919+
shutdown_handler(Signal{ signal });
28992920
}
29002921

29012922
int main(int argc, char ** argv) {
@@ -3935,13 +3956,13 @@ int main(int argc, char ** argv) {
39353956
ctx_server.queue_tasks.on_update_slots(std::bind(
39363957
&server_context::update_slots, &ctx_server));
39373958

3938-
shutdown_handler = [&](int) {
3959+
shutdown_handler = [&](ShutdownReason) {
39393960
ctx_server.queue_tasks.terminate();
39403961
};
39413962

39423963
LOG_INF("%s: server is listening on http://%s:%d - starting the main loop\n", __func__, params.hostname.c_str(), params.port);
39433964

3944-
ctx_server.queue_tasks.start_loop();
3965+
ctx_server.queue_tasks.start_loop(params.standby_timeout);
39453966

39463967
#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__))
39473968
struct sigaction sigint_action;

0 commit comments

Comments
 (0)