Skip to content
Open
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions common/arg.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1785,6 +1785,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
params.n_cache_reuse = value;
}
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_CACHE_REUSE"));
add_opt(common_arg(
{"--standby-timeout"}, "N",
string_format("seconds that must pass since a request has been served, before the server stops automatically (default: %d)", params.standby_timeout),
[](common_params & params, int value) {
params.standby_timeout = value;
}
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_STANDBY_TIMEOUT"));
add_opt(common_arg(
{"--metrics"},
string_format("enable prometheus compatible metrics endpoint (default: %s)", params.endpoint_metrics ? "enabled" : "disabled"),
Expand Down
1 change: 1 addition & 0 deletions common/common.h
Original file line number Diff line number Diff line change
Expand Up @@ -306,6 +306,7 @@ struct common_params {
int32_t timeout_write = timeout_read; // http write timeout in seconds
int32_t n_threads_http = -1; // number of threads to process HTTP requests (TODO: support threadpool)
int32_t n_cache_reuse = 0; // min chunk size to reuse from the cache via KV shifting
int32_t standby_timeout = 0; // seconds that must pass since a request has been processed before server terminates in order to save resources. If -1, then never terminate automatically.

std::string hostname = "127.0.0.1";
std::string public_path = ""; // NOLINT
Expand Down
1 change: 1 addition & 0 deletions examples/server/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -155,6 +155,7 @@ The project is under active development, and we are [looking for feedback and co
| `-to, --timeout N` | server read/write timeout in seconds (default: 600)<br/>(env: LLAMA_ARG_TIMEOUT) |
| `--threads-http N` | number of threads used to process HTTP requests (default: -1)<br/>(env: LLAMA_ARG_THREADS_HTTP) |
| `--cache-reuse N` | min chunk size to attempt reusing from the cache via KV shifting (default: 0)<br/>(env: LLAMA_ARG_CACHE_REUSE) |
| `--standby-timeout N` | seconds that must pass since a request has been served, before the server stops automatically (default: 0)<br/>(env: LLAMA_ARG_STANDBY_TIMEOUT) |
| `--metrics` | enable prometheus compatible metrics endpoint (default: disabled)<br/>(env: LLAMA_ARG_ENDPOINT_METRICS) |
| `--slots` | enable slots monitoring endpoint (default: disabled)<br/>(env: LLAMA_ARG_ENDPOINT_SLOTS) |
| `--props` | enable changing global properties via POST /props (default: disabled)<br/>(env: LLAMA_ARG_ENDPOINT_PROPS) |
Expand Down
37 changes: 29 additions & 8 deletions examples/server/server.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,8 @@
#include <thread>
#include <unordered_map>
#include <unordered_set>
#include <chrono>
#include <variant>

using json = nlohmann::ordered_json;

Expand Down Expand Up @@ -1162,6 +1164,16 @@ struct server_metrics {
}
};

struct termination_signal {
int number;
};

struct standby_timeout {};

using shutdown_reason = std::variant<termination_signal, standby_timeout>;

std::function<void(shutdown_reason)> shutdown_handler;

struct server_queue {
int id = 0;
bool running;
Expand Down Expand Up @@ -1258,7 +1270,7 @@ struct server_queue {
* - Check if multitask is finished
* - Update all slots
*/
void start_loop() {
void start_loop(int standby_timeout) {
running = true;

while (true) {
Expand Down Expand Up @@ -1291,9 +1303,19 @@ struct server_queue {
QUE_DBG("%s", "terminate\n");
return;
}
condition_tasks.wait(lock, [&]{
return (!queue_tasks.empty() || !running);
});
const auto pred = [&] {
return (!queue_tasks.empty() || !running);
};
if (standby_timeout > 0) {
if (!condition_tasks.wait_for(lock, std::chrono::seconds(standby_timeout), pred)) {
lock.release()->unlock(); // unlock the unique_lock, before calling the shutdown_handler, as it tries to lock it
QUE_INF("%s", "stand-by timeout reached\n");
shutdown_handler(::standby_timeout{});
break;
}
} else {
condition_tasks.wait(lock, pred);
}
}
}
}
Expand Down Expand Up @@ -2884,7 +2906,6 @@ static void log_server_request(const httplib::Request & req, const httplib::Resp
LOG_DBG("response: %s\n", res.body.c_str());
}

std::function<void(int)> shutdown_handler;
std::atomic_flag is_terminating = ATOMIC_FLAG_INIT;

inline void signal_handler(int signal) {
Expand All @@ -2895,7 +2916,7 @@ inline void signal_handler(int signal) {
exit(1);
}

shutdown_handler(signal);
shutdown_handler(termination_signal{ signal });
}

int main(int argc, char ** argv) {
Expand Down Expand Up @@ -3935,13 +3956,13 @@ int main(int argc, char ** argv) {
ctx_server.queue_tasks.on_update_slots(std::bind(
&server_context::update_slots, &ctx_server));

shutdown_handler = [&](int) {
shutdown_handler = [&](shutdown_reason) {
ctx_server.queue_tasks.terminate();
};

LOG_INF("%s: server is listening on http://%s:%d - starting the main loop\n", __func__, params.hostname.c_str(), params.port);

ctx_server.queue_tasks.start_loop();
ctx_server.queue_tasks.start_loop(params.standby_timeout);

#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__))
struct sigaction sigint_action;
Expand Down
Loading