From 19a852f83a76022356ff97f1c066377986a2ffb3 Mon Sep 17 00:00:00 2001 From: "P. Varet" Date: Sat, 13 Sep 2025 23:00:45 +0100 Subject: [PATCH 1/3] Added support for systemd socket activation. --- CMakeLists.txt | 7 ++- common/CMakeLists.txt | 4 ++ common/arg.cpp | 8 +++ common/common.h | 4 ++ tools/server/CMakeLists.txt | 8 +++ tools/server/server.cpp | 110 ++++++++++++++++++++++++++++-------- 6 files changed, 116 insertions(+), 25 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 4bf8b2789ae7b..56cd451b0770b 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -91,9 +91,10 @@ option(LLAMA_BUILD_SERVER "llama: build server example" ${LLAMA_STANDALONE}) option(LLAMA_TOOLS_INSTALL "llama: install tools" ${LLAMA_TOOLS_INSTALL_DEFAULT}) # 3rd party libs -option(LLAMA_CURL "llama: use libcurl to download model from an URL" ON) -option(LLAMA_OPENSSL "llama: use openssl to support HTTPS" OFF) -option(LLAMA_LLGUIDANCE "llama-common: include LLGuidance library for structured output in common utils" OFF) +option(LLAMA_CURL "llama: use libcurl to download model from an URL" ON) +option(LLAMA_OPENSSL "llama: use openssl to support HTTPS" OFF) +option(LLAMA_LLGUIDANCE "llama-common: include LLGuidance library for structured output in common utils" OFF) +option(LLAMA_SERVER_SYSTEMD "llama-server: support systemd socket activation and readiness notification (linux only)" OFF) # Required for relocatable CMake package include(${CMAKE_CURRENT_SOURCE_DIR}/cmake/build-info.cmake) diff --git a/common/CMakeLists.txt b/common/CMakeLists.txt index 8ab3d445104a7..f25d68bbb196f 100644 --- a/common/CMakeLists.txt +++ b/common/CMakeLists.txt @@ -169,6 +169,10 @@ if (LLAMA_LLGUIDANCE) set(LLAMA_COMMON_EXTRA_LIBS ${LLAMA_COMMON_EXTRA_LIBS} llguidance ${LLGUIDANCE_PLATFORM_LIBS}) endif () +if (UNIX AND NOT APPLE AND LLAMA_SERVER_SYSTEMD) + target_compile_definitions(${TARGET} PRIVATE LLAMA_CPP_SYSTEMD_SUPPORT) +endif() + target_include_directories(${TARGET} PUBLIC . ../vendor) target_compile_features (${TARGET} PUBLIC cxx_std_17) target_link_libraries (${TARGET} PRIVATE ${LLAMA_COMMON_EXTRA_LIBS} PUBLIC llama Threads::Threads) diff --git a/common/arg.cpp b/common/arg.cpp index f6a775fc4a804..c5aba6334022b 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -3376,6 +3376,14 @@ common_params_context common_params_parser_init(common_params & params, llama_ex params.port = value; } ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_PORT")); +#ifdef LLAMA_CPP_SYSTEMD_SUPPORT + add_opt(common_arg({ "--systemd" }, + string_format("use systemd socket and readiness notification (default: %s)", + params.use_systemd ? "enabled" : "disabled"), + [](common_params & params) { params.use_systemd = true; }) + .set_examples({ LLAMA_EXAMPLE_SERVER }) + .set_env("LLAMA_ARG_SYSTEMD")); +#endif // LLAMA_CPP_SYSTEMD_SUPPORT add_opt(common_arg( {"--path"}, "PATH", string_format("path to serve static files from (default: %s)", params.public_path.c_str()), diff --git a/common/common.h b/common/common.h index 40c6847f32ddb..25390826b6756 100644 --- a/common/common.h +++ b/common/common.h @@ -436,6 +436,10 @@ struct common_params { int reasoning_budget = -1; bool prefill_assistant = true; // if true, any trailing assistant message will be prefilled into the response +#ifdef LLAMA_CPP_SYSTEMD_SUPPORT + bool use_systemd = false; // use systemd socket and readiness notification +#endif + std::vector api_keys; std::string ssl_file_key = ""; // NOLINT diff --git a/tools/server/CMakeLists.txt b/tools/server/CMakeLists.txt index 06df3ee49dd33..4c94acd833a85 100644 --- a/tools/server/CMakeLists.txt +++ b/tools/server/CMakeLists.txt @@ -39,4 +39,12 @@ if (WIN32) TARGET_LINK_LIBRARIES(${TARGET} PRIVATE ws2_32) endif() +if (UNIX AND NOT APPLE AND LLAMA_SERVER_SYSTEMD) + message(STATUS "LLAMA_SERVER_SYSTEMD is ON, enabling systemd support") + find_package(PkgConfig REQUIRED) + pkg_check_modules(SYSTEMD REQUIRED libsystemd) + target_link_libraries(${TARGET} PRIVATE ${SYSTEMD_LIBRARIES}) + target_compile_definitions(${TARGET} PRIVATE LLAMA_CPP_SYSTEMD_SUPPORT) +endif() + target_compile_features(${TARGET} PRIVATE cxx_std_17) diff --git a/tools/server/server.cpp b/tools/server/server.cpp index 6062904a8c7c0..b134f70d77210 100644 --- a/tools/server/server.cpp +++ b/tools/server/server.cpp @@ -31,6 +31,12 @@ #include #include +#ifdef LLAMA_CPP_SYSTEMD_SUPPORT +# include +# include +# include +#endif // LLAMA_CPP_SYSTEMD_SUPPORT + using json = nlohmann::ordered_json; constexpr int HTTP_POLLING_SECONDS = 1; @@ -4075,6 +4081,38 @@ inline void signal_handler(int signal) { shutdown_handler(signal); } +#ifdef LLAMA_CPP_SYSTEMD_SUPPORT +// Subclass of httplib::Server that adds systemd socket activation support on systems +// where that's available. +class SystemdServer : public httplib::Server { + public: + bool setup_sd_socket() { + int n = sd_listen_fds(0); + if (n != 1) { + LOG_ERR("%s: sd_listen_fds() returned %d\n", __func__, n); + return false; + } + + int fd = SD_LISTEN_FDS_START; + struct stat statbuf; + if (fstat(fd, &statbuf) == -1 || !S_ISSOCK(statbuf.st_mode)) { + LOG_ERR("%s: fstat() failed or fd is not a socket\n", __func__); + return false; + } + + LOG_INF("%s: using systemd socket fd %d\n", __func__, fd); + svr_sock_ = fd; + return true; + } +}; +#endif // LLAMA_CPP_SYSTEMD_SUPPORT + +#ifdef LLAMA_CPP_SYSTEMD_SUPPORT +# define NEW_SERVER (new SystemdServer()) +#else +# define NEW_SERVER (new httplib::Server()) +#endif // LLAMA_CPP_SYSTEMD_SUPPORT + int main(int argc, char ** argv) { // own arguments required by this example common_params params; @@ -4105,14 +4143,14 @@ int main(int argc, char ** argv) { ); } else { LOG_INF("Running without SSL\n"); - svr.reset(new httplib::Server()); + svr.reset(NEW_SERVER); } #else if (params.ssl_file_key != "" && params.ssl_file_cert != "") { LOG_ERR("Server is built without SSL support\n"); return 1; } - svr.reset(new httplib::Server()); + svr.reset(NEW_SERVER); #endif std::atomic state{SERVER_STATE_LOADING_MODEL}; @@ -5281,24 +5319,38 @@ int main(int argc, char ** argv) { }; bool was_bound = false; - bool is_sock = false; - if (string_ends_with(std::string(params.hostname), ".sock")) { - is_sock = true; - LOG_INF("%s: setting address family to AF_UNIX\n", __func__); - svr->set_address_family(AF_UNIX); - // bind_to_port requires a second arg, any value other than 0 should - // simply get ignored - was_bound = svr->bind_to_port(params.hostname, 8080); - } else { - LOG_INF("%s: binding port with default address family\n", __func__); - // bind HTTP listen port - if (params.port == 0) { - int bound_port = svr->bind_to_any_port(params.hostname); - if ((was_bound = (bound_port >= 0))) { - params.port = bound_port; - } + bool is_sock = false; + +#ifdef LLAMA_CPP_SYSTEMD_SUPPORT + bool using_sd_socket = false; + if (params.use_systemd) { + was_bound = static_cast(svr.get())->setup_sd_socket(); + using_sd_socket = was_bound; + if (!was_bound) { + LOG_INF("%s: couldn't set up systemd socket; falling back to opening host:port socket\n", __func__); + } + } +#endif // LLAMA_CPP_SYSTEMD_SUPPORT + + if (!was_bound) { + if (string_ends_with(std::string(params.hostname), ".sock")) { + is_sock = true; + LOG_INF("%s: setting address family to AF_UNIX\n", __func__); + svr->set_address_family(AF_UNIX); + // bind_to_port requires a second arg, any value other than 0 should + // simply get ignored + was_bound = svr->bind_to_port(params.hostname, 8080); } else { - was_bound = svr->bind_to_port(params.hostname, params.port); + LOG_INF("%s: binding port with default address family\n", __func__); + // bind HTTP listen port + if (params.port == 0) { + int bound_port = svr->bind_to_any_port(params.hostname); + if ((was_bound = (bound_port >= 0))) { + params.port = bound_port; + } + } else { + was_bound = svr->bind_to_port(params.hostname, params.port); + } } } @@ -5327,6 +5379,12 @@ int main(int argc, char ** argv) { ctx_server.init(); state.store(SERVER_STATE_READY); +#ifdef LLAMA_CPP_SYSTEMD_SUPPORT + if (params.use_systemd) { + sd_notify(0, "READY=1"); + } +#endif + LOG_INF("%s: model loaded\n", __func__); // print sample chat example to make it clear which template is used @@ -5361,9 +5419,17 @@ int main(int argc, char ** argv) { SetConsoleCtrlHandler(reinterpret_cast(console_ctrl_handler), true); #endif - LOG_INF("%s: server is listening on %s - starting the main loop\n", __func__, - is_sock ? string_format("unix://%s", params.hostname.c_str()).c_str() : - string_format("http://%s:%d", params.hostname.c_str(), params.port).c_str()); +#ifdef LLAMA_CPP_SYSTEMD_SUPPORT + if (using_sd_socket) { + LOG_INF("%s: server is listening on systemd socket - starting the main loop\n", __func__); + } else { +#endif // LLAMA_CPP_SYSTEMD_SUPPORT + LOG_INF("%s: server is listening on %s - starting the main loop\n", __func__, + is_sock ? string_format("unix://%s", params.hostname.c_str()).c_str() : + string_format("http://%s:%d", params.hostname.c_str(), params.port).c_str()); +#ifdef LLAMA_CPP_SYSTEMD_SUPPORT + } +#endif // LLAMA_CPP_SYSTEMD_SUPPORT // this call blocks the main thread until queue_tasks.terminate() is called ctx_server.queue_tasks.start_loop(); From 103e3d3c226337b537db3fbbbcf64539dd4d094e Mon Sep 17 00:00:00 2001 From: "P. Varet" Date: Mon, 15 Sep 2025 19:38:48 +0100 Subject: [PATCH 2/3] Don't shut down the socket if it was passed down to us by systemd. systemd owns the lifecycle of the socket, not us. --- tools/server/server.cpp | 70 +++++++++++++++++++++++++++++++---------- 1 file changed, 54 insertions(+), 16 deletions(-) diff --git a/tools/server/server.cpp b/tools/server/server.cpp index b134f70d77210..91a7e94b690f9 100644 --- a/tools/server/server.cpp +++ b/tools/server/server.cpp @@ -32,8 +32,6 @@ #include #ifdef LLAMA_CPP_SYSTEMD_SUPPORT -# include -# include # include #endif // LLAMA_CPP_SYSTEMD_SUPPORT @@ -4093,17 +4091,35 @@ class SystemdServer : public httplib::Server { return false; } - int fd = SD_LISTEN_FDS_START; - struct stat statbuf; - if (fstat(fd, &statbuf) == -1 || !S_ISSOCK(statbuf.st_mode)) { - LOG_ERR("%s: fstat() failed or fd is not a socket\n", __func__); + int fd = SD_LISTEN_FDS_START; + if (!sd_is_socket(fd, AF_UNSPEC, SOCK_STREAM, -1)) { + LOG_ERR("%s: fd is not a socket\n", __func__); return false; } LOG_INF("%s: using systemd socket fd %d\n", __func__, fd); - svr_sock_ = fd; + svr_sock_ = fd; + socket_is_sd_ = true; + // Add a timeout to internal processing loop to enable graceful shutdown by + // just closing the socket. + set_idle_interval(1, 0); return true; } + + bool close_sd_socket() { + if (!socket_is_sd_.exchange(false)) { + return false; + } + // If we're using a systemd socket, we don't own it, so we just close it without + // shutdown. + SRV_INF("%s: closing systemd socket...\n", __func__); + std::atomic sock(svr_sock_.exchange(INVALID_SOCKET)); + httplib::detail::close_socket(sock); + return true; + } + + private: + std::atomic socket_is_sd_{ false }; }; #endif // LLAMA_CPP_SYSTEMD_SUPPORT @@ -5310,14 +5326,6 @@ int main(int argc, char ** argv) { log_data["n_threads_http"] = std::to_string(params.n_threads_http); svr->new_task_queue = [¶ms] { return new httplib::ThreadPool(params.n_threads_http); }; - // clean up function, to be called before exit - auto clean_up = [&svr, &ctx_server]() { - SRV_INF("%s: cleaning up before exit...\n", __func__); - svr->stop(); - ctx_server.queue_results.terminate(); - llama_backend_free(); - }; - bool was_bound = false; bool is_sock = false; @@ -5354,6 +5362,27 @@ int main(int argc, char ** argv) { } } + // clean up function, to be called before exit + auto clean_up = [&svr, &ctx_server +#ifdef LLAMA_CPP_SYSTEMD_SUPPORT + , &using_sd_socket +#endif + ]() { + SRV_INF("%s: cleaning up before exit...\n", __func__); +#ifdef LLAMA_CPP_SYSTEMD_SUPPORT + if (!using_sd_socket) { +#endif + svr->stop(); +#ifdef LLAMA_CPP_SYSTEMD_SUPPORT + } else { + sd_notify(0, "STOPPING=1"); + static_cast(svr.get())->close_sd_socket(); + } +#endif + ctx_server.queue_results.terminate(); + llama_backend_free(); + }; + if (!was_bound) { LOG_ERR("%s: couldn't bind HTTP server socket, hostname: %s, port: %d\n", __func__, params.hostname.c_str(), params.port); clean_up(); @@ -5364,7 +5393,16 @@ int main(int argc, char ** argv) { std::thread t([&]() { svr->listen_after_bind(); }); svr->wait_until_ready(); - LOG_INF("%s: HTTP server is listening, hostname: %s, port: %d, http threads: %d\n", __func__, params.hostname.c_str(), params.port, params.n_threads_http); +#ifdef LLAMA_CPP_SYSTEMD_SUPPORT + if (!using_sd_socket) { +#endif + LOG_INF("%s: HTTP server is listening, hostname: %s, port: %d, http threads: %d\n", __func__, + params.hostname.c_str(), params.port, params.n_threads_http); +#ifdef LLAMA_CPP_SYSTEMD_SUPPORT + } else { + LOG_INF("%s: HTTP server is listening on systemd socket, http threads: %d\n", __func__, params.n_threads_http); + } +#endif // load the model LOG_INF("%s: loading model\n", __func__); From 7b29df3d2628c5b2867ed8bf5f69672275ab8571 Mon Sep 17 00:00:00 2001 From: "P. Varet" Date: Sat, 20 Sep 2025 13:11:58 +0100 Subject: [PATCH 3/3] When using systemd, load the model before accepting connections. This avoids a race condition where the client whose connection triggered the service's start would get errors 503 while the model loads. --- tools/server/server.cpp | 53 ++++++++++++++++++++++++++++------------- 1 file changed, 36 insertions(+), 17 deletions(-) diff --git a/tools/server/server.cpp b/tools/server/server.cpp index 91a7e94b690f9..32e48563f9df1 100644 --- a/tools/server/server.cpp +++ b/tools/server/server.cpp @@ -5251,6 +5251,35 @@ int main(int argc, char ** argv) { res_ok(res, result->to_json()); }; + const auto & do_load_model = [&ctx_server, ¶ms, &state]() -> bool { + // load the model + LOG_INF("%s: loading model\n", __func__); + + if (!ctx_server.load_model(params)) { + return false; + } + + ctx_server.init(); + state.store(SERVER_STATE_READY); + + LOG_INF("%s: model loaded\n", __func__); + return true; + }; + +#ifdef LLAMA_CPP_SYSTEMD_SUPPORT + if (params.use_systemd) { + // When using systemd, load the model before starting to accept on the socket. + // This prevents a race condition where the client whose connection triggered + // this service's start will get 503 errors while the model loads. + if (!do_load_model()) { + LOG_ERR("%s: exiting due to model loading error\n", __func__); + ctx_server.queue_results.terminate(); + llama_backend_free(); + return 1; + } + } +#endif + // // Router // @@ -5402,28 +5431,18 @@ int main(int argc, char ** argv) { } else { LOG_INF("%s: HTTP server is listening on systemd socket, http threads: %d\n", __func__, params.n_threads_http); } -#endif - - // load the model - LOG_INF("%s: loading model\n", __func__); - - if (!ctx_server.load_model(params)) { - clean_up(); - t.join(); - LOG_ERR("%s: exiting due to model loading error\n", __func__); - return 1; - } - - ctx_server.init(); - state.store(SERVER_STATE_READY); - -#ifdef LLAMA_CPP_SYSTEMD_SUPPORT if (params.use_systemd) { sd_notify(0, "READY=1"); } #endif - LOG_INF("%s: model loaded\n", __func__); + if (state.load() != SERVER_STATE_READY) { + if (!do_load_model()) { + clean_up(); + t.join(); + return 1; + } + } // print sample chat example to make it clear which template is used LOG_INF("%s: chat template, chat_template: %s, example_format: '%s'\n", __func__,