From 19a852f83a76022356ff97f1c066377986a2ffb3 Mon Sep 17 00:00:00 2001
From: "P. Varet"
Date: Sat, 13 Sep 2025 23:00:45 +0100
Subject: [PATCH 1/3] Added support for systemd socket activation.
---
CMakeLists.txt | 7 ++-
common/CMakeLists.txt | 4 ++
common/arg.cpp | 8 +++
common/common.h | 4 ++
tools/server/CMakeLists.txt | 8 +++
tools/server/server.cpp | 110 ++++++++++++++++++++++++++++--------
6 files changed, 116 insertions(+), 25 deletions(-)
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 4bf8b2789ae7b..56cd451b0770b 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -91,9 +91,10 @@ option(LLAMA_BUILD_SERVER "llama: build server example" ${LLAMA_STANDALONE})
option(LLAMA_TOOLS_INSTALL "llama: install tools" ${LLAMA_TOOLS_INSTALL_DEFAULT})
# 3rd party libs
-option(LLAMA_CURL "llama: use libcurl to download model from an URL" ON)
-option(LLAMA_OPENSSL "llama: use openssl to support HTTPS" OFF)
-option(LLAMA_LLGUIDANCE "llama-common: include LLGuidance library for structured output in common utils" OFF)
+option(LLAMA_CURL "llama: use libcurl to download model from an URL" ON)
+option(LLAMA_OPENSSL "llama: use openssl to support HTTPS" OFF)
+option(LLAMA_LLGUIDANCE "llama-common: include LLGuidance library for structured output in common utils" OFF)
+option(LLAMA_SERVER_SYSTEMD "llama-server: support systemd socket activation and readiness notification (linux only)" OFF)
# Required for relocatable CMake package
include(${CMAKE_CURRENT_SOURCE_DIR}/cmake/build-info.cmake)
diff --git a/common/CMakeLists.txt b/common/CMakeLists.txt
index 8ab3d445104a7..f25d68bbb196f 100644
--- a/common/CMakeLists.txt
+++ b/common/CMakeLists.txt
@@ -169,6 +169,10 @@ if (LLAMA_LLGUIDANCE)
set(LLAMA_COMMON_EXTRA_LIBS ${LLAMA_COMMON_EXTRA_LIBS} llguidance ${LLGUIDANCE_PLATFORM_LIBS})
endif ()
+if (UNIX AND NOT APPLE AND LLAMA_SERVER_SYSTEMD)
+ target_compile_definitions(${TARGET} PRIVATE LLAMA_CPP_SYSTEMD_SUPPORT)
+endif()
+
target_include_directories(${TARGET} PUBLIC . ../vendor)
target_compile_features (${TARGET} PUBLIC cxx_std_17)
target_link_libraries (${TARGET} PRIVATE ${LLAMA_COMMON_EXTRA_LIBS} PUBLIC llama Threads::Threads)
diff --git a/common/arg.cpp b/common/arg.cpp
index f6a775fc4a804..c5aba6334022b 100644
--- a/common/arg.cpp
+++ b/common/arg.cpp
@@ -3376,6 +3376,14 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
params.port = value;
}
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_PORT"));
+#ifdef LLAMA_CPP_SYSTEMD_SUPPORT
+ add_opt(common_arg({ "--systemd" },
+ string_format("use systemd socket and readiness notification (default: %s)",
+ params.use_systemd ? "enabled" : "disabled"),
+ [](common_params & params) { params.use_systemd = true; })
+ .set_examples({ LLAMA_EXAMPLE_SERVER })
+ .set_env("LLAMA_ARG_SYSTEMD"));
+#endif // LLAMA_CPP_SYSTEMD_SUPPORT
add_opt(common_arg(
{"--path"}, "PATH",
string_format("path to serve static files from (default: %s)", params.public_path.c_str()),
diff --git a/common/common.h b/common/common.h
index 40c6847f32ddb..25390826b6756 100644
--- a/common/common.h
+++ b/common/common.h
@@ -436,6 +436,10 @@ struct common_params {
int reasoning_budget = -1;
bool prefill_assistant = true; // if true, any trailing assistant message will be prefilled into the response
+#ifdef LLAMA_CPP_SYSTEMD_SUPPORT
+ bool use_systemd = false; // use systemd socket and readiness notification
+#endif
+
std::vector api_keys;
std::string ssl_file_key = ""; // NOLINT
diff --git a/tools/server/CMakeLists.txt b/tools/server/CMakeLists.txt
index 06df3ee49dd33..4c94acd833a85 100644
--- a/tools/server/CMakeLists.txt
+++ b/tools/server/CMakeLists.txt
@@ -39,4 +39,12 @@ if (WIN32)
TARGET_LINK_LIBRARIES(${TARGET} PRIVATE ws2_32)
endif()
+if (UNIX AND NOT APPLE AND LLAMA_SERVER_SYSTEMD)
+ message(STATUS "LLAMA_SERVER_SYSTEMD is ON, enabling systemd support")
+ find_package(PkgConfig REQUIRED)
+ pkg_check_modules(SYSTEMD REQUIRED libsystemd)
+ target_link_libraries(${TARGET} PRIVATE ${SYSTEMD_LIBRARIES})
+ target_compile_definitions(${TARGET} PRIVATE LLAMA_CPP_SYSTEMD_SUPPORT)
+endif()
+
target_compile_features(${TARGET} PRIVATE cxx_std_17)
diff --git a/tools/server/server.cpp b/tools/server/server.cpp
index 6062904a8c7c0..b134f70d77210 100644
--- a/tools/server/server.cpp
+++ b/tools/server/server.cpp
@@ -31,6 +31,12 @@
#include
#include
+#ifdef LLAMA_CPP_SYSTEMD_SUPPORT
+# include
+# include
+# include
+#endif // LLAMA_CPP_SYSTEMD_SUPPORT
+
using json = nlohmann::ordered_json;
constexpr int HTTP_POLLING_SECONDS = 1;
@@ -4075,6 +4081,38 @@ inline void signal_handler(int signal) {
shutdown_handler(signal);
}
+#ifdef LLAMA_CPP_SYSTEMD_SUPPORT
+// Subclass of httplib::Server that adds systemd socket activation support on systems
+// where that's available.
+class SystemdServer : public httplib::Server {
+ public:
+ bool setup_sd_socket() {
+ int n = sd_listen_fds(0);
+ if (n != 1) {
+ LOG_ERR("%s: sd_listen_fds() returned %d\n", __func__, n);
+ return false;
+ }
+
+ int fd = SD_LISTEN_FDS_START;
+ struct stat statbuf;
+ if (fstat(fd, &statbuf) == -1 || !S_ISSOCK(statbuf.st_mode)) {
+ LOG_ERR("%s: fstat() failed or fd is not a socket\n", __func__);
+ return false;
+ }
+
+ LOG_INF("%s: using systemd socket fd %d\n", __func__, fd);
+ svr_sock_ = fd;
+ return true;
+ }
+};
+#endif // LLAMA_CPP_SYSTEMD_SUPPORT
+
+#ifdef LLAMA_CPP_SYSTEMD_SUPPORT
+# define NEW_SERVER (new SystemdServer())
+#else
+# define NEW_SERVER (new httplib::Server())
+#endif // LLAMA_CPP_SYSTEMD_SUPPORT
+
int main(int argc, char ** argv) {
// own arguments required by this example
common_params params;
@@ -4105,14 +4143,14 @@ int main(int argc, char ** argv) {
);
} else {
LOG_INF("Running without SSL\n");
- svr.reset(new httplib::Server());
+ svr.reset(NEW_SERVER);
}
#else
if (params.ssl_file_key != "" && params.ssl_file_cert != "") {
LOG_ERR("Server is built without SSL support\n");
return 1;
}
- svr.reset(new httplib::Server());
+ svr.reset(NEW_SERVER);
#endif
std::atomic state{SERVER_STATE_LOADING_MODEL};
@@ -5281,24 +5319,38 @@ int main(int argc, char ** argv) {
};
bool was_bound = false;
- bool is_sock = false;
- if (string_ends_with(std::string(params.hostname), ".sock")) {
- is_sock = true;
- LOG_INF("%s: setting address family to AF_UNIX\n", __func__);
- svr->set_address_family(AF_UNIX);
- // bind_to_port requires a second arg, any value other than 0 should
- // simply get ignored
- was_bound = svr->bind_to_port(params.hostname, 8080);
- } else {
- LOG_INF("%s: binding port with default address family\n", __func__);
- // bind HTTP listen port
- if (params.port == 0) {
- int bound_port = svr->bind_to_any_port(params.hostname);
- if ((was_bound = (bound_port >= 0))) {
- params.port = bound_port;
- }
+ bool is_sock = false;
+
+#ifdef LLAMA_CPP_SYSTEMD_SUPPORT
+ bool using_sd_socket = false;
+ if (params.use_systemd) {
+ was_bound = static_cast(svr.get())->setup_sd_socket();
+ using_sd_socket = was_bound;
+ if (!was_bound) {
+ LOG_INF("%s: couldn't set up systemd socket; falling back to opening host:port socket\n", __func__);
+ }
+ }
+#endif // LLAMA_CPP_SYSTEMD_SUPPORT
+
+ if (!was_bound) {
+ if (string_ends_with(std::string(params.hostname), ".sock")) {
+ is_sock = true;
+ LOG_INF("%s: setting address family to AF_UNIX\n", __func__);
+ svr->set_address_family(AF_UNIX);
+ // bind_to_port requires a second arg, any value other than 0 should
+ // simply get ignored
+ was_bound = svr->bind_to_port(params.hostname, 8080);
} else {
- was_bound = svr->bind_to_port(params.hostname, params.port);
+ LOG_INF("%s: binding port with default address family\n", __func__);
+ // bind HTTP listen port
+ if (params.port == 0) {
+ int bound_port = svr->bind_to_any_port(params.hostname);
+ if ((was_bound = (bound_port >= 0))) {
+ params.port = bound_port;
+ }
+ } else {
+ was_bound = svr->bind_to_port(params.hostname, params.port);
+ }
}
}
@@ -5327,6 +5379,12 @@ int main(int argc, char ** argv) {
ctx_server.init();
state.store(SERVER_STATE_READY);
+#ifdef LLAMA_CPP_SYSTEMD_SUPPORT
+ if (params.use_systemd) {
+ sd_notify(0, "READY=1");
+ }
+#endif
+
LOG_INF("%s: model loaded\n", __func__);
// print sample chat example to make it clear which template is used
@@ -5361,9 +5419,17 @@ int main(int argc, char ** argv) {
SetConsoleCtrlHandler(reinterpret_cast(console_ctrl_handler), true);
#endif
- LOG_INF("%s: server is listening on %s - starting the main loop\n", __func__,
- is_sock ? string_format("unix://%s", params.hostname.c_str()).c_str() :
- string_format("http://%s:%d", params.hostname.c_str(), params.port).c_str());
+#ifdef LLAMA_CPP_SYSTEMD_SUPPORT
+ if (using_sd_socket) {
+ LOG_INF("%s: server is listening on systemd socket - starting the main loop\n", __func__);
+ } else {
+#endif // LLAMA_CPP_SYSTEMD_SUPPORT
+ LOG_INF("%s: server is listening on %s - starting the main loop\n", __func__,
+ is_sock ? string_format("unix://%s", params.hostname.c_str()).c_str() :
+ string_format("http://%s:%d", params.hostname.c_str(), params.port).c_str());
+#ifdef LLAMA_CPP_SYSTEMD_SUPPORT
+ }
+#endif // LLAMA_CPP_SYSTEMD_SUPPORT
// this call blocks the main thread until queue_tasks.terminate() is called
ctx_server.queue_tasks.start_loop();
From 103e3d3c226337b537db3fbbbcf64539dd4d094e Mon Sep 17 00:00:00 2001
From: "P. Varet"
Date: Mon, 15 Sep 2025 19:38:48 +0100
Subject: [PATCH 2/3] Don't shut down the socket if it was passed down to us by
systemd. systemd owns the lifecycle of the socket, not us.
---
tools/server/server.cpp | 70 +++++++++++++++++++++++++++++++----------
1 file changed, 54 insertions(+), 16 deletions(-)
diff --git a/tools/server/server.cpp b/tools/server/server.cpp
index b134f70d77210..91a7e94b690f9 100644
--- a/tools/server/server.cpp
+++ b/tools/server/server.cpp
@@ -32,8 +32,6 @@
#include
#ifdef LLAMA_CPP_SYSTEMD_SUPPORT
-# include
-# include
# include
#endif // LLAMA_CPP_SYSTEMD_SUPPORT
@@ -4093,17 +4091,35 @@ class SystemdServer : public httplib::Server {
return false;
}
- int fd = SD_LISTEN_FDS_START;
- struct stat statbuf;
- if (fstat(fd, &statbuf) == -1 || !S_ISSOCK(statbuf.st_mode)) {
- LOG_ERR("%s: fstat() failed or fd is not a socket\n", __func__);
+ int fd = SD_LISTEN_FDS_START;
+ if (!sd_is_socket(fd, AF_UNSPEC, SOCK_STREAM, -1)) {
+ LOG_ERR("%s: fd is not a socket\n", __func__);
return false;
}
LOG_INF("%s: using systemd socket fd %d\n", __func__, fd);
- svr_sock_ = fd;
+ svr_sock_ = fd;
+ socket_is_sd_ = true;
+ // Add a timeout to internal processing loop to enable graceful shutdown by
+ // just closing the socket.
+ set_idle_interval(1, 0);
return true;
}
+
+ bool close_sd_socket() {
+ if (!socket_is_sd_.exchange(false)) {
+ return false;
+ }
+ // If we're using a systemd socket, we don't own it, so we just close it without
+ // shutdown.
+ SRV_INF("%s: closing systemd socket...\n", __func__);
+ std::atomic sock(svr_sock_.exchange(INVALID_SOCKET));
+ httplib::detail::close_socket(sock);
+ return true;
+ }
+
+ private:
+ std::atomic socket_is_sd_{ false };
};
#endif // LLAMA_CPP_SYSTEMD_SUPPORT
@@ -5310,14 +5326,6 @@ int main(int argc, char ** argv) {
log_data["n_threads_http"] = std::to_string(params.n_threads_http);
svr->new_task_queue = [¶ms] { return new httplib::ThreadPool(params.n_threads_http); };
- // clean up function, to be called before exit
- auto clean_up = [&svr, &ctx_server]() {
- SRV_INF("%s: cleaning up before exit...\n", __func__);
- svr->stop();
- ctx_server.queue_results.terminate();
- llama_backend_free();
- };
-
bool was_bound = false;
bool is_sock = false;
@@ -5354,6 +5362,27 @@ int main(int argc, char ** argv) {
}
}
+ // clean up function, to be called before exit
+ auto clean_up = [&svr, &ctx_server
+#ifdef LLAMA_CPP_SYSTEMD_SUPPORT
+ , &using_sd_socket
+#endif
+ ]() {
+ SRV_INF("%s: cleaning up before exit...\n", __func__);
+#ifdef LLAMA_CPP_SYSTEMD_SUPPORT
+ if (!using_sd_socket) {
+#endif
+ svr->stop();
+#ifdef LLAMA_CPP_SYSTEMD_SUPPORT
+ } else {
+ sd_notify(0, "STOPPING=1");
+ static_cast(svr.get())->close_sd_socket();
+ }
+#endif
+ ctx_server.queue_results.terminate();
+ llama_backend_free();
+ };
+
if (!was_bound) {
LOG_ERR("%s: couldn't bind HTTP server socket, hostname: %s, port: %d\n", __func__, params.hostname.c_str(), params.port);
clean_up();
@@ -5364,7 +5393,16 @@ int main(int argc, char ** argv) {
std::thread t([&]() { svr->listen_after_bind(); });
svr->wait_until_ready();
- LOG_INF("%s: HTTP server is listening, hostname: %s, port: %d, http threads: %d\n", __func__, params.hostname.c_str(), params.port, params.n_threads_http);
+#ifdef LLAMA_CPP_SYSTEMD_SUPPORT
+ if (!using_sd_socket) {
+#endif
+ LOG_INF("%s: HTTP server is listening, hostname: %s, port: %d, http threads: %d\n", __func__,
+ params.hostname.c_str(), params.port, params.n_threads_http);
+#ifdef LLAMA_CPP_SYSTEMD_SUPPORT
+ } else {
+ LOG_INF("%s: HTTP server is listening on systemd socket, http threads: %d\n", __func__, params.n_threads_http);
+ }
+#endif
// load the model
LOG_INF("%s: loading model\n", __func__);
From 7b29df3d2628c5b2867ed8bf5f69672275ab8571 Mon Sep 17 00:00:00 2001
From: "P. Varet"
Date: Sat, 20 Sep 2025 13:11:58 +0100
Subject: [PATCH 3/3] When using systemd, load the model before accepting
connections.
This avoids a race condition where the client whose connection triggered
the service's start would get errors 503 while the model loads.
---
tools/server/server.cpp | 53 ++++++++++++++++++++++++++++-------------
1 file changed, 36 insertions(+), 17 deletions(-)
diff --git a/tools/server/server.cpp b/tools/server/server.cpp
index 91a7e94b690f9..32e48563f9df1 100644
--- a/tools/server/server.cpp
+++ b/tools/server/server.cpp
@@ -5251,6 +5251,35 @@ int main(int argc, char ** argv) {
res_ok(res, result->to_json());
};
+ const auto & do_load_model = [&ctx_server, ¶ms, &state]() -> bool {
+ // load the model
+ LOG_INF("%s: loading model\n", __func__);
+
+ if (!ctx_server.load_model(params)) {
+ return false;
+ }
+
+ ctx_server.init();
+ state.store(SERVER_STATE_READY);
+
+ LOG_INF("%s: model loaded\n", __func__);
+ return true;
+ };
+
+#ifdef LLAMA_CPP_SYSTEMD_SUPPORT
+ if (params.use_systemd) {
+ // When using systemd, load the model before starting to accept on the socket.
+ // This prevents a race condition where the client whose connection triggered
+ // this service's start will get 503 errors while the model loads.
+ if (!do_load_model()) {
+ LOG_ERR("%s: exiting due to model loading error\n", __func__);
+ ctx_server.queue_results.terminate();
+ llama_backend_free();
+ return 1;
+ }
+ }
+#endif
+
//
// Router
//
@@ -5402,28 +5431,18 @@ int main(int argc, char ** argv) {
} else {
LOG_INF("%s: HTTP server is listening on systemd socket, http threads: %d\n", __func__, params.n_threads_http);
}
-#endif
-
- // load the model
- LOG_INF("%s: loading model\n", __func__);
-
- if (!ctx_server.load_model(params)) {
- clean_up();
- t.join();
- LOG_ERR("%s: exiting due to model loading error\n", __func__);
- return 1;
- }
-
- ctx_server.init();
- state.store(SERVER_STATE_READY);
-
-#ifdef LLAMA_CPP_SYSTEMD_SUPPORT
if (params.use_systemd) {
sd_notify(0, "READY=1");
}
#endif
- LOG_INF("%s: model loaded\n", __func__);
+ if (state.load() != SERVER_STATE_READY) {
+ if (!do_load_model()) {
+ clean_up();
+ t.join();
+ return 1;
+ }
+ }
// print sample chat example to make it clear which template is used
LOG_INF("%s: chat template, chat_template: %s, example_format: '%s'\n", __func__,