From 19a852f83a76022356ff97f1c066377986a2ffb3 Mon Sep 17 00:00:00 2001
From: "P. Varet" <p.varet@gmail.com>
Date: Sat, 13 Sep 2025 23:00:45 +0100
Subject: [PATCH 1/3] Added support for systemd socket activation.

---
 CMakeLists.txt              |   7 ++-
 common/CMakeLists.txt       |   4 ++
 common/arg.cpp              |   8 +++
 common/common.h             |   4 ++
 tools/server/CMakeLists.txt |   8 +++
 tools/server/server.cpp     | 110 ++++++++++++++++++++++++++++--------
 6 files changed, 116 insertions(+), 25 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 4bf8b2789ae7b..56cd451b0770b 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -91,9 +91,10 @@ option(LLAMA_BUILD_SERVER   "llama: build server example" ${LLAMA_STANDALONE})
 option(LLAMA_TOOLS_INSTALL  "llama: install tools"        ${LLAMA_TOOLS_INSTALL_DEFAULT})
 
 # 3rd party libs
-option(LLAMA_CURL       "llama: use libcurl to download model from an URL" ON)
-option(LLAMA_OPENSSL    "llama: use openssl to support HTTPS" OFF)
-option(LLAMA_LLGUIDANCE "llama-common: include LLGuidance library for structured output in common utils" OFF)
+option(LLAMA_CURL           "llama: use libcurl to download model from an URL" ON)
+option(LLAMA_OPENSSL        "llama: use openssl to support HTTPS" OFF)
+option(LLAMA_LLGUIDANCE     "llama-common: include LLGuidance library for structured output in common utils" OFF)
+option(LLAMA_SERVER_SYSTEMD "llama-server: support systemd socket activation and readiness notification (linux only)" OFF)
 
 # Required for relocatable CMake package
 include(${CMAKE_CURRENT_SOURCE_DIR}/cmake/build-info.cmake)
diff --git a/common/CMakeLists.txt b/common/CMakeLists.txt
index 8ab3d445104a7..f25d68bbb196f 100644
--- a/common/CMakeLists.txt
+++ b/common/CMakeLists.txt
@@ -169,6 +169,10 @@ if (LLAMA_LLGUIDANCE)
     set(LLAMA_COMMON_EXTRA_LIBS ${LLAMA_COMMON_EXTRA_LIBS} llguidance ${LLGUIDANCE_PLATFORM_LIBS})
 endif ()
 
+if (UNIX AND NOT APPLE AND LLAMA_SERVER_SYSTEMD)
+    target_compile_definitions(${TARGET} PRIVATE LLAMA_CPP_SYSTEMD_SUPPORT)
+endif()
+
 target_include_directories(${TARGET} PUBLIC . ../vendor)
 target_compile_features   (${TARGET} PUBLIC cxx_std_17)
 target_link_libraries     (${TARGET} PRIVATE ${LLAMA_COMMON_EXTRA_LIBS} PUBLIC llama Threads::Threads)
diff --git a/common/arg.cpp b/common/arg.cpp
index f6a775fc4a804..c5aba6334022b 100644
--- a/common/arg.cpp
+++ b/common/arg.cpp
@@ -3376,6 +3376,14 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
             params.port = value;
         }
     ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_PORT"));
+#ifdef LLAMA_CPP_SYSTEMD_SUPPORT
+    add_opt(common_arg({ "--systemd" },
+                       string_format("use systemd socket and readiness notification (default: %s)",
+                                     params.use_systemd ? "enabled" : "disabled"),
+                       [](common_params & params) { params.use_systemd = true; })
+                .set_examples({ LLAMA_EXAMPLE_SERVER })
+                .set_env("LLAMA_ARG_SYSTEMD"));
+#endif  // LLAMA_CPP_SYSTEMD_SUPPORT
     add_opt(common_arg(
         {"--path"}, "PATH",
         string_format("path to serve static files from (default: %s)", params.public_path.c_str()),
diff --git a/common/common.h b/common/common.h
index 40c6847f32ddb..25390826b6756 100644
--- a/common/common.h
+++ b/common/common.h
@@ -436,6 +436,10 @@ struct common_params {
     int reasoning_budget = -1;
     bool prefill_assistant = true;                                                                          // if true, any trailing assistant message will be prefilled into the response
 
+#ifdef LLAMA_CPP_SYSTEMD_SUPPORT
+    bool use_systemd = false;  // use systemd socket and readiness notification
+#endif
+
     std::vector<std::string> api_keys;
 
     std::string ssl_file_key  = "";                                                                         // NOLINT
diff --git a/tools/server/CMakeLists.txt b/tools/server/CMakeLists.txt
index 06df3ee49dd33..4c94acd833a85 100644
--- a/tools/server/CMakeLists.txt
+++ b/tools/server/CMakeLists.txt
@@ -39,4 +39,12 @@ if (WIN32)
     TARGET_LINK_LIBRARIES(${TARGET} PRIVATE ws2_32)
 endif()
 
+if (UNIX AND NOT APPLE AND LLAMA_SERVER_SYSTEMD)
+    message(STATUS "LLAMA_SERVER_SYSTEMD is ON, enabling systemd support")
+    find_package(PkgConfig REQUIRED)
+    pkg_check_modules(SYSTEMD REQUIRED libsystemd)
+    target_link_libraries(${TARGET} PRIVATE ${SYSTEMD_LIBRARIES})
+    target_compile_definitions(${TARGET} PRIVATE LLAMA_CPP_SYSTEMD_SUPPORT)
+endif()
+
 target_compile_features(${TARGET} PRIVATE cxx_std_17)
diff --git a/tools/server/server.cpp b/tools/server/server.cpp
index 6062904a8c7c0..b134f70d77210 100644
--- a/tools/server/server.cpp
+++ b/tools/server/server.cpp
@@ -31,6 +31,12 @@
 #include <unordered_map>
 #include <unordered_set>
 
+#ifdef LLAMA_CPP_SYSTEMD_SUPPORT
+#    include <sys/socket.h>
+#    include <sys/stat.h>
+#    include <systemd/sd-daemon.h>
+#endif  // LLAMA_CPP_SYSTEMD_SUPPORT
+
 using json = nlohmann::ordered_json;
 
 constexpr int HTTP_POLLING_SECONDS = 1;
@@ -4075,6 +4081,38 @@ inline void signal_handler(int signal) {
     shutdown_handler(signal);
 }
 
+#ifdef LLAMA_CPP_SYSTEMD_SUPPORT
+// Subclass of httplib::Server that adds systemd socket activation support on systems
+// where that's available.
+class SystemdServer : public httplib::Server {
+  public:
+    bool setup_sd_socket() {
+        int n = sd_listen_fds(0);
+        if (n != 1) {
+            LOG_ERR("%s: sd_listen_fds() returned %d\n", __func__, n);
+            return false;
+        }
+
+        int         fd = SD_LISTEN_FDS_START;
+        struct stat statbuf;
+        if (fstat(fd, &statbuf) == -1 || !S_ISSOCK(statbuf.st_mode)) {
+            LOG_ERR("%s: fstat() failed or fd is not a socket\n", __func__);
+            return false;
+        }
+
+        LOG_INF("%s: using systemd socket fd %d\n", __func__, fd);
+        svr_sock_ = fd;
+        return true;
+    }
+};
+#endif  // LLAMA_CPP_SYSTEMD_SUPPORT
+
+#ifdef LLAMA_CPP_SYSTEMD_SUPPORT
+#    define NEW_SERVER (new SystemdServer())
+#else
+#    define NEW_SERVER (new httplib::Server())
+#endif  // LLAMA_CPP_SYSTEMD_SUPPORT
+
 int main(int argc, char ** argv) {
     // own arguments required by this example
     common_params params;
@@ -4105,14 +4143,14 @@ int main(int argc, char ** argv) {
         );
     } else {
         LOG_INF("Running without SSL\n");
-        svr.reset(new httplib::Server());
+        svr.reset(NEW_SERVER);
     }
 #else
     if (params.ssl_file_key != "" && params.ssl_file_cert != "") {
         LOG_ERR("Server is built without SSL support\n");
         return 1;
     }
-    svr.reset(new httplib::Server());
+    svr.reset(NEW_SERVER);
 #endif
 
     std::atomic<server_state> state{SERVER_STATE_LOADING_MODEL};
@@ -5281,24 +5319,38 @@ int main(int argc, char ** argv) {
     };
 
     bool was_bound = false;
-    bool is_sock = false;
-    if (string_ends_with(std::string(params.hostname), ".sock")) {
-        is_sock = true;
-        LOG_INF("%s: setting address family to AF_UNIX\n", __func__);
-        svr->set_address_family(AF_UNIX);
-        // bind_to_port requires a second arg, any value other than 0 should
-        // simply get ignored
-        was_bound = svr->bind_to_port(params.hostname, 8080);
-    } else {
-        LOG_INF("%s: binding port with default address family\n", __func__);
-        // bind HTTP listen port
-        if (params.port == 0) {
-            int bound_port = svr->bind_to_any_port(params.hostname);
-            if ((was_bound = (bound_port >= 0))) {
-                params.port = bound_port;
-            }
+    bool is_sock   = false;
+
+#ifdef LLAMA_CPP_SYSTEMD_SUPPORT
+    bool using_sd_socket = false;
+    if (params.use_systemd) {
+        was_bound       = static_cast<SystemdServer *>(svr.get())->setup_sd_socket();
+        using_sd_socket = was_bound;
+        if (!was_bound) {
+            LOG_INF("%s: couldn't set up systemd socket; falling back to opening host:port socket\n", __func__);
+        }
+    }
+#endif  // LLAMA_CPP_SYSTEMD_SUPPORT
+
+    if (!was_bound) {
+        if (string_ends_with(std::string(params.hostname), ".sock")) {
+            is_sock = true;
+            LOG_INF("%s: setting address family to AF_UNIX\n", __func__);
+            svr->set_address_family(AF_UNIX);
+            // bind_to_port requires a second arg, any value other than 0 should
+            // simply get ignored
+            was_bound = svr->bind_to_port(params.hostname, 8080);
         } else {
-            was_bound = svr->bind_to_port(params.hostname, params.port);
+            LOG_INF("%s: binding port with default address family\n", __func__);
+            // bind HTTP listen port
+            if (params.port == 0) {
+                int bound_port = svr->bind_to_any_port(params.hostname);
+                if ((was_bound = (bound_port >= 0))) {
+                    params.port = bound_port;
+                }
+            } else {
+                was_bound = svr->bind_to_port(params.hostname, params.port);
+            }
         }
     }
 
@@ -5327,6 +5379,12 @@ int main(int argc, char ** argv) {
     ctx_server.init();
     state.store(SERVER_STATE_READY);
 
+#ifdef LLAMA_CPP_SYSTEMD_SUPPORT
+    if (params.use_systemd) {
+        sd_notify(0, "READY=1");
+    }
+#endif
+
     LOG_INF("%s: model loaded\n", __func__);
 
     // print sample chat example to make it clear which template is used
@@ -5361,9 +5419,17 @@ int main(int argc, char ** argv) {
     SetConsoleCtrlHandler(reinterpret_cast<PHANDLER_ROUTINE>(console_ctrl_handler), true);
 #endif
 
-    LOG_INF("%s: server is listening on %s - starting the main loop\n", __func__,
-            is_sock ? string_format("unix://%s", params.hostname.c_str()).c_str() :
-                      string_format("http://%s:%d", params.hostname.c_str(), params.port).c_str());
+#ifdef LLAMA_CPP_SYSTEMD_SUPPORT
+    if (using_sd_socket) {
+        LOG_INF("%s: server is listening on systemd socket - starting the main loop\n", __func__);
+    } else {
+#endif  // LLAMA_CPP_SYSTEMD_SUPPORT
+        LOG_INF("%s: server is listening on %s - starting the main loop\n", __func__,
+                is_sock ? string_format("unix://%s", params.hostname.c_str()).c_str() :
+                          string_format("http://%s:%d", params.hostname.c_str(), params.port).c_str());
+#ifdef LLAMA_CPP_SYSTEMD_SUPPORT
+    }
+#endif  // LLAMA_CPP_SYSTEMD_SUPPORT
 
     // this call blocks the main thread until queue_tasks.terminate() is called
     ctx_server.queue_tasks.start_loop();

From 103e3d3c226337b537db3fbbbcf64539dd4d094e Mon Sep 17 00:00:00 2001
From: "P. Varet" <p.varet@gmail.com>
Date: Mon, 15 Sep 2025 19:38:48 +0100
Subject: [PATCH 2/3] Don't shut down the socket if it was passed down to us by
 systemd. systemd owns the lifecycle of the socket, not us.

---
 tools/server/server.cpp | 70 +++++++++++++++++++++++++++++++----------
 1 file changed, 54 insertions(+), 16 deletions(-)

diff --git a/tools/server/server.cpp b/tools/server/server.cpp
index b134f70d77210..91a7e94b690f9 100644
--- a/tools/server/server.cpp
+++ b/tools/server/server.cpp
@@ -32,8 +32,6 @@
 #include <unordered_set>
 
 #ifdef LLAMA_CPP_SYSTEMD_SUPPORT
-#    include <sys/socket.h>
-#    include <sys/stat.h>
 #    include <systemd/sd-daemon.h>
 #endif  // LLAMA_CPP_SYSTEMD_SUPPORT
 
@@ -4093,17 +4091,35 @@ class SystemdServer : public httplib::Server {
             return false;
         }
 
-        int         fd = SD_LISTEN_FDS_START;
-        struct stat statbuf;
-        if (fstat(fd, &statbuf) == -1 || !S_ISSOCK(statbuf.st_mode)) {
-            LOG_ERR("%s: fstat() failed or fd is not a socket\n", __func__);
+        int fd = SD_LISTEN_FDS_START;
+        if (!sd_is_socket(fd, AF_UNSPEC, SOCK_STREAM, -1)) {
+            LOG_ERR("%s: fd is not a socket\n", __func__);
             return false;
         }
 
         LOG_INF("%s: using systemd socket fd %d\n", __func__, fd);
-        svr_sock_ = fd;
+        svr_sock_     = fd;
+        socket_is_sd_ = true;
+        // Add a timeout to internal processing loop to enable graceful shutdown by
+        // just closing the socket.
+        set_idle_interval(1, 0);
         return true;
     }
+
+    bool close_sd_socket() {
+        if (!socket_is_sd_.exchange(false)) {
+            return false;
+        }
+        // If we're using a systemd socket, we don't own it, so we just close it without
+        // shutdown.
+        SRV_INF("%s: closing systemd socket...\n", __func__);
+        std::atomic<socket_t> sock(svr_sock_.exchange(INVALID_SOCKET));
+        httplib::detail::close_socket(sock);
+        return true;
+    }
+
+  private:
+    std::atomic<bool> socket_is_sd_{ false };
 };
 #endif  // LLAMA_CPP_SYSTEMD_SUPPORT
 
@@ -5310,14 +5326,6 @@ int main(int argc, char ** argv) {
     log_data["n_threads_http"] =  std::to_string(params.n_threads_http);
     svr->new_task_queue = [&params] { return new httplib::ThreadPool(params.n_threads_http); };
 
-    // clean up function, to be called before exit
-    auto clean_up = [&svr, &ctx_server]() {
-        SRV_INF("%s: cleaning up before exit...\n", __func__);
-        svr->stop();
-        ctx_server.queue_results.terminate();
-        llama_backend_free();
-    };
-
     bool was_bound = false;
     bool is_sock   = false;
 
@@ -5354,6 +5362,27 @@ int main(int argc, char ** argv) {
         }
     }
 
+    // clean up function, to be called before exit
+    auto clean_up = [&svr, &ctx_server
+#ifdef LLAMA_CPP_SYSTEMD_SUPPORT
+                     , &using_sd_socket
+#endif
+    ]() {
+        SRV_INF("%s: cleaning up before exit...\n", __func__);
+#ifdef LLAMA_CPP_SYSTEMD_SUPPORT
+        if (!using_sd_socket) {
+#endif
+            svr->stop();
+#ifdef LLAMA_CPP_SYSTEMD_SUPPORT
+        } else {
+            sd_notify(0, "STOPPING=1");
+            static_cast<SystemdServer *>(svr.get())->close_sd_socket();
+        }
+#endif
+        ctx_server.queue_results.terminate();
+        llama_backend_free();
+    };
+
     if (!was_bound) {
         LOG_ERR("%s: couldn't bind HTTP server socket, hostname: %s, port: %d\n", __func__, params.hostname.c_str(), params.port);
         clean_up();
@@ -5364,7 +5393,16 @@ int main(int argc, char ** argv) {
     std::thread t([&]() { svr->listen_after_bind(); });
     svr->wait_until_ready();
 
-    LOG_INF("%s: HTTP server is listening, hostname: %s, port: %d, http threads: %d\n", __func__, params.hostname.c_str(), params.port, params.n_threads_http);
+#ifdef LLAMA_CPP_SYSTEMD_SUPPORT
+    if (!using_sd_socket) {
+#endif
+        LOG_INF("%s: HTTP server is listening, hostname: %s, port: %d, http threads: %d\n", __func__,
+                params.hostname.c_str(), params.port, params.n_threads_http);
+#ifdef LLAMA_CPP_SYSTEMD_SUPPORT
+    } else {
+        LOG_INF("%s: HTTP server is listening on systemd socket, http threads: %d\n", __func__, params.n_threads_http);
+    }
+#endif
 
     // load the model
     LOG_INF("%s: loading model\n", __func__);

From 7b29df3d2628c5b2867ed8bf5f69672275ab8571 Mon Sep 17 00:00:00 2001
From: "P. Varet" <p.varet@gmail.com>
Date: Sat, 20 Sep 2025 13:11:58 +0100
Subject: [PATCH 3/3] When using systemd, load the model before accepting
 connections.

This avoids a race condition where the client whose connection triggered
the service's start would get errors 503 while the model loads.
---
 tools/server/server.cpp | 53 ++++++++++++++++++++++++++++-------------
 1 file changed, 36 insertions(+), 17 deletions(-)

diff --git a/tools/server/server.cpp b/tools/server/server.cpp
index 91a7e94b690f9..32e48563f9df1 100644
--- a/tools/server/server.cpp
+++ b/tools/server/server.cpp
@@ -5251,6 +5251,35 @@ int main(int argc, char ** argv) {
         res_ok(res, result->to_json());
     };
 
+    const auto & do_load_model = [&ctx_server, &params, &state]() -> bool {
+        // load the model
+        LOG_INF("%s: loading model\n", __func__);
+
+        if (!ctx_server.load_model(params)) {
+            return false;
+        }
+
+        ctx_server.init();
+        state.store(SERVER_STATE_READY);
+
+        LOG_INF("%s: model loaded\n", __func__);
+        return true;
+    };
+
+#ifdef LLAMA_CPP_SYSTEMD_SUPPORT
+    if (params.use_systemd) {
+        // When using systemd, load the model before starting to accept on the socket.
+        // This prevents a race condition where the client whose connection triggered
+        // this service's start will get 503 errors while the model loads.
+        if (!do_load_model()) {
+            LOG_ERR("%s: exiting due to model loading error\n", __func__);
+            ctx_server.queue_results.terminate();
+            llama_backend_free();
+            return 1;
+        }
+    }
+#endif
+
     //
     // Router
     //
@@ -5402,28 +5431,18 @@ int main(int argc, char ** argv) {
     } else {
         LOG_INF("%s: HTTP server is listening on systemd socket, http threads: %d\n", __func__, params.n_threads_http);
     }
-#endif
-
-    // load the model
-    LOG_INF("%s: loading model\n", __func__);
-
-    if (!ctx_server.load_model(params)) {
-        clean_up();
-        t.join();
-        LOG_ERR("%s: exiting due to model loading error\n", __func__);
-        return 1;
-    }
-
-    ctx_server.init();
-    state.store(SERVER_STATE_READY);
-
-#ifdef LLAMA_CPP_SYSTEMD_SUPPORT
     if (params.use_systemd) {
         sd_notify(0, "READY=1");
     }
 #endif
 
-    LOG_INF("%s: model loaded\n", __func__);
+    if (state.load() != SERVER_STATE_READY) {
+        if (!do_load_model()) {
+            clean_up();
+            t.join();
+            return 1;
+        }
+    }
 
     // print sample chat example to make it clear which template is used
     LOG_INF("%s: chat template, chat_template: %s, example_format: '%s'\n", __func__,