|
1 | | -{ withSystem, ... }: |
| 1 | +{ ... }: |
2 | 2 | { |
3 | 3 | flake.modules.nixos.healthcheck = |
4 | 4 | # /etc/nixos/modules/systemd-healthcheck.nix |
5 | 5 | { |
6 | 6 | config, |
7 | | - options, |
8 | 7 | lib, |
9 | 8 | pkgs, |
10 | 9 | ... |
|
13 | 12 | with lib; |
14 | 13 |
|
15 | 14 | let |
16 | | - # Options for the liveness probe (timer-based check) |
17 | | - mkLivenessProbeOptions = { |
18 | | - options = { |
19 | | - enable = mkEnableOption "the liveness probe"; |
20 | 15 |
|
21 | | - command = mkOption { |
22 | | - type = types.lines; |
23 | | - description = "The command to execute for the liveness check."; |
24 | | - }; |
| 16 | + mkProbeOptions = x: { |
| 17 | + options = |
| 18 | + { |
| 19 | + enable = mkEnableOption "the ${x} probe"; |
25 | 20 |
|
26 | | - initialDelaySeconds = mkOption { |
27 | | - type = types.int; |
28 | | - default = 15; |
29 | | - description = "Seconds to wait after the service is up before the first liveness probe."; |
30 | | - }; |
| 21 | + command = mkOption { |
| 22 | + type = types.lines; |
| 23 | + description = "The command to execute for the ${x} check."; |
| 24 | + }; |
31 | 25 |
|
32 | | - periodSeconds = mkOption { |
33 | | - type = types.int; |
34 | | - default = 30; |
35 | | - description = "How often (in seconds) to perform the probe."; |
36 | | - }; |
| 26 | + initialDelaySeconds = mkOption { |
| 27 | + type = types.int; |
| 28 | + default = 15; |
| 29 | + description = "Seconds to wait after the service is up before the first ${x} probe."; |
| 30 | + }; |
37 | 31 |
|
38 | | - timeoutSeconds = mkOption { |
39 | | - type = types.int; |
40 | | - default = 10; |
41 | | - description = "Seconds after which the liveness probe command times out."; |
42 | | - }; |
43 | | - }; |
44 | | - }; |
| 32 | + periodSeconds = mkOption { |
| 33 | + type = types.int; |
| 34 | + default = if x == "liveness" then 30 else 2; |
| 35 | + description = "How often (in seconds) to perform the ${x} probe."; |
| 36 | + }; |
45 | 37 |
|
46 | | - # Options for the readiness probe (notify-based check) |
47 | | - mkReadinessProbeOptions = { |
48 | | - options = { |
49 | | - enable = mkEnableOption "the notify readiness probe"; |
| 38 | + timeoutSeconds = mkOption { |
| 39 | + type = types.int; |
| 40 | + default = 10; |
| 41 | + description = "Seconds after which the ${x} probe command times out."; |
| 42 | + }; |
| 43 | + } |
| 44 | + // lib.optionalAttrs (x == "readiness") ({ |
| 45 | + statusWaitingMessage = mkOption { |
| 46 | + type = types.str; |
| 47 | + default = "Service starting, waiting for ready signal..."; |
| 48 | + description = "The status message to send to systemd while waiting."; |
| 49 | + }; |
50 | 50 |
|
51 | | - command = mkOption { |
52 | | - type = types.lines; |
53 | | - description = "The check command to run in a loop. It should output a value on success."; |
54 | | - example = "curl -s localhost:9091/metrics | awk '/^signing_signers_loaded_count /{print int($2)}'"; |
55 | | - }; |
56 | | - |
57 | | - periodSeconds = mkOption { |
58 | | - type = types.int; |
59 | | - default = 2; |
60 | | - description = "How often (in seconds) to perform the readiness check."; |
61 | | - }; |
| 51 | + statusReadyMessage = mkOption { |
| 52 | + type = types.str; |
| 53 | + default = "Service is ready."; |
| 54 | + description = '' |
| 55 | + The status message to send when the service is ready. |
| 56 | + Use %OUTPUT% to substitute the output of the check command. |
| 57 | + ''; |
| 58 | + }; |
| 59 | + }); |
| 60 | + }; |
62 | 61 |
|
63 | | - statusWaiting = mkOption { |
64 | | - type = types.str; |
65 | | - default = "Service starting, waiting for ready signal..."; |
66 | | - description = "The status message to send to systemd while waiting."; |
67 | | - }; |
| 62 | + # Options for the liveness probe (timer-based check) |
| 63 | + mkLivenessProbeOptions = mkProbeOptions "liveness"; |
68 | 64 |
|
69 | | - statusReady = mkOption { |
70 | | - type = types.str; |
71 | | - default = "Service is ready."; |
72 | | - description = '' |
73 | | - The status message to send when the service is ready. |
74 | | - Use %OUTPUT% to substitute the output of the check command. |
75 | | - ''; |
76 | | - }; |
77 | | - }; |
78 | | - }; |
| 65 | + # Options for the readiness probe (notify-based check) |
| 66 | + mkReadinessProbeOptions = mkProbeOptions "readiness"; |
79 | 67 | in |
80 | 68 | { |
81 | 69 |
|
|
91 | 79 | lib.map (serviceName: { |
92 | 80 | assertion = |
93 | 81 | servicesWithHealthcheck."${serviceName}".healthcheck.readiness-probe.enable |
94 | | - && servicesWithHealthcheck."${serviceName}".healthcheck.exec != null; |
| 82 | + -> servicesWithHealthcheck."${serviceName}".healthcheck.exec != null; |
95 | 83 | message = "When healthcheck.readiness-probe is enabled, you must define `healthcheck.exec` with the service command."; |
96 | 84 | }) (lib.attrNames servicesWithHealthcheck) |
97 | 85 | ); |
|
104 | 92 | in |
105 | 93 | { |
106 | 94 | name = "${mainServiceName}-liveness-check"; |
107 | | - value = mkIf (cfg != null && cfg.liveness-probe.enable) ( |
108 | | - let |
109 | | - probe = cfg.liveness-probe; |
110 | | - in |
111 | | - { |
112 | | - description = "Timer for ${mainServiceName} liveness probe"; |
113 | | - timerConfig = { |
114 | | - Unit = "${mainServiceName}-liveness-check.service"; |
115 | | - }; |
116 | | - wantedBy = [ "${mainServiceName}.service" ]; |
117 | | - } |
118 | | - ); |
| 95 | + value = mkIf (cfg != null && cfg.liveness-probe.enable) { |
| 96 | + description = "Timer for ${mainServiceName} liveness probe"; |
| 97 | + timerConfig = { |
| 98 | + Unit = "${mainServiceName}-liveness-check.service"; |
| 99 | + }; |
| 100 | + wantedBy = [ "${mainServiceName}.service" ]; |
| 101 | + }; |
119 | 102 | } |
120 | | - ) (servicesWithHealthcheck); |
| 103 | + ) servicesWithHealthcheck; |
121 | 104 |
|
122 | 105 | services = |
123 | 106 | (lib.mapAttrs ( |
|
145 | 128 | ++ (if cfg ? "path" then cfg.path else [ ]) |
146 | 129 | ++ (if serviceConfig ? "path" then serviceConfig.path else [ ]) |
147 | 130 | ); |
148 | | - successStatus = builtins.replaceStrings [ "%OUTPUT%" ] [ ''''${output}'' ] probeCfg.statusReady; |
149 | 131 | in |
150 | 132 | lib.mkForce ( |
151 | 133 | pkgs.writeShellScript "${mainServiceName}-readiness-check" '' |
|
156 | 138 |
|
157 | 139 | check() { |
158 | 140 | echo "Health check: starting background readiness probe for ${mainServiceName}." |
| 141 | + sleep ${toString probeCfg.initialDelaySeconds} |
159 | 142 | while true; do |
160 | | - local output |
161 | | - output=$(sh -c "${probeCfg.command}" 2>/dev/null) |
162 | | -
|
163 | | - if [[ -n "''${output}" ]]; then |
| 143 | + if (timeout ${toString probeCfg.timeoutSeconds}s ${probeCfg.command} &> /dev/null); then |
164 | 144 | echo "Health check: probe successful. Notifying systemd that service is ready." |
165 | | - systemd-notify --ready --status="${successStatus}" |
| 145 | + systemd-notify --ready --status="${probeCfg.statusReadyMessage}" |
166 | 146 | return 0 |
167 | 147 | else |
168 | 148 | echo "Health check: probe not successful. Notifying systemd that service is still waiting." |
169 | | - systemd-notify --status="${probeCfg.statusWaiting}" |
| 149 | + systemd-notify --status="${probeCfg.statusWaitingMessage}" |
170 | 150 | fi |
171 | 151 | sleep ${toString probeCfg.periodSeconds} |
172 | 152 | done |
|
176 | 156 | check & |
177 | 157 | else |
178 | 158 | echo "Health check: NOTIFY_SOCKET not set. Cannot run readiness probe." >&2 |
| 159 | + exit 1 |
179 | 160 | fi |
180 | 161 |
|
181 | 162 | ${cfg.exec} |
|
193 | 174 | name = "${mainServiceName}-liveness-check"; |
194 | 175 | value = mkIf (cfg != null && cfg.liveness-probe.enable) ( |
195 | 176 | let |
196 | | - probe = cfg.liveness-probe; |
| 177 | + probeCfg = cfg.liveness-probe; |
197 | 178 | checkScript = pkgs.writeShellScriptBin "liveness-check" '' |
198 | 179 | #!${pkgs.runtimeShell} |
199 | | - sleep ${toString probe.initialDelaySeconds} |
| 180 | + sleep ${toString probeCfg.initialDelaySeconds} |
200 | 181 | echo "Executing liveness probe for ${mainServiceName}..." |
201 | 182 | # If the command fails, explicitly restart the main service |
202 | 183 | while true; do |
203 | | - if ! (timeout ${toString probe.timeoutSeconds}s ${probe.command}); then |
204 | | - echo "(timeout ${toString probe.timeoutSeconds}s ${probe.command})" |
| 184 | + if ! (timeout ${toString probeCfg.timeoutSeconds}s ${probeCfg.command} &> /dev/null); then |
| 185 | + echo "(timeout ${toString probeCfg.timeoutSeconds}s ${probeCfg.command})" |
205 | 186 | echo "Liveness probe for ${mainServiceName} failed. Triggering restart..." |
206 | 187 | ${pkgs.systemd}/bin/systemctl restart ${mainServiceName}.service & |
207 | 188 | fi |
208 | | - sleep ${toString probe.periodSeconds} |
| 189 | + sleep ${toString probeCfg.periodSeconds} |
209 | 190 | done |
210 | 191 | ''; |
211 | 192 | in |
|
230 | 211 | with types; |
231 | 212 | attrsOf ( |
232 | 213 | submodule ( |
233 | | - { name, config, ... }: |
| 214 | + { ... }: |
234 | 215 | { |
235 | 216 | options = { |
236 | 217 | healthcheck = mkOption { |
| 218 | + default = null; |
| 219 | + description = "Declarative health checks for this systemd service."; |
237 | 220 | type = |
238 | 221 | with types; |
239 | 222 | nullOr (submodule { |
|
257 | 240 |
|
258 | 241 | # The new readiness probe that uses the notify pattern. |
259 | 242 | readiness-probe = mkOption { |
260 | | - type = types.submodule (mkReadinessProbeOptions); |
| 243 | + type = types.submodule mkReadinessProbeOptions; |
261 | 244 | default = { }; |
262 | 245 | }; |
263 | 246 |
|
264 | 247 | # The liveness probe (timer-based). |
265 | 248 | liveness-probe = mkOption { |
266 | | - type = types.submodule (mkLivenessProbeOptions); |
| 249 | + type = types.submodule mkLivenessProbeOptions; |
267 | 250 | default = { }; |
268 | 251 | }; |
269 | 252 | }; |
270 | 253 | }); |
271 | | - default = null; |
272 | | - description = "Declarative health checks for this systemd service."; |
273 | 254 | }; |
274 | 255 | }; |
275 | 256 |
|
|
0 commit comments