Skip to content

Commit 80d2881

Browse files
committed
wip
1 parent dc46722 commit 80d2881

File tree

1 file changed

+69
-88
lines changed

1 file changed

+69
-88
lines changed

modules/healthcheck/default.nix

Lines changed: 69 additions & 88 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,9 @@
1-
{ withSystem, ... }:
1+
{ ... }:
22
{
33
flake.modules.nixos.healthcheck =
44
# /etc/nixos/modules/systemd-healthcheck.nix
55
{
66
config,
7-
options,
87
lib,
98
pkgs,
109
...
@@ -13,69 +12,58 @@
1312
with lib;
1413

1514
let
16-
# Options for the liveness probe (timer-based check)
17-
mkLivenessProbeOptions = {
18-
options = {
19-
enable = mkEnableOption "the liveness probe";
2015

21-
command = mkOption {
22-
type = types.lines;
23-
description = "The command to execute for the liveness check.";
24-
};
16+
mkProbeOptions = x: {
17+
options =
18+
{
19+
enable = mkEnableOption "the ${x} probe";
2520

26-
initialDelaySeconds = mkOption {
27-
type = types.int;
28-
default = 15;
29-
description = "Seconds to wait after the service is up before the first liveness probe.";
30-
};
21+
command = mkOption {
22+
type = types.lines;
23+
description = "The command to execute for the ${x} check.";
24+
};
3125

32-
periodSeconds = mkOption {
33-
type = types.int;
34-
default = 30;
35-
description = "How often (in seconds) to perform the probe.";
36-
};
26+
initialDelaySeconds = mkOption {
27+
type = types.int;
28+
default = 15;
29+
description = "Seconds to wait after the service is up before the first ${x} probe.";
30+
};
3731

38-
timeoutSeconds = mkOption {
39-
type = types.int;
40-
default = 10;
41-
description = "Seconds after which the liveness probe command times out.";
42-
};
43-
};
44-
};
32+
periodSeconds = mkOption {
33+
type = types.int;
34+
default = if x == "liveness" then 30 else 2;
35+
description = "How often (in seconds) to perform the ${x} probe.";
36+
};
4537

46-
# Options for the readiness probe (notify-based check)
47-
mkReadinessProbeOptions = {
48-
options = {
49-
enable = mkEnableOption "the notify readiness probe";
38+
timeoutSeconds = mkOption {
39+
type = types.int;
40+
default = 10;
41+
description = "Seconds after which the ${x} probe command times out.";
42+
};
43+
}
44+
// lib.optionalAttrs (x == "readiness") ({
45+
statusWaitingMessage = mkOption {
46+
type = types.str;
47+
default = "Service starting, waiting for ready signal...";
48+
description = "The status message to send to systemd while waiting.";
49+
};
5050

51-
command = mkOption {
52-
type = types.lines;
53-
description = "The check command to run in a loop. It should output a value on success.";
54-
example = "curl -s localhost:9091/metrics | awk '/^signing_signers_loaded_count /{print int($2)}'";
55-
};
56-
57-
periodSeconds = mkOption {
58-
type = types.int;
59-
default = 2;
60-
description = "How often (in seconds) to perform the readiness check.";
61-
};
51+
statusReadyMessage = mkOption {
52+
type = types.str;
53+
default = "Service is ready.";
54+
description = ''
55+
The status message to send when the service is ready.
56+
Use %OUTPUT% to substitute the output of the check command.
57+
'';
58+
};
59+
});
60+
};
6261

63-
statusWaiting = mkOption {
64-
type = types.str;
65-
default = "Service starting, waiting for ready signal...";
66-
description = "The status message to send to systemd while waiting.";
67-
};
62+
# Options for the liveness probe (timer-based check)
63+
mkLivenessProbeOptions = mkProbeOptions "liveness";
6864

69-
statusReady = mkOption {
70-
type = types.str;
71-
default = "Service is ready.";
72-
description = ''
73-
The status message to send when the service is ready.
74-
Use %OUTPUT% to substitute the output of the check command.
75-
'';
76-
};
77-
};
78-
};
65+
# Options for the readiness probe (notify-based check)
66+
mkReadinessProbeOptions = mkProbeOptions "readiness";
7967
in
8068
{
8169

@@ -91,7 +79,7 @@
9179
lib.map (serviceName: {
9280
assertion =
9381
servicesWithHealthcheck."${serviceName}".healthcheck.readiness-probe.enable
94-
&& servicesWithHealthcheck."${serviceName}".healthcheck.exec != null;
82+
-> servicesWithHealthcheck."${serviceName}".healthcheck.exec != null;
9583
message = "When healthcheck.readiness-probe is enabled, you must define `healthcheck.exec` with the service command.";
9684
}) (lib.attrNames servicesWithHealthcheck)
9785
);
@@ -104,20 +92,15 @@
10492
in
10593
{
10694
name = "${mainServiceName}-liveness-check";
107-
value = mkIf (cfg != null && cfg.liveness-probe.enable) (
108-
let
109-
probe = cfg.liveness-probe;
110-
in
111-
{
112-
description = "Timer for ${mainServiceName} liveness probe";
113-
timerConfig = {
114-
Unit = "${mainServiceName}-liveness-check.service";
115-
};
116-
wantedBy = [ "${mainServiceName}.service" ];
117-
}
118-
);
95+
value = mkIf (cfg != null && cfg.liveness-probe.enable) {
96+
description = "Timer for ${mainServiceName} liveness probe";
97+
timerConfig = {
98+
Unit = "${mainServiceName}-liveness-check.service";
99+
};
100+
wantedBy = [ "${mainServiceName}.service" ];
101+
};
119102
}
120-
) (servicesWithHealthcheck);
103+
) servicesWithHealthcheck;
121104

122105
services =
123106
(lib.mapAttrs (
@@ -145,7 +128,6 @@
145128
++ (if cfg ? "path" then cfg.path else [ ])
146129
++ (if serviceConfig ? "path" then serviceConfig.path else [ ])
147130
);
148-
successStatus = builtins.replaceStrings [ "%OUTPUT%" ] [ ''''${output}'' ] probeCfg.statusReady;
149131
in
150132
lib.mkForce (
151133
pkgs.writeShellScript "${mainServiceName}-readiness-check" ''
@@ -156,17 +138,15 @@
156138
157139
check() {
158140
echo "Health check: starting background readiness probe for ${mainServiceName}."
141+
sleep ${toString probeCfg.initialDelaySeconds}
159142
while true; do
160-
local output
161-
output=$(sh -c "${probeCfg.command}" 2>/dev/null)
162-
163-
if [[ -n "''${output}" ]]; then
143+
if (timeout ${toString probeCfg.timeoutSeconds}s ${probeCfg.command} &> /dev/null); then
164144
echo "Health check: probe successful. Notifying systemd that service is ready."
165-
systemd-notify --ready --status="${successStatus}"
145+
systemd-notify --ready --status="${probeCfg.statusReadyMessage}"
166146
return 0
167147
else
168148
echo "Health check: probe not successful. Notifying systemd that service is still waiting."
169-
systemd-notify --status="${probeCfg.statusWaiting}"
149+
systemd-notify --status="${probeCfg.statusWaitingMessage}"
170150
fi
171151
sleep ${toString probeCfg.periodSeconds}
172152
done
@@ -176,6 +156,7 @@
176156
check &
177157
else
178158
echo "Health check: NOTIFY_SOCKET not set. Cannot run readiness probe." >&2
159+
exit 1
179160
fi
180161
181162
${cfg.exec}
@@ -193,19 +174,19 @@
193174
name = "${mainServiceName}-liveness-check";
194175
value = mkIf (cfg != null && cfg.liveness-probe.enable) (
195176
let
196-
probe = cfg.liveness-probe;
177+
probeCfg = cfg.liveness-probe;
197178
checkScript = pkgs.writeShellScriptBin "liveness-check" ''
198179
#!${pkgs.runtimeShell}
199-
sleep ${toString probe.initialDelaySeconds}
180+
sleep ${toString probeCfg.initialDelaySeconds}
200181
echo "Executing liveness probe for ${mainServiceName}..."
201182
# If the command fails, explicitly restart the main service
202183
while true; do
203-
if ! (timeout ${toString probe.timeoutSeconds}s ${probe.command}); then
204-
echo "(timeout ${toString probe.timeoutSeconds}s ${probe.command})"
184+
if ! (timeout ${toString probeCfg.timeoutSeconds}s ${probeCfg.command} &> /dev/null); then
185+
echo "(timeout ${toString probeCfg.timeoutSeconds}s ${probeCfg.command})"
205186
echo "Liveness probe for ${mainServiceName} failed. Triggering restart..."
206187
${pkgs.systemd}/bin/systemctl restart ${mainServiceName}.service &
207188
fi
208-
sleep ${toString probe.periodSeconds}
189+
sleep ${toString probeCfg.periodSeconds}
209190
done
210191
'';
211192
in
@@ -230,10 +211,12 @@
230211
with types;
231212
attrsOf (
232213
submodule (
233-
{ name, config, ... }:
214+
{ ... }:
234215
{
235216
options = {
236217
healthcheck = mkOption {
218+
default = null;
219+
description = "Declarative health checks for this systemd service.";
237220
type =
238221
with types;
239222
nullOr (submodule {
@@ -257,19 +240,17 @@
257240

258241
# The new readiness probe that uses the notify pattern.
259242
readiness-probe = mkOption {
260-
type = types.submodule (mkReadinessProbeOptions);
243+
type = types.submodule mkReadinessProbeOptions;
261244
default = { };
262245
};
263246

264247
# The liveness probe (timer-based).
265248
liveness-probe = mkOption {
266-
type = types.submodule (mkLivenessProbeOptions);
249+
type = types.submodule mkLivenessProbeOptions;
267250
default = { };
268251
};
269252
};
270253
});
271-
default = null;
272-
description = "Declarative health checks for this systemd service.";
273254
};
274255
};
275256

0 commit comments

Comments
 (0)