Skip to content

Commit 98f73f6

Browse files
reo101monyarmMartinNikov
committed
feat(modules): Initial healthcheck module
Co-authored-by: Simeon Armenchev <[email protected]> Co-authored-by: Martin Nikov <[email protected]>
1 parent 8517b67 commit 98f73f6

File tree

2 files changed

+249
-0
lines changed

2 files changed

+249
-0
lines changed

modules/default.nix

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,5 +9,6 @@
99
./secrets.nix
1010
./mcl-disko
1111
./pharos
12+
./healthcheck
1213
];
1314
}

modules/healthcheck/default.nix

Lines changed: 248 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,248 @@
1+
{ ... }:
2+
{
3+
flake.modules.nixos.healthcheck =
4+
{
5+
config,
6+
lib,
7+
pkgs,
8+
...
9+
}:
10+
let
11+
inherit (lib) types;
12+
mkProbeOptions = x: {
13+
options =
14+
{
15+
enable = lib.mkEnableOption "the ${x} probe";
16+
17+
command = lib.mkOption {
18+
type = types.str;
19+
description = "The command to execute for the ${x} check. Any necessary programs should be added to the healthcheck.runtimePackages option.";
20+
};
21+
22+
initialDelay = lib.mkOption {
23+
type = types.int;
24+
default = 15;
25+
description = "Seconds to wait after the service is up before the first ${x} probe.";
26+
};
27+
28+
interval = lib.mkOption {
29+
type = types.int;
30+
default = if x == "liveness" then 30 else 2;
31+
description = "How often (in seconds) to perform the ${x} probe.";
32+
};
33+
34+
timeout = lib.mkOption {
35+
type = types.int;
36+
default = 10;
37+
description = "Seconds after which the ${x} probe command times out.";
38+
};
39+
40+
# TODO: `{success,failure}_treshold`
41+
42+
retryCount = lib.mkOption {
43+
type = types.int;
44+
default = 10;
45+
description = "Number of times to retry the ${x} probe before considering it failed. (-1 means infinite retries)";
46+
};
47+
}
48+
// lib.optionalAttrs (x == "readiness") {
49+
statusWaitingMessage = lib.mkOption {
50+
type = types.str;
51+
default = "Service starting, waiting for ready signal...";
52+
description = "The status message to send to systemd while waiting.";
53+
};
54+
55+
statusReadyMessage = lib.mkOption {
56+
type = types.str;
57+
default = "Service is ready.";
58+
description = ''
59+
The status message to send when the service is ready.
60+
Use %OUTPUT% to substitute the output of the check command.
61+
'';
62+
};
63+
};
64+
};
65+
66+
# Options for the liveness probe (timer-based check)
67+
livenessProbeOptions = mkProbeOptions "liveness";
68+
69+
# Options for the readiness probe (notify-based check)
70+
readinessProbeOptions = mkProbeOptions "readiness";
71+
in
72+
{
73+
config =
74+
let
75+
servicesWithHealthcheck = lib.filterAttrs (
76+
_name: service: service.healthcheck != null
77+
) config.mcl.services;
78+
in
79+
{
80+
assertions = lib.pipe config.mcl.services [
81+
(lib.filterAttrs (_: service: service.healthcheck != null))
82+
(lib.mapAttrsToList (
83+
name: _:
84+
let
85+
serviceConfig = config.systemd.services.${name}.serviceConfig;
86+
in
87+
{
88+
# NOTE: as per <https://www.freedesktop.org/software/systemd/man/latest/systemd.service.html#ExecStartPost=>
89+
assertion = lib.elem serviceConfig.Type [
90+
"simple"
91+
"idle"
92+
];
93+
message = ''
94+
Service ${name} is not of type "simple" or "idle", but ${serviceConfig.Type}.
95+
Cannot attach a readiness probe to it.
96+
'';
97+
}
98+
))
99+
];
100+
systemd = {
101+
services =
102+
let
103+
mainServices = lib.mapAttrs (
104+
mainServiceName: serviceConfig:
105+
let
106+
cfg = serviceConfig.healthcheck;
107+
probeCfg = cfg.readiness-probe;
108+
in
109+
lib.mkIf (cfg != null && probeCfg.enable) {
110+
# Timeout is now handled manually by the new `ExecStartPost`
111+
serviceConfig.TimeoutStartSec = "infinity";
112+
113+
# Add an `ExecStartPost` with a script that runs the readiness probe
114+
# WARN: cannot assure that there is no `ExecStartPost` in the original `serviceConfig`
115+
# (in order to avoid overriding/duplication)
116+
serviceConfig.ExecStartPost =
117+
let
118+
scriptPath = lib.makeBinPath (cfg.runtimePackages ++ (serviceConfig.path or [ ]));
119+
in
120+
lib.getExe (
121+
pkgs.writeShellScriptBin "${mainServiceName}-readiness-check" ''
122+
set -o nounset
123+
124+
export PATH="${scriptPath}:$PATH"
125+
126+
echo "Health check: starting background readiness probe for ${mainServiceName}."
127+
sleep ${toString probeCfg.initialDelay}
128+
retryCount=${toString probeCfg.retryCount}
129+
while true; do
130+
if (timeout ${toString probeCfg.timeout}s ${probeCfg.command} &> /dev/null); then
131+
echo "Health check: probe successful. Notifying systemd that the service is ready."
132+
exit 0
133+
else
134+
echo "Health check: probe not successful. Notifying systemd that the service is still waiting. Retrying in ${toString probeCfg.interval} seconds..."
135+
if [[ ''${retryCount} -ne -1 ]]; then
136+
retryCount=$((retryCount - 1))
137+
if [[ ''${retryCount} -le 0 ]]; then
138+
echo "Health check: probe failed after maximum retries. Exiting."
139+
exit 1
140+
fi
141+
fi
142+
fi
143+
sleep ${toString probeCfg.interval}
144+
done
145+
''
146+
);
147+
}
148+
) servicesWithHealthcheck;
149+
150+
healthCheckServices = lib.mapAttrs' (
151+
mainServiceName: serviceConfig:
152+
let
153+
cfg = serviceConfig.healthcheck;
154+
in
155+
{
156+
name = "${mainServiceName}-liveness-check";
157+
value = lib.mkIf (cfg != null && cfg.liveness-probe.enable) (
158+
let
159+
probeCfg = cfg.liveness-probe;
160+
checkScript = pkgs.writeShellScriptBin "liveness-check" ''
161+
#!${pkgs.runtimeShell}
162+
echo "Executing liveness probe for ${mainServiceName}..."
163+
if ! (timeout ${toString probeCfg.timeout}s ${probeCfg.command} &> /dev/null); then
164+
echo "Liveness probe for ${mainServiceName} failed. Triggering restart..."
165+
${lib.getExe' pkgs.systemd "systemctl"} restart ${lib.escapeShellArg mainServiceName}.service
166+
exit 1
167+
fi
168+
echo "Liveness probe for ${mainServiceName} successful."
169+
'';
170+
in
171+
{
172+
description = "Liveness check for ${mainServiceName}";
173+
path = cfg.runtimePackages;
174+
serviceConfig = {
175+
Type = "oneshot";
176+
ExecStart = "${lib.getExe checkScript}";
177+
};
178+
}
179+
);
180+
}
181+
) servicesWithHealthcheck;
182+
in
183+
mainServices // healthCheckServices;
184+
185+
timers = lib.mapAttrs' (
186+
mainServiceName: serviceConfig:
187+
let
188+
cfg = serviceConfig.healthcheck;
189+
in
190+
{
191+
name = "${mainServiceName}-liveness-check";
192+
value = lib.mkIf (cfg != null && cfg.liveness-probe.enable) (
193+
let
194+
probeCfg = cfg.liveness-probe;
195+
in
196+
{
197+
description = "Timer for ${mainServiceName} liveness probe";
198+
wantedBy = [ "timers.target" ];
199+
timerConfig = {
200+
OnActiveSec = "${toString probeCfg.initialDelay}s";
201+
OnUnitInactiveSec = "${toString probeCfg.interval}s";
202+
};
203+
}
204+
);
205+
}
206+
) servicesWithHealthcheck;
207+
};
208+
};
209+
210+
options.mcl.services = lib.mkOption {
211+
default = { };
212+
type = types.attrsOf (
213+
types.submodule {
214+
options = {
215+
healthcheck = lib.mkOption {
216+
default = null;
217+
description = "Declarative health checks for this systemd service.";
218+
type = types.nullOr (
219+
types.submodule {
220+
options = {
221+
# Programs to add to the PATH for the health check.
222+
runtimePackages = lib.mkOption {
223+
type = types.listOf types.package;
224+
default = [ ];
225+
description = "Additional programs to add to the PATH for health checks.";
226+
};
227+
228+
# The new readiness probe that uses the notify pattern.
229+
readiness-probe = lib.mkOption {
230+
type = types.submodule readinessProbeOptions;
231+
default = { };
232+
};
233+
234+
# The liveness probe (timer-based).
235+
liveness-probe = lib.mkOption {
236+
type = types.submodule livenessProbeOptions;
237+
default = { };
238+
};
239+
};
240+
}
241+
);
242+
};
243+
};
244+
}
245+
);
246+
};
247+
};
248+
}

0 commit comments

Comments
 (0)