Skip to content

Commit 7b83b03

Browse files
reo101monyarmMartinNikov
committed
feat(modules): Initial healthcheck module
Co-authored-by: Simeon Armenchev <[email protected]> Co-authored-by: Martin Nikov <[email protected]>
1 parent 8a7271c commit 7b83b03

File tree

2 files changed

+238
-0
lines changed

2 files changed

+238
-0
lines changed

modules/default.nix

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,5 +9,6 @@
99
./secrets.nix
1010
./mcl-disko
1111
./pharos
12+
./healthcheck
1213
];
1314
}

modules/healthcheck/default.nix

Lines changed: 237 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,237 @@
1+
{ ... }:
2+
{
3+
flake.modules.nixos.healthcheck =
4+
{
5+
config,
6+
lib,
7+
pkgs,
8+
...
9+
}:
10+
let
11+
inherit (lib) types;
12+
mkProbeOptions = x: {
13+
options =
14+
{
15+
enable = lib.mkEnableOption "the ${x} probe";
16+
17+
command = lib.mkOption {
18+
type = types.str;
19+
description = "The command to execute for the ${x} check. Any necessary programs should be added to the healthcheck.runtimePackages option.";
20+
};
21+
22+
initialDelay = lib.mkOption {
23+
type = types.int;
24+
default = 15;
25+
description = "Seconds to wait after the service is up before the first ${x} probe.";
26+
};
27+
28+
interval = lib.mkOption {
29+
type = types.int;
30+
default = if x == "liveness" then 30 else 2;
31+
description = "How often (in seconds) to perform the ${x} probe.";
32+
};
33+
34+
timeout = lib.mkOption {
35+
type = types.int;
36+
default = 10;
37+
description = "Seconds after which the ${x} probe command times out.";
38+
};
39+
40+
# TODO: `{success,failure}_treshold`
41+
42+
retryCount = lib.mkOption {
43+
type = types.int;
44+
default = 10;
45+
description = "Number of times to retry the ${x} probe before considering it failed. (-1 means infinite retries)";
46+
};
47+
}
48+
// lib.optionalAttrs (x == "readiness") {
49+
statusWaitingMessage = lib.mkOption {
50+
type = types.str;
51+
default = "Service starting, waiting for ready signal...";
52+
description = "The status message to send to systemd while waiting.";
53+
};
54+
55+
statusReadyMessage = lib.mkOption {
56+
type = types.str;
57+
default = "Service is ready.";
58+
description = ''
59+
The status message to send when the service is ready.
60+
Use %OUTPUT% to substitute the output of the check command.
61+
'';
62+
};
63+
};
64+
};
65+
66+
# Options for the liveness probe (timer-based check)
67+
livenessProbeOptions = mkProbeOptions "liveness";
68+
69+
# Options for the readiness probe (notify-based check)
70+
readinessProbeOptions = mkProbeOptions "readiness";
71+
in
72+
{
73+
config =
74+
let
75+
servicesWithHealthcheck = lib.filterAttrs (
76+
_name: service: service.healthcheck != null
77+
) config.mcl.services;
78+
in
79+
{
80+
assertions = lib.pipe config.mcl.services [
81+
(lib.filterAttrs (_: service: service.healthcheck != null))
82+
(lib.mapAttrsToList (name: _: let
83+
serviceConfig = config.systemd.services.${name}.serviceConfig;
84+
in {
85+
# NOTE: as per <https://www.freedesktop.org/software/systemd/man/latest/systemd.service.html#ExecStartPost=>
86+
assertion = lib.elem serviceConfig.Type ["simple" "idle"];
87+
message = ''
88+
Service ${name} is not of type "simple" or "idle", but ${serviceConfig.Type}.
89+
Cannot attach a readiness probe to it.
90+
'';
91+
}))
92+
];
93+
systemd = {
94+
services =
95+
let
96+
mainServices = lib.mapAttrs (
97+
mainServiceName: serviceConfig:
98+
let
99+
cfg = serviceConfig.healthcheck;
100+
probeCfg = cfg.readiness-probe;
101+
in
102+
lib.mkIf (cfg != null && probeCfg.enable) {
103+
# Timeout is now handled manually by the new `ExecStartPost`
104+
serviceConfig.TimeoutStartSec = "infinity";
105+
106+
# Add an `ExecStartPost` with a script that runs the readiness probe
107+
# WARN: cannot assure that there is no `ExecStartPost` in the original `serviceConfig`
108+
# (in order to avoid overriding/duplication)
109+
serviceConfig.ExecStartPost =
110+
let
111+
scriptPath = lib.makeBinPath (cfg.runtimePackages ++ (serviceConfig.path or []));
112+
in
113+
lib.getExe (pkgs.writeShellScriptBin "${mainServiceName}-readiness-check" ''
114+
set -o nounset
115+
116+
export PATH="${scriptPath}:$PATH"
117+
118+
echo "Health check: starting background readiness probe for ${mainServiceName}."
119+
sleep ${toString probeCfg.initialDelay}
120+
retryCount=${toString probeCfg.retryCount}
121+
while true; do
122+
if (timeout ${toString probeCfg.timeout}s ${probeCfg.command} &> /dev/null); then
123+
echo "Health check: probe successful. Notifying systemd that the service is ready."
124+
exit 0
125+
else
126+
echo "Health check: probe not successful. Notifying systemd that the service is still waiting. Retrying in ${toString probeCfg.interval} seconds..."
127+
if [[ ''${retryCount} -ne -1 ]]; then
128+
retryCount=$((retryCount - 1))
129+
if [[ ''${retryCount} -le 0 ]]; then
130+
echo "Health check: probe failed after maximum retries. Exiting."
131+
exit 1
132+
fi
133+
fi
134+
fi
135+
sleep ${toString probeCfg.interval}
136+
done
137+
'');
138+
}
139+
) servicesWithHealthcheck;
140+
141+
healthCheckServices = lib.mapAttrs' (
142+
mainServiceName: serviceConfig:
143+
let
144+
cfg = serviceConfig.healthcheck;
145+
in
146+
{
147+
name = "${mainServiceName}-liveness-check";
148+
value = lib.mkIf (cfg != null && cfg.liveness-probe.enable) (
149+
let
150+
probeCfg = cfg.liveness-probe;
151+
checkScript = pkgs.writeShellScriptBin "liveness-check" ''
152+
#!${pkgs.runtimeShell}
153+
echo "Executing liveness probe for ${mainServiceName}..."
154+
if ! (timeout ${toString probeCfg.timeout}s ${probeCfg.command} &> /dev/null); then
155+
echo "Liveness probe for ${mainServiceName} failed. Triggering restart..."
156+
${lib.getExe' pkgs.systemd "systemctl"} restart ${lib.escapeShellArg mainServiceName}.service
157+
exit 1
158+
fi
159+
echo "Liveness probe for ${mainServiceName} successful."
160+
'';
161+
in
162+
{
163+
description = "Liveness check for ${mainServiceName}";
164+
path = cfg.runtimePackages;
165+
serviceConfig = {
166+
Type = "oneshot";
167+
ExecStart = "${lib.getExe checkScript}";
168+
};
169+
}
170+
);
171+
}
172+
) servicesWithHealthcheck;
173+
in
174+
mainServices // healthCheckServices;
175+
176+
timers = lib.mapAttrs' (
177+
mainServiceName: serviceConfig:
178+
let
179+
cfg = serviceConfig.healthcheck;
180+
in
181+
{
182+
name = "${mainServiceName}-liveness-check";
183+
value = lib.mkIf (cfg != null && cfg.liveness-probe.enable) (
184+
let
185+
probeCfg = cfg.liveness-probe;
186+
in
187+
{
188+
description = "Timer for ${mainServiceName} liveness probe";
189+
wantedBy = [ "timers.target" ];
190+
timerConfig = {
191+
OnActiveSec = "${toString probeCfg.initialDelay}s";
192+
OnUnitInactiveSec = "${toString probeCfg.interval}s";
193+
};
194+
}
195+
);
196+
}
197+
) servicesWithHealthcheck;
198+
};
199+
};
200+
201+
options.mcl.services = lib.mkOption {
202+
default = { };
203+
type = types.attrsOf (types.submodule {
204+
options = {
205+
healthcheck = lib.mkOption {
206+
default = null;
207+
description = "Declarative health checks for this systemd service.";
208+
type = types.nullOr (
209+
types.submodule {
210+
options = {
211+
# Programs to add to the PATH for the health check.
212+
runtimePackages = lib.mkOption {
213+
type = types.listOf types.package;
214+
default = [ ];
215+
description = "Additional programs to add to the PATH for health checks.";
216+
};
217+
218+
# The new readiness probe that uses the notify pattern.
219+
readiness-probe = lib.mkOption {
220+
type = types.submodule readinessProbeOptions;
221+
default = { };
222+
};
223+
224+
# The liveness probe (timer-based).
225+
liveness-probe = lib.mkOption {
226+
type = types.submodule livenessProbeOptions;
227+
default = { };
228+
};
229+
};
230+
}
231+
);
232+
};
233+
};
234+
});
235+
};
236+
};
237+
}

0 commit comments

Comments
 (0)