Skip to content

Commit acb393f

Browse files
committed
feat(healthcheck): separate readiness services
1 parent c2e4db1 commit acb393f

File tree

1 file changed

+57
-51
lines changed

1 file changed

+57
-51
lines changed

modules/healthcheck/default.nix

Lines changed: 57 additions & 51 deletions
Original file line numberDiff line numberDiff line change
@@ -95,7 +95,7 @@
9595

9696
services =
9797
let
98-
mainServices = lib.mapAttrs (
98+
mainServices = lib.concatMapAttrs (
9999
mainServiceName: serviceConfig:
100100
let
101101
cfg = serviceConfig.healthcheck;
@@ -104,59 +104,65 @@
104104
let
105105
probeCfg = cfg.readiness-probe;
106106
in
107-
{
108-
# We have to force it to be a notify service, in order to use systemd-notify.
109-
serviceConfig.Type = lib.mkForce "notify";
110-
# If the TimeoutStartSec is not infinity, it can cause the service to fail, because the readiness probe is considered part of the startup.
111-
serviceConfig.TimeoutStartSec = lib.mkForce "infinity";
112-
113-
# We add a ExecStartPost with a script that runs the readiness probe
114-
serviceConfig.ExecStartPre =
115-
let
116-
scriptPath = lib.makeBinPath (
117-
[
118-
pkgs.systemd
119-
pkgs.curl
120-
pkgs.gawk
121-
]
122-
++ (cfg.runtimePackages or [ ])
123-
++ (serviceConfig.path or [ ])
124-
);
125-
in
126-
pkgs.writeShellScript "${mainServiceName}-readiness-check" ''
127-
#!${pkgs.runtimeShell}
128-
set -o nounset
129-
130-
export NOTIFY_SOCKET
131-
monitor() {
132-
export PATH="${scriptPath}:$PATH"
133-
134-
echo "Health check: starting background readiness probe for ${mainServiceName}." 1>>/tmp/banica1 2>>/tmp/banica2
135-
sleep ${toString probeCfg.initialDelay}
136-
retryCount=${toString probeCfg.retryCount}
137-
while true; do
138-
if (timeout ${toString probeCfg.timeout}s ${probeCfg.command} &> /dev/null); then
139-
echo "Health check: probe successful. Notifying systemd that the service is ready." 1>>/tmp/banica1 2>>/tmp/banica2
140-
systemd-notify --ready --status="${probeCfg.statusReadyMessage}" 1>>/tmp/banica1 2>>/tmp/banica2
141-
exit 0
142-
else
143-
echo "Health check: probe not successful. Notifying systemd that the service is still waiting. Retrying in ${toString probeCfg.interval} seconds..." 1>>/tmp/banica1 2>>/tmp/banica2
144-
systemd-notify --status="${probeCfg.statusWaitingMessage}" 1>>/tmp/banica1 2>>/tmp/banica2
145-
if [[ ''${retryCount} -ne -1 ]]; then
146-
retryCount=$((retryCount - 1))
147-
if [[ ''${retryCount} -le 0 ]]; then
148-
echo "Health check: probe failed after maximum retries. Exiting." 1>>/tmp/banica1 2>>/tmp/banica2
149-
exit 1
107+
{
108+
"${mainServiceName}-liveness-check" = {
109+
# We have to force it to be a notify service, in order to use systemd-notify.
110+
serviceConfig.Type = "oneshot";
111+
# If the TimeoutStartSec is not infinity, it can cause the service to fail, because the readiness probe is considered part of the startup.
112+
serviceConfig.TimeoutStartSec = "infinity";
113+
114+
# We add a ExecStartPost with a script that runs the readiness probe
115+
script =
116+
let
117+
scriptPath = lib.makeBinPath (
118+
[
119+
pkgs.systemd
120+
pkgs.curl
121+
pkgs.gawk
122+
]
123+
++ (cfg.runtimePackages or [ ])
124+
++ (serviceConfig.path or [ ])
125+
);
126+
in
127+
pkgs.writeShellScript "${mainServiceName}-readiness-check" ''
128+
#!${pkgs.runtimeShell}
129+
set -o nounset
130+
131+
export PATH="${scriptPath}:$PATH"
132+
133+
echo "Health check: starting background readiness probe for ${mainServiceName}."
134+
sleep ${toString probeCfg.initialDelay}
135+
retryCount=${toString probeCfg.retryCount}
136+
while true; do
137+
if (timeout ${toString probeCfg.timeout}s ${probeCfg.command} &> /dev/null); then
138+
echo "Health check: probe successful. Notifying systemd that the service is ready."
139+
exit 0
140+
else
141+
echo "Health check: probe not successful. Notifying systemd that the service is still waiting. Retrying in ${toString probeCfg.interval} seconds..."
142+
if [[ ''${retryCount} -ne -1 ]]; then
143+
retryCount=$((retryCount - 1))
144+
if [[ ''${retryCount} -le 0 ]]; then
145+
echo "Health check: probe failed after maximum retries. Exiting."
146+
exit 1
147+
fi
150148
fi
151149
fi
152-
fi
153-
sleep ${toString probeCfg.interval}
154-
done
155-
}
150+
sleep ${toString probeCfg.interval}
151+
done
152+
'';
156153

157-
monitor &
158-
'';
159-
}
154+
requires = [ "${mainServiceName}.service" ];
155+
after = [ "${mainServiceName}.service" ];
156+
};
157+
} // lib.pipe config.systemd.services [
158+
# TODO: not only `requires`, also `after` and friends
159+
(lib.filterAttrs (name: value: lib.elem "${mainServiceName}.service" value.requires))
160+
(lib.mapAttrs (value: lib.recursiveUpdate value {
161+
requires = value.requires ++ [
162+
"${mainServiceName}-readiness-check.service"
163+
];
164+
}))
165+
];
160166
))
161167
) servicesWithHealthcheck;
162168
healthCheckServices = lib.mapAttrs' (

0 commit comments

Comments
 (0)