Skip to content

Commit b58cade

Browse files
committed
Update documentation
Signed-off-by: Arjun Raja Yogidas <[email protected]>
1 parent 6e0a3fc commit b58cade

File tree

5 files changed

+106
-62
lines changed

5 files changed

+106
-62
lines changed

cmd/nerdctl/container/container_health_check_linux_test.go

Lines changed: 38 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@ package container
1919
import (
2020
"encoding/json"
2121
"errors"
22+
"fmt"
2223
"strings"
2324
"testing"
2425
"time"
@@ -37,9 +38,6 @@ import (
3738
)
3839

3940
func TestContainerHealthCheckBasic(t *testing.T) {
40-
if rootlessutil.IsRootless() {
41-
t.Skip("healthcheck tests are skipped in rootless environment")
42-
}
4341

4442
testCase := nerdtest.Setup()
4543

@@ -138,10 +136,6 @@ func TestContainerHealthCheckBasic(t *testing.T) {
138136
}
139137

140138
func TestContainerHealthCheckAdvance(t *testing.T) {
141-
if rootlessutil.IsRootless() {
142-
t.Skip("healthcheck tests are skipped in rootless environment")
143-
}
144-
145139
testCase := nerdtest.Setup()
146140

147141
// Docker CLI does not provide a standalone healthcheck command.
@@ -399,6 +393,43 @@ func TestContainerHealthCheckAdvance(t *testing.T) {
399393
}
400394
},
401395
},
396+
{
397+
Description: "Healthcheck emits large output repeatedly",
398+
Setup: func(data test.Data, helpers test.Helpers) {
399+
helpers.Ensure("run", "-d", "--name", data.Identifier(),
400+
"--health-cmd", "yes X | head -c 60000",
401+
"--health-interval", "1s", "--health-timeout", "2s",
402+
testutil.CommonImage, "sleep", nerdtest.Infinity)
403+
nerdtest.EnsureContainerStarted(helpers, data.Identifier())
404+
},
405+
Cleanup: func(data test.Data, helpers test.Helpers) {
406+
helpers.Anyhow("rm", "-f", data.Identifier())
407+
},
408+
Command: func(data test.Data, helpers test.Helpers) test.TestableCommand {
409+
for i := 0; i < 3; i++ {
410+
helpers.Ensure("container", "healthcheck", data.Identifier())
411+
time.Sleep(2 * time.Second)
412+
}
413+
return helpers.Command("inspect", data.Identifier())
414+
},
415+
Expected: func(data test.Data, helpers test.Helpers) *test.Expected {
416+
return &test.Expected{
417+
ExitCode: 0,
418+
Output: expect.All(func(_ string, t tig.T) {
419+
inspect := nerdtest.InspectContainer(helpers, data.Identifier())
420+
h := inspect.State.Health
421+
debug, _ := json.MarshalIndent(h, "", " ")
422+
t.Log(string(debug))
423+
assert.Assert(t, h != nil, "expected health state")
424+
assert.Equal(t, h.Status, healthcheck.Healthy)
425+
assert.Assert(t, len(h.Log) >= 3, "expected at least 3 health log entries")
426+
for _, log := range h.Log {
427+
assert.Assert(t, len(log.Output) >= 1024, fmt.Sprintf("each output should be >= 1024 bytes, was: %s", log.Output))
428+
}
429+
}),
430+
}
431+
},
432+
},
402433
{
403434
Description: "Health log in inspect keeps only the latest 5 entries",
404435
Setup: func(data test.Data, helpers test.Helpers) {
@@ -587,13 +618,10 @@ func TestHealthCheck_SystemdIntegration_Basic(t *testing.T) {
587618
"--health-interval", "2s",
588619
testutil.CommonImage, "sleep", "30")
589620
nerdtest.EnsureContainerStarted(helpers, data.Identifier())
590-
// Wait for a healthcheck to execute
591-
time.Sleep(2 * time.Second)
592621
},
593622
Cleanup: func(data test.Data, helpers test.Helpers) {
594623
// Ensure proper cleanup of systemd units
595624
helpers.Anyhow("stop", data.Identifier())
596-
time.Sleep(500 * time.Millisecond) // Allow systemd cleanup
597625
helpers.Anyhow("rm", "-f", data.Identifier())
598626
},
599627
Expected: func(data test.Data, helpers test.Helpers) *test.Expected {
@@ -617,9 +645,7 @@ func TestHealthCheck_SystemdIntegration_Basic(t *testing.T) {
617645
"--health-interval", "1s",
618646
testutil.CommonImage, "sleep", "30")
619647
nerdtest.EnsureContainerStarted(helpers, data.Identifier())
620-
time.Sleep(2 * time.Second) // Wait for at least one health check to execute
621648
helpers.Ensure("kill", data.Identifier()) // Kill the container
622-
time.Sleep(3 * time.Second) // Wait to allow any potential extra healthchecks (shouldn't happen)
623649
},
624650
Cleanup: func(data test.Data, helpers test.Helpers) {
625651
// Container is already killed, just remove it

docs/healthchecks.md

Lines changed: 61 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -2,25 +2,29 @@
22

33
`nerdctl` supports Docker-compatible health checks for containers, allowing users to monitor container health via a user-defined command.
44

5-
Currently, health checks can be triggered manually using the nerdctl container healthcheck command. Automatic orchestration (e.g., periodic checks) will be added in a future update.
5+
## Configuration Options
66

77
Health checks can be configured in multiple ways:
88

9-
1. At container creation time using nerdctl run or nerdctl create with `--health-*` flags
9+
1. At container creation time using `nerdctl run` or `nerdctl create` with these flags:
10+
- `--health-cmd`: Command to run to check health
11+
- `--health-interval`: Time between running the check (default: 30s)
12+
- `--health-timeout`: Maximum time to allow one check to run (default: 30s)
13+
- `--health-retries`: Consecutive failures needed to report unhealthy (default: 3)
14+
- `--health-start-period`: Start period for the container to initialize before starting health-retries countdown
15+
- `--no-healthcheck`: Disable any container-specified HEALTHCHECK
16+
1017
2. At image build time using HEALTHCHECK in a Dockerfile
11-
3. In docker-compose.yaml files, if using nerdctl compose
1218

13-
When a container is created, nerdctl determines the health check configuration based on the following priority:
19+
**Note:** The `--health-start-interval` option is currently not supported by nerdctl.
1420

15-
1. **CLI flags** take highest precedence (e.g., `--health-cmd`, etc.)
16-
2. If no CLI flags are set, nerdctl will use any health check defined in the image.
17-
3. If neither is present, no health check will be configured
21+
## Configuration Priority
1822

19-
Example:
23+
When a container is created, nerdctl determines the health check configuration based on this priority:
2024

21-
```bash
22-
nerdctl run --name web --health-cmd="curl -f http://localhost || exit 1" --health-interval=30s --health-timeout=5s --health-retries=3 nginx
23-
```
25+
1. CLI flags take highest precedence (e.g., `--health-cmd`, etc.)
26+
2. If no CLI flags are set, nerdctl will use any health check defined in the image
27+
3. If neither is present, no health check will be configured
2428

2529
### Disabling Health Checks
2630

@@ -37,15 +41,54 @@ configured health check inside the container and reports the result. It serves a
3741
health checks, especially in scenarios where external scheduling is used.
3842

3943
Example:
40-
4144
```
4245
nerdctl container healthcheck <container-id>
4346
```
4447

45-
### Future Work (WIP)
48+
## Automatic Health Checks with systemd
49+
50+
On Linux systems with systemd, nerdctl automatically creates and manages systemd timer units to execute health checks at the configured intervals. This provides reliable scheduling and execution of health checks without requiring a persistent daemon.
51+
52+
### Requirements for Automatic Health Checks
53+
54+
- systemd must be available on the system
55+
- Container must not be running in rootless mode
56+
- Environment variable `DISABLE_HC_SYSTEMD` must not be set to "true"
57+
58+
### How It Works
4659

47-
Since nerdctl is daemonless and does not have a persistent background process, we rely on systemd(or external schedulers)
48-
to invoke nerdctl container healthcheck at configured intervals. This allows periodic health checks for containers in a
49-
systemd-based environment. We are actively working on automating health checks, where we will listen to container lifecycle
50-
events and generate appropriate systemd service and timer units. This will enable nerdctl to support automated,
51-
Docker-compatible health checks by leveraging systemd for scheduling and lifecycle integration.
60+
1. When a container with health checks is created, nerdctl:
61+
- Creates a systemd timer unit for the container
62+
- Configures the timer according to the health check interval
63+
- Starts monitoring the container's health status
64+
65+
2. The health check status can be one of:
66+
- `starting`: During container initialization
67+
- `healthy`: When health checks are passing
68+
- `unhealthy`: After specified number of consecutive failures
69+
## Examples
70+
71+
1. Basic health check that verifies a web server:
72+
```bash
73+
nerdctl run -d --name web \
74+
--health-cmd="curl -f http://localhost/ || exit 1" \
75+
--health-interval=5s \
76+
--health-retries=3 \
77+
nginx
78+
```
79+
80+
2. Health check with initialization period:
81+
```bash
82+
nerdctl run -d --name app \
83+
--health-cmd="./health-check.sh" \
84+
--health-interval=30s \
85+
--health-timeout=10s \
86+
--health-retries=3 \
87+
--health-start-period=60s \
88+
myapp
89+
```
90+
91+
3. Disable health checks:
92+
```bash
93+
nerdctl run --no-healthcheck myapp
94+
```

go.mod

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -120,7 +120,7 @@ require (
120120
github.com/santhosh-tekuri/jsonschema/v6 v6.0.1 // indirect
121121
github.com/sasha-s/go-deadlock v0.3.5 // indirect
122122
//gomodjail:unconfined
123-
github.com/sirupsen/logrus v1.9.3
123+
github.com/sirupsen/logrus v1.9.3 // indirect
124124
github.com/smallstep/pkcs7 v0.1.1 // indirect
125125
github.com/spaolacci/murmur3 v1.1.0 // indirect
126126
github.com/stefanberger/go-pkcs11uri v0.0.0-20230803200340-78284954bff6 // indirect

pkg/containerutil/containerutil.go

Lines changed: 0 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -359,11 +359,6 @@ func Stop(ctx context.Context, container containerd.Container, timeout *time.Dur
359359
}
360360
}()
361361

362-
// Clean up healthcheck units if configured.
363-
// if err := healthcheck.RemoveTransientHealthCheckFiles(ctx, container); err != nil {
364-
// return fmt.Errorf("failed to clean up healthcheck units for container %s", container.ID())
365-
// }
366-
367362
if timeout == nil {
368363
t, ok := l[labels.StopTimeout]
369364
if !ok {
@@ -505,11 +500,6 @@ func Pause(ctx context.Context, client *containerd.Client, id string) error {
505500
return err
506501
}
507502

508-
// Clean up healthcheck units if configured.
509-
// if err := healthcheck.RemoveTransientHealthCheckFiles(ctx, container); err != nil {
510-
// return fmt.Errorf("failed to clean up healthcheck units for container %s", container.ID())
511-
// }
512-
513503
switch status.Status {
514504
case containerd.Paused:
515505
return fmt.Errorf("container %s is already paused", id)

pkg/healthcheck/healthcheck_manager_linux.go

Lines changed: 6 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,6 @@ import (
2525
"strings"
2626

2727
"github.com/coreos/go-systemd/v22/dbus"
28-
"github.com/sirupsen/logrus"
2928

3029
containerd "github.com/containerd/containerd/v2/client"
3130
"github.com/containerd/log"
@@ -47,12 +46,9 @@ func CreateTimer(ctx context.Context, container containerd.Container) error {
4746

4847
containerID := container.ID()
4948
hcName := hcUnitName(containerID, true)
50-
logrus.Debugf("Creating healthcheck timer unit: %s", hcName)
49+
log.G(ctx).Debugf("Creating healthcheck timer unit: %s", hcName)
5150

5251
cmd := []string{}
53-
if rootlessutil.IsRootless() {
54-
cmd = append(cmd, "--user")
55-
}
5652
if path := os.Getenv("PATH"); path != "" {
5753
cmd = append(cmd, "--setenv=PATH="+path)
5854
}
@@ -61,7 +57,7 @@ func CreateTimer(ctx context.Context, container containerd.Container) error {
6157
cmd = append(cmd, "--unit", hcName, "--on-unit-inactive="+hc.Interval.String(), "--timer-property=AccuracySec=1s")
6258

6359
cmd = append(cmd, "nerdctl", "container", "healthcheck", containerID)
64-
if logrus.IsLevelEnabled(logrus.DebugLevel) {
60+
if log.G(ctx).Logger.IsLevelEnabled(log.DebugLevel) {
6561
cmd = append(cmd, "--debug")
6662
}
6763

@@ -71,7 +67,7 @@ func CreateTimer(ctx context.Context, container containerd.Container) error {
7167
}
7268
defer conn.Close()
7369

74-
logrus.Debugf("creating healthcheck timer with: systemd-run %s", strings.Join(cmd, " "))
70+
log.G(ctx).Debugf("creating healthcheck timer with: systemd-run %s", strings.Join(cmd, " "))
7571
run := exec.Command("systemd-run", cmd...)
7672
if out, err := run.CombinedOutput(); err != nil {
7773
return fmt.Errorf("systemd-run failed: %w\noutput: %s", err, strings.TrimSpace(string(out)))
@@ -81,7 +77,6 @@ func CreateTimer(ctx context.Context, container containerd.Container) error {
8177
}
8278

8379
// StartTimer starts the healthcheck timer unit.
84-
// TODO if we persist hcName to container state, pass that to this function.
8580
func StartTimer(ctx context.Context, container containerd.Container) error {
8681
hc := extractHealthcheck(ctx, container)
8782
if hc == nil {
@@ -124,17 +119,7 @@ func RemoveTransientHealthCheckFiles(ctx context.Context, container containerd.C
124119

125120
// RemoveTransientHealthCheckFilesByID stops and cleans up the transient timer and service using just the container ID.
126121
func RemoveTransientHealthCheckFilesByID(ctx context.Context, containerID string) error {
127-
// Don't proceed if systemd is unavailable or disabled
128-
if !defaults.IsSystemdAvailable() || os.Getenv("DISABLE_HC_SYSTEMD") == "true" {
129-
return nil
130-
}
131-
132-
// Skip healthchecks in rootless environments to avoid systemd DBUS permission issues
133-
if rootlessutil.IsRootless() {
134-
return nil
135-
}
136-
137-
logrus.Debugf("Removing healthcheck timer unit: %s", containerID)
122+
log.G(ctx).Debugf("Removing healthcheck timer unit: %s", containerID)
138123

139124
conn, err := dbus.NewSystemConnectionContext(context.Background())
140125
if err != nil {
@@ -150,15 +135,15 @@ func RemoveTransientHealthCheckFilesByID(ctx context.Context, containerID string
150135
tChan := make(chan string)
151136
if _, err := conn.StopUnitContext(context.Background(), timer, "ignore-dependencies", tChan); err == nil {
152137
if msg := <-tChan; msg != "done" {
153-
logrus.Warnf("timer stop message: %s", msg)
138+
log.G(ctx).Warnf("timer stop message: %s", msg)
154139
}
155140
}
156141

157142
// Stop service
158143
sChan := make(chan string)
159144
if _, err := conn.StopUnitContext(context.Background(), service, "ignore-dependencies", sChan); err == nil {
160145
if msg := <-sChan; msg != "done" {
161-
logrus.Warnf("service stop message: %s", msg)
146+
log.G(ctx).Warnf("service stop message: %s", msg)
162147
}
163148
}
164149

0 commit comments

Comments
 (0)