Skip to content

Commit 0b8240f

Browse files
committed
support rootless healthchecks
Signed-off-by: Arjun Raja Yogidas <[email protected]>
1 parent 66f3f5e commit 0b8240f

File tree

2 files changed

+48
-51
lines changed

2 files changed

+48
-51
lines changed

hack/test-integration.sh

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -50,7 +50,7 @@ for arg in "$@"; do
5050
done
5151

5252
if [ "$needsudo" == "true" ] || [ "$needsudo" == "yes" ] || [ "$needsudo" == "1" ]; then
53-
gotestsum "${args[@]}" -- -timeout="$timeout" -p 1 -exec sudo -v -run TestHealthCheck_SystemdIntegration_Advanced -args -test.allow-kill-daemon ./cmd/nerdctl/container/
53+
gotestsum "${args[@]}" -- -timeout="$timeout" -p 1 -exec sudo -v -run TestHealthCheck_SystemdIntegration_Basic -args -test.allow-kill-daemon ./cmd/nerdctl/container/
5454
else
55-
gotestsum "${args[@]}" -- -timeout="$timeout" -p 1 -v -run TestContainerHealthCheckAdvance -args -test.allow-kill-daemon ./cmd/nerdctl/container/
55+
gotestsum "${args[@]}" -- -timeout="$timeout" -p 1 -v -run TestHealthCheck_SystemdIntegration_Basic -args -test.allow-kill-daemon ./cmd/nerdctl/container/
5656
fi

pkg/healthcheck/healthcheck_manager_linux.go

Lines changed: 46 additions & 49 deletions
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,11 @@ func CreateTimer(ctx context.Context, container containerd.Container) error {
5050
log.G(ctx).Debugf("Creating healthcheck timer unit: %s", hcName)
5151

5252
cmd := []string{}
53+
if rootlessutil.IsRootless() {
54+
cmd = append(cmd, "--user")
55+
cmd = append(cmd, fmt.Sprintf("--uid=%d", rootlessutil.ParentEUID()))
56+
}
57+
5358
if path := os.Getenv("PATH"); path != "" {
5459
cmd = append(cmd, "--setenv=PATH="+path)
5560
}
@@ -62,11 +67,11 @@ func CreateTimer(ctx context.Context, container containerd.Container) error {
6267
cmd = append(cmd, "--debug")
6368
}
6469

65-
conn, err := dbus.NewSystemConnectionContext(context.Background())
66-
if err != nil {
67-
return fmt.Errorf("systemd DBUS connect error: %w", err)
68-
}
69-
defer conn.Close()
70+
// conn, err := dbus.NewSystemConnectionContext(context.Background())
71+
// if err != nil {
72+
// return fmt.Errorf("systemd DBUS connect error: %w", err)
73+
// }
74+
// defer conn.Close()
7075

7176
log.G(ctx).Debugf("creating healthcheck timer with: systemd-run %s", strings.Join(cmd, " "))
7277
run := exec.Command("systemd-run", cmd...)
@@ -79,29 +84,53 @@ func CreateTimer(ctx context.Context, container containerd.Container) error {
7984

8085
// StartTimer starts the healthcheck timer unit.
8186
func StartTimer(ctx context.Context, container containerd.Container) error {
87+
log.G(ctx).Infof("DEBUG: StartTimer called for container %s", container.ID())
88+
8289
hc := extractHealthcheck(ctx, container)
8390
if hc == nil {
91+
log.G(ctx).Infof("DEBUG: No healthcheck found, skipping StartTimer")
8492
return nil
8593
}
8694
if shouldSkipHealthCheckSystemd(hc) {
95+
log.G(ctx).Infof("DEBUG: Skipping healthcheck systemd, shouldSkip=true")
8796
return nil
8897
}
8998

9099
hcName := hcUnitName(container.ID(), true)
91-
conn, err := dbus.NewSystemConnectionContext(context.Background())
100+
log.G(ctx).Infof("DEBUG: Starting timer for unit: %s, rootless=%v", hcName, rootlessutil.IsRootless())
101+
102+
var conn *dbus.Conn
103+
var err error
104+
if rootlessutil.IsRootless() {
105+
log.G(ctx).Infof("DEBUG: Attempting user DBUS connection...")
106+
conn, err = dbus.NewUserConnectionContext(ctx)
107+
} else {
108+
log.G(ctx).Infof("DEBUG: Attempting system DBUS connection...")
109+
conn, err = dbus.NewSystemConnectionContext(ctx)
110+
}
92111
if err != nil {
112+
log.G(ctx).Errorf("DEBUG: DBUS connection failed: %v", err)
93113
return fmt.Errorf("systemd DBUS connect error: %w", err)
94114
}
95115
defer conn.Close()
116+
log.G(ctx).Infof("DEBUG: DBUS connection successful")
96117

97118
startChan := make(chan string)
98119
unit := hcName + ".service"
120+
log.G(ctx).Infof("DEBUG: About to restart unit: %s", unit)
121+
99122
if _, err := conn.RestartUnitContext(context.Background(), unit, "fail", startChan); err != nil {
123+
log.G(ctx).Errorf("DEBUG: RestartUnitContext failed: %v", err)
100124
return err
101125
}
126+
127+
log.G(ctx).Infof("DEBUG: Waiting for restart confirmation...")
102128
if msg := <-startChan; msg != "done" {
129+
log.G(ctx).Errorf("DEBUG: Unexpected restart result: %s", msg)
103130
return fmt.Errorf("unexpected systemd restart result: %s", msg)
104131
}
132+
133+
log.G(ctx).Infof("DEBUG: StartTimer completed successfully")
105134
return nil
106135
}
107136

@@ -115,44 +144,6 @@ func RemoveTransientHealthCheckFiles(ctx context.Context, container containerd.C
115144
return ForceRemoveTransientHealthCheckFiles(ctx, container.ID())
116145
}
117146

118-
// RemoveTransientHealthCheckFilesByID stops and cleans up the transient timer and service using just the container ID.
119-
// This function is deprecated and no longer used. Use ForceRemoveTransientHealthCheckFiles instead.
120-
/*
121-
func RemoveTransientHealthCheckFilesByID(ctx context.Context, containerID string) error {
122-
log.G(ctx).Debugf("Removing healthcheck timer unit: %s", containerID)
123-
124-
conn, err := dbus.NewSystemConnectionContext(context.Background())
125-
if err != nil {
126-
return fmt.Errorf("systemd DBUS connect error: %w", err)
127-
}
128-
defer conn.Close()
129-
130-
unitName := hcUnitName(containerID, true)
131-
timer := unitName + ".timer"
132-
service := unitName + ".service"
133-
134-
// Stop timer
135-
tChan := make(chan string)
136-
if _, err := conn.StopUnitContext(context.Background(), timer, "ignore-dependencies", tChan); err == nil {
137-
if msg := <-tChan; msg != "done" {
138-
log.G(ctx).Warnf("timer stop message: %s", msg)
139-
}
140-
}
141-
142-
// Stop service
143-
sChan := make(chan string)
144-
if _, err := conn.StopUnitContext(context.Background(), service, "ignore-dependencies", sChan); err == nil {
145-
if msg := <-sChan; msg != "done" {
146-
log.G(ctx).Warnf("service stop message: %s", msg)
147-
}
148-
}
149-
150-
// Reset failed units
151-
_ = conn.ResetFailedUnitContext(context.Background(), service)
152-
return nil
153-
}
154-
*/
155-
156147
// ForceRemoveTransientHealthCheckFiles forcefully stops and cleans up the transient timer and service
157148
// using just the container ID. This function is non-blocking and uses timeouts to prevent hanging
158149
// on systemd operations. It logs errors as warnings but continues cleanup attempts.
@@ -174,7 +165,13 @@ func ForceRemoveTransientHealthCheckFiles(ctx context.Context, containerID strin
174165
go func() {
175166
defer close(errChan)
176167

177-
conn, err := dbus.NewSystemConnectionContext(timeoutCtx)
168+
var conn *dbus.Conn
169+
var err error
170+
if rootlessutil.IsRootless() {
171+
conn, err = dbus.NewUserConnectionContext(ctx)
172+
} else {
173+
conn, err = dbus.NewSystemConnectionContext(ctx)
174+
}
178175
if err != nil {
179176
log.G(ctx).Warnf("systemd DBUS connect error during force cleanup: %v", err)
180177
errChan <- fmt.Errorf("systemd DBUS connect error: %w", err)
@@ -300,10 +297,10 @@ func shouldSkipHealthCheckSystemd(hc *Healthcheck) bool {
300297
return true
301298
}
302299

303-
// Skip healthchecks in rootless environments to avoid systemd DBUS permission issues
304-
if rootlessutil.IsRootless() {
305-
return true
306-
}
300+
// Skip healthchecks in environments without dbus-launch to avoid permission issues
301+
// if _, err := exec.LookPath("dbus-launch"); err != nil {
302+
// return true
303+
// }
307304

308305
// Don't proceed if health check is nil, empty, explicitly NONE or interval is 0.
309306
if hc == nil || len(hc.Test) == 0 || hc.Test[0] == "NONE" || hc.Interval == 0 {

0 commit comments

Comments
 (0)