Skip to content

Commit ee86e77

Browse files
committed
support rootless healthchecks
Signed-off-by: Arjun Raja Yogidas <[email protected]>
1 parent 2984d07 commit ee86e77

File tree

5 files changed

+195
-51
lines changed

5 files changed

+195
-51
lines changed

.github/workflows/job-test-in-container.yml

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -151,6 +151,27 @@ jobs:
151151
sudo mkdir -p /etc/docker
152152
echo '{"ipv6": true, "fixed-cidr-v6": "2001:db8:1::/64", "experimental": true, "ip6tables": true}' | sudo tee /etc/docker/daemon.json
153153
sudo systemctl restart docker
154+
- name: "Debug: DBUS availability in container"
155+
run: |
156+
echo "=== DBUS Debugging in Container ==="
157+
[ "${{ inputs.target }}" == "rootful" ] \
158+
&& args=(test-integration) \
159+
|| args=(test-integration-${{ inputs.target }})
160+
docker run -t --rm --privileged "${args[@]}" bash -c '
161+
echo "--- DBUS Tools Availability ---"
162+
which dbus-launch dbus-daemon dbus-send dbus-monitor systemd-run || true
163+
echo "--- DBUS Binaries Location ---"
164+
ls -la /usr/bin/dbus* 2>/dev/null || echo "No DBUS tools in /usr/bin"
165+
ls -la /bin/dbus* 2>/dev/null || echo "No DBUS tools in /bin"
166+
echo "--- Systemd Tools ---"
167+
which systemctl systemd-run || true
168+
echo "--- Environment ---"
169+
echo "PATH: $PATH"
170+
echo "USER: $(whoami)"
171+
echo "UID: $(id -u)"
172+
echo "--- Package Info ---"
173+
dpkg -l | grep -E "(dbus|systemd)" || true
174+
'
154175
- name: "Run: integration tests"
155176
run: |
156177
. ./hack/github/action-helpers.sh

Dockerfile.d/test-integration-rootless.sh

Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,10 +30,40 @@ if [[ "$(id -u)" = "0" ]]; then
3030
touch /workaround-issue-622
3131
fi
3232

33+
echo "=== DBUS Debugging as ROOT ==="
34+
echo "--- DBUS Tools Availability (root) ---"
35+
which dbus-launch dbus-daemon dbus-send dbus-monitor systemd-run || true
36+
echo "--- Environment (root) ---"
37+
echo "PATH: $PATH"
38+
echo "USER: $(whoami)"
39+
echo "UID: $(id -u)"
40+
echo "DBUS_SESSION_BUS_ADDRESS: ${DBUS_SESSION_BUS_ADDRESS:-unset}"
41+
echo "XDG_RUNTIME_DIR: ${XDG_RUNTIME_DIR:-unset}"
42+
echo "--- Systemd Status (root) ---"
43+
systemctl --user status 2>&1 || echo "systemctl --user failed as root"
44+
echo "--- DBUS Launch Test (root) ---"
45+
dbus-launch --sh-syntax 2>&1 || echo "dbus-launch failed as root"
46+
3347
# Switch to the rootless user via SSH
3448
systemctl start ssh
3549
exec ssh -o StrictHostKeyChecking=no rootless@localhost "$0" "$@"
3650
else
51+
echo "=== DBUS Debugging as ROOTLESS USER ==="
52+
echo "--- DBUS Tools Availability (rootless) ---"
53+
which dbus-launch dbus-daemon dbus-send dbus-monitor systemd-run || true
54+
echo "--- Environment (rootless) ---"
55+
echo "PATH: $PATH"
56+
echo "USER: $(whoami)"
57+
echo "UID: $(id -u)"
58+
echo "DBUS_SESSION_BUS_ADDRESS: ${DBUS_SESSION_BUS_ADDRESS:-unset}"
59+
echo "XDG_RUNTIME_DIR: ${XDG_RUNTIME_DIR:-unset}"
60+
echo "--- Systemd Status (rootless) ---"
61+
systemctl --user status 2>&1 || echo "systemctl --user failed as rootless"
62+
echo "--- DBUS Launch Test (rootless) ---"
63+
dbus-launch --sh-syntax 2>&1 || echo "dbus-launch failed as rootless"
64+
echo "--- Systemd User Environment ---"
65+
systemctl --user show-environment 2>&1 || echo "systemctl --user show-environment failed"
66+
3767
containerd-rootless-setuptool.sh install
3868
if grep -q "options use-vc" /etc/resolv.conf; then
3969
containerd-rootless-setuptool.sh nsenter -- sh -euc 'echo "options use-vc" >>/etc/resolv.conf'
@@ -60,6 +90,16 @@ EOF
6090
systemctl --user restart stargz-snapshotter.service
6191
export IPFS_PATH="/home/rootless/.local/share/ipfs"
6292
containerd-rootless-setuptool.sh install-bypass4netnsd
93+
94+
echo "=== DBUS Debugging AFTER containerd-rootless-setuptool.sh ==="
95+
echo "--- Environment After Setup ---"
96+
echo "DBUS_SESSION_BUS_ADDRESS: ${DBUS_SESSION_BUS_ADDRESS:-unset}"
97+
echo "XDG_RUNTIME_DIR: ${XDG_RUNTIME_DIR:-unset}"
98+
echo "--- Systemd Status After Setup ---"
99+
systemctl --user status 2>&1 || echo "systemctl --user still failed after setup"
100+
echo "--- DBUS Launch Test After Setup ---"
101+
dbus-launch --sh-syntax 2>&1 || echo "dbus-launch still failed after setup"
102+
63103
# Once ssh-ed, we lost the Dockerfile working dir, so, get back in the nerdctl checkout
64104
cd /go/src/github.com/containerd/nerdctl
65105
# We also lose the PATH (and SendEnv=PATH would require sshd config changes)

hack/test-integration.sh

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -50,7 +50,7 @@ for arg in "$@"; do
5050
done
5151

5252
if [ "$needsudo" == "true" ] || [ "$needsudo" == "yes" ] || [ "$needsudo" == "1" ]; then
53-
gotestsum "${args[@]}" -- -timeout="$timeout" -p 1 -exec sudo -v -run TestHealthCheck_SystemdIntegration_Advanced -args -test.allow-kill-daemon ./cmd/nerdctl/container/
53+
gotestsum "${args[@]}" -- -timeout="$timeout" -p 1 -exec sudo -v -run TestHealthCheck_SystemdIntegration_Basic -args -test.allow-kill-daemon ./cmd/nerdctl/container/
5454
else
55-
gotestsum "${args[@]}" -- -timeout="$timeout" -p 1 -v -run TestContainerHealthCheckAdvance -args -test.allow-kill-daemon ./cmd/nerdctl/container/
55+
gotestsum "${args[@]}" -- -timeout="$timeout" -p 1 -v -run TestHealthCheck_SystemdIntegration_Basic -args -test.allow-kill-daemon ./cmd/nerdctl/container/
5656
fi

pkg/healthcheck/healthcheck_manager_linux.go

Lines changed: 46 additions & 49 deletions
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,11 @@ func CreateTimer(ctx context.Context, container containerd.Container) error {
5050
log.G(ctx).Debugf("Creating healthcheck timer unit: %s", hcName)
5151

5252
cmd := []string{}
53+
if rootlessutil.IsRootless() {
54+
cmd = append(cmd, "--user")
55+
cmd = append(cmd, fmt.Sprintf("--uid=%d", rootlessutil.ParentEUID()))
56+
}
57+
5358
if path := os.Getenv("PATH"); path != "" {
5459
cmd = append(cmd, "--setenv=PATH="+path)
5560
}
@@ -62,11 +67,11 @@ func CreateTimer(ctx context.Context, container containerd.Container) error {
6267
cmd = append(cmd, "--debug")
6368
}
6469

65-
conn, err := dbus.NewSystemConnectionContext(context.Background())
66-
if err != nil {
67-
return fmt.Errorf("systemd DBUS connect error: %w", err)
68-
}
69-
defer conn.Close()
70+
// conn, err := dbus.NewSystemConnectionContext(context.Background())
71+
// if err != nil {
72+
// return fmt.Errorf("systemd DBUS connect error: %w", err)
73+
// }
74+
// defer conn.Close()
7075

7176
log.G(ctx).Debugf("creating healthcheck timer with: systemd-run %s", strings.Join(cmd, " "))
7277
run := exec.Command("systemd-run", cmd...)
@@ -79,29 +84,53 @@ func CreateTimer(ctx context.Context, container containerd.Container) error {
7984

8085
// StartTimer starts the healthcheck timer unit.
8186
func StartTimer(ctx context.Context, container containerd.Container) error {
87+
log.G(ctx).Infof("DEBUG: StartTimer called for container %s", container.ID())
88+
8289
hc := extractHealthcheck(ctx, container)
8390
if hc == nil {
91+
log.G(ctx).Infof("DEBUG: No healthcheck found, skipping StartTimer")
8492
return nil
8593
}
8694
if shouldSkipHealthCheckSystemd(hc) {
95+
log.G(ctx).Infof("DEBUG: Skipping healthcheck systemd, shouldSkip=true")
8796
return nil
8897
}
8998

9099
hcName := hcUnitName(container.ID(), true)
91-
conn, err := dbus.NewSystemConnectionContext(context.Background())
100+
log.G(ctx).Infof("DEBUG: Starting timer for unit: %s, rootless=%v", hcName, rootlessutil.IsRootless())
101+
102+
var conn *dbus.Conn
103+
var err error
104+
if rootlessutil.IsRootless() {
105+
log.G(ctx).Infof("DEBUG: Attempting user DBUS connection...")
106+
conn, err = dbus.NewUserConnectionContext(ctx)
107+
} else {
108+
log.G(ctx).Infof("DEBUG: Attempting system DBUS connection...")
109+
conn, err = dbus.NewSystemConnectionContext(ctx)
110+
}
92111
if err != nil {
112+
log.G(ctx).Errorf("DEBUG: DBUS connection failed: %v", err)
93113
return fmt.Errorf("systemd DBUS connect error: %w", err)
94114
}
95115
defer conn.Close()
116+
log.G(ctx).Infof("DEBUG: DBUS connection successful")
96117

97118
startChan := make(chan string)
98119
unit := hcName + ".service"
120+
log.G(ctx).Infof("DEBUG: About to restart unit: %s", unit)
121+
99122
if _, err := conn.RestartUnitContext(context.Background(), unit, "fail", startChan); err != nil {
123+
log.G(ctx).Errorf("DEBUG: RestartUnitContext failed: %v", err)
100124
return err
101125
}
126+
127+
log.G(ctx).Infof("DEBUG: Waiting for restart confirmation...")
102128
if msg := <-startChan; msg != "done" {
129+
log.G(ctx).Errorf("DEBUG: Unexpected restart result: %s", msg)
103130
return fmt.Errorf("unexpected systemd restart result: %s", msg)
104131
}
132+
133+
log.G(ctx).Infof("DEBUG: StartTimer completed successfully")
105134
return nil
106135
}
107136

@@ -115,44 +144,6 @@ func RemoveTransientHealthCheckFiles(ctx context.Context, container containerd.C
115144
return ForceRemoveTransientHealthCheckFiles(ctx, container.ID())
116145
}
117146

118-
// RemoveTransientHealthCheckFilesByID stops and cleans up the transient timer and service using just the container ID.
119-
// This function is deprecated and no longer used. Use ForceRemoveTransientHealthCheckFiles instead.
120-
/*
121-
func RemoveTransientHealthCheckFilesByID(ctx context.Context, containerID string) error {
122-
log.G(ctx).Debugf("Removing healthcheck timer unit: %s", containerID)
123-
124-
conn, err := dbus.NewSystemConnectionContext(context.Background())
125-
if err != nil {
126-
return fmt.Errorf("systemd DBUS connect error: %w", err)
127-
}
128-
defer conn.Close()
129-
130-
unitName := hcUnitName(containerID, true)
131-
timer := unitName + ".timer"
132-
service := unitName + ".service"
133-
134-
// Stop timer
135-
tChan := make(chan string)
136-
if _, err := conn.StopUnitContext(context.Background(), timer, "ignore-dependencies", tChan); err == nil {
137-
if msg := <-tChan; msg != "done" {
138-
log.G(ctx).Warnf("timer stop message: %s", msg)
139-
}
140-
}
141-
142-
// Stop service
143-
sChan := make(chan string)
144-
if _, err := conn.StopUnitContext(context.Background(), service, "ignore-dependencies", sChan); err == nil {
145-
if msg := <-sChan; msg != "done" {
146-
log.G(ctx).Warnf("service stop message: %s", msg)
147-
}
148-
}
149-
150-
// Reset failed units
151-
_ = conn.ResetFailedUnitContext(context.Background(), service)
152-
return nil
153-
}
154-
*/
155-
156147
// ForceRemoveTransientHealthCheckFiles forcefully stops and cleans up the transient timer and service
157148
// using just the container ID. This function is non-blocking and uses timeouts to prevent hanging
158149
// on systemd operations. It logs errors as warnings but continues cleanup attempts.
@@ -174,7 +165,13 @@ func ForceRemoveTransientHealthCheckFiles(ctx context.Context, containerID strin
174165
go func() {
175166
defer close(errChan)
176167

177-
conn, err := dbus.NewSystemConnectionContext(timeoutCtx)
168+
var conn *dbus.Conn
169+
var err error
170+
if rootlessutil.IsRootless() {
171+
conn, err = dbus.NewUserConnectionContext(ctx)
172+
} else {
173+
conn, err = dbus.NewSystemConnectionContext(ctx)
174+
}
178175
if err != nil {
179176
log.G(ctx).Warnf("systemd DBUS connect error during force cleanup: %v", err)
180177
errChan <- fmt.Errorf("systemd DBUS connect error: %w", err)
@@ -300,10 +297,10 @@ func shouldSkipHealthCheckSystemd(hc *Healthcheck) bool {
300297
return true
301298
}
302299

303-
// Skip healthchecks in rootless environments to avoid systemd DBUS permission issues
304-
if rootlessutil.IsRootless() {
305-
return true
306-
}
300+
// Skip healthchecks in environments without dbus-launch to avoid permission issues
301+
// if _, err := exec.LookPath("dbus-launch"); err != nil {
302+
// return true
303+
// }
307304

308305
// Don't proceed if health check is nil, empty, explicitly NONE or interval is 0.
309306
if hc == nil || len(hc.Test) == 0 || hc.Test[0] == "NONE" || hc.Interval == 0 {

test-dbus-debug.md

Lines changed: 86 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,86 @@
1+
# DBUS Debugging Test Plan
2+
3+
## What We Added
4+
5+
### 1. GitHub Workflow Debugging (.github/workflows/job-test-in-container.yml)
6+
- Added a new step "Debug: DBUS availability in container" before running integration tests
7+
- Checks DBUS tool availability, binary locations, systemd tools, environment variables, and package info
8+
- Runs in the same container that will execute the tests
9+
10+
### 2. Rootless Test Script Debugging (Dockerfile.d/test-integration-rootless.sh)
11+
- Added debugging as ROOT user before switching to rootless
12+
- Added debugging as ROOTLESS user after SSH switch
13+
- Added debugging AFTER containerd-rootless-setuptool.sh setup
14+
- Checks DBUS tools, environment variables, systemd status, and DBUS connectivity
15+
16+
## Expected Output
17+
18+
When the CI runs, we should see:
19+
20+
### Container Level (from GitHub workflow):
21+
```
22+
=== DBUS Debugging in Container ===
23+
--- DBUS Tools Availability ---
24+
/usr/bin/dbus-launch
25+
/usr/bin/dbus-daemon
26+
/usr/bin/dbus-send
27+
/usr/bin/dbus-monitor
28+
/usr/bin/systemd-run
29+
--- DBUS Binaries Location ---
30+
-rwxr-xr-x 1 root root ... /usr/bin/dbus-launch
31+
-rwxr-xr-x 1 root root ... /usr/bin/dbus-daemon
32+
...
33+
--- Package Info ---
34+
dbus 1.14.10-4ubuntu4.1
35+
systemd 255.4-1ubuntu8.4
36+
...
37+
```
38+
39+
### Root User Level:
40+
```
41+
=== DBUS Debugging as ROOT ===
42+
--- DBUS Tools Availability (root) ---
43+
/usr/bin/dbus-launch
44+
/usr/bin/systemd-run
45+
...
46+
--- Environment (root) ---
47+
PATH: /usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin
48+
USER: root
49+
UID: 0
50+
DBUS_SESSION_BUS_ADDRESS: unset
51+
XDG_RUNTIME_DIR: unset
52+
```
53+
54+
### Rootless User Level:
55+
```
56+
=== DBUS Debugging as ROOTLESS USER ===
57+
--- DBUS Tools Availability (rootless) ---
58+
/usr/bin/dbus-launch
59+
/usr/bin/systemd-run
60+
...
61+
--- Environment (rootless) ---
62+
PATH: /usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin
63+
USER: rootless
64+
UID: 1000
65+
DBUS_SESSION_BUS_ADDRESS: unset
66+
XDG_RUNTIME_DIR: /run/user/1000
67+
```
68+
69+
## What This Will Tell Us
70+
71+
1. **Are DBUS tools installed?** - We'll see if dbus-launch, dbus-daemon, etc. are available
72+
2. **Are they in PATH?** - We'll see the exact paths and verify accessibility
73+
3. **Environment differences** - Compare root vs rootless environment setup
74+
4. **Systemd user session status** - See if systemd --user works properly
75+
5. **DBUS connectivity** - Test if dbus-launch can establish connections
76+
6. **Setup impact** - See how containerd-rootless-setuptool.sh affects DBUS environment
77+
78+
## Next Steps After Getting Debug Output
79+
80+
Based on the debug output, we can:
81+
1. **If DBUS tools are missing**: Add them to the Dockerfile
82+
2. **If DBUS tools exist but fail**: Implement graceful fallback in healthcheck code
83+
3. **If environment is wrong**: Fix environment setup in test script
84+
4. **If systemd user session fails**: Implement proper session initialization
85+
86+
This debugging will give us the exact information needed to solve the healthcheck timer failures in CI.

0 commit comments

Comments
 (0)