Skip to content

Commit 98cf1c8

Browse files
committed
add non blocking systemd removal
Signed-off-by: Arjun Raja Yogidas <[email protected]>
1 parent d035a7f commit 98cf1c8

File tree

4 files changed

+138
-10
lines changed

4 files changed

+138
-10
lines changed

cmd/nerdctl/container/container_health_check_linux_test.go

Lines changed: 0 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -695,8 +695,6 @@ func TestHealthCheck_SystemdIntegration_Advanced(t *testing.T) {
695695
"--health-interval", "1s",
696696
testutil.CommonImage, "sleep", "30")
697697
nerdtest.EnsureContainerStarted(helpers, data.Identifier())
698-
// Wait longer for systemd timer creation and first healthcheck execution
699-
time.Sleep(3 * time.Second)
700698
},
701699
Cleanup: func(data test.Data, helpers test.Helpers) {
702700
helpers.Anyhow("rm", "-f", data.Identifier())
@@ -724,7 +722,6 @@ func TestHealthCheck_SystemdIntegration_Advanced(t *testing.T) {
724722
})
725723
// Stop container and verify cleanup
726724
helpers.Ensure("stop", data.Identifier())
727-
time.Sleep(500 * time.Millisecond) // Allow cleanup to complete
728725

729726
// Check that timer is gone
730727
result = helpers.Custom("systemctl", "list-timers", "--all", "--no-pager")
@@ -748,7 +745,6 @@ func TestHealthCheck_SystemdIntegration_Advanced(t *testing.T) {
748745
"--health-interval", "2s",
749746
testutil.CommonImage, "sleep", "60")
750747
nerdtest.EnsureContainerStarted(helpers, data.Identifier())
751-
time.Sleep(3 * time.Second) // Wait for initial timer creation
752748
},
753749
Cleanup: func(data test.Data, helpers test.Helpers) {
754750
helpers.Anyhow("rm", "-f", data.Identifier())
@@ -770,7 +766,6 @@ func TestHealthCheck_SystemdIntegration_Advanced(t *testing.T) {
770766

771767
// Step 2: Stop container
772768
helpers.Ensure("stop", data.Identifier())
773-
time.Sleep(1 * time.Second) // Allow cleanup
774769

775770
// Step 3: Verify timer is removed after stop
776771
result = helpers.Custom("systemctl", "list-timers", "--all", "--no-pager")
@@ -785,7 +780,6 @@ func TestHealthCheck_SystemdIntegration_Advanced(t *testing.T) {
785780
// Step 4: Restart container
786781
helpers.Ensure("start", data.Identifier())
787782
nerdtest.EnsureContainerStarted(helpers, data.Identifier())
788-
time.Sleep(3 * time.Second) // Wait for timer recreation
789783

790784
// Step 5: Verify timer is recreated after restart - this is our final verification
791785
return helpers.Custom("systemctl", "list-timers", "--all", "--no-pager")

pkg/healthcheck/healthcheck_manager_darwin.go

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,16 @@ func RemoveTransientHealthCheckFiles(ctx context.Context, container containerd.C
3838
}
3939

4040
// RemoveTransientHealthCheckFilesByID stops and cleans up the transient timer and service using just the container ID.
41+
// This function is deprecated and no longer used. Use ForceRemoveTransientHealthCheckFiles instead.
42+
/*
4143
func RemoveTransientHealthCheckFilesByID(ctx context.Context, containerID string) error {
4244
return nil
4345
}
46+
*/
47+
48+
// ForceRemoveTransientHealthCheckFiles forcefully stops and cleans up the transient timer and service
49+
// using just the container ID. This function is non-blocking and uses timeouts to prevent hanging
50+
// on systemd operations. On Darwin, this is a no-op since systemd is not available.
51+
func ForceRemoveTransientHealthCheckFiles(ctx context.Context, containerID string) error {
52+
return nil
53+
}

pkg/healthcheck/healthcheck_manager_linux.go

Lines changed: 118 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@ import (
2323
"os"
2424
"os/exec"
2525
"strings"
26+
"time"
2627

2728
"github.com/coreos/go-systemd/v22/dbus"
2829

@@ -110,14 +111,13 @@ func RemoveTransientHealthCheckFiles(ctx context.Context, container containerd.C
110111
if hc == nil {
111112
return nil
112113
}
113-
if shouldSkipHealthCheckSystemd(hc) {
114-
return nil
115-
}
116114

117-
return RemoveTransientHealthCheckFilesByID(ctx, container.ID())
115+
return ForceRemoveTransientHealthCheckFiles(ctx, container.ID())
118116
}
119117

120118
// RemoveTransientHealthCheckFilesByID stops and cleans up the transient timer and service using just the container ID.
119+
// This function is deprecated and no longer used. Use ForceRemoveTransientHealthCheckFiles instead.
120+
/*
121121
func RemoveTransientHealthCheckFilesByID(ctx context.Context, containerID string) error {
122122
log.G(ctx).Debugf("Removing healthcheck timer unit: %s", containerID)
123123
@@ -151,6 +151,120 @@ func RemoveTransientHealthCheckFilesByID(ctx context.Context, containerID string
151151
_ = conn.ResetFailedUnitContext(context.Background(), service)
152152
return nil
153153
}
154+
*/
155+
156+
// ForceRemoveTransientHealthCheckFiles forcefully stops and cleans up the transient timer and service
157+
// using just the container ID. This function is non-blocking and uses timeouts to prevent hanging
158+
// on systemd operations. It logs errors as warnings but continues cleanup attempts.
159+
func ForceRemoveTransientHealthCheckFiles(ctx context.Context, containerID string) error {
160+
log.G(ctx).Debugf("Force removing healthcheck timer unit: %s", containerID)
161+
162+
// Create a timeout context for systemd operations (5 seconds default)
163+
timeoutCtx, cancel := context.WithTimeout(ctx, 5*time.Second)
164+
defer cancel()
165+
166+
unitName := hcUnitName(containerID, true)
167+
timer := unitName + ".timer"
168+
service := unitName + ".service"
169+
170+
// Channel to collect any critical errors (though we'll continue cleanup regardless)
171+
errChan := make(chan error, 3)
172+
173+
// Goroutine for DBUS connection and cleanup operations
174+
go func() {
175+
defer close(errChan)
176+
177+
conn, err := dbus.NewSystemConnectionContext(timeoutCtx)
178+
if err != nil {
179+
log.G(ctx).Warnf("systemd DBUS connect error during force cleanup: %v", err)
180+
errChan <- fmt.Errorf("systemd DBUS connect error: %w", err)
181+
return
182+
}
183+
defer conn.Close()
184+
185+
// Stop timer with timeout
186+
go func() {
187+
select {
188+
case <-timeoutCtx.Done():
189+
log.G(ctx).Warnf("timeout stopping timer %s during force cleanup", timer)
190+
return
191+
default:
192+
tChan := make(chan string, 1)
193+
if _, err := conn.StopUnitContext(timeoutCtx, timer, "ignore-dependencies", tChan); err == nil {
194+
select {
195+
case msg := <-tChan:
196+
if msg != "done" {
197+
log.G(ctx).Warnf("timer stop message during force cleanup: %s", msg)
198+
}
199+
case <-timeoutCtx.Done():
200+
log.G(ctx).Warnf("timeout waiting for timer stop confirmation: %s", timer)
201+
}
202+
} else {
203+
log.G(ctx).Warnf("failed to stop timer %s during force cleanup: %v", timer, err)
204+
}
205+
}
206+
}()
207+
208+
// Stop service with timeout
209+
go func() {
210+
select {
211+
case <-timeoutCtx.Done():
212+
log.G(ctx).Warnf("timeout stopping service %s during force cleanup", service)
213+
return
214+
default:
215+
sChan := make(chan string, 1)
216+
if _, err := conn.StopUnitContext(timeoutCtx, service, "ignore-dependencies", sChan); err == nil {
217+
select {
218+
case msg := <-sChan:
219+
if msg != "done" {
220+
log.G(ctx).Warnf("service stop message during force cleanup: %s", msg)
221+
}
222+
case <-timeoutCtx.Done():
223+
log.G(ctx).Warnf("timeout waiting for service stop confirmation: %s", service)
224+
}
225+
} else {
226+
log.G(ctx).Warnf("failed to stop service %s during force cleanup: %v", service, err)
227+
}
228+
}
229+
}()
230+
231+
// Reset failed units (best effort, non-blocking)
232+
go func() {
233+
select {
234+
case <-timeoutCtx.Done():
235+
log.G(ctx).Warnf("timeout resetting failed unit %s during force cleanup", service)
236+
return
237+
default:
238+
if err := conn.ResetFailedUnitContext(timeoutCtx, service); err != nil {
239+
log.G(ctx).Warnf("failed to reset failed unit %s during force cleanup: %v", service, err)
240+
}
241+
}
242+
}()
243+
244+
// Wait a short time for operations to complete, but don't block indefinitely
245+
select {
246+
case <-time.After(3 * time.Second):
247+
log.G(ctx).Debugf("force cleanup operations completed for container %s", containerID)
248+
case <-timeoutCtx.Done():
249+
log.G(ctx).Warnf("force cleanup timed out for container %s", containerID)
250+
}
251+
}()
252+
253+
// Wait for the cleanup goroutine to finish or timeout
254+
select {
255+
case err := <-errChan:
256+
if err != nil {
257+
log.G(ctx).Warnf("force cleanup encountered errors but continuing: %v", err)
258+
}
259+
case <-timeoutCtx.Done():
260+
log.G(ctx).Warnf("force cleanup timed out for container %s, but cleanup may continue in background", containerID)
261+
}
262+
263+
// Always return nil - this function should never block the caller
264+
// even if systemd operations fail or timeout
265+
log.G(ctx).Debugf("force cleanup completed (non-blocking) for container %s", containerID)
266+
return nil
267+
}
154268

155269
// hcUnitName returns a systemd unit name for a container healthcheck.
156270
func hcUnitName(containerID string, bare bool) string {

pkg/healthcheck/healthcheck_manager_windows.go

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,16 @@ func RemoveTransientHealthCheckFiles(ctx context.Context, container containerd.C
3838
}
3939

4040
// RemoveTransientHealthCheckFilesByID stops and cleans up the transient timer and service using just the container ID.
41+
// This function is deprecated and no longer used. Use ForceRemoveTransientHealthCheckFiles instead.
42+
/*
4143
func RemoveTransientHealthCheckFilesByID(ctx context.Context, containerID string) error {
4244
return nil
4345
}
46+
*/
47+
48+
// ForceRemoveTransientHealthCheckFiles forcefully stops and cleans up the transient timer and service
49+
// using just the container ID. This function is non-blocking and uses timeouts to prevent hanging
50+
// on systemd operations. On Windows, this is a no-op since systemd is not available.
51+
func ForceRemoveTransientHealthCheckFiles(ctx context.Context, containerID string) error {
52+
return nil
53+
}

0 commit comments

Comments
 (0)