@@ -23,6 +23,7 @@ import (
2323 "os"
2424 "os/exec"
2525 "strings"
26+ "time"
2627
2728 "github.com/coreos/go-systemd/v22/dbus"
2829
@@ -110,14 +111,13 @@ func RemoveTransientHealthCheckFiles(ctx context.Context, container containerd.C
110111 if hc == nil {
111112 return nil
112113 }
113- if shouldSkipHealthCheckSystemd (hc ) {
114- return nil
115- }
116114
117- return RemoveTransientHealthCheckFilesByID (ctx , container .ID ())
115+ return ForceRemoveTransientHealthCheckFiles (ctx , container .ID ())
118116}
119117
120118// RemoveTransientHealthCheckFilesByID stops and cleans up the transient timer and service using just the container ID.
119+ // This function is deprecated and no longer used. Use ForceRemoveTransientHealthCheckFiles instead.
120+ /*
121121func RemoveTransientHealthCheckFilesByID(ctx context.Context, containerID string) error {
122122 log.G(ctx).Debugf("Removing healthcheck timer unit: %s", containerID)
123123
@@ -151,6 +151,120 @@ func RemoveTransientHealthCheckFilesByID(ctx context.Context, containerID string
151151 _ = conn.ResetFailedUnitContext(context.Background(), service)
152152 return nil
153153}
154+ */
155+
156+ // ForceRemoveTransientHealthCheckFiles forcefully stops and cleans up the transient timer and service
157+ // using just the container ID. This function is non-blocking and uses timeouts to prevent hanging
158+ // on systemd operations. It logs errors as warnings but continues cleanup attempts.
159+ func ForceRemoveTransientHealthCheckFiles (ctx context.Context , containerID string ) error {
160+ log .G (ctx ).Debugf ("Force removing healthcheck timer unit: %s" , containerID )
161+
162+ // Create a timeout context for systemd operations (5 seconds default)
163+ timeoutCtx , cancel := context .WithTimeout (ctx , 5 * time .Second )
164+ defer cancel ()
165+
166+ unitName := hcUnitName (containerID , true )
167+ timer := unitName + ".timer"
168+ service := unitName + ".service"
169+
170+ // Channel to collect any critical errors (though we'll continue cleanup regardless)
171+ errChan := make (chan error , 3 )
172+
173+ // Goroutine for DBUS connection and cleanup operations
174+ go func () {
175+ defer close (errChan )
176+
177+ conn , err := dbus .NewSystemConnectionContext (timeoutCtx )
178+ if err != nil {
179+ log .G (ctx ).Warnf ("systemd DBUS connect error during force cleanup: %v" , err )
180+ errChan <- fmt .Errorf ("systemd DBUS connect error: %w" , err )
181+ return
182+ }
183+ defer conn .Close ()
184+
185+ // Stop timer with timeout
186+ go func () {
187+ select {
188+ case <- timeoutCtx .Done ():
189+ log .G (ctx ).Warnf ("timeout stopping timer %s during force cleanup" , timer )
190+ return
191+ default :
192+ tChan := make (chan string , 1 )
193+ if _ , err := conn .StopUnitContext (timeoutCtx , timer , "ignore-dependencies" , tChan ); err == nil {
194+ select {
195+ case msg := <- tChan :
196+ if msg != "done" {
197+ log .G (ctx ).Warnf ("timer stop message during force cleanup: %s" , msg )
198+ }
199+ case <- timeoutCtx .Done ():
200+ log .G (ctx ).Warnf ("timeout waiting for timer stop confirmation: %s" , timer )
201+ }
202+ } else {
203+ log .G (ctx ).Warnf ("failed to stop timer %s during force cleanup: %v" , timer , err )
204+ }
205+ }
206+ }()
207+
208+ // Stop service with timeout
209+ go func () {
210+ select {
211+ case <- timeoutCtx .Done ():
212+ log .G (ctx ).Warnf ("timeout stopping service %s during force cleanup" , service )
213+ return
214+ default :
215+ sChan := make (chan string , 1 )
216+ if _ , err := conn .StopUnitContext (timeoutCtx , service , "ignore-dependencies" , sChan ); err == nil {
217+ select {
218+ case msg := <- sChan :
219+ if msg != "done" {
220+ log .G (ctx ).Warnf ("service stop message during force cleanup: %s" , msg )
221+ }
222+ case <- timeoutCtx .Done ():
223+ log .G (ctx ).Warnf ("timeout waiting for service stop confirmation: %s" , service )
224+ }
225+ } else {
226+ log .G (ctx ).Warnf ("failed to stop service %s during force cleanup: %v" , service , err )
227+ }
228+ }
229+ }()
230+
231+ // Reset failed units (best effort, non-blocking)
232+ go func () {
233+ select {
234+ case <- timeoutCtx .Done ():
235+ log .G (ctx ).Warnf ("timeout resetting failed unit %s during force cleanup" , service )
236+ return
237+ default :
238+ if err := conn .ResetFailedUnitContext (timeoutCtx , service ); err != nil {
239+ log .G (ctx ).Warnf ("failed to reset failed unit %s during force cleanup: %v" , service , err )
240+ }
241+ }
242+ }()
243+
244+ // Wait a short time for operations to complete, but don't block indefinitely
245+ select {
246+ case <- time .After (3 * time .Second ):
247+ log .G (ctx ).Debugf ("force cleanup operations completed for container %s" , containerID )
248+ case <- timeoutCtx .Done ():
249+ log .G (ctx ).Warnf ("force cleanup timed out for container %s" , containerID )
250+ }
251+ }()
252+
253+ // Wait for the cleanup goroutine to finish or timeout
254+ select {
255+ case err := <- errChan :
256+ if err != nil {
257+ log .G (ctx ).Warnf ("force cleanup encountered errors but continuing: %v" , err )
258+ }
259+ case <- timeoutCtx .Done ():
260+ log .G (ctx ).Warnf ("force cleanup timed out for container %s, but cleanup may continue in background" , containerID )
261+ }
262+
263+ // Always return nil - this function should never block the caller
264+ // even if systemd operations fail or timeout
265+ log .G (ctx ).Debugf ("force cleanup completed (non-blocking) for container %s" , containerID )
266+ return nil
267+ }
154268
155269// hcUnitName returns a systemd unit name for a container healthcheck.
156270func hcUnitName (containerID string , bare bool ) string {
0 commit comments