@@ -23,6 +23,7 @@ import (
23
23
"os"
24
24
"os/exec"
25
25
"strings"
26
+ "time"
26
27
27
28
"github.com/coreos/go-systemd/v22/dbus"
28
29
@@ -110,14 +111,13 @@ func RemoveTransientHealthCheckFiles(ctx context.Context, container containerd.C
110
111
if hc == nil {
111
112
return nil
112
113
}
113
- if shouldSkipHealthCheckSystemd (hc ) {
114
- return nil
115
- }
116
114
117
- return RemoveTransientHealthCheckFilesByID (ctx , container .ID ())
115
+ return ForceRemoveTransientHealthCheckFiles (ctx , container .ID ())
118
116
}
119
117
120
118
// RemoveTransientHealthCheckFilesByID stops and cleans up the transient timer and service using just the container ID.
119
+ // This function is deprecated and no longer used. Use ForceRemoveTransientHealthCheckFiles instead.
120
+ /*
121
121
func RemoveTransientHealthCheckFilesByID(ctx context.Context, containerID string) error {
122
122
log.G(ctx).Debugf("Removing healthcheck timer unit: %s", containerID)
123
123
@@ -151,6 +151,120 @@ func RemoveTransientHealthCheckFilesByID(ctx context.Context, containerID string
151
151
_ = conn.ResetFailedUnitContext(context.Background(), service)
152
152
return nil
153
153
}
154
+ */
155
+
156
+ // ForceRemoveTransientHealthCheckFiles forcefully stops and cleans up the transient timer and service
157
+ // using just the container ID. This function is non-blocking and uses timeouts to prevent hanging
158
+ // on systemd operations. It logs errors as warnings but continues cleanup attempts.
159
+ func ForceRemoveTransientHealthCheckFiles (ctx context.Context , containerID string ) error {
160
+ log .G (ctx ).Debugf ("Force removing healthcheck timer unit: %s" , containerID )
161
+
162
+ // Create a timeout context for systemd operations (5 seconds default)
163
+ timeoutCtx , cancel := context .WithTimeout (ctx , 5 * time .Second )
164
+ defer cancel ()
165
+
166
+ unitName := hcUnitName (containerID , true )
167
+ timer := unitName + ".timer"
168
+ service := unitName + ".service"
169
+
170
+ // Channel to collect any critical errors (though we'll continue cleanup regardless)
171
+ errChan := make (chan error , 3 )
172
+
173
+ // Goroutine for DBUS connection and cleanup operations
174
+ go func () {
175
+ defer close (errChan )
176
+
177
+ conn , err := dbus .NewSystemConnectionContext (timeoutCtx )
178
+ if err != nil {
179
+ log .G (ctx ).Warnf ("systemd DBUS connect error during force cleanup: %v" , err )
180
+ errChan <- fmt .Errorf ("systemd DBUS connect error: %w" , err )
181
+ return
182
+ }
183
+ defer conn .Close ()
184
+
185
+ // Stop timer with timeout
186
+ go func () {
187
+ select {
188
+ case <- timeoutCtx .Done ():
189
+ log .G (ctx ).Warnf ("timeout stopping timer %s during force cleanup" , timer )
190
+ return
191
+ default :
192
+ tChan := make (chan string , 1 )
193
+ if _ , err := conn .StopUnitContext (timeoutCtx , timer , "ignore-dependencies" , tChan ); err == nil {
194
+ select {
195
+ case msg := <- tChan :
196
+ if msg != "done" {
197
+ log .G (ctx ).Warnf ("timer stop message during force cleanup: %s" , msg )
198
+ }
199
+ case <- timeoutCtx .Done ():
200
+ log .G (ctx ).Warnf ("timeout waiting for timer stop confirmation: %s" , timer )
201
+ }
202
+ } else {
203
+ log .G (ctx ).Warnf ("failed to stop timer %s during force cleanup: %v" , timer , err )
204
+ }
205
+ }
206
+ }()
207
+
208
+ // Stop service with timeout
209
+ go func () {
210
+ select {
211
+ case <- timeoutCtx .Done ():
212
+ log .G (ctx ).Warnf ("timeout stopping service %s during force cleanup" , service )
213
+ return
214
+ default :
215
+ sChan := make (chan string , 1 )
216
+ if _ , err := conn .StopUnitContext (timeoutCtx , service , "ignore-dependencies" , sChan ); err == nil {
217
+ select {
218
+ case msg := <- sChan :
219
+ if msg != "done" {
220
+ log .G (ctx ).Warnf ("service stop message during force cleanup: %s" , msg )
221
+ }
222
+ case <- timeoutCtx .Done ():
223
+ log .G (ctx ).Warnf ("timeout waiting for service stop confirmation: %s" , service )
224
+ }
225
+ } else {
226
+ log .G (ctx ).Warnf ("failed to stop service %s during force cleanup: %v" , service , err )
227
+ }
228
+ }
229
+ }()
230
+
231
+ // Reset failed units (best effort, non-blocking)
232
+ go func () {
233
+ select {
234
+ case <- timeoutCtx .Done ():
235
+ log .G (ctx ).Warnf ("timeout resetting failed unit %s during force cleanup" , service )
236
+ return
237
+ default :
238
+ if err := conn .ResetFailedUnitContext (timeoutCtx , service ); err != nil {
239
+ log .G (ctx ).Warnf ("failed to reset failed unit %s during force cleanup: %v" , service , err )
240
+ }
241
+ }
242
+ }()
243
+
244
+ // Wait a short time for operations to complete, but don't block indefinitely
245
+ select {
246
+ case <- time .After (3 * time .Second ):
247
+ log .G (ctx ).Debugf ("force cleanup operations completed for container %s" , containerID )
248
+ case <- timeoutCtx .Done ():
249
+ log .G (ctx ).Warnf ("force cleanup timed out for container %s" , containerID )
250
+ }
251
+ }()
252
+
253
+ // Wait for the cleanup goroutine to finish or timeout
254
+ select {
255
+ case err := <- errChan :
256
+ if err != nil {
257
+ log .G (ctx ).Warnf ("force cleanup encountered errors but continuing: %v" , err )
258
+ }
259
+ case <- timeoutCtx .Done ():
260
+ log .G (ctx ).Warnf ("force cleanup timed out for container %s, but cleanup may continue in background" , containerID )
261
+ }
262
+
263
+ // Always return nil - this function should never block the caller
264
+ // even if systemd operations fail or timeout
265
+ log .G (ctx ).Debugf ("force cleanup completed (non-blocking) for container %s" , containerID )
266
+ return nil
267
+ }
154
268
155
269
// hcUnitName returns a systemd unit name for a container healthcheck.
156
270
func hcUnitName (containerID string , bare bool ) string {
0 commit comments