Skip to content

Commit df90606

Browse files
Merge pull request #26086 from Honny1/hc-timeout
Fix: Ensure HealthCheck exec session terminates on timeout
2 parents e68bd89 + 077649f commit df90606

File tree

7 files changed

+38
-21
lines changed

7 files changed

+38
-21
lines changed

cmd/podman/common/create.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -627,7 +627,7 @@ func DefineCreateFlags(cmd *cobra.Command, cf *entities.ContainerCreateOptions,
627627
createFlags.StringVar(
628628
&cf.HealthTimeout,
629629
healthTimeoutFlagName, define.DefaultHealthCheckTimeout,
630-
"the maximum time allowed to complete the healthcheck before an interval is considered failed",
630+
"the maximum time allowed to complete the healthcheck before an interval is considered failed and SIGKILL is sent to the healthcheck process",
631631
)
632632
_ = cmd.RegisterFlagCompletionFunc(healthTimeoutFlagName, completion.AutocompleteNone)
633633

docs/source/markdown/options/health-timeout.md

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -7,8 +7,7 @@
77
The maximum time allowed to complete the healthcheck before an interval is considered failed. Like start-period, the
88
value can be expressed in a time format such as **1m22s**. The default value is **30s**.
99

10-
Note: A timeout marks the healthcheck as failed but does not terminate the running process.
11-
This ensures that a slow but eventually successful healthcheck does not disrupt the container
12-
but is still accounted for in the health status.
10+
Note: A timeout marks the healthcheck as failed. If the healthcheck command itself runs longer than the specified *timeout*,
11+
it will be sent a `SIGKILL` signal.
1312

1413
Note: This parameter will overwrite related healthcheck configuration from the image.

libpod/container_exec.go

Lines changed: 13 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -820,7 +820,7 @@ func (c *Container) ExecResize(sessionID string, newSize resize.TerminalSize) er
820820
return c.ociRuntime.ExecAttachResize(c, sessionID, newSize)
821821
}
822822

823-
func (c *Container) healthCheckExec(config *ExecConfig, streams *define.AttachStreams) (int, error) {
823+
func (c *Container) healthCheckExec(config *ExecConfig, timeout time.Duration, streams *define.AttachStreams) (int, error) {
824824
unlock := true
825825
if !c.batched {
826826
c.lock.Lock()
@@ -868,15 +868,24 @@ func (c *Container) healthCheckExec(config *ExecConfig, streams *define.AttachSt
868868
if err != nil {
869869
return -1, err
870870
}
871+
session.PID = pid
872+
session.PIDData = getPidData(pid)
871873

872874
if !c.batched {
873875
c.lock.Unlock()
874876
unlock = false
875877
}
876878

877-
err = <-attachErrChan
878-
if err != nil {
879-
return -1, fmt.Errorf("container %s light exec session with pid: %d error: %v", c.ID(), pid, err)
879+
select {
880+
case err = <-attachErrChan:
881+
if err != nil {
882+
return -1, fmt.Errorf("container %s light exec session with pid: %d error: %v", c.ID(), pid, err)
883+
}
884+
case <-time.After(timeout):
885+
if err := c.ociRuntime.ExecStopContainer(c, session.ID(), 0); err != nil {
886+
return -1, err
887+
}
888+
return -1, fmt.Errorf("%v of %s", define.ErrHealthCheckTimeout, c.HealthCheckConfig().Timeout.String())
880889
}
881890

882891
return c.readExecExitCode(session.ID())

libpod/define/errors.go

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -217,4 +217,7 @@ var (
217217
// ErrRemovingCtrs indicates that there was an error removing all
218218
// containers from a pod.
219219
ErrRemovingCtrs = errors.New("removing pod containers")
220+
221+
// ErrHealthCheckTimeout indicates that a HealthCheck timed out.
222+
ErrHealthCheckTimeout = errors.New("healthcheck command exceeded timeout")
220223
)

libpod/healthcheck.go

Lines changed: 9 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -93,19 +93,23 @@ func (c *Container) runHealthCheck(ctx context.Context, isStartup bool) (define.
9393
streams.AttachInput = true
9494

9595
logrus.Debugf("executing health check command %s for %s", strings.Join(newCommand, " "), c.ID())
96-
timeStart := time.Now()
9796
hcResult := define.HealthCheckSuccess
9897
config := new(ExecConfig)
9998
config.Command = newCommand
100-
exitCode, hcErr := c.healthCheckExec(config, streams)
99+
timeStart := time.Now()
100+
exitCode, hcErr := c.healthCheckExec(config, c.HealthCheckConfig().Timeout, streams)
101+
timeEnd := time.Now()
101102
if hcErr != nil {
102103
hcResult = define.HealthCheckFailure
103-
if errors.Is(hcErr, define.ErrOCIRuntimeNotFound) ||
104+
switch {
105+
case errors.Is(hcErr, define.ErrOCIRuntimeNotFound) ||
104106
errors.Is(hcErr, define.ErrOCIRuntimePermissionDenied) ||
105-
errors.Is(hcErr, define.ErrOCIRuntime) {
107+
errors.Is(hcErr, define.ErrOCIRuntime):
106108
returnCode = 1
107109
hcErr = nil
108-
} else {
110+
case errors.Is(hcErr, define.ErrHealthCheckTimeout):
111+
returnCode = -1
112+
default:
109113
returnCode = 125
110114
}
111115
} else if exitCode != 0 {
@@ -140,7 +144,6 @@ func (c *Container) runHealthCheck(ctx context.Context, isStartup bool) (define.
140144
hcResult = define.HealthCheckContainerStopped
141145
}
142146

143-
timeEnd := time.Now()
144147
if c.HealthCheckConfig().StartPeriod > 0 {
145148
// there is a start-period we need to honor; we add startPeriod to container start time
146149
startPeriodTime := c.state.StartedTime.Add(c.HealthCheckConfig().StartPeriod)
@@ -156,12 +159,6 @@ func (c *Container) runHealthCheck(ctx context.Context, isStartup bool) (define.
156159
eventLog = eventLog[:c.HealthCheckMaxLogSize()]
157160
}
158161

159-
if timeEnd.Sub(timeStart) > c.HealthCheckConfig().Timeout {
160-
returnCode = -1
161-
hcResult = define.HealthCheckFailure
162-
hcErr = fmt.Errorf("healthcheck command exceeded timeout of %s", c.HealthCheckConfig().Timeout.String())
163-
}
164-
165162
hcl := newHealthCheckLog(timeStart, timeEnd, returnCode, eventLog)
166163

167164
healthCheckResult, err := c.updateHealthCheckLog(hcl, hcResult, inStartPeriod, isStartup)

libpod/oci_conmon_exec_common.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -258,7 +258,7 @@ func (r *ConmonOCIRuntime) ExecStopContainer(ctr *Container, sessionID string, t
258258

259259
// SIGTERM did not work. On to SIGKILL.
260260
logrus.Debugf("Killing exec session %s (PID %d) of container %s with SIGKILL", sessionID, pid, ctr.ID())
261-
if err := pidHandle.Kill(unix.SIGTERM); err != nil {
261+
if err := pidHandle.Kill(unix.SIGKILL); err != nil {
262262
if err == unix.ESRCH {
263263
return nil
264264
}

test/e2e/healthcheck_run_test.go

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -404,4 +404,13 @@ HEALTHCHECK CMD ls -l / 2>&1`, ALPINE)
404404
Expect(ps.OutputToStringArray()).To(HaveLen(2))
405405
Expect(ps.OutputToString()).To(ContainSubstring("hc"))
406406
})
407+
408+
It("podman healthcheck - health timeout", func() {
409+
ctrName := "c-h-" + RandomString(6)
410+
podmanTest.PodmanExitCleanly("run", "-d", "--name", ctrName, "--health-cmd", "top", "--health-timeout=3s", ALPINE, "top")
411+
412+
hc := podmanTest.Podman([]string{"healthcheck", "run", ctrName})
413+
hc.WaitWithTimeout(10)
414+
Expect(hc).Should(ExitWithError(125, "Error: healthcheck command exceeded timeout of 3s"))
415+
})
407416
})

0 commit comments

Comments
 (0)