Skip to content

Commit a2be1d8

Browse files
author
Samuel Archambault
committed
enable healthcheck arg
Signed-off-by: Samuel Archambault <[email protected]>
1 parent 14b68ba commit a2be1d8

File tree

5 files changed

+351
-9
lines changed

5 files changed

+351
-9
lines changed

libpod/oci_conmon_common.go

Lines changed: 112 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,15 @@ const (
4646
// Important: The conmon attach socket uses an extra byte at the beginning of each
4747
// message to specify the STREAM so we have to increase the buffer size by one
4848
bufferSize = conmonConfig.BufSize + 1
49+
50+
// Healthcheck message type from conmon (using negative to avoid PID conflicts)
51+
HealthCheckMsgStatusUpdate = -100
52+
53+
// Healthcheck status values sent by conmon (added to base message type -100)
54+
HealthCheckStatusNone = 0
55+
HealthCheckStatusStarting = 1
56+
HealthCheckStatusHealthy = 2
57+
HealthCheckStatusUnhealthy = 3
4958
)
5059

5160
// ConmonOCIRuntime is an OCI runtime managed by Conmon.
@@ -981,7 +990,6 @@ func (r *ConmonOCIRuntime) createOCIContainer(ctr *Container, restoreOptions *Co
981990
if err != nil {
982991
return 0, fmt.Errorf("creating socket pair: %w", err)
983992
}
984-
defer errorhandling.CloseQuiet(parentSyncPipe)
985993

986994
childStartPipe, parentStartPipe, err := newPipe()
987995
if err != nil {
@@ -1038,6 +1046,9 @@ func (r *ConmonOCIRuntime) createOCIContainer(ctr *Container, restoreOptions *Co
10381046
args = append(args, "--conmon-pidfile", ctr.config.ConmonPidFile)
10391047
}
10401048

1049+
// Add healthcheck-related arguments (build-conditional)
1050+
args = r.addHealthCheckArgs(ctr, args)
1051+
10411052
if r.noPivot {
10421053
args = append(args, "--no-pivot")
10431054
}
@@ -1199,6 +1210,8 @@ func (r *ConmonOCIRuntime) createOCIContainer(ctr *Container, restoreOptions *Co
11991210
// regardless of whether we errored or not, we no longer need the children pipes
12001211
childSyncPipe.Close()
12011212
childStartPipe.Close()
1213+
1214+
// Note: parentSyncPipe is NOT closed here because it's used for continuous healthcheck monitoring
12021215
if err != nil {
12031216
return 0, err
12041217
}
@@ -1219,7 +1232,7 @@ func (r *ConmonOCIRuntime) createOCIContainer(ctr *Container, restoreOptions *Co
12191232
return 0, fmt.Errorf("conmon failed: %w", err)
12201233
}
12211234

1222-
pid, err := readConmonPipeData(r.name, parentSyncPipe, ociLog)
1235+
pid, err := readConmonPipeData(r.name, parentSyncPipe, ociLog, ctr)
12231236
if err != nil {
12241237
if err2 := r.DeleteContainer(ctr); err2 != nil {
12251238
logrus.Errorf("Removing container %s from runtime after creation failed", ctr.ID())
@@ -1322,7 +1335,6 @@ func (r *ConmonOCIRuntime) sharedConmonArgs(ctr *Container, cuuid, bundlePath, p
13221335
logDriverArg = define.NoLogging
13231336
case define.PassthroughLogging, define.PassthroughTTYLogging:
13241337
logDriverArg = define.PassthroughLogging
1325-
//lint:ignore ST1015 the default case has to be here
13261338
default: //nolint:gocritic
13271339
// No case here should happen except JSONLogging, but keep this here in case the options are extended
13281340
logrus.Errorf("%s logging specified but not supported. Choosing k8s-file logging instead", ctr.LogDriver())
@@ -1390,13 +1402,15 @@ func readConmonPidFile(pidFile string) (int, error) {
13901402
return 0, nil
13911403
}
13921404

1405+
// syncInfo is used to return data from monitor process to daemon
1406+
type syncInfo struct {
1407+
Data int `json:"data"`
1408+
Message string `json:"message,omitempty"`
1409+
}
1410+
13931411
// readConmonPipeData attempts to read a syncInfo struct from the pipe
1394-
func readConmonPipeData(runtimeName string, pipe *os.File, ociLog string) (int, error) {
1395-
// syncInfo is used to return data from monitor process to daemon
1396-
type syncInfo struct {
1397-
Data int `json:"data"`
1398-
Message string `json:"message,omitempty"`
1399-
}
1412+
// If ctr is provided, it will also start continuous healthcheck monitoring
1413+
func readConmonPipeData(runtimeName string, pipe *os.File, ociLog string, ctr ...*Container) (int, error) {
14001414

14011415
// Wait to get container pid from conmon
14021416
type syncStruct struct {
@@ -1408,15 +1422,24 @@ func readConmonPipeData(runtimeName string, pipe *os.File, ociLog string) (int,
14081422
var si *syncInfo
14091423
rdr := bufio.NewReader(pipe)
14101424
b, err := rdr.ReadBytes('\n')
1425+
1426+
// Log the raw JSON string received from conmon
1427+
logrus.Debugf("HEALTHCHECK: Raw JSON received from conmon: %q", string(b))
1428+
logrus.Debugf("HEALTHCHECK: JSON length: %d bytes", len(b))
1429+
14111430
// ignore EOF here, error is returned even when data was read
14121431
// if it is no valid json unmarshal will fail below
14131432
if err != nil && !errors.Is(err, io.EOF) {
1433+
logrus.Debugf("HEALTHCHECK: Error reading from conmon pipe: %v", err)
14141434
ch <- syncStruct{err: err}
1435+
return
14151436
}
14161437
if err := json.Unmarshal(b, &si); err != nil {
1438+
logrus.Debugf("HEALTHCHECK: Failed to unmarshal JSON from conmon: %v", err)
14171439
ch <- syncStruct{err: fmt.Errorf("conmon bytes %q: %w", string(b), err)}
14181440
return
14191441
}
1442+
logrus.Debugf("HEALTHCHECK: Successfully parsed JSON from conmon: Data=%d, Message=%q", si.Data, si.Message)
14201443
ch <- syncStruct{si: si}
14211444
}()
14221445

@@ -1436,6 +1459,13 @@ func readConmonPipeData(runtimeName string, pipe *os.File, ociLog string) (int,
14361459
return -1, fmt.Errorf("container create failed (no logs from conmon): %w", ss.err)
14371460
}
14381461
logrus.Debugf("Received: %d", ss.si.Data)
1462+
1463+
// Start continuous healthcheck monitoring if container is provided and PID is valid
1464+
if len(ctr) > 0 && ctr[0] != nil && ss.si.Data > 0 {
1465+
logrus.Debugf("HEALTHCHECK: Starting continuous healthcheck monitoring for container %s (PID: %d)", ctr[0].ID(), ss.si.Data)
1466+
go readConmonHealthCheckPipeData(ctr[0], pipe)
1467+
}
1468+
14391469
if ss.si.Data < 0 {
14401470
if ociLog != "" {
14411471
ociLogData, err := os.ReadFile(ociLog)
@@ -1459,6 +1489,79 @@ func readConmonPipeData(runtimeName string, pipe *os.File, ociLog string) (int,
14591489
return data, nil
14601490
}
14611491

1492+
// readConmonHealthCheckPipeData continuously reads healthcheck status updates from conmon
1493+
func readConmonHealthCheckPipeData(ctr *Container, pipe *os.File) {
1494+
logrus.Debugf("HEALTHCHECK: Starting continuous healthcheck monitoring for container %s", ctr.ID())
1495+
1496+
rdr := bufio.NewReader(pipe)
1497+
for {
1498+
// Read one line from the pipe
1499+
b, err := rdr.ReadBytes('\n')
1500+
if err != nil {
1501+
if err == io.EOF {
1502+
logrus.Debugf("HEALTHCHECK: Pipe closed for container %s, stopping monitoring", ctr.ID())
1503+
return
1504+
}
1505+
logrus.Errorf("HEALTHCHECK: Error reading from pipe for container %s: %v", ctr.ID(), err)
1506+
return
1507+
}
1508+
1509+
// Log the raw JSON string received from conmon
1510+
logrus.Debugf("HEALTHCHECK: Raw JSON received from conmon for container %s: %q", ctr.ID(), string(b))
1511+
logrus.Debugf("HEALTHCHECK: JSON length: %d bytes", len(b))
1512+
1513+
// Parse the JSON
1514+
var si syncInfo
1515+
if err := json.Unmarshal(b, &si); err != nil {
1516+
logrus.Errorf("HEALTHCHECK: Failed to parse JSON from conmon for container %s: %v", ctr.ID(), err)
1517+
continue
1518+
}
1519+
1520+
logrus.Debugf("HEALTHCHECK: Parsed sync info for container %s: Data=%d, Message=%q", ctr.ID(), si.Data, si.Message)
1521+
1522+
// Handle healthcheck status updates based on your new encoding scheme
1523+
// Base message type is -100, status values are added to it:
1524+
// -100 + 0 (none) = -100
1525+
// -100 + 1 (starting) = -99
1526+
// -100 + 2 (healthy) = -98
1527+
// -100 + 3 (unhealthy) = -97
1528+
if si.Data >= HealthCheckMsgStatusUpdate && si.Data <= HealthCheckMsgStatusUpdate+HealthCheckStatusUnhealthy {
1529+
statusValue := si.Data - HealthCheckMsgStatusUpdate // Convert back to status value
1530+
var status string
1531+
1532+
switch statusValue {
1533+
case HealthCheckStatusNone:
1534+
status = define.HealthCheckReset // "reset" or "none"
1535+
case HealthCheckStatusStarting:
1536+
status = define.HealthCheckStarting // "starting"
1537+
case HealthCheckStatusHealthy:
1538+
status = define.HealthCheckHealthy // "healthy"
1539+
case HealthCheckStatusUnhealthy:
1540+
status = define.HealthCheckUnhealthy // "unhealthy"
1541+
default:
1542+
logrus.Errorf("HEALTHCHECK: Unknown status value %d for container %s", statusValue, ctr.ID())
1543+
continue
1544+
}
1545+
1546+
logrus.Infof("HEALTHCHECK: Received healthcheck status update for container %s: %s (message type: %d, status value: %d)",
1547+
ctr.ID(), status, si.Data, statusValue)
1548+
1549+
// Update the container's healthcheck status
1550+
if err := ctr.updateHealthStatus(status); err != nil {
1551+
logrus.Errorf("HEALTHCHECK: Failed to update healthcheck status for container %s: %v", ctr.ID(), err)
1552+
} else {
1553+
logrus.Infof("HEALTHCHECK: Successfully updated healthcheck status for container %s to %s", ctr.ID(), status)
1554+
}
1555+
} else if si.Data < 0 {
1556+
// Other negative message types - might be healthcheck related but not recognized
1557+
logrus.Debugf("HEALTHCHECK: Received unrecognized negative message type %d for container %s - might be healthcheck related", si.Data, ctr.ID())
1558+
} else if si.Data > 0 {
1559+
// Positive message types - not healthcheck related
1560+
logrus.Debugf("HEALTHCHECK: Received positive message type %d for container %s - not healthcheck related", si.Data, ctr.ID())
1561+
}
1562+
}
1563+
}
1564+
14621565
// writeConmonPipeData writes nonce data to a pipe
14631566
func writeConmonPipeData(pipe *os.File) error {
14641567
someData := []byte{0}

libpod/oci_conmon_nosystemd.go

Lines changed: 134 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,134 @@
1+
//go:build !remote && (linux || freebsd) && !systemd
2+
3+
package libpod
4+
5+
import (
6+
"strconv"
7+
"time"
8+
9+
"github.com/sirupsen/logrus"
10+
)
11+
12+
// addHealthCheckArgs adds healthcheck-related arguments to conmon for non-systemd builds
13+
func (r *ConmonOCIRuntime) addHealthCheckArgs(ctr *Container, args []string) []string {
14+
// Add healthcheck configuration as CLI arguments if container has healthcheck config
15+
if ctr.HasHealthCheck() {
16+
healthConfig := ctr.HealthCheckConfig()
17+
if healthConfig != nil {
18+
logrus.Debugf("HEALTHCHECK: Adding healthcheck CLI args for container %s", ctr.ID())
19+
20+
// Build healthcheck command and arguments from test array
21+
healthCmd, healthArgs := r.buildHealthcheckCmdAndArgs(healthConfig.Test)
22+
if healthCmd != "" {
23+
args = append(args, "--healthcheck-cmd", healthCmd)
24+
25+
// Add all healthcheck arguments
26+
for _, arg := range healthArgs {
27+
args = append(args, "--healthcheck-arg", arg)
28+
}
29+
30+
// Add optional healthcheck parameters with validation and defaults
31+
interval := r.validateAndGetInterval(healthConfig.Interval)
32+
timeout := r.validateAndGetTimeout(healthConfig.Timeout)
33+
retries := r.validateAndGetRetries(healthConfig.Retries)
34+
startPeriod := r.validateAndGetStartPeriod(healthConfig.StartPeriod)
35+
36+
args = append(args, "--healthcheck-interval", strconv.Itoa(interval))
37+
args = append(args, "--healthcheck-timeout", strconv.Itoa(timeout))
38+
args = append(args, "--healthcheck-retries", strconv.Itoa(retries))
39+
args = append(args, "--healthcheck-start-period", strconv.Itoa(startPeriod))
40+
41+
logrus.Debugf("HEALTHCHECK: Added healthcheck args for container %s: cmd=%s, args=%v, interval=%ds, timeout=%ds, retries=%d, start-period=%ds",
42+
ctr.ID(), healthCmd, healthArgs, interval, timeout, retries, startPeriod)
43+
} else {
44+
logrus.Warnf("HEALTHCHECK: Container %s has healthcheck config but no valid command", ctr.ID())
45+
}
46+
}
47+
} else {
48+
logrus.Debugf("HEALTHCHECK: Container %s does not have healthcheck config, skipping healthcheck args", ctr.ID())
49+
}
50+
return args
51+
}
52+
53+
// buildHealthcheckCmdAndArgs converts Podman's healthcheck test array to command and arguments
54+
func (r *ConmonOCIRuntime) buildHealthcheckCmdAndArgs(test []string) (string, []string) {
55+
if len(test) == 0 {
56+
return "", nil
57+
}
58+
59+
// Handle special cases
60+
switch test[0] {
61+
case "", "NONE":
62+
return "", nil
63+
case "CMD":
64+
// CMD format: ["CMD", "curl", "-f", "http://localhost:8080/health"]
65+
// -> cmd="curl", args=["-f", "http://localhost:8080/health"]
66+
if len(test) > 1 {
67+
return test[1], test[2:]
68+
}
69+
return "", nil
70+
case "CMD-SHELL":
71+
// CMD-SHELL format: ["CMD-SHELL", "curl -f http://localhost:8080/health"]
72+
// -> cmd="/bin/sh", args=["-c", "curl -f http://localhost:8080/health"]
73+
if len(test) > 1 {
74+
return "/bin/sh", []string{"-c", test[1]}
75+
}
76+
return "", nil
77+
default:
78+
// Direct command format: ["curl", "-f", "http://localhost:8080/health"]
79+
// -> cmd="curl", args=["-f", "http://localhost:8080/health"]
80+
return test[0], test[1:]
81+
}
82+
}
83+
84+
// validateAndGetInterval validates and returns the healthcheck interval in seconds
85+
func (r *ConmonOCIRuntime) validateAndGetInterval(interval time.Duration) int {
86+
// Default interval is 30 seconds
87+
if interval <= 0 {
88+
return 30
89+
}
90+
// Ensure minimum interval of 1 second
91+
if interval < time.Second {
92+
logrus.Warnf("HEALTHCHECK: Interval %v is less than 1 second, using 1 second", interval)
93+
return 1
94+
}
95+
return int(interval.Seconds())
96+
}
97+
98+
// validateAndGetTimeout validates and returns the healthcheck timeout in seconds
99+
func (r *ConmonOCIRuntime) validateAndGetTimeout(timeout time.Duration) int {
100+
// Default timeout is 30 seconds
101+
if timeout <= 0 {
102+
return 30
103+
}
104+
// Ensure minimum timeout of 1 second
105+
if timeout < time.Second {
106+
logrus.Warnf("HEALTHCHECK: Timeout %v is less than 1 second, using 1 second", timeout)
107+
return 1
108+
}
109+
return int(timeout.Seconds())
110+
}
111+
112+
// validateAndGetRetries validates and returns the healthcheck retries count
113+
func (r *ConmonOCIRuntime) validateAndGetRetries(retries int) int {
114+
// Default retries is 3
115+
if retries <= 0 {
116+
return 3
117+
}
118+
// Ensure reasonable maximum retries (conmon should handle this too)
119+
if retries > 10 {
120+
logrus.Warnf("HEALTHCHECK: Retries %d is very high, using 10", retries)
121+
return 10
122+
}
123+
return retries
124+
}
125+
126+
// validateAndGetStartPeriod validates and returns the healthcheck start period in seconds
127+
func (r *ConmonOCIRuntime) validateAndGetStartPeriod(startPeriod time.Duration) int {
128+
// Default start period is 0 seconds
129+
if startPeriod < 0 {
130+
logrus.Warnf("HEALTHCHECK: Start period %v is negative, using 0", startPeriod)
131+
return 0
132+
}
133+
return int(startPeriod.Seconds())
134+
}

libpod/oci_conmon_systemd.go

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
//go:build !remote && (linux || freebsd) && systemd
2+
3+
package libpod
4+
5+
// addHealthCheckArgs adds healthcheck-related arguments to conmon for systemd builds
6+
func (r *ConmonOCIRuntime) addHealthCheckArgs(ctr *Container, args []string) []string {
7+
// For systemd builds, healthchecks are managed by systemd timers, not conmon
8+
// No healthcheck CLI arguments needed for conmon
9+
return args
10+
}

0 commit comments

Comments
 (0)