From a2be1d8de15c0e3752a1875e0c155d9b8588b5d2 Mon Sep 17 00:00:00 2001 From: Samuel Archambault Date: Thu, 11 Sep 2025 08:40:29 -0400 Subject: [PATCH] enable healthcheck arg Signed-off-by: Samuel Archambault --- libpod/oci_conmon_common.go | 121 +++++++++++++++++++++++-- libpod/oci_conmon_nosystemd.go | 134 ++++++++++++++++++++++++++++ libpod/oci_conmon_systemd.go | 10 +++ pkg/specgen/generate/oci_freebsd.go | 44 +++++++++ pkg/specgen/generate/oci_linux.go | 51 +++++++++++ 5 files changed, 351 insertions(+), 9 deletions(-) create mode 100644 libpod/oci_conmon_nosystemd.go create mode 100644 libpod/oci_conmon_systemd.go diff --git a/libpod/oci_conmon_common.go b/libpod/oci_conmon_common.go index 2185148a59..33e77e0e0f 100644 --- a/libpod/oci_conmon_common.go +++ b/libpod/oci_conmon_common.go @@ -46,6 +46,15 @@ const ( // Important: The conmon attach socket uses an extra byte at the beginning of each // message to specify the STREAM so we have to increase the buffer size by one bufferSize = conmonConfig.BufSize + 1 + + // Healthcheck message type from conmon (using negative to avoid PID conflicts) + HealthCheckMsgStatusUpdate = -100 + + // Healthcheck status values sent by conmon (added to base message type -100) + HealthCheckStatusNone = 0 + HealthCheckStatusStarting = 1 + HealthCheckStatusHealthy = 2 + HealthCheckStatusUnhealthy = 3 ) // ConmonOCIRuntime is an OCI runtime managed by Conmon. @@ -981,7 +990,6 @@ func (r *ConmonOCIRuntime) createOCIContainer(ctr *Container, restoreOptions *Co if err != nil { return 0, fmt.Errorf("creating socket pair: %w", err) } - defer errorhandling.CloseQuiet(parentSyncPipe) childStartPipe, parentStartPipe, err := newPipe() if err != nil { @@ -1038,6 +1046,9 @@ func (r *ConmonOCIRuntime) createOCIContainer(ctr *Container, restoreOptions *Co args = append(args, "--conmon-pidfile", ctr.config.ConmonPidFile) } + // Add healthcheck-related arguments (build-conditional) + args = r.addHealthCheckArgs(ctr, args) + if r.noPivot { args = append(args, "--no-pivot") } @@ -1199,6 +1210,8 @@ func (r *ConmonOCIRuntime) createOCIContainer(ctr *Container, restoreOptions *Co // regardless of whether we errored or not, we no longer need the children pipes childSyncPipe.Close() childStartPipe.Close() + + // Note: parentSyncPipe is NOT closed here because it's used for continuous healthcheck monitoring if err != nil { return 0, err } @@ -1219,7 +1232,7 @@ func (r *ConmonOCIRuntime) createOCIContainer(ctr *Container, restoreOptions *Co return 0, fmt.Errorf("conmon failed: %w", err) } - pid, err := readConmonPipeData(r.name, parentSyncPipe, ociLog) + pid, err := readConmonPipeData(r.name, parentSyncPipe, ociLog, ctr) if err != nil { if err2 := r.DeleteContainer(ctr); err2 != nil { logrus.Errorf("Removing container %s from runtime after creation failed", ctr.ID()) @@ -1322,7 +1335,6 @@ func (r *ConmonOCIRuntime) sharedConmonArgs(ctr *Container, cuuid, bundlePath, p logDriverArg = define.NoLogging case define.PassthroughLogging, define.PassthroughTTYLogging: logDriverArg = define.PassthroughLogging - //lint:ignore ST1015 the default case has to be here default: //nolint:gocritic // No case here should happen except JSONLogging, but keep this here in case the options are extended logrus.Errorf("%s logging specified but not supported. Choosing k8s-file logging instead", ctr.LogDriver()) @@ -1390,13 +1402,15 @@ func readConmonPidFile(pidFile string) (int, error) { return 0, nil } +// syncInfo is used to return data from monitor process to daemon +type syncInfo struct { + Data int `json:"data"` + Message string `json:"message,omitempty"` +} + // readConmonPipeData attempts to read a syncInfo struct from the pipe -func readConmonPipeData(runtimeName string, pipe *os.File, ociLog string) (int, error) { - // syncInfo is used to return data from monitor process to daemon - type syncInfo struct { - Data int `json:"data"` - Message string `json:"message,omitempty"` - } +// If ctr is provided, it will also start continuous healthcheck monitoring +func readConmonPipeData(runtimeName string, pipe *os.File, ociLog string, ctr ...*Container) (int, error) { // Wait to get container pid from conmon type syncStruct struct { @@ -1408,15 +1422,24 @@ func readConmonPipeData(runtimeName string, pipe *os.File, ociLog string) (int, var si *syncInfo rdr := bufio.NewReader(pipe) b, err := rdr.ReadBytes('\n') + + // Log the raw JSON string received from conmon + logrus.Debugf("HEALTHCHECK: Raw JSON received from conmon: %q", string(b)) + logrus.Debugf("HEALTHCHECK: JSON length: %d bytes", len(b)) + // ignore EOF here, error is returned even when data was read // if it is no valid json unmarshal will fail below if err != nil && !errors.Is(err, io.EOF) { + logrus.Debugf("HEALTHCHECK: Error reading from conmon pipe: %v", err) ch <- syncStruct{err: err} + return } if err := json.Unmarshal(b, &si); err != nil { + logrus.Debugf("HEALTHCHECK: Failed to unmarshal JSON from conmon: %v", err) ch <- syncStruct{err: fmt.Errorf("conmon bytes %q: %w", string(b), err)} return } + logrus.Debugf("HEALTHCHECK: Successfully parsed JSON from conmon: Data=%d, Message=%q", si.Data, si.Message) ch <- syncStruct{si: si} }() @@ -1436,6 +1459,13 @@ func readConmonPipeData(runtimeName string, pipe *os.File, ociLog string) (int, return -1, fmt.Errorf("container create failed (no logs from conmon): %w", ss.err) } logrus.Debugf("Received: %d", ss.si.Data) + + // Start continuous healthcheck monitoring if container is provided and PID is valid + if len(ctr) > 0 && ctr[0] != nil && ss.si.Data > 0 { + logrus.Debugf("HEALTHCHECK: Starting continuous healthcheck monitoring for container %s (PID: %d)", ctr[0].ID(), ss.si.Data) + go readConmonHealthCheckPipeData(ctr[0], pipe) + } + if ss.si.Data < 0 { if ociLog != "" { ociLogData, err := os.ReadFile(ociLog) @@ -1459,6 +1489,79 @@ func readConmonPipeData(runtimeName string, pipe *os.File, ociLog string) (int, return data, nil } +// readConmonHealthCheckPipeData continuously reads healthcheck status updates from conmon +func readConmonHealthCheckPipeData(ctr *Container, pipe *os.File) { + logrus.Debugf("HEALTHCHECK: Starting continuous healthcheck monitoring for container %s", ctr.ID()) + + rdr := bufio.NewReader(pipe) + for { + // Read one line from the pipe + b, err := rdr.ReadBytes('\n') + if err != nil { + if err == io.EOF { + logrus.Debugf("HEALTHCHECK: Pipe closed for container %s, stopping monitoring", ctr.ID()) + return + } + logrus.Errorf("HEALTHCHECK: Error reading from pipe for container %s: %v", ctr.ID(), err) + return + } + + // Log the raw JSON string received from conmon + logrus.Debugf("HEALTHCHECK: Raw JSON received from conmon for container %s: %q", ctr.ID(), string(b)) + logrus.Debugf("HEALTHCHECK: JSON length: %d bytes", len(b)) + + // Parse the JSON + var si syncInfo + if err := json.Unmarshal(b, &si); err != nil { + logrus.Errorf("HEALTHCHECK: Failed to parse JSON from conmon for container %s: %v", ctr.ID(), err) + continue + } + + logrus.Debugf("HEALTHCHECK: Parsed sync info for container %s: Data=%d, Message=%q", ctr.ID(), si.Data, si.Message) + + // Handle healthcheck status updates based on your new encoding scheme + // Base message type is -100, status values are added to it: + // -100 + 0 (none) = -100 + // -100 + 1 (starting) = -99 + // -100 + 2 (healthy) = -98 + // -100 + 3 (unhealthy) = -97 + if si.Data >= HealthCheckMsgStatusUpdate && si.Data <= HealthCheckMsgStatusUpdate+HealthCheckStatusUnhealthy { + statusValue := si.Data - HealthCheckMsgStatusUpdate // Convert back to status value + var status string + + switch statusValue { + case HealthCheckStatusNone: + status = define.HealthCheckReset // "reset" or "none" + case HealthCheckStatusStarting: + status = define.HealthCheckStarting // "starting" + case HealthCheckStatusHealthy: + status = define.HealthCheckHealthy // "healthy" + case HealthCheckStatusUnhealthy: + status = define.HealthCheckUnhealthy // "unhealthy" + default: + logrus.Errorf("HEALTHCHECK: Unknown status value %d for container %s", statusValue, ctr.ID()) + continue + } + + logrus.Infof("HEALTHCHECK: Received healthcheck status update for container %s: %s (message type: %d, status value: %d)", + ctr.ID(), status, si.Data, statusValue) + + // Update the container's healthcheck status + if err := ctr.updateHealthStatus(status); err != nil { + logrus.Errorf("HEALTHCHECK: Failed to update healthcheck status for container %s: %v", ctr.ID(), err) + } else { + logrus.Infof("HEALTHCHECK: Successfully updated healthcheck status for container %s to %s", ctr.ID(), status) + } + } else if si.Data < 0 { + // Other negative message types - might be healthcheck related but not recognized + logrus.Debugf("HEALTHCHECK: Received unrecognized negative message type %d for container %s - might be healthcheck related", si.Data, ctr.ID()) + } else if si.Data > 0 { + // Positive message types - not healthcheck related + logrus.Debugf("HEALTHCHECK: Received positive message type %d for container %s - not healthcheck related", si.Data, ctr.ID()) + } + } +} + // writeConmonPipeData writes nonce data to a pipe func writeConmonPipeData(pipe *os.File) error { someData := []byte{0} diff --git a/libpod/oci_conmon_nosystemd.go b/libpod/oci_conmon_nosystemd.go new file mode 100644 index 0000000000..05a05b2650 --- /dev/null +++ b/libpod/oci_conmon_nosystemd.go @@ -0,0 +1,134 @@ +//go:build !remote && (linux || freebsd) && !systemd + +package libpod + +import ( + "strconv" + "time" + + "github.com/sirupsen/logrus" +) + +// addHealthCheckArgs adds healthcheck-related arguments to conmon for non-systemd builds +func (r *ConmonOCIRuntime) addHealthCheckArgs(ctr *Container, args []string) []string { + // Add healthcheck configuration as CLI arguments if container has healthcheck config + if ctr.HasHealthCheck() { + healthConfig := ctr.HealthCheckConfig() + if healthConfig != nil { + logrus.Debugf("HEALTHCHECK: Adding healthcheck CLI args for container %s", ctr.ID()) + + // Build healthcheck command and arguments from test array + healthCmd, healthArgs := r.buildHealthcheckCmdAndArgs(healthConfig.Test) + if healthCmd != "" { + args = append(args, "--healthcheck-cmd", healthCmd) + + // Add all healthcheck arguments + for _, arg := range healthArgs { + args = append(args, "--healthcheck-arg", arg) + } + + // Add optional healthcheck parameters with validation and defaults + interval := r.validateAndGetInterval(healthConfig.Interval) + timeout := r.validateAndGetTimeout(healthConfig.Timeout) + retries := r.validateAndGetRetries(healthConfig.Retries) + startPeriod := r.validateAndGetStartPeriod(healthConfig.StartPeriod) + + args = append(args, "--healthcheck-interval", strconv.Itoa(interval)) + args = append(args, "--healthcheck-timeout", strconv.Itoa(timeout)) + args = append(args, "--healthcheck-retries", strconv.Itoa(retries)) + args = append(args, "--healthcheck-start-period", strconv.Itoa(startPeriod)) + + logrus.Debugf("HEALTHCHECK: Added healthcheck args for container %s: cmd=%s, args=%v, interval=%ds, timeout=%ds, retries=%d, start-period=%ds", + ctr.ID(), healthCmd, healthArgs, interval, timeout, retries, startPeriod) + } else { + logrus.Warnf("HEALTHCHECK: Container %s has healthcheck config but no valid command", ctr.ID()) + } + } + } else { + logrus.Debugf("HEALTHCHECK: Container %s does not have healthcheck config, skipping healthcheck args", ctr.ID()) + } + return args +} + +// buildHealthcheckCmdAndArgs converts Podman's healthcheck test array to command and arguments +func (r *ConmonOCIRuntime) buildHealthcheckCmdAndArgs(test []string) (string, []string) { + if len(test) == 0 { + return "", nil + } + + // Handle special cases + switch test[0] { + case "", "NONE": + return "", nil + case "CMD": + // CMD format: ["CMD", "curl", "-f", "http://localhost:8080/health"] + // -> cmd="curl", args=["-f", "http://localhost:8080/health"] + if len(test) > 1 { + return test[1], test[2:] + } + return "", nil + case "CMD-SHELL": + // CMD-SHELL format: ["CMD-SHELL", "curl -f http://localhost:8080/health"] + // -> cmd="/bin/sh", args=["-c", "curl -f http://localhost:8080/health"] + if len(test) > 1 { + return "/bin/sh", []string{"-c", test[1]} + } + return "", nil + default: + // Direct command format: ["curl", "-f", "http://localhost:8080/health"] + // -> cmd="curl", args=["-f", "http://localhost:8080/health"] + return test[0], test[1:] + } +} + +// validateAndGetInterval validates and returns the healthcheck interval in seconds +func (r *ConmonOCIRuntime) validateAndGetInterval(interval time.Duration) int { + // Default interval is 30 seconds + if interval <= 0 { + return 30 + } + // Ensure minimum interval of 1 second + if interval < time.Second { + logrus.Warnf("HEALTHCHECK: Interval %v is less than 1 second, using 1 second", interval) + return 1 + } + return int(interval.Seconds()) +} + +// validateAndGetTimeout validates and returns the healthcheck timeout in seconds +func (r *ConmonOCIRuntime) validateAndGetTimeout(timeout time.Duration) int { + // Default timeout is 30 seconds + if timeout <= 0 { + return 30 + } + // Ensure minimum timeout of 1 second + if timeout < time.Second { + logrus.Warnf("HEALTHCHECK: Timeout %v is less than 1 second, using 1 second", timeout) + return 1 + } + return int(timeout.Seconds()) +} + +// validateAndGetRetries validates and returns the healthcheck retries count +func (r *ConmonOCIRuntime) validateAndGetRetries(retries int) int { + // Default retries is 3 + if retries <= 0 { + return 3 + } + // Ensure reasonable maximum retries (conmon should handle this too) + if retries > 10 { + logrus.Warnf("HEALTHCHECK: Retries %d is very high, using 10", retries) + return 10 + } + return retries +} + +// validateAndGetStartPeriod validates and returns the healthcheck start period in seconds +func (r *ConmonOCIRuntime) validateAndGetStartPeriod(startPeriod time.Duration) int { + // Default start period is 0 seconds + if startPeriod < 0 { + logrus.Warnf("HEALTHCHECK: Start period %v is negative, using 0", startPeriod) + return 0 + } + return int(startPeriod.Seconds()) +} diff --git a/libpod/oci_conmon_systemd.go b/libpod/oci_conmon_systemd.go new file mode 100644 index 0000000000..00005e45ac --- /dev/null +++ b/libpod/oci_conmon_systemd.go @@ -0,0 +1,10 @@ +//go:build !remote && (linux || freebsd) && systemd + +package libpod + +// addHealthCheckArgs adds healthcheck-related arguments to conmon for systemd builds +func (r *ConmonOCIRuntime) addHealthCheckArgs(ctr *Container, args []string) []string { + // For systemd builds, healthchecks are managed by systemd timers, not conmon + // No healthcheck CLI arguments needed for conmon + return args +} diff --git a/pkg/specgen/generate/oci_freebsd.go b/pkg/specgen/generate/oci_freebsd.go index 1d8f4a3016..c8ca23220d 100644 --- a/pkg/specgen/generate/oci_freebsd.go +++ b/pkg/specgen/generate/oci_freebsd.go @@ -4,6 +4,7 @@ package generate import ( "context" + "encoding/json" "fmt" "strings" @@ -162,6 +163,49 @@ func SpecGenToOCI(ctx context.Context, s *specgen.SpecGenerator, rt *libpod.Runt configSpec.Annotations[define.InspectAnnotationInit] = define.InspectResponseTrue } + // Add healthcheck configuration as JSON annotation (for conmon) + if s.ContainerHealthCheckConfig.HealthConfig != nil { + // Convert durations to seconds for easier parsing in conmon + healthCheckForConmon := struct { + Test []string `json:"test"` + Interval int `json:"interval"` // seconds + Timeout int `json:"timeout"` // seconds + Retries int `json:"retries"` + StartPeriod int `json:"start_period"` // seconds + }{ + Test: s.ContainerHealthCheckConfig.HealthConfig.Test, + Interval: int(s.ContainerHealthCheckConfig.HealthConfig.Interval.Seconds()), + Timeout: int(s.ContainerHealthCheckConfig.HealthConfig.Timeout.Seconds()), + Retries: s.ContainerHealthCheckConfig.HealthConfig.Retries, + StartPeriod: int(s.ContainerHealthCheckConfig.HealthConfig.StartPeriod.Seconds()), + } + healthCheckJSON, err := json.Marshal(healthCheckForConmon) + if err == nil { + configSpec.Annotations["io.podman.healthcheck"] = string(healthCheckJSON) + } + } + + if s.ContainerHealthCheckConfig.StartupHealthConfig != nil { + // Convert durations to seconds for easier parsing in conmon + startupHealthCheckForConmon := struct { + Test []string `json:"test"` + Interval int `json:"interval"` // seconds + Timeout int `json:"timeout"` // seconds + Retries int `json:"retries"` + StartPeriod int `json:"start_period"` // seconds + }{ + Test: s.ContainerHealthCheckConfig.StartupHealthConfig.Test, + Interval: int(s.ContainerHealthCheckConfig.StartupHealthConfig.Interval.Seconds()), + Timeout: int(s.ContainerHealthCheckConfig.StartupHealthConfig.Timeout.Seconds()), + Retries: s.ContainerHealthCheckConfig.StartupHealthConfig.Retries, + StartPeriod: int(s.ContainerHealthCheckConfig.StartupHealthConfig.StartPeriod.Seconds()), + } + startupHealthCheckJSON, err := json.Marshal(startupHealthCheckForConmon) + if err == nil { + configSpec.Annotations["io.podman.startup-healthcheck"] = string(startupHealthCheckJSON) + } + } + if s.OOMScoreAdj != nil { g.SetProcessOOMScoreAdj(*s.OOMScoreAdj) } diff --git a/pkg/specgen/generate/oci_linux.go b/pkg/specgen/generate/oci_linux.go index 335a82f2a2..3423b129fb 100644 --- a/pkg/specgen/generate/oci_linux.go +++ b/pkg/specgen/generate/oci_linux.go @@ -334,6 +334,57 @@ func SpecGenToOCI(_ context.Context, s *specgen.SpecGenerator, rt *libpod.Runtim configSpec.Annotations[define.InspectAnnotationInit] = define.InspectResponseTrue } + // Add healthcheck configuration as JSON annotation (for conmon) + if s.ContainerHealthCheckConfig.HealthConfig != nil { + logrus.Debugf("HEALTHCHECK: Adding regular healthcheck annotation to OCI spec") + // Convert durations to seconds for easier parsing in conmon + healthCheckForConmon := struct { + Test []string `json:"test"` + Interval int `json:"interval"` // seconds + Timeout int `json:"timeout"` // seconds + Retries int `json:"retries"` + StartPeriod int `json:"start_period"` // seconds + }{ + Test: s.ContainerHealthCheckConfig.HealthConfig.Test, + Interval: int(s.ContainerHealthCheckConfig.HealthConfig.Interval.Seconds()), + Timeout: int(s.ContainerHealthCheckConfig.HealthConfig.Timeout.Seconds()), + Retries: s.ContainerHealthCheckConfig.HealthConfig.Retries, + StartPeriod: int(s.ContainerHealthCheckConfig.HealthConfig.StartPeriod.Seconds()), + } + healthCheckJSON, err := json.Marshal(healthCheckForConmon) + if err == nil { + configSpec.Annotations["io.podman.healthcheck"] = string(healthCheckJSON) + logrus.Debugf("HEALTHCHECK: Added healthcheck annotation: %s", string(healthCheckJSON)) + } else { + logrus.Errorf("HEALTHCHECK: Failed to marshal healthcheck config: %v", err) + } + } + + if s.ContainerHealthCheckConfig.StartupHealthConfig != nil { + logrus.Debugf("HEALTHCHECK: Adding startup healthcheck annotation to OCI spec") + // Convert durations to seconds for easier parsing in conmon + startupHealthCheckForConmon := struct { + Test []string `json:"test"` + Interval int `json:"interval"` // seconds + Timeout int `json:"timeout"` // seconds + Retries int `json:"retries"` + StartPeriod int `json:"start_period"` // seconds + }{ + Test: s.ContainerHealthCheckConfig.StartupHealthConfig.Test, + Interval: int(s.ContainerHealthCheckConfig.StartupHealthConfig.Interval.Seconds()), + Timeout: int(s.ContainerHealthCheckConfig.StartupHealthConfig.Timeout.Seconds()), + Retries: s.ContainerHealthCheckConfig.StartupHealthConfig.Retries, + StartPeriod: int(s.ContainerHealthCheckConfig.StartupHealthConfig.StartPeriod.Seconds()), + } + startupHealthCheckJSON, err := json.Marshal(startupHealthCheckForConmon) + if err == nil { + configSpec.Annotations["io.podman.startup-healthcheck"] = string(startupHealthCheckJSON) + logrus.Debugf("HEALTHCHECK: Added startup healthcheck annotation: %s", string(startupHealthCheckJSON)) + } else { + logrus.Errorf("HEALTHCHECK: Failed to marshal startup healthcheck config: %v", err) + } + } + if s.OOMScoreAdj != nil { g.SetProcessOOMScoreAdj(*s.OOMScoreAdj) }