Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
121 changes: 112 additions & 9 deletions libpod/oci_conmon_common.go
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,15 @@ const (
// Important: The conmon attach socket uses an extra byte at the beginning of each
// message to specify the STREAM so we have to increase the buffer size by one
bufferSize = conmonConfig.BufSize + 1

// Healthcheck message type from conmon (using negative to avoid PID conflicts)
HealthCheckMsgStatusUpdate = -100

// Healthcheck status values sent by conmon (added to base message type -100)
HealthCheckStatusNone = 0
HealthCheckStatusStarting = 1
HealthCheckStatusHealthy = 2
HealthCheckStatusUnhealthy = 3
)

// ConmonOCIRuntime is an OCI runtime managed by Conmon.
Expand Down Expand Up @@ -981,7 +990,6 @@ func (r *ConmonOCIRuntime) createOCIContainer(ctr *Container, restoreOptions *Co
if err != nil {
return 0, fmt.Errorf("creating socket pair: %w", err)
}
defer errorhandling.CloseQuiet(parentSyncPipe)

childStartPipe, parentStartPipe, err := newPipe()
if err != nil {
Expand Down Expand Up @@ -1038,6 +1046,9 @@ func (r *ConmonOCIRuntime) createOCIContainer(ctr *Container, restoreOptions *Co
args = append(args, "--conmon-pidfile", ctr.config.ConmonPidFile)
}

// Add healthcheck-related arguments (build-conditional)
args = r.addHealthCheckArgs(ctr, args)

if r.noPivot {
args = append(args, "--no-pivot")
}
Expand Down Expand Up @@ -1199,6 +1210,8 @@ func (r *ConmonOCIRuntime) createOCIContainer(ctr *Container, restoreOptions *Co
// regardless of whether we errored or not, we no longer need the children pipes
childSyncPipe.Close()
childStartPipe.Close()

// Note: parentSyncPipe is NOT closed here because it's used for continuous healthcheck monitoring
if err != nil {
return 0, err
}
Expand All @@ -1219,7 +1232,7 @@ func (r *ConmonOCIRuntime) createOCIContainer(ctr *Container, restoreOptions *Co
return 0, fmt.Errorf("conmon failed: %w", err)
}

pid, err := readConmonPipeData(r.name, parentSyncPipe, ociLog)
pid, err := readConmonPipeData(r.name, parentSyncPipe, ociLog, ctr)
if err != nil {
if err2 := r.DeleteContainer(ctr); err2 != nil {
logrus.Errorf("Removing container %s from runtime after creation failed", ctr.ID())
Expand Down Expand Up @@ -1322,7 +1335,6 @@ func (r *ConmonOCIRuntime) sharedConmonArgs(ctr *Container, cuuid, bundlePath, p
logDriverArg = define.NoLogging
case define.PassthroughLogging, define.PassthroughTTYLogging:
logDriverArg = define.PassthroughLogging
//lint:ignore ST1015 the default case has to be here
default: //nolint:gocritic
// No case here should happen except JSONLogging, but keep this here in case the options are extended
logrus.Errorf("%s logging specified but not supported. Choosing k8s-file logging instead", ctr.LogDriver())
Expand Down Expand Up @@ -1390,13 +1402,15 @@ func readConmonPidFile(pidFile string) (int, error) {
return 0, nil
}

// syncInfo is used to return data from monitor process to daemon
type syncInfo struct {
Data int `json:"data"`
Message string `json:"message,omitempty"`
}

// readConmonPipeData attempts to read a syncInfo struct from the pipe
func readConmonPipeData(runtimeName string, pipe *os.File, ociLog string) (int, error) {
// syncInfo is used to return data from monitor process to daemon
type syncInfo struct {
Data int `json:"data"`
Message string `json:"message,omitempty"`
}
// If ctr is provided, it will also start continuous healthcheck monitoring
func readConmonPipeData(runtimeName string, pipe *os.File, ociLog string, ctr ...*Container) (int, error) {

// Wait to get container pid from conmon
type syncStruct struct {
Expand All @@ -1408,15 +1422,24 @@ func readConmonPipeData(runtimeName string, pipe *os.File, ociLog string) (int,
var si *syncInfo
rdr := bufio.NewReader(pipe)
b, err := rdr.ReadBytes('\n')

// Log the raw JSON string received from conmon
logrus.Debugf("HEALTHCHECK: Raw JSON received from conmon: %q", string(b))
logrus.Debugf("HEALTHCHECK: JSON length: %d bytes", len(b))

// ignore EOF here, error is returned even when data was read
// if it is no valid json unmarshal will fail below
if err != nil && !errors.Is(err, io.EOF) {
logrus.Debugf("HEALTHCHECK: Error reading from conmon pipe: %v", err)
ch <- syncStruct{err: err}
return
}
if err := json.Unmarshal(b, &si); err != nil {
logrus.Debugf("HEALTHCHECK: Failed to unmarshal JSON from conmon: %v", err)
ch <- syncStruct{err: fmt.Errorf("conmon bytes %q: %w", string(b), err)}
return
}
logrus.Debugf("HEALTHCHECK: Successfully parsed JSON from conmon: Data=%d, Message=%q", si.Data, si.Message)
ch <- syncStruct{si: si}
}()

Expand All @@ -1436,6 +1459,13 @@ func readConmonPipeData(runtimeName string, pipe *os.File, ociLog string) (int,
return -1, fmt.Errorf("container create failed (no logs from conmon): %w", ss.err)
}
logrus.Debugf("Received: %d", ss.si.Data)

// Start continuous healthcheck monitoring if container is provided and PID is valid
if len(ctr) > 0 && ctr[0] != nil && ss.si.Data > 0 {
logrus.Debugf("HEALTHCHECK: Starting continuous healthcheck monitoring for container %s (PID: %d)", ctr[0].ID(), ss.si.Data)
go readConmonHealthCheckPipeData(ctr[0], pipe)
}

if ss.si.Data < 0 {
if ociLog != "" {
ociLogData, err := os.ReadFile(ociLog)
Expand All @@ -1459,6 +1489,79 @@ func readConmonPipeData(runtimeName string, pipe *os.File, ociLog string) (int,
return data, nil
}

// readConmonHealthCheckPipeData continuously reads healthcheck status updates from conmon
func readConmonHealthCheckPipeData(ctr *Container, pipe *os.File) {
logrus.Debugf("HEALTHCHECK: Starting continuous healthcheck monitoring for container %s", ctr.ID())

rdr := bufio.NewReader(pipe)
for {
// Read one line from the pipe
b, err := rdr.ReadBytes('\n')
if err != nil {
if err == io.EOF {
logrus.Debugf("HEALTHCHECK: Pipe closed for container %s, stopping monitoring", ctr.ID())
return
}
logrus.Errorf("HEALTHCHECK: Error reading from pipe for container %s: %v", ctr.ID(), err)
return
}

// Log the raw JSON string received from conmon
logrus.Debugf("HEALTHCHECK: Raw JSON received from conmon for container %s: %q", ctr.ID(), string(b))
logrus.Debugf("HEALTHCHECK: JSON length: %d bytes", len(b))

// Parse the JSON
var si syncInfo
if err := json.Unmarshal(b, &si); err != nil {
logrus.Errorf("HEALTHCHECK: Failed to parse JSON from conmon for container %s: %v", ctr.ID(), err)
continue
}

logrus.Debugf("HEALTHCHECK: Parsed sync info for container %s: Data=%d, Message=%q", ctr.ID(), si.Data, si.Message)

// Handle healthcheck status updates based on your new encoding scheme
// Base message type is -100, status values are added to it:
// -100 + 0 (none) = -100
// -100 + 1 (starting) = -99
// -100 + 2 (healthy) = -98
// -100 + 3 (unhealthy) = -97
if si.Data >= HealthCheckMsgStatusUpdate && si.Data <= HealthCheckMsgStatusUpdate+HealthCheckStatusUnhealthy {
statusValue := si.Data - HealthCheckMsgStatusUpdate // Convert back to status value
var status string

switch statusValue {
case HealthCheckStatusNone:
status = define.HealthCheckReset // "reset" or "none"
case HealthCheckStatusStarting:
status = define.HealthCheckStarting // "starting"
case HealthCheckStatusHealthy:
status = define.HealthCheckHealthy // "healthy"
case HealthCheckStatusUnhealthy:
status = define.HealthCheckUnhealthy // "unhealthy"
default:
logrus.Errorf("HEALTHCHECK: Unknown status value %d for container %s", statusValue, ctr.ID())
continue
}

logrus.Infof("HEALTHCHECK: Received healthcheck status update for container %s: %s (message type: %d, status value: %d)",
ctr.ID(), status, si.Data, statusValue)

// Update the container's healthcheck status
if err := ctr.updateHealthStatus(status); err != nil {
logrus.Errorf("HEALTHCHECK: Failed to update healthcheck status for container %s: %v", ctr.ID(), err)
} else {
logrus.Infof("HEALTHCHECK: Successfully updated healthcheck status for container %s to %s", ctr.ID(), status)
}
} else if si.Data < 0 {
// Other negative message types - might be healthcheck related but not recognized
logrus.Debugf("HEALTHCHECK: Received unrecognized negative message type %d for container %s - might be healthcheck related", si.Data, ctr.ID())
} else if si.Data > 0 {
// Positive message types - not healthcheck related
logrus.Debugf("HEALTHCHECK: Received positive message type %d for container %s - not healthcheck related", si.Data, ctr.ID())
}
}
}

// writeConmonPipeData writes nonce data to a pipe
func writeConmonPipeData(pipe *os.File) error {
someData := []byte{0}
Expand Down
134 changes: 134 additions & 0 deletions libpod/oci_conmon_nosystemd.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,134 @@
//go:build !remote && (linux || freebsd) && !systemd

package libpod

import (
"strconv"
"time"

"github.com/sirupsen/logrus"
)

// addHealthCheckArgs adds healthcheck-related arguments to conmon for non-systemd builds
func (r *ConmonOCIRuntime) addHealthCheckArgs(ctr *Container, args []string) []string {
// Add healthcheck configuration as CLI arguments if container has healthcheck config
if ctr.HasHealthCheck() {
healthConfig := ctr.HealthCheckConfig()
if healthConfig != nil {
logrus.Debugf("HEALTHCHECK: Adding healthcheck CLI args for container %s", ctr.ID())

// Build healthcheck command and arguments from test array
healthCmd, healthArgs := r.buildHealthcheckCmdAndArgs(healthConfig.Test)
if healthCmd != "" {
args = append(args, "--healthcheck-cmd", healthCmd)

// Add all healthcheck arguments
for _, arg := range healthArgs {
args = append(args, "--healthcheck-arg", arg)
}

// Add optional healthcheck parameters with validation and defaults
interval := r.validateAndGetInterval(healthConfig.Interval)
timeout := r.validateAndGetTimeout(healthConfig.Timeout)
retries := r.validateAndGetRetries(healthConfig.Retries)
startPeriod := r.validateAndGetStartPeriod(healthConfig.StartPeriod)

args = append(args, "--healthcheck-interval", strconv.Itoa(interval))
args = append(args, "--healthcheck-timeout", strconv.Itoa(timeout))
args = append(args, "--healthcheck-retries", strconv.Itoa(retries))
args = append(args, "--healthcheck-start-period", strconv.Itoa(startPeriod))

logrus.Debugf("HEALTHCHECK: Added healthcheck args for container %s: cmd=%s, args=%v, interval=%ds, timeout=%ds, retries=%d, start-period=%ds",
ctr.ID(), healthCmd, healthArgs, interval, timeout, retries, startPeriod)
} else {
logrus.Warnf("HEALTHCHECK: Container %s has healthcheck config but no valid command", ctr.ID())
}
}
} else {
logrus.Debugf("HEALTHCHECK: Container %s does not have healthcheck config, skipping healthcheck args", ctr.ID())
}
return args
}

// buildHealthcheckCmdAndArgs converts Podman's healthcheck test array to command and arguments
func (r *ConmonOCIRuntime) buildHealthcheckCmdAndArgs(test []string) (string, []string) {
if len(test) == 0 {
return "", nil
}

// Handle special cases
switch test[0] {
case "", "NONE":
return "", nil
case "CMD":
// CMD format: ["CMD", "curl", "-f", "http://localhost:8080/health"]
// -> cmd="curl", args=["-f", "http://localhost:8080/health"]
if len(test) > 1 {
return test[1], test[2:]
}
return "", nil
case "CMD-SHELL":
// CMD-SHELL format: ["CMD-SHELL", "curl -f http://localhost:8080/health"]
// -> cmd="/bin/sh", args=["-c", "curl -f http://localhost:8080/health"]
if len(test) > 1 {
return "/bin/sh", []string{"-c", test[1]}
}
return "", nil
default:
// Direct command format: ["curl", "-f", "http://localhost:8080/health"]
// -> cmd="curl", args=["-f", "http://localhost:8080/health"]
return test[0], test[1:]
}
}

// validateAndGetInterval validates and returns the healthcheck interval in seconds
func (r *ConmonOCIRuntime) validateAndGetInterval(interval time.Duration) int {
// Default interval is 30 seconds
if interval <= 0 {
return 30
}
// Ensure minimum interval of 1 second
if interval < time.Second {
logrus.Warnf("HEALTHCHECK: Interval %v is less than 1 second, using 1 second", interval)
return 1
}
return int(interval.Seconds())
}

// validateAndGetTimeout validates and returns the healthcheck timeout in seconds
func (r *ConmonOCIRuntime) validateAndGetTimeout(timeout time.Duration) int {
// Default timeout is 30 seconds
if timeout <= 0 {
return 30
}
// Ensure minimum timeout of 1 second
if timeout < time.Second {
logrus.Warnf("HEALTHCHECK: Timeout %v is less than 1 second, using 1 second", timeout)
return 1
}
return int(timeout.Seconds())
}

// validateAndGetRetries validates and returns the healthcheck retries count
func (r *ConmonOCIRuntime) validateAndGetRetries(retries int) int {
// Default retries is 3
if retries <= 0 {
return 3
}
// Ensure reasonable maximum retries (conmon should handle this too)
if retries > 10 {
logrus.Warnf("HEALTHCHECK: Retries %d is very high, using 10", retries)
return 10
}
return retries
}

// validateAndGetStartPeriod validates and returns the healthcheck start period in seconds
func (r *ConmonOCIRuntime) validateAndGetStartPeriod(startPeriod time.Duration) int {
// Default start period is 0 seconds
if startPeriod < 0 {
logrus.Warnf("HEALTHCHECK: Start period %v is negative, using 0", startPeriod)
return 0
}
return int(startPeriod.Seconds())
}
10 changes: 10 additions & 0 deletions libpod/oci_conmon_systemd.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
//go:build !remote && (linux || freebsd) && systemd

package libpod

// addHealthCheckArgs adds healthcheck-related arguments to conmon for systemd builds
func (r *ConmonOCIRuntime) addHealthCheckArgs(ctr *Container, args []string) []string {
// For systemd builds, healthchecks are managed by systemd timers, not conmon
// No healthcheck CLI arguments needed for conmon
return args
}
Loading