@@ -46,6 +46,15 @@ const (
46
46
// Important: The conmon attach socket uses an extra byte at the beginning of each
47
47
// message to specify the STREAM so we have to increase the buffer size by one
48
48
bufferSize = conmonConfig .BufSize + 1
49
+
50
+ // Healthcheck message type from conmon (using negative to avoid PID conflicts)
51
+ HealthCheckMsgStatusUpdate = - 100
52
+
53
+ // Healthcheck status values sent by conmon (added to base message type -100)
54
+ HealthCheckStatusNone = 0
55
+ HealthCheckStatusStarting = 1
56
+ HealthCheckStatusHealthy = 2
57
+ HealthCheckStatusUnhealthy = 3
49
58
)
50
59
51
60
// ConmonOCIRuntime is an OCI runtime managed by Conmon.
@@ -981,7 +990,6 @@ func (r *ConmonOCIRuntime) createOCIContainer(ctr *Container, restoreOptions *Co
981
990
if err != nil {
982
991
return 0 , fmt .Errorf ("creating socket pair: %w" , err )
983
992
}
984
- defer errorhandling .CloseQuiet (parentSyncPipe )
985
993
986
994
childStartPipe , parentStartPipe , err := newPipe ()
987
995
if err != nil {
@@ -1038,6 +1046,9 @@ func (r *ConmonOCIRuntime) createOCIContainer(ctr *Container, restoreOptions *Co
1038
1046
args = append (args , "--conmon-pidfile" , ctr .config .ConmonPidFile )
1039
1047
}
1040
1048
1049
+ // Add healthcheck-related arguments (build-conditional)
1050
+ args = r .addHealthCheckArgs (ctr , args )
1051
+
1041
1052
if r .noPivot {
1042
1053
args = append (args , "--no-pivot" )
1043
1054
}
@@ -1199,6 +1210,8 @@ func (r *ConmonOCIRuntime) createOCIContainer(ctr *Container, restoreOptions *Co
1199
1210
// regardless of whether we errored or not, we no longer need the children pipes
1200
1211
childSyncPipe .Close ()
1201
1212
childStartPipe .Close ()
1213
+
1214
+ // Note: parentSyncPipe is NOT closed here because it's used for continuous healthcheck monitoring
1202
1215
if err != nil {
1203
1216
return 0 , err
1204
1217
}
@@ -1219,7 +1232,7 @@ func (r *ConmonOCIRuntime) createOCIContainer(ctr *Container, restoreOptions *Co
1219
1232
return 0 , fmt .Errorf ("conmon failed: %w" , err )
1220
1233
}
1221
1234
1222
- pid , err := readConmonPipeData (r .name , parentSyncPipe , ociLog )
1235
+ pid , err := readConmonPipeData (r .name , parentSyncPipe , ociLog , ctr )
1223
1236
if err != nil {
1224
1237
if err2 := r .DeleteContainer (ctr ); err2 != nil {
1225
1238
logrus .Errorf ("Removing container %s from runtime after creation failed" , ctr .ID ())
@@ -1322,7 +1335,6 @@ func (r *ConmonOCIRuntime) sharedConmonArgs(ctr *Container, cuuid, bundlePath, p
1322
1335
logDriverArg = define .NoLogging
1323
1336
case define .PassthroughLogging , define .PassthroughTTYLogging :
1324
1337
logDriverArg = define .PassthroughLogging
1325
- //lint:ignore ST1015 the default case has to be here
1326
1338
default : //nolint:gocritic
1327
1339
// No case here should happen except JSONLogging, but keep this here in case the options are extended
1328
1340
logrus .Errorf ("%s logging specified but not supported. Choosing k8s-file logging instead" , ctr .LogDriver ())
@@ -1390,13 +1402,15 @@ func readConmonPidFile(pidFile string) (int, error) {
1390
1402
return 0 , nil
1391
1403
}
1392
1404
1405
+ // syncInfo is used to return data from monitor process to daemon
1406
+ type syncInfo struct {
1407
+ Data int `json:"data"`
1408
+ Message string `json:"message,omitempty"`
1409
+ }
1410
+
1393
1411
// readConmonPipeData attempts to read a syncInfo struct from the pipe
1394
- func readConmonPipeData (runtimeName string , pipe * os.File , ociLog string ) (int , error ) {
1395
- // syncInfo is used to return data from monitor process to daemon
1396
- type syncInfo struct {
1397
- Data int `json:"data"`
1398
- Message string `json:"message,omitempty"`
1399
- }
1412
+ // If ctr is provided, it will also start continuous healthcheck monitoring
1413
+ func readConmonPipeData (runtimeName string , pipe * os.File , ociLog string , ctr ... * Container ) (int , error ) {
1400
1414
1401
1415
// Wait to get container pid from conmon
1402
1416
type syncStruct struct {
@@ -1408,15 +1422,24 @@ func readConmonPipeData(runtimeName string, pipe *os.File, ociLog string) (int,
1408
1422
var si * syncInfo
1409
1423
rdr := bufio .NewReader (pipe )
1410
1424
b , err := rdr .ReadBytes ('\n' )
1425
+
1426
+ // Log the raw JSON string received from conmon
1427
+ logrus .Debugf ("HEALTHCHECK: Raw JSON received from conmon: %q" , string (b ))
1428
+ logrus .Debugf ("HEALTHCHECK: JSON length: %d bytes" , len (b ))
1429
+
1411
1430
// ignore EOF here, error is returned even when data was read
1412
1431
// if it is no valid json unmarshal will fail below
1413
1432
if err != nil && ! errors .Is (err , io .EOF ) {
1433
+ logrus .Debugf ("HEALTHCHECK: Error reading from conmon pipe: %v" , err )
1414
1434
ch <- syncStruct {err : err }
1435
+ return
1415
1436
}
1416
1437
if err := json .Unmarshal (b , & si ); err != nil {
1438
+ logrus .Debugf ("HEALTHCHECK: Failed to unmarshal JSON from conmon: %v" , err )
1417
1439
ch <- syncStruct {err : fmt .Errorf ("conmon bytes %q: %w" , string (b ), err )}
1418
1440
return
1419
1441
}
1442
+ logrus .Debugf ("HEALTHCHECK: Successfully parsed JSON from conmon: Data=%d, Message=%q" , si .Data , si .Message )
1420
1443
ch <- syncStruct {si : si }
1421
1444
}()
1422
1445
@@ -1436,6 +1459,13 @@ func readConmonPipeData(runtimeName string, pipe *os.File, ociLog string) (int,
1436
1459
return - 1 , fmt .Errorf ("container create failed (no logs from conmon): %w" , ss .err )
1437
1460
}
1438
1461
logrus .Debugf ("Received: %d" , ss .si .Data )
1462
+
1463
+ // Start continuous healthcheck monitoring if container is provided and PID is valid
1464
+ if len (ctr ) > 0 && ctr [0 ] != nil && ss .si .Data > 0 {
1465
+ logrus .Debugf ("HEALTHCHECK: Starting continuous healthcheck monitoring for container %s (PID: %d)" , ctr [0 ].ID (), ss .si .Data )
1466
+ go readConmonHealthCheckPipeData (ctr [0 ], pipe )
1467
+ }
1468
+
1439
1469
if ss .si .Data < 0 {
1440
1470
if ociLog != "" {
1441
1471
ociLogData , err := os .ReadFile (ociLog )
@@ -1459,6 +1489,79 @@ func readConmonPipeData(runtimeName string, pipe *os.File, ociLog string) (int,
1459
1489
return data , nil
1460
1490
}
1461
1491
1492
+ // readConmonHealthCheckPipeData continuously reads healthcheck status updates from conmon
1493
+ func readConmonHealthCheckPipeData (ctr * Container , pipe * os.File ) {
1494
+ logrus .Debugf ("HEALTHCHECK: Starting continuous healthcheck monitoring for container %s" , ctr .ID ())
1495
+
1496
+ rdr := bufio .NewReader (pipe )
1497
+ for {
1498
+ // Read one line from the pipe
1499
+ b , err := rdr .ReadBytes ('\n' )
1500
+ if err != nil {
1501
+ if err == io .EOF {
1502
+ logrus .Debugf ("HEALTHCHECK: Pipe closed for container %s, stopping monitoring" , ctr .ID ())
1503
+ return
1504
+ }
1505
+ logrus .Errorf ("HEALTHCHECK: Error reading from pipe for container %s: %v" , ctr .ID (), err )
1506
+ return
1507
+ }
1508
+
1509
+ // Log the raw JSON string received from conmon
1510
+ logrus .Debugf ("HEALTHCHECK: Raw JSON received from conmon for container %s: %q" , ctr .ID (), string (b ))
1511
+ logrus .Debugf ("HEALTHCHECK: JSON length: %d bytes" , len (b ))
1512
+
1513
+ // Parse the JSON
1514
+ var si syncInfo
1515
+ if err := json .Unmarshal (b , & si ); err != nil {
1516
+ logrus .Errorf ("HEALTHCHECK: Failed to parse JSON from conmon for container %s: %v" , ctr .ID (), err )
1517
+ continue
1518
+ }
1519
+
1520
+ logrus .Debugf ("HEALTHCHECK: Parsed sync info for container %s: Data=%d, Message=%q" , ctr .ID (), si .Data , si .Message )
1521
+
1522
+ // Handle healthcheck status updates based on your new encoding scheme
1523
+ // Base message type is -100, status values are added to it:
1524
+ // -100 + 0 (none) = -100
1525
+ // -100 + 1 (starting) = -99
1526
+ // -100 + 2 (healthy) = -98
1527
+ // -100 + 3 (unhealthy) = -97
1528
+ if si .Data >= HealthCheckMsgStatusUpdate && si .Data <= HealthCheckMsgStatusUpdate + HealthCheckStatusUnhealthy {
1529
+ statusValue := si .Data - HealthCheckMsgStatusUpdate // Convert back to status value
1530
+ var status string
1531
+
1532
+ switch statusValue {
1533
+ case HealthCheckStatusNone :
1534
+ status = define .HealthCheckReset // "reset" or "none"
1535
+ case HealthCheckStatusStarting :
1536
+ status = define .HealthCheckStarting // "starting"
1537
+ case HealthCheckStatusHealthy :
1538
+ status = define .HealthCheckHealthy // "healthy"
1539
+ case HealthCheckStatusUnhealthy :
1540
+ status = define .HealthCheckUnhealthy // "unhealthy"
1541
+ default :
1542
+ logrus .Errorf ("HEALTHCHECK: Unknown status value %d for container %s" , statusValue , ctr .ID ())
1543
+ continue
1544
+ }
1545
+
1546
+ logrus .Infof ("HEALTHCHECK: Received healthcheck status update for container %s: %s (message type: %d, status value: %d)" ,
1547
+ ctr .ID (), status , si .Data , statusValue )
1548
+
1549
+ // Update the container's healthcheck status
1550
+ if err := ctr .updateHealthStatus (status ); err != nil {
1551
+ logrus .Errorf ("HEALTHCHECK: Failed to update healthcheck status for container %s: %v" , ctr .ID (), err )
1552
+ } else {
1553
+ logrus .Infof ("HEALTHCHECK: Successfully updated healthcheck status for container %s to %s" , ctr .ID (), status )
1554
+ }
1555
+ } else if si .Data < 0 {
1556
+ // Other negative message types - might be healthcheck related but not recognized
1557
+ logrus .Debugf ("HEALTHCHECK: Received unrecognized negative message type %d for container %s - might be healthcheck related" , si .Data , ctr .ID ())
1558
+ } else if si .Data > 0 {
1559
+ // Positive message types - not healthcheck related
1560
+ logrus .Debugf ("HEALTHCHECK: Received positive message type %d for container %s - not healthcheck related" , si .Data , ctr .ID ())
1561
+ }
1562
+ }
1563
+ }
1564
+
1462
1565
// writeConmonPipeData writes nonce data to a pipe
1463
1566
func writeConmonPipeData (pipe * os.File ) error {
1464
1567
someData := []byte {0 }
0 commit comments